From 5a9849293c1f1a2dcc4d5d4ba1a8c75777f2be38 Mon Sep 17 00:00:00 2001
From: Nik Bamert <github@nikbamert.com>
Date: Sat, 14 Feb 2026 13:14:45 +0100
Subject: [PATCH 01/36] cmakelists eigen3

---
 samples/CMakeLists.txt | 6 +++---
 tests/CMakeLists.txt   | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt
index 3bbe11f..a7e73d7 100644
--- a/samples/CMakeLists.txt
+++ b/samples/CMakeLists.txt
@@ -1,10 +1,10 @@
 add_executable(extract extract.cpp)
-target_link_libraries(extract ${PNG_LIBRARIES} Threads::Threads)
+target_link_libraries(extract ${PNG_LIBRARIES} Threads::Threads Eigen3::Eigen)
 
 add_executable(train train.cpp)
-target_link_libraries(train ${PNG_LIBRARIES} Threads::Threads)
+target_link_libraries(train ${PNG_LIBRARIES} Threads::Threads Eigen3::Eigen)
 
 add_executable(sparsematch sparsematch.cpp)
-target_link_libraries(sparsematch ${PNG_LIBRARIES} Threads::Threads)
+target_link_libraries(sparsematch ${PNG_LIBRARIES} Threads::Threads Eigen3::Eigen)
 
 
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index acf86db..a211cac 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -12,7 +12,7 @@ FetchContent_MakeAvailable(approvaltests)
 
 find_package(GTest REQUIRED)
 add_executable(test_single_matching test_single_matching.cpp)
-target_link_libraries(test_single_matching PRIVATE ${PNG_LIBRARIES} ApprovalTests::ApprovalTests GTest::gtest_main)
+target_link_libraries(test_single_matching PRIVATE ${PNG_LIBRARIES} ApprovalTests::ApprovalTests GTest::gtest_main Eigen3::Eigen)
 
 add_test(NAME single_matching COMMAND test_single_matching)
 

From a4436e7660dade09c94c50e5cc2bdb5bb2213d18 Mon Sep 17 00:00:00 2001
From: Nik Bamert <github@nikbamert.com>
Date: Sat, 14 Feb 2026 14:09:18 +0100
Subject: [PATCH 02/36] decouple forest.hpp

---
 CMakeLists.txt                 |  10 +
 lib/gpc/buffer.hpp             |   4 +-
 lib/gpc/filter.hpp             |  24 +-
 lib/gpc/forest.cpp             | 374 ++++++++++++++++++++++++
 lib/gpc/forest.hpp             | 283 ++++++++++++++++++
 lib/gpc/inference.hpp          | 505 ---------------------------------
 samples/CMakeLists.txt         |   6 +-
 samples/sparsematch.cpp        |  10 +-
 tests/CMakeLists.txt           |   8 +-
 tests/test_single_matching.cpp |   8 +-
 10 files changed, 700 insertions(+), 532 deletions(-)
 create mode 100644 lib/gpc/forest.cpp
 create mode 100644 lib/gpc/forest.hpp
 delete mode 100644 lib/gpc/inference.hpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 55e3851..535b559 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -37,6 +37,16 @@ if(SSE)
     endif()
 endif()
 
+add_library(gpc_core 
+    lib/gpc/forest.cpp 
+)
+target_link_libraries(gpc_core 
+    PUBLIC 
+        Eigen3::Eigen 
+        ${PNG_LIBRARIES} 
+        Threads::Threads
+)
+target_include_directories(gpc_core PUBLIC lib)
 enable_testing()
 add_subdirectory(samples)
 add_subdirectory(tests)
diff --git a/lib/gpc/buffer.hpp b/lib/gpc/buffer.hpp
index dd6dce6..453ecaa 100644
--- a/lib/gpc/buffer.hpp
+++ b/lib/gpc/buffer.hpp
@@ -896,7 +896,7 @@ class RGBBuffer : public Buffer<RGBColor> {
         free(rowPointers);
     }
 };
-Buffer<RGBColor> getDisparityVisualization(
+inline Buffer<RGBColor> getDisparityVisualization(
     ndb::Buffer<uint8_t>& srcImg,
     std::vector<int>& validEstimateIndices,
     ndb::Buffer<float>& disparity) {
@@ -969,7 +969,7 @@ Buffer<RGBColor> getDisparityVisualization(
     }
     return dispVis;
 }
-Buffer<RGBColor> getDisparityVisualization(ndb::Buffer<uint8_t>& srcImg,
+inline Buffer<RGBColor> getDisparityVisualization(ndb::Buffer<uint8_t>& srcImg,
                                            std::vector<Support>& support) {
     float min_disparity = 0;
     float max_disparity = 128;
diff --git a/lib/gpc/filter.hpp b/lib/gpc/filter.hpp
index 49384f0..2caecc2 100644
--- a/lib/gpc/filter.hpp
+++ b/lib/gpc/filter.hpp
@@ -58,7 +58,7 @@ namespace ndb {
  * @param ind output array (indices into n of nonzero elements)
  * @param m   number of elements in output
  */
-__attribute__((noinline)) void arr2ind(const unsigned char* a,
+inline void arr2ind(const unsigned char* a,
                                        int n,
                                        int* ind,
                                        int* m) {
@@ -132,7 +132,7 @@ void pack16to8(const __m128i x0, const __m128i x1, __m128i& y) {
  * @param end      end of the range
  * @param nThreads number of threads to use
  */
-void parFor(std::function<void(int, int)> const& f,
+inline void parFor(std::function<void(int, int)> const& f,
             int start,
             int end,
             int nThreads) {
@@ -165,7 +165,7 @@ void parFor(std::function<void(int, int)> const& f,
  * @param[in]  numThreads number of threads to use
  * @param      threshold  threshold to binarize sobel filter output
  */
-void sobelNaive(
+inline void sobelNaive(
     uint8_t* in, uint8_t* gradient, int width, int height, uint8_t threshold) {
     assert(width % 16 == 0 && "width must be multiple of 16!");
     int thresholdSq = threshold * threshold;
@@ -217,7 +217,7 @@ void sobelNaive(
  * @param[in]  height   The height
  * @param[in]  numThreads number of threads to use
  */
-void boxNaive(uint8_t* in, uint8_t* blurred, int width, int height) {
+inline void boxNaive(uint8_t* in, uint8_t* blurred, int width, int height) {
     assert(width % 16 == 0 && "width must be multiple of 16!");
     // allocate space for result
     uint8_t* ptr = in;
@@ -269,7 +269,7 @@ void boxNaive(uint8_t* in, uint8_t* blurred, int width, int height) {
  * @param width     The width of the image at pointer *in
  * @param height    The height of the image at pointer *in
  */
-void gpcFilterNaive(uint8_t* in,
+inline void gpcFilterNaive(uint8_t* in,
                     const uint8_t* grad,
                     uint32_t* gpc,
                     std::vector<int32_t> fastmask,
@@ -303,7 +303,7 @@ void gpcFilterNaive(uint8_t* in,
  * @param width     The width of the image at pointer *in
  * @param height    The height of the image at pointer *in
  */
-void gpcFilterTauNaive(uint8_t* in,
+inline void gpcFilterTauNaive(uint8_t* in,
                        const uint8_t* grad,
                        uint32_t* gpc,
                        std::vector<int32_t> fastmask,
@@ -336,7 +336,7 @@ void gpcFilterTauNaive(uint8_t* in,
    * @param[in]  height   The height
    * @param[in]  numThreads number of threads to use
    */
-void box(uint8_t* in, uint8_t* blurred, int width, int height, int numThreads) {
+inline void box(uint8_t* in, uint8_t* blurred, int width, int height, int numThreads) {
     assert(width % 16 == 0 && "width must be multiple of 16!");
 #ifndef _INTRINSICS_SSE
     boxNaive(in, blurred, width, height);
@@ -464,7 +464,7 @@ void box(uint8_t* in, uint8_t* blurred, int width, int height, int numThreads) {
  * @param[in]  numThreads number of threads to use
  */
 
-void sobel(uint8_t* in,
+inline void sobel(uint8_t* in,
            uint8_t* blurred,
            int width,
            int height,
@@ -645,7 +645,7 @@ inline bool isAllZeros(__m128i xmm) {
  * @param height    The height of the image at pointer *in
  * @param numThreadsNumber of threads to use
  */
-void gpcFilter(uint8_t* in,
+inline void gpcFilter(uint8_t* in,
                const uint8_t* grad,
                uint32_t* gpc,
                std::vector<int32_t> fastmask,
@@ -731,7 +731,7 @@ void gpcFilter(uint8_t* in,
  * @param height    The height of the image at pointer *in
  * @param numThreads Number of threads to use
  */
-void gpcFilterTau(uint8_t* in,
+inline void gpcFilterTau(uint8_t* in,
                   const uint8_t* grad,
                   uint32_t* gpc,
                   std::vector<int32_t> fastmask,
@@ -816,7 +816,7 @@ void gpcFilterTau(uint8_t* in,
  * @param width   Width of the image at *in pointer
  * @param height  Heiht of the image at *in pointer
  */
-void census5x5Naive(uint8_t* in, uint32_t* census, int width, int height) {
+inline void census5x5Naive(uint8_t* in, uint32_t* census, int width, int height) {
     uint32_t val;
     uint32_t* dst;
     for (int y = 2; y < height - 3; y++) {
@@ -850,7 +850,7 @@ void census5x5Naive(uint8_t* in, uint32_t* census, int width, int height) {
  * @param width
  * @param height
  */
-void census5x5(uint8_t* in, uint32_t* census, int width, int height) {
+inline void census5x5(uint8_t* in, uint32_t* census, int width, int height) {
     assert(width % 16 == 0 && "width must be multiple of 16!");
 #ifndef _INTRINSICS_SSE
     census5x5Naive(in, census, width, height);
diff --git a/lib/gpc/forest.cpp b/lib/gpc/forest.cpp
new file mode 100644
index 0000000..0809951
--- /dev/null
+++ b/lib/gpc/forest.cpp
@@ -0,0 +1,374 @@
+// Copyright (c) 2018, ETH Zurich
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the copyright holder nor the names of its contributors
+// may be used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Implements and extends the method proposed in
+// The Global Patch Collider
+// Shenlong Wang, Sean Ryan Fanello, Christoph Rhemann, Shahram Izadi, Pushmeet
+// Kohli CVPR 2016 Code Author: Niklaus Bamert (bamertn@ethz.ch)
+#include <Eigen/Dense>
+#include <chrono>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <random>
+#include <string>
+#include <thread>
+#include <vector>
+
+// GPC includes
+#include "gpc/Feature.hpp"
+#include "gpc/SintelOpticalFlow.hpp"
+#include "gpc/SintelStereo.hpp"
+#include "gpc/buffer.hpp"
+#include "gpc/filter.hpp"
+#include "gpc/hashmatch.hpp"
+#include "gpc/forest.hpp"
+
+
+namespace gpc {
+namespace inference {
+
+    /**
+     * @brief Computes sparse matches on a pair of rectified and smoothed
+     * images. Here the src and tar images refer to the left and right images,
+     * respectively.
+     *
+     * @param src    Preprocessed source(left) image
+     * @param tar    Preprocessed target(right) image
+     * @param fastmask    forest mask of relative integer offsets.
+     *
+     * @return
+     */
+std::vector<ndb::Correspondence> Forest::depthPriorFast(
+    PreprocessedImage& src,
+    PreprocessedImage& tar,
+    FilterMask& fastmask,
+    InferenceSettings& settings) {
+    std::vector<ndb::Descriptor> statesSrc = evalFastMaskOnSubsetSSE(
+        src.smooth, src.grad, src.mask, fastmask, settings);
+    std::vector<ndb::Descriptor> statesTar = evalFastMaskOnSubsetSSE(
+        tar.smooth, tar.grad, tar.mask, fastmask, settings);
+    // Epipolar mode. Use upper 32bit of 64bit descriptor to store y
+    // coordinate
+    if (settings.epipolarMode_) {
+        for (auto& el : statesSrc) el.state |= uint64_t(el.point.y) << 32;
+        for (auto& el : statesTar) el.state |= uint64_t(el.point.y) << 32;
+    }
+    // Use sort method for matching
+    if (settings.useHashtable_ == false) {
+        std::vector<ndb::Correspondence> corr =
+            findCorrespondences(statesSrc, statesTar);
+        return corr;
+    }
+    // Use hashtable matching
+    else {
+        for (auto& q : statesSrc) q.srcDescr = true;
+        for (auto& q : statesTar) q.srcDescr = false;
+
+        ndb::Hashmatch<ndb::Descriptor> hm(
+            214673,  // statesSrc.size() + statesTar.size() ,
+            statesSrc.size() + statesTar.size());
+        std::vector<std::pair<ndb::Descriptor, ndb::Descriptor>> corr;
+        for (auto& q : statesSrc) hm.insert(q);
+        for (auto& q : statesTar) hm.insert(q);
+        hm.getDuplicates(corr);
+        // Store vertices in a format that is more convenient for us:
+        std::vector<ndb::Correspondence> corr2;
+        for (auto& e : corr) {
+            corr2.push_back(
+                ndb::Correspondence(e.first.point, e.second.point));
+        }
+
+        return corr2;
+    }
+}
+std::vector<ndb::Correspondence> Forest::findCorrespondences(
+    std::vector<ndb::Descriptor>& srcStates,
+    std::vector<ndb::Descriptor>& tarStates) {
+    int numStates = std::min(srcStates.size(), tarStates.size());
+    // Limit search to rectified epipolar case.
+    std::sort(srcStates.begin(), srcStates.end());
+
+    std::sort(tarStates.begin(), tarStates.end());
+    std::vector<ndb::Correspondence> corr;
+    uint32_t j = 0;
+    for (uint32_t i = 0; i < srcStates.size(); ++i) {
+        bool unique = true;
+        while (i + 1 < srcStates.size() && srcStates[i] == srcStates[i + 1])
+            ++i, unique = false;
+
+        if (unique) {
+            // emulates std::lowerbound behavior for arrays
+            for (; j < tarStates.size() - 1; ++j) {
+                if (!(tarStates[j] < srcStates[i])) break;
+            }
+
+            if (j != tarStates.size() - 1 && tarStates[j] == srcStates[i] &&
+                ((j + 1) == tarStates.size() - 1 ||
+                 !(tarStates[j] == tarStates[j + 1])))
+                corr.push_back(ndb::Correspondence(srcStates[i].point,
+                                                   tarStates[j].point));
+        }
+    }
+    return corr;
+}
+
+/**
+ * @brief Evaluates a given forest mask on an image and returns the
+ * descriptors
+ *
+ * @param img       The image
+ * @param grad      gradient image
+ * @param idx       offsets with high gradient pixels within the grad image
+ * @param fastmask  the forest mask
+ *
+ * @return
+ */
+std::vector<ndb::Descriptor> Forest::evalFastMaskOnSubsetSSE(
+    ndb::Buffer<uint8_t>& img,
+    ndb::Buffer<uint8_t>& grad,
+    std::vector<int>& idx,
+    FilterMask& fastmask,
+    InferenceSettings& settings) {
+    std::chrono::high_resolution_clock::time_point t0, t1;
+
+    // output buffer of same size
+    ndb::Buffer<uint32_t> gpcstates(img.rows(), img.cols(), 0);
+    if (fastmask.type == 0) {
+        ndb::gpcFilter(img.data(),
+                       grad.data(),
+                       gpcstates.data(),
+                       fastmask.mask,
+                       idx,
+                       img.cols(),
+                       img.rows(),
+                       settings.numThreads_);
+    } else {
+        ndb::gpcFilterTau(img.data(),
+                          grad.data(),
+                          gpcstates.data(),
+                          fastmask.mask,
+                          fastmask.tau,
+                          idx,
+                          img.cols(),
+                          img.rows(),
+                          settings.numThreads_);
+    }
+    std::vector<ndb::Descriptor> out(idx.size());
+    int j = 0;
+
+    for (auto k : idx) {
+        int x = k % img.cols();
+        int y = k / img.cols();
+        out[j] = ndb::Descriptor(ndb::Point(x, y), gpcstates.data()[k]);
+        j++;
+    }
+    return out;
+}
+
+/**
+ * @brief Preprocesses an image. (smooth, binary sobel image and gradient
+ * pixel indices)
+ *
+ * @param img     The raw input image to be preprocessed
+ * @param InferenceSettings inference settings struct
+ *
+ * @return the preprocessed image
+ */
+PreprocessedImage Forest::preprocessImage(ndb::Buffer<uint8_t>& img,
+                                  InferenceSettings settings) {
+    assert((settings.gradientThreshold_ >= 0 &&
+            settings.gradientThreshold_ <= 255) &&
+           "gradientThreshold needs to be within 0...255");
+
+    ndb::Buffer<uint8_t> smooth(img.rows(), img.cols());
+    smooth.width = img.width;
+    ndb::box(img.data(),
+             smooth.data(),
+             img.cols(),
+             img.rows(),
+             settings.numThreads_);
+    smooth.clearBoundary();
+    ndb::Buffer<uint8_t> grad(img.rows(), img.cols());
+    grad.width = img.width;
+    ndb::Buffer<int> maskTmp;
+    ndb::sobel(img.data(),
+               grad.data(),
+               img.cols(),
+               img.rows(),
+               settings.gradientThreshold_,
+               settings.numThreads_);
+
+    ndb::Buffer<int> idx;
+    idx.resize(grad.rows(), grad.cols());
+    auto ff = [&](ndb::Buffer<int>& in, std::vector<int>& out, int m) {
+        for (int i = 0; i < m; i++) {
+            int x = in.data()[i] % grad.cols();
+            int y = in.data()[i] / grad.cols();
+            if (y >= 13 && y < grad.rows() - 13 && x >= 13 &&
+                x < grad.cols() - 13)
+                out.push_back(in.data()[i]);
+        }
+    };
+    int m;
+    // mask indexing gradient pixels
+    std::vector<int> mask;
+    ndb::arr2ind(grad.data(), grad.cols() * grad.rows(), idx.data(), &m);
+    ff(idx, mask, m);
+    // Our outputs are: smooth, grad, mask;
+    return PreprocessedImage(smooth, grad, mask);
+}
+/**
+ * @brief Finds matches between two stereo images based on a given forest
+ * mask.
+ *
+ * @param simg              source image (assumed to be the left image)
+ * @param timg              target image (assumed to be the right image)
+ * @param forestmask        forest mask, provided by readForest method
+ * @param InferenceSettings inference settings struct
+ * @return                  Set of correspondences (ptSrc, ptTar) where
+ * ptSrc and ptTar are points in the source and target images, respectively.
+ */
+std::vector<ndb::Correspondence> Forest::stereoMatch(PreprocessedImage& simg,
+                                             PreprocessedImage& timg,
+                                             FilterMask& forestmask,
+                                             InferenceSettings settings) {
+    // make sure the delivered mask matches the image dimensions
+    assert(
+        (forestmask.width == simg.smooth.cols() &&
+         forestmask.height == simg.smooth.rows()) &&
+        "Source Image: dimension does not fit dimension of supplied forest "
+        "mask");
+    assert(
+        (forestmask.width == timg.smooth.cols() &&
+         forestmask.height == simg.smooth.rows()) &&
+        "Targe Image: dimension does not fit dimension of supplied forest "
+        "mask");
+    bool m_debug = false;
+    std::chrono::high_resolution_clock::time_point t0, t1;
+    // Match
+    std::vector<ndb::Correspondence> corr =
+        depthPriorFast(simg, timg, forestmask, settings);
+    t1 = sysTick();
+
+    return corr;
+}
+
+/**
+ * @brief                   Returns support (set of x,y coordinates and
+ * disparity) of a pair of images that have been rectified.
+ *
+ * @@param simg             source image (assumed to be the left image)
+ * @param timg              target image (assumed to be the right image)
+ * @param forestmask        forest mask, provided by readForest method
+ * @param InferenceSettings inference settings struct
+ *                          In practice, values between 5...20 produce good
+ * results.
+ *
+ * @return                  Set of supports (x,y,d) with x,y the coordinate
+ * of a point in the left image and d the disparity.
+ */
+std::vector<ndb::Support> Forest::rectifiedMatch(PreprocessedImage& simg,
+                                         PreprocessedImage& timg,
+                                         FilterMask& forestmask,
+                                         InferenceSettings settings) {
+    // Do matching
+    std::vector<ndb::Correspondence> corr =
+        stereoMatch(simg, timg, forestmask, settings);
+    // Filter epipolar matches
+    std::vector<ndb::Support> supp;
+    for (auto& e : corr) {
+        // epipolar constraint
+        if (std::abs(e.srcPt.y - e.tarPt.y) <= settings.verticalTolerance_
+            // disparity filter
+            && std::abs(e.srcPt.x - e.tarPt.x) <= settings.dispHigh_)
+            supp.push_back(
+                ndb::Support(e.srcPt.x, e.srcPt.y, e.srcPt.x - e.tarPt.x));
+    }
+    return supp;
+}
+
+/**
+ * @brief Reads text-based forest format and returns a mask for a given
+ * image size.
+ *
+ * @param path    Path to the file that contains the forest.
+ * @param width   16-Byte aligned width of the image in pixels
+ * @param height  height of the image in pixels
+ *
+ * @return
+ */
+FilterMask Forest::readForest(std::string path, int width, int height) {
+    std::ifstream ff(path);
+
+    std::vector<int32_t> fastmask;
+    std::vector<int> taus;
+    if (ff.fail()) {
+        cout << "Error opening forest file" << endl;
+        return FilterMask(fastmask, width, height, 0);
+    }
+    int numNonZeroTau = 0;
+    int numFerns;
+    int type;
+    ff >> numFerns;
+    cout << "number of ferns:" << numFerns << endl;
+    for (int i = 0; i < numFerns; i++) {
+        int fernID, numTests;
+        std::string fernScale;
+        ff >> fernID >> fernScale >> numTests;
+        for (int j = 0; j < numTests; j++) {
+            int levelID, ix, iy, jx, jy, tau;
+            ff >> levelID >> ix >> iy >> jx >> jy >> tau;
+            // Limit mask size to 32 binary tests
+            if (fastmask.size() < 64 && taus.size() < 32) {
+                fastmask.push_back(ix + iy * width);
+                fastmask.push_back(jx + jy * width);
+                taus.push_back(tau);
+            } else {
+                cout << "Note: A maximum of 32 fern features are allowed, "
+                        "discarding "
+                        "remainder of forest."
+                     << endl;
+            }
+            if (tau != 0) numNonZeroTau++;
+        }
+    }
+    if (numNonZeroTau == 0) {
+        type = 0;  // We have a zero forest (all tau=0)
+        FilterMask fm(fastmask, width, height, type);
+        return fm;
+    } else {
+        type = 1;  // We have a tau forest (some tau!=0)
+        FilterMask fm(fastmask, taus, width, height, type);
+        return fm;
+    }
+}
+
+}  // namespace inference
+}
diff --git a/lib/gpc/forest.hpp b/lib/gpc/forest.hpp
new file mode 100644
index 0000000..87939c1
--- /dev/null
+++ b/lib/gpc/forest.hpp
@@ -0,0 +1,283 @@
+// Copyright (c) 2018, ETH Zurich
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the copyright holder nor the names of its contributors
+// may be used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Implements and extends the method proposed in
+// The Global Patch Collider
+// Shenlong Wang, Sean Ryan Fanello, Christoph Rhemann, Shahram Izadi, Pushmeet
+// Kohli CVPR 2016 Code Author: Niklaus Bamert (bamertn@ethz.ch)
+#ifndef _GPC_inference
+#define _GPC_inference
+#include <Eigen/Dense>
+#include <chrono>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <random>
+#include <string>
+#include <thread>
+#include <vector>
+
+// GPC includes
+#include "gpc/Feature.hpp"
+#include "gpc/SintelOpticalFlow.hpp"
+#include "gpc/SintelStereo.hpp"
+#include "gpc/buffer.hpp"
+#include "gpc/filter.hpp"
+#include "gpc/hashmatch.hpp"
+
+/**
+ * @brief      The inference class of the GPC forest
+ *
+ */
+namespace gpc {
+namespace inference {
+typedef typename std::chrono::high_resolution_clock::time_point time_point;
+inline std::chrono::high_resolution_clock::time_point sysTick() {
+    return std::chrono::high_resolution_clock::now();
+}
+inline float tickToMs(std::chrono::high_resolution_clock::time_point t0,
+               std::chrono::high_resolution_clock::time_point t1) {
+    return std::abs(
+        1000. *
+        std::chrono::duration_cast<std::chrono::duration<double>>(t1 - t0)
+            .count());
+}
+struct InferenceSettings {
+    // Threshold to be used for edge detection. Can be 0...255.
+    // In practice, values between 5...20 produce good results.  uint8_t
+    // gradientThreshold;
+    uint8_t gradientThreshold_ = 10;
+    // upper absolute limit for disparity in pixels. The lower (implied) limit
+    // is
+    // 0
+    int dispHigh_ = 128;
+    // vertical deviation tolerance in pixels for corresponding features in
+    // rectified stereo images.
+    int verticalTolerance_ = 1;
+    // Whether to use epipolar mode on matching or not.
+    bool epipolarMode_ = false;
+    // Use hashtable to match extracted descriptors. Usually only faster with a
+    // large number of descriptors (> 100k) or when using multiple threads. Note
+    // that the hashtable method does not return a slightly reduced amount of
+    // matches as a result of the hash table implementation (small bucket size)
+    // if false, the descriptors are sorted and matched by iterating
+    // alternatingly through both sets.
+    bool useHashtable_ = false;
+
+    // Number of threads to use for inference
+    int numThreads_ = 1;
+
+    // Default contructor defaults to using a single thread
+    InferenceSettings(uint8_t gradientThreshold,
+                      int dispHigh,
+                      int verticalTolerance,
+                      bool epipolarMode,
+                      bool useHashtable,
+                      int numThreads)
+        : gradientThreshold_(gradientThreshold),
+          dispHigh_(dispHigh),
+          verticalTolerance_(verticalTolerance),
+          epipolarMode_(epipolarMode),
+          useHashtable_(useHashtable),
+          numThreads_(numThreads) {}
+
+    InferenceSettings() {}
+    InferenceSettings& builder(void) { return *this; }
+    InferenceSettings& gradientThreshold(uint8_t gradientThreshold) {
+        this->gradientThreshold_ = gradientThreshold;
+        return *this;
+    }
+    InferenceSettings& dispHigh(int dispHigh) {
+        this->dispHigh_ = dispHigh;
+        return *this;
+    }
+    InferenceSettings& verticalTolerance(int verticalTolerance) {
+        this->verticalTolerance_ = verticalTolerance;
+        return *this;
+    }
+    InferenceSettings& epipolarMode(bool epipolarMode) {
+        this->epipolarMode_ = epipolarMode;
+        return *this;
+    }
+    InferenceSettings& useHashtable(bool useHashtable) {
+        this->useHashtable_ = useHashtable;
+        return *this;
+    }
+    InferenceSettings& numThreads(int numThreads) {
+        if (numThreads > std::thread::hardware_concurrency())
+            this->numThreads_ = std::thread::hardware_concurrency();
+        else
+            this->numThreads_ = numThreads;
+        return *this;
+    }
+};
+/**
+ * @brief FilterMask object that is returned by the forest reader
+ */
+struct FilterMask {
+    std::vector<int32_t> mask;
+    std::vector<int> tau;
+    int width;
+    int height;
+    int type;
+    FilterMask(std::vector<int32_t> mask, int width, int height, int type) {
+        this->mask = mask;
+        this->width = width;
+        this->height = height;
+        this->type = type;
+    }
+    FilterMask(std::vector<int32_t> mask,
+               std::vector<int> tau,
+               int width,
+               int height,
+               int type) {
+        this->mask = mask;
+        this->tau = tau;
+        this->width = width;
+        this->height = height;
+        this->type = type;
+    }
+};
+struct PreprocessedImage {
+    ndb::Buffer<uint8_t> smooth;
+    ndb::Buffer<uint8_t> grad;
+    std::vector<int> mask;
+    PreprocessedImage(ndb::Buffer<uint8_t>& smooth,
+                      ndb::Buffer<uint8_t>& grad,
+                      std::vector<int>& mask)
+        : smooth(smooth), grad(grad), mask(mask) {};
+};
+
+enum CorrMethod { sorting = 's', hashtable = 'h' };
+struct MatchStats {
+    double prec, rec, timeProp, timeMatch;
+    int numInlier, numStates, numMatches;
+};
+
+
+class Forest {
+   public:
+    /**
+     * @brief Computes sparse matches on a pair of rectified and smoothed
+     * images. Here the src and tar images refer to the left and right images,
+     * respectively.
+     *
+     * @param src    Preprocessed source(left) image
+     * @param tar    Preprocessed target(right) image
+     * @param fastmask    forest mask of relative integer offsets.
+     *
+     * @return
+     */
+    std::vector<ndb::Correspondence> depthPriorFast(
+        PreprocessedImage& src,
+        PreprocessedImage& tar,
+        FilterMask& fastmask,
+        InferenceSettings& settings);
+    std::vector<ndb::Correspondence> findCorrespondences(
+        std::vector<ndb::Descriptor>& srcStates,
+        std::vector<ndb::Descriptor>& tarStates);
+    /**
+     * @brief Evaluates a given forest mask on an image and returns the
+     * descriptors
+     *
+     * @param img       The image
+     * @param grad      gradient image
+     * @param idx       offsets with high gradient pixels within the grad image
+     * @param fastmask  the forest mask
+     *
+     * @return
+     */
+    std::vector<ndb::Descriptor> evalFastMaskOnSubsetSSE(
+        ndb::Buffer<uint8_t>& img,
+        ndb::Buffer<uint8_t>& grad,
+        std::vector<int>& idx,
+        FilterMask& fastmask,
+        InferenceSettings& settings);
+        
+    /**
+     * @brief Preprocesses an image. (smooth, binary sobel image and gradient
+     * pixel indices)
+     *
+     * @param img     The raw input image to be preprocessed
+     * @param InferenceSettings inference settings struct
+     *
+     * @return the preprocessed image
+     */
+    PreprocessedImage preprocessImage(ndb::Buffer<uint8_t>& img,
+                                      InferenceSettings settings);
+     /**
+     * @brief Finds matches between two stereo images based on a given forest
+     * mask.
+     *
+     * @param simg              source image (assumed to be the left image)
+     * @param timg              target image (assumed to be the right image)
+     * @param forestmask        forest mask, provided by readForest method
+     * @param InferenceSettings inference settings struct
+     * @return                  Set of correspondences (ptSrc, ptTar) where
+     * ptSrc and ptTar are points in the source and target images, respectively.
+     */
+    std::vector<ndb::Correspondence> stereoMatch(PreprocessedImage& simg,
+                                                 PreprocessedImage& timg,
+                                                 FilterMask& forestmask,
+                                                 InferenceSettings settings);
+    /**
+     * @brief                   Returns support (set of x,y coordinates and
+     * disparity) of a pair of images that have been rectified.
+     *
+     * @@param simg             source image (assumed to be the left image)
+     * @param timg              target image (assumed to be the right image)
+     * @param forestmask        forest mask, provided by readForest method
+     * @param InferenceSettings inference settings struct
+     *                          In practice, values between 5...20 produce good
+     * results.
+     *
+     * @return                  Set of supports (x,y,d) with x,y the coordinate
+     * of a point in the left image and d the disparity.
+     */
+    std::vector<ndb::Support> rectifiedMatch(PreprocessedImage& simg,
+                                             PreprocessedImage& timg,
+                                             FilterMask& forestmask,
+                                             InferenceSettings settings);
+                                            
+    /**
+     * @brief Reads text-based forest format and returns a mask for a given
+     * image size.
+     *
+     * @param path    Path to the file that contains the forest.
+     * @param width   16-Byte aligned width of the image in pixels
+     * @param height  height of the image in pixels
+     *
+     * @return
+     */
+    FilterMask readForest(std::string path, int width, int height);
+};  // forest class
+}  // namespace inference
+}  // namespace gpc
+
+#endif
diff --git a/lib/gpc/inference.hpp b/lib/gpc/inference.hpp
deleted file mode 100644
index e1a887a..0000000
--- a/lib/gpc/inference.hpp
+++ /dev/null
@@ -1,505 +0,0 @@
-// Copyright (c) 2018, ETH Zurich
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// 1. Redistributions of source code must retain the above copyright notice,
-// this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the copyright holder nor the names of its contributors
-// may be used to endorse or promote products derived from this software without
-// specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-//
-// Implements and extends the method proposed in
-// The Global Patch Collider
-// Shenlong Wang, Sean Ryan Fanello, Christoph Rhemann, Shahram Izadi, Pushmeet
-// Kohli CVPR 2016 Code Author: Niklaus Bamert (bamertn@ethz.ch)
-#ifndef _GPC_inference
-#define _GPC_inference
-#include <Eigen/Dense>
-#include <chrono>
-#include <cstring>
-#include <fstream>
-#include <iostream>
-#include <random>
-#include <string>
-#include <thread>
-#include <vector>
-
-// GPC includes
-#include "gpc/Feature.hpp"
-#include "gpc/SintelOpticalFlow.hpp"
-#include "gpc/SintelStereo.hpp"
-#include "gpc/buffer.hpp"
-#include "gpc/filter.hpp"
-#include "gpc/hashmatch.hpp"
-
-/**
- * @brief      The inference class of the GPC forest
- *
- */
-namespace gpc {
-namespace inference {
-typedef typename std::chrono::high_resolution_clock::time_point time_point;
-std::chrono::high_resolution_clock::time_point sysTick() {
-    return std::chrono::high_resolution_clock::now();
-}
-float tickToMs(std::chrono::high_resolution_clock::time_point t0,
-               std::chrono::high_resolution_clock::time_point t1) {
-    return std::abs(
-        1000. *
-        std::chrono::duration_cast<std::chrono::duration<double>>(t1 - t0)
-            .count());
-}
-struct InferenceSettings {
-    // Threshold to be used for edge detection. Can be 0...255.
-    // In practice, values between 5...20 produce good results.  uint8_t
-    // gradientThreshold;
-    uint8_t gradientThreshold_ = 10;
-    // upper absolute limit for disparity in pixels. The lower (implied) limit
-    // is
-    // 0
-    int dispHigh_ = 128;
-    // vertical deviation tolerance in pixels for corresponding features in
-    // rectified stereo images.
-    int verticalTolerance_ = 1;
-    // Whether to use epipolar mode on matching or not.
-    bool epipolarMode_ = false;
-    // Use hashtable to match extracted descriptors. Usually only faster with a
-    // large number of descriptors (> 100k) or when using multiple threads. Note
-    // that the hashtable method does not return a slightly reduced amount of
-    // matches as a result of the hash table implementation (small bucket size)
-    // if false, the descriptors are sorted and matched by iterating
-    // alternatingly through both sets.
-    bool useHashtable_ = false;
-
-    // Number of threads to use for inference
-    int numThreads_ = 1;
-
-    // Default contructor defaults to using a single thread
-    InferenceSettings(uint8_t gradientThreshold,
-                      int dispHigh,
-                      int verticalTolerance,
-                      bool epipolarMode,
-                      bool useHashtable,
-                      int numThreads)
-        : gradientThreshold_(gradientThreshold),
-          dispHigh_(dispHigh),
-          verticalTolerance_(verticalTolerance),
-          epipolarMode_(epipolarMode),
-          useHashtable_(useHashtable),
-          numThreads_(numThreads) {}
-
-    InferenceSettings() {}
-    InferenceSettings& builder(void) { return *this; }
-    InferenceSettings& gradientThreshold(uint8_t gradientThreshold) {
-        this->gradientThreshold_ = gradientThreshold;
-        return *this;
-    }
-    InferenceSettings& dispHigh(int dispHigh) {
-        this->dispHigh_ = dispHigh;
-        return *this;
-    }
-    InferenceSettings& verticalTolerance(int verticalTolerance) {
-        this->verticalTolerance_ = verticalTolerance;
-        return *this;
-    }
-    InferenceSettings& epipolarMode(bool epipolarMode) {
-        this->epipolarMode_ = epipolarMode;
-        return *this;
-    }
-    InferenceSettings& useHashtable(bool useHashtable) {
-        this->useHashtable_ = useHashtable;
-        return *this;
-    }
-    InferenceSettings& numThreads(int numThreads) {
-        if (numThreads > std::thread::hardware_concurrency())
-            this->numThreads_ = std::thread::hardware_concurrency();
-        else
-            this->numThreads_ = numThreads;
-        return *this;
-    }
-};
-class Forest {
-   public:
-    /**
-     * @brief FilterMask object that is returned by the forest reader
-     */
-    struct FilterMask {
-        std::vector<int32_t> mask;
-        std::vector<int> tau;
-        int width;
-        int height;
-        int type;
-        FilterMask(std::vector<int32_t> mask, int width, int height, int type) {
-            this->mask = mask;
-            this->width = width;
-            this->height = height;
-            this->type = type;
-        }
-        FilterMask(std::vector<int32_t> mask,
-                   std::vector<int> tau,
-                   int width,
-                   int height,
-                   int type) {
-            this->mask = mask;
-            this->tau = tau;
-            this->width = width;
-            this->height = height;
-            this->type = type;
-        }
-    };
-    struct PreprocessedImage {
-        ndb::Buffer<uint8_t> smooth;
-        ndb::Buffer<uint8_t> grad;
-        std::vector<int> mask;
-        PreprocessedImage(ndb::Buffer<uint8_t>& smooth,
-                          ndb::Buffer<uint8_t>& grad,
-                          std::vector<int>& mask)
-            : smooth(smooth), grad(grad), mask(mask) {};
-    };
-
-    enum CorrMethod { sorting = 's', hashtable = 'h' };
-    struct MatchStats {
-        double prec, rec, timeProp, timeMatch;
-        int numInlier, numStates, numMatches;
-    };
-
-    /**
-     * @brief Computes sparse matches on a pair of rectified and smoothed
-     * images. Here the src and tar images refer to the left and right images,
-     * respectively.
-     *
-     * @param src    Preprocessed source(left) image
-     * @param tar    Preprocessed target(right) image
-     * @param fastmask    forest mask of relative integer offsets.
-     *
-     * @return
-     */
-    std::vector<ndb::Correspondence> depthPriorFast(
-        PreprocessedImage& src,
-        PreprocessedImage& tar,
-        FilterMask& fastmask,
-        InferenceSettings& settings) {
-        std::vector<ndb::Descriptor> statesSrc = evalFastMaskOnSubsetSSE(
-            src.smooth, src.grad, src.mask, fastmask, settings);
-        std::vector<ndb::Descriptor> statesTar = evalFastMaskOnSubsetSSE(
-            tar.smooth, tar.grad, tar.mask, fastmask, settings);
-        // Epipolar mode. Use upper 32bit of 64bit descriptor to store y
-        // coordinate
-        if (settings.epipolarMode_) {
-            for (auto& el : statesSrc) el.state |= uint64_t(el.point.y) << 32;
-            for (auto& el : statesTar) el.state |= uint64_t(el.point.y) << 32;
-        }
-        // Use sort method for matching
-        if (settings.useHashtable_ == false) {
-            std::vector<ndb::Correspondence> corr =
-                findCorrespondences(statesSrc, statesTar);
-            return corr;
-        }
-        // Use hashtable matching
-        else {
-            for (auto& q : statesSrc) q.srcDescr = true;
-            for (auto& q : statesTar) q.srcDescr = false;
-
-            ndb::Hashmatch<ndb::Descriptor> hm(
-                214673,  // statesSrc.size() + statesTar.size() ,
-                statesSrc.size() + statesTar.size());
-            std::vector<std::pair<ndb::Descriptor, ndb::Descriptor>> corr;
-            for (auto& q : statesSrc) hm.insert(q);
-            for (auto& q : statesTar) hm.insert(q);
-            hm.getDuplicates(corr);
-            // Store vertices in a format that is more convenient for us:
-            std::vector<ndb::Correspondence> corr2;
-            for (auto& e : corr) {
-                corr2.push_back(
-                    ndb::Correspondence(e.first.point, e.second.point));
-            }
-
-            return corr2;
-        }
-    }
-    std::vector<ndb::Correspondence> findCorrespondences(
-        std::vector<ndb::Descriptor>& srcStates,
-        std::vector<ndb::Descriptor>& tarStates) {
-        int numStates = std::min(srcStates.size(), tarStates.size());
-        // Limit search to rectified epipolar case.
-        std::sort(srcStates.begin(), srcStates.end());
-
-        std::sort(tarStates.begin(), tarStates.end());
-        std::vector<ndb::Correspondence> corr;
-        uint32_t j = 0;
-        for (uint32_t i = 0; i < srcStates.size(); ++i) {
-            bool unique = true;
-            while (i + 1 < srcStates.size() && srcStates[i] == srcStates[i + 1])
-                ++i, unique = false;
-
-            if (unique) {
-                // emulates std::lowerbound behavior for arrays
-                for (; j < tarStates.size() - 1; ++j) {
-                    if (!(tarStates[j] < srcStates[i])) break;
-                }
-
-                if (j != tarStates.size() - 1 && tarStates[j] == srcStates[i] &&
-                    ((j + 1) == tarStates.size() - 1 ||
-                     !(tarStates[j] == tarStates[j + 1])))
-                    corr.push_back(ndb::Correspondence(srcStates[i].point,
-                                                       tarStates[j].point));
-            }
-        }
-        return corr;
-    }
-
-    /**
-     * @brief Evaluates a given forest mask on an image and returns the
-     * descriptors
-     *
-     * @param img       The image
-     * @param grad      gradient image
-     * @param idx       offsets with high gradient pixels within the grad image
-     * @param fastmask  the forest mask
-     *
-     * @return
-     */
-    std::vector<ndb::Descriptor> evalFastMaskOnSubsetSSE(
-        ndb::Buffer<uint8_t>& img,
-        ndb::Buffer<uint8_t>& grad,
-        std::vector<int>& idx,
-        FilterMask& fastmask,
-        InferenceSettings& settings) {
-        std::chrono::high_resolution_clock::time_point t0, t1;
-
-        // output buffer of same size
-        ndb::Buffer<uint32_t> gpcstates(img.rows(), img.cols(), 0);
-        if (fastmask.type == 0) {
-            ndb::gpcFilter(img.data(),
-                           grad.data(),
-                           gpcstates.data(),
-                           fastmask.mask,
-                           idx,
-                           img.cols(),
-                           img.rows(),
-                           settings.numThreads_);
-        } else {
-            ndb::gpcFilterTau(img.data(),
-                              grad.data(),
-                              gpcstates.data(),
-                              fastmask.mask,
-                              fastmask.tau,
-                              idx,
-                              img.cols(),
-                              img.rows(),
-                              settings.numThreads_);
-        }
-        std::vector<ndb::Descriptor> out(idx.size());
-        int j = 0;
-
-        for (auto k : idx) {
-            int x = k % img.cols();
-            int y = k / img.cols();
-            out[j] = ndb::Descriptor(ndb::Point(x, y), gpcstates.data()[k]);
-            j++;
-        }
-        return out;
-    }
-
-    /**
-     * @brief Preprocesses an image. (smooth, binary sobel image and gradient
-     * pixel indices)
-     *
-     * @param img     The raw input image to be preprocessed
-     * @param InferenceSettings inference settings struct
-     *
-     * @return the preprocessed image
-     */
-    PreprocessedImage preprocessImage(ndb::Buffer<uint8_t>& img,
-                                      InferenceSettings settings) {
-        assert((settings.gradientThreshold_ >= 0 &&
-                settings.gradientThreshold_ <= 255) &&
-               "gradientThreshold needs to be within 0...255");
-
-        ndb::Buffer<uint8_t> smooth(img.rows(), img.cols());
-        smooth.width = img.width;
-        ndb::box(img.data(),
-                 smooth.data(),
-                 img.cols(),
-                 img.rows(),
-                 settings.numThreads_);
-        smooth.clearBoundary();
-        ndb::Buffer<uint8_t> grad(img.rows(), img.cols());
-        grad.width = img.width;
-        ndb::Buffer<int> maskTmp;
-        ndb::sobel(img.data(),
-                   grad.data(),
-                   img.cols(),
-                   img.rows(),
-                   settings.gradientThreshold_,
-                   settings.numThreads_);
-
-        ndb::Buffer<int> idx;
-        idx.resize(grad.rows(), grad.cols());
-        auto ff = [&](ndb::Buffer<int>& in, std::vector<int>& out, int m) {
-            for (int i = 0; i < m; i++) {
-                int x = in.data()[i] % grad.cols();
-                int y = in.data()[i] / grad.cols();
-                if (y >= 13 && y < grad.rows() - 13 && x >= 13 &&
-                    x < grad.cols() - 13)
-                    out.push_back(in.data()[i]);
-            }
-        };
-        int m;
-        // mask indexing gradient pixels
-        std::vector<int> mask;
-        ndb::arr2ind(grad.data(), grad.cols() * grad.rows(), idx.data(), &m);
-        ff(idx, mask, m);
-        // Our outputs are: smooth, grad, mask;
-        return PreprocessedImage(smooth, grad, mask);
-    }
-    /**
-     * @brief Finds matches between two stereo images based on a given forest
-     * mask.
-     *
-     * @param simg              source image (assumed to be the left image)
-     * @param timg              target image (assumed to be the right image)
-     * @param forestmask        forest mask, provided by readForest method
-     * @param InferenceSettings inference settings struct
-     * @return                  Set of correspondences (ptSrc, ptTar) where
-     * ptSrc and ptTar are points in the source and target images, respectively.
-     */
-    std::vector<ndb::Correspondence> stereoMatch(PreprocessedImage& simg,
-                                                 PreprocessedImage& timg,
-                                                 FilterMask& forestmask,
-                                                 InferenceSettings settings) {
-        // make sure the delivered mask matches the image dimensions
-        assert(
-            (forestmask.width == simg.smooth.cols() &&
-             forestmask.height == simg.smooth.rows()) &&
-            "Source Image: dimension does not fit dimension of supplied forest "
-            "mask");
-        assert(
-            (forestmask.width == timg.smooth.cols() &&
-             forestmask.height == simg.smooth.rows()) &&
-            "Targe Image: dimension does not fit dimension of supplied forest "
-            "mask");
-        bool m_debug = false;
-        std::chrono::high_resolution_clock::time_point t0, t1;
-        // Match
-        std::vector<ndb::Correspondence> corr =
-            depthPriorFast(simg, timg, forestmask, settings);
-        t1 = sysTick();
-
-        return corr;
-    }
-
-    /**
-     * @brief                   Returns support (set of x,y coordinates and
-     * disparity) of a pair of images that have been rectified.
-     *
-     * @@param simg             source image (assumed to be the left image)
-     * @param timg              target image (assumed to be the right image)
-     * @param forestmask        forest mask, provided by readForest method
-     * @param InferenceSettings inference settings struct
-     *                          In practice, values between 5...20 produce good
-     * results.
-     *
-     * @return                  Set of supports (x,y,d) with x,y the coordinate
-     * of a point in the left image and d the disparity.
-     */
-    std::vector<ndb::Support> rectifiedMatch(PreprocessedImage& simg,
-                                             PreprocessedImage& timg,
-                                             FilterMask& forestmask,
-                                             InferenceSettings settings) {
-        // Do matching
-        std::vector<ndb::Correspondence> corr =
-            stereoMatch(simg, timg, forestmask, settings);
-        // Filter epipolar matches
-        std::vector<ndb::Support> supp;
-        for (auto& e : corr) {
-            // epipolar constraint
-            if (std::abs(e.srcPt.y - e.tarPt.y) <= settings.verticalTolerance_
-                // disparity filter
-                && std::abs(e.srcPt.x - e.tarPt.x) <= settings.dispHigh_)
-                supp.push_back(
-                    ndb::Support(e.srcPt.x, e.srcPt.y, e.srcPt.x - e.tarPt.x));
-        }
-        return supp;
-    }
-
-    /**
-     * @brief Reads text-based forest format and returns a mask for a given
-     * image size.
-     *
-     * @param path    Path to the file that contains the forest.
-     * @param width   16-Byte aligned width of the image in pixels
-     * @param height  height of the image in pixels
-     *
-     * @return
-     */
-    FilterMask readForest(std::string path, int width, int height) {
-        std::ifstream ff(path);
-
-        std::vector<int32_t> fastmask;
-        std::vector<int> taus;
-        if (ff.fail()) {
-            cout << "Error opening forest file" << endl;
-            return FilterMask(fastmask, width, height, 0);
-        }
-        int numNonZeroTau = 0;
-        int numFerns;
-        int type;
-        ff >> numFerns;
-        cout << "number of ferns:" << numFerns << endl;
-        for (int i = 0; i < numFerns; i++) {
-            int fernID, numTests;
-            std::string fernScale;
-            ff >> fernID >> fernScale >> numTests;
-            for (int j = 0; j < numTests; j++) {
-                int levelID, ix, iy, jx, jy, tau;
-                ff >> levelID >> ix >> iy >> jx >> jy >> tau;
-                // Limit mask size to 32 binary tests
-                if (fastmask.size() < 64 && taus.size() < 32) {
-                    fastmask.push_back(ix + iy * width);
-                    fastmask.push_back(jx + jy * width);
-                    taus.push_back(tau);
-                } else {
-                    cout << "Note: A maximum of 32 fern features are allowed, "
-                            "discarding "
-                            "remainder of forest."
-                         << endl;
-                }
-                if (tau != 0) numNonZeroTau++;
-            }
-        }
-        if (numNonZeroTau == 0) {
-            type = 0;  // We have a zero forest (all tau=0)
-            FilterMask fm(fastmask, width, height, type);
-            return fm;
-        } else {
-            type = 1;  // We have a tau forest (some tau!=0)
-            FilterMask fm(fastmask, taus, width, height, type);
-            return fm;
-        }
-    }
-
-};  // forest class
-}  // namespace inference
-}  // namespace gpc
-
-#endif
diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt
index a7e73d7..a00cef3 100644
--- a/samples/CMakeLists.txt
+++ b/samples/CMakeLists.txt
@@ -1,10 +1,10 @@
 add_executable(extract extract.cpp)
-target_link_libraries(extract ${PNG_LIBRARIES} Threads::Threads Eigen3::Eigen)
+target_link_libraries(extract gpc_core)
 
 add_executable(train train.cpp)
-target_link_libraries(train ${PNG_LIBRARIES} Threads::Threads Eigen3::Eigen)
+target_link_libraries(train gpc_core)
 
 add_executable(sparsematch sparsematch.cpp)
-target_link_libraries(sparsematch ${PNG_LIBRARIES} Threads::Threads Eigen3::Eigen)
+target_link_libraries(sparsematch gpc_core)
 
 
diff --git a/samples/sparsematch.cpp b/samples/sparsematch.cpp
index ed43016..6834f61 100644
--- a/samples/sparsematch.cpp
+++ b/samples/sparsematch.cpp
@@ -1,6 +1,6 @@
 #include <iostream>
 
-#include "gpc/inference.hpp"
+#include "gpc/forest.hpp"
 using namespace std;
 int main(int argc, char** argv) {
     std::string forestPath = "../../forests/defaultZeroForest.txt";
@@ -46,15 +46,15 @@ int main(int argc, char** argv) {
     timg.readPNG(rightImgPath);
 
     // Get learned filter for the given image dimensions.
-    GPCForest_t::FilterMask fm =
+    gpc::inference::FilterMask fm =
         forest.readForest(forestPath, simg.cols(), simg.rows());
 
     // Preprocess images (box filter, sobel filter, indices of high gradient
     // pixels)
     gpc::inference::time_point t0 = gpc::inference::sysTick();
-    GPCForest_t::PreprocessedImage simgP =
+    gpc::inference::PreprocessedImage simgP =
         forest.preprocessImage(simg, inferencesettings);
-    GPCForest_t::PreprocessedImage timgP =
+    gpc::inference::PreprocessedImage timgP =
         forest.preprocessImage(timg, inferencesettings);
     gpc::inference::time_point t1 = gpc::inference::sysTick();
 
@@ -66,7 +66,7 @@ int main(int argc, char** argv) {
          << ", #candidatesL:" << simgP.mask.size()
          << ", #candidatesR:" << timgP.mask.size()
          << ", tMatch: " << gpc::inference::tickToMs(t2, t1) << " ms"
-         << ", num matches:" << supp.size() << endl;
+         << ", num matches:" << supp.size() << std::endl;
 
     // Output sparse disparities overlayed on left input image
     ndb::Buffer<ndb::RGBColor> renderDisp;
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index a211cac..564c125 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -12,7 +12,13 @@ FetchContent_MakeAvailable(approvaltests)
 
 find_package(GTest REQUIRED)
 add_executable(test_single_matching test_single_matching.cpp)
-target_link_libraries(test_single_matching PRIVATE ${PNG_LIBRARIES} ApprovalTests::ApprovalTests GTest::gtest_main Eigen3::Eigen)
+#target_link_libraries(test_single_matching PRIVATE ${PNG_LIBRARIES} ApprovalTests::ApprovalTests GTest::gtest_main Eigen3::Eigen)
+target_link_libraries(test_single_matching 
+    PRIVATE 
+        gpc_core 
+        ApprovalTests::ApprovalTests 
+        GTest::gtest_main
+)
 
 add_test(NAME single_matching COMMAND test_single_matching)
 
diff --git a/tests/test_single_matching.cpp b/tests/test_single_matching.cpp
index e9e6a75..fdff603 100644
--- a/tests/test_single_matching.cpp
+++ b/tests/test_single_matching.cpp
@@ -1,7 +1,7 @@
 #define APPROVALS_GOOGLETEST
 #include <ApprovalTests.hpp>     
 #include <gtest/gtest.h>
-#include "gpc/inference.hpp"
+#include "gpc/forest.hpp"
 
 
 TEST(Approval, Inference)
@@ -34,12 +34,12 @@ TEST(Approval, Inference)
     simg.readPNG(leftImgPath);
     timg.readPNG(rightImgPath); 
     // Get learned filter for the given image dimensions.
-    GPCForest_t::FilterMask fm =
+    gpc::inference::FilterMask fm =
         forest.readForest(forestPath, simg.cols(), simg.rows());
 
-    GPCForest_t::PreprocessedImage simgP =
+    gpc::inference::PreprocessedImage simgP =
         forest.preprocessImage(simg, inferencesettings);
-    GPCForest_t::PreprocessedImage timgP =
+    gpc::inference::PreprocessedImage timgP =
         forest.preprocessImage(timg, inferencesettings);
 
     // Match rectified stereo images

From 7d00a91072d06528299dece81291d5bfa4c35cc8 Mon Sep 17 00:00:00 2001
From: Nik Bamert <github@nikbamert.com>
Date: Sat, 14 Feb 2026 17:38:00 +0100
Subject: [PATCH 03/36] decouple fern  Please enter the commit message for your
 changes. Lines starting

---
 CMakeLists.txt   |   1 +
 lib/gpc/Fern.hpp | 157 +++--------------------------------
 lib/gpc/fern.cpp | 208 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 221 insertions(+), 145 deletions(-)
 create mode 100644 lib/gpc/fern.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 535b559..d123aaf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -39,6 +39,7 @@ endif()
 
 add_library(gpc_core 
     lib/gpc/forest.cpp 
+    lib/gpc/fern.cpp 
 )
 target_link_libraries(gpc_core 
     PUBLIC 
diff --git a/lib/gpc/Fern.hpp b/lib/gpc/Fern.hpp
index 5f5f533..d9554fd 100644
--- a/lib/gpc/Fern.hpp
+++ b/lib/gpc/Fern.hpp
@@ -167,10 +167,7 @@ OptimizerSettings TauOptimizer(int taulo,
                                int tauhi,
                                int numResamples,
                                bool onlyScoreNonSplitSamples,
-                               double w1) {
-    return OptimizerSettings(
-        taulo, tauhi, numResamples, onlyScoreNonSplitSamples, w1);
-}
+                               double w1);
 /**
  * @brief Optimzer setting factory for a zero fern
  *
@@ -184,9 +181,7 @@ OptimizerSettings TauOptimizer(int taulo,
  */
 OptimizerSettings ZeroOptimizer(int numResamples,
                                 bool onlyScoreNonSplitSamples,
-                                double w1) {
-    return OptimizerSettings(0, 1, numResamples, onlyScoreNonSplitSamples, w1);
-}
+                                double w1) ;
 struct FernSettings {
     const int maxDepth;
     const int scale;
@@ -231,63 +226,8 @@ class Fern {
                    FernSettings fernsetting,
                    OptimizerSettings optsetting,
                    int scoreUntilLevel,
-                   splitStats& s) {
-        s.tp = 0;
-        s.fn = 0;
-        s.fp = 0;
-        s.prec = 0.;
-        s.rec = 0.;
-        s.hmean = 0.;
-        s.convcomb = 0.;
-        s.tot = 0;
-        for (auto& triplet : data) {
-            uint64_t ref = 0, pos = 0, neg = 0;
-            // Score the first scoreUntilLevel levels of a given fern
-            for (int i = 0; i < scoreUntilLevel + 1; i++) {
-                ref <<= 1;
-                pos <<= 1;
-                neg <<= 1;
-                bool refDec, posDec, negDec;
-
-                // Decisions need to be added into a codeword
-                Feature.getDecisions(
-                    refDec, posDec, negDec, params[i], triplet);
-                if (refDec) ref++;
-                if (posDec) pos++;
-                if (negDec) neg++;
-            }
-            // Only count those that haven't been true positives yet
-            // Ignore samples previously classified as True positive
-            if (!(triplet.pos.split == true && triplet.neg.split == true)) {
-                s.tot++;
-                // Decide which are equal (i.e. set the split indicators)
-                if (ref == pos) {      // 110(TP), 111, 001(TP), 000
-                    if (ref != neg) {  // 110 (TP), 001(TP)
-                        s.tp++;
-                    } else {  // 111(FN), 000(FN)
-                        s.fn++;
-                    }
-                } else {               // 100, 101, 011, 010
-                    if (ref != neg) {  // 100(FN), 011(FN) FN
-                        s.fn++;
-                    } else {  //  101(FP), 010(FP)
-                        s.fp++;
-                    }
-                }
-            }
-        }
-
-        // Compute statistics of this split
-        double w2 = 1. - optsetting.w1_;
-        s.prec = ((s.tp + s.fp) == 0) ? 0. : double(s.tp) / (s.tp + s.fp);
-        s.rec = ((s.tp + s.fn) == 0) ? 0. : double(s.tp) / (s.tp + s.fn);
-
-        s.hmean = (s.prec + s.rec == 0.)
-                      ? 0.
-                      : s.prec * s.rec / ((1. - w2) * s.prec + w2 * s.rec);
-        s.convcomb = (1. - w2) * s.prec + w2 * s.rec;
-    }
-    /**
+                   splitStats& s);
+     /**
      * @brief      Mark those samples in the set as "split" if they have been
      *             correctly classified(ref=pos and pos!=neg) with the parameter
      * set in params
@@ -298,26 +238,7 @@ class Fern {
      */
     void markSplitSamples(std::vector<GPCTriplet_t>& data,
                           std::vector<SplitParams_t>& params,
-                          int numParams) {
-        for (auto& triplet : data) {
-            // Evaluate triplet on all given parameters
-            uint64_t ref = 0, pos = 0, neg = 0;
-            for (int i = 0; i < numParams; i++) {
-                ref <<= 1;  // shift by one
-                pos <<= 1;  // shift by one
-                neg <<= 1;  // shift by one
-                bool refDec, posDec, negDec;
-
-                Feature.getDecisions(
-                    refDec, posDec, negDec, params[i], triplet);
-                if (refDec) ref++;
-                if (posDec) pos++;
-                if (negDec) neg++;
-            }
-            if (ref == pos) triplet.pos.split = true;
-            if (ref != neg) triplet.neg.split = true;
-        }
-    }
+                          int numParams) ;
     /**
      * @brief Reset the mark on the training samples on whether they have been
      * split correctly or not Since we do not operate on copies of the training
@@ -325,13 +246,8 @@ class Fern {
      *
      * @param data
      */
-    void resetMarkOnSamples(std::vector<GPCTriplet_t>& data) {
-        for (auto& triplet : data) {
-            triplet.pos.split = false;
-            triplet.neg.split = false;
-        }
-    }
-
+    void resetMarkOnSamples(std::vector<GPCTriplet_t>& data);
+   
     /**
      * @brief Train a fern given a set of training data and some optimizer
      * settings
@@ -340,70 +256,21 @@ class Fern {
      * @param optsetting      the optimizer settings
      */
     void train(std::vector<GPCTriplet_t>& trainingSamples,
-               OptimizerSettings optsetting) {
-        splitStats stats;
-        float maxScore = 0.f;
-        SplitParams_t bestParams;
-
-        fernparams.resize(fernsettings.maxDepth);
-
-        cout << setw(7) << "Level" << setw(10) << "Prec" << setw(10) << "Rec"
-             << setw(10) << "Har" << setw(8) << "Tot" << setw(8) << "TP"
-             << setw(8) << "FP" << setw(8) << "FN" << setw(6) << "scale"
-             << setw(5) << "tau" << setw(5) << "i" << setw(5) << "j" << endl;
-        if (optsetting.onlyScoreNonSplitSamples_)
-            resetMarkOnSamples(trainingSamples);
-        for (int level = 0; level < fernsettings.maxDepth; level++) {
-            maxScore = 0.f;
-            for (int k = 0; k < optsetting.numResamples_; k++) {
-                // Samples a hyperplane in the requested scale
-                Feature.sampleHyperplane(fernsettings.scale, fernparams[level]);
-                // Iterates over a small range of tau (intercept)
-                for (int tau = optsetting.taulo_; tau < optsetting.tauhi_;
-                     tau++) {
-                    fernparams[level].tau = tau;
-                    // Score hyperplane set we have so far
-                    evalSplit(trainingSamples,
-                              fernparams,
-                              fernsettings,
-                              optsetting,
-                              level,
-                              stats);
-                    // If score exceeds previously best, replace paramset
-                    if (stats.hmean > maxScore) {
-                        bestParams = fernparams[level];
-                        maxScore = stats.hmean;
-                    }
-                }  // tau loop
-            }  // k loop
-            // Store best performing parameters
-            fernparams[level] = bestParams;
-
-            // Mark samples as split if they were labeled true positive
-            if (optsetting.onlyScoreNonSplitSamples_)
-                markSplitSamples(trainingSamples, fernparams, level);
-            cout << setw(7) << level << setw(10) << stats.prec << setw(10)
-                 << stats.rec << setw(10) << stats.hmean << setw(8) << stats.tot
-                 << setw(8) << stats.tp << setw(8) << stats.fp << setw(8)
-                 << stats.fn << setw(6) << fernsettings.scale << setw(5)
-                 << fernparams[level].tau << setw(5) << fernparams[level].i
-                 << setw(5) << fernparams[level].j << endl;
-        }  // level loop
-    }  // train
-
+               OptimizerSettings optsetting) ;
+   
     /**
      * @brief      Returns the decision of the first five levels of the ferns
      *
      * @return     The parameters.
      */
-    std::vector<SplitParams_t> getParameters() { return fernparams; }
+    std::vector<SplitParams_t> getParameters();
 
     /**
      * @brief Return the scale that this fern uses
      *
      * @return The scale
      */
-    int getScale() { return fernsettings.scale; }
+    int getScale();
 
 };  // Fern
 
@@ -417,7 +284,7 @@ class Fern {
  *
  * @return
  */
-std::vector<Fern> FernFactory(int num_S, int num_M, int num_L, int maxDepth) {
+inline std::vector<Fern> FernFactory(int num_S, int num_M, int num_L, int maxDepth) {
     std::vector<Fern> ferns;
     for (int i = 0; i < num_S; i++)
         ferns.push_back(Fern(FernSettings(maxDepth, 2)));
diff --git a/lib/gpc/fern.cpp b/lib/gpc/fern.cpp
new file mode 100644
index 0000000..a171218
--- /dev/null
+++ b/lib/gpc/fern.cpp
@@ -0,0 +1,208 @@
+// Copyright (c) 2018, ETH Zurich
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the copyright holder nor the names of its contributors
+// may be used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Implements and extends the method proposed in
+// The Global Patch Collider
+// Shenlong Wang, Sean Ryan Fanello, Christoph Rhemann, Shahram Izadi, Pushmeet
+// Kohli CVPR 2016 Code Author: Niklaus Bamert (bamertn@ethz.ch)
+#include <Eigen/Dense>
+#include <iomanip>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "gpc/Feature.hpp"
+#include "gpc/Fern.hpp"
+
+using namespace std;
+namespace gpc {
+namespace training {
+OptimizerSettings TauOptimizer(int taulo,
+                               int tauhi,
+                               int numResamples,
+                               bool onlyScoreNonSplitSamples,
+                               double w1) {
+    return OptimizerSettings(
+        taulo, tauhi, numResamples, onlyScoreNonSplitSamples, w1);
+}
+OptimizerSettings ZeroOptimizer(int numResamples,
+                                bool onlyScoreNonSplitSamples,
+                                double w1) {
+    return OptimizerSettings(0, 1, numResamples, onlyScoreNonSplitSamples, w1);
+}
+void Fern::evalSplit(std::vector<GPCTriplet_t>& data,
+               std::vector<SplitParams_t>& params,
+               FernSettings fernsetting,
+               OptimizerSettings optsetting,
+               int scoreUntilLevel,
+               splitStats& s) {
+    s.tp = 0;
+    s.fn = 0;
+    s.fp = 0;
+    s.prec = 0.;
+    s.rec = 0.;
+    s.hmean = 0.;
+    s.convcomb = 0.;
+    s.tot = 0;
+    for (auto& triplet : data) {
+        uint64_t ref = 0, pos = 0, neg = 0;
+        // Score the first scoreUntilLevel levels of a given fern
+        for (int i = 0; i < scoreUntilLevel + 1; i++) {
+            ref <<= 1;
+            pos <<= 1;
+            neg <<= 1;
+            bool refDec, posDec, negDec;
+
+            // Decisions need to be added into a codeword
+            Feature.getDecisions(
+                refDec, posDec, negDec, params[i], triplet);
+            if (refDec) ref++;
+            if (posDec) pos++;
+            if (negDec) neg++;
+        }
+        // Only count those that haven't been true positives yet
+        // Ignore samples previously classified as True positive
+        if (!(triplet.pos.split == true && triplet.neg.split == true)) {
+            s.tot++;
+            // Decide which are equal (i.e. set the split indicators)
+            if (ref == pos) {      // 110(TP), 111, 001(TP), 000
+                if (ref != neg) {  // 110 (TP), 001(TP)
+                    s.tp++;
+                } else {  // 111(FN), 000(FN)
+                    s.fn++;
+                }
+            } else {               // 100, 101, 011, 010
+                if (ref != neg) {  // 100(FN), 011(FN) FN
+                    s.fn++;
+                } else {  //  101(FP), 010(FP)
+                    s.fp++;
+                }
+            }
+        }
+    }
+
+    // Compute statistics of this split
+    double w2 = 1. - optsetting.w1_;
+    s.prec = ((s.tp + s.fp) == 0) ? 0. : double(s.tp) / (s.tp + s.fp);
+    s.rec = ((s.tp + s.fn) == 0) ? 0. : double(s.tp) / (s.tp + s.fn);
+
+    s.hmean = (s.prec + s.rec == 0.)
+                  ? 0.
+                  : s.prec * s.rec / ((1. - w2) * s.prec + w2 * s.rec);
+    s.convcomb = (1. - w2) * s.prec + w2 * s.rec;
+}
+void Fern::markSplitSamples(std::vector<GPCTriplet_t>& data,
+                      std::vector<SplitParams_t>& params,
+                      int numParams) {
+    for (auto& triplet : data) {
+        // Evaluate triplet on all given parameters
+        uint64_t ref = 0, pos = 0, neg = 0;
+        for (int i = 0; i < numParams; i++) {
+            ref <<= 1;  // shift by one
+            pos <<= 1;  // shift by one
+            neg <<= 1;  // shift by one
+            bool refDec, posDec, negDec;
+
+            Feature.getDecisions(
+                refDec, posDec, negDec, params[i], triplet);
+            if (refDec) ref++;
+            if (posDec) pos++;
+            if (negDec) neg++;
+        }
+        if (ref == pos) triplet.pos.split = true;
+        if (ref != neg) triplet.neg.split = true;
+    }
+}
+void Fern::resetMarkOnSamples(std::vector<GPCTriplet_t>& data) {
+    for (auto& triplet : data) {
+        triplet.pos.split = false;
+        triplet.neg.split = false;
+    }
+}
+
+void Fern::train(std::vector<GPCTriplet_t>& trainingSamples,
+           OptimizerSettings optsetting) {
+    splitStats stats;
+    float maxScore = 0.f;
+    SplitParams_t bestParams;
+
+    fernparams.resize(fernsettings.maxDepth);
+
+    cout << setw(7) << "Level" << setw(10) << "Prec" << setw(10) << "Rec"
+         << setw(10) << "Har" << setw(8) << "Tot" << setw(8) << "TP"
+         << setw(8) << "FP" << setw(8) << "FN" << setw(6) << "scale"
+         << setw(5) << "tau" << setw(5) << "i" << setw(5) << "j" << endl;
+    if (optsetting.onlyScoreNonSplitSamples_)
+        resetMarkOnSamples(trainingSamples);
+    for (int level = 0; level < fernsettings.maxDepth; level++) {
+        maxScore = 0.f;
+        for (int k = 0; k < optsetting.numResamples_; k++) {
+            // Samples a hyperplane in the requested scale
+            Feature.sampleHyperplane(fernsettings.scale, fernparams[level]);
+            // Iterates over a small range of tau (intercept)
+            for (int tau = optsetting.taulo_; tau < optsetting.tauhi_;
+                 tau++) {
+                fernparams[level].tau = tau;
+                // Score hyperplane set we have so far
+                evalSplit(trainingSamples,
+                          fernparams,
+                          fernsettings,
+                          optsetting,
+                          level,
+                          stats);
+                // If score exceeds previously best, replace paramset
+                if (stats.hmean > maxScore) {
+                    bestParams = fernparams[level];
+                    maxScore = stats.hmean;
+                }
+            }  // tau loop
+        }  // k loop
+        // Store best performing parameters
+        fernparams[level] = bestParams;
+
+        // Mark samples as split if they were labeled true positive
+        if (optsetting.onlyScoreNonSplitSamples_)
+            markSplitSamples(trainingSamples, fernparams, level);
+        cout << setw(7) << level << setw(10) << stats.prec << setw(10)
+             << stats.rec << setw(10) << stats.hmean << setw(8) << stats.tot
+             << setw(8) << stats.tp << setw(8) << stats.fp << setw(8)
+             << stats.fn << setw(6) << fernsettings.scale << setw(5)
+             << fernparams[level].tau << setw(5) << fernparams[level].i
+             << setw(5) << fernparams[level].j << endl;
+    }  // level loop
+}  // train
+
+std::vector<Fern::SplitParams_t> Fern::getParameters() { return fernparams; }
+
+int Fern::getScale() { return fernsettings.scale; }
+
+
+
+}  // namespace training
+}  // namespace gpc

From 5d43602b0e4e3574b3539a8c949bcfb029579977 Mon Sep 17 00:00:00 2001
From: Nik Bamert <github@nikbamert.com>
Date: Sat, 14 Feb 2026 20:27:42 +0100
Subject: [PATCH 04/36] decouple feature

---
 CMakeLists.txt      |   1 +
 lib/gpc/Feature.hpp | 160 ++-------------------------------
 lib/gpc/feature.cpp | 215 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 223 insertions(+), 153 deletions(-)
 create mode 100644 lib/gpc/feature.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d123aaf..b0f34ec 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -40,6 +40,7 @@ endif()
 add_library(gpc_core 
     lib/gpc/forest.cpp 
     lib/gpc/fern.cpp 
+    lib/gpc/feature.cpp 
 )
 target_link_libraries(gpc_core 
     PUBLIC 
diff --git a/lib/gpc/Feature.hpp b/lib/gpc/Feature.hpp
index f0f7072..82aff06 100644
--- a/lib/gpc/Feature.hpp
+++ b/lib/gpc/Feature.hpp
@@ -96,32 +96,13 @@ class Feature {
      * @param      params  The parameters for this split
      * @param[in]  trip    The triplet
      */
-    inline void getDecisions(bool& ref,
+    void getDecisions(bool& ref,
                              bool& pos,
                              bool& neg,
                              params& params,
-                             const GPCPatchTriplet& trip) {
-        ref =
-            ((int)trip.ref.feature(params.i) - (int)trip.ref.feature(params.j) <
-             params.tau);
-        pos =
-            ((int)trip.pos.feature(params.i) - (int)trip.pos.feature(params.j) <
-             params.tau);
-        neg =
-            ((int)trip.neg.feature(params.i) - (int)trip.neg.feature(params.j) <
-             params.tau);
-    }
-
-    Feature() {
-        std::random_device rd2;
-        rng = std::mt19937(rd2());
-        randIJ7 = std::uniform_int_distribution<int>(0, 48);
-        randIJ17 = std::uniform_int_distribution<int>(0, 17 * 17 - 1);
-        randIJ27 = std::uniform_int_distribution<int>(0, 27 * 27 - 1);
-
-        randTAU = std::uniform_int_distribution<int>(-15, 15);
-    }
+                             const GPCPatchTriplet& trip);
 
+    Feature();
     /**
      * @brief Returns a random hyperplane within a 27 x 27
      *        pixel-sized patch. depending on the scale
@@ -132,50 +113,7 @@ class Feature {
      * @param scale Determines which patch size is used
      * @param params returns the parameters
      */
-    void inline sampleHyperplane(int scale, params& params) {
-        if (scale == 2) {
-            params.i = params.j;  // s.t. they regenerate each iteration
-            while (params.i == params.j) {  // i and j need to be distinct
-                int i = randIJ7(rng);
-                int j = randIJ7(rng);
-                params.ix = i % 7 - 3;
-                params.iy = i / 7 - 3;
-                params.jx = j % 7 - 3;
-                params.jy = j / 7 - 3;
-
-                params.i = 280 + (params.ix + 3) + 27 * (params.iy + 3);
-                params.j = 280 + (params.jx + 3) + 27 * (params.jy + 3);
-            }
-        } else if (scale == 1) {
-            params.i = params.j;  // s.t. they regenerate each iteration
-            while (params.i == params.j) {  // i and j need to be distinct
-                int i = randIJ17(rng);
-                int j = randIJ17(rng);
-                params.ix = i % 17 - 8;
-                params.iy = i / 17 - 8;
-                params.jx = j % 17 - 8;
-                params.jy = j / 17 - 8;
-
-                params.i = 140 + (params.ix + 8) + 27 * (params.iy + 8);
-                params.j = 140 + (params.jx + 8) + 27 * (params.jy + 8);
-            }
-        } else if (scale == 0) {
-            params.i = params.j;  // s.t. they regenerate each iteration
-            while (params.i == params.j) {  // i and j need to be distinct
-                params.i = randIJ27(rng);
-                params.j = randIJ27(rng);
-                params.ix = params.i % 27 - 13;
-                params.iy = params.i / 27 - 13;
-                params.jx = params.j % 27 - 13;
-                params.jy = params.j / 27 - 13;
-
-                params.i = (params.ix + 13) + 27 * (params.iy + 13);
-                params.j = (params.jx + 13) + 27 * (params.jy + 13);
-            }
-        }
-        params.tau = randTAU(rng);
-    }
-
+    void sampleHyperplane(int scale, params& params);
     /**
      * @brief      Gets all descriptors (triplets) for an image pair for
      * training given the three keypoint vectors.
@@ -193,56 +131,7 @@ class Feature {
                             std::vector<ndb::Point>& ref,
                             std::vector<ndb::Point>& pos,
                             std::vector<ndb::Point>& neg,
-                            std::vector<GPCPatchTriplet>& triplets) {
-        ndb::Buffer<uint8_t> LL(bwL.rows(), bwL.cols());
-        LL.width = bwL.width;
-        ndb::box(bwL.data(), LL.data(), bwL.cols(), bwL.rows(), 1);
-        LL.clearBoundary();
-
-        ndb::Buffer<uint8_t> RR(bwL.rows(), bwL.cols());
-        RR.width = bwR.width;
-        ndb::box(bwR.data(), RR.data(), bwR.cols(), bwR.rows(), 1);
-        RR.clearBoundary();
-
-        auto f = [=](ndb::Point& kp) {
-            if (kp.x > 20 && kp.y > 20 && kp.x < bwL.cols() - 20 &&
-                kp.y < bwL.rows() - 20)
-                return false;
-            else
-                return true;
-        };
-
-        for (std::vector<ndb::Point>::size_type i = 0; i != ref.size(); i++) {
-            if (!f(ref[i]) && !f(pos[i]) && !f(neg[i])) {
-                // Get all descriptors:
-                GPCPatchTriplet newPatch;
-
-                // Reference patch
-                //====================================
-                newPatch.ref.x = ref[i].x;
-                newPatch.ref.y = ref[i].y;
-
-                LL.getPatch(newPatch.ref.feature, ref[i].x, ref[i].y, 27);
-
-                // Extract a positive match in the right image
-                //====================================
-                newPatch.pos.x = pos[i].x;
-                newPatch.pos.y = pos[i].y;
-
-                RR.getPatch(newPatch.pos.feature, pos[i].x, pos[i].y, 27);
-
-                // Extract negative patch
-                //====================================
-                newPatch.neg.x = neg[i].x;
-                newPatch.neg.y = neg[i].y;
-
-                RR.getPatch(newPatch.neg.feature, neg[i].x, neg[i].y, 27);
-
-                triplets.push_back(std::move(newPatch));
-            }
-        }
-    }
-
+                            std::vector<GPCPatchTriplet>& triplets);
     /**
      * @brief Store a vector of triplets of training data to file
      *
@@ -251,16 +140,7 @@ class Feature {
      *             in binary form.
      */
     void storeAllTriplets(std::vector<GPCPatchTriplet>& data,
-                          std::string path) {
-        ofstream fout;
-        fout.open(path, ios::binary | ios::out);
-        for (auto& triplet : data) {
-            fout.write((char*)triplet.ref.feature.data(), 27 * 27);
-            fout.write((char*)triplet.pos.feature.data(), 27 * 27);
-            fout.write((char*)triplet.neg.feature.data(), 27 * 27);
-        }
-        fout.close();
-    }
+                          std::string path);
     /**
      * @brief Read triplets of training data from a binary file
      *        written by the storeAllTriplets method.
@@ -269,33 +149,7 @@ class Feature {
      *
      * @return The training set
      */
-    std::vector<GPCPatchTriplet> loadAllTriplets(std::string path) {
-        std::vector<GPCPatchTriplet> data;
-        std::ifstream in(path, std::ifstream::ate | std::ifstream::binary);
-        uint32_t filesize = in.tellg();
-        if (filesize % ((27 * 27) * 3)) {
-            cout << "ERR: File is not a training set of this feature type"
-                 << endl;
-            cout << "FS: " << filesize << endl;
-            return data;
-        }
-        int numSamples = filesize / ((27 * 27) * 3);
-        data.resize(numSamples);
-        ifstream fin;
-        fin.open(path, ios::binary | ios::in);
-        for (auto& datum : data) {
-            datum.ref.feature.resize(27, 27);
-            datum.pos.feature.resize(27, 27);
-            datum.neg.feature.resize(27, 27);
-
-            fin.read((char*)datum.ref.feature.data(), 27 * 27);
-            fin.read((char*)datum.pos.feature.data(), 27 * 27);
-            fin.read((char*)datum.neg.feature.data(), 27 * 27);
-        }
-        fin.close();
-        return data;
-    }
-
+    std::vector<GPCPatchTriplet> loadAllTriplets(std::string path);
 };  // Feature
 }  // namespace training
 }  // namespace gpc
diff --git a/lib/gpc/feature.cpp b/lib/gpc/feature.cpp
new file mode 100644
index 0000000..f68d440
--- /dev/null
+++ b/lib/gpc/feature.cpp
@@ -0,0 +1,215 @@
+// Copyright (c) 2018, ETH Zurich
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the copyright holder nor the names of its contributors
+// may be used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Implements and extends the method proposed in
+// The Global Patch Collider
+// Shenlong Wang, Sean Ryan Fanello, Christoph Rhemann, Shahram Izadi, Pushmeet
+// Kohli CVPR 2016 Code Author: Niklaus Bamert (bamertn@ethz.ch)
+
+#include <Eigen/Dense>
+#include <algorithm>
+#include <cmath>  //for log2
+#include <fstream>
+#include <gpc/buffer.hpp>
+#include <gpc/filter.hpp>
+#include <gpc/Feature.hpp>
+#include <iostream>
+#include <iterator>
+#include <random>
+#include <set>
+#include <string>
+#include <vector>
+
+using namespace std;
+
+namespace gpc {
+namespace training {
+void Feature::getDecisions(bool& ref,
+                         bool& pos,
+                         bool& neg,
+                         params& params,
+                         const GPCPatchTriplet& trip) {
+    ref =
+        ((int)trip.ref.feature(params.i) - (int)trip.ref.feature(params.j) <
+         params.tau);
+    pos =
+        ((int)trip.pos.feature(params.i) - (int)trip.pos.feature(params.j) <
+         params.tau);
+    neg =
+        ((int)trip.neg.feature(params.i) - (int)trip.neg.feature(params.j) <
+         params.tau);
+}
+
+Feature::Feature() {
+    std::random_device rd2;
+    rng = std::mt19937(rd2());
+    randIJ7 = std::uniform_int_distribution<int>(0, 48);
+    randIJ17 = std::uniform_int_distribution<int>(0, 17 * 17 - 1);
+    randIJ27 = std::uniform_int_distribution<int>(0, 27 * 27 - 1);
+
+    randTAU = std::uniform_int_distribution<int>(-15, 15);
+}
+void Feature::sampleHyperplane(int scale, params& params) {
+    if (scale == 2) {
+        params.i = params.j;  // s.t. they regenerate each iteration
+        while (params.i == params.j) {  // i and j need to be distinct
+            int i = randIJ7(rng);
+            int j = randIJ7(rng);
+            params.ix = i % 7 - 3;
+            params.iy = i / 7 - 3;
+            params.jx = j % 7 - 3;
+            params.jy = j / 7 - 3;
+
+            params.i = 280 + (params.ix + 3) + 27 * (params.iy + 3);
+            params.j = 280 + (params.jx + 3) + 27 * (params.jy + 3);
+        }
+    } else if (scale == 1) {
+        params.i = params.j;  // s.t. they regenerate each iteration
+        while (params.i == params.j) {  // i and j need to be distinct
+            int i = randIJ17(rng);
+            int j = randIJ17(rng);
+            params.ix = i % 17 - 8;
+            params.iy = i / 17 - 8;
+            params.jx = j % 17 - 8;
+            params.jy = j / 17 - 8;
+
+            params.i = 140 + (params.ix + 8) + 27 * (params.iy + 8);
+            params.j = 140 + (params.jx + 8) + 27 * (params.jy + 8);
+        }
+    } else if (scale == 0) {
+        params.i = params.j;  // s.t. they regenerate each iteration
+        while (params.i == params.j) {  // i and j need to be distinct
+            params.i = randIJ27(rng);
+            params.j = randIJ27(rng);
+            params.ix = params.i % 27 - 13;
+            params.iy = params.i / 27 - 13;
+            params.jx = params.j % 27 - 13;
+            params.jy = params.j / 27 - 13;
+
+            params.i = (params.ix + 13) + 27 * (params.iy + 13);
+            params.j = (params.jx + 13) + 27 * (params.jy + 13);
+        }
+    }
+    params.tau = randTAU(rng);
+}
+void Feature::extractAllTriplets(ndb::Buffer<uint8_t>& bwL,
+                        ndb::Buffer<uint8_t>& bwR,
+                        std::vector<ndb::Point>& ref,
+                        std::vector<ndb::Point>& pos,
+                        std::vector<ndb::Point>& neg,
+                        std::vector<GPCPatchTriplet>& triplets) {
+    ndb::Buffer<uint8_t> LL(bwL.rows(), bwL.cols());
+    LL.width = bwL.width;
+    ndb::box(bwL.data(), LL.data(), bwL.cols(), bwL.rows(), 1);
+    LL.clearBoundary();
+
+    ndb::Buffer<uint8_t> RR(bwL.rows(), bwL.cols());
+    RR.width = bwR.width;
+    ndb::box(bwR.data(), RR.data(), bwR.cols(), bwR.rows(), 1);
+    RR.clearBoundary();
+
+    auto f = [=](ndb::Point& kp) {
+        if (kp.x > 20 && kp.y > 20 && kp.x < bwL.cols() - 20 &&
+            kp.y < bwL.rows() - 20)
+            return false;
+        else
+            return true;
+    };
+
+    for (std::vector<ndb::Point>::size_type i = 0; i != ref.size(); i++) {
+        if (!f(ref[i]) && !f(pos[i]) && !f(neg[i])) {
+            // Get all descriptors:
+            GPCPatchTriplet newPatch;
+
+            // Reference patch
+            //====================================
+            newPatch.ref.x = ref[i].x;
+            newPatch.ref.y = ref[i].y;
+
+            LL.getPatch(newPatch.ref.feature, ref[i].x, ref[i].y, 27);
+
+            // Extract a positive match in the right image
+            //====================================
+            newPatch.pos.x = pos[i].x;
+            newPatch.pos.y = pos[i].y;
+
+            RR.getPatch(newPatch.pos.feature, pos[i].x, pos[i].y, 27);
+
+            // Extract negative patch
+            //====================================
+            newPatch.neg.x = neg[i].x;
+            newPatch.neg.y = neg[i].y;
+
+            RR.getPatch(newPatch.neg.feature, neg[i].x, neg[i].y, 27);
+
+            triplets.push_back(std::move(newPatch));
+        }
+    }
+}
+
+void Feature::storeAllTriplets(std::vector<GPCPatchTriplet>& data,
+                      std::string path) {
+    ofstream fout;
+    fout.open(path, ios::binary | ios::out);
+    for (auto& triplet : data) {
+        fout.write((char*)triplet.ref.feature.data(), 27 * 27);
+        fout.write((char*)triplet.pos.feature.data(), 27 * 27);
+        fout.write((char*)triplet.neg.feature.data(), 27 * 27);
+    }
+    fout.close();
+}
+std::vector<Feature::GPCPatchTriplet> Feature::loadAllTriplets(std::string path) {
+    std::vector<Feature::GPCPatchTriplet> data;
+    std::ifstream in(path, std::ifstream::ate | std::ifstream::binary);
+    uint32_t filesize = in.tellg();
+    if (filesize % ((27 * 27) * 3)) {
+        cout << "ERR: File is not a training set of this feature type"
+             << endl;
+        cout << "FS: " << filesize << endl;
+        return data;
+    }
+    int numSamples = filesize / ((27 * 27) * 3);
+    data.resize(numSamples);
+    ifstream fin;
+    fin.open(path, ios::binary | ios::in);
+    for (auto& datum : data) {
+        datum.ref.feature.resize(27, 27);
+        datum.pos.feature.resize(27, 27);
+        datum.neg.feature.resize(27, 27);
+
+        fin.read((char*)datum.ref.feature.data(), 27 * 27);
+        fin.read((char*)datum.pos.feature.data(), 27 * 27);
+        fin.read((char*)datum.neg.feature.data(), 27 * 27);
+    }
+    fin.close();
+    return data;
+}
+
+}  // namespace training
+}  // namespace gpc

From af850bb7436a5ff22099bbd68abd1b5682449c2b Mon Sep 17 00:00:00 2001
From: Nik Bamert <github@nikbamert.com>
Date: Sun, 15 Feb 2026 08:50:30 +0100
Subject: [PATCH 05/36] decouple filter

---
 CMakeLists.txt          |   1 +
 lib/gpc/filter.hpp      | 775 ++--------------------------------------
 samples/sparsematch.cpp |   2 +-
 3 files changed, 29 insertions(+), 749 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b0f34ec..60f919e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -41,6 +41,7 @@ add_library(gpc_core
     lib/gpc/forest.cpp 
     lib/gpc/fern.cpp 
     lib/gpc/feature.cpp 
+    lib/gpc/filter.cpp
 )
 target_link_libraries(gpc_core 
     PUBLIC 
diff --git a/lib/gpc/filter.hpp b/lib/gpc/filter.hpp
index 2caecc2..3576650 100644
--- a/lib/gpc/filter.hpp
+++ b/lib/gpc/filter.hpp
@@ -58,40 +58,12 @@ namespace ndb {
  * @param ind output array (indices into n of nonzero elements)
  * @param m   number of elements in output
  */
-inline void arr2ind(const unsigned char* a,
+void arr2ind(const unsigned char* a,
                                        int n,
                                        int* ind,
-                                       int* m) {
+                                       int* m);
+
 #ifdef _INTRINSICS_SSE
-    int i, m0, k;
-    __m256i msk;
-    m0 = 0;
-    for (i = 0; i < n; i = i + 32) { /* Load 32 bytes and compare with zero: */
-        msk = _mm256_cmpeq_epi8(_mm256_load_si256((__m256i*)&a[i]),
-                                _mm256_setzero_si256());
-        k = _mm256_movemask_epi8(msk);
-        k = ~k; /* Search for nonzero bits instead of zero bits.  */
-        while (k) {
-            ind[m0] =
-                i + _tzcnt_u32(
-                        k); /* Count the number of trailing zero bits in k. */
-            m0++;
-            k = _blsr_u32(k); /* Clear the lowest set bit in k. */
-        }
-    }
-    *m = m0;
-#else
-    int nnz = 0;
-    for (int i = 0; i < n; i++) {
-        if (a[i] != 0) {
-            nnz++;
-            *ind = i;
-            ind++;
-        }
-    }
-    *m = nnz;
-#endif
-}
 /**
  * @brief      Unpacks 16x8bit from a 128bit simd var into 2x128bit vars
  *             (8x16bit)
@@ -100,12 +72,8 @@ inline void arr2ind(const unsigned char* a,
  * @param      y0    The y 0
  * @param      y1    The y 1
  */
-#ifdef _INTRINSICS_SSE
-void unpack8to16(const __m128i x, __m128i& y0, __m128i& y1) {
-    __m128i zero = _mm_setzero_si128();
-    y0 = _mm_unpacklo_epi8(x, zero);
-    y1 = _mm_unpackhi_epi8(x, zero);
-}
+void unpack8to16(const __m128i x, __m128i& y0, __m128i& y1);
+
 /**
  * @brief      Packs 2x128bit vars with 16bit values(where 8 upper bits are
  *             zero) into 1x128bit with 8bit values
@@ -114,10 +82,7 @@ void unpack8to16(const __m128i x, __m128i& y0, __m128i& y1) {
  * @param[in]  x1    The x 1
  * @param      y     the packed vector
  */
-void pack16to8(const __m128i x0, const __m128i x1, __m128i& y) {
-    y = _mm_packus_epi16(x0, x1);
-}
-
+void pack16to8(const __m128i x0, const __m128i x1, __m128i& y);
 #endif
 /**
  * @brief Calls a given functional f with subranges based on the given start
@@ -132,29 +97,10 @@ void pack16to8(const __m128i x0, const __m128i x1, __m128i& y) {
  * @param end      end of the range
  * @param nThreads number of threads to use
  */
-inline void parFor(std::function<void(int, int)> const& f,
+void parFor(std::function<void(int, int)> const& f,
             int start,
             int end,
-            int nThreads) {
-    // Range definition
-    // quantities derived from range
-    int segSize = (end - start) / nThreads;
-    int lastSeg = (end - start) % nThreads;
-
-    std::vector<std::thread> threads;
-    threads.reserve(nThreads);
-
-    // Spawn threads
-    for (int t = 0; t < nThreads - 1; t++) {
-        threads.emplace_back(f, start + t * segSize, start + (t + 1) * segSize);
-    }
-    threads.emplace_back(f,
-                         start + (nThreads - 1) * segSize,
-                         start + (nThreads)*segSize + lastSeg);
-    // Join
-    for (auto& t : threads) t.join();
-}
-
+            int nThreads);
 /**
  * @brief Naive 3x3 sobel filter implementation
  *
@@ -165,49 +111,9 @@ inline void parFor(std::function<void(int, int)> const& f,
  * @param[in]  numThreads number of threads to use
  * @param      threshold  threshold to binarize sobel filter output
  */
-inline void sobelNaive(
-    uint8_t* in, uint8_t* gradient, int width, int height, uint8_t threshold) {
-    assert(width % 16 == 0 && "width must be multiple of 16!");
-    int thresholdSq = threshold * threshold;
-    uint8_t* ptr = in;
-
-    uint8_t* p11 = ptr + 0 * width;
-    uint8_t* p12 = ptr + 0 * width + 1;
-    uint8_t* p13 = ptr + 0 * width + 2;
-
-    uint8_t* p21 = ptr + 1 * width;
-    uint8_t* p22 = ptr + 1 * width + 1;
-    uint8_t* p23 = ptr + 1 * width + 2;
+void sobelNaive(
+    uint8_t* in, uint8_t* gradient, int width, int height, uint8_t threshold);
 
-    uint8_t* p31 = ptr + 2 * width;
-    uint8_t* p32 = ptr + 2 * width + 1;
-    uint8_t* p33 = ptr + 2 * width + 2;
-
-    // output pointer
-    uint8_t* optr = gradient + 1 * width + 1;
-    // Apply 3x3 box filter to image less pixel border of 1 (to avoid treating
-    // boundary) (unoptimized)
-    for (int iy = 1; iy < height - 1; iy++) {
-        for (int ix = 0; ix < width; ix++) {
-            int sx = (*p11 + *p31 + 2 * *p21 - *p13 - 2 * *p23 - *p33) / 9;
-            int sy = (*p11 + *p13 + 2 * *p12 - *p31 - 2 * *p32 - *p33) / 9;
-
-            int val = sx * sx + sy * sy;
-
-            *optr = val > thresholdSq ? 255 : 0;
-            p11++;
-            p12++;
-            p13++;
-            p21++;
-            p22++;
-            p23++;
-            p31++;
-            p32++;
-            p33++;
-            optr++;
-        }
-    }
-}
 /**
  * @brief      Naive 3x3 box filter implementation
  *
@@ -217,44 +123,8 @@ inline void sobelNaive(
  * @param[in]  height   The height
  * @param[in]  numThreads number of threads to use
  */
-inline void boxNaive(uint8_t* in, uint8_t* blurred, int width, int height) {
-    assert(width % 16 == 0 && "width must be multiple of 16!");
-    // allocate space for result
-    uint8_t* ptr = in;
-    uint8_t* p11 = ptr + 0 * width;
-    uint8_t* p12 = ptr + 0 * width + 1;
-    uint8_t* p13 = ptr + 0 * width + 2;
-
-    uint8_t* p21 = ptr + 1 * width;
-    uint8_t* p22 = ptr + 1 * width + 1;
-    uint8_t* p23 = ptr + 1 * width + 2;
-
-    uint8_t* p31 = ptr + 2 * width;
-    uint8_t* p32 = ptr + 2 * width + 1;
-    uint8_t* p33 = ptr + 2 * width + 2;
-    uint8_t* optr = blurred + 1 * width + 1;
+void boxNaive(uint8_t* in, uint8_t* blurred, int width, int height);
 
-    // Apply 3x3 box filter to image less pixel border of 1 (to avoid treating
-    // boundary) (unoptimized)
-    for (int iy = 1; iy < height - 1; iy++) {
-        for (int ix = 0; ix < width; ix++) {
-            int res =
-                (*p11 + *p12 + *p13 + *p21 + *p22 + *p23 + *p31 + *p32 + *p33) /
-                9;
-            *optr = res;
-            p11++;
-            p12++;
-            p13++;
-            p21++;
-            p22++;
-            p23++;
-            p31++;
-            p32++;
-            p33++;
-            optr++;
-        }
-    }
-}
 /**
  * @brief Applies a gpc filter defined by the pixel-difference tests in
  * fastmask. Naive implementation
@@ -269,28 +139,14 @@ inline void boxNaive(uint8_t* in, uint8_t* blurred, int width, int height) {
  * @param width     The width of the image at pointer *in
  * @param height    The height of the image at pointer *in
  */
-inline void gpcFilterNaive(uint8_t* in,
+void gpcFilterNaive(uint8_t* in,
                     const uint8_t* grad,
                     uint32_t* gpc,
                     std::vector<int32_t> fastmask,
                     std::vector<int>& idx,
                     int width,
-                    int height) {
-    // output buffer of same size
-    uint32_t tmp;
+                    int height);
 
-    int j = 0;
-    for (auto k : idx) {
-        tmp = 0;
-        for (uint8_t i = 0; i < fastmask.size(); i += 2) {
-            tmp <<= 1;  // shift by one
-            if (*(in + k + fastmask[i]) > *(in + k + fastmask[i + 1]))
-                tmp++;  // set this test's result to 1
-        }
-        gpc[k] = tmp;
-        j++;
-    }
-}
 /**
  * @brief Applies a gpc filter defined by the pixel-difference tests in
  * fastmask. Additionally uses a threshold vector (tau) Naive implementation.
@@ -303,29 +159,15 @@ inline void gpcFilterNaive(uint8_t* in,
  * @param width     The width of the image at pointer *in
  * @param height    The height of the image at pointer *in
  */
-inline void gpcFilterTauNaive(uint8_t* in,
+void gpcFilterTauNaive(uint8_t* in,
                        const uint8_t* grad,
                        uint32_t* gpc,
                        std::vector<int32_t> fastmask,
                        std::vector<int> tau,
                        std::vector<int>& idx,
                        int width,
-                       int height) {
-    uint32_t tmp;
-
-    int j = 0;
-    for (auto k : idx) {
-        tmp = 0;
-        for (uint8_t i = 0; i < fastmask.size(); i += 2) {
-            tmp <<= 1;  // shift by one
-            if (*(in + k + fastmask[i]) >
-                *(in + k + fastmask[i + 1]) - tau[i / 2])
-                tmp++;  // set this test's result to 1
-        }
-        gpc[k] = tmp;
-        j++;
-    }
-} /**
+                       int height);
+/**
    * @brief      boxfilter using SSE2 instructions. Loosely based on
    *             https://www.ignorantus.com/box_sse2/, published under
    *             the https://creativecommons.org/publicdomain/zero/1.0/ licence.
@@ -336,123 +178,8 @@ inline void gpcFilterTauNaive(uint8_t* in,
    * @param[in]  height   The height
    * @param[in]  numThreads number of threads to use
    */
-inline void box(uint8_t* in, uint8_t* blurred, int width, int height, int numThreads) {
-    assert(width % 16 == 0 && "width must be multiple of 16!");
-#ifndef _INTRINSICS_SSE
-    boxNaive(in, blurred, width, height);
-#else
-    auto boxFilterSegment = [&](int start, int end) {
-        int x, y;
-        __m128i one_third;
-        __m128i *dst0, *dst1;
-        __m128i zero = _mm_setzero_si128();
-
-        one_third = _mm_set1_epi16(
-            21846);  // 2^16/3+1. For 16bit ints. 2^8/3+1=86.33 for 8bit
-        dst0 = (__m128i*)(blurred + width * (start));
-        dst1 = (__m128i*)(blurred + width * (start + 1));
-        for (y = start; y < end;
-             y += 2) {  // We compute results for two rows in one iteration
-            const uint8_t *row0, *row1, *row2, *row3;
-
-            row1 = in + y * width;
-            row0 = row1 - width;
-            row2 = row1 + width;
-            row3 = row2 + width;
-
-            for (x = 0; x < width; x += 16) {
-                __m128i s00, s01, s02;
-                __m128i r00, r01, r02;
-                __m128i ra00, ra01, ra02;
-                __m128i rb00, rb01, rb02;
+void box(uint8_t* in, uint8_t* blurred, int width, int height, int numThreads);
 
-                __m128i a00, a01, a02, b00, b01, b02;
-
-                __m128i tmp0, tmp1, res;
-
-                s00 = _mm_loadu_si128((__m128i*)(row0 - 1));
-                s01 = _mm_loadu_si128((__m128i*)(row0 + 1));
-                s02 = _mm_load_si128((__m128i*)(row0));
-                unpack8to16(s00, a00, b00);
-                unpack8to16(s01, a01, b01);
-                unpack8to16(s02, a02, b02);
-
-                ra00 = _mm_mulhi_epi16(
-                    _mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third);
-                rb00 = _mm_mulhi_epi16(
-                    _mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third);
-
-                s00 = _mm_loadu_si128((__m128i*)(row1 - 1));
-                s01 = _mm_loadu_si128((__m128i*)(row1 + 1));
-                s02 = _mm_load_si128((__m128i*)(row1));
-                unpack8to16(s00, a00, b00);
-                unpack8to16(s01, a01, b01);
-                unpack8to16(s02, a02, b02);
-
-                ra01 = _mm_mulhi_epi16(
-                    _mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third);
-                rb01 = _mm_mulhi_epi16(
-                    _mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third);
-
-                s00 = _mm_loadu_si128((__m128i*)(row2 - 1));
-                s01 = _mm_loadu_si128((__m128i*)(row2 + 1));
-                s02 = _mm_load_si128((__m128i*)(row2));
-                unpack8to16(s00, a00, b00);
-                unpack8to16(s01, a01, b01);
-                unpack8to16(s02, a02, b02);
-
-                ra02 = _mm_mulhi_epi16(
-                    _mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third);
-                rb02 = _mm_mulhi_epi16(
-                    _mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third);
-
-                tmp0 = _mm_mulhi_epi16(
-                    _mm_adds_epi16(_mm_adds_epi16(ra00, ra01), ra02),
-                    one_third);
-                tmp1 = _mm_mulhi_epi16(
-                    _mm_adds_epi16(_mm_adds_epi16(rb00, rb01), rb02),
-                    one_third);
-
-                pack16to8(tmp0, tmp1, res);
-                _mm_store_si128(dst0++, res);
-
-                s00 = _mm_loadu_si128((__m128i*)(row3 - 1));
-                s01 = _mm_loadu_si128((__m128i*)(row3 + 1));
-                s02 = _mm_load_si128((__m128i*)(row3));
-                unpack8to16(s00, a00, b00);
-                unpack8to16(s01, a01, b01);
-                unpack8to16(s02, a02, b02);
-                ra00 = _mm_mulhi_epi16(
-                    _mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third);
-                rb00 = _mm_mulhi_epi16(
-                    _mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third);
-
-                tmp0 = _mm_mulhi_epi16(
-                    _mm_adds_epi16(_mm_adds_epi16(ra00, ra01), ra02),
-                    one_third);
-                tmp1 = _mm_mulhi_epi16(
-                    _mm_adds_epi16(_mm_adds_epi16(rb00, rb01), rb02),
-                    one_third);
-
-                pack16to8(tmp0, tmp1, res);
-                _mm_store_si128(dst1++, res);
-
-                row0 += 16;
-                row1 += 16;
-                row2 += 16;
-                row3 += 16;
-            }
-            // still storing 128bit, but now in 16 x 8bit format, so /16 instead
-            // of /8
-            dst0 += width / 16;
-            dst1 += width / 16;
-        }
-    };  // lambda
-
-    boxFilterSegment(1, height - 3);
-    // parFor(boxFilterSegment,1,height-3,4);
-#endif
-}
 /**
  * @brief      3x3 Sobel filter. Input dimension must be multiple of 16
  *
@@ -464,158 +191,12 @@ inline void box(uint8_t* in, uint8_t* blurred, int width, int height, int numThr
  * @param[in]  numThreads number of threads to use
  */
 
-inline void sobel(uint8_t* in,
+void sobel(uint8_t* in,
            uint8_t* blurred,
            int width,
            int height,
            uint8_t threshold,
-           int numThreads) {
-    assert(width % 16 == 0 && "width must be multiple of 16!");
-#ifndef _INTRINSICS_SSE
-    sobelNaive(in, blurred, width, height, threshold);
-#else
-    auto sobelSSESegment = [&](int start, int end) {
-        __m128i one_third, one_ninth, one, two, mone, mtwo, binThres;
-        __m128i *dst0, *dst1;
-        __m128i zero = _mm_setzero_si128();
-
-        int x, y;
-        one_third = _mm_set1_epi16(
-            21846);  // 2^16/3+1. For 16bit ints. 2^8/3+1=86.33 for 8bit
-        one_ninth = _mm_set1_epi16(7282);  // 2^16/9+1. For 16bit ints.
-
-        binThres = _mm_set1_epi16(threshold * threshold);
-
-        dst0 = (__m128i*)(blurred + width * 1);
-        // dst1 = (__m128i *)(blurred + width * 2);
-        for (y = start; y < end;
-             y++) {  // We compute results for two rows in one iteration
-            const uint8_t *row0, *row1, *row2;
-
-            row1 = in + y * width;
-            row0 = row1 - width;
-            row2 = row1 + width;
-
-            for (x = 0; x < width; x += 16) {
-                // Note: Center element not used in sobel kernels!!
-                // Kernel indices:
-                // 00 01 02
-                // 10 11 12
-                // 20 21 22
-
-                __m128i a00, a01, a02, a10, a12, a20, a21, a22;
-                __m128i b00, b01, b02, b10, b12, b20, b21, b22;
-
-                __m128i raA, raB, rbA, rbB;
-                __m128i tmpa, tmpb, sya, syb, sxa, sxb, res;
-
-                unpack8to16(_mm_loadu_si128((__m128i*)(row0 - 1)), a00, b00);
-                unpack8to16(_mm_load_si128((__m128i*)(row0)), a01, b01);
-                unpack8to16(_mm_loadu_si128((__m128i*)(row0 + 1)), a02, b02);
-
-                unpack8to16(_mm_loadu_si128((__m128i*)(row1 - 1)), a10, b10);
-                unpack8to16(_mm_loadu_si128((__m128i*)(row1 + 1)), a12, b12);
-
-                unpack8to16(_mm_loadu_si128((__m128i*)(row2 - 1)), a20, b20);
-                unpack8to16(_mm_load_si128((__m128i*)(row2)), a21, b21);
-                unpack8to16(_mm_loadu_si128((__m128i*)(row2 + 1)), a22, b22);
-
-                // Sobel kernels for x and y direction.
-                //      1 0 -1       1 2 1
-                // sx = 2 0 -2 sy =  0 0 0
-                //      1 0 -1      -1-2-1
-                //      Note that neither kernel uses the center element)
-
-                // In the following, mullo is used to multiply intermediate
-                // results with -1 To divide by 3, 16bit overflow divide by
-                // multiply is used, which thus uses the upper 16bit(_mm_mulhi)
-                // of the 32bit temporary result.
-
-                // sx column kernel vectors (1,2,1)
-                // Two chained add/sub are used for 2 and -2
-                raA = _mm_mulhi_epi16(
-                    _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(a00, a20), a10),
-                                  a10),
-                    one_ninth);
-                rbA = _mm_mulhi_epi16(
-                    _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(b00, b20), b10),
-                                  b10),
-                    one_ninth);
-
-                // sx column kernel vector (-1 -2 -1)
-                raB = _mm_mulhi_epi16(
-                    _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(a02, a22), a12),
-                                  a12),
-                    one_ninth);
-                rbB = _mm_mulhi_epi16(
-                    _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(b02, b22), b12),
-                                  b12),
-                    one_ninth);
-
-                // Square of sx: Add squares of above temporaries into final sum
-                tmpa = _mm_sub_epi16(raA, raB);
-                tmpb = _mm_sub_epi16(rbA, rbB);
-
-                sxa = _mm_mullo_epi16(tmpa, tmpa);
-                sxb = _mm_mullo_epi16(tmpb, tmpb);
-
-                // sy row kernel vector (1,2,1)
-                // Two chained add are used for 2 and -2
-                raA = _mm_mulhi_epi16(
-                    _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(a00, a02), a01),
-                                  a01),
-                    one_ninth);
-                rbA = _mm_mulhi_epi16(
-                    _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(b00, b02), b01),
-                                  b01),
-                    one_ninth);
-
-                // sy row kernel vector (-1 -2 -1)
-                raB = _mm_mulhi_epi16(
-                    _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(a20, a22), a21),
-                                  a21),
-                    one_ninth);
-                rbB = _mm_mulhi_epi16(
-                    _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(b20, b22), b21),
-                                  b21),
-                    one_ninth);
-
-                // Square of sx: Add squares of above temporaries into final sum
-                tmpa = _mm_sub_epi16(raA, raB);
-                tmpb = _mm_sub_epi16(rbA, rbB);
-
-                // watch out, can't overwrite this
-                sya = _mm_mullo_epi16(tmpa, tmpa);
-                syb = _mm_mullo_epi16(tmpb, tmpb);
-
-                __m128i zero = _mm_setzero_si128();
-
-                // The unpacklo is necessary because _mm_cmput_epi16 sets the
-                // output to 0xFFFF if the comparison is true. When packing
-                // 16bit to 8bit however, 0xFFFF will be interpreted (in a
-                // signed environment) as being negative, and hence set to 0,
-                // resulting in a 0 output everywhere. using unpacklo in between
-                // we get 0xFFFF->0xFF
-                pack16to8(
-                    _mm_unpacklo_epi8(
-                        _mm_cmpgt_epi16(_mm_adds_epi16(sxa, sya), binThres),
-                        zero),
-                    _mm_unpacklo_epi8(
-                        _mm_cmpgt_epi16(_mm_adds_epi16(sxb, syb), binThres),
-                        zero),
-                    res);
-
-                _mm_store_si128(dst0++, res);
-
-                row0 += 16;
-                row1 += 16;
-                row2 += 16;
-            }  // cols
-        }  // rows
-    };  // Lambda
-    sobelSSESegment(1, height - 3);
-#endif
-}
+           int numThreads);
 
 /**
  * @brief Checks if the 128bits in xmm are all zero
@@ -625,10 +206,7 @@ inline void sobel(uint8_t* in,
  * @return true if all zeros, false otherwise
  */
 #ifdef _INTRINSICS_SSE
-inline bool isAllZeros(__m128i xmm) {
-    return _mm_movemask_epi8(_mm_cmpeq_epi8(xmm, _mm_setzero_si128())) ==
-           0xFFFF;
-}
+bool isAllZeros(__m128i xmm);
 #endif
 /**
  * @brief Applies a gpc filter defined by the pixel-difference tests in
@@ -645,79 +223,15 @@ inline bool isAllZeros(__m128i xmm) {
  * @param height    The height of the image at pointer *in
  * @param numThreadsNumber of threads to use
  */
-inline void gpcFilter(uint8_t* in,
+void gpcFilter(uint8_t* in,
                const uint8_t* grad,
                uint32_t* gpc,
                std::vector<int32_t> fastmask,
                std::vector<int>& idx,
                int width,
                int height,
-               int numThreads) {
-    assert(width % 16 == 0 && "width must be multiple of 16!");
-#ifndef _INTRINSICS_SSE
-    gpcFilterNaive(in, grad, gpc, fastmask, idx, width, height);
-#else
-    auto gpcFilterSegment = [&](int start, int end) {
-        __m128i zero = _mm_set1_epi8(0);
-        __m128i one = _mm_set1_epi8(1);
-        for (int y = start; y < end; y++) {
-            for (int x = 0; x < width; x += 16) {
-                uint8_t* rowPtr;
-                rowPtr = in + (y - 2) * width + x;
-                __m128i out[4];  // temporary output vector of 4 128bit words
-
-                const uint8_t* center = (in + y * width + x);
-                const uint8_t* centerGrad = (grad + y * width + x);
-                // We only process the current segment if there are any non-zero
-                // values (high gradient pixels)
-                if (!isAllZeros(_mm_lddqu_si128((__m128i*)centerGrad))) {
-                    __m128i* dst =
-                        (__m128i*)(gpc + y * width +
-                                   x);  // Set starting point to pixel (2,2)
-                    out[0] = zero;
-                    out[1] = zero;
-                    out[2] = zero;
-                    out[3] = zero;
-                    uint8_t k = 0;
-                    __m128i bitMask = one;
-                    for (uint8_t i = 0; i < fastmask.size() && i < 64; i += 2) {
-                        out[k] |= _mm_and_si128(
-                            _mm_cmpgt_epu8(
-                                _mm_lddqu_si128(
-                                    (__m128i*)(center + fastmask[i])),
-                                _mm_lddqu_si128(
-                                    (__m128i*)(center + fastmask[i + 1]))),
-                            bitMask);
-                        // Keeps index into output vector and updates bit mask
-                        if (i % 16 == 0 && i != 0) {
-                            bitMask = one;
-                            k++;
-                        } else {
-                            bitMask += bitMask;
-                        }
-                    }
-                    // 8bit to 16bit
-                    __m128i high1 = _mm_unpacklo_epi8(out[2], out[3]);
-                    __m128i high2 = _mm_unpackhi_epi8(out[2], out[3]);
-                    __m128i low1 = _mm_unpacklo_epi8(out[0], out[1]);
-                    __m128i low2 = _mm_unpackhi_epi8(out[0], out[1]);
-
-                    // 16bit to 32bit ints
-                    _mm_storeu_si128(dst, _mm_unpacklo_epi16(low1, high1));
-                    _mm_storeu_si128(dst + 1, _mm_unpackhi_epi16(low1, high1));
-                    _mm_storeu_si128(dst + 2, _mm_unpacklo_epi16(low2, high2));
-                    _mm_storeu_si128(dst + 3, _mm_unpackhi_epi16(low2, high2));
-                }
-            }  // col iteration
-        }  // row iteration
-    };
+               int numThreads);
 
-    if (numThreads == 1)
-        gpcFilterSegment(13, height - 15);
-    else
-        parFor(gpcFilterSegment, 13, height - 15, 4);
-#endif
-}
 /**
  * @brief Applies a gpc filter defined by the pixel-difference tests in
  * fastmask. Additionally uses a threshold vector (tau)
@@ -731,7 +245,7 @@ inline void gpcFilter(uint8_t* in,
  * @param height    The height of the image at pointer *in
  * @param numThreads Number of threads to use
  */
-inline void gpcFilterTau(uint8_t* in,
+void gpcFilterTau(uint8_t* in,
                   const uint8_t* grad,
                   uint32_t* gpc,
                   std::vector<int32_t> fastmask,
@@ -739,75 +253,7 @@ inline void gpcFilterTau(uint8_t* in,
                   std::vector<int>& idx,
                   int width,
                   int height,
-                  int numThreads) {
-    assert(width % 16 == 0 && "width must be multiple of 16!");
-#ifndef _INTRINSICS_SSE
-    gpcFilterTauNaive(in, grad, gpc, fastmask, tau, idx, width, height);
-#else
-    auto gpcFilterSegment = [&](int start, int end) {
-        __m128i zero = _mm_set1_epi8(0);
-        __m128i one = _mm_set1_epi8(1);
-        for (int y = start; y < end; y++) {
-            for (int x = 0; x < width; x += 16) {
-                uint8_t* rowPtr;
-                rowPtr = in + (y - 2) * width + x;
-                __m128i out[4];  // temporary output vector of 4 128bit words
-
-                const uint8_t* center = (in + y * width + x);
-                const uint8_t* centerGrad = (grad + y * width + x);
-                // We only process the current segment if there are any non-zero
-                // values (high gradient pixels)
-                if (!isAllZeros(_mm_lddqu_si128((__m128i*)centerGrad))) {
-                    __m128i* dst =
-                        (__m128i*)(gpc + y * width +
-                                   x);  // Set starting point to pixel (2,2)
-                    out[0] = zero;
-                    out[1] = zero;
-                    out[2] = zero;
-                    out[3] = zero;
-                    uint8_t k = 0;
-                    __m128i bitMask = one;
-                    for (uint8_t i = 0; i < fastmask.size() && i < 64; i += 2) {
-                        out[k] |= _mm_and_si128(
-                            _mm_cmpgt_epu8(
-                                _mm_lddqu_si128(
-                                    (__m128i*)(center + fastmask[i])),
-                                _mm_subs_epi8(
-                                    _mm_lddqu_si128(
-                                        (__m128i*)(center + fastmask[i + 1])),
-                                    _mm_set1_epi8(tau[i / 2]))  // deduct tau
-                                ),
-                            bitMask);
-                        // Keeps index into output vector and updates bit mask
-                        if (i % 16 == 0 && i != 0) {
-                            bitMask = one;
-                            k++;
-                        } else {
-                            bitMask += bitMask;
-                        }
-                    }
-                    // 8bit to 16bit
-                    __m128i high1 = _mm_unpacklo_epi8(out[2], out[3]);
-                    __m128i high2 = _mm_unpackhi_epi8(out[2], out[3]);
-                    __m128i low1 = _mm_unpacklo_epi8(out[0], out[1]);
-                    __m128i low2 = _mm_unpackhi_epi8(out[0], out[1]);
-
-                    // 16bit to 32bit ints
-                    _mm_storeu_si128(dst, _mm_unpacklo_epi16(low1, high1));
-                    _mm_storeu_si128(dst + 1, _mm_unpackhi_epi16(low1, high1));
-                    _mm_storeu_si128(dst + 2, _mm_unpacklo_epi16(low2, high2));
-                    _mm_storeu_si128(dst + 3, _mm_unpackhi_epi16(low2, high2));
-                }
-            }  // col iteration
-        }  // row iteration
-    };
-
-    if (numThreads == 1)
-        gpcFilterSegment(13, height - 15);
-    else
-        parFor(gpcFilterSegment, 13, height - 15, 4);
-#endif
-}
+                  int numThreads); 
 /**
  * @brief Naive version of 5x5 census transoform
  *
@@ -816,30 +262,8 @@ inline void gpcFilterTau(uint8_t* in,
  * @param width   Width of the image at *in pointer
  * @param height  Heiht of the image at *in pointer
  */
-inline void census5x5Naive(uint8_t* in, uint32_t* census, int width, int height) {
-    uint32_t val;
-    uint32_t* dst;
-    for (int y = 2; y < height - 3; y++) {
-        for (int x = 0; x < width; x++) {
-            val = 0;
-            dst = census + y * width + x;
-            int i = 0;
-            // patch loops
-            for (int px = -2; px <= 2; px++) {
-                for (int py = -2; py <= 2; py++) {
-                    if (!(px == 0 && py == 0)) {
-                        val |= (in[(y + py) * width + (x + px)] >
-                                in[y * width + x])
-                                   ? (1 << i)
-                                   : 0;
-                        i++;
-                    }
-                }
-            }  // End patch loops
-            *dst = val;
-        }
-    }  // End pixel loops
-}
+void census5x5Naive(uint8_t* in, uint32_t* census, int width, int height);
+
 
 /**
  * @brief 5x5 dense census transform of input image. binary codes are returned
@@ -850,151 +274,6 @@ inline void census5x5Naive(uint8_t* in, uint32_t* census, int width, int height)
  * @param width
  * @param height
  */
-inline void census5x5(uint8_t* in, uint32_t* census, int width, int height) {
-    assert(width % 16 == 0 && "width must be multiple of 16!");
-#ifndef _INTRINSICS_SSE
-    census5x5Naive(in, census, width, height);
-#else
-    __m128i zero = _mm_set1_epi8(0);
-    __m128i one = _mm_set1_epi8(1);
-
-    for (int y = 2; y < height - 3; y++) {
-        for (int x = 0; x < width; x += 16) {
-            uint8_t* rowPtr;
-            rowPtr = in + (y - 2) * width + x;
-            __m128i center = _mm_lddqu_si128((__m128i*)(in + y * width + x));
-            __m128i* dst = (__m128i*)(census + y * width +
-                                      x);  // Set starting point to pixel (2,2)
-            // row 0
-            __m128i bitMask = one;
-            __m128i byte1 = _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 2))),
-                bitMask);
-            bitMask += bitMask;  // 2
-            byte1 |= _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 1))),
-                bitMask);
-            bitMask += bitMask;  // 4
-            byte1 |= _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr))),
-                bitMask);
-            bitMask += bitMask;  // 8
-            byte1 |= _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 1))),
-                bitMask);
-            bitMask += bitMask;  // 16
-            byte1 |= _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 2))),
-                bitMask);
-
-            // row 1
-            rowPtr += width;
-            bitMask += bitMask;  // 32
-            byte1 |= _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 2))),
-                bitMask);
-            bitMask += bitMask;  // 64
-            byte1 |= _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 1))),
-                bitMask);
-            bitMask += bitMask;  // 128
-            byte1 |= _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr))),
-                bitMask);
-            bitMask = one;  // 1
-            __m128i byte2 = _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 1))),
-                bitMask);
-            bitMask += bitMask;  // 2
-            byte2 |= _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 2))),
-                bitMask);
-
-            // row 2
-            rowPtr += width;
-            bitMask += bitMask;  // 4
-            byte2 |= _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 2))),
-                bitMask);
-            bitMask += bitMask;  // 8
-            byte2 |= _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 1))),
-                bitMask);
-            bitMask += bitMask;  // 16
-            byte2 |= _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 1))),
-                bitMask);
-            bitMask += bitMask;  // 32
-            byte2 |= _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 2))),
-                bitMask);
-
-            // row 3
-            rowPtr += width;
-            bitMask += bitMask;  // 64
-            byte2 |= _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 2))),
-                bitMask);
-            bitMask += bitMask;  // 128
-            byte2 |= _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 1))),
-                bitMask);
-            bitMask = one;  // 1
-            __m128i byte3 = _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr))),
-                bitMask);
-            bitMask += bitMask;  // 2
-            byte3 |= _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 1))),
-                bitMask);
-            bitMask += bitMask;  // 4
-            byte3 |= _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 2))),
-                bitMask);
-
-            // row 4
-            rowPtr += width;
-            bitMask += bitMask;  // 8
-            byte3 |= _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 2))),
-                bitMask);
-            bitMask += bitMask;  // 16
-            byte3 |= _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 1))),
-                bitMask);
-            bitMask += bitMask;  // 32
-            byte3 |= _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr))),
-                bitMask);
-            bitMask += bitMask;  // 64
-            byte3 |= _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 1))),
-                bitMask);
-            bitMask += bitMask;  // 128
-            byte3 |= _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 2))),
-                bitMask);
-
-            // 8bit to 16bit
-            __m128i high1 = _mm_unpacklo_epi8(byte3, zero);
-            __m128i high2 = _mm_unpackhi_epi8(byte3, zero);
-            __m128i low1 = _mm_unpacklo_epi8(byte1, byte2);
-            __m128i low2 = _mm_unpackhi_epi8(byte1, byte2);
-
-            // 16bit to 32bit ints
-            _mm_storeu_si128(dst, _mm_unpacklo_epi16(low1, high1));
-            _mm_storeu_si128(dst + 1, _mm_unpackhi_epi16(low1, high1));
-            _mm_storeu_si128(dst + 2, _mm_unpacklo_epi16(low2, high2));
-            _mm_storeu_si128(dst + 3, _mm_unpackhi_epi16(low2, high2));
-
-        }  // col iteration
-    }  // row iteration
-    // if(numThreads == 1)
-    // gpcFilterSegment(13,height-15);
-    // else
-    // parFor(gpcFilterSegment,13,height-15,4);
-
-#endif
-}  // census5x5
+void census5x5(uint8_t* in, uint32_t* census, int width, int height);
 }  // namespace ndb
 #endif
diff --git a/samples/sparsematch.cpp b/samples/sparsematch.cpp
index 6834f61..3d5f19b 100644
--- a/samples/sparsematch.cpp
+++ b/samples/sparsematch.cpp
@@ -32,7 +32,7 @@ int main(int argc, char** argv) {
     gpc::inference::InferenceSettings inferencesettings =
         gpc::inference::InferenceSettings()
             .builder()
-            .gradientThreshold(5)
+            .gradientThreshold(20)
             .verticalTolerance(
                 0)               // 0px tolerance for rectified epipolar matches
             .dispHigh(128)       // limit disparities to 128

From d4dcdf6327f931d2b695becc5f7c692abaf7e30e Mon Sep 17 00:00:00 2001
From: Nik Bamert <github@nikbamert.com>
Date: Sun, 15 Feb 2026 09:03:49 +0100
Subject: [PATCH 06/36] add filter cpp

---
 lib/gpc/filter.cpp | 833 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 833 insertions(+)
 create mode 100644 lib/gpc/filter.cpp

diff --git a/lib/gpc/filter.cpp b/lib/gpc/filter.cpp
new file mode 100644
index 0000000..d03c0af
--- /dev/null
+++ b/lib/gpc/filter.cpp
@@ -0,0 +1,833 @@
+// Copyright (c) 2018, ETH Zurich
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the copyright holder nor the names of its contributors
+// may be used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Code Author: Niklaus Bamert (bamertn@ethz.ch)
+#ifndef __NDB__FILTER
+#define __NDB__FILTER
+
+#include <cassert>
+#include <thread>
+
+#include "gpc/filter.hpp"
+using namespace std;
+
+namespace ndb {
+void arr2ind(const unsigned char* a,
+                                       int n,
+                                       int* ind,
+                                       int* m) {
+#ifdef _INTRINSICS_SSE
+    int i, m0, k;
+    __m256i msk;
+    m0 = 0;
+    for (i = 0; i < n; i = i + 32) { /* Load 32 bytes and compare with zero: */
+        msk = _mm256_cmpeq_epi8(_mm256_load_si256((__m256i*)&a[i]),
+                                _mm256_setzero_si256());
+        k = _mm256_movemask_epi8(msk);
+        k = ~k; /* Search for nonzero bits instead of zero bits.  */
+        while (k) {
+            ind[m0] =
+                i + _tzcnt_u32(
+                        k); /* Count the number of trailing zero bits in k. */
+            m0++;
+            k = _blsr_u32(k); /* Clear the lowest set bit in k. */
+        }
+    }
+    *m = m0;
+#else
+    int nnz = 0;
+    for (int i = 0; i < n; i++) {
+        if (a[i] != 0) {
+            nnz++;
+            *ind = i;
+            ind++;
+        }
+    }
+    *m = nnz;
+#endif
+}
+#ifdef _INTRINSICS_SSE
+void unpack8to16(const __m128i x, __m128i& y0, __m128i& y1) {
+    __m128i zero = _mm_setzero_si128();
+    y0 = _mm_unpacklo_epi8(x, zero);
+    y1 = _mm_unpackhi_epi8(x, zero);
+}
+void pack16to8(const __m128i x0, const __m128i x1, __m128i& y) {
+    y = _mm_packus_epi16(x0, x1);
+}
+
+#endif
+void parFor(std::function<void(int, int)> const& f,
+            int start,
+            int end,
+            int nThreads) {
+    // Range definition
+    // quantities derived from range
+    int segSize = (end - start) / nThreads;
+    int lastSeg = (end - start) % nThreads;
+
+    std::vector<std::thread> threads;
+    threads.reserve(nThreads);
+
+    // Spawn threads
+    for (int t = 0; t < nThreads - 1; t++) {
+        threads.emplace_back(f, start + t * segSize, start + (t + 1) * segSize);
+    }
+    threads.emplace_back(f,
+                         start + (nThreads - 1) * segSize,
+                         start + (nThreads)*segSize + lastSeg);
+    // Join
+    for (auto& t : threads) t.join();
+}
+
+void sobelNaive(
+    uint8_t* in, uint8_t* gradient, int width, int height, uint8_t threshold) {
+    assert(width % 16 == 0 && "width must be multiple of 16!");
+    int thresholdSq = threshold * threshold;
+    uint8_t* ptr = in;
+
+    uint8_t* p11 = ptr + 0 * width;
+    uint8_t* p12 = ptr + 0 * width + 1;
+    uint8_t* p13 = ptr + 0 * width + 2;
+
+    uint8_t* p21 = ptr + 1 * width;
+    uint8_t* p22 = ptr + 1 * width + 1;
+    uint8_t* p23 = ptr + 1 * width + 2;
+
+    uint8_t* p31 = ptr + 2 * width;
+    uint8_t* p32 = ptr + 2 * width + 1;
+    uint8_t* p33 = ptr + 2 * width + 2;
+
+    // output pointer
+    uint8_t* optr = gradient + 1 * width + 1;
+    // Apply 3x3 box filter to image less pixel border of 1 (to avoid treating
+    // boundary) (unoptimized)
+    for (int iy = 1; iy < height - 1; iy++) {
+        for (int ix = 0; ix < width; ix++) {
+            int sx = (*p11 + *p31 + 2 * *p21 - *p13 - 2 * *p23 - *p33) / 9;
+            int sy = (*p11 + *p13 + 2 * *p12 - *p31 - 2 * *p32 - *p33) / 9;
+
+            int val = sx * sx + sy * sy;
+
+            *optr = val > thresholdSq ? 255 : 0;
+            p11++;
+            p12++;
+            p13++;
+            p21++;
+            p22++;
+            p23++;
+            p31++;
+            p32++;
+            p33++;
+            optr++;
+        }
+    }
+}
+void boxNaive(uint8_t* in, uint8_t* blurred, int width, int height) {
+    assert(width % 16 == 0 && "width must be multiple of 16!");
+    // allocate space for result
+    uint8_t* ptr = in;
+    uint8_t* p11 = ptr + 0 * width;
+    uint8_t* p12 = ptr + 0 * width + 1;
+    uint8_t* p13 = ptr + 0 * width + 2;
+
+    uint8_t* p21 = ptr + 1 * width;
+    uint8_t* p22 = ptr + 1 * width + 1;
+    uint8_t* p23 = ptr + 1 * width + 2;
+
+    uint8_t* p31 = ptr + 2 * width;
+    uint8_t* p32 = ptr + 2 * width + 1;
+    uint8_t* p33 = ptr + 2 * width + 2;
+    uint8_t* optr = blurred + 1 * width + 1;
+
+    // Apply 3x3 box filter to image less pixel border of 1 (to avoid treating
+    // boundary) (unoptimized)
+    for (int iy = 1; iy < height - 1; iy++) {
+        for (int ix = 0; ix < width; ix++) {
+            int res =
+                (*p11 + *p12 + *p13 + *p21 + *p22 + *p23 + *p31 + *p32 + *p33) /
+                9;
+            *optr = res;
+            p11++;
+            p12++;
+            p13++;
+            p21++;
+            p22++;
+            p23++;
+            p31++;
+            p32++;
+            p33++;
+            optr++;
+        }
+    }
+}
+void gpcFilterNaive(uint8_t* in,
+                    const uint8_t* grad,
+                    uint32_t* gpc,
+                    std::vector<int32_t> fastmask,
+                    std::vector<int>& idx,
+                    int width,
+                    int height) {
+    // output buffer of same size
+    uint32_t tmp;
+
+    int j = 0;
+    for (auto k : idx) {
+        tmp = 0;
+        for (uint8_t i = 0; i < fastmask.size(); i += 2) {
+            tmp <<= 1;  // shift by one
+            if (*(in + k + fastmask[i]) > *(in + k + fastmask[i + 1]))
+                tmp++;  // set this test's result to 1
+        }
+        gpc[k] = tmp;
+        j++;
+    }
+}
+
+void gpcFilterTauNaive(uint8_t* in,
+                       const uint8_t* grad,
+                       uint32_t* gpc,
+                       std::vector<int32_t> fastmask,
+                       std::vector<int> tau,
+                       std::vector<int>& idx,
+                       int width,
+                       int height) {
+    uint32_t tmp;
+
+    int j = 0;
+    for (auto k : idx) {
+        tmp = 0;
+        for (uint8_t i = 0; i < fastmask.size(); i += 2) {
+            tmp <<= 1;  // shift by one
+            if (*(in + k + fastmask[i]) >
+                *(in + k + fastmask[i + 1]) - tau[i / 2])
+                tmp++;  // set this test's result to 1
+        }
+        gpc[k] = tmp;
+        j++;
+    }
+} 
+void box(uint8_t* in, uint8_t* blurred, int width, int height, int numThreads) {
+    assert(width % 16 == 0 && "width must be multiple of 16!");
+#ifndef _INTRINSICS_SSE
+    boxNaive(in, blurred, width, height);
+#else
+    auto boxFilterSegment = [&](int start, int end) {
+        int x, y;
+        __m128i one_third;
+        __m128i *dst0, *dst1;
+        __m128i zero = _mm_setzero_si128();
+
+        one_third = _mm_set1_epi16(
+            21846);  // 2^16/3+1. For 16bit ints. 2^8/3+1=86.33 for 8bit
+        dst0 = (__m128i*)(blurred + width * (start));
+        dst1 = (__m128i*)(blurred + width * (start + 1));
+        for (y = start; y < end;
+             y += 2) {  // We compute results for two rows in one iteration
+            const uint8_t *row0, *row1, *row2, *row3;
+
+            row1 = in + y * width;
+            row0 = row1 - width;
+            row2 = row1 + width;
+            row3 = row2 + width;
+
+            for (x = 0; x < width; x += 16) {
+                __m128i s00, s01, s02;
+                __m128i r00, r01, r02;
+                __m128i ra00, ra01, ra02;
+                __m128i rb00, rb01, rb02;
+
+                __m128i a00, a01, a02, b00, b01, b02;
+
+                __m128i tmp0, tmp1, res;
+
+                s00 = _mm_loadu_si128((__m128i*)(row0 - 1));
+                s01 = _mm_loadu_si128((__m128i*)(row0 + 1));
+                s02 = _mm_load_si128((__m128i*)(row0));
+                unpack8to16(s00, a00, b00);
+                unpack8to16(s01, a01, b01);
+                unpack8to16(s02, a02, b02);
+
+                ra00 = _mm_mulhi_epi16(
+                    _mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third);
+                rb00 = _mm_mulhi_epi16(
+                    _mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third);
+
+                s00 = _mm_loadu_si128((__m128i*)(row1 - 1));
+                s01 = _mm_loadu_si128((__m128i*)(row1 + 1));
+                s02 = _mm_load_si128((__m128i*)(row1));
+                unpack8to16(s00, a00, b00);
+                unpack8to16(s01, a01, b01);
+                unpack8to16(s02, a02, b02);
+
+                ra01 = _mm_mulhi_epi16(
+                    _mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third);
+                rb01 = _mm_mulhi_epi16(
+                    _mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third);
+
+                s00 = _mm_loadu_si128((__m128i*)(row2 - 1));
+                s01 = _mm_loadu_si128((__m128i*)(row2 + 1));
+                s02 = _mm_load_si128((__m128i*)(row2));
+                unpack8to16(s00, a00, b00);
+                unpack8to16(s01, a01, b01);
+                unpack8to16(s02, a02, b02);
+
+                ra02 = _mm_mulhi_epi16(
+                    _mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third);
+                rb02 = _mm_mulhi_epi16(
+                    _mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third);
+
+                tmp0 = _mm_mulhi_epi16(
+                    _mm_adds_epi16(_mm_adds_epi16(ra00, ra01), ra02),
+                    one_third);
+                tmp1 = _mm_mulhi_epi16(
+                    _mm_adds_epi16(_mm_adds_epi16(rb00, rb01), rb02),
+                    one_third);
+
+                pack16to8(tmp0, tmp1, res);
+                _mm_store_si128(dst0++, res);
+
+                s00 = _mm_loadu_si128((__m128i*)(row3 - 1));
+                s01 = _mm_loadu_si128((__m128i*)(row3 + 1));
+                s02 = _mm_load_si128((__m128i*)(row3));
+                unpack8to16(s00, a00, b00);
+                unpack8to16(s01, a01, b01);
+                unpack8to16(s02, a02, b02);
+                ra00 = _mm_mulhi_epi16(
+                    _mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third);
+                rb00 = _mm_mulhi_epi16(
+                    _mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third);
+
+                tmp0 = _mm_mulhi_epi16(
+                    _mm_adds_epi16(_mm_adds_epi16(ra00, ra01), ra02),
+                    one_third);
+                tmp1 = _mm_mulhi_epi16(
+                    _mm_adds_epi16(_mm_adds_epi16(rb00, rb01), rb02),
+                    one_third);
+
+                pack16to8(tmp0, tmp1, res);
+                _mm_store_si128(dst1++, res);
+
+                row0 += 16;
+                row1 += 16;
+                row2 += 16;
+                row3 += 16;
+            }
+            // still storing 128bit, but now in 16 x 8bit format, so /16 instead
+            // of /8
+            dst0 += width / 16;
+            dst1 += width / 16;
+        }
+    };  // lambda
+
+    boxFilterSegment(1, height - 3);
+    // parFor(boxFilterSegment,1,height-3,4);
+#endif
+}
+void sobel(uint8_t* in,
+           uint8_t* blurred,
+           int width,
+           int height,
+           uint8_t threshold,
+           int numThreads) {
+    assert(width % 16 == 0 && "width must be multiple of 16!");
+#ifndef _INTRINSICS_SSE
+    sobelNaive(in, blurred, width, height, threshold);
+#else
+    auto sobelSSESegment = [&](int start, int end) {
+        __m128i one_third, one_ninth, one, two, mone, mtwo, binThres;
+        __m128i *dst0, *dst1;
+        __m128i zero = _mm_setzero_si128();
+
+        int x, y;
+        one_third = _mm_set1_epi16(
+            21846);  // 2^16/3+1. For 16bit ints. 2^8/3+1=86.33 for 8bit
+        one_ninth = _mm_set1_epi16(7282);  // 2^16/9+1. For 16bit ints.
+
+        binThres = _mm_set1_epi16(threshold * threshold);
+
+        dst0 = (__m128i*)(blurred + width * 1);
+        // dst1 = (__m128i *)(blurred + width * 2);
+        for (y = start; y < end;
+             y++) {  // We compute results for two rows in one iteration
+            const uint8_t *row0, *row1, *row2;
+
+            row1 = in + y * width;
+            row0 = row1 - width;
+            row2 = row1 + width;
+
+            for (x = 0; x < width; x += 16) {
+                // Note: Center element not used in sobel kernels!!
+                // Kernel indices:
+                // 00 01 02
+                // 10 11 12
+                // 20 21 22
+
+                __m128i a00, a01, a02, a10, a12, a20, a21, a22;
+                __m128i b00, b01, b02, b10, b12, b20, b21, b22;
+
+                __m128i raA, raB, rbA, rbB;
+                __m128i tmpa, tmpb, sya, syb, sxa, sxb, res;
+
+                unpack8to16(_mm_loadu_si128((__m128i*)(row0 - 1)), a00, b00);
+                unpack8to16(_mm_load_si128((__m128i*)(row0)), a01, b01);
+                unpack8to16(_mm_loadu_si128((__m128i*)(row0 + 1)), a02, b02);
+
+                unpack8to16(_mm_loadu_si128((__m128i*)(row1 - 1)), a10, b10);
+                unpack8to16(_mm_loadu_si128((__m128i*)(row1 + 1)), a12, b12);
+
+                unpack8to16(_mm_loadu_si128((__m128i*)(row2 - 1)), a20, b20);
+                unpack8to16(_mm_load_si128((__m128i*)(row2)), a21, b21);
+                unpack8to16(_mm_loadu_si128((__m128i*)(row2 + 1)), a22, b22);
+
+                // Sobel kernels for x and y direction.
+                //      1 0 -1       1 2 1
+                // sx = 2 0 -2 sy =  0 0 0
+                //      1 0 -1      -1-2-1
+                //      Note that neither kernel uses the center element)
+
+                // In the following, mullo is used to multiply intermediate
+                // results with -1 To divide by 3, 16bit overflow divide by
+                // multiply is used, which thus uses the upper 16bit(_mm_mulhi)
+                // of the 32bit temporary result.
+
+                // sx column kernel vectors (1,2,1)
+                // Two chained add/sub are used for 2 and -2
+                raA = _mm_mulhi_epi16(
+                    _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(a00, a20), a10),
+                                  a10),
+                    one_ninth);
+                rbA = _mm_mulhi_epi16(
+                    _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(b00, b20), b10),
+                                  b10),
+                    one_ninth);
+
+                // sx column kernel vector (-1 -2 -1)
+                raB = _mm_mulhi_epi16(
+                    _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(a02, a22), a12),
+                                  a12),
+                    one_ninth);
+                rbB = _mm_mulhi_epi16(
+                    _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(b02, b22), b12),
+                                  b12),
+                    one_ninth);
+
+                // Square of sx: Add squares of above temporaries into final sum
+                tmpa = _mm_sub_epi16(raA, raB);
+                tmpb = _mm_sub_epi16(rbA, rbB);
+
+                sxa = _mm_mullo_epi16(tmpa, tmpa);
+                sxb = _mm_mullo_epi16(tmpb, tmpb);
+
+                // sy row kernel vector (1,2,1)
+                // Two chained add are used for 2 and -2
+                raA = _mm_mulhi_epi16(
+                    _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(a00, a02), a01),
+                                  a01),
+                    one_ninth);
+                rbA = _mm_mulhi_epi16(
+                    _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(b00, b02), b01),
+                                  b01),
+                    one_ninth);
+
+                // sy row kernel vector (-1 -2 -1)
+                raB = _mm_mulhi_epi16(
+                    _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(a20, a22), a21),
+                                  a21),
+                    one_ninth);
+                rbB = _mm_mulhi_epi16(
+                    _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(b20, b22), b21),
+                                  b21),
+                    one_ninth);
+
+                // Square of sx: Add squares of above temporaries into final sum
+                tmpa = _mm_sub_epi16(raA, raB);
+                tmpb = _mm_sub_epi16(rbA, rbB);
+
+                // watch out, can't overwrite this
+                sya = _mm_mullo_epi16(tmpa, tmpa);
+                syb = _mm_mullo_epi16(tmpb, tmpb);
+
+                __m128i zero = _mm_setzero_si128();
+
+                // The unpacklo is necessary because _mm_cmput_epi16 sets the
+                // output to 0xFFFF if the comparison is true. When packing
+                // 16bit to 8bit however, 0xFFFF will be interpreted (in a
+                // signed environment) as being negative, and hence set to 0,
+                // resulting in a 0 output everywhere. using unpacklo in between
+                // we get 0xFFFF->0xFF
+                pack16to8(
+                    _mm_unpacklo_epi8(
+                        _mm_cmpgt_epi16(_mm_adds_epi16(sxa, sya), binThres),
+                        zero),
+                    _mm_unpacklo_epi8(
+                        _mm_cmpgt_epi16(_mm_adds_epi16(sxb, syb), binThres),
+                        zero),
+                    res);
+
+                _mm_store_si128(dst0++, res);
+
+                row0 += 16;
+                row1 += 16;
+                row2 += 16;
+            }  // cols
+        }  // rows
+    };  // Lambda
+    sobelSSESegment(1, height - 3);
+#endif
+}
+
+#ifdef _INTRINSICS_SSE
+bool isAllZeros(__m128i xmm) {
+    return _mm_movemask_epi8(_mm_cmpeq_epi8(xmm, _mm_setzero_si128())) ==
+           0xFFFF;
+}
+#endif
+void gpcFilter(uint8_t* in,
+               const uint8_t* grad,
+               uint32_t* gpc,
+               std::vector<int32_t> fastmask,
+               std::vector<int>& idx,
+               int width,
+               int height,
+               int numThreads) {
+    assert(width % 16 == 0 && "width must be multiple of 16!");
+#ifndef _INTRINSICS_SSE
+    gpcFilterNaive(in, grad, gpc, fastmask, idx, width, height);
+#else
+    auto gpcFilterSegment = [&](int start, int end) {
+        __m128i zero = _mm_set1_epi8(0);
+        __m128i one = _mm_set1_epi8(1);
+        for (int y = start; y < end; y++) {
+            for (int x = 0; x < width; x += 16) {
+                uint8_t* rowPtr;
+                rowPtr = in + (y - 2) * width + x;
+                __m128i out[4];  // temporary output vector of 4 128bit words
+
+                const uint8_t* center = (in + y * width + x);
+                const uint8_t* centerGrad = (grad + y * width + x);
+                // We only process the current segment if there are any non-zero
+                // values (high gradient pixels)
+                if (!isAllZeros(_mm_lddqu_si128((__m128i*)centerGrad))) {
+                    __m128i* dst =
+                        (__m128i*)(gpc + y * width +
+                                   x);  // Set starting point to pixel (2,2)
+                    out[0] = zero;
+                    out[1] = zero;
+                    out[2] = zero;
+                    out[3] = zero;
+                    uint8_t k = 0;
+                    __m128i bitMask = one;
+                    for (uint8_t i = 0; i < fastmask.size() && i < 64; i += 2) {
+                        out[k] |= _mm_and_si128(
+                            _mm_cmpgt_epu8(
+                                _mm_lddqu_si128(
+                                    (__m128i*)(center + fastmask[i])),
+                                _mm_lddqu_si128(
+                                    (__m128i*)(center + fastmask[i + 1]))),
+                            bitMask);
+                        // Keeps index into output vector and updates bit mask
+                        if (i % 16 == 0 && i != 0) {
+                            bitMask = one;
+                            k++;
+                        } else {
+                            bitMask += bitMask;
+                        }
+                    }
+                    // 8bit to 16bit
+                    __m128i high1 = _mm_unpacklo_epi8(out[2], out[3]);
+                    __m128i high2 = _mm_unpackhi_epi8(out[2], out[3]);
+                    __m128i low1 = _mm_unpacklo_epi8(out[0], out[1]);
+                    __m128i low2 = _mm_unpackhi_epi8(out[0], out[1]);
+
+                    // 16bit to 32bit ints
+                    _mm_storeu_si128(dst, _mm_unpacklo_epi16(low1, high1));
+                    _mm_storeu_si128(dst + 1, _mm_unpackhi_epi16(low1, high1));
+                    _mm_storeu_si128(dst + 2, _mm_unpacklo_epi16(low2, high2));
+                    _mm_storeu_si128(dst + 3, _mm_unpackhi_epi16(low2, high2));
+                }
+            }  // col iteration
+        }  // row iteration
+    };
+
+    if (numThreads == 1)
+        gpcFilterSegment(13, height - 15);
+    else
+        parFor(gpcFilterSegment, 13, height - 15, 4);
+#endif
+}
+void gpcFilterTau(uint8_t* in,
+                  const uint8_t* grad,
+                  uint32_t* gpc,
+                  std::vector<int32_t> fastmask,
+                  std::vector<int> tau,
+                  std::vector<int>& idx,
+                  int width,
+                  int height,
+                  int numThreads) {
+    assert(width % 16 == 0 && "width must be multiple of 16!");
+#ifndef _INTRINSICS_SSE
+    gpcFilterTauNaive(in, grad, gpc, fastmask, tau, idx, width, height);
+#else
+    auto gpcFilterSegment = [&](int start, int end) {
+        __m128i zero = _mm_set1_epi8(0);
+        __m128i one = _mm_set1_epi8(1);
+        for (int y = start; y < end; y++) {
+            for (int x = 0; x < width; x += 16) {
+                uint8_t* rowPtr;
+                rowPtr = in + (y - 2) * width + x;
+                __m128i out[4];  // temporary output vector of 4 128bit words
+
+                const uint8_t* center = (in + y * width + x);
+                const uint8_t* centerGrad = (grad + y * width + x);
+                // We only process the current segment if there are any non-zero
+                // values (high gradient pixels)
+                if (!isAllZeros(_mm_lddqu_si128((__m128i*)centerGrad))) {
+                    __m128i* dst =
+                        (__m128i*)(gpc + y * width +
+                                   x);  // Set starting point to pixel (2,2)
+                    out[0] = zero;
+                    out[1] = zero;
+                    out[2] = zero;
+                    out[3] = zero;
+                    uint8_t k = 0;
+                    __m128i bitMask = one;
+                    for (uint8_t i = 0; i < fastmask.size() && i < 64; i += 2) {
+                        out[k] |= _mm_and_si128(
+                            _mm_cmpgt_epu8(
+                                _mm_lddqu_si128(
+                                    (__m128i*)(center + fastmask[i])),
+                                _mm_subs_epi8(
+                                    _mm_lddqu_si128(
+                                        (__m128i*)(center + fastmask[i + 1])),
+                                    _mm_set1_epi8(tau[i / 2]))  // deduct tau
+                                ),
+                            bitMask);
+                        // Keeps index into output vector and updates bit mask
+                        if (i % 16 == 0 && i != 0) {
+                            bitMask = one;
+                            k++;
+                        } else {
+                            bitMask += bitMask;
+                        }
+                    }
+                    // 8bit to 16bit
+                    __m128i high1 = _mm_unpacklo_epi8(out[2], out[3]);
+                    __m128i high2 = _mm_unpackhi_epi8(out[2], out[3]);
+                    __m128i low1 = _mm_unpacklo_epi8(out[0], out[1]);
+                    __m128i low2 = _mm_unpackhi_epi8(out[0], out[1]);
+
+                    // 16bit to 32bit ints
+                    _mm_storeu_si128(dst, _mm_unpacklo_epi16(low1, high1));
+                    _mm_storeu_si128(dst + 1, _mm_unpackhi_epi16(low1, high1));
+                    _mm_storeu_si128(dst + 2, _mm_unpacklo_epi16(low2, high2));
+                    _mm_storeu_si128(dst + 3, _mm_unpackhi_epi16(low2, high2));
+                }
+            }  // col iteration
+        }  // row iteration
+    };
+
+    if (numThreads == 1)
+        gpcFilterSegment(13, height - 15);
+    else
+        parFor(gpcFilterSegment, 13, height - 15, 4);
+#endif
+}
+void census5x5Naive(uint8_t* in, uint32_t* census, int width, int height) {
+    uint32_t val;
+    uint32_t* dst;
+    for (int y = 2; y < height - 3; y++) {
+        for (int x = 0; x < width; x++) {
+            val = 0;
+            dst = census + y * width + x;
+            int i = 0;
+            // patch loops
+            for (int px = -2; px <= 2; px++) {
+                for (int py = -2; py <= 2; py++) {
+                    if (!(px == 0 && py == 0)) {
+                        val |= (in[(y + py) * width + (x + px)] >
+                                in[y * width + x])
+                                   ? (1 << i)
+                                   : 0;
+                        i++;
+                    }
+                }
+            }  // End patch loops
+            *dst = val;
+        }
+    }  // End pixel loops
+}
+void census5x5(uint8_t* in, uint32_t* census, int width, int height) {
+    assert(width % 16 == 0 && "width must be multiple of 16!");
+#ifndef _INTRINSICS_SSE
+    census5x5Naive(in, census, width, height);
+#else
+    __m128i zero = _mm_set1_epi8(0);
+    __m128i one = _mm_set1_epi8(1);
+
+    for (int y = 2; y < height - 3; y++) {
+        for (int x = 0; x < width; x += 16) {
+            uint8_t* rowPtr;
+            rowPtr = in + (y - 2) * width + x;
+            __m128i center = _mm_lddqu_si128((__m128i*)(in + y * width + x));
+            __m128i* dst = (__m128i*)(census + y * width +
+                                      x);  // Set starting point to pixel (2,2)
+            // row 0
+            __m128i bitMask = one;
+            __m128i byte1 = _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 2))),
+                bitMask);
+            bitMask += bitMask;  // 2
+            byte1 |= _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 1))),
+                bitMask);
+            bitMask += bitMask;  // 4
+            byte1 |= _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr))),
+                bitMask);
+            bitMask += bitMask;  // 8
+            byte1 |= _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 1))),
+                bitMask);
+            bitMask += bitMask;  // 16
+            byte1 |= _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 2))),
+                bitMask);
+
+            // row 1
+            rowPtr += width;
+            bitMask += bitMask;  // 32
+            byte1 |= _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 2))),
+                bitMask);
+            bitMask += bitMask;  // 64
+            byte1 |= _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 1))),
+                bitMask);
+            bitMask += bitMask;  // 128
+            byte1 |= _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr))),
+                bitMask);
+            bitMask = one;  // 1
+            __m128i byte2 = _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 1))),
+                bitMask);
+            bitMask += bitMask;  // 2
+            byte2 |= _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 2))),
+                bitMask);
+
+            // row 2
+            rowPtr += width;
+            bitMask += bitMask;  // 4
+            byte2 |= _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 2))),
+                bitMask);
+            bitMask += bitMask;  // 8
+            byte2 |= _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 1))),
+                bitMask);
+            bitMask += bitMask;  // 16
+            byte2 |= _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 1))),
+                bitMask);
+            bitMask += bitMask;  // 32
+            byte2 |= _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 2))),
+                bitMask);
+
+            // row 3
+            rowPtr += width;
+            bitMask += bitMask;  // 64
+            byte2 |= _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 2))),
+                bitMask);
+            bitMask += bitMask;  // 128
+            byte2 |= _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 1))),
+                bitMask);
+            bitMask = one;  // 1
+            __m128i byte3 = _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr))),
+                bitMask);
+            bitMask += bitMask;  // 2
+            byte3 |= _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 1))),
+                bitMask);
+            bitMask += bitMask;  // 4
+            byte3 |= _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 2))),
+                bitMask);
+
+            // row 4
+            rowPtr += width;
+            bitMask += bitMask;  // 8
+            byte3 |= _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 2))),
+                bitMask);
+            bitMask += bitMask;  // 16
+            byte3 |= _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 1))),
+                bitMask);
+            bitMask += bitMask;  // 32
+            byte3 |= _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr))),
+                bitMask);
+            bitMask += bitMask;  // 64
+            byte3 |= _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 1))),
+                bitMask);
+            bitMask += bitMask;  // 128
+            byte3 |= _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 2))),
+                bitMask);
+
+            // 8bit to 16bit
+            __m128i high1 = _mm_unpacklo_epi8(byte3, zero);
+            __m128i high2 = _mm_unpackhi_epi8(byte3, zero);
+            __m128i low1 = _mm_unpacklo_epi8(byte1, byte2);
+            __m128i low2 = _mm_unpackhi_epi8(byte1, byte2);
+
+            // 16bit to 32bit ints
+            _mm_storeu_si128(dst, _mm_unpacklo_epi16(low1, high1));
+            _mm_storeu_si128(dst + 1, _mm_unpackhi_epi16(low1, high1));
+            _mm_storeu_si128(dst + 2, _mm_unpacklo_epi16(low2, high2));
+            _mm_storeu_si128(dst + 3, _mm_unpackhi_epi16(low2, high2));
+
+        }  // col iteration
+    }  // row iteration
+    // if(numThreads == 1)
+    // gpcFilterSegment(13,height-15);
+    // else
+    // parFor(gpcFilterSegment,13,height-15,4);
+
+#endif
+}  // census5x5
+}  // namespace ndb
+#endif

From b682d4415fb1e9968cd15712a1278bfc82cb66f5 Mon Sep 17 00:00:00 2001
From: Nik Bamert <github@nikbamert.com>
Date: Sun, 15 Feb 2026 09:20:22 +0100
Subject: [PATCH 07/36] break out sobel kernel

---
 CMakeLists.txt            |   1 +
 lib/gpc/filter.cpp        | 193 --------------------------------
 lib/gpc/filter.hpp        |  31 ------
 lib/gpc/forest.cpp        |   1 +
 lib/gpc/kernels/sobel.cpp | 229 ++++++++++++++++++++++++++++++++++++++
 lib/gpc/kernels/sobel.hpp |  70 ++++++++++++
 6 files changed, 301 insertions(+), 224 deletions(-)
 create mode 100644 lib/gpc/kernels/sobel.cpp
 create mode 100644 lib/gpc/kernels/sobel.hpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 60f919e..8f7182d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -42,6 +42,7 @@ add_library(gpc_core
     lib/gpc/fern.cpp 
     lib/gpc/feature.cpp 
     lib/gpc/filter.cpp
+    lib/gpc/kernels/sobel.cpp
 )
 target_link_libraries(gpc_core 
     PUBLIC 
diff --git a/lib/gpc/filter.cpp b/lib/gpc/filter.cpp
index d03c0af..cbe18b9 100644
--- a/lib/gpc/filter.cpp
+++ b/lib/gpc/filter.cpp
@@ -106,49 +106,7 @@ void parFor(std::function<void(int, int)> const& f,
     for (auto& t : threads) t.join();
 }
 
-void sobelNaive(
-    uint8_t* in, uint8_t* gradient, int width, int height, uint8_t threshold) {
-    assert(width % 16 == 0 && "width must be multiple of 16!");
-    int thresholdSq = threshold * threshold;
-    uint8_t* ptr = in;
-
-    uint8_t* p11 = ptr + 0 * width;
-    uint8_t* p12 = ptr + 0 * width + 1;
-    uint8_t* p13 = ptr + 0 * width + 2;
 
-    uint8_t* p21 = ptr + 1 * width;
-    uint8_t* p22 = ptr + 1 * width + 1;
-    uint8_t* p23 = ptr + 1 * width + 2;
-
-    uint8_t* p31 = ptr + 2 * width;
-    uint8_t* p32 = ptr + 2 * width + 1;
-    uint8_t* p33 = ptr + 2 * width + 2;
-
-    // output pointer
-    uint8_t* optr = gradient + 1 * width + 1;
-    // Apply 3x3 box filter to image less pixel border of 1 (to avoid treating
-    // boundary) (unoptimized)
-    for (int iy = 1; iy < height - 1; iy++) {
-        for (int ix = 0; ix < width; ix++) {
-            int sx = (*p11 + *p31 + 2 * *p21 - *p13 - 2 * *p23 - *p33) / 9;
-            int sy = (*p11 + *p13 + 2 * *p12 - *p31 - 2 * *p32 - *p33) / 9;
-
-            int val = sx * sx + sy * sy;
-
-            *optr = val > thresholdSq ? 255 : 0;
-            p11++;
-            p12++;
-            p13++;
-            p21++;
-            p22++;
-            p23++;
-            p31++;
-            p32++;
-            p33++;
-            optr++;
-        }
-    }
-}
 void boxNaive(uint8_t* in, uint8_t* blurred, int width, int height) {
     assert(width % 16 == 0 && "width must be multiple of 16!");
     // allocate space for result
@@ -350,158 +308,7 @@ void box(uint8_t* in, uint8_t* blurred, int width, int height, int numThreads) {
     // parFor(boxFilterSegment,1,height-3,4);
 #endif
 }
-void sobel(uint8_t* in,
-           uint8_t* blurred,
-           int width,
-           int height,
-           uint8_t threshold,
-           int numThreads) {
-    assert(width % 16 == 0 && "width must be multiple of 16!");
-#ifndef _INTRINSICS_SSE
-    sobelNaive(in, blurred, width, height, threshold);
-#else
-    auto sobelSSESegment = [&](int start, int end) {
-        __m128i one_third, one_ninth, one, two, mone, mtwo, binThres;
-        __m128i *dst0, *dst1;
-        __m128i zero = _mm_setzero_si128();
 
-        int x, y;
-        one_third = _mm_set1_epi16(
-            21846);  // 2^16/3+1. For 16bit ints. 2^8/3+1=86.33 for 8bit
-        one_ninth = _mm_set1_epi16(7282);  // 2^16/9+1. For 16bit ints.
-
-        binThres = _mm_set1_epi16(threshold * threshold);
-
-        dst0 = (__m128i*)(blurred + width * 1);
-        // dst1 = (__m128i *)(blurred + width * 2);
-        for (y = start; y < end;
-             y++) {  // We compute results for two rows in one iteration
-            const uint8_t *row0, *row1, *row2;
-
-            row1 = in + y * width;
-            row0 = row1 - width;
-            row2 = row1 + width;
-
-            for (x = 0; x < width; x += 16) {
-                // Note: Center element not used in sobel kernels!!
-                // Kernel indices:
-                // 00 01 02
-                // 10 11 12
-                // 20 21 22
-
-                __m128i a00, a01, a02, a10, a12, a20, a21, a22;
-                __m128i b00, b01, b02, b10, b12, b20, b21, b22;
-
-                __m128i raA, raB, rbA, rbB;
-                __m128i tmpa, tmpb, sya, syb, sxa, sxb, res;
-
-                unpack8to16(_mm_loadu_si128((__m128i*)(row0 - 1)), a00, b00);
-                unpack8to16(_mm_load_si128((__m128i*)(row0)), a01, b01);
-                unpack8to16(_mm_loadu_si128((__m128i*)(row0 + 1)), a02, b02);
-
-                unpack8to16(_mm_loadu_si128((__m128i*)(row1 - 1)), a10, b10);
-                unpack8to16(_mm_loadu_si128((__m128i*)(row1 + 1)), a12, b12);
-
-                unpack8to16(_mm_loadu_si128((__m128i*)(row2 - 1)), a20, b20);
-                unpack8to16(_mm_load_si128((__m128i*)(row2)), a21, b21);
-                unpack8to16(_mm_loadu_si128((__m128i*)(row2 + 1)), a22, b22);
-
-                // Sobel kernels for x and y direction.
-                //      1 0 -1       1 2 1
-                // sx = 2 0 -2 sy =  0 0 0
-                //      1 0 -1      -1-2-1
-                //      Note that neither kernel uses the center element)
-
-                // In the following, mullo is used to multiply intermediate
-                // results with -1 To divide by 3, 16bit overflow divide by
-                // multiply is used, which thus uses the upper 16bit(_mm_mulhi)
-                // of the 32bit temporary result.
-
-                // sx column kernel vectors (1,2,1)
-                // Two chained add/sub are used for 2 and -2
-                raA = _mm_mulhi_epi16(
-                    _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(a00, a20), a10),
-                                  a10),
-                    one_ninth);
-                rbA = _mm_mulhi_epi16(
-                    _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(b00, b20), b10),
-                                  b10),
-                    one_ninth);
-
-                // sx column kernel vector (-1 -2 -1)
-                raB = _mm_mulhi_epi16(
-                    _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(a02, a22), a12),
-                                  a12),
-                    one_ninth);
-                rbB = _mm_mulhi_epi16(
-                    _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(b02, b22), b12),
-                                  b12),
-                    one_ninth);
-
-                // Square of sx: Add squares of above temporaries into final sum
-                tmpa = _mm_sub_epi16(raA, raB);
-                tmpb = _mm_sub_epi16(rbA, rbB);
-
-                sxa = _mm_mullo_epi16(tmpa, tmpa);
-                sxb = _mm_mullo_epi16(tmpb, tmpb);
-
-                // sy row kernel vector (1,2,1)
-                // Two chained add are used for 2 and -2
-                raA = _mm_mulhi_epi16(
-                    _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(a00, a02), a01),
-                                  a01),
-                    one_ninth);
-                rbA = _mm_mulhi_epi16(
-                    _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(b00, b02), b01),
-                                  b01),
-                    one_ninth);
-
-                // sy row kernel vector (-1 -2 -1)
-                raB = _mm_mulhi_epi16(
-                    _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(a20, a22), a21),
-                                  a21),
-                    one_ninth);
-                rbB = _mm_mulhi_epi16(
-                    _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(b20, b22), b21),
-                                  b21),
-                    one_ninth);
-
-                // Square of sx: Add squares of above temporaries into final sum
-                tmpa = _mm_sub_epi16(raA, raB);
-                tmpb = _mm_sub_epi16(rbA, rbB);
-
-                // watch out, can't overwrite this
-                sya = _mm_mullo_epi16(tmpa, tmpa);
-                syb = _mm_mullo_epi16(tmpb, tmpb);
-
-                __m128i zero = _mm_setzero_si128();
-
-                // The unpacklo is necessary because _mm_cmput_epi16 sets the
-                // output to 0xFFFF if the comparison is true. When packing
-                // 16bit to 8bit however, 0xFFFF will be interpreted (in a
-                // signed environment) as being negative, and hence set to 0,
-                // resulting in a 0 output everywhere. using unpacklo in between
-                // we get 0xFFFF->0xFF
-                pack16to8(
-                    _mm_unpacklo_epi8(
-                        _mm_cmpgt_epi16(_mm_adds_epi16(sxa, sya), binThres),
-                        zero),
-                    _mm_unpacklo_epi8(
-                        _mm_cmpgt_epi16(_mm_adds_epi16(sxb, syb), binThres),
-                        zero),
-                    res);
-
-                _mm_store_si128(dst0++, res);
-
-                row0 += 16;
-                row1 += 16;
-                row2 += 16;
-            }  // cols
-        }  // rows
-    };  // Lambda
-    sobelSSESegment(1, height - 3);
-#endif
-}
 
 #ifdef _INTRINSICS_SSE
 bool isAllZeros(__m128i xmm) {
diff --git a/lib/gpc/filter.hpp b/lib/gpc/filter.hpp
index 3576650..2e6454a 100644
--- a/lib/gpc/filter.hpp
+++ b/lib/gpc/filter.hpp
@@ -101,19 +101,6 @@ void parFor(std::function<void(int, int)> const& f,
             int start,
             int end,
             int nThreads);
-/**
- * @brief Naive 3x3 sobel filter implementation
- *
- * @param      in       input image
- * @param      blurred  The blurred output image
- * @param[in]  width    The width
- * @param[in]  height   The height
- * @param[in]  numThreads number of threads to use
- * @param      threshold  threshold to binarize sobel filter output
- */
-void sobelNaive(
-    uint8_t* in, uint8_t* gradient, int width, int height, uint8_t threshold);
-
 /**
  * @brief      Naive 3x3 box filter implementation
  *
@@ -180,24 +167,6 @@ void gpcFilterTauNaive(uint8_t* in,
    */
 void box(uint8_t* in, uint8_t* blurred, int width, int height, int numThreads);
 
-/**
- * @brief      3x3 Sobel filter. Input dimension must be multiple of 16
- *
- * @param      in         { parameter_description }
- * @param      blurred    The blurred
- * @param[in]  width      The width
- * @param[in]  height     The height
- * @param[in]  threshold  The threshold
- * @param[in]  numThreads number of threads to use
- */
-
-void sobel(uint8_t* in,
-           uint8_t* blurred,
-           int width,
-           int height,
-           uint8_t threshold,
-           int numThreads);
-
 /**
  * @brief Checks if the 128bits in xmm are all zero
  *
diff --git a/lib/gpc/forest.cpp b/lib/gpc/forest.cpp
index 0809951..8b9d2ef 100644
--- a/lib/gpc/forest.cpp
+++ b/lib/gpc/forest.cpp
@@ -47,6 +47,7 @@
 #include "gpc/SintelStereo.hpp"
 #include "gpc/buffer.hpp"
 #include "gpc/filter.hpp"
+#include "gpc/kernels/sobel.hpp"
 #include "gpc/hashmatch.hpp"
 #include "gpc/forest.hpp"
 
diff --git a/lib/gpc/kernels/sobel.cpp b/lib/gpc/kernels/sobel.cpp
new file mode 100644
index 0000000..4d56716
--- /dev/null
+++ b/lib/gpc/kernels/sobel.cpp
@@ -0,0 +1,229 @@
+// Copyright (c) 2018, ETH Zurich
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the copyright holder nor the names of its contributors
+// may be used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Code Author: Niklaus Bamert (bamertn@ethz.ch)
+
+#include "gpc/kernels/sobel.hpp"
+namespace ndb {
+void sobelNaive(
+    uint8_t* in, uint8_t* gradient, int width, int height, uint8_t threshold) {
+    assert(width % 16 == 0 && "width must be multiple of 16!");
+    int thresholdSq = threshold * threshold;
+    uint8_t* ptr = in;
+
+    uint8_t* p11 = ptr + 0 * width;
+    uint8_t* p12 = ptr + 0 * width + 1;
+    uint8_t* p13 = ptr + 0 * width + 2;
+
+    uint8_t* p21 = ptr + 1 * width;
+    uint8_t* p22 = ptr + 1 * width + 1;
+    uint8_t* p23 = ptr + 1 * width + 2;
+
+    uint8_t* p31 = ptr + 2 * width;
+    uint8_t* p32 = ptr + 2 * width + 1;
+    uint8_t* p33 = ptr + 2 * width + 2;
+
+    // output pointer
+    uint8_t* optr = gradient + 1 * width + 1;
+    // Apply 3x3 box filter to image less pixel border of 1 (to avoid treating
+    // boundary) (unoptimized)
+    for (int iy = 1; iy < height - 1; iy++) {
+        for (int ix = 0; ix < width; ix++) {
+            int sx = (*p11 + *p31 + 2 * *p21 - *p13 - 2 * *p23 - *p33) / 9;
+            int sy = (*p11 + *p13 + 2 * *p12 - *p31 - 2 * *p32 - *p33) / 9;
+
+            int val = sx * sx + sy * sy;
+
+            *optr = val > thresholdSq ? 255 : 0;
+            p11++;
+            p12++;
+            p13++;
+            p21++;
+            p22++;
+            p23++;
+            p31++;
+            p32++;
+            p33++;
+            optr++;
+        }
+    }
+}
+void sobel(uint8_t* in,
+           uint8_t* blurred,
+           int width,
+           int height,
+           uint8_t threshold,
+           int numThreads) {
+    assert(width % 16 == 0 && "width must be multiple of 16!");
+#ifndef _INTRINSICS_SSE
+    sobelNaive(in, blurred, width, height, threshold);
+#else
+    auto sobelSSESegment = [&](int start, int end) {
+        __m128i one_third, one_ninth, one, two, mone, mtwo, binThres;
+        __m128i *dst0, *dst1;
+        __m128i zero = _mm_setzero_si128();
+
+        int x, y;
+        one_third = _mm_set1_epi16(
+            21846);  // 2^16/3+1. For 16bit ints. 2^8/3+1=86.33 for 8bit
+        one_ninth = _mm_set1_epi16(7282);  // 2^16/9+1. For 16bit ints.
+
+        binThres = _mm_set1_epi16(threshold * threshold);
+
+        dst0 = (__m128i*)(blurred + width * 1);
+        // dst1 = (__m128i *)(blurred + width * 2);
+        for (y = start; y < end;
+             y++) {  // We compute results for two rows in one iteration
+            const uint8_t *row0, *row1, *row2;
+
+            row1 = in + y * width;
+            row0 = row1 - width;
+            row2 = row1 + width;
+
+            for (x = 0; x < width; x += 16) {
+                // Note: Center element not used in sobel kernels!!
+                // Kernel indices:
+                // 00 01 02
+                // 10 11 12
+                // 20 21 22
+
+                __m128i a00, a01, a02, a10, a12, a20, a21, a22;
+                __m128i b00, b01, b02, b10, b12, b20, b21, b22;
+
+                __m128i raA, raB, rbA, rbB;
+                __m128i tmpa, tmpb, sya, syb, sxa, sxb, res;
+
+                unpack8to16(_mm_loadu_si128((__m128i*)(row0 - 1)), a00, b00);
+                unpack8to16(_mm_load_si128((__m128i*)(row0)), a01, b01);
+                unpack8to16(_mm_loadu_si128((__m128i*)(row0 + 1)), a02, b02);
+
+                unpack8to16(_mm_loadu_si128((__m128i*)(row1 - 1)), a10, b10);
+                unpack8to16(_mm_loadu_si128((__m128i*)(row1 + 1)), a12, b12);
+
+                unpack8to16(_mm_loadu_si128((__m128i*)(row2 - 1)), a20, b20);
+                unpack8to16(_mm_load_si128((__m128i*)(row2)), a21, b21);
+                unpack8to16(_mm_loadu_si128((__m128i*)(row2 + 1)), a22, b22);
+
+                // Sobel kernels for x and y direction.
+                //      1 0 -1       1 2 1
+                // sx = 2 0 -2 sy =  0 0 0
+                //      1 0 -1      -1-2-1
+                //      Note that neither kernel uses the center element)
+
+                // In the following, mullo is used to multiply intermediate
+                // results with -1 To divide by 3, 16bit overflow divide by
+                // multiply is used, which thus uses the upper 16bit(_mm_mulhi)
+                // of the 32bit temporary result.
+
+                // sx column kernel vectors (1,2,1)
+                // Two chained add/sub are used for 2 and -2
+                raA = _mm_mulhi_epi16(
+                    _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(a00, a20), a10),
+                                  a10),
+                    one_ninth);
+                rbA = _mm_mulhi_epi16(
+                    _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(b00, b20), b10),
+                                  b10),
+                    one_ninth);
+
+                // sx column kernel vector (-1 -2 -1)
+                raB = _mm_mulhi_epi16(
+                    _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(a02, a22), a12),
+                                  a12),
+                    one_ninth);
+                rbB = _mm_mulhi_epi16(
+                    _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(b02, b22), b12),
+                                  b12),
+                    one_ninth);
+
+                // Square of sx: Add squares of above temporaries into final sum
+                tmpa = _mm_sub_epi16(raA, raB);
+                tmpb = _mm_sub_epi16(rbA, rbB);
+
+                sxa = _mm_mullo_epi16(tmpa, tmpa);
+                sxb = _mm_mullo_epi16(tmpb, tmpb);
+
+                // sy row kernel vector (1,2,1)
+                // Two chained add are used for 2 and -2
+                raA = _mm_mulhi_epi16(
+                    _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(a00, a02), a01),
+                                  a01),
+                    one_ninth);
+                rbA = _mm_mulhi_epi16(
+                    _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(b00, b02), b01),
+                                  b01),
+                    one_ninth);
+
+                // sy row kernel vector (-1 -2 -1)
+                raB = _mm_mulhi_epi16(
+                    _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(a20, a22), a21),
+                                  a21),
+                    one_ninth);
+                rbB = _mm_mulhi_epi16(
+                    _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(b20, b22), b21),
+                                  b21),
+                    one_ninth);
+
+                // Square of sx: Add squares of above temporaries into final sum
+                tmpa = _mm_sub_epi16(raA, raB);
+                tmpb = _mm_sub_epi16(rbA, rbB);
+
+                // watch out, can't overwrite this
+                sya = _mm_mullo_epi16(tmpa, tmpa);
+                syb = _mm_mullo_epi16(tmpb, tmpb);
+
+                __m128i zero = _mm_setzero_si128();
+
+                // The unpacklo is necessary because _mm_cmput_epi16 sets the
+                // output to 0xFFFF if the comparison is true. When packing
+                // 16bit to 8bit however, 0xFFFF will be interpreted (in a
+                // signed environment) as being negative, and hence set to 0,
+                // resulting in a 0 output everywhere. using unpacklo in between
+                // we get 0xFFFF->0xFF
+                pack16to8(
+                    _mm_unpacklo_epi8(
+                        _mm_cmpgt_epi16(_mm_adds_epi16(sxa, sya), binThres),
+                        zero),
+                    _mm_unpacklo_epi8(
+                        _mm_cmpgt_epi16(_mm_adds_epi16(sxb, syb), binThres),
+                        zero),
+                    res);
+
+                _mm_store_si128(dst0++, res);
+
+                row0 += 16;
+                row1 += 16;
+                row2 += 16;
+            }  // cols
+        }  // rows
+    };  // Lambda
+    sobelSSESegment(1, height - 3);
+#endif
+}
+} // namespace ndb
diff --git a/lib/gpc/kernels/sobel.hpp b/lib/gpc/kernels/sobel.hpp
new file mode 100644
index 0000000..038408a
--- /dev/null
+++ b/lib/gpc/kernels/sobel.hpp
@@ -0,0 +1,70 @@
+// Copyright (c) 2018, ETH Zurich
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the copyright holder nor the names of its contributors
+// may be used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Code Author: Niklaus Bamert (bamertn@ethz.ch)
+
+#ifndef __NDB__KERNEL_SOBEL
+#define __NDB__KERNEL_SOBEL
+using namespace std;
+
+#include "gpc/buffer.hpp"
+
+namespace ndb {
+/**
+ * @brief Naive 3x3 sobel filter implementation
+ *
+ * @param      in       input image
+ * @param      blurred  The blurred output image
+ * @param[in]  width    The width
+ * @param[in]  height   The height
+ * @param[in]  numThreads number of threads to use
+ * @param      threshold  threshold to binarize sobel filter output
+ */
+void sobelNaive(
+    uint8_t* in, uint8_t* gradient, int width, int height, uint8_t threshold);
+
+/**
+ * @brief      3x3 Sobel filter. Input dimension must be multiple of 16
+ *
+ * @param      in         { parameter_description }
+ * @param      blurred    The blurred
+ * @param[in]  width      The width
+ * @param[in]  height     The height
+ * @param[in]  threshold  The threshold
+ * @param[in]  numThreads number of threads to use
+ */
+
+void sobel(uint8_t* in,
+           uint8_t* blurred,
+           int width,
+           int height,
+           uint8_t threshold,
+           int numThreads);
+}
+#endif

From 6ed662a91796c4b2009c9b45b3e5269617f364a8 Mon Sep 17 00:00:00 2001
From: Nik Bamert <github@nikbamert.com>
Date: Sun, 15 Feb 2026 09:26:27 +0100
Subject: [PATCH 08/36] break out box filter

---
 CMakeLists.txt          |   1 +
 lib/gpc/feature.cpp     |   1 +
 lib/gpc/filter.cpp      | 154 --------------------------------
 lib/gpc/filter.hpp      |  23 -----
 lib/gpc/forest.cpp      |   1 +
 lib/gpc/kernels/box.cpp | 190 ++++++++++++++++++++++++++++++++++++++++
 lib/gpc/kernels/box.hpp |  65 ++++++++++++++
 7 files changed, 258 insertions(+), 177 deletions(-)
 create mode 100644 lib/gpc/kernels/box.cpp
 create mode 100644 lib/gpc/kernels/box.hpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8f7182d..d7f1d1a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -43,6 +43,7 @@ add_library(gpc_core
     lib/gpc/feature.cpp 
     lib/gpc/filter.cpp
     lib/gpc/kernels/sobel.cpp
+    lib/gpc/kernels/box.cpp
 )
 target_link_libraries(gpc_core 
     PUBLIC 
diff --git a/lib/gpc/feature.cpp b/lib/gpc/feature.cpp
index f68d440..0ffc27b 100644
--- a/lib/gpc/feature.cpp
+++ b/lib/gpc/feature.cpp
@@ -38,6 +38,7 @@
 #include <fstream>
 #include <gpc/buffer.hpp>
 #include <gpc/filter.hpp>
+#include <gpc/kernels/box.hpp>
 #include <gpc/Feature.hpp>
 #include <iostream>
 #include <iterator>
diff --git a/lib/gpc/filter.cpp b/lib/gpc/filter.cpp
index cbe18b9..713d0d9 100644
--- a/lib/gpc/filter.cpp
+++ b/lib/gpc/filter.cpp
@@ -107,44 +107,7 @@ void parFor(std::function<void(int, int)> const& f,
 }
 
 
-void boxNaive(uint8_t* in, uint8_t* blurred, int width, int height) {
-    assert(width % 16 == 0 && "width must be multiple of 16!");
-    // allocate space for result
-    uint8_t* ptr = in;
-    uint8_t* p11 = ptr + 0 * width;
-    uint8_t* p12 = ptr + 0 * width + 1;
-    uint8_t* p13 = ptr + 0 * width + 2;
-
-    uint8_t* p21 = ptr + 1 * width;
-    uint8_t* p22 = ptr + 1 * width + 1;
-    uint8_t* p23 = ptr + 1 * width + 2;
 
-    uint8_t* p31 = ptr + 2 * width;
-    uint8_t* p32 = ptr + 2 * width + 1;
-    uint8_t* p33 = ptr + 2 * width + 2;
-    uint8_t* optr = blurred + 1 * width + 1;
-
-    // Apply 3x3 box filter to image less pixel border of 1 (to avoid treating
-    // boundary) (unoptimized)
-    for (int iy = 1; iy < height - 1; iy++) {
-        for (int ix = 0; ix < width; ix++) {
-            int res =
-                (*p11 + *p12 + *p13 + *p21 + *p22 + *p23 + *p31 + *p32 + *p33) /
-                9;
-            *optr = res;
-            p11++;
-            p12++;
-            p13++;
-            p21++;
-            p22++;
-            p23++;
-            p31++;
-            p32++;
-            p33++;
-            optr++;
-        }
-    }
-}
 void gpcFilterNaive(uint8_t* in,
                     const uint8_t* grad,
                     uint32_t* gpc,
@@ -191,123 +154,6 @@ void gpcFilterTauNaive(uint8_t* in,
         j++;
     }
 } 
-void box(uint8_t* in, uint8_t* blurred, int width, int height, int numThreads) {
-    assert(width % 16 == 0 && "width must be multiple of 16!");
-#ifndef _INTRINSICS_SSE
-    boxNaive(in, blurred, width, height);
-#else
-    auto boxFilterSegment = [&](int start, int end) {
-        int x, y;
-        __m128i one_third;
-        __m128i *dst0, *dst1;
-        __m128i zero = _mm_setzero_si128();
-
-        one_third = _mm_set1_epi16(
-            21846);  // 2^16/3+1. For 16bit ints. 2^8/3+1=86.33 for 8bit
-        dst0 = (__m128i*)(blurred + width * (start));
-        dst1 = (__m128i*)(blurred + width * (start + 1));
-        for (y = start; y < end;
-             y += 2) {  // We compute results for two rows in one iteration
-            const uint8_t *row0, *row1, *row2, *row3;
-
-            row1 = in + y * width;
-            row0 = row1 - width;
-            row2 = row1 + width;
-            row3 = row2 + width;
-
-            for (x = 0; x < width; x += 16) {
-                __m128i s00, s01, s02;
-                __m128i r00, r01, r02;
-                __m128i ra00, ra01, ra02;
-                __m128i rb00, rb01, rb02;
-
-                __m128i a00, a01, a02, b00, b01, b02;
-
-                __m128i tmp0, tmp1, res;
-
-                s00 = _mm_loadu_si128((__m128i*)(row0 - 1));
-                s01 = _mm_loadu_si128((__m128i*)(row0 + 1));
-                s02 = _mm_load_si128((__m128i*)(row0));
-                unpack8to16(s00, a00, b00);
-                unpack8to16(s01, a01, b01);
-                unpack8to16(s02, a02, b02);
-
-                ra00 = _mm_mulhi_epi16(
-                    _mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third);
-                rb00 = _mm_mulhi_epi16(
-                    _mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third);
-
-                s00 = _mm_loadu_si128((__m128i*)(row1 - 1));
-                s01 = _mm_loadu_si128((__m128i*)(row1 + 1));
-                s02 = _mm_load_si128((__m128i*)(row1));
-                unpack8to16(s00, a00, b00);
-                unpack8to16(s01, a01, b01);
-                unpack8to16(s02, a02, b02);
-
-                ra01 = _mm_mulhi_epi16(
-                    _mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third);
-                rb01 = _mm_mulhi_epi16(
-                    _mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third);
-
-                s00 = _mm_loadu_si128((__m128i*)(row2 - 1));
-                s01 = _mm_loadu_si128((__m128i*)(row2 + 1));
-                s02 = _mm_load_si128((__m128i*)(row2));
-                unpack8to16(s00, a00, b00);
-                unpack8to16(s01, a01, b01);
-                unpack8to16(s02, a02, b02);
-
-                ra02 = _mm_mulhi_epi16(
-                    _mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third);
-                rb02 = _mm_mulhi_epi16(
-                    _mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third);
-
-                tmp0 = _mm_mulhi_epi16(
-                    _mm_adds_epi16(_mm_adds_epi16(ra00, ra01), ra02),
-                    one_third);
-                tmp1 = _mm_mulhi_epi16(
-                    _mm_adds_epi16(_mm_adds_epi16(rb00, rb01), rb02),
-                    one_third);
-
-                pack16to8(tmp0, tmp1, res);
-                _mm_store_si128(dst0++, res);
-
-                s00 = _mm_loadu_si128((__m128i*)(row3 - 1));
-                s01 = _mm_loadu_si128((__m128i*)(row3 + 1));
-                s02 = _mm_load_si128((__m128i*)(row3));
-                unpack8to16(s00, a00, b00);
-                unpack8to16(s01, a01, b01);
-                unpack8to16(s02, a02, b02);
-                ra00 = _mm_mulhi_epi16(
-                    _mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third);
-                rb00 = _mm_mulhi_epi16(
-                    _mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third);
-
-                tmp0 = _mm_mulhi_epi16(
-                    _mm_adds_epi16(_mm_adds_epi16(ra00, ra01), ra02),
-                    one_third);
-                tmp1 = _mm_mulhi_epi16(
-                    _mm_adds_epi16(_mm_adds_epi16(rb00, rb01), rb02),
-                    one_third);
-
-                pack16to8(tmp0, tmp1, res);
-                _mm_store_si128(dst1++, res);
-
-                row0 += 16;
-                row1 += 16;
-                row2 += 16;
-                row3 += 16;
-            }
-            // still storing 128bit, but now in 16 x 8bit format, so /16 instead
-            // of /8
-            dst0 += width / 16;
-            dst1 += width / 16;
-        }
-    };  // lambda
-
-    boxFilterSegment(1, height - 3);
-    // parFor(boxFilterSegment,1,height-3,4);
-#endif
-}
 
 
 #ifdef _INTRINSICS_SSE
diff --git a/lib/gpc/filter.hpp b/lib/gpc/filter.hpp
index 2e6454a..d48fd6f 100644
--- a/lib/gpc/filter.hpp
+++ b/lib/gpc/filter.hpp
@@ -101,16 +101,6 @@ void parFor(std::function<void(int, int)> const& f,
             int start,
             int end,
             int nThreads);
-/**
- * @brief      Naive 3x3 box filter implementation
- *
- * @param      in       input image
- * @param      blurred  The blurred output image
- * @param[in]  width    The width
- * @param[in]  height   The height
- * @param[in]  numThreads number of threads to use
- */
-void boxNaive(uint8_t* in, uint8_t* blurred, int width, int height);
 
 /**
  * @brief Applies a gpc filter defined by the pixel-difference tests in
@@ -154,19 +144,6 @@ void gpcFilterTauNaive(uint8_t* in,
                        std::vector<int>& idx,
                        int width,
                        int height);
-/**
-   * @brief      boxfilter using SSE2 instructions. Loosely based on
-   *             https://www.ignorantus.com/box_sse2/, published under
-   *             the https://creativecommons.org/publicdomain/zero/1.0/ licence.
-   *
-   * @param      in       input image
-   * @param      blurred  The blurred
-   * @param[in]  width    The width
-   * @param[in]  height   The height
-   * @param[in]  numThreads number of threads to use
-   */
-void box(uint8_t* in, uint8_t* blurred, int width, int height, int numThreads);
-
 /**
  * @brief Checks if the 128bits in xmm are all zero
  *
diff --git a/lib/gpc/forest.cpp b/lib/gpc/forest.cpp
index 8b9d2ef..f8d4e4c 100644
--- a/lib/gpc/forest.cpp
+++ b/lib/gpc/forest.cpp
@@ -48,6 +48,7 @@
 #include "gpc/buffer.hpp"
 #include "gpc/filter.hpp"
 #include "gpc/kernels/sobel.hpp"
+#include "gpc/kernels/box.hpp"
 #include "gpc/hashmatch.hpp"
 #include "gpc/forest.hpp"
 
diff --git a/lib/gpc/kernels/box.cpp b/lib/gpc/kernels/box.cpp
new file mode 100644
index 0000000..9e444d4
--- /dev/null
+++ b/lib/gpc/kernels/box.cpp
@@ -0,0 +1,190 @@
+// Copyright (c) 2018, ETH Zurich
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the copyright holder nor the names of its contributors
+// may be used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Code Author: Niklaus Bamert (bamertn@ethz.ch)
+
+#include "gpc/kernels/box.hpp"
+namespace ndb {
+void boxNaive(uint8_t* in, uint8_t* blurred, int width, int height) {
+    assert(width % 16 == 0 && "width must be multiple of 16!");
+    // allocate space for result
+    uint8_t* ptr = in;
+    uint8_t* p11 = ptr + 0 * width;
+    uint8_t* p12 = ptr + 0 * width + 1;
+    uint8_t* p13 = ptr + 0 * width + 2;
+
+    uint8_t* p21 = ptr + 1 * width;
+    uint8_t* p22 = ptr + 1 * width + 1;
+    uint8_t* p23 = ptr + 1 * width + 2;
+
+    uint8_t* p31 = ptr + 2 * width;
+    uint8_t* p32 = ptr + 2 * width + 1;
+    uint8_t* p33 = ptr + 2 * width + 2;
+    uint8_t* optr = blurred + 1 * width + 1;
+
+    // Apply 3x3 box filter to image less pixel border of 1 (to avoid treating
+    // boundary) (unoptimized)
+    for (int iy = 1; iy < height - 1; iy++) {
+        for (int ix = 0; ix < width; ix++) {
+            int res =
+                (*p11 + *p12 + *p13 + *p21 + *p22 + *p23 + *p31 + *p32 + *p33) /
+                9;
+            *optr = res;
+            p11++;
+            p12++;
+            p13++;
+            p21++;
+            p22++;
+            p23++;
+            p31++;
+            p32++;
+            p33++;
+            optr++;
+        }
+    }
+}
+void box(uint8_t* in, uint8_t* blurred, int width, int height, int numThreads) {
+    assert(width % 16 == 0 && "width must be multiple of 16!");
+#ifndef _INTRINSICS_SSE
+    boxNaive(in, blurred, width, height);
+#else
+    auto boxFilterSegment = [&](int start, int end) {
+        int x, y;
+        __m128i one_third;
+        __m128i *dst0, *dst1;
+        __m128i zero = _mm_setzero_si128();
+
+        one_third = _mm_set1_epi16(
+            21846);  // 2^16/3+1. For 16bit ints. 2^8/3+1=86.33 for 8bit
+        dst0 = (__m128i*)(blurred + width * (start));
+        dst1 = (__m128i*)(blurred + width * (start + 1));
+        for (y = start; y < end;
+             y += 2) {  // We compute results for two rows in one iteration
+            const uint8_t *row0, *row1, *row2, *row3;
+
+            row1 = in + y * width;
+            row0 = row1 - width;
+            row2 = row1 + width;
+            row3 = row2 + width;
+
+            for (x = 0; x < width; x += 16) {
+                __m128i s00, s01, s02;
+                __m128i r00, r01, r02;
+                __m128i ra00, ra01, ra02;
+                __m128i rb00, rb01, rb02;
+
+                __m128i a00, a01, a02, b00, b01, b02;
+
+                __m128i tmp0, tmp1, res;
+
+                s00 = _mm_loadu_si128((__m128i*)(row0 - 1));
+                s01 = _mm_loadu_si128((__m128i*)(row0 + 1));
+                s02 = _mm_load_si128((__m128i*)(row0));
+                unpack8to16(s00, a00, b00);
+                unpack8to16(s01, a01, b01);
+                unpack8to16(s02, a02, b02);
+
+                ra00 = _mm_mulhi_epi16(
+                    _mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third);
+                rb00 = _mm_mulhi_epi16(
+                    _mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third);
+
+                s00 = _mm_loadu_si128((__m128i*)(row1 - 1));
+                s01 = _mm_loadu_si128((__m128i*)(row1 + 1));
+                s02 = _mm_load_si128((__m128i*)(row1));
+                unpack8to16(s00, a00, b00);
+                unpack8to16(s01, a01, b01);
+                unpack8to16(s02, a02, b02);
+
+                ra01 = _mm_mulhi_epi16(
+                    _mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third);
+                rb01 = _mm_mulhi_epi16(
+                    _mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third);
+
+                s00 = _mm_loadu_si128((__m128i*)(row2 - 1));
+                s01 = _mm_loadu_si128((__m128i*)(row2 + 1));
+                s02 = _mm_load_si128((__m128i*)(row2));
+                unpack8to16(s00, a00, b00);
+                unpack8to16(s01, a01, b01);
+                unpack8to16(s02, a02, b02);
+
+                ra02 = _mm_mulhi_epi16(
+                    _mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third);
+                rb02 = _mm_mulhi_epi16(
+                    _mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third);
+
+                tmp0 = _mm_mulhi_epi16(
+                    _mm_adds_epi16(_mm_adds_epi16(ra00, ra01), ra02),
+                    one_third);
+                tmp1 = _mm_mulhi_epi16(
+                    _mm_adds_epi16(_mm_adds_epi16(rb00, rb01), rb02),
+                    one_third);
+
+                pack16to8(tmp0, tmp1, res);
+                _mm_store_si128(dst0++, res);
+
+                s00 = _mm_loadu_si128((__m128i*)(row3 - 1));
+                s01 = _mm_loadu_si128((__m128i*)(row3 + 1));
+                s02 = _mm_load_si128((__m128i*)(row3));
+                unpack8to16(s00, a00, b00);
+                unpack8to16(s01, a01, b01);
+                unpack8to16(s02, a02, b02);
+                ra00 = _mm_mulhi_epi16(
+                    _mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third);
+                rb00 = _mm_mulhi_epi16(
+                    _mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third);
+
+                tmp0 = _mm_mulhi_epi16(
+                    _mm_adds_epi16(_mm_adds_epi16(ra00, ra01), ra02),
+                    one_third);
+                tmp1 = _mm_mulhi_epi16(
+                    _mm_adds_epi16(_mm_adds_epi16(rb00, rb01), rb02),
+                    one_third);
+
+                pack16to8(tmp0, tmp1, res);
+                _mm_store_si128(dst1++, res);
+
+                row0 += 16;
+                row1 += 16;
+                row2 += 16;
+                row3 += 16;
+            }
+            // still storing 128bit, but now in 16 x 8bit format, so /16 instead
+            // of /8
+            dst0 += width / 16;
+            dst1 += width / 16;
+        }
+    };  // lambda
+
+    boxFilterSegment(1, height - 3);
+    // parFor(boxFilterSegment,1,height-3,4);
+#endif
+}
+
+}
diff --git a/lib/gpc/kernels/box.hpp b/lib/gpc/kernels/box.hpp
new file mode 100644
index 0000000..bf9eea3
--- /dev/null
+++ b/lib/gpc/kernels/box.hpp
@@ -0,0 +1,65 @@
+// Copyright (c) 2018, ETH Zurich
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the copyright holder nor the names of its contributors
+// may be used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Code Author: Niklaus Bamert (bamertn@ethz.ch)
+
+#ifndef __NDB__KERNEL_BOX
+#define __NDB__KERNEL_BOX
+using namespace std;
+
+#include "gpc/buffer.hpp"
+
+namespace ndb {
+/**
+ * @brief      Naive 3x3 box filter implementation
+ *
+ * @param      in       input image
+ * @param      blurred  The blurred output image
+ * @param[in]  width    The width
+ * @param[in]  height   The height
+ * @param[in]  numThreads number of threads to use
+ */
+void boxNaive(uint8_t* in, uint8_t* blurred, int width, int height);
+
+/**
+   * @brief      boxfilter using SSE2 instructions. Loosely based on
+   *             https://www.ignorantus.com/box_sse2/, published under
+   *             the https://creativecommons.org/publicdomain/zero/1.0/ licence.
+   *
+   * @param      in       input image
+   * @param      blurred  The blurred
+   * @param[in]  width    The width
+   * @param[in]  height   The height
+   * @param[in]  numThreads number of threads to use
+   */
+void box(uint8_t* in, uint8_t* blurred, int width, int height, int numThreads);
+
+
+}
+#endif

From fb4d6261580b40011c585c769ca116fe2fc8d8f2 Mon Sep 17 00:00:00 2001
From: Nik Bamert <github@nikbamert.com>
Date: Sun, 15 Feb 2026 09:31:18 +0100
Subject: [PATCH 09/36] break out census filter

---
 CMakeLists.txt             |   1 +
 lib/gpc/filter.cpp         | 169 ------------------------------
 lib/gpc/filter.hpp         |  21 ----
 lib/gpc/kernels/census.cpp | 204 +++++++++++++++++++++++++++++++++++++
 lib/gpc/kernels/census.hpp |  61 +++++++++++
 lib/gpc/kernels/sobel.hpp  |   2 -
 6 files changed, 266 insertions(+), 192 deletions(-)
 create mode 100644 lib/gpc/kernels/census.cpp
 create mode 100644 lib/gpc/kernels/census.hpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d7f1d1a..e4af04f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -44,6 +44,7 @@ add_library(gpc_core
     lib/gpc/filter.cpp
     lib/gpc/kernels/sobel.cpp
     lib/gpc/kernels/box.cpp
+    lib/gpc/kernels/census.cpp
 )
 target_link_libraries(gpc_core 
     PUBLIC 
diff --git a/lib/gpc/filter.cpp b/lib/gpc/filter.cpp
index 713d0d9..fa39340 100644
--- a/lib/gpc/filter.cpp
+++ b/lib/gpc/filter.cpp
@@ -312,175 +312,6 @@ void gpcFilterTau(uint8_t* in,
         parFor(gpcFilterSegment, 13, height - 15, 4);
 #endif
 }
-void census5x5Naive(uint8_t* in, uint32_t* census, int width, int height) {
-    uint32_t val;
-    uint32_t* dst;
-    for (int y = 2; y < height - 3; y++) {
-        for (int x = 0; x < width; x++) {
-            val = 0;
-            dst = census + y * width + x;
-            int i = 0;
-            // patch loops
-            for (int px = -2; px <= 2; px++) {
-                for (int py = -2; py <= 2; py++) {
-                    if (!(px == 0 && py == 0)) {
-                        val |= (in[(y + py) * width + (x + px)] >
-                                in[y * width + x])
-                                   ? (1 << i)
-                                   : 0;
-                        i++;
-                    }
-                }
-            }  // End patch loops
-            *dst = val;
-        }
-    }  // End pixel loops
-}
-void census5x5(uint8_t* in, uint32_t* census, int width, int height) {
-    assert(width % 16 == 0 && "width must be multiple of 16!");
-#ifndef _INTRINSICS_SSE
-    census5x5Naive(in, census, width, height);
-#else
-    __m128i zero = _mm_set1_epi8(0);
-    __m128i one = _mm_set1_epi8(1);
-
-    for (int y = 2; y < height - 3; y++) {
-        for (int x = 0; x < width; x += 16) {
-            uint8_t* rowPtr;
-            rowPtr = in + (y - 2) * width + x;
-            __m128i center = _mm_lddqu_si128((__m128i*)(in + y * width + x));
-            __m128i* dst = (__m128i*)(census + y * width +
-                                      x);  // Set starting point to pixel (2,2)
-            // row 0
-            __m128i bitMask = one;
-            __m128i byte1 = _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 2))),
-                bitMask);
-            bitMask += bitMask;  // 2
-            byte1 |= _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 1))),
-                bitMask);
-            bitMask += bitMask;  // 4
-            byte1 |= _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr))),
-                bitMask);
-            bitMask += bitMask;  // 8
-            byte1 |= _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 1))),
-                bitMask);
-            bitMask += bitMask;  // 16
-            byte1 |= _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 2))),
-                bitMask);
 
-            // row 1
-            rowPtr += width;
-            bitMask += bitMask;  // 32
-            byte1 |= _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 2))),
-                bitMask);
-            bitMask += bitMask;  // 64
-            byte1 |= _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 1))),
-                bitMask);
-            bitMask += bitMask;  // 128
-            byte1 |= _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr))),
-                bitMask);
-            bitMask = one;  // 1
-            __m128i byte2 = _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 1))),
-                bitMask);
-            bitMask += bitMask;  // 2
-            byte2 |= _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 2))),
-                bitMask);
-
-            // row 2
-            rowPtr += width;
-            bitMask += bitMask;  // 4
-            byte2 |= _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 2))),
-                bitMask);
-            bitMask += bitMask;  // 8
-            byte2 |= _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 1))),
-                bitMask);
-            bitMask += bitMask;  // 16
-            byte2 |= _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 1))),
-                bitMask);
-            bitMask += bitMask;  // 32
-            byte2 |= _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 2))),
-                bitMask);
-
-            // row 3
-            rowPtr += width;
-            bitMask += bitMask;  // 64
-            byte2 |= _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 2))),
-                bitMask);
-            bitMask += bitMask;  // 128
-            byte2 |= _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 1))),
-                bitMask);
-            bitMask = one;  // 1
-            __m128i byte3 = _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr))),
-                bitMask);
-            bitMask += bitMask;  // 2
-            byte3 |= _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 1))),
-                bitMask);
-            bitMask += bitMask;  // 4
-            byte3 |= _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 2))),
-                bitMask);
-
-            // row 4
-            rowPtr += width;
-            bitMask += bitMask;  // 8
-            byte3 |= _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 2))),
-                bitMask);
-            bitMask += bitMask;  // 16
-            byte3 |= _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 1))),
-                bitMask);
-            bitMask += bitMask;  // 32
-            byte3 |= _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr))),
-                bitMask);
-            bitMask += bitMask;  // 64
-            byte3 |= _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 1))),
-                bitMask);
-            bitMask += bitMask;  // 128
-            byte3 |= _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 2))),
-                bitMask);
-
-            // 8bit to 16bit
-            __m128i high1 = _mm_unpacklo_epi8(byte3, zero);
-            __m128i high2 = _mm_unpackhi_epi8(byte3, zero);
-            __m128i low1 = _mm_unpacklo_epi8(byte1, byte2);
-            __m128i low2 = _mm_unpackhi_epi8(byte1, byte2);
-
-            // 16bit to 32bit ints
-            _mm_storeu_si128(dst, _mm_unpacklo_epi16(low1, high1));
-            _mm_storeu_si128(dst + 1, _mm_unpackhi_epi16(low1, high1));
-            _mm_storeu_si128(dst + 2, _mm_unpacklo_epi16(low2, high2));
-            _mm_storeu_si128(dst + 3, _mm_unpackhi_epi16(low2, high2));
-
-        }  // col iteration
-    }  // row iteration
-    // if(numThreads == 1)
-    // gpcFilterSegment(13,height-15);
-    // else
-    // parFor(gpcFilterSegment,13,height-15,4);
-
-#endif
-}  // census5x5
 }  // namespace ndb
 #endif
diff --git a/lib/gpc/filter.hpp b/lib/gpc/filter.hpp
index d48fd6f..35f1325 100644
--- a/lib/gpc/filter.hpp
+++ b/lib/gpc/filter.hpp
@@ -200,26 +200,5 @@ void gpcFilterTau(uint8_t* in,
                   int width,
                   int height,
                   int numThreads); 
-/**
- * @brief Naive version of 5x5 census transoform
- *
- * @param in      Input image
- * @param census  32bit census transform output
- * @param width   Width of the image at *in pointer
- * @param height  Heiht of the image at *in pointer
- */
-void census5x5Naive(uint8_t* in, uint32_t* census, int width, int height);
-
-
-/**
- * @brief 5x5 dense census transform of input image. binary codes are returned
- * as a 32bit image
- *
- * @param in
- * @param census
- * @param width
- * @param height
- */
-void census5x5(uint8_t* in, uint32_t* census, int width, int height);
 }  // namespace ndb
 #endif
diff --git a/lib/gpc/kernels/census.cpp b/lib/gpc/kernels/census.cpp
new file mode 100644
index 0000000..bd70613
--- /dev/null
+++ b/lib/gpc/kernels/census.cpp
@@ -0,0 +1,204 @@
+// Copyright (c) 2018, ETH Zurich
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the copyright holder nor the names of its contributors
+// may be used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Code Author: Niklaus Bamert (bamertn@ethz.ch)
+
+#include "gpc/kernels/census.hpp"
+void census5x5Naive(uint8_t* in, uint32_t* census, int width, int height) {
+    uint32_t val;
+    uint32_t* dst;
+    for (int y = 2; y < height - 3; y++) {
+        for (int x = 0; x < width; x++) {
+            val = 0;
+            dst = census + y * width + x;
+            int i = 0;
+            // patch loops
+            for (int px = -2; px <= 2; px++) {
+                for (int py = -2; py <= 2; py++) {
+                    if (!(px == 0 && py == 0)) {
+                        val |= (in[(y + py) * width + (x + px)] >
+                                in[y * width + x])
+                                   ? (1 << i)
+                                   : 0;
+                        i++;
+                    }
+                }
+            }  // End patch loops
+            *dst = val;
+        }
+    }  // End pixel loops
+}
+void census5x5(uint8_t* in, uint32_t* census, int width, int height) {
+    assert(width % 16 == 0 && "width must be multiple of 16!");
+#ifndef _INTRINSICS_SSE
+    census5x5Naive(in, census, width, height);
+#else
+    __m128i zero = _mm_set1_epi8(0);
+    __m128i one = _mm_set1_epi8(1);
+
+    for (int y = 2; y < height - 3; y++) {
+        for (int x = 0; x < width; x += 16) {
+            uint8_t* rowPtr;
+            rowPtr = in + (y - 2) * width + x;
+            __m128i center = _mm_lddqu_si128((__m128i*)(in + y * width + x));
+            __m128i* dst = (__m128i*)(census + y * width +
+                                      x);  // Set starting point to pixel (2,2)
+            // row 0
+            __m128i bitMask = one;
+            __m128i byte1 = _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 2))),
+                bitMask);
+            bitMask += bitMask;  // 2
+            byte1 |= _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 1))),
+                bitMask);
+            bitMask += bitMask;  // 4
+            byte1 |= _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr))),
+                bitMask);
+            bitMask += bitMask;  // 8
+            byte1 |= _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 1))),
+                bitMask);
+            bitMask += bitMask;  // 16
+            byte1 |= _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 2))),
+                bitMask);
+
+            // row 1
+            rowPtr += width;
+            bitMask += bitMask;  // 32
+            byte1 |= _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 2))),
+                bitMask);
+            bitMask += bitMask;  // 64
+            byte1 |= _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 1))),
+                bitMask);
+            bitMask += bitMask;  // 128
+            byte1 |= _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr))),
+                bitMask);
+            bitMask = one;  // 1
+            __m128i byte2 = _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 1))),
+                bitMask);
+            bitMask += bitMask;  // 2
+            byte2 |= _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 2))),
+                bitMask);
+
+            // row 2
+            rowPtr += width;
+            bitMask += bitMask;  // 4
+            byte2 |= _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 2))),
+                bitMask);
+            bitMask += bitMask;  // 8
+            byte2 |= _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 1))),
+                bitMask);
+            bitMask += bitMask;  // 16
+            byte2 |= _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 1))),
+                bitMask);
+            bitMask += bitMask;  // 32
+            byte2 |= _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 2))),
+                bitMask);
+
+            // row 3
+            rowPtr += width;
+            bitMask += bitMask;  // 64
+            byte2 |= _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 2))),
+                bitMask);
+            bitMask += bitMask;  // 128
+            byte2 |= _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 1))),
+                bitMask);
+            bitMask = one;  // 1
+            __m128i byte3 = _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr))),
+                bitMask);
+            bitMask += bitMask;  // 2
+            byte3 |= _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 1))),
+                bitMask);
+            bitMask += bitMask;  // 4
+            byte3 |= _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 2))),
+                bitMask);
+
+            // row 4
+            rowPtr += width;
+            bitMask += bitMask;  // 8
+            byte3 |= _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 2))),
+                bitMask);
+            bitMask += bitMask;  // 16
+            byte3 |= _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 1))),
+                bitMask);
+            bitMask += bitMask;  // 32
+            byte3 |= _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr))),
+                bitMask);
+            bitMask += bitMask;  // 64
+            byte3 |= _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 1))),
+                bitMask);
+            bitMask += bitMask;  // 128
+            byte3 |= _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 2))),
+                bitMask);
+
+            // 8bit to 16bit
+            __m128i high1 = _mm_unpacklo_epi8(byte3, zero);
+            __m128i high2 = _mm_unpackhi_epi8(byte3, zero);
+            __m128i low1 = _mm_unpacklo_epi8(byte1, byte2);
+            __m128i low2 = _mm_unpackhi_epi8(byte1, byte2);
+
+            // 16bit to 32bit ints
+            _mm_storeu_si128(dst, _mm_unpacklo_epi16(low1, high1));
+            _mm_storeu_si128(dst + 1, _mm_unpackhi_epi16(low1, high1));
+            _mm_storeu_si128(dst + 2, _mm_unpacklo_epi16(low2, high2));
+            _mm_storeu_si128(dst + 3, _mm_unpackhi_epi16(low2, high2));
+
+        }  // col iteration
+    }  // row iteration
+    // if(numThreads == 1)
+    // gpcFilterSegment(13,height-15);
+    // else
+    // parFor(gpcFilterSegment,13,height-15,4);
+
+#endif
+}  // census5x5
+
+
diff --git a/lib/gpc/kernels/census.hpp b/lib/gpc/kernels/census.hpp
new file mode 100644
index 0000000..8353a4e
--- /dev/null
+++ b/lib/gpc/kernels/census.hpp
@@ -0,0 +1,61 @@
+// Copyright (c) 2018, ETH Zurich
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the copyright holder nor the names of its contributors
+// may be used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Code Author: Niklaus Bamert (bamertn@ethz.ch)
+
+#ifndef __NDB__KERNEL_CENSUS
+#define __NDB__KERNEL_CENSUS
+
+#include "gpc/buffer.hpp"
+
+namespace ndb {
+/**
+ * @brief Naive version of 5x5 census transoform
+ *
+ * @param in      Input image
+ * @param census  32bit census transform output
+ * @param width   Width of the image at *in pointer
+ * @param height  Heiht of the image at *in pointer
+ */
+void census5x5Naive(uint8_t* in, uint32_t* census, int width, int height);
+
+
+/**
+ * @brief 5x5 dense census transform of input image. binary codes are returned
+ * as a 32bit image
+ *
+ * @param in
+ * @param census
+ * @param width
+ * @param height
+ */
+void census5x5(uint8_t* in, uint32_t* census, int width, int height);
+
+}
+#endif
diff --git a/lib/gpc/kernels/sobel.hpp b/lib/gpc/kernels/sobel.hpp
index 038408a..31749cb 100644
--- a/lib/gpc/kernels/sobel.hpp
+++ b/lib/gpc/kernels/sobel.hpp
@@ -31,8 +31,6 @@
 
 #ifndef __NDB__KERNEL_SOBEL
 #define __NDB__KERNEL_SOBEL
-using namespace std;
-
 #include "gpc/buffer.hpp"
 
 namespace ndb {

From 7d0ce8d504c36b97fa7b6f32b957447527aa9dd1 Mon Sep 17 00:00:00 2001
From: Nik Bamert <github@nikbamert.com>
Date: Sun, 15 Feb 2026 09:51:13 +0100
Subject: [PATCH 10/36] extract gpc filter, move utils

---
 CMakeLists.txt                          |   3 +-
 lib/gpc/Feature.hpp                     |   1 -
 lib/gpc/feature.cpp                     |   1 -
 lib/gpc/forest.cpp                      |   3 +-
 lib/gpc/forest.hpp                      |   1 -
 lib/gpc/{filter.cpp => kernels/gpc.cpp} |  82 +------------
 lib/gpc/{filter.hpp => kernels/gpc.hpp} | 157 +++++++-----------------
 lib/gpc/kernels/utils.cpp               | 109 ++++++++++++++++
 lib/gpc/kernels/utils.hpp               | 106 ++++++++++++++++
 lib/gpc/training.hpp                    |   1 -
 10 files changed, 268 insertions(+), 196 deletions(-)
 rename lib/gpc/{filter.cpp => kernels/gpc.cpp} (81%)
 rename lib/gpc/{filter.hpp => kernels/gpc.hpp} (68%)
 create mode 100644 lib/gpc/kernels/utils.cpp
 create mode 100644 lib/gpc/kernels/utils.hpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e4af04f..c0fe739 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -41,10 +41,11 @@ add_library(gpc_core
     lib/gpc/forest.cpp 
     lib/gpc/fern.cpp 
     lib/gpc/feature.cpp 
-    lib/gpc/filter.cpp
     lib/gpc/kernels/sobel.cpp
     lib/gpc/kernels/box.cpp
     lib/gpc/kernels/census.cpp
+    lib/gpc/kernels/gpc.cpp
+    lib/gpc/kernels/utils.cpp
 )
 target_link_libraries(gpc_core 
     PUBLIC 
diff --git a/lib/gpc/Feature.hpp b/lib/gpc/Feature.hpp
index 82aff06..8a8b55c 100644
--- a/lib/gpc/Feature.hpp
+++ b/lib/gpc/Feature.hpp
@@ -39,7 +39,6 @@
 #include <cmath>  //for log2
 #include <fstream>
 #include <gpc/buffer.hpp>
-#include <gpc/filter.hpp>
 #include <iostream>
 #include <iterator>
 #include <random>
diff --git a/lib/gpc/feature.cpp b/lib/gpc/feature.cpp
index 0ffc27b..529970c 100644
--- a/lib/gpc/feature.cpp
+++ b/lib/gpc/feature.cpp
@@ -37,7 +37,6 @@
 #include <cmath>  //for log2
 #include <fstream>
 #include <gpc/buffer.hpp>
-#include <gpc/filter.hpp>
 #include <gpc/kernels/box.hpp>
 #include <gpc/Feature.hpp>
 #include <iostream>
diff --git a/lib/gpc/forest.cpp b/lib/gpc/forest.cpp
index f8d4e4c..52046f2 100644
--- a/lib/gpc/forest.cpp
+++ b/lib/gpc/forest.cpp
@@ -46,9 +46,10 @@
 #include "gpc/SintelOpticalFlow.hpp"
 #include "gpc/SintelStereo.hpp"
 #include "gpc/buffer.hpp"
-#include "gpc/filter.hpp"
 #include "gpc/kernels/sobel.hpp"
 #include "gpc/kernels/box.hpp"
+#include "gpc/kernels/gpc.hpp"
+#include "gpc/kernels/utils.hpp"
 #include "gpc/hashmatch.hpp"
 #include "gpc/forest.hpp"
 
diff --git a/lib/gpc/forest.hpp b/lib/gpc/forest.hpp
index 87939c1..d0b5c32 100644
--- a/lib/gpc/forest.hpp
+++ b/lib/gpc/forest.hpp
@@ -48,7 +48,6 @@
 #include "gpc/SintelOpticalFlow.hpp"
 #include "gpc/SintelStereo.hpp"
 #include "gpc/buffer.hpp"
-#include "gpc/filter.hpp"
 #include "gpc/hashmatch.hpp"
 
 /**
diff --git a/lib/gpc/filter.cpp b/lib/gpc/kernels/gpc.cpp
similarity index 81%
rename from lib/gpc/filter.cpp
rename to lib/gpc/kernels/gpc.cpp
index fa39340..636a9cb 100644
--- a/lib/gpc/filter.cpp
+++ b/lib/gpc/kernels/gpc.cpp
@@ -28,86 +28,9 @@
 // POSSIBILITY OF SUCH DAMAGE.
 //
 // Code Author: Niklaus Bamert (bamertn@ethz.ch)
-#ifndef __NDB__FILTER
-#define __NDB__FILTER
-
-#include <cassert>
-#include <thread>
-
-#include "gpc/filter.hpp"
-using namespace std;
 
+#include "gpc/kernels/gpc.hpp"
 namespace ndb {
-void arr2ind(const unsigned char* a,
-                                       int n,
-                                       int* ind,
-                                       int* m) {
-#ifdef _INTRINSICS_SSE
-    int i, m0, k;
-    __m256i msk;
-    m0 = 0;
-    for (i = 0; i < n; i = i + 32) { /* Load 32 bytes and compare with zero: */
-        msk = _mm256_cmpeq_epi8(_mm256_load_si256((__m256i*)&a[i]),
-                                _mm256_setzero_si256());
-        k = _mm256_movemask_epi8(msk);
-        k = ~k; /* Search for nonzero bits instead of zero bits.  */
-        while (k) {
-            ind[m0] =
-                i + _tzcnt_u32(
-                        k); /* Count the number of trailing zero bits in k. */
-            m0++;
-            k = _blsr_u32(k); /* Clear the lowest set bit in k. */
-        }
-    }
-    *m = m0;
-#else
-    int nnz = 0;
-    for (int i = 0; i < n; i++) {
-        if (a[i] != 0) {
-            nnz++;
-            *ind = i;
-            ind++;
-        }
-    }
-    *m = nnz;
-#endif
-}
-#ifdef _INTRINSICS_SSE
-void unpack8to16(const __m128i x, __m128i& y0, __m128i& y1) {
-    __m128i zero = _mm_setzero_si128();
-    y0 = _mm_unpacklo_epi8(x, zero);
-    y1 = _mm_unpackhi_epi8(x, zero);
-}
-void pack16to8(const __m128i x0, const __m128i x1, __m128i& y) {
-    y = _mm_packus_epi16(x0, x1);
-}
-
-#endif
-void parFor(std::function<void(int, int)> const& f,
-            int start,
-            int end,
-            int nThreads) {
-    // Range definition
-    // quantities derived from range
-    int segSize = (end - start) / nThreads;
-    int lastSeg = (end - start) % nThreads;
-
-    std::vector<std::thread> threads;
-    threads.reserve(nThreads);
-
-    // Spawn threads
-    for (int t = 0; t < nThreads - 1; t++) {
-        threads.emplace_back(f, start + t * segSize, start + (t + 1) * segSize);
-    }
-    threads.emplace_back(f,
-                         start + (nThreads - 1) * segSize,
-                         start + (nThreads)*segSize + lastSeg);
-    // Join
-    for (auto& t : threads) t.join();
-}
-
-
-
 void gpcFilterNaive(uint8_t* in,
                     const uint8_t* grad,
                     uint32_t* gpc,
@@ -312,6 +235,5 @@ void gpcFilterTau(uint8_t* in,
         parFor(gpcFilterSegment, 13, height - 15, 4);
 #endif
 }
+}
 
-}  // namespace ndb
-#endif
diff --git a/lib/gpc/filter.hpp b/lib/gpc/kernels/gpc.hpp
similarity index 68%
rename from lib/gpc/filter.hpp
rename to lib/gpc/kernels/gpc.hpp
index 35f1325..5f43743 100644
--- a/lib/gpc/filter.hpp
+++ b/lib/gpc/kernels/gpc.hpp
@@ -28,79 +28,38 @@
 // POSSIBILITY OF SUCH DAMAGE.
 //
 // Code Author: Niklaus Bamert (bamertn@ethz.ch)
-#ifndef __NDB__FILTER
-#define __NDB__FILTER
 
-#include <cassert>
-#include <thread>
+#ifndef __NDB__KERNEL_GPC
+#define __NDB__KERNEL_GPC
+using namespace std;
 
 #include "gpc/buffer.hpp"
-using namespace std;
 
-#ifdef _INTRINSICS_SSE
-#include <immintrin.h>
-// greater and lesser than simd ops for unsigned 8bit integer (epu8)
-#define _mm_cmpgt_epu8(v0, v1)                             \
-    _mm_cmpgt_epi8(_mm_xor_si128(v0, _mm_set1_epi8(-128)), \
-                   _mm_xor_si128(v1, _mm_set1_epi8(-128)))
-#define _mm_cmplt_epu8(v1, v0)                             \
-    _mm_cmpgt_epi8(_mm_xor_si128(v0, _mm_set1_epi8(-128)), \
-                   _mm_xor_si128(v1, _mm_set1_epi8(-128)))
-#endif
 namespace ndb {
 /**
- * @brief Gets indices of non-zero values in array  a.
- *    Credits:
- *    https://stackoverflow.com/questions/18971401/sparse-array-compression-using-simd-avx2/41958528#41958528
- *
- * @param     input array
- * @param n   number of input elements
- * @param ind output array (indices into n of nonzero elements)
- * @param m   number of elements in output
- */
-void arr2ind(const unsigned char* a,
-                                       int n,
-                                       int* ind,
-                                       int* m);
-
-#ifdef _INTRINSICS_SSE
-/**
- * @brief      Unpacks 16x8bit from a 128bit simd var into 2x128bit vars
- *             (8x16bit)
+ * @brief Applies a gpc filter defined by the pixel-difference tests in
+ * fastmask. Accelerated with SSE.
  *
- * @param[in]  x     the 128 bit vector to be unpacked
- * @param      y0    The y 0
- * @param      y1    The y 1
+ * @param in        The input image.
+ * @param grad      The gradient image, such that we can skip non-gradient
+ * pixels
+ * @param gpc       The output image of 32bit codes
+ * @param fastmask  The fastmask containing the gpc filter
+ * @param idx       The gradient indices. Only used if no intrincs are available
+ *                  and the call gets forwarded to the naive implementation.
+ * @param width     The width of the image at pointer *in
+ * @param height    The height of the image at pointer *in
+ * @param numThreadsNumber of threads to use
  */
-void unpack8to16(const __m128i x, __m128i& y0, __m128i& y1);
+void gpcFilter(uint8_t* in,
+               const uint8_t* grad,
+               uint32_t* gpc,
+               std::vector<int32_t> fastmask,
+               std::vector<int>& idx,
+               int width,
+               int height,
+               int numThreads);
 
-/**
- * @brief      Packs 2x128bit vars with 16bit values(where 8 upper bits are
- *             zero) into 1x128bit with 8bit values
- *
- * @param[in]  x0    The x 0
- * @param[in]  x1    The x 1
- * @param      y     the packed vector
- */
-void pack16to8(const __m128i x0, const __m128i x1, __m128i& y);
-#endif
-/**
- * @brief Calls a given functional f with subranges based on the given start
- *        and end indices. Here the functional is assumed to take two integer
- *        arguments indicating their respective start and end ranges.
- *        nThreads determines the number of threads the given range shall be
- * split into. The range is inclusive on the lower bound and exclusive on the
- * upper bound, i.e. [start,end)
- *
- * @param f        function object (e.g. a lambda functional)
- * @param start    start of the range
- * @param end      end of the range
- * @param nThreads number of threads to use
- */
-void parFor(std::function<void(int, int)> const& f,
-            int start,
-            int end,
-            int nThreads);
 
 /**
  * @brief Applies a gpc filter defined by the pixel-difference tests in
@@ -123,6 +82,28 @@ void gpcFilterNaive(uint8_t* in,
                     std::vector<int>& idx,
                     int width,
                     int height);
+/**
+ * @brief Applies a gpc filter defined by the pixel-difference tests in
+ * fastmask. Additionally uses a threshold vector (tau)
+ *
+ * @param in        The input image.
+ * @param grad      The gradient image, such that we can skip non-gradient
+ * pixels
+ * @param gpc       The output image of 32bit codes
+ * @param fastmask  The fastmask containing the gpc filter
+ * @param width     The width of the image at pointer *in
+ * @param height    The height of the image at pointer *in
+ * @param numThreads Number of threads to use
+ */
+void gpcFilterTau(uint8_t* in,
+                  const uint8_t* grad,
+                  uint32_t* gpc,
+                  std::vector<int32_t> fastmask,
+                  std::vector<int> tau,
+                  std::vector<int>& idx,
+                  int width,
+                  int height,
+                  int numThreads); 
 
 /**
  * @brief Applies a gpc filter defined by the pixel-difference tests in
@@ -154,51 +135,7 @@ void gpcFilterTauNaive(uint8_t* in,
 #ifdef _INTRINSICS_SSE
 bool isAllZeros(__m128i xmm);
 #endif
-/**
- * @brief Applies a gpc filter defined by the pixel-difference tests in
- * fastmask. Accelerated with SSE.
- *
- * @param in        The input image.
- * @param grad      The gradient image, such that we can skip non-gradient
- * pixels
- * @param gpc       The output image of 32bit codes
- * @param fastmask  The fastmask containing the gpc filter
- * @param idx       The gradient indices. Only used if no intrincs are available
- *                  and the call gets forwarded to the naive implementation.
- * @param width     The width of the image at pointer *in
- * @param height    The height of the image at pointer *in
- * @param numThreadsNumber of threads to use
- */
-void gpcFilter(uint8_t* in,
-               const uint8_t* grad,
-               uint32_t* gpc,
-               std::vector<int32_t> fastmask,
-               std::vector<int>& idx,
-               int width,
-               int height,
-               int numThreads);
 
-/**
- * @brief Applies a gpc filter defined by the pixel-difference tests in
- * fastmask. Additionally uses a threshold vector (tau)
- *
- * @param in        The input image.
- * @param grad      The gradient image, such that we can skip non-gradient
- * pixels
- * @param gpc       The output image of 32bit codes
- * @param fastmask  The fastmask containing the gpc filter
- * @param width     The width of the image at pointer *in
- * @param height    The height of the image at pointer *in
- * @param numThreads Number of threads to use
- */
-void gpcFilterTau(uint8_t* in,
-                  const uint8_t* grad,
-                  uint32_t* gpc,
-                  std::vector<int32_t> fastmask,
-                  std::vector<int> tau,
-                  std::vector<int>& idx,
-                  int width,
-                  int height,
-                  int numThreads); 
-}  // namespace ndb
+
+}
 #endif
diff --git a/lib/gpc/kernels/utils.cpp b/lib/gpc/kernels/utils.cpp
new file mode 100644
index 0000000..dd5d146
--- /dev/null
+++ b/lib/gpc/kernels/utils.cpp
@@ -0,0 +1,109 @@
+// Copyright (c) 2018, ETH Zurich
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the copyright holder nor the names of its contributors
+// may be used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Code Author: Niklaus Bamert (bamertn@ethz.ch)
+#include <cassert>
+#include <thread>
+
+using namespace std;
+
+namespace ndb {
+void arr2ind(const unsigned char* a,
+                                       int n,
+                                       int* ind,
+                                       int* m) {
+#ifdef _INTRINSICS_SSE
+    int i, m0, k;
+    __m256i msk;
+    m0 = 0;
+    for (i = 0; i < n; i = i + 32) { /* Load 32 bytes and compare with zero: */
+        msk = _mm256_cmpeq_epi8(_mm256_load_si256((__m256i*)&a[i]),
+                                _mm256_setzero_si256());
+        k = _mm256_movemask_epi8(msk);
+        k = ~k; /* Search for nonzero bits instead of zero bits.  */
+        while (k) {
+            ind[m0] =
+                i + _tzcnt_u32(
+                        k); /* Count the number of trailing zero bits in k. */
+            m0++;
+            k = _blsr_u32(k); /* Clear the lowest set bit in k. */
+        }
+    }
+    *m = m0;
+#else
+    int nnz = 0;
+    for (int i = 0; i < n; i++) {
+        if (a[i] != 0) {
+            nnz++;
+            *ind = i;
+            ind++;
+        }
+    }
+    *m = nnz;
+#endif
+}
+#ifdef _INTRINSICS_SSE
+void unpack8to16(const __m128i x, __m128i& y0, __m128i& y1) {
+    __m128i zero = _mm_setzero_si128();
+    y0 = _mm_unpacklo_epi8(x, zero);
+    y1 = _mm_unpackhi_epi8(x, zero);
+}
+void pack16to8(const __m128i x0, const __m128i x1, __m128i& y) {
+    y = _mm_packus_epi16(x0, x1);
+}
+
+#endif
+void parFor(std::function<void(int, int)> const& f,
+            int start,
+            int end,
+            int nThreads) {
+    // Range definition
+    // quantities derived from range
+    int segSize = (end - start) / nThreads;
+    int lastSeg = (end - start) % nThreads;
+
+    std::vector<std::thread> threads;
+    threads.reserve(nThreads);
+
+    // Spawn threads
+    for (int t = 0; t < nThreads - 1; t++) {
+        threads.emplace_back(f, start + t * segSize, start + (t + 1) * segSize);
+    }
+    threads.emplace_back(f,
+                         start + (nThreads - 1) * segSize,
+                         start + (nThreads)*segSize + lastSeg);
+    // Join
+    for (auto& t : threads) t.join();
+}
+
+
+
+
+
+}  // namespace ndb
diff --git a/lib/gpc/kernels/utils.hpp b/lib/gpc/kernels/utils.hpp
new file mode 100644
index 0000000..e9ce569
--- /dev/null
+++ b/lib/gpc/kernels/utils.hpp
@@ -0,0 +1,106 @@
+// Copyright (c) 2018, ETH Zurich
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the copyright holder nor the names of its contributors
+// may be used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Code Author: Niklaus Bamert (bamertn@ethz.ch)
+#ifndef __NDB__KERNEL_UTILS
+#define __NDB__KERNEL_UTILS
+
+#include <cassert>
+#include <thread>
+
+#include "gpc/buffer.hpp"
+using namespace std;
+
+#ifdef _INTRINSICS_SSE
+#include <immintrin.h>
+// greater and lesser than simd ops for unsigned 8bit integer (epu8)
+#define _mm_cmpgt_epu8(v0, v1)                             \
+    _mm_cmpgt_epi8(_mm_xor_si128(v0, _mm_set1_epi8(-128)), \
+                   _mm_xor_si128(v1, _mm_set1_epi8(-128)))
+#define _mm_cmplt_epu8(v1, v0)                             \
+    _mm_cmpgt_epi8(_mm_xor_si128(v0, _mm_set1_epi8(-128)), \
+                   _mm_xor_si128(v1, _mm_set1_epi8(-128)))
+#endif
+namespace ndb {
+/**
+ * @brief Gets indices of non-zero values in array  a.
+ *    Credits:
+ *    https://stackoverflow.com/questions/18971401/sparse-array-compression-using-simd-avx2/41958528#41958528
+ *
+ * @param     input array
+ * @param n   number of input elements
+ * @param ind output array (indices into n of nonzero elements)
+ * @param m   number of elements in output
+ */
+void arr2ind(const unsigned char* a,
+                                       int n,
+                                       int* ind,
+                                       int* m);
+
+#ifdef _INTRINSICS_SSE
+/**
+ * @brief      Unpacks 16x8bit from a 128bit simd var into 2x128bit vars
+ *             (8x16bit)
+ *
+ * @param[in]  x     the 128 bit vector to be unpacked
+ * @param      y0    The y 0
+ * @param      y1    The y 1
+ */
+void unpack8to16(const __m128i x, __m128i& y0, __m128i& y1);
+
+/**
+ * @brief      Packs 2x128bit vars with 16bit values(where 8 upper bits are
+ *             zero) into 1x128bit with 8bit values
+ *
+ * @param[in]  x0    The x 0
+ * @param[in]  x1    The x 1
+ * @param      y     the packed vector
+ */
+void pack16to8(const __m128i x0, const __m128i x1, __m128i& y);
+#endif
+/**
+ * @brief Calls a given functional f with subranges based on the given start
+ *        and end indices. Here the functional is assumed to take two integer
+ *        arguments indicating their respective start and end ranges.
+ *        nThreads determines the number of threads the given range shall be
+ * split into. The range is inclusive on the lower bound and exclusive on the
+ * upper bound, i.e. [start,end)
+ *
+ * @param f        function object (e.g. a lambda functional)
+ * @param start    start of the range
+ * @param end      end of the range
+ * @param nThreads number of threads to use
+ */
+void parFor(std::function<void(int, int)> const& f,
+            int start,
+            int end,
+            int nThreads);
+
+}  // namespace ndb
+#endif
diff --git a/lib/gpc/training.hpp b/lib/gpc/training.hpp
index f1e398b..8d557cc 100644
--- a/lib/gpc/training.hpp
+++ b/lib/gpc/training.hpp
@@ -49,7 +49,6 @@
 #include "gpc/SintelOpticalFlow.hpp"
 #include "gpc/SintelStereo.hpp"
 #include "gpc/buffer.hpp"
-#include "gpc/filter.hpp"
 #include "gpc/hashmatch.hpp"
 
 namespace gpc {

From f0ed437fae3e5a9a116eb4eb2968a21abffc99e5 Mon Sep 17 00:00:00 2001
From: Nik Bamert <github@nikbamert.com>
Date: Sun, 15 Feb 2026 10:07:51 +0100
Subject: [PATCH 11/36] add highway

---
 CMakeLists.txt          | 10 ++++++++++
 samples/sparsematch.cpp | 20 ++++++++++++++++++++
 2 files changed, 30 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c0fe739..4e01b6e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -36,6 +36,15 @@ if(SSE)
       add_compile_options(-mavx2 -march=core-avx2)
     endif()
 endif()
+include(FetchContent)
+set(HWY_ENABLE_TESTS OFF CACHE BOOL "Disable Highway tests" FORCE)
+set(HWY_ENABLE_EXAMPLES OFF CACHE BOOL "Disable Highway examples" FORCE)
+FetchContent_Declare(
+  highway
+  GIT_REPOSITORY https://github.com/google/highway.git
+  GIT_TAG        1.3.0  
+)
+FetchContent_MakeAvailable(highway)
 
 add_library(gpc_core 
     lib/gpc/forest.cpp 
@@ -52,6 +61,7 @@ target_link_libraries(gpc_core
         Eigen3::Eigen 
         ${PNG_LIBRARIES} 
         Threads::Threads
+        hwy
 )
 target_include_directories(gpc_core PUBLIC lib)
 enable_testing()
diff --git a/samples/sparsematch.cpp b/samples/sparsematch.cpp
index 3d5f19b..2554271 100644
--- a/samples/sparsematch.cpp
+++ b/samples/sparsematch.cpp
@@ -1,7 +1,26 @@
 #include <iostream>
+#include <hwy/highway.h>
 
 #include "gpc/forest.hpp"
 using namespace std;
+void test_hwy_neon() {
+    namespace hn = hwy::HWY_NAMESPACE;
+    
+    // d is a "descriptor" for a vector of 8-bit unsigned ints
+    const hn::ScalableTag<uint8_t> d;
+    
+    // If this is NEON, hn::Lanes(d) will be 16
+    size_t lanes = hn::Lanes(d);
+    
+    auto v1 = hn::Set(d, 10);
+    auto v2 = hn::Set(d, 20);
+    auto res = hn::Add(v1, v2); // res lanes all contain 30
+    
+    std::cout << "--- Highway Status ---" << std::endl;
+    std::cout << "Target: " << hwy::TargetName(hwy::SupportedTargets()) << std::endl;
+    std::cout << "Vector lanes (uint8): " << lanes << std::endl;
+    std::cout << "----------------------" << std::endl;
+}
 int main(int argc, char** argv) {
     std::string forestPath = "../../forests/defaultZeroForest.txt";
     std::string leftImgPath = "../../data/kitti/training/image_0/000000_10.png";
@@ -72,4 +91,5 @@ int main(int argc, char** argv) {
     ndb::Buffer<ndb::RGBColor> renderDisp;
     renderDisp = ndb::getDisparityVisualization(simg, supp);
     renderDisp.writePNGRGB("disparity.png");
+    test_hwy_neon();
 }

From 06bda6e1b9019ccf69d45ca7d45b19c144195258 Mon Sep 17 00:00:00 2001
From: Nik Bamert <github@nikbamert.com>
Date: Sun, 15 Feb 2026 13:44:50 +0100
Subject: [PATCH 12/36] add highway implementation of box filter

---
 CMakeLists.txt              |   1 +
 lib/gpc/forest.cpp          |   3 +
 lib/gpc/kernels/box.cpp     | 214 +++++++++++++++++-------------------
 lib/gpc/kernels/box.hpp     |   4 +
 lib/gpc/kernels/box_hwy.cpp | 146 ++++++++++++++++++++++++
 5 files changed, 255 insertions(+), 113 deletions(-)
 create mode 100644 lib/gpc/kernels/box_hwy.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4e01b6e..56c5314 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -55,6 +55,7 @@ add_library(gpc_core
     lib/gpc/kernels/census.cpp
     lib/gpc/kernels/gpc.cpp
     lib/gpc/kernels/utils.cpp
+    lib/gpc/kernels/box_hwy.cpp
 )
 target_link_libraries(gpc_core 
     PUBLIC 
diff --git a/lib/gpc/forest.cpp b/lib/gpc/forest.cpp
index 52046f2..5ba9f38 100644
--- a/lib/gpc/forest.cpp
+++ b/lib/gpc/forest.cpp
@@ -212,11 +212,14 @@ PreprocessedImage Forest::preprocessImage(ndb::Buffer<uint8_t>& img,
 
     ndb::Buffer<uint8_t> smooth(img.rows(), img.cols());
     smooth.width = img.width;
+    gpc::inference::time_point t0 = gpc::inference::sysTick();
     ndb::box(img.data(),
              smooth.data(),
              img.cols(),
              img.rows(),
              settings.numThreads_);
+    gpc::inference::time_point t1 = gpc::inference::sysTick();
+    cout << "box: " << gpc::inference::tickToMs(t1, t0) << " ms" << endl;
     smooth.clearBoundary();
     ndb::Buffer<uint8_t> grad(img.rows(), img.cols());
     grad.width = img.width;
diff --git a/lib/gpc/kernels/box.cpp b/lib/gpc/kernels/box.cpp
index 9e444d4..10215d4 100644
--- a/lib/gpc/kernels/box.cpp
+++ b/lib/gpc/kernels/box.cpp
@@ -31,6 +31,9 @@
 
 #include "gpc/kernels/box.hpp"
 namespace ndb {
+namespace testing { 
+    void box_hwy(uint8_t* in, uint8_t* blurred, int width, int height); 
+}
 void boxNaive(uint8_t* in, uint8_t* blurred, int width, int height) {
     assert(width % 16 == 0 && "width must be multiple of 16!");
     // allocate space for result
@@ -69,122 +72,107 @@ void boxNaive(uint8_t* in, uint8_t* blurred, int width, int height) {
         }
     }
 }
+#ifdef _INTRINSICS_SSE
+/**
+ * @brief SSE implementation of the 3x3 box filter.
+ * Processed two rows at a time using fixed-point multiplication for division.
+ */
+#include <immintrin.h>
+void boxSSE(uint8_t* in, uint8_t* blurred, int width, int height) {
+    int start = 1;
+    int end = height - 3;
+    
+    int x, y;
+    __m128i one_third = _mm_set1_epi16(21846); // 2^16/3 + 1
+    
+    __m128i *dst0 = (__m128i*)(blurred + width * start);
+    __m128i *dst1 = (__m128i*)(blurred + width * (start + 1));
+
+    for (y = start; y < end; y += 2) {
+        const uint8_t *row0, *row1, *row2, *row3;
+
+        row1 = in + y * width;
+        row0 = row1 - width;
+        row2 = row1 + width;
+        row3 = row2 + width;
+
+        for (x = 0; x < width; x += 16) {
+            __m128i s00, s01, s02;
+            __m128i ra00, ra01, ra02, rb00, rb01, rb02;
+            __m128i a00, a01, a02, b00, b01, b02;
+            __m128i tmp0, tmp1, res;
+
+            // Row 0 Processing
+            s00 = _mm_loadu_si128((__m128i*)(row0 - 1));
+            s01 = _mm_loadu_si128((__m128i*)(row0 + 1));
+            s02 = _mm_load_si128((__m128i*)(row0));
+            unpack8to16(s00, a00, b00);
+            unpack8to16(s01, a01, b01);
+            unpack8to16(s02, a02, b02);
+            ra00 = _mm_mulhi_epi16(_mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third);
+            rb00 = _mm_mulhi_epi16(_mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third);
+
+            // Row 1 Processing
+            s00 = _mm_loadu_si128((__m128i*)(row1 - 1));
+            s01 = _mm_loadu_si128((__m128i*)(row1 + 1));
+            s02 = _mm_load_si128((__m128i*)(row1));
+            unpack8to16(s00, a00, b00);
+            unpack8to16(s01, a01, b01);
+            unpack8to16(s02, a02, b02);
+            ra01 = _mm_mulhi_epi16(_mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third);
+            rb01 = _mm_mulhi_epi16(_mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third);
+
+            // Row 2 Processing
+            s00 = _mm_loadu_si128((__m128i*)(row2 - 1));
+            s01 = _mm_loadu_si128((__m128i*)(row2 + 1));
+            s02 = _mm_load_si128((__m128i*)(row2));
+            unpack8to16(s00, a00, b00);
+            unpack8to16(s01, a01, b01);
+            unpack8to16(s02, a02, b02);
+            ra02 = _mm_mulhi_epi16(_mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third);
+            rb02 = _mm_mulhi_epi16(_mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third);
+
+            // Accumulate rows 0, 1, 2 for dst0
+            tmp0 = _mm_mulhi_epi16(_mm_adds_epi16(_mm_adds_epi16(ra00, ra01), ra02), one_third);
+            tmp1 = _mm_mulhi_epi16(_mm_adds_epi16(_mm_adds_epi16(rb00, rb01), rb02), one_third);
+            pack16to8(tmp0, tmp1, res);
+            _mm_store_si128(dst0++, res);
+
+            // Row 3 Processing
+            s00 = _mm_loadu_si128((__m128i*)(row3 - 1));
+            s01 = _mm_loadu_si128((__m128i*)(row3 + 1));
+            s02 = _mm_load_si128((__m128i*)(row3));
+            unpack8to16(s00, a00, b00);
+            unpack8to16(s01, a01, b01);
+            unpack8to16(s02, a02, b02);
+            ra00 = _mm_mulhi_epi16(_mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third);
+            rb00 = _mm_mulhi_epi16(_mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third);
+
+            // Accumulate rows 1, 2, 3 for dst1
+            tmp0 = _mm_mulhi_epi16(_mm_adds_epi16(_mm_adds_epi16(ra01, ra02), ra00), one_third);
+            tmp1 = _mm_mulhi_epi16(_mm_adds_epi16(_mm_adds_epi16(rb01, rb02), rb00), one_third);
+            pack16to8(tmp0, tmp1, res);
+            _mm_store_si128(dst1++, res);
+
+            row0 += 16; row1 += 16; row2 += 16; row3 += 16;
+        }
+        dst0 += width / 16;
+        dst1 += width / 16;
+    }
+}
+#endif
 void box(uint8_t* in, uint8_t* blurred, int width, int height, int numThreads) {
     assert(width % 16 == 0 && "width must be multiple of 16!");
-#ifndef _INTRINSICS_SSE
-    boxNaive(in, blurred, width, height);
+#if defined(__ARM_NEON) || defined(__aarch64__)
+    // Force use of our new Highway kernel on Mac
+    testing::box_hwy(in, blurred, width, height);
 #else
-    auto boxFilterSegment = [&](int start, int end) {
-        int x, y;
-        __m128i one_third;
-        __m128i *dst0, *dst1;
-        __m128i zero = _mm_setzero_si128();
-
-        one_third = _mm_set1_epi16(
-            21846);  // 2^16/3+1. For 16bit ints. 2^8/3+1=86.33 for 8bit
-        dst0 = (__m128i*)(blurred + width * (start));
-        dst1 = (__m128i*)(blurred + width * (start + 1));
-        for (y = start; y < end;
-             y += 2) {  // We compute results for two rows in one iteration
-            const uint8_t *row0, *row1, *row2, *row3;
-
-            row1 = in + y * width;
-            row0 = row1 - width;
-            row2 = row1 + width;
-            row3 = row2 + width;
-
-            for (x = 0; x < width; x += 16) {
-                __m128i s00, s01, s02;
-                __m128i r00, r01, r02;
-                __m128i ra00, ra01, ra02;
-                __m128i rb00, rb01, rb02;
-
-                __m128i a00, a01, a02, b00, b01, b02;
-
-                __m128i tmp0, tmp1, res;
-
-                s00 = _mm_loadu_si128((__m128i*)(row0 - 1));
-                s01 = _mm_loadu_si128((__m128i*)(row0 + 1));
-                s02 = _mm_load_si128((__m128i*)(row0));
-                unpack8to16(s00, a00, b00);
-                unpack8to16(s01, a01, b01);
-                unpack8to16(s02, a02, b02);
-
-                ra00 = _mm_mulhi_epi16(
-                    _mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third);
-                rb00 = _mm_mulhi_epi16(
-                    _mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third);
-
-                s00 = _mm_loadu_si128((__m128i*)(row1 - 1));
-                s01 = _mm_loadu_si128((__m128i*)(row1 + 1));
-                s02 = _mm_load_si128((__m128i*)(row1));
-                unpack8to16(s00, a00, b00);
-                unpack8to16(s01, a01, b01);
-                unpack8to16(s02, a02, b02);
-
-                ra01 = _mm_mulhi_epi16(
-                    _mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third);
-                rb01 = _mm_mulhi_epi16(
-                    _mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third);
-
-                s00 = _mm_loadu_si128((__m128i*)(row2 - 1));
-                s01 = _mm_loadu_si128((__m128i*)(row2 + 1));
-                s02 = _mm_load_si128((__m128i*)(row2));
-                unpack8to16(s00, a00, b00);
-                unpack8to16(s01, a01, b01);
-                unpack8to16(s02, a02, b02);
-
-                ra02 = _mm_mulhi_epi16(
-                    _mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third);
-                rb02 = _mm_mulhi_epi16(
-                    _mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third);
-
-                tmp0 = _mm_mulhi_epi16(
-                    _mm_adds_epi16(_mm_adds_epi16(ra00, ra01), ra02),
-                    one_third);
-                tmp1 = _mm_mulhi_epi16(
-                    _mm_adds_epi16(_mm_adds_epi16(rb00, rb01), rb02),
-                    one_third);
-
-                pack16to8(tmp0, tmp1, res);
-                _mm_store_si128(dst0++, res);
-
-                s00 = _mm_loadu_si128((__m128i*)(row3 - 1));
-                s01 = _mm_loadu_si128((__m128i*)(row3 + 1));
-                s02 = _mm_load_si128((__m128i*)(row3));
-                unpack8to16(s00, a00, b00);
-                unpack8to16(s01, a01, b01);
-                unpack8to16(s02, a02, b02);
-                ra00 = _mm_mulhi_epi16(
-                    _mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third);
-                rb00 = _mm_mulhi_epi16(
-                    _mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third);
-
-                tmp0 = _mm_mulhi_epi16(
-                    _mm_adds_epi16(_mm_adds_epi16(ra00, ra01), ra02),
-                    one_third);
-                tmp1 = _mm_mulhi_epi16(
-                    _mm_adds_epi16(_mm_adds_epi16(rb00, rb01), rb02),
-                    one_third);
-
-                pack16to8(tmp0, tmp1, res);
-                _mm_store_si128(dst1++, res);
-
-                row0 += 16;
-                row1 += 16;
-                row2 += 16;
-                row3 += 16;
-            }
-            // still storing 128bit, but now in 16 x 8bit format, so /16 instead
-            // of /8
-            dst0 += width / 16;
-            dst1 += width / 16;
-        }
-    };  // lambda
-
-    boxFilterSegment(1, height - 3);
-    // parFor(boxFilterSegment,1,height-3,4);
+    #ifndef _INTRINSICS_SSE
+        boxNaive(in, blurred, width, height);
+    #else
+        boxSSE(in, blurred, width, height);
+    #endif
 #endif
 }
 
-}
+}  // namespace ndb
diff --git a/lib/gpc/kernels/box.hpp b/lib/gpc/kernels/box.hpp
index bf9eea3..c5f2d0e 100644
--- a/lib/gpc/kernels/box.hpp
+++ b/lib/gpc/kernels/box.hpp
@@ -63,3 +63,7 @@ void box(uint8_t* in, uint8_t* blurred, int width, int height, int numThreads);
 
 }
 #endif
+
+
+
+
diff --git a/lib/gpc/kernels/box_hwy.cpp b/lib/gpc/kernels/box_hwy.cpp
new file mode 100644
index 0000000..8573e40
--- /dev/null
+++ b/lib/gpc/kernels/box_hwy.cpp
@@ -0,0 +1,146 @@
+
+// We define the target BEFORE including highway.h
+// On Mac, this forces Highway to use NEON mode without the inclusion loop.
+#define HWY_TARGET HWY_NEON 
+#include <hwy/highway.h>
+
+// We skip foreach_target.h entirely to avoid the "redefinition" and "path" errors.
+
+HWY_BEFORE_NAMESPACE(); 
+namespace ndb {
+namespace HWY_NAMESPACE {
+namespace hn = hwy::HWY_NAMESPACE;
+
+void BoxKernelNaive(const uint8_t* HWY_RESTRICT in, uint8_t* HWY_RESTRICT blurred, int width, int height) {
+    const hn::ScalableTag<uint8_t> d8;
+    const hn::ScalableTag<uint16_t> d16;
+    const size_t N = hn::Lanes(d8);
+    const auto divisor = hn::Set(d16, (uint16_t)7282); // 65536 / 9
+
+    for (int y = 1; y < height - 1; ++y) {
+        const uint8_t* r0 = in + (y - 1) * width;
+        const uint8_t* r1 = in + y * width;
+        const uint8_t* r2 = in + (y + 1) * width;
+        
+        uint8_t* out_row = blurred + y * width + 1;
+
+        for (int x = 0; x < width; x += N) {
+            // Row 0
+            auto v11 = hn::LoadU(d8, r0 + x);
+            auto v12 = hn::LoadU(d8, r0 + x + 1);
+            auto v13 = hn::LoadU(d8, r0 + x + 2);
+            
+            // Row 1
+            auto v21 = hn::LoadU(d8, r1 + x);
+            auto v22 = hn::LoadU(d8, r1 + x + 1);
+            auto v23 = hn::LoadU(d8, r1 + x + 2);
+
+            // Row 2
+            auto v31 = hn::LoadU(d8, r2 + x);
+            auto v32 = hn::LoadU(d8, r2 + x + 1);
+            auto v33 = hn::LoadU(d8, r2 + x + 2);
+
+            // Vertical sums first (3 instructions per half-vector)
+            auto sum_col1_lo = hn::Add(hn::PromoteLowerTo(d16, v11), hn::Add(hn::PromoteLowerTo(d16, v21), hn::PromoteLowerTo(d16, v31)));
+            auto sum_col1_hi = hn::Add(hn::PromoteUpperTo(d16, v11), hn::Add(hn::PromoteUpperTo(d16, v21), hn::PromoteUpperTo(d16, v31)));
+
+            auto sum_col2_lo = hn::Add(hn::PromoteLowerTo(d16, v12), hn::Add(hn::PromoteLowerTo(d16, v22), hn::PromoteLowerTo(d16, v32)));
+            auto sum_col2_hi = hn::Add(hn::PromoteUpperTo(d16, v12), hn::Add(hn::PromoteUpperTo(d16, v22), hn::PromoteUpperTo(d16, v32)));
+
+            auto sum_col3_lo = hn::Add(hn::PromoteLowerTo(d16, v13), hn::Add(hn::PromoteLowerTo(d16, v23), hn::PromoteLowerTo(d16, v33)));
+            auto sum_col3_hi = hn::Add(hn::PromoteUpperTo(d16, v13), hn::Add(hn::PromoteUpperTo(d16, v23), hn::PromoteUpperTo(d16, v33)));
+
+            // Horizontal accumulation
+            auto total_lo = hn::Add(sum_col1_lo, hn::Add(sum_col2_lo, sum_col3_lo));
+            auto total_hi = hn::Add(sum_col1_hi, hn::Add(sum_col2_hi, sum_col3_hi));
+
+            // Fixed-point division by 9
+            auto res_lo = hn::MulHigh(total_lo, divisor);
+            auto res_hi = hn::MulHigh(total_hi, divisor);
+            
+            hn::StoreU(hn::Combine(d8, hn::DemoteTo(d8, res_hi), hn::DemoteTo(d8, res_lo)), d8, out_row + x);
+        }
+    }
+}
+void BoxKernel(const uint8_t* HWY_RESTRICT in, uint8_t* HWY_RESTRICT blurred, int width, int height) {
+    const hn::ScalableTag<uint8_t> d8;
+    const hn::ScalableTag<uint16_t> d16;
+    const size_t N = hn::Lanes(d8);
+    const auto divisor = hn::Set(d16, (uint16_t)7282);
+
+    // We process two output rows at a time (y and y+1)
+    // This requires 4 input rows (r0, r1, r2, r3)
+    for (int y = 1; y < height - 2; y += 2) {
+        const uint8_t* r0 = in + (y - 1) * width;
+        const uint8_t* r1 = in + y * width;
+        const uint8_t* r2 = in + (y + 1) * width;
+        const uint8_t* r3 = in + (y + 2) * width;
+        
+        uint8_t* out0 = blurred + y * width + 1;
+        uint8_t* out1 = blurred + (y + 1) * width + 1;
+
+        for (int x = 0; x < width; x += N) {
+            // Load all 4 rows needed for 2 output rows
+            auto v0_0 = hn::LoadU(d8, r0 + x);
+            auto v0_1 = hn::LoadU(d8, r0 + x + 1);
+            auto v0_2 = hn::LoadU(d8, r0 + x + 2);
+
+            auto v1_0 = hn::LoadU(d8, r1 + x);
+            auto v1_1 = hn::LoadU(d8, r1 + x + 1);
+            auto v1_2 = hn::LoadU(d8, r1 + x + 2);
+
+            auto v2_0 = hn::LoadU(d8, r2 + x);
+            auto v2_1 = hn::LoadU(d8, r2 + x + 1);
+            auto v2_2 = hn::LoadU(d8, r2 + x + 2);
+
+            auto v3_0 = hn::LoadU(d8, r3 + x);
+            auto v3_1 = hn::LoadU(d8, r3 + x + 1);
+            auto v3_2 = hn::LoadU(d8, r3 + x + 2);
+
+            // Vertical sums for Row Pair 1 (Rows 0, 1, 2)
+            // Vertical sums for Row Pair 2 (Rows 1, 2, 3)
+            // Note: Rows 1 and 2 are REUSED.
+            
+            auto s1_lo = hn::Add(hn::PromoteLowerTo(d16, v1_1), hn::Add(hn::PromoteLowerTo(d16, v1_0), hn::PromoteLowerTo(d16, v1_2)));
+            auto s2_lo = hn::Add(hn::PromoteLowerTo(d16, v2_1), hn::Add(hn::PromoteLowerTo(d16, v2_0), hn::PromoteLowerTo(d16, v2_2)));
+            
+            // Output Row 0 logic
+            auto s0_lo = hn::Add(hn::PromoteLowerTo(d16, v0_1), hn::Add(hn::PromoteLowerTo(d16, v0_0), hn::PromoteLowerTo(d16, v0_2)));
+            auto row0_lo = hn::Add(s0_lo, hn::Add(s1_lo, s2_lo));
+
+            // Output Row 1 logic
+            auto s3_lo = hn::Add(hn::PromoteLowerTo(d16, v3_1), hn::Add(hn::PromoteLowerTo(d16, v3_0), hn::PromoteLowerTo(d16, v3_2)));
+            auto row1_lo = hn::Add(s3_lo, hn::Add(s1_lo, s2_lo));
+
+            // Repeat for high bits...
+            auto s1_hi = hn::Add(hn::PromoteUpperTo(d16, v1_1), hn::Add(hn::PromoteUpperTo(d16, v1_0), hn::PromoteUpperTo(d16, v1_2)));
+            auto s2_hi = hn::Add(hn::PromoteUpperTo(d16, v2_1), hn::Add(hn::PromoteUpperTo(d16, v2_0), hn::PromoteUpperTo(d16, v2_2)));
+            
+            auto s0_hi = hn::Add(hn::PromoteUpperTo(d16, v0_1), hn::Add(hn::PromoteUpperTo(d16, v0_0), hn::PromoteUpperTo(d16, v0_2)));
+            auto row0_hi = hn::Add(s0_hi, hn::Add(s1_hi, s2_hi));
+
+            auto s3_hi = hn::Add(hn::PromoteUpperTo(d16, v3_1), hn::Add(hn::PromoteUpperTo(d16, v3_0), hn::PromoteUpperTo(d16, v3_2)));
+            auto row1_hi = hn::Add(s3_hi, hn::Add(s1_hi, s2_hi));
+
+            // Store both rows
+            hn::StoreU(hn::Combine(d8, hn::DemoteTo(d8, hn::MulHigh(row0_hi, divisor)), 
+                                       hn::DemoteTo(d8, hn::MulHigh(row0_lo, divisor))), d8, out0 + x);
+            hn::StoreU(hn::Combine(d8, hn::DemoteTo(d8, hn::MulHigh(row1_hi, divisor)), 
+                                       hn::DemoteTo(d8, hn::MulHigh(row1_lo, divisor))), d8, out1 + x);
+        }
+    }
+}
+
+} // namespace HWY_NAMESPACE
+} // namespace ndb
+HWY_AFTER_NAMESPACE();
+
+namespace ndb {
+namespace testing {
+    void box_hwy(uint8_t* in, uint8_t* blurred, int width, int height) {
+        // We call ghwthe NEON version directly. 
+        // Highway maps HWY_NAMESPACE to N_NEON because of our #define above.
+        ndb::N_NEON::BoxKernel(in, blurred, width, height);
+    }
+}
+}

From 5e06b649af9363643a7b40f8be333b745eb381de Mon Sep 17 00:00:00 2001
From: Nik Bamert <github@nikbamert.com>
Date: Sun, 15 Feb 2026 14:14:31 +0100
Subject: [PATCH 13/36] highway sobel kernel

---
 CMakeLists.txt                |   1 +
 lib/gpc/forest.cpp            |   6 +-
 lib/gpc/kernels/box.cpp       |   1 -
 lib/gpc/kernels/box_hwy.cpp   |   7 -
 lib/gpc/kernels/sobel.cpp     | 232 +++++++++++++---------------------
 lib/gpc/kernels/sobel_hwy.cpp | 165 ++++++++++++++++++++++++
 6 files changed, 259 insertions(+), 153 deletions(-)
 create mode 100644 lib/gpc/kernels/sobel_hwy.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 56c5314..06b1991 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -56,6 +56,7 @@ add_library(gpc_core
     lib/gpc/kernels/gpc.cpp
     lib/gpc/kernels/utils.cpp
     lib/gpc/kernels/box_hwy.cpp
+    lib/gpc/kernels/sobel_hwy.cpp
 )
 target_link_libraries(gpc_core 
     PUBLIC 
diff --git a/lib/gpc/forest.cpp b/lib/gpc/forest.cpp
index 5ba9f38..79f6894 100644
--- a/lib/gpc/forest.cpp
+++ b/lib/gpc/forest.cpp
@@ -212,18 +212,16 @@ PreprocessedImage Forest::preprocessImage(ndb::Buffer<uint8_t>& img,
 
     ndb::Buffer<uint8_t> smooth(img.rows(), img.cols());
     smooth.width = img.width;
-    gpc::inference::time_point t0 = gpc::inference::sysTick();
     ndb::box(img.data(),
              smooth.data(),
              img.cols(),
              img.rows(),
              settings.numThreads_);
-    gpc::inference::time_point t1 = gpc::inference::sysTick();
-    cout << "box: " << gpc::inference::tickToMs(t1, t0) << " ms" << endl;
     smooth.clearBoundary();
     ndb::Buffer<uint8_t> grad(img.rows(), img.cols());
     grad.width = img.width;
     ndb::Buffer<int> maskTmp;
+    gpc::inference::time_point t0 = gpc::inference::sysTick();
     ndb::sobel(img.data(),
                grad.data(),
                img.cols(),
@@ -231,6 +229,8 @@ PreprocessedImage Forest::preprocessImage(ndb::Buffer<uint8_t>& img,
                settings.gradientThreshold_,
                settings.numThreads_);
 
+    gpc::inference::time_point t1 = gpc::inference::sysTick();
+    cout << "sobel: " << gpc::inference::tickToMs(t1, t0) << " ms" << endl;
     ndb::Buffer<int> idx;
     idx.resize(grad.rows(), grad.cols());
     auto ff = [&](ndb::Buffer<int>& in, std::vector<int>& out, int m) {
diff --git a/lib/gpc/kernels/box.cpp b/lib/gpc/kernels/box.cpp
index 10215d4..0b611fb 100644
--- a/lib/gpc/kernels/box.cpp
+++ b/lib/gpc/kernels/box.cpp
@@ -174,5 +174,4 @@ void box(uint8_t* in, uint8_t* blurred, int width, int height, int numThreads) {
     #endif
 #endif
 }
-
 }  // namespace ndb
diff --git a/lib/gpc/kernels/box_hwy.cpp b/lib/gpc/kernels/box_hwy.cpp
index 8573e40..384360e 100644
--- a/lib/gpc/kernels/box_hwy.cpp
+++ b/lib/gpc/kernels/box_hwy.cpp
@@ -1,11 +1,6 @@
-
-// We define the target BEFORE including highway.h
-// On Mac, this forces Highway to use NEON mode without the inclusion loop.
 #define HWY_TARGET HWY_NEON 
 #include <hwy/highway.h>
 
-// We skip foreach_target.h entirely to avoid the "redefinition" and "path" errors.
-
 HWY_BEFORE_NAMESPACE(); 
 namespace ndb {
 namespace HWY_NAMESPACE {
@@ -138,8 +133,6 @@ HWY_AFTER_NAMESPACE();
 namespace ndb {
 namespace testing {
     void box_hwy(uint8_t* in, uint8_t* blurred, int width, int height) {
-        // We call ghwthe NEON version directly. 
-        // Highway maps HWY_NAMESPACE to N_NEON because of our #define above.
         ndb::N_NEON::BoxKernel(in, blurred, width, height);
     }
 }
diff --git a/lib/gpc/kernels/sobel.cpp b/lib/gpc/kernels/sobel.cpp
index 4d56716..c6a14bc 100644
--- a/lib/gpc/kernels/sobel.cpp
+++ b/lib/gpc/kernels/sobel.cpp
@@ -31,6 +31,9 @@
 
 #include "gpc/kernels/sobel.hpp"
 namespace ndb {
+namespace testing { 
+    void sobel_hwy(uint8_t* in, uint8_t* blurred, int width, int height, uint8_t threshold); 
+}
 void sobelNaive(
     uint8_t* in, uint8_t* gradient, int width, int height, uint8_t threshold) {
     assert(width % 16 == 0 && "width must be multiple of 16!");
@@ -74,6 +77,84 @@ void sobelNaive(
         }
     }
 }
+#ifdef _INTRINSICS_SSE
+#include <immintrin.h>
+
+// Assuming your helper macros/inline funcs are defined elsewhere
+// pack16to8(lo, hi, res) 
+// unpack8to16(in, lo, hi)
+
+void sobelSSE(const uint8_t* in, uint8_t* blurred, 
+                            int width, int start, int end, 
+                            uint8_t threshold) {
+    
+    __m128i zero = _mm_setzero_si128();
+    __m128i one_ninth = _mm_set1_epi16(7282); // 2^16/9
+    __m128i binThres = _mm_set1_epi16(threshold * threshold);
+
+    for (int y = start; y < end; y++) {
+        const uint8_t* row1 = in + y * width;
+        const uint8_t* row0 = row1 - width;
+        const uint8_t* row2 = row1 + width;
+        
+        // Output destination for this specific row
+        __m128i* dst = (__m128i*)(blurred + y * width + 1);
+
+        for (int x = 0; x < width; x += 16) {
+            __m128i a00, a01, a02, a10, a12, a20, a21, a22;
+            __m128i b00, b01, b02, b10, b12, b20, b21, b22;
+            __m128i raA, raB, rbA, rbB;
+            __m128i tmpa, tmpb, sya, syb, sxa, sxb, res;
+
+            // Load and unpack 3x3 neighborhood (excluding center a11/b11)
+            unpack8to16(_mm_loadu_si128((__m128i*)(row0 + x - 1)), a00, b00);
+            unpack8to16(_mm_loadu_si128((__m128i*)(row0 + x)),     a01, b01);
+            unpack8to16(_mm_loadu_si128((__m128i*)(row0 + x + 1)), a02, b02);
+
+            unpack8to16(_mm_loadu_si128((__m128i*)(row1 + x - 1)), a10, b10);
+            unpack8to16(_mm_loadu_si128((__m128i*)(row1 + x + 1)), a12, b12);
+
+            unpack8to16(_mm_loadu_si128((__m128i*)(row2 + x - 1)), a20, b20);
+            unpack8to16(_mm_loadu_si128((__m128i*)(row2 + x)),     a21, b21);
+            unpack8to16(_mm_loadu_si128((__m128i*)(row2 + x + 1)), a22, b22);
+
+            // --- SX Calculation ---
+            // Left col (1,2,1)
+            raA = _mm_mulhi_epi16(_mm_add_epi16(_mm_add_epi16(a00, a20), _mm_add_epi16(a10, a10)), one_ninth);
+            rbA = _mm_mulhi_epi16(_mm_add_epi16(_mm_add_epi16(b00, b20), _mm_add_epi16(b10, b10)), one_ninth);
+            // Right col (-1,-2,-1)
+            raB = _mm_mulhi_epi16(_mm_add_epi16(_mm_add_epi16(a02, a22), _mm_add_epi16(a12, a12)), one_ninth);
+            rbB = _mm_mulhi_epi16(_mm_add_epi16(_mm_add_epi16(b02, b22), _mm_add_epi16(b12, b12)), one_ninth);
+
+            tmpa = _mm_sub_epi16(raA, raB);
+            tmpb = _mm_sub_epi16(rbA, rbB);
+            sxa = _mm_mullo_epi16(tmpa, tmpa);
+            sxb = _mm_mullo_epi16(tmpb, tmpb);
+
+            // --- SY Calculation ---
+            // Top row (1,2,1)
+            raA = _mm_mulhi_epi16(_mm_add_epi16(_mm_add_epi16(a00, a02), _mm_add_epi16(a01, a01)), one_ninth);
+            rbA = _mm_mulhi_epi16(_mm_add_epi16(_mm_add_epi16(b00, b02), _mm_add_epi16(b01, b01)), one_ninth);
+            // Bottom row (-1,-2,-1)
+            raB = _mm_mulhi_epi16(_mm_add_epi16(_mm_add_epi16(a20, a22), _mm_add_epi16(a21, a21)), one_ninth);
+            rbB = _mm_mulhi_epi16(_mm_add_epi16(_mm_add_epi16(b20, b22), _mm_add_epi16(b21, b21)), one_ninth);
+
+            tmpa = _mm_sub_epi16(raA, raB);
+            tmpb = _mm_sub_epi16(rbA, rbB);
+            sya = _mm_mullo_epi16(tmpa, tmpa);
+            syb = _mm_mullo_epi16(tmpb, tmpb);
+
+            // --- Thresholding and Packing ---
+            pack16to8(
+                _mm_unpacklo_epi8(_mm_cmpgt_epi16(_mm_adds_epi16(sxa, sya), binThres), zero),
+                _mm_unpacklo_epi8(_mm_cmpgt_epi16(_mm_adds_epi16(sxb, syb), binThres), zero),
+                res);
+
+            _mm_storeu_si128(dst++, res);
+        }
+    }
+}
+#endif
 void sobel(uint8_t* in,
            uint8_t* blurred,
            int width,
@@ -81,149 +162,16 @@ void sobel(uint8_t* in,
            uint8_t threshold,
            int numThreads) {
     assert(width % 16 == 0 && "width must be multiple of 16!");
-#ifndef _INTRINSICS_SSE
-    sobelNaive(in, blurred, width, height, threshold);
+#if defined(__ARM_NEON) || defined(__aarch64__)
+    // Force use of our new Highway kernel on Mac
+        sobelNaive(in, blurred, width, height, threshold);
+    //testing::sobel_hwy(in, blurred, width, height, threshold);
 #else
-    auto sobelSSESegment = [&](int start, int end) {
-        __m128i one_third, one_ninth, one, two, mone, mtwo, binThres;
-        __m128i *dst0, *dst1;
-        __m128i zero = _mm_setzero_si128();
-
-        int x, y;
-        one_third = _mm_set1_epi16(
-            21846);  // 2^16/3+1. For 16bit ints. 2^8/3+1=86.33 for 8bit
-        one_ninth = _mm_set1_epi16(7282);  // 2^16/9+1. For 16bit ints.
-
-        binThres = _mm_set1_epi16(threshold * threshold);
-
-        dst0 = (__m128i*)(blurred + width * 1);
-        // dst1 = (__m128i *)(blurred + width * 2);
-        for (y = start; y < end;
-             y++) {  // We compute results for two rows in one iteration
-            const uint8_t *row0, *row1, *row2;
-
-            row1 = in + y * width;
-            row0 = row1 - width;
-            row2 = row1 + width;
-
-            for (x = 0; x < width; x += 16) {
-                // Note: Center element not used in sobel kernels!!
-                // Kernel indices:
-                // 00 01 02
-                // 10 11 12
-                // 20 21 22
-
-                __m128i a00, a01, a02, a10, a12, a20, a21, a22;
-                __m128i b00, b01, b02, b10, b12, b20, b21, b22;
-
-                __m128i raA, raB, rbA, rbB;
-                __m128i tmpa, tmpb, sya, syb, sxa, sxb, res;
-
-                unpack8to16(_mm_loadu_si128((__m128i*)(row0 - 1)), a00, b00);
-                unpack8to16(_mm_load_si128((__m128i*)(row0)), a01, b01);
-                unpack8to16(_mm_loadu_si128((__m128i*)(row0 + 1)), a02, b02);
-
-                unpack8to16(_mm_loadu_si128((__m128i*)(row1 - 1)), a10, b10);
-                unpack8to16(_mm_loadu_si128((__m128i*)(row1 + 1)), a12, b12);
-
-                unpack8to16(_mm_loadu_si128((__m128i*)(row2 - 1)), a20, b20);
-                unpack8to16(_mm_load_si128((__m128i*)(row2)), a21, b21);
-                unpack8to16(_mm_loadu_si128((__m128i*)(row2 + 1)), a22, b22);
-
-                // Sobel kernels for x and y direction.
-                //      1 0 -1       1 2 1
-                // sx = 2 0 -2 sy =  0 0 0
-                //      1 0 -1      -1-2-1
-                //      Note that neither kernel uses the center element)
-
-                // In the following, mullo is used to multiply intermediate
-                // results with -1 To divide by 3, 16bit overflow divide by
-                // multiply is used, which thus uses the upper 16bit(_mm_mulhi)
-                // of the 32bit temporary result.
-
-                // sx column kernel vectors (1,2,1)
-                // Two chained add/sub are used for 2 and -2
-                raA = _mm_mulhi_epi16(
-                    _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(a00, a20), a10),
-                                  a10),
-                    one_ninth);
-                rbA = _mm_mulhi_epi16(
-                    _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(b00, b20), b10),
-                                  b10),
-                    one_ninth);
-
-                // sx column kernel vector (-1 -2 -1)
-                raB = _mm_mulhi_epi16(
-                    _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(a02, a22), a12),
-                                  a12),
-                    one_ninth);
-                rbB = _mm_mulhi_epi16(
-                    _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(b02, b22), b12),
-                                  b12),
-                    one_ninth);
-
-                // Square of sx: Add squares of above temporaries into final sum
-                tmpa = _mm_sub_epi16(raA, raB);
-                tmpb = _mm_sub_epi16(rbA, rbB);
-
-                sxa = _mm_mullo_epi16(tmpa, tmpa);
-                sxb = _mm_mullo_epi16(tmpb, tmpb);
-
-                // sy row kernel vector (1,2,1)
-                // Two chained add are used for 2 and -2
-                raA = _mm_mulhi_epi16(
-                    _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(a00, a02), a01),
-                                  a01),
-                    one_ninth);
-                rbA = _mm_mulhi_epi16(
-                    _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(b00, b02), b01),
-                                  b01),
-                    one_ninth);
-
-                // sy row kernel vector (-1 -2 -1)
-                raB = _mm_mulhi_epi16(
-                    _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(a20, a22), a21),
-                                  a21),
-                    one_ninth);
-                rbB = _mm_mulhi_epi16(
-                    _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(b20, b22), b21),
-                                  b21),
-                    one_ninth);
-
-                // Square of sx: Add squares of above temporaries into final sum
-                tmpa = _mm_sub_epi16(raA, raB);
-                tmpb = _mm_sub_epi16(rbA, rbB);
-
-                // watch out, can't overwrite this
-                sya = _mm_mullo_epi16(tmpa, tmpa);
-                syb = _mm_mullo_epi16(tmpb, tmpb);
-
-                __m128i zero = _mm_setzero_si128();
-
-                // The unpacklo is necessary because _mm_cmput_epi16 sets the
-                // output to 0xFFFF if the comparison is true. When packing
-                // 16bit to 8bit however, 0xFFFF will be interpreted (in a
-                // signed environment) as being negative, and hence set to 0,
-                // resulting in a 0 output everywhere. using unpacklo in between
-                // we get 0xFFFF->0xFF
-                pack16to8(
-                    _mm_unpacklo_epi8(
-                        _mm_cmpgt_epi16(_mm_adds_epi16(sxa, sya), binThres),
-                        zero),
-                    _mm_unpacklo_epi8(
-                        _mm_cmpgt_epi16(_mm_adds_epi16(sxb, syb), binThres),
-                        zero),
-                    res);
-
-                _mm_store_si128(dst0++, res);
-
-                row0 += 16;
-                row1 += 16;
-                row2 += 16;
-            }  // cols
-        }  // rows
-    };  // Lambda
-    sobelSSESegment(1, height - 3);
+    #ifndef _INTRINSICS_SSE
+        sobelNaive(in, blurred, width, height, threshold);
+    #else
+        sobelSSE(in, blurred, width, 1, height - 1, threshold);
+    #endif
 #endif
 }
 } // namespace ndb
diff --git a/lib/gpc/kernels/sobel_hwy.cpp b/lib/gpc/kernels/sobel_hwy.cpp
new file mode 100644
index 0000000..1e395c4
--- /dev/null
+++ b/lib/gpc/kernels/sobel_hwy.cpp
@@ -0,0 +1,165 @@
+#define HWY_TARGET HWY_NEON 
+#include <hwy/highway.h>
+
+HWY_BEFORE_NAMESPACE(); 
+namespace ndb {
+namespace HWY_NAMESPACE {
+namespace hn = hwy::HWY_NAMESPACE;
+
+
+void SobelKernel(const uint8_t* HWY_RESTRICT in, uint8_t* HWY_RESTRICT gradient, 
+                 int width, int height, uint8_t threshold) {
+    const hn::ScalableTag<uint8_t> d8;
+    const hn::Rebind<int16_t, hn::Half<decltype(d8)>> d16; // Signed 16-bit, half the lanes of d8
+    const hn::Half<decltype(d8)> d8_half; // Tag for half-width 8-bit loads
+    
+    const size_t N = hn::Lanes(d8);
+    const auto divisor = hn::Set(d16, (int16_t)7282); 
+    const auto threshSq = hn::Set(d16, (int16_t)(threshold * threshold));
+    const auto v255 = hn::Set(d16, 255);
+    const auto v0 = hn::Zero(d16);
+
+    for (int y = 1; y < height - 1; ++y) {
+        const uint8_t* r0 = in + (y - 1) * width;
+        const uint8_t* r1 = in + y * width;
+        const uint8_t* r2 = in + (y + 1) * width;
+        uint8_t* out = gradient + y * width + 1;
+
+        for (int x = 0; x < width; x += N) {
+            // Load full 128-bit vectors
+            auto v11 = hn::LoadU(d8, r0 + x); auto v12 = hn::LoadU(d8, r0 + x + 1); auto v13 = hn::LoadU(d8, r0 + x + 2);
+            auto v21 = hn::LoadU(d8, r1 + x);                                      auto v23 = hn::LoadU(d8, r1 + x + 2);
+            auto v31 = hn::LoadU(d8, r2 + x); auto v32 = hn::LoadU(d8, r2 + x + 1); auto v33 = hn::LoadU(d8, r2 + x + 2);
+
+            // LOWER HALF PROCESSING
+            {
+                // PromoteTo signed 16-bit from the lower half of our 8-bit vectors
+                auto p11 = hn::PromoteTo(d16, hn::LowerHalf(v11));
+                auto p12 = hn::PromoteTo(d16, hn::LowerHalf(v12));
+                auto p13 = hn::PromoteTo(d16, hn::LowerHalf(v13));
+                auto p21 = hn::PromoteTo(d16, hn::LowerHalf(v21));
+                auto p23 = hn::PromoteTo(d16, hn::LowerHalf(v23));
+                auto p31 = hn::PromoteTo(d16, hn::LowerHalf(v31));
+                auto p32 = hn::PromoteTo(d16, hn::LowerHalf(v32));
+                auto p33 = hn::PromoteTo(d16, hn::LowerHalf(v33));
+
+                auto sx = hn::Sub(hn::Add(hn::Add(p11, p31), hn::Add(p21, p21)), 
+                                  hn::Add(hn::Add(p13, p33), hn::Add(p23, p23)));
+                sx = hn::MulHigh(sx, divisor);
+
+                auto sy = hn::Sub(hn::Add(hn::Add(p11, p13), hn::Add(p12, p12)), 
+                                  hn::Add(hn::Add(p31, p33), hn::Add(p32, p32)));
+                sy = hn::MulHigh(sy, divisor);
+
+                auto mag = hn::Add(hn::Mul(sx, sx), hn::Mul(sy, sy));
+                auto mask = hn::Gt(mag, threshSq);
+                auto res_lo = hn::DemoteTo(d8_half, hn::IfThenElse(mask, v255, v0));
+
+                // UPPER HALF PROCESSING
+                auto u11 = hn::PromoteTo(d16, hn::UpperHalf(d8, v11));
+                auto u12 = hn::PromoteTo(d16, hn::UpperHalf(d8, v12));
+                auto u13 = hn::PromoteTo(d16, hn::UpperHalf(d8, v13));
+                auto u21 = hn::PromoteTo(d16, hn::UpperHalf(d8, v21));
+                auto u23 = hn::PromoteTo(d16, hn::UpperHalf(d8, v23));
+                auto u31 = hn::PromoteTo(d16, hn::UpperHalf(d8, v31));
+                auto u32 = hn::PromoteTo(d16, hn::UpperHalf(d8, v32));
+                auto u33 = hn::PromoteTo(d16, hn::UpperHalf(d8, v33));
+
+                auto sx_u = hn::Sub(hn::Add(hn::Add(u11, u31), hn::Add(u21, u21)), 
+                                    hn::Add(hn::Add(u13, u33), hn::Add(u23, u23)));
+                sx_u = hn::MulHigh(sx_u, divisor);
+
+                auto sy_u = hn::Sub(hn::Add(hn::Add(u11, u13), hn::Add(u12, u12)), 
+                                    hn::Add(hn::Add(u31, u33), hn::Add(u32, u32)));
+                sy_u = hn::MulHigh(sy_u, divisor);
+
+                auto mag_u = hn::Add(hn::Mul(sx_u, sx_u), hn::Mul(sy_u, sy_u));
+                auto mask_u = hn::Gt(mag_u, threshSq);
+                auto res_hi = hn::DemoteTo(d8_half, hn::IfThenElse(mask_u, v255, v0));
+
+                hn::StoreU(hn::Combine(d8, res_hi, res_lo), d8, out + x);
+            }
+        }
+    }
+}
+void SobelKerneli(const uint8_t* HWY_RESTRICT in, uint8_t* HWY_RESTRICT gradient, 
+                 int width, int height, uint8_t threshold) {
+    const hn::ScalableTag<uint8_t> d8;
+    const hn::Rebind<int16_t, hn::Half<decltype(d8)>> d16; 
+    const hn::Half<decltype(d8)> d8_half; 
+    
+    const size_t N = hn::Lanes(d8);
+    // Multiply threshold by 9 BEFORE squaring to match the "no-division" math
+    int16_t tScaled = (int16_t)threshold * 9;
+    const auto threshSq = hn::Set(d16, tScaled * tScaled);
+    
+    const auto v255 = hn::Set(d16, 255);
+    const auto v0 = hn::Zero(d16);
+
+    for (int y = 1; y < height - 1; ++y) {
+        const uint8_t* r0 = in + (y - 1) * width;
+        const uint8_t* r1 = in + y * width;
+        const uint8_t* r2 = in + (y + 1) * width;
+        uint8_t* out = gradient + y * width + 1;
+
+        for (int x = 0; x < width; x += N) {
+            auto v11 = hn::LoadU(d8, r0 + x); auto v12 = hn::LoadU(d8, r0 + x + 1); auto v13 = hn::LoadU(d8, r0 + x + 2);
+            auto v21 = hn::LoadU(d8, r1 + x);                                      auto v23 = hn::LoadU(d8, r1 + x + 2);
+            auto v31 = hn::LoadU(d8, r2 + x); auto v32 = hn::LoadU(d8, r2 + x + 1); auto v33 = hn::LoadU(d8, r2 + x + 2);
+
+            // LOWER HALF
+            {
+                auto p11 = hn::PromoteTo(d16, hn::LowerHalf(v11));
+                auto p12 = hn::PromoteTo(d16, hn::LowerHalf(v12));
+                auto p13 = hn::PromoteTo(d16, hn::LowerHalf(v13));
+                auto p21 = hn::PromoteTo(d16, hn::LowerHalf(v21));
+                auto p23 = hn::PromoteTo(d16, hn::LowerHalf(v23));
+                auto p31 = hn::PromoteTo(d16, hn::LowerHalf(v31));
+                auto p32 = hn::PromoteTo(d16, hn::LowerHalf(v32));
+                auto p33 = hn::PromoteTo(d16, hn::LowerHalf(v33));
+
+                auto sx = hn::Sub(hn::Add(hn::Add(p11, p31), hn::Add(p21, p21)), 
+                                  hn::Add(hn::Add(p13, p33), hn::Add(p23, p23)));
+                auto sy = hn::Sub(hn::Add(hn::Add(p11, p13), hn::Add(p12, p12)), 
+                                  hn::Add(hn::Add(p31, p33), hn::Add(p32, p32)));
+
+                // Removed MulHigh (division). Math is now: (sx*sx + sy*sy) > (threshold*9)^2
+                auto mag = hn::Add(hn::Mul(sx, sx), hn::Mul(sy, sy));
+                auto mask = hn::Gt(mag, threshSq);
+                auto res_lo = hn::DemoteTo(d8_half, hn::IfThenElse(mask, v255, v0));
+
+                // UPPER HALF
+                auto u11 = hn::PromoteTo(d16, hn::UpperHalf(d8, v11));
+                auto u12 = hn::PromoteTo(d16, hn::UpperHalf(d8, v12));
+                auto u13 = hn::PromoteTo(d16, hn::UpperHalf(d8, v13));
+                auto u21 = hn::PromoteTo(d16, hn::UpperHalf(d8, v21));
+                auto u23 = hn::PromoteTo(d16, hn::UpperHalf(d8, v23));
+                auto u31 = hn::PromoteTo(d16, hn::UpperHalf(d8, v31));
+                auto u32 = hn::PromoteTo(d16, hn::UpperHalf(d8, v32));
+                auto u33 = hn::PromoteTo(d16, hn::UpperHalf(d8, v33));
+
+                auto sx_u = hn::Sub(hn::Add(hn::Add(u11, u31), hn::Add(u21, u21)), 
+                                    hn::Add(hn::Add(u13, u33), hn::Add(u23, u23)));
+                auto sy_u = hn::Sub(hn::Add(hn::Add(u11, u13), hn::Add(u12, u12)), 
+                                    hn::Add(hn::Add(u31, u33), hn::Add(u32, u32)));
+
+                auto mag_u = hn::Add(hn::Mul(sx_u, sx_u), hn::Mul(sy_u, sy_u));
+                auto mask_u = hn::Gt(mag_u, threshSq);
+                auto res_hi = hn::DemoteTo(d8_half, hn::IfThenElse(mask_u, v255, v0));
+
+                hn::StoreU(hn::Combine(d8, res_hi, res_lo), d8, out + x);
+            }
+        }
+    }
+}
+} // namespace HWY_NAMESPACE
+} // namespace ndb
+HWY_AFTER_NAMESPACE();
+
+namespace ndb {
+namespace testing {
+    void sobel_hwy(uint8_t* in, uint8_t* blurred, int width, int height, uint8_t threshold) {
+        ndb::N_NEON::SobelKernel(in, blurred, width, height, threshold);
+    }
+}
+}

From 0285f71f78ed87d8c9fc2cd7b06b02003edee541 Mon Sep 17 00:00:00 2001
From: Nik Bamert <github@nikbamert.com>
Date: Wed, 18 Feb 2026 16:00:53 +0100
Subject: [PATCH 14/36] checkin

---
 CMakeLists.txt                 | 22 +++++++++++++++++++++-
 lib/gpc/forest.cpp             | 12 +++++++-----
 lib/gpc/kernels/box.cpp        |  1 +
 lib/gpc/kernels/census.cpp     |  2 +-
 lib/gpc/kernels/gpc.cpp        |  2 +-
 lib/gpc/kernels/sobel.cpp      |  4 ++--
 samples/sparsematch.cpp        | 18 +++++++-----------
 tests/CMakeLists.txt           | 26 +++++++++++++++++---------
 tests/test_single_matching.cpp |  2 ++
 9 files changed, 59 insertions(+), 30 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 06b1991..c072f48 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,6 +3,9 @@ include(CheckCXXCompilerFlag)
 include(CheckCXXSourceRuns)
 project(openGPC CXX)
 set (REQ_CPP11_FEATURES  cxx_strong_enums cxx_auto_type)
+if(NOT CMAKE_BUILD_TYPE)
+  set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
+endif()
 
 set(CMAKE_CXX_STANDARD 17) 
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
@@ -19,7 +22,12 @@ include_directories(lib)
 #By default, use SSE intrinsics
 option(SSE "Enable SSE/AVX optimizations if available" ON)
 
-add_compile_options(-O3 -funroll-loops)
+add_compile_options(-O3 -funroll-loops -flto)
+if(APPLE AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
+    add_compile_options(-mcpu=apple-m1)
+elseif(NOT MSVC)
+    add_compile_options(-march=native)
+endif()
 if(SSE)
     message(STATUS "Checking if target CPU supports AVX2 instructions...")
     check_cxx_source_runs("
@@ -46,6 +54,17 @@ FetchContent_Declare(
 )
 FetchContent_MakeAvailable(highway)
 
+include(FetchContent)
+set(BENCHMARK_ENABLE_TESTING OFF CACHE BOOL "" FORCE)
+set(BENCHMARK_ENABLE_INSTALL OFF CACHE BOOL "" FORCE)
+set(BENCHMARK_ENABLE_GTEST_LIB OFF CACHE BOOL "" FORCE)
+
+FetchContent_Declare(
+  google_benchmark
+  GIT_REPOSITORY https://github.com/google/benchmark.git
+  GIT_TAG        v1.9.5  
+)
+FetchContent_MakeAvailable(google_benchmark)
 add_library(gpc_core 
     lib/gpc/forest.cpp 
     lib/gpc/fern.cpp 
@@ -69,4 +88,5 @@ target_include_directories(gpc_core PUBLIC lib)
 enable_testing()
 add_subdirectory(samples)
 add_subdirectory(tests)
+add_subdirectory(benchmarks)
 
diff --git a/lib/gpc/forest.cpp b/lib/gpc/forest.cpp
index 79f6894..e39eaff 100644
--- a/lib/gpc/forest.cpp
+++ b/lib/gpc/forest.cpp
@@ -211,26 +211,26 @@ PreprocessedImage Forest::preprocessImage(ndb::Buffer<uint8_t>& img,
            "gradientThreshold needs to be within 0...255");
 
     ndb::Buffer<uint8_t> smooth(img.rows(), img.cols());
+
     smooth.width = img.width;
+    // 0.2ms
     ndb::box(img.data(),
              smooth.data(),
              img.cols(),
              img.rows(),
              settings.numThreads_);
+    //4.2 *10^-5 ms
     smooth.clearBoundary();
     ndb::Buffer<uint8_t> grad(img.rows(), img.cols());
     grad.width = img.width;
-    ndb::Buffer<int> maskTmp;
-    gpc::inference::time_point t0 = gpc::inference::sysTick();
+    //4.2*10-5ms (unclear how)
     ndb::sobel(img.data(),
                grad.data(),
                img.cols(),
                img.rows(),
                settings.gradientThreshold_,
                settings.numThreads_);
-
-    gpc::inference::time_point t1 = gpc::inference::sysTick();
-    cout << "sobel: " << gpc::inference::tickToMs(t1, t0) << " ms" << endl;
+    gpc::inference::time_point t0 = gpc::inference::sysTick();
     ndb::Buffer<int> idx;
     idx.resize(grad.rows(), grad.cols());
     auto ff = [&](ndb::Buffer<int>& in, std::vector<int>& out, int m) {
@@ -247,6 +247,8 @@ PreprocessedImage Forest::preprocessImage(ndb::Buffer<uint8_t>& img,
     std::vector<int> mask;
     ndb::arr2ind(grad.data(), grad.cols() * grad.rows(), idx.data(), &m);
     ff(idx, mask, m);
+
+    gpc::inference::time_point t1 = gpc::inference::sysTick();
     // Our outputs are: smooth, grad, mask;
     return PreprocessedImage(smooth, grad, mask);
 }
diff --git a/lib/gpc/kernels/box.cpp b/lib/gpc/kernels/box.cpp
index 0b611fb..c9984dd 100644
--- a/lib/gpc/kernels/box.cpp
+++ b/lib/gpc/kernels/box.cpp
@@ -30,6 +30,7 @@
 // Code Author: Niklaus Bamert (bamertn@ethz.ch)
 
 #include "gpc/kernels/box.hpp"
+#include <cassert>
 namespace ndb {
 namespace testing { 
     void box_hwy(uint8_t* in, uint8_t* blurred, int width, int height); 
diff --git a/lib/gpc/kernels/census.cpp b/lib/gpc/kernels/census.cpp
index bd70613..6235b06 100644
--- a/lib/gpc/kernels/census.cpp
+++ b/lib/gpc/kernels/census.cpp
@@ -28,7 +28,7 @@
 // POSSIBILITY OF SUCH DAMAGE.
 //
 // Code Author: Niklaus Bamert (bamertn@ethz.ch)
-
+#include <cassert>
 #include "gpc/kernels/census.hpp"
 void census5x5Naive(uint8_t* in, uint32_t* census, int width, int height) {
     uint32_t val;
diff --git a/lib/gpc/kernels/gpc.cpp b/lib/gpc/kernels/gpc.cpp
index 636a9cb..5e22e23 100644
--- a/lib/gpc/kernels/gpc.cpp
+++ b/lib/gpc/kernels/gpc.cpp
@@ -28,7 +28,7 @@
 // POSSIBILITY OF SUCH DAMAGE.
 //
 // Code Author: Niklaus Bamert (bamertn@ethz.ch)
-
+#include <cassert>
 #include "gpc/kernels/gpc.hpp"
 namespace ndb {
 void gpcFilterNaive(uint8_t* in,
diff --git a/lib/gpc/kernels/sobel.cpp b/lib/gpc/kernels/sobel.cpp
index c6a14bc..dc6e46b 100644
--- a/lib/gpc/kernels/sobel.cpp
+++ b/lib/gpc/kernels/sobel.cpp
@@ -28,7 +28,7 @@
 // POSSIBILITY OF SUCH DAMAGE.
 //
 // Code Author: Niklaus Bamert (bamertn@ethz.ch)
-
+#include <cassert>
 #include "gpc/kernels/sobel.hpp"
 namespace ndb {
 namespace testing { 
@@ -165,7 +165,7 @@ void sobel(uint8_t* in,
 #if defined(__ARM_NEON) || defined(__aarch64__)
     // Force use of our new Highway kernel on Mac
         sobelNaive(in, blurred, width, height, threshold);
-    //testing::sobel_hwy(in, blurred, width, height, threshold);
+    //testing::sobel_hwy(in, blurred, width, height, threshold); // not exact!
 #else
     #ifndef _INTRINSICS_SSE
         sobelNaive(in, blurred, width, height, threshold);
diff --git a/samples/sparsematch.cpp b/samples/sparsematch.cpp
index 2554271..be0015a 100644
--- a/samples/sparsematch.cpp
+++ b/samples/sparsematch.cpp
@@ -51,7 +51,7 @@ int main(int argc, char** argv) {
     gpc::inference::InferenceSettings inferencesettings =
         gpc::inference::InferenceSettings()
             .builder()
-            .gradientThreshold(20)
+            .gradientThreshold(2) // gradientthres 20: matching ~3ms, 2: matching: ~30ms. 
             .verticalTolerance(
                 0)               // 0px tolerance for rectified epipolar matches
             .dispHigh(128)       // limit disparities to 128
@@ -68,9 +68,12 @@ int main(int argc, char** argv) {
     gpc::inference::FilterMask fm =
         forest.readForest(forestPath, simg.cols(), simg.rows());
 
+    for(int i = 0; i<10000; i++) {
     // Preprocess images (box filter, sobel filter, indices of high gradient
     // pixels)
+
     gpc::inference::time_point t0 = gpc::inference::sysTick();
+
     gpc::inference::PreprocessedImage simgP =
         forest.preprocessImage(simg, inferencesettings);
     gpc::inference::PreprocessedImage timgP =
@@ -81,15 +84,8 @@ int main(int argc, char** argv) {
     std::vector<ndb::Support> supp =
         forest.rectifiedMatch(simgP, timgP, fm, inferencesettings);
     gpc::inference::time_point t2 = gpc::inference::sysTick();
-    cout << "tPreprocess: " << gpc::inference::tickToMs(t1, t0) << " ms"
-         << ", #candidatesL:" << simgP.mask.size()
-         << ", #candidatesR:" << timgP.mask.size()
-         << ", tMatch: " << gpc::inference::tickToMs(t2, t1) << " ms"
-         << ", num matches:" << supp.size() << std::endl;
-
-    // Output sparse disparities overlayed on left input image
-    ndb::Buffer<ndb::RGBColor> renderDisp;
-    renderDisp = ndb::getDisparityVisualization(simg, supp);
-    renderDisp.writePNGRGB("disparity.png");
+    std::cout << "Preprocessing time: " << gpc::inference::tickToMs(t1, t0) << " ms" << std::endl;
+    std::cout << "Matching time: " << gpc::inference::tickToMs(t2, t1) << " ms" << std::endl;
+    }
     test_hwy_neon();
 }
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 564c125..5e527f7 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -11,14 +11,22 @@ FetchContent_MakeAvailable(approvaltests)
 
 
 find_package(GTest REQUIRED)
-add_executable(test_single_matching test_single_matching.cpp)
-#target_link_libraries(test_single_matching PRIVATE ${PNG_LIBRARIES} ApprovalTests::ApprovalTests GTest::gtest_main Eigen3::Eigen)
-target_link_libraries(test_single_matching 
-    PRIVATE 
-        gpc_core 
-        ApprovalTests::ApprovalTests 
-        GTest::gtest_main
-)
+function(add_gpc_approval_test TEST_NAME SOURCE_FILE)
+    add_executable(${TEST_NAME} ${SOURCE_FILE})
+    
+    target_link_libraries(${TEST_NAME} 
+        PRIVATE 
+            gpc_core 
+            ApprovalTests::ApprovalTests 
+            GTest::gtest_main
+    )
+    
+
+    add_test(NAME ${TEST_NAME} COMMAND ${TEST_NAME})
+endfunction()
+
+add_gpc_approval_test(test_single_matching test_single_matching.cpp)
+add_gpc_approval_test(test_kernel_box test_kernel_box.cpp)
+add_gpc_approval_test(test_kernel_sobel test_kernel_sobel.cpp)
 
-add_test(NAME single_matching COMMAND test_single_matching)
 
diff --git a/tests/test_single_matching.cpp b/tests/test_single_matching.cpp
index fdff603..e675a7c 100644
--- a/tests/test_single_matching.cpp
+++ b/tests/test_single_matching.cpp
@@ -52,3 +52,5 @@ TEST(Approval, Inference)
     EXPECT_EQ(866, supp.size());
     ApprovalTests::Approvals::verify(ss.str());
 }
+
+

From 832a6516c791b86de798e989f1ac9b10fc985092 Mon Sep 17 00:00:00 2001
From: Nik Bamert <github@nikbamert.com>
Date: Wed, 18 Feb 2026 19:35:51 +0100
Subject: [PATCH 15/36] cmakelist eigen include

---
 lib/gpc/inference.hpp  | 1 -
 samples/CMakeLists.txt | 6 +++---
 tests/CMakeLists.txt   | 2 +-
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/lib/gpc/inference.hpp b/lib/gpc/inference.hpp
index e1a887a..eddc709 100644
--- a/lib/gpc/inference.hpp
+++ b/lib/gpc/inference.hpp
@@ -465,7 +465,6 @@ class Forest {
         int numFerns;
         int type;
         ff >> numFerns;
-        cout << "number of ferns:" << numFerns << endl;
         for (int i = 0; i < numFerns; i++) {
             int fernID, numTests;
             std::string fernScale;
diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt
index 3bbe11f..a7e73d7 100644
--- a/samples/CMakeLists.txt
+++ b/samples/CMakeLists.txt
@@ -1,10 +1,10 @@
 add_executable(extract extract.cpp)
-target_link_libraries(extract ${PNG_LIBRARIES} Threads::Threads)
+target_link_libraries(extract ${PNG_LIBRARIES} Threads::Threads Eigen3::Eigen)
 
 add_executable(train train.cpp)
-target_link_libraries(train ${PNG_LIBRARIES} Threads::Threads)
+target_link_libraries(train ${PNG_LIBRARIES} Threads::Threads Eigen3::Eigen)
 
 add_executable(sparsematch sparsematch.cpp)
-target_link_libraries(sparsematch ${PNG_LIBRARIES} Threads::Threads)
+target_link_libraries(sparsematch ${PNG_LIBRARIES} Threads::Threads Eigen3::Eigen)
 
 
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index acf86db..a211cac 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -12,7 +12,7 @@ FetchContent_MakeAvailable(approvaltests)
 
 find_package(GTest REQUIRED)
 add_executable(test_single_matching test_single_matching.cpp)
-target_link_libraries(test_single_matching PRIVATE ${PNG_LIBRARIES} ApprovalTests::ApprovalTests GTest::gtest_main)
+target_link_libraries(test_single_matching PRIVATE ${PNG_LIBRARIES} ApprovalTests::ApprovalTests GTest::gtest_main Eigen3::Eigen)
 
 add_test(NAME single_matching COMMAND test_single_matching)
 

From ca65a4284e7e73f1640093bb942045bdc0d45a58 Mon Sep 17 00:00:00 2001
From: Nik Bamert <github@nikbamert.com>
Date: Wed, 18 Feb 2026 20:55:42 +0100
Subject: [PATCH 16/36] update dir

---
 CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 55e3851..56b841c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -40,4 +40,5 @@ endif()
 enable_testing()
 add_subdirectory(samples)
 add_subdirectory(tests)
+add_subdirectory(benchmark)
 

From d3e1c320d62830c6012a8f9c2c76dbd247d204fe Mon Sep 17 00:00:00 2001
From: Nik Bamert <github@nikbamert.com>
Date: Wed, 18 Feb 2026 21:02:29 +0100
Subject: [PATCH 17/36] update

---
 CMakeLists.txt | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 56b841c..e96a542 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -37,6 +37,13 @@ if(SSE)
     endif()
 endif()
 
+FetchContent_Declare(
+  google_benchmark
+  GIT_REPOSITORY https://github.com/google/benchmark.git
+  GIT_TAG        v1.9.5  
+)
+FetchContent_MakeAvailable(google_benchmark)
+
 enable_testing()
 add_subdirectory(samples)
 add_subdirectory(tests)

From 362f3f5be2960e6b601ce97dfc5df75c8d985f03 Mon Sep 17 00:00:00 2001
From: Nik Bamert <github@nikbamert.com>
Date: Wed, 18 Feb 2026 21:26:04 +0100
Subject: [PATCH 18/36] add benchmark. coarse runtime measurement

---
 CMakeLists.txt              |  3 +-
 benchmarks/CMakeLists.txt   |  8 +++++
 benchmarks/kernel_bench.cpp | 69 +++++++++++++++++++++++++++++++++++++
 3 files changed, 79 insertions(+), 1 deletion(-)
 create mode 100644 benchmarks/CMakeLists.txt
 create mode 100644 benchmarks/kernel_bench.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e96a542..81d06f9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,6 +8,7 @@ set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
+include(FetchContent)
 find_package(Eigen3 REQUIRED)
 find_package(PNG REQUIRED)
 find_package(Threads REQUIRED)
@@ -47,5 +48,5 @@ FetchContent_MakeAvailable(google_benchmark)
 enable_testing()
 add_subdirectory(samples)
 add_subdirectory(tests)
-add_subdirectory(benchmark)
+add_subdirectory(benchmarks)
 
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
new file mode 100644
index 0000000..ecbaa4d
--- /dev/null
+++ b/benchmarks/CMakeLists.txt
@@ -0,0 +1,8 @@
+add_executable(kernel_bench kernel_bench.cpp)
+
+target_link_libraries(kernel_bench 
+    PRIVATE 
+        benchmark::benchmark 
+        Eigen3::Eigen
+        ${PNG_LIBRARIES} 
+)
diff --git a/benchmarks/kernel_bench.cpp b/benchmarks/kernel_bench.cpp
new file mode 100644
index 0000000..13c1927
--- /dev/null
+++ b/benchmarks/kernel_bench.cpp
@@ -0,0 +1,69 @@
+#include <benchmark/benchmark.h>
+#include "gpc/inference.hpp"
+
+typedef gpc::inference::Forest GPCForest_t;
+GPCForest_t forest;
+
+static void fullInference(
+        benchmark::State& state){
+
+    std::string forestPath = "../forests/defaultZeroForest.txt";
+    std::string leftImgPath = "../data/middlebury/im0.png";
+    std::string rightImgPath = "../data/middlebury/im1.png";
+    gpc::inference::InferenceSettings inferencesettings =
+        gpc::inference::InferenceSettings()
+            .builder()
+            .gradientThreshold(state.range(0)) // 0...255 gradient threshold for sobel filter
+            .verticalTolerance(
+                0)               // 0px tolerance for rectified epipolar matches
+            .dispHigh(128)       // limit disparities to 128
+            .epipolarMode(true)  // match GPC states in epipolar mode. more
+                                 // matches, lower accuracy than global
+            .useHashtable(false);  // use sort method for matching. faster for
+                                   // <100K descriptors
+
+    ndb::Buffer<uint8_t> simg, timg;
+    // Load images
+    simg.readPNG(leftImgPath);
+    timg.readPNG(rightImgPath);
+
+    // Get learned filter for the given image dimensions.
+    GPCForest_t::FilterMask fm =
+        forest.readForest(forestPath, simg.cols(), simg.rows());
+
+
+
+    for (auto _ : state) {
+        GPCForest_t::PreprocessedImage simgP =
+            forest.preprocessImage(simg, inferencesettings);
+        GPCForest_t::PreprocessedImage timgP =
+            forest.preprocessImage(timg, inferencesettings);
+        std::vector<ndb::Support> supp =
+            forest.rectifiedMatch(simgP, timgP, fm, inferencesettings);
+        state.counters["f_s"] = simgP.mask.size();
+        state.counters["f_t"] = timgP.mask.size();
+        state.counters["matches"] = supp.size();
+        benchmark::DoNotOptimize(supp);
+        benchmark::ClobberMemory();
+    }
+ 
+}
+
+BENCHMARK(fullInference)
+    ->Unit(benchmark::kMillisecond)
+    ->Args({0})
+    ->Args({5})
+    ->Args({100});
+
+
+BENCHMARK_MAIN();
+/*
+int main(int argc, char** argv) {
+
+        BenchmarkResults b = fullInference(simg,timg, fm, inferenceSettings);
+    for (const auto& [name, time] : b) {
+        cout << name << ", " << time << " ms" << endl;
+    }
+
+}
+*/

From 75846ed2296655307fd74e2d7662f8df1511bcd8 Mon Sep 17 00:00:00 2001
From: Nik Bamert <github@nikbamert.com>
Date: Sun, 22 Feb 2026 08:11:03 +0100
Subject: [PATCH 19/36] update legacy bench

---
 benchmarks/CMakeLists.txt         | 16 ++++++++++++++++
 benchmarks/box_legacy_bench.cpp   | 21 +++++++++++++++++++++
 benchmarks/kernel_bench.cpp       | 17 +++++------------
 benchmarks/sobel_legacy_bench.cpp | 21 +++++++++++++++++++++
 4 files changed, 63 insertions(+), 12 deletions(-)
 create mode 100644 benchmarks/box_legacy_bench.cpp
 create mode 100644 benchmarks/sobel_legacy_bench.cpp

diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index ecbaa4d..66b886b 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -6,3 +6,19 @@ target_link_libraries(kernel_bench
         Eigen3::Eigen
         ${PNG_LIBRARIES} 
 )
+add_executable(sobel_legacy_bench sobel_legacy_bench.cpp)
+
+target_link_libraries(sobel_legacy_bench
+    PRIVATE 
+        benchmark::benchmark 
+        Eigen3::Eigen
+        ${PNG_LIBRARIES} 
+)
+add_executable(box_legacy_bench box_legacy_bench.cpp)
+
+target_link_libraries(box_legacy_bench
+    PRIVATE 
+        benchmark::benchmark 
+        Eigen3::Eigen
+        ${PNG_LIBRARIES} 
+)
diff --git a/benchmarks/box_legacy_bench.cpp b/benchmarks/box_legacy_bench.cpp
new file mode 100644
index 0000000..9f06107
--- /dev/null
+++ b/benchmarks/box_legacy_bench.cpp
@@ -0,0 +1,21 @@
+#include <benchmark/benchmark.h>
+#include "gpc/filter.hpp" 
+
+static void BM_BoxHighway(benchmark::State& state) {
+    int w = 1920, h = 1080;
+    std::vector<uint8_t> in(w * h, 128);
+    std::vector<uint8_t> out(w * h, 0);
+
+    for (auto _ : state) {
+        ndb::box(in.data(), out.data(), w, h, 50);
+        
+        // Ensure the compiler doesn't skip the work
+        benchmark::DoNotOptimize(out.data());
+        benchmark::ClobberMemory();
+    }
+    
+    state.SetBytesProcessed(int64_t(state.iterations()) * w * h);
+}
+BENCHMARK(BM_BoxHighway)->Unit(benchmark::kMillisecond);
+
+BENCHMARK_MAIN();
diff --git a/benchmarks/kernel_bench.cpp b/benchmarks/kernel_bench.cpp
index 13c1927..d86cfe8 100644
--- a/benchmarks/kernel_bench.cpp
+++ b/benchmarks/kernel_bench.cpp
@@ -40,8 +40,8 @@ static void fullInference(
             forest.preprocessImage(timg, inferencesettings);
         std::vector<ndb::Support> supp =
             forest.rectifiedMatch(simgP, timgP, fm, inferencesettings);
-        state.counters["f_s"] = simgP.mask.size();
-        state.counters["f_t"] = timgP.mask.size();
+        state.counters["candidates_s"] = simgP.mask.size();
+        state.counters["candidates_t"] = timgP.mask.size();
         state.counters["matches"] = supp.size();
         benchmark::DoNotOptimize(supp);
         benchmark::ClobberMemory();
@@ -53,17 +53,10 @@ BENCHMARK(fullInference)
     ->Unit(benchmark::kMillisecond)
     ->Args({0})
     ->Args({5})
+    ->Args({10})
+    ->Args({20})
+    ->Args({50})
     ->Args({100});
 
 
 BENCHMARK_MAIN();
-/*
-int main(int argc, char** argv) {
-
-        BenchmarkResults b = fullInference(simg,timg, fm, inferenceSettings);
-    for (const auto& [name, time] : b) {
-        cout << name << ", " << time << " ms" << endl;
-    }
-
-}
-*/
diff --git a/benchmarks/sobel_legacy_bench.cpp b/benchmarks/sobel_legacy_bench.cpp
new file mode 100644
index 0000000..406dd6a
--- /dev/null
+++ b/benchmarks/sobel_legacy_bench.cpp
@@ -0,0 +1,21 @@
+#include <benchmark/benchmark.h>
+#include "gpc/filter.hpp" 
+
+static void BM_SobelHighway(benchmark::State& state) {
+    int w = 1920, h = 1080;
+    std::vector<uint8_t> in(w * h, 128);
+    std::vector<uint8_t> out(w * h, 0);
+
+    for (auto _ : state) {
+        ndb::sobel(in.data(), out.data(), w, h, 50, 1);
+        
+        // Ensure the compiler doesn't skip the work
+        benchmark::DoNotOptimize(out.data());
+        benchmark::ClobberMemory();
+    }
+    
+    state.SetBytesProcessed(int64_t(state.iterations()) * w * h);
+}
+BENCHMARK(BM_SobelHighway)->Unit(benchmark::kMillisecond);
+
+BENCHMARK_MAIN();

From 0b37d4538dbe3cf42617bfac86e83b982363f36b Mon Sep 17 00:00:00 2001
From: Nik Bamert <github@nikbamert.com>
Date: Sun, 22 Feb 2026 10:28:24 +0100
Subject: [PATCH 20/36] benchmark, perf, box and sobel acceptance tests

---
 benchmarks/CMakeLists_decouple_branch.txt | 11 ++++++
 benchmarks/sobel_bench.cpp                | 22 ++++++++++++
 lib/gpc/kernels/box_hwy.hpp               | 18 ++++++++++
 lib/gpc/kernels/sobel_hwy.hpp             | 17 +++++++++
 tests/test_kernel_box.cpp                 | 44 +++++++++++++++++++++++
 tests/test_kernel_sobel.cpp               | 44 +++++++++++++++++++++++
 6 files changed, 156 insertions(+)
 create mode 100644 benchmarks/CMakeLists_decouple_branch.txt
 create mode 100644 benchmarks/sobel_bench.cpp
 create mode 100644 lib/gpc/kernels/box_hwy.hpp
 create mode 100644 lib/gpc/kernels/sobel_hwy.hpp
 create mode 100644 tests/test_kernel_box.cpp
 create mode 100644 tests/test_kernel_sobel.cpp

diff --git a/benchmarks/CMakeLists_decouple_branch.txt b/benchmarks/CMakeLists_decouple_branch.txt
new file mode 100644
index 0000000..efecb51
--- /dev/null
+++ b/benchmarks/CMakeLists_decouple_branch.txt
@@ -0,0 +1,11 @@
+add_executable(kernel_bench sobel_bench.cpp)
+
+target_link_libraries(kernel_bench 
+    PRIVATE 
+        gpc_core 
+        benchmark::benchmark 
+        hwy
+)
+
+# allows the compiler to inline Highway kernels into the benchmark loop
+set_target_properties(kernel_bench PROPERTIES INTERPROCEDURAL_OPTIMIZATION TRUE)
diff --git a/benchmarks/sobel_bench.cpp b/benchmarks/sobel_bench.cpp
new file mode 100644
index 0000000..e0a65d1
--- /dev/null
+++ b/benchmarks/sobel_bench.cpp
@@ -0,0 +1,22 @@
+#include <benchmark/benchmark.h>
+#include "gpc/kernels/sobel_hwy.hpp" // Your header
+
+static void BM_SobelHighway(benchmark::State& state) {
+    int w = 1920, h = 1080;
+    std::vector<uint8_t> in(w * h, 128);
+    std::vector<uint8_t> out(w * h, 0);
+
+    // Warmup is handled automatically by the library
+    for (auto _ : state) {
+        ndb::testing::sobel_hwy(in.data(), out.data(), w, h, 50);
+        
+        // Ensure the compiler doesn't skip the work
+        benchmark::DoNotOptimize(out.data());
+        benchmark::ClobberMemory();
+    }
+    
+    state.SetBytesProcessed(int64_t(state.iterations()) * w * h);
+}
+BENCHMARK(BM_SobelHighway)->Unit(benchmark::kMillisecond);
+
+BENCHMARK_MAIN();
diff --git a/lib/gpc/kernels/box_hwy.hpp b/lib/gpc/kernels/box_hwy.hpp
new file mode 100644
index 0000000..6c256b0
--- /dev/null
+++ b/lib/gpc/kernels/box_hwy.hpp
@@ -0,0 +1,18 @@
+#ifndef  __NDB__KERNEL_BOX_HWY
+#define __NDB__KERNEL_BOX_HWY
+
+#include <cstdint>
+
+namespace ndb {
+
+namespace testing {
+    /**
+     * Entry point for benchmarking the MulHigh (approximate) version.
+     */
+    void box_hwy(uint8_t* in, uint8_t* blurred, int width, int height);
+
+}
+
+}  // namespace ndb
+
+#endif  // GPC_KERNELS_BOX_HWY_H_
diff --git a/lib/gpc/kernels/sobel_hwy.hpp b/lib/gpc/kernels/sobel_hwy.hpp
new file mode 100644
index 0000000..bc99199
--- /dev/null
+++ b/lib/gpc/kernels/sobel_hwy.hpp
@@ -0,0 +1,17 @@
+#ifndef  __NDB__KERNEL_SOBEL_HWY
+#define __NDB__KERNEL_SOBEL_HWY
+
+#include <cstdint>
+
+namespace ndb {
+
+namespace testing {
+    /**
+     * Entry point for benchmarking the MulHigh (approximate) version.
+     */
+    void sobel_hwy(uint8_t* in, uint8_t* blurred, int width, int height, uint8_t threshold);
+}
+
+}  // namespace ndb
+
+#endif  // GPC_KERNELS_SOBEL_HWY_H_
diff --git a/tests/test_kernel_box.cpp b/tests/test_kernel_box.cpp
new file mode 100644
index 0000000..8fff2ce
--- /dev/null
+++ b/tests/test_kernel_box.cpp
@@ -0,0 +1,44 @@
+#include <gtest/gtest.h>
+#include <vector>
+#include <random>
+#include "gpc/kernels/box.hpp"     // Naive version
+#include "gpc/kernels/box_hwy.hpp" // Highway version
+
+TEST(Approval, BoxKernel) {
+    const int width = 640;
+    const int height = 480;
+    const int radius = 2; // Typical for 5x5 box
+
+    // 1. Prepare randomized input
+    std::vector<uint8_t> input(width * height);
+    std::mt19937 gen(42); 
+    std::uniform_int_distribution<> dis(0, 255);
+    for (auto& val : input) val = dis(gen);
+
+    // 2. Prepare output buffers
+    std::vector<uint8_t> outNaive(width * height, 0);
+    std::vector<uint8_t> outHighway(width * height, 0);
+
+    // 3. Run Naive version
+    ndb::boxNaive(input.data(), outNaive.data(), width, height);
+
+    // 4. Run Highway version (only if compiled for the target)
+#if defined(HWY_TARGET) && HWY_TARGET == HWY_NEON
+    ndb::N_NEON::BoxFilter(input.data(), outHighway.data(), width, height);
+#else
+    // Fallback if the specific NEON namespace isn't exposed
+    ndb::testing::box_hwy(input.data(), outHighway.data(), width, height);
+
+#endif
+
+    // 5. Compare results
+    // We skip the border (radius) because different implementations 
+    // might handle edges differently.
+    for (int y = radius; y < height - radius; ++y) {
+        for (int x = radius; x < width - radius; ++x) {
+            int idx = y * width + x;
+            ASSERT_EQ(outNaive[idx], outHighway[idx]) 
+                << "Mismatch at (" << x << "," << y << ")";
+        }
+    }
+}
diff --git a/tests/test_kernel_sobel.cpp b/tests/test_kernel_sobel.cpp
new file mode 100644
index 0000000..fd5b30d
--- /dev/null
+++ b/tests/test_kernel_sobel.cpp
@@ -0,0 +1,44 @@
+#include <gtest/gtest.h>
+#include <vector>
+#include <random>
+#include "gpc/kernels/sobel.hpp"     // Naive version
+#include "gpc/kernels/sobel_hwy.hpp" // Highway version
+
+TEST(Approval, SobelKernel) {
+    const int width = 640;
+    const int height = 480;
+    const int radius = 2; // Typical for 5x5 box
+
+    // 1. Prepare randomized input
+    std::vector<uint8_t> input(width * height);
+    std::mt19937 gen(42); 
+    std::uniform_int_distribution<> dis(0, 255);
+    for (auto& val : input) val = dis(gen);
+
+    // 2. Prepare output buffers
+    std::vector<uint8_t> outNaive(width * height, 0);
+    std::vector<uint8_t> outHighway(width * height, 0);
+
+    // 3. Run Naive version
+    ndb::sobelNaive(input.data(), outNaive.data(), width, height, 30);
+
+    // 4. Run Highway version (only if compiled for the target)
+#if defined(HWY_TARGET) && HWY_TARGET == HWY_NEON
+    ndb::N_NEON::BoxFilter(input.data(), outHighway.data(), width, height);
+#else
+    // Fallback if the specific NEON namespace isn't exposed
+    ndb::testing::sobel_hwy(input.data(), outHighway.data(), width, height, 30);
+
+#endif
+
+    // 5. Compare results
+    // We skip the border (radius) because different implementations 
+    // might handle edges differently.
+    for (int y = radius; y < height - radius; ++y) {
+        for (int x = radius; x < width - radius; ++x) {
+            int idx = y * width + x;
+            ASSERT_EQ(outNaive[idx], outHighway[idx]) 
+                << "Mismatch at (" << x << "," << y << ")";
+        }
+    }
+}

From 018e23c91a240a086a89242263e115ce5531d6bf Mon Sep 17 00:00:00 2001
From: Nik Bamert <github@nikbamert.com>
Date: Sun, 22 Feb 2026 10:31:09 +0100
Subject: [PATCH 21/36] move

---
 benchmarks/{CMakeLists_decouple_branch.txt => CMakeLists.txt} | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
 rename benchmarks/{CMakeLists_decouple_branch.txt => CMakeLists.txt} (76%)

diff --git a/benchmarks/CMakeLists_decouple_branch.txt b/benchmarks/CMakeLists.txt
similarity index 76%
rename from benchmarks/CMakeLists_decouple_branch.txt
rename to benchmarks/CMakeLists.txt
index efecb51..6a97132 100644
--- a/benchmarks/CMakeLists_decouple_branch.txt
+++ b/benchmarks/CMakeLists.txt
@@ -7,5 +7,5 @@ target_link_libraries(kernel_bench
         hwy
 )
 
-# allows the compiler to inline Highway kernels into the benchmark loop
+G# allows the compiler to inline Highway kernels into the benchmark loop
 set_target_properties(kernel_bench PROPERTIES INTERPROCEDURAL_OPTIMIZATION TRUE)

From 83c18b50239a7c3141f2d7f0e6bcf88ee07f97e4 Mon Sep 17 00:00:00 2001
From: Nik Bamert <github@nikbamert.com>
Date: Sun, 22 Feb 2026 12:36:08 +0100
Subject: [PATCH 22/36] rename

---
 benchmarks/box_legacy_bench.cpp   | 4 ++--
 benchmarks/sobel_legacy_bench.cpp | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/benchmarks/box_legacy_bench.cpp b/benchmarks/box_legacy_bench.cpp
index 39a5edf..c8960a3 100644
--- a/benchmarks/box_legacy_bench.cpp
+++ b/benchmarks/box_legacy_bench.cpp
@@ -1,7 +1,7 @@
 #include <benchmark/benchmark.h>
 #include "gpc/kernels/box.hpp" 
 
-static void BM_BoxHighway(benchmark::State& state) {
+static void BM_BoxLegacy(benchmark::State& state) {
     int w = 1920, h = 1080;
     std::vector<uint8_t> in(w * h, 128);
     std::vector<uint8_t> out(w * h, 0);
@@ -16,6 +16,6 @@ static void BM_BoxHighway(benchmark::State& state) {
     
     state.SetBytesProcessed(int64_t(state.iterations()) * w * h);
 }
-BENCHMARK(BM_BoxHighway)->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_BoxLegacy)->Unit(benchmark::kMillisecond);
 
 BENCHMARK_MAIN();
diff --git a/benchmarks/sobel_legacy_bench.cpp b/benchmarks/sobel_legacy_bench.cpp
index b20d502..d00c2c6 100644
--- a/benchmarks/sobel_legacy_bench.cpp
+++ b/benchmarks/sobel_legacy_bench.cpp
@@ -1,7 +1,7 @@
 #include <benchmark/benchmark.h>
 #include "gpc/kernels/sobel.hpp" 
 
-static void BM_SobelHighway(benchmark::State& state) {
+static void BM_SobelLegacy(benchmark::State& state) {
     int w = 1920, h = 1080;
     std::vector<uint8_t> in(w * h, 128);
     std::vector<uint8_t> out(w * h, 0);
@@ -16,6 +16,6 @@ static void BM_SobelHighway(benchmark::State& state) {
     
     state.SetBytesProcessed(int64_t(state.iterations()) * w * h);
 }
-BENCHMARK(BM_SobelHighway)->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_SobelLegacy)->Unit(benchmark::kMillisecond);
 
 BENCHMARK_MAIN();

From 5358de18d0d47f10622ee6e765cb5185f797ec0b Mon Sep 17 00:00:00 2001
From: Nik Bamert <github@nikbamert.com>
Date: Sun, 22 Feb 2026 12:57:26 +0100
Subject: [PATCH 23/36] update hwy kernels

---
 lib/gpc/kernels/box_hwy.cpp   | 125 +++++++-----------------------
 lib/gpc/kernels/sobel_hwy.cpp | 139 +++++++---------------------------
 lib/gpc/kernels/utils.cpp     |   1 +
 3 files changed, 56 insertions(+), 209 deletions(-)

diff --git a/lib/gpc/kernels/box_hwy.cpp b/lib/gpc/kernels/box_hwy.cpp
index 384360e..7cdd983 100644
--- a/lib/gpc/kernels/box_hwy.cpp
+++ b/lib/gpc/kernels/box_hwy.cpp
@@ -6,65 +6,15 @@ namespace ndb {
 namespace HWY_NAMESPACE {
 namespace hn = hwy::HWY_NAMESPACE;
 
-void BoxKernelNaive(const uint8_t* HWY_RESTRICT in, uint8_t* HWY_RESTRICT blurred, int width, int height) {
-    const hn::ScalableTag<uint8_t> d8;
-    const hn::ScalableTag<uint16_t> d16;
-    const size_t N = hn::Lanes(d8);
-    const auto divisor = hn::Set(d16, (uint16_t)7282); // 65536 / 9
-
-    for (int y = 1; y < height - 1; ++y) {
-        const uint8_t* r0 = in + (y - 1) * width;
-        const uint8_t* r1 = in + y * width;
-        const uint8_t* r2 = in + (y + 1) * width;
-        
-        uint8_t* out_row = blurred + y * width + 1;
-
-        for (int x = 0; x < width; x += N) {
-            // Row 0
-            auto v11 = hn::LoadU(d8, r0 + x);
-            auto v12 = hn::LoadU(d8, r0 + x + 1);
-            auto v13 = hn::LoadU(d8, r0 + x + 2);
-            
-            // Row 1
-            auto v21 = hn::LoadU(d8, r1 + x);
-            auto v22 = hn::LoadU(d8, r1 + x + 1);
-            auto v23 = hn::LoadU(d8, r1 + x + 2);
-
-            // Row 2
-            auto v31 = hn::LoadU(d8, r2 + x);
-            auto v32 = hn::LoadU(d8, r2 + x + 1);
-            auto v33 = hn::LoadU(d8, r2 + x + 2);
-
-            // Vertical sums first (3 instructions per half-vector)
-            auto sum_col1_lo = hn::Add(hn::PromoteLowerTo(d16, v11), hn::Add(hn::PromoteLowerTo(d16, v21), hn::PromoteLowerTo(d16, v31)));
-            auto sum_col1_hi = hn::Add(hn::PromoteUpperTo(d16, v11), hn::Add(hn::PromoteUpperTo(d16, v21), hn::PromoteUpperTo(d16, v31)));
-
-            auto sum_col2_lo = hn::Add(hn::PromoteLowerTo(d16, v12), hn::Add(hn::PromoteLowerTo(d16, v22), hn::PromoteLowerTo(d16, v32)));
-            auto sum_col2_hi = hn::Add(hn::PromoteUpperTo(d16, v12), hn::Add(hn::PromoteUpperTo(d16, v22), hn::PromoteUpperTo(d16, v32)));
-
-            auto sum_col3_lo = hn::Add(hn::PromoteLowerTo(d16, v13), hn::Add(hn::PromoteLowerTo(d16, v23), hn::PromoteLowerTo(d16, v33)));
-            auto sum_col3_hi = hn::Add(hn::PromoteUpperTo(d16, v13), hn::Add(hn::PromoteUpperTo(d16, v23), hn::PromoteUpperTo(d16, v33)));
 
-            // Horizontal accumulation
-            auto total_lo = hn::Add(sum_col1_lo, hn::Add(sum_col2_lo, sum_col3_lo));
-            auto total_hi = hn::Add(sum_col1_hi, hn::Add(sum_col2_hi, sum_col3_hi));
-
-            // Fixed-point division by 9
-            auto res_lo = hn::MulHigh(total_lo, divisor);
-            auto res_hi = hn::MulHigh(total_hi, divisor);
-            
-            hn::StoreU(hn::Combine(d8, hn::DemoteTo(d8, res_hi), hn::DemoteTo(d8, res_lo)), d8, out_row + x);
-        }
-    }
-}
 void BoxKernel(const uint8_t* HWY_RESTRICT in, uint8_t* HWY_RESTRICT blurred, int width, int height) {
     const hn::ScalableTag<uint8_t> d8;
-    const hn::ScalableTag<uint16_t> d16;
+    // We need d16 to be the "Promoted" version of the half-width d8 to stay lane-consistent
+    const hn::Rebind<uint16_t, hn::Half<decltype(d8)>> d16;
+    
     const size_t N = hn::Lanes(d8);
     const auto divisor = hn::Set(d16, (uint16_t)7282);
 
-    // We process two output rows at a time (y and y+1)
-    // This requires 4 input rows (r0, r1, r2, r3)
     for (int y = 1; y < height - 2; y += 2) {
         const uint8_t* r0 = in + (y - 1) * width;
         const uint8_t* r1 = in + y * width;
@@ -75,65 +25,46 @@ void BoxKernel(const uint8_t* HWY_RESTRICT in, uint8_t* HWY_RESTRICT blurred, in
         uint8_t* out1 = blurred + (y + 1) * width + 1;
 
         for (int x = 0; x < width; x += N) {
-            // Load all 4 rows needed for 2 output rows
-            auto v0_0 = hn::LoadU(d8, r0 + x);
-            auto v0_1 = hn::LoadU(d8, r0 + x + 1);
-            auto v0_2 = hn::LoadU(d8, r0 + x + 2);
-
-            auto v1_0 = hn::LoadU(d8, r1 + x);
-            auto v1_1 = hn::LoadU(d8, r1 + x + 1);
-            auto v1_2 = hn::LoadU(d8, r1 + x + 2);
-
-            auto v2_0 = hn::LoadU(d8, r2 + x);
-            auto v2_1 = hn::LoadU(d8, r2 + x + 1);
-            auto v2_2 = hn::LoadU(d8, r2 + x + 2);
-
-            auto v3_0 = hn::LoadU(d8, r3 + x);
-            auto v3_1 = hn::LoadU(d8, r3 + x + 1);
-            auto v3_2 = hn::LoadU(d8, r3 + x + 2);
-
-            // Vertical sums for Row Pair 1 (Rows 0, 1, 2)
-            // Vertical sums for Row Pair 2 (Rows 1, 2, 3)
-            // Note: Rows 1 and 2 are REUSED.
-            
-            auto s1_lo = hn::Add(hn::PromoteLowerTo(d16, v1_1), hn::Add(hn::PromoteLowerTo(d16, v1_0), hn::PromoteLowerTo(d16, v1_2)));
-            auto s2_lo = hn::Add(hn::PromoteLowerTo(d16, v2_1), hn::Add(hn::PromoteLowerTo(d16, v2_0), hn::PromoteLowerTo(d16, v2_2)));
+            auto v0_0 = hn::LoadU(d8, r0 + x); auto v0_1 = hn::LoadU(d8, r0 + x + 1); auto v0_2 = hn::LoadU(d8, r0 + x + 2);
+            auto v1_0 = hn::LoadU(d8, r1 + x); auto v1_1 = hn::LoadU(d8, r1 + x + 1); auto v1_2 = hn::LoadU(d8, r1 + x + 2);
+            auto v2_0 = hn::LoadU(d8, r2 + x); auto v2_1 = hn::LoadU(d8, r2 + x + 1); auto v2_2 = hn::LoadU(d8, r2 + x + 2);
+            auto v3_0 = hn::LoadU(d8, r3 + x); auto v3_1 = hn::LoadU(d8, r3 + x + 1); auto v3_2 = hn::LoadU(d8, r3 + x + 2);
+
+            // Helper to sum 3 promoted pixels
+            auto sum3 = [&](auto v0, auto v1, auto v2) {
+                return hn::Add(v1, hn::Add(v0, v2));
+            };
+
+            // LOWER HALF
+            auto s1_lo = sum3(hn::PromoteTo(d16, hn::LowerHalf(v1_0)), hn::PromoteTo(d16, hn::LowerHalf(v1_1)), hn::PromoteTo(d16, hn::LowerHalf(v1_2)));
+            auto s2_lo = sum3(hn::PromoteTo(d16, hn::LowerHalf(v2_0)), hn::PromoteTo(d16, hn::LowerHalf(v2_1)), hn::PromoteTo(d16, hn::LowerHalf(v2_2)));
             
-            // Output Row 0 logic
-            auto s0_lo = hn::Add(hn::PromoteLowerTo(d16, v0_1), hn::Add(hn::PromoteLowerTo(d16, v0_0), hn::PromoteLowerTo(d16, v0_2)));
-            auto row0_lo = hn::Add(s0_lo, hn::Add(s1_lo, s2_lo));
+            auto row0_lo = hn::Add(sum3(hn::PromoteTo(d16, hn::LowerHalf(v0_0)), hn::PromoteTo(d16, hn::LowerHalf(v0_1)), hn::PromoteTo(d16, hn::LowerHalf(v0_2))), hn::Add(s1_lo, s2_lo));
+            auto row1_lo = hn::Add(sum3(hn::PromoteTo(d16, hn::LowerHalf(v3_0)), hn::PromoteTo(d16, hn::LowerHalf(v3_1)), hn::PromoteTo(d16, hn::LowerHalf(v3_2))), hn::Add(s1_lo, s2_lo));
 
-            // Output Row 1 logic
-            auto s3_lo = hn::Add(hn::PromoteLowerTo(d16, v3_1), hn::Add(hn::PromoteLowerTo(d16, v3_0), hn::PromoteLowerTo(d16, v3_2)));
-            auto row1_lo = hn::Add(s3_lo, hn::Add(s1_lo, s2_lo));
-
-            // Repeat for high bits...
-            auto s1_hi = hn::Add(hn::PromoteUpperTo(d16, v1_1), hn::Add(hn::PromoteUpperTo(d16, v1_0), hn::PromoteUpperTo(d16, v1_2)));
-            auto s2_hi = hn::Add(hn::PromoteUpperTo(d16, v2_1), hn::Add(hn::PromoteUpperTo(d16, v2_0), hn::PromoteUpperTo(d16, v2_2)));
+            // UPPER HALF
+            auto s1_hi = sum3(hn::PromoteTo(d16, hn::UpperHalf(d8, v1_0)), hn::PromoteTo(d16, hn::UpperHalf(d8, v1_1)), hn::PromoteTo(d16, hn::UpperHalf(d8, v1_2)));
+            auto s2_hi = sum3(hn::PromoteTo(d16, hn::UpperHalf(d8, v2_0)), hn::PromoteTo(d16, hn::UpperHalf(d8, v2_1)), hn::PromoteTo(d16, hn::UpperHalf(d8, v2_2)));
             
-            auto s0_hi = hn::Add(hn::PromoteUpperTo(d16, v0_1), hn::Add(hn::PromoteUpperTo(d16, v0_0), hn::PromoteUpperTo(d16, v0_2)));
-            auto row0_hi = hn::Add(s0_hi, hn::Add(s1_hi, s2_hi));
-
-            auto s3_hi = hn::Add(hn::PromoteUpperTo(d16, v3_1), hn::Add(hn::PromoteUpperTo(d16, v3_0), hn::PromoteUpperTo(d16, v3_2)));
-            auto row1_hi = hn::Add(s3_hi, hn::Add(s1_hi, s2_hi));
+            auto row0_hi = hn::Add(sum3(hn::PromoteTo(d16, hn::UpperHalf(d8, v0_0)), hn::PromoteTo(d16, hn::UpperHalf(d8, v0_1)), hn::PromoteTo(d16, hn::UpperHalf(d8, v0_2))), hn::Add(s1_hi, s2_hi));
+            auto row1_hi = hn::Add(sum3(hn::PromoteTo(d16, hn::UpperHalf(d8, v3_0)), hn::PromoteTo(d16, hn::UpperHalf(d8, v3_1)), hn::PromoteTo(d16, hn::UpperHalf(d8, v3_2))), hn::Add(s1_hi, s2_hi));
 
-            // Store both rows
-            hn::StoreU(hn::Combine(d8, hn::DemoteTo(d8, hn::MulHigh(row0_hi, divisor)), 
-                                       hn::DemoteTo(d8, hn::MulHigh(row0_lo, divisor))), d8, out0 + x);
-            hn::StoreU(hn::Combine(d8, hn::DemoteTo(d8, hn::MulHigh(row1_hi, divisor)), 
-                                       hn::DemoteTo(d8, hn::MulHigh(row1_lo, divisor))), d8, out1 + x);
+            // Perform normalization and store using OrderedDemote2To
+            hn::StoreU(hn::OrderedDemote2To(d8, hn::MulHigh(row0_lo, divisor), hn::MulHigh(row0_hi, divisor)), d8, out0 + x);
+            hn::StoreU(hn::OrderedDemote2To(d8, hn::MulHigh(row1_lo, divisor), hn::MulHigh(row1_hi, divisor)), d8, out1 + x);
         }
     }
 }
-
 } // namespace HWY_NAMESPACE
 } // namespace ndb
 HWY_AFTER_NAMESPACE();
 
 namespace ndb {
 namespace testing {
+#if defined(HWY_TARGET) && HWY_TARGET == HWY_NEON
     void box_hwy(uint8_t* in, uint8_t* blurred, int width, int height) {
         ndb::N_NEON::BoxKernel(in, blurred, width, height);
     }
+#endif
 }
 }
diff --git a/lib/gpc/kernels/sobel_hwy.cpp b/lib/gpc/kernels/sobel_hwy.cpp
index 1e395c4..97abee3 100644
--- a/lib/gpc/kernels/sobel_hwy.cpp
+++ b/lib/gpc/kernels/sobel_hwy.cpp
@@ -6,12 +6,11 @@ namespace ndb {
 namespace HWY_NAMESPACE {
 namespace hn = hwy::HWY_NAMESPACE;
 
-
 void SobelKernel(const uint8_t* HWY_RESTRICT in, uint8_t* HWY_RESTRICT gradient, 
                  int width, int height, uint8_t threshold) {
     const hn::ScalableTag<uint8_t> d8;
-    const hn::Rebind<int16_t, hn::Half<decltype(d8)>> d16; // Signed 16-bit, half the lanes of d8
-    const hn::Half<decltype(d8)> d8_half; // Tag for half-width 8-bit loads
+    // d16 will have half the lanes of d8, regardless of whether N=16 (NEON) or N=32 (AVX2)
+    const hn::Rebind<int16_t, hn::Half<decltype(d8)>> d16; 
     
     const size_t N = hn::Lanes(d8);
     const auto divisor = hn::Set(d16, (int16_t)7282); 
@@ -26,23 +25,13 @@ void SobelKernel(const uint8_t* HWY_RESTRICT in, uint8_t* HWY_RESTRICT gradient,
         uint8_t* out = gradient + y * width + 1;
 
         for (int x = 0; x < width; x += N) {
-            // Load full 128-bit vectors
+            // Load full vectors (128-bit on NEON, 256-bit on AVX2)
             auto v11 = hn::LoadU(d8, r0 + x); auto v12 = hn::LoadU(d8, r0 + x + 1); auto v13 = hn::LoadU(d8, r0 + x + 2);
             auto v21 = hn::LoadU(d8, r1 + x);                                      auto v23 = hn::LoadU(d8, r1 + x + 2);
             auto v31 = hn::LoadU(d8, r2 + x); auto v32 = hn::LoadU(d8, r2 + x + 1); auto v33 = hn::LoadU(d8, r2 + x + 2);
 
-            // LOWER HALF PROCESSING
-            {
-                // PromoteTo signed 16-bit from the lower half of our 8-bit vectors
-                auto p11 = hn::PromoteTo(d16, hn::LowerHalf(v11));
-                auto p12 = hn::PromoteTo(d16, hn::LowerHalf(v12));
-                auto p13 = hn::PromoteTo(d16, hn::LowerHalf(v13));
-                auto p21 = hn::PromoteTo(d16, hn::LowerHalf(v21));
-                auto p23 = hn::PromoteTo(d16, hn::LowerHalf(v23));
-                auto p31 = hn::PromoteTo(d16, hn::LowerHalf(v31));
-                auto p32 = hn::PromoteTo(d16, hn::LowerHalf(v32));
-                auto p33 = hn::PromoteTo(d16, hn::LowerHalf(v33));
-
+            // Helper lambda to process 8-bit to 16-bit math for a specific half
+            auto process_half = [&](auto p11, auto p12, auto p13, auto p21, auto p23, auto p31, auto p32, auto p33) {
                 auto sx = hn::Sub(hn::Add(hn::Add(p11, p31), hn::Add(p21, p21)), 
                                   hn::Add(hn::Add(p13, p33), hn::Add(p23, p23)));
                 sx = hn::MulHigh(sx, divisor);
@@ -52,114 +41,40 @@ void SobelKernel(const uint8_t* HWY_RESTRICT in, uint8_t* HWY_RESTRICT gradient,
                 sy = hn::MulHigh(sy, divisor);
 
                 auto mag = hn::Add(hn::Mul(sx, sx), hn::Mul(sy, sy));
-                auto mask = hn::Gt(mag, threshSq);
-                auto res_lo = hn::DemoteTo(d8_half, hn::IfThenElse(mask, v255, v0));
-
-                // UPPER HALF PROCESSING
-                auto u11 = hn::PromoteTo(d16, hn::UpperHalf(d8, v11));
-                auto u12 = hn::PromoteTo(d16, hn::UpperHalf(d8, v12));
-                auto u13 = hn::PromoteTo(d16, hn::UpperHalf(d8, v13));
-                auto u21 = hn::PromoteTo(d16, hn::UpperHalf(d8, v21));
-                auto u23 = hn::PromoteTo(d16, hn::UpperHalf(d8, v23));
-                auto u31 = hn::PromoteTo(d16, hn::UpperHalf(d8, v31));
-                auto u32 = hn::PromoteTo(d16, hn::UpperHalf(d8, v32));
-                auto u33 = hn::PromoteTo(d16, hn::UpperHalf(d8, v33));
-
-                auto sx_u = hn::Sub(hn::Add(hn::Add(u11, u31), hn::Add(u21, u21)), 
-                                    hn::Add(hn::Add(u13, u33), hn::Add(u23, u23)));
-                sx_u = hn::MulHigh(sx_u, divisor);
-
-                auto sy_u = hn::Sub(hn::Add(hn::Add(u11, u13), hn::Add(u12, u12)), 
-                                    hn::Add(hn::Add(u31, u33), hn::Add(u32, u32)));
-                sy_u = hn::MulHigh(sy_u, divisor);
-
-                auto mag_u = hn::Add(hn::Mul(sx_u, sx_u), hn::Mul(sy_u, sy_u));
-                auto mask_u = hn::Gt(mag_u, threshSq);
-                auto res_hi = hn::DemoteTo(d8_half, hn::IfThenElse(mask_u, v255, v0));
-
-                hn::StoreU(hn::Combine(d8, res_hi, res_lo), d8, out + x);
-            }
+                return hn::IfThenElse(hn::Gt(mag, threshSq), v255, v0);
+            };
+
+            // 1. Promote and process Lower Half
+            auto res_lo = process_half(
+                hn::PromoteTo(d16, hn::LowerHalf(v11)), hn::PromoteTo(d16, hn::LowerHalf(v12)), hn::PromoteTo(d16, hn::LowerHalf(v13)),
+                hn::PromoteTo(d16, hn::LowerHalf(v21)), hn::PromoteTo(d16, hn::LowerHalf(v23)),
+                hn::PromoteTo(d16, hn::LowerHalf(v31)), hn::PromoteTo(d16, hn::LowerHalf(v32)), hn::PromoteTo(d16, hn::LowerHalf(v33))
+            );
+
+            // 2. Promote and process Upper Half
+            auto res_hi = process_half(
+                hn::PromoteTo(d16, hn::UpperHalf(d8, v11)), hn::PromoteTo(d16, hn::UpperHalf(d8, v12)), hn::PromoteTo(d16, hn::UpperHalf(d8, v13)),
+                hn::PromoteTo(d16, hn::UpperHalf(d8, v21)), hn::PromoteTo(d16, hn::UpperHalf(d8, v23)),
+                hn::PromoteTo(d16, hn::UpperHalf(d8, v31)), hn::PromoteTo(d16, hn::UpperHalf(d8, v32)), hn::PromoteTo(d16, hn::UpperHalf(d8, v33))
+            );
+
+            // 3. The "Magic" fix: OrderedDemote2To handles the cross-lane logic for AVX2 automatically
+            auto result8 = hn::OrderedDemote2To(d8, res_lo, res_hi);
+            hn::StoreU(result8, d8, out + x);
         }
     }
 }
-void SobelKerneli(const uint8_t* HWY_RESTRICT in, uint8_t* HWY_RESTRICT gradient, 
-                 int width, int height, uint8_t threshold) {
-    const hn::ScalableTag<uint8_t> d8;
-    const hn::Rebind<int16_t, hn::Half<decltype(d8)>> d16; 
-    const hn::Half<decltype(d8)> d8_half; 
-    
-    const size_t N = hn::Lanes(d8);
-    // Multiply threshold by 9 BEFORE squaring to match the "no-division" math
-    int16_t tScaled = (int16_t)threshold * 9;
-    const auto threshSq = hn::Set(d16, tScaled * tScaled);
-    
-    const auto v255 = hn::Set(d16, 255);
-    const auto v0 = hn::Zero(d16);
 
-    for (int y = 1; y < height - 1; ++y) {
-        const uint8_t* r0 = in + (y - 1) * width;
-        const uint8_t* r1 = in + y * width;
-        const uint8_t* r2 = in + (y + 1) * width;
-        uint8_t* out = gradient + y * width + 1;
-
-        for (int x = 0; x < width; x += N) {
-            auto v11 = hn::LoadU(d8, r0 + x); auto v12 = hn::LoadU(d8, r0 + x + 1); auto v13 = hn::LoadU(d8, r0 + x + 2);
-            auto v21 = hn::LoadU(d8, r1 + x);                                      auto v23 = hn::LoadU(d8, r1 + x + 2);
-            auto v31 = hn::LoadU(d8, r2 + x); auto v32 = hn::LoadU(d8, r2 + x + 1); auto v33 = hn::LoadU(d8, r2 + x + 2);
-
-            // LOWER HALF
-            {
-                auto p11 = hn::PromoteTo(d16, hn::LowerHalf(v11));
-                auto p12 = hn::PromoteTo(d16, hn::LowerHalf(v12));
-                auto p13 = hn::PromoteTo(d16, hn::LowerHalf(v13));
-                auto p21 = hn::PromoteTo(d16, hn::LowerHalf(v21));
-                auto p23 = hn::PromoteTo(d16, hn::LowerHalf(v23));
-                auto p31 = hn::PromoteTo(d16, hn::LowerHalf(v31));
-                auto p32 = hn::PromoteTo(d16, hn::LowerHalf(v32));
-                auto p33 = hn::PromoteTo(d16, hn::LowerHalf(v33));
-
-                auto sx = hn::Sub(hn::Add(hn::Add(p11, p31), hn::Add(p21, p21)), 
-                                  hn::Add(hn::Add(p13, p33), hn::Add(p23, p23)));
-                auto sy = hn::Sub(hn::Add(hn::Add(p11, p13), hn::Add(p12, p12)), 
-                                  hn::Add(hn::Add(p31, p33), hn::Add(p32, p32)));
-
-                // Removed MulHigh (division). Math is now: (sx*sx + sy*sy) > (threshold*9)^2
-                auto mag = hn::Add(hn::Mul(sx, sx), hn::Mul(sy, sy));
-                auto mask = hn::Gt(mag, threshSq);
-                auto res_lo = hn::DemoteTo(d8_half, hn::IfThenElse(mask, v255, v0));
-
-                // UPPER HALF
-                auto u11 = hn::PromoteTo(d16, hn::UpperHalf(d8, v11));
-                auto u12 = hn::PromoteTo(d16, hn::UpperHalf(d8, v12));
-                auto u13 = hn::PromoteTo(d16, hn::UpperHalf(d8, v13));
-                auto u21 = hn::PromoteTo(d16, hn::UpperHalf(d8, v21));
-                auto u23 = hn::PromoteTo(d16, hn::UpperHalf(d8, v23));
-                auto u31 = hn::PromoteTo(d16, hn::UpperHalf(d8, v31));
-                auto u32 = hn::PromoteTo(d16, hn::UpperHalf(d8, v32));
-                auto u33 = hn::PromoteTo(d16, hn::UpperHalf(d8, v33));
-
-                auto sx_u = hn::Sub(hn::Add(hn::Add(u11, u31), hn::Add(u21, u21)), 
-                                    hn::Add(hn::Add(u13, u33), hn::Add(u23, u23)));
-                auto sy_u = hn::Sub(hn::Add(hn::Add(u11, u13), hn::Add(u12, u12)), 
-                                    hn::Add(hn::Add(u31, u33), hn::Add(u32, u32)));
-
-                auto mag_u = hn::Add(hn::Mul(sx_u, sx_u), hn::Mul(sy_u, sy_u));
-                auto mask_u = hn::Gt(mag_u, threshSq);
-                auto res_hi = hn::DemoteTo(d8_half, hn::IfThenElse(mask_u, v255, v0));
-
-                hn::StoreU(hn::Combine(d8, res_hi, res_lo), d8, out + x);
-            }
-        }
-    }
-}
 } // namespace HWY_NAMESPACE
 } // namespace ndb
 HWY_AFTER_NAMESPACE();
 
 namespace ndb {
 namespace testing {
+#if defined(HWY_TARGET) && HWY_TARGET == HWY_NEON
     void sobel_hwy(uint8_t* in, uint8_t* blurred, int width, int height, uint8_t threshold) {
         ndb::N_NEON::SobelKernel(in, blurred, width, height, threshold);
     }
+#endif  
 }
 }
diff --git a/lib/gpc/kernels/utils.cpp b/lib/gpc/kernels/utils.cpp
index dd5d146..796c2b1 100644
--- a/lib/gpc/kernels/utils.cpp
+++ b/lib/gpc/kernels/utils.cpp
@@ -30,6 +30,7 @@
 // Code Author: Niklaus Bamert (bamertn@ethz.ch)
 #include <cassert>
 #include <thread>
+#include <functional>
 
 using namespace std;
 

From 8f654841f0cf45e1eeeb4e0c97995051aa6203fb Mon Sep 17 00:00:00 2001
From: Nik Bamert <github@nikbamert.com>
Date: Sun, 22 Feb 2026 15:04:26 +0100
Subject: [PATCH 24/36] static dispatch

---
 CMakeLists.txt                    | 34 ++++++-----------------
 benchmarks/sobel_legacy_bench.cpp |  4 ++-
 lib/gpc/kernels/box_hwy.cpp       | 45 ++++++++++++++-----------------
 lib/gpc/kernels/sobel.cpp         |  8 +++---
 lib/gpc/kernels/sobel.hpp         |  6 +++++
 lib/gpc/kernels/sobel_hwy.cpp     | 43 +++++++++++++----------------
 lib/gpc/kernels/utils.cpp         |  5 ++--
 lib/gpc/kernels/utils.hpp         |  4 +--
 samples/CMakeLists.txt            |  4 +++
 tests/test_kernel_box.cpp         |  5 ++--
 tests/test_kernel_sobel.cpp       |  5 ++--
 11 files changed, 73 insertions(+), 90 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 629477a..dfcdd61 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,7 @@
 cmake_minimum_required(VERSION 3.10)
 include(CheckCXXCompilerFlag)
 include(CheckCXXSourceRuns)
+include(CMakePushCheckState)
 project(openGPC CXX)
 set (REQ_CPP11_FEATURES  cxx_strong_enums cxx_auto_type)
 if(NOT CMAKE_BUILD_TYPE)
@@ -19,32 +20,6 @@ find_package(Threads REQUIRED)
 include_directories(${EIGEN3_INCLUDE_DIR})
 include_directories(${PNG_INCLUDE_DIRS})
 include_directories(lib)
-
-#By default, use SSE intrinsics
-option(SSE "Enable SSE/AVX optimizations if available" ON)
-
-add_compile_options(-O3 -funroll-loops -flto)
-if(APPLE AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
-    add_compile_options(-mcpu=apple-m1)
-elseif(NOT MSVC)
-    add_compile_options(-march=native)
-endif()
-if(SSE)
-    message(STATUS "Checking if target CPU supports AVX2 instructions...")
-    check_cxx_source_runs("
-      #include <immintrin.h>
-      int main() {
-        __m256i x = _mm256_set1_epi32(1);
-        return _mm256_extract_epi32(x, 0);
-      }
-    " CPU_HAS_AVX2)
-
-    if(CPU_HAS_AVX2)
-      message(STATUS "AVX2: supported and enabled")
-      add_compile_definitions(_INTRINSICS_SSE)
-      add_compile_options(-mavx2 -march=core-avx2)
-    endif()
-endif()
 include(FetchContent)
 set(HWY_ENABLE_TESTS OFF CACHE BOOL "Disable Highway tests" FORCE)
 set(HWY_ENABLE_EXAMPLES OFF CACHE BOOL "Disable Highway examples" FORCE)
@@ -78,6 +53,13 @@ add_library(gpc_core
     lib/gpc/kernels/box_hwy.cpp
     lib/gpc/kernels/sobel_hwy.cpp
 )
+if(MSVC)
+    target_compile_options(gpc_core PUBLIC /arch:AVX2)
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64")
+    target_compile_options(gpc_core PUBLIC -march=native)
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64")
+    target_compile_options(gpc_core PUBLIC -mcpu=native)
+endif()
 target_link_libraries(gpc_core 
     PUBLIC 
         Eigen3::Eigen 
diff --git a/benchmarks/sobel_legacy_bench.cpp b/benchmarks/sobel_legacy_bench.cpp
index d00c2c6..7ffb183 100644
--- a/benchmarks/sobel_legacy_bench.cpp
+++ b/benchmarks/sobel_legacy_bench.cpp
@@ -7,7 +7,9 @@ static void BM_SobelLegacy(benchmark::State& state) {
     std::vector<uint8_t> out(w * h, 0);
 
     for (auto _ : state) {
-        ndb::sobel(in.data(), out.data(), w, h, 50, 1);
+        //ndb::sobel(in.data(), out.data(), w, h, 50, 1);
+        //ndb::sobelSSE(in.data(), out.data(), w, 1, h - 1, 1);
+        ndb::sobelNaive(in.data(), out.data(), w, h, 1);
         
         // Ensure the compiler doesn't skip the work
         benchmark::DoNotOptimize(out.data());
diff --git a/lib/gpc/kernels/box_hwy.cpp b/lib/gpc/kernels/box_hwy.cpp
index 7cdd983..3cc2736 100644
--- a/lib/gpc/kernels/box_hwy.cpp
+++ b/lib/gpc/kernels/box_hwy.cpp
@@ -9,8 +9,8 @@ namespace hn = hwy::HWY_NAMESPACE;
 
 void BoxKernel(const uint8_t* HWY_RESTRICT in, uint8_t* HWY_RESTRICT blurred, int width, int height) {
     const hn::ScalableTag<uint8_t> d8;
-    // We need d16 to be the "Promoted" version of the half-width d8 to stay lane-consistent
-    const hn::Rebind<uint16_t, hn::Half<decltype(d8)>> d16;
+    const hn::Half<decltype(d8)> d8_h;
+    const hn::Rebind<uint16_t, decltype(d8_h)> d16;
     
     const size_t N = hn::Lanes(d8);
     const auto divisor = hn::Set(d16, (uint16_t)7282);
@@ -25,31 +25,25 @@ void BoxKernel(const uint8_t* HWY_RESTRICT in, uint8_t* HWY_RESTRICT blurred, in
         uint8_t* out1 = blurred + (y + 1) * width + 1;
 
         for (int x = 0; x < width; x += N) {
-            auto v0_0 = hn::LoadU(d8, r0 + x); auto v0_1 = hn::LoadU(d8, r0 + x + 1); auto v0_2 = hn::LoadU(d8, r0 + x + 2);
-            auto v1_0 = hn::LoadU(d8, r1 + x); auto v1_1 = hn::LoadU(d8, r1 + x + 1); auto v1_2 = hn::LoadU(d8, r1 + x + 2);
-            auto v2_0 = hn::LoadU(d8, r2 + x); auto v2_1 = hn::LoadU(d8, r2 + x + 1); auto v2_2 = hn::LoadU(d8, r2 + x + 2);
-            auto v3_0 = hn::LoadU(d8, r3 + x); auto v3_1 = hn::LoadU(d8, r3 + x + 1); auto v3_2 = hn::LoadU(d8, r3 + x + 2);
+            auto v00 = hn::LoadU(d8, r0+x); auto v01 = hn::LoadU(d8, r0+x+1); auto v02 = hn::LoadU(d8, r0+x+2);
+            auto v10 = hn::LoadU(d8, r1+x); auto v11 = hn::LoadU(d8, r1+x+1); auto v12 = hn::LoadU(d8, r1+x+2);
+            auto v20 = hn::LoadU(d8, r2+x); auto v21 = hn::LoadU(d8, r2+x+1); auto v22 = hn::LoadU(d8, r2+x+2);
+            auto v30 = hn::LoadU(d8, r3+x); auto v31 = hn::LoadU(d8, r3+x+1); auto v32 = hn::LoadU(d8, r3+x+2);
 
-            // Helper to sum 3 promoted pixels
-            auto sum3 = [&](auto v0, auto v1, auto v2) {
-                return hn::Add(v1, hn::Add(v0, v2));
-            };
-
-            // LOWER HALF
-            auto s1_lo = sum3(hn::PromoteTo(d16, hn::LowerHalf(v1_0)), hn::PromoteTo(d16, hn::LowerHalf(v1_1)), hn::PromoteTo(d16, hn::LowerHalf(v1_2)));
-            auto s2_lo = sum3(hn::PromoteTo(d16, hn::LowerHalf(v2_0)), hn::PromoteTo(d16, hn::LowerHalf(v2_1)), hn::PromoteTo(d16, hn::LowerHalf(v2_2)));
+            // Lower Half Math
+            auto s1_lo = hn::Add(hn::PromoteTo(d16, hn::LowerHalf(v11)), hn::Add(hn::PromoteTo(d16, hn::LowerHalf(v10)), hn::PromoteTo(d16, hn::LowerHalf(v12))));
+            auto s2_lo = hn::Add(hn::PromoteTo(d16, hn::LowerHalf(v21)), hn::Add(hn::PromoteTo(d16, hn::LowerHalf(v20)), hn::PromoteTo(d16, hn::LowerHalf(v22))));
             
-            auto row0_lo = hn::Add(sum3(hn::PromoteTo(d16, hn::LowerHalf(v0_0)), hn::PromoteTo(d16, hn::LowerHalf(v0_1)), hn::PromoteTo(d16, hn::LowerHalf(v0_2))), hn::Add(s1_lo, s2_lo));
-            auto row1_lo = hn::Add(sum3(hn::PromoteTo(d16, hn::LowerHalf(v3_0)), hn::PromoteTo(d16, hn::LowerHalf(v3_1)), hn::PromoteTo(d16, hn::LowerHalf(v3_2))), hn::Add(s1_lo, s2_lo));
+            auto row0_lo = hn::Add(hn::Add(hn::PromoteTo(d16, hn::LowerHalf(v01)), hn::Add(hn::PromoteTo(d16, hn::LowerHalf(v00)), hn::PromoteTo(d16, hn::LowerHalf(v02)))), hn::Add(s1_lo, s2_lo));
+            auto row1_lo = hn::Add(hn::Add(hn::PromoteTo(d16, hn::LowerHalf(v31)), hn::Add(hn::PromoteTo(d16, hn::LowerHalf(v30)), hn::PromoteTo(d16, hn::LowerHalf(v32)))), hn::Add(s1_lo, s2_lo));
 
-            // UPPER HALF
-            auto s1_hi = sum3(hn::PromoteTo(d16, hn::UpperHalf(d8, v1_0)), hn::PromoteTo(d16, hn::UpperHalf(d8, v1_1)), hn::PromoteTo(d16, hn::UpperHalf(d8, v1_2)));
-            auto s2_hi = sum3(hn::PromoteTo(d16, hn::UpperHalf(d8, v2_0)), hn::PromoteTo(d16, hn::UpperHalf(d8, v2_1)), hn::PromoteTo(d16, hn::UpperHalf(d8, v2_2)));
+            // Upper Half Math
+            auto s1_hi = hn::Add(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v11)), hn::Add(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v10)), hn::PromoteTo(d16, hn::UpperHalf(d8_h, v12))));
+            auto s2_hi = hn::Add(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v21)), hn::Add(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v20)), hn::PromoteTo(d16, hn::UpperHalf(d8_h, v22))));
             
-            auto row0_hi = hn::Add(sum3(hn::PromoteTo(d16, hn::UpperHalf(d8, v0_0)), hn::PromoteTo(d16, hn::UpperHalf(d8, v0_1)), hn::PromoteTo(d16, hn::UpperHalf(d8, v0_2))), hn::Add(s1_hi, s2_hi));
-            auto row1_hi = hn::Add(sum3(hn::PromoteTo(d16, hn::UpperHalf(d8, v3_0)), hn::PromoteTo(d16, hn::UpperHalf(d8, v3_1)), hn::PromoteTo(d16, hn::UpperHalf(d8, v3_2))), hn::Add(s1_hi, s2_hi));
+            auto row0_hi = hn::Add(hn::Add(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v01)), hn::Add(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v00)), hn::PromoteTo(d16, hn::UpperHalf(d8_h, v02)))), hn::Add(s1_hi, s2_hi));
+            auto row1_hi = hn::Add(hn::Add(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v31)), hn::Add(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v30)), hn::PromoteTo(d16, hn::UpperHalf(d8_h, v32)))), hn::Add(s1_hi, s2_hi));
 
-            // Perform normalization and store using OrderedDemote2To
             hn::StoreU(hn::OrderedDemote2To(d8, hn::MulHigh(row0_lo, divisor), hn::MulHigh(row0_hi, divisor)), d8, out0 + x);
             hn::StoreU(hn::OrderedDemote2To(d8, hn::MulHigh(row1_lo, divisor), hn::MulHigh(row1_hi, divisor)), d8, out1 + x);
         }
@@ -61,10 +55,11 @@ HWY_AFTER_NAMESPACE();
 
 namespace ndb {
 namespace testing {
-#if defined(HWY_TARGET) && HWY_TARGET == HWY_NEON
+//#if defined(HWY_TARGET) && HWY_TARGET == HWY_NEON
     void box_hwy(uint8_t* in, uint8_t* blurred, int width, int height) {
-        ndb::N_NEON::BoxKernel(in, blurred, width, height);
+        //ndb::N_NEON::BoxKernel(in, blurred, width, height);
+        HWY_STATIC_DISPATCH(BoxKernel)(in, blurred, width, height);
     }
-#endif
+//#endif
 }
 }
diff --git a/lib/gpc/kernels/sobel.cpp b/lib/gpc/kernels/sobel.cpp
index dc6e46b..2817622 100644
--- a/lib/gpc/kernels/sobel.cpp
+++ b/lib/gpc/kernels/sobel.cpp
@@ -30,6 +30,7 @@
 // Code Author: Niklaus Bamert (bamertn@ethz.ch)
 #include <cassert>
 #include "gpc/kernels/sobel.hpp"
+#include "gpc/kernels/utils.hpp"
 namespace ndb {
 namespace testing { 
     void sobel_hwy(uint8_t* in, uint8_t* blurred, int width, int height, uint8_t threshold); 
@@ -77,13 +78,10 @@ void sobelNaive(
         }
     }
 }
-#ifdef _INTRINSICS_SSE
+//#ifdef _INTRINSICS_SSE
+#if HWY_TARGET == HWY_AVX2
 #include <immintrin.h>
 
-// Assuming your helper macros/inline funcs are defined elsewhere
-// pack16to8(lo, hi, res) 
-// unpack8to16(in, lo, hi)
-
 void sobelSSE(const uint8_t* in, uint8_t* blurred, 
                             int width, int start, int end, 
                             uint8_t threshold) {
diff --git a/lib/gpc/kernels/sobel.hpp b/lib/gpc/kernels/sobel.hpp
index 31749cb..c14b950 100644
--- a/lib/gpc/kernels/sobel.hpp
+++ b/lib/gpc/kernels/sobel.hpp
@@ -34,6 +34,12 @@
 #include "gpc/buffer.hpp"
 
 namespace ndb {
+#if HWY_TARGET == HWY_AVX2
+void sobelSSE(const uint8_t* in, uint8_t* blurred, 
+                            int width, int start, int end, 
+                            uint8_t threshold);
+ 
+#endif
 /**
  * @brief Naive 3x3 sobel filter implementation
  *
diff --git a/lib/gpc/kernels/sobel_hwy.cpp b/lib/gpc/kernels/sobel_hwy.cpp
index 97abee3..24afc32 100644
--- a/lib/gpc/kernels/sobel_hwy.cpp
+++ b/lib/gpc/kernels/sobel_hwy.cpp
@@ -6,12 +6,13 @@ namespace ndb {
 namespace HWY_NAMESPACE {
 namespace hn = hwy::HWY_NAMESPACE;
 
+
 void SobelKernel(const uint8_t* HWY_RESTRICT in, uint8_t* HWY_RESTRICT gradient, 
                  int width, int height, uint8_t threshold) {
     const hn::ScalableTag<uint8_t> d8;
-    // d16 will have half the lanes of d8, regardless of whether N=16 (NEON) or N=32 (AVX2)
-    const hn::Rebind<int16_t, hn::Half<decltype(d8)>> d16; 
-    
+    const hn::Half<decltype(d8)> d8_h; 
+    const hn::Rebind<int16_t, decltype(d8_h)> d16; 
+
     const size_t N = hn::Lanes(d8);
     const auto divisor = hn::Set(d16, (int16_t)7282); 
     const auto threshSq = hn::Set(d16, (int16_t)(threshold * threshold));
@@ -25,56 +26,48 @@ void SobelKernel(const uint8_t* HWY_RESTRICT in, uint8_t* HWY_RESTRICT gradient,
         uint8_t* out = gradient + y * width + 1;
 
         for (int x = 0; x < width; x += N) {
-            // Load full vectors (128-bit on NEON, 256-bit on AVX2)
             auto v11 = hn::LoadU(d8, r0 + x); auto v12 = hn::LoadU(d8, r0 + x + 1); auto v13 = hn::LoadU(d8, r0 + x + 2);
             auto v21 = hn::LoadU(d8, r1 + x);                                      auto v23 = hn::LoadU(d8, r1 + x + 2);
             auto v31 = hn::LoadU(d8, r2 + x); auto v32 = hn::LoadU(d8, r2 + x + 1); auto v33 = hn::LoadU(d8, r2 + x + 2);
 
-            // Helper lambda to process 8-bit to 16-bit math for a specific half
-            auto process_half = [&](auto p11, auto p12, auto p13, auto p21, auto p23, auto p31, auto p32, auto p33) {
+            auto process = [&](auto p11, auto p12, auto p13, auto p21, auto p23, auto p31, auto p32, auto p33) {
                 auto sx = hn::Sub(hn::Add(hn::Add(p11, p31), hn::Add(p21, p21)), 
                                   hn::Add(hn::Add(p13, p33), hn::Add(p23, p23)));
                 sx = hn::MulHigh(sx, divisor);
-
                 auto sy = hn::Sub(hn::Add(hn::Add(p11, p13), hn::Add(p12, p12)), 
                                   hn::Add(hn::Add(p31, p33), hn::Add(p32, p32)));
                 sy = hn::MulHigh(sy, divisor);
-
                 auto mag = hn::Add(hn::Mul(sx, sx), hn::Mul(sy, sy));
                 return hn::IfThenElse(hn::Gt(mag, threshSq), v255, v0);
             };
 
-            // 1. Promote and process Lower Half
-            auto res_lo = process_half(
+            // Process Lower Half
+            auto res_lo = process(
                 hn::PromoteTo(d16, hn::LowerHalf(v11)), hn::PromoteTo(d16, hn::LowerHalf(v12)), hn::PromoteTo(d16, hn::LowerHalf(v13)),
                 hn::PromoteTo(d16, hn::LowerHalf(v21)), hn::PromoteTo(d16, hn::LowerHalf(v23)),
-                hn::PromoteTo(d16, hn::LowerHalf(v31)), hn::PromoteTo(d16, hn::LowerHalf(v32)), hn::PromoteTo(d16, hn::LowerHalf(v33))
-            );
+                hn::PromoteTo(d16, hn::LowerHalf(v31)), hn::PromoteTo(d16, hn::LowerHalf(v32)), hn::PromoteTo(d16, hn::LowerHalf(v33)));
 
-            // 2. Promote and process Upper Half
-            auto res_hi = process_half(
-                hn::PromoteTo(d16, hn::UpperHalf(d8, v11)), hn::PromoteTo(d16, hn::UpperHalf(d8, v12)), hn::PromoteTo(d16, hn::UpperHalf(d8, v13)),
-                hn::PromoteTo(d16, hn::UpperHalf(d8, v21)), hn::PromoteTo(d16, hn::UpperHalf(d8, v23)),
-                hn::PromoteTo(d16, hn::UpperHalf(d8, v31)), hn::PromoteTo(d16, hn::UpperHalf(d8, v32)), hn::PromoteTo(d16, hn::UpperHalf(d8, v33))
-            );
+            // Process Upper Half - Using correct d8_h tag
+            auto res_hi = process(
+                hn::PromoteTo(d16, hn::UpperHalf(d8_h, v11)), hn::PromoteTo(d16, hn::UpperHalf(d8_h, v12)), hn::PromoteTo(d16, hn::UpperHalf(d8_h, v13)),
+                hn::PromoteTo(d16, hn::UpperHalf(d8_h, v21)), hn::PromoteTo(d16, hn::UpperHalf(d8_h, v23)),
+                hn::PromoteTo(d16, hn::UpperHalf(d8_h, v31)), hn::PromoteTo(d16, hn::UpperHalf(d8_h, v32)), hn::PromoteTo(d16, hn::UpperHalf(d8_h, v33)));
 
-            // 3. The "Magic" fix: OrderedDemote2To handles the cross-lane logic for AVX2 automatically
-            auto result8 = hn::OrderedDemote2To(d8, res_lo, res_hi);
-            hn::StoreU(result8, d8, out + x);
+            hn::StoreU(hn::OrderedDemote2To(d8, res_lo, res_hi), d8, out + x);
         }
     }
 }
-
 } // namespace HWY_NAMESPACE
 } // namespace ndb
 HWY_AFTER_NAMESPACE();
 
 namespace ndb {
 namespace testing {
-#if defined(HWY_TARGET) && HWY_TARGET == HWY_NEON
+//#if defined(HWY_TARGET) && HWY_TARGET == HWY_NEON
     void sobel_hwy(uint8_t* in, uint8_t* blurred, int width, int height, uint8_t threshold) {
-        ndb::N_NEON::SobelKernel(in, blurred, width, height, threshold);
+        //ndb::N_NEON::SobelKernel(in, blurred, width, height, threshold);
+        HWY_STATIC_DISPATCH(SobelKernel)(in, blurred, width, height, threshold);
     }
-#endif  
+//#endif  
 }
 }
diff --git a/lib/gpc/kernels/utils.cpp b/lib/gpc/kernels/utils.cpp
index 796c2b1..ce920e8 100644
--- a/lib/gpc/kernels/utils.cpp
+++ b/lib/gpc/kernels/utils.cpp
@@ -31,6 +31,7 @@
 #include <cassert>
 #include <thread>
 #include <functional>
+#include "gpc/kernels/utils.hpp"
 
 using namespace std;
 
@@ -39,7 +40,7 @@ void arr2ind(const unsigned char* a,
                                        int n,
                                        int* ind,
                                        int* m) {
-#ifdef _INTRINSICS_SSE
+#if HWY_TARGET == HWY_AVX2
     int i, m0, k;
     __m256i msk;
     m0 = 0;
@@ -69,7 +70,7 @@ void arr2ind(const unsigned char* a,
     *m = nnz;
 #endif
 }
-#ifdef _INTRINSICS_SSE
+#if HWY_TARGET == HWY_AVX2
 void unpack8to16(const __m128i x, __m128i& y0, __m128i& y1) {
     __m128i zero = _mm_setzero_si128();
     y0 = _mm_unpacklo_epi8(x, zero);
diff --git a/lib/gpc/kernels/utils.hpp b/lib/gpc/kernels/utils.hpp
index e9ce569..b2b0c2a 100644
--- a/lib/gpc/kernels/utils.hpp
+++ b/lib/gpc/kernels/utils.hpp
@@ -37,7 +37,7 @@
 #include "gpc/buffer.hpp"
 using namespace std;
 
-#ifdef _INTRINSICS_SSE
+#if HWY_TARGET == HWY_AVX2
 #include <immintrin.h>
 // greater and lesser than simd ops for unsigned 8bit integer (epu8)
 #define _mm_cmpgt_epu8(v0, v1)                             \
@@ -63,7 +63,7 @@ void arr2ind(const unsigned char* a,
                                        int* ind,
                                        int* m);
 
-#ifdef _INTRINSICS_SSE
+#if HWY_TARGET == HWY_AVX2
 /**
  * @brief      Unpacks 16x8bit from a 128bit simd var into 2x128bit vars
  *             (8x16bit)
diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt
index d5be853..14f4a2c 100644
--- a/samples/CMakeLists.txt
+++ b/samples/CMakeLists.txt
@@ -7,3 +7,7 @@ target_link_libraries(train gpc_core)
 add_executable(sparsematch sparsematch.cpp)
 target_link_libraries(sparsematch gpc_core)
 
+add_executable(target target.cpp)
+target_link_libraries(target gpc_core)
+
+
diff --git a/tests/test_kernel_box.cpp b/tests/test_kernel_box.cpp
index 8fff2ce..9772ecd 100644
--- a/tests/test_kernel_box.cpp
+++ b/tests/test_kernel_box.cpp
@@ -24,10 +24,11 @@ TEST(Approval, BoxKernel) {
 
     // 4. Run Highway version (only if compiled for the target)
 #if defined(HWY_TARGET) && HWY_TARGET == HWY_NEON
-    ndb::N_NEON::BoxFilter(input.data(), outHighway.data(), width, height);
+    ndb::BoxFilter(input.data(), outHighway.data(), width, height);
 #else
+    ndb::boxNaive(input.data(), outNaive.data(), width, height);
     // Fallback if the specific NEON namespace isn't exposed
-    ndb::testing::box_hwy(input.data(), outHighway.data(), width, height);
+    //ndb::testing::box_hwy(input.data(), outHighway.data(), width, height);
 
 #endif
 
diff --git a/tests/test_kernel_sobel.cpp b/tests/test_kernel_sobel.cpp
index fd5b30d..3d4b7d0 100644
--- a/tests/test_kernel_sobel.cpp
+++ b/tests/test_kernel_sobel.cpp
@@ -24,10 +24,11 @@ TEST(Approval, SobelKernel) {
 
     // 4. Run Highway version (only if compiled for the target)
 #if defined(HWY_TARGET) && HWY_TARGET == HWY_NEON
-    ndb::N_NEON::BoxFilter(input.data(), outHighway.data(), width, height);
+    ndb::BoxFilter(input.data(), outHighway.data(), width, height);
 #else
     // Fallback if the specific NEON namespace isn't exposed
-    ndb::testing::sobel_hwy(input.data(), outHighway.data(), width, height, 30);
+    //ndb::testing::sobel_hwy(input.data(), outHighway.data(), width, height, 30);
+    ndb::sobelNaive(input.data(), outHighway.data(), width, height, 30);
 
 #endif
 

From a9443c0b7ce59530d9a8313c9c97461218de5a3d Mon Sep 17 00:00:00 2001
From: Nik Bamert <github@nikbamert.com>
Date: Sun, 22 Feb 2026 15:46:39 +0100
Subject: [PATCH 25/36] update hwy sobel filter to be pixel accurate with naive
 version (although inefficient)

---
 lib/gpc/kernels/sobel_hwy.cpp | 118 +++++++++++++++++++++++++++++++++-
 lib/gpc/kernels/utils.hpp     |   1 +
 tests/test_kernel_box.cpp     |   8 +--
 tests/test_kernel_sobel.cpp   |  14 ++--
 4 files changed, 123 insertions(+), 18 deletions(-)

diff --git a/lib/gpc/kernels/sobel_hwy.cpp b/lib/gpc/kernels/sobel_hwy.cpp
index 24afc32..35d0b72 100644
--- a/lib/gpc/kernels/sobel_hwy.cpp
+++ b/lib/gpc/kernels/sobel_hwy.cpp
@@ -5,10 +5,126 @@ HWY_BEFORE_NAMESPACE();
 namespace ndb {
 namespace HWY_NAMESPACE {
 namespace hn = hwy::HWY_NAMESPACE;
+void SobelKernelNoDiv(const uint8_t* HWY_RESTRICT in, uint8_t* HWY_RESTRICT gradient, 
+                 int width, int height, uint8_t threshold) {
+    const hn::ScalableTag<uint8_t> d8;
+    const hn::Half<decltype(d8)> d8_h; 
+    const hn::Rebind<int16_t, decltype(d8_h)> d16;
+    // d32 has half the lanes of d16
+    const hn::Rebind<int32_t, hn::Half<decltype(d16)>> d32;
+
+    const size_t N = hn::Lanes(d8);
+    const auto vDivMult = hn::Set(d16, (int16_t)7282); 
+    const auto vThreshSq = hn::Set(d32, (int32_t)threshold * threshold);
+    const auto v255_16 = hn::Set(d16, (int16_t)255);
+    const auto v255_8 = hn::Set(d8, (uint8_t)255);
+    const auto v0_8 = hn::Zero(d8);
+
+    for (int y = 1; y < height - 1; ++y) {
+        const uint8_t* r0 = in + (y - 1) * width;
+        const uint8_t* r1 = in + y * width;
+        const uint8_t* r2 = in + (y + 1) * width;
+        uint8_t* out = gradient + y * width + 1;
+
+        for (int x = 0; x < width; x += N) {
+            auto v11 = hn::LoadU(d8, r0 + x); auto v12 = hn::LoadU(d8, r0 + x + 1); auto v13 = hn::LoadU(d8, r0 + x + 2);
+            auto v21 = hn::LoadU(d8, r1 + x);                                       auto v23 = hn::LoadU(d8, r1 + x + 2);
+            auto v31 = hn::LoadU(d8, r2 + x); auto v32 = hn::LoadU(d8, r2 + x + 1); auto v33 = hn::LoadU(d8, r2 + x + 2);
+
+            // Helper to process 8 pixels into a 16-bit mask-like result
+            auto process_half = [&](auto p11, auto p12, auto p13, auto p21, auto p23, auto p31, auto p32, auto p33) {
+                // Sobel derivatives in 16-bit
+                auto sx16 = hn::MulHigh(hn::Sub(hn::Add(hn::Add(p11, p31), hn::Add(p21, p21)), 
+                                                hn::Add(hn::Add(p13, p33), hn::Add(p23, p23))), vDivMult);
+                auto sy16 = hn::MulHigh(hn::Sub(hn::Add(hn::Add(p11, p13), hn::Add(p12, p12)), 
+                                                hn::Add(hn::Add(p31, p33), hn::Add(p32, p32))), vDivMult);
+
+                // Magnitude squared in 32-bit
+                auto sx_lo = hn::PromoteLowerTo(d32, sx16);
+                auto sy_lo = hn::PromoteLowerTo(d32, sy16);
+                auto mag_lo = hn::Add(hn::Mul(sx_lo, sx_lo), hn::Mul(sy_lo, sy_lo));
+
+                auto sx_hi = hn::PromoteUpperTo(d32, sx16);
+                auto sy_hi = hn::PromoteUpperTo(d32, sy16);
+                auto mag_hi = hn::Add(hn::Mul(sx_hi, sx_hi), hn::Mul(sy_hi, sy_hi));
+
+                // Comparison in 32-bit, returning 16-bit values (0 or 255) to avoid mask issues
+                auto m_lo = hn::IfThenElse(hn::Gt(mag_lo, vThreshSq), hn::Set(d32, 255), hn::Zero(d32));
+                auto m_hi = hn::IfThenElse(hn::Gt(mag_hi, vThreshSq), hn::Set(d32, 255), hn::Zero(d32));
+
+                return hn::OrderedDemote2To(d16, m_lo, m_hi);
+            };
 
+            // Process halves using standard Highway promotion
+            auto res_lo = process_half(
+                hn::PromoteLowerTo(d16, v11), hn::PromoteLowerTo(d16, v12), hn::PromoteLowerTo(d16, v13),
+                hn::PromoteLowerTo(d16, v21), hn::PromoteLowerTo(d16, v23),
+                hn::PromoteLowerTo(d16, v31), hn::PromoteLowerTo(d16, v32), hn::PromoteLowerTo(d16, v33));
 
+            auto res_hi = process_half(
+                hn::PromoteUpperTo(d16, v11), hn::PromoteUpperTo(d16, v12), hn::PromoteUpperTo(d16, v13),
+                hn::PromoteUpperTo(d16, v21), hn::PromoteUpperTo(d16, v23),
+                hn::PromoteUpperTo(d16, v31), hn::PromoteUpperTo(d16, v32), hn::PromoteUpperTo(d16, v33));
+
+            // Final store: 16-bit to 8-bit demotion
+            auto final_val = hn::OrderedDemote2To(d8, res_lo, res_hi);
+            hn::StoreU(final_val, d8, out + x);
+        }
+    }
+}
 void SobelKernel(const uint8_t* HWY_RESTRICT in, uint8_t* HWY_RESTRICT gradient, 
                  int width, int height, uint8_t threshold) {
+    // We target 4 pixels at a time as our base 'Scalable' unit.
+    // This allows easy promotion from 8 -> 16 -> 32 bit while keeping lane counts identical.
+    const hn::FixedTag<uint8_t, 4> d8;
+    const hn::FixedTag<int16_t, 4> d16;
+    const hn::FixedTag<int32_t, 4> d32;
+
+    const auto vDiv = hn::Set(d32, 9);
+    const auto vThreshSq = hn::Set(d32, (int32_t)threshold * threshold);
+    const auto v255 = hn::Set(d32, 255);
+    const auto v0 = hn::Zero(d32);
+
+    for (int y = 1; y < height - 1; ++y) {
+        const uint8_t* r0 = in + (y - 1) * width;
+        const uint8_t* r1 = in + y * width;
+        const uint8_t* r2 = in + (y + 1) * width;
+        uint8_t* out = gradient + y * width + 1;
+
+        for (int x = 0; x < width; x += 4) {
+            // Load and promote immediately to 32-bit to match naive 'int' math
+            auto load32 = [&](const uint8_t* p) {
+                return hn::PromoteTo(d32, hn::PromoteTo(d16, hn::LoadU(d8, p)));
+            };
+
+            auto p11 = load32(r0 + x);     auto p12 = load32(r0 + x + 1); auto p13 = load32(r0 + x + 2);
+            auto p21 = load32(r1 + x);                                    auto p23 = load32(r1 + x + 2);
+            auto p31 = load32(r2 + x);     auto p32 = load32(r2 + x + 1); auto p33 = load32(r2 + x + 2);
+
+            // Note:: Division is very slow - we use it for now to match exactly with the naive non simd-implementation
+            // sx = (*p11 + *p31 + 2 * *p21 - *p13 - 2 * *p23 - *p33) / 9;
+            auto sx = hn::Div(hn::Sub(hn::Add(hn::Add(p11, p31), hn::Add(p21, p21)),
+                                      hn::Add(hn::Add(p13, p33), hn::Add(p23, p23))), vDiv);
+            
+            // sy = (*p11 + *p13 + 2 * *p12 - *p31 - 2 * *p32 - *p33) / 9;
+            auto sy = hn::Div(hn::Sub(hn::Add(hn::Add(p11, p13), hn::Add(p12, p12)),
+                                      hn::Add(hn::Add(p31, p33), hn::Add(p32, p32))), vDiv);
+
+            // int val = sx * sx + sy * sy;
+            auto magSq = hn::Add(hn::Mul(sx, sx), hn::Mul(sy, sy));
+
+            // *optr = val > thresholdSq ? 255 : 0;
+            auto mask = hn::Gt(magSq, vThreshSq);
+            auto res32 = hn::IfThenElse(mask, v255, v0);
+            
+            // Demote 32 -> 16 -> 8
+            auto res8 = hn::DemoteTo(d8, hn::DemoteTo(d16, res32));
+            hn::StoreU(res8, d8, out + x);
+        }
+    }
+}
+void SobelKerneli(const uint8_t* HWY_RESTRICT in, uint8_t* HWY_RESTRICT gradient, 
+                 int width, int height, uint8_t threshold) {
     const hn::ScalableTag<uint8_t> d8;
     const hn::Half<decltype(d8)> d8_h; 
     const hn::Rebind<int16_t, decltype(d8_h)> d16; 
@@ -47,7 +163,7 @@ void SobelKernel(const uint8_t* HWY_RESTRICT in, uint8_t* HWY_RESTRICT gradient,
                 hn::PromoteTo(d16, hn::LowerHalf(v21)), hn::PromoteTo(d16, hn::LowerHalf(v23)),
                 hn::PromoteTo(d16, hn::LowerHalf(v31)), hn::PromoteTo(d16, hn::LowerHalf(v32)), hn::PromoteTo(d16, hn::LowerHalf(v33)));
 
-            // Process Upper Half - Using correct d8_h tag
+            // Process Upper Half 
             auto res_hi = process(
                 hn::PromoteTo(d16, hn::UpperHalf(d8_h, v11)), hn::PromoteTo(d16, hn::UpperHalf(d8_h, v12)), hn::PromoteTo(d16, hn::UpperHalf(d8_h, v13)),
                 hn::PromoteTo(d16, hn::UpperHalf(d8_h, v21)), hn::PromoteTo(d16, hn::UpperHalf(d8_h, v23)),
diff --git a/lib/gpc/kernels/utils.hpp b/lib/gpc/kernels/utils.hpp
index b2b0c2a..18227ba 100644
--- a/lib/gpc/kernels/utils.hpp
+++ b/lib/gpc/kernels/utils.hpp
@@ -33,6 +33,7 @@
 
 #include <cassert>
 #include <thread>
+#include <hwy/highway.h>
 
 #include "gpc/buffer.hpp"
 using namespace std;
diff --git a/tests/test_kernel_box.cpp b/tests/test_kernel_box.cpp
index 9772ecd..9913a7a 100644
--- a/tests/test_kernel_box.cpp
+++ b/tests/test_kernel_box.cpp
@@ -23,14 +23,8 @@ TEST(Approval, BoxKernel) {
     ndb::boxNaive(input.data(), outNaive.data(), width, height);
 
     // 4. Run Highway version (only if compiled for the target)
-#if defined(HWY_TARGET) && HWY_TARGET == HWY_NEON
-    ndb::BoxFilter(input.data(), outHighway.data(), width, height);
-#else
-    ndb::boxNaive(input.data(), outNaive.data(), width, height);
-    // Fallback if the specific NEON namespace isn't exposed
-    //ndb::testing::box_hwy(input.data(), outHighway.data(), width, height);
+    ndb::testing::box_hwy(input.data(), outHighway.data(), width, height);
 
-#endif
 
     // 5. Compare results
     // We skip the border (radius) because different implementations 
diff --git a/tests/test_kernel_sobel.cpp b/tests/test_kernel_sobel.cpp
index 3d4b7d0..bfc56ff 100644
--- a/tests/test_kernel_sobel.cpp
+++ b/tests/test_kernel_sobel.cpp
@@ -7,7 +7,8 @@
 TEST(Approval, SobelKernel) {
     const int width = 640;
     const int height = 480;
-    const int radius = 2; // Typical for 5x5 box
+    const int radius = 2; // Typical for 5x5 bo
+    const int threshold = 30; // Example threshold for binarization
 
     // 1. Prepare randomized input
     std::vector<uint8_t> input(width * height);
@@ -20,17 +21,10 @@ TEST(Approval, SobelKernel) {
     std::vector<uint8_t> outHighway(width * height, 0);
 
     // 3. Run Naive version
-    ndb::sobelNaive(input.data(), outNaive.data(), width, height, 30);
+    ndb::sobelNaive(input.data(), outNaive.data(), width, height, threshold);
 
     // 4. Run Highway version (only if compiled for the target)
-#if defined(HWY_TARGET) && HWY_TARGET == HWY_NEON
-    ndb::BoxFilter(input.data(), outHighway.data(), width, height);
-#else
-    // Fallback if the specific NEON namespace isn't exposed
-    //ndb::testing::sobel_hwy(input.data(), outHighway.data(), width, height, 30);
-    ndb::sobelNaive(input.data(), outHighway.data(), width, height, 30);
-
-#endif
+    ndb::testing::sobel_hwy(input.data(), outHighway.data(), width, height, threshold);
 
     // 5. Compare results
     // We skip the border (radius) because different implementations 

From 35063490d53aa7155006f4a950fd1773a4fef586 Mon Sep 17 00:00:00 2001
From: Nik Bamert <github@nikbamert.com>
Date: Tue, 24 Feb 2026 14:55:14 +0100
Subject: [PATCH 26/36] wip dense gpt hwy kernel

---
 CMakeLists.txt                    |   1 +
 benchmarks/CMakeLists.txt         |  11 +-
 benchmarks/box_bench.cpp          |  56 ++++++
 benchmarks/box_legacy_bench.cpp   |  21 ---
 benchmarks/sobel_bench.cpp        |  44 ++++-
 benchmarks/sobel_legacy_bench.cpp |  23 ---
 lib/gpc/forest.cpp                |   6 +-
 lib/gpc/inference.hpp             |   6 +-
 lib/gpc/kernels/box.cpp           |   9 +-
 lib/gpc/kernels/box.hpp           |   3 +
 lib/gpc/kernels/gpc.cpp           | 272 ++++++++++++++++--------------
 lib/gpc/kernels/gpc.hpp           |  27 ++-
 lib/gpc/kernels/gpc_hwy.cpp       | 157 +++++++++++++++++
 lib/gpc/kernels/gpc_hwy.hpp       |  17 ++
 lib/gpc/kernels/sobel_hwy.cpp     |   2 +-
 tests/CMakeLists.txt              |   1 +
 tests/test_kernel_gpc.cpp         |  82 +++++++++
 17 files changed, 535 insertions(+), 203 deletions(-)
 create mode 100644 benchmarks/box_bench.cpp
 delete mode 100644 benchmarks/box_legacy_bench.cpp
 delete mode 100644 benchmarks/sobel_legacy_bench.cpp
 create mode 100644 lib/gpc/kernels/gpc_hwy.cpp
 create mode 100644 lib/gpc/kernels/gpc_hwy.hpp
 create mode 100644 tests/test_kernel_gpc.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index dfcdd61..6957189 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -52,6 +52,7 @@ add_library(gpc_core
     lib/gpc/kernels/utils.cpp
     lib/gpc/kernels/box_hwy.cpp
     lib/gpc/kernels/sobel_hwy.cpp
+    lib/gpc/kernels/gpc_hwy.cpp
 )
 if(MSVC)
     target_compile_options(gpc_core PUBLIC /arch:AVX2)
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index 0fc8bf3..9735e84 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -15,16 +15,9 @@ target_link_libraries(kernel_bench
         gpc_core
         benchmark::benchmark 
 )
-add_executable(sobel_legacy_bench sobel_legacy_bench.cpp)
+add_executable(box_bench box_bench.cpp)
 
-target_link_libraries(sobel_legacy_bench
-    PRIVATE 
-        gpc_core
-        benchmark::benchmark 
-)
-add_executable(box_legacy_bench box_legacy_bench.cpp)
-
-target_link_libraries(box_legacy_bench
+target_link_libraries(box_bench
     PRIVATE 
         gpc_core
         benchmark::benchmark 
diff --git a/benchmarks/box_bench.cpp b/benchmarks/box_bench.cpp
new file mode 100644
index 0000000..7b7ff5c
--- /dev/null
+++ b/benchmarks/box_bench.cpp
@@ -0,0 +1,56 @@
+#include <benchmark/benchmark.h>
+#include <hwy/highway.h>
+#include "gpc/kernels/box.hpp" 
+#include "gpc/kernels/box_hwy.hpp" 
+static void BM_BoxHighway(benchmark::State& state) {
+    int w = 1920, h = 1080;
+    std::vector<uint8_t> in(w * h, 128);
+    std::vector<uint8_t> out(w * h, 0);
+    state.SetLabel(hwy::TargetName(HWY_TARGET));    
+    // Warmup is handled automatically by the library
+    for (auto _ : state) {
+        ndb::testing::box_hwy(in.data(), out.data(), w, h);
+        
+        // Ensure the compiler doesn't skip the work
+        benchmark::DoNotOptimize(out.data());
+        benchmark::ClobberMemory();
+    }
+}
+
+#if HWY_TARGET == HWY_AVX2
+static void BM_BoxLegacySIMD(benchmark::State& state) {
+    int w = 1920, h = 1080;
+    std::vector<uint8_t> in(w * h, 128);
+    std::vector<uint8_t> out(w * h, 0);
+
+    state.SetLabel("AVX2_legacy");    
+    for (auto _ : state) {
+        ndb::boxSSE(in.data(), out.data(), w, h);
+        
+        // Ensure the compiler doesn't skip the work
+        benchmark::DoNotOptimize(out.data());
+        benchmark::ClobberMemory();
+    }
+}
+#endif
+static void BM_BoxNaive(benchmark::State& state) {
+    int w = 1920, h = 1080;
+    std::vector<uint8_t> in(w * h, 128);
+    std::vector<uint8_t> out(w * h, 0);
+
+    state.SetLabel("naive");    
+    for (auto _ : state) {
+        ndb::boxNaive(in.data(), out.data(), w, h);
+        
+        // Ensure the compiler doesn't skip the work
+        benchmark::DoNotOptimize(out.data());
+        benchmark::ClobberMemory();
+    }
+}
+BENCHMARK(BM_BoxHighway)->Unit(benchmark::kMillisecond);
+#if HWY_TARGET == HWY_AVX2
+BENCHMARK(BM_BoxLegacySIMD)->Unit(benchmark::kMillisecond);
+#endif
+BENCHMARK(BM_BoxNaive)->Unit(benchmark::kMillisecond);
+
+BENCHMARK_MAIN();
diff --git a/benchmarks/box_legacy_bench.cpp b/benchmarks/box_legacy_bench.cpp
deleted file mode 100644
index c8960a3..0000000
--- a/benchmarks/box_legacy_bench.cpp
+++ /dev/null
@@ -1,21 +0,0 @@
-#include <benchmark/benchmark.h>
-#include "gpc/kernels/box.hpp" 
-
-static void BM_BoxLegacy(benchmark::State& state) {
-    int w = 1920, h = 1080;
-    std::vector<uint8_t> in(w * h, 128);
-    std::vector<uint8_t> out(w * h, 0);
-
-    for (auto _ : state) {
-        ndb::box(in.data(), out.data(), w, h, 50);
-        
-        // Ensure the compiler doesn't skip the work
-        benchmark::DoNotOptimize(out.data());
-        benchmark::ClobberMemory();
-    }
-    
-    state.SetBytesProcessed(int64_t(state.iterations()) * w * h);
-}
-BENCHMARK(BM_BoxLegacy)->Unit(benchmark::kMillisecond);
-
-BENCHMARK_MAIN();
diff --git a/benchmarks/sobel_bench.cpp b/benchmarks/sobel_bench.cpp
index e0a65d1..5c26d89 100644
--- a/benchmarks/sobel_bench.cpp
+++ b/benchmarks/sobel_bench.cpp
@@ -1,11 +1,12 @@
 #include <benchmark/benchmark.h>
-#include "gpc/kernels/sobel_hwy.hpp" // Your header
-
+#include <hwy/highway.h>
+#include "gpc/kernels/sobel.hpp" 
+#include "gpc/kernels/sobel_hwy.hpp" 
 static void BM_SobelHighway(benchmark::State& state) {
     int w = 1920, h = 1080;
     std::vector<uint8_t> in(w * h, 128);
     std::vector<uint8_t> out(w * h, 0);
-
+    state.SetLabel(hwy::TargetName(HWY_TARGET));    
     // Warmup is handled automatically by the library
     for (auto _ : state) {
         ndb::testing::sobel_hwy(in.data(), out.data(), w, h, 50);
@@ -14,9 +15,42 @@ static void BM_SobelHighway(benchmark::State& state) {
         benchmark::DoNotOptimize(out.data());
         benchmark::ClobberMemory();
     }
-    
-    state.SetBytesProcessed(int64_t(state.iterations()) * w * h);
+}
+
+#if HWY_TARGET == HWY_AVX2
+static void BM_SobelLegacySIMD(benchmark::State& state) {
+    int w = 1920, h = 1080;
+    std::vector<uint8_t> in(w * h, 128);
+    std::vector<uint8_t> out(w * h, 0);
+
+    state.SetLabel("AVX2_legacy");    
+    for (auto _ : state) {
+        ndb::sobelSSE(in.data(), out.data(), w, 1, h - 1, 1);
+        
+        // Ensure the compiler doesn't skip the work
+        benchmark::DoNotOptimize(out.data());
+        benchmark::ClobberMemory();
+    }
+}
+#endif
+static void BM_SobelNaive(benchmark::State& state) {
+    int w = 1920, h = 1080;
+    std::vector<uint8_t> in(w * h, 128);
+    std::vector<uint8_t> out(w * h, 0);
+
+    state.SetLabel("naive");    
+    for (auto _ : state) {
+        ndb::sobelNaive(in.data(), out.data(), w, h, 1);
+        
+        // Ensure the compiler doesn't skip the work
+        benchmark::DoNotOptimize(out.data());
+        benchmark::ClobberMemory();
+    }
 }
 BENCHMARK(BM_SobelHighway)->Unit(benchmark::kMillisecond);
+#if HWY_TARGET == HWY_AVX2
+BENCHMARK(BM_SobelLegacySIMD)->Unit(benchmark::kMillisecond);
+#endif
+BENCHMARK(BM_SobelNaive)->Unit(benchmark::kMillisecond);
 
 BENCHMARK_MAIN();
diff --git a/benchmarks/sobel_legacy_bench.cpp b/benchmarks/sobel_legacy_bench.cpp
deleted file mode 100644
index 7ffb183..0000000
--- a/benchmarks/sobel_legacy_bench.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-#include <benchmark/benchmark.h>
-#include "gpc/kernels/sobel.hpp" 
-
-static void BM_SobelLegacy(benchmark::State& state) {
-    int w = 1920, h = 1080;
-    std::vector<uint8_t> in(w * h, 128);
-    std::vector<uint8_t> out(w * h, 0);
-
-    for (auto _ : state) {
-        //ndb::sobel(in.data(), out.data(), w, h, 50, 1);
-        //ndb::sobelSSE(in.data(), out.data(), w, 1, h - 1, 1);
-        ndb::sobelNaive(in.data(), out.data(), w, h, 1);
-        
-        // Ensure the compiler doesn't skip the work
-        benchmark::DoNotOptimize(out.data());
-        benchmark::ClobberMemory();
-    }
-    
-    state.SetBytesProcessed(int64_t(state.iterations()) * w * h);
-}
-BENCHMARK(BM_SobelLegacy)->Unit(benchmark::kMillisecond);
-
-BENCHMARK_MAIN();
diff --git a/lib/gpc/forest.cpp b/lib/gpc/forest.cpp
index e39eaff..6aca590 100644
--- a/lib/gpc/forest.cpp
+++ b/lib/gpc/forest.cpp
@@ -170,8 +170,7 @@ std::vector<ndb::Descriptor> Forest::evalFastMaskOnSubsetSSE(
                        fastmask.mask,
                        idx,
                        img.cols(),
-                       img.rows(),
-                       settings.numThreads_);
+                       img.rows());
     } else {
         ndb::gpcFilterTau(img.data(),
                           grad.data(),
@@ -180,8 +179,7 @@ std::vector<ndb::Descriptor> Forest::evalFastMaskOnSubsetSSE(
                           fastmask.tau,
                           idx,
                           img.cols(),
-                          img.rows(),
-                          settings.numThreads_);
+                          img.rows());
     }
     std::vector<ndb::Descriptor> out(idx.size());
     int j = 0;
diff --git a/lib/gpc/inference.hpp b/lib/gpc/inference.hpp
index 5136010..e074290 100644
--- a/lib/gpc/inference.hpp
+++ b/lib/gpc/inference.hpp
@@ -298,8 +298,7 @@ class Forest {
                            fastmask.mask,
                            idx,
                            img.cols(),
-                           img.rows(),
-                           settings.numThreads_);
+                           img.rows());
         } else {
             ndb::gpcFilterTau(img.data(),
                               grad.data(),
@@ -308,8 +307,7 @@ class Forest {
                               fastmask.tau,
                               idx,
                               img.cols(),
-                              img.rows(),
-                              settings.numThreads_);
+                              img.rows());
         }
         std::vector<ndb::Descriptor> out(idx.size());
         int j = 0;
diff --git a/lib/gpc/kernels/box.cpp b/lib/gpc/kernels/box.cpp
index c9984dd..605daa2 100644
--- a/lib/gpc/kernels/box.cpp
+++ b/lib/gpc/kernels/box.cpp
@@ -30,6 +30,7 @@
 // Code Author: Niklaus Bamert (bamertn@ethz.ch)
 
 #include "gpc/kernels/box.hpp"
+#include "gpc/kernels/utils.hpp"
 #include <cassert>
 namespace ndb {
 namespace testing { 
@@ -73,7 +74,7 @@ void boxNaive(uint8_t* in, uint8_t* blurred, int width, int height) {
         }
     }
 }
-#ifdef _INTRINSICS_SSE
+#if HWY_TARGET == HWY_AVX2
 /**
  * @brief SSE implementation of the 3x3 box filter.
  * Processed two rows at a time using fixed-point multiplication for division.
@@ -168,10 +169,10 @@ void box(uint8_t* in, uint8_t* blurred, int width, int height, int numThreads) {
     // Force use of our new Highway kernel on Mac
     testing::box_hwy(in, blurred, width, height);
 #else
-    #ifndef _INTRINSICS_SSE
-        boxNaive(in, blurred, width, height);
-    #else
+    #if HWY_TARGET == HWY_AVX2
         boxSSE(in, blurred, width, height);
+    #else
+        boxNaive(in, blurred, width, height);
     #endif
 #endif
 }
diff --git a/lib/gpc/kernels/box.hpp b/lib/gpc/kernels/box.hpp
index c5f2d0e..eef0b3d 100644
--- a/lib/gpc/kernels/box.hpp
+++ b/lib/gpc/kernels/box.hpp
@@ -60,6 +60,9 @@ void boxNaive(uint8_t* in, uint8_t* blurred, int width, int height);
    */
 void box(uint8_t* in, uint8_t* blurred, int width, int height, int numThreads);
 
+#if HWY_TARGET == HWY_AVX2
+void boxSSE(uint8_t* in, uint8_t* blurred, int width, int height); 
+#endif
 
 }
 #endif
diff --git a/lib/gpc/kernels/gpc.cpp b/lib/gpc/kernels/gpc.cpp
index 5e22e23..62ffa3e 100644
--- a/lib/gpc/kernels/gpc.cpp
+++ b/lib/gpc/kernels/gpc.cpp
@@ -79,11 +79,73 @@ void gpcFilterTauNaive(uint8_t* in,
 } 
 
 
-#ifdef _INTRINSICS_SSE
+#if (HWY_ARCH_X86) && (HWY_TARGET == HWY_AVX2)
 bool isAllZeros(__m128i xmm) {
     return _mm_movemask_epi8(_mm_cmpeq_epi8(xmm, _mm_setzero_si128())) ==
            0xFFFF;
 }
+void gpcFilterSSE(uint8_t* in,
+               const uint8_t* grad,
+               uint32_t* gpc,
+               std::vector<int32_t> fastmask,
+               std::vector<int>& idx,
+               int width,
+               int height) {
+    const int start  = 13; 
+    const int end = height - 15;
+    __m128i zero = _mm_set1_epi8(0);
+    __m128i one = _mm_set1_epi8(1);
+    for (int y = start; y < end; y++) {
+        for (int x = 0; x < width; x += 16) {
+            uint8_t* rowPtr;
+            rowPtr = in + (y - 2) * width + x;
+            __m128i out[4];  // temporary output vector of 4 128bit words
+
+            const uint8_t* center = (in + y * width + x);
+            const uint8_t* centerGrad = (grad + y * width + x);
+            // We only process the current segment if there are any non-zero
+            // values (high gradient pixels)
+            if (!isAllZeros(_mm_lddqu_si128((__m128i*)centerGrad))) {
+                __m128i* dst =
+                    (__m128i*)(gpc + y * width +
+                               x);  // Set starting point to pixel (2,2)
+                out[0] = zero;
+                out[1] = zero;
+                out[2] = zero;
+                out[3] = zero;
+                uint8_t k = 0;
+                __m128i bitMask = one;
+                for (uint8_t i = 0; i < fastmask.size() && i < 64; i += 2) {
+                    out[k] |= _mm_and_si128(
+                        _mm_cmpgt_epu8(
+                            _mm_lddqu_si128(
+                                (__m128i*)(center + fastmask[i])),
+                            _mm_lddqu_si128(
+                                (__m128i*)(center + fastmask[i + 1]))),
+                        bitMask);
+                    // Keeps index into output vector and updates bit mask
+                    if (i % 16 == 0 && i != 0) {
+                        bitMask = one;
+                        k++;
+                    } else {
+                        bitMask += bitMask;
+                    }
+                }
+                // 8bit to 16bit
+                __m128i high1 = _mm_unpacklo_epi8(out[2], out[3]);
+                __m128i high2 = _mm_unpackhi_epi8(out[2], out[3]);
+                __m128i low1 = _mm_unpacklo_epi8(out[0], out[1]);
+                __m128i low2 = _mm_unpackhi_epi8(out[0], out[1]);
+
+                // 16bit to 32bit ints
+                _mm_storeu_si128(dst, _mm_unpacklo_epi16(low1, high1));
+                _mm_storeu_si128(dst + 1, _mm_unpackhi_epi16(low1, high1));
+                _mm_storeu_si128(dst + 2, _mm_unpacklo_epi16(low2, high2));
+                _mm_storeu_si128(dst + 3, _mm_unpackhi_epi16(low2, high2));
+            }
+        }  // col iteration
+    }  // row iteration
+}
 #endif
 void gpcFilter(uint8_t* in,
                const uint8_t* grad,
@@ -91,73 +153,89 @@ void gpcFilter(uint8_t* in,
                std::vector<int32_t> fastmask,
                std::vector<int>& idx,
                int width,
-               int height,
-               int numThreads) {
+               int height){
     assert(width % 16 == 0 && "width must be multiple of 16!");
-#ifndef _INTRINSICS_SSE
+#if defined(__ARM_NEON) || defined(__aarch64__)
+    // Replace with call to highway
     gpcFilterNaive(in, grad, gpc, fastmask, idx, width, height);
 #else
-    auto gpcFilterSegment = [&](int start, int end) {
-        __m128i zero = _mm_set1_epi8(0);
-        __m128i one = _mm_set1_epi8(1);
-        for (int y = start; y < end; y++) {
-            for (int x = 0; x < width; x += 16) {
-                uint8_t* rowPtr;
-                rowPtr = in + (y - 2) * width + x;
-                __m128i out[4];  // temporary output vector of 4 128bit words
+    #if (HWY_ARCH_X86) && (HWY_TARGET == HWY_AVX2)
+        gpcFilterSSE(in, grad, gpc, fastmask, idx, width, height);
+    #else 
+        gpcFilterNaive(in, grad, gpc, fastmask, idx, width, height);
+#endif
+#endif
+}
 
-                const uint8_t* center = (in + y * width + x);
-                const uint8_t* centerGrad = (grad + y * width + x);
-                // We only process the current segment if there are any non-zero
-                // values (high gradient pixels)
-                if (!isAllZeros(_mm_lddqu_si128((__m128i*)centerGrad))) {
-                    __m128i* dst =
-                        (__m128i*)(gpc + y * width +
-                                   x);  // Set starting point to pixel (2,2)
-                    out[0] = zero;
-                    out[1] = zero;
-                    out[2] = zero;
-                    out[3] = zero;
-                    uint8_t k = 0;
-                    __m128i bitMask = one;
-                    for (uint8_t i = 0; i < fastmask.size() && i < 64; i += 2) {
-                        out[k] |= _mm_and_si128(
-                            _mm_cmpgt_epu8(
-                                _mm_lddqu_si128(
-                                    (__m128i*)(center + fastmask[i])),
+#if (HWY_ARCH_X86) && (HWY_TARGET == HWY_AVX2)
+void gpcFilterTauSSE(uint8_t* in,
+                  const uint8_t* grad,
+                  uint32_t* gpc,
+                  std::vector<int32_t> fastmask,
+                  std::vector<int> tau,
+                  std::vector<int>& idx,
+                  int width,
+                  int height){
+    const int start  = 13; 
+    const int end = height - 15;
+    __m128i zero = _mm_set1_epi8(0);
+    __m128i one = _mm_set1_epi8(1);
+    for (int y = start; y < end; y++) {
+        for (int x = 0; x < width; x += 16) {
+            uint8_t* rowPtr;
+            rowPtr = in + (y - 2) * width + x;
+            __m128i out[4];  // temporary output vector of 4 128bit words
+
+            const uint8_t* center = (in + y * width + x);
+            const uint8_t* centerGrad = (grad + y * width + x);
+            // We only process the current segment if there are any non-zero
+            // values (high gradient pixels)
+            if (!isAllZeros(_mm_lddqu_si128((__m128i*)centerGrad))) {
+                __m128i* dst =
+                    (__m128i*)(gpc + y * width +
+                               x);  // Set starting point to pixel (2,2)
+                out[0] = zero;
+                out[1] = zero;
+                out[2] = zero;
+                out[3] = zero;
+                uint8_t k = 0;
+                __m128i bitMask = one;
+                for (uint8_t i = 0; i < fastmask.size() && i < 64; i += 2) {
+                    out[k] |= _mm_and_si128(
+                        _mm_cmpgt_epu8(
+                            _mm_lddqu_si128(
+                                (__m128i*)(center + fastmask[i])),
+                            _mm_subs_epi8(
                                 _mm_lddqu_si128(
-                                    (__m128i*)(center + fastmask[i + 1]))),
-                            bitMask);
-                        // Keeps index into output vector and updates bit mask
-                        if (i % 16 == 0 && i != 0) {
-                            bitMask = one;
-                            k++;
-                        } else {
-                            bitMask += bitMask;
-                        }
+                                    (__m128i*)(center + fastmask[i + 1])),
+                                _mm_set1_epi8(tau[i / 2]))  // deduct tau
+                            ),
+                        bitMask);
+                    // Keeps index into output vector and updates bit mask
+                    if (i % 16 == 0 && i != 0) {
+                        bitMask = one;
+                        k++;
+                    } else {
+                        bitMask += bitMask;
                     }
-                    // 8bit to 16bit
-                    __m128i high1 = _mm_unpacklo_epi8(out[2], out[3]);
-                    __m128i high2 = _mm_unpackhi_epi8(out[2], out[3]);
-                    __m128i low1 = _mm_unpacklo_epi8(out[0], out[1]);
-                    __m128i low2 = _mm_unpackhi_epi8(out[0], out[1]);
-
-                    // 16bit to 32bit ints
-                    _mm_storeu_si128(dst, _mm_unpacklo_epi16(low1, high1));
-                    _mm_storeu_si128(dst + 1, _mm_unpackhi_epi16(low1, high1));
-                    _mm_storeu_si128(dst + 2, _mm_unpacklo_epi16(low2, high2));
-                    _mm_storeu_si128(dst + 3, _mm_unpackhi_epi16(low2, high2));
                 }
-            }  // col iteration
-        }  // row iteration
-    };
+                // 8bit to 16bit
+                __m128i high1 = _mm_unpacklo_epi8(out[2], out[3]);
+                __m128i high2 = _mm_unpackhi_epi8(out[2], out[3]);
+                __m128i low1 = _mm_unpacklo_epi8(out[0], out[1]);
+                __m128i low2 = _mm_unpackhi_epi8(out[0], out[1]);
 
-    if (numThreads == 1)
-        gpcFilterSegment(13, height - 15);
-    else
-        parFor(gpcFilterSegment, 13, height - 15, 4);
-#endif
+                // 16bit to 32bit ints
+                _mm_storeu_si128(dst, _mm_unpacklo_epi16(low1, high1));
+                _mm_storeu_si128(dst + 1, _mm_unpackhi_epi16(low1, high1));
+                _mm_storeu_si128(dst + 2, _mm_unpacklo_epi16(low2, high2));
+                _mm_storeu_si128(dst + 3, _mm_unpackhi_epi16(low2, high2));
+            }
+        }  // col iteration
+    }  // row iteration
 }
+#endif
+
 void gpcFilterTau(uint8_t* in,
                   const uint8_t* grad,
                   uint32_t* gpc,
@@ -165,75 +243,19 @@ void gpcFilterTau(uint8_t* in,
                   std::vector<int> tau,
                   std::vector<int>& idx,
                   int width,
-                  int height,
-                  int numThreads) {
+                  int height){
     assert(width % 16 == 0 && "width must be multiple of 16!");
-#ifndef _INTRINSICS_SSE
+#if defined(__ARM_NEON) || defined(__aarch64__)
+    // Replace with call to highway
     gpcFilterTauNaive(in, grad, gpc, fastmask, tau, idx, width, height);
 #else
-    auto gpcFilterSegment = [&](int start, int end) {
-        __m128i zero = _mm_set1_epi8(0);
-        __m128i one = _mm_set1_epi8(1);
-        for (int y = start; y < end; y++) {
-            for (int x = 0; x < width; x += 16) {
-                uint8_t* rowPtr;
-                rowPtr = in + (y - 2) * width + x;
-                __m128i out[4];  // temporary output vector of 4 128bit words
-
-                const uint8_t* center = (in + y * width + x);
-                const uint8_t* centerGrad = (grad + y * width + x);
-                // We only process the current segment if there are any non-zero
-                // values (high gradient pixels)
-                if (!isAllZeros(_mm_lddqu_si128((__m128i*)centerGrad))) {
-                    __m128i* dst =
-                        (__m128i*)(gpc + y * width +
-                                   x);  // Set starting point to pixel (2,2)
-                    out[0] = zero;
-                    out[1] = zero;
-                    out[2] = zero;
-                    out[3] = zero;
-                    uint8_t k = 0;
-                    __m128i bitMask = one;
-                    for (uint8_t i = 0; i < fastmask.size() && i < 64; i += 2) {
-                        out[k] |= _mm_and_si128(
-                            _mm_cmpgt_epu8(
-                                _mm_lddqu_si128(
-                                    (__m128i*)(center + fastmask[i])),
-                                _mm_subs_epi8(
-                                    _mm_lddqu_si128(
-                                        (__m128i*)(center + fastmask[i + 1])),
-                                    _mm_set1_epi8(tau[i / 2]))  // deduct tau
-                                ),
-                            bitMask);
-                        // Keeps index into output vector and updates bit mask
-                        if (i % 16 == 0 && i != 0) {
-                            bitMask = one;
-                            k++;
-                        } else {
-                            bitMask += bitMask;
-                        }
-                    }
-                    // 8bit to 16bit
-                    __m128i high1 = _mm_unpacklo_epi8(out[2], out[3]);
-                    __m128i high2 = _mm_unpackhi_epi8(out[2], out[3]);
-                    __m128i low1 = _mm_unpacklo_epi8(out[0], out[1]);
-                    __m128i low2 = _mm_unpackhi_epi8(out[0], out[1]);
-
-                    // 16bit to 32bit ints
-                    _mm_storeu_si128(dst, _mm_unpacklo_epi16(low1, high1));
-                    _mm_storeu_si128(dst + 1, _mm_unpackhi_epi16(low1, high1));
-                    _mm_storeu_si128(dst + 2, _mm_unpacklo_epi16(low2, high2));
-                    _mm_storeu_si128(dst + 3, _mm_unpackhi_epi16(low2, high2));
-                }
-            }  // col iteration
-        }  // row iteration
-    };
-
-    if (numThreads == 1)
-        gpcFilterSegment(13, height - 15);
-    else
-        parFor(gpcFilterSegment, 13, height - 15, 4);
+    #if (HWY_ARCH_X86) && (HWY_TARGET == HWY_AVX2)
+        gpcFilterTauSSE(in, grad, gpc, fastmask, tau, idx, width, height);
+    #else 
+        gpcFilterTauNaive(in, grad, gpc, fastmask, tau, idx, width, height);
 #endif
+#endif
+
 }
-}
+} // namespace ndb
 
diff --git a/lib/gpc/kernels/gpc.hpp b/lib/gpc/kernels/gpc.hpp
index 5f43743..49db7ae 100644
--- a/lib/gpc/kernels/gpc.hpp
+++ b/lib/gpc/kernels/gpc.hpp
@@ -49,7 +49,6 @@ namespace ndb {
  *                  and the call gets forwarded to the naive implementation.
  * @param width     The width of the image at pointer *in
  * @param height    The height of the image at pointer *in
- * @param numThreadsNumber of threads to use
  */
 void gpcFilter(uint8_t* in,
                const uint8_t* grad,
@@ -57,8 +56,7 @@ void gpcFilter(uint8_t* in,
                std::vector<int32_t> fastmask,
                std::vector<int>& idx,
                int width,
-               int height,
-               int numThreads);
+               int height);
 
 
 /**
@@ -93,7 +91,6 @@ void gpcFilterNaive(uint8_t* in,
  * @param fastmask  The fastmask containing the gpc filter
  * @param width     The width of the image at pointer *in
  * @param height    The height of the image at pointer *in
- * @param numThreads Number of threads to use
  */
 void gpcFilterTau(uint8_t* in,
                   const uint8_t* grad,
@@ -102,8 +99,7 @@ void gpcFilterTau(uint8_t* in,
                   std::vector<int> tau,
                   std::vector<int>& idx,
                   int width,
-                  int height,
-                  int numThreads); 
+                  int height);
 
 /**
  * @brief Applies a gpc filter defined by the pixel-difference tests in
@@ -132,8 +128,25 @@ void gpcFilterTauNaive(uint8_t* in,
  *
  * @return true if all zeros, false otherwise
  */
-#ifdef _INTRINSICS_SSE
+#if (HWY_ARCH_X86) && (HWY_TARGET == HWY_AVX2)
 bool isAllZeros(__m128i xmm);
+void gpcFilterTauSSE(uint8_t* in,
+                  const uint8_t* grad,
+                  uint32_t* gpc,
+                  std::vector<int32_t> fastmask,
+                  std::vector<int> tau,
+                  std::vector<int>& idx,
+                  int width,
+                  int height);
+void gpcFilterSSE(uint8_t* in,
+               const uint8_t* grad,
+               uint32_t* gpc,
+               std::vector<int32_t> fastmask,
+               std::vector<int>& idx,
+               int width,
+               int height);
+
+
 #endif
 
 
diff --git a/lib/gpc/kernels/gpc_hwy.cpp b/lib/gpc/kernels/gpc_hwy.cpp
new file mode 100644
index 0000000..21dae2d
--- /dev/null
+++ b/lib/gpc/kernels/gpc_hwy.cpp
@@ -0,0 +1,157 @@
+//#define HWY_TARGET HWY_NEON 
+#include "gpc_hwy.hpp"
+HWY_BEFORE_NAMESPACE(); 
+namespace ndb {
+namespace HWY_NAMESPACE {
+namespace hn = hwy::HWY_NAMESPACE;
+
+//dense!
+#include <hwy/highway.h>
+
+namespace hn = hwy::HWY_NAMESPACE;
+
+// Dense Version
+void GPCKernel(const uint8_t* HWY_RESTRICT in,
+               const uint8_t* HWY_RESTRICT grad, 
+               uint32_t* HWY_RESTRICT gpc,
+               const std::vector<int32_t>& fastmask,
+               const std::vector<int32_t>& tau,  
+               int width, int height) {
+    
+    const hn::ScalableTag<uint8_t> d8;
+    const hn::ScalableTag<uint32_t> d32;
+    const size_t N = hn::Lanes(d8);
+    
+    const int border = 13;
+    const auto v_zero8 = hn::Zero(d8);
+    const auto v_one8 = hn::Set(d8, 1);
+    const int32_t* fm = fastmask.data();
+
+    for (int y = border; y < height - border; ++y) {
+        const int row_base = y * width;
+        uint32_t* HWY_RESTRICT row_out = gpc + row_base;
+
+        for (int x = border; x <= width - border - (int)N; x += N) {
+            const int k = row_base + x;
+
+            // We use four 8-bit registers to build the 32 bits.
+            // This keeps the entire hot-loop in 8-bit space.
+            auto v_acc0 = hn::Zero(d8); // Bits 0-7
+            auto v_acc1 = hn::Zero(d8); // Bits 8-15
+            auto v_acc2 = hn::Zero(d8); // Bits 16-23
+            auto v_acc3 = hn::Zero(d8); // Bits 24-31
+
+            // Pass 1: Bits 0-7
+            for (int i = 0; i < 16; i += 2) {
+                v_acc0 = hn::Add(v_acc0, v_acc0);
+                auto mask = hn::Gt(hn::LoadU(d8, in + k + fm[i]), 
+                                   hn::LoadU(d8, in + k + fm[i+1]));
+                v_acc0 = hn::Or(v_acc0, hn::IfThenElse(mask, v_one8, v_zero8));
+            }
+
+            // Pass 2: Bits 8-15
+            for (int i = 16; i < 32; i += 2) {
+                v_acc1 = hn::Add(v_acc1, v_acc1);
+                auto mask = hn::Gt(hn::LoadU(d8, in + k + fm[i]), 
+                                   hn::LoadU(d8, in + k + fm[i+1]));
+                v_acc1 = hn::Or(v_acc1, hn::IfThenElse(mask, v_one8, v_zero8));
+            }
+
+            // Pass 3: Bits 16-23
+            for (int i = 32; i < 48; i += 2) {
+                v_acc2 = hn::Add(v_acc2, v_acc2);
+                auto mask = hn::Gt(hn::LoadU(d8, in + k + fm[i]), 
+                                   hn::LoadU(d8, in + k + fm[i+1]));
+                v_acc2 = hn::Or(v_acc2, hn::IfThenElse(mask, v_one8, v_zero8));
+            }
+
+            // Pass 4: Bits 24-31
+            for (int i = 48; i < 64; i += 2) {
+                v_acc3 = hn::Add(v_acc3, v_acc3);
+                auto mask = hn::Gt(hn::LoadU(d8, in + k + fm[i]), 
+                                   hn::LoadU(d8, in + k + fm[i+1]));
+                v_acc3 = hn::Or(v_acc3, hn::IfThenElse(mask, v_one8, v_zero8));
+            }
+
+            // Final Assembly: Promote the four 8-bit chunks into 32-bit results.
+            // We use PromoteUpper/Lower to widen the data.
+            // N is the number of 8-bit lanes. We need to store N/4 results in d32.
+            
+            // To be perfectly safe across all Highway targets, we extract and combine:
+            for (size_t lane = 0; lane < N; ++lane) {
+                uint32_t final_val = (uint32_t(hn::ExtractLane(v_acc0, lane)) << 24) |
+                                     (uint32_t(hn::ExtractLane(v_acc1, lane)) << 16) |
+                                     (uint32_t(hn::ExtractLane(v_acc2, lane)) << 8)  |
+                                     (uint32_t(hn::ExtractLane(v_acc3, lane)));
+                row_out[x + lane] = final_val;
+            }
+        }
+    }
+}
+void GPCKerneli(const uint8_t* HWY_RESTRICT in,
+                      const uint8_t* HWY_RESTRICT grad,
+                      uint32_t* HWY_RESTRICT gpc,
+                      const std::vector<int32_t>& fastmask,
+                      const std::vector<int32_t>& tau,
+                      int width, int height) {
+    // We use the ScalableTag, but we will "Narrow" our view manually
+    const hn::ScalableTag<uint32_t> d32;
+    const hn::Rebind<uint8_t, decltype(d32)> d8_n; // Same number of lanes as d32
+    
+    const size_t N = hn::Lanes(d32); 
+    const auto v_zero = hn::Zero(d32);
+    const bool use_tau = !tau.empty();
+
+    for (int y = 0; y < height; ++y) {
+        for (int x = 0; x < width; x += N) {
+            const uint8_t* centerGrad = grad + y * width + x;
+            
+            // 1. Load the gradient bytes for the current N lanes
+            auto v_grad = hn::LoadU(d8_n, centerGrad);
+
+            // 2. Promotion-free zero check
+            if (hn::AllTrue(d8_n, hn::Eq(v_grad, hn::Zero(d8_n)))) {
+                continue;
+            }
+
+            auto v_tmp = hn::Zero(d32);
+
+            for (size_t i = 0; i < fastmask.size(); i += 2) {
+                v_tmp = hn::ShiftLeft<1>(v_tmp); 
+
+                // 3. The "Promotion" that actually works on all platforms:
+                // Promote N lanes of uint8 to N lanes of uint32
+                auto v1 = hn::PromoteTo(d32, hn::LoadU(d8_n, in + y * width + x + fastmask[i]));
+                auto v2 = hn::PromoteTo(d32, hn::LoadU(d8_n, in + y * width + x + fastmask[i + 1]));
+
+                hn::Mask<decltype(d32)> mask;
+                if (use_tau) {
+                    auto v_tau = hn::Set(d32, tau[i / 2]);
+                    mask = hn::Gt(v1, hn::Sub(v2, v_tau));
+                } else {
+                    mask = hn::Gt(v1, v2);
+                }
+
+                v_tmp = hn::Add(v_tmp, hn::IfThenElse(mask, hn::Set(d32, 1), v_zero));
+            }
+
+            hn::StoreU(v_tmp, d32, gpc + y * width + x);
+        }
+    }
+}
+
+} // namespace HWY_NAMESPACE
+} // namespace ndb
+HWY_AFTER_NAMESPACE();
+
+namespace ndb {
+namespace testing {
+    void gpc_hwy(uint8_t* in, uint8_t* grad, uint32_t* HWY_RESTRICT gpc,
+            const std::vector<int32_t>& fastmask, 
+            const std::vector<int32_t>& tau, int width, int height) {
+
+        HWY_STATIC_DISPATCH(GPCKernel)(in, grad, gpc, fastmask, tau, width, height);
+
+    }
+}
+}
diff --git a/lib/gpc/kernels/gpc_hwy.hpp b/lib/gpc/kernels/gpc_hwy.hpp
new file mode 100644
index 0000000..f49d05a
--- /dev/null
+++ b/lib/gpc/kernels/gpc_hwy.hpp
@@ -0,0 +1,17 @@
+#ifndef  __NDB__KERNEL_GPC_HWY
+#define __NDB__KERNEL_GPC_HWY
+
+#include <hwy/highway.h>
+#include <cstdint>
+
+namespace ndb {
+
+namespace testing {
+    void gpc_hwy(uint8_t* in, uint8_t* grad, uint32_t* HWY_RESTRICT gpc, const std::vector<int32_t>& fastmask, const std::vector<int32_t>& tau, int width, int height);
+
+
+}
+
+}  // namespace ndb
+
+#endif  // GPC_KERNELS_SOBEL_HWY_H_
diff --git a/lib/gpc/kernels/sobel_hwy.cpp b/lib/gpc/kernels/sobel_hwy.cpp
index 35d0b72..d1b331f 100644
--- a/lib/gpc/kernels/sobel_hwy.cpp
+++ b/lib/gpc/kernels/sobel_hwy.cpp
@@ -1,4 +1,4 @@
-#define HWY_TARGET HWY_NEON 
+//#define HWY_TARGET HWY_NEON 
 #include <hwy/highway.h>
 
 HWY_BEFORE_NAMESPACE(); 
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 26ef573..e14ceda 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -28,4 +28,5 @@ endfunction()
 add_gpc_approval_test(test_single_matching test_single_matching.cpp)
 add_gpc_approval_test(test_kernel_box test_kernel_box.cpp)
 add_gpc_approval_test(test_kernel_sobel test_kernel_sobel.cpp)
+add_gpc_approval_test(test_kernel_gpc test_kernel_gpc.cpp)
 
diff --git a/tests/test_kernel_gpc.cpp b/tests/test_kernel_gpc.cpp
new file mode 100644
index 0000000..7989f35
--- /dev/null
+++ b/tests/test_kernel_gpc.cpp
@@ -0,0 +1,82 @@
+#include <gtest/gtest.h>
+#include <vector>
+#include <random>
+#include "gpc/forest.hpp"
+#include "gpc/kernels/gpc.hpp"     // Naive version
+#include "gpc/kernels/sobel.hpp" // Highway version
+#include "gpc/kernels/gpc_hwy.hpp" // Highway version
+#include "gpc/kernels/utils.hpp" // Highway version
+
+TEST(Approval, GPCKernel) {
+    auto file = std::filesystem::absolute(__FILE__);
+    auto dir  = file.parent_path();
+    std::filesystem::path forestPath = dir / ".." / "forests" / "defaultZeroForest.txt";
+
+    const int width = 640;
+    const int height = 480;
+    const int radius = 2; // Typical for 5x5 bo
+    const int threshold = 0; // Example threshold for binarization
+
+    typedef gpc::inference::Forest GPCForest_t;
+    GPCForest_t forest;
+    gpc::inference::FilterMask fm =
+        forest.readForest(forestPath, width, height);
+
+    // 1. Prepare randomized input
+    std::vector<uint8_t> input(width * height);
+    std::mt19937 gen(42); 
+    std::uniform_int_distribution<> dis(0, 255);
+    for (auto& val : input) val = dis(gen);
+
+    // 2. Prepare output buffers
+    std::vector<uint8_t> grad(width * height, 0);
+    std::vector<uint32_t> outNaive(width * height, 0);
+    std::vector<uint32_t> outHighway(width * height, 0);
+
+    // 3. Prepare gradient and fastmask
+    ndb::sobelNaive(input.data(), grad.data(), width, height, threshold);
+
+    // More prep
+    std::vector<int> idx(grad.size());
+    auto ff = [&](std::vector<int>& in, std::vector<int>& out, int m) {
+        for (int i = 0; i < m; i++) {
+            int x = in.data()[i] % width;
+            int y = in.data()[i] / width;
+            if (y >= 13 && y < height - 13 && x >= 13 && x < width - 13)
+                out.push_back(in.data()[i]);
+        }
+    };
+    int m;
+    // mask indexing gradient pixels
+    std::vector<int> fastmask;
+    ndb::arr2ind(grad.data(), width * height, idx.data(), &m);
+    ff(idx, fastmask, m);
+
+    std::vector<int> tau;
+    // 4. Run Naive version
+    ndb::gpcFilterNaive(input.data(), grad.data(), outNaive.data(),
+            fm.mask, fastmask, width, height);
+    /*
+     * fastmask.mask, fastmask.tau, idx
+        fastmask.mask is.. imo the extraction pattern. lets se...
+        it's filtermask! lol
+        where idx is... preprocessed.mask. WTF..what is that lol
+        */
+
+    // 5. Run Highway version 
+    //
+    //ndb::gpcFilterSSE(input.data(), grad.data(), outHighway.data(), fastmask, tau, width, height);
+    ndb::testing::gpc_hwy(input.data(), grad.data(), outHighway.data(),
+            fastmask, tau, width, height);
+
+    // 6. Compare results
+    // We skip the border (radius) because different implementations 
+    // might handle edges differently.
+    for (int y = radius; y < height - radius; ++y) {
+        for (int x = radius; x < width - radius; ++x) {
+            int idx = y * width + x;
+            ASSERT_EQ(outNaive[idx], outHighway[idx]) 
+                << "Mismatch at (" << x << "," << y << ")";
+        }
+    }
+}

From b1b5dee72d855c4ced4e6ab5b4c0a02f8653c965 Mon Sep 17 00:00:00 2001
From: Nik Bamert <github@nikbamert.com>
Date: Sun, 8 Mar 2026 12:13:56 +0100
Subject: [PATCH 27/36] add matching kernels

---
 CMakeLists.txt            |   10 +
 benchmarks/CMakeLists.txt |    7 +
 lib/gpc/forest.cpp        | 1244 ++++++++++++++++++++++++++++++++++++-
 lib/gpc/forest.hpp        |  108 +++-
 samples/sparsematch.cpp   |   35 +-
 5 files changed, 1374 insertions(+), 30 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6957189..57b44ae 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -40,6 +40,16 @@ FetchContent_Declare(
   GIT_REPOSITORY https://github.com/google/benchmark.git
   GIT_TAG        v1.9.5  
 )
+# MUST go before FetchContent_MakeAvailable
+set(BENCHMARK_ENABLE_TESTING OFF CACHE BOOL "" FORCE)
+set(BENCHMARK_ENABLE_INSTALL OFF CACHE BOOL "" FORCE)
+
+# Force the library itself to build in Release mode
+set(CMAKE_BUILD_TYPE Release CACHE STRING "" FORCE)
+
+#add_definitions(-DNDEBUG) 
+
+FetchContent_MakeAvailable(google_benchmark)
 FetchContent_MakeAvailable(google_benchmark)
 add_library(gpc_core 
     lib/gpc/forest.cpp 
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index 9735e84..7942fa6 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -22,3 +22,10 @@ target_link_libraries(box_bench
         gpc_core
         benchmark::benchmark 
 )
+add_executable(correspondence_bench correspondence_bench.cpp)
+
+target_link_libraries(correspondence_bench
+    PRIVATE 
+        gpc_core
+        benchmark::benchmark 
+)
diff --git a/lib/gpc/forest.cpp b/lib/gpc/forest.cpp
index 6aca590..876fe0b 100644
--- a/lib/gpc/forest.cpp
+++ b/lib/gpc/forest.cpp
@@ -32,6 +32,7 @@
 // Shenlong Wang, Sean Ryan Fanello, Christoph Rhemann, Shahram Izadi, Pushmeet
 // Kohli CVPR 2016 Code Author: Niklaus Bamert (bamertn@ethz.ch)
 #include <Eigen/Dense>
+//#include <arm_neon.h>
 #include <chrono>
 #include <cstring>
 #include <fstream>
@@ -56,6 +57,1001 @@
 
 namespace gpc {
 namespace inference {
+void Forest::prepareSoAFramesPersistentSingleSlab(
+    std::vector<ndb::Descriptor>& srcStates,
+    std::vector<ndb::Descriptor>& tarStates,
+    SoAFramePersistentSingleSlab& srcFrame, 
+    SoAFramePersistentSingleSlab& tarFrame) {
+
+    uint32_t srcCounts[256] = {0}, tarCounts[256] = {0};
+    for (const auto& s : srcStates) srcCounts[s.state & 0xFF]++;
+    for (const auto& t : tarStates) tarCounts[t.state & 0xFF]++;
+
+    StateIdx* sP = srcFrame.slab.data();
+    StateIdx* tP = tarFrame.slab.data();
+    for (int i = 0; i < 256; ++i) {
+        srcFrame.bucketData[i] = sP;
+        srcFrame.bucketSizes[i] = srcCounts[i];
+        tarFrame.bucketData[i] = tP;
+        tarFrame.bucketSizes[i] = tarCounts[i];
+        sP += srcCounts[i]; tP += tarCounts[i];
+    }
+
+    uint32_t sW[256] = {0}, tW[256] = {0};
+    for (uint32_t i = 0; i < (uint32_t)srcStates.size(); ++i) {
+        uint64_t sv = srcStates[i].state;
+        uint64_t tv = tarStates[i].state;
+        srcFrame.bucketData[sv & 0xFF][sW[sv & 0xFF]++] = {sv, i};
+        tarFrame.bucketData[tv & 0xFF][tW[tv & 0xFF]++] = {tv, i};
+    }
+}
+void Forest::prepareSoAFramesPersistent(
+    std::vector<ndb::Descriptor>& srcStates,
+    std::vector<ndb::Descriptor>& tarStates,
+    SoAFramePersistent& srcFrame, 
+    SoAFramePersistent& tarFrame) {
+    assert(srcStates.size() == tarStates.size());
+    assert(srcStates.size() <= 256 * 16384); // limit for max unique items in our table design
+/*
+ // This is only slightly slower than the bit below.
+    const uint32_t BUCKET_COUNT = 256;
+    const uint64_t BUCKET_MASK = 0xFF;
+
+    // 1. Histogram (To find bucket boundaries)
+    uint32_t srcCounts[BUCKET_COUNT] = {0};
+    uint32_t tarCounts[BUCKET_COUNT] = {0};
+    for (const auto& s : srcStates) srcCounts[s.state & BUCKET_MASK]++;
+    for (const auto& t : tarStates) tarCounts[t.state & BUCKET_MASK]++;
+
+    // 2. Setup Bucket Pointers into the Slab
+    // We treat the slab like a custom allocator
+    uint64_t* srcPtr = srcFrame.statesSlab.data();
+    uint32_t* srcIdxPtr = srcFrame.indicesSlab.data();
+    uint64_t* tarPtr = tarFrame.statesSlab.data();
+    uint32_t* tarIdxPtr = tarFrame.indicesSlab.data();
+
+    for (uint32_t i = 0; i < BUCKET_COUNT; ++i) {
+        srcFrame.bucketStates[i] = srcPtr;
+        srcFrame.bucketIndices[i] = srcIdxPtr;
+        srcFrame.bucketSizes[i] = srcCounts[i];
+        
+        tarFrame.bucketStates[i] = tarPtr;
+        tarFrame.bucketIndices[i] = tarIdxPtr;
+        tarFrame.bucketSizes[i] = tarCounts[i];
+
+        srcPtr += srcCounts[i];
+        srcIdxPtr += srcCounts[i];
+        tarPtr += tarCounts[i];
+        tarIdxPtr += tarCounts[i];
+    }
+
+    // 3. The "Pure Scatter" (No push_back, no resize, no zeroing)
+    uint32_t srcWriteIdx[BUCKET_COUNT] = {0};
+    uint32_t tarWriteIdx[BUCKET_COUNT] = {0};
+
+    for (uint32_t i = 0; i < (uint32_t)srcStates.size(); ++i) {
+        uint64_t s = srcStates[i].state;
+        uint32_t b = s & BUCKET_MASK;
+        uint32_t pos = srcWriteIdx[b]++;
+        srcFrame.bucketStates[b][pos] = s;
+        srcFrame.bucketIndices[b][pos] = i;
+    }
+
+    for (uint32_t i = 0; i < (uint32_t)tarStates.size(); ++i) {
+        uint64_t s = tarStates[i].state;
+        uint32_t b = s & BUCKET_MASK;
+        uint32_t pos = tarWriteIdx[b]++;
+        tarFrame.bucketStates[b][pos] = s;
+        tarFrame.bucketIndices[b][pos] = i;
+    }
+    */
+    const uint32_t BUCKET_COUNT = 256;
+    const uint64_t BUCKET_MASK = 0xFF;
+
+    uint32_t srcCounts[BUCKET_COUNT] = {0};
+    uint32_t tarCounts[BUCKET_COUNT] = {0};
+
+    // 1. Fused Histogram Pass (Assuming equal sizes as per your note)
+    const uint32_t totalSize = (uint32_t)srcStates.size();
+    for (uint32_t i = 0; i < totalSize; ++i) {
+        srcCounts[srcStates[i].state & BUCKET_MASK]++;
+        tarCounts[tarStates[i].state & BUCKET_MASK]++;
+    }
+
+    // 2. Setup Bucket Pointers (Unchanged, this is fast)
+    uint64_t* sP = srcFrame.statesSlab.data();
+    uint32_t* sI = srcFrame.indicesSlab.data();
+    uint64_t* tP = tarFrame.statesSlab.data();
+    uint32_t* tI = tarFrame.indicesSlab.data();
+
+    for (uint32_t i = 0; i < BUCKET_COUNT; ++i) {
+        srcFrame.bucketStates[i] = sP;
+        srcFrame.bucketIndices[i] = sI;
+        srcFrame.bucketSizes[i] = srcCounts[i];
+        tarFrame.bucketStates[i] = tP;
+        tarFrame.bucketIndices[i] = tI;
+        tarFrame.bucketSizes[i] = tarCounts[i];
+        sP += srcCounts[i]; sI += srcCounts[i];
+        tP += tarCounts[i]; tI += tarCounts[i];
+    }
+
+    // 3. Optimized Fused Scatter
+    uint32_t srcWriteIdx[BUCKET_COUNT] = {0};
+    uint32_t tarWriteIdx[BUCKET_COUNT] = {0};
+
+    // Unroll by 2 to keep the M3's execution ports saturated
+    uint32_t i = 0;
+    for (; i + 1 < totalSize; i += 2) {
+        // Source pair
+        uint64_t s0 = srcStates[i].state;
+        uint64_t s1 = srcStates[i+1].state;
+        uint32_t bS0 = s0 & BUCKET_MASK;
+        uint32_t bS1 = s1 & BUCKET_MASK;
+
+        srcFrame.bucketStates[bS0][srcWriteIdx[bS0]++] = s0;
+        srcFrame.bucketIndices[bS0][srcWriteIdx[bS0]-1] = i;
+        srcFrame.bucketStates[bS1][srcWriteIdx[bS1]++] = s1;
+        srcFrame.bucketIndices[bS1][srcWriteIdx[bS1]-1] = i+1;
+
+        // Target pair
+        uint64_t t0 = tarStates[i].state;
+        uint64_t t1 = tarStates[i+1].state;
+        uint32_t bT0 = t0 & BUCKET_MASK;
+        uint32_t bT1 = t1 & BUCKET_MASK;
+
+        tarFrame.bucketStates[bT0][tarWriteIdx[bT0]++] = t0;
+        tarFrame.bucketIndices[bT0][tarWriteIdx[bT0]-1] = i;
+        tarFrame.bucketStates[bT1][tarWriteIdx[bT1]++] = t1;
+        tarFrame.bucketIndices[bT1][tarWriteIdx[bT1]-1] = i+1;
+    }
+
+    // Handle remainder
+    for (; i < totalSize; ++i) {
+        uint64_t s = srcStates[i].state;
+        uint32_t bS = s & BUCKET_MASK;
+        srcFrame.bucketStates[bS][srcWriteIdx[bS]++] = s;
+        srcFrame.bucketIndices[bS][srcWriteIdx[bS]-1] = i;
+
+        uint64_t t = tarStates[i].state;
+        uint32_t bT = t & BUCKET_MASK;
+        tarFrame.bucketStates[bT][tarWriteIdx[bT]++] = t;
+        tarFrame.bucketIndices[bT][tarWriteIdx[bT]-1] = i;
+    }
+}
+
+ // Here we did allocation within the prepare. we can move that part out
+std::pair<SoAFrame, SoAFrame> Forest::prepareSoAFrames(
+    std::vector<ndb::Descriptor>& srcStates,
+    std::vector<ndb::Descriptor>& tarStates) {
+    SoAFrame srcFrame, tarFrame;
+    srcFrame.reserve(srcStates.size());
+    tarFrame.reserve(tarStates.size());
+
+    const uint64_t MASK = 0xFF;
+
+    // Distribute into buckets based on the last 8 bits of the state
+    for (uint32_t i = 0; i < srcStates.size(); ++i) {
+        uint64_t s = srcStates[i].state;
+        srcFrame.states[s & MASK].push_back(s);
+        srcFrame.indices[s & MASK].push_back(i);
+    }
+
+    for (uint32_t i = 0; i < tarStates.size(); ++i) {
+        uint64_t s = tarStates[i].state;
+        tarFrame.states[s & MASK].push_back(s);
+        tarFrame.indices[s & MASK].push_back(i);
+    }
+
+    return {srcFrame, tarFrame};
+}
+void Forest::matchPipelinedBranchlessPreallocateSingleSlab(
+    SoAFramePersistentSingleSlab& src, SoAFramePersistentSingleSlab& tar,
+    std::vector<uint32_t>& outS, std::vector<uint32_t>& outT) {
+
+    struct Slot { 
+        uint64_t key;   // The 64-bit Descriptor/State ID
+        uint32_t idx;   // The original global index in the Source array
+        uint32_t gen;   // The "Generation" ID (replaces memset/clear)
+        uint32_t count; // The match state (0=empty, 1=unique, >1=dup, 0xFF..=matched)
+    };
+    static std::vector<Slot> table(16384, {0, 0, 0, 0});
+    static uint32_t currentGen = 1;
+
+    for (int b = 0; b < 256; ++b) {
+        StateIdx* sData = src.bucketData[b];
+        uint32_t  sSize = src.bucketSizes[b];
+        if (sSize == 0) continue;
+
+        const uint32_t mask = (sSize < 1000) ? 2047 : 16383;
+        const uint32_t shift = (sSize < 1000) ? 53 : 50;
+        currentGen++;
+
+        for (uint32_t i = 0; i < sSize; ++i) {
+            uint64_t k = sData[i].state;
+            uint32_t h = (k * 11400714819323198485llu) >> shift;
+            h &= mask;
+            while (table[h].gen == currentGen && table[h].key != k) h = (h + 1) & mask;
+            if (table[h].gen != currentGen) table[h] = {k, sData[i].index, currentGen, 1};
+            else table[h].count++;
+        }
+
+        StateIdx* tData = tar.bucketData[b];
+        uint32_t  tSize = tar.bucketSizes[b];
+        for (uint32_t i = 0; i < tSize; ++i) {
+            uint64_t k = tData[i].state;
+            uint32_t h = (k * 11400714819323198485llu) >> shift;
+            h &= mask;
+            while (table[h].gen == currentGen && table[h].key != k) h = (h + 1) & mask;
+
+            if (table[h].gen == currentGen && table[h].key == k) {
+                if (table[h].count == 1) {
+                    outS.push_back(table[h].idx);
+                    outT.push_back(tData[i].index);
+                    table[h].count = 0xFFFFFFFF;
+                } else if (table[h].count == 0xFFFFFFFF) {
+                    outS.pop_back(); outT.pop_back();
+                    table[h].count = 0xEEEEEEEE;
+                }
+            }
+        }
+    }
+}
+/*
+std::pair<std::vector<uint32_t>, std::vector<uint32_t>> Forest::matchAdaptiveNeon(
+    SoAFrame& src, 
+    SoAFrame& tar) {
+
+    std::pair<std::vector<uint32_t>, std::vector<uint32_t>> result;
+    result.first.reserve(10000); 
+    result.second.reserve(10000);
+
+    // Slot is exactly 32 bytes. 2 Slots = 64 bytes (1 Cache Line).
+    struct alignas(16) Slot { 
+        uint64_t key; 
+        uint32_t idx; 
+        uint32_t gen;   
+        uint32_t count; 
+        uint32_t padding; 
+    };
+
+    static uint32_t currentGen = 1;
+    static std::vector<Slot> table(8192, {0, 0, 0, 0, 0});
+
+    for (int b = 0; b < 256; ++b) {
+        const auto& sStates = src.states[b];
+        const auto& sIdxs   = src.indices[b];
+        if (sStates.empty()) continue;
+
+        const uint32_t mask = (sStates.size() < 500) ? 1023 : 8191;
+        currentGen++;
+
+        // --- PART 1: SOURCE FILL (Keep Scalar as it's usually not the bottleneck) ---
+        for (size_t i = 0; i < sStates.size(); ++i) {
+            uint64_t k = sStates[i];
+            uint32_t h = (k * 11400714819323198485llu) >> (64 - 13);
+            h &= mask;
+
+            while (table[h].gen == currentGen && table[h].key != k) {
+                h = (h + 1) & mask;
+            }
+            
+            if (table[h].gen != currentGen) {
+                table[h] = {k, sIdxs[i], currentGen, 1, 0};
+            } else {
+                table[h].count++;
+            }
+        }
+
+        const auto& tStates = tar.states[b];
+        const auto& tIdxs   = tar.indices[b];
+
+        // --- PART 2: TARGET MATCH (NEON Vectorized Window) ---
+        uint64x2_t genVec = vdupq_n_u64((uint64_t)currentGen << 32); // Gen is at offset 12 in slot
+        
+        for (size_t i = 0; i < tStates.size(); ++i) {
+            uint64_t k = tStates[i];
+            uint32_t h = (k * 11400714819323198485llu) >> (64 - 13);
+            h &= mask;
+
+            uint64x2_t targetKeyV = vdupq_n_u64(k);
+            bool found = false;
+
+            // Check 2 slots at a time (One Cache Line)
+            // This loop usually terminates in the first iteration (h and h+1)
+            while (true) {
+                // Load keys from Slot H and Slot H+1
+                // We use vld2 to pick the 'key' field which is the first 8 bytes of each 32-byte slot
+                // For simplicity and speed on M3, we'll just do direct pointer access:
+                uint64_t k0 = table[h].key;
+                uint64_t k1 = table[(h + 1) & mask].key;
+                uint32_t g0 = table[h].gen;
+                uint32_t g1 = table[(h + 1) & mask].gen;
+
+                uint64x2_t keysV = {k0, k1};
+                uint32x2_t gensV = {g0, g1};
+
+                // Compare keys
+                uint64x2_t keyMatch = vceqq_u64(keysV, targetKeyV);
+                // Compare generations
+                uint32x2_t genMatch = vceq_u32(gensV, vdup_n_u32(currentGen));
+
+                // Check lane 0
+                if (vgetq_lane_u64(keyMatch, 0) && vget_lane_u32(genMatch, 0)) {
+                    if (table[h].count == 1) {
+                        result.first.push_back(table[h].idx);
+                        result.second.push_back(tIdxs[i]);
+                        table[h].count = 0xFFFFFFFF;
+                    } else if (table[h].count == 0xFFFFFFFF) {
+                        result.first.pop_back(); result.second.pop_back();
+                        table[h].count = 0xEEEEEEEE;
+                    }
+                    found = true; break;
+                }
+                
+                // Check lane 1
+                uint32_t nextH = (h + 1) & mask;
+                if (vgetq_lane_u64(keyMatch, 1) && vget_lane_u32(genMatch, 1)) {
+                    if (table[nextH].count == 1) {
+                        result.first.push_back(table[nextH].idx);
+                        result.second.push_back(tIdxs[i]);
+                        table[nextH].count = 0xFFFFFFFF;
+                    } else if (table[nextH].count == 0xFFFFFFFF) {
+                        result.first.pop_back(); result.second.pop_back();
+                        table[nextH].count = 0xEEEEEEEE;
+                    }
+                    found = true; break;
+                }
+
+                // If neither matches and both are "current", we must keep probing
+                if (g0 == currentGen && g1 == currentGen) {
+                    h = (h + 2) & mask;
+                } else {
+                    // One of them is an empty slot (gen != currentGen), stop searching
+                    break;
+                }
+            }
+        }
+    }
+    return result;
+}
+*/
+void Forest::matchPipelinedBranchlessPreallocate(
+    SoAFramePersistent& src, 
+    SoAFramePersistent& tar,
+    std::vector<uint32_t>& resultSrc,
+    std::vector<uint32_t>& resultTar) {
+
+    //std::pair<std::vector<uint32_t>, std::vector<uint32_t>> result;
+    // For 100M items, we might find more matches; 
+    // adjusting reserve to prevent mid-run reallocations.
+    //result.first.reserve(src.statesSlab.size() / 100); 
+    //result.second.reserve(src.statesSlab.size() / 100);
+
+    struct Slot { 
+        uint64_t key; 
+        uint32_t idx; 
+        uint32_t gen;   
+        uint32_t count; 
+    };
+
+    static uint32_t currentGen = 1; 
+    // Increased table size slightly to 16k to further reduce Pareto collisions
+    static std::vector<Slot> table(16384, {0, 0, 0, 0}); 
+
+    for (int b = 0; b < 256; ++b) {
+        uint64_t* sStates = src.bucketStates[b];
+        uint32_t* sIdxs   = src.bucketIndices[b];
+        uint32_t  sSize   = src.bucketSizes[b];
+        
+        if (sSize == 0) continue;
+
+        // Adaptive Mask: 2k for small, 16k for large
+        const uint32_t mask = (sSize < 1000) ? 2047 : 16383;
+        const uint32_t shift = (sSize < 1000) ? (64 - 11) : (64 - 14);
+        currentGen++; 
+
+        // 1. Fill Table (Source)
+        for (size_t i = 0; i < sSize; ++i) {
+            uint64_t k = sStates[i];
+            uint32_t h = (k * 11400714819323198485llu) >> shift;
+            h &= mask;
+
+            // Branchless-ish Probe: Most IDs are unique, so this loop
+            // is predicted "not taken" after the first check.
+            while (table[h].gen == currentGen && table[h].key != k) {
+                h = (h + 1) & mask;
+            }
+            
+            if (table[h].gen != currentGen) {
+                table[h] = {k, sIdxs[i], currentGen, 1};
+            } else {
+                table[h].count++; 
+            }
+        }
+
+        // 2. Intersect (Target) with Software Pipelining
+        uint64_t* tStates = tar.bucketStates[b];
+        uint32_t* tIdxs   = tar.bucketIndices[b];
+        uint32_t  tSize   = tar.bucketSizes[b];
+
+        for (size_t i = 0; i < tSize; ++i) {
+            // Manual prefetch of the state 16 elements ahead to stay in L1
+            if (i + 16 < tSize) {
+                __builtin_prefetch(&tStates[i + 16], 0, 3);
+            }
+
+            uint64_t k = tStates[i];
+            uint32_t h = (k * 11400714819323198485llu) >> shift;
+            h &= mask;
+
+            // Probe logic
+            while (table[h].gen == currentGen && table[h].key != k) {
+                h = (h + 1) & mask;
+            }
+
+            if (table[h].gen == currentGen && table[h].key == k) {
+                const uint32_t cnt = table[h].count;
+                if (cnt == 1) {
+                    resultSrc.push_back(table[h].idx);
+                    resultTar.push_back(tIdxs[i]);
+                    table[h].count = 0xFFFFFFFF; 
+                } else if (cnt == 0xFFFFFFFF) {
+                    // Pareto multi-match removal logic
+                    resultSrc.pop_back();
+                    resultTar.pop_back();
+                    table[h].count = 0xEEEEEEEE; 
+                }
+            }
+        }
+    }
+}
+std::pair<std::vector<uint32_t>, std::vector<uint32_t>> Forest::matchPipelinedBranchless(
+    SoAFramePersistent& src, 
+    SoAFramePersistent& tar) {
+
+    std::pair<std::vector<uint32_t>, std::vector<uint32_t>> result;
+    // For 100M items, we might find more matches; 
+    // adjusting reserve to prevent mid-run reallocations.
+    result.first.reserve(src.statesSlab.size() / 100); 
+    result.second.reserve(src.statesSlab.size() / 100);
+
+    struct Slot { 
+        uint64_t key; 
+        uint32_t idx; 
+        uint32_t gen;   
+        uint32_t count; 
+    };
+
+    static uint32_t currentGen = 1; 
+    // Increased table size slightly to 16k to further reduce Pareto collisions
+    static std::vector<Slot> table(16384, {0, 0, 0, 0}); 
+
+    for (int b = 0; b < 256; ++b) {
+        uint64_t* sStates = src.bucketStates[b];
+        uint32_t* sIdxs   = src.bucketIndices[b];
+        uint32_t  sSize   = src.bucketSizes[b];
+        
+        if (sSize == 0) continue;
+
+        // Adaptive Mask: 2k for small, 16k for large
+        const uint32_t mask = (sSize < 1000) ? 2047 : 16383;
+        const uint32_t shift = (sSize < 1000) ? (64 - 11) : (64 - 14);
+        currentGen++; 
+
+        // 1. Fill Table (Source)
+        for (size_t i = 0; i < sSize; ++i) {
+            uint64_t k = sStates[i];
+            uint32_t h = (k * 11400714819323198485llu) >> shift;
+            h &= mask;
+
+            // Branchless-ish Probe: Most IDs are unique, so this loop
+            // is predicted "not taken" after the first check.
+            while (table[h].gen == currentGen && table[h].key != k) {
+                h = (h + 1) & mask;
+            }
+            
+            if (table[h].gen != currentGen) {
+                table[h] = {k, sIdxs[i], currentGen, 1};
+            } else {
+                table[h].count++; 
+            }
+        }
+
+        // 2. Intersect (Target) with Software Pipelining
+        uint64_t* tStates = tar.bucketStates[b];
+        uint32_t* tIdxs   = tar.bucketIndices[b];
+        uint32_t  tSize   = tar.bucketSizes[b];
+
+        for (size_t i = 0; i < tSize; ++i) {
+            // Manual prefetch of the state 16 elements ahead to stay in L1
+            if (i + 16 < tSize) {
+                __builtin_prefetch(&tStates[i + 16], 0, 3);
+            }
+
+            uint64_t k = tStates[i];
+            uint32_t h = (k * 11400714819323198485llu) >> shift;
+            h &= mask;
+
+            // Probe logic
+            while (table[h].gen == currentGen && table[h].key != k) {
+                h = (h + 1) & mask;
+            }
+
+            if (table[h].gen == currentGen && table[h].key == k) {
+                const uint32_t cnt = table[h].count;
+                if (cnt == 1) {
+                    result.first.push_back(table[h].idx);
+                    result.second.push_back(tIdxs[i]);
+                    table[h].count = 0xFFFFFFFF; 
+                } else if (cnt == 0xFFFFFFFF) {
+                    // Pareto multi-match removal logic
+                    result.first.pop_back();
+                    result.second.pop_back();
+                    table[h].count = 0xEEEEEEEE; 
+                }
+            }
+        }
+    }
+    return result;
+}
+std::pair<std::vector<uint32_t>, std::vector<uint32_t>> Forest::matchAdaptivePersistent(
+    SoAFramePersistent& src, 
+    SoAFramePersistent& tar) {
+
+    std::pair<std::vector<uint32_t>, std::vector<uint32_t>> result;
+    result.first.reserve(10000); 
+    result.second.reserve(10000);
+
+    struct Slot { 
+        uint64_t key; 
+        uint32_t idx; 
+        uint32_t gen;   // Generation counter
+        uint32_t count; // 1=SrcUnique, 0xFFFFFFFF=Matched, etc.
+    };
+
+    static uint32_t currentGen = 1; 
+    static std::vector<Slot> table(8192, {0, 0, 0, 0}); 
+
+    for (int b = 0; b < 256; ++b) {
+        uint64_t* sStates = src.bucketStates[b];
+        uint32_t* sIdxs   = src.bucketIndices[b];
+        uint32_t  sSize   = src.bucketSizes[b];
+        
+        if (sSize == 0) continue;
+
+        const uint32_t mask = (sSize < 500) ? 1023 : 8191;
+        currentGen++; 
+
+        // 1. Fill Table
+        for (size_t i = 0; i < sSize; ++i) {
+            uint64_t k = sStates[i];
+            uint32_t h = (k * 11400714819323198485llu) >> (64 - 13);
+            h &= mask;
+
+            while (table[h].gen == currentGen && table[h].key != k) {
+                h = (h + 1) & mask;
+            }
+            
+            if (table[h].gen != currentGen) {
+                table[h] = {k, sIdxs[i], currentGen, 1};
+            } else {
+                table[h].count++; 
+            }
+        }
+
+        // 2. Intersect
+        uint64_t* tStates = tar.bucketStates[b];
+        uint32_t* tIdxs   = tar.bucketIndices[b];
+        uint32_t  tSize   = tar.bucketSizes[b];
+
+        for (size_t i = 0; i < tSize; ++i) {
+            uint64_t k = tStates[i];
+            uint32_t h = (k * 11400714819323198485llu) >> (64 - 13);
+            h &= mask;
+
+            while (table[h].gen == currentGen && table[h].key != k) {
+                h = (h + 1) & mask;
+            }
+
+            if (table[h].gen == currentGen && table[h].key == k) {
+                if (table[h].count == 1) {
+                    result.first.push_back(table[h].idx);
+                    result.second.push_back(tIdxs[i]);
+                    table[h].count = 0xFFFFFFFF; 
+                } else if (table[h].count == 0xFFFFFFFF) {
+                    result.first.pop_back();
+                    result.second.pop_back();
+                    table[h].count = 0xEEEEEEEE; 
+                }
+            }
+        }
+    }
+    return result;
+}
+
+std::pair<std::vector<uint32_t>, std::vector<uint32_t>> Forest::matchAdaptive(
+    SoAFrame& src, 
+    SoAFrame& tar) {
+
+    std::pair<std::vector<uint32_t>, std::vector<uint32_t>> result;
+    result.first.reserve(10000); 
+    result.second.reserve(10000);
+
+    struct Slot { 
+        uint64_t key; 
+        uint32_t idx; 
+        uint32_t gen;   // Generation counter
+        uint32_t count; // 1=SrcUnique, 0xFFFFFFFF=Matched, etc.
+    };
+
+    // Global generation for this call
+    uint32_t currentGen = 1; 
+    std::vector<Slot> table(8192, {0, 0, 0, 0}); 
+
+    for (int b = 0; b < 256; ++b) {
+        const auto& sStates = src.states[b];
+        const auto& sIdxs   = src.indices[b];
+        if (sStates.empty()) continue;
+
+        // Adaptive Table Mask: Use smaller range for tiny buckets
+        const uint32_t mask = (sStates.size() < 500) ? 1023 : 8191;
+        currentGen++; 
+
+        // 1. Fill Table
+        for (size_t i = 0; i < sStates.size(); ++i) {
+            // Prefetch an element roughly 16 iterations ahead (adjust based on testing)
+            /*
+             * This didn't help anymore. So either compiler already optimized this or 
+             * we are compute bound.
+             * if (i + 16 < sStates.size()) {
+                __builtin_prefetch(&sStates[i + 16], 0, 3);
+                __builtin_prefetch(&sIdxs[i + 16], 0, 3);
+            }*/
+            uint64_t k = sStates[i];
+            uint32_t h = (k * 11400714819323198485llu) >> (64 - 13);
+            h &= mask;
+
+            // Probe: Valid if gen matches AND key is different
+            while (table[h].gen == currentGen && table[h].key != k) {
+                h = (h + 1) & mask;
+            }
+            
+            if (table[h].gen != currentGen) {
+                table[h] = {k, sIdxs[i], currentGen, 1};
+            } else {
+                table[h].count++; // Duplicate in Source
+            }
+        }
+
+        // 2. Intersect
+        const auto& tStates = tar.states[b];
+        const auto& tIdxs   = tar.indices[b];
+        for (size_t i = 0; i < tStates.size(); ++i) {
+            uint64_t k = tStates[i];
+            uint32_t h = (k * 11400714819323198485llu) >> (64 - 13);
+            h &= mask;
+
+            while (table[h].gen == currentGen && table[h].key != k) {
+                h = (h + 1) & mask;
+            }
+
+            if (table[h].gen == currentGen && table[h].key == k) {
+                if (table[h].count == 1) {
+                    result.first.push_back(table[h].idx);
+                    result.second.push_back(tIdxs[i]);
+                    table[h].count = 0xFFFFFFFF; 
+                } else if (table[h].count == 0xFFFFFFFF) {
+                    result.first.pop_back();
+                    result.second.pop_back();
+                    table[h].count = 0xEEEEEEEE; 
+                }
+            }
+        }
+    }
+    return result;
+}
+std::pair<std::vector<uint32_t>, std::vector<uint32_t>> Forest::matchBlockedBloom(
+    SoAFrame& src, 
+    SoAFrame& tar) {
+
+    std::pair<std::vector<uint32_t>, std::vector<uint32_t>> result;
+    result.first.reserve(10000); 
+    result.second.reserve(10000);
+
+    struct Slot { 
+        uint64_t key; 
+        uint32_t idx; 
+        uint32_t count; 
+    };
+
+    const uint32_t TABLE_SIZE = 8192;
+    const uint32_t HASH_MASK = TABLE_SIZE - 1;
+    std::vector<Slot> table(TABLE_SIZE); 
+
+    // A 512-bit Bloom Filter fits in exactly one Cache Line (64 bytes).
+    // We use 8 x 64-bit integers to represent the 512 bits.
+    uint64_t bloom[8];
+
+    for (int b = 0; b < 256; ++b) {
+        std::fill(table.begin(), table.end(), Slot{0, 0, 0});
+        std::memset(bloom, 0, sizeof(bloom));
+
+        const auto& sStates = src.states[b];
+        const auto& sIdxs   = src.indices[b];
+        const auto& tStates = tar.states[b];
+        const auto& tIdxs   = tar.indices[b];
+
+        // 1. Fill Table + Bloom Filter
+        for (size_t i = 0; i < sStates.size(); ++i) {
+            uint64_t k = sStates[i];
+            
+            // Set Bloom bit: use a different hash or shift for the bloom index
+            // We'll use bits from the key to pick one of 512 bits
+            uint32_t bHash = (k ^ (k >> 32));
+            bloom[(bHash >> 6) & 7] |= (1ull << (bHash & 63));
+
+            uint32_t h = (k * 11400714819323198485llu) >> (64 - 13); 
+            h &= HASH_MASK;
+
+            while (table[h].count > 0 && table[h].key != k) {
+                h = (h + 1) & HASH_MASK;
+            }
+            
+            table[h].key = k;
+            table[h].idx = sIdxs[i];
+            table[h].count++; 
+        }
+
+        // 2. Intersection with Bloom Filter Gate
+        for (size_t i = 0; i < tStates.size(); ++i) {
+            uint64_t k = tStates[i];
+            
+            // --- BLOOM FILTER GATE ---
+            uint32_t bHash = (k ^ (k >> 32));
+            if (!(bloom[(bHash >> 6) & 7] & (1ull << (bHash & 63)))) {
+                continue; // 100% certainly not in Source. Skip hash probe!
+            }
+            // -------------------------
+
+            uint32_t h = (k * 11400714819323198485llu) >> (64 - 13);
+            h &= HASH_MASK;
+
+            while (table[h].count > 0 && table[h].key != k) {
+                h = (h + 1) & HASH_MASK;
+            }
+
+            if (table[h].key == k) {
+                if (table[h].count == 1) {
+                    result.first.push_back(table[h].idx);
+                    result.second.push_back(tIdxs[i]);
+                    table[h].count = 0xFFFFFFFF; 
+                } else if (table[h].count == 0xFFFFFFFF) {
+                    result.first.pop_back();
+                    result.second.pop_back();
+                    table[h].count = 0xEEEEEEEE; 
+                }
+            }
+        }
+    }
+    return result;
+}
+std::pair<std::vector<uint32_t>, std::vector<uint32_t>> Forest::matchParallelRadixPartitioning(
+    SoAFrame& src, 
+    SoAFrame& tar) {
+
+    std::pair<std::vector<uint32_t>, std::vector<uint32_t>> result;
+    result.first.reserve(10000); 
+    result.second.reserve(10000);
+
+    const uint32_t TABLE_SIZE = 8192;
+    const uint32_t HASH_MASK = TABLE_SIZE - 1;
+    
+    // Aligned scratchpad to maximize L1/L2 cache efficiency
+    struct alignas(64) Slot { 
+        uint64_t key; 
+        uint32_t idx; 
+        uint32_t count; 
+    };
+    std::vector<Slot> table(TABLE_SIZE);
+
+    for (int b = 0; b < 256; ++b) {
+        // 1. FAST CLEAR
+        // std::fill is optimized, but we only zero the 'count' to save cycles
+        for(auto& s : table) s.count = 0;
+
+        const auto& sStates = src.states[b];
+        const auto& sIdxs   = src.indices[b];
+        const size_t sSize  = sStates.size();
+
+        // 2. PIPELINED FILL (Unrolled x4 for ILP)
+        // We process 4 items at once to hide memory latency
+        size_t i = 0;
+        for (; i + 3 < sSize; i += 4) {
+            for (int k = 0; k < 4; ++k) {
+                uint64_t key = sStates[i + k];
+                uint32_t h = (key * 11400714819323198485llu) >> (64 - 13);
+                h &= HASH_MASK;
+
+                while (table[h].count > 0 && table[h].key != key) h = (h + 1) & HASH_MASK;
+                
+                table[h].key = key;
+                table[h].idx = sIdxs[i + k];
+                table[h].count++;
+            }
+        }
+        // Handle remainder
+        for (; i < sSize; ++i) {
+            uint64_t key = sStates[i];
+            uint32_t h = (key * 11400714819323198485llu) >> (64 - 13);
+            h &= HASH_MASK;
+            while (table[h].count > 0 && table[h].key != key) h = (h + 1) & HASH_MASK;
+            table[h].key = key; table[h].idx = sIdxs[i]; table[h].count++;
+        }
+
+        // 3. OPTIMISTIC INTERSECTION
+        const auto& tStates = tar.states[b];
+        const auto& tIdxs   = tar.indices[b];
+        const size_t tSize  = tStates.size();
+
+        for (size_t j = 0; j < tSize; ++j) {
+            uint64_t key = tStates[j];
+            uint32_t h = (key * 11400714819323198485llu) >> (64 - 13);
+            h &= HASH_MASK;
+
+            while (table[h].count > 0 && table[h].key != key) h = (h + 1) & HASH_MASK;
+
+            if (table[h].key == key) {
+                if (table[h].count == 1) {
+                    result.first.push_back(table[h].idx);
+                    result.second.push_back(tIdxs[j]);
+                    table[h].count = 0xFFFFFFFF; // Mark as Matched
+                } else if (table[h].count == 0xFFFFFFFF) {
+                    // Pareto duplicate found in Target: Roll back
+                    result.first.pop_back();
+                    result.second.pop_back();
+                    table[h].count = 0xEEEEEEEE; // Mark as Permanent Duplicate
+                }
+            }
+        }
+    }
+
+    return result;
+}
+std::pair<std::vector<uint32_t>, std::vector<uint32_t>> Forest::matchPreparedFramesFaster(
+    SoAFrame& src, 
+    SoAFrame& tar) {
+
+    std::pair<std::vector<uint32_t>, std::vector<uint32_t>> result;
+    result.first.reserve(10000); 
+    result.second.reserve(10000);
+
+    // Flat, cache-aligned slot structure
+    struct Slot { 
+        uint64_t key; 
+        uint32_t idx; 
+        uint32_t count; 
+    };
+
+    // 8192 slots = 128KB. This fits perfectly in your 4MB L2.
+    // We use a power-of-two size to use bitwise AND instead of modulo %.
+    const uint32_t TABLE_SIZE = 8192;
+    const uint32_t HASH_MASK = TABLE_SIZE - 1;
+    std::vector<Slot> table(TABLE_SIZE); 
+
+    for (int b = 0; b < 256; ++b) {
+        // FAST: std::fill is usually a vectorized memset.
+        std::fill(table.begin(), table.end(), Slot{0, 0, 0});
+
+        const auto& sStates = src.states[b];
+        const auto& sIdxs   = src.indices[b];
+        const auto& tStates = tar.states[b];
+        const auto& tIdxs   = tar.indices[b];
+
+        // 1. Fill Table from Source
+        for (size_t i = 0; i < sStates.size(); ++i) {
+            uint64_t k = sStates[i];
+            // Fibonacci Hashing (very fast for 64-bit keys)
+            uint32_t h = (k * 11400714819323198485llu) >> (64 - 13); 
+            h &= HASH_MASK;
+
+            while (table[h].count > 0 && table[h].key != k) {
+                h = (h + 1) & HASH_MASK;
+            }
+            
+            table[h].key = k;
+            table[h].idx = sIdxs[i];
+            table[h].count++; 
+        }
+
+        // 2. Secondary Uniqueness Check + Intersection
+        // We reuse the 'count' field: 
+        // 1 = Unique in Src
+        // >1 = Duplicate in Src
+        // 0 = Already Matched (prevents Target duplicates)
+        for (size_t i = 0; i < tStates.size(); ++i) {
+            uint64_t k = tStates[i];
+            uint32_t h = (k * 11400714819323198485llu) >> (64 - 13);
+            h &= HASH_MASK;
+
+            while (table[h].count > 0 && table[h].key != k) {
+                h = (h + 1) & HASH_MASK;
+            }
+
+            // We need to know if 'k' is unique in Target too.
+            // A quick way is to check if it appears again in the target bucket.
+            // For Pareto, we can use a "tombstone" logic:
+            if (table[h].key == k) {
+                if (table[h].count == 1) {
+                    // This is the first time we see it in Target
+                    result.first.push_back(table[h].idx);
+                    result.second.push_back(tIdxs[i]);
+                    table[h].count = 0xFFFFFFFF; // Mark as "Matched once"
+                } else if (table[h].count == 0xFFFFFFFF) {
+                    // Oh no, this is a Target duplicate! 
+                    // We must remove the last added match.
+                    result.first.pop_back();
+                    result.second.pop_back();
+                    table[h].count = 0xEEEEEEEE; // Mark as "Permanent Duplicate"
+                }
+            }
+        }
+    }
+    return result;
+}
+std::pair<std::vector<uint32_t>, std::vector<uint32_t>> Forest::matchPreparedFrames( SoAFrame& src, SoAFrame& tar) {
+
+    // Initialize the pair of vectors
+    std::pair<std::vector<uint32_t>, std::vector<uint32_t>> result;
+    
+    // Heuristic: start with a reasonable reserve (e.g., 5% of average bucket size * 256)
+    size_t initialReserve = (src.states[0].size() + tar.states[0].size()) * 6; 
+    result.first.reserve(initialReserve);
+    result.second.reserve(initialReserve);
+
+    // Local structures for bucket-level uniqueness
+    struct SrcInfo { uint32_t idx; bool isDup; };
+    std::unordered_map<uint64_t, SrcInfo> bucketSrc;
+    std::unordered_map<uint64_t, bool> bucketTar;
+
+    for (int b = 0; b < 256; ++b) {
+        bucketSrc.clear();
+        bucketTar.clear();
+
+        const auto& sStates = src.states[b];
+        const auto& sIdxs   = src.indices[b];
+        const auto& tStates = tar.states[b];
+        const auto& tIdxs   = tar.indices[b];
+
+        // 1. Process Source: Mark unique vs duplicates
+        for (size_t i = 0; i < sStates.size(); ++i) {
+            auto [it, inserted] = bucketSrc.try_emplace(sStates[i], SrcInfo{sIdxs[i], false});
+            if (!inserted) it->second.isDup = true;
+        }
+
+        // 2. Process Target: Mark unique vs duplicates
+        for (size_t i = 0; i < tStates.size(); ++i) {
+            auto [it, inserted] = bucketTar.try_emplace(tStates[i], false);
+            if (!inserted) it->second = true; // Mark as duplicate
+        }
+
+        // 3. Intersect unique-only IDs
+        for (size_t i = 0; i < tStates.size(); ++i) {
+            uint64_t id = tStates[i];
+            
+            // Check if unique in Target
+            if (bucketTar[id] == false) {
+                auto it = bucketSrc.find(id);
+                // Check if exists in Source AND is unique there
+                if (it != bucketSrc.end() && it->second.isDup == false) {
+                    result.first.push_back(it->second.idx);
+                    result.second.push_back(tIdxs[i]);
+                }
+            }
+        }
+    }
+
+    return result;
+}
 
     /**
      * @brief Computes sparse matches on a pair of rectified and smoothed
@@ -117,7 +1113,6 @@ std::vector<ndb::Correspondence> Forest::findCorrespondences(
     int numStates = std::min(srcStates.size(), tarStates.size());
     // Limit search to rectified epipolar case.
     std::sort(srcStates.begin(), srcStates.end());
-
     std::sort(tarStates.begin(), tarStates.end());
     std::vector<ndb::Correspondence> corr;
     uint32_t j = 0;
@@ -141,7 +1136,254 @@ std::vector<ndb::Correspondence> Forest::findCorrespondences(
     }
     return corr;
 }
+#include <unordered_map>
+
+// State machine for our IDs
+enum class State : uint8_t { Unseen = 0, SeenOnce = 1, Duplicate = 2 };
+
+#include <vector>
+#include <unordered_map>
+#include <cstdint>
+
+std::vector<ndb::Correspondence> Forest::findCorrespondencesHash(
+    std::vector<ndb::Descriptor>& srcStates,
+    std::vector<ndb::Descriptor>& tarStates) {
+
+    // Tracking states: 0 = Unseen, 1 = SeenOnce, 2 = Duplicate
+    enum class Occurence : uint8_t { Unseen = 0, SeenOnce = 1, Duplicate = 2 };
+
+    // 1. Map Source IDs: State -> {OccurenceLevel, OriginalIndex}
+    // Pre-allocating prevents expensive rehashes during the loop
+    std::unordered_map<uint64_t, std::pair<Occurence, uint32_t>> srcMap;
+    srcMap.reserve(srcStates.size());
+
+    for (uint32_t i = 0; i < srcStates.size(); ++i) {
+        auto& entry = srcMap[srcStates[i].state];
+        if (entry.first == Occurence::Unseen) {
+            entry = {Occurence::SeenOnce, i};
+        } else {
+            entry.first = Occurence::Duplicate;
+        }
+    }
+
+    // 2. Map Target IDs: State -> OccurenceLevel
+    std::unordered_map<uint64_t, Occurence> tarMap;
+    tarMap.reserve(tarStates.size());
+
+    for (uint32_t j = 0; j < tarStates.size(); ++j) {
+        auto& occ = tarMap[tarStates[j].state];
+        if (occ == Occurence::Unseen) {
+            occ = Occurence::SeenOnce;
+        } else {
+            occ = Occurence::Duplicate;
+        }
+    }
+
+    // 3. Intersect unique pairs
+    std::vector<ndb::Correspondence> corr;
+    // Heuristic: Reserve 20% of the smaller set size for the results
+    corr.reserve(std::min(srcStates.size(), tarStates.size()) / 5);
+
+    for (uint32_t j = 0; j < tarStates.size(); ++j) {
+        uint64_t currentID = tarStates[j].state;
+
+        // Condition: Must be unique in Target AND unique in Source
+        if (tarMap[currentID] == Occurence::SeenOnce) {
+            auto it = srcMap.find(currentID);
+            if (it != srcMap.end() && it->second.first == Occurence::SeenOnce) {
+                // Correspondence(Point from Source, Point from Target)
+                corr.push_back(ndb::Correspondence(
+                    srcStates[it->second.second].point, 
+                    tarStates[j].point
+                ));
+            }
+        }
+    }
+
+    return corr;
+}
+#include <vector>
+#include <cstdint>
+#include <algorithm>
+
+// A lightweight structure to avoid moving heavy Descriptor objects
+struct KeyIndex {
+    uint64_t state;
+    uint32_t index;
+};
+#include <vector>
+#include <cstdint>
+#include <array>
+
+std::vector<ndb::Correspondence> Forest::findCorrespondencesTurbo(
+    std::vector<ndb::Descriptor>& srcStates,
+    std::vector<ndb::Descriptor>& tarStates) {
+
+    const int BUCKETS = 256;
+    const uint64_t MASK = 0xFF;
+
+    // --- STEP 1: Linear Partitioning (Radix Pass) ---
+    // We use a single flat buffer to avoid 256 separate vector allocations
+    std::vector<KeyIndex> srcBuffer(srcStates.size());
+    std::vector<KeyIndex> tarBuffer(tarStates.size());
+    std::array<size_t, BUCKETS> srcCounts = {0}, tarCounts = {0};
+    std::array<size_t, BUCKETS> srcOffsets, tarOffsets;
+
+    for (const auto& s : srcStates) srcCounts[s.state & MASK]++;
+    for (const auto& t : tarStates) tarCounts[t.state & MASK]++;
+
+    srcOffsets[0] = tarOffsets[0] = 0;
+    for (int i = 1; i < BUCKETS; ++i) {
+        srcOffsets[i] = srcOffsets[i - 1] + srcCounts[i - 1];
+        tarOffsets[i] = tarOffsets[i - 1] + tarCounts[i - 1];
+    }
+
+    auto srcCursors = srcOffsets;
+    auto tarCursors = tarOffsets;
+
+    for (uint32_t i = 0; i < srcStates.size(); ++i) {
+        srcBuffer[srcCursors[srcStates[i].state & MASK]++] = {srcStates[i].state, i};
+    }
+    for (uint32_t i = 0; i < tarStates.size(); ++i) {
+        tarBuffer[tarCursors[tarStates[i].state & MASK]++] = {tarStates[i].state, i};
+    }
 
+    // --- STEP 2: In-Cache Hashing ---
+    std::vector<ndb::Correspondence> corr;
+    corr.reserve(std::min(srcStates.size(), tarStates.size()) / 8);
+
+    // Using a tiny fixed-size hash table for each bucket to stay in L1/L2 cache
+    // State: 0 = Unseen, 1 = SeenOnce, 2 = Duplicate
+    struct LocalVal { uint32_t index; uint8_t count; };
+    
+    // We reuse this map across buckets to avoid reallocating
+    // A simple open-addressed hash map for the bucket
+    std::unordered_map<uint64_t, LocalVal> bucketMap;
+    bucketMap.reserve(srcStates.size() / BUCKETS * 2);
+
+    for (int b = 0; b < BUCKETS; ++b) {
+        bucketMap.clear();
+
+        // Load Source bucket into local cache-friendly map
+        size_t srcStart = srcOffsets[b];
+        size_t srcEnd = srcStart + srcCounts[b];
+        for (size_t i = srcStart; i < srcEnd; ++i) {
+            auto& entry = bucketMap[srcBuffer[i].state];
+            entry.index = srcBuffer[i].index;
+            entry.count = (entry.count == 0) ? 1 : 2;
+        }
+
+        // Intersect with Target bucket
+        size_t tarStart = tarOffsets[b];
+        size_t tarEnd = tarStart + tarCounts[b];
+        
+        // Secondary map to ensure target-side uniqueness
+        std::unordered_map<uint64_t, uint8_t> tarUniqueness;
+        for (size_t i = tarStart; i < tarEnd; ++i) {
+            auto& count = tarUniqueness[tarBuffer[i].state];
+            count = (count == 0) ? 1 : 2;
+        }
+
+        for (size_t i = tarStart; i < tarEnd; ++i) {
+            uint64_t id = tarBuffer[i].state;
+            if (tarUniqueness[id] == 1) {
+                auto it = bucketMap.find(id);
+                if (it != bucketMap.end() && it->second.count == 1) {
+                    corr.push_back(ndb::Correspondence(
+                        srcStates[it->second.index].point, 
+                        tarStates[tarBuffer[i].index].point
+                    ));
+                }
+            }
+        }
+    }
+
+    return corr;
+}
+std::vector<ndb::Correspondence> Forest::findCorrespondencesHashingRadix(
+    std::vector<ndb::Descriptor>& srcStates,
+    std::vector<ndb::Descriptor>& tarStates) {
+
+    const int NUM_BUCKETS = 256;
+    const uint64_t MASK = 0xFF;
+
+    // 1. Partition Source into Buckets
+    std::vector<KeyIndex> srcBuckets[NUM_BUCKETS];
+    for (int i = 0; i < NUM_BUCKETS; ++i) srcBuckets[i].reserve(srcStates.size() / NUM_BUCKETS * 1.2);
+    
+    for (uint32_t i = 0; i < srcStates.size(); ++i) {
+        srcBuckets[srcStates[i].state & MASK].push_back({srcStates[i].state, i});
+    }
+
+    // 2. Partition Target into Buckets
+    std::vector<uint64_t> tarBuckets[NUM_BUCKETS];
+    for (int i = 0; i < NUM_BUCKETS; ++i) tarBuckets[i].reserve(tarStates.size() / NUM_BUCKETS * 1.2);
+    
+    for (uint32_t i = 0; i < tarStates.size(); ++i) {
+        tarBuckets[tarStates[i].state & MASK].push_back(tarStates[i].state);
+    }
+
+    std::vector<ndb::Correspondence> corr;
+    corr.reserve(std::min(srcStates.size(), tarStates.size()) / 5);
+
+    // 3. Process each bucket pair
+    // This part can be easily parallelized with #pragma omp parallel for
+    for (int b = 0; b < NUM_BUCKETS; ++b) {
+        if (srcBuckets[b].empty() || tarBuckets[b].empty()) continue;
+
+        // Small local maps fit in L1/L2 Cache
+        // Using a simple frequency map for the local bucket
+        enum class Occ : uint8_t { Unseen = 0, SeenOnce = 1, Duplicate = 2 };
+        
+        struct LocalEntry {
+            Occ occ = Occ::Unseen;
+            uint32_t idx = 0;
+        };
+
+        // We use a flat hash map here. For simplicity in standard C++, 
+        // std::unordered_map is used, but even it is faster here 
+        // because it stays in cache.
+        std::unordered_map<uint64_t, LocalEntry> localSrc;
+        localSrc.reserve(srcBuckets[b].size());
+
+        for (auto& ki : srcBuckets[b]) {
+            auto& entry = localSrc[ki.state];
+            if (entry.occ == Occ::Unseen) {
+                entry = {Occ::SeenOnce, ki.index};
+            } else {
+                entry.occ = Occ::Duplicate;
+            }
+        }
+
+        std::unordered_map<uint64_t, Occ> localTar;
+        localTar.reserve(tarBuckets[b].size());
+        for (uint64_t state : tarBuckets[b]) {
+            auto& occ = localTar[state];
+            occ = (occ == Occ::Unseen) ? Occ::SeenOnce : Occ::Duplicate;
+        }
+
+        // Intersect within the bucket
+        // Since we are inside a bucket, we iterate the target indices
+        // but we need to find the target point. 
+        // To be fast, we'll re-scan the original tarStates for this bucket's IDs
+        for (uint32_t j = 0; j < tarStates.size(); ++j) {
+            uint64_t s = tarStates[j].state;
+            if ((s & MASK) == b) { // Only process IDs belonging to this bucket
+                if (localTar[s] == Occ::SeenOnce) {
+                    auto it = localSrc.find(s);
+                    if (it != localSrc.end() && it->second.occ == Occ::SeenOnce) {
+                        corr.push_back(ndb::Correspondence(
+                            srcStates[it->second.idx].point,
+                            tarStates[j].point
+                        ));
+                    }
+                }
+            }
+        }
+    }
+
+    return corr;
+}
 /**
  * @brief Evaluates a given forest mask on an image and returns the
  * descriptors
diff --git a/lib/gpc/forest.hpp b/lib/gpc/forest.hpp
index d0b5c32..9f48c87 100644
--- a/lib/gpc/forest.hpp
+++ b/lib/gpc/forest.hpp
@@ -178,7 +178,47 @@ struct MatchStats {
     double prec, rec, timeProp, timeMatch;
     int numInlier, numStates, numMatches;
 };
+struct SoAFrame {
+    // 256 Buckets to ensure each chunk fits in L2/L3 cache
+    std::vector<uint64_t> states[256];
+    std::vector<uint32_t> indices[256];
+    
+    void reserve(size_t total_size) {
+        for(int i=0; i<256; ++i) {
+            states[i].reserve(total_size / size_t(256 * 1.2));
+            indices[i].reserve(total_size / size_t(256 * 1.2));
+        }
+    }
+};
+struct SoAFramePersistent {
+    // Persistent memory blocks
+    std::vector<uint64_t> statesSlab;
+    std::vector<uint32_t> indicesSlab;
+    
+    // Pointers into the slab for each bucket
+    uint64_t* bucketStates[256];
+    uint32_t* bucketIndices[256];
+    uint32_t bucketSizes[256];
+
+    void preallocate(size_t total_size) {
+        statesSlab.assign(total_size, 0);
+        indicesSlab.assign(total_size, 0);
+    }
+};
+struct StateIdx {
+    uint64_t state;
+    uint32_t index;
+};
 
+struct SoAFramePersistentSingleSlab {
+    std::vector<StateIdx> slab; 
+    StateIdx* bucketData[256];
+    uint32_t bucketSizes[256];
+
+    void preallocate(size_t total_size) {
+        slab.assign(total_size, {0, 0});
+    }
+};
 
 class Forest {
    public:
@@ -198,9 +238,75 @@ class Forest {
         PreprocessedImage& tar,
         FilterMask& fastmask,
         InferenceSettings& settings);
-    std::vector<ndb::Correspondence> findCorrespondences(
+    static std::vector<ndb::Correspondence> findCorrespondences(
+        std::vector<ndb::Descriptor>& srcStates,
+        std::vector<ndb::Descriptor>& tarStates);
+    static std::vector<ndb::Correspondence> findCorrespondencesHash(
+        std::vector<ndb::Descriptor>& srcStates,
+        std::vector<ndb::Descriptor>& tarStates);
+
+    static std::vector<ndb::Correspondence> findCorrespondencesHashingRadix(
+        std::vector<ndb::Descriptor>& srcStates,
+        std::vector<ndb::Descriptor>& tarStates);
+
+    static std::vector<ndb::Correspondence> findCorrespondencesTurbo(
         std::vector<ndb::Descriptor>& srcStates,
         std::vector<ndb::Descriptor>& tarStates);
+
+
+    static std::pair<SoAFrame, SoAFrame> prepareSoAFrames(
+    std::vector<ndb::Descriptor>& srcStates,
+    std::vector<ndb::Descriptor>& tarStates);
+
+    static void prepareSoAFramesPersistent(
+        std::vector<ndb::Descriptor>& srcStates,
+        std::vector<ndb::Descriptor>& tarStates,
+        SoAFramePersistent& srcFrame, 
+        SoAFramePersistent& tarFrame);
+static void prepareSoAFramesPersistentSingleSlab(
+    std::vector<ndb::Descriptor>& srcStates,
+    std::vector<ndb::Descriptor>& tarStates,
+    SoAFramePersistentSingleSlab& srcFrame, 
+    SoAFramePersistentSingleSlab& tarFrame);
+
+
+static std::pair<std::vector<uint32_t>, std::vector<uint32_t>> matchPreparedFrames( SoAFrame& src, SoAFrame& tar);
+static std::pair<std::vector<uint32_t>, std::vector<uint32_t>> matchPreparedFramesFaster( SoAFrame& src, SoAFrame& tar);
+
+static std::pair<std::vector<uint32_t>, std::vector<uint32_t>> matchParallelRadixPartitioning(
+    SoAFrame& src, 
+    SoAFrame& tar) ;
+static std::pair<std::vector<uint32_t>, std::vector<uint32_t>> matchBlockedBloom(
+    SoAFrame& src, 
+    SoAFrame& tar) ;
+static std::pair<std::vector<uint32_t>, std::vector<uint32_t>> matchAdaptive(
+    SoAFrame& src, 
+    SoAFrame& tar);
+static std::pair<std::vector<uint32_t>, std::vector<uint32_t>> matchAdaptivePersistent(
+    SoAFramePersistent& src, 
+    SoAFramePersistent& tar);
+static std::pair<std::vector<uint32_t>, std::vector<uint32_t>> matchPipelinedBranchless(
+    SoAFramePersistent& src, 
+    SoAFramePersistent& tar);
+static void matchPipelinedBranchlessPreallocate(
+    SoAFramePersistent& src, 
+    SoAFramePersistent& tar,
+    std::vector<uint32_t>& resultSrc,
+    std::vector<uint32_t>& resultTar);
+
+/*
+static std::pair<std::vector<uint32_t>, std::vector<uint32_t>> matchAdaptiveNeon(
+    SoAFrame& src, 
+    SoAFrame& tar);
+*/
+static void matchPipelinedBranchlessPreallocateSingleSlab(
+    SoAFramePersistentSingleSlab& src, SoAFramePersistentSingleSlab& tar,
+    std::vector<uint32_t>& outS, std::vector<uint32_t>& outT);
+
+
+
+
+
     /**
      * @brief Evaluates a given forest mask on an image and returns the
      * descriptors
diff --git a/samples/sparsematch.cpp b/samples/sparsematch.cpp
index be0015a..85070a7 100644
--- a/samples/sparsematch.cpp
+++ b/samples/sparsematch.cpp
@@ -3,29 +3,11 @@
 
 #include "gpc/forest.hpp"
 using namespace std;
-void test_hwy_neon() {
-    namespace hn = hwy::HWY_NAMESPACE;
-    
-    // d is a "descriptor" for a vector of 8-bit unsigned ints
-    const hn::ScalableTag<uint8_t> d;
-    
-    // If this is NEON, hn::Lanes(d) will be 16
-    size_t lanes = hn::Lanes(d);
-    
-    auto v1 = hn::Set(d, 10);
-    auto v2 = hn::Set(d, 20);
-    auto res = hn::Add(v1, v2); // res lanes all contain 30
-    
-    std::cout << "--- Highway Status ---" << std::endl;
-    std::cout << "Target: " << hwy::TargetName(hwy::SupportedTargets()) << std::endl;
-    std::cout << "Vector lanes (uint8): " << lanes << std::endl;
-    std::cout << "----------------------" << std::endl;
-}
+
 int main(int argc, char** argv) {
-    std::string forestPath = "../../forests/defaultZeroForest.txt";
-    std::string leftImgPath = "../../data/kitti/training/image_0/000000_10.png";
-    std::string rightImgPath =
-        "../../data/kitti/training/image_1/000000_10.png";
+    std::string forestPath = "../forests/defaultZeroForest.txt";
+    std::string leftImgPath = "../data/middlebury/im0.png";
+    std::string rightImgPath = "../data/middlebury/im1.png";
 
     if (argc == 4) {
         forestPath = argv[1];
@@ -51,7 +33,7 @@ int main(int argc, char** argv) {
     gpc::inference::InferenceSettings inferencesettings =
         gpc::inference::InferenceSettings()
             .builder()
-            .gradientThreshold(2) // gradientthres 20: matching ~3ms, 2: matching: ~30ms. 
+            .gradientThreshold(1) // gradientthres 20: matching ~3ms, 2: matching: ~30ms. 
             .verticalTolerance(
                 0)               // 0px tolerance for rectified epipolar matches
             .dispHigh(128)       // limit disparities to 128
@@ -68,9 +50,6 @@ int main(int argc, char** argv) {
     gpc::inference::FilterMask fm =
         forest.readForest(forestPath, simg.cols(), simg.rows());
 
-    for(int i = 0; i<10000; i++) {
-    // Preprocess images (box filter, sobel filter, indices of high gradient
-    // pixels)
 
     gpc::inference::time_point t0 = gpc::inference::sysTick();
 
@@ -84,8 +63,8 @@ int main(int argc, char** argv) {
     std::vector<ndb::Support> supp =
         forest.rectifiedMatch(simgP, timgP, fm, inferencesettings);
     gpc::inference::time_point t2 = gpc::inference::sysTick();
+    std::cout << "Number of features(s,t): " << simgP.mask.size() << "," << timgP.mask.size() << std::endl;
+    std::cout << "Number of matches: " << supp.size() << std::endl;
     std::cout << "Preprocessing time: " << gpc::inference::tickToMs(t1, t0) << " ms" << std::endl;
     std::cout << "Matching time: " << gpc::inference::tickToMs(t2, t1) << " ms" << std::endl;
-    }
-    test_hwy_neon();
 }

From 55994f5592bb1b8f2b922d2a1e3a3ce5b87a486b Mon Sep 17 00:00:00 2001
From: Nik Bamert <github@nikbamert.com>
Date: Sun, 8 Mar 2026 15:22:13 +0100
Subject: [PATCH 28/36] add correspondence bench and(de)serialization

---
 benchmarks/correspondence_bench.cpp | 353 ++++++++++++++++++++++++++++
 lib/gpc/buffer.hpp                  |  55 +++++
 lib/gpc/forest.cpp                  |  10 +-
 samples/sparsematch.cpp             |   7 +
 4 files changed, 421 insertions(+), 4 deletions(-)
 create mode 100644 benchmarks/correspondence_bench.cpp

diff --git a/benchmarks/correspondence_bench.cpp b/benchmarks/correspondence_bench.cpp
new file mode 100644
index 0000000..e16eb9b
--- /dev/null
+++ b/benchmarks/correspondence_bench.cpp
@@ -0,0 +1,353 @@
+#include <benchmark/benchmark.h>
+#include "gpc/forest.hpp"
+#include "gpc/inference.hpp"
+#include <vector>
+#include <random>
+#include <cmath>
+#include <cstdint>
+
+#define NUM_ELEMENTS 262668 //10*1224*375 //1024*1024
+
+/* Remaining ideas
+ * -USE ILP: Parallel Radix Partitioning (Even on one core, using a single-pass shuffle).
+ *      - Didn't speed up. was same as matchPreparedFramesFaster
+ *      - Assuming that the bottleneck is the hash table probes, hence: look into bloom filters...
+ * -Blocked Bloom Filter to discard non-matches in L1.
+ *      - Faster at 1M, slower at 100K and 10M 
+ * -SIMD-Probed Flat Table (checking 4 slots at once).
+ * -Manual Prefetching of the next bucket's data.
+ * */
+/**
+ * Generates a reproducible Pareto-distributed vector.
+ * @param count Number of IDs to generate.
+ * @param target_mean The theoretical mean (requires alpha > 1).
+ * @param seed A fixed value (e.g., 42) for deterministic benchmarks.
+ */
+std::vector<ndb::Descriptor> generate_pareto_ids(size_t count, double target_mean, uint32_t seed = 42) {
+    std::vector<ndb::Descriptor> ids;
+    ids.reserve(count);
+
+    // Using a fixed seed for benchmark consistency
+    std::mt19937 gen(seed); 
+    
+    // 1e-9 epsilon prevents division by zero/infinity
+    std::uniform_real_distribution<double> dist(1e-9, 1.0);
+
+    // Alpha = 1.16 provides a classic "80/20" Pareto distribution
+    const double alpha = 1.16; 
+    const double xm = target_mean * (alpha - 1.0) / alpha;
+
+    for (size_t i = 0; i < count; ++i) {
+        // Inverse Transform Sampling
+        double val = xm / std::pow(dist(gen), 1.0 / alpha);
+        
+        // Casting to uint32_t will handle the Pareto "tail" by wrapping 
+        // values that exceed 2^32-1, simulating a dense ID space.
+        ids.push_back(ndb::Descriptor(ndb::Point(0,0), static_cast<uint32_t>(val)));
+    }
+
+    return ids;
+}
+std::vector<ndb::Descriptor> getSrcDescriptors() {
+    return ndb::Descriptor::deserialize("statesSrc.txt", true);
+    //return generate_pareto_ids(NUM_ELEMENTS, 1000.0, 42); // 1M IDs with mean ~1000
+}
+
+std::vector<ndb::Descriptor> getTarDescriptors() {
+    return ndb::Descriptor::deserialize("statesTar.txt", false);
+    //return generate_pareto_ids(NUM_ELEMENTS, 1001.0, 42); // 1M IDs with mean ~1000
+}
+std::vector<ndb::Descriptor> generate_unique_ids(size_t count) {
+    std::vector<ndb::Descriptor> ids;
+    ids.reserve(count);
+
+
+    for (size_t i = 0; i < count; ++i) {
+        ids.push_back(ndb::Descriptor(ndb::Point(0,0), static_cast<uint32_t>(i)));
+    }
+    return ids;
+}
+static void matchBySorting(
+        benchmark::State& state) {
+    std::vector<ndb::Descriptor> srcOriginal = getSrcDescriptors(); 
+    std::vector<ndb::Descriptor> tarOriginal = getTarDescriptors();
+    for (auto _ : state) {
+        state.PauseTiming();
+        std::vector<ndb::Descriptor> src = srcOriginal;
+        std::vector<ndb::Descriptor> tar = tarOriginal;
+        state.ResumeTiming();
+        std::vector<ndb::Correspondence> 
+            matches = gpc::inference::Forest::findCorrespondences(src, tar);
+
+        state.counters["matches"] = matches.size();
+        //state.counters["candidates_t"] = timgP.mask.size();
+        //state.counters["matches"] = supp.size();
+        benchmark::DoNotOptimize(matches);
+        benchmark::ClobberMemory();
+    }
+}
+static void matchByHashing(
+        benchmark::State& state) {
+    std::vector<ndb::Descriptor> srcOriginal = getSrcDescriptors(); 
+    std::vector<ndb::Descriptor> tarOriginal = getTarDescriptors();
+    for (auto _ : state) {
+        state.PauseTiming();
+        std::vector<ndb::Descriptor> src = srcOriginal;
+        std::vector<ndb::Descriptor> tar = tarOriginal;
+        state.ResumeTiming();
+        std::vector<ndb::Correspondence> 
+            matches = gpc::inference::Forest::findCorrespondencesTurbo(src, tar);
+
+        state.counters["matches"] = matches.size();
+        //state.counters["candidates_t"] = timgP.mask.size();
+        //state.counters["matches"] = supp.size();
+        benchmark::DoNotOptimize(matches);
+        benchmark::ClobberMemory();
+    }
+}
+static void matchPreparedFrames(
+        benchmark::State& state) {
+    std::vector<ndb::Descriptor> srcOriginal = getSrcDescriptors(); 
+    std::vector<ndb::Descriptor> tarOriginal = getTarDescriptors();
+    for (auto _ : state) {
+        state.PauseTiming();
+        std::vector<ndb::Descriptor> src = srcOriginal;
+        std::vector<ndb::Descriptor> tar = tarOriginal;
+        state.ResumeTiming();
+        auto v = gpc::inference::Forest::prepareSoAFrames(src, tar);
+        auto matches = gpc::inference::Forest::matchPreparedFrames(v.first, v.second);
+
+        state.counters["matches"] = matches.first.size();
+        //state.counters["candidates_t"] = timgP.mask.size();
+        //state.counters["matches"] = supp.size();
+        benchmark::DoNotOptimize(matches);
+        benchmark::ClobberMemory();
+    }
+}
+static void matchPreparedFramesFaster(
+        benchmark::State& state) {
+    std::vector<ndb::Descriptor> srcOriginal = getSrcDescriptors(); 
+    std::vector<ndb::Descriptor> tarOriginal = getTarDescriptors();
+    for (auto _ : state) {
+        state.PauseTiming();
+        std::vector<ndb::Descriptor> src = srcOriginal;
+        std::vector<ndb::Descriptor> tar = tarOriginal;
+        state.ResumeTiming();
+        auto v = gpc::inference::Forest::prepareSoAFrames(src, tar);
+        //auto matches = gpc::inference::Forest::matchPreparedFramesFaster(v.first, v.second);
+        auto matches = gpc::inference::Forest::matchParallelRadixPartitioning(v.first,v.second);
+        state.counters["matches"] = matches.first.size();
+        //state.counters["candidates_t"] = timgP.mask.size();
+        //state.counters["matches"] = supp.size();
+        benchmark::DoNotOptimize(matches);
+        benchmark::ClobberMemory();
+    }
+}
+static void matchParallelRadixPartitioning(
+        benchmark::State& state) {
+    std::vector<ndb::Descriptor> srcOriginal = getSrcDescriptors(); 
+    std::vector<ndb::Descriptor> tarOriginal = getTarDescriptors();
+    for (auto _ : state) {
+        state.PauseTiming();
+        std::vector<ndb::Descriptor> src = srcOriginal;
+        std::vector<ndb::Descriptor> tar = tarOriginal;
+        state.ResumeTiming();
+        auto v = gpc::inference::Forest::prepareSoAFrames(src,tar);
+        auto matches = gpc::inference::Forest::matchParallelRadixPartitioning(v.first, v.second);
+
+        state.counters["matches"] = matches.first.size();
+        //state.counters["candidates_t"] = timgP.mask.size();
+        //state.counters["matches"] = supp.size();
+        benchmark::DoNotOptimize(matches);
+        benchmark::ClobberMemory();
+    }
+}
+static void matchBlockedBloom(
+        benchmark::State& state) {
+    std::vector<ndb::Descriptor> srcOriginal = getSrcDescriptors(); 
+    std::vector<ndb::Descriptor> tarOriginal = getTarDescriptors();
+                                                    
+    for (auto _ : state) {
+        state.PauseTiming();
+        std::vector<ndb::Descriptor> src = srcOriginal;
+        std::vector<ndb::Descriptor> tar = tarOriginal;
+        state.ResumeTiming();
+        auto v = gpc::inference::Forest::prepareSoAFrames(src,tar);
+        auto matches = gpc::inference::Forest::matchBlockedBloom(v.first, v.second);
+
+        state.counters["matches"] = matches.first.size();
+        benchmark::DoNotOptimize(matches);
+        benchmark::ClobberMemory();
+    }
+}
+static void matchAdaptive(
+        benchmark::State& state) {
+    std::vector<ndb::Descriptor> srcOriginal = getSrcDescriptors(); 
+    std::vector<ndb::Descriptor> tarOriginal = getTarDescriptors();
+    for (auto _ : state) {
+        state.PauseTiming();
+        std::vector<ndb::Descriptor> src = srcOriginal;
+        std::vector<ndb::Descriptor> tar = tarOriginal;
+        state.ResumeTiming();
+        auto v = gpc::inference::Forest::prepareSoAFrames(src,tar);
+        auto matches = gpc::inference::Forest::matchAdaptive(v.first, v.second);
+
+        state.counters["matches"] = matches.first.size();
+        benchmark::DoNotOptimize(matches);
+        benchmark::ClobberMemory();
+    }
+}
+/*
+static void matchAdaptiveNeon(
+        benchmark::State& state) {
+    std::vector<ndb::Descriptor> src, tar;
+    src = generate_pareto_ids(NUM_ELEMENTS, 1000.0, 42); // 1M IDs with mean ~1000
+    tar = generate_pareto_ids(NUM_ELEMENTS, 1001.0, 42); // 1M IDs with mean ~1000
+                                                    
+    for (auto _ : state) {
+        auto v = gpc::inference::Forest::prepareSoAFrames(src, tar);
+        auto matches = gpc::inference::Forest::matchAdaptiveNeon(v.first, v.second);
+
+        state.counters["matches"] = matches.first.size();
+        //state.counters["candidates_t"] = timgP.mask.size();
+        //state.counters["matches"] = supp.size();
+        benchmark::DoNotOptimize(matches);
+        benchmark::ClobberMemory();
+    }
+}
+*/
+static void matchAdaptivePersistent(
+        benchmark::State& state) {
+    std::vector<ndb::Descriptor> srcOriginal = getSrcDescriptors(); 
+    std::vector<ndb::Descriptor> tarOriginal = getTarDescriptors();
+    gpc::inference::SoAFramePersistent srcFrame, tarFrame;
+    srcFrame.preallocate(srcOriginal.size()); // size known
+    tarFrame.preallocate(tarOriginal.size());
+    for (auto _ : state) {
+        state.PauseTiming();
+        std::vector<ndb::Descriptor> src = srcOriginal;
+        std::vector<ndb::Descriptor> tar = tarOriginal;
+        state.ResumeTiming();
+
+        gpc::inference::Forest::prepareSoAFramesPersistent(src, tar, srcFrame, tarFrame);
+        auto matches = gpc::inference::Forest::matchAdaptivePersistent(srcFrame, tarFrame);
+
+        state.counters["matches"] = matches.first.size();
+        benchmark::DoNotOptimize(matches);
+        benchmark::ClobberMemory();
+    }
+}
+static void matchPipelinedBranchless(
+        benchmark::State& state) {
+    std::vector<ndb::Descriptor> srcOriginal = getSrcDescriptors(); 
+    std::vector<ndb::Descriptor> tarOriginal = getTarDescriptors();
+    gpc::inference::SoAFramePersistent srcFrame, tarFrame;
+    srcFrame.preallocate(srcOriginal.size()); // size known
+    tarFrame.preallocate(tarOriginal.size());
+    for (auto _ : state) {
+        state.PauseTiming();
+        std::vector<ndb::Descriptor> src = srcOriginal;
+        std::vector<ndb::Descriptor> tar = tarOriginal;
+        state.ResumeTiming();
+
+        gpc::inference::Forest::prepareSoAFramesPersistent(src, tar, srcFrame, tarFrame);
+        auto matches = gpc::inference::Forest::matchPipelinedBranchless(srcFrame, tarFrame);
+        benchmark::DoNotOptimize(matches);
+        benchmark::ClobberMemory();
+    }
+}
+static void matchPipelinedBranchlessPreallocate(
+        benchmark::State& state) {
+    std::vector<ndb::Descriptor> srcOriginal = getSrcDescriptors(); 
+    std::vector<ndb::Descriptor> tarOriginal = getTarDescriptors();
+    gpc::inference::SoAFramePersistent srcFrame, tarFrame;
+    srcFrame.preallocate(srcOriginal.size()); // size known
+    tarFrame.preallocate(tarOriginal.size());
+    std::vector<uint32_t> resultSrc, resultTar;
+    resultSrc.reserve(srcOriginal.size()/100);
+    resultTar.reserve(tarOriginal.size()/100);
+    for (auto _ : state) {
+        state.PauseTiming();
+        resultSrc.clear();
+        resultTar.clear();
+        std::vector<ndb::Descriptor> src = srcOriginal;
+        std::vector<ndb::Descriptor> tar = tarOriginal;
+        state.ResumeTiming();
+
+        // 1. Measure Prepare
+        // 2M: 5.7ms, 20M: 57ms
+        gpc::inference::Forest::prepareSoAFramesPersistent(src, tar, srcFrame, tarFrame);
+
+        // 2. Measure Match
+        // 2M: 5.3ms , 20M: 53ms
+        gpc::inference::Forest::matchPipelinedBranchlessPreallocate(srcFrame, tarFrame, resultSrc, resultTar);
+
+        state.counters["matches"] = resultSrc.size();
+        
+        benchmark::DoNotOptimize(resultSrc);
+        benchmark::DoNotOptimize(resultTar);
+        benchmark::ClobberMemory();
+    }
+}
+
+static void matchPipelinedBranchlessPreallocateSingleSlab(
+        benchmark::State& state) {
+    std::vector<ndb::Descriptor> srcOriginal = getSrcDescriptors(); 
+    std::vector<ndb::Descriptor> tarOriginal = getTarDescriptors();
+                                                    
+    gpc::inference::SoAFramePersistentSingleSlab srcFrame, tarFrame;
+    srcFrame.preallocate(srcOriginal.size()); // size known
+    tarFrame.preallocate(tarOriginal.size());
+    std::vector<uint32_t> resultSrc, resultTar;
+    resultSrc.reserve(srcOriginal.size()/10);
+    resultTar.reserve(tarOriginal.size()/10);
+    for (auto _ : state) {
+        state.PauseTiming();
+        resultSrc.clear();
+        resultTar.clear();
+        // 1. Measure Prepare
+        // 2M: 5.7ms, 20M: 57ms
+        std::vector<ndb::Descriptor> src = srcOriginal;
+        std::vector<ndb::Descriptor> tar = tarOriginal;
+        state.ResumeTiming();
+        gpc::inference::Forest::prepareSoAFramesPersistentSingleSlab(src, tar, srcFrame, tarFrame);
+
+        // 2. Measure Match
+        // 2M: 5.3ms , 20M: 53ms
+        gpc::inference::Forest::matchPipelinedBranchlessPreallocateSingleSlab(srcFrame, tarFrame, resultSrc, resultTar);
+
+        state.counters["matches"] = resultSrc.size();
+        
+        benchmark::DoNotOptimize(resultSrc);
+        benchmark::DoNotOptimize(resultTar);
+        benchmark::ClobberMemory();
+    }
+}
+BENCHMARK(matchBySorting)
+    ->Unit(benchmark::kMillisecond);
+BENCHMARK(matchByHashing)
+    ->Unit(benchmark::kMillisecond);
+BENCHMARK(matchPreparedFrames)
+    ->Unit(benchmark::kMillisecond);
+BENCHMARK(matchPreparedFramesFaster)
+    ->Unit(benchmark::kMillisecond);
+BENCHMARK(matchParallelRadixPartitioning)
+    ->Unit(benchmark::kMillisecond);
+BENCHMARK(matchBlockedBloom)
+    ->Unit(benchmark::kMillisecond);
+BENCHMARK(matchAdaptive)
+    ->Unit(benchmark::kMillisecond);
+/*
+BENCHMARK(matchAdaptiveNeon)
+    ->Unit(benchmark::kMillisecond);
+*/
+BENCHMARK(matchAdaptivePersistent)
+    ->Unit(benchmark::kMillisecond);
+
+BENCHMARK(matchPipelinedBranchless)
+    ->Unit(benchmark::kMillisecond);
+BENCHMARK(matchPipelinedBranchlessPreallocate)
+    ->Unit(benchmark::kMillisecond);
+BENCHMARK(matchPipelinedBranchlessPreallocateSingleSlab)
+    ->Unit(benchmark::kMillisecond);
+BENCHMARK_MAIN();
diff --git a/lib/gpc/buffer.hpp b/lib/gpc/buffer.hpp
index 453ecaa..26e261a 100644
--- a/lib/gpc/buffer.hpp
+++ b/lib/gpc/buffer.hpp
@@ -35,6 +35,10 @@
 #include <Eigen/Dense>
 #include <type_traits>
 #include <vector>
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <sstream>
 
 using namespace std;
 
@@ -80,6 +84,57 @@ struct Descriptor {
     bool operator<(const Descriptor& d) const { return state < d.state; }
     bool operator<=(const Descriptor& d) const { return state <= d.state; }
     int operator%(const int& d) const { return state % d; }
+    static void serialize(const std::string& filename, const std::vector<Descriptor>& data) {
+        std::ofstream outFile(filename);
+        if (!outFile.is_open()) {
+            std::cerr << "Error opening file for writing: " << filename << std::endl;
+            return;
+        }
+
+        for (const auto& desc : data) {
+            outFile << desc.point.x << "," 
+                    << desc.point.y << "," 
+                    << desc.state << "\n";
+        }
+        outFile.close();
+    }
+
+    /**
+     * Deserializes a CSV file back into a vector of Descriptors.
+     */
+    static std::vector<Descriptor> deserialize(const std::string& filename, bool srcDescr) {
+        std::vector<Descriptor> result;
+        std::ifstream inFile(filename);
+        if (!inFile.is_open()) {
+            std::cerr << "Error opening file for reading: " << filename << std::endl;
+            return result;
+        }
+
+        std::string line;
+        while (std::getline(inFile, line)) {
+            if (line.empty()) continue;
+
+            std::stringstream ss(line);
+            std::string x_str, y_str, state_str;
+
+            // Split by comma
+            if (std::getline(ss, x_str, ',') &&
+                std::getline(ss, y_str, ',') &&
+                std::getline(ss, state_str, ',')) {
+                
+                Descriptor d;
+                d.point.x = std::stod(x_str);
+                d.point.y = std::stod(y_str);
+                d.state = std::stoull(state_str);
+                d.srcDescr = srcDescr; 
+                
+                if (d.point.y > 200 && d.point.y < 400)
+                    result.push_back(d);
+            }
+        }
+        inFile.close();
+        return result;
+    }
 };
 // Keeps support points with associated disparity
 // Support points are only used in the left image
diff --git a/lib/gpc/forest.cpp b/lib/gpc/forest.cpp
index 876fe0b..026733d 100644
--- a/lib/gpc/forest.cpp
+++ b/lib/gpc/forest.cpp
@@ -701,8 +701,7 @@ std::pair<std::vector<uint32_t>, std::vector<uint32_t>> Forest::matchAdaptive(
         for (size_t i = 0; i < sStates.size(); ++i) {
             // Prefetch an element roughly 16 iterations ahead (adjust based on testing)
             /*
-             * This didn't help anymore. So either compiler already optimized this or 
-             * we are compute bound.
+             * This didn't help anymore. 
              * if (i + 16 < sStates.size()) {
                 __builtin_prefetch(&sStates[i + 16], 0, 3);
                 __builtin_prefetch(&sIdxs[i + 16], 0, 3);
@@ -1069,6 +1068,7 @@ std::vector<ndb::Correspondence> Forest::depthPriorFast(
     PreprocessedImage& tar,
     FilterMask& fastmask,
     InferenceSettings& settings) {
+    std::chrono::high_resolution_clock::time_point t0, t1;
     std::vector<ndb::Descriptor> statesSrc = evalFastMaskOnSubsetSSE(
         src.smooth, src.grad, src.mask, fastmask, settings);
     std::vector<ndb::Descriptor> statesTar = evalFastMaskOnSubsetSSE(
@@ -1081,8 +1081,12 @@ std::vector<ndb::Correspondence> Forest::depthPriorFast(
     }
     // Use sort method for matching
     if (settings.useHashtable_ == false) {
+    t0 = sysTick();
         std::vector<ndb::Correspondence> corr =
             findCorrespondences(statesSrc, statesTar);
+    t1 = sysTick();
+    std::cout << "findCorrespondences (without allocation): " << gpc::inference::tickToMs(t1, t0) << " ms" << std::endl;
+    std::cout << "length src: " << statesSrc.size() << std::endl;
         return corr;
     }
     // Use hashtable matching
@@ -1519,11 +1523,9 @@ std::vector<ndb::Correspondence> Forest::stereoMatch(PreprocessedImage& simg,
         "Targe Image: dimension does not fit dimension of supplied forest "
         "mask");
     bool m_debug = false;
-    std::chrono::high_resolution_clock::time_point t0, t1;
     // Match
     std::vector<ndb::Correspondence> corr =
         depthPriorFast(simg, timg, forestmask, settings);
-    t1 = sysTick();
 
     return corr;
 }
diff --git a/samples/sparsematch.cpp b/samples/sparsematch.cpp
index 85070a7..3b94d9f 100644
--- a/samples/sparsematch.cpp
+++ b/samples/sparsematch.cpp
@@ -67,4 +67,11 @@ int main(int argc, char** argv) {
     std::cout << "Number of matches: " << supp.size() << std::endl;
     std::cout << "Preprocessing time: " << gpc::inference::tickToMs(t1, t0) << " ms" << std::endl;
     std::cout << "Matching time: " << gpc::inference::tickToMs(t2, t1) << " ms" << std::endl;
+    std::vector<ndb::Descriptor> statesSrc = forest.evalFastMaskOnSubsetSSE(
+        simgP.smooth, simgP.grad, simgP.mask, fm, inferencesettings);
+    std::vector<ndb::Descriptor> statesTar = forest.evalFastMaskOnSubsetSSE(
+        timgP.smooth, timgP.grad, timgP.mask, fm, inferencesettings);
+    ndb::Descriptor::serialize("statesSrc.txt", statesSrc);
+    ndb::Descriptor::serialize("statesTar.txt", statesTar);
+
 }

From 31bd8a227987fcaf46059861ead5b2b807e26628 Mon Sep 17 00:00:00 2001
From: Nik Bamert <github@nikbamert.com>
Date: Sun, 8 Mar 2026 16:12:40 +0100
Subject: [PATCH 29/36] add naive hash match version

---
 benchmarks/correspondence_bench.cpp | 19 ++++++++++
 lib/gpc/buffer.hpp                  |  2 +-
 lib/gpc/forest.cpp                  | 54 +++++++++++++++++++++++++----
 lib/gpc/forest.hpp                  |  3 ++
 4 files changed, 70 insertions(+), 8 deletions(-)

diff --git a/benchmarks/correspondence_bench.cpp b/benchmarks/correspondence_bench.cpp
index e16eb9b..fe4139f 100644
--- a/benchmarks/correspondence_bench.cpp
+++ b/benchmarks/correspondence_bench.cpp
@@ -86,6 +86,23 @@ static void matchBySorting(
         benchmark::ClobberMemory();
     }
 }
+static void matchByHashingNaive(
+        benchmark::State& state) {
+    std::vector<ndb::Descriptor> srcOriginal = getSrcDescriptors(); 
+    std::vector<ndb::Descriptor> tarOriginal = getTarDescriptors();
+    for (auto _ : state) {
+        state.PauseTiming();
+        std::vector<ndb::Descriptor> src = srcOriginal;
+        std::vector<ndb::Descriptor> tar = tarOriginal;
+        state.ResumeTiming();
+        std::vector<ndb::Correspondence> 
+            matches = gpc::inference::Forest::findCorrespondencesHashNaive(src, tar);
+
+        state.counters["matches"] = matches.size();
+        benchmark::DoNotOptimize(matches);
+        benchmark::ClobberMemory();
+    }
+}
 static void matchByHashing(
         benchmark::State& state) {
     std::vector<ndb::Descriptor> srcOriginal = getSrcDescriptors(); 
@@ -325,6 +342,8 @@ static void matchPipelinedBranchlessPreallocateSingleSlab(
 }
 BENCHMARK(matchBySorting)
     ->Unit(benchmark::kMillisecond);
+BENCHMARK(matchByHashingNaive)
+    ->Unit(benchmark::kMillisecond);
 BENCHMARK(matchByHashing)
     ->Unit(benchmark::kMillisecond);
 BENCHMARK(matchPreparedFrames)
diff --git a/lib/gpc/buffer.hpp b/lib/gpc/buffer.hpp
index 26e261a..903a507 100644
--- a/lib/gpc/buffer.hpp
+++ b/lib/gpc/buffer.hpp
@@ -128,7 +128,7 @@ struct Descriptor {
                 d.state = std::stoull(state_str);
                 d.srcDescr = srcDescr; 
                 
-                if (d.point.y > 200 && d.point.y < 400)
+                //if (d.point.y > 200 && d.point.y < 400)
                     result.push_back(d);
             }
         }
diff --git a/lib/gpc/forest.cpp b/lib/gpc/forest.cpp
index 026733d..e95fea9 100644
--- a/lib/gpc/forest.cpp
+++ b/lib/gpc/forest.cpp
@@ -53,6 +53,7 @@
 #include "gpc/kernels/utils.hpp"
 #include "gpc/hashmatch.hpp"
 #include "gpc/forest.hpp"
+#include <unordered_map>
 
 
 namespace gpc {
@@ -421,12 +422,6 @@ void Forest::matchPipelinedBranchlessPreallocate(
     std::vector<uint32_t>& resultSrc,
     std::vector<uint32_t>& resultTar) {
 
-    //std::pair<std::vector<uint32_t>, std::vector<uint32_t>> result;
-    // For 100M items, we might find more matches; 
-    // adjusting reserve to prevent mid-run reallocations.
-    //result.first.reserve(src.statesSlab.size() / 100); 
-    //result.second.reserve(src.statesSlab.size() / 100);
-
     struct Slot { 
         uint64_t key; 
         uint32_t idx; 
@@ -1140,7 +1135,52 @@ std::vector<ndb::Correspondence> Forest::findCorrespondences(
     }
     return corr;
 }
-#include <unordered_map>
+std::vector<ndb::Correspondence> Forest::findCorrespondencesHashNaive(
+    std::vector<ndb::Descriptor>& srcStates,
+    std::vector<ndb::Descriptor>& tarStates) {
+    
+    std::vector<ndb::Correspondence> corr;
+    struct DescriptorHasher {
+        std::size_t operator()(const ndb::Descriptor& d) const {
+            // Just return the state since it's already a unique-ish 64-bit int
+            return static_cast<std::size_t>(d.state);
+        }
+    };
+    // 1. Count frequencies in Source
+    std::unordered_map<ndb::Descriptor, int, DescriptorHasher> srcCounts;
+    std::unordered_map<ndb::Descriptor, int, DescriptorHasher> tarCounts;
+    for (const auto& d : srcStates) {
+        srcCounts[d]++;
+    }
+
+    // 2. Count frequencies in Target
+    for (const auto& d : tarStates) {
+        tarCounts[d]++;
+    }
+
+    // 3. Match only if the descriptor is unique in both (count == 1)
+    // We iterate through srcStates to maintain a similar "order" or 
+    // simply to find potential matches.
+    for (const auto& srcDesc : srcStates) {
+        // Is it unique in Source?
+        if (srcCounts[srcDesc] == 1) {
+            // Does it exist and is it unique in Target?
+            if (tarCounts.count(srcDesc) && tarCounts[srcDesc] == 1) {
+                
+                // We need the actual target object to get the 'point' 
+                // In a naive way, we just go find it.
+                for (const auto& tarDesc : tarStates) {
+                    if (tarDesc == srcDesc) {
+                        corr.push_back(ndb::Correspondence(srcDesc.point, tarDesc.point));
+                        break; 
+                    }
+                }
+            }
+        }
+    }
+
+    return corr;
+}
 
 // State machine for our IDs
 enum class State : uint8_t { Unseen = 0, SeenOnce = 1, Duplicate = 2 };
diff --git a/lib/gpc/forest.hpp b/lib/gpc/forest.hpp
index 9f48c87..559a5c3 100644
--- a/lib/gpc/forest.hpp
+++ b/lib/gpc/forest.hpp
@@ -241,6 +241,9 @@ class Forest {
     static std::vector<ndb::Correspondence> findCorrespondences(
         std::vector<ndb::Descriptor>& srcStates,
         std::vector<ndb::Descriptor>& tarStates);
+    static std::vector<ndb::Correspondence> findCorrespondencesHashNaive(
+        std::vector<ndb::Descriptor>& srcStates,
+        std::vector<ndb::Descriptor>& tarStates);
     static std::vector<ndb::Correspondence> findCorrespondencesHash(
         std::vector<ndb::Descriptor>& srcStates,
         std::vector<ndb::Descriptor>& tarStates);

From 46b8d5d6067eb95d453eddd71eff52210f96c7d4 Mon Sep 17 00:00:00 2001
From: Nik Bamert <github@nikbamert.com>
Date: Sun, 15 Mar 2026 20:14:09 +0100
Subject: [PATCH 30/36] add additional matching method

---
 benchmarks/correspondence_bench.cpp |  47 +++++++++++--
 lib/gpc/forest.cpp                  | 104 ++++++++++++++++++++++++++++
 lib/gpc/forest.hpp                  |   8 +++
 tests/test_single_matching.cpp      |  37 ++++++++++
 4 files changed, 190 insertions(+), 6 deletions(-)

diff --git a/benchmarks/correspondence_bench.cpp b/benchmarks/correspondence_bench.cpp
index fe4139f..d7655f2 100644
--- a/benchmarks/correspondence_bench.cpp
+++ b/benchmarks/correspondence_bench.cpp
@@ -49,13 +49,13 @@ std::vector<ndb::Descriptor> generate_pareto_ids(size_t count, double target_mea
     return ids;
 }
 std::vector<ndb::Descriptor> getSrcDescriptors() {
-    return ndb::Descriptor::deserialize("statesSrc.txt", true);
-    //return generate_pareto_ids(NUM_ELEMENTS, 1000.0, 42); // 1M IDs with mean ~1000
+    //return ndb::Descriptor::deserialize("statesSrc.txt", true);
+    return generate_pareto_ids(NUM_ELEMENTS, 1000.0, 42); // 1M IDs with mean ~1000
 }
 
 std::vector<ndb::Descriptor> getTarDescriptors() {
-    return ndb::Descriptor::deserialize("statesTar.txt", false);
-    //return generate_pareto_ids(NUM_ELEMENTS, 1001.0, 42); // 1M IDs with mean ~1000
+    //return ndb::Descriptor::deserialize("statesTar.txt", false);
+    return generate_pareto_ids(NUM_ELEMENTS, 1001.0, 42); // 1M IDs with mean ~1000
 }
 std::vector<ndb::Descriptor> generate_unique_ids(size_t count) {
     std::vector<ndb::Descriptor> ids;
@@ -340,12 +340,45 @@ static void matchPipelinedBranchlessPreallocateSingleSlab(
         benchmark::ClobberMemory();
     }
 }
+static void matchPipelinedBranchlessPreallocateSingleSlabUnordered(
+        benchmark::State& state) {
+    std::vector<ndb::Descriptor> srcOriginal = getSrcDescriptors(); 
+    std::vector<ndb::Descriptor> tarOriginal = getTarDescriptors();
+                                                    
+    gpc::inference::SoAFramePersistentSingleSlab srcFrame, tarFrame;
+    srcFrame.preallocate(srcOriginal.size()); // size known
+    tarFrame.preallocate(tarOriginal.size());
+    std::vector<uint32_t> resultSrc, resultTar;
+    resultSrc.reserve(srcOriginal.size()/10);
+    resultTar.reserve(tarOriginal.size()/10);
+    for (auto _ : state) {
+        state.PauseTiming();
+        resultSrc.clear();
+        resultTar.clear();
+        // 1. Measure Prepare
+        // 2M: 5.7ms, 20M: 57ms
+        std::vector<ndb::Descriptor> src = srcOriginal;
+        std::vector<ndb::Descriptor> tar = tarOriginal;
+        state.ResumeTiming();
+        gpc::inference::Forest::prepareSoAFramesPersistentSingleSlabUnordered(src, tar, srcFrame, tarFrame);
+
+        // 2. Measure Match
+        // 2M: 5.3ms , 20M: 53ms
+        gpc::inference::Forest::matchPipelinedBranchlessPreallocateSingleSlabUnordered(srcFrame, tarFrame, resultSrc, resultTar);
+
+        state.counters["matches"] = resultSrc.size();
+        
+        benchmark::DoNotOptimize(resultSrc);
+        benchmark::DoNotOptimize(resultTar);
+        benchmark::ClobberMemory();
+    }
+}
 BENCHMARK(matchBySorting)
     ->Unit(benchmark::kMillisecond);
-BENCHMARK(matchByHashingNaive)
-    ->Unit(benchmark::kMillisecond);
 BENCHMARK(matchByHashing)
     ->Unit(benchmark::kMillisecond);
+BENCHMARK(matchByHashingNaive)
+    ->Unit(benchmark::kMillisecond);
 BENCHMARK(matchPreparedFrames)
     ->Unit(benchmark::kMillisecond);
 BENCHMARK(matchPreparedFramesFaster)
@@ -369,4 +402,6 @@ BENCHMARK(matchPipelinedBranchlessPreallocate)
     ->Unit(benchmark::kMillisecond);
 BENCHMARK(matchPipelinedBranchlessPreallocateSingleSlab)
     ->Unit(benchmark::kMillisecond);
+BENCHMARK(matchPipelinedBranchlessPreallocateSingleSlabUnordered)
+    ->Unit(benchmark::kMillisecond);
 BENCHMARK_MAIN();
diff --git a/lib/gpc/forest.cpp b/lib/gpc/forest.cpp
index e95fea9..5ebcbf0 100644
--- a/lib/gpc/forest.cpp
+++ b/lib/gpc/forest.cpp
@@ -58,6 +58,110 @@
 
 namespace gpc {
 namespace inference {
+void Forest::prepareSoAFramesPersistentSingleSlabUnordered(
+    std::vector<ndb::Descriptor>& srcStates,
+    std::vector<ndb::Descriptor>& tarStates,
+    SoAFramePersistentSingleSlab& srcFrame, 
+    SoAFramePersistentSingleSlab& tarFrame) {
+
+    uint32_t srcCounts[256] = {0}, tarCounts[256] = {0};
+    for (const auto& s : srcStates) {
+        srcCounts[s.state & 0xFF]++;
+    }
+    for (const auto& t : tarStates) {
+        tarCounts[t.state & 0xFF]++;
+    }
+
+    StateIdx* sP = srcFrame.slab.data();
+    StateIdx* tP = tarFrame.slab.data();
+    for (int i = 0; i < 256; ++i) {
+        srcFrame.bucketData[i] = sP;
+        srcFrame.bucketSizes[i] = srcCounts[i];
+        tarFrame.bucketData[i] = tP;
+        tarFrame.bucketSizes[i] = tarCounts[i];
+        sP += srcCounts[i]; tP += tarCounts[i];
+    }
+
+    uint32_t sW[256] = {0}, tW[256] = {0};
+    // FIX: Split into two independent loops
+    for (uint32_t i = 0; i < (uint32_t)srcStates.size(); ++i) {
+        uint64_t sv = srcStates[i].state;
+        srcFrame.bucketData[sv & 0xFF][sW[sv & 0xFF]++] = {sv, i};
+    }
+    for (uint32_t i = 0; i < (uint32_t)tarStates.size(); ++i) {
+        uint64_t tv = tarStates[i].state;
+        tarFrame.bucketData[tv & 0xFF][tW[tv & 0xFF]++] = {tv, i};
+    }
+}
+void Forest::matchPipelinedBranchlessPreallocateSingleSlabUnordered(
+    SoAFramePersistentSingleSlab& src, SoAFramePersistentSingleSlab& tar,
+    std::vector<uint32_t>& outS, std::vector<uint32_t>& outT) {
+
+    struct Slot { 
+        uint64_t key;   
+        uint32_t idx;   
+        uint32_t gen;   
+        uint32_t count; 
+        uint32_t outIdx; // FIX: Track where the match was written in the output vectors
+    };
+    static std::vector<Slot> table(16384, {0, 0, 0, 0, 0});
+    static uint32_t currentGen = 1;
+
+    for (int b = 0; b < 256; ++b) {
+        StateIdx* sData = src.bucketData[b];
+        uint32_t  sSize = src.bucketSizes[b];
+        if (sSize == 0) continue;
+
+        const uint32_t mask = (sSize < 1000) ? 2047 : 16383;
+        const uint32_t shift = (sSize < 1000) ? 53 : 50;
+        currentGen++;
+
+        for (uint32_t i = 0; i < sSize; ++i) {
+            uint64_t k = sData[i].state;
+            uint32_t h = (k * 11400714819323198485llu) >> shift;
+            h &= mask;
+            while (table[h].gen == currentGen && table[h].key != k) h = (h + 1) & mask;
+            if (table[h].gen != currentGen) table[h] = {k, sData[i].index, currentGen, 1, 0};
+            else table[h].count++;
+        }
+
+        StateIdx* tData = tar.bucketData[b];
+        uint32_t  tSize = tar.bucketSizes[b];
+        for (uint32_t i = 0; i < tSize; ++i) {
+            uint64_t k = tData[i].state;
+            uint32_t h = (k * 11400714819323198485llu) >> shift;
+            h &= mask;
+            while (table[h].gen == currentGen && table[h].key != k) h = (h + 1) & mask;
+
+            if (table[h].gen == currentGen && table[h].key == k) {
+                if (table[h].count == 1) {
+                    // Unique source, first target match
+                    table[h].outIdx = outS.size(); // Remember the index
+                    outS.push_back(table[h].idx);
+                    outT.push_back(tData[i].index);
+                    table[h].count = 0xFFFFFFFF; // Mark as matched once
+                } else if (table[h].count == 0xFFFFFFFF) {
+                    // Duplicate target found! Invalidate the previously written match.
+                    outS[table[h].outIdx] = 0xFFFFFFFF; 
+                    outT[table[h].outIdx] = 0xFFFFFFFF;
+                    table[h].count = 0xEEEEEEEE; // Mark as ruined
+                }
+            }
+        }
+    }
+
+    // FIX: Final compaction pass to remove invalidated matches (sentinels)
+    uint32_t validCount = 0;
+    for (size_t i = 0; i < outS.size(); ++i) {
+        if (outS[i] != 0xFFFFFFFF) {
+            outS[validCount] = outS[i];
+            outT[validCount] = outT[i];
+            validCount++;
+        }
+    }
+    outS.resize(validCount);
+    outT.resize(validCount);
+}
 void Forest::prepareSoAFramesPersistentSingleSlab(
     std::vector<ndb::Descriptor>& srcStates,
     std::vector<ndb::Descriptor>& tarStates,
diff --git a/lib/gpc/forest.hpp b/lib/gpc/forest.hpp
index 559a5c3..9e45600 100644
--- a/lib/gpc/forest.hpp
+++ b/lib/gpc/forest.hpp
@@ -308,6 +308,14 @@ static void matchPipelinedBranchlessPreallocateSingleSlab(
 
 
 
+static void prepareSoAFramesPersistentSingleSlabUnordered(
+    std::vector<ndb::Descriptor>& srcStates,
+    std::vector<ndb::Descriptor>& tarStates,
+    SoAFramePersistentSingleSlab& srcFrame, 
+    SoAFramePersistentSingleSlab& tarFrame);
+static void matchPipelinedBranchlessPreallocateSingleSlabUnordered(
+    SoAFramePersistentSingleSlab& src, SoAFramePersistentSingleSlab& tar,
+    std::vector<uint32_t>& outS, std::vector<uint32_t>& outT);
 
 
     /**
diff --git a/tests/test_single_matching.cpp b/tests/test_single_matching.cpp
index e675a7c..2893b01 100644
--- a/tests/test_single_matching.cpp
+++ b/tests/test_single_matching.cpp
@@ -52,5 +52,42 @@ TEST(Approval, Inference)
     EXPECT_EQ(866, supp.size());
     ApprovalTests::Approvals::verify(ss.str());
 }
+std::vector<ndb::Descriptor> getSrcDescriptors() {
+    return ndb::Descriptor::deserialize("statesSrc.txt", true);
+}
+
+std::vector<ndb::Descriptor> getTarDescriptors() {
+    return ndb::Descriptor::deserialize("statesTar.txt", false);
+}
 
 
+TEST(A,B) {
+    std::vector<ndb::Descriptor> srcOriginal = getSrcDescriptors(); 
+    std::vector<ndb::Descriptor> tarOriginal = getTarDescriptors();
+    std::vector<ndb::Descriptor> srcBaseline = srcOriginal;
+    std::vector<ndb::Descriptor> tarBaseline = tarOriginal;
+    std::vector<ndb::Descriptor> srcAlt = srcOriginal;
+    std::vector<ndb::Descriptor> tarAlt = tarOriginal;
+    
+    // Baseline
+    // To write a test for this we'd actually need to get the ids of the sources back, not just the final matches.
+    std::vector<ndb::Correspondence> 
+        matches = gpc::inference::Forest::findCorrespondences(srcBaseline, tarBaseline);
+
+
+    // Alternative method
+    gpc::inference::SoAFramePersistentSingleSlab srcFrame, tarFrame;
+    srcFrame.preallocate(srcOriginal.size()); // size known
+    tarFrame.preallocate(tarOriginal.size());
+
+    std::vector<uint32_t> resultSrc, resultTar;
+    resultSrc.reserve(srcOriginal.size()/10);
+    resultTar.reserve(tarOriginal.size()/10);
+    gpc::inference::Forest::prepareSoAFramesPersistentSingleSlabUnordered(srcAlt, tarAlt, srcFrame, tarFrame);
+    gpc::inference::Forest::matchPipelinedBranchlessPreallocateSingleSlabUnordered(srcFrame, tarFrame, resultSrc, resultTar);
+
+    // Ensure ID pairings of (resultSrc, resultTar) match the naive version. 
+    // We ignore exact matching for now and just expect the count to be the same
+    EXPECT_EQ(matches.size(), resultSrc.size());
+    EXPECT_EQ(matches.size(), resultTar.size());
+}

From 62807c6c5aaa1d8236ed91a34d89c87e2c74aba0 Mon Sep 17 00:00:00 2001
From: Nik Bamert <github@nikbamert.com>
Date: Tue, 7 Apr 2026 07:40:49 +0200
Subject: [PATCH 31/36] move individual HT benchmarks to separate repo

---
 benchmarks/correspondence_bench.cpp |  358 +------
 benchmarks/sobel_bench.cpp          |    2 +-
 lib/gpc/forest.cpp                  | 1388 +--------------------------
 lib/gpc/forest.hpp                  |  119 ---
 lib/gpc/kernels/box.cpp             |    1 -
 lib/gpc/kernels/sobel.cpp           |    1 -
 samples/sparsematch.cpp             |   35 +-
 tests/test_single_matching.cpp      |    4 +-
 8 files changed, 54 insertions(+), 1854 deletions(-)

diff --git a/benchmarks/correspondence_bench.cpp b/benchmarks/correspondence_bench.cpp
index d7655f2..737d4dc 100644
--- a/benchmarks/correspondence_bench.cpp
+++ b/benchmarks/correspondence_bench.cpp
@@ -8,15 +8,6 @@
 
 #define NUM_ELEMENTS 262668 //10*1224*375 //1024*1024
 
-/* Remaining ideas
- * -USE ILP: Parallel Radix Partitioning (Even on one core, using a single-pass shuffle).
- *      - Didn't speed up. was same as matchPreparedFramesFaster
- *      - Assuming that the bottleneck is the hash table probes, hence: look into bloom filters...
- * -Blocked Bloom Filter to discard non-matches in L1.
- *      - Faster at 1M, slower at 100K and 10M 
- * -SIMD-Probed Flat Table (checking 4 slots at once).
- * -Manual Prefetching of the next bucket's data.
- * */
 /**
  * Generates a reproducible Pareto-distributed vector.
  * @param count Number of IDs to generate.
@@ -49,23 +40,28 @@ std::vector<ndb::Descriptor> generate_pareto_ids(size_t count, double target_mea
     return ids;
 }
 std::vector<ndb::Descriptor> getSrcDescriptors() {
-    //return ndb::Descriptor::deserialize("statesSrc.txt", true);
-    return generate_pareto_ids(NUM_ELEMENTS, 1000.0, 42); // 1M IDs with mean ~1000
+    std::vector<ndb::Descriptor> v =  ndb::Descriptor::deserialize("statesSrcLarge.txt", true);
+    std::vector<ndb::Descriptor> out;
+    for (size_t i = 0; i < v.size(); i++) {
+        if (v[i].point.y % 5 == 0 && (v[i].state & 0xFFFFFFFF) != 0) { 
+            out.push_back(v[i]);
+        }
+    }
+    return out;
+    //return generate_pareto_ids(NUM_ELEMENTS, 1000.0, 42); // 1M IDs with mean ~1000
 }
 
 std::vector<ndb::Descriptor> getTarDescriptors() {
-    //return ndb::Descriptor::deserialize("statesTar.txt", false);
-    return generate_pareto_ids(NUM_ELEMENTS, 1001.0, 42); // 1M IDs with mean ~1000
-}
-std::vector<ndb::Descriptor> generate_unique_ids(size_t count) {
-    std::vector<ndb::Descriptor> ids;
-    ids.reserve(count);
-
-
-    for (size_t i = 0; i < count; ++i) {
-        ids.push_back(ndb::Descriptor(ndb::Point(0,0), static_cast<uint32_t>(i)));
+    std::vector<ndb::Descriptor> v = ndb::Descriptor::deserialize("statesTarLarge.txt", false);
+    std::vector<ndb::Descriptor> out;
+    for (size_t i = 0; i < v.size(); i++) {
+        if (v[i].point.y % 5 == 0 && (v[i].state & 0xFFFFFFFF) != 0) { 
+            out.push_back(v[i]);
+        }
     }
-    return ids;
+    return out;
+
+    //return generate_pareto_ids(NUM_ELEMENTS, 1001.0, 42); // 1M IDs with mean ~1000
 }
 static void matchBySorting(
         benchmark::State& state) {
@@ -80,328 +76,10 @@ static void matchBySorting(
             matches = gpc::inference::Forest::findCorrespondences(src, tar);
 
         state.counters["matches"] = matches.size();
-        //state.counters["candidates_t"] = timgP.mask.size();
-        //state.counters["matches"] = supp.size();
-        benchmark::DoNotOptimize(matches);
-        benchmark::ClobberMemory();
-    }
-}
-static void matchByHashingNaive(
-        benchmark::State& state) {
-    std::vector<ndb::Descriptor> srcOriginal = getSrcDescriptors(); 
-    std::vector<ndb::Descriptor> tarOriginal = getTarDescriptors();
-    for (auto _ : state) {
-        state.PauseTiming();
-        std::vector<ndb::Descriptor> src = srcOriginal;
-        std::vector<ndb::Descriptor> tar = tarOriginal;
-        state.ResumeTiming();
-        std::vector<ndb::Correspondence> 
-            matches = gpc::inference::Forest::findCorrespondencesHashNaive(src, tar);
-
-        state.counters["matches"] = matches.size();
-        benchmark::DoNotOptimize(matches);
-        benchmark::ClobberMemory();
-    }
-}
-static void matchByHashing(
-        benchmark::State& state) {
-    std::vector<ndb::Descriptor> srcOriginal = getSrcDescriptors(); 
-    std::vector<ndb::Descriptor> tarOriginal = getTarDescriptors();
-    for (auto _ : state) {
-        state.PauseTiming();
-        std::vector<ndb::Descriptor> src = srcOriginal;
-        std::vector<ndb::Descriptor> tar = tarOriginal;
-        state.ResumeTiming();
-        std::vector<ndb::Correspondence> 
-            matches = gpc::inference::Forest::findCorrespondencesTurbo(src, tar);
-
-        state.counters["matches"] = matches.size();
-        //state.counters["candidates_t"] = timgP.mask.size();
-        //state.counters["matches"] = supp.size();
-        benchmark::DoNotOptimize(matches);
-        benchmark::ClobberMemory();
-    }
-}
-static void matchPreparedFrames(
-        benchmark::State& state) {
-    std::vector<ndb::Descriptor> srcOriginal = getSrcDescriptors(); 
-    std::vector<ndb::Descriptor> tarOriginal = getTarDescriptors();
-    for (auto _ : state) {
-        state.PauseTiming();
-        std::vector<ndb::Descriptor> src = srcOriginal;
-        std::vector<ndb::Descriptor> tar = tarOriginal;
-        state.ResumeTiming();
-        auto v = gpc::inference::Forest::prepareSoAFrames(src, tar);
-        auto matches = gpc::inference::Forest::matchPreparedFrames(v.first, v.second);
-
-        state.counters["matches"] = matches.first.size();
-        //state.counters["candidates_t"] = timgP.mask.size();
-        //state.counters["matches"] = supp.size();
         benchmark::DoNotOptimize(matches);
         benchmark::ClobberMemory();
     }
 }
-static void matchPreparedFramesFaster(
-        benchmark::State& state) {
-    std::vector<ndb::Descriptor> srcOriginal = getSrcDescriptors(); 
-    std::vector<ndb::Descriptor> tarOriginal = getTarDescriptors();
-    for (auto _ : state) {
-        state.PauseTiming();
-        std::vector<ndb::Descriptor> src = srcOriginal;
-        std::vector<ndb::Descriptor> tar = tarOriginal;
-        state.ResumeTiming();
-        auto v = gpc::inference::Forest::prepareSoAFrames(src, tar);
-        //auto matches = gpc::inference::Forest::matchPreparedFramesFaster(v.first, v.second);
-        auto matches = gpc::inference::Forest::matchParallelRadixPartitioning(v.first,v.second);
-        state.counters["matches"] = matches.first.size();
-        //state.counters["candidates_t"] = timgP.mask.size();
-        //state.counters["matches"] = supp.size();
-        benchmark::DoNotOptimize(matches);
-        benchmark::ClobberMemory();
-    }
-}
-static void matchParallelRadixPartitioning(
-        benchmark::State& state) {
-    std::vector<ndb::Descriptor> srcOriginal = getSrcDescriptors(); 
-    std::vector<ndb::Descriptor> tarOriginal = getTarDescriptors();
-    for (auto _ : state) {
-        state.PauseTiming();
-        std::vector<ndb::Descriptor> src = srcOriginal;
-        std::vector<ndb::Descriptor> tar = tarOriginal;
-        state.ResumeTiming();
-        auto v = gpc::inference::Forest::prepareSoAFrames(src,tar);
-        auto matches = gpc::inference::Forest::matchParallelRadixPartitioning(v.first, v.second);
-
-        state.counters["matches"] = matches.first.size();
-        //state.counters["candidates_t"] = timgP.mask.size();
-        //state.counters["matches"] = supp.size();
-        benchmark::DoNotOptimize(matches);
-        benchmark::ClobberMemory();
-    }
-}
-static void matchBlockedBloom(
-        benchmark::State& state) {
-    std::vector<ndb::Descriptor> srcOriginal = getSrcDescriptors(); 
-    std::vector<ndb::Descriptor> tarOriginal = getTarDescriptors();
-                                                    
-    for (auto _ : state) {
-        state.PauseTiming();
-        std::vector<ndb::Descriptor> src = srcOriginal;
-        std::vector<ndb::Descriptor> tar = tarOriginal;
-        state.ResumeTiming();
-        auto v = gpc::inference::Forest::prepareSoAFrames(src,tar);
-        auto matches = gpc::inference::Forest::matchBlockedBloom(v.first, v.second);
-
-        state.counters["matches"] = matches.first.size();
-        benchmark::DoNotOptimize(matches);
-        benchmark::ClobberMemory();
-    }
-}
-static void matchAdaptive(
-        benchmark::State& state) {
-    std::vector<ndb::Descriptor> srcOriginal = getSrcDescriptors(); 
-    std::vector<ndb::Descriptor> tarOriginal = getTarDescriptors();
-    for (auto _ : state) {
-        state.PauseTiming();
-        std::vector<ndb::Descriptor> src = srcOriginal;
-        std::vector<ndb::Descriptor> tar = tarOriginal;
-        state.ResumeTiming();
-        auto v = gpc::inference::Forest::prepareSoAFrames(src,tar);
-        auto matches = gpc::inference::Forest::matchAdaptive(v.first, v.second);
-
-        state.counters["matches"] = matches.first.size();
-        benchmark::DoNotOptimize(matches);
-        benchmark::ClobberMemory();
-    }
-}
-/*
-static void matchAdaptiveNeon(
-        benchmark::State& state) {
-    std::vector<ndb::Descriptor> src, tar;
-    src = generate_pareto_ids(NUM_ELEMENTS, 1000.0, 42); // 1M IDs with mean ~1000
-    tar = generate_pareto_ids(NUM_ELEMENTS, 1001.0, 42); // 1M IDs with mean ~1000
-                                                    
-    for (auto _ : state) {
-        auto v = gpc::inference::Forest::prepareSoAFrames(src, tar);
-        auto matches = gpc::inference::Forest::matchAdaptiveNeon(v.first, v.second);
-
-        state.counters["matches"] = matches.first.size();
-        //state.counters["candidates_t"] = timgP.mask.size();
-        //state.counters["matches"] = supp.size();
-        benchmark::DoNotOptimize(matches);
-        benchmark::ClobberMemory();
-    }
-}
-*/
-static void matchAdaptivePersistent(
-        benchmark::State& state) {
-    std::vector<ndb::Descriptor> srcOriginal = getSrcDescriptors(); 
-    std::vector<ndb::Descriptor> tarOriginal = getTarDescriptors();
-    gpc::inference::SoAFramePersistent srcFrame, tarFrame;
-    srcFrame.preallocate(srcOriginal.size()); // size known
-    tarFrame.preallocate(tarOriginal.size());
-    for (auto _ : state) {
-        state.PauseTiming();
-        std::vector<ndb::Descriptor> src = srcOriginal;
-        std::vector<ndb::Descriptor> tar = tarOriginal;
-        state.ResumeTiming();
-
-        gpc::inference::Forest::prepareSoAFramesPersistent(src, tar, srcFrame, tarFrame);
-        auto matches = gpc::inference::Forest::matchAdaptivePersistent(srcFrame, tarFrame);
-
-        state.counters["matches"] = matches.first.size();
-        benchmark::DoNotOptimize(matches);
-        benchmark::ClobberMemory();
-    }
-}
-static void matchPipelinedBranchless(
-        benchmark::State& state) {
-    std::vector<ndb::Descriptor> srcOriginal = getSrcDescriptors(); 
-    std::vector<ndb::Descriptor> tarOriginal = getTarDescriptors();
-    gpc::inference::SoAFramePersistent srcFrame, tarFrame;
-    srcFrame.preallocate(srcOriginal.size()); // size known
-    tarFrame.preallocate(tarOriginal.size());
-    for (auto _ : state) {
-        state.PauseTiming();
-        std::vector<ndb::Descriptor> src = srcOriginal;
-        std::vector<ndb::Descriptor> tar = tarOriginal;
-        state.ResumeTiming();
-
-        gpc::inference::Forest::prepareSoAFramesPersistent(src, tar, srcFrame, tarFrame);
-        auto matches = gpc::inference::Forest::matchPipelinedBranchless(srcFrame, tarFrame);
-        benchmark::DoNotOptimize(matches);
-        benchmark::ClobberMemory();
-    }
-}
-static void matchPipelinedBranchlessPreallocate(
-        benchmark::State& state) {
-    std::vector<ndb::Descriptor> srcOriginal = getSrcDescriptors(); 
-    std::vector<ndb::Descriptor> tarOriginal = getTarDescriptors();
-    gpc::inference::SoAFramePersistent srcFrame, tarFrame;
-    srcFrame.preallocate(srcOriginal.size()); // size known
-    tarFrame.preallocate(tarOriginal.size());
-    std::vector<uint32_t> resultSrc, resultTar;
-    resultSrc.reserve(srcOriginal.size()/100);
-    resultTar.reserve(tarOriginal.size()/100);
-    for (auto _ : state) {
-        state.PauseTiming();
-        resultSrc.clear();
-        resultTar.clear();
-        std::vector<ndb::Descriptor> src = srcOriginal;
-        std::vector<ndb::Descriptor> tar = tarOriginal;
-        state.ResumeTiming();
-
-        // 1. Measure Prepare
-        // 2M: 5.7ms, 20M: 57ms
-        gpc::inference::Forest::prepareSoAFramesPersistent(src, tar, srcFrame, tarFrame);
-
-        // 2. Measure Match
-        // 2M: 5.3ms , 20M: 53ms
-        gpc::inference::Forest::matchPipelinedBranchlessPreallocate(srcFrame, tarFrame, resultSrc, resultTar);
-
-        state.counters["matches"] = resultSrc.size();
-        
-        benchmark::DoNotOptimize(resultSrc);
-        benchmark::DoNotOptimize(resultTar);
-        benchmark::ClobberMemory();
-    }
-}
-
-static void matchPipelinedBranchlessPreallocateSingleSlab(
-        benchmark::State& state) {
-    std::vector<ndb::Descriptor> srcOriginal = getSrcDescriptors(); 
-    std::vector<ndb::Descriptor> tarOriginal = getTarDescriptors();
-                                                    
-    gpc::inference::SoAFramePersistentSingleSlab srcFrame, tarFrame;
-    srcFrame.preallocate(srcOriginal.size()); // size known
-    tarFrame.preallocate(tarOriginal.size());
-    std::vector<uint32_t> resultSrc, resultTar;
-    resultSrc.reserve(srcOriginal.size()/10);
-    resultTar.reserve(tarOriginal.size()/10);
-    for (auto _ : state) {
-        state.PauseTiming();
-        resultSrc.clear();
-        resultTar.clear();
-        // 1. Measure Prepare
-        // 2M: 5.7ms, 20M: 57ms
-        std::vector<ndb::Descriptor> src = srcOriginal;
-        std::vector<ndb::Descriptor> tar = tarOriginal;
-        state.ResumeTiming();
-        gpc::inference::Forest::prepareSoAFramesPersistentSingleSlab(src, tar, srcFrame, tarFrame);
-
-        // 2. Measure Match
-        // 2M: 5.3ms , 20M: 53ms
-        gpc::inference::Forest::matchPipelinedBranchlessPreallocateSingleSlab(srcFrame, tarFrame, resultSrc, resultTar);
-
-        state.counters["matches"] = resultSrc.size();
-        
-        benchmark::DoNotOptimize(resultSrc);
-        benchmark::DoNotOptimize(resultTar);
-        benchmark::ClobberMemory();
-    }
-}
-static void matchPipelinedBranchlessPreallocateSingleSlabUnordered(
-        benchmark::State& state) {
-    std::vector<ndb::Descriptor> srcOriginal = getSrcDescriptors(); 
-    std::vector<ndb::Descriptor> tarOriginal = getTarDescriptors();
-                                                    
-    gpc::inference::SoAFramePersistentSingleSlab srcFrame, tarFrame;
-    srcFrame.preallocate(srcOriginal.size()); // size known
-    tarFrame.preallocate(tarOriginal.size());
-    std::vector<uint32_t> resultSrc, resultTar;
-    resultSrc.reserve(srcOriginal.size()/10);
-    resultTar.reserve(tarOriginal.size()/10);
-    for (auto _ : state) {
-        state.PauseTiming();
-        resultSrc.clear();
-        resultTar.clear();
-        // 1. Measure Prepare
-        // 2M: 5.7ms, 20M: 57ms
-        std::vector<ndb::Descriptor> src = srcOriginal;
-        std::vector<ndb::Descriptor> tar = tarOriginal;
-        state.ResumeTiming();
-        gpc::inference::Forest::prepareSoAFramesPersistentSingleSlabUnordered(src, tar, srcFrame, tarFrame);
-
-        // 2. Measure Match
-        // 2M: 5.3ms , 20M: 53ms
-        gpc::inference::Forest::matchPipelinedBranchlessPreallocateSingleSlabUnordered(srcFrame, tarFrame, resultSrc, resultTar);
-
-        state.counters["matches"] = resultSrc.size();
-        
-        benchmark::DoNotOptimize(resultSrc);
-        benchmark::DoNotOptimize(resultTar);
-        benchmark::ClobberMemory();
-    }
-}
 BENCHMARK(matchBySorting)
     ->Unit(benchmark::kMillisecond);
-BENCHMARK(matchByHashing)
-    ->Unit(benchmark::kMillisecond);
-BENCHMARK(matchByHashingNaive)
-    ->Unit(benchmark::kMillisecond);
-BENCHMARK(matchPreparedFrames)
-    ->Unit(benchmark::kMillisecond);
-BENCHMARK(matchPreparedFramesFaster)
-    ->Unit(benchmark::kMillisecond);
-BENCHMARK(matchParallelRadixPartitioning)
-    ->Unit(benchmark::kMillisecond);
-BENCHMARK(matchBlockedBloom)
-    ->Unit(benchmark::kMillisecond);
-BENCHMARK(matchAdaptive)
-    ->Unit(benchmark::kMillisecond);
-/*
-BENCHMARK(matchAdaptiveNeon)
-    ->Unit(benchmark::kMillisecond);
-*/
-BENCHMARK(matchAdaptivePersistent)
-    ->Unit(benchmark::kMillisecond);
-
-BENCHMARK(matchPipelinedBranchless)
-    ->Unit(benchmark::kMillisecond);
-BENCHMARK(matchPipelinedBranchlessPreallocate)
-    ->Unit(benchmark::kMillisecond);
-BENCHMARK(matchPipelinedBranchlessPreallocateSingleSlab)
-    ->Unit(benchmark::kMillisecond);
-BENCHMARK(matchPipelinedBranchlessPreallocateSingleSlabUnordered)
-    ->Unit(benchmark::kMillisecond);
 BENCHMARK_MAIN();
diff --git a/benchmarks/sobel_bench.cpp b/benchmarks/sobel_bench.cpp
index 5c26d89..490a861 100644
--- a/benchmarks/sobel_bench.cpp
+++ b/benchmarks/sobel_bench.cpp
@@ -40,7 +40,7 @@ static void BM_SobelNaive(benchmark::State& state) {
 
     state.SetLabel("naive");    
     for (auto _ : state) {
-        ndb::sobelNaive(in.data(), out.data(), w, h, 1);
+        ndb::sobelNaive(in.data(), out.data(), w, h, 50);
         
         // Ensure the compiler doesn't skip the work
         benchmark::DoNotOptimize(out.data());
diff --git a/lib/gpc/forest.cpp b/lib/gpc/forest.cpp
index 5ebcbf0..87e0582 100644
--- a/lib/gpc/forest.cpp
+++ b/lib/gpc/forest.cpp
@@ -58,1099 +58,6 @@
 
 namespace gpc {
 namespace inference {
-void Forest::prepareSoAFramesPersistentSingleSlabUnordered(
-    std::vector<ndb::Descriptor>& srcStates,
-    std::vector<ndb::Descriptor>& tarStates,
-    SoAFramePersistentSingleSlab& srcFrame, 
-    SoAFramePersistentSingleSlab& tarFrame) {
-
-    uint32_t srcCounts[256] = {0}, tarCounts[256] = {0};
-    for (const auto& s : srcStates) {
-        srcCounts[s.state & 0xFF]++;
-    }
-    for (const auto& t : tarStates) {
-        tarCounts[t.state & 0xFF]++;
-    }
-
-    StateIdx* sP = srcFrame.slab.data();
-    StateIdx* tP = tarFrame.slab.data();
-    for (int i = 0; i < 256; ++i) {
-        srcFrame.bucketData[i] = sP;
-        srcFrame.bucketSizes[i] = srcCounts[i];
-        tarFrame.bucketData[i] = tP;
-        tarFrame.bucketSizes[i] = tarCounts[i];
-        sP += srcCounts[i]; tP += tarCounts[i];
-    }
-
-    uint32_t sW[256] = {0}, tW[256] = {0};
-    // FIX: Split into two independent loops
-    for (uint32_t i = 0; i < (uint32_t)srcStates.size(); ++i) {
-        uint64_t sv = srcStates[i].state;
-        srcFrame.bucketData[sv & 0xFF][sW[sv & 0xFF]++] = {sv, i};
-    }
-    for (uint32_t i = 0; i < (uint32_t)tarStates.size(); ++i) {
-        uint64_t tv = tarStates[i].state;
-        tarFrame.bucketData[tv & 0xFF][tW[tv & 0xFF]++] = {tv, i};
-    }
-}
-void Forest::matchPipelinedBranchlessPreallocateSingleSlabUnordered(
-    SoAFramePersistentSingleSlab& src, SoAFramePersistentSingleSlab& tar,
-    std::vector<uint32_t>& outS, std::vector<uint32_t>& outT) {
-
-    struct Slot { 
-        uint64_t key;   
-        uint32_t idx;   
-        uint32_t gen;   
-        uint32_t count; 
-        uint32_t outIdx; // FIX: Track where the match was written in the output vectors
-    };
-    static std::vector<Slot> table(16384, {0, 0, 0, 0, 0});
-    static uint32_t currentGen = 1;
-
-    for (int b = 0; b < 256; ++b) {
-        StateIdx* sData = src.bucketData[b];
-        uint32_t  sSize = src.bucketSizes[b];
-        if (sSize == 0) continue;
-
-        const uint32_t mask = (sSize < 1000) ? 2047 : 16383;
-        const uint32_t shift = (sSize < 1000) ? 53 : 50;
-        currentGen++;
-
-        for (uint32_t i = 0; i < sSize; ++i) {
-            uint64_t k = sData[i].state;
-            uint32_t h = (k * 11400714819323198485llu) >> shift;
-            h &= mask;
-            while (table[h].gen == currentGen && table[h].key != k) h = (h + 1) & mask;
-            if (table[h].gen != currentGen) table[h] = {k, sData[i].index, currentGen, 1, 0};
-            else table[h].count++;
-        }
-
-        StateIdx* tData = tar.bucketData[b];
-        uint32_t  tSize = tar.bucketSizes[b];
-        for (uint32_t i = 0; i < tSize; ++i) {
-            uint64_t k = tData[i].state;
-            uint32_t h = (k * 11400714819323198485llu) >> shift;
-            h &= mask;
-            while (table[h].gen == currentGen && table[h].key != k) h = (h + 1) & mask;
-
-            if (table[h].gen == currentGen && table[h].key == k) {
-                if (table[h].count == 1) {
-                    // Unique source, first target match
-                    table[h].outIdx = outS.size(); // Remember the index
-                    outS.push_back(table[h].idx);
-                    outT.push_back(tData[i].index);
-                    table[h].count = 0xFFFFFFFF; // Mark as matched once
-                } else if (table[h].count == 0xFFFFFFFF) {
-                    // Duplicate target found! Invalidate the previously written match.
-                    outS[table[h].outIdx] = 0xFFFFFFFF; 
-                    outT[table[h].outIdx] = 0xFFFFFFFF;
-                    table[h].count = 0xEEEEEEEE; // Mark as ruined
-                }
-            }
-        }
-    }
-
-    // FIX: Final compaction pass to remove invalidated matches (sentinels)
-    uint32_t validCount = 0;
-    for (size_t i = 0; i < outS.size(); ++i) {
-        if (outS[i] != 0xFFFFFFFF) {
-            outS[validCount] = outS[i];
-            outT[validCount] = outT[i];
-            validCount++;
-        }
-    }
-    outS.resize(validCount);
-    outT.resize(validCount);
-}
-void Forest::prepareSoAFramesPersistentSingleSlab(
-    std::vector<ndb::Descriptor>& srcStates,
-    std::vector<ndb::Descriptor>& tarStates,
-    SoAFramePersistentSingleSlab& srcFrame, 
-    SoAFramePersistentSingleSlab& tarFrame) {
-
-    uint32_t srcCounts[256] = {0}, tarCounts[256] = {0};
-    for (const auto& s : srcStates) srcCounts[s.state & 0xFF]++;
-    for (const auto& t : tarStates) tarCounts[t.state & 0xFF]++;
-
-    StateIdx* sP = srcFrame.slab.data();
-    StateIdx* tP = tarFrame.slab.data();
-    for (int i = 0; i < 256; ++i) {
-        srcFrame.bucketData[i] = sP;
-        srcFrame.bucketSizes[i] = srcCounts[i];
-        tarFrame.bucketData[i] = tP;
-        tarFrame.bucketSizes[i] = tarCounts[i];
-        sP += srcCounts[i]; tP += tarCounts[i];
-    }
-
-    uint32_t sW[256] = {0}, tW[256] = {0};
-    for (uint32_t i = 0; i < (uint32_t)srcStates.size(); ++i) {
-        uint64_t sv = srcStates[i].state;
-        uint64_t tv = tarStates[i].state;
-        srcFrame.bucketData[sv & 0xFF][sW[sv & 0xFF]++] = {sv, i};
-        tarFrame.bucketData[tv & 0xFF][tW[tv & 0xFF]++] = {tv, i};
-    }
-}
-void Forest::prepareSoAFramesPersistent(
-    std::vector<ndb::Descriptor>& srcStates,
-    std::vector<ndb::Descriptor>& tarStates,
-    SoAFramePersistent& srcFrame, 
-    SoAFramePersistent& tarFrame) {
-    assert(srcStates.size() == tarStates.size());
-    assert(srcStates.size() <= 256 * 16384); // limit for max unique items in our table design
-/*
- // This is only slightly slower than the bit below.
-    const uint32_t BUCKET_COUNT = 256;
-    const uint64_t BUCKET_MASK = 0xFF;
-
-    // 1. Histogram (To find bucket boundaries)
-    uint32_t srcCounts[BUCKET_COUNT] = {0};
-    uint32_t tarCounts[BUCKET_COUNT] = {0};
-    for (const auto& s : srcStates) srcCounts[s.state & BUCKET_MASK]++;
-    for (const auto& t : tarStates) tarCounts[t.state & BUCKET_MASK]++;
-
-    // 2. Setup Bucket Pointers into the Slab
-    // We treat the slab like a custom allocator
-    uint64_t* srcPtr = srcFrame.statesSlab.data();
-    uint32_t* srcIdxPtr = srcFrame.indicesSlab.data();
-    uint64_t* tarPtr = tarFrame.statesSlab.data();
-    uint32_t* tarIdxPtr = tarFrame.indicesSlab.data();
-
-    for (uint32_t i = 0; i < BUCKET_COUNT; ++i) {
-        srcFrame.bucketStates[i] = srcPtr;
-        srcFrame.bucketIndices[i] = srcIdxPtr;
-        srcFrame.bucketSizes[i] = srcCounts[i];
-        
-        tarFrame.bucketStates[i] = tarPtr;
-        tarFrame.bucketIndices[i] = tarIdxPtr;
-        tarFrame.bucketSizes[i] = tarCounts[i];
-
-        srcPtr += srcCounts[i];
-        srcIdxPtr += srcCounts[i];
-        tarPtr += tarCounts[i];
-        tarIdxPtr += tarCounts[i];
-    }
-
-    // 3. The "Pure Scatter" (No push_back, no resize, no zeroing)
-    uint32_t srcWriteIdx[BUCKET_COUNT] = {0};
-    uint32_t tarWriteIdx[BUCKET_COUNT] = {0};
-
-    for (uint32_t i = 0; i < (uint32_t)srcStates.size(); ++i) {
-        uint64_t s = srcStates[i].state;
-        uint32_t b = s & BUCKET_MASK;
-        uint32_t pos = srcWriteIdx[b]++;
-        srcFrame.bucketStates[b][pos] = s;
-        srcFrame.bucketIndices[b][pos] = i;
-    }
-
-    for (uint32_t i = 0; i < (uint32_t)tarStates.size(); ++i) {
-        uint64_t s = tarStates[i].state;
-        uint32_t b = s & BUCKET_MASK;
-        uint32_t pos = tarWriteIdx[b]++;
-        tarFrame.bucketStates[b][pos] = s;
-        tarFrame.bucketIndices[b][pos] = i;
-    }
-    */
-    const uint32_t BUCKET_COUNT = 256;
-    const uint64_t BUCKET_MASK = 0xFF;
-
-    uint32_t srcCounts[BUCKET_COUNT] = {0};
-    uint32_t tarCounts[BUCKET_COUNT] = {0};
-
-    // 1. Fused Histogram Pass (Assuming equal sizes as per your note)
-    const uint32_t totalSize = (uint32_t)srcStates.size();
-    for (uint32_t i = 0; i < totalSize; ++i) {
-        srcCounts[srcStates[i].state & BUCKET_MASK]++;
-        tarCounts[tarStates[i].state & BUCKET_MASK]++;
-    }
-
-    // 2. Setup Bucket Pointers (Unchanged, this is fast)
-    uint64_t* sP = srcFrame.statesSlab.data();
-    uint32_t* sI = srcFrame.indicesSlab.data();
-    uint64_t* tP = tarFrame.statesSlab.data();
-    uint32_t* tI = tarFrame.indicesSlab.data();
-
-    for (uint32_t i = 0; i < BUCKET_COUNT; ++i) {
-        srcFrame.bucketStates[i] = sP;
-        srcFrame.bucketIndices[i] = sI;
-        srcFrame.bucketSizes[i] = srcCounts[i];
-        tarFrame.bucketStates[i] = tP;
-        tarFrame.bucketIndices[i] = tI;
-        tarFrame.bucketSizes[i] = tarCounts[i];
-        sP += srcCounts[i]; sI += srcCounts[i];
-        tP += tarCounts[i]; tI += tarCounts[i];
-    }
-
-    // 3. Optimized Fused Scatter
-    uint32_t srcWriteIdx[BUCKET_COUNT] = {0};
-    uint32_t tarWriteIdx[BUCKET_COUNT] = {0};
-
-    // Unroll by 2 to keep the M3's execution ports saturated
-    uint32_t i = 0;
-    for (; i + 1 < totalSize; i += 2) {
-        // Source pair
-        uint64_t s0 = srcStates[i].state;
-        uint64_t s1 = srcStates[i+1].state;
-        uint32_t bS0 = s0 & BUCKET_MASK;
-        uint32_t bS1 = s1 & BUCKET_MASK;
-
-        srcFrame.bucketStates[bS0][srcWriteIdx[bS0]++] = s0;
-        srcFrame.bucketIndices[bS0][srcWriteIdx[bS0]-1] = i;
-        srcFrame.bucketStates[bS1][srcWriteIdx[bS1]++] = s1;
-        srcFrame.bucketIndices[bS1][srcWriteIdx[bS1]-1] = i+1;
-
-        // Target pair
-        uint64_t t0 = tarStates[i].state;
-        uint64_t t1 = tarStates[i+1].state;
-        uint32_t bT0 = t0 & BUCKET_MASK;
-        uint32_t bT1 = t1 & BUCKET_MASK;
-
-        tarFrame.bucketStates[bT0][tarWriteIdx[bT0]++] = t0;
-        tarFrame.bucketIndices[bT0][tarWriteIdx[bT0]-1] = i;
-        tarFrame.bucketStates[bT1][tarWriteIdx[bT1]++] = t1;
-        tarFrame.bucketIndices[bT1][tarWriteIdx[bT1]-1] = i+1;
-    }
-
-    // Handle remainder
-    for (; i < totalSize; ++i) {
-        uint64_t s = srcStates[i].state;
-        uint32_t bS = s & BUCKET_MASK;
-        srcFrame.bucketStates[bS][srcWriteIdx[bS]++] = s;
-        srcFrame.bucketIndices[bS][srcWriteIdx[bS]-1] = i;
-
-        uint64_t t = tarStates[i].state;
-        uint32_t bT = t & BUCKET_MASK;
-        tarFrame.bucketStates[bT][tarWriteIdx[bT]++] = t;
-        tarFrame.bucketIndices[bT][tarWriteIdx[bT]-1] = i;
-    }
-}
-
- // Here we did allocation within the prepare. we can move that part out
-std::pair<SoAFrame, SoAFrame> Forest::prepareSoAFrames(
-    std::vector<ndb::Descriptor>& srcStates,
-    std::vector<ndb::Descriptor>& tarStates) {
-    SoAFrame srcFrame, tarFrame;
-    srcFrame.reserve(srcStates.size());
-    tarFrame.reserve(tarStates.size());
-
-    const uint64_t MASK = 0xFF;
-
-    // Distribute into buckets based on the last 8 bits of the state
-    for (uint32_t i = 0; i < srcStates.size(); ++i) {
-        uint64_t s = srcStates[i].state;
-        srcFrame.states[s & MASK].push_back(s);
-        srcFrame.indices[s & MASK].push_back(i);
-    }
-
-    for (uint32_t i = 0; i < tarStates.size(); ++i) {
-        uint64_t s = tarStates[i].state;
-        tarFrame.states[s & MASK].push_back(s);
-        tarFrame.indices[s & MASK].push_back(i);
-    }
-
-    return {srcFrame, tarFrame};
-}
-void Forest::matchPipelinedBranchlessPreallocateSingleSlab(
-    SoAFramePersistentSingleSlab& src, SoAFramePersistentSingleSlab& tar,
-    std::vector<uint32_t>& outS, std::vector<uint32_t>& outT) {
-
-    struct Slot { 
-        uint64_t key;   // The 64-bit Descriptor/State ID
-        uint32_t idx;   // The original global index in the Source array
-        uint32_t gen;   // The "Generation" ID (replaces memset/clear)
-        uint32_t count; // The match state (0=empty, 1=unique, >1=dup, 0xFF..=matched)
-    };
-    static std::vector<Slot> table(16384, {0, 0, 0, 0});
-    static uint32_t currentGen = 1;
-
-    for (int b = 0; b < 256; ++b) {
-        StateIdx* sData = src.bucketData[b];
-        uint32_t  sSize = src.bucketSizes[b];
-        if (sSize == 0) continue;
-
-        const uint32_t mask = (sSize < 1000) ? 2047 : 16383;
-        const uint32_t shift = (sSize < 1000) ? 53 : 50;
-        currentGen++;
-
-        for (uint32_t i = 0; i < sSize; ++i) {
-            uint64_t k = sData[i].state;
-            uint32_t h = (k * 11400714819323198485llu) >> shift;
-            h &= mask;
-            while (table[h].gen == currentGen && table[h].key != k) h = (h + 1) & mask;
-            if (table[h].gen != currentGen) table[h] = {k, sData[i].index, currentGen, 1};
-            else table[h].count++;
-        }
-
-        StateIdx* tData = tar.bucketData[b];
-        uint32_t  tSize = tar.bucketSizes[b];
-        for (uint32_t i = 0; i < tSize; ++i) {
-            uint64_t k = tData[i].state;
-            uint32_t h = (k * 11400714819323198485llu) >> shift;
-            h &= mask;
-            while (table[h].gen == currentGen && table[h].key != k) h = (h + 1) & mask;
-
-            if (table[h].gen == currentGen && table[h].key == k) {
-                if (table[h].count == 1) {
-                    outS.push_back(table[h].idx);
-                    outT.push_back(tData[i].index);
-                    table[h].count = 0xFFFFFFFF;
-                } else if (table[h].count == 0xFFFFFFFF) {
-                    outS.pop_back(); outT.pop_back();
-                    table[h].count = 0xEEEEEEEE;
-                }
-            }
-        }
-    }
-}
-/*
-std::pair<std::vector<uint32_t>, std::vector<uint32_t>> Forest::matchAdaptiveNeon(
-    SoAFrame& src, 
-    SoAFrame& tar) {
-
-    std::pair<std::vector<uint32_t>, std::vector<uint32_t>> result;
-    result.first.reserve(10000); 
-    result.second.reserve(10000);
-
-    // Slot is exactly 32 bytes. 2 Slots = 64 bytes (1 Cache Line).
-    struct alignas(16) Slot { 
-        uint64_t key; 
-        uint32_t idx; 
-        uint32_t gen;   
-        uint32_t count; 
-        uint32_t padding; 
-    };
-
-    static uint32_t currentGen = 1;
-    static std::vector<Slot> table(8192, {0, 0, 0, 0, 0});
-
-    for (int b = 0; b < 256; ++b) {
-        const auto& sStates = src.states[b];
-        const auto& sIdxs   = src.indices[b];
-        if (sStates.empty()) continue;
-
-        const uint32_t mask = (sStates.size() < 500) ? 1023 : 8191;
-        currentGen++;
-
-        // --- PART 1: SOURCE FILL (Keep Scalar as it's usually not the bottleneck) ---
-        for (size_t i = 0; i < sStates.size(); ++i) {
-            uint64_t k = sStates[i];
-            uint32_t h = (k * 11400714819323198485llu) >> (64 - 13);
-            h &= mask;
-
-            while (table[h].gen == currentGen && table[h].key != k) {
-                h = (h + 1) & mask;
-            }
-            
-            if (table[h].gen != currentGen) {
-                table[h] = {k, sIdxs[i], currentGen, 1, 0};
-            } else {
-                table[h].count++;
-            }
-        }
-
-        const auto& tStates = tar.states[b];
-        const auto& tIdxs   = tar.indices[b];
-
-        // --- PART 2: TARGET MATCH (NEON Vectorized Window) ---
-        uint64x2_t genVec = vdupq_n_u64((uint64_t)currentGen << 32); // Gen is at offset 12 in slot
-        
-        for (size_t i = 0; i < tStates.size(); ++i) {
-            uint64_t k = tStates[i];
-            uint32_t h = (k * 11400714819323198485llu) >> (64 - 13);
-            h &= mask;
-
-            uint64x2_t targetKeyV = vdupq_n_u64(k);
-            bool found = false;
-
-            // Check 2 slots at a time (One Cache Line)
-            // This loop usually terminates in the first iteration (h and h+1)
-            while (true) {
-                // Load keys from Slot H and Slot H+1
-                // We use vld2 to pick the 'key' field which is the first 8 bytes of each 32-byte slot
-                // For simplicity and speed on M3, we'll just do direct pointer access:
-                uint64_t k0 = table[h].key;
-                uint64_t k1 = table[(h + 1) & mask].key;
-                uint32_t g0 = table[h].gen;
-                uint32_t g1 = table[(h + 1) & mask].gen;
-
-                uint64x2_t keysV = {k0, k1};
-                uint32x2_t gensV = {g0, g1};
-
-                // Compare keys
-                uint64x2_t keyMatch = vceqq_u64(keysV, targetKeyV);
-                // Compare generations
-                uint32x2_t genMatch = vceq_u32(gensV, vdup_n_u32(currentGen));
-
-                // Check lane 0
-                if (vgetq_lane_u64(keyMatch, 0) && vget_lane_u32(genMatch, 0)) {
-                    if (table[h].count == 1) {
-                        result.first.push_back(table[h].idx);
-                        result.second.push_back(tIdxs[i]);
-                        table[h].count = 0xFFFFFFFF;
-                    } else if (table[h].count == 0xFFFFFFFF) {
-                        result.first.pop_back(); result.second.pop_back();
-                        table[h].count = 0xEEEEEEEE;
-                    }
-                    found = true; break;
-                }
-                
-                // Check lane 1
-                uint32_t nextH = (h + 1) & mask;
-                if (vgetq_lane_u64(keyMatch, 1) && vget_lane_u32(genMatch, 1)) {
-                    if (table[nextH].count == 1) {
-                        result.first.push_back(table[nextH].idx);
-                        result.second.push_back(tIdxs[i]);
-                        table[nextH].count = 0xFFFFFFFF;
-                    } else if (table[nextH].count == 0xFFFFFFFF) {
-                        result.first.pop_back(); result.second.pop_back();
-                        table[nextH].count = 0xEEEEEEEE;
-                    }
-                    found = true; break;
-                }
-
-                // If neither matches and both are "current", we must keep probing
-                if (g0 == currentGen && g1 == currentGen) {
-                    h = (h + 2) & mask;
-                } else {
-                    // One of them is an empty slot (gen != currentGen), stop searching
-                    break;
-                }
-            }
-        }
-    }
-    return result;
-}
-*/
-void Forest::matchPipelinedBranchlessPreallocate(
-    SoAFramePersistent& src, 
-    SoAFramePersistent& tar,
-    std::vector<uint32_t>& resultSrc,
-    std::vector<uint32_t>& resultTar) {
-
-    struct Slot { 
-        uint64_t key; 
-        uint32_t idx; 
-        uint32_t gen;   
-        uint32_t count; 
-    };
-
-    static uint32_t currentGen = 1; 
-    // Increased table size slightly to 16k to further reduce Pareto collisions
-    static std::vector<Slot> table(16384, {0, 0, 0, 0}); 
-
-    for (int b = 0; b < 256; ++b) {
-        uint64_t* sStates = src.bucketStates[b];
-        uint32_t* sIdxs   = src.bucketIndices[b];
-        uint32_t  sSize   = src.bucketSizes[b];
-        
-        if (sSize == 0) continue;
-
-        // Adaptive Mask: 2k for small, 16k for large
-        const uint32_t mask = (sSize < 1000) ? 2047 : 16383;
-        const uint32_t shift = (sSize < 1000) ? (64 - 11) : (64 - 14);
-        currentGen++; 
-
-        // 1. Fill Table (Source)
-        for (size_t i = 0; i < sSize; ++i) {
-            uint64_t k = sStates[i];
-            uint32_t h = (k * 11400714819323198485llu) >> shift;
-            h &= mask;
-
-            // Branchless-ish Probe: Most IDs are unique, so this loop
-            // is predicted "not taken" after the first check.
-            while (table[h].gen == currentGen && table[h].key != k) {
-                h = (h + 1) & mask;
-            }
-            
-            if (table[h].gen != currentGen) {
-                table[h] = {k, sIdxs[i], currentGen, 1};
-            } else {
-                table[h].count++; 
-            }
-        }
-
-        // 2. Intersect (Target) with Software Pipelining
-        uint64_t* tStates = tar.bucketStates[b];
-        uint32_t* tIdxs   = tar.bucketIndices[b];
-        uint32_t  tSize   = tar.bucketSizes[b];
-
-        for (size_t i = 0; i < tSize; ++i) {
-            // Manual prefetch of the state 16 elements ahead to stay in L1
-            if (i + 16 < tSize) {
-                __builtin_prefetch(&tStates[i + 16], 0, 3);
-            }
-
-            uint64_t k = tStates[i];
-            uint32_t h = (k * 11400714819323198485llu) >> shift;
-            h &= mask;
-
-            // Probe logic
-            while (table[h].gen == currentGen && table[h].key != k) {
-                h = (h + 1) & mask;
-            }
-
-            if (table[h].gen == currentGen && table[h].key == k) {
-                const uint32_t cnt = table[h].count;
-                if (cnt == 1) {
-                    resultSrc.push_back(table[h].idx);
-                    resultTar.push_back(tIdxs[i]);
-                    table[h].count = 0xFFFFFFFF; 
-                } else if (cnt == 0xFFFFFFFF) {
-                    // Pareto multi-match removal logic
-                    resultSrc.pop_back();
-                    resultTar.pop_back();
-                    table[h].count = 0xEEEEEEEE; 
-                }
-            }
-        }
-    }
-}
-std::pair<std::vector<uint32_t>, std::vector<uint32_t>> Forest::matchPipelinedBranchless(
-    SoAFramePersistent& src, 
-    SoAFramePersistent& tar) {
-
-    std::pair<std::vector<uint32_t>, std::vector<uint32_t>> result;
-    // For 100M items, we might find more matches; 
-    // adjusting reserve to prevent mid-run reallocations.
-    result.first.reserve(src.statesSlab.size() / 100); 
-    result.second.reserve(src.statesSlab.size() / 100);
-
-    struct Slot { 
-        uint64_t key; 
-        uint32_t idx; 
-        uint32_t gen;   
-        uint32_t count; 
-    };
-
-    static uint32_t currentGen = 1; 
-    // Increased table size slightly to 16k to further reduce Pareto collisions
-    static std::vector<Slot> table(16384, {0, 0, 0, 0}); 
-
-    for (int b = 0; b < 256; ++b) {
-        uint64_t* sStates = src.bucketStates[b];
-        uint32_t* sIdxs   = src.bucketIndices[b];
-        uint32_t  sSize   = src.bucketSizes[b];
-        
-        if (sSize == 0) continue;
-
-        // Adaptive Mask: 2k for small, 16k for large
-        const uint32_t mask = (sSize < 1000) ? 2047 : 16383;
-        const uint32_t shift = (sSize < 1000) ? (64 - 11) : (64 - 14);
-        currentGen++; 
-
-        // 1. Fill Table (Source)
-        for (size_t i = 0; i < sSize; ++i) {
-            uint64_t k = sStates[i];
-            uint32_t h = (k * 11400714819323198485llu) >> shift;
-            h &= mask;
-
-            // Branchless-ish Probe: Most IDs are unique, so this loop
-            // is predicted "not taken" after the first check.
-            while (table[h].gen == currentGen && table[h].key != k) {
-                h = (h + 1) & mask;
-            }
-            
-            if (table[h].gen != currentGen) {
-                table[h] = {k, sIdxs[i], currentGen, 1};
-            } else {
-                table[h].count++; 
-            }
-        }
-
-        // 2. Intersect (Target) with Software Pipelining
-        uint64_t* tStates = tar.bucketStates[b];
-        uint32_t* tIdxs   = tar.bucketIndices[b];
-        uint32_t  tSize   = tar.bucketSizes[b];
-
-        for (size_t i = 0; i < tSize; ++i) {
-            // Manual prefetch of the state 16 elements ahead to stay in L1
-            if (i + 16 < tSize) {
-                __builtin_prefetch(&tStates[i + 16], 0, 3);
-            }
-
-            uint64_t k = tStates[i];
-            uint32_t h = (k * 11400714819323198485llu) >> shift;
-            h &= mask;
-
-            // Probe logic
-            while (table[h].gen == currentGen && table[h].key != k) {
-                h = (h + 1) & mask;
-            }
-
-            if (table[h].gen == currentGen && table[h].key == k) {
-                const uint32_t cnt = table[h].count;
-                if (cnt == 1) {
-                    result.first.push_back(table[h].idx);
-                    result.second.push_back(tIdxs[i]);
-                    table[h].count = 0xFFFFFFFF; 
-                } else if (cnt == 0xFFFFFFFF) {
-                    // Pareto multi-match removal logic
-                    result.first.pop_back();
-                    result.second.pop_back();
-                    table[h].count = 0xEEEEEEEE; 
-                }
-            }
-        }
-    }
-    return result;
-}
-std::pair<std::vector<uint32_t>, std::vector<uint32_t>> Forest::matchAdaptivePersistent(
-    SoAFramePersistent& src, 
-    SoAFramePersistent& tar) {
-
-    std::pair<std::vector<uint32_t>, std::vector<uint32_t>> result;
-    result.first.reserve(10000); 
-    result.second.reserve(10000);
-
-    struct Slot { 
-        uint64_t key; 
-        uint32_t idx; 
-        uint32_t gen;   // Generation counter
-        uint32_t count; // 1=SrcUnique, 0xFFFFFFFF=Matched, etc.
-    };
-
-    static uint32_t currentGen = 1; 
-    static std::vector<Slot> table(8192, {0, 0, 0, 0}); 
-
-    for (int b = 0; b < 256; ++b) {
-        uint64_t* sStates = src.bucketStates[b];
-        uint32_t* sIdxs   = src.bucketIndices[b];
-        uint32_t  sSize   = src.bucketSizes[b];
-        
-        if (sSize == 0) continue;
-
-        const uint32_t mask = (sSize < 500) ? 1023 : 8191;
-        currentGen++; 
-
-        // 1. Fill Table
-        for (size_t i = 0; i < sSize; ++i) {
-            uint64_t k = sStates[i];
-            uint32_t h = (k * 11400714819323198485llu) >> (64 - 13);
-            h &= mask;
-
-            while (table[h].gen == currentGen && table[h].key != k) {
-                h = (h + 1) & mask;
-            }
-            
-            if (table[h].gen != currentGen) {
-                table[h] = {k, sIdxs[i], currentGen, 1};
-            } else {
-                table[h].count++; 
-            }
-        }
-
-        // 2. Intersect
-        uint64_t* tStates = tar.bucketStates[b];
-        uint32_t* tIdxs   = tar.bucketIndices[b];
-        uint32_t  tSize   = tar.bucketSizes[b];
-
-        for (size_t i = 0; i < tSize; ++i) {
-            uint64_t k = tStates[i];
-            uint32_t h = (k * 11400714819323198485llu) >> (64 - 13);
-            h &= mask;
-
-            while (table[h].gen == currentGen && table[h].key != k) {
-                h = (h + 1) & mask;
-            }
-
-            if (table[h].gen == currentGen && table[h].key == k) {
-                if (table[h].count == 1) {
-                    result.first.push_back(table[h].idx);
-                    result.second.push_back(tIdxs[i]);
-                    table[h].count = 0xFFFFFFFF; 
-                } else if (table[h].count == 0xFFFFFFFF) {
-                    result.first.pop_back();
-                    result.second.pop_back();
-                    table[h].count = 0xEEEEEEEE; 
-                }
-            }
-        }
-    }
-    return result;
-}
-
-std::pair<std::vector<uint32_t>, std::vector<uint32_t>> Forest::matchAdaptive(
-    SoAFrame& src, 
-    SoAFrame& tar) {
-
-    std::pair<std::vector<uint32_t>, std::vector<uint32_t>> result;
-    result.first.reserve(10000); 
-    result.second.reserve(10000);
-
-    struct Slot { 
-        uint64_t key; 
-        uint32_t idx; 
-        uint32_t gen;   // Generation counter
-        uint32_t count; // 1=SrcUnique, 0xFFFFFFFF=Matched, etc.
-    };
-
-    // Global generation for this call
-    uint32_t currentGen = 1; 
-    std::vector<Slot> table(8192, {0, 0, 0, 0}); 
-
-    for (int b = 0; b < 256; ++b) {
-        const auto& sStates = src.states[b];
-        const auto& sIdxs   = src.indices[b];
-        if (sStates.empty()) continue;
-
-        // Adaptive Table Mask: Use smaller range for tiny buckets
-        const uint32_t mask = (sStates.size() < 500) ? 1023 : 8191;
-        currentGen++; 
-
-        // 1. Fill Table
-        for (size_t i = 0; i < sStates.size(); ++i) {
-            // Prefetch an element roughly 16 iterations ahead (adjust based on testing)
-            /*
-             * This didn't help anymore. 
-             * if (i + 16 < sStates.size()) {
-                __builtin_prefetch(&sStates[i + 16], 0, 3);
-                __builtin_prefetch(&sIdxs[i + 16], 0, 3);
-            }*/
-            uint64_t k = sStates[i];
-            uint32_t h = (k * 11400714819323198485llu) >> (64 - 13);
-            h &= mask;
-
-            // Probe: Valid if gen matches AND key is different
-            while (table[h].gen == currentGen && table[h].key != k) {
-                h = (h + 1) & mask;
-            }
-            
-            if (table[h].gen != currentGen) {
-                table[h] = {k, sIdxs[i], currentGen, 1};
-            } else {
-                table[h].count++; // Duplicate in Source
-            }
-        }
-
-        // 2. Intersect
-        const auto& tStates = tar.states[b];
-        const auto& tIdxs   = tar.indices[b];
-        for (size_t i = 0; i < tStates.size(); ++i) {
-            uint64_t k = tStates[i];
-            uint32_t h = (k * 11400714819323198485llu) >> (64 - 13);
-            h &= mask;
-
-            while (table[h].gen == currentGen && table[h].key != k) {
-                h = (h + 1) & mask;
-            }
-
-            if (table[h].gen == currentGen && table[h].key == k) {
-                if (table[h].count == 1) {
-                    result.first.push_back(table[h].idx);
-                    result.second.push_back(tIdxs[i]);
-                    table[h].count = 0xFFFFFFFF; 
-                } else if (table[h].count == 0xFFFFFFFF) {
-                    result.first.pop_back();
-                    result.second.pop_back();
-                    table[h].count = 0xEEEEEEEE; 
-                }
-            }
-        }
-    }
-    return result;
-}
-std::pair<std::vector<uint32_t>, std::vector<uint32_t>> Forest::matchBlockedBloom(
-    SoAFrame& src, 
-    SoAFrame& tar) {
-
-    std::pair<std::vector<uint32_t>, std::vector<uint32_t>> result;
-    result.first.reserve(10000); 
-    result.second.reserve(10000);
-
-    struct Slot { 
-        uint64_t key; 
-        uint32_t idx; 
-        uint32_t count; 
-    };
-
-    const uint32_t TABLE_SIZE = 8192;
-    const uint32_t HASH_MASK = TABLE_SIZE - 1;
-    std::vector<Slot> table(TABLE_SIZE); 
-
-    // A 512-bit Bloom Filter fits in exactly one Cache Line (64 bytes).
-    // We use 8 x 64-bit integers to represent the 512 bits.
-    uint64_t bloom[8];
-
-    for (int b = 0; b < 256; ++b) {
-        std::fill(table.begin(), table.end(), Slot{0, 0, 0});
-        std::memset(bloom, 0, sizeof(bloom));
-
-        const auto& sStates = src.states[b];
-        const auto& sIdxs   = src.indices[b];
-        const auto& tStates = tar.states[b];
-        const auto& tIdxs   = tar.indices[b];
-
-        // 1. Fill Table + Bloom Filter
-        for (size_t i = 0; i < sStates.size(); ++i) {
-            uint64_t k = sStates[i];
-            
-            // Set Bloom bit: use a different hash or shift for the bloom index
-            // We'll use bits from the key to pick one of 512 bits
-            uint32_t bHash = (k ^ (k >> 32));
-            bloom[(bHash >> 6) & 7] |= (1ull << (bHash & 63));
-
-            uint32_t h = (k * 11400714819323198485llu) >> (64 - 13); 
-            h &= HASH_MASK;
-
-            while (table[h].count > 0 && table[h].key != k) {
-                h = (h + 1) & HASH_MASK;
-            }
-            
-            table[h].key = k;
-            table[h].idx = sIdxs[i];
-            table[h].count++; 
-        }
-
-        // 2. Intersection with Bloom Filter Gate
-        for (size_t i = 0; i < tStates.size(); ++i) {
-            uint64_t k = tStates[i];
-            
-            // --- BLOOM FILTER GATE ---
-            uint32_t bHash = (k ^ (k >> 32));
-            if (!(bloom[(bHash >> 6) & 7] & (1ull << (bHash & 63)))) {
-                continue; // 100% certainly not in Source. Skip hash probe!
-            }
-            // -------------------------
-
-            uint32_t h = (k * 11400714819323198485llu) >> (64 - 13);
-            h &= HASH_MASK;
-
-            while (table[h].count > 0 && table[h].key != k) {
-                h = (h + 1) & HASH_MASK;
-            }
-
-            if (table[h].key == k) {
-                if (table[h].count == 1) {
-                    result.first.push_back(table[h].idx);
-                    result.second.push_back(tIdxs[i]);
-                    table[h].count = 0xFFFFFFFF; 
-                } else if (table[h].count == 0xFFFFFFFF) {
-                    result.first.pop_back();
-                    result.second.pop_back();
-                    table[h].count = 0xEEEEEEEE; 
-                }
-            }
-        }
-    }
-    return result;
-}
-std::pair<std::vector<uint32_t>, std::vector<uint32_t>> Forest::matchParallelRadixPartitioning(
-    SoAFrame& src, 
-    SoAFrame& tar) {
-
-    std::pair<std::vector<uint32_t>, std::vector<uint32_t>> result;
-    result.first.reserve(10000); 
-    result.second.reserve(10000);
-
-    const uint32_t TABLE_SIZE = 8192;
-    const uint32_t HASH_MASK = TABLE_SIZE - 1;
-    
-    // Aligned scratchpad to maximize L1/L2 cache efficiency
-    struct alignas(64) Slot { 
-        uint64_t key; 
-        uint32_t idx; 
-        uint32_t count; 
-    };
-    std::vector<Slot> table(TABLE_SIZE);
-
-    for (int b = 0; b < 256; ++b) {
-        // 1. FAST CLEAR
-        // std::fill is optimized, but we only zero the 'count' to save cycles
-        for(auto& s : table) s.count = 0;
-
-        const auto& sStates = src.states[b];
-        const auto& sIdxs   = src.indices[b];
-        const size_t sSize  = sStates.size();
-
-        // 2. PIPELINED FILL (Unrolled x4 for ILP)
-        // We process 4 items at once to hide memory latency
-        size_t i = 0;
-        for (; i + 3 < sSize; i += 4) {
-            for (int k = 0; k < 4; ++k) {
-                uint64_t key = sStates[i + k];
-                uint32_t h = (key * 11400714819323198485llu) >> (64 - 13);
-                h &= HASH_MASK;
-
-                while (table[h].count > 0 && table[h].key != key) h = (h + 1) & HASH_MASK;
-                
-                table[h].key = key;
-                table[h].idx = sIdxs[i + k];
-                table[h].count++;
-            }
-        }
-        // Handle remainder
-        for (; i < sSize; ++i) {
-            uint64_t key = sStates[i];
-            uint32_t h = (key * 11400714819323198485llu) >> (64 - 13);
-            h &= HASH_MASK;
-            while (table[h].count > 0 && table[h].key != key) h = (h + 1) & HASH_MASK;
-            table[h].key = key; table[h].idx = sIdxs[i]; table[h].count++;
-        }
-
-        // 3. OPTIMISTIC INTERSECTION
-        const auto& tStates = tar.states[b];
-        const auto& tIdxs   = tar.indices[b];
-        const size_t tSize  = tStates.size();
-
-        for (size_t j = 0; j < tSize; ++j) {
-            uint64_t key = tStates[j];
-            uint32_t h = (key * 11400714819323198485llu) >> (64 - 13);
-            h &= HASH_MASK;
-
-            while (table[h].count > 0 && table[h].key != key) h = (h + 1) & HASH_MASK;
-
-            if (table[h].key == key) {
-                if (table[h].count == 1) {
-                    result.first.push_back(table[h].idx);
-                    result.second.push_back(tIdxs[j]);
-                    table[h].count = 0xFFFFFFFF; // Mark as Matched
-                } else if (table[h].count == 0xFFFFFFFF) {
-                    // Pareto duplicate found in Target: Roll back
-                    result.first.pop_back();
-                    result.second.pop_back();
-                    table[h].count = 0xEEEEEEEE; // Mark as Permanent Duplicate
-                }
-            }
-        }
-    }
-
-    return result;
-}
-std::pair<std::vector<uint32_t>, std::vector<uint32_t>> Forest::matchPreparedFramesFaster(
-    SoAFrame& src, 
-    SoAFrame& tar) {
-
-    std::pair<std::vector<uint32_t>, std::vector<uint32_t>> result;
-    result.first.reserve(10000); 
-    result.second.reserve(10000);
-
-    // Flat, cache-aligned slot structure
-    struct Slot { 
-        uint64_t key; 
-        uint32_t idx; 
-        uint32_t count; 
-    };
-
-    // 8192 slots = 128KB. This fits perfectly in your 4MB L2.
-    // We use a power-of-two size to use bitwise AND instead of modulo %.
-    const uint32_t TABLE_SIZE = 8192;
-    const uint32_t HASH_MASK = TABLE_SIZE - 1;
-    std::vector<Slot> table(TABLE_SIZE); 
-
-    for (int b = 0; b < 256; ++b) {
-        // FAST: std::fill is usually a vectorized memset.
-        std::fill(table.begin(), table.end(), Slot{0, 0, 0});
-
-        const auto& sStates = src.states[b];
-        const auto& sIdxs   = src.indices[b];
-        const auto& tStates = tar.states[b];
-        const auto& tIdxs   = tar.indices[b];
-
-        // 1. Fill Table from Source
-        for (size_t i = 0; i < sStates.size(); ++i) {
-            uint64_t k = sStates[i];
-            // Fibonacci Hashing (very fast for 64-bit keys)
-            uint32_t h = (k * 11400714819323198485llu) >> (64 - 13); 
-            h &= HASH_MASK;
-
-            while (table[h].count > 0 && table[h].key != k) {
-                h = (h + 1) & HASH_MASK;
-            }
-            
-            table[h].key = k;
-            table[h].idx = sIdxs[i];
-            table[h].count++; 
-        }
-
-        // 2. Secondary Uniqueness Check + Intersection
-        // We reuse the 'count' field: 
-        // 1 = Unique in Src
-        // >1 = Duplicate in Src
-        // 0 = Already Matched (prevents Target duplicates)
-        for (size_t i = 0; i < tStates.size(); ++i) {
-            uint64_t k = tStates[i];
-            uint32_t h = (k * 11400714819323198485llu) >> (64 - 13);
-            h &= HASH_MASK;
-
-            while (table[h].count > 0 && table[h].key != k) {
-                h = (h + 1) & HASH_MASK;
-            }
-
-            // We need to know if 'k' is unique in Target too.
-            // A quick way is to check if it appears again in the target bucket.
-            // For Pareto, we can use a "tombstone" logic:
-            if (table[h].key == k) {
-                if (table[h].count == 1) {
-                    // This is the first time we see it in Target
-                    result.first.push_back(table[h].idx);
-                    result.second.push_back(tIdxs[i]);
-                    table[h].count = 0xFFFFFFFF; // Mark as "Matched once"
-                } else if (table[h].count == 0xFFFFFFFF) {
-                    // Oh no, this is a Target duplicate! 
-                    // We must remove the last added match.
-                    result.first.pop_back();
-                    result.second.pop_back();
-                    table[h].count = 0xEEEEEEEE; // Mark as "Permanent Duplicate"
-                }
-            }
-        }
-    }
-    return result;
-}
-std::pair<std::vector<uint32_t>, std::vector<uint32_t>> Forest::matchPreparedFrames( SoAFrame& src, SoAFrame& tar) {
-
-    // Initialize the pair of vectors
-    std::pair<std::vector<uint32_t>, std::vector<uint32_t>> result;
-    
-    // Heuristic: start with a reasonable reserve (e.g., 5% of average bucket size * 256)
-    size_t initialReserve = (src.states[0].size() + tar.states[0].size()) * 6; 
-    result.first.reserve(initialReserve);
-    result.second.reserve(initialReserve);
-
-    // Local structures for bucket-level uniqueness
-    struct SrcInfo { uint32_t idx; bool isDup; };
-    std::unordered_map<uint64_t, SrcInfo> bucketSrc;
-    std::unordered_map<uint64_t, bool> bucketTar;
-
-    for (int b = 0; b < 256; ++b) {
-        bucketSrc.clear();
-        bucketTar.clear();
-
-        const auto& sStates = src.states[b];
-        const auto& sIdxs   = src.indices[b];
-        const auto& tStates = tar.states[b];
-        const auto& tIdxs   = tar.indices[b];
-
-        // 1. Process Source: Mark unique vs duplicates
-        for (size_t i = 0; i < sStates.size(); ++i) {
-            auto [it, inserted] = bucketSrc.try_emplace(sStates[i], SrcInfo{sIdxs[i], false});
-            if (!inserted) it->second.isDup = true;
-        }
-
-        // 2. Process Target: Mark unique vs duplicates
-        for (size_t i = 0; i < tStates.size(); ++i) {
-            auto [it, inserted] = bucketTar.try_emplace(tStates[i], false);
-            if (!inserted) it->second = true; // Mark as duplicate
-        }
-
-        // 3. Intersect unique-only IDs
-        for (size_t i = 0; i < tStates.size(); ++i) {
-            uint64_t id = tStates[i];
-            
-            // Check if unique in Target
-            if (bucketTar[id] == false) {
-                auto it = bucketSrc.find(id);
-                // Check if exists in Source AND is unique there
-                if (it != bucketSrc.end() && it->second.isDup == false) {
-                    result.first.push_back(it->second.idx);
-                    result.second.push_back(tIdxs[i]);
-                }
-            }
-        }
-    }
-
-    return result;
-}
-
     /**
      * @brief Computes sparse matches on a pair of rectified and smoothed
      * images. Here the src and tar images refer to the left and right images,
@@ -1194,7 +101,7 @@ std::vector<ndb::Correspondence> Forest::depthPriorFast(
         for (auto& q : statesTar) q.srcDescr = false;
 
         ndb::Hashmatch<ndb::Descriptor> hm(
-            214673,  // statesSrc.size() + statesTar.size() ,
+            214673,  
             statesSrc.size() + statesTar.size());
         std::vector<std::pair<ndb::Descriptor, ndb::Descriptor>> corr;
         for (auto& q : statesSrc) hm.insert(q);
@@ -1225,7 +132,6 @@ std::vector<ndb::Correspondence> Forest::findCorrespondences(
             ++i, unique = false;
 
         if (unique) {
-            // emulates std::lowerbound behavior for arrays
             for (; j < tarStates.size() - 1; ++j) {
                 if (!(tarStates[j] < srcStates[i])) break;
             }
@@ -1239,299 +145,7 @@ std::vector<ndb::Correspondence> Forest::findCorrespondences(
     }
     return corr;
 }
-std::vector<ndb::Correspondence> Forest::findCorrespondencesHashNaive(
-    std::vector<ndb::Descriptor>& srcStates,
-    std::vector<ndb::Descriptor>& tarStates) {
-    
-    std::vector<ndb::Correspondence> corr;
-    struct DescriptorHasher {
-        std::size_t operator()(const ndb::Descriptor& d) const {
-            // Just return the state since it's already a unique-ish 64-bit int
-            return static_cast<std::size_t>(d.state);
-        }
-    };
-    // 1. Count frequencies in Source
-    std::unordered_map<ndb::Descriptor, int, DescriptorHasher> srcCounts;
-    std::unordered_map<ndb::Descriptor, int, DescriptorHasher> tarCounts;
-    for (const auto& d : srcStates) {
-        srcCounts[d]++;
-    }
-
-    // 2. Count frequencies in Target
-    for (const auto& d : tarStates) {
-        tarCounts[d]++;
-    }
-
-    // 3. Match only if the descriptor is unique in both (count == 1)
-    // We iterate through srcStates to maintain a similar "order" or 
-    // simply to find potential matches.
-    for (const auto& srcDesc : srcStates) {
-        // Is it unique in Source?
-        if (srcCounts[srcDesc] == 1) {
-            // Does it exist and is it unique in Target?
-            if (tarCounts.count(srcDesc) && tarCounts[srcDesc] == 1) {
-                
-                // We need the actual target object to get the 'point' 
-                // In a naive way, we just go find it.
-                for (const auto& tarDesc : tarStates) {
-                    if (tarDesc == srcDesc) {
-                        corr.push_back(ndb::Correspondence(srcDesc.point, tarDesc.point));
-                        break; 
-                    }
-                }
-            }
-        }
-    }
-
-    return corr;
-}
-
-// State machine for our IDs
-enum class State : uint8_t { Unseen = 0, SeenOnce = 1, Duplicate = 2 };
-
-#include <vector>
-#include <unordered_map>
-#include <cstdint>
-
-std::vector<ndb::Correspondence> Forest::findCorrespondencesHash(
-    std::vector<ndb::Descriptor>& srcStates,
-    std::vector<ndb::Descriptor>& tarStates) {
-
-    // Tracking states: 0 = Unseen, 1 = SeenOnce, 2 = Duplicate
-    enum class Occurence : uint8_t { Unseen = 0, SeenOnce = 1, Duplicate = 2 };
-
-    // 1. Map Source IDs: State -> {OccurenceLevel, OriginalIndex}
-    // Pre-allocating prevents expensive rehashes during the loop
-    std::unordered_map<uint64_t, std::pair<Occurence, uint32_t>> srcMap;
-    srcMap.reserve(srcStates.size());
-
-    for (uint32_t i = 0; i < srcStates.size(); ++i) {
-        auto& entry = srcMap[srcStates[i].state];
-        if (entry.first == Occurence::Unseen) {
-            entry = {Occurence::SeenOnce, i};
-        } else {
-            entry.first = Occurence::Duplicate;
-        }
-    }
-
-    // 2. Map Target IDs: State -> OccurenceLevel
-    std::unordered_map<uint64_t, Occurence> tarMap;
-    tarMap.reserve(tarStates.size());
-
-    for (uint32_t j = 0; j < tarStates.size(); ++j) {
-        auto& occ = tarMap[tarStates[j].state];
-        if (occ == Occurence::Unseen) {
-            occ = Occurence::SeenOnce;
-        } else {
-            occ = Occurence::Duplicate;
-        }
-    }
-
-    // 3. Intersect unique pairs
-    std::vector<ndb::Correspondence> corr;
-    // Heuristic: Reserve 20% of the smaller set size for the results
-    corr.reserve(std::min(srcStates.size(), tarStates.size()) / 5);
-
-    for (uint32_t j = 0; j < tarStates.size(); ++j) {
-        uint64_t currentID = tarStates[j].state;
-
-        // Condition: Must be unique in Target AND unique in Source
-        if (tarMap[currentID] == Occurence::SeenOnce) {
-            auto it = srcMap.find(currentID);
-            if (it != srcMap.end() && it->second.first == Occurence::SeenOnce) {
-                // Correspondence(Point from Source, Point from Target)
-                corr.push_back(ndb::Correspondence(
-                    srcStates[it->second.second].point, 
-                    tarStates[j].point
-                ));
-            }
-        }
-    }
-
-    return corr;
-}
-#include <vector>
-#include <cstdint>
-#include <algorithm>
-
-// A lightweight structure to avoid moving heavy Descriptor objects
-struct KeyIndex {
-    uint64_t state;
-    uint32_t index;
-};
-#include <vector>
-#include <cstdint>
-#include <array>
-
-std::vector<ndb::Correspondence> Forest::findCorrespondencesTurbo(
-    std::vector<ndb::Descriptor>& srcStates,
-    std::vector<ndb::Descriptor>& tarStates) {
-
-    const int BUCKETS = 256;
-    const uint64_t MASK = 0xFF;
-
-    // --- STEP 1: Linear Partitioning (Radix Pass) ---
-    // We use a single flat buffer to avoid 256 separate vector allocations
-    std::vector<KeyIndex> srcBuffer(srcStates.size());
-    std::vector<KeyIndex> tarBuffer(tarStates.size());
-    std::array<size_t, BUCKETS> srcCounts = {0}, tarCounts = {0};
-    std::array<size_t, BUCKETS> srcOffsets, tarOffsets;
-
-    for (const auto& s : srcStates) srcCounts[s.state & MASK]++;
-    for (const auto& t : tarStates) tarCounts[t.state & MASK]++;
-
-    srcOffsets[0] = tarOffsets[0] = 0;
-    for (int i = 1; i < BUCKETS; ++i) {
-        srcOffsets[i] = srcOffsets[i - 1] + srcCounts[i - 1];
-        tarOffsets[i] = tarOffsets[i - 1] + tarCounts[i - 1];
-    }
 
-    auto srcCursors = srcOffsets;
-    auto tarCursors = tarOffsets;
-
-    for (uint32_t i = 0; i < srcStates.size(); ++i) {
-        srcBuffer[srcCursors[srcStates[i].state & MASK]++] = {srcStates[i].state, i};
-    }
-    for (uint32_t i = 0; i < tarStates.size(); ++i) {
-        tarBuffer[tarCursors[tarStates[i].state & MASK]++] = {tarStates[i].state, i};
-    }
-
-    // --- STEP 2: In-Cache Hashing ---
-    std::vector<ndb::Correspondence> corr;
-    corr.reserve(std::min(srcStates.size(), tarStates.size()) / 8);
-
-    // Using a tiny fixed-size hash table for each bucket to stay in L1/L2 cache
-    // State: 0 = Unseen, 1 = SeenOnce, 2 = Duplicate
-    struct LocalVal { uint32_t index; uint8_t count; };
-    
-    // We reuse this map across buckets to avoid reallocating
-    // A simple open-addressed hash map for the bucket
-    std::unordered_map<uint64_t, LocalVal> bucketMap;
-    bucketMap.reserve(srcStates.size() / BUCKETS * 2);
-
-    for (int b = 0; b < BUCKETS; ++b) {
-        bucketMap.clear();
-
-        // Load Source bucket into local cache-friendly map
-        size_t srcStart = srcOffsets[b];
-        size_t srcEnd = srcStart + srcCounts[b];
-        for (size_t i = srcStart; i < srcEnd; ++i) {
-            auto& entry = bucketMap[srcBuffer[i].state];
-            entry.index = srcBuffer[i].index;
-            entry.count = (entry.count == 0) ? 1 : 2;
-        }
-
-        // Intersect with Target bucket
-        size_t tarStart = tarOffsets[b];
-        size_t tarEnd = tarStart + tarCounts[b];
-        
-        // Secondary map to ensure target-side uniqueness
-        std::unordered_map<uint64_t, uint8_t> tarUniqueness;
-        for (size_t i = tarStart; i < tarEnd; ++i) {
-            auto& count = tarUniqueness[tarBuffer[i].state];
-            count = (count == 0) ? 1 : 2;
-        }
-
-        for (size_t i = tarStart; i < tarEnd; ++i) {
-            uint64_t id = tarBuffer[i].state;
-            if (tarUniqueness[id] == 1) {
-                auto it = bucketMap.find(id);
-                if (it != bucketMap.end() && it->second.count == 1) {
-                    corr.push_back(ndb::Correspondence(
-                        srcStates[it->second.index].point, 
-                        tarStates[tarBuffer[i].index].point
-                    ));
-                }
-            }
-        }
-    }
-
-    return corr;
-}
-std::vector<ndb::Correspondence> Forest::findCorrespondencesHashingRadix(
-    std::vector<ndb::Descriptor>& srcStates,
-    std::vector<ndb::Descriptor>& tarStates) {
-
-    const int NUM_BUCKETS = 256;
-    const uint64_t MASK = 0xFF;
-
-    // 1. Partition Source into Buckets
-    std::vector<KeyIndex> srcBuckets[NUM_BUCKETS];
-    for (int i = 0; i < NUM_BUCKETS; ++i) srcBuckets[i].reserve(srcStates.size() / NUM_BUCKETS * 1.2);
-    
-    for (uint32_t i = 0; i < srcStates.size(); ++i) {
-        srcBuckets[srcStates[i].state & MASK].push_back({srcStates[i].state, i});
-    }
-
-    // 2. Partition Target into Buckets
-    std::vector<uint64_t> tarBuckets[NUM_BUCKETS];
-    for (int i = 0; i < NUM_BUCKETS; ++i) tarBuckets[i].reserve(tarStates.size() / NUM_BUCKETS * 1.2);
-    
-    for (uint32_t i = 0; i < tarStates.size(); ++i) {
-        tarBuckets[tarStates[i].state & MASK].push_back(tarStates[i].state);
-    }
-
-    std::vector<ndb::Correspondence> corr;
-    corr.reserve(std::min(srcStates.size(), tarStates.size()) / 5);
-
-    // 3. Process each bucket pair
-    // This part can be easily parallelized with #pragma omp parallel for
-    for (int b = 0; b < NUM_BUCKETS; ++b) {
-        if (srcBuckets[b].empty() || tarBuckets[b].empty()) continue;
-
-        // Small local maps fit in L1/L2 Cache
-        // Using a simple frequency map for the local bucket
-        enum class Occ : uint8_t { Unseen = 0, SeenOnce = 1, Duplicate = 2 };
-        
-        struct LocalEntry {
-            Occ occ = Occ::Unseen;
-            uint32_t idx = 0;
-        };
-
-        // We use a flat hash map here. For simplicity in standard C++, 
-        // std::unordered_map is used, but even it is faster here 
-        // because it stays in cache.
-        std::unordered_map<uint64_t, LocalEntry> localSrc;
-        localSrc.reserve(srcBuckets[b].size());
-
-        for (auto& ki : srcBuckets[b]) {
-            auto& entry = localSrc[ki.state];
-            if (entry.occ == Occ::Unseen) {
-                entry = {Occ::SeenOnce, ki.index};
-            } else {
-                entry.occ = Occ::Duplicate;
-            }
-        }
-
-        std::unordered_map<uint64_t, Occ> localTar;
-        localTar.reserve(tarBuckets[b].size());
-        for (uint64_t state : tarBuckets[b]) {
-            auto& occ = localTar[state];
-            occ = (occ == Occ::Unseen) ? Occ::SeenOnce : Occ::Duplicate;
-        }
-
-        // Intersect within the bucket
-        // Since we are inside a bucket, we iterate the target indices
-        // but we need to find the target point. 
-        // To be fast, we'll re-scan the original tarStates for this bucket's IDs
-        for (uint32_t j = 0; j < tarStates.size(); ++j) {
-            uint64_t s = tarStates[j].state;
-            if ((s & MASK) == b) { // Only process IDs belonging to this bucket
-                if (localTar[s] == Occ::SeenOnce) {
-                    auto it = localSrc.find(s);
-                    if (it != localSrc.end() && it->second.occ == Occ::SeenOnce) {
-                        corr.push_back(ndb::Correspondence(
-                            srcStates[it->second.idx].point,
-                            tarStates[j].point
-                        ));
-                    }
-                }
-            }
-        }
-    }
-
-    return corr;
-}
 /**
  * @brief Evaluates a given forest mask on an image and returns the
  * descriptors
diff --git a/lib/gpc/forest.hpp b/lib/gpc/forest.hpp
index 9e45600..6463aec 100644
--- a/lib/gpc/forest.hpp
+++ b/lib/gpc/forest.hpp
@@ -178,48 +178,6 @@ struct MatchStats {
     double prec, rec, timeProp, timeMatch;
     int numInlier, numStates, numMatches;
 };
-struct SoAFrame {
-    // 256 Buckets to ensure each chunk fits in L2/L3 cache
-    std::vector<uint64_t> states[256];
-    std::vector<uint32_t> indices[256];
-    
-    void reserve(size_t total_size) {
-        for(int i=0; i<256; ++i) {
-            states[i].reserve(total_size / size_t(256 * 1.2));
-            indices[i].reserve(total_size / size_t(256 * 1.2));
-        }
-    }
-};
-struct SoAFramePersistent {
-    // Persistent memory blocks
-    std::vector<uint64_t> statesSlab;
-    std::vector<uint32_t> indicesSlab;
-    
-    // Pointers into the slab for each bucket
-    uint64_t* bucketStates[256];
-    uint32_t* bucketIndices[256];
-    uint32_t bucketSizes[256];
-
-    void preallocate(size_t total_size) {
-        statesSlab.assign(total_size, 0);
-        indicesSlab.assign(total_size, 0);
-    }
-};
-struct StateIdx {
-    uint64_t state;
-    uint32_t index;
-};
-
-struct SoAFramePersistentSingleSlab {
-    std::vector<StateIdx> slab; 
-    StateIdx* bucketData[256];
-    uint32_t bucketSizes[256];
-
-    void preallocate(size_t total_size) {
-        slab.assign(total_size, {0, 0});
-    }
-};
-
 class Forest {
    public:
     /**
@@ -241,83 +199,6 @@ class Forest {
     static std::vector<ndb::Correspondence> findCorrespondences(
         std::vector<ndb::Descriptor>& srcStates,
         std::vector<ndb::Descriptor>& tarStates);
-    static std::vector<ndb::Correspondence> findCorrespondencesHashNaive(
-        std::vector<ndb::Descriptor>& srcStates,
-        std::vector<ndb::Descriptor>& tarStates);
-    static std::vector<ndb::Correspondence> findCorrespondencesHash(
-        std::vector<ndb::Descriptor>& srcStates,
-        std::vector<ndb::Descriptor>& tarStates);
-
-    static std::vector<ndb::Correspondence> findCorrespondencesHashingRadix(
-        std::vector<ndb::Descriptor>& srcStates,
-        std::vector<ndb::Descriptor>& tarStates);
-
-    static std::vector<ndb::Correspondence> findCorrespondencesTurbo(
-        std::vector<ndb::Descriptor>& srcStates,
-        std::vector<ndb::Descriptor>& tarStates);
-
-
-    static std::pair<SoAFrame, SoAFrame> prepareSoAFrames(
-    std::vector<ndb::Descriptor>& srcStates,
-    std::vector<ndb::Descriptor>& tarStates);
-
-    static void prepareSoAFramesPersistent(
-        std::vector<ndb::Descriptor>& srcStates,
-        std::vector<ndb::Descriptor>& tarStates,
-        SoAFramePersistent& srcFrame, 
-        SoAFramePersistent& tarFrame);
-static void prepareSoAFramesPersistentSingleSlab(
-    std::vector<ndb::Descriptor>& srcStates,
-    std::vector<ndb::Descriptor>& tarStates,
-    SoAFramePersistentSingleSlab& srcFrame, 
-    SoAFramePersistentSingleSlab& tarFrame);
-
-
-static std::pair<std::vector<uint32_t>, std::vector<uint32_t>> matchPreparedFrames( SoAFrame& src, SoAFrame& tar);
-static std::pair<std::vector<uint32_t>, std::vector<uint32_t>> matchPreparedFramesFaster( SoAFrame& src, SoAFrame& tar);
-
-static std::pair<std::vector<uint32_t>, std::vector<uint32_t>> matchParallelRadixPartitioning(
-    SoAFrame& src, 
-    SoAFrame& tar) ;
-static std::pair<std::vector<uint32_t>, std::vector<uint32_t>> matchBlockedBloom(
-    SoAFrame& src, 
-    SoAFrame& tar) ;
-static std::pair<std::vector<uint32_t>, std::vector<uint32_t>> matchAdaptive(
-    SoAFrame& src, 
-    SoAFrame& tar);
-static std::pair<std::vector<uint32_t>, std::vector<uint32_t>> matchAdaptivePersistent(
-    SoAFramePersistent& src, 
-    SoAFramePersistent& tar);
-static std::pair<std::vector<uint32_t>, std::vector<uint32_t>> matchPipelinedBranchless(
-    SoAFramePersistent& src, 
-    SoAFramePersistent& tar);
-static void matchPipelinedBranchlessPreallocate(
-    SoAFramePersistent& src, 
-    SoAFramePersistent& tar,
-    std::vector<uint32_t>& resultSrc,
-    std::vector<uint32_t>& resultTar);
-
-/*
-static std::pair<std::vector<uint32_t>, std::vector<uint32_t>> matchAdaptiveNeon(
-    SoAFrame& src, 
-    SoAFrame& tar);
-*/
-static void matchPipelinedBranchlessPreallocateSingleSlab(
-    SoAFramePersistentSingleSlab& src, SoAFramePersistentSingleSlab& tar,
-    std::vector<uint32_t>& outS, std::vector<uint32_t>& outT);
-
-
-
-static void prepareSoAFramesPersistentSingleSlabUnordered(
-    std::vector<ndb::Descriptor>& srcStates,
-    std::vector<ndb::Descriptor>& tarStates,
-    SoAFramePersistentSingleSlab& srcFrame, 
-    SoAFramePersistentSingleSlab& tarFrame);
-static void matchPipelinedBranchlessPreallocateSingleSlabUnordered(
-    SoAFramePersistentSingleSlab& src, SoAFramePersistentSingleSlab& tar,
-    std::vector<uint32_t>& outS, std::vector<uint32_t>& outT);
-
-
     /**
      * @brief Evaluates a given forest mask on an image and returns the
      * descriptors
diff --git a/lib/gpc/kernels/box.cpp b/lib/gpc/kernels/box.cpp
index 605daa2..714fbb6 100644
--- a/lib/gpc/kernels/box.cpp
+++ b/lib/gpc/kernels/box.cpp
@@ -166,7 +166,6 @@ void boxSSE(uint8_t* in, uint8_t* blurred, int width, int height) {
 void box(uint8_t* in, uint8_t* blurred, int width, int height, int numThreads) {
     assert(width % 16 == 0 && "width must be multiple of 16!");
 #if defined(__ARM_NEON) || defined(__aarch64__)
-    // Force use of our new Highway kernel on Mac
     testing::box_hwy(in, blurred, width, height);
 #else
     #if HWY_TARGET == HWY_AVX2
diff --git a/lib/gpc/kernels/sobel.cpp b/lib/gpc/kernels/sobel.cpp
index 2817622..531b3ed 100644
--- a/lib/gpc/kernels/sobel.cpp
+++ b/lib/gpc/kernels/sobel.cpp
@@ -161,7 +161,6 @@ void sobel(uint8_t* in,
            int numThreads) {
     assert(width % 16 == 0 && "width must be multiple of 16!");
 #if defined(__ARM_NEON) || defined(__aarch64__)
-    // Force use of our new Highway kernel on Mac
         sobelNaive(in, blurred, width, height, threshold);
     //testing::sobel_hwy(in, blurred, width, height, threshold); // not exact!
 #else
diff --git a/samples/sparsematch.cpp b/samples/sparsematch.cpp
index 3b94d9f..57864e8 100644
--- a/samples/sparsematch.cpp
+++ b/samples/sparsematch.cpp
@@ -3,7 +3,30 @@
 
 #include "gpc/forest.hpp"
 using namespace std;
-
+std::vector<ndb::Descriptor> gpcFilterDense(uint8_t* in,
+                    const std::vector<int32_t>& fastmask,
+                    int width,
+                    int height) {
+    uint32_t tmp;
+    uint32_t usableW = width - 26;
+    uint32_t usableH = height - 26;
+    std::vector<ndb::Descriptor> out(usableW * usableH);
+    int j = 0;
+    for (int y=13;y<height-13;y++) {
+        for (int x=13;x<width-13;x++) {
+            tmp = 0;
+            int idx = y * width + x; 
+            for (size_t i = 0; i < fastmask.size(); i += 2) {
+                tmp <<= 1;  // shift by one
+                if (*(in + idx + fastmask[i]) > *(in + idx + fastmask[i + 1]))
+                    tmp++;  // set this test's result to 1
+            }
+            out[j] = ndb::Descriptor(ndb::Point(x, y), tmp);
+            j++;
+        }
+    }
+    return out;
+}
 int main(int argc, char** argv) {
     std::string forestPath = "../forests/defaultZeroForest.txt";
     std::string leftImgPath = "../data/middlebury/im0.png";
@@ -67,11 +90,17 @@ int main(int argc, char** argv) {
     std::cout << "Number of matches: " << supp.size() << std::endl;
     std::cout << "Preprocessing time: " << gpc::inference::tickToMs(t1, t0) << " ms" << std::endl;
     std::cout << "Matching time: " << gpc::inference::tickToMs(t2, t1) << " ms" << std::endl;
+    /*
     std::vector<ndb::Descriptor> statesSrc = forest.evalFastMaskOnSubsetSSE(
         simgP.smooth, simgP.grad, simgP.mask, fm, inferencesettings);
     std::vector<ndb::Descriptor> statesTar = forest.evalFastMaskOnSubsetSSE(
         timgP.smooth, timgP.grad, timgP.mask, fm, inferencesettings);
-    ndb::Descriptor::serialize("statesSrc.txt", statesSrc);
-    ndb::Descriptor::serialize("statesTar.txt", statesTar);
+    */
+
+    std::vector<ndb::Descriptor> statesSrc = gpcFilterDense(simgP.smooth.data(), fm.mask, simgP.smooth.cols(), simgP.smooth.rows());
+    std::vector<ndb::Descriptor> statesTar = gpcFilterDense(timgP.smooth.data(), fm.mask, timgP.smooth.cols(), timgP.smooth.rows());
+
+    ndb::Descriptor::serialize("statesSrcLargeS.txt", statesSrc);
+    ndb::Descriptor::serialize("statesTarLargeS.txt", statesTar);
 
 }
diff --git a/tests/test_single_matching.cpp b/tests/test_single_matching.cpp
index 2893b01..f7724b9 100644
--- a/tests/test_single_matching.cpp
+++ b/tests/test_single_matching.cpp
@@ -61,6 +61,7 @@ std::vector<ndb::Descriptor> getTarDescriptors() {
 }
 
 
+/*
 TEST(A,B) {
     std::vector<ndb::Descriptor> srcOriginal = getSrcDescriptors(); 
     std::vector<ndb::Descriptor> tarOriginal = getTarDescriptors();
@@ -69,8 +70,6 @@ TEST(A,B) {
     std::vector<ndb::Descriptor> srcAlt = srcOriginal;
     std::vector<ndb::Descriptor> tarAlt = tarOriginal;
     
-    // Baseline
-    // To write a test for this we'd actually need to get the ids of the sources back, not just the final matches.
     std::vector<ndb::Correspondence> 
         matches = gpc::inference::Forest::findCorrespondences(srcBaseline, tarBaseline);
 
@@ -91,3 +90,4 @@ TEST(A,B) {
     EXPECT_EQ(matches.size(), resultSrc.size());
     EXPECT_EQ(matches.size(), resultTar.size());
 }
+*/

From e8ed2f3c3797deed9789ef33289882d44c1cf0b3 Mon Sep 17 00:00:00 2001
From: Nik Bamert <github@nikbamert.com>
Date: Tue, 7 Apr 2026 07:45:34 +0200
Subject: [PATCH 32/36] add target

---
 samples/target.cpp | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)
 create mode 100644 samples/target.cpp

diff --git a/samples/target.cpp b/samples/target.cpp
new file mode 100644
index 0000000..4f9a1bc
--- /dev/null
+++ b/samples/target.cpp
@@ -0,0 +1,17 @@
+#include <hwy/highway.h>
+#include <iostream>
+int main() {
+    // This is evaluated at compile-time
+    std::cout << "Compiled for: " << hwy::TargetName(HWY_TARGET) << std::endl;
+
+    // If you need logic based on the arch:
+#if HWY_TARGET == HWY_AVX2
+    std::cout << "Logic: Using 256-bit AVX2 paths." << std::endl;
+#elif HWY_TARGET == HWY_NEON
+    std::cout << "Logic: Using 128-bit NEON paths." << std::endl;
+#elif HWY_TARGET == HWY_SSE4
+    std::cout << "Logic: Using 128-bit SSE4 paths." << std::endl;
+#else
+    std::cout << "Logic: Using Scalar fallback." << std::endl;
+#endif
+}

From ee46a5b9c5fe1841c8fb12747b93756470a85934 Mon Sep 17 00:00:00 2001
From: Nik Bamert <github@nikbamert.com>
Date: Tue, 7 Apr 2026 07:49:09 +0200
Subject: [PATCH 33/36] no div

---
 lib/gpc/kernels/sobel_hwy.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/gpc/kernels/sobel_hwy.cpp b/lib/gpc/kernels/sobel_hwy.cpp
index d1b331f..fe93031 100644
--- a/lib/gpc/kernels/sobel_hwy.cpp
+++ b/lib/gpc/kernels/sobel_hwy.cpp
@@ -182,7 +182,7 @@ namespace testing {
 //#if defined(HWY_TARGET) && HWY_TARGET == HWY_NEON
     void sobel_hwy(uint8_t* in, uint8_t* blurred, int width, int height, uint8_t threshold) {
         //ndb::N_NEON::SobelKernel(in, blurred, width, height, threshold);
-        HWY_STATIC_DISPATCH(SobelKernel)(in, blurred, width, height, threshold);
+        HWY_STATIC_DISPATCH(SobelKernelNoDiv)(in, blurred, width, height, threshold);
     }
 //#endif  
 }

From cbddf6b986183b71cfcbd636d35beea8fbf4ca82 Mon Sep 17 00:00:00 2001
From: Nik Bamert <github@nikbamert.com>
Date: Tue, 7 Apr 2026 08:46:39 +0200
Subject: [PATCH 34/36] approximate division by 9 with fixed point
 multiplication for comparison with Hwy implementation.

---
 CMakeLists.txt            |  1 +
 lib/gpc/kernels/sobel.cpp |  7 +++++--
 samples/target.cpp        | 11 ++++-------
 3 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 57b44ae..72f760c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,6 +6,7 @@ project(openGPC CXX)
 set (REQ_CPP11_FEATURES  cxx_strong_enums cxx_auto_type)
 if(NOT CMAKE_BUILD_TYPE)
   set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
+  add_compile_options(-O3 -funroll-loops)
 endif()
 
 set(CMAKE_CXX_STANDARD 17) 
diff --git a/lib/gpc/kernels/sobel.cpp b/lib/gpc/kernels/sobel.cpp
index 531b3ed..7867cd3 100644
--- a/lib/gpc/kernels/sobel.cpp
+++ b/lib/gpc/kernels/sobel.cpp
@@ -59,9 +59,12 @@ void sobelNaive(
     // boundary) (unoptimized)
     for (int iy = 1; iy < height - 1; iy++) {
         for (int ix = 0; ix < width; ix++) {
-            int sx = (*p11 + *p31 + 2 * *p21 - *p13 - 2 * *p23 - *p33) / 9;
-            int sy = (*p11 + *p13 + 2 * *p12 - *p31 - 2 * *p32 - *p33) / 9;
+            // Approximate division by 9 with fixed-point multiplication (2^16/9 = 7282)
+            int16_t sum_x = (*p11 + *p31 + 2 * *p21 - *p13 - 2 * *p23 - *p33);
+            int16_t sum_y = (*p11 + *p13 + 2 * *p12 - *p31 - 2 * *p32 - *p33);
 
+            int sx = (static_cast<int32_t>(sum_x) * 7282) >> 16;
+            int sy = (static_cast<int32_t>(sum_y) * 7282) >> 16;
             int val = sx * sx + sy * sy;
 
             *optr = val > thresholdSq ? 255 : 0;
diff --git a/samples/target.cpp b/samples/target.cpp
index 4f9a1bc..6c03ab7 100644
--- a/samples/target.cpp
+++ b/samples/target.cpp
@@ -1,17 +1,14 @@
 #include <hwy/highway.h>
 #include <iostream>
 int main() {
-    // This is evaluated at compile-time
     std::cout << "Compiled for: " << hwy::TargetName(HWY_TARGET) << std::endl;
-
-    // If you need logic based on the arch:
 #if HWY_TARGET == HWY_AVX2
-    std::cout << "Logic: Using 256-bit AVX2 paths." << std::endl;
+    std::cout << "Using 256-bit AVX2 paths." << std::endl;
 #elif HWY_TARGET == HWY_NEON
-    std::cout << "Logic: Using 128-bit NEON paths." << std::endl;
+    std::cout << "Using 128-bit NEON paths." << std::endl;
 #elif HWY_TARGET == HWY_SSE4
-    std::cout << "Logic: Using 128-bit SSE4 paths." << std::endl;
+    std::cout << "Using 128-bit SSE4 paths." << std::endl;
 #else
-    std::cout << "Logic: Using Scalar fallback." << std::endl;
+    std::cout << "Using Scalar fallback." << std::endl;
 #endif
 }

From bd27dfaaf9c5b1cd402c9af0287828072d129e6d Mon Sep 17 00:00:00 2001
From: Nik Bamert <github@nikbamert.com>
Date: Tue, 7 Apr 2026 08:57:22 +0200
Subject: [PATCH 35/36] formatting

---
 format_code.sh                |   2 +-
 lib/gpc/Feature.hpp           |  11 +-
 lib/gpc/Fern.hpp              |  17 +--
 lib/gpc/buffer.hpp            |  41 +++----
 lib/gpc/feature.cpp           |  49 ++++----
 lib/gpc/fern.cpp              |  36 +++---
 lib/gpc/forest.cpp            |  92 +++++++-------
 lib/gpc/forest.hpp            |   8 +-
 lib/gpc/inference.hpp         |   4 +-
 lib/gpc/kernels/box.cpp       |  69 +++++++----
 lib/gpc/kernels/box.hpp       |  28 ++---
 lib/gpc/kernels/box_hwy.cpp   | 108 ++++++++++++-----
 lib/gpc/kernels/box_hwy.hpp   |  12 +-
 lib/gpc/kernels/census.cpp    |   5 +-
 lib/gpc/kernels/census.hpp    |   3 +-
 lib/gpc/kernels/gpc.cpp       |  66 +++++-----
 lib/gpc/kernels/gpc.hpp       |  21 ++--
 lib/gpc/kernels/gpc_hwy.cpp   | 113 ++++++++++--------
 lib/gpc/kernels/gpc_hwy.hpp   |  12 +-
 lib/gpc/kernels/sobel.cpp     |  86 ++++++++-----
 lib/gpc/kernels/sobel.hpp     |  13 +-
 lib/gpc/kernels/sobel_hwy.cpp | 219 +++++++++++++++++++++++-----------
 lib/gpc/kernels/sobel_hwy.hpp |  13 +-
 lib/gpc/kernels/utils.cpp     |  14 +--
 lib/gpc/kernels/utils.hpp     |   8 +-
 samples/sparsematch.cpp       |  35 +++---
 samples/target.cpp            |   1 +
 27 files changed, 622 insertions(+), 464 deletions(-)

diff --git a/format_code.sh b/format_code.sh
index 0eaf149..16d9cb5 100755
--- a/format_code.sh
+++ b/format_code.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 set -euo pipefail
-EXPECTED_VERSION="21.1.5"
+EXPECTED_VERSION="21.1.8"
 root_folder=$(git rev-parse --show-toplevel)
 
 change_in_place=false
diff --git a/lib/gpc/Feature.hpp b/lib/gpc/Feature.hpp
index 8a8b55c..c2064b4 100644
--- a/lib/gpc/Feature.hpp
+++ b/lib/gpc/Feature.hpp
@@ -96,10 +96,10 @@ class Feature {
      * @param[in]  trip    The triplet
      */
     void getDecisions(bool& ref,
-                             bool& pos,
-                             bool& neg,
-                             params& params,
-                             const GPCPatchTriplet& trip);
+                      bool& pos,
+                      bool& neg,
+                      params& params,
+                      const GPCPatchTriplet& trip);
 
     Feature();
     /**
@@ -138,8 +138,7 @@ class Feature {
      * @param path The path where we'd like to store the training data
      *             in binary form.
      */
-    void storeAllTriplets(std::vector<GPCPatchTriplet>& data,
-                          std::string path);
+    void storeAllTriplets(std::vector<GPCPatchTriplet>& data, std::string path);
     /**
      * @brief Read triplets of training data from a binary file
      *        written by the storeAllTriplets method.
diff --git a/lib/gpc/Fern.hpp b/lib/gpc/Fern.hpp
index d9554fd..68c6a1f 100644
--- a/lib/gpc/Fern.hpp
+++ b/lib/gpc/Fern.hpp
@@ -181,7 +181,7 @@ OptimizerSettings TauOptimizer(int taulo,
  */
 OptimizerSettings ZeroOptimizer(int numResamples,
                                 bool onlyScoreNonSplitSamples,
-                                double w1) ;
+                                double w1);
 struct FernSettings {
     const int maxDepth;
     const int scale;
@@ -227,7 +227,7 @@ class Fern {
                    OptimizerSettings optsetting,
                    int scoreUntilLevel,
                    splitStats& s);
-     /**
+    /**
      * @brief      Mark those samples in the set as "split" if they have been
      *             correctly classified(ref=pos and pos!=neg) with the parameter
      * set in params
@@ -238,7 +238,7 @@ class Fern {
      */
     void markSplitSamples(std::vector<GPCTriplet_t>& data,
                           std::vector<SplitParams_t>& params,
-                          int numParams) ;
+                          int numParams);
     /**
      * @brief Reset the mark on the training samples on whether they have been
      * split correctly or not Since we do not operate on copies of the training
@@ -247,7 +247,7 @@ class Fern {
      * @param data
      */
     void resetMarkOnSamples(std::vector<GPCTriplet_t>& data);
-   
+
     /**
      * @brief Train a fern given a set of training data and some optimizer
      * settings
@@ -256,8 +256,8 @@ class Fern {
      * @param optsetting      the optimizer settings
      */
     void train(std::vector<GPCTriplet_t>& trainingSamples,
-               OptimizerSettings optsetting) ;
-   
+               OptimizerSettings optsetting);
+
     /**
      * @brief      Returns the decision of the first five levels of the ferns
      *
@@ -284,7 +284,10 @@ class Fern {
  *
  * @return
  */
-inline std::vector<Fern> FernFactory(int num_S, int num_M, int num_L, int maxDepth) {
+inline std::vector<Fern> FernFactory(int num_S,
+                                     int num_M,
+                                     int num_L,
+                                     int maxDepth) {
     std::vector<Fern> ferns;
     for (int i = 0; i < num_S; i++)
         ferns.push_back(Fern(FernSettings(maxDepth, 2)));
diff --git a/lib/gpc/buffer.hpp b/lib/gpc/buffer.hpp
index 903a507..0b58119 100644
--- a/lib/gpc/buffer.hpp
+++ b/lib/gpc/buffer.hpp
@@ -33,12 +33,12 @@
 #include <png.h>
 
 #include <Eigen/Dense>
-#include <type_traits>
-#include <vector>
-#include <iostream>
 #include <fstream>
-#include <string>
+#include <iostream>
 #include <sstream>
+#include <string>
+#include <type_traits>
+#include <vector>
 
 using namespace std;
 
@@ -84,17 +84,18 @@ struct Descriptor {
     bool operator<(const Descriptor& d) const { return state < d.state; }
     bool operator<=(const Descriptor& d) const { return state <= d.state; }
     int operator%(const int& d) const { return state % d; }
-    static void serialize(const std::string& filename, const std::vector<Descriptor>& data) {
+    static void serialize(const std::string& filename,
+                          const std::vector<Descriptor>& data) {
         std::ofstream outFile(filename);
         if (!outFile.is_open()) {
-            std::cerr << "Error opening file for writing: " << filename << std::endl;
+            std::cerr << "Error opening file for writing: " << filename
+                      << std::endl;
             return;
         }
 
         for (const auto& desc : data) {
-            outFile << desc.point.x << "," 
-                    << desc.point.y << "," 
-                    << desc.state << "\n";
+            outFile << desc.point.x << "," << desc.point.y << "," << desc.state
+                    << "\n";
         }
         outFile.close();
     }
@@ -102,11 +103,13 @@ struct Descriptor {
     /**
      * Deserializes a CSV file back into a vector of Descriptors.
      */
-    static std::vector<Descriptor> deserialize(const std::string& filename, bool srcDescr) {
+    static std::vector<Descriptor> deserialize(const std::string& filename,
+                                               bool srcDescr) {
         std::vector<Descriptor> result;
         std::ifstream inFile(filename);
         if (!inFile.is_open()) {
-            std::cerr << "Error opening file for reading: " << filename << std::endl;
+            std::cerr << "Error opening file for reading: " << filename
+                      << std::endl;
             return result;
         }
 
@@ -118,18 +121,16 @@ struct Descriptor {
             std::string x_str, y_str, state_str;
 
             // Split by comma
-            if (std::getline(ss, x_str, ',') &&
-                std::getline(ss, y_str, ',') &&
+            if (std::getline(ss, x_str, ',') && std::getline(ss, y_str, ',') &&
                 std::getline(ss, state_str, ',')) {
-                
                 Descriptor d;
                 d.point.x = std::stod(x_str);
                 d.point.y = std::stod(y_str);
                 d.state = std::stoull(state_str);
-                d.srcDescr = srcDescr; 
-                
-                //if (d.point.y > 200 && d.point.y < 400)
-                    result.push_back(d);
+                d.srcDescr = srcDescr;
+
+                // if (d.point.y > 200 && d.point.y < 400)
+                result.push_back(d);
             }
         }
         inFile.close();
@@ -1024,8 +1025,8 @@ inline Buffer<RGBColor> getDisparityVisualization(
     }
     return dispVis;
 }
-inline Buffer<RGBColor> getDisparityVisualization(ndb::Buffer<uint8_t>& srcImg,
-                                           std::vector<Support>& support) {
+inline Buffer<RGBColor> getDisparityVisualization(
+    ndb::Buffer<uint8_t>& srcImg, std::vector<Support>& support) {
     float min_disparity = 0;
     float max_disparity = 128;
     Buffer<RGBColor> dispVis(Eigen::Vector2i(srcImg.width, srcImg.rows()));
diff --git a/lib/gpc/feature.cpp b/lib/gpc/feature.cpp
index 529970c..9e1322f 100644
--- a/lib/gpc/feature.cpp
+++ b/lib/gpc/feature.cpp
@@ -36,9 +36,9 @@
 #include <algorithm>
 #include <cmath>  //for log2
 #include <fstream>
+#include <gpc/Feature.hpp>
 #include <gpc/buffer.hpp>
 #include <gpc/kernels/box.hpp>
-#include <gpc/Feature.hpp>
 #include <iostream>
 #include <iterator>
 #include <random>
@@ -51,19 +51,16 @@ using namespace std;
 namespace gpc {
 namespace training {
 void Feature::getDecisions(bool& ref,
-                         bool& pos,
-                         bool& neg,
-                         params& params,
-                         const GPCPatchTriplet& trip) {
-    ref =
-        ((int)trip.ref.feature(params.i) - (int)trip.ref.feature(params.j) <
-         params.tau);
-    pos =
-        ((int)trip.pos.feature(params.i) - (int)trip.pos.feature(params.j) <
-         params.tau);
-    neg =
-        ((int)trip.neg.feature(params.i) - (int)trip.neg.feature(params.j) <
-         params.tau);
+                           bool& pos,
+                           bool& neg,
+                           params& params,
+                           const GPCPatchTriplet& trip) {
+    ref = ((int)trip.ref.feature(params.i) - (int)trip.ref.feature(params.j) <
+           params.tau);
+    pos = ((int)trip.pos.feature(params.i) - (int)trip.pos.feature(params.j) <
+           params.tau);
+    neg = ((int)trip.neg.feature(params.i) - (int)trip.neg.feature(params.j) <
+           params.tau);
 }
 
 Feature::Feature() {
@@ -77,7 +74,7 @@ Feature::Feature() {
 }
 void Feature::sampleHyperplane(int scale, params& params) {
     if (scale == 2) {
-        params.i = params.j;  // s.t. they regenerate each iteration
+        params.i = params.j;            // s.t. they regenerate each iteration
         while (params.i == params.j) {  // i and j need to be distinct
             int i = randIJ7(rng);
             int j = randIJ7(rng);
@@ -90,7 +87,7 @@ void Feature::sampleHyperplane(int scale, params& params) {
             params.j = 280 + (params.jx + 3) + 27 * (params.jy + 3);
         }
     } else if (scale == 1) {
-        params.i = params.j;  // s.t. they regenerate each iteration
+        params.i = params.j;            // s.t. they regenerate each iteration
         while (params.i == params.j) {  // i and j need to be distinct
             int i = randIJ17(rng);
             int j = randIJ17(rng);
@@ -103,7 +100,7 @@ void Feature::sampleHyperplane(int scale, params& params) {
             params.j = 140 + (params.jx + 8) + 27 * (params.jy + 8);
         }
     } else if (scale == 0) {
-        params.i = params.j;  // s.t. they regenerate each iteration
+        params.i = params.j;            // s.t. they regenerate each iteration
         while (params.i == params.j) {  // i and j need to be distinct
             params.i = randIJ27(rng);
             params.j = randIJ27(rng);
@@ -119,11 +116,11 @@ void Feature::sampleHyperplane(int scale, params& params) {
     params.tau = randTAU(rng);
 }
 void Feature::extractAllTriplets(ndb::Buffer<uint8_t>& bwL,
-                        ndb::Buffer<uint8_t>& bwR,
-                        std::vector<ndb::Point>& ref,
-                        std::vector<ndb::Point>& pos,
-                        std::vector<ndb::Point>& neg,
-                        std::vector<GPCPatchTriplet>& triplets) {
+                                 ndb::Buffer<uint8_t>& bwR,
+                                 std::vector<ndb::Point>& ref,
+                                 std::vector<ndb::Point>& pos,
+                                 std::vector<ndb::Point>& neg,
+                                 std::vector<GPCPatchTriplet>& triplets) {
     ndb::Buffer<uint8_t> LL(bwL.rows(), bwL.cols());
     LL.width = bwL.width;
     ndb::box(bwL.data(), LL.data(), bwL.cols(), bwL.rows(), 1);
@@ -174,7 +171,7 @@ void Feature::extractAllTriplets(ndb::Buffer<uint8_t>& bwL,
 }
 
 void Feature::storeAllTriplets(std::vector<GPCPatchTriplet>& data,
-                      std::string path) {
+                               std::string path) {
     ofstream fout;
     fout.open(path, ios::binary | ios::out);
     for (auto& triplet : data) {
@@ -184,13 +181,13 @@ void Feature::storeAllTriplets(std::vector<GPCPatchTriplet>& data,
     }
     fout.close();
 }
-std::vector<Feature::GPCPatchTriplet> Feature::loadAllTriplets(std::string path) {
+std::vector<Feature::GPCPatchTriplet> Feature::loadAllTriplets(
+    std::string path) {
     std::vector<Feature::GPCPatchTriplet> data;
     std::ifstream in(path, std::ifstream::ate | std::ifstream::binary);
     uint32_t filesize = in.tellg();
     if (filesize % ((27 * 27) * 3)) {
-        cout << "ERR: File is not a training set of this feature type"
-             << endl;
+        cout << "ERR: File is not a training set of this feature type" << endl;
         cout << "FS: " << filesize << endl;
         return data;
     }
diff --git a/lib/gpc/fern.cpp b/lib/gpc/fern.cpp
index a171218..5a2632c 100644
--- a/lib/gpc/fern.cpp
+++ b/lib/gpc/fern.cpp
@@ -31,6 +31,8 @@
 // The Global Patch Collider
 // Shenlong Wang, Sean Ryan Fanello, Christoph Rhemann, Shahram Izadi, Pushmeet
 // Kohli CVPR 2016 Code Author: Niklaus Bamert (bamertn@ethz.ch)
+#include "gpc/Fern.hpp"
+
 #include <Eigen/Dense>
 #include <iomanip>
 #include <iostream>
@@ -38,7 +40,6 @@
 #include <vector>
 
 #include "gpc/Feature.hpp"
-#include "gpc/Fern.hpp"
 
 using namespace std;
 namespace gpc {
@@ -57,11 +58,11 @@ OptimizerSettings ZeroOptimizer(int numResamples,
     return OptimizerSettings(0, 1, numResamples, onlyScoreNonSplitSamples, w1);
 }
 void Fern::evalSplit(std::vector<GPCTriplet_t>& data,
-               std::vector<SplitParams_t>& params,
-               FernSettings fernsetting,
-               OptimizerSettings optsetting,
-               int scoreUntilLevel,
-               splitStats& s) {
+                     std::vector<SplitParams_t>& params,
+                     FernSettings fernsetting,
+                     OptimizerSettings optsetting,
+                     int scoreUntilLevel,
+                     splitStats& s) {
     s.tp = 0;
     s.fn = 0;
     s.fp = 0;
@@ -80,8 +81,7 @@ void Fern::evalSplit(std::vector<GPCTriplet_t>& data,
             bool refDec, posDec, negDec;
 
             // Decisions need to be added into a codeword
-            Feature.getDecisions(
-                refDec, posDec, negDec, params[i], triplet);
+            Feature.getDecisions(refDec, posDec, negDec, params[i], triplet);
             if (refDec) ref++;
             if (posDec) pos++;
             if (negDec) neg++;
@@ -118,8 +118,8 @@ void Fern::evalSplit(std::vector<GPCTriplet_t>& data,
     s.convcomb = (1. - w2) * s.prec + w2 * s.rec;
 }
 void Fern::markSplitSamples(std::vector<GPCTriplet_t>& data,
-                      std::vector<SplitParams_t>& params,
-                      int numParams) {
+                            std::vector<SplitParams_t>& params,
+                            int numParams) {
     for (auto& triplet : data) {
         // Evaluate triplet on all given parameters
         uint64_t ref = 0, pos = 0, neg = 0;
@@ -129,8 +129,7 @@ void Fern::markSplitSamples(std::vector<GPCTriplet_t>& data,
             neg <<= 1;  // shift by one
             bool refDec, posDec, negDec;
 
-            Feature.getDecisions(
-                refDec, posDec, negDec, params[i], triplet);
+            Feature.getDecisions(refDec, posDec, negDec, params[i], triplet);
             if (refDec) ref++;
             if (posDec) pos++;
             if (negDec) neg++;
@@ -147,7 +146,7 @@ void Fern::resetMarkOnSamples(std::vector<GPCTriplet_t>& data) {
 }
 
 void Fern::train(std::vector<GPCTriplet_t>& trainingSamples,
-           OptimizerSettings optsetting) {
+                 OptimizerSettings optsetting) {
     splitStats stats;
     float maxScore = 0.f;
     SplitParams_t bestParams;
@@ -155,9 +154,9 @@ void Fern::train(std::vector<GPCTriplet_t>& trainingSamples,
     fernparams.resize(fernsettings.maxDepth);
 
     cout << setw(7) << "Level" << setw(10) << "Prec" << setw(10) << "Rec"
-         << setw(10) << "Har" << setw(8) << "Tot" << setw(8) << "TP"
-         << setw(8) << "FP" << setw(8) << "FN" << setw(6) << "scale"
-         << setw(5) << "tau" << setw(5) << "i" << setw(5) << "j" << endl;
+         << setw(10) << "Har" << setw(8) << "Tot" << setw(8) << "TP" << setw(8)
+         << "FP" << setw(8) << "FN" << setw(6) << "scale" << setw(5) << "tau"
+         << setw(5) << "i" << setw(5) << "j" << endl;
     if (optsetting.onlyScoreNonSplitSamples_)
         resetMarkOnSamples(trainingSamples);
     for (int level = 0; level < fernsettings.maxDepth; level++) {
@@ -166,8 +165,7 @@ void Fern::train(std::vector<GPCTriplet_t>& trainingSamples,
             // Samples a hyperplane in the requested scale
             Feature.sampleHyperplane(fernsettings.scale, fernparams[level]);
             // Iterates over a small range of tau (intercept)
-            for (int tau = optsetting.taulo_; tau < optsetting.tauhi_;
-                 tau++) {
+            for (int tau = optsetting.taulo_; tau < optsetting.tauhi_; tau++) {
                 fernparams[level].tau = tau;
                 // Score hyperplane set we have so far
                 evalSplit(trainingSamples,
@@ -202,7 +200,5 @@ std::vector<Fern::SplitParams_t> Fern::getParameters() { return fernparams; }
 
 int Fern::getScale() { return fernsettings.scale; }
 
-
-
 }  // namespace training
 }  // namespace gpc
diff --git a/lib/gpc/forest.cpp b/lib/gpc/forest.cpp
index 87e0582..940eb33 100644
--- a/lib/gpc/forest.cpp
+++ b/lib/gpc/forest.cpp
@@ -32,7 +32,7 @@
 // Shenlong Wang, Sean Ryan Fanello, Christoph Rhemann, Shahram Izadi, Pushmeet
 // Kohli CVPR 2016 Code Author: Niklaus Bamert (bamertn@ethz.ch)
 #include <Eigen/Dense>
-//#include <arm_neon.h>
+// #include <arm_neon.h>
 #include <chrono>
 #include <cstring>
 #include <fstream>
@@ -43,32 +43,32 @@
 #include <vector>
 
 // GPC includes
+#include <unordered_map>
+
 #include "gpc/Feature.hpp"
 #include "gpc/SintelOpticalFlow.hpp"
 #include "gpc/SintelStereo.hpp"
 #include "gpc/buffer.hpp"
-#include "gpc/kernels/sobel.hpp"
+#include "gpc/forest.hpp"
+#include "gpc/hashmatch.hpp"
 #include "gpc/kernels/box.hpp"
 #include "gpc/kernels/gpc.hpp"
+#include "gpc/kernels/sobel.hpp"
 #include "gpc/kernels/utils.hpp"
-#include "gpc/hashmatch.hpp"
-#include "gpc/forest.hpp"
-#include <unordered_map>
-
 
 namespace gpc {
 namespace inference {
-    /**
-     * @brief Computes sparse matches on a pair of rectified and smoothed
-     * images. Here the src and tar images refer to the left and right images,
-     * respectively.
-     *
-     * @param src    Preprocessed source(left) image
-     * @param tar    Preprocessed target(right) image
-     * @param fastmask    forest mask of relative integer offsets.
-     *
-     * @return
-     */
+/**
+ * @brief Computes sparse matches on a pair of rectified and smoothed
+ * images. Here the src and tar images refer to the left and right images,
+ * respectively.
+ *
+ * @param src    Preprocessed source(left) image
+ * @param tar    Preprocessed target(right) image
+ * @param fastmask    forest mask of relative integer offsets.
+ *
+ * @return
+ */
 std::vector<ndb::Correspondence> Forest::depthPriorFast(
     PreprocessedImage& src,
     PreprocessedImage& tar,
@@ -87,12 +87,13 @@ std::vector<ndb::Correspondence> Forest::depthPriorFast(
     }
     // Use sort method for matching
     if (settings.useHashtable_ == false) {
-    t0 = sysTick();
+        t0 = sysTick();
         std::vector<ndb::Correspondence> corr =
             findCorrespondences(statesSrc, statesTar);
-    t1 = sysTick();
-    std::cout << "findCorrespondences (without allocation): " << gpc::inference::tickToMs(t1, t0) << " ms" << std::endl;
-    std::cout << "length src: " << statesSrc.size() << std::endl;
+        t1 = sysTick();
+        std::cout << "findCorrespondences (without allocation): "
+                  << gpc::inference::tickToMs(t1, t0) << " ms" << std::endl;
+        std::cout << "length src: " << statesSrc.size() << std::endl;
         return corr;
     }
     // Use hashtable matching
@@ -100,9 +101,8 @@ std::vector<ndb::Correspondence> Forest::depthPriorFast(
         for (auto& q : statesSrc) q.srcDescr = true;
         for (auto& q : statesTar) q.srcDescr = false;
 
-        ndb::Hashmatch<ndb::Descriptor> hm(
-            214673,  
-            statesSrc.size() + statesTar.size());
+        ndb::Hashmatch<ndb::Descriptor> hm(214673,
+                                           statesSrc.size() + statesTar.size());
         std::vector<std::pair<ndb::Descriptor, ndb::Descriptor>> corr;
         for (auto& q : statesSrc) hm.insert(q);
         for (auto& q : statesTar) hm.insert(q);
@@ -110,8 +110,7 @@ std::vector<ndb::Correspondence> Forest::depthPriorFast(
         // Store vertices in a format that is more convenient for us:
         std::vector<ndb::Correspondence> corr2;
         for (auto& e : corr) {
-            corr2.push_back(
-                ndb::Correspondence(e.first.point, e.second.point));
+            corr2.push_back(ndb::Correspondence(e.first.point, e.second.point));
         }
 
         return corr2;
@@ -207,7 +206,7 @@ std::vector<ndb::Descriptor> Forest::evalFastMaskOnSubsetSSE(
  * @return the preprocessed image
  */
 PreprocessedImage Forest::preprocessImage(ndb::Buffer<uint8_t>& img,
-                                  InferenceSettings settings) {
+                                          InferenceSettings settings) {
     assert((settings.gradientThreshold_ >= 0 &&
             settings.gradientThreshold_ <= 255) &&
            "gradientThreshold needs to be within 0...255");
@@ -221,11 +220,11 @@ PreprocessedImage Forest::preprocessImage(ndb::Buffer<uint8_t>& img,
              img.cols(),
              img.rows(),
              settings.numThreads_);
-    //4.2 *10^-5 ms
+    // 4.2 *10^-5 ms
     smooth.clearBoundary();
     ndb::Buffer<uint8_t> grad(img.rows(), img.cols());
     grad.width = img.width;
-    //4.2*10-5ms (unclear how)
+    // 4.2*10-5ms (unclear how)
     ndb::sobel(img.data(),
                grad.data(),
                img.cols(),
@@ -265,21 +264,20 @@ PreprocessedImage Forest::preprocessImage(ndb::Buffer<uint8_t>& img,
  * @return                  Set of correspondences (ptSrc, ptTar) where
  * ptSrc and ptTar are points in the source and target images, respectively.
  */
-std::vector<ndb::Correspondence> Forest::stereoMatch(PreprocessedImage& simg,
-                                             PreprocessedImage& timg,
-                                             FilterMask& forestmask,
-                                             InferenceSettings settings) {
+std::vector<ndb::Correspondence> Forest::stereoMatch(
+    PreprocessedImage& simg,
+    PreprocessedImage& timg,
+    FilterMask& forestmask,
+    InferenceSettings settings) {
     // make sure the delivered mask matches the image dimensions
-    assert(
-        (forestmask.width == simg.smooth.cols() &&
-         forestmask.height == simg.smooth.rows()) &&
-        "Source Image: dimension does not fit dimension of supplied forest "
-        "mask");
-    assert(
-        (forestmask.width == timg.smooth.cols() &&
-         forestmask.height == simg.smooth.rows()) &&
-        "Targe Image: dimension does not fit dimension of supplied forest "
-        "mask");
+    assert((forestmask.width == simg.smooth.cols() &&
+            forestmask.height == simg.smooth.rows()) &&
+           "Source Image: dimension does not fit dimension of supplied forest "
+           "mask");
+    assert((forestmask.width == timg.smooth.cols() &&
+            forestmask.height == simg.smooth.rows()) &&
+           "Targe Image: dimension does not fit dimension of supplied forest "
+           "mask");
     bool m_debug = false;
     // Match
     std::vector<ndb::Correspondence> corr =
@@ -303,9 +301,9 @@ std::vector<ndb::Correspondence> Forest::stereoMatch(PreprocessedImage& simg,
  * of a point in the left image and d the disparity.
  */
 std::vector<ndb::Support> Forest::rectifiedMatch(PreprocessedImage& simg,
-                                         PreprocessedImage& timg,
-                                         FilterMask& forestmask,
-                                         InferenceSettings settings) {
+                                                 PreprocessedImage& timg,
+                                                 FilterMask& forestmask,
+                                                 InferenceSettings settings) {
     // Do matching
     std::vector<ndb::Correspondence> corr =
         stereoMatch(simg, timg, forestmask, settings);
@@ -379,4 +377,4 @@ FilterMask Forest::readForest(std::string path, int width, int height) {
 }
 
 }  // namespace inference
-}
+}  // namespace gpc
diff --git a/lib/gpc/forest.hpp b/lib/gpc/forest.hpp
index 6463aec..6226b83 100644
--- a/lib/gpc/forest.hpp
+++ b/lib/gpc/forest.hpp
@@ -61,7 +61,7 @@ inline std::chrono::high_resolution_clock::time_point sysTick() {
     return std::chrono::high_resolution_clock::now();
 }
 inline float tickToMs(std::chrono::high_resolution_clock::time_point t0,
-               std::chrono::high_resolution_clock::time_point t1) {
+                      std::chrono::high_resolution_clock::time_point t1) {
     return std::abs(
         1000. *
         std::chrono::duration_cast<std::chrono::duration<double>>(t1 - t0)
@@ -216,7 +216,7 @@ class Forest {
         std::vector<int>& idx,
         FilterMask& fastmask,
         InferenceSettings& settings);
-        
+
     /**
      * @brief Preprocesses an image. (smooth, binary sobel image and gradient
      * pixel indices)
@@ -228,7 +228,7 @@ class Forest {
      */
     PreprocessedImage preprocessImage(ndb::Buffer<uint8_t>& img,
                                       InferenceSettings settings);
-     /**
+    /**
      * @brief Finds matches between two stereo images based on a given forest
      * mask.
      *
@@ -261,7 +261,7 @@ class Forest {
                                              PreprocessedImage& timg,
                                              FilterMask& forestmask,
                                              InferenceSettings settings);
-                                            
+
     /**
      * @brief Reads text-based forest format and returns a mask for a given
      * image size.
diff --git a/lib/gpc/inference.hpp b/lib/gpc/inference.hpp
index e074290..df1c840 100644
--- a/lib/gpc/inference.hpp
+++ b/lib/gpc/inference.hpp
@@ -48,11 +48,11 @@
 #include "gpc/SintelOpticalFlow.hpp"
 #include "gpc/SintelStereo.hpp"
 #include "gpc/buffer.hpp"
+#include "gpc/hashmatch.hpp"
 #include "gpc/kernels/box.hpp"
-#include "gpc/kernels/sobel.hpp"
 #include "gpc/kernels/gpc.hpp"
+#include "gpc/kernels/sobel.hpp"
 #include "gpc/kernels/utils.hpp"
-#include "gpc/hashmatch.hpp"
 
 /**
  * @brief      The inference class of the GPC forest
diff --git a/lib/gpc/kernels/box.cpp b/lib/gpc/kernels/box.cpp
index 714fbb6..5b51b60 100644
--- a/lib/gpc/kernels/box.cpp
+++ b/lib/gpc/kernels/box.cpp
@@ -30,11 +30,13 @@
 // Code Author: Niklaus Bamert (bamertn@ethz.ch)
 
 #include "gpc/kernels/box.hpp"
-#include "gpc/kernels/utils.hpp"
+
 #include <cassert>
+
+#include "gpc/kernels/utils.hpp"
 namespace ndb {
-namespace testing { 
-    void box_hwy(uint8_t* in, uint8_t* blurred, int width, int height); 
+namespace testing {
+void box_hwy(uint8_t* in, uint8_t* blurred, int width, int height);
 }
 void boxNaive(uint8_t* in, uint8_t* blurred, int width, int height) {
     assert(width % 16 == 0 && "width must be multiple of 16!");
@@ -83,12 +85,12 @@ void boxNaive(uint8_t* in, uint8_t* blurred, int width, int height) {
 void boxSSE(uint8_t* in, uint8_t* blurred, int width, int height) {
     int start = 1;
     int end = height - 3;
-    
+
     int x, y;
-    __m128i one_third = _mm_set1_epi16(21846); // 2^16/3 + 1
-    
-    __m128i *dst0 = (__m128i*)(blurred + width * start);
-    __m128i *dst1 = (__m128i*)(blurred + width * (start + 1));
+    __m128i one_third = _mm_set1_epi16(21846);  // 2^16/3 + 1
+
+    __m128i* dst0 = (__m128i*)(blurred + width * start);
+    __m128i* dst1 = (__m128i*)(blurred + width * (start + 1));
 
     for (y = start; y < end; y += 2) {
         const uint8_t *row0, *row1, *row2, *row3;
@@ -111,8 +113,10 @@ void boxSSE(uint8_t* in, uint8_t* blurred, int width, int height) {
             unpack8to16(s00, a00, b00);
             unpack8to16(s01, a01, b01);
             unpack8to16(s02, a02, b02);
-            ra00 = _mm_mulhi_epi16(_mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third);
-            rb00 = _mm_mulhi_epi16(_mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third);
+            ra00 = _mm_mulhi_epi16(
+                _mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third);
+            rb00 = _mm_mulhi_epi16(
+                _mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third);
 
             // Row 1 Processing
             s00 = _mm_loadu_si128((__m128i*)(row1 - 1));
@@ -121,8 +125,10 @@ void boxSSE(uint8_t* in, uint8_t* blurred, int width, int height) {
             unpack8to16(s00, a00, b00);
             unpack8to16(s01, a01, b01);
             unpack8to16(s02, a02, b02);
-            ra01 = _mm_mulhi_epi16(_mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third);
-            rb01 = _mm_mulhi_epi16(_mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third);
+            ra01 = _mm_mulhi_epi16(
+                _mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third);
+            rb01 = _mm_mulhi_epi16(
+                _mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third);
 
             // Row 2 Processing
             s00 = _mm_loadu_si128((__m128i*)(row2 - 1));
@@ -131,12 +137,16 @@ void boxSSE(uint8_t* in, uint8_t* blurred, int width, int height) {
             unpack8to16(s00, a00, b00);
             unpack8to16(s01, a01, b01);
             unpack8to16(s02, a02, b02);
-            ra02 = _mm_mulhi_epi16(_mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third);
-            rb02 = _mm_mulhi_epi16(_mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third);
+            ra02 = _mm_mulhi_epi16(
+                _mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third);
+            rb02 = _mm_mulhi_epi16(
+                _mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third);
 
             // Accumulate rows 0, 1, 2 for dst0
-            tmp0 = _mm_mulhi_epi16(_mm_adds_epi16(_mm_adds_epi16(ra00, ra01), ra02), one_third);
-            tmp1 = _mm_mulhi_epi16(_mm_adds_epi16(_mm_adds_epi16(rb00, rb01), rb02), one_third);
+            tmp0 = _mm_mulhi_epi16(
+                _mm_adds_epi16(_mm_adds_epi16(ra00, ra01), ra02), one_third);
+            tmp1 = _mm_mulhi_epi16(
+                _mm_adds_epi16(_mm_adds_epi16(rb00, rb01), rb02), one_third);
             pack16to8(tmp0, tmp1, res);
             _mm_store_si128(dst0++, res);
 
@@ -147,16 +157,23 @@ void boxSSE(uint8_t* in, uint8_t* blurred, int width, int height) {
             unpack8to16(s00, a00, b00);
             unpack8to16(s01, a01, b01);
             unpack8to16(s02, a02, b02);
-            ra00 = _mm_mulhi_epi16(_mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third);
-            rb00 = _mm_mulhi_epi16(_mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third);
+            ra00 = _mm_mulhi_epi16(
+                _mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third);
+            rb00 = _mm_mulhi_epi16(
+                _mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third);
 
             // Accumulate rows 1, 2, 3 for dst1
-            tmp0 = _mm_mulhi_epi16(_mm_adds_epi16(_mm_adds_epi16(ra01, ra02), ra00), one_third);
-            tmp1 = _mm_mulhi_epi16(_mm_adds_epi16(_mm_adds_epi16(rb01, rb02), rb00), one_third);
+            tmp0 = _mm_mulhi_epi16(
+                _mm_adds_epi16(_mm_adds_epi16(ra01, ra02), ra00), one_third);
+            tmp1 = _mm_mulhi_epi16(
+                _mm_adds_epi16(_mm_adds_epi16(rb01, rb02), rb00), one_third);
             pack16to8(tmp0, tmp1, res);
             _mm_store_si128(dst1++, res);
 
-            row0 += 16; row1 += 16; row2 += 16; row3 += 16;
+            row0 += 16;
+            row1 += 16;
+            row2 += 16;
+            row3 += 16;
         }
         dst0 += width / 16;
         dst1 += width / 16;
@@ -168,11 +185,11 @@ void box(uint8_t* in, uint8_t* blurred, int width, int height, int numThreads) {
 #if defined(__ARM_NEON) || defined(__aarch64__)
     testing::box_hwy(in, blurred, width, height);
 #else
-    #if HWY_TARGET == HWY_AVX2
-        boxSSE(in, blurred, width, height);
-    #else
-        boxNaive(in, blurred, width, height);
-    #endif
+#if HWY_TARGET == HWY_AVX2
+    boxSSE(in, blurred, width, height);
+#else
+    boxNaive(in, blurred, width, height);
+#endif
 #endif
 }
 }  // namespace ndb
diff --git a/lib/gpc/kernels/box.hpp b/lib/gpc/kernels/box.hpp
index eef0b3d..b00dc88 100644
--- a/lib/gpc/kernels/box.hpp
+++ b/lib/gpc/kernels/box.hpp
@@ -48,25 +48,21 @@ namespace ndb {
 void boxNaive(uint8_t* in, uint8_t* blurred, int width, int height);
 
 /**
-   * @brief      boxfilter using SSE2 instructions. Loosely based on
-   *             https://www.ignorantus.com/box_sse2/, published under
-   *             the https://creativecommons.org/publicdomain/zero/1.0/ licence.
-   *
-   * @param      in       input image
-   * @param      blurred  The blurred
-   * @param[in]  width    The width
-   * @param[in]  height   The height
-   * @param[in]  numThreads number of threads to use
-   */
+ * @brief      boxfilter using SSE2 instructions. Loosely based on
+ *             https://www.ignorantus.com/box_sse2/, published under
+ *             the https://creativecommons.org/publicdomain/zero/1.0/ licence.
+ *
+ * @param      in       input image
+ * @param      blurred  The blurred
+ * @param[in]  width    The width
+ * @param[in]  height   The height
+ * @param[in]  numThreads number of threads to use
+ */
 void box(uint8_t* in, uint8_t* blurred, int width, int height, int numThreads);
 
 #if HWY_TARGET == HWY_AVX2
-void boxSSE(uint8_t* in, uint8_t* blurred, int width, int height); 
+void boxSSE(uint8_t* in, uint8_t* blurred, int width, int height);
 #endif
 
-}
+}  // namespace ndb
 #endif
-
-
-
-
diff --git a/lib/gpc/kernels/box_hwy.cpp b/lib/gpc/kernels/box_hwy.cpp
index 3cc2736..ba2f5dd 100644
--- a/lib/gpc/kernels/box_hwy.cpp
+++ b/lib/gpc/kernels/box_hwy.cpp
@@ -1,17 +1,19 @@
-#define HWY_TARGET HWY_NEON 
+#define HWY_TARGET HWY_NEON
 #include <hwy/highway.h>
 
-HWY_BEFORE_NAMESPACE(); 
+HWY_BEFORE_NAMESPACE();
 namespace ndb {
 namespace HWY_NAMESPACE {
 namespace hn = hwy::HWY_NAMESPACE;
 
-
-void BoxKernel(const uint8_t* HWY_RESTRICT in, uint8_t* HWY_RESTRICT blurred, int width, int height) {
+void BoxKernel(const uint8_t* HWY_RESTRICT in,
+               uint8_t* HWY_RESTRICT blurred,
+               int width,
+               int height) {
     const hn::ScalableTag<uint8_t> d8;
     const hn::Half<decltype(d8)> d8_h;
     const hn::Rebind<uint16_t, decltype(d8_h)> d16;
-    
+
     const size_t N = hn::Lanes(d8);
     const auto divisor = hn::Set(d16, (uint16_t)7282);
 
@@ -20,46 +22,90 @@ void BoxKernel(const uint8_t* HWY_RESTRICT in, uint8_t* HWY_RESTRICT blurred, in
         const uint8_t* r1 = in + y * width;
         const uint8_t* r2 = in + (y + 1) * width;
         const uint8_t* r3 = in + (y + 2) * width;
-        
+
         uint8_t* out0 = blurred + y * width + 1;
         uint8_t* out1 = blurred + (y + 1) * width + 1;
 
         for (int x = 0; x < width; x += N) {
-            auto v00 = hn::LoadU(d8, r0+x); auto v01 = hn::LoadU(d8, r0+x+1); auto v02 = hn::LoadU(d8, r0+x+2);
-            auto v10 = hn::LoadU(d8, r1+x); auto v11 = hn::LoadU(d8, r1+x+1); auto v12 = hn::LoadU(d8, r1+x+2);
-            auto v20 = hn::LoadU(d8, r2+x); auto v21 = hn::LoadU(d8, r2+x+1); auto v22 = hn::LoadU(d8, r2+x+2);
-            auto v30 = hn::LoadU(d8, r3+x); auto v31 = hn::LoadU(d8, r3+x+1); auto v32 = hn::LoadU(d8, r3+x+2);
+            auto v00 = hn::LoadU(d8, r0 + x);
+            auto v01 = hn::LoadU(d8, r0 + x + 1);
+            auto v02 = hn::LoadU(d8, r0 + x + 2);
+            auto v10 = hn::LoadU(d8, r1 + x);
+            auto v11 = hn::LoadU(d8, r1 + x + 1);
+            auto v12 = hn::LoadU(d8, r1 + x + 2);
+            auto v20 = hn::LoadU(d8, r2 + x);
+            auto v21 = hn::LoadU(d8, r2 + x + 1);
+            auto v22 = hn::LoadU(d8, r2 + x + 2);
+            auto v30 = hn::LoadU(d8, r3 + x);
+            auto v31 = hn::LoadU(d8, r3 + x + 1);
+            auto v32 = hn::LoadU(d8, r3 + x + 2);
 
             // Lower Half Math
-            auto s1_lo = hn::Add(hn::PromoteTo(d16, hn::LowerHalf(v11)), hn::Add(hn::PromoteTo(d16, hn::LowerHalf(v10)), hn::PromoteTo(d16, hn::LowerHalf(v12))));
-            auto s2_lo = hn::Add(hn::PromoteTo(d16, hn::LowerHalf(v21)), hn::Add(hn::PromoteTo(d16, hn::LowerHalf(v20)), hn::PromoteTo(d16, hn::LowerHalf(v22))));
-            
-            auto row0_lo = hn::Add(hn::Add(hn::PromoteTo(d16, hn::LowerHalf(v01)), hn::Add(hn::PromoteTo(d16, hn::LowerHalf(v00)), hn::PromoteTo(d16, hn::LowerHalf(v02)))), hn::Add(s1_lo, s2_lo));
-            auto row1_lo = hn::Add(hn::Add(hn::PromoteTo(d16, hn::LowerHalf(v31)), hn::Add(hn::PromoteTo(d16, hn::LowerHalf(v30)), hn::PromoteTo(d16, hn::LowerHalf(v32)))), hn::Add(s1_lo, s2_lo));
+            auto s1_lo =
+                hn::Add(hn::PromoteTo(d16, hn::LowerHalf(v11)),
+                        hn::Add(hn::PromoteTo(d16, hn::LowerHalf(v10)),
+                                hn::PromoteTo(d16, hn::LowerHalf(v12))));
+            auto s2_lo =
+                hn::Add(hn::PromoteTo(d16, hn::LowerHalf(v21)),
+                        hn::Add(hn::PromoteTo(d16, hn::LowerHalf(v20)),
+                                hn::PromoteTo(d16, hn::LowerHalf(v22))));
+
+            auto row0_lo = hn::Add(
+                hn::Add(hn::PromoteTo(d16, hn::LowerHalf(v01)),
+                        hn::Add(hn::PromoteTo(d16, hn::LowerHalf(v00)),
+                                hn::PromoteTo(d16, hn::LowerHalf(v02)))),
+                hn::Add(s1_lo, s2_lo));
+            auto row1_lo = hn::Add(
+                hn::Add(hn::PromoteTo(d16, hn::LowerHalf(v31)),
+                        hn::Add(hn::PromoteTo(d16, hn::LowerHalf(v30)),
+                                hn::PromoteTo(d16, hn::LowerHalf(v32)))),
+                hn::Add(s1_lo, s2_lo));
 
             // Upper Half Math
-            auto s1_hi = hn::Add(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v11)), hn::Add(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v10)), hn::PromoteTo(d16, hn::UpperHalf(d8_h, v12))));
-            auto s2_hi = hn::Add(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v21)), hn::Add(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v20)), hn::PromoteTo(d16, hn::UpperHalf(d8_h, v22))));
-            
-            auto row0_hi = hn::Add(hn::Add(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v01)), hn::Add(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v00)), hn::PromoteTo(d16, hn::UpperHalf(d8_h, v02)))), hn::Add(s1_hi, s2_hi));
-            auto row1_hi = hn::Add(hn::Add(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v31)), hn::Add(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v30)), hn::PromoteTo(d16, hn::UpperHalf(d8_h, v32)))), hn::Add(s1_hi, s2_hi));
+            auto s1_hi =
+                hn::Add(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v11)),
+                        hn::Add(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v10)),
+                                hn::PromoteTo(d16, hn::UpperHalf(d8_h, v12))));
+            auto s2_hi =
+                hn::Add(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v21)),
+                        hn::Add(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v20)),
+                                hn::PromoteTo(d16, hn::UpperHalf(d8_h, v22))));
 
-            hn::StoreU(hn::OrderedDemote2To(d8, hn::MulHigh(row0_lo, divisor), hn::MulHigh(row0_hi, divisor)), d8, out0 + x);
-            hn::StoreU(hn::OrderedDemote2To(d8, hn::MulHigh(row1_lo, divisor), hn::MulHigh(row1_hi, divisor)), d8, out1 + x);
+            auto row0_hi = hn::Add(
+                hn::Add(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v01)),
+                        hn::Add(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v00)),
+                                hn::PromoteTo(d16, hn::UpperHalf(d8_h, v02)))),
+                hn::Add(s1_hi, s2_hi));
+            auto row1_hi = hn::Add(
+                hn::Add(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v31)),
+                        hn::Add(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v30)),
+                                hn::PromoteTo(d16, hn::UpperHalf(d8_h, v32)))),
+                hn::Add(s1_hi, s2_hi));
+
+            hn::StoreU(hn::OrderedDemote2To(d8,
+                                            hn::MulHigh(row0_lo, divisor),
+                                            hn::MulHigh(row0_hi, divisor)),
+                       d8,
+                       out0 + x);
+            hn::StoreU(hn::OrderedDemote2To(d8,
+                                            hn::MulHigh(row1_lo, divisor),
+                                            hn::MulHigh(row1_hi, divisor)),
+                       d8,
+                       out1 + x);
         }
     }
 }
-} // namespace HWY_NAMESPACE
-} // namespace ndb
+}  // namespace HWY_NAMESPACE
+}  // namespace ndb
 HWY_AFTER_NAMESPACE();
 
 namespace ndb {
 namespace testing {
-//#if defined(HWY_TARGET) && HWY_TARGET == HWY_NEON
-    void box_hwy(uint8_t* in, uint8_t* blurred, int width, int height) {
-        //ndb::N_NEON::BoxKernel(in, blurred, width, height);
-        HWY_STATIC_DISPATCH(BoxKernel)(in, blurred, width, height);
-    }
-//#endif
-}
+// #if defined(HWY_TARGET) && HWY_TARGET == HWY_NEON
+void box_hwy(uint8_t* in, uint8_t* blurred, int width, int height) {
+    // ndb::N_NEON::BoxKernel(in, blurred, width, height);
+    HWY_STATIC_DISPATCH(BoxKernel)(in, blurred, width, height);
 }
+// #endif
+}  // namespace testing
+}  // namespace ndb
diff --git a/lib/gpc/kernels/box_hwy.hpp b/lib/gpc/kernels/box_hwy.hpp
index 6c256b0..ae39d71 100644
--- a/lib/gpc/kernels/box_hwy.hpp
+++ b/lib/gpc/kernels/box_hwy.hpp
@@ -1,4 +1,4 @@
-#ifndef  __NDB__KERNEL_BOX_HWY
+#ifndef __NDB__KERNEL_BOX_HWY
 #define __NDB__KERNEL_BOX_HWY
 
 #include <cstdint>
@@ -6,12 +6,12 @@
 namespace ndb {
 
 namespace testing {
-    /**
-     * Entry point for benchmarking the MulHigh (approximate) version.
-     */
-    void box_hwy(uint8_t* in, uint8_t* blurred, int width, int height);
+/**
+ * Entry point for benchmarking the MulHigh (approximate) version.
+ */
+void box_hwy(uint8_t* in, uint8_t* blurred, int width, int height);
 
-}
+}  // namespace testing
 
 }  // namespace ndb
 
diff --git a/lib/gpc/kernels/census.cpp b/lib/gpc/kernels/census.cpp
index 6235b06..8b265cc 100644
--- a/lib/gpc/kernels/census.cpp
+++ b/lib/gpc/kernels/census.cpp
@@ -28,8 +28,9 @@
 // POSSIBILITY OF SUCH DAMAGE.
 //
 // Code Author: Niklaus Bamert (bamertn@ethz.ch)
-#include <cassert>
 #include "gpc/kernels/census.hpp"
+
+#include <cassert>
 void census5x5Naive(uint8_t* in, uint32_t* census, int width, int height) {
     uint32_t val;
     uint32_t* dst;
@@ -200,5 +201,3 @@ void census5x5(uint8_t* in, uint32_t* census, int width, int height) {
 
 #endif
 }  // census5x5
-
-
diff --git a/lib/gpc/kernels/census.hpp b/lib/gpc/kernels/census.hpp
index 8353a4e..054a45f 100644
--- a/lib/gpc/kernels/census.hpp
+++ b/lib/gpc/kernels/census.hpp
@@ -45,7 +45,6 @@ namespace ndb {
  */
 void census5x5Naive(uint8_t* in, uint32_t* census, int width, int height);
 
-
 /**
  * @brief 5x5 dense census transform of input image. binary codes are returned
  * as a 32bit image
@@ -57,5 +56,5 @@ void census5x5Naive(uint8_t* in, uint32_t* census, int width, int height);
  */
 void census5x5(uint8_t* in, uint32_t* census, int width, int height);
 
-}
+}  // namespace ndb
 #endif
diff --git a/lib/gpc/kernels/gpc.cpp b/lib/gpc/kernels/gpc.cpp
index 62ffa3e..7bdf97b 100644
--- a/lib/gpc/kernels/gpc.cpp
+++ b/lib/gpc/kernels/gpc.cpp
@@ -28,8 +28,9 @@
 // POSSIBILITY OF SUCH DAMAGE.
 //
 // Code Author: Niklaus Bamert (bamertn@ethz.ch)
-#include <cassert>
 #include "gpc/kernels/gpc.hpp"
+
+#include <cassert>
 namespace ndb {
 void gpcFilterNaive(uint8_t* in,
                     const uint8_t* grad,
@@ -76,8 +77,7 @@ void gpcFilterTauNaive(uint8_t* in,
         gpc[k] = tmp;
         j++;
     }
-} 
-
+}
 
 #if (HWY_ARCH_X86) && (HWY_TARGET == HWY_AVX2)
 bool isAllZeros(__m128i xmm) {
@@ -85,13 +85,13 @@ bool isAllZeros(__m128i xmm) {
            0xFFFF;
 }
 void gpcFilterSSE(uint8_t* in,
-               const uint8_t* grad,
-               uint32_t* gpc,
-               std::vector<int32_t> fastmask,
-               std::vector<int>& idx,
-               int width,
-               int height) {
-    const int start  = 13; 
+                  const uint8_t* grad,
+                  uint32_t* gpc,
+                  std::vector<int32_t> fastmask,
+                  std::vector<int>& idx,
+                  int width,
+                  int height) {
+    const int start = 13;
     const int end = height - 15;
     __m128i zero = _mm_set1_epi8(0);
     __m128i one = _mm_set1_epi8(1);
@@ -118,8 +118,7 @@ void gpcFilterSSE(uint8_t* in,
                 for (uint8_t i = 0; i < fastmask.size() && i < 64; i += 2) {
                     out[k] |= _mm_and_si128(
                         _mm_cmpgt_epu8(
-                            _mm_lddqu_si128(
-                                (__m128i*)(center + fastmask[i])),
+                            _mm_lddqu_si128((__m128i*)(center + fastmask[i])),
                             _mm_lddqu_si128(
                                 (__m128i*)(center + fastmask[i + 1]))),
                         bitMask);
@@ -153,30 +152,30 @@ void gpcFilter(uint8_t* in,
                std::vector<int32_t> fastmask,
                std::vector<int>& idx,
                int width,
-               int height){
+               int height) {
     assert(width % 16 == 0 && "width must be multiple of 16!");
 #if defined(__ARM_NEON) || defined(__aarch64__)
     // Replace with call to highway
     gpcFilterNaive(in, grad, gpc, fastmask, idx, width, height);
 #else
-    #if (HWY_ARCH_X86) && (HWY_TARGET == HWY_AVX2)
-        gpcFilterSSE(in, grad, gpc, fastmask, idx, width, height);
-    #else 
-        gpcFilterNaive(in, grad, gpc, fastmask, idx, width, height);
+#if (HWY_ARCH_X86) && (HWY_TARGET == HWY_AVX2)
+    gpcFilterSSE(in, grad, gpc, fastmask, idx, width, height);
+#else
+    gpcFilterNaive(in, grad, gpc, fastmask, idx, width, height);
 #endif
 #endif
 }
 
 #if (HWY_ARCH_X86) && (HWY_TARGET == HWY_AVX2)
 void gpcFilterTauSSE(uint8_t* in,
-                  const uint8_t* grad,
-                  uint32_t* gpc,
-                  std::vector<int32_t> fastmask,
-                  std::vector<int> tau,
-                  std::vector<int>& idx,
-                  int width,
-                  int height){
-    const int start  = 13; 
+                     const uint8_t* grad,
+                     uint32_t* gpc,
+                     std::vector<int32_t> fastmask,
+                     std::vector<int> tau,
+                     std::vector<int>& idx,
+                     int width,
+                     int height) {
+    const int start = 13;
     const int end = height - 15;
     __m128i zero = _mm_set1_epi8(0);
     __m128i one = _mm_set1_epi8(1);
@@ -203,8 +202,7 @@ void gpcFilterTauSSE(uint8_t* in,
                 for (uint8_t i = 0; i < fastmask.size() && i < 64; i += 2) {
                     out[k] |= _mm_and_si128(
                         _mm_cmpgt_epu8(
-                            _mm_lddqu_si128(
-                                (__m128i*)(center + fastmask[i])),
+                            _mm_lddqu_si128((__m128i*)(center + fastmask[i])),
                             _mm_subs_epi8(
                                 _mm_lddqu_si128(
                                     (__m128i*)(center + fastmask[i + 1])),
@@ -243,19 +241,17 @@ void gpcFilterTau(uint8_t* in,
                   std::vector<int> tau,
                   std::vector<int>& idx,
                   int width,
-                  int height){
+                  int height) {
     assert(width % 16 == 0 && "width must be multiple of 16!");
 #if defined(__ARM_NEON) || defined(__aarch64__)
     // Replace with call to highway
     gpcFilterTauNaive(in, grad, gpc, fastmask, tau, idx, width, height);
 #else
-    #if (HWY_ARCH_X86) && (HWY_TARGET == HWY_AVX2)
-        gpcFilterTauSSE(in, grad, gpc, fastmask, tau, idx, width, height);
-    #else 
-        gpcFilterTauNaive(in, grad, gpc, fastmask, tau, idx, width, height);
+#if (HWY_ARCH_X86) && (HWY_TARGET == HWY_AVX2)
+    gpcFilterTauSSE(in, grad, gpc, fastmask, tau, idx, width, height);
+#else
+    gpcFilterTauNaive(in, grad, gpc, fastmask, tau, idx, width, height);
 #endif
 #endif
-
 }
-} // namespace ndb
-
+}  // namespace ndb
diff --git a/lib/gpc/kernels/gpc.hpp b/lib/gpc/kernels/gpc.hpp
index 49db7ae..bdd4bf4 100644
--- a/lib/gpc/kernels/gpc.hpp
+++ b/lib/gpc/kernels/gpc.hpp
@@ -58,7 +58,6 @@ void gpcFilter(uint8_t* in,
                int width,
                int height);
 
-
 /**
  * @brief Applies a gpc filter defined by the pixel-difference tests in
  * fastmask. Naive implementation
@@ -131,24 +130,22 @@ void gpcFilterTauNaive(uint8_t* in,
 #if (HWY_ARCH_X86) && (HWY_TARGET == HWY_AVX2)
 bool isAllZeros(__m128i xmm);
 void gpcFilterTauSSE(uint8_t* in,
+                     const uint8_t* grad,
+                     uint32_t* gpc,
+                     std::vector<int32_t> fastmask,
+                     std::vector<int> tau,
+                     std::vector<int>& idx,
+                     int width,
+                     int height);
+void gpcFilterSSE(uint8_t* in,
                   const uint8_t* grad,
                   uint32_t* gpc,
                   std::vector<int32_t> fastmask,
-                  std::vector<int> tau,
                   std::vector<int>& idx,
                   int width,
                   int height);
-void gpcFilterSSE(uint8_t* in,
-               const uint8_t* grad,
-               uint32_t* gpc,
-               std::vector<int32_t> fastmask,
-               std::vector<int>& idx,
-               int width,
-               int height);
-
 
 #endif
 
-
-}
+}  // namespace ndb
 #endif
diff --git a/lib/gpc/kernels/gpc_hwy.cpp b/lib/gpc/kernels/gpc_hwy.cpp
index 21dae2d..87b0f20 100644
--- a/lib/gpc/kernels/gpc_hwy.cpp
+++ b/lib/gpc/kernels/gpc_hwy.cpp
@@ -1,27 +1,27 @@
-//#define HWY_TARGET HWY_NEON 
+// #define HWY_TARGET HWY_NEON
 #include "gpc_hwy.hpp"
-HWY_BEFORE_NAMESPACE(); 
+HWY_BEFORE_NAMESPACE();
 namespace ndb {
 namespace HWY_NAMESPACE {
 namespace hn = hwy::HWY_NAMESPACE;
 
-//dense!
+// dense!
 #include <hwy/highway.h>
 
 namespace hn = hwy::HWY_NAMESPACE;
 
 // Dense Version
 void GPCKernel(const uint8_t* HWY_RESTRICT in,
-               const uint8_t* HWY_RESTRICT grad, 
+               const uint8_t* HWY_RESTRICT grad,
                uint32_t* HWY_RESTRICT gpc,
                const std::vector<int32_t>& fastmask,
-               const std::vector<int32_t>& tau,  
-               int width, int height) {
-    
+               const std::vector<int32_t>& tau,
+               int width,
+               int height) {
     const hn::ScalableTag<uint8_t> d8;
     const hn::ScalableTag<uint32_t> d32;
     const size_t N = hn::Lanes(d8);
-    
+
     const int border = 13;
     const auto v_zero8 = hn::Zero(d8);
     const auto v_one8 = hn::Set(d8, 1);
@@ -36,76 +36,80 @@ void GPCKernel(const uint8_t* HWY_RESTRICT in,
 
             // We use four 8-bit registers to build the 32 bits.
             // This keeps the entire hot-loop in 8-bit space.
-            auto v_acc0 = hn::Zero(d8); // Bits 0-7
-            auto v_acc1 = hn::Zero(d8); // Bits 8-15
-            auto v_acc2 = hn::Zero(d8); // Bits 16-23
-            auto v_acc3 = hn::Zero(d8); // Bits 24-31
+            auto v_acc0 = hn::Zero(d8);  // Bits 0-7
+            auto v_acc1 = hn::Zero(d8);  // Bits 8-15
+            auto v_acc2 = hn::Zero(d8);  // Bits 16-23
+            auto v_acc3 = hn::Zero(d8);  // Bits 24-31
 
             // Pass 1: Bits 0-7
             for (int i = 0; i < 16; i += 2) {
                 v_acc0 = hn::Add(v_acc0, v_acc0);
-                auto mask = hn::Gt(hn::LoadU(d8, in + k + fm[i]), 
-                                   hn::LoadU(d8, in + k + fm[i+1]));
+                auto mask = hn::Gt(hn::LoadU(d8, in + k + fm[i]),
+                                   hn::LoadU(d8, in + k + fm[i + 1]));
                 v_acc0 = hn::Or(v_acc0, hn::IfThenElse(mask, v_one8, v_zero8));
             }
 
             // Pass 2: Bits 8-15
             for (int i = 16; i < 32; i += 2) {
                 v_acc1 = hn::Add(v_acc1, v_acc1);
-                auto mask = hn::Gt(hn::LoadU(d8, in + k + fm[i]), 
-                                   hn::LoadU(d8, in + k + fm[i+1]));
+                auto mask = hn::Gt(hn::LoadU(d8, in + k + fm[i]),
+                                   hn::LoadU(d8, in + k + fm[i + 1]));
                 v_acc1 = hn::Or(v_acc1, hn::IfThenElse(mask, v_one8, v_zero8));
             }
 
             // Pass 3: Bits 16-23
             for (int i = 32; i < 48; i += 2) {
                 v_acc2 = hn::Add(v_acc2, v_acc2);
-                auto mask = hn::Gt(hn::LoadU(d8, in + k + fm[i]), 
-                                   hn::LoadU(d8, in + k + fm[i+1]));
+                auto mask = hn::Gt(hn::LoadU(d8, in + k + fm[i]),
+                                   hn::LoadU(d8, in + k + fm[i + 1]));
                 v_acc2 = hn::Or(v_acc2, hn::IfThenElse(mask, v_one8, v_zero8));
             }
 
             // Pass 4: Bits 24-31
             for (int i = 48; i < 64; i += 2) {
                 v_acc3 = hn::Add(v_acc3, v_acc3);
-                auto mask = hn::Gt(hn::LoadU(d8, in + k + fm[i]), 
-                                   hn::LoadU(d8, in + k + fm[i+1]));
+                auto mask = hn::Gt(hn::LoadU(d8, in + k + fm[i]),
+                                   hn::LoadU(d8, in + k + fm[i + 1]));
                 v_acc3 = hn::Or(v_acc3, hn::IfThenElse(mask, v_one8, v_zero8));
             }
 
-            // Final Assembly: Promote the four 8-bit chunks into 32-bit results.
-            // We use PromoteUpper/Lower to widen the data.
-            // N is the number of 8-bit lanes. We need to store N/4 results in d32.
-            
-            // To be perfectly safe across all Highway targets, we extract and combine:
+            // Final Assembly: Promote the four 8-bit chunks into 32-bit
+            // results. We use PromoteUpper/Lower to widen the data. N is the
+            // number of 8-bit lanes. We need to store N/4 results in d32.
+
+            // To be perfectly safe across all Highway targets, we extract and
+            // combine:
             for (size_t lane = 0; lane < N; ++lane) {
-                uint32_t final_val = (uint32_t(hn::ExtractLane(v_acc0, lane)) << 24) |
-                                     (uint32_t(hn::ExtractLane(v_acc1, lane)) << 16) |
-                                     (uint32_t(hn::ExtractLane(v_acc2, lane)) << 8)  |
-                                     (uint32_t(hn::ExtractLane(v_acc3, lane)));
+                uint32_t final_val =
+                    (uint32_t(hn::ExtractLane(v_acc0, lane)) << 24) |
+                    (uint32_t(hn::ExtractLane(v_acc1, lane)) << 16) |
+                    (uint32_t(hn::ExtractLane(v_acc2, lane)) << 8) |
+                    (uint32_t(hn::ExtractLane(v_acc3, lane)));
                 row_out[x + lane] = final_val;
             }
         }
     }
 }
 void GPCKerneli(const uint8_t* HWY_RESTRICT in,
-                      const uint8_t* HWY_RESTRICT grad,
-                      uint32_t* HWY_RESTRICT gpc,
-                      const std::vector<int32_t>& fastmask,
-                      const std::vector<int32_t>& tau,
-                      int width, int height) {
+                const uint8_t* HWY_RESTRICT grad,
+                uint32_t* HWY_RESTRICT gpc,
+                const std::vector<int32_t>& fastmask,
+                const std::vector<int32_t>& tau,
+                int width,
+                int height) {
     // We use the ScalableTag, but we will "Narrow" our view manually
     const hn::ScalableTag<uint32_t> d32;
-    const hn::Rebind<uint8_t, decltype(d32)> d8_n; // Same number of lanes as d32
-    
-    const size_t N = hn::Lanes(d32); 
+    const hn::Rebind<uint8_t, decltype(d32)>
+        d8_n;  // Same number of lanes as d32
+
+    const size_t N = hn::Lanes(d32);
     const auto v_zero = hn::Zero(d32);
     const bool use_tau = !tau.empty();
 
     for (int y = 0; y < height; ++y) {
         for (int x = 0; x < width; x += N) {
             const uint8_t* centerGrad = grad + y * width + x;
-            
+
             // 1. Load the gradient bytes for the current N lanes
             auto v_grad = hn::LoadU(d8_n, centerGrad);
 
@@ -117,12 +121,14 @@ void GPCKerneli(const uint8_t* HWY_RESTRICT in,
             auto v_tmp = hn::Zero(d32);
 
             for (size_t i = 0; i < fastmask.size(); i += 2) {
-                v_tmp = hn::ShiftLeft<1>(v_tmp); 
+                v_tmp = hn::ShiftLeft<1>(v_tmp);
 
                 // 3. The "Promotion" that actually works on all platforms:
                 // Promote N lanes of uint8 to N lanes of uint32
-                auto v1 = hn::PromoteTo(d32, hn::LoadU(d8_n, in + y * width + x + fastmask[i]));
-                auto v2 = hn::PromoteTo(d32, hn::LoadU(d8_n, in + y * width + x + fastmask[i + 1]));
+                auto v1 = hn::PromoteTo(
+                    d32, hn::LoadU(d8_n, in + y * width + x + fastmask[i]));
+                auto v2 = hn::PromoteTo(
+                    d32, hn::LoadU(d8_n, in + y * width + x + fastmask[i + 1]));
 
                 hn::Mask<decltype(d32)> mask;
                 if (use_tau) {
@@ -132,7 +138,8 @@ void GPCKerneli(const uint8_t* HWY_RESTRICT in,
                     mask = hn::Gt(v1, v2);
                 }
 
-                v_tmp = hn::Add(v_tmp, hn::IfThenElse(mask, hn::Set(d32, 1), v_zero));
+                v_tmp = hn::Add(v_tmp,
+                                hn::IfThenElse(mask, hn::Set(d32, 1), v_zero));
             }
 
             hn::StoreU(v_tmp, d32, gpc + y * width + x);
@@ -140,18 +147,20 @@ void GPCKerneli(const uint8_t* HWY_RESTRICT in,
     }
 }
 
-} // namespace HWY_NAMESPACE
-} // namespace ndb
+}  // namespace HWY_NAMESPACE
+}  // namespace ndb
 HWY_AFTER_NAMESPACE();
 
 namespace ndb {
 namespace testing {
-    void gpc_hwy(uint8_t* in, uint8_t* grad, uint32_t* HWY_RESTRICT gpc,
-            const std::vector<int32_t>& fastmask, 
-            const std::vector<int32_t>& tau, int width, int height) {
-
-        HWY_STATIC_DISPATCH(GPCKernel)(in, grad, gpc, fastmask, tau, width, height);
-
-    }
-}
+void gpc_hwy(uint8_t* in,
+             uint8_t* grad,
+             uint32_t* HWY_RESTRICT gpc,
+             const std::vector<int32_t>& fastmask,
+             const std::vector<int32_t>& tau,
+             int width,
+             int height) {
+    HWY_STATIC_DISPATCH(GPCKernel)(in, grad, gpc, fastmask, tau, width, height);
 }
+}  // namespace testing
+}  // namespace ndb
diff --git a/lib/gpc/kernels/gpc_hwy.hpp b/lib/gpc/kernels/gpc_hwy.hpp
index f49d05a..8a83751 100644
--- a/lib/gpc/kernels/gpc_hwy.hpp
+++ b/lib/gpc/kernels/gpc_hwy.hpp
@@ -1,14 +1,20 @@
-#ifndef  __NDB__KERNEL_GPC_HWY
+#ifndef __NDB__KERNEL_GPC_HWY
 #define __NDB__KERNEL_GPC_HWY
 
 #include <hwy/highway.h>
+
 #include <cstdint>
 
 namespace ndb {
 
 namespace testing {
-    void gpc_hwy(uint8_t* in, uint8_t* grad, uint32_t* HWY_RESTRICT gpc, const std::vector<int32_t>& fastmask, const std::vector<int32_t>& tau, int width, int height);
-
+void gpc_hwy(uint8_t* in,
+             uint8_t* grad,
+             uint32_t* HWY_RESTRICT gpc,
+             const std::vector<int32_t>& fastmask,
+             const std::vector<int32_t>& tau,
+             int width,
+             int height);
 
 }
 
diff --git a/lib/gpc/kernels/sobel.cpp b/lib/gpc/kernels/sobel.cpp
index 7867cd3..becaf50 100644
--- a/lib/gpc/kernels/sobel.cpp
+++ b/lib/gpc/kernels/sobel.cpp
@@ -28,12 +28,15 @@
 // POSSIBILITY OF SUCH DAMAGE.
 //
 // Code Author: Niklaus Bamert (bamertn@ethz.ch)
-#include <cassert>
 #include "gpc/kernels/sobel.hpp"
+
+#include <cassert>
+
 #include "gpc/kernels/utils.hpp"
 namespace ndb {
-namespace testing { 
-    void sobel_hwy(uint8_t* in, uint8_t* blurred, int width, int height, uint8_t threshold); 
+namespace testing {
+void sobel_hwy(
+    uint8_t* in, uint8_t* blurred, int width, int height, uint8_t threshold);
 }
 void sobelNaive(
     uint8_t* in, uint8_t* gradient, int width, int height, uint8_t threshold) {
@@ -59,7 +62,8 @@ void sobelNaive(
     // boundary) (unoptimized)
     for (int iy = 1; iy < height - 1; iy++) {
         for (int ix = 0; ix < width; ix++) {
-            // Approximate division by 9 with fixed-point multiplication (2^16/9 = 7282)
+            // Approximate division by 9 with fixed-point multiplication (2^16/9
+            // = 7282)
             int16_t sum_x = (*p11 + *p31 + 2 * *p21 - *p13 - 2 * *p23 - *p33);
             int16_t sum_y = (*p11 + *p13 + 2 * *p12 - *p31 - 2 * *p32 - *p33);
 
@@ -81,23 +85,25 @@ void sobelNaive(
         }
     }
 }
-//#ifdef _INTRINSICS_SSE
+// #ifdef _INTRINSICS_SSE
 #if HWY_TARGET == HWY_AVX2
 #include <immintrin.h>
 
-void sobelSSE(const uint8_t* in, uint8_t* blurred, 
-                            int width, int start, int end, 
-                            uint8_t threshold) {
-    
+void sobelSSE(const uint8_t* in,
+              uint8_t* blurred,
+              int width,
+              int start,
+              int end,
+              uint8_t threshold) {
     __m128i zero = _mm_setzero_si128();
-    __m128i one_ninth = _mm_set1_epi16(7282); // 2^16/9
+    __m128i one_ninth = _mm_set1_epi16(7282);  // 2^16/9
     __m128i binThres = _mm_set1_epi16(threshold * threshold);
 
     for (int y = start; y < end; y++) {
         const uint8_t* row1 = in + y * width;
         const uint8_t* row0 = row1 - width;
         const uint8_t* row2 = row1 + width;
-        
+
         // Output destination for this specific row
         __m128i* dst = (__m128i*)(blurred + y * width + 1);
 
@@ -109,23 +115,31 @@ void sobelSSE(const uint8_t* in, uint8_t* blurred,
 
             // Load and unpack 3x3 neighborhood (excluding center a11/b11)
             unpack8to16(_mm_loadu_si128((__m128i*)(row0 + x - 1)), a00, b00);
-            unpack8to16(_mm_loadu_si128((__m128i*)(row0 + x)),     a01, b01);
+            unpack8to16(_mm_loadu_si128((__m128i*)(row0 + x)), a01, b01);
             unpack8to16(_mm_loadu_si128((__m128i*)(row0 + x + 1)), a02, b02);
 
             unpack8to16(_mm_loadu_si128((__m128i*)(row1 + x - 1)), a10, b10);
             unpack8to16(_mm_loadu_si128((__m128i*)(row1 + x + 1)), a12, b12);
 
             unpack8to16(_mm_loadu_si128((__m128i*)(row2 + x - 1)), a20, b20);
-            unpack8to16(_mm_loadu_si128((__m128i*)(row2 + x)),     a21, b21);
+            unpack8to16(_mm_loadu_si128((__m128i*)(row2 + x)), a21, b21);
             unpack8to16(_mm_loadu_si128((__m128i*)(row2 + x + 1)), a22, b22);
 
             // --- SX Calculation ---
             // Left col (1,2,1)
-            raA = _mm_mulhi_epi16(_mm_add_epi16(_mm_add_epi16(a00, a20), _mm_add_epi16(a10, a10)), one_ninth);
-            rbA = _mm_mulhi_epi16(_mm_add_epi16(_mm_add_epi16(b00, b20), _mm_add_epi16(b10, b10)), one_ninth);
+            raA = _mm_mulhi_epi16(
+                _mm_add_epi16(_mm_add_epi16(a00, a20), _mm_add_epi16(a10, a10)),
+                one_ninth);
+            rbA = _mm_mulhi_epi16(
+                _mm_add_epi16(_mm_add_epi16(b00, b20), _mm_add_epi16(b10, b10)),
+                one_ninth);
             // Right col (-1,-2,-1)
-            raB = _mm_mulhi_epi16(_mm_add_epi16(_mm_add_epi16(a02, a22), _mm_add_epi16(a12, a12)), one_ninth);
-            rbB = _mm_mulhi_epi16(_mm_add_epi16(_mm_add_epi16(b02, b22), _mm_add_epi16(b12, b12)), one_ninth);
+            raB = _mm_mulhi_epi16(
+                _mm_add_epi16(_mm_add_epi16(a02, a22), _mm_add_epi16(a12, a12)),
+                one_ninth);
+            rbB = _mm_mulhi_epi16(
+                _mm_add_epi16(_mm_add_epi16(b02, b22), _mm_add_epi16(b12, b12)),
+                one_ninth);
 
             tmpa = _mm_sub_epi16(raA, raB);
             tmpb = _mm_sub_epi16(rbA, rbB);
@@ -134,11 +148,19 @@ void sobelSSE(const uint8_t* in, uint8_t* blurred,
 
             // --- SY Calculation ---
             // Top row (1,2,1)
-            raA = _mm_mulhi_epi16(_mm_add_epi16(_mm_add_epi16(a00, a02), _mm_add_epi16(a01, a01)), one_ninth);
-            rbA = _mm_mulhi_epi16(_mm_add_epi16(_mm_add_epi16(b00, b02), _mm_add_epi16(b01, b01)), one_ninth);
+            raA = _mm_mulhi_epi16(
+                _mm_add_epi16(_mm_add_epi16(a00, a02), _mm_add_epi16(a01, a01)),
+                one_ninth);
+            rbA = _mm_mulhi_epi16(
+                _mm_add_epi16(_mm_add_epi16(b00, b02), _mm_add_epi16(b01, b01)),
+                one_ninth);
             // Bottom row (-1,-2,-1)
-            raB = _mm_mulhi_epi16(_mm_add_epi16(_mm_add_epi16(a20, a22), _mm_add_epi16(a21, a21)), one_ninth);
-            rbB = _mm_mulhi_epi16(_mm_add_epi16(_mm_add_epi16(b20, b22), _mm_add_epi16(b21, b21)), one_ninth);
+            raB = _mm_mulhi_epi16(
+                _mm_add_epi16(_mm_add_epi16(a20, a22), _mm_add_epi16(a21, a21)),
+                one_ninth);
+            rbB = _mm_mulhi_epi16(
+                _mm_add_epi16(_mm_add_epi16(b20, b22), _mm_add_epi16(b21, b21)),
+                one_ninth);
 
             tmpa = _mm_sub_epi16(raA, raB);
             tmpb = _mm_sub_epi16(rbA, rbB);
@@ -147,8 +169,10 @@ void sobelSSE(const uint8_t* in, uint8_t* blurred,
 
             // --- Thresholding and Packing ---
             pack16to8(
-                _mm_unpacklo_epi8(_mm_cmpgt_epi16(_mm_adds_epi16(sxa, sya), binThres), zero),
-                _mm_unpacklo_epi8(_mm_cmpgt_epi16(_mm_adds_epi16(sxb, syb), binThres), zero),
+                _mm_unpacklo_epi8(
+                    _mm_cmpgt_epi16(_mm_adds_epi16(sxa, sya), binThres), zero),
+                _mm_unpacklo_epi8(
+                    _mm_cmpgt_epi16(_mm_adds_epi16(sxb, syb), binThres), zero),
                 res);
 
             _mm_storeu_si128(dst++, res);
@@ -164,14 +188,14 @@ void sobel(uint8_t* in,
            int numThreads) {
     assert(width % 16 == 0 && "width must be multiple of 16!");
 #if defined(__ARM_NEON) || defined(__aarch64__)
-        sobelNaive(in, blurred, width, height, threshold);
-    //testing::sobel_hwy(in, blurred, width, height, threshold); // not exact!
+    sobelNaive(in, blurred, width, height, threshold);
+    // testing::sobel_hwy(in, blurred, width, height, threshold); // not exact!
+#else
+#ifndef _INTRINSICS_SSE
+    sobelNaive(in, blurred, width, height, threshold);
 #else
-    #ifndef _INTRINSICS_SSE
-        sobelNaive(in, blurred, width, height, threshold);
-    #else
-        sobelSSE(in, blurred, width, 1, height - 1, threshold);
-    #endif
+    sobelSSE(in, blurred, width, 1, height - 1, threshold);
+#endif
 #endif
 }
-} // namespace ndb
+}  // namespace ndb
diff --git a/lib/gpc/kernels/sobel.hpp b/lib/gpc/kernels/sobel.hpp
index c14b950..312a70c 100644
--- a/lib/gpc/kernels/sobel.hpp
+++ b/lib/gpc/kernels/sobel.hpp
@@ -35,10 +35,13 @@
 
 namespace ndb {
 #if HWY_TARGET == HWY_AVX2
-void sobelSSE(const uint8_t* in, uint8_t* blurred, 
-                            int width, int start, int end, 
-                            uint8_t threshold);
- 
+void sobelSSE(const uint8_t* in,
+              uint8_t* blurred,
+              int width,
+              int start,
+              int end,
+              uint8_t threshold);
+
 #endif
 /**
  * @brief Naive 3x3 sobel filter implementation
@@ -70,5 +73,5 @@ void sobel(uint8_t* in,
            int height,
            uint8_t threshold,
            int numThreads);
-}
+}  // namespace ndb
 #endif
diff --git a/lib/gpc/kernels/sobel_hwy.cpp b/lib/gpc/kernels/sobel_hwy.cpp
index fe93031..14ad593 100644
--- a/lib/gpc/kernels/sobel_hwy.cpp
+++ b/lib/gpc/kernels/sobel_hwy.cpp
@@ -1,20 +1,23 @@
-//#define HWY_TARGET HWY_NEON 
+// #define HWY_TARGET HWY_NEON
 #include <hwy/highway.h>
 
-HWY_BEFORE_NAMESPACE(); 
+HWY_BEFORE_NAMESPACE();
 namespace ndb {
 namespace HWY_NAMESPACE {
 namespace hn = hwy::HWY_NAMESPACE;
-void SobelKernelNoDiv(const uint8_t* HWY_RESTRICT in, uint8_t* HWY_RESTRICT gradient, 
-                 int width, int height, uint8_t threshold) {
+void SobelKernelNoDiv(const uint8_t* HWY_RESTRICT in,
+                      uint8_t* HWY_RESTRICT gradient,
+                      int width,
+                      int height,
+                      uint8_t threshold) {
     const hn::ScalableTag<uint8_t> d8;
-    const hn::Half<decltype(d8)> d8_h; 
+    const hn::Half<decltype(d8)> d8_h;
     const hn::Rebind<int16_t, decltype(d8_h)> d16;
     // d32 has half the lanes of d16
     const hn::Rebind<int32_t, hn::Half<decltype(d16)>> d32;
 
     const size_t N = hn::Lanes(d8);
-    const auto vDivMult = hn::Set(d16, (int16_t)7282); 
+    const auto vDivMult = hn::Set(d16, (int16_t)7282);
     const auto vThreshSq = hn::Set(d32, (int32_t)threshold * threshold);
     const auto v255_16 = hn::Set(d16, (int16_t)255);
     const auto v255_8 = hn::Set(d8, (uint8_t)255);
@@ -27,44 +30,75 @@ void SobelKernelNoDiv(const uint8_t* HWY_RESTRICT in, uint8_t* HWY_RESTRICT grad
         uint8_t* out = gradient + y * width + 1;
 
         for (int x = 0; x < width; x += N) {
-            auto v11 = hn::LoadU(d8, r0 + x); auto v12 = hn::LoadU(d8, r0 + x + 1); auto v13 = hn::LoadU(d8, r0 + x + 2);
-            auto v21 = hn::LoadU(d8, r1 + x);                                       auto v23 = hn::LoadU(d8, r1 + x + 2);
-            auto v31 = hn::LoadU(d8, r2 + x); auto v32 = hn::LoadU(d8, r2 + x + 1); auto v33 = hn::LoadU(d8, r2 + x + 2);
+            auto v11 = hn::LoadU(d8, r0 + x);
+            auto v12 = hn::LoadU(d8, r0 + x + 1);
+            auto v13 = hn::LoadU(d8, r0 + x + 2);
+            auto v21 = hn::LoadU(d8, r1 + x);
+            auto v23 = hn::LoadU(d8, r1 + x + 2);
+            auto v31 = hn::LoadU(d8, r2 + x);
+            auto v32 = hn::LoadU(d8, r2 + x + 1);
+            auto v33 = hn::LoadU(d8, r2 + x + 2);
 
             // Helper to process 8 pixels into a 16-bit mask-like result
-            auto process_half = [&](auto p11, auto p12, auto p13, auto p21, auto p23, auto p31, auto p32, auto p33) {
+            auto process_half = [&](auto p11,
+                                    auto p12,
+                                    auto p13,
+                                    auto p21,
+                                    auto p23,
+                                    auto p31,
+                                    auto p32,
+                                    auto p33) {
                 // Sobel derivatives in 16-bit
-                auto sx16 = hn::MulHigh(hn::Sub(hn::Add(hn::Add(p11, p31), hn::Add(p21, p21)), 
-                                                hn::Add(hn::Add(p13, p33), hn::Add(p23, p23))), vDivMult);
-                auto sy16 = hn::MulHigh(hn::Sub(hn::Add(hn::Add(p11, p13), hn::Add(p12, p12)), 
-                                                hn::Add(hn::Add(p31, p33), hn::Add(p32, p32))), vDivMult);
+                auto sx16 = hn::MulHigh(
+                    hn::Sub(hn::Add(hn::Add(p11, p31), hn::Add(p21, p21)),
+                            hn::Add(hn::Add(p13, p33), hn::Add(p23, p23))),
+                    vDivMult);
+                auto sy16 = hn::MulHigh(
+                    hn::Sub(hn::Add(hn::Add(p11, p13), hn::Add(p12, p12)),
+                            hn::Add(hn::Add(p31, p33), hn::Add(p32, p32))),
+                    vDivMult);
 
                 // Magnitude squared in 32-bit
                 auto sx_lo = hn::PromoteLowerTo(d32, sx16);
                 auto sy_lo = hn::PromoteLowerTo(d32, sy16);
-                auto mag_lo = hn::Add(hn::Mul(sx_lo, sx_lo), hn::Mul(sy_lo, sy_lo));
+                auto mag_lo =
+                    hn::Add(hn::Mul(sx_lo, sx_lo), hn::Mul(sy_lo, sy_lo));
 
                 auto sx_hi = hn::PromoteUpperTo(d32, sx16);
                 auto sy_hi = hn::PromoteUpperTo(d32, sy16);
-                auto mag_hi = hn::Add(hn::Mul(sx_hi, sx_hi), hn::Mul(sy_hi, sy_hi));
+                auto mag_hi =
+                    hn::Add(hn::Mul(sx_hi, sx_hi), hn::Mul(sy_hi, sy_hi));
 
-                // Comparison in 32-bit, returning 16-bit values (0 or 255) to avoid mask issues
-                auto m_lo = hn::IfThenElse(hn::Gt(mag_lo, vThreshSq), hn::Set(d32, 255), hn::Zero(d32));
-                auto m_hi = hn::IfThenElse(hn::Gt(mag_hi, vThreshSq), hn::Set(d32, 255), hn::Zero(d32));
+                // Comparison in 32-bit, returning 16-bit values (0 or 255) to
+                // avoid mask issues
+                auto m_lo = hn::IfThenElse(hn::Gt(mag_lo, vThreshSq),
+                                           hn::Set(d32, 255),
+                                           hn::Zero(d32));
+                auto m_hi = hn::IfThenElse(hn::Gt(mag_hi, vThreshSq),
+                                           hn::Set(d32, 255),
+                                           hn::Zero(d32));
 
                 return hn::OrderedDemote2To(d16, m_lo, m_hi);
             };
 
             // Process halves using standard Highway promotion
-            auto res_lo = process_half(
-                hn::PromoteLowerTo(d16, v11), hn::PromoteLowerTo(d16, v12), hn::PromoteLowerTo(d16, v13),
-                hn::PromoteLowerTo(d16, v21), hn::PromoteLowerTo(d16, v23),
-                hn::PromoteLowerTo(d16, v31), hn::PromoteLowerTo(d16, v32), hn::PromoteLowerTo(d16, v33));
+            auto res_lo = process_half(hn::PromoteLowerTo(d16, v11),
+                                       hn::PromoteLowerTo(d16, v12),
+                                       hn::PromoteLowerTo(d16, v13),
+                                       hn::PromoteLowerTo(d16, v21),
+                                       hn::PromoteLowerTo(d16, v23),
+                                       hn::PromoteLowerTo(d16, v31),
+                                       hn::PromoteLowerTo(d16, v32),
+                                       hn::PromoteLowerTo(d16, v33));
 
-            auto res_hi = process_half(
-                hn::PromoteUpperTo(d16, v11), hn::PromoteUpperTo(d16, v12), hn::PromoteUpperTo(d16, v13),
-                hn::PromoteUpperTo(d16, v21), hn::PromoteUpperTo(d16, v23),
-                hn::PromoteUpperTo(d16, v31), hn::PromoteUpperTo(d16, v32), hn::PromoteUpperTo(d16, v33));
+            auto res_hi = process_half(hn::PromoteUpperTo(d16, v11),
+                                       hn::PromoteUpperTo(d16, v12),
+                                       hn::PromoteUpperTo(d16, v13),
+                                       hn::PromoteUpperTo(d16, v21),
+                                       hn::PromoteUpperTo(d16, v23),
+                                       hn::PromoteUpperTo(d16, v31),
+                                       hn::PromoteUpperTo(d16, v32),
+                                       hn::PromoteUpperTo(d16, v33));
 
             // Final store: 16-bit to 8-bit demotion
             auto final_val = hn::OrderedDemote2To(d8, res_lo, res_hi);
@@ -72,10 +106,14 @@ void SobelKernelNoDiv(const uint8_t* HWY_RESTRICT in, uint8_t* HWY_RESTRICT grad
         }
     }
 }
-void SobelKernel(const uint8_t* HWY_RESTRICT in, uint8_t* HWY_RESTRICT gradient, 
-                 int width, int height, uint8_t threshold) {
+void SobelKernel(const uint8_t* HWY_RESTRICT in,
+                 uint8_t* HWY_RESTRICT gradient,
+                 int width,
+                 int height,
+                 uint8_t threshold) {
     // We target 4 pixels at a time as our base 'Scalable' unit.
-    // This allows easy promotion from 8 -> 16 -> 32 bit while keeping lane counts identical.
+    // This allows easy promotion from 8 -> 16 -> 32 bit while keeping lane
+    // counts identical.
     const hn::FixedTag<uint8_t, 4> d8;
     const hn::FixedTag<int16_t, 4> d16;
     const hn::FixedTag<int32_t, 4> d32;
@@ -97,18 +135,28 @@ void SobelKernel(const uint8_t* HWY_RESTRICT in, uint8_t* HWY_RESTRICT gradient,
                 return hn::PromoteTo(d32, hn::PromoteTo(d16, hn::LoadU(d8, p)));
             };
 
-            auto p11 = load32(r0 + x);     auto p12 = load32(r0 + x + 1); auto p13 = load32(r0 + x + 2);
-            auto p21 = load32(r1 + x);                                    auto p23 = load32(r1 + x + 2);
-            auto p31 = load32(r2 + x);     auto p32 = load32(r2 + x + 1); auto p33 = load32(r2 + x + 2);
+            auto p11 = load32(r0 + x);
+            auto p12 = load32(r0 + x + 1);
+            auto p13 = load32(r0 + x + 2);
+            auto p21 = load32(r1 + x);
+            auto p23 = load32(r1 + x + 2);
+            auto p31 = load32(r2 + x);
+            auto p32 = load32(r2 + x + 1);
+            auto p33 = load32(r2 + x + 2);
+
+            // Note:: Division is very slow - we use it for now to match exactly
+            // with the naive non simd-implementation sx = (*p11 + *p31 + 2 *
+            // *p21 - *p13 - 2 * *p23 - *p33) / 9;
+            auto sx =
+                hn::Div(hn::Sub(hn::Add(hn::Add(p11, p31), hn::Add(p21, p21)),
+                                hn::Add(hn::Add(p13, p33), hn::Add(p23, p23))),
+                        vDiv);
 
-            // Note:: Division is very slow - we use it for now to match exactly with the naive non simd-implementation
-            // sx = (*p11 + *p31 + 2 * *p21 - *p13 - 2 * *p23 - *p33) / 9;
-            auto sx = hn::Div(hn::Sub(hn::Add(hn::Add(p11, p31), hn::Add(p21, p21)),
-                                      hn::Add(hn::Add(p13, p33), hn::Add(p23, p23))), vDiv);
-            
             // sy = (*p11 + *p13 + 2 * *p12 - *p31 - 2 * *p32 - *p33) / 9;
-            auto sy = hn::Div(hn::Sub(hn::Add(hn::Add(p11, p13), hn::Add(p12, p12)),
-                                      hn::Add(hn::Add(p31, p33), hn::Add(p32, p32))), vDiv);
+            auto sy =
+                hn::Div(hn::Sub(hn::Add(hn::Add(p11, p13), hn::Add(p12, p12)),
+                                hn::Add(hn::Add(p31, p33), hn::Add(p32, p32))),
+                        vDiv);
 
             // int val = sx * sx + sy * sy;
             auto magSq = hn::Add(hn::Mul(sx, sx), hn::Mul(sy, sy));
@@ -116,21 +164,24 @@ void SobelKernel(const uint8_t* HWY_RESTRICT in, uint8_t* HWY_RESTRICT gradient,
             // *optr = val > thresholdSq ? 255 : 0;
             auto mask = hn::Gt(magSq, vThreshSq);
             auto res32 = hn::IfThenElse(mask, v255, v0);
-            
+
             // Demote 32 -> 16 -> 8
             auto res8 = hn::DemoteTo(d8, hn::DemoteTo(d16, res32));
             hn::StoreU(res8, d8, out + x);
         }
     }
 }
-void SobelKerneli(const uint8_t* HWY_RESTRICT in, uint8_t* HWY_RESTRICT gradient, 
-                 int width, int height, uint8_t threshold) {
+void SobelKerneli(const uint8_t* HWY_RESTRICT in,
+                  uint8_t* HWY_RESTRICT gradient,
+                  int width,
+                  int height,
+                  uint8_t threshold) {
     const hn::ScalableTag<uint8_t> d8;
-    const hn::Half<decltype(d8)> d8_h; 
-    const hn::Rebind<int16_t, decltype(d8_h)> d16; 
+    const hn::Half<decltype(d8)> d8_h;
+    const hn::Rebind<int16_t, decltype(d8_h)> d16;
 
     const size_t N = hn::Lanes(d8);
-    const auto divisor = hn::Set(d16, (int16_t)7282); 
+    const auto divisor = hn::Set(d16, (int16_t)7282);
     const auto threshSq = hn::Set(d16, (int16_t)(threshold * threshold));
     const auto v255 = hn::Set(d16, 255);
     const auto v0 = hn::Zero(d16);
@@ -142,48 +193,72 @@ void SobelKerneli(const uint8_t* HWY_RESTRICT in, uint8_t* HWY_RESTRICT gradient
         uint8_t* out = gradient + y * width + 1;
 
         for (int x = 0; x < width; x += N) {
-            auto v11 = hn::LoadU(d8, r0 + x); auto v12 = hn::LoadU(d8, r0 + x + 1); auto v13 = hn::LoadU(d8, r0 + x + 2);
-            auto v21 = hn::LoadU(d8, r1 + x);                                      auto v23 = hn::LoadU(d8, r1 + x + 2);
-            auto v31 = hn::LoadU(d8, r2 + x); auto v32 = hn::LoadU(d8, r2 + x + 1); auto v33 = hn::LoadU(d8, r2 + x + 2);
+            auto v11 = hn::LoadU(d8, r0 + x);
+            auto v12 = hn::LoadU(d8, r0 + x + 1);
+            auto v13 = hn::LoadU(d8, r0 + x + 2);
+            auto v21 = hn::LoadU(d8, r1 + x);
+            auto v23 = hn::LoadU(d8, r1 + x + 2);
+            auto v31 = hn::LoadU(d8, r2 + x);
+            auto v32 = hn::LoadU(d8, r2 + x + 1);
+            auto v33 = hn::LoadU(d8, r2 + x + 2);
 
-            auto process = [&](auto p11, auto p12, auto p13, auto p21, auto p23, auto p31, auto p32, auto p33) {
-                auto sx = hn::Sub(hn::Add(hn::Add(p11, p31), hn::Add(p21, p21)), 
-                                  hn::Add(hn::Add(p13, p33), hn::Add(p23, p23)));
+            auto process = [&](auto p11,
+                               auto p12,
+                               auto p13,
+                               auto p21,
+                               auto p23,
+                               auto p31,
+                               auto p32,
+                               auto p33) {
+                auto sx =
+                    hn::Sub(hn::Add(hn::Add(p11, p31), hn::Add(p21, p21)),
+                            hn::Add(hn::Add(p13, p33), hn::Add(p23, p23)));
                 sx = hn::MulHigh(sx, divisor);
-                auto sy = hn::Sub(hn::Add(hn::Add(p11, p13), hn::Add(p12, p12)), 
-                                  hn::Add(hn::Add(p31, p33), hn::Add(p32, p32)));
+                auto sy =
+                    hn::Sub(hn::Add(hn::Add(p11, p13), hn::Add(p12, p12)),
+                            hn::Add(hn::Add(p31, p33), hn::Add(p32, p32)));
                 sy = hn::MulHigh(sy, divisor);
                 auto mag = hn::Add(hn::Mul(sx, sx), hn::Mul(sy, sy));
                 return hn::IfThenElse(hn::Gt(mag, threshSq), v255, v0);
             };
 
             // Process Lower Half
-            auto res_lo = process(
-                hn::PromoteTo(d16, hn::LowerHalf(v11)), hn::PromoteTo(d16, hn::LowerHalf(v12)), hn::PromoteTo(d16, hn::LowerHalf(v13)),
-                hn::PromoteTo(d16, hn::LowerHalf(v21)), hn::PromoteTo(d16, hn::LowerHalf(v23)),
-                hn::PromoteTo(d16, hn::LowerHalf(v31)), hn::PromoteTo(d16, hn::LowerHalf(v32)), hn::PromoteTo(d16, hn::LowerHalf(v33)));
+            auto res_lo = process(hn::PromoteTo(d16, hn::LowerHalf(v11)),
+                                  hn::PromoteTo(d16, hn::LowerHalf(v12)),
+                                  hn::PromoteTo(d16, hn::LowerHalf(v13)),
+                                  hn::PromoteTo(d16, hn::LowerHalf(v21)),
+                                  hn::PromoteTo(d16, hn::LowerHalf(v23)),
+                                  hn::PromoteTo(d16, hn::LowerHalf(v31)),
+                                  hn::PromoteTo(d16, hn::LowerHalf(v32)),
+                                  hn::PromoteTo(d16, hn::LowerHalf(v33)));
 
-            // Process Upper Half 
-            auto res_hi = process(
-                hn::PromoteTo(d16, hn::UpperHalf(d8_h, v11)), hn::PromoteTo(d16, hn::UpperHalf(d8_h, v12)), hn::PromoteTo(d16, hn::UpperHalf(d8_h, v13)),
-                hn::PromoteTo(d16, hn::UpperHalf(d8_h, v21)), hn::PromoteTo(d16, hn::UpperHalf(d8_h, v23)),
-                hn::PromoteTo(d16, hn::UpperHalf(d8_h, v31)), hn::PromoteTo(d16, hn::UpperHalf(d8_h, v32)), hn::PromoteTo(d16, hn::UpperHalf(d8_h, v33)));
+            // Process Upper Half
+            auto res_hi = process(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v11)),
+                                  hn::PromoteTo(d16, hn::UpperHalf(d8_h, v12)),
+                                  hn::PromoteTo(d16, hn::UpperHalf(d8_h, v13)),
+                                  hn::PromoteTo(d16, hn::UpperHalf(d8_h, v21)),
+                                  hn::PromoteTo(d16, hn::UpperHalf(d8_h, v23)),
+                                  hn::PromoteTo(d16, hn::UpperHalf(d8_h, v31)),
+                                  hn::PromoteTo(d16, hn::UpperHalf(d8_h, v32)),
+                                  hn::PromoteTo(d16, hn::UpperHalf(d8_h, v33)));
 
             hn::StoreU(hn::OrderedDemote2To(d8, res_lo, res_hi), d8, out + x);
         }
     }
 }
-} // namespace HWY_NAMESPACE
-} // namespace ndb
+}  // namespace HWY_NAMESPACE
+}  // namespace ndb
 HWY_AFTER_NAMESPACE();
 
 namespace ndb {
 namespace testing {
-//#if defined(HWY_TARGET) && HWY_TARGET == HWY_NEON
-    void sobel_hwy(uint8_t* in, uint8_t* blurred, int width, int height, uint8_t threshold) {
-        //ndb::N_NEON::SobelKernel(in, blurred, width, height, threshold);
-        HWY_STATIC_DISPATCH(SobelKernelNoDiv)(in, blurred, width, height, threshold);
-    }
-//#endif  
-}
+// #if defined(HWY_TARGET) && HWY_TARGET == HWY_NEON
+void sobel_hwy(
+    uint8_t* in, uint8_t* blurred, int width, int height, uint8_t threshold) {
+    // ndb::N_NEON::SobelKernel(in, blurred, width, height, threshold);
+    HWY_STATIC_DISPATCH(SobelKernelNoDiv)(
+        in, blurred, width, height, threshold);
 }
+// #endif
+}  // namespace testing
+}  // namespace ndb
diff --git a/lib/gpc/kernels/sobel_hwy.hpp b/lib/gpc/kernels/sobel_hwy.hpp
index bc99199..7c5d4ce 100644
--- a/lib/gpc/kernels/sobel_hwy.hpp
+++ b/lib/gpc/kernels/sobel_hwy.hpp
@@ -1,4 +1,4 @@
-#ifndef  __NDB__KERNEL_SOBEL_HWY
+#ifndef __NDB__KERNEL_SOBEL_HWY
 #define __NDB__KERNEL_SOBEL_HWY
 
 #include <cstdint>
@@ -6,11 +6,12 @@
 namespace ndb {
 
 namespace testing {
-    /**
-     * Entry point for benchmarking the MulHigh (approximate) version.
-     */
-    void sobel_hwy(uint8_t* in, uint8_t* blurred, int width, int height, uint8_t threshold);
-}
+/**
+ * Entry point for benchmarking the MulHigh (approximate) version.
+ */
+void sobel_hwy(
+    uint8_t* in, uint8_t* blurred, int width, int height, uint8_t threshold);
+}  // namespace testing
 
 }  // namespace ndb
 
diff --git a/lib/gpc/kernels/utils.cpp b/lib/gpc/kernels/utils.cpp
index ce920e8..1b1ab4b 100644
--- a/lib/gpc/kernels/utils.cpp
+++ b/lib/gpc/kernels/utils.cpp
@@ -28,18 +28,16 @@
 // POSSIBILITY OF SUCH DAMAGE.
 //
 // Code Author: Niklaus Bamert (bamertn@ethz.ch)
+#include "gpc/kernels/utils.hpp"
+
 #include <cassert>
-#include <thread>
 #include <functional>
-#include "gpc/kernels/utils.hpp"
+#include <thread>
 
 using namespace std;
 
 namespace ndb {
-void arr2ind(const unsigned char* a,
-                                       int n,
-                                       int* ind,
-                                       int* m) {
+void arr2ind(const unsigned char* a, int n, int* ind, int* m) {
 #if HWY_TARGET == HWY_AVX2
     int i, m0, k;
     __m256i msk;
@@ -104,8 +102,4 @@ void parFor(std::function<void(int, int)> const& f,
     for (auto& t : threads) t.join();
 }
 
-
-
-
-
 }  // namespace ndb
diff --git a/lib/gpc/kernels/utils.hpp b/lib/gpc/kernels/utils.hpp
index 18227ba..3985c1e 100644
--- a/lib/gpc/kernels/utils.hpp
+++ b/lib/gpc/kernels/utils.hpp
@@ -31,9 +31,10 @@
 #ifndef __NDB__KERNEL_UTILS
 #define __NDB__KERNEL_UTILS
 
+#include <hwy/highway.h>
+
 #include <cassert>
 #include <thread>
-#include <hwy/highway.h>
 
 #include "gpc/buffer.hpp"
 using namespace std;
@@ -59,10 +60,7 @@ namespace ndb {
  * @param ind output array (indices into n of nonzero elements)
  * @param m   number of elements in output
  */
-void arr2ind(const unsigned char* a,
-                                       int n,
-                                       int* ind,
-                                       int* m);
+void arr2ind(const unsigned char* a, int n, int* ind, int* m);
 
 #if HWY_TARGET == HWY_AVX2
 /**
diff --git a/samples/sparsematch.cpp b/samples/sparsematch.cpp
index 57864e8..b25a96f 100644
--- a/samples/sparsematch.cpp
+++ b/samples/sparsematch.cpp
@@ -1,21 +1,20 @@
-#include <iostream>
 #include <hwy/highway.h>
 
+#include <iostream>
+
 #include "gpc/forest.hpp"
 using namespace std;
-std::vector<ndb::Descriptor> gpcFilterDense(uint8_t* in,
-                    const std::vector<int32_t>& fastmask,
-                    int width,
-                    int height) {
+std::vector<ndb::Descriptor> gpcFilterDense(
+    uint8_t* in, const std::vector<int32_t>& fastmask, int width, int height) {
     uint32_t tmp;
     uint32_t usableW = width - 26;
     uint32_t usableH = height - 26;
     std::vector<ndb::Descriptor> out(usableW * usableH);
     int j = 0;
-    for (int y=13;y<height-13;y++) {
-        for (int x=13;x<width-13;x++) {
+    for (int y = 13; y < height - 13; y++) {
+        for (int x = 13; x < width - 13; x++) {
             tmp = 0;
-            int idx = y * width + x; 
+            int idx = y * width + x;
             for (size_t i = 0; i < fastmask.size(); i += 2) {
                 tmp <<= 1;  // shift by one
                 if (*(in + idx + fastmask[i]) > *(in + idx + fastmask[i + 1]))
@@ -56,7 +55,8 @@ int main(int argc, char** argv) {
     gpc::inference::InferenceSettings inferencesettings =
         gpc::inference::InferenceSettings()
             .builder()
-            .gradientThreshold(1) // gradientthres 20: matching ~3ms, 2: matching: ~30ms. 
+            .gradientThreshold(
+                1)  // gradientthres 20: matching ~3ms, 2: matching: ~30ms.
             .verticalTolerance(
                 0)               // 0px tolerance for rectified epipolar matches
             .dispHigh(128)       // limit disparities to 128
@@ -73,7 +73,6 @@ int main(int argc, char** argv) {
     gpc::inference::FilterMask fm =
         forest.readForest(forestPath, simg.cols(), simg.rows());
 
-
     gpc::inference::time_point t0 = gpc::inference::sysTick();
 
     gpc::inference::PreprocessedImage simgP =
@@ -86,10 +85,13 @@ int main(int argc, char** argv) {
     std::vector<ndb::Support> supp =
         forest.rectifiedMatch(simgP, timgP, fm, inferencesettings);
     gpc::inference::time_point t2 = gpc::inference::sysTick();
-    std::cout << "Number of features(s,t): " << simgP.mask.size() << "," << timgP.mask.size() << std::endl;
+    std::cout << "Number of features(s,t): " << simgP.mask.size() << ","
+              << timgP.mask.size() << std::endl;
     std::cout << "Number of matches: " << supp.size() << std::endl;
-    std::cout << "Preprocessing time: " << gpc::inference::tickToMs(t1, t0) << " ms" << std::endl;
-    std::cout << "Matching time: " << gpc::inference::tickToMs(t2, t1) << " ms" << std::endl;
+    std::cout << "Preprocessing time: " << gpc::inference::tickToMs(t1, t0)
+              << " ms" << std::endl;
+    std::cout << "Matching time: " << gpc::inference::tickToMs(t2, t1) << " ms"
+              << std::endl;
     /*
     std::vector<ndb::Descriptor> statesSrc = forest.evalFastMaskOnSubsetSSE(
         simgP.smooth, simgP.grad, simgP.mask, fm, inferencesettings);
@@ -97,10 +99,11 @@ int main(int argc, char** argv) {
         timgP.smooth, timgP.grad, timgP.mask, fm, inferencesettings);
     */
 
-    std::vector<ndb::Descriptor> statesSrc = gpcFilterDense(simgP.smooth.data(), fm.mask, simgP.smooth.cols(), simgP.smooth.rows());
-    std::vector<ndb::Descriptor> statesTar = gpcFilterDense(timgP.smooth.data(), fm.mask, timgP.smooth.cols(), timgP.smooth.rows());
+    std::vector<ndb::Descriptor> statesSrc = gpcFilterDense(
+        simgP.smooth.data(), fm.mask, simgP.smooth.cols(), simgP.smooth.rows());
+    std::vector<ndb::Descriptor> statesTar = gpcFilterDense(
+        timgP.smooth.data(), fm.mask, timgP.smooth.cols(), timgP.smooth.rows());
 
     ndb::Descriptor::serialize("statesSrcLargeS.txt", statesSrc);
     ndb::Descriptor::serialize("statesTarLargeS.txt", statesTar);
-
 }
diff --git a/samples/target.cpp b/samples/target.cpp
index 6c03ab7..f457f9d 100644
--- a/samples/target.cpp
+++ b/samples/target.cpp
@@ -1,4 +1,5 @@
 #include <hwy/highway.h>
+
 #include <iostream>
 int main() {
     std::cout << "Compiled for: " << hwy::TargetName(HWY_TARGET) << std::endl;

From 432cc9238937b6afa354e33c22b6045508d413c5 Mon Sep 17 00:00:00 2001
From: Nik Bamert <github@nikbamert.com>
Date: Tue, 7 Apr 2026 09:17:14 +0200
Subject: [PATCH 36/36] update approval test blob due to div /9 approximation
 in sobel filter

---
 CMakeLists.txt                                |   2 -
 benchmarks/box_bench.cpp                      |   4 -
 benchmarks/correspondence_bench.cpp           |   1 -
 lib/gpc/kernels/gpc_hwy.cpp                   |  14 +-
 lib/gpc/kernels/sobel_hwy.cpp                 | 139 ------------------
 ...e_matching.Approval.Inference.approved.txt |   2 +-
 tests/test_single_matching.cpp                |   2 +-
 7 files changed, 5 insertions(+), 159 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 72f760c..6d3a39e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -48,8 +48,6 @@ set(BENCHMARK_ENABLE_INSTALL OFF CACHE BOOL "" FORCE)
 # Force the library itself to build in Release mode
 set(CMAKE_BUILD_TYPE Release CACHE STRING "" FORCE)
 
-#add_definitions(-DNDEBUG) 
-
 FetchContent_MakeAvailable(google_benchmark)
 FetchContent_MakeAvailable(google_benchmark)
 add_library(gpc_core 
diff --git a/benchmarks/box_bench.cpp b/benchmarks/box_bench.cpp
index 7b7ff5c..f0aeeb8 100644
--- a/benchmarks/box_bench.cpp
+++ b/benchmarks/box_bench.cpp
@@ -7,11 +7,9 @@ static void BM_BoxHighway(benchmark::State& state) {
     std::vector<uint8_t> in(w * h, 128);
     std::vector<uint8_t> out(w * h, 0);
     state.SetLabel(hwy::TargetName(HWY_TARGET));    
-    // Warmup is handled automatically by the library
     for (auto _ : state) {
         ndb::testing::box_hwy(in.data(), out.data(), w, h);
         
-        // Ensure the compiler doesn't skip the work
         benchmark::DoNotOptimize(out.data());
         benchmark::ClobberMemory();
     }
@@ -27,7 +25,6 @@ static void BM_BoxLegacySIMD(benchmark::State& state) {
     for (auto _ : state) {
         ndb::boxSSE(in.data(), out.data(), w, h);
         
-        // Ensure the compiler doesn't skip the work
         benchmark::DoNotOptimize(out.data());
         benchmark::ClobberMemory();
     }
@@ -42,7 +39,6 @@ static void BM_BoxNaive(benchmark::State& state) {
     for (auto _ : state) {
         ndb::boxNaive(in.data(), out.data(), w, h);
         
-        // Ensure the compiler doesn't skip the work
         benchmark::DoNotOptimize(out.data());
         benchmark::ClobberMemory();
     }
diff --git a/benchmarks/correspondence_bench.cpp b/benchmarks/correspondence_bench.cpp
index 737d4dc..b847fb1 100644
--- a/benchmarks/correspondence_bench.cpp
+++ b/benchmarks/correspondence_bench.cpp
@@ -24,7 +24,6 @@ std::vector<ndb::Descriptor> generate_pareto_ids(size_t count, double target_mea
     // 1e-9 epsilon prevents division by zero/infinity
     std::uniform_real_distribution<double> dist(1e-9, 1.0);
 
-    // Alpha = 1.16 provides a classic "80/20" Pareto distribution
     const double alpha = 1.16; 
     const double xm = target_mean * (alpha - 1.0) / alpha;
 
diff --git a/lib/gpc/kernels/gpc_hwy.cpp b/lib/gpc/kernels/gpc_hwy.cpp
index 87b0f20..d84710a 100644
--- a/lib/gpc/kernels/gpc_hwy.cpp
+++ b/lib/gpc/kernels/gpc_hwy.cpp
@@ -34,8 +34,6 @@ void GPCKernel(const uint8_t* HWY_RESTRICT in,
         for (int x = border; x <= width - border - (int)N; x += N) {
             const int k = row_base + x;
 
-            // We use four 8-bit registers to build the 32 bits.
-            // This keeps the entire hot-loop in 8-bit space.
             auto v_acc0 = hn::Zero(d8);  // Bits 0-7
             auto v_acc1 = hn::Zero(d8);  // Bits 8-15
             auto v_acc2 = hn::Zero(d8);  // Bits 16-23
@@ -73,12 +71,7 @@ void GPCKernel(const uint8_t* HWY_RESTRICT in,
                 v_acc3 = hn::Or(v_acc3, hn::IfThenElse(mask, v_one8, v_zero8));
             }
 
-            // Final Assembly: Promote the four 8-bit chunks into 32-bit
-            // results. We use PromoteUpper/Lower to widen the data. N is the
-            // number of 8-bit lanes. We need to store N/4 results in d32.
-
-            // To be perfectly safe across all Highway targets, we extract and
-            // combine:
+            //extract and combine:
             for (size_t lane = 0; lane < N; ++lane) {
                 uint32_t final_val =
                     (uint32_t(hn::ExtractLane(v_acc0, lane)) << 24) |
@@ -110,10 +103,10 @@ void GPCKerneli(const uint8_t* HWY_RESTRICT in,
         for (int x = 0; x < width; x += N) {
             const uint8_t* centerGrad = grad + y * width + x;
 
-            // 1. Load the gradient bytes for the current N lanes
+            // Load the gradient bytes for the current N lanes
             auto v_grad = hn::LoadU(d8_n, centerGrad);
 
-            // 2. Promotion-free zero check
+            // Promotion-free zero check
             if (hn::AllTrue(d8_n, hn::Eq(v_grad, hn::Zero(d8_n)))) {
                 continue;
             }
@@ -123,7 +116,6 @@ void GPCKerneli(const uint8_t* HWY_RESTRICT in,
             for (size_t i = 0; i < fastmask.size(); i += 2) {
                 v_tmp = hn::ShiftLeft<1>(v_tmp);
 
-                // 3. The "Promotion" that actually works on all platforms:
                 // Promote N lanes of uint8 to N lanes of uint32
                 auto v1 = hn::PromoteTo(
                     d32, hn::LoadU(d8_n, in + y * width + x + fastmask[i]));
diff --git a/lib/gpc/kernels/sobel_hwy.cpp b/lib/gpc/kernels/sobel_hwy.cpp
index 14ad593..493b52d 100644
--- a/lib/gpc/kernels/sobel_hwy.cpp
+++ b/lib/gpc/kernels/sobel_hwy.cpp
@@ -106,146 +106,7 @@ void SobelKernelNoDiv(const uint8_t* HWY_RESTRICT in,
         }
     }
 }
-void SobelKernel(const uint8_t* HWY_RESTRICT in,
-                 uint8_t* HWY_RESTRICT gradient,
-                 int width,
-                 int height,
-                 uint8_t threshold) {
-    // We target 4 pixels at a time as our base 'Scalable' unit.
-    // This allows easy promotion from 8 -> 16 -> 32 bit while keeping lane
-    // counts identical.
-    const hn::FixedTag<uint8_t, 4> d8;
-    const hn::FixedTag<int16_t, 4> d16;
-    const hn::FixedTag<int32_t, 4> d32;
 
-    const auto vDiv = hn::Set(d32, 9);
-    const auto vThreshSq = hn::Set(d32, (int32_t)threshold * threshold);
-    const auto v255 = hn::Set(d32, 255);
-    const auto v0 = hn::Zero(d32);
-
-    for (int y = 1; y < height - 1; ++y) {
-        const uint8_t* r0 = in + (y - 1) * width;
-        const uint8_t* r1 = in + y * width;
-        const uint8_t* r2 = in + (y + 1) * width;
-        uint8_t* out = gradient + y * width + 1;
-
-        for (int x = 0; x < width; x += 4) {
-            // Load and promote immediately to 32-bit to match naive 'int' math
-            auto load32 = [&](const uint8_t* p) {
-                return hn::PromoteTo(d32, hn::PromoteTo(d16, hn::LoadU(d8, p)));
-            };
-
-            auto p11 = load32(r0 + x);
-            auto p12 = load32(r0 + x + 1);
-            auto p13 = load32(r0 + x + 2);
-            auto p21 = load32(r1 + x);
-            auto p23 = load32(r1 + x + 2);
-            auto p31 = load32(r2 + x);
-            auto p32 = load32(r2 + x + 1);
-            auto p33 = load32(r2 + x + 2);
-
-            // Note:: Division is very slow - we use it for now to match exactly
-            // with the naive non simd-implementation sx = (*p11 + *p31 + 2 *
-            // *p21 - *p13 - 2 * *p23 - *p33) / 9;
-            auto sx =
-                hn::Div(hn::Sub(hn::Add(hn::Add(p11, p31), hn::Add(p21, p21)),
-                                hn::Add(hn::Add(p13, p33), hn::Add(p23, p23))),
-                        vDiv);
-
-            // sy = (*p11 + *p13 + 2 * *p12 - *p31 - 2 * *p32 - *p33) / 9;
-            auto sy =
-                hn::Div(hn::Sub(hn::Add(hn::Add(p11, p13), hn::Add(p12, p12)),
-                                hn::Add(hn::Add(p31, p33), hn::Add(p32, p32))),
-                        vDiv);
-
-            // int val = sx * sx + sy * sy;
-            auto magSq = hn::Add(hn::Mul(sx, sx), hn::Mul(sy, sy));
-
-            // *optr = val > thresholdSq ? 255 : 0;
-            auto mask = hn::Gt(magSq, vThreshSq);
-            auto res32 = hn::IfThenElse(mask, v255, v0);
-
-            // Demote 32 -> 16 -> 8
-            auto res8 = hn::DemoteTo(d8, hn::DemoteTo(d16, res32));
-            hn::StoreU(res8, d8, out + x);
-        }
-    }
-}
-void SobelKerneli(const uint8_t* HWY_RESTRICT in,
-                  uint8_t* HWY_RESTRICT gradient,
-                  int width,
-                  int height,
-                  uint8_t threshold) {
-    const hn::ScalableTag<uint8_t> d8;
-    const hn::Half<decltype(d8)> d8_h;
-    const hn::Rebind<int16_t, decltype(d8_h)> d16;
-
-    const size_t N = hn::Lanes(d8);
-    const auto divisor = hn::Set(d16, (int16_t)7282);
-    const auto threshSq = hn::Set(d16, (int16_t)(threshold * threshold));
-    const auto v255 = hn::Set(d16, 255);
-    const auto v0 = hn::Zero(d16);
-
-    for (int y = 1; y < height - 1; ++y) {
-        const uint8_t* r0 = in + (y - 1) * width;
-        const uint8_t* r1 = in + y * width;
-        const uint8_t* r2 = in + (y + 1) * width;
-        uint8_t* out = gradient + y * width + 1;
-
-        for (int x = 0; x < width; x += N) {
-            auto v11 = hn::LoadU(d8, r0 + x);
-            auto v12 = hn::LoadU(d8, r0 + x + 1);
-            auto v13 = hn::LoadU(d8, r0 + x + 2);
-            auto v21 = hn::LoadU(d8, r1 + x);
-            auto v23 = hn::LoadU(d8, r1 + x + 2);
-            auto v31 = hn::LoadU(d8, r2 + x);
-            auto v32 = hn::LoadU(d8, r2 + x + 1);
-            auto v33 = hn::LoadU(d8, r2 + x + 2);
-
-            auto process = [&](auto p11,
-                               auto p12,
-                               auto p13,
-                               auto p21,
-                               auto p23,
-                               auto p31,
-                               auto p32,
-                               auto p33) {
-                auto sx =
-                    hn::Sub(hn::Add(hn::Add(p11, p31), hn::Add(p21, p21)),
-                            hn::Add(hn::Add(p13, p33), hn::Add(p23, p23)));
-                sx = hn::MulHigh(sx, divisor);
-                auto sy =
-                    hn::Sub(hn::Add(hn::Add(p11, p13), hn::Add(p12, p12)),
-                            hn::Add(hn::Add(p31, p33), hn::Add(p32, p32)));
-                sy = hn::MulHigh(sy, divisor);
-                auto mag = hn::Add(hn::Mul(sx, sx), hn::Mul(sy, sy));
-                return hn::IfThenElse(hn::Gt(mag, threshSq), v255, v0);
-            };
-
-            // Process Lower Half
-            auto res_lo = process(hn::PromoteTo(d16, hn::LowerHalf(v11)),
-                                  hn::PromoteTo(d16, hn::LowerHalf(v12)),
-                                  hn::PromoteTo(d16, hn::LowerHalf(v13)),
-                                  hn::PromoteTo(d16, hn::LowerHalf(v21)),
-                                  hn::PromoteTo(d16, hn::LowerHalf(v23)),
-                                  hn::PromoteTo(d16, hn::LowerHalf(v31)),
-                                  hn::PromoteTo(d16, hn::LowerHalf(v32)),
-                                  hn::PromoteTo(d16, hn::LowerHalf(v33)));
-
-            // Process Upper Half
-            auto res_hi = process(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v11)),
-                                  hn::PromoteTo(d16, hn::UpperHalf(d8_h, v12)),
-                                  hn::PromoteTo(d16, hn::UpperHalf(d8_h, v13)),
-                                  hn::PromoteTo(d16, hn::UpperHalf(d8_h, v21)),
-                                  hn::PromoteTo(d16, hn::UpperHalf(d8_h, v23)),
-                                  hn::PromoteTo(d16, hn::UpperHalf(d8_h, v31)),
-                                  hn::PromoteTo(d16, hn::UpperHalf(d8_h, v32)),
-                                  hn::PromoteTo(d16, hn::UpperHalf(d8_h, v33)));
-
-            hn::StoreU(hn::OrderedDemote2To(d8, res_lo, res_hi), d8, out + x);
-        }
-    }
-}
 }  // namespace HWY_NAMESPACE
 }  // namespace ndb
 HWY_AFTER_NAMESPACE();
diff --git a/tests/test_single_matching.Approval.Inference.approved.txt b/tests/test_single_matching.Approval.Inference.approved.txt
index 294072e..d481bd2 100644
--- a/tests/test_single_matching.Approval.Inference.approved.txt
+++ b/tests/test_single_matching.Approval.Inference.approved.txt
@@ -1 +1 @@
-[(13, 569, 0), (13, 570, 0), (13, 658, -1), (13, 659, -1), (13, 660, -1), (13, 671, -1), (13, 690, -2), (14, 562, 1), (14, 571, 1), (14, 572, 1), (14, 573, 1), (14, 609, 0), (14, 792, -5), (14, 793, -5), (14, 794, -5), (14, 857, -5), (15, 566, 2), (15, 828, -5), (15, 868, -6), (15, 1006, -10), (16, 850, -5), (143, 76, 103), (215, 753, 102), (236, 336, 128), (236, 337, 128), (237, 340, 128), (237, 341, 128), (239, 347, 128), (239, 351, 128), (239, 352, 128), (239, 353, 128), (239, 356, 128), (240, 263, 67), (240, 351, 128), (240, 352, 128), (240, 354, 128), (240, 357, 128), (240, 359, 128), (240, 362, 128), (241, 264, 68), (241, 362, 128), (241, 364, 128), (242, 364, 128), (242, 367, 128), (243, 267, 68), (243, 370, 128), (243, 371, 128), (243, 372, 127), (243, 373, 127), (243, 374, 127), (243, 375, 128), (243, 377, 128), (243, 421, 124), (244, 268, 69), (244, 377, 127), (244, 378, 127), (244, 380, 127), (244, 420, 124), (244, 421, 124), (244, 442, 127), (245, 270, 69), (245, 385, 127), (245, 386, 127), (245, 425, 124), (245, 439, 127), (246, 387, 128), (246, 388, 127), (246, 389, 127), (247, 442, 128), (247, 448, 128), (248, 374, 128), (248, 446, 128), (249, 276, 69), (250, 277, 69), (250, 412, 127), (250, 413, 126), (250, 456, 127), (252, 426, 127), (252, 932, 115), (254, 425, 126), (256, 287, 68), (257, 289, 68), (258, 290, 68), (259, 292, 69), (259, 421, 126), (262, 419, 127), (264, 418, 126), (266, 418, 6), (266, 550, 121), (267, 422, 126), (269, 286, 82), (270, 938, 119), (272, 582, 120), (272, 583, 120), (273, 843, 113), (275, 934, 124), (277, 560, 127), (278, 617, 119), (279, 570, 96), (279, 617, 119), (279, 620, 119), (279, 621, 119), (282, 623, -1), (283, 644, 118), (284, 637, 92), (285, 635, -2), (285, 642, 119), (287, 401, 40), (290, 557, -38), (291, 634, 125), (292, 643, 126), (292, 674, 118), (296, 570, -40), (296, 665, 5), (296, 715, 116), (296, 716, 116), (297, 674, 119), (298, 672, 120), (298, 673, 120), (299, 682, 119), (299, 723, 117), (300, 687, 119), (300, 688, 119), (300, 690, 118), (301, 580, -42), (301, 693, 119), (301, 698, 118), (301, 744, 115), (304, 587, -43), (304, 953, 125), (310, 600, -46), (310, 986, 128), (311, 506, 62), (312, 205, 127), (320, 840, 112), (321, 836, 112), (322, 237, 72), (322, 841, 111), (324, 843, 112), (324, 844, 112), (325, 838, 113), (325, 844, 112), (325, 845, 112), (326, 842, 113), (328, 228, 127), (329, 228, 127), (332, 845, 113), (333, 844, 114), (336, 719, 42), (338, 716, 38), (339, 235, 125), (339, 381, 99), (339, 718, 39), (344, 232, 52), (346, 455, -117), (346, 456, -116), (346, 554, -121), (354, 834, 125), (355, 831, 126), (359, 223, 124), (368, 218, 126), (370, 392, -97), (377, 58, 124), (385, 315, 123), (387, 316, 124), (387, 384, 94), (388, 313, 119), (391, 312, 122), (395, 299, 122), (396, 54, 109), (396, 475, -24), (400, 918, -107), (404, 180, 123), (407, 453, -109), (408, 286, 119), (409, 208, 123), (412, 504, -44), (414, 340, 121), (415, 220, 123), (415, 221, 123), (417, 561, 87), (418, 218, 123), (420, 1035, -94), (425, 338, 118), (433, 53, 124), (434, 379, 117), (436, 379, 118), (437, 52, 128), (437, 205, -47), (437, 379, 118), (439, 378, 109), (440, 317, 120), (443, 285, 120), (443, 394, 125), (446, 414, 126), (448, 202, 121), (452, 330, 119), (452, 719, 64), (454, 199, 122), (455, 52, 126), (455, 723, 65), (456, 377, 118), (457, 199, 122), (463, 325, 120), (465, 330, 112), (470, 192, 121), (472, 330, 120), (473, 327, 118), (473, 372, 115), (477, 372, 112), (480, 272, 108), (480, 279, 119), (481, 170, 120), (481, 186, 120), (481, 221, 115), (482, 169, 120), (483, 185, 121), (486, 823, 63), (489, 181, 120), (492, 370, 115), (500, 312, 118), (504, 179, 119), (505, 172, 122), (508, 171, 120), (509, 171, 120), (510, 961, 126), (517, 169, 121), (517, 217, 118), (518, 401, 124), (519, 400, 122), (520, 400, 124), (523, 321, 115), (523, 407, 126), (525, 171, 118), (526, 272, 117), (527, 363, 127), (529, 315, 116), (532, 171, 119), (534, 360, 110), (539, 360, 112), (543, 297, 117), (552, 588, 27), (555, 262, 96), (555, 295, 117), (555, 391, 123), (556, 357, 112), (557, 291, 116), (558, 594, 27), (565, 161, 115), (570, 169, 116), (570, 170, 120), (572, 354, 111), (572, 398, 122), (573, 313, 114), (573, 611, 28), (574, 609, 29), (576, 394, 105), (577, 615, 29), (578, 352, 113), (579, 352, 110), (580, 697, -38), (581, 166, 112), (583, 354, 112), (585, 622, 31), (589, 310, 112), (591, 262, 113), (593, 352, 113), (595, 204, 115), (597, 68, 117), (597, 69, 117), (598, 64, 117), (598, 66, 117), (598, 67, 117), (598, 68, 117), (598, 69, 117), (598, 70, 117), (598, 71, 117), (598, 72, 117), (598, 81, 117), (599, 73, 117), (599, 75, 117), (599, 86, 116), (599, 87, 116), (600, 83, 117), (600, 84, 117), (600, 85, 117), (600, 88, 117), (600, 94, 116), (600, 95, 116), (601, 92, 117), (601, 99, 116), (601, 100, 116), (601, 101, 116), (601, 102, 116), (602, 102, 116), (602, 106, 116), (602, 107, 116), (603, 151, 115), (603, 204, 114), (603, 205, 113), (604, 126, 115), (604, 146, 115), (604, 147, 115), (605, 146, 115), (605, 205, 113), (605, 348, 107), (606, 144, 115), (606, 146, 115), (606, 148, 115), (606, 150, 115), (606, 203, 114), (607, 150, 115), (608, 55, 118), (609, 204, 114), (610, 205, 114), (611, 205, 114), (611, 208, 114), (611, 209, 114), (611, 210, 114), (612, 150, 116), (612, 202, 114), (612, 205, 114), (612, 206, 114), (612, 208, 113), (612, 209, 113), (612, 345, 112), (613, 102, 117), (613, 202, 114), (613, 348, 111), (613, 652, -113), (614, 212, 114), (614, 305, 111), (614, 306, 111), (615, 103, 118), (615, 347, 111), (616, 154, 114), (616, 233, 113), (616, 234, 113), (616, 236, 113), (616, 250, 113), (616, 251, 113), (616, 303, 112), (616, 655, 34), (617, 132, 117), (617, 160, 116), (617, 239, 113), (617, 241, 113), (617, 242, 113), (617, 251, 113), (617, 347, 110), (618, 134, 117), (618, 137, 117), (618, 138, 117), (618, 250, 113), (619, 108, 118), (619, 109, 118), (619, 110, 118), (619, 129, 117), (619, 135, 117), (619, 136, 117), (619, 138, 117), (619, 147, 117), (619, 154, 116), (619, 253, 113), (620, 103, 118), (620, 110, 118), (620, 256, 113), (620, 258, 119), (620, 304, 112), (621, 111, 117), (621, 150, 116), (622, 129, 118), (622, 168, 117), (622, 308, 112), (622, 347, 111), (623, 129, 118), (623, 203, 115), (623, 303, 112), (623, 306, 111), (623, 309, 111), (623, 310, 111), (623, 346, 110), (623, 347, 111), (624, 130, 117), (624, 164, 116), (624, 205, 115), (624, 305, 112), (624, 308, 111), (624, 311, 112), (624, 388, 116), (625, 167, 117), (625, 169, 116), (625, 198, 116), (625, 205, 115), (625, 303, 111), (625, 316, 111), (625, 317, 111), (625, 318, 111), (625, 319, 111), (625, 339, 111), (625, 341, 111), (625, 342, 111), (625, 343, 111), (625, 389, 123), (626, 130, 117), (626, 138, 118), (626, 199, 115), (626, 200, 115), (626, 301, 112), (626, 326, 111), (626, 329, 111), (626, 330, 111), (626, 331, 111), (626, 332, 111), (626, 333, 111), (626, 339, 111), (626, 342, 111), (626, 343, 111), (627, 130, 117), (627, 141, 116), (627, 254, 115), (627, 337, 111), (627, 339, 111), (628, 50, 117), (628, 183, 116), (628, 184, 116), (628, 185, 116), (628, 254, 114), (628, 255, 114), (628, 302, 111), (628, 341, 111), (629, 97, 117), (629, 132, 118), (629, 188, 115), (629, 192, 116), (629, 193, 116), (629, 255, 114), (629, 341, 111), (630, 144, 117), (630, 193, 115), (630, 194, 115), (630, 302, 112), (630, 342, 111), (630, 344, 111), (630, 348, 110), (631, 134, 118), (631, 344, 111), (631, 348, 110), (632, 47, 118), (632, 50, 117), (632, 109, 119), (632, 196, 116), (632, 198, 116), (632, 202, 115), (632, 203, 115), (632, 205, 115), (632, 206, 115), (632, 342, 111), (633, 52, 118), (633, 53, 118), (633, 54, 118), (633, 206, 115), (633, 214, 114), (634, 53, 118), (634, 55, 118), (634, 64, 117), (635, 51, 118), (635, 76, 117), (635, 97, 117), (635, 112, 118), (635, 136, 116), (636, 61, 117), (636, 79, 117), (636, 97, 117), (636, 301, 113), (637, 75, 118), (637, 84, 115), (637, 305, 113), (637, 347, 112), (638, 347, 112), (638, 408, 126), (639, 83, 117), (639, 387, 117), (639, 408, 124), (640, 85, 117), (640, 89, 117), (640, 92, 117), (640, 277, 113), (640, 278, 113), (641, 85, 117), (641, 111, 119), (641, 347, 112), (642, 91, 114), (643, 144, 115), (643, 145, 115), (643, 343, 112), (643, 346, 111), (644, 87, 114), (644, 142, 115), (644, 342, 112), (645, 94, 115), (645, 112, 121), (645, 162, 115), (645, 343, 112), (646, 90, 115), (646, 143, 116), (646, 146, 116), (646, 148, 116), (646, 149, 116), (646, 166, 115), (646, 167, 115), (646, 347, 102), (647, 118, 121), (647, 151, 116), (647, 163, 116), (647, 164, 116), (647, 343, 111), (647, 364, 115), (648, 144, 116), (648, 364, 115), (648, 386, 124), (649, 92, 116), (649, 119, 121), (649, 170, 115), (650, 294, 113), (651, 93, 116), (652, 459, 84), (654, 318, 112), (655, 122, 121), (655, 318, 111), (655, 321, 112), (655, 322, 112), (655, 323, 112), (655, 334, 112), (655, 439, 87), (656, 228, 114), (656, 229, 114), (656, 233, 113), (656, 256, 113), (656, 321, 112), (656, 323, 112), (656, 332, 111), (656, 347, 111), (657, 125, 66), (657, 331, 111), (657, 347, 111), (658, 149, 118), (658, 244, 114), (659, 254, 113), (659, 255, 113), (659, 265, 113), (660, 261, 113), (661, 99, 116), (662, 126, 121), (662, 127, 121), (664, 152, 120), (665, 126, 123), (665, 311, 113), (666, 340, 111), (666, 341, 111), (666, 380, 117), (667, 119, 121), (667, 129, 121), (667, 331, 111), (667, 339, 112), (667, 340, 111), (667, 343, 112), (668, 328, 112), (668, 329, 112), (668, 330, 112), (668, 339, 112), (668, 343, 111), (668, 380, 118), (669, 342, 111), (669, 343, 111), (669, 344, 111), (670, 343, 111), (671, 343, 111), (672, 344, 111), (673, 103, 117), (673, 106, 118), (673, 120, 119), (674, 426, -43), (674, 432, 11), (675, 135, 122), (677, 136, 121), (677, 155, 122), (678, 156, 121), (679, 123, 120), (679, 352, 115), (680, 136, 122), (681, 157, 122), (682, 125, 119), (682, 347, 113), (682, 459, 11), (683, 459, 13), (684, 138, 123), (686, 346, 116), (688, 354, 114), (688, 379, 118), (689, 493, -77), (691, 356, 115), (691, 378, 116), (692, 455, -69), (693, 356, 114), (694, 164, 126), (695, 356, 114), (697, 119, 119), (698, 130, 118), (698, 390, 121), (698, 955, 88), (699, 970, 4), (699, 971, 4), (699, 972, 89), (700, 998, 109), (703, 122, 120), (705, 123, 120), (706, 354, 115), (707, 124, 120), (707, 354, 115), (708, 125, 120), (708, 149, 125), (709, 372, 116), (710, 126, 120), (711, 130, 120), (712, 127, 120), (712, 130, 120), (715, 128, 120), (716, 128, 120), (716, 174, 124), (718, 129, 120), (718, 175, 124), (719, 134, 120), (720, 490, -69), (721, 131, 120), (723, 159, 125), (725, 133, 121), (726, 133, 121), (727, 158, 126), (727, 160, 126), (729, 135, 121), (730, 135, 121), (730, 159, 126), (730, 162, 126), (731, 136, 121), (734, 161, 126), (738, 1036, -31), (741, 139, 122), (742, 141, 122), (742, 164, 128), (742, 433, -93), (743, 638, -73), (744, 140, 121), (746, 166, 127), (747, 192, 126), (751, 171, 66), (755, 170, 127), (756, 173, 128), (757, 468, 122), (757, 479, 119), (760, 172, 128), (763, 174, 128), (764, 176, 65), (768, 369, 118), (772, 154, 125), (775, 443, -4), (777, 448, -3), (778, 457, 108), (778, 458, 108), (784, 185, 63), (793, 1033, -10), (794, 884, -29), (795, 1011, -44), (796, 400, -6), (796, 1025, 128), (797, 427, -8), (798, 165, 126), (798, 1024, -5), (800, 433, 45), (801, 880, -11), (802, 412, 119), (802, 771, 52), (802, 937, 94), (816, 224, 126), (816, 389, -93), (816, 393, -93), (816, 876, -77), (820, 934, -91), (823, 653, 95), (824, 423, -25), (824, 965, 66), (830, 936, 86), (833, 871, 62), (839, 769, -124), (845, 470, 83), (848, 185, 128), (850, 481, 47), (850, 864, -124), (851, 424, 2), (864, 762, -125), (865, 383, 6), (865, 761, -124), (868, 346, 53), (873, 476, 7), (884, 337, -6), (885, 371, -87), (885, 372, -87), (888, 586, 60), (889, 373, 45), (900, 759, -50), (902, 642, -3), (913, 389, 37), (916, 435, 38), (917, 842, -107), (919, 704, 58), (927, 561, -120), (940, 695, 54), (944, 840, -85), (957, 691, 52), (963, 269, 54), (963, 935, 116), (964, 952, -89), (965, 274, 54), (965, 275, 54), (965, 988, -94), (965, 989, -94), (966, 828, -90), (966, 983, -92), (966, 984, -92), (966, 985, -92), (966, 986, -92), (970, 667, -48), (972, 689, 56), (975, 397, 118), (990, 1000, -97), (997, 813, -85), (1007, 769, 53), (1010, 812, -61), (1018, 392, 38), (1026, 867, 91), (1057, 665, -59), (1085, 971, -84), (1087, 382, 112), (1119, 787, 40), (1128, 1008, -1), (1129, 973, 72), (1129, 974, 72), (1130, 989, -2), (1132, 642, 47), (1150, 639, 52), (1157, 635, 48), (1173, 631, 46), (1174, 461, 93), (1197, 804, -98), (1202, 624, -55), (1214, 500, -60), (1223, 618, 52), (1236, 494, -77), (1238, 929, 81), (1258, 916, 88), (1261, 606, 45), (1266, 684, 66), (1270, 603, 47), (1276, 685, 58), (1281, 998, 110), (1281, 999, 110), (1281, 1000, 112), (1288, 928, 120), (1288, 929, 120), (1296, 78, 107), (1297, 86, 107), (1297, 100, 109), (1297, 101, 109), (1297, 103, 109), (1297, 104, 109), (1298, 75, 108), (1298, 102, 109), (1298, 105, 108), (1298, 106, 108), (1299, 75, 108), (1300, 69, 109), (1301, 68, 108), (1302, 69, 109), (1302, 75, 109), (1303, 73, 108), (1304, 50, 107), (1304, 65, 108), (1304, 76, 108), (1307, 507, 77), (1322, 666, 65), (1325, 588, 47), (1341, 197, 123), (1341, 719, 39), (1341, 720, 39), (1354, 344, 97), (1355, 752, 38), (1355, 753, 38), (1356, 343, 98), (1358, 335, 96), (1360, 335, 97), (1360, 336, 97), (1364, 336, 98), (1367, 561, -53), (1371, 461, 125), (1372, 459, 126), (1380, 337, 99), (1383, 335, 111), (1403, 715, 83), (1408, 820, 110), (1412, 825, 114), (1417, 856, 116), (1418, 570, 107), (1426, 910, 20), (1427, 890, 120), (1428, 889, 120), (1437, 942, 125), (1437, 943, 125), (1443, 963, 124), (1453, 896, 123), (1455, 417, 101), (1456, 305, 99), (1456, 308, 99), (1456, 416, 102), (1457, 306, 100), (1457, 416, 102), (1457, 917, 119), (1458, 897, 125), (1461, 477, 115), (1464, 410, 98), (1467, 334, 103), (1469, 407, 98), (1469, 440, 104), (1469, 441, 103), (1469, 443, 103), (1514, 460, 104), (1519, 897, 124), (1519, 899, 124), (1520, 897, 124), (1526, 929, 124), (1528, 899, 122), (1529, 912, 125), (1529, 913, 124), (1529, 916, 125), (1531, 839, 124), (1536, 969, 128), (1539, 962, 127), (1540, 964, 127), (1540, 965, 127), (1561, 1000, 128), (1584, 863, 49), (1686, 1017, -39), (1712, 979, 90), (1713, 978, 86), (1766, 971, 128), (1769, 966, 128), (1778, 834, -14), (1853, 970, 126), (1885, 905, 89), (1897, 935, 125), (1902, 171, 1), (1903, 67, 1), (1903, 291, 1), (1903, 292, 1), (1904, 260, 1), (1904, 261, 1), (1905, 329, 1)]
+[(13, 570, 0), (13, 658, -1), (13, 659, -1), (13, 660, -1), (13, 690, -2), (14, 562, 1), (14, 571, 1), (14, 572, 1), (14, 573, 1), (14, 609, 0), (14, 792, -5), (14, 857, -5), (15, 566, 2), (15, 868, -6), (15, 1006, -10), (16, 850, -5), (143, 76, 103), (150, 736, 102), (165, 543, 81), (165, 546, 80), (175, 982, 128), (178, 546, 87), (183, 546, 81), (183, 743, 99), (188, 742, 102), (191, 749, 105), (195, 547, 77), (203, 756, 103), (215, 753, 102), (224, 690, 95), (236, 336, 128), (236, 337, 128), (237, 340, 128), (237, 341, 128), (239, 347, 128), (239, 351, 128), (239, 352, 128), (239, 353, 128), (239, 356, 128), (240, 263, 67), (240, 351, 128), (240, 352, 128), (240, 354, 128), (240, 357, 128), (240, 359, 128), (240, 362, 128), (241, 264, 68), (241, 362, 128), (241, 364, 128), (242, 364, 128), (242, 367, 128), (243, 267, 68), (243, 370, 128), (243, 371, 128), (243, 372, 127), (243, 373, 127), (243, 374, 127), (243, 375, 128), (243, 377, 128), (243, 421, 124), (244, 376, 127), (244, 377, 127), (244, 378, 127), (244, 420, 124), (244, 421, 124), (244, 442, 127), (245, 270, 69), (245, 385, 127), (245, 386, 127), (245, 425, 124), (245, 439, 127), (246, 387, 128), (246, 388, 127), (246, 389, 127), (246, 741, 102), (247, 362, 128), (247, 442, 128), (247, 448, 128), (248, 373, 128), (248, 374, 128), (248, 376, 128), (248, 446, 128), (249, 276, 69), (249, 404, 127), (250, 277, 69), (250, 411, 127), (250, 412, 127), (250, 413, 126), (250, 456, 127), (252, 426, 127), (254, 425, 126), (256, 287, 68), (257, 289, 68), (258, 290, 68), (259, 292, 69), (259, 421, 126), (261, 521, 94), (262, 419, 127), (264, 418, 126), (264, 853, 114), (266, 418, 6), (266, 550, 121), (267, 422, 126), (269, 286, 82), (272, 582, 120), (272, 583, 120), (273, 324, 116), (273, 843, 113), (275, 934, 124), (276, 839, 112), (277, 560, 127), (278, 585, 125), (278, 617, 119), (279, 570, 96), (279, 617, 119), (279, 620, 119), (279, 621, 119), (282, 582, 127), (282, 623, -1), (283, 644, 118), (284, 637, 92), (285, 635, -2), (285, 642, 119), (287, 401, 40), (288, 554, -37), (290, 557, -38), (290, 558, -39), (291, 634, 125), (292, 643, 126), (292, 674, 118), (296, 570, -40), (296, 665, 5), (296, 716, 116), (297, 674, 119), (298, 673, 120), (299, 677, 120), (299, 682, 119), (300, 687, 119), (300, 688, 119), (300, 690, 118), (301, 580, -42), (301, 693, 119), (301, 698, 118), (301, 744, 115), (304, 953, 125), (309, 952, 125), (309, 953, 125), (310, 600, -46), (312, 205, 127), (319, 236, 128), (320, 619, -51), (320, 840, 112), (321, 836, 112), (322, 237, 72), (322, 238, 126), (322, 841, 111), (324, 843, 112), (324, 844, 112), (325, 231, 128), (325, 838, 113), (325, 844, 112), (325, 845, 112), (325, 983, 128), (326, 842, 113), (328, 228, 127), (329, 228, 127), (329, 240, 125), (329, 706, 34), (332, 231, 126), (333, 844, 114), (336, 718, -52), (336, 719, 42), (338, 716, 38), (339, 235, 125), (339, 381, 99), (340, 748, -65), (344, 232, 52), (346, 455, -117), (346, 456, -116), (346, 554, -121), (347, 229, 125), (347, 969, 125), (348, 229, 123), (355, 241, 119), (355, 830, 126), (355, 831, 126), (356, 830, 126), (358, 222, 126), (361, 639, 75), (366, 217, 124), (368, 218, 126), (368, 689, 3), (377, 58, 124), (385, 314, 123), (385, 315, 123), (390, 314, 123), (391, 308, 123), (391, 312, 122), (393, 303, 121), (393, 307, 122), (394, 303, 121), (394, 305, 122), (395, 299, 122), (396, 294, 121), (396, 301, 122), (400, 918, -107), (404, 180, 123), (404, 293, 121), (407, 453, -109), (408, 286, 119), (409, 208, 123), (414, 340, 121), (414, 602, 57), (415, 220, 123), (415, 221, 123), (415, 222, 123), (418, 218, 123), (419, 224, 122), (420, 1035, -94), (420, 1036, -94), (421, 221, 122), (423, 226, 122), (423, 227, 122), (425, 338, 118), (433, 53, 124), (433, 384, 119), (434, 379, 117), (436, 379, 118), (437, 52, 128), (437, 379, 118), (439, 378, 109), (439, 380, 128), (440, 204, 122), (440, 317, 120), (443, 285, 120), (443, 394, 125), (446, 414, 126), (448, 202, 121), (449, 200, 121), (449, 202, 121), (452, 330, 119), (452, 719, 64), (453, 201, 119), (453, 705, 74), (453, 720, 64), (454, 199, 122), (455, 52, 126), (456, 226, 121), (456, 377, 118), (458, 327, 119), (459, 197, 121), (463, 325, 120), (465, 330, 112), (472, 330, 120), (473, 372, 115), (474, 188, 120), (477, 372, 112), (480, 272, 108), (480, 279, 119), (481, 170, 120), (481, 186, 120), (481, 221, 115), (481, 882, 118), (482, 169, 120), (483, 185, 121), (486, 823, 63), (488, 221, 115), (492, 370, 115), (493, 180, 114), (499, 313, 118), (502, 172, 120), (506, 173, 119), (508, 175, 121), (510, 961, 126), (511, 397, 123), (512, 961, 124), (513, 323, 120), (513, 988, -79), (517, 167, 123), (517, 169, 121), (517, 217, 118), (518, 401, 124), (518, 801, 111), (518, 960, 126), (519, 400, 122), (519, 957, 125), (520, 400, 124), (520, 801, 112), (523, 407, 126), (524, 961, 128), (525, 171, 118), (525, 306, 117), (526, 272, 117), (527, 363, 127), (529, 313, 117), (530, 313, 117), (530, 639, -38), (532, 171, 119), (532, 312, 118), (532, 641, -41), (533, 300, 116), (534, 360, 110), (534, 384, 121), (539, 360, 112), (541, 315, 115), (543, 297, 117), (551, 388, 121), (552, 588, 27), (555, 169, 118), (555, 262, 96), (555, 295, 117), (555, 391, 123), (556, 357, 112), (557, 291, 116), (557, 386, 122), (558, 594, 27), (561, 290, 116), (565, 161, 115), (565, 293, 116), (570, 161, 119), (570, 169, 116), (572, 354, 111), (572, 398, 122), (573, 611, 28), (574, 609, 29), (576, 394, 105), (577, 615, 29), (578, 352, 113), (579, 352, 110), (580, 697, -38), (581, 166, 112), (583, 354, 112), (585, 622, 31), (589, 310, 112), (589, 353, 112), (591, 262, 113), (593, 352, 113), (593, 895, 119), (594, 381, 115), (597, 68, 117), (597, 69, 117), (597, 165, 115), (598, 64, 117), (598, 66, 117), (598, 67, 117), (598, 68, 117), (598, 69, 117), (598, 70, 117), (598, 71, 117), (598, 72, 117), (598, 81, 117), (599, 72, 117), (599, 73, 117), (599, 75, 117), (599, 86, 116), (599, 87, 116), (600, 83, 117), (600, 84, 117), (600, 85, 117), (600, 88, 117), (600, 94, 116), (600, 95, 116), (601, 91, 117), (601, 92, 117), (601, 99, 116), (601, 100, 116), (601, 101, 116), (601, 102, 116), (602, 102, 116), (602, 106, 116), (602, 107, 116), (603, 151, 115), (603, 204, 114), (603, 205, 113), (604, 126, 115), (604, 146, 115), (604, 147, 115), (604, 203, 114), (605, 146, 115), (605, 205, 113), (605, 348, 107), (606, 144, 115), (606, 146, 115), (606, 148, 115), (606, 150, 115), (606, 203, 114), (607, 150, 115), (608, 55, 118), (609, 204, 114), (609, 350, 112), (610, 205, 114), (611, 205, 114), (611, 208, 114), (611, 209, 114), (611, 210, 114), (612, 150, 116), (612, 202, 114), (612, 205, 114), (612, 206, 114), (612, 208, 113), (612, 209, 113), (612, 345, 112), (613, 102, 117), (613, 202, 114), (613, 348, 111), (613, 652, -113), (614, 212, 114), (614, 305, 111), (614, 306, 111), (615, 103, 118), (616, 151, 116), (616, 154, 114), (616, 233, 113), (616, 234, 113), (616, 236, 113), (616, 250, 113), (616, 251, 113), (616, 303, 112), (616, 349, 111), (616, 655, 34), (617, 132, 117), (617, 160, 116), (617, 204, 116), (617, 239, 113), (617, 241, 113), (617, 242, 113), (617, 251, 113), (617, 347, 110), (618, 134, 117), (618, 137, 117), (618, 250, 113), (619, 108, 118), (619, 109, 118), (619, 110, 118), (619, 129, 117), (619, 135, 117), (619, 136, 117), (619, 138, 117), (619, 147, 117), (619, 154, 116), (619, 253, 113), (620, 103, 118), (620, 107, 118), (620, 110, 118), (620, 256, 113), (620, 258, 119), (620, 304, 112), (621, 111, 117), (621, 150, 116), (622, 129, 118), (622, 132, 117), (622, 168, 117), (622, 308, 112), (622, 347, 111), (623, 129, 118), (623, 169, 117), (623, 203, 115), (623, 303, 112), (623, 306, 111), (623, 309, 111), (623, 310, 111), (623, 346, 110), (623, 347, 111), (624, 102, 118), (624, 130, 117), (624, 133, 117), (624, 164, 116), (624, 305, 112), (624, 306, 111), (624, 308, 111), (624, 311, 112), (624, 388, 116), (625, 167, 117), (625, 169, 116), (625, 198, 116), (625, 205, 115), (625, 303, 111), (625, 316, 111), (625, 317, 111), (625, 318, 111), (625, 319, 111), (625, 339, 111), (625, 341, 111), (625, 342, 111), (625, 343, 111), (625, 389, 123), (626, 130, 117), (626, 138, 118), (626, 167, 117), (626, 199, 115), (626, 200, 115), (626, 203, 115), (626, 301, 112), (626, 326, 111), (626, 329, 111), (626, 330, 111), (626, 331, 111), (626, 332, 111), (626, 333, 111), (626, 339, 111), (626, 342, 111), (626, 343, 111), (627, 50, 118), (627, 130, 117), (627, 141, 116), (627, 202, 116), (627, 204, 115), (627, 254, 115), (627, 322, 111), (627, 337, 111), (627, 339, 111), (628, 50, 117), (628, 183, 116), (628, 184, 116), (628, 185, 116), (628, 254, 114), (628, 255, 114), (628, 302, 111), (628, 341, 111), (629, 132, 118), (629, 188, 115), (629, 192, 116), (629, 255, 114), (629, 341, 111), (630, 107, 118), (630, 144, 117), (630, 194, 115), (630, 302, 112), (630, 342, 111), (630, 344, 111), (630, 348, 110), (630, 410, 128), (631, 50, 117), (631, 134, 118), (631, 272, 114), (631, 344, 111), (631, 348, 110), (632, 47, 118), (632, 50, 117), (632, 196, 116), (632, 198, 116), (632, 202, 115), (632, 203, 115), (632, 205, 115), (632, 206, 115), (632, 342, 111), (633, 52, 118), (633, 53, 118), (633, 54, 118), (633, 206, 115), (633, 214, 114), (633, 343, 111), (634, 47, 117), (634, 53, 118), (634, 55, 118), (635, 51, 118), (635, 76, 117), (635, 97, 117), (635, 112, 118), (635, 136, 116), (636, 61, 117), (636, 79, 117), (636, 85, 117), (636, 97, 117), (636, 301, 113), (637, 84, 115), (637, 305, 113), (637, 347, 112), (638, 92, 117), (638, 347, 112), (638, 408, 126), (639, 83, 117), (639, 387, 117), (639, 406, 124), (639, 408, 124), (640, 85, 117), (640, 89, 117), (640, 92, 117), (640, 111, 120), (640, 277, 113), (640, 278, 113), (640, 279, 113), (641, 85, 117), (641, 92, 117), (641, 347, 112), (642, 91, 114), (643, 144, 115), (643, 145, 115), (643, 343, 112), (644, 87, 114), (644, 142, 115), (644, 342, 112), (645, 94, 115), (645, 112, 121), (645, 162, 115), (645, 343, 112), (646, 90, 115), (646, 111, 119), (646, 143, 116), (646, 146, 116), (646, 148, 116), (646, 149, 116), (646, 347, 102), (647, 118, 121), (647, 144, 116), (647, 151, 116), (647, 163, 116), (647, 164, 116), (647, 343, 111), (648, 144, 116), (648, 364, 115), (648, 386, 124), (649, 92, 116), (649, 119, 121), (649, 145, 116), (650, 112, 120), (650, 293, 113), (650, 294, 113), (650, 295, 113), (651, 93, 116), (651, 302, 112), (652, 459, 84), (654, 318, 112), (655, 122, 121), (655, 318, 111), (655, 321, 112), (655, 322, 112), (655, 323, 112), (655, 334, 112), (655, 439, 87), (656, 228, 114), (656, 229, 114), (656, 233, 113), (656, 256, 113), (656, 321, 112), (656, 323, 112), (656, 332, 111), (656, 347, 111), (657, 118, 120), (657, 125, 66), (657, 236, 114), (657, 331, 111), (657, 347, 111), (658, 149, 118), (658, 244, 114), (659, 254, 113), (659, 255, 113), (659, 381, 118), (660, 261, 113), (660, 869, -42), (661, 99, 116), (662, 126, 121), (662, 127, 121), (663, 120, 122), (664, 152, 120), (665, 311, 113), (666, 340, 111), (666, 341, 111), (666, 380, 117), (667, 119, 121), (667, 129, 121), (667, 331, 111), (667, 339, 112), (667, 340, 111), (667, 343, 112), (667, 344, 112), (668, 328, 112), (668, 329, 112), (668, 330, 112), (668, 339, 112), (668, 343, 111), (668, 380, 118), (669, 342, 111), (669, 343, 111), (669, 344, 111), (670, 343, 111), (670, 380, 117), (671, 343, 111), (672, 344, 111), (672, 410, 11), (673, 103, 117), (673, 106, 118), (673, 120, 119), (674, 146, 122), (674, 426, -43), (674, 432, 11), (677, 136, 121), (677, 155, 122), (678, 156, 121), (679, 123, 120), (679, 352, 115), (680, 136, 122), (681, 157, 122), (682, 125, 119), (682, 347, 113), (682, 459, 11), (683, 459, 13), (684, 352, 113), (686, 126, 121), (686, 138, 123), (686, 346, 116), (686, 353, 113), (687, 160, 122), (688, 354, 114), (688, 379, 118), (689, 493, -77), (691, 356, 115), (691, 378, 116), (692, 455, -69), (693, 356, 114), (694, 129, 121), (694, 164, 126), (697, 119, 119), (698, 390, 121), (698, 955, 88), (699, 130, 123), (699, 970, 4), (699, 971, 4), (699, 972, 89), (700, 998, 109), (701, 130, 119), (701, 372, 116), (702, 130, 119), (703, 122, 120), (703, 934, 90), (705, 123, 120), (705, 360, 114), (706, 354, 115), (707, 124, 120), (707, 354, 115), (708, 125, 120), (708, 149, 125), (709, 372, 116), (710, 126, 120), (711, 130, 120), (712, 127, 120), (712, 130, 120), (714, 363, 113), (716, 128, 120), (716, 174, 124), (718, 129, 120), (718, 175, 124), (719, 133, 120), (719, 134, 120), (720, 133, 120), (720, 490, -69), (721, 131, 120), (723, 159, 125), (725, 133, 121), (726, 133, 121), (727, 158, 126), (727, 160, 126), (729, 135, 121), (730, 135, 121), (730, 159, 126), (730, 162, 126), (731, 136, 121), (734, 161, 126), (738, 1036, -31), (740, 144, 122), (741, 139, 122), (742, 141, 122), (742, 164, 128), (742, 433, -93), (743, 142, 122), (743, 638, -73), (746, 166, 127), (747, 192, 126), (751, 171, 66), (755, 170, 127), (756, 173, 128), (757, 468, 122), (757, 479, 119), (757, 818, 59), (758, 686, 112), (760, 172, 128), (763, 174, 128), (764, 176, 65), (764, 961, -75), (768, 369, 118), (772, 154, 125), (775, 443, -4), (778, 383, 121), (778, 457, 108), (779, 456, -5), (784, 185, 63), (784, 387, 122), (785, 210, 128), (793, 1029, 66), (793, 1033, -10), (794, 884, -29), (795, 1011, -44), (796, 400, -6), (796, 1025, 128), (797, 427, -8), (798, 165, 126), (798, 1024, -5), (800, 433, 45), (801, 880, -11), (802, 412, 119), (802, 771, 52), (816, 224, 126), (816, 389, -93), (816, 393, -93), (819, 409, 104), (823, 653, 95), (824, 423, -25), (833, 871, 62), (839, 769, -124), (845, 470, 83), (848, 185, 128), (850, 481, 47), (850, 864, -124), (855, 763, -120), (855, 1017, -109), (864, 762, -125), (865, 383, 6), (865, 761, -124), (868, 346, 53), (869, 930, 84), (883, 350, 48), (884, 337, -6), (885, 371, -87), (888, 586, 60), (889, 373, 45), (900, 759, -50), (902, 642, -3), (913, 388, 37), (913, 389, 37), (916, 435, 38), (917, 842, -107), (919, 704, 58), (927, 561, -120), (940, 695, 54), (944, 840, -85), (947, 354, 72), (957, 691, 52), (961, 901, 34), (963, 935, 116), (964, 952, -89), (965, 274, 54), (965, 275, 54), (965, 988, -94), (965, 989, -94), (966, 828, -90), (966, 983, -92), (966, 984, -92), (966, 985, -92), (966, 986, -92), (970, 667, -48), (972, 689, 56), (975, 356, 84), (975, 397, 118), (981, 1018, 58), (989, 413, -70), (990, 1000, -97), (997, 813, -85), (1006, 832, 83), (1007, 769, 53), (1010, 812, -61), (1018, 392, 38), (1026, 867, 91), (1033, 777, 92), (1057, 665, -59), (1084, 974, -4), (1119, 787, 40), (1125, 850, 28), (1127, 574, 44), (1128, 1008, -1), (1129, 973, 72), (1129, 974, 72), (1130, 989, -2), (1132, 642, 47), (1150, 639, 52), (1150, 777, 126), (1157, 635, 48), (1161, 843, -14), (1173, 631, 46), (1174, 461, 93), (1197, 804, -98), (1202, 624, -55), (1214, 500, -60), (1223, 618, 52), (1236, 494, -77), (1238, 929, 81), (1258, 916, 88), (1261, 606, 45), (1266, 684, 66), (1270, 603, 47), (1276, 685, 58), (1281, 998, 110), (1281, 999, 110), (1296, 78, 107), (1297, 86, 107), (1297, 100, 109), (1297, 101, 109), (1297, 103, 109), (1297, 104, 109), (1298, 75, 108), (1298, 102, 109), (1298, 105, 108), (1298, 106, 108), (1299, 75, 108), (1300, 69, 109), (1301, 68, 108), (1302, 69, 109), (1302, 75, 109), (1303, 73, 108), (1304, 50, 107), (1304, 65, 108), (1304, 76, 108), (1307, 507, 77), (1308, 60, 108), (1308, 74, 108), (1320, 691, 122), (1320, 692, 124), (1322, 666, 65), (1325, 588, 47), (1341, 197, 123), (1341, 719, 39), (1341, 720, 39), (1354, 344, 97), (1355, 344, 97), (1355, 753, 38), (1356, 343, 98), (1358, 335, 96), (1360, 335, 97), (1360, 336, 97), (1364, 336, 98), (1367, 561, -53), (1371, 461, 125), (1372, 459, 126), (1380, 337, 99), (1383, 335, 111), (1388, 771, 98), (1408, 820, 110), (1412, 825, 114), (1412, 886, 120), (1413, 827, 114), (1417, 856, 116), (1418, 570, 107), (1421, 568, 103), (1422, 912, 19), (1426, 910, 20), (1427, 890, 120), (1428, 889, 120), (1433, 895, 120), (1437, 942, 125), (1437, 943, 125), (1441, 436, 101), (1443, 963, 124), (1447, 554, 109), (1452, 472, 113), (1453, 896, 123), (1454, 337, 102), (1454, 418, 102), (1454, 421, 100), (1455, 417, 101), (1456, 305, 99), (1456, 308, 99), (1456, 416, 102), (1456, 425, 102), (1457, 416, 102), (1457, 917, 119), (1458, 897, 125), (1461, 477, 115), (1461, 478, 115), (1461, 915, 127), (1464, 410, 98), (1467, 92, 125), (1467, 334, 103), (1467, 409, 99), (1468, 407, 98), (1468, 441, 103), (1469, 440, 104), (1469, 443, 103), (1470, 405, 98), (1472, 404, 98), (1502, 401, 111), (1504, 471, 122), (1511, 413, 59), (1514, 460, 104), (1514, 810, 126), (1519, 897, 124), (1519, 899, 124), (1520, 896, 122), (1526, 929, 124), (1527, 931, 125), (1528, 899, 122), (1529, 912, 125), (1529, 913, 124), (1529, 916, 125), (1531, 839, 124), (1536, 969, 128), (1539, 962, 127), (1540, 964, 127), (1540, 965, 127), (1544, 975, 120), (1556, 535, 126), (1584, 863, 49), (1634, 1013, 86), (1686, 1017, -39), (1712, 979, 90), (1713, 978, 86), (1758, 833, -38), (1763, 833, -22), (1766, 964, 127), (1766, 971, 128), (1768, 1013, 121), (1769, 966, 128), (1778, 834, -14), (1850, 981, 38), (1853, 970, 126), (1885, 905, 89), (1897, 935, 125), (1902, 171, 1), (1903, 67, 1), (1903, 291, 1), (1903, 292, 1), (1903, 299, 1), (1903, 300, 1), (1904, 260, 1), (1904, 261, 1), (1905, 329, 1)]
diff --git a/tests/test_single_matching.cpp b/tests/test_single_matching.cpp
index f7724b9..9443a33 100644
--- a/tests/test_single_matching.cpp
+++ b/tests/test_single_matching.cpp
@@ -49,7 +49,7 @@ TEST(Approval, Inference)
 
     std::stringstream ss;
     ss << supp;
-    EXPECT_EQ(866, supp.size());
+    EXPECT_EQ(1024, supp.size());
     ApprovalTests::Approvals::verify(ss.str());
 }
 std::vector<ndb::Descriptor> getSrcDescriptors() {