diff --git a/CMakeLists.txt b/CMakeLists.txt
index 55e3851..6d3a39e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,13 +1,19 @@
 cmake_minimum_required(VERSION 3.10)
 include(CheckCXXCompilerFlag)
 include(CheckCXXSourceRuns)
+include(CMakePushCheckState)
 project(openGPC CXX)
 set (REQ_CPP11_FEATURES  cxx_strong_enums cxx_auto_type)
+if(NOT CMAKE_BUILD_TYPE)
+  set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
+  add_compile_options(-O3 -funroll-loops)
+endif()
 
 set(CMAKE_CXX_STANDARD 17) 
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
+include(FetchContent)
 find_package(Eigen3 REQUIRED)
 find_package(PNG REQUIRED)
 find_package(Threads REQUIRED)
@@ -15,29 +21,66 @@ find_package(Threads REQUIRED)
 include_directories(${EIGEN3_INCLUDE_DIR})
 include_directories(${PNG_INCLUDE_DIRS})
 include_directories(lib)
+include(FetchContent)
+set(HWY_ENABLE_TESTS OFF CACHE BOOL "Disable Highway tests" FORCE)
+set(HWY_ENABLE_EXAMPLES OFF CACHE BOOL "Disable Highway examples" FORCE)
+FetchContent_Declare(
+  highway
+  GIT_REPOSITORY https://github.com/google/highway.git
+  GIT_TAG        1.3.0  
+)
+FetchContent_MakeAvailable(highway)
+
+include(FetchContent)
+set(BENCHMARK_ENABLE_TESTING OFF CACHE BOOL "" FORCE)
+set(BENCHMARK_ENABLE_INSTALL OFF CACHE BOOL "" FORCE)
+set(BENCHMARK_ENABLE_GTEST_LIB OFF CACHE BOOL "" FORCE)
+
+FetchContent_Declare(
+  google_benchmark
+  GIT_REPOSITORY https://github.com/google/benchmark.git
+  GIT_TAG        v1.9.5  
+)
+# MUST go before FetchContent_MakeAvailable
+set(BENCHMARK_ENABLE_TESTING OFF CACHE BOOL "" FORCE)
+set(BENCHMARK_ENABLE_INSTALL OFF CACHE BOOL "" FORCE)
+
+# Force the library itself to build in Release mode
+set(CMAKE_BUILD_TYPE Release CACHE STRING "" FORCE)
 
-#By default, use SSE intrinsics
-option(SSE "Enable SSE/AVX optimizations if available" ON)
-
-add_compile_options(-O3 -funroll-loops)
-if(SSE)
-    message(STATUS "Checking if target CPU supports AVX2 instructions...")
-    check_cxx_source_runs("
-      #include <immintrin.h>
-      int main() {
-        __m256i x = _mm256_set1_epi32(1);
-        return _mm256_extract_epi32(x, 0);
-      }
-    " CPU_HAS_AVX2)
-
-    if(CPU_HAS_AVX2)
-      message(STATUS "AVX2: supported and enabled")
-      add_compile_definitions(_INTRINSICS_SSE)
-      add_compile_options(-mavx2 -march=core-avx2)
-    endif()
+FetchContent_MakeAvailable(google_benchmark)
+FetchContent_MakeAvailable(google_benchmark)
+add_library(gpc_core 
+    lib/gpc/forest.cpp 
+    lib/gpc/fern.cpp 
+    lib/gpc/feature.cpp 
+    lib/gpc/kernels/sobel.cpp
+    lib/gpc/kernels/box.cpp
+    lib/gpc/kernels/census.cpp
+    lib/gpc/kernels/gpc.cpp
+    lib/gpc/kernels/utils.cpp
+    lib/gpc/kernels/box_hwy.cpp
+    lib/gpc/kernels/sobel_hwy.cpp
+    lib/gpc/kernels/gpc_hwy.cpp
+)
+if(MSVC)
+    target_compile_options(gpc_core PUBLIC /arch:AVX2)
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64")
+    target_compile_options(gpc_core PUBLIC -march=native)
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64")
+    target_compile_options(gpc_core PUBLIC -mcpu=native)
 endif()
+target_link_libraries(gpc_core 
+    PUBLIC 
+        Eigen3::Eigen 
+        ${PNG_LIBRARIES} 
+        Threads::Threads
+        hwy
+)
+target_include_directories(gpc_core PUBLIC lib)
 
 enable_testing()
 add_subdirectory(samples)
 add_subdirectory(tests)
+add_subdirectory(benchmarks)
 
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
new file mode 100644
index 0000000..7942fa6
--- /dev/null
+++ b/benchmarks/CMakeLists.txt
@@ -0,0 +1,31 @@
+add_executable(sobel_bench sobel_bench.cpp)
+
+target_link_libraries(sobel_bench
+    PRIVATE 
+        gpc_core 
+        benchmark::benchmark 
+        hwy
+)
+
+set_target_properties(sobel_bench PROPERTIES INTERPROCEDURAL_OPTIMIZATION TRUE)
+
+add_executable(kernel_bench kernel_bench.cpp)
+target_link_libraries(kernel_bench 
+    PRIVATE 
+        gpc_core
+        benchmark::benchmark 
+)
+add_executable(box_bench box_bench.cpp)
+
+target_link_libraries(box_bench
+    PRIVATE 
+        gpc_core
+        benchmark::benchmark 
+)
+add_executable(correspondence_bench correspondence_bench.cpp)
+
+target_link_libraries(correspondence_bench
+    PRIVATE 
+        gpc_core
+        benchmark::benchmark 
+)
diff --git a/benchmarks/box_bench.cpp b/benchmarks/box_bench.cpp
new file mode 100644
index 0000000..f0aeeb8
--- /dev/null
+++ b/benchmarks/box_bench.cpp
@@ -0,0 +1,52 @@
+#include <benchmark/benchmark.h>
+#include <hwy/highway.h>
+#include "gpc/kernels/box.hpp" 
+#include "gpc/kernels/box_hwy.hpp" 
+static void BM_BoxHighway(benchmark::State& state) {
+    int w = 1920, h = 1080;
+    std::vector<uint8_t> in(w * h, 128);
+    std::vector<uint8_t> out(w * h, 0);
+    state.SetLabel(hwy::TargetName(HWY_TARGET));    
+    for (auto _ : state) {
+        ndb::testing::box_hwy(in.data(), out.data(), w, h);
+        
+        benchmark::DoNotOptimize(out.data());
+        benchmark::ClobberMemory();
+    }
+}
+
+#if HWY_TARGET == HWY_AVX2
+static void BM_BoxLegacySIMD(benchmark::State& state) {
+    int w = 1920, h = 1080;
+    std::vector<uint8_t> in(w * h, 128);
+    std::vector<uint8_t> out(w * h, 0);
+
+    state.SetLabel("AVX2_legacy");    
+    for (auto _ : state) {
+        ndb::boxSSE(in.data(), out.data(), w, h);
+        
+        benchmark::DoNotOptimize(out.data());
+        benchmark::ClobberMemory();
+    }
+}
+#endif
+static void BM_BoxNaive(benchmark::State& state) {
+    int w = 1920, h = 1080;
+    std::vector<uint8_t> in(w * h, 128);
+    std::vector<uint8_t> out(w * h, 0);
+
+    state.SetLabel("naive");    
+    for (auto _ : state) {
+        ndb::boxNaive(in.data(), out.data(), w, h);
+        
+        benchmark::DoNotOptimize(out.data());
+        benchmark::ClobberMemory();
+    }
+}
+BENCHMARK(BM_BoxHighway)->Unit(benchmark::kMillisecond);
+#if HWY_TARGET == HWY_AVX2
+BENCHMARK(BM_BoxLegacySIMD)->Unit(benchmark::kMillisecond);
+#endif
+BENCHMARK(BM_BoxNaive)->Unit(benchmark::kMillisecond);
+
+BENCHMARK_MAIN();
diff --git a/benchmarks/correspondence_bench.cpp b/benchmarks/correspondence_bench.cpp
new file mode 100644
index 0000000..b847fb1
--- /dev/null
+++ b/benchmarks/correspondence_bench.cpp
@@ -0,0 +1,84 @@
+#include <benchmark/benchmark.h>
+#include "gpc/forest.hpp"
+#include "gpc/inference.hpp"
+#include <vector>
+#include <random>
+#include <cmath>
+#include <cstdint>
+
+#define NUM_ELEMENTS 262668 //10*1224*375 //1024*1024
+
+/**
+ * Generates a reproducible Pareto-distributed vector.
+ * @param count Number of IDs to generate.
+ * @param target_mean The theoretical mean (requires alpha > 1).
+ * @param seed A fixed value (e.g., 42) for deterministic benchmarks.
+ */
+std::vector<ndb::Descriptor> generate_pareto_ids(size_t count, double target_mean, uint32_t seed = 42) {
+    std::vector<ndb::Descriptor> ids;
+    ids.reserve(count);
+
+    // Using a fixed seed for benchmark consistency
+    std::mt19937 gen(seed); 
+    
+    // 1e-9 epsilon prevents division by zero/infinity
+    std::uniform_real_distribution<double> dist(1e-9, 1.0);
+
+    const double alpha = 1.16; 
+    const double xm = target_mean * (alpha - 1.0) / alpha;
+
+    for (size_t i = 0; i < count; ++i) {
+        // Inverse Transform Sampling
+        double val = xm / std::pow(dist(gen), 1.0 / alpha);
+        
+        // Casting to uint32_t will handle the Pareto "tail" by wrapping 
+        // values that exceed 2^32-1, simulating a dense ID space.
+        ids.push_back(ndb::Descriptor(ndb::Point(0,0), static_cast<uint32_t>(val)));
+    }
+
+    return ids;
+}
+std::vector<ndb::Descriptor> getSrcDescriptors() {
+    std::vector<ndb::Descriptor> v =  ndb::Descriptor::deserialize("statesSrcLarge.txt", true);
+    std::vector<ndb::Descriptor> out;
+    for (size_t i = 0; i < v.size(); i++) {
+        if (v[i].point.y % 5 == 0 && (v[i].state & 0xFFFFFFFF) != 0) { 
+            out.push_back(v[i]);
+        }
+    }
+    return out;
+    //return generate_pareto_ids(NUM_ELEMENTS, 1000.0, 42); // 1M IDs with mean ~1000
+}
+
+std::vector<ndb::Descriptor> getTarDescriptors() {
+    std::vector<ndb::Descriptor> v = ndb::Descriptor::deserialize("statesTarLarge.txt", false);
+    std::vector<ndb::Descriptor> out;
+    for (size_t i = 0; i < v.size(); i++) {
+        if (v[i].point.y % 5 == 0 && (v[i].state & 0xFFFFFFFF) != 0) { 
+            out.push_back(v[i]);
+        }
+    }
+    return out;
+
+    //return generate_pareto_ids(NUM_ELEMENTS, 1001.0, 42); // 1M IDs with mean ~1000
+}
+static void matchBySorting(
+        benchmark::State& state) {
+    std::vector<ndb::Descriptor> srcOriginal = getSrcDescriptors(); 
+    std::vector<ndb::Descriptor> tarOriginal = getTarDescriptors();
+    for (auto _ : state) {
+        state.PauseTiming();
+        std::vector<ndb::Descriptor> src = srcOriginal;
+        std::vector<ndb::Descriptor> tar = tarOriginal;
+        state.ResumeTiming();
+        std::vector<ndb::Correspondence> 
+            matches = gpc::inference::Forest::findCorrespondences(src, tar);
+
+        state.counters["matches"] = matches.size();
+        benchmark::DoNotOptimize(matches);
+        benchmark::ClobberMemory();
+    }
+}
+BENCHMARK(matchBySorting)
+    ->Unit(benchmark::kMillisecond);
+BENCHMARK_MAIN();
diff --git a/benchmarks/kernel_bench.cpp b/benchmarks/kernel_bench.cpp
new file mode 100644
index 0000000..d86cfe8
--- /dev/null
+++ b/benchmarks/kernel_bench.cpp
@@ -0,0 +1,62 @@
+#include <benchmark/benchmark.h>
+#include "gpc/inference.hpp"
+
+typedef gpc::inference::Forest GPCForest_t;
+GPCForest_t forest;
+
+static void fullInference(
+        benchmark::State& state){
+
+    std::string forestPath = "../forests/defaultZeroForest.txt";
+    std::string leftImgPath = "../data/middlebury/im0.png";
+    std::string rightImgPath = "../data/middlebury/im1.png";
+    gpc::inference::InferenceSettings inferencesettings =
+        gpc::inference::InferenceSettings()
+            .builder()
+            .gradientThreshold(state.range(0)) // 0...255 gradient threshold for sobel filter
+            .verticalTolerance(
+                0)               // 0px tolerance for rectified epipolar matches
+            .dispHigh(128)       // limit disparities to 128
+            .epipolarMode(true)  // match GPC states in epipolar mode. more
+                                 // matches, lower accuracy than global
+            .useHashtable(false);  // use sort method for matching. faster for
+                                   // <100K descriptors
+
+    ndb::Buffer<uint8_t> simg, timg;
+    // Load images
+    simg.readPNG(leftImgPath);
+    timg.readPNG(rightImgPath);
+
+    // Get learned filter for the given image dimensions.
+    GPCForest_t::FilterMask fm =
+        forest.readForest(forestPath, simg.cols(), simg.rows());
+
+
+
+    for (auto _ : state) {
+        GPCForest_t::PreprocessedImage simgP =
+            forest.preprocessImage(simg, inferencesettings);
+        GPCForest_t::PreprocessedImage timgP =
+            forest.preprocessImage(timg, inferencesettings);
+        std::vector<ndb::Support> supp =
+            forest.rectifiedMatch(simgP, timgP, fm, inferencesettings);
+        state.counters["candidates_s"] = simgP.mask.size();
+        state.counters["candidates_t"] = timgP.mask.size();
+        state.counters["matches"] = supp.size();
+        benchmark::DoNotOptimize(supp);
+        benchmark::ClobberMemory();
+    }
+ 
+}
+
+BENCHMARK(fullInference)
+    ->Unit(benchmark::kMillisecond)
+    ->Args({0})
+    ->Args({5})
+    ->Args({10})
+    ->Args({20})
+    ->Args({50})
+    ->Args({100});
+
+
+BENCHMARK_MAIN();
diff --git a/benchmarks/sobel_bench.cpp b/benchmarks/sobel_bench.cpp
new file mode 100644
index 0000000..490a861
--- /dev/null
+++ b/benchmarks/sobel_bench.cpp
@@ -0,0 +1,56 @@
+#include <benchmark/benchmark.h>
+#include <hwy/highway.h>
+#include "gpc/kernels/sobel.hpp" 
+#include "gpc/kernels/sobel_hwy.hpp" 
+static void BM_SobelHighway(benchmark::State& state) {
+    int w = 1920, h = 1080;
+    std::vector<uint8_t> in(w * h, 128);
+    std::vector<uint8_t> out(w * h, 0);
+    state.SetLabel(hwy::TargetName(HWY_TARGET));    
+    // Warmup is handled automatically by the library
+    for (auto _ : state) {
+        ndb::testing::sobel_hwy(in.data(), out.data(), w, h, 50);
+        
+        // Ensure the compiler doesn't skip the work
+        benchmark::DoNotOptimize(out.data());
+        benchmark::ClobberMemory();
+    }
+}
+
+#if HWY_TARGET == HWY_AVX2
+static void BM_SobelLegacySIMD(benchmark::State& state) {
+    int w = 1920, h = 1080;
+    std::vector<uint8_t> in(w * h, 128);
+    std::vector<uint8_t> out(w * h, 0);
+
+    state.SetLabel("AVX2_legacy");    
+    for (auto _ : state) {
+        ndb::sobelSSE(in.data(), out.data(), w, 1, h - 1, 1);
+        
+        // Ensure the compiler doesn't skip the work
+        benchmark::DoNotOptimize(out.data());
+        benchmark::ClobberMemory();
+    }
+}
+#endif
+static void BM_SobelNaive(benchmark::State& state) {
+    int w = 1920, h = 1080;
+    std::vector<uint8_t> in(w * h, 128);
+    std::vector<uint8_t> out(w * h, 0);
+
+    state.SetLabel("naive");    
+    for (auto _ : state) {
+        ndb::sobelNaive(in.data(), out.data(), w, h, 50);
+        
+        // Ensure the compiler doesn't skip the work
+        benchmark::DoNotOptimize(out.data());
+        benchmark::ClobberMemory();
+    }
+}
+BENCHMARK(BM_SobelHighway)->Unit(benchmark::kMillisecond);
+#if HWY_TARGET == HWY_AVX2
+BENCHMARK(BM_SobelLegacySIMD)->Unit(benchmark::kMillisecond);
+#endif
+BENCHMARK(BM_SobelNaive)->Unit(benchmark::kMillisecond);
+
+BENCHMARK_MAIN();
diff --git a/format_code.sh b/format_code.sh
index 0eaf149..16d9cb5 100755
--- a/format_code.sh
+++ b/format_code.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 set -euo pipefail
-EXPECTED_VERSION="21.1.5"
+EXPECTED_VERSION="21.1.8"
 root_folder=$(git rev-parse --show-toplevel)
 
 change_in_place=false
diff --git a/lib/gpc/Feature.hpp b/lib/gpc/Feature.hpp
index f0f7072..c2064b4 100644
--- a/lib/gpc/Feature.hpp
+++ b/lib/gpc/Feature.hpp
@@ -39,7 +39,6 @@
 #include <cmath>  //for log2
 #include <fstream>
 #include <gpc/buffer.hpp>
-#include <gpc/filter.hpp>
 #include <iostream>
 #include <iterator>
 #include <random>
@@ -96,32 +95,13 @@ class Feature {
      * @param      params  The parameters for this split
      * @param[in]  trip    The triplet
      */
-    inline void getDecisions(bool& ref,
-                             bool& pos,
-                             bool& neg,
-                             params& params,
-                             const GPCPatchTriplet& trip) {
-        ref =
-            ((int)trip.ref.feature(params.i) - (int)trip.ref.feature(params.j) <
-             params.tau);
-        pos =
-            ((int)trip.pos.feature(params.i) - (int)trip.pos.feature(params.j) <
-             params.tau);
-        neg =
-            ((int)trip.neg.feature(params.i) - (int)trip.neg.feature(params.j) <
-             params.tau);
-    }
-
-    Feature() {
-        std::random_device rd2;
-        rng = std::mt19937(rd2());
-        randIJ7 = std::uniform_int_distribution<int>(0, 48);
-        randIJ17 = std::uniform_int_distribution<int>(0, 17 * 17 - 1);
-        randIJ27 = std::uniform_int_distribution<int>(0, 27 * 27 - 1);
-
-        randTAU = std::uniform_int_distribution<int>(-15, 15);
-    }
+    void getDecisions(bool& ref,
+                      bool& pos,
+                      bool& neg,
+                      params& params,
+                      const GPCPatchTriplet& trip);
 
+    Feature();
     /**
      * @brief Returns a random hyperplane within a 27 x 27
      *        pixel-sized patch. depending on the scale
@@ -132,50 +112,7 @@ class Feature {
      * @param scale Determines which patch size is used
      * @param params returns the parameters
      */
-    void inline sampleHyperplane(int scale, params& params) {
-        if (scale == 2) {
-            params.i = params.j;  // s.t. they regenerate each iteration
-            while (params.i == params.j) {  // i and j need to be distinct
-                int i = randIJ7(rng);
-                int j = randIJ7(rng);
-                params.ix = i % 7 - 3;
-                params.iy = i / 7 - 3;
-                params.jx = j % 7 - 3;
-                params.jy = j / 7 - 3;
-
-                params.i = 280 + (params.ix + 3) + 27 * (params.iy + 3);
-                params.j = 280 + (params.jx + 3) + 27 * (params.jy + 3);
-            }
-        } else if (scale == 1) {
-            params.i = params.j;  // s.t. they regenerate each iteration
-            while (params.i == params.j) {  // i and j need to be distinct
-                int i = randIJ17(rng);
-                int j = randIJ17(rng);
-                params.ix = i % 17 - 8;
-                params.iy = i / 17 - 8;
-                params.jx = j % 17 - 8;
-                params.jy = j / 17 - 8;
-
-                params.i = 140 + (params.ix + 8) + 27 * (params.iy + 8);
-                params.j = 140 + (params.jx + 8) + 27 * (params.jy + 8);
-            }
-        } else if (scale == 0) {
-            params.i = params.j;  // s.t. they regenerate each iteration
-            while (params.i == params.j) {  // i and j need to be distinct
-                params.i = randIJ27(rng);
-                params.j = randIJ27(rng);
-                params.ix = params.i % 27 - 13;
-                params.iy = params.i / 27 - 13;
-                params.jx = params.j % 27 - 13;
-                params.jy = params.j / 27 - 13;
-
-                params.i = (params.ix + 13) + 27 * (params.iy + 13);
-                params.j = (params.jx + 13) + 27 * (params.jy + 13);
-            }
-        }
-        params.tau = randTAU(rng);
-    }
-
+    void sampleHyperplane(int scale, params& params);
     /**
      * @brief      Gets all descriptors (triplets) for an image pair for
      * training given the three keypoint vectors.
@@ -193,56 +130,7 @@ class Feature {
                             std::vector<ndb::Point>& ref,
                             std::vector<ndb::Point>& pos,
                             std::vector<ndb::Point>& neg,
-                            std::vector<GPCPatchTriplet>& triplets) {
-        ndb::Buffer<uint8_t> LL(bwL.rows(), bwL.cols());
-        LL.width = bwL.width;
-        ndb::box(bwL.data(), LL.data(), bwL.cols(), bwL.rows(), 1);
-        LL.clearBoundary();
-
-        ndb::Buffer<uint8_t> RR(bwL.rows(), bwL.cols());
-        RR.width = bwR.width;
-        ndb::box(bwR.data(), RR.data(), bwR.cols(), bwR.rows(), 1);
-        RR.clearBoundary();
-
-        auto f = [=](ndb::Point& kp) {
-            if (kp.x > 20 && kp.y > 20 && kp.x < bwL.cols() - 20 &&
-                kp.y < bwL.rows() - 20)
-                return false;
-            else
-                return true;
-        };
-
-        for (std::vector<ndb::Point>::size_type i = 0; i != ref.size(); i++) {
-            if (!f(ref[i]) && !f(pos[i]) && !f(neg[i])) {
-                // Get all descriptors:
-                GPCPatchTriplet newPatch;
-
-                // Reference patch
-                //====================================
-                newPatch.ref.x = ref[i].x;
-                newPatch.ref.y = ref[i].y;
-
-                LL.getPatch(newPatch.ref.feature, ref[i].x, ref[i].y, 27);
-
-                // Extract a positive match in the right image
-                //====================================
-                newPatch.pos.x = pos[i].x;
-                newPatch.pos.y = pos[i].y;
-
-                RR.getPatch(newPatch.pos.feature, pos[i].x, pos[i].y, 27);
-
-                // Extract negative patch
-                //====================================
-                newPatch.neg.x = neg[i].x;
-                newPatch.neg.y = neg[i].y;
-
-                RR.getPatch(newPatch.neg.feature, neg[i].x, neg[i].y, 27);
-
-                triplets.push_back(std::move(newPatch));
-            }
-        }
-    }
-
+                            std::vector<GPCPatchTriplet>& triplets);
     /**
      * @brief Store a vector of triplets of training data to file
      *
@@ -250,17 +138,7 @@ class Feature {
      * @param path The path where we'd like to store the training data
      *             in binary form.
      */
-    void storeAllTriplets(std::vector<GPCPatchTriplet>& data,
-                          std::string path) {
-        ofstream fout;
-        fout.open(path, ios::binary | ios::out);
-        for (auto& triplet : data) {
-            fout.write((char*)triplet.ref.feature.data(), 27 * 27);
-            fout.write((char*)triplet.pos.feature.data(), 27 * 27);
-            fout.write((char*)triplet.neg.feature.data(), 27 * 27);
-        }
-        fout.close();
-    }
+    void storeAllTriplets(std::vector<GPCPatchTriplet>& data, std::string path);
     /**
      * @brief Read triplets of training data from a binary file
      *        written by the storeAllTriplets method.
@@ -269,33 +147,7 @@ class Feature {
      *
      * @return The training set
      */
-    std::vector<GPCPatchTriplet> loadAllTriplets(std::string path) {
-        std::vector<GPCPatchTriplet> data;
-        std::ifstream in(path, std::ifstream::ate | std::ifstream::binary);
-        uint32_t filesize = in.tellg();
-        if (filesize % ((27 * 27) * 3)) {
-            cout << "ERR: File is not a training set of this feature type"
-                 << endl;
-            cout << "FS: " << filesize << endl;
-            return data;
-        }
-        int numSamples = filesize / ((27 * 27) * 3);
-        data.resize(numSamples);
-        ifstream fin;
-        fin.open(path, ios::binary | ios::in);
-        for (auto& datum : data) {
-            datum.ref.feature.resize(27, 27);
-            datum.pos.feature.resize(27, 27);
-            datum.neg.feature.resize(27, 27);
-
-            fin.read((char*)datum.ref.feature.data(), 27 * 27);
-            fin.read((char*)datum.pos.feature.data(), 27 * 27);
-            fin.read((char*)datum.neg.feature.data(), 27 * 27);
-        }
-        fin.close();
-        return data;
-    }
-
+    std::vector<GPCPatchTriplet> loadAllTriplets(std::string path);
 };  // Feature
 }  // namespace training
 }  // namespace gpc
diff --git a/lib/gpc/Fern.hpp b/lib/gpc/Fern.hpp
index 5f5f533..68c6a1f 100644
--- a/lib/gpc/Fern.hpp
+++ b/lib/gpc/Fern.hpp
@@ -167,10 +167,7 @@ OptimizerSettings TauOptimizer(int taulo,
                                int tauhi,
                                int numResamples,
                                bool onlyScoreNonSplitSamples,
-                               double w1) {
-    return OptimizerSettings(
-        taulo, tauhi, numResamples, onlyScoreNonSplitSamples, w1);
-}
+                               double w1);
 /**
  * @brief Optimzer setting factory for a zero fern
  *
@@ -184,9 +181,7 @@ OptimizerSettings TauOptimizer(int taulo,
  */
 OptimizerSettings ZeroOptimizer(int numResamples,
                                 bool onlyScoreNonSplitSamples,
-                                double w1) {
-    return OptimizerSettings(0, 1, numResamples, onlyScoreNonSplitSamples, w1);
-}
+                                double w1);
 struct FernSettings {
     const int maxDepth;
     const int scale;
@@ -231,62 +226,7 @@ class Fern {
                    FernSettings fernsetting,
                    OptimizerSettings optsetting,
                    int scoreUntilLevel,
-                   splitStats& s) {
-        s.tp = 0;
-        s.fn = 0;
-        s.fp = 0;
-        s.prec = 0.;
-        s.rec = 0.;
-        s.hmean = 0.;
-        s.convcomb = 0.;
-        s.tot = 0;
-        for (auto& triplet : data) {
-            uint64_t ref = 0, pos = 0, neg = 0;
-            // Score the first scoreUntilLevel levels of a given fern
-            for (int i = 0; i < scoreUntilLevel + 1; i++) {
-                ref <<= 1;
-                pos <<= 1;
-                neg <<= 1;
-                bool refDec, posDec, negDec;
-
-                // Decisions need to be added into a codeword
-                Feature.getDecisions(
-                    refDec, posDec, negDec, params[i], triplet);
-                if (refDec) ref++;
-                if (posDec) pos++;
-                if (negDec) neg++;
-            }
-            // Only count those that haven't been true positives yet
-            // Ignore samples previously classified as True positive
-            if (!(triplet.pos.split == true && triplet.neg.split == true)) {
-                s.tot++;
-                // Decide which are equal (i.e. set the split indicators)
-                if (ref == pos) {      // 110(TP), 111, 001(TP), 000
-                    if (ref != neg) {  // 110 (TP), 001(TP)
-                        s.tp++;
-                    } else {  // 111(FN), 000(FN)
-                        s.fn++;
-                    }
-                } else {               // 100, 101, 011, 010
-                    if (ref != neg) {  // 100(FN), 011(FN) FN
-                        s.fn++;
-                    } else {  //  101(FP), 010(FP)
-                        s.fp++;
-                    }
-                }
-            }
-        }
-
-        // Compute statistics of this split
-        double w2 = 1. - optsetting.w1_;
-        s.prec = ((s.tp + s.fp) == 0) ? 0. : double(s.tp) / (s.tp + s.fp);
-        s.rec = ((s.tp + s.fn) == 0) ? 0. : double(s.tp) / (s.tp + s.fn);
-
-        s.hmean = (s.prec + s.rec == 0.)
-                      ? 0.
-                      : s.prec * s.rec / ((1. - w2) * s.prec + w2 * s.rec);
-        s.convcomb = (1. - w2) * s.prec + w2 * s.rec;
-    }
+                   splitStats& s);
     /**
      * @brief      Mark those samples in the set as "split" if they have been
      *             correctly classified(ref=pos and pos!=neg) with the parameter
@@ -298,26 +238,7 @@ class Fern {
      */
     void markSplitSamples(std::vector<GPCTriplet_t>& data,
                           std::vector<SplitParams_t>& params,
-                          int numParams) {
-        for (auto& triplet : data) {
-            // Evaluate triplet on all given parameters
-            uint64_t ref = 0, pos = 0, neg = 0;
-            for (int i = 0; i < numParams; i++) {
-                ref <<= 1;  // shift by one
-                pos <<= 1;  // shift by one
-                neg <<= 1;  // shift by one
-                bool refDec, posDec, negDec;
-
-                Feature.getDecisions(
-                    refDec, posDec, negDec, params[i], triplet);
-                if (refDec) ref++;
-                if (posDec) pos++;
-                if (negDec) neg++;
-            }
-            if (ref == pos) triplet.pos.split = true;
-            if (ref != neg) triplet.neg.split = true;
-        }
-    }
+                          int numParams);
     /**
      * @brief Reset the mark on the training samples on whether they have been
      * split correctly or not Since we do not operate on copies of the training
@@ -325,12 +246,7 @@ class Fern {
      *
      * @param data
      */
-    void resetMarkOnSamples(std::vector<GPCTriplet_t>& data) {
-        for (auto& triplet : data) {
-            triplet.pos.split = false;
-            triplet.neg.split = false;
-        }
-    }
+    void resetMarkOnSamples(std::vector<GPCTriplet_t>& data);
 
     /**
      * @brief Train a fern given a set of training data and some optimizer
@@ -340,70 +256,21 @@ class Fern {
      * @param optsetting      the optimizer settings
      */
     void train(std::vector<GPCTriplet_t>& trainingSamples,
-               OptimizerSettings optsetting) {
-        splitStats stats;
-        float maxScore = 0.f;
-        SplitParams_t bestParams;
-
-        fernparams.resize(fernsettings.maxDepth);
-
-        cout << setw(7) << "Level" << setw(10) << "Prec" << setw(10) << "Rec"
-             << setw(10) << "Har" << setw(8) << "Tot" << setw(8) << "TP"
-             << setw(8) << "FP" << setw(8) << "FN" << setw(6) << "scale"
-             << setw(5) << "tau" << setw(5) << "i" << setw(5) << "j" << endl;
-        if (optsetting.onlyScoreNonSplitSamples_)
-            resetMarkOnSamples(trainingSamples);
-        for (int level = 0; level < fernsettings.maxDepth; level++) {
-            maxScore = 0.f;
-            for (int k = 0; k < optsetting.numResamples_; k++) {
-                // Samples a hyperplane in the requested scale
-                Feature.sampleHyperplane(fernsettings.scale, fernparams[level]);
-                // Iterates over a small range of tau (intercept)
-                for (int tau = optsetting.taulo_; tau < optsetting.tauhi_;
-                     tau++) {
-                    fernparams[level].tau = tau;
-                    // Score hyperplane set we have so far
-                    evalSplit(trainingSamples,
-                              fernparams,
-                              fernsettings,
-                              optsetting,
-                              level,
-                              stats);
-                    // If score exceeds previously best, replace paramset
-                    if (stats.hmean > maxScore) {
-                        bestParams = fernparams[level];
-                        maxScore = stats.hmean;
-                    }
-                }  // tau loop
-            }  // k loop
-            // Store best performing parameters
-            fernparams[level] = bestParams;
-
-            // Mark samples as split if they were labeled true positive
-            if (optsetting.onlyScoreNonSplitSamples_)
-                markSplitSamples(trainingSamples, fernparams, level);
-            cout << setw(7) << level << setw(10) << stats.prec << setw(10)
-                 << stats.rec << setw(10) << stats.hmean << setw(8) << stats.tot
-                 << setw(8) << stats.tp << setw(8) << stats.fp << setw(8)
-                 << stats.fn << setw(6) << fernsettings.scale << setw(5)
-                 << fernparams[level].tau << setw(5) << fernparams[level].i
-                 << setw(5) << fernparams[level].j << endl;
-        }  // level loop
-    }  // train
+               OptimizerSettings optsetting);
 
     /**
      * @brief      Returns the decision of the first five levels of the ferns
      *
      * @return     The parameters.
      */
-    std::vector<SplitParams_t> getParameters() { return fernparams; }
+    std::vector<SplitParams_t> getParameters();
 
     /**
      * @brief Return the scale that this fern uses
      *
      * @return The scale
      */
-    int getScale() { return fernsettings.scale; }
+    int getScale();
 
 };  // Fern
 
@@ -417,7 +284,10 @@ class Fern {
  *
  * @return
  */
-std::vector<Fern> FernFactory(int num_S, int num_M, int num_L, int maxDepth) {
+inline std::vector<Fern> FernFactory(int num_S,
+                                     int num_M,
+                                     int num_L,
+                                     int maxDepth) {
     std::vector<Fern> ferns;
     for (int i = 0; i < num_S; i++)
         ferns.push_back(Fern(FernSettings(maxDepth, 2)));
diff --git a/lib/gpc/buffer.hpp b/lib/gpc/buffer.hpp
index dd6dce6..0b58119 100644
--- a/lib/gpc/buffer.hpp
+++ b/lib/gpc/buffer.hpp
@@ -33,6 +33,10 @@
 #include <png.h>
 
 #include <Eigen/Dense>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
 #include <type_traits>
 #include <vector>
 
@@ -80,6 +84,58 @@ struct Descriptor {
     bool operator<(const Descriptor& d) const { return state < d.state; }
     bool operator<=(const Descriptor& d) const { return state <= d.state; }
     int operator%(const int& d) const { return state % d; }
+    static void serialize(const std::string& filename,
+                          const std::vector<Descriptor>& data) {
+        std::ofstream outFile(filename);
+        if (!outFile.is_open()) {
+            std::cerr << "Error opening file for writing: " << filename
+                      << std::endl;
+            return;
+        }
+
+        for (const auto& desc : data) {
+            outFile << desc.point.x << "," << desc.point.y << "," << desc.state
+                    << "\n";
+        }
+        outFile.close();
+    }
+
+    /**
+     * Deserializes a CSV file back into a vector of Descriptors.
+     */
+    static std::vector<Descriptor> deserialize(const std::string& filename,
+                                               bool srcDescr) {
+        std::vector<Descriptor> result;
+        std::ifstream inFile(filename);
+        if (!inFile.is_open()) {
+            std::cerr << "Error opening file for reading: " << filename
+                      << std::endl;
+            return result;
+        }
+
+        std::string line;
+        while (std::getline(inFile, line)) {
+            if (line.empty()) continue;
+
+            std::stringstream ss(line);
+            std::string x_str, y_str, state_str;
+
+            // Split by comma
+            if (std::getline(ss, x_str, ',') && std::getline(ss, y_str, ',') &&
+                std::getline(ss, state_str, ',')) {
+                Descriptor d;
+                d.point.x = std::stod(x_str);
+                d.point.y = std::stod(y_str);
+                d.state = std::stoull(state_str);
+                d.srcDescr = srcDescr;
+
+                // if (d.point.y > 200 && d.point.y < 400)
+                result.push_back(d);
+            }
+        }
+        inFile.close();
+        return result;
+    }
 };
 // Keeps support points with associated disparity
 // Support points are only used in the left image
@@ -896,7 +952,7 @@ class RGBBuffer : public Buffer<RGBColor> {
         free(rowPointers);
     }
 };
-Buffer<RGBColor> getDisparityVisualization(
+inline Buffer<RGBColor> getDisparityVisualization(
     ndb::Buffer<uint8_t>& srcImg,
     std::vector<int>& validEstimateIndices,
     ndb::Buffer<float>& disparity) {
@@ -969,8 +1025,8 @@ Buffer<RGBColor> getDisparityVisualization(
     }
     return dispVis;
 }
-Buffer<RGBColor> getDisparityVisualization(ndb::Buffer<uint8_t>& srcImg,
-                                           std::vector<Support>& support) {
+inline Buffer<RGBColor> getDisparityVisualization(
+    ndb::Buffer<uint8_t>& srcImg, std::vector<Support>& support) {
     float min_disparity = 0;
     float max_disparity = 128;
     Buffer<RGBColor> dispVis(Eigen::Vector2i(srcImg.width, srcImg.rows()));
diff --git a/lib/gpc/feature.cpp b/lib/gpc/feature.cpp
new file mode 100644
index 0000000..9e1322f
--- /dev/null
+++ b/lib/gpc/feature.cpp
@@ -0,0 +1,212 @@
+// Copyright (c) 2018, ETH Zurich
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the copyright holder nor the names of its contributors
+// may be used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Implements and extends the method proposed in
+// The Global Patch Collider
+// Shenlong Wang, Sean Ryan Fanello, Christoph Rhemann, Shahram Izadi, Pushmeet
+// Kohli CVPR 2016 Code Author: Niklaus Bamert (bamertn@ethz.ch)
+
+#include <Eigen/Dense>
+#include <algorithm>
+#include <cmath>  //for log2
+#include <fstream>
+#include <gpc/Feature.hpp>
+#include <gpc/buffer.hpp>
+#include <gpc/kernels/box.hpp>
+#include <iostream>
+#include <iterator>
+#include <random>
+#include <set>
+#include <string>
+#include <vector>
+
+using namespace std;
+
+namespace gpc {
+namespace training {
+void Feature::getDecisions(bool& ref,
+                           bool& pos,
+                           bool& neg,
+                           params& params,
+                           const GPCPatchTriplet& trip) {
+    ref = ((int)trip.ref.feature(params.i) - (int)trip.ref.feature(params.j) <
+           params.tau);
+    pos = ((int)trip.pos.feature(params.i) - (int)trip.pos.feature(params.j) <
+           params.tau);
+    neg = ((int)trip.neg.feature(params.i) - (int)trip.neg.feature(params.j) <
+           params.tau);
+}
+
+Feature::Feature() {
+    std::random_device rd2;
+    rng = std::mt19937(rd2());
+    randIJ7 = std::uniform_int_distribution<int>(0, 48);
+    randIJ17 = std::uniform_int_distribution<int>(0, 17 * 17 - 1);
+    randIJ27 = std::uniform_int_distribution<int>(0, 27 * 27 - 1);
+
+    randTAU = std::uniform_int_distribution<int>(-15, 15);
+}
+void Feature::sampleHyperplane(int scale, params& params) {
+    if (scale == 2) {
+        params.i = params.j;            // s.t. they regenerate each iteration
+        while (params.i == params.j) {  // i and j need to be distinct
+            int i = randIJ7(rng);
+            int j = randIJ7(rng);
+            params.ix = i % 7 - 3;
+            params.iy = i / 7 - 3;
+            params.jx = j % 7 - 3;
+            params.jy = j / 7 - 3;
+
+            params.i = 280 + (params.ix + 3) + 27 * (params.iy + 3);
+            params.j = 280 + (params.jx + 3) + 27 * (params.jy + 3);
+        }
+    } else if (scale == 1) {
+        params.i = params.j;            // s.t. they regenerate each iteration
+        while (params.i == params.j) {  // i and j need to be distinct
+            int i = randIJ17(rng);
+            int j = randIJ17(rng);
+            params.ix = i % 17 - 8;
+            params.iy = i / 17 - 8;
+            params.jx = j % 17 - 8;
+            params.jy = j / 17 - 8;
+
+            params.i = 140 + (params.ix + 8) + 27 * (params.iy + 8);
+            params.j = 140 + (params.jx + 8) + 27 * (params.jy + 8);
+        }
+    } else if (scale == 0) {
+        params.i = params.j;            // s.t. they regenerate each iteration
+        while (params.i == params.j) {  // i and j need to be distinct
+            params.i = randIJ27(rng);
+            params.j = randIJ27(rng);
+            params.ix = params.i % 27 - 13;
+            params.iy = params.i / 27 - 13;
+            params.jx = params.j % 27 - 13;
+            params.jy = params.j / 27 - 13;
+
+            params.i = (params.ix + 13) + 27 * (params.iy + 13);
+            params.j = (params.jx + 13) + 27 * (params.jy + 13);
+        }
+    }
+    params.tau = randTAU(rng);
+}
+void Feature::extractAllTriplets(ndb::Buffer<uint8_t>& bwL,
+                                 ndb::Buffer<uint8_t>& bwR,
+                                 std::vector<ndb::Point>& ref,
+                                 std::vector<ndb::Point>& pos,
+                                 std::vector<ndb::Point>& neg,
+                                 std::vector<GPCPatchTriplet>& triplets) {
+    ndb::Buffer<uint8_t> LL(bwL.rows(), bwL.cols());
+    LL.width = bwL.width;
+    ndb::box(bwL.data(), LL.data(), bwL.cols(), bwL.rows(), 1);
+    LL.clearBoundary();
+
+    ndb::Buffer<uint8_t> RR(bwL.rows(), bwL.cols());
+    RR.width = bwR.width;
+    ndb::box(bwR.data(), RR.data(), bwR.cols(), bwR.rows(), 1);
+    RR.clearBoundary();
+
+    auto f = [=](ndb::Point& kp) {
+        if (kp.x > 20 && kp.y > 20 && kp.x < bwL.cols() - 20 &&
+            kp.y < bwL.rows() - 20)
+            return false;
+        else
+            return true;
+    };
+
+    for (std::vector<ndb::Point>::size_type i = 0; i != ref.size(); i++) {
+        if (!f(ref[i]) && !f(pos[i]) && !f(neg[i])) {
+            // Get all descriptors:
+            GPCPatchTriplet newPatch;
+
+            // Reference patch
+            //====================================
+            newPatch.ref.x = ref[i].x;
+            newPatch.ref.y = ref[i].y;
+
+            LL.getPatch(newPatch.ref.feature, ref[i].x, ref[i].y, 27);
+
+            // Extract a positive match in the right image
+            //====================================
+            newPatch.pos.x = pos[i].x;
+            newPatch.pos.y = pos[i].y;
+
+            RR.getPatch(newPatch.pos.feature, pos[i].x, pos[i].y, 27);
+
+            // Extract negative patch
+            //====================================
+            newPatch.neg.x = neg[i].x;
+            newPatch.neg.y = neg[i].y;
+
+            RR.getPatch(newPatch.neg.feature, neg[i].x, neg[i].y, 27);
+
+            triplets.push_back(std::move(newPatch));
+        }
+    }
+}
+
+void Feature::storeAllTriplets(std::vector<GPCPatchTriplet>& data,
+                               std::string path) {
+    ofstream fout;
+    fout.open(path, ios::binary | ios::out);
+    for (auto& triplet : data) {
+        fout.write((char*)triplet.ref.feature.data(), 27 * 27);
+        fout.write((char*)triplet.pos.feature.data(), 27 * 27);
+        fout.write((char*)triplet.neg.feature.data(), 27 * 27);
+    }
+    fout.close();
+}
+std::vector<Feature::GPCPatchTriplet> Feature::loadAllTriplets(
+    std::string path) {
+    std::vector<Feature::GPCPatchTriplet> data;
+    std::ifstream in(path, std::ifstream::ate | std::ifstream::binary);
+    uint32_t filesize = in.tellg();
+    if (filesize % ((27 * 27) * 3)) {
+        cout << "ERR: File is not a training set of this feature type" << endl;
+        cout << "FS: " << filesize << endl;
+        return data;
+    }
+    int numSamples = filesize / ((27 * 27) * 3);
+    data.resize(numSamples);
+    ifstream fin;
+    fin.open(path, ios::binary | ios::in);
+    for (auto& datum : data) {
+        datum.ref.feature.resize(27, 27);
+        datum.pos.feature.resize(27, 27);
+        datum.neg.feature.resize(27, 27);
+
+        fin.read((char*)datum.ref.feature.data(), 27 * 27);
+        fin.read((char*)datum.pos.feature.data(), 27 * 27);
+        fin.read((char*)datum.neg.feature.data(), 27 * 27);
+    }
+    fin.close();
+    return data;
+}
+
+}  // namespace training
+}  // namespace gpc
diff --git a/lib/gpc/fern.cpp b/lib/gpc/fern.cpp
new file mode 100644
index 0000000..5a2632c
--- /dev/null
+++ b/lib/gpc/fern.cpp
@@ -0,0 +1,204 @@
+// Copyright (c) 2018, ETH Zurich
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the copyright holder nor the names of its contributors
+// may be used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Implements and extends the method proposed in
+// The Global Patch Collider
+// Shenlong Wang, Sean Ryan Fanello, Christoph Rhemann, Shahram Izadi, Pushmeet
+// Kohli CVPR 2016 Code Author: Niklaus Bamert (bamertn@ethz.ch)
+#include "gpc/Fern.hpp"
+
+#include <Eigen/Dense>
+#include <iomanip>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "gpc/Feature.hpp"
+
+using namespace std;
+namespace gpc {
+namespace training {
+OptimizerSettings TauOptimizer(int taulo,
+                               int tauhi,
+                               int numResamples,
+                               bool onlyScoreNonSplitSamples,
+                               double w1) {
+    return OptimizerSettings(
+        taulo, tauhi, numResamples, onlyScoreNonSplitSamples, w1);
+}
+OptimizerSettings ZeroOptimizer(int numResamples,
+                                bool onlyScoreNonSplitSamples,
+                                double w1) {
+    return OptimizerSettings(0, 1, numResamples, onlyScoreNonSplitSamples, w1);
+}
+void Fern::evalSplit(std::vector<GPCTriplet_t>& data,
+                     std::vector<SplitParams_t>& params,
+                     FernSettings fernsetting,
+                     OptimizerSettings optsetting,
+                     int scoreUntilLevel,
+                     splitStats& s) {
+    s.tp = 0;
+    s.fn = 0;
+    s.fp = 0;
+    s.prec = 0.;
+    s.rec = 0.;
+    s.hmean = 0.;
+    s.convcomb = 0.;
+    s.tot = 0;
+    for (auto& triplet : data) {
+        uint64_t ref = 0, pos = 0, neg = 0;
+        // Score the first scoreUntilLevel levels of a given fern
+        for (int i = 0; i < scoreUntilLevel + 1; i++) {
+            ref <<= 1;
+            pos <<= 1;
+            neg <<= 1;
+            bool refDec, posDec, negDec;
+
+            // Decisions need to be added into a codeword
+            Feature.getDecisions(refDec, posDec, negDec, params[i], triplet);
+            if (refDec) ref++;
+            if (posDec) pos++;
+            if (negDec) neg++;
+        }
+        // Only count those that haven't been true positives yet
+        // Ignore samples previously classified as True positive
+        if (!(triplet.pos.split == true && triplet.neg.split == true)) {
+            s.tot++;
+            // Decide which are equal (i.e. set the split indicators)
+            if (ref == pos) {      // 110(TP), 111, 001(TP), 000
+                if (ref != neg) {  // 110 (TP), 001(TP)
+                    s.tp++;
+                } else {  // 111(FN), 000(FN)
+                    s.fn++;
+                }
+            } else {               // 100, 101, 011, 010
+                if (ref != neg) {  // 100(FN), 011(FN) FN
+                    s.fn++;
+                } else {  //  101(FP), 010(FP)
+                    s.fp++;
+                }
+            }
+        }
+    }
+
+    // Compute statistics of this split
+    double w2 = 1. - optsetting.w1_;
+    s.prec = ((s.tp + s.fp) == 0) ? 0. : double(s.tp) / (s.tp + s.fp);
+    s.rec = ((s.tp + s.fn) == 0) ? 0. : double(s.tp) / (s.tp + s.fn);
+
+    s.hmean = (s.prec + s.rec == 0.)
+                  ? 0.
+                  : s.prec * s.rec / ((1. - w2) * s.prec + w2 * s.rec);
+    s.convcomb = (1. - w2) * s.prec + w2 * s.rec;
+}
+void Fern::markSplitSamples(std::vector<GPCTriplet_t>& data,
+                            std::vector<SplitParams_t>& params,
+                            int numParams) {
+    for (auto& triplet : data) {
+        // Evaluate triplet on all given parameters
+        uint64_t ref = 0, pos = 0, neg = 0;
+        for (int i = 0; i < numParams; i++) {
+            ref <<= 1;  // shift by one
+            pos <<= 1;  // shift by one
+            neg <<= 1;  // shift by one
+            bool refDec, posDec, negDec;
+
+            Feature.getDecisions(refDec, posDec, negDec, params[i], triplet);
+            if (refDec) ref++;
+            if (posDec) pos++;
+            if (negDec) neg++;
+        }
+        if (ref == pos) triplet.pos.split = true;
+        if (ref != neg) triplet.neg.split = true;
+    }
+}
+void Fern::resetMarkOnSamples(std::vector<GPCTriplet_t>& data) {
+    for (auto& triplet : data) {
+        triplet.pos.split = false;
+        triplet.neg.split = false;
+    }
+}
+
+void Fern::train(std::vector<GPCTriplet_t>& trainingSamples,
+                 OptimizerSettings optsetting) {
+    splitStats stats;
+    float maxScore = 0.f;
+    SplitParams_t bestParams;
+
+    fernparams.resize(fernsettings.maxDepth);
+
+    cout << setw(7) << "Level" << setw(10) << "Prec" << setw(10) << "Rec"
+         << setw(10) << "Har" << setw(8) << "Tot" << setw(8) << "TP" << setw(8)
+         << "FP" << setw(8) << "FN" << setw(6) << "scale" << setw(5) << "tau"
+         << setw(5) << "i" << setw(5) << "j" << endl;
+    if (optsetting.onlyScoreNonSplitSamples_)
+        resetMarkOnSamples(trainingSamples);
+    for (int level = 0; level < fernsettings.maxDepth; level++) {
+        maxScore = 0.f;
+        for (int k = 0; k < optsetting.numResamples_; k++) {
+            // Samples a hyperplane in the requested scale
+            Feature.sampleHyperplane(fernsettings.scale, fernparams[level]);
+            // Iterates over a small range of tau (intercept)
+            for (int tau = optsetting.taulo_; tau < optsetting.tauhi_; tau++) {
+                fernparams[level].tau = tau;
+                // Score hyperplane set we have so far
+                evalSplit(trainingSamples,
+                          fernparams,
+                          fernsettings,
+                          optsetting,
+                          level,
+                          stats);
+                // If score exceeds previously best, replace paramset
+                if (stats.hmean > maxScore) {
+                    bestParams = fernparams[level];
+                    maxScore = stats.hmean;
+                }
+            }  // tau loop
+        }  // k loop
+        // Store best performing parameters
+        fernparams[level] = bestParams;
+
+        // Mark samples as split if they were labeled true positive
+        if (optsetting.onlyScoreNonSplitSamples_)
+            markSplitSamples(trainingSamples, fernparams, level);
+        cout << setw(7) << level << setw(10) << stats.prec << setw(10)
+             << stats.rec << setw(10) << stats.hmean << setw(8) << stats.tot
+             << setw(8) << stats.tp << setw(8) << stats.fp << setw(8)
+             << stats.fn << setw(6) << fernsettings.scale << setw(5)
+             << fernparams[level].tau << setw(5) << fernparams[level].i
+             << setw(5) << fernparams[level].j << endl;
+    }  // level loop
+}  // train
+
+std::vector<Fern::SplitParams_t> Fern::getParameters() { return fernparams; }
+
+int Fern::getScale() { return fernsettings.scale; }
+
+}  // namespace training
+}  // namespace gpc
diff --git a/lib/gpc/filter.hpp b/lib/gpc/filter.hpp
deleted file mode 100644
index 49384f0..0000000
--- a/lib/gpc/filter.hpp
+++ /dev/null
@@ -1,1000 +0,0 @@
-// Copyright (c) 2018, ETH Zurich
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// 1. Redistributions of source code must retain the above copyright notice,
-// this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the copyright holder nor the names of its contributors
-// may be used to endorse or promote products derived from this software without
-// specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-//
-// Code Author: Niklaus Bamert (bamertn@ethz.ch)
-#ifndef __NDB__FILTER
-#define __NDB__FILTER
-
-#include <cassert>
-#include <thread>
-
-#include "gpc/buffer.hpp"
-using namespace std;
-
-#ifdef _INTRINSICS_SSE
-#include <immintrin.h>
-// greater and lesser than simd ops for unsigned 8bit integer (epu8)
-#define _mm_cmpgt_epu8(v0, v1)                             \
-    _mm_cmpgt_epi8(_mm_xor_si128(v0, _mm_set1_epi8(-128)), \
-                   _mm_xor_si128(v1, _mm_set1_epi8(-128)))
-#define _mm_cmplt_epu8(v1, v0)                             \
-    _mm_cmpgt_epi8(_mm_xor_si128(v0, _mm_set1_epi8(-128)), \
-                   _mm_xor_si128(v1, _mm_set1_epi8(-128)))
-#endif
-namespace ndb {
-/**
- * @brief Gets indices of non-zero values in array  a.
- *    Credits:
- *    https://stackoverflow.com/questions/18971401/sparse-array-compression-using-simd-avx2/41958528#41958528
- *
- * @param     input array
- * @param n   number of input elements
- * @param ind output array (indices into n of nonzero elements)
- * @param m   number of elements in output
- */
-__attribute__((noinline)) void arr2ind(const unsigned char* a,
-                                       int n,
-                                       int* ind,
-                                       int* m) {
-#ifdef _INTRINSICS_SSE
-    int i, m0, k;
-    __m256i msk;
-    m0 = 0;
-    for (i = 0; i < n; i = i + 32) { /* Load 32 bytes and compare with zero: */
-        msk = _mm256_cmpeq_epi8(_mm256_load_si256((__m256i*)&a[i]),
-                                _mm256_setzero_si256());
-        k = _mm256_movemask_epi8(msk);
-        k = ~k; /* Search for nonzero bits instead of zero bits.  */
-        while (k) {
-            ind[m0] =
-                i + _tzcnt_u32(
-                        k); /* Count the number of trailing zero bits in k. */
-            m0++;
-            k = _blsr_u32(k); /* Clear the lowest set bit in k. */
-        }
-    }
-    *m = m0;
-#else
-    int nnz = 0;
-    for (int i = 0; i < n; i++) {
-        if (a[i] != 0) {
-            nnz++;
-            *ind = i;
-            ind++;
-        }
-    }
-    *m = nnz;
-#endif
-}
-/**
- * @brief      Unpacks 16x8bit from a 128bit simd var into 2x128bit vars
- *             (8x16bit)
- *
- * @param[in]  x     the 128 bit vector to be unpacked
- * @param      y0    The y 0
- * @param      y1    The y 1
- */
-#ifdef _INTRINSICS_SSE
-void unpack8to16(const __m128i x, __m128i& y0, __m128i& y1) {
-    __m128i zero = _mm_setzero_si128();
-    y0 = _mm_unpacklo_epi8(x, zero);
-    y1 = _mm_unpackhi_epi8(x, zero);
-}
-/**
- * @brief      Packs 2x128bit vars with 16bit values(where 8 upper bits are
- *             zero) into 1x128bit with 8bit values
- *
- * @param[in]  x0    The x 0
- * @param[in]  x1    The x 1
- * @param      y     the packed vector
- */
-void pack16to8(const __m128i x0, const __m128i x1, __m128i& y) {
-    y = _mm_packus_epi16(x0, x1);
-}
-
-#endif
-/**
- * @brief Calls a given functional f with subranges based on the given start
- *        and end indices. Here the functional is assumed to take two integer
- *        arguments indicating their respective start and end ranges.
- *        nThreads determines the number of threads the given range shall be
- * split into. The range is inclusive on the lower bound and exclusive on the
- * upper bound, i.e. [start,end)
- *
- * @param f        function object (e.g. a lambda functional)
- * @param start    start of the range
- * @param end      end of the range
- * @param nThreads number of threads to use
- */
-void parFor(std::function<void(int, int)> const& f,
-            int start,
-            int end,
-            int nThreads) {
-    // Range definition
-    // quantities derived from range
-    int segSize = (end - start) / nThreads;
-    int lastSeg = (end - start) % nThreads;
-
-    std::vector<std::thread> threads;
-    threads.reserve(nThreads);
-
-    // Spawn threads
-    for (int t = 0; t < nThreads - 1; t++) {
-        threads.emplace_back(f, start + t * segSize, start + (t + 1) * segSize);
-    }
-    threads.emplace_back(f,
-                         start + (nThreads - 1) * segSize,
-                         start + (nThreads)*segSize + lastSeg);
-    // Join
-    for (auto& t : threads) t.join();
-}
-
-/**
- * @brief Naive 3x3 sobel filter implementation
- *
- * @param      in       input image
- * @param      blurred  The blurred output image
- * @param[in]  width    The width
- * @param[in]  height   The height
- * @param[in]  numThreads number of threads to use
- * @param      threshold  threshold to binarize sobel filter output
- */
-void sobelNaive(
-    uint8_t* in, uint8_t* gradient, int width, int height, uint8_t threshold) {
-    assert(width % 16 == 0 && "width must be multiple of 16!");
-    int thresholdSq = threshold * threshold;
-    uint8_t* ptr = in;
-
-    uint8_t* p11 = ptr + 0 * width;
-    uint8_t* p12 = ptr + 0 * width + 1;
-    uint8_t* p13 = ptr + 0 * width + 2;
-
-    uint8_t* p21 = ptr + 1 * width;
-    uint8_t* p22 = ptr + 1 * width + 1;
-    uint8_t* p23 = ptr + 1 * width + 2;
-
-    uint8_t* p31 = ptr + 2 * width;
-    uint8_t* p32 = ptr + 2 * width + 1;
-    uint8_t* p33 = ptr + 2 * width + 2;
-
-    // output pointer
-    uint8_t* optr = gradient + 1 * width + 1;
-    // Apply 3x3 box filter to image less pixel border of 1 (to avoid treating
-    // boundary) (unoptimized)
-    for (int iy = 1; iy < height - 1; iy++) {
-        for (int ix = 0; ix < width; ix++) {
-            int sx = (*p11 + *p31 + 2 * *p21 - *p13 - 2 * *p23 - *p33) / 9;
-            int sy = (*p11 + *p13 + 2 * *p12 - *p31 - 2 * *p32 - *p33) / 9;
-
-            int val = sx * sx + sy * sy;
-
-            *optr = val > thresholdSq ? 255 : 0;
-            p11++;
-            p12++;
-            p13++;
-            p21++;
-            p22++;
-            p23++;
-            p31++;
-            p32++;
-            p33++;
-            optr++;
-        }
-    }
-}
-/**
- * @brief      Naive 3x3 box filter implementation
- *
- * @param      in       input image
- * @param      blurred  The blurred output image
- * @param[in]  width    The width
- * @param[in]  height   The height
- * @param[in]  numThreads number of threads to use
- */
-void boxNaive(uint8_t* in, uint8_t* blurred, int width, int height) {
-    assert(width % 16 == 0 && "width must be multiple of 16!");
-    // allocate space for result
-    uint8_t* ptr = in;
-    uint8_t* p11 = ptr + 0 * width;
-    uint8_t* p12 = ptr + 0 * width + 1;
-    uint8_t* p13 = ptr + 0 * width + 2;
-
-    uint8_t* p21 = ptr + 1 * width;
-    uint8_t* p22 = ptr + 1 * width + 1;
-    uint8_t* p23 = ptr + 1 * width + 2;
-
-    uint8_t* p31 = ptr + 2 * width;
-    uint8_t* p32 = ptr + 2 * width + 1;
-    uint8_t* p33 = ptr + 2 * width + 2;
-    uint8_t* optr = blurred + 1 * width + 1;
-
-    // Apply 3x3 box filter to image less pixel border of 1 (to avoid treating
-    // boundary) (unoptimized)
-    for (int iy = 1; iy < height - 1; iy++) {
-        for (int ix = 0; ix < width; ix++) {
-            int res =
-                (*p11 + *p12 + *p13 + *p21 + *p22 + *p23 + *p31 + *p32 + *p33) /
-                9;
-            *optr = res;
-            p11++;
-            p12++;
-            p13++;
-            p21++;
-            p22++;
-            p23++;
-            p31++;
-            p32++;
-            p33++;
-            optr++;
-        }
-    }
-}
-/**
- * @brief Applies a gpc filter defined by the pixel-difference tests in
- * fastmask. Naive implementation
- *
- * @param in        The input image.
- * @param grad      The gradient image, such that we can skip non-gradient
- * pixels
- * @param gpc       The output image of 32bit codes
- * @param fastmask  The fastmask containing the gpc filter
- * @param idx       The gradient indices. Only used if no intrincs are available
- *                  and the call gets forwarded to the naive implementation.
- * @param width     The width of the image at pointer *in
- * @param height    The height of the image at pointer *in
- */
-void gpcFilterNaive(uint8_t* in,
-                    const uint8_t* grad,
-                    uint32_t* gpc,
-                    std::vector<int32_t> fastmask,
-                    std::vector<int>& idx,
-                    int width,
-                    int height) {
-    // output buffer of same size
-    uint32_t tmp;
-
-    int j = 0;
-    for (auto k : idx) {
-        tmp = 0;
-        for (uint8_t i = 0; i < fastmask.size(); i += 2) {
-            tmp <<= 1;  // shift by one
-            if (*(in + k + fastmask[i]) > *(in + k + fastmask[i + 1]))
-                tmp++;  // set this test's result to 1
-        }
-        gpc[k] = tmp;
-        j++;
-    }
-}
-/**
- * @brief Applies a gpc filter defined by the pixel-difference tests in
- * fastmask. Additionally uses a threshold vector (tau) Naive implementation.
- *
- * @param in        The input image.
- * @param grad      The gradient image, such that we can skip non-gradient
- * pixels
- * @param gpc       The output image of 32bit codes
- * @param fastmask  The fastmask containing the gpc filter
- * @param width     The width of the image at pointer *in
- * @param height    The height of the image at pointer *in
- */
-void gpcFilterTauNaive(uint8_t* in,
-                       const uint8_t* grad,
-                       uint32_t* gpc,
-                       std::vector<int32_t> fastmask,
-                       std::vector<int> tau,
-                       std::vector<int>& idx,
-                       int width,
-                       int height) {
-    uint32_t tmp;
-
-    int j = 0;
-    for (auto k : idx) {
-        tmp = 0;
-        for (uint8_t i = 0; i < fastmask.size(); i += 2) {
-            tmp <<= 1;  // shift by one
-            if (*(in + k + fastmask[i]) >
-                *(in + k + fastmask[i + 1]) - tau[i / 2])
-                tmp++;  // set this test's result to 1
-        }
-        gpc[k] = tmp;
-        j++;
-    }
-} /**
-   * @brief      boxfilter using SSE2 instructions. Loosely based on
-   *             https://www.ignorantus.com/box_sse2/, published under
-   *             the https://creativecommons.org/publicdomain/zero/1.0/ licence.
-   *
-   * @param      in       input image
-   * @param      blurred  The blurred
-   * @param[in]  width    The width
-   * @param[in]  height   The height
-   * @param[in]  numThreads number of threads to use
-   */
-void box(uint8_t* in, uint8_t* blurred, int width, int height, int numThreads) {
-    assert(width % 16 == 0 && "width must be multiple of 16!");
-#ifndef _INTRINSICS_SSE
-    boxNaive(in, blurred, width, height);
-#else
-    auto boxFilterSegment = [&](int start, int end) {
-        int x, y;
-        __m128i one_third;
-        __m128i *dst0, *dst1;
-        __m128i zero = _mm_setzero_si128();
-
-        one_third = _mm_set1_epi16(
-            21846);  // 2^16/3+1. For 16bit ints. 2^8/3+1=86.33 for 8bit
-        dst0 = (__m128i*)(blurred + width * (start));
-        dst1 = (__m128i*)(blurred + width * (start + 1));
-        for (y = start; y < end;
-             y += 2) {  // We compute results for two rows in one iteration
-            const uint8_t *row0, *row1, *row2, *row3;
-
-            row1 = in + y * width;
-            row0 = row1 - width;
-            row2 = row1 + width;
-            row3 = row2 + width;
-
-            for (x = 0; x < width; x += 16) {
-                __m128i s00, s01, s02;
-                __m128i r00, r01, r02;
-                __m128i ra00, ra01, ra02;
-                __m128i rb00, rb01, rb02;
-
-                __m128i a00, a01, a02, b00, b01, b02;
-
-                __m128i tmp0, tmp1, res;
-
-                s00 = _mm_loadu_si128((__m128i*)(row0 - 1));
-                s01 = _mm_loadu_si128((__m128i*)(row0 + 1));
-                s02 = _mm_load_si128((__m128i*)(row0));
-                unpack8to16(s00, a00, b00);
-                unpack8to16(s01, a01, b01);
-                unpack8to16(s02, a02, b02);
-
-                ra00 = _mm_mulhi_epi16(
-                    _mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third);
-                rb00 = _mm_mulhi_epi16(
-                    _mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third);
-
-                s00 = _mm_loadu_si128((__m128i*)(row1 - 1));
-                s01 = _mm_loadu_si128((__m128i*)(row1 + 1));
-                s02 = _mm_load_si128((__m128i*)(row1));
-                unpack8to16(s00, a00, b00);
-                unpack8to16(s01, a01, b01);
-                unpack8to16(s02, a02, b02);
-
-                ra01 = _mm_mulhi_epi16(
-                    _mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third);
-                rb01 = _mm_mulhi_epi16(
-                    _mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third);
-
-                s00 = _mm_loadu_si128((__m128i*)(row2 - 1));
-                s01 = _mm_loadu_si128((__m128i*)(row2 + 1));
-                s02 = _mm_load_si128((__m128i*)(row2));
-                unpack8to16(s00, a00, b00);
-                unpack8to16(s01, a01, b01);
-                unpack8to16(s02, a02, b02);
-
-                ra02 = _mm_mulhi_epi16(
-                    _mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third);
-                rb02 = _mm_mulhi_epi16(
-                    _mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third);
-
-                tmp0 = _mm_mulhi_epi16(
-                    _mm_adds_epi16(_mm_adds_epi16(ra00, ra01), ra02),
-                    one_third);
-                tmp1 = _mm_mulhi_epi16(
-                    _mm_adds_epi16(_mm_adds_epi16(rb00, rb01), rb02),
-                    one_third);
-
-                pack16to8(tmp0, tmp1, res);
-                _mm_store_si128(dst0++, res);
-
-                s00 = _mm_loadu_si128((__m128i*)(row3 - 1));
-                s01 = _mm_loadu_si128((__m128i*)(row3 + 1));
-                s02 = _mm_load_si128((__m128i*)(row3));
-                unpack8to16(s00, a00, b00);
-                unpack8to16(s01, a01, b01);
-                unpack8to16(s02, a02, b02);
-                ra00 = _mm_mulhi_epi16(
-                    _mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third);
-                rb00 = _mm_mulhi_epi16(
-                    _mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third);
-
-                tmp0 = _mm_mulhi_epi16(
-                    _mm_adds_epi16(_mm_adds_epi16(ra00, ra01), ra02),
-                    one_third);
-                tmp1 = _mm_mulhi_epi16(
-                    _mm_adds_epi16(_mm_adds_epi16(rb00, rb01), rb02),
-                    one_third);
-
-                pack16to8(tmp0, tmp1, res);
-                _mm_store_si128(dst1++, res);
-
-                row0 += 16;
-                row1 += 16;
-                row2 += 16;
-                row3 += 16;
-            }
-            // still storing 128bit, but now in 16 x 8bit format, so /16 instead
-            // of /8
-            dst0 += width / 16;
-            dst1 += width / 16;
-        }
-    };  // lambda
-
-    boxFilterSegment(1, height - 3);
-    // parFor(boxFilterSegment,1,height-3,4);
-#endif
-}
-/**
- * @brief      3x3 Sobel filter. Input dimension must be multiple of 16
- *
- * @param      in         { parameter_description }
- * @param      blurred    The blurred
- * @param[in]  width      The width
- * @param[in]  height     The height
- * @param[in]  threshold  The threshold
- * @param[in]  numThreads number of threads to use
- */
-
-void sobel(uint8_t* in,
-           uint8_t* blurred,
-           int width,
-           int height,
-           uint8_t threshold,
-           int numThreads) {
-    assert(width % 16 == 0 && "width must be multiple of 16!");
-#ifndef _INTRINSICS_SSE
-    sobelNaive(in, blurred, width, height, threshold);
-#else
-    auto sobelSSESegment = [&](int start, int end) {
-        __m128i one_third, one_ninth, one, two, mone, mtwo, binThres;
-        __m128i *dst0, *dst1;
-        __m128i zero = _mm_setzero_si128();
-
-        int x, y;
-        one_third = _mm_set1_epi16(
-            21846);  // 2^16/3+1. For 16bit ints. 2^8/3+1=86.33 for 8bit
-        one_ninth = _mm_set1_epi16(7282);  // 2^16/9+1. For 16bit ints.
-
-        binThres = _mm_set1_epi16(threshold * threshold);
-
-        dst0 = (__m128i*)(blurred + width * 1);
-        // dst1 = (__m128i *)(blurred + width * 2);
-        for (y = start; y < end;
-             y++) {  // We compute results for two rows in one iteration
-            const uint8_t *row0, *row1, *row2;
-
-            row1 = in + y * width;
-            row0 = row1 - width;
-            row2 = row1 + width;
-
-            for (x = 0; x < width; x += 16) {
-                // Note: Center element not used in sobel kernels!!
-                // Kernel indices:
-                // 00 01 02
-                // 10 11 12
-                // 20 21 22
-
-                __m128i a00, a01, a02, a10, a12, a20, a21, a22;
-                __m128i b00, b01, b02, b10, b12, b20, b21, b22;
-
-                __m128i raA, raB, rbA, rbB;
-                __m128i tmpa, tmpb, sya, syb, sxa, sxb, res;
-
-                unpack8to16(_mm_loadu_si128((__m128i*)(row0 - 1)), a00, b00);
-                unpack8to16(_mm_load_si128((__m128i*)(row0)), a01, b01);
-                unpack8to16(_mm_loadu_si128((__m128i*)(row0 + 1)), a02, b02);
-
-                unpack8to16(_mm_loadu_si128((__m128i*)(row1 - 1)), a10, b10);
-                unpack8to16(_mm_loadu_si128((__m128i*)(row1 + 1)), a12, b12);
-
-                unpack8to16(_mm_loadu_si128((__m128i*)(row2 - 1)), a20, b20);
-                unpack8to16(_mm_load_si128((__m128i*)(row2)), a21, b21);
-                unpack8to16(_mm_loadu_si128((__m128i*)(row2 + 1)), a22, b22);
-
-                // Sobel kernels for x and y direction.
-                //      1 0 -1       1 2 1
-                // sx = 2 0 -2 sy =  0 0 0
-                //      1 0 -1      -1-2-1
-                //      Note that neither kernel uses the center element)
-
-                // In the following, mullo is used to multiply intermediate
-                // results with -1 To divide by 3, 16bit overflow divide by
-                // multiply is used, which thus uses the upper 16bit(_mm_mulhi)
-                // of the 32bit temporary result.
-
-                // sx column kernel vectors (1,2,1)
-                // Two chained add/sub are used for 2 and -2
-                raA = _mm_mulhi_epi16(
-                    _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(a00, a20), a10),
-                                  a10),
-                    one_ninth);
-                rbA = _mm_mulhi_epi16(
-                    _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(b00, b20), b10),
-                                  b10),
-                    one_ninth);
-
-                // sx column kernel vector (-1 -2 -1)
-                raB = _mm_mulhi_epi16(
-                    _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(a02, a22), a12),
-                                  a12),
-                    one_ninth);
-                rbB = _mm_mulhi_epi16(
-                    _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(b02, b22), b12),
-                                  b12),
-                    one_ninth);
-
-                // Square of sx: Add squares of above temporaries into final sum
-                tmpa = _mm_sub_epi16(raA, raB);
-                tmpb = _mm_sub_epi16(rbA, rbB);
-
-                sxa = _mm_mullo_epi16(tmpa, tmpa);
-                sxb = _mm_mullo_epi16(tmpb, tmpb);
-
-                // sy row kernel vector (1,2,1)
-                // Two chained add are used for 2 and -2
-                raA = _mm_mulhi_epi16(
-                    _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(a00, a02), a01),
-                                  a01),
-                    one_ninth);
-                rbA = _mm_mulhi_epi16(
-                    _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(b00, b02), b01),
-                                  b01),
-                    one_ninth);
-
-                // sy row kernel vector (-1 -2 -1)
-                raB = _mm_mulhi_epi16(
-                    _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(a20, a22), a21),
-                                  a21),
-                    one_ninth);
-                rbB = _mm_mulhi_epi16(
-                    _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(b20, b22), b21),
-                                  b21),
-                    one_ninth);
-
-                // Square of sx: Add squares of above temporaries into final sum
-                tmpa = _mm_sub_epi16(raA, raB);
-                tmpb = _mm_sub_epi16(rbA, rbB);
-
-                // watch out, can't overwrite this
-                sya = _mm_mullo_epi16(tmpa, tmpa);
-                syb = _mm_mullo_epi16(tmpb, tmpb);
-
-                __m128i zero = _mm_setzero_si128();
-
-                // The unpacklo is necessary because _mm_cmput_epi16 sets the
-                // output to 0xFFFF if the comparison is true. When packing
-                // 16bit to 8bit however, 0xFFFF will be interpreted (in a
-                // signed environment) as being negative, and hence set to 0,
-                // resulting in a 0 output everywhere. using unpacklo in between
-                // we get 0xFFFF->0xFF
-                pack16to8(
-                    _mm_unpacklo_epi8(
-                        _mm_cmpgt_epi16(_mm_adds_epi16(sxa, sya), binThres),
-                        zero),
-                    _mm_unpacklo_epi8(
-                        _mm_cmpgt_epi16(_mm_adds_epi16(sxb, syb), binThres),
-                        zero),
-                    res);
-
-                _mm_store_si128(dst0++, res);
-
-                row0 += 16;
-                row1 += 16;
-                row2 += 16;
-            }  // cols
-        }  // rows
-    };  // Lambda
-    sobelSSESegment(1, height - 3);
-#endif
-}
-
-/**
- * @brief Checks if the 128bits in xmm are all zero
- *
- * @param xmm
- *
- * @return true if all zeros, false otherwise
- */
-#ifdef _INTRINSICS_SSE
-inline bool isAllZeros(__m128i xmm) {
-    return _mm_movemask_epi8(_mm_cmpeq_epi8(xmm, _mm_setzero_si128())) ==
-           0xFFFF;
-}
-#endif
-/**
- * @brief Applies a gpc filter defined by the pixel-difference tests in
- * fastmask. Accelerated with SSE.
- *
- * @param in        The input image.
- * @param grad      The gradient image, such that we can skip non-gradient
- * pixels
- * @param gpc       The output image of 32bit codes
- * @param fastmask  The fastmask containing the gpc filter
- * @param idx       The gradient indices. Only used if no intrincs are available
- *                  and the call gets forwarded to the naive implementation.
- * @param width     The width of the image at pointer *in
- * @param height    The height of the image at pointer *in
- * @param numThreadsNumber of threads to use
- */
-void gpcFilter(uint8_t* in,
-               const uint8_t* grad,
-               uint32_t* gpc,
-               std::vector<int32_t> fastmask,
-               std::vector<int>& idx,
-               int width,
-               int height,
-               int numThreads) {
-    assert(width % 16 == 0 && "width must be multiple of 16!");
-#ifndef _INTRINSICS_SSE
-    gpcFilterNaive(in, grad, gpc, fastmask, idx, width, height);
-#else
-    auto gpcFilterSegment = [&](int start, int end) {
-        __m128i zero = _mm_set1_epi8(0);
-        __m128i one = _mm_set1_epi8(1);
-        for (int y = start; y < end; y++) {
-            for (int x = 0; x < width; x += 16) {
-                uint8_t* rowPtr;
-                rowPtr = in + (y - 2) * width + x;
-                __m128i out[4];  // temporary output vector of 4 128bit words
-
-                const uint8_t* center = (in + y * width + x);
-                const uint8_t* centerGrad = (grad + y * width + x);
-                // We only process the current segment if there are any non-zero
-                // values (high gradient pixels)
-                if (!isAllZeros(_mm_lddqu_si128((__m128i*)centerGrad))) {
-                    __m128i* dst =
-                        (__m128i*)(gpc + y * width +
-                                   x);  // Set starting point to pixel (2,2)
-                    out[0] = zero;
-                    out[1] = zero;
-                    out[2] = zero;
-                    out[3] = zero;
-                    uint8_t k = 0;
-                    __m128i bitMask = one;
-                    for (uint8_t i = 0; i < fastmask.size() && i < 64; i += 2) {
-                        out[k] |= _mm_and_si128(
-                            _mm_cmpgt_epu8(
-                                _mm_lddqu_si128(
-                                    (__m128i*)(center + fastmask[i])),
-                                _mm_lddqu_si128(
-                                    (__m128i*)(center + fastmask[i + 1]))),
-                            bitMask);
-                        // Keeps index into output vector and updates bit mask
-                        if (i % 16 == 0 && i != 0) {
-                            bitMask = one;
-                            k++;
-                        } else {
-                            bitMask += bitMask;
-                        }
-                    }
-                    // 8bit to 16bit
-                    __m128i high1 = _mm_unpacklo_epi8(out[2], out[3]);
-                    __m128i high2 = _mm_unpackhi_epi8(out[2], out[3]);
-                    __m128i low1 = _mm_unpacklo_epi8(out[0], out[1]);
-                    __m128i low2 = _mm_unpackhi_epi8(out[0], out[1]);
-
-                    // 16bit to 32bit ints
-                    _mm_storeu_si128(dst, _mm_unpacklo_epi16(low1, high1));
-                    _mm_storeu_si128(dst + 1, _mm_unpackhi_epi16(low1, high1));
-                    _mm_storeu_si128(dst + 2, _mm_unpacklo_epi16(low2, high2));
-                    _mm_storeu_si128(dst + 3, _mm_unpackhi_epi16(low2, high2));
-                }
-            }  // col iteration
-        }  // row iteration
-    };
-
-    if (numThreads == 1)
-        gpcFilterSegment(13, height - 15);
-    else
-        parFor(gpcFilterSegment, 13, height - 15, 4);
-#endif
-}
-/**
- * @brief Applies a gpc filter defined by the pixel-difference tests in
- * fastmask. Additionally uses a threshold vector (tau)
- *
- * @param in        The input image.
- * @param grad      The gradient image, such that we can skip non-gradient
- * pixels
- * @param gpc       The output image of 32bit codes
- * @param fastmask  The fastmask containing the gpc filter
- * @param width     The width of the image at pointer *in
- * @param height    The height of the image at pointer *in
- * @param numThreads Number of threads to use
- */
-void gpcFilterTau(uint8_t* in,
-                  const uint8_t* grad,
-                  uint32_t* gpc,
-                  std::vector<int32_t> fastmask,
-                  std::vector<int> tau,
-                  std::vector<int>& idx,
-                  int width,
-                  int height,
-                  int numThreads) {
-    assert(width % 16 == 0 && "width must be multiple of 16!");
-#ifndef _INTRINSICS_SSE
-    gpcFilterTauNaive(in, grad, gpc, fastmask, tau, idx, width, height);
-#else
-    auto gpcFilterSegment = [&](int start, int end) {
-        __m128i zero = _mm_set1_epi8(0);
-        __m128i one = _mm_set1_epi8(1);
-        for (int y = start; y < end; y++) {
-            for (int x = 0; x < width; x += 16) {
-                uint8_t* rowPtr;
-                rowPtr = in + (y - 2) * width + x;
-                __m128i out[4];  // temporary output vector of 4 128bit words
-
-                const uint8_t* center = (in + y * width + x);
-                const uint8_t* centerGrad = (grad + y * width + x);
-                // We only process the current segment if there are any non-zero
-                // values (high gradient pixels)
-                if (!isAllZeros(_mm_lddqu_si128((__m128i*)centerGrad))) {
-                    __m128i* dst =
-                        (__m128i*)(gpc + y * width +
-                                   x);  // Set starting point to pixel (2,2)
-                    out[0] = zero;
-                    out[1] = zero;
-                    out[2] = zero;
-                    out[3] = zero;
-                    uint8_t k = 0;
-                    __m128i bitMask = one;
-                    for (uint8_t i = 0; i < fastmask.size() && i < 64; i += 2) {
-                        out[k] |= _mm_and_si128(
-                            _mm_cmpgt_epu8(
-                                _mm_lddqu_si128(
-                                    (__m128i*)(center + fastmask[i])),
-                                _mm_subs_epi8(
-                                    _mm_lddqu_si128(
-                                        (__m128i*)(center + fastmask[i + 1])),
-                                    _mm_set1_epi8(tau[i / 2]))  // deduct tau
-                                ),
-                            bitMask);
-                        // Keeps index into output vector and updates bit mask
-                        if (i % 16 == 0 && i != 0) {
-                            bitMask = one;
-                            k++;
-                        } else {
-                            bitMask += bitMask;
-                        }
-                    }
-                    // 8bit to 16bit
-                    __m128i high1 = _mm_unpacklo_epi8(out[2], out[3]);
-                    __m128i high2 = _mm_unpackhi_epi8(out[2], out[3]);
-                    __m128i low1 = _mm_unpacklo_epi8(out[0], out[1]);
-                    __m128i low2 = _mm_unpackhi_epi8(out[0], out[1]);
-
-                    // 16bit to 32bit ints
-                    _mm_storeu_si128(dst, _mm_unpacklo_epi16(low1, high1));
-                    _mm_storeu_si128(dst + 1, _mm_unpackhi_epi16(low1, high1));
-                    _mm_storeu_si128(dst + 2, _mm_unpacklo_epi16(low2, high2));
-                    _mm_storeu_si128(dst + 3, _mm_unpackhi_epi16(low2, high2));
-                }
-            }  // col iteration
-        }  // row iteration
-    };
-
-    if (numThreads == 1)
-        gpcFilterSegment(13, height - 15);
-    else
-        parFor(gpcFilterSegment, 13, height - 15, 4);
-#endif
-}
-/**
- * @brief Naive version of 5x5 census transoform
- *
- * @param in      Input image
- * @param census  32bit census transform output
- * @param width   Width of the image at *in pointer
- * @param height  Heiht of the image at *in pointer
- */
-void census5x5Naive(uint8_t* in, uint32_t* census, int width, int height) {
-    uint32_t val;
-    uint32_t* dst;
-    for (int y = 2; y < height - 3; y++) {
-        for (int x = 0; x < width; x++) {
-            val = 0;
-            dst = census + y * width + x;
-            int i = 0;
-            // patch loops
-            for (int px = -2; px <= 2; px++) {
-                for (int py = -2; py <= 2; py++) {
-                    if (!(px == 0 && py == 0)) {
-                        val |= (in[(y + py) * width + (x + px)] >
-                                in[y * width + x])
-                                   ? (1 << i)
-                                   : 0;
-                        i++;
-                    }
-                }
-            }  // End patch loops
-            *dst = val;
-        }
-    }  // End pixel loops
-}
-
-/**
- * @brief 5x5 dense census transform of input image. binary codes are returned
- * as a 32bit image
- *
- * @param in
- * @param census
- * @param width
- * @param height
- */
-void census5x5(uint8_t* in, uint32_t* census, int width, int height) {
-    assert(width % 16 == 0 && "width must be multiple of 16!");
-#ifndef _INTRINSICS_SSE
-    census5x5Naive(in, census, width, height);
-#else
-    __m128i zero = _mm_set1_epi8(0);
-    __m128i one = _mm_set1_epi8(1);
-
-    for (int y = 2; y < height - 3; y++) {
-        for (int x = 0; x < width; x += 16) {
-            uint8_t* rowPtr;
-            rowPtr = in + (y - 2) * width + x;
-            __m128i center = _mm_lddqu_si128((__m128i*)(in + y * width + x));
-            __m128i* dst = (__m128i*)(census + y * width +
-                                      x);  // Set starting point to pixel (2,2)
-            // row 0
-            __m128i bitMask = one;
-            __m128i byte1 = _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 2))),
-                bitMask);
-            bitMask += bitMask;  // 2
-            byte1 |= _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 1))),
-                bitMask);
-            bitMask += bitMask;  // 4
-            byte1 |= _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr))),
-                bitMask);
-            bitMask += bitMask;  // 8
-            byte1 |= _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 1))),
-                bitMask);
-            bitMask += bitMask;  // 16
-            byte1 |= _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 2))),
-                bitMask);
-
-            // row 1
-            rowPtr += width;
-            bitMask += bitMask;  // 32
-            byte1 |= _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 2))),
-                bitMask);
-            bitMask += bitMask;  // 64
-            byte1 |= _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 1))),
-                bitMask);
-            bitMask += bitMask;  // 128
-            byte1 |= _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr))),
-                bitMask);
-            bitMask = one;  // 1
-            __m128i byte2 = _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 1))),
-                bitMask);
-            bitMask += bitMask;  // 2
-            byte2 |= _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 2))),
-                bitMask);
-
-            // row 2
-            rowPtr += width;
-            bitMask += bitMask;  // 4
-            byte2 |= _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 2))),
-                bitMask);
-            bitMask += bitMask;  // 8
-            byte2 |= _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 1))),
-                bitMask);
-            bitMask += bitMask;  // 16
-            byte2 |= _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 1))),
-                bitMask);
-            bitMask += bitMask;  // 32
-            byte2 |= _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 2))),
-                bitMask);
-
-            // row 3
-            rowPtr += width;
-            bitMask += bitMask;  // 64
-            byte2 |= _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 2))),
-                bitMask);
-            bitMask += bitMask;  // 128
-            byte2 |= _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 1))),
-                bitMask);
-            bitMask = one;  // 1
-            __m128i byte3 = _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr))),
-                bitMask);
-            bitMask += bitMask;  // 2
-            byte3 |= _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 1))),
-                bitMask);
-            bitMask += bitMask;  // 4
-            byte3 |= _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 2))),
-                bitMask);
-
-            // row 4
-            rowPtr += width;
-            bitMask += bitMask;  // 8
-            byte3 |= _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 2))),
-                bitMask);
-            bitMask += bitMask;  // 16
-            byte3 |= _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 1))),
-                bitMask);
-            bitMask += bitMask;  // 32
-            byte3 |= _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr))),
-                bitMask);
-            bitMask += bitMask;  // 64
-            byte3 |= _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 1))),
-                bitMask);
-            bitMask += bitMask;  // 128
-            byte3 |= _mm_and_si128(
-                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 2))),
-                bitMask);
-
-            // 8bit to 16bit
-            __m128i high1 = _mm_unpacklo_epi8(byte3, zero);
-            __m128i high2 = _mm_unpackhi_epi8(byte3, zero);
-            __m128i low1 = _mm_unpacklo_epi8(byte1, byte2);
-            __m128i low2 = _mm_unpackhi_epi8(byte1, byte2);
-
-            // 16bit to 32bit ints
-            _mm_storeu_si128(dst, _mm_unpacklo_epi16(low1, high1));
-            _mm_storeu_si128(dst + 1, _mm_unpackhi_epi16(low1, high1));
-            _mm_storeu_si128(dst + 2, _mm_unpacklo_epi16(low2, high2));
-            _mm_storeu_si128(dst + 3, _mm_unpackhi_epi16(low2, high2));
-
-        }  // col iteration
-    }  // row iteration
-    // if(numThreads == 1)
-    // gpcFilterSegment(13,height-15);
-    // else
-    // parFor(gpcFilterSegment,13,height-15,4);
-
-#endif
-}  // census5x5
-}  // namespace ndb
-#endif
diff --git a/lib/gpc/forest.cpp b/lib/gpc/forest.cpp
new file mode 100644
index 0000000..940eb33
--- /dev/null
+++ b/lib/gpc/forest.cpp
@@ -0,0 +1,380 @@
+// Copyright (c) 2018, ETH Zurich
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the copyright holder nor the names of its contributors
+// may be used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Implements and extends the method proposed in
+// The Global Patch Collider
+// Shenlong Wang, Sean Ryan Fanello, Christoph Rhemann, Shahram Izadi, Pushmeet
+// Kohli CVPR 2016 Code Author: Niklaus Bamert (bamertn@ethz.ch)
+#include <Eigen/Dense>
+// #include <arm_neon.h>
+#include <chrono>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <random>
+#include <string>
+#include <thread>
+#include <vector>
+
+// GPC includes
+#include <unordered_map>
+
+#include "gpc/Feature.hpp"
+#include "gpc/SintelOpticalFlow.hpp"
+#include "gpc/SintelStereo.hpp"
+#include "gpc/buffer.hpp"
+#include "gpc/forest.hpp"
+#include "gpc/hashmatch.hpp"
+#include "gpc/kernels/box.hpp"
+#include "gpc/kernels/gpc.hpp"
+#include "gpc/kernels/sobel.hpp"
+#include "gpc/kernels/utils.hpp"
+
+namespace gpc {
+namespace inference {
+/**
+ * @brief Computes sparse matches on a pair of rectified and smoothed
+ * images. Here the src and tar images refer to the left and right images,
+ * respectively.
+ *
+ * @param src    Preprocessed source(left) image
+ * @param tar    Preprocessed target(right) image
+ * @param fastmask    forest mask of relative integer offsets.
+ *
+ * @return
+ */
+std::vector<ndb::Correspondence> Forest::depthPriorFast(
+    PreprocessedImage& src,
+    PreprocessedImage& tar,
+    FilterMask& fastmask,
+    InferenceSettings& settings) {
+    std::chrono::high_resolution_clock::time_point t0, t1;
+    std::vector<ndb::Descriptor> statesSrc = evalFastMaskOnSubsetSSE(
+        src.smooth, src.grad, src.mask, fastmask, settings);
+    std::vector<ndb::Descriptor> statesTar = evalFastMaskOnSubsetSSE(
+        tar.smooth, tar.grad, tar.mask, fastmask, settings);
+    // Epipolar mode. Use upper 32bit of 64bit descriptor to store y
+    // coordinate
+    if (settings.epipolarMode_) {
+        for (auto& el : statesSrc) el.state |= uint64_t(el.point.y) << 32;
+        for (auto& el : statesTar) el.state |= uint64_t(el.point.y) << 32;
+    }
+    // Use sort method for matching
+    if (settings.useHashtable_ == false) {
+        t0 = sysTick();
+        std::vector<ndb::Correspondence> corr =
+            findCorrespondences(statesSrc, statesTar);
+        t1 = sysTick();
+        std::cout << "findCorrespondences (without allocation): "
+                  << gpc::inference::tickToMs(t1, t0) << " ms" << std::endl;
+        std::cout << "length src: " << statesSrc.size() << std::endl;
+        return corr;
+    }
+    // Use hashtable matching
+    else {
+        for (auto& q : statesSrc) q.srcDescr = true;
+        for (auto& q : statesTar) q.srcDescr = false;
+
+        ndb::Hashmatch<ndb::Descriptor> hm(214673,
+                                           statesSrc.size() + statesTar.size());
+        std::vector<std::pair<ndb::Descriptor, ndb::Descriptor>> corr;
+        for (auto& q : statesSrc) hm.insert(q);
+        for (auto& q : statesTar) hm.insert(q);
+        hm.getDuplicates(corr);
+        // Store vertices in a format that is more convenient for us:
+        std::vector<ndb::Correspondence> corr2;
+        for (auto& e : corr) {
+            corr2.push_back(ndb::Correspondence(e.first.point, e.second.point));
+        }
+
+        return corr2;
+    }
+}
+std::vector<ndb::Correspondence> Forest::findCorrespondences(
+    std::vector<ndb::Descriptor>& srcStates,
+    std::vector<ndb::Descriptor>& tarStates) {
+    int numStates = std::min(srcStates.size(), tarStates.size());
+    // Limit search to rectified epipolar case.
+    std::sort(srcStates.begin(), srcStates.end());
+    std::sort(tarStates.begin(), tarStates.end());
+    std::vector<ndb::Correspondence> corr;
+    uint32_t j = 0;
+    for (uint32_t i = 0; i < srcStates.size(); ++i) {
+        bool unique = true;
+        while (i + 1 < srcStates.size() && srcStates[i] == srcStates[i + 1])
+            ++i, unique = false;
+
+        if (unique) {
+            for (; j < tarStates.size() - 1; ++j) {
+                if (!(tarStates[j] < srcStates[i])) break;
+            }
+
+            if (j != tarStates.size() - 1 && tarStates[j] == srcStates[i] &&
+                ((j + 1) == tarStates.size() - 1 ||
+                 !(tarStates[j] == tarStates[j + 1])))
+                corr.push_back(ndb::Correspondence(srcStates[i].point,
+                                                   tarStates[j].point));
+        }
+    }
+    return corr;
+}
+
+/**
+ * @brief Evaluates a given forest mask on an image and returns the
+ * descriptors
+ *
+ * @param img       The image
+ * @param grad      gradient image
+ * @param idx       offsets with high gradient pixels within the grad image
+ * @param fastmask  the forest mask
+ *
+ * @return
+ */
+std::vector<ndb::Descriptor> Forest::evalFastMaskOnSubsetSSE(
+    ndb::Buffer<uint8_t>& img,
+    ndb::Buffer<uint8_t>& grad,
+    std::vector<int>& idx,
+    FilterMask& fastmask,
+    InferenceSettings& settings) {
+    std::chrono::high_resolution_clock::time_point t0, t1;
+
+    // output buffer of same size
+    ndb::Buffer<uint32_t> gpcstates(img.rows(), img.cols(), 0);
+    if (fastmask.type == 0) {
+        ndb::gpcFilter(img.data(),
+                       grad.data(),
+                       gpcstates.data(),
+                       fastmask.mask,
+                       idx,
+                       img.cols(),
+                       img.rows());
+    } else {
+        ndb::gpcFilterTau(img.data(),
+                          grad.data(),
+                          gpcstates.data(),
+                          fastmask.mask,
+                          fastmask.tau,
+                          idx,
+                          img.cols(),
+                          img.rows());
+    }
+    std::vector<ndb::Descriptor> out(idx.size());
+    int j = 0;
+
+    for (auto k : idx) {
+        int x = k % img.cols();
+        int y = k / img.cols();
+        out[j] = ndb::Descriptor(ndb::Point(x, y), gpcstates.data()[k]);
+        j++;
+    }
+    return out;
+}
+
+/**
+ * @brief Preprocesses an image. (smooth, binary sobel image and gradient
+ * pixel indices)
+ *
+ * @param img     The raw input image to be preprocessed
+ * @param InferenceSettings inference settings struct
+ *
+ * @return the preprocessed image
+ */
+PreprocessedImage Forest::preprocessImage(ndb::Buffer<uint8_t>& img,
+                                          InferenceSettings settings) {
+    assert((settings.gradientThreshold_ >= 0 &&
+            settings.gradientThreshold_ <= 255) &&
+           "gradientThreshold needs to be within 0...255");
+
+    ndb::Buffer<uint8_t> smooth(img.rows(), img.cols());
+
+    smooth.width = img.width;
+    // 0.2ms
+    ndb::box(img.data(),
+             smooth.data(),
+             img.cols(),
+             img.rows(),
+             settings.numThreads_);
+    // 4.2 *10^-5 ms
+    smooth.clearBoundary();
+    ndb::Buffer<uint8_t> grad(img.rows(), img.cols());
+    grad.width = img.width;
+    // 4.2*10-5ms (unclear how)
+    ndb::sobel(img.data(),
+               grad.data(),
+               img.cols(),
+               img.rows(),
+               settings.gradientThreshold_,
+               settings.numThreads_);
+    gpc::inference::time_point t0 = gpc::inference::sysTick();
+    ndb::Buffer<int> idx;
+    idx.resize(grad.rows(), grad.cols());
+    auto ff = [&](ndb::Buffer<int>& in, std::vector<int>& out, int m) {
+        for (int i = 0; i < m; i++) {
+            int x = in.data()[i] % grad.cols();
+            int y = in.data()[i] / grad.cols();
+            if (y >= 13 && y < grad.rows() - 13 && x >= 13 &&
+                x < grad.cols() - 13)
+                out.push_back(in.data()[i]);
+        }
+    };
+    int m;
+    // mask indexing gradient pixels
+    std::vector<int> mask;
+    ndb::arr2ind(grad.data(), grad.cols() * grad.rows(), idx.data(), &m);
+    ff(idx, mask, m);
+
+    gpc::inference::time_point t1 = gpc::inference::sysTick();
+    // Our outputs are: smooth, grad, mask;
+    return PreprocessedImage(smooth, grad, mask);
+}
+/**
+ * @brief Finds matches between two stereo images based on a given forest
+ * mask.
+ *
+ * @param simg              source image (assumed to be the left image)
+ * @param timg              target image (assumed to be the right image)
+ * @param forestmask        forest mask, provided by readForest method
+ * @param InferenceSettings inference settings struct
+ * @return                  Set of correspondences (ptSrc, ptTar) where
+ * ptSrc and ptTar are points in the source and target images, respectively.
+ */
+std::vector<ndb::Correspondence> Forest::stereoMatch(
+    PreprocessedImage& simg,
+    PreprocessedImage& timg,
+    FilterMask& forestmask,
+    InferenceSettings settings) {
+    // make sure the delivered mask matches the image dimensions
+    assert((forestmask.width == simg.smooth.cols() &&
+            forestmask.height == simg.smooth.rows()) &&
+           "Source Image: dimension does not fit dimension of supplied forest "
+           "mask");
+    assert((forestmask.width == timg.smooth.cols() &&
+            forestmask.height == simg.smooth.rows()) &&
+           "Targe Image: dimension does not fit dimension of supplied forest "
+           "mask");
+    bool m_debug = false;
+    // Match
+    std::vector<ndb::Correspondence> corr =
+        depthPriorFast(simg, timg, forestmask, settings);
+
+    return corr;
+}
+
+/**
+ * @brief                   Returns support (set of x,y coordinates and
+ * disparity) of a pair of images that have been rectified.
+ *
+ * @@param simg             source image (assumed to be the left image)
+ * @param timg              target image (assumed to be the right image)
+ * @param forestmask        forest mask, provided by readForest method
+ * @param InferenceSettings inference settings struct
+ *                          In practice, values between 5...20 produce good
+ * results.
+ *
+ * @return                  Set of supports (x,y,d) with x,y the coordinate
+ * of a point in the left image and d the disparity.
+ */
+std::vector<ndb::Support> Forest::rectifiedMatch(PreprocessedImage& simg,
+                                                 PreprocessedImage& timg,
+                                                 FilterMask& forestmask,
+                                                 InferenceSettings settings) {
+    // Do matching
+    std::vector<ndb::Correspondence> corr =
+        stereoMatch(simg, timg, forestmask, settings);
+    // Filter epipolar matches
+    std::vector<ndb::Support> supp;
+    for (auto& e : corr) {
+        // epipolar constraint
+        if (std::abs(e.srcPt.y - e.tarPt.y) <= settings.verticalTolerance_
+            // disparity filter
+            && std::abs(e.srcPt.x - e.tarPt.x) <= settings.dispHigh_)
+            supp.push_back(
+                ndb::Support(e.srcPt.x, e.srcPt.y, e.srcPt.x - e.tarPt.x));
+    }
+    return supp;
+}
+
+/**
+ * @brief Reads text-based forest format and returns a mask for a given
+ * image size.
+ *
+ * @param path    Path to the file that contains the forest.
+ * @param width   16-Byte aligned width of the image in pixels
+ * @param height  height of the image in pixels
+ *
+ * @return
+ */
+FilterMask Forest::readForest(std::string path, int width, int height) {
+    std::ifstream ff(path);
+
+    std::vector<int32_t> fastmask;
+    std::vector<int> taus;
+    if (ff.fail()) {
+        cout << "Error opening forest file" << endl;
+        return FilterMask(fastmask, width, height, 0);
+    }
+    int numNonZeroTau = 0;
+    int numFerns;
+    int type;
+    ff >> numFerns;
+    cout << "number of ferns:" << numFerns << endl;
+    for (int i = 0; i < numFerns; i++) {
+        int fernID, numTests;
+        std::string fernScale;
+        ff >> fernID >> fernScale >> numTests;
+        for (int j = 0; j < numTests; j++) {
+            int levelID, ix, iy, jx, jy, tau;
+            ff >> levelID >> ix >> iy >> jx >> jy >> tau;
+            // Limit mask size to 32 binary tests
+            if (fastmask.size() < 64 && taus.size() < 32) {
+                fastmask.push_back(ix + iy * width);
+                fastmask.push_back(jx + jy * width);
+                taus.push_back(tau);
+            } else {
+                cout << "Note: A maximum of 32 fern features are allowed, "
+                        "discarding "
+                        "remainder of forest."
+                     << endl;
+            }
+            if (tau != 0) numNonZeroTau++;
+        }
+    }
+    if (numNonZeroTau == 0) {
+        type = 0;  // We have a zero forest (all tau=0)
+        FilterMask fm(fastmask, width, height, type);
+        return fm;
+    } else {
+        type = 1;  // We have a tau forest (some tau!=0)
+        FilterMask fm(fastmask, taus, width, height, type);
+        return fm;
+    }
+}
+
+}  // namespace inference
+}  // namespace gpc
diff --git a/lib/gpc/forest.hpp b/lib/gpc/forest.hpp
new file mode 100644
index 0000000..6226b83
--- /dev/null
+++ b/lib/gpc/forest.hpp
@@ -0,0 +1,280 @@
+// Copyright (c) 2018, ETH Zurich
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the copyright holder nor the names of its contributors
+// may be used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Implements and extends the method proposed in
+// The Global Patch Collider
+// Shenlong Wang, Sean Ryan Fanello, Christoph Rhemann, Shahram Izadi, Pushmeet
+// Kohli CVPR 2016 Code Author: Niklaus Bamert (bamertn@ethz.ch)
+#ifndef _GPC_inference
+#define _GPC_inference
+#include <Eigen/Dense>
+#include <chrono>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <random>
+#include <string>
+#include <thread>
+#include <vector>
+
+// GPC includes
+#include "gpc/Feature.hpp"
+#include "gpc/SintelOpticalFlow.hpp"
+#include "gpc/SintelStereo.hpp"
+#include "gpc/buffer.hpp"
+#include "gpc/hashmatch.hpp"
+
+/**
+ * @brief      The inference class of the GPC forest
+ *
+ */
+namespace gpc {
+namespace inference {
+typedef typename std::chrono::high_resolution_clock::time_point time_point;
+inline std::chrono::high_resolution_clock::time_point sysTick() {
+    return std::chrono::high_resolution_clock::now();
+}
+inline float tickToMs(std::chrono::high_resolution_clock::time_point t0,
+                      std::chrono::high_resolution_clock::time_point t1) {
+    return std::abs(
+        1000. *
+        std::chrono::duration_cast<std::chrono::duration<double>>(t1 - t0)
+            .count());
+}
+struct InferenceSettings {
+    // Threshold to be used for edge detection. Can be 0...255.
+    // In practice, values between 5...20 produce good results.  uint8_t
+    // gradientThreshold;
+    uint8_t gradientThreshold_ = 10;
+    // upper absolute limit for disparity in pixels. The lower (implied) limit
+    // is
+    // 0
+    int dispHigh_ = 128;
+    // vertical deviation tolerance in pixels for corresponding features in
+    // rectified stereo images.
+    int verticalTolerance_ = 1;
+    // Whether to use epipolar mode on matching or not.
+    bool epipolarMode_ = false;
+    // Use hashtable to match extracted descriptors. Usually only faster with a
+    // large number of descriptors (> 100k) or when using multiple threads. Note
+    // that the hashtable method does not return a slightly reduced amount of
+    // matches as a result of the hash table implementation (small bucket size)
+    // if false, the descriptors are sorted and matched by iterating
+    // alternatingly through both sets.
+    bool useHashtable_ = false;
+
+    // Number of threads to use for inference
+    int numThreads_ = 1;
+
+    // Default contructor defaults to using a single thread
+    InferenceSettings(uint8_t gradientThreshold,
+                      int dispHigh,
+                      int verticalTolerance,
+                      bool epipolarMode,
+                      bool useHashtable,
+                      int numThreads)
+        : gradientThreshold_(gradientThreshold),
+          dispHigh_(dispHigh),
+          verticalTolerance_(verticalTolerance),
+          epipolarMode_(epipolarMode),
+          useHashtable_(useHashtable),
+          numThreads_(numThreads) {}
+
+    InferenceSettings() {}
+    InferenceSettings& builder(void) { return *this; }
+    InferenceSettings& gradientThreshold(uint8_t gradientThreshold) {
+        this->gradientThreshold_ = gradientThreshold;
+        return *this;
+    }
+    InferenceSettings& dispHigh(int dispHigh) {
+        this->dispHigh_ = dispHigh;
+        return *this;
+    }
+    InferenceSettings& verticalTolerance(int verticalTolerance) {
+        this->verticalTolerance_ = verticalTolerance;
+        return *this;
+    }
+    InferenceSettings& epipolarMode(bool epipolarMode) {
+        this->epipolarMode_ = epipolarMode;
+        return *this;
+    }
+    InferenceSettings& useHashtable(bool useHashtable) {
+        this->useHashtable_ = useHashtable;
+        return *this;
+    }
+    InferenceSettings& numThreads(int numThreads) {
+        if (numThreads > std::thread::hardware_concurrency())
+            this->numThreads_ = std::thread::hardware_concurrency();
+        else
+            this->numThreads_ = numThreads;
+        return *this;
+    }
+};
+/**
+ * @brief FilterMask object that is returned by the forest reader
+ */
+struct FilterMask {
+    std::vector<int32_t> mask;
+    std::vector<int> tau;
+    int width;
+    int height;
+    int type;
+    FilterMask(std::vector<int32_t> mask, int width, int height, int type) {
+        this->mask = mask;
+        this->width = width;
+        this->height = height;
+        this->type = type;
+    }
+    FilterMask(std::vector<int32_t> mask,
+               std::vector<int> tau,
+               int width,
+               int height,
+               int type) {
+        this->mask = mask;
+        this->tau = tau;
+        this->width = width;
+        this->height = height;
+        this->type = type;
+    }
+};
+struct PreprocessedImage {
+    ndb::Buffer<uint8_t> smooth;
+    ndb::Buffer<uint8_t> grad;
+    std::vector<int> mask;
+    PreprocessedImage(ndb::Buffer<uint8_t>& smooth,
+                      ndb::Buffer<uint8_t>& grad,
+                      std::vector<int>& mask)
+        : smooth(smooth), grad(grad), mask(mask) {};
+};
+
+enum CorrMethod { sorting = 's', hashtable = 'h' };
+struct MatchStats {
+    double prec, rec, timeProp, timeMatch;
+    int numInlier, numStates, numMatches;
+};
+class Forest {
+   public:
+    /**
+     * @brief Computes sparse matches on a pair of rectified and smoothed
+     * images. Here the src and tar images refer to the left and right images,
+     * respectively.
+     *
+     * @param src    Preprocessed source(left) image
+     * @param tar    Preprocessed target(right) image
+     * @param fastmask    forest mask of relative integer offsets.
+     *
+     * @return
+     */
+    std::vector<ndb::Correspondence> depthPriorFast(
+        PreprocessedImage& src,
+        PreprocessedImage& tar,
+        FilterMask& fastmask,
+        InferenceSettings& settings);
+    static std::vector<ndb::Correspondence> findCorrespondences(
+        std::vector<ndb::Descriptor>& srcStates,
+        std::vector<ndb::Descriptor>& tarStates);
+    /**
+     * @brief Evaluates a given forest mask on an image and returns the
+     * descriptors
+     *
+     * @param img       The image
+     * @param grad      gradient image
+     * @param idx       offsets with high gradient pixels within the grad image
+     * @param fastmask  the forest mask
+     *
+     * @return
+     */
+    std::vector<ndb::Descriptor> evalFastMaskOnSubsetSSE(
+        ndb::Buffer<uint8_t>& img,
+        ndb::Buffer<uint8_t>& grad,
+        std::vector<int>& idx,
+        FilterMask& fastmask,
+        InferenceSettings& settings);
+
+    /**
+     * @brief Preprocesses an image. (smooth, binary sobel image and gradient
+     * pixel indices)
+     *
+     * @param img     The raw input image to be preprocessed
+     * @param InferenceSettings inference settings struct
+     *
+     * @return the preprocessed image
+     */
+    PreprocessedImage preprocessImage(ndb::Buffer<uint8_t>& img,
+                                      InferenceSettings settings);
+    /**
+     * @brief Finds matches between two stereo images based on a given forest
+     * mask.
+     *
+     * @param simg              source image (assumed to be the left image)
+     * @param timg              target image (assumed to be the right image)
+     * @param forestmask        forest mask, provided by readForest method
+     * @param InferenceSettings inference settings struct
+     * @return                  Set of correspondences (ptSrc, ptTar) where
+     * ptSrc and ptTar are points in the source and target images, respectively.
+     */
+    std::vector<ndb::Correspondence> stereoMatch(PreprocessedImage& simg,
+                                                 PreprocessedImage& timg,
+                                                 FilterMask& forestmask,
+                                                 InferenceSettings settings);
+    /**
+     * @brief                   Returns support (set of x,y coordinates and
+     * disparity) of a pair of images that have been rectified.
+     *
+     * @@param simg             source image (assumed to be the left image)
+     * @param timg              target image (assumed to be the right image)
+     * @param forestmask        forest mask, provided by readForest method
+     * @param InferenceSettings inference settings struct
+     *                          In practice, values between 5...20 produce good
+     * results.
+     *
+     * @return                  Set of supports (x,y,d) with x,y the coordinate
+     * of a point in the left image and d the disparity.
+     */
+    std::vector<ndb::Support> rectifiedMatch(PreprocessedImage& simg,
+                                             PreprocessedImage& timg,
+                                             FilterMask& forestmask,
+                                             InferenceSettings settings);
+
+    /**
+     * @brief Reads text-based forest format and returns a mask for a given
+     * image size.
+     *
+     * @param path    Path to the file that contains the forest.
+     * @param width   16-Byte aligned width of the image in pixels
+     * @param height  height of the image in pixels
+     *
+     * @return
+     */
+    FilterMask readForest(std::string path, int width, int height);
+};  // forest class
+}  // namespace inference
+}  // namespace gpc
+
+#endif
diff --git a/lib/gpc/inference.hpp b/lib/gpc/inference.hpp
index e1a887a..df1c840 100644
--- a/lib/gpc/inference.hpp
+++ b/lib/gpc/inference.hpp
@@ -48,8 +48,11 @@
 #include "gpc/SintelOpticalFlow.hpp"
 #include "gpc/SintelStereo.hpp"
 #include "gpc/buffer.hpp"
-#include "gpc/filter.hpp"
 #include "gpc/hashmatch.hpp"
+#include "gpc/kernels/box.hpp"
+#include "gpc/kernels/gpc.hpp"
+#include "gpc/kernels/sobel.hpp"
+#include "gpc/kernels/utils.hpp"
 
 /**
  * @brief      The inference class of the GPC forest
@@ -295,8 +298,7 @@ class Forest {
                            fastmask.mask,
                            idx,
                            img.cols(),
-                           img.rows(),
-                           settings.numThreads_);
+                           img.rows());
         } else {
             ndb::gpcFilterTau(img.data(),
                               grad.data(),
@@ -305,8 +307,7 @@ class Forest {
                               fastmask.tau,
                               idx,
                               img.cols(),
-                              img.rows(),
-                              settings.numThreads_);
+                              img.rows());
         }
         std::vector<ndb::Descriptor> out(idx.size());
         int j = 0;
@@ -465,7 +466,6 @@ class Forest {
         int numFerns;
         int type;
         ff >> numFerns;
-        cout << "number of ferns:" << numFerns << endl;
         for (int i = 0; i < numFerns; i++) {
             int fernID, numTests;
             std::string fernScale;
diff --git a/lib/gpc/kernels/box.cpp b/lib/gpc/kernels/box.cpp
new file mode 100644
index 0000000..5b51b60
--- /dev/null
+++ b/lib/gpc/kernels/box.cpp
@@ -0,0 +1,195 @@
+// Copyright (c) 2018, ETH Zurich
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the copyright holder nor the names of its contributors
+// may be used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Code Author: Niklaus Bamert (bamertn@ethz.ch)
+
+#include "gpc/kernels/box.hpp"
+
+#include <cassert>
+
+#include "gpc/kernels/utils.hpp"
+namespace ndb {
+namespace testing {
+void box_hwy(uint8_t* in, uint8_t* blurred, int width, int height);
+}
+void boxNaive(uint8_t* in, uint8_t* blurred, int width, int height) {
+    assert(width % 16 == 0 && "width must be multiple of 16!");
+    // allocate space for result
+    uint8_t* ptr = in;
+    uint8_t* p11 = ptr + 0 * width;
+    uint8_t* p12 = ptr + 0 * width + 1;
+    uint8_t* p13 = ptr + 0 * width + 2;
+
+    uint8_t* p21 = ptr + 1 * width;
+    uint8_t* p22 = ptr + 1 * width + 1;
+    uint8_t* p23 = ptr + 1 * width + 2;
+
+    uint8_t* p31 = ptr + 2 * width;
+    uint8_t* p32 = ptr + 2 * width + 1;
+    uint8_t* p33 = ptr + 2 * width + 2;
+    uint8_t* optr = blurred + 1 * width + 1;
+
+    // Apply 3x3 box filter to image less pixel border of 1 (to avoid treating
+    // boundary) (unoptimized)
+    for (int iy = 1; iy < height - 1; iy++) {
+        for (int ix = 0; ix < width; ix++) {
+            int res =
+                (*p11 + *p12 + *p13 + *p21 + *p22 + *p23 + *p31 + *p32 + *p33) /
+                9;
+            *optr = res;
+            p11++;
+            p12++;
+            p13++;
+            p21++;
+            p22++;
+            p23++;
+            p31++;
+            p32++;
+            p33++;
+            optr++;
+        }
+    }
+}
+#if HWY_TARGET == HWY_AVX2
+/**
+ * @brief SSE implementation of the 3x3 box filter.
+ * Processed two rows at a time using fixed-point multiplication for division.
+ */
+#include <immintrin.h>
+void boxSSE(uint8_t* in, uint8_t* blurred, int width, int height) {
+    int start = 1;
+    int end = height - 3;
+
+    int x, y;
+    __m128i one_third = _mm_set1_epi16(21846);  // 2^16/3 + 1
+
+    __m128i* dst0 = (__m128i*)(blurred + width * start);
+    __m128i* dst1 = (__m128i*)(blurred + width * (start + 1));
+
+    for (y = start; y < end; y += 2) {
+        const uint8_t *row0, *row1, *row2, *row3;
+
+        row1 = in + y * width;
+        row0 = row1 - width;
+        row2 = row1 + width;
+        row3 = row2 + width;
+
+        for (x = 0; x < width; x += 16) {
+            __m128i s00, s01, s02;
+            __m128i ra00, ra01, ra02, rb00, rb01, rb02;
+            __m128i a00, a01, a02, b00, b01, b02;
+            __m128i tmp0, tmp1, res;
+
+            // Row 0 Processing
+            s00 = _mm_loadu_si128((__m128i*)(row0 - 1));
+            s01 = _mm_loadu_si128((__m128i*)(row0 + 1));
+            s02 = _mm_load_si128((__m128i*)(row0));
+            unpack8to16(s00, a00, b00);
+            unpack8to16(s01, a01, b01);
+            unpack8to16(s02, a02, b02);
+            ra00 = _mm_mulhi_epi16(
+                _mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third);
+            rb00 = _mm_mulhi_epi16(
+                _mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third);
+
+            // Row 1 Processing
+            s00 = _mm_loadu_si128((__m128i*)(row1 - 1));
+            s01 = _mm_loadu_si128((__m128i*)(row1 + 1));
+            s02 = _mm_load_si128((__m128i*)(row1));
+            unpack8to16(s00, a00, b00);
+            unpack8to16(s01, a01, b01);
+            unpack8to16(s02, a02, b02);
+            ra01 = _mm_mulhi_epi16(
+                _mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third);
+            rb01 = _mm_mulhi_epi16(
+                _mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third);
+
+            // Row 2 Processing
+            s00 = _mm_loadu_si128((__m128i*)(row2 - 1));
+            s01 = _mm_loadu_si128((__m128i*)(row2 + 1));
+            s02 = _mm_load_si128((__m128i*)(row2));
+            unpack8to16(s00, a00, b00);
+            unpack8to16(s01, a01, b01);
+            unpack8to16(s02, a02, b02);
+            ra02 = _mm_mulhi_epi16(
+                _mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third);
+            rb02 = _mm_mulhi_epi16(
+                _mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third);
+
+            // Accumulate rows 0, 1, 2 for dst0
+            tmp0 = _mm_mulhi_epi16(
+                _mm_adds_epi16(_mm_adds_epi16(ra00, ra01), ra02), one_third);
+            tmp1 = _mm_mulhi_epi16(
+                _mm_adds_epi16(_mm_adds_epi16(rb00, rb01), rb02), one_third);
+            pack16to8(tmp0, tmp1, res);
+            _mm_store_si128(dst0++, res);
+
+            // Row 3 Processing
+            s00 = _mm_loadu_si128((__m128i*)(row3 - 1));
+            s01 = _mm_loadu_si128((__m128i*)(row3 + 1));
+            s02 = _mm_load_si128((__m128i*)(row3));
+            unpack8to16(s00, a00, b00);
+            unpack8to16(s01, a01, b01);
+            unpack8to16(s02, a02, b02);
+            ra00 = _mm_mulhi_epi16(
+                _mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third);
+            rb00 = _mm_mulhi_epi16(
+                _mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third);
+
+            // Accumulate rows 1, 2, 3 for dst1
+            tmp0 = _mm_mulhi_epi16(
+                _mm_adds_epi16(_mm_adds_epi16(ra01, ra02), ra00), one_third);
+            tmp1 = _mm_mulhi_epi16(
+                _mm_adds_epi16(_mm_adds_epi16(rb01, rb02), rb00), one_third);
+            pack16to8(tmp0, tmp1, res);
+            _mm_store_si128(dst1++, res);
+
+            row0 += 16;
+            row1 += 16;
+            row2 += 16;
+            row3 += 16;
+        }
+        dst0 += width / 16;
+        dst1 += width / 16;
+    }
+}
+#endif
+void box(uint8_t* in, uint8_t* blurred, int width, int height, int numThreads) {
+    assert(width % 16 == 0 && "width must be multiple of 16!");
+#if defined(__ARM_NEON) || defined(__aarch64__)
+    testing::box_hwy(in, blurred, width, height);
+#else
+#if HWY_TARGET == HWY_AVX2
+    boxSSE(in, blurred, width, height);
+#else
+    boxNaive(in, blurred, width, height);
+#endif
+#endif
+}
+}  // namespace ndb
diff --git a/lib/gpc/kernels/box.hpp b/lib/gpc/kernels/box.hpp
new file mode 100644
index 0000000..b00dc88
--- /dev/null
+++ b/lib/gpc/kernels/box.hpp
@@ -0,0 +1,68 @@
+// Copyright (c) 2018, ETH Zurich
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the copyright holder nor the names of its contributors
+// may be used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Code Author: Niklaus Bamert (bamertn@ethz.ch)
+
+#ifndef __NDB__KERNEL_BOX
+#define __NDB__KERNEL_BOX
+using namespace std;
+
+#include "gpc/buffer.hpp"
+
+namespace ndb {
+/**
+ * @brief      Naive 3x3 box filter implementation
+ *
+ * @param      in       input image
+ * @param      blurred  The blurred output image
+ * @param[in]  width    The width
+ * @param[in]  height   The height
+ * @param[in]  numThreads number of threads to use
+ */
+void boxNaive(uint8_t* in, uint8_t* blurred, int width, int height);
+
+/**
+ * @brief      boxfilter using SSE2 instructions. Loosely based on
+ *             https://www.ignorantus.com/box_sse2/, published under
+ *             the https://creativecommons.org/publicdomain/zero/1.0/ licence.
+ *
+ * @param      in       input image
+ * @param      blurred  The blurred
+ * @param[in]  width    The width
+ * @param[in]  height   The height
+ * @param[in]  numThreads number of threads to use
+ */
+void box(uint8_t* in, uint8_t* blurred, int width, int height, int numThreads);
+
+#if HWY_TARGET == HWY_AVX2
+void boxSSE(uint8_t* in, uint8_t* blurred, int width, int height);
+#endif
+
+}  // namespace ndb
+#endif
diff --git a/lib/gpc/kernels/box_hwy.cpp b/lib/gpc/kernels/box_hwy.cpp
new file mode 100644
index 0000000..ba2f5dd
--- /dev/null
+++ b/lib/gpc/kernels/box_hwy.cpp
@@ -0,0 +1,111 @@
+#define HWY_TARGET HWY_NEON
+#include <hwy/highway.h>
+
+HWY_BEFORE_NAMESPACE();
+namespace ndb {
+namespace HWY_NAMESPACE {
+namespace hn = hwy::HWY_NAMESPACE;
+
+void BoxKernel(const uint8_t* HWY_RESTRICT in,
+               uint8_t* HWY_RESTRICT blurred,
+               int width,
+               int height) {
+    const hn::ScalableTag<uint8_t> d8;
+    const hn::Half<decltype(d8)> d8_h;
+    const hn::Rebind<uint16_t, decltype(d8_h)> d16;
+
+    const size_t N = hn::Lanes(d8);
+    const auto divisor = hn::Set(d16, (uint16_t)7282);
+
+    for (int y = 1; y < height - 2; y += 2) {
+        const uint8_t* r0 = in + (y - 1) * width;
+        const uint8_t* r1 = in + y * width;
+        const uint8_t* r2 = in + (y + 1) * width;
+        const uint8_t* r3 = in + (y + 2) * width;
+
+        uint8_t* out0 = blurred + y * width + 1;
+        uint8_t* out1 = blurred + (y + 1) * width + 1;
+
+        for (int x = 0; x < width; x += N) {
+            auto v00 = hn::LoadU(d8, r0 + x);
+            auto v01 = hn::LoadU(d8, r0 + x + 1);
+            auto v02 = hn::LoadU(d8, r0 + x + 2);
+            auto v10 = hn::LoadU(d8, r1 + x);
+            auto v11 = hn::LoadU(d8, r1 + x + 1);
+            auto v12 = hn::LoadU(d8, r1 + x + 2);
+            auto v20 = hn::LoadU(d8, r2 + x);
+            auto v21 = hn::LoadU(d8, r2 + x + 1);
+            auto v22 = hn::LoadU(d8, r2 + x + 2);
+            auto v30 = hn::LoadU(d8, r3 + x);
+            auto v31 = hn::LoadU(d8, r3 + x + 1);
+            auto v32 = hn::LoadU(d8, r3 + x + 2);
+
+            // Lower Half Math
+            auto s1_lo =
+                hn::Add(hn::PromoteTo(d16, hn::LowerHalf(v11)),
+                        hn::Add(hn::PromoteTo(d16, hn::LowerHalf(v10)),
+                                hn::PromoteTo(d16, hn::LowerHalf(v12))));
+            auto s2_lo =
+                hn::Add(hn::PromoteTo(d16, hn::LowerHalf(v21)),
+                        hn::Add(hn::PromoteTo(d16, hn::LowerHalf(v20)),
+                                hn::PromoteTo(d16, hn::LowerHalf(v22))));
+
+            auto row0_lo = hn::Add(
+                hn::Add(hn::PromoteTo(d16, hn::LowerHalf(v01)),
+                        hn::Add(hn::PromoteTo(d16, hn::LowerHalf(v00)),
+                                hn::PromoteTo(d16, hn::LowerHalf(v02)))),
+                hn::Add(s1_lo, s2_lo));
+            auto row1_lo = hn::Add(
+                hn::Add(hn::PromoteTo(d16, hn::LowerHalf(v31)),
+                        hn::Add(hn::PromoteTo(d16, hn::LowerHalf(v30)),
+                                hn::PromoteTo(d16, hn::LowerHalf(v32)))),
+                hn::Add(s1_lo, s2_lo));
+
+            // Upper Half Math
+            auto s1_hi =
+                hn::Add(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v11)),
+                        hn::Add(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v10)),
+                                hn::PromoteTo(d16, hn::UpperHalf(d8_h, v12))));
+            auto s2_hi =
+                hn::Add(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v21)),
+                        hn::Add(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v20)),
+                                hn::PromoteTo(d16, hn::UpperHalf(d8_h, v22))));
+
+            auto row0_hi = hn::Add(
+                hn::Add(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v01)),
+                        hn::Add(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v00)),
+                                hn::PromoteTo(d16, hn::UpperHalf(d8_h, v02)))),
+                hn::Add(s1_hi, s2_hi));
+            auto row1_hi = hn::Add(
+                hn::Add(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v31)),
+                        hn::Add(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v30)),
+                                hn::PromoteTo(d16, hn::UpperHalf(d8_h, v32)))),
+                hn::Add(s1_hi, s2_hi));
+
+            hn::StoreU(hn::OrderedDemote2To(d8,
+                                            hn::MulHigh(row0_lo, divisor),
+                                            hn::MulHigh(row0_hi, divisor)),
+                       d8,
+                       out0 + x);
+            hn::StoreU(hn::OrderedDemote2To(d8,
+                                            hn::MulHigh(row1_lo, divisor),
+                                            hn::MulHigh(row1_hi, divisor)),
+                       d8,
+                       out1 + x);
+        }
+    }
+}
+}  // namespace HWY_NAMESPACE
+}  // namespace ndb
+HWY_AFTER_NAMESPACE();
+
+namespace ndb {
+namespace testing {
+// #if defined(HWY_TARGET) && HWY_TARGET == HWY_NEON
+void box_hwy(uint8_t* in, uint8_t* blurred, int width, int height) {
+    // ndb::N_NEON::BoxKernel(in, blurred, width, height);
+    HWY_STATIC_DISPATCH(BoxKernel)(in, blurred, width, height);
+}
+// #endif
+}  // namespace testing
+}  // namespace ndb
diff --git a/lib/gpc/kernels/box_hwy.hpp b/lib/gpc/kernels/box_hwy.hpp
new file mode 100644
index 0000000..ae39d71
--- /dev/null
+++ b/lib/gpc/kernels/box_hwy.hpp
@@ -0,0 +1,18 @@
+#ifndef __NDB__KERNEL_BOX_HWY
+#define __NDB__KERNEL_BOX_HWY
+
+#include <cstdint>
+
+namespace ndb {
+
+namespace testing {
+/**
+ * Entry point for benchmarking the MulHigh (approximate) version.
+ */
+void box_hwy(uint8_t* in, uint8_t* blurred, int width, int height);
+
+}  // namespace testing
+
+}  // namespace ndb
+
+#endif  // GPC_KERNELS_BOX_HWY_H_
diff --git a/lib/gpc/kernels/census.cpp b/lib/gpc/kernels/census.cpp
new file mode 100644
index 0000000..8b265cc
--- /dev/null
+++ b/lib/gpc/kernels/census.cpp
@@ -0,0 +1,203 @@
+// Copyright (c) 2018, ETH Zurich
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the copyright holder nor the names of its contributors
+// may be used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Code Author: Niklaus Bamert (bamertn@ethz.ch)
+#include "gpc/kernels/census.hpp"
+
+#include <cassert>
+void census5x5Naive(uint8_t* in, uint32_t* census, int width, int height) {
+    uint32_t val;
+    uint32_t* dst;
+    for (int y = 2; y < height - 3; y++) {
+        for (int x = 0; x < width; x++) {
+            val = 0;
+            dst = census + y * width + x;
+            int i = 0;
+            // patch loops
+            for (int px = -2; px <= 2; px++) {
+                for (int py = -2; py <= 2; py++) {
+                    if (!(px == 0 && py == 0)) {
+                        val |= (in[(y + py) * width + (x + px)] >
+                                in[y * width + x])
+                                   ? (1 << i)
+                                   : 0;
+                        i++;
+                    }
+                }
+            }  // End patch loops
+            *dst = val;
+        }
+    }  // End pixel loops
+}
+void census5x5(uint8_t* in, uint32_t* census, int width, int height) {
+    assert(width % 16 == 0 && "width must be multiple of 16!");
+#ifndef _INTRINSICS_SSE
+    census5x5Naive(in, census, width, height);
+#else
+    __m128i zero = _mm_set1_epi8(0);
+    __m128i one = _mm_set1_epi8(1);
+
+    for (int y = 2; y < height - 3; y++) {
+        for (int x = 0; x < width; x += 16) {
+            uint8_t* rowPtr;
+            rowPtr = in + (y - 2) * width + x;
+            __m128i center = _mm_lddqu_si128((__m128i*)(in + y * width + x));
+            __m128i* dst = (__m128i*)(census + y * width +
+                                      x);  // Set starting point to pixel (2,2)
+            // row 0
+            __m128i bitMask = one;
+            __m128i byte1 = _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 2))),
+                bitMask);
+            bitMask += bitMask;  // 2
+            byte1 |= _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 1))),
+                bitMask);
+            bitMask += bitMask;  // 4
+            byte1 |= _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr))),
+                bitMask);
+            bitMask += bitMask;  // 8
+            byte1 |= _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 1))),
+                bitMask);
+            bitMask += bitMask;  // 16
+            byte1 |= _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 2))),
+                bitMask);
+
+            // row 1
+            rowPtr += width;
+            bitMask += bitMask;  // 32
+            byte1 |= _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 2))),
+                bitMask);
+            bitMask += bitMask;  // 64
+            byte1 |= _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 1))),
+                bitMask);
+            bitMask += bitMask;  // 128
+            byte1 |= _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr))),
+                bitMask);
+            bitMask = one;  // 1
+            __m128i byte2 = _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 1))),
+                bitMask);
+            bitMask += bitMask;  // 2
+            byte2 |= _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 2))),
+                bitMask);
+
+            // row 2
+            rowPtr += width;
+            bitMask += bitMask;  // 4
+            byte2 |= _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 2))),
+                bitMask);
+            bitMask += bitMask;  // 8
+            byte2 |= _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 1))),
+                bitMask);
+            bitMask += bitMask;  // 16
+            byte2 |= _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 1))),
+                bitMask);
+            bitMask += bitMask;  // 32
+            byte2 |= _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 2))),
+                bitMask);
+
+            // row 3
+            rowPtr += width;
+            bitMask += bitMask;  // 64
+            byte2 |= _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 2))),
+                bitMask);
+            bitMask += bitMask;  // 128
+            byte2 |= _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 1))),
+                bitMask);
+            bitMask = one;  // 1
+            __m128i byte3 = _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr))),
+                bitMask);
+            bitMask += bitMask;  // 2
+            byte3 |= _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 1))),
+                bitMask);
+            bitMask += bitMask;  // 4
+            byte3 |= _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 2))),
+                bitMask);
+
+            // row 4
+            rowPtr += width;
+            bitMask += bitMask;  // 8
+            byte3 |= _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 2))),
+                bitMask);
+            bitMask += bitMask;  // 16
+            byte3 |= _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 1))),
+                bitMask);
+            bitMask += bitMask;  // 32
+            byte3 |= _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr))),
+                bitMask);
+            bitMask += bitMask;  // 64
+            byte3 |= _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 1))),
+                bitMask);
+            bitMask += bitMask;  // 128
+            byte3 |= _mm_and_si128(
+                _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 2))),
+                bitMask);
+
+            // 8bit to 16bit
+            __m128i high1 = _mm_unpacklo_epi8(byte3, zero);
+            __m128i high2 = _mm_unpackhi_epi8(byte3, zero);
+            __m128i low1 = _mm_unpacklo_epi8(byte1, byte2);
+            __m128i low2 = _mm_unpackhi_epi8(byte1, byte2);
+
+            // 16bit to 32bit ints
+            _mm_storeu_si128(dst, _mm_unpacklo_epi16(low1, high1));
+            _mm_storeu_si128(dst + 1, _mm_unpackhi_epi16(low1, high1));
+            _mm_storeu_si128(dst + 2, _mm_unpacklo_epi16(low2, high2));
+            _mm_storeu_si128(dst + 3, _mm_unpackhi_epi16(low2, high2));
+
+        }  // col iteration
+    }  // row iteration
+    // if(numThreads == 1)
+    // gpcFilterSegment(13,height-15);
+    // else
+    // parFor(gpcFilterSegment,13,height-15,4);
+
+#endif
+}  // census5x5
diff --git a/lib/gpc/kernels/census.hpp b/lib/gpc/kernels/census.hpp
new file mode 100644
index 0000000..054a45f
--- /dev/null
+++ b/lib/gpc/kernels/census.hpp
@@ -0,0 +1,60 @@
+// Copyright (c) 2018, ETH Zurich
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the copyright holder nor the names of its contributors
+// may be used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Code Author: Niklaus Bamert (bamertn@ethz.ch)
+
+#ifndef __NDB__KERNEL_CENSUS
+#define __NDB__KERNEL_CENSUS
+
+#include "gpc/buffer.hpp"
+
+namespace ndb {
+/**
+ * @brief Naive version of 5x5 census transoform
+ *
+ * @param in      Input image
+ * @param census  32bit census transform output
+ * @param width   Width of the image at *in pointer
+ * @param height  Heiht of the image at *in pointer
+ */
+void census5x5Naive(uint8_t* in, uint32_t* census, int width, int height);
+
+/**
+ * @brief 5x5 dense census transform of input image. binary codes are returned
+ * as a 32bit image
+ *
+ * @param in
+ * @param census
+ * @param width
+ * @param height
+ */
+void census5x5(uint8_t* in, uint32_t* census, int width, int height);
+
+}  // namespace ndb
+#endif
diff --git a/lib/gpc/kernels/gpc.cpp b/lib/gpc/kernels/gpc.cpp
new file mode 100644
index 0000000..7bdf97b
--- /dev/null
+++ b/lib/gpc/kernels/gpc.cpp
@@ -0,0 +1,257 @@
+// Copyright (c) 2018, ETH Zurich
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the copyright holder nor the names of its contributors
+// may be used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Code Author: Niklaus Bamert (bamertn@ethz.ch)
+#include "gpc/kernels/gpc.hpp"
+
+#include <cassert>
+namespace ndb {
+void gpcFilterNaive(uint8_t* in,
+                    const uint8_t* grad,
+                    uint32_t* gpc,
+                    std::vector<int32_t> fastmask,
+                    std::vector<int>& idx,
+                    int width,
+                    int height) {
+    // output buffer of same size
+    uint32_t tmp;
+
+    int j = 0;
+    for (auto k : idx) {
+        tmp = 0;
+        for (uint8_t i = 0; i < fastmask.size(); i += 2) {
+            tmp <<= 1;  // shift by one
+            if (*(in + k + fastmask[i]) > *(in + k + fastmask[i + 1]))
+                tmp++;  // set this test's result to 1
+        }
+        gpc[k] = tmp;
+        j++;
+    }
+}
+
+void gpcFilterTauNaive(uint8_t* in,
+                       const uint8_t* grad,
+                       uint32_t* gpc,
+                       std::vector<int32_t> fastmask,
+                       std::vector<int> tau,
+                       std::vector<int>& idx,
+                       int width,
+                       int height) {
+    uint32_t tmp;
+
+    int j = 0;
+    for (auto k : idx) {
+        tmp = 0;
+        for (uint8_t i = 0; i < fastmask.size(); i += 2) {
+            tmp <<= 1;  // shift by one
+            if (*(in + k + fastmask[i]) >
+                *(in + k + fastmask[i + 1]) - tau[i / 2])
+                tmp++;  // set this test's result to 1
+        }
+        gpc[k] = tmp;
+        j++;
+    }
+}
+
+#if (HWY_ARCH_X86) && (HWY_TARGET == HWY_AVX2)
+bool isAllZeros(__m128i xmm) {
+    return _mm_movemask_epi8(_mm_cmpeq_epi8(xmm, _mm_setzero_si128())) ==
+           0xFFFF;
+}
+void gpcFilterSSE(uint8_t* in,
+                  const uint8_t* grad,
+                  uint32_t* gpc,
+                  std::vector<int32_t> fastmask,
+                  std::vector<int>& idx,
+                  int width,
+                  int height) {
+    const int start = 13;
+    const int end = height - 15;
+    __m128i zero = _mm_set1_epi8(0);
+    __m128i one = _mm_set1_epi8(1);
+    for (int y = start; y < end; y++) {
+        for (int x = 0; x < width; x += 16) {
+            uint8_t* rowPtr;
+            rowPtr = in + (y - 2) * width + x;
+            __m128i out[4];  // temporary output vector of 4 128bit words
+
+            const uint8_t* center = (in + y * width + x);
+            const uint8_t* centerGrad = (grad + y * width + x);
+            // We only process the current segment if there are any non-zero
+            // values (high gradient pixels)
+            if (!isAllZeros(_mm_lddqu_si128((__m128i*)centerGrad))) {
+                __m128i* dst =
+                    (__m128i*)(gpc + y * width +
+                               x);  // Set starting point to pixel (2,2)
+                out[0] = zero;
+                out[1] = zero;
+                out[2] = zero;
+                out[3] = zero;
+                uint8_t k = 0;
+                __m128i bitMask = one;
+                for (uint8_t i = 0; i < fastmask.size() && i < 64; i += 2) {
+                    out[k] |= _mm_and_si128(
+                        _mm_cmpgt_epu8(
+                            _mm_lddqu_si128((__m128i*)(center + fastmask[i])),
+                            _mm_lddqu_si128(
+                                (__m128i*)(center + fastmask[i + 1]))),
+                        bitMask);
+                    // Keeps index into output vector and updates bit mask
+                    if (i % 16 == 0 && i != 0) {
+                        bitMask = one;
+                        k++;
+                    } else {
+                        bitMask += bitMask;
+                    }
+                }
+                // 8bit to 16bit
+                __m128i high1 = _mm_unpacklo_epi8(out[2], out[3]);
+                __m128i high2 = _mm_unpackhi_epi8(out[2], out[3]);
+                __m128i low1 = _mm_unpacklo_epi8(out[0], out[1]);
+                __m128i low2 = _mm_unpackhi_epi8(out[0], out[1]);
+
+                // 16bit to 32bit ints
+                _mm_storeu_si128(dst, _mm_unpacklo_epi16(low1, high1));
+                _mm_storeu_si128(dst + 1, _mm_unpackhi_epi16(low1, high1));
+                _mm_storeu_si128(dst + 2, _mm_unpacklo_epi16(low2, high2));
+                _mm_storeu_si128(dst + 3, _mm_unpackhi_epi16(low2, high2));
+            }
+        }  // col iteration
+    }  // row iteration
+}
+#endif
+void gpcFilter(uint8_t* in,
+               const uint8_t* grad,
+               uint32_t* gpc,
+               std::vector<int32_t> fastmask,
+               std::vector<int>& idx,
+               int width,
+               int height) {
+    assert(width % 16 == 0 && "width must be multiple of 16!");
+#if defined(__ARM_NEON) || defined(__aarch64__)
+    // Replace with call to highway
+    gpcFilterNaive(in, grad, gpc, fastmask, idx, width, height);
+#else
+#if (HWY_ARCH_X86) && (HWY_TARGET == HWY_AVX2)
+    gpcFilterSSE(in, grad, gpc, fastmask, idx, width, height);
+#else
+    gpcFilterNaive(in, grad, gpc, fastmask, idx, width, height);
+#endif
+#endif
+}
+
+#if (HWY_ARCH_X86) && (HWY_TARGET == HWY_AVX2)
+void gpcFilterTauSSE(uint8_t* in,
+                     const uint8_t* grad,
+                     uint32_t* gpc,
+                     std::vector<int32_t> fastmask,
+                     std::vector<int> tau,
+                     std::vector<int>& idx,
+                     int width,
+                     int height) {
+    const int start = 13;
+    const int end = height - 15;
+    __m128i zero = _mm_set1_epi8(0);
+    __m128i one = _mm_set1_epi8(1);
+    for (int y = start; y < end; y++) {
+        for (int x = 0; x < width; x += 16) {
+            uint8_t* rowPtr;
+            rowPtr = in + (y - 2) * width + x;
+            __m128i out[4];  // temporary output vector of 4 128bit words
+
+            const uint8_t* center = (in + y * width + x);
+            const uint8_t* centerGrad = (grad + y * width + x);
+            // We only process the current segment if there are any non-zero
+            // values (high gradient pixels)
+            if (!isAllZeros(_mm_lddqu_si128((__m128i*)centerGrad))) {
+                __m128i* dst =
+                    (__m128i*)(gpc + y * width +
+                               x);  // Set starting point to pixel (2,2)
+                out[0] = zero;
+                out[1] = zero;
+                out[2] = zero;
+                out[3] = zero;
+                uint8_t k = 0;
+                __m128i bitMask = one;
+                for (uint8_t i = 0; i < fastmask.size() && i < 64; i += 2) {
+                    out[k] |= _mm_and_si128(
+                        _mm_cmpgt_epu8(
+                            _mm_lddqu_si128((__m128i*)(center + fastmask[i])),
+                            _mm_subs_epi8(
+                                _mm_lddqu_si128(
+                                    (__m128i*)(center + fastmask[i + 1])),
+                                _mm_set1_epi8(tau[i / 2]))  // deduct tau
+                            ),
+                        bitMask);
+                    // Keeps index into output vector and updates bit mask
+                    if (i % 16 == 0 && i != 0) {
+                        bitMask = one;
+                        k++;
+                    } else {
+                        bitMask += bitMask;
+                    }
+                }
+                // 8bit to 16bit
+                __m128i high1 = _mm_unpacklo_epi8(out[2], out[3]);
+                __m128i high2 = _mm_unpackhi_epi8(out[2], out[3]);
+                __m128i low1 = _mm_unpacklo_epi8(out[0], out[1]);
+                __m128i low2 = _mm_unpackhi_epi8(out[0], out[1]);
+
+                // 16bit to 32bit ints
+                _mm_storeu_si128(dst, _mm_unpacklo_epi16(low1, high1));
+                _mm_storeu_si128(dst + 1, _mm_unpackhi_epi16(low1, high1));
+                _mm_storeu_si128(dst + 2, _mm_unpacklo_epi16(low2, high2));
+                _mm_storeu_si128(dst + 3, _mm_unpackhi_epi16(low2, high2));
+            }
+        }  // col iteration
+    }  // row iteration
+}
+#endif
+
+void gpcFilterTau(uint8_t* in,
+                  const uint8_t* grad,
+                  uint32_t* gpc,
+                  std::vector<int32_t> fastmask,
+                  std::vector<int> tau,
+                  std::vector<int>& idx,
+                  int width,
+                  int height) {
+    assert(width % 16 == 0 && "width must be multiple of 16!");
+#if defined(__ARM_NEON) || defined(__aarch64__)
+    // Replace with call to highway
+    gpcFilterTauNaive(in, grad, gpc, fastmask, tau, idx, width, height);
+#else
+#if (HWY_ARCH_X86) && (HWY_TARGET == HWY_AVX2)
+    gpcFilterTauSSE(in, grad, gpc, fastmask, tau, idx, width, height);
+#else
+    gpcFilterTauNaive(in, grad, gpc, fastmask, tau, idx, width, height);
+#endif
+#endif
+}
+}  // namespace ndb
diff --git a/lib/gpc/kernels/gpc.hpp b/lib/gpc/kernels/gpc.hpp
new file mode 100644
index 0000000..bdd4bf4
--- /dev/null
+++ b/lib/gpc/kernels/gpc.hpp
@@ -0,0 +1,151 @@
+// Copyright (c) 2018, ETH Zurich
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the copyright holder nor the names of its contributors
+// may be used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Code Author: Niklaus Bamert (bamertn@ethz.ch)
+
+#ifndef __NDB__KERNEL_GPC
+#define __NDB__KERNEL_GPC
+using namespace std;
+
+#include "gpc/buffer.hpp"
+
+namespace ndb {
+/**
+ * @brief Applies a gpc filter defined by the pixel-difference tests in
+ * fastmask. Accelerated with SSE.
+ *
+ * @param in        The input image.
+ * @param grad      The gradient image, such that we can skip non-gradient
+ * pixels
+ * @param gpc       The output image of 32bit codes
+ * @param fastmask  The fastmask containing the gpc filter
+ * @param idx       The gradient indices. Only used if no intrincs are available
+ *                  and the call gets forwarded to the naive implementation.
+ * @param width     The width of the image at pointer *in
+ * @param height    The height of the image at pointer *in
+ */
+void gpcFilter(uint8_t* in,
+               const uint8_t* grad,
+               uint32_t* gpc,
+               std::vector<int32_t> fastmask,
+               std::vector<int>& idx,
+               int width,
+               int height);
+
+/**
+ * @brief Applies a gpc filter defined by the pixel-difference tests in
+ * fastmask. Naive implementation
+ *
+ * @param in        The input image.
+ * @param grad      The gradient image, such that we can skip non-gradient
+ * pixels
+ * @param gpc       The output image of 32bit codes
+ * @param fastmask  The fastmask containing the gpc filter
+ * @param idx       The gradient indices. Only used if no intrincs are available
+ *                  and the call gets forwarded to the naive implementation.
+ * @param width     The width of the image at pointer *in
+ * @param height    The height of the image at pointer *in
+ */
+void gpcFilterNaive(uint8_t* in,
+                    const uint8_t* grad,
+                    uint32_t* gpc,
+                    std::vector<int32_t> fastmask,
+                    std::vector<int>& idx,
+                    int width,
+                    int height);
+/**
+ * @brief Applies a gpc filter defined by the pixel-difference tests in
+ * fastmask. Additionally uses a threshold vector (tau)
+ *
+ * @param in        The input image.
+ * @param grad      The gradient image, such that we can skip non-gradient
+ * pixels
+ * @param gpc       The output image of 32bit codes
+ * @param fastmask  The fastmask containing the gpc filter
+ * @param width     The width of the image at pointer *in
+ * @param height    The height of the image at pointer *in
+ */
+void gpcFilterTau(uint8_t* in,
+                  const uint8_t* grad,
+                  uint32_t* gpc,
+                  std::vector<int32_t> fastmask,
+                  std::vector<int> tau,
+                  std::vector<int>& idx,
+                  int width,
+                  int height);
+
+/**
+ * @brief Applies a gpc filter defined by the pixel-difference tests in
+ * fastmask. Additionally uses a threshold vector (tau) Naive implementation.
+ *
+ * @param in        The input image.
+ * @param grad      The gradient image, such that we can skip non-gradient
+ * pixels
+ * @param gpc       The output image of 32bit codes
+ * @param fastmask  The fastmask containing the gpc filter
+ * @param width     The width of the image at pointer *in
+ * @param height    The height of the image at pointer *in
+ */
+void gpcFilterTauNaive(uint8_t* in,
+                       const uint8_t* grad,
+                       uint32_t* gpc,
+                       std::vector<int32_t> fastmask,
+                       std::vector<int> tau,
+                       std::vector<int>& idx,
+                       int width,
+                       int height);
+/**
+ * @brief Checks if the 128bits in xmm are all zero
+ *
+ * @param xmm
+ *
+ * @return true if all zeros, false otherwise
+ */
+#if (HWY_ARCH_X86) && (HWY_TARGET == HWY_AVX2)
+bool isAllZeros(__m128i xmm);
+void gpcFilterTauSSE(uint8_t* in,
+                     const uint8_t* grad,
+                     uint32_t* gpc,
+                     std::vector<int32_t> fastmask,
+                     std::vector<int> tau,
+                     std::vector<int>& idx,
+                     int width,
+                     int height);
+void gpcFilterSSE(uint8_t* in,
+                  const uint8_t* grad,
+                  uint32_t* gpc,
+                  std::vector<int32_t> fastmask,
+                  std::vector<int>& idx,
+                  int width,
+                  int height);
+
+#endif
+
+}  // namespace ndb
+#endif
diff --git a/lib/gpc/kernels/gpc_hwy.cpp b/lib/gpc/kernels/gpc_hwy.cpp
new file mode 100644
index 0000000..d84710a
--- /dev/null
+++ b/lib/gpc/kernels/gpc_hwy.cpp
@@ -0,0 +1,158 @@
+// #define HWY_TARGET HWY_NEON
+#include "gpc_hwy.hpp"
+HWY_BEFORE_NAMESPACE();
+namespace ndb {
+namespace HWY_NAMESPACE {
+namespace hn = hwy::HWY_NAMESPACE;
+
+// dense!
+#include <hwy/highway.h>
+
+namespace hn = hwy::HWY_NAMESPACE;
+
+// Dense Version
+void GPCKernel(const uint8_t* HWY_RESTRICT in,
+               const uint8_t* HWY_RESTRICT grad,
+               uint32_t* HWY_RESTRICT gpc,
+               const std::vector<int32_t>& fastmask,
+               const std::vector<int32_t>& tau,
+               int width,
+               int height) {
+    const hn::ScalableTag<uint8_t> d8;
+    const hn::ScalableTag<uint32_t> d32;
+    const size_t N = hn::Lanes(d8);
+
+    const int border = 13;
+    const auto v_zero8 = hn::Zero(d8);
+    const auto v_one8 = hn::Set(d8, 1);
+    const int32_t* fm = fastmask.data();
+
+    for (int y = border; y < height - border; ++y) {
+        const int row_base = y * width;
+        uint32_t* HWY_RESTRICT row_out = gpc + row_base;
+
+        for (int x = border; x <= width - border - (int)N; x += N) {
+            const int k = row_base + x;
+
+            auto v_acc0 = hn::Zero(d8);  // Bits 0-7
+            auto v_acc1 = hn::Zero(d8);  // Bits 8-15
+            auto v_acc2 = hn::Zero(d8);  // Bits 16-23
+            auto v_acc3 = hn::Zero(d8);  // Bits 24-31
+
+            // Pass 1: Bits 0-7
+            for (int i = 0; i < 16; i += 2) {
+                v_acc0 = hn::Add(v_acc0, v_acc0);
+                auto mask = hn::Gt(hn::LoadU(d8, in + k + fm[i]),
+                                   hn::LoadU(d8, in + k + fm[i + 1]));
+                v_acc0 = hn::Or(v_acc0, hn::IfThenElse(mask, v_one8, v_zero8));
+            }
+
+            // Pass 2: Bits 8-15
+            for (int i = 16; i < 32; i += 2) {
+                v_acc1 = hn::Add(v_acc1, v_acc1);
+                auto mask = hn::Gt(hn::LoadU(d8, in + k + fm[i]),
+                                   hn::LoadU(d8, in + k + fm[i + 1]));
+                v_acc1 = hn::Or(v_acc1, hn::IfThenElse(mask, v_one8, v_zero8));
+            }
+
+            // Pass 3: Bits 16-23
+            for (int i = 32; i < 48; i += 2) {
+                v_acc2 = hn::Add(v_acc2, v_acc2);
+                auto mask = hn::Gt(hn::LoadU(d8, in + k + fm[i]),
+                                   hn::LoadU(d8, in + k + fm[i + 1]));
+                v_acc2 = hn::Or(v_acc2, hn::IfThenElse(mask, v_one8, v_zero8));
+            }
+
+            // Pass 4: Bits 24-31
+            for (int i = 48; i < 64; i += 2) {
+                v_acc3 = hn::Add(v_acc3, v_acc3);
+                auto mask = hn::Gt(hn::LoadU(d8, in + k + fm[i]),
+                                   hn::LoadU(d8, in + k + fm[i + 1]));
+                v_acc3 = hn::Or(v_acc3, hn::IfThenElse(mask, v_one8, v_zero8));
+            }
+
+            //extract and combine:
+            for (size_t lane = 0; lane < N; ++lane) {
+                uint32_t final_val =
+                    (uint32_t(hn::ExtractLane(v_acc0, lane)) << 24) |
+                    (uint32_t(hn::ExtractLane(v_acc1, lane)) << 16) |
+                    (uint32_t(hn::ExtractLane(v_acc2, lane)) << 8) |
+                    (uint32_t(hn::ExtractLane(v_acc3, lane)));
+                row_out[x + lane] = final_val;
+            }
+        }
+    }
+}
+void GPCKerneli(const uint8_t* HWY_RESTRICT in,
+                const uint8_t* HWY_RESTRICT grad,
+                uint32_t* HWY_RESTRICT gpc,
+                const std::vector<int32_t>& fastmask,
+                const std::vector<int32_t>& tau,
+                int width,
+                int height) {
+    // We use the ScalableTag, but we will "Narrow" our view manually
+    const hn::ScalableTag<uint32_t> d32;
+    const hn::Rebind<uint8_t, decltype(d32)>
+        d8_n;  // Same number of lanes as d32
+
+    const size_t N = hn::Lanes(d32);
+    const auto v_zero = hn::Zero(d32);
+    const bool use_tau = !tau.empty();
+
+    for (int y = 0; y < height; ++y) {
+        for (int x = 0; x < width; x += N) {
+            const uint8_t* centerGrad = grad + y * width + x;
+
+            // Load the gradient bytes for the current N lanes
+            auto v_grad = hn::LoadU(d8_n, centerGrad);
+
+            // Promotion-free zero check
+            if (hn::AllTrue(d8_n, hn::Eq(v_grad, hn::Zero(d8_n)))) {
+                continue;
+            }
+
+            auto v_tmp = hn::Zero(d32);
+
+            for (size_t i = 0; i < fastmask.size(); i += 2) {
+                v_tmp = hn::ShiftLeft<1>(v_tmp);
+
+                // Promote N lanes of uint8 to N lanes of uint32
+                auto v1 = hn::PromoteTo(
+                    d32, hn::LoadU(d8_n, in + y * width + x + fastmask[i]));
+                auto v2 = hn::PromoteTo(
+                    d32, hn::LoadU(d8_n, in + y * width + x + fastmask[i + 1]));
+
+                hn::Mask<decltype(d32)> mask;
+                if (use_tau) {
+                    auto v_tau = hn::Set(d32, tau[i / 2]);
+                    mask = hn::Gt(v1, hn::Sub(v2, v_tau));
+                } else {
+                    mask = hn::Gt(v1, v2);
+                }
+
+                v_tmp = hn::Add(v_tmp,
+                                hn::IfThenElse(mask, hn::Set(d32, 1), v_zero));
+            }
+
+            hn::StoreU(v_tmp, d32, gpc + y * width + x);
+        }
+    }
+}
+
+}  // namespace HWY_NAMESPACE
+}  // namespace ndb
+HWY_AFTER_NAMESPACE();
+
+namespace ndb {
+namespace testing {
+void gpc_hwy(uint8_t* in,
+             uint8_t* grad,
+             uint32_t* HWY_RESTRICT gpc,
+             const std::vector<int32_t>& fastmask,
+             const std::vector<int32_t>& tau,
+             int width,
+             int height) {
+    HWY_STATIC_DISPATCH(GPCKernel)(in, grad, gpc, fastmask, tau, width, height);
+}
+}  // namespace testing
+}  // namespace ndb
diff --git a/lib/gpc/kernels/gpc_hwy.hpp b/lib/gpc/kernels/gpc_hwy.hpp
new file mode 100644
index 0000000..8a83751
--- /dev/null
+++ b/lib/gpc/kernels/gpc_hwy.hpp
@@ -0,0 +1,23 @@
+#ifndef __NDB__KERNEL_GPC_HWY
+#define __NDB__KERNEL_GPC_HWY
+
+#include <hwy/highway.h>
+
+#include <cstdint>
+
+namespace ndb {
+
+namespace testing {
+void gpc_hwy(uint8_t* in,
+             uint8_t* grad,
+             uint32_t* HWY_RESTRICT gpc,
+             const std::vector<int32_t>& fastmask,
+             const std::vector<int32_t>& tau,
+             int width,
+             int height);
+
+}
+
+}  // namespace ndb
+
+#endif  // GPC_KERNELS_SOBEL_HWY_H_
diff --git a/lib/gpc/kernels/sobel.cpp b/lib/gpc/kernels/sobel.cpp
new file mode 100644
index 0000000..becaf50
--- /dev/null
+++ b/lib/gpc/kernels/sobel.cpp
@@ -0,0 +1,201 @@
+// Copyright (c) 2018, ETH Zurich
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the copyright holder nor the names of its contributors
+// may be used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Code Author: Niklaus Bamert (bamertn@ethz.ch)
+#include "gpc/kernels/sobel.hpp"
+
+#include <cassert>
+
+#include "gpc/kernels/utils.hpp"
+namespace ndb {
+namespace testing {
+void sobel_hwy(
+    uint8_t* in, uint8_t* blurred, int width, int height, uint8_t threshold);
+}
+void sobelNaive(
+    uint8_t* in, uint8_t* gradient, int width, int height, uint8_t threshold) {
+    assert(width % 16 == 0 && "width must be multiple of 16!");
+    int thresholdSq = threshold * threshold;
+    uint8_t* ptr = in;
+
+    uint8_t* p11 = ptr + 0 * width;
+    uint8_t* p12 = ptr + 0 * width + 1;
+    uint8_t* p13 = ptr + 0 * width + 2;
+
+    uint8_t* p21 = ptr + 1 * width;
+    uint8_t* p22 = ptr + 1 * width + 1;
+    uint8_t* p23 = ptr + 1 * width + 2;
+
+    uint8_t* p31 = ptr + 2 * width;
+    uint8_t* p32 = ptr + 2 * width + 1;
+    uint8_t* p33 = ptr + 2 * width + 2;
+
+    // output pointer
+    uint8_t* optr = gradient + 1 * width + 1;
+    // Apply 3x3 box filter to image less pixel border of 1 (to avoid treating
+    // boundary) (unoptimized)
+    for (int iy = 1; iy < height - 1; iy++) {
+        for (int ix = 0; ix < width; ix++) {
+            // Approximate division by 9 with fixed-point multiplication (2^16/9
+            // = 7282)
+            int16_t sum_x = (*p11 + *p31 + 2 * *p21 - *p13 - 2 * *p23 - *p33);
+            int16_t sum_y = (*p11 + *p13 + 2 * *p12 - *p31 - 2 * *p32 - *p33);
+
+            int sx = (static_cast<int32_t>(sum_x) * 7282) >> 16;
+            int sy = (static_cast<int32_t>(sum_y) * 7282) >> 16;
+            int val = sx * sx + sy * sy;
+
+            *optr = val > thresholdSq ? 255 : 0;
+            p11++;
+            p12++;
+            p13++;
+            p21++;
+            p22++;
+            p23++;
+            p31++;
+            p32++;
+            p33++;
+            optr++;
+        }
+    }
+}
+// #ifdef _INTRINSICS_SSE
+#if HWY_TARGET == HWY_AVX2
+#include <immintrin.h>
+
+void sobelSSE(const uint8_t* in,
+              uint8_t* blurred,
+              int width,
+              int start,
+              int end,
+              uint8_t threshold) {
+    __m128i zero = _mm_setzero_si128();
+    __m128i one_ninth = _mm_set1_epi16(7282);  // 2^16/9
+    __m128i binThres = _mm_set1_epi16(threshold * threshold);
+
+    for (int y = start; y < end; y++) {
+        const uint8_t* row1 = in + y * width;
+        const uint8_t* row0 = row1 - width;
+        const uint8_t* row2 = row1 + width;
+
+        // Output destination for this specific row
+        __m128i* dst = (__m128i*)(blurred + y * width + 1);
+
+        for (int x = 0; x < width; x += 16) {
+            __m128i a00, a01, a02, a10, a12, a20, a21, a22;
+            __m128i b00, b01, b02, b10, b12, b20, b21, b22;
+            __m128i raA, raB, rbA, rbB;
+            __m128i tmpa, tmpb, sya, syb, sxa, sxb, res;
+
+            // Load and unpack 3x3 neighborhood (excluding center a11/b11)
+            unpack8to16(_mm_loadu_si128((__m128i*)(row0 + x - 1)), a00, b00);
+            unpack8to16(_mm_loadu_si128((__m128i*)(row0 + x)), a01, b01);
+            unpack8to16(_mm_loadu_si128((__m128i*)(row0 + x + 1)), a02, b02);
+
+            unpack8to16(_mm_loadu_si128((__m128i*)(row1 + x - 1)), a10, b10);
+            unpack8to16(_mm_loadu_si128((__m128i*)(row1 + x + 1)), a12, b12);
+
+            unpack8to16(_mm_loadu_si128((__m128i*)(row2 + x - 1)), a20, b20);
+            unpack8to16(_mm_loadu_si128((__m128i*)(row2 + x)), a21, b21);
+            unpack8to16(_mm_loadu_si128((__m128i*)(row2 + x + 1)), a22, b22);
+
+            // --- SX Calculation ---
+            // Left col (1,2,1)
+            raA = _mm_mulhi_epi16(
+                _mm_add_epi16(_mm_add_epi16(a00, a20), _mm_add_epi16(a10, a10)),
+                one_ninth);
+            rbA = _mm_mulhi_epi16(
+                _mm_add_epi16(_mm_add_epi16(b00, b20), _mm_add_epi16(b10, b10)),
+                one_ninth);
+            // Right col (-1,-2,-1)
+            raB = _mm_mulhi_epi16(
+                _mm_add_epi16(_mm_add_epi16(a02, a22), _mm_add_epi16(a12, a12)),
+                one_ninth);
+            rbB = _mm_mulhi_epi16(
+                _mm_add_epi16(_mm_add_epi16(b02, b22), _mm_add_epi16(b12, b12)),
+                one_ninth);
+
+            tmpa = _mm_sub_epi16(raA, raB);
+            tmpb = _mm_sub_epi16(rbA, rbB);
+            sxa = _mm_mullo_epi16(tmpa, tmpa);
+            sxb = _mm_mullo_epi16(tmpb, tmpb);
+
+            // --- SY Calculation ---
+            // Top row (1,2,1)
+            raA = _mm_mulhi_epi16(
+                _mm_add_epi16(_mm_add_epi16(a00, a02), _mm_add_epi16(a01, a01)),
+                one_ninth);
+            rbA = _mm_mulhi_epi16(
+                _mm_add_epi16(_mm_add_epi16(b00, b02), _mm_add_epi16(b01, b01)),
+                one_ninth);
+            // Bottom row (-1,-2,-1)
+            raB = _mm_mulhi_epi16(
+                _mm_add_epi16(_mm_add_epi16(a20, a22), _mm_add_epi16(a21, a21)),
+                one_ninth);
+            rbB = _mm_mulhi_epi16(
+                _mm_add_epi16(_mm_add_epi16(b20, b22), _mm_add_epi16(b21, b21)),
+                one_ninth);
+
+            tmpa = _mm_sub_epi16(raA, raB);
+            tmpb = _mm_sub_epi16(rbA, rbB);
+            sya = _mm_mullo_epi16(tmpa, tmpa);
+            syb = _mm_mullo_epi16(tmpb, tmpb);
+
+            // --- Thresholding and Packing ---
+            pack16to8(
+                _mm_unpacklo_epi8(
+                    _mm_cmpgt_epi16(_mm_adds_epi16(sxa, sya), binThres), zero),
+                _mm_unpacklo_epi8(
+                    _mm_cmpgt_epi16(_mm_adds_epi16(sxb, syb), binThres), zero),
+                res);
+
+            _mm_storeu_si128(dst++, res);
+        }
+    }
+}
+#endif
+void sobel(uint8_t* in,
+           uint8_t* blurred,
+           int width,
+           int height,
+           uint8_t threshold,
+           int numThreads) {
+    assert(width % 16 == 0 && "width must be multiple of 16!");
+#if defined(__ARM_NEON) || defined(__aarch64__)
+    sobelNaive(in, blurred, width, height, threshold);
+    // testing::sobel_hwy(in, blurred, width, height, threshold); // not exact!
+#else
+#ifndef _INTRINSICS_SSE
+    sobelNaive(in, blurred, width, height, threshold);
+#else
+    sobelSSE(in, blurred, width, 1, height - 1, threshold);
+#endif
+#endif
+}
+}  // namespace ndb
diff --git a/lib/gpc/kernels/sobel.hpp b/lib/gpc/kernels/sobel.hpp
new file mode 100644
index 0000000..312a70c
--- /dev/null
+++ b/lib/gpc/kernels/sobel.hpp
@@ -0,0 +1,77 @@
+// Copyright (c) 2018, ETH Zurich
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the copyright holder nor the names of its contributors
+// may be used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Code Author: Niklaus Bamert (bamertn@ethz.ch)
+
+#ifndef __NDB__KERNEL_SOBEL
+#define __NDB__KERNEL_SOBEL
+#include "gpc/buffer.hpp"
+
+namespace ndb {
+#if HWY_TARGET == HWY_AVX2
+void sobelSSE(const uint8_t* in,
+              uint8_t* blurred,
+              int width,
+              int start,
+              int end,
+              uint8_t threshold);
+
+#endif
+/**
+ * @brief Naive 3x3 sobel filter implementation
+ *
+ * @param      in       input image
+ * @param      blurred  The blurred output image
+ * @param[in]  width    The width
+ * @param[in]  height   The height
+ * @param[in]  numThreads number of threads to use
+ * @param      threshold  threshold to binarize sobel filter output
+ */
+void sobelNaive(
+    uint8_t* in, uint8_t* gradient, int width, int height, uint8_t threshold);
+
+/**
+ * @brief      3x3 Sobel filter. Input dimension must be multiple of 16
+ *
+ * @param      in         { parameter_description }
+ * @param      blurred    The blurred
+ * @param[in]  width      The width
+ * @param[in]  height     The height
+ * @param[in]  threshold  The threshold
+ * @param[in]  numThreads number of threads to use
+ */
+
+void sobel(uint8_t* in,
+           uint8_t* blurred,
+           int width,
+           int height,
+           uint8_t threshold,
+           int numThreads);
+}  // namespace ndb
+#endif
diff --git a/lib/gpc/kernels/sobel_hwy.cpp b/lib/gpc/kernels/sobel_hwy.cpp
new file mode 100644
index 0000000..493b52d
--- /dev/null
+++ b/lib/gpc/kernels/sobel_hwy.cpp
@@ -0,0 +1,125 @@
+// #define HWY_TARGET HWY_NEON
+#include <hwy/highway.h>
+
+HWY_BEFORE_NAMESPACE();
+namespace ndb {
+namespace HWY_NAMESPACE {
+namespace hn = hwy::HWY_NAMESPACE;
+void SobelKernelNoDiv(const uint8_t* HWY_RESTRICT in,
+                      uint8_t* HWY_RESTRICT gradient,
+                      int width,
+                      int height,
+                      uint8_t threshold) {
+    const hn::ScalableTag<uint8_t> d8;
+    const hn::Half<decltype(d8)> d8_h;
+    const hn::Rebind<int16_t, decltype(d8_h)> d16;
+    // d32 has half the lanes of d16
+    const hn::Rebind<int32_t, hn::Half<decltype(d16)>> d32;
+
+    const size_t N = hn::Lanes(d8);
+    const auto vDivMult = hn::Set(d16, (int16_t)7282);
+    const auto vThreshSq = hn::Set(d32, (int32_t)threshold * threshold);
+    const auto v255_16 = hn::Set(d16, (int16_t)255);
+    const auto v255_8 = hn::Set(d8, (uint8_t)255);
+    const auto v0_8 = hn::Zero(d8);
+
+    for (int y = 1; y < height - 1; ++y) {
+        const uint8_t* r0 = in + (y - 1) * width;
+        const uint8_t* r1 = in + y * width;
+        const uint8_t* r2 = in + (y + 1) * width;
+        uint8_t* out = gradient + y * width + 1;
+
+        for (int x = 0; x < width; x += N) {
+            auto v11 = hn::LoadU(d8, r0 + x);
+            auto v12 = hn::LoadU(d8, r0 + x + 1);
+            auto v13 = hn::LoadU(d8, r0 + x + 2);
+            auto v21 = hn::LoadU(d8, r1 + x);
+            auto v23 = hn::LoadU(d8, r1 + x + 2);
+            auto v31 = hn::LoadU(d8, r2 + x);
+            auto v32 = hn::LoadU(d8, r2 + x + 1);
+            auto v33 = hn::LoadU(d8, r2 + x + 2);
+
+            // Helper to process 8 pixels into a 16-bit mask-like result
+            auto process_half = [&](auto p11,
+                                    auto p12,
+                                    auto p13,
+                                    auto p21,
+                                    auto p23,
+                                    auto p31,
+                                    auto p32,
+                                    auto p33) {
+                // Sobel derivatives in 16-bit
+                auto sx16 = hn::MulHigh(
+                    hn::Sub(hn::Add(hn::Add(p11, p31), hn::Add(p21, p21)),
+                            hn::Add(hn::Add(p13, p33), hn::Add(p23, p23))),
+                    vDivMult);
+                auto sy16 = hn::MulHigh(
+                    hn::Sub(hn::Add(hn::Add(p11, p13), hn::Add(p12, p12)),
+                            hn::Add(hn::Add(p31, p33), hn::Add(p32, p32))),
+                    vDivMult);
+
+                // Magnitude squared in 32-bit
+                auto sx_lo = hn::PromoteLowerTo(d32, sx16);
+                auto sy_lo = hn::PromoteLowerTo(d32, sy16);
+                auto mag_lo =
+                    hn::Add(hn::Mul(sx_lo, sx_lo), hn::Mul(sy_lo, sy_lo));
+
+                auto sx_hi = hn::PromoteUpperTo(d32, sx16);
+                auto sy_hi = hn::PromoteUpperTo(d32, sy16);
+                auto mag_hi =
+                    hn::Add(hn::Mul(sx_hi, sx_hi), hn::Mul(sy_hi, sy_hi));
+
+                // Comparison in 32-bit, returning 16-bit values (0 or 255) to
+                // avoid mask issues
+                auto m_lo = hn::IfThenElse(hn::Gt(mag_lo, vThreshSq),
+                                           hn::Set(d32, 255),
+                                           hn::Zero(d32));
+                auto m_hi = hn::IfThenElse(hn::Gt(mag_hi, vThreshSq),
+                                           hn::Set(d32, 255),
+                                           hn::Zero(d32));
+
+                return hn::OrderedDemote2To(d16, m_lo, m_hi);
+            };
+
+            // Process halves using standard Highway promotion
+            auto res_lo = process_half(hn::PromoteLowerTo(d16, v11),
+                                       hn::PromoteLowerTo(d16, v12),
+                                       hn::PromoteLowerTo(d16, v13),
+                                       hn::PromoteLowerTo(d16, v21),
+                                       hn::PromoteLowerTo(d16, v23),
+                                       hn::PromoteLowerTo(d16, v31),
+                                       hn::PromoteLowerTo(d16, v32),
+                                       hn::PromoteLowerTo(d16, v33));
+
+            auto res_hi = process_half(hn::PromoteUpperTo(d16, v11),
+                                       hn::PromoteUpperTo(d16, v12),
+                                       hn::PromoteUpperTo(d16, v13),
+                                       hn::PromoteUpperTo(d16, v21),
+                                       hn::PromoteUpperTo(d16, v23),
+                                       hn::PromoteUpperTo(d16, v31),
+                                       hn::PromoteUpperTo(d16, v32),
+                                       hn::PromoteUpperTo(d16, v33));
+
+            // Final store: 16-bit to 8-bit demotion
+            auto final_val = hn::OrderedDemote2To(d8, res_lo, res_hi);
+            hn::StoreU(final_val, d8, out + x);
+        }
+    }
+}
+
+}  // namespace HWY_NAMESPACE
+}  // namespace ndb
+HWY_AFTER_NAMESPACE();
+
+namespace ndb {
+namespace testing {
+// #if defined(HWY_TARGET) && HWY_TARGET == HWY_NEON
+void sobel_hwy(
+    uint8_t* in, uint8_t* blurred, int width, int height, uint8_t threshold) {
+    // ndb::N_NEON::SobelKernel(in, blurred, width, height, threshold);
+    HWY_STATIC_DISPATCH(SobelKernelNoDiv)(
+        in, blurred, width, height, threshold);
+}
+// #endif
+}  // namespace testing
+}  // namespace ndb
diff --git a/lib/gpc/kernels/sobel_hwy.hpp b/lib/gpc/kernels/sobel_hwy.hpp
new file mode 100644
index 0000000..7c5d4ce
--- /dev/null
+++ b/lib/gpc/kernels/sobel_hwy.hpp
@@ -0,0 +1,18 @@
+#ifndef __NDB__KERNEL_SOBEL_HWY
+#define __NDB__KERNEL_SOBEL_HWY
+
+#include <cstdint>
+
+namespace ndb {
+
+namespace testing {
+/**
+ * Entry point for benchmarking the MulHigh (approximate) version.
+ */
+void sobel_hwy(
+    uint8_t* in, uint8_t* blurred, int width, int height, uint8_t threshold);
+}  // namespace testing
+
+}  // namespace ndb
+
+#endif  // GPC_KERNELS_SOBEL_HWY_H_
diff --git a/lib/gpc/kernels/utils.cpp b/lib/gpc/kernels/utils.cpp
new file mode 100644
index 0000000..1b1ab4b
--- /dev/null
+++ b/lib/gpc/kernels/utils.cpp
@@ -0,0 +1,105 @@
+// Copyright (c) 2018, ETH Zurich
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the copyright holder nor the names of its contributors
+// may be used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Code Author: Niklaus Bamert (bamertn@ethz.ch)
+#include "gpc/kernels/utils.hpp"
+
+#include <cassert>
+#include <functional>
+#include <thread>
+
+using namespace std;
+
+namespace ndb {
+void arr2ind(const unsigned char* a, int n, int* ind, int* m) {
+#if HWY_TARGET == HWY_AVX2
+    int i, m0, k;
+    __m256i msk;
+    m0 = 0;
+    for (i = 0; i < n; i = i + 32) { /* Load 32 bytes and compare with zero: */
+        msk = _mm256_cmpeq_epi8(_mm256_load_si256((__m256i*)&a[i]),
+                                _mm256_setzero_si256());
+        k = _mm256_movemask_epi8(msk);
+        k = ~k; /* Search for nonzero bits instead of zero bits.  */
+        while (k) {
+            ind[m0] =
+                i + _tzcnt_u32(
+                        k); /* Count the number of trailing zero bits in k. */
+            m0++;
+            k = _blsr_u32(k); /* Clear the lowest set bit in k. */
+        }
+    }
+    *m = m0;
+#else
+    int nnz = 0;
+    for (int i = 0; i < n; i++) {
+        if (a[i] != 0) {
+            nnz++;
+            *ind = i;
+            ind++;
+        }
+    }
+    *m = nnz;
+#endif
+}
+#if HWY_TARGET == HWY_AVX2
+void unpack8to16(const __m128i x, __m128i& y0, __m128i& y1) {
+    __m128i zero = _mm_setzero_si128();
+    y0 = _mm_unpacklo_epi8(x, zero);
+    y1 = _mm_unpackhi_epi8(x, zero);
+}
+void pack16to8(const __m128i x0, const __m128i x1, __m128i& y) {
+    y = _mm_packus_epi16(x0, x1);
+}
+
+#endif
+void parFor(std::function<void(int, int)> const& f,
+            int start,
+            int end,
+            int nThreads) {
+    // Range definition
+    // quantities derived from range
+    int segSize = (end - start) / nThreads;
+    int lastSeg = (end - start) % nThreads;
+
+    std::vector<std::thread> threads;
+    threads.reserve(nThreads);
+
+    // Spawn threads
+    for (int t = 0; t < nThreads - 1; t++) {
+        threads.emplace_back(f, start + t * segSize, start + (t + 1) * segSize);
+    }
+    threads.emplace_back(f,
+                         start + (nThreads - 1) * segSize,
+                         start + (nThreads)*segSize + lastSeg);
+    // Join
+    for (auto& t : threads) t.join();
+}
+
+}  // namespace ndb
diff --git a/lib/gpc/kernels/utils.hpp b/lib/gpc/kernels/utils.hpp
new file mode 100644
index 0000000..3985c1e
--- /dev/null
+++ b/lib/gpc/kernels/utils.hpp
@@ -0,0 +1,105 @@
+// Copyright (c) 2018, ETH Zurich
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the copyright holder nor the names of its contributors
+// may be used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Code Author: Niklaus Bamert (bamertn@ethz.ch)
+#ifndef __NDB__KERNEL_UTILS
+#define __NDB__KERNEL_UTILS
+
+#include <hwy/highway.h>
+
+#include <cassert>
+#include <thread>
+
+#include "gpc/buffer.hpp"
+using namespace std;
+
+#if HWY_TARGET == HWY_AVX2
+#include <immintrin.h>
+// greater and lesser than simd ops for unsigned 8bit integer (epu8)
+#define _mm_cmpgt_epu8(v0, v1)                             \
+    _mm_cmpgt_epi8(_mm_xor_si128(v0, _mm_set1_epi8(-128)), \
+                   _mm_xor_si128(v1, _mm_set1_epi8(-128)))
+#define _mm_cmplt_epu8(v1, v0)                             \
+    _mm_cmpgt_epi8(_mm_xor_si128(v0, _mm_set1_epi8(-128)), \
+                   _mm_xor_si128(v1, _mm_set1_epi8(-128)))
+#endif
+namespace ndb {
+/**
+ * @brief Gets indices of non-zero values in array  a.
+ *    Credits:
+ *    https://stackoverflow.com/questions/18971401/sparse-array-compression-using-simd-avx2/41958528#41958528
+ *
+ * @param     input array
+ * @param n   number of input elements
+ * @param ind output array (indices into n of nonzero elements)
+ * @param m   number of elements in output
+ */
+void arr2ind(const unsigned char* a, int n, int* ind, int* m);
+
+#if HWY_TARGET == HWY_AVX2
+/**
+ * @brief      Unpacks 16x8bit from a 128bit simd var into 2x128bit vars
+ *             (8x16bit)
+ *
+ * @param[in]  x     the 128 bit vector to be unpacked
+ * @param      y0    The y 0
+ * @param      y1    The y 1
+ */
+void unpack8to16(const __m128i x, __m128i& y0, __m128i& y1);
+
+/**
+ * @brief      Packs 2x128bit vars with 16bit values(where 8 upper bits are
+ *             zero) into 1x128bit with 8bit values
+ *
+ * @param[in]  x0    The x 0
+ * @param[in]  x1    The x 1
+ * @param      y     the packed vector
+ */
+void pack16to8(const __m128i x0, const __m128i x1, __m128i& y);
+#endif
+/**
+ * @brief Calls a given functional f with subranges based on the given start
+ *        and end indices. Here the functional is assumed to take two integer
+ *        arguments indicating their respective start and end ranges.
+ *        nThreads determines the number of threads the given range shall be
+ * split into. The range is inclusive on the lower bound and exclusive on the
+ * upper bound, i.e. [start,end)
+ *
+ * @param f        function object (e.g. a lambda functional)
+ * @param start    start of the range
+ * @param end      end of the range
+ * @param nThreads number of threads to use
+ */
+void parFor(std::function<void(int, int)> const& f,
+            int start,
+            int end,
+            int nThreads);
+
+}  // namespace ndb
+#endif
diff --git a/lib/gpc/training.hpp b/lib/gpc/training.hpp
index f1e398b..8d557cc 100644
--- a/lib/gpc/training.hpp
+++ b/lib/gpc/training.hpp
@@ -49,7 +49,6 @@
 #include "gpc/SintelOpticalFlow.hpp"
 #include "gpc/SintelStereo.hpp"
 #include "gpc/buffer.hpp"
-#include "gpc/filter.hpp"
 #include "gpc/hashmatch.hpp"
 
 namespace gpc {
diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt
index 3bbe11f..14f4a2c 100644
--- a/samples/CMakeLists.txt
+++ b/samples/CMakeLists.txt
@@ -1,10 +1,13 @@
 add_executable(extract extract.cpp)
-target_link_libraries(extract ${PNG_LIBRARIES} Threads::Threads)
+target_link_libraries(extract gpc_core)
 
 add_executable(train train.cpp)
-target_link_libraries(train ${PNG_LIBRARIES} Threads::Threads)
+target_link_libraries(train gpc_core)
 
 add_executable(sparsematch sparsematch.cpp)
-target_link_libraries(sparsematch ${PNG_LIBRARIES} Threads::Threads)
+target_link_libraries(sparsematch gpc_core)
+
+add_executable(target target.cpp)
+target_link_libraries(target gpc_core)
 
 
diff --git a/samples/sparsematch.cpp b/samples/sparsematch.cpp
index ed43016..b25a96f 100644
--- a/samples/sparsematch.cpp
+++ b/samples/sparsematch.cpp
@@ -1,12 +1,35 @@
+#include <hwy/highway.h>
+
 #include <iostream>
 
-#include "gpc/inference.hpp"
+#include "gpc/forest.hpp"
 using namespace std;
+std::vector<ndb::Descriptor> gpcFilterDense(
+    uint8_t* in, const std::vector<int32_t>& fastmask, int width, int height) {
+    uint32_t tmp;
+    uint32_t usableW = width - 26;
+    uint32_t usableH = height - 26;
+    std::vector<ndb::Descriptor> out(usableW * usableH);
+    int j = 0;
+    for (int y = 13; y < height - 13; y++) {
+        for (int x = 13; x < width - 13; x++) {
+            tmp = 0;
+            int idx = y * width + x;
+            for (size_t i = 0; i < fastmask.size(); i += 2) {
+                tmp <<= 1;  // shift by one
+                if (*(in + idx + fastmask[i]) > *(in + idx + fastmask[i + 1]))
+                    tmp++;  // set this test's result to 1
+            }
+            out[j] = ndb::Descriptor(ndb::Point(x, y), tmp);
+            j++;
+        }
+    }
+    return out;
+}
 int main(int argc, char** argv) {
-    std::string forestPath = "../../forests/defaultZeroForest.txt";
-    std::string leftImgPath = "../../data/kitti/training/image_0/000000_10.png";
-    std::string rightImgPath =
-        "../../data/kitti/training/image_1/000000_10.png";
+    std::string forestPath = "../forests/defaultZeroForest.txt";
+    std::string leftImgPath = "../data/middlebury/im0.png";
+    std::string rightImgPath = "../data/middlebury/im1.png";
 
     if (argc == 4) {
         forestPath = argv[1];
@@ -32,7 +55,8 @@ int main(int argc, char** argv) {
     gpc::inference::InferenceSettings inferencesettings =
         gpc::inference::InferenceSettings()
             .builder()
-            .gradientThreshold(5)
+            .gradientThreshold(
+                1)  // gradientthres 20: matching ~3ms, 2: matching: ~30ms.
             .verticalTolerance(
                 0)               // 0px tolerance for rectified epipolar matches
             .dispHigh(128)       // limit disparities to 128
@@ -46,15 +70,14 @@ int main(int argc, char** argv) {
     timg.readPNG(rightImgPath);
 
     // Get learned filter for the given image dimensions.
-    GPCForest_t::FilterMask fm =
+    gpc::inference::FilterMask fm =
         forest.readForest(forestPath, simg.cols(), simg.rows());
 
-    // Preprocess images (box filter, sobel filter, indices of high gradient
-    // pixels)
     gpc::inference::time_point t0 = gpc::inference::sysTick();
-    GPCForest_t::PreprocessedImage simgP =
+
+    gpc::inference::PreprocessedImage simgP =
         forest.preprocessImage(simg, inferencesettings);
-    GPCForest_t::PreprocessedImage timgP =
+    gpc::inference::PreprocessedImage timgP =
         forest.preprocessImage(timg, inferencesettings);
     gpc::inference::time_point t1 = gpc::inference::sysTick();
 
@@ -62,14 +85,25 @@ int main(int argc, char** argv) {
     std::vector<ndb::Support> supp =
         forest.rectifiedMatch(simgP, timgP, fm, inferencesettings);
     gpc::inference::time_point t2 = gpc::inference::sysTick();
-    cout << "tPreprocess: " << gpc::inference::tickToMs(t1, t0) << " ms"
-         << ", #candidatesL:" << simgP.mask.size()
-         << ", #candidatesR:" << timgP.mask.size()
-         << ", tMatch: " << gpc::inference::tickToMs(t2, t1) << " ms"
-         << ", num matches:" << supp.size() << endl;
+    std::cout << "Number of features(s,t): " << simgP.mask.size() << ","
+              << timgP.mask.size() << std::endl;
+    std::cout << "Number of matches: " << supp.size() << std::endl;
+    std::cout << "Preprocessing time: " << gpc::inference::tickToMs(t1, t0)
+              << " ms" << std::endl;
+    std::cout << "Matching time: " << gpc::inference::tickToMs(t2, t1) << " ms"
+              << std::endl;
+    /*
+    std::vector<ndb::Descriptor> statesSrc = forest.evalFastMaskOnSubsetSSE(
+        simgP.smooth, simgP.grad, simgP.mask, fm, inferencesettings);
+    std::vector<ndb::Descriptor> statesTar = forest.evalFastMaskOnSubsetSSE(
+        timgP.smooth, timgP.grad, timgP.mask, fm, inferencesettings);
+    */
+
+    std::vector<ndb::Descriptor> statesSrc = gpcFilterDense(
+        simgP.smooth.data(), fm.mask, simgP.smooth.cols(), simgP.smooth.rows());
+    std::vector<ndb::Descriptor> statesTar = gpcFilterDense(
+        timgP.smooth.data(), fm.mask, timgP.smooth.cols(), timgP.smooth.rows());
 
-    // Output sparse disparities overlayed on left input image
-    ndb::Buffer<ndb::RGBColor> renderDisp;
-    renderDisp = ndb::getDisparityVisualization(simg, supp);
-    renderDisp.writePNGRGB("disparity.png");
+    ndb::Descriptor::serialize("statesSrcLargeS.txt", statesSrc);
+    ndb::Descriptor::serialize("statesTarLargeS.txt", statesTar);
 }
diff --git a/samples/target.cpp b/samples/target.cpp
new file mode 100644
index 0000000..f457f9d
--- /dev/null
+++ b/samples/target.cpp
@@ -0,0 +1,15 @@
+#include <hwy/highway.h>
+
+#include <iostream>
+int main() {
+    std::cout << "Compiled for: " << hwy::TargetName(HWY_TARGET) << std::endl;
+#if HWY_TARGET == HWY_AVX2
+    std::cout << "Using 256-bit AVX2 paths." << std::endl;
+#elif HWY_TARGET == HWY_NEON
+    std::cout << "Using 128-bit NEON paths." << std::endl;
+#elif HWY_TARGET == HWY_SSE4
+    std::cout << "Using 128-bit SSE4 paths." << std::endl;
+#else
+    std::cout << "Using Scalar fallback." << std::endl;
+#endif
+}
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index acf86db..e14ceda 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -11,8 +11,22 @@ FetchContent_MakeAvailable(approvaltests)
 
 
 find_package(GTest REQUIRED)
-add_executable(test_single_matching test_single_matching.cpp)
-target_link_libraries(test_single_matching PRIVATE ${PNG_LIBRARIES} ApprovalTests::ApprovalTests GTest::gtest_main)
+function(add_gpc_approval_test TEST_NAME SOURCE_FILE)
+    add_executable(${TEST_NAME} ${SOURCE_FILE})
+    
+    target_link_libraries(${TEST_NAME} 
+        PRIVATE 
+            gpc_core 
+            ApprovalTests::ApprovalTests 
+            GTest::gtest_main
+    )
+    
 
-add_test(NAME single_matching COMMAND test_single_matching)
+    add_test(NAME ${TEST_NAME} COMMAND ${TEST_NAME})
+endfunction()
+
+add_gpc_approval_test(test_single_matching test_single_matching.cpp)
+add_gpc_approval_test(test_kernel_box test_kernel_box.cpp)
+add_gpc_approval_test(test_kernel_sobel test_kernel_sobel.cpp)
+add_gpc_approval_test(test_kernel_gpc test_kernel_gpc.cpp)
 
diff --git a/tests/test_kernel_box.cpp b/tests/test_kernel_box.cpp
new file mode 100644
index 0000000..9913a7a
--- /dev/null
+++ b/tests/test_kernel_box.cpp
@@ -0,0 +1,39 @@
+#include <gtest/gtest.h>
+#include <vector>
+#include <random>
+#include "gpc/kernels/box.hpp"     // Naive version
+#include "gpc/kernels/box_hwy.hpp" // Highway version
+
+TEST(Approval, BoxKernel) {
+    const int width = 640;
+    const int height = 480;
+    const int radius = 2; // Typical for 5x5 box
+
+    // 1. Prepare randomized input
+    std::vector<uint8_t> input(width * height);
+    std::mt19937 gen(42); 
+    std::uniform_int_distribution<> dis(0, 255);
+    for (auto& val : input) val = dis(gen);
+
+    // 2. Prepare output buffers
+    std::vector<uint8_t> outNaive(width * height, 0);
+    std::vector<uint8_t> outHighway(width * height, 0);
+
+    // 3. Run Naive version
+    ndb::boxNaive(input.data(), outNaive.data(), width, height);
+
+    // 4. Run Highway version (only if compiled for the target)
+    ndb::testing::box_hwy(input.data(), outHighway.data(), width, height);
+
+
+    // 5. Compare results
+    // We skip the border (radius) because different implementations 
+    // might handle edges differently.
+    for (int y = radius; y < height - radius; ++y) {
+        for (int x = radius; x < width - radius; ++x) {
+            int idx = y * width + x;
+            ASSERT_EQ(outNaive[idx], outHighway[idx]) 
+                << "Mismatch at (" << x << "," << y << ")";
+        }
+    }
+}
diff --git a/tests/test_kernel_gpc.cpp b/tests/test_kernel_gpc.cpp
new file mode 100644
index 0000000..7989f35
--- /dev/null
+++ b/tests/test_kernel_gpc.cpp
@@ -0,0 +1,82 @@
+#include <gtest/gtest.h>
+#include <vector>
+#include <random>
+#include "gpc/forest.hpp"
+#include "gpc/kernels/gpc.hpp"     // Naive version
+#include "gpc/kernels/sobel.hpp" // Highway version
+#include "gpc/kernels/gpc_hwy.hpp" // Highway version
+#include "gpc/kernels/utils.hpp" // Highway version
+
+TEST(Approval, GPCKernel) {
+    auto file = std::filesystem::absolute(__FILE__);
+    auto dir  = file.parent_path();
+    std::filesystem::path forestPath = dir / ".." / "forests" / "defaultZeroForest.txt";
+
+    const int width = 640;
+    const int height = 480;
+    const int radius = 2; // Typical for 5x5 bo
+    const int threshold = 0; // Example threshold for binarization
+
+    typedef gpc::inference::Forest GPCForest_t;
+    GPCForest_t forest;
+    gpc::inference::FilterMask fm =
+        forest.readForest(forestPath, width, height);
+
+    // 1. Prepare randomized input
+    std::vector<uint8_t> input(width * height);
+    std::mt19937 gen(42); 
+    std::uniform_int_distribution<> dis(0, 255);
+    for (auto& val : input) val = dis(gen);
+
+    // 2. Prepare output buffers
+    std::vector<uint8_t> grad(width * height, 0);
+    std::vector<uint32_t> outNaive(width * height, 0);
+    std::vector<uint32_t> outHighway(width * height, 0);
+
+    // 3. Prepare gradient and fastmask
+    ndb::sobelNaive(input.data(), grad.data(), width, height, threshold);
+
+    // More prep
+    std::vector<int> idx(grad.size());
+    auto ff = [&](std::vector<int>& in, std::vector<int>& out, int m) {
+        for (int i = 0; i < m; i++) {
+            int x = in.data()[i] % width;
+            int y = in.data()[i] / width;
+            if (y >= 13 && y < height - 13 && x >= 13 && x < width - 13)
+                out.push_back(in.data()[i]);
+        }
+    };
+    int m;
+    // mask indexing gradient pixels
+    std::vector<int> fastmask;
+    ndb::arr2ind(grad.data(), width * height, idx.data(), &m);
+    ff(idx, fastmask, m);
+
+    std::vector<int> tau;
+    // 4. Run Naive version
+    ndb::gpcFilterNaive(input.data(), grad.data(), outNaive.data(),
+            fm.mask, fastmask, width, height);
+    /*
+     * fastmask.mask, fastmask.tau, idx
+        fastmask.mask is.. imo the extraction pattern. lets se...
+        it's filtermask! lol
+        where idx is... preprocessed.mask. WTF..what is that lol
+        */
+
+    // 5. Run Highway version 
+    //
+    //ndb::gpcFilterSSE(input.data(), grad.data(), outHighway.data(), fastmask, tau, width, height);
+    ndb::testing::gpc_hwy(input.data(), grad.data(), outHighway.data(),
+            fastmask, tau, width, height);
+
+    // 6. Compare results
+    // We skip the border (radius) because different implementations 
+    // might handle edges differently.
+    for (int y = radius; y < height - radius; ++y) {
+        for (int x = radius; x < width - radius; ++x) {
+            int idx = y * width + x;
+            ASSERT_EQ(outNaive[idx], outHighway[idx]) 
+                << "Mismatch at (" << x << "," << y << ")";
+        }
+    }
+}
diff --git a/tests/test_kernel_sobel.cpp b/tests/test_kernel_sobel.cpp
new file mode 100644
index 0000000..bfc56ff
--- /dev/null
+++ b/tests/test_kernel_sobel.cpp
@@ -0,0 +1,39 @@
+#include <gtest/gtest.h>
+#include <vector>
+#include <random>
+#include "gpc/kernels/sobel.hpp"     // Naive version
+#include "gpc/kernels/sobel_hwy.hpp" // Highway version
+
+TEST(Approval, SobelKernel) {
+    const int width = 640;
+    const int height = 480;
+    const int radius = 2; // Typical for 5x5 bo
+    const int threshold = 30; // Example threshold for binarization
+
+    // 1. Prepare randomized input
+    std::vector<uint8_t> input(width * height);
+    std::mt19937 gen(42); 
+    std::uniform_int_distribution<> dis(0, 255);
+    for (auto& val : input) val = dis(gen);
+
+    // 2. Prepare output buffers
+    std::vector<uint8_t> outNaive(width * height, 0);
+    std::vector<uint8_t> outHighway(width * height, 0);
+
+    // 3. Run Naive version
+    ndb::sobelNaive(input.data(), outNaive.data(), width, height, threshold);
+
+    // 4. Run Highway version (only if compiled for the target)
+    ndb::testing::sobel_hwy(input.data(), outHighway.data(), width, height, threshold);
+
+    // 5. Compare results
+    // We skip the border (radius) because different implementations 
+    // might handle edges differently.
+    for (int y = radius; y < height - radius; ++y) {
+        for (int x = radius; x < width - radius; ++x) {
+            int idx = y * width + x;
+            ASSERT_EQ(outNaive[idx], outHighway[idx]) 
+                << "Mismatch at (" << x << "," << y << ")";
+        }
+    }
+}
diff --git a/tests/test_single_matching.Approval.Inference.approved.txt b/tests/test_single_matching.Approval.Inference.approved.txt
index 294072e..d481bd2 100644
--- a/tests/test_single_matching.Approval.Inference.approved.txt
+++ b/tests/test_single_matching.Approval.Inference.approved.txt
@@ -1 +1 @@
-[(13, 569, 0), (13, 570, 0), (13, 658, -1), (13, 659, -1), (13, 660, -1), (13, 671, -1), (13, 690, -2), (14, 562, 1), (14, 571, 1), (14, 572, 1), (14, 573, 1), (14, 609, 0), (14, 792, -5), (14, 793, -5), (14, 794, -5), (14, 857, -5), (15, 566, 2), (15, 828, -5), (15, 868, -6), (15, 1006, -10), (16, 850, -5), (143, 76, 103), (215, 753, 102), (236, 336, 128), (236, 337, 128), (237, 340, 128), (237, 341, 128), (239, 347, 128), (239, 351, 128), (239, 352, 128), (239, 353, 128), (239, 356, 128), (240, 263, 67), (240, 351, 128), (240, 352, 128), (240, 354, 128), (240, 357, 128), (240, 359, 128), (240, 362, 128), (241, 264, 68), (241, 362, 128), (241, 364, 128), (242, 364, 128), (242, 367, 128), (243, 267, 68), (243, 370, 128), (243, 371, 128), (243, 372, 127), (243, 373, 127), (243, 374, 127), (243, 375, 128), (243, 377, 128), (243, 421, 124), (244, 268, 69), (244, 377, 127), (244, 378, 127), (244, 380, 127), (244, 420, 124), (244, 421, 124), (244, 442, 127), (245, 270, 69), (245, 385, 127), (245, 386, 127), (245, 425, 124), (245, 439, 127), (246, 387, 128), (246, 388, 127), (246, 389, 127), (247, 442, 128), (247, 448, 128), (248, 374, 128), (248, 446, 128), (249, 276, 69), (250, 277, 69), (250, 412, 127), (250, 413, 126), (250, 456, 127), (252, 426, 127), (252, 932, 115), (254, 425, 126), (256, 287, 68), (257, 289, 68), (258, 290, 68), (259, 292, 69), (259, 421, 126), (262, 419, 127), (264, 418, 126), (266, 418, 6), (266, 550, 121), (267, 422, 126), (269, 286, 82), (270, 938, 119), (272, 582, 120), (272, 583, 120), (273, 843, 113), (275, 934, 124), (277, 560, 127), (278, 617, 119), (279, 570, 96), (279, 617, 119), (279, 620, 119), (279, 621, 119), (282, 623, -1), (283, 644, 118), (284, 637, 92), (285, 635, -2), (285, 642, 119), (287, 401, 40), (290, 557, -38), (291, 634, 125), (292, 643, 126), (292, 674, 118), (296, 570, -40), (296, 665, 5), (296, 715, 116), (296, 716, 116), (297, 674, 119), (298, 672, 120), (298, 673, 120), (299, 682, 119), (299, 723, 117), (300, 687, 119), (300, 688, 119), (300, 690, 118), (301, 580, -42), (301, 693, 119), (301, 698, 118), (301, 744, 115), (304, 587, -43), (304, 953, 125), (310, 600, -46), (310, 986, 128), (311, 506, 62), (312, 205, 127), (320, 840, 112), (321, 836, 112), (322, 237, 72), (322, 841, 111), (324, 843, 112), (324, 844, 112), (325, 838, 113), (325, 844, 112), (325, 845, 112), (326, 842, 113), (328, 228, 127), (329, 228, 127), (332, 845, 113), (333, 844, 114), (336, 719, 42), (338, 716, 38), (339, 235, 125), (339, 381, 99), (339, 718, 39), (344, 232, 52), (346, 455, -117), (346, 456, -116), (346, 554, -121), (354, 834, 125), (355, 831, 126), (359, 223, 124), (368, 218, 126), (370, 392, -97), (377, 58, 124), (385, 315, 123), (387, 316, 124), (387, 384, 94), (388, 313, 119), (391, 312, 122), (395, 299, 122), (396, 54, 109), (396, 475, -24), (400, 918, -107), (404, 180, 123), (407, 453, -109), (408, 286, 119), (409, 208, 123), (412, 504, -44), (414, 340, 121), (415, 220, 123), (415, 221, 123), (417, 561, 87), (418, 218, 123), (420, 1035, -94), (425, 338, 118), (433, 53, 124), (434, 379, 117), (436, 379, 118), (437, 52, 128), (437, 205, -47), (437, 379, 118), (439, 378, 109), (440, 317, 120), (443, 285, 120), (443, 394, 125), (446, 414, 126), (448, 202, 121), (452, 330, 119), (452, 719, 64), (454, 199, 122), (455, 52, 126), (455, 723, 65), (456, 377, 118), (457, 199, 122), (463, 325, 120), (465, 330, 112), (470, 192, 121), (472, 330, 120), (473, 327, 118), (473, 372, 115), (477, 372, 112), (480, 272, 108), (480, 279, 119), (481, 170, 120), (481, 186, 120), (481, 221, 115), (482, 169, 120), (483, 185, 121), (486, 823, 63), (489, 181, 120), (492, 370, 115), (500, 312, 118), (504, 179, 119), (505, 172, 122), (508, 171, 120), (509, 171, 120), (510, 961, 126), (517, 169, 121), (517, 217, 118), (518, 401, 124), (519, 400, 122), (520, 400, 124), (523, 321, 115), (523, 407, 126), (525, 171, 118), (526, 272, 117), (527, 363, 127), (529, 315, 116), (532, 171, 119), (534, 360, 110), (539, 360, 112), (543, 297, 117), (552, 588, 27), (555, 262, 96), (555, 295, 117), (555, 391, 123), (556, 357, 112), (557, 291, 116), (558, 594, 27), (565, 161, 115), (570, 169, 116), (570, 170, 120), (572, 354, 111), (572, 398, 122), (573, 313, 114), (573, 611, 28), (574, 609, 29), (576, 394, 105), (577, 615, 29), (578, 352, 113), (579, 352, 110), (580, 697, -38), (581, 166, 112), (583, 354, 112), (585, 622, 31), (589, 310, 112), (591, 262, 113), (593, 352, 113), (595, 204, 115), (597, 68, 117), (597, 69, 117), (598, 64, 117), (598, 66, 117), (598, 67, 117), (598, 68, 117), (598, 69, 117), (598, 70, 117), (598, 71, 117), (598, 72, 117), (598, 81, 117), (599, 73, 117), (599, 75, 117), (599, 86, 116), (599, 87, 116), (600, 83, 117), (600, 84, 117), (600, 85, 117), (600, 88, 117), (600, 94, 116), (600, 95, 116), (601, 92, 117), (601, 99, 116), (601, 100, 116), (601, 101, 116), (601, 102, 116), (602, 102, 116), (602, 106, 116), (602, 107, 116), (603, 151, 115), (603, 204, 114), (603, 205, 113), (604, 126, 115), (604, 146, 115), (604, 147, 115), (605, 146, 115), (605, 205, 113), (605, 348, 107), (606, 144, 115), (606, 146, 115), (606, 148, 115), (606, 150, 115), (606, 203, 114), (607, 150, 115), (608, 55, 118), (609, 204, 114), (610, 205, 114), (611, 205, 114), (611, 208, 114), (611, 209, 114), (611, 210, 114), (612, 150, 116), (612, 202, 114), (612, 205, 114), (612, 206, 114), (612, 208, 113), (612, 209, 113), (612, 345, 112), (613, 102, 117), (613, 202, 114), (613, 348, 111), (613, 652, -113), (614, 212, 114), (614, 305, 111), (614, 306, 111), (615, 103, 118), (615, 347, 111), (616, 154, 114), (616, 233, 113), (616, 234, 113), (616, 236, 113), (616, 250, 113), (616, 251, 113), (616, 303, 112), (616, 655, 34), (617, 132, 117), (617, 160, 116), (617, 239, 113), (617, 241, 113), (617, 242, 113), (617, 251, 113), (617, 347, 110), (618, 134, 117), (618, 137, 117), (618, 138, 117), (618, 250, 113), (619, 108, 118), (619, 109, 118), (619, 110, 118), (619, 129, 117), (619, 135, 117), (619, 136, 117), (619, 138, 117), (619, 147, 117), (619, 154, 116), (619, 253, 113), (620, 103, 118), (620, 110, 118), (620, 256, 113), (620, 258, 119), (620, 304, 112), (621, 111, 117), (621, 150, 116), (622, 129, 118), (622, 168, 117), (622, 308, 112), (622, 347, 111), (623, 129, 118), (623, 203, 115), (623, 303, 112), (623, 306, 111), (623, 309, 111), (623, 310, 111), (623, 346, 110), (623, 347, 111), (624, 130, 117), (624, 164, 116), (624, 205, 115), (624, 305, 112), (624, 308, 111), (624, 311, 112), (624, 388, 116), (625, 167, 117), (625, 169, 116), (625, 198, 116), (625, 205, 115), (625, 303, 111), (625, 316, 111), (625, 317, 111), (625, 318, 111), (625, 319, 111), (625, 339, 111), (625, 341, 111), (625, 342, 111), (625, 343, 111), (625, 389, 123), (626, 130, 117), (626, 138, 118), (626, 199, 115), (626, 200, 115), (626, 301, 112), (626, 326, 111), (626, 329, 111), (626, 330, 111), (626, 331, 111), (626, 332, 111), (626, 333, 111), (626, 339, 111), (626, 342, 111), (626, 343, 111), (627, 130, 117), (627, 141, 116), (627, 254, 115), (627, 337, 111), (627, 339, 111), (628, 50, 117), (628, 183, 116), (628, 184, 116), (628, 185, 116), (628, 254, 114), (628, 255, 114), (628, 302, 111), (628, 341, 111), (629, 97, 117), (629, 132, 118), (629, 188, 115), (629, 192, 116), (629, 193, 116), (629, 255, 114), (629, 341, 111), (630, 144, 117), (630, 193, 115), (630, 194, 115), (630, 302, 112), (630, 342, 111), (630, 344, 111), (630, 348, 110), (631, 134, 118), (631, 344, 111), (631, 348, 110), (632, 47, 118), (632, 50, 117), (632, 109, 119), (632, 196, 116), (632, 198, 116), (632, 202, 115), (632, 203, 115), (632, 205, 115), (632, 206, 115), (632, 342, 111), (633, 52, 118), (633, 53, 118), (633, 54, 118), (633, 206, 115), (633, 214, 114), (634, 53, 118), (634, 55, 118), (634, 64, 117), (635, 51, 118), (635, 76, 117), (635, 97, 117), (635, 112, 118), (635, 136, 116), (636, 61, 117), (636, 79, 117), (636, 97, 117), (636, 301, 113), (637, 75, 118), (637, 84, 115), (637, 305, 113), (637, 347, 112), (638, 347, 112), (638, 408, 126), (639, 83, 117), (639, 387, 117), (639, 408, 124), (640, 85, 117), (640, 89, 117), (640, 92, 117), (640, 277, 113), (640, 278, 113), (641, 85, 117), (641, 111, 119), (641, 347, 112), (642, 91, 114), (643, 144, 115), (643, 145, 115), (643, 343, 112), (643, 346, 111), (644, 87, 114), (644, 142, 115), (644, 342, 112), (645, 94, 115), (645, 112, 121), (645, 162, 115), (645, 343, 112), (646, 90, 115), (646, 143, 116), (646, 146, 116), (646, 148, 116), (646, 149, 116), (646, 166, 115), (646, 167, 115), (646, 347, 102), (647, 118, 121), (647, 151, 116), (647, 163, 116), (647, 164, 116), (647, 343, 111), (647, 364, 115), (648, 144, 116), (648, 364, 115), (648, 386, 124), (649, 92, 116), (649, 119, 121), (649, 170, 115), (650, 294, 113), (651, 93, 116), (652, 459, 84), (654, 318, 112), (655, 122, 121), (655, 318, 111), (655, 321, 112), (655, 322, 112), (655, 323, 112), (655, 334, 112), (655, 439, 87), (656, 228, 114), (656, 229, 114), (656, 233, 113), (656, 256, 113), (656, 321, 112), (656, 323, 112), (656, 332, 111), (656, 347, 111), (657, 125, 66), (657, 331, 111), (657, 347, 111), (658, 149, 118), (658, 244, 114), (659, 254, 113), (659, 255, 113), (659, 265, 113), (660, 261, 113), (661, 99, 116), (662, 126, 121), (662, 127, 121), (664, 152, 120), (665, 126, 123), (665, 311, 113), (666, 340, 111), (666, 341, 111), (666, 380, 117), (667, 119, 121), (667, 129, 121), (667, 331, 111), (667, 339, 112), (667, 340, 111), (667, 343, 112), (668, 328, 112), (668, 329, 112), (668, 330, 112), (668, 339, 112), (668, 343, 111), (668, 380, 118), (669, 342, 111), (669, 343, 111), (669, 344, 111), (670, 343, 111), (671, 343, 111), (672, 344, 111), (673, 103, 117), (673, 106, 118), (673, 120, 119), (674, 426, -43), (674, 432, 11), (675, 135, 122), (677, 136, 121), (677, 155, 122), (678, 156, 121), (679, 123, 120), (679, 352, 115), (680, 136, 122), (681, 157, 122), (682, 125, 119), (682, 347, 113), (682, 459, 11), (683, 459, 13), (684, 138, 123), (686, 346, 116), (688, 354, 114), (688, 379, 118), (689, 493, -77), (691, 356, 115), (691, 378, 116), (692, 455, -69), (693, 356, 114), (694, 164, 126), (695, 356, 114), (697, 119, 119), (698, 130, 118), (698, 390, 121), (698, 955, 88), (699, 970, 4), (699, 971, 4), (699, 972, 89), (700, 998, 109), (703, 122, 120), (705, 123, 120), (706, 354, 115), (707, 124, 120), (707, 354, 115), (708, 125, 120), (708, 149, 125), (709, 372, 116), (710, 126, 120), (711, 130, 120), (712, 127, 120), (712, 130, 120), (715, 128, 120), (716, 128, 120), (716, 174, 124), (718, 129, 120), (718, 175, 124), (719, 134, 120), (720, 490, -69), (721, 131, 120), (723, 159, 125), (725, 133, 121), (726, 133, 121), (727, 158, 126), (727, 160, 126), (729, 135, 121), (730, 135, 121), (730, 159, 126), (730, 162, 126), (731, 136, 121), (734, 161, 126), (738, 1036, -31), (741, 139, 122), (742, 141, 122), (742, 164, 128), (742, 433, -93), (743, 638, -73), (744, 140, 121), (746, 166, 127), (747, 192, 126), (751, 171, 66), (755, 170, 127), (756, 173, 128), (757, 468, 122), (757, 479, 119), (760, 172, 128), (763, 174, 128), (764, 176, 65), (768, 369, 118), (772, 154, 125), (775, 443, -4), (777, 448, -3), (778, 457, 108), (778, 458, 108), (784, 185, 63), (793, 1033, -10), (794, 884, -29), (795, 1011, -44), (796, 400, -6), (796, 1025, 128), (797, 427, -8), (798, 165, 126), (798, 1024, -5), (800, 433, 45), (801, 880, -11), (802, 412, 119), (802, 771, 52), (802, 937, 94), (816, 224, 126), (816, 389, -93), (816, 393, -93), (816, 876, -77), (820, 934, -91), (823, 653, 95), (824, 423, -25), (824, 965, 66), (830, 936, 86), (833, 871, 62), (839, 769, -124), (845, 470, 83), (848, 185, 128), (850, 481, 47), (850, 864, -124), (851, 424, 2), (864, 762, -125), (865, 383, 6), (865, 761, -124), (868, 346, 53), (873, 476, 7), (884, 337, -6), (885, 371, -87), (885, 372, -87), (888, 586, 60), (889, 373, 45), (900, 759, -50), (902, 642, -3), (913, 389, 37), (916, 435, 38), (917, 842, -107), (919, 704, 58), (927, 561, -120), (940, 695, 54), (944, 840, -85), (957, 691, 52), (963, 269, 54), (963, 935, 116), (964, 952, -89), (965, 274, 54), (965, 275, 54), (965, 988, -94), (965, 989, -94), (966, 828, -90), (966, 983, -92), (966, 984, -92), (966, 985, -92), (966, 986, -92), (970, 667, -48), (972, 689, 56), (975, 397, 118), (990, 1000, -97), (997, 813, -85), (1007, 769, 53), (1010, 812, -61), (1018, 392, 38), (1026, 867, 91), (1057, 665, -59), (1085, 971, -84), (1087, 382, 112), (1119, 787, 40), (1128, 1008, -1), (1129, 973, 72), (1129, 974, 72), (1130, 989, -2), (1132, 642, 47), (1150, 639, 52), (1157, 635, 48), (1173, 631, 46), (1174, 461, 93), (1197, 804, -98), (1202, 624, -55), (1214, 500, -60), (1223, 618, 52), (1236, 494, -77), (1238, 929, 81), (1258, 916, 88), (1261, 606, 45), (1266, 684, 66), (1270, 603, 47), (1276, 685, 58), (1281, 998, 110), (1281, 999, 110), (1281, 1000, 112), (1288, 928, 120), (1288, 929, 120), (1296, 78, 107), (1297, 86, 107), (1297, 100, 109), (1297, 101, 109), (1297, 103, 109), (1297, 104, 109), (1298, 75, 108), (1298, 102, 109), (1298, 105, 108), (1298, 106, 108), (1299, 75, 108), (1300, 69, 109), (1301, 68, 108), (1302, 69, 109), (1302, 75, 109), (1303, 73, 108), (1304, 50, 107), (1304, 65, 108), (1304, 76, 108), (1307, 507, 77), (1322, 666, 65), (1325, 588, 47), (1341, 197, 123), (1341, 719, 39), (1341, 720, 39), (1354, 344, 97), (1355, 752, 38), (1355, 753, 38), (1356, 343, 98), (1358, 335, 96), (1360, 335, 97), (1360, 336, 97), (1364, 336, 98), (1367, 561, -53), (1371, 461, 125), (1372, 459, 126), (1380, 337, 99), (1383, 335, 111), (1403, 715, 83), (1408, 820, 110), (1412, 825, 114), (1417, 856, 116), (1418, 570, 107), (1426, 910, 20), (1427, 890, 120), (1428, 889, 120), (1437, 942, 125), (1437, 943, 125), (1443, 963, 124), (1453, 896, 123), (1455, 417, 101), (1456, 305, 99), (1456, 308, 99), (1456, 416, 102), (1457, 306, 100), (1457, 416, 102), (1457, 917, 119), (1458, 897, 125), (1461, 477, 115), (1464, 410, 98), (1467, 334, 103), (1469, 407, 98), (1469, 440, 104), (1469, 441, 103), (1469, 443, 103), (1514, 460, 104), (1519, 897, 124), (1519, 899, 124), (1520, 897, 124), (1526, 929, 124), (1528, 899, 122), (1529, 912, 125), (1529, 913, 124), (1529, 916, 125), (1531, 839, 124), (1536, 969, 128), (1539, 962, 127), (1540, 964, 127), (1540, 965, 127), (1561, 1000, 128), (1584, 863, 49), (1686, 1017, -39), (1712, 979, 90), (1713, 978, 86), (1766, 971, 128), (1769, 966, 128), (1778, 834, -14), (1853, 970, 126), (1885, 905, 89), (1897, 935, 125), (1902, 171, 1), (1903, 67, 1), (1903, 291, 1), (1903, 292, 1), (1904, 260, 1), (1904, 261, 1), (1905, 329, 1)]
+[(13, 570, 0), (13, 658, -1), (13, 659, -1), (13, 660, -1), (13, 690, -2), (14, 562, 1), (14, 571, 1), (14, 572, 1), (14, 573, 1), (14, 609, 0), (14, 792, -5), (14, 857, -5), (15, 566, 2), (15, 868, -6), (15, 1006, -10), (16, 850, -5), (143, 76, 103), (150, 736, 102), (165, 543, 81), (165, 546, 80), (175, 982, 128), (178, 546, 87), (183, 546, 81), (183, 743, 99), (188, 742, 102), (191, 749, 105), (195, 547, 77), (203, 756, 103), (215, 753, 102), (224, 690, 95), (236, 336, 128), (236, 337, 128), (237, 340, 128), (237, 341, 128), (239, 347, 128), (239, 351, 128), (239, 352, 128), (239, 353, 128), (239, 356, 128), (240, 263, 67), (240, 351, 128), (240, 352, 128), (240, 354, 128), (240, 357, 128), (240, 359, 128), (240, 362, 128), (241, 264, 68), (241, 362, 128), (241, 364, 128), (242, 364, 128), (242, 367, 128), (243, 267, 68), (243, 370, 128), (243, 371, 128), (243, 372, 127), (243, 373, 127), (243, 374, 127), (243, 375, 128), (243, 377, 128), (243, 421, 124), (244, 376, 127), (244, 377, 127), (244, 378, 127), (244, 420, 124), (244, 421, 124), (244, 442, 127), (245, 270, 69), (245, 385, 127), (245, 386, 127), (245, 425, 124), (245, 439, 127), (246, 387, 128), (246, 388, 127), (246, 389, 127), (246, 741, 102), (247, 362, 128), (247, 442, 128), (247, 448, 128), (248, 373, 128), (248, 374, 128), (248, 376, 128), (248, 446, 128), (249, 276, 69), (249, 404, 127), (250, 277, 69), (250, 411, 127), (250, 412, 127), (250, 413, 126), (250, 456, 127), (252, 426, 127), (254, 425, 126), (256, 287, 68), (257, 289, 68), (258, 290, 68), (259, 292, 69), (259, 421, 126), (261, 521, 94), (262, 419, 127), (264, 418, 126), (264, 853, 114), (266, 418, 6), (266, 550, 121), (267, 422, 126), (269, 286, 82), (272, 582, 120), (272, 583, 120), (273, 324, 116), (273, 843, 113), (275, 934, 124), (276, 839, 112), (277, 560, 127), (278, 585, 125), (278, 617, 119), (279, 570, 96), (279, 617, 119), (279, 620, 119), (279, 621, 119), (282, 582, 127), (282, 623, -1), (283, 644, 118), (284, 637, 92), (285, 635, -2), (285, 642, 119), (287, 401, 40), (288, 554, -37), (290, 557, -38), (290, 558, -39), (291, 634, 125), (292, 643, 126), (292, 674, 118), (296, 570, -40), (296, 665, 5), (296, 716, 116), (297, 674, 119), (298, 673, 120), (299, 677, 120), (299, 682, 119), (300, 687, 119), (300, 688, 119), (300, 690, 118), (301, 580, -42), (301, 693, 119), (301, 698, 118), (301, 744, 115), (304, 953, 125), (309, 952, 125), (309, 953, 125), (310, 600, -46), (312, 205, 127), (319, 236, 128), (320, 619, -51), (320, 840, 112), (321, 836, 112), (322, 237, 72), (322, 238, 126), (322, 841, 111), (324, 843, 112), (324, 844, 112), (325, 231, 128), (325, 838, 113), (325, 844, 112), (325, 845, 112), (325, 983, 128), (326, 842, 113), (328, 228, 127), (329, 228, 127), (329, 240, 125), (329, 706, 34), (332, 231, 126), (333, 844, 114), (336, 718, -52), (336, 719, 42), (338, 716, 38), (339, 235, 125), (339, 381, 99), (340, 748, -65), (344, 232, 52), (346, 455, -117), (346, 456, -116), (346, 554, -121), (347, 229, 125), (347, 969, 125), (348, 229, 123), (355, 241, 119), (355, 830, 126), (355, 831, 126), (356, 830, 126), (358, 222, 126), (361, 639, 75), (366, 217, 124), (368, 218, 126), (368, 689, 3), (377, 58, 124), (385, 314, 123), (385, 315, 123), (390, 314, 123), (391, 308, 123), (391, 312, 122), (393, 303, 121), (393, 307, 122), (394, 303, 121), (394, 305, 122), (395, 299, 122), (396, 294, 121), (396, 301, 122), (400, 918, -107), (404, 180, 123), (404, 293, 121), (407, 453, -109), (408, 286, 119), (409, 208, 123), (414, 340, 121), (414, 602, 57), (415, 220, 123), (415, 221, 123), (415, 222, 123), (418, 218, 123), (419, 224, 122), (420, 1035, -94), (420, 1036, -94), (421, 221, 122), (423, 226, 122), (423, 227, 122), (425, 338, 118), (433, 53, 124), (433, 384, 119), (434, 379, 117), (436, 379, 118), (437, 52, 128), (437, 379, 118), (439, 378, 109), (439, 380, 128), (440, 204, 122), (440, 317, 120), (443, 285, 120), (443, 394, 125), (446, 414, 126), (448, 202, 121), (449, 200, 121), (449, 202, 121), (452, 330, 119), (452, 719, 64), (453, 201, 119), (453, 705, 74), (453, 720, 64), (454, 199, 122), (455, 52, 126), (456, 226, 121), (456, 377, 118), (458, 327, 119), (459, 197, 121), (463, 325, 120), (465, 330, 112), (472, 330, 120), (473, 372, 115), (474, 188, 120), (477, 372, 112), (480, 272, 108), (480, 279, 119), (481, 170, 120), (481, 186, 120), (481, 221, 115), (481, 882, 118), (482, 169, 120), (483, 185, 121), (486, 823, 63), (488, 221, 115), (492, 370, 115), (493, 180, 114), (499, 313, 118), (502, 172, 120), (506, 173, 119), (508, 175, 121), (510, 961, 126), (511, 397, 123), (512, 961, 124), (513, 323, 120), (513, 988, -79), (517, 167, 123), (517, 169, 121), (517, 217, 118), (518, 401, 124), (518, 801, 111), (518, 960, 126), (519, 400, 122), (519, 957, 125), (520, 400, 124), (520, 801, 112), (523, 407, 126), (524, 961, 128), (525, 171, 118), (525, 306, 117), (526, 272, 117), (527, 363, 127), (529, 313, 117), (530, 313, 117), (530, 639, -38), (532, 171, 119), (532, 312, 118), (532, 641, -41), (533, 300, 116), (534, 360, 110), (534, 384, 121), (539, 360, 112), (541, 315, 115), (543, 297, 117), (551, 388, 121), (552, 588, 27), (555, 169, 118), (555, 262, 96), (555, 295, 117), (555, 391, 123), (556, 357, 112), (557, 291, 116), (557, 386, 122), (558, 594, 27), (561, 290, 116), (565, 161, 115), (565, 293, 116), (570, 161, 119), (570, 169, 116), (572, 354, 111), (572, 398, 122), (573, 611, 28), (574, 609, 29), (576, 394, 105), (577, 615, 29), (578, 352, 113), (579, 352, 110), (580, 697, -38), (581, 166, 112), (583, 354, 112), (585, 622, 31), (589, 310, 112), (589, 353, 112), (591, 262, 113), (593, 352, 113), (593, 895, 119), (594, 381, 115), (597, 68, 117), (597, 69, 117), (597, 165, 115), (598, 64, 117), (598, 66, 117), (598, 67, 117), (598, 68, 117), (598, 69, 117), (598, 70, 117), (598, 71, 117), (598, 72, 117), (598, 81, 117), (599, 72, 117), (599, 73, 117), (599, 75, 117), (599, 86, 116), (599, 87, 116), (600, 83, 117), (600, 84, 117), (600, 85, 117), (600, 88, 117), (600, 94, 116), (600, 95, 116), (601, 91, 117), (601, 92, 117), (601, 99, 116), (601, 100, 116), (601, 101, 116), (601, 102, 116), (602, 102, 116), (602, 106, 116), (602, 107, 116), (603, 151, 115), (603, 204, 114), (603, 205, 113), (604, 126, 115), (604, 146, 115), (604, 147, 115), (604, 203, 114), (605, 146, 115), (605, 205, 113), (605, 348, 107), (606, 144, 115), (606, 146, 115), (606, 148, 115), (606, 150, 115), (606, 203, 114), (607, 150, 115), (608, 55, 118), (609, 204, 114), (609, 350, 112), (610, 205, 114), (611, 205, 114), (611, 208, 114), (611, 209, 114), (611, 210, 114), (612, 150, 116), (612, 202, 114), (612, 205, 114), (612, 206, 114), (612, 208, 113), (612, 209, 113), (612, 345, 112), (613, 102, 117), (613, 202, 114), (613, 348, 111), (613, 652, -113), (614, 212, 114), (614, 305, 111), (614, 306, 111), (615, 103, 118), (616, 151, 116), (616, 154, 114), (616, 233, 113), (616, 234, 113), (616, 236, 113), (616, 250, 113), (616, 251, 113), (616, 303, 112), (616, 349, 111), (616, 655, 34), (617, 132, 117), (617, 160, 116), (617, 204, 116), (617, 239, 113), (617, 241, 113), (617, 242, 113), (617, 251, 113), (617, 347, 110), (618, 134, 117), (618, 137, 117), (618, 250, 113), (619, 108, 118), (619, 109, 118), (619, 110, 118), (619, 129, 117), (619, 135, 117), (619, 136, 117), (619, 138, 117), (619, 147, 117), (619, 154, 116), (619, 253, 113), (620, 103, 118), (620, 107, 118), (620, 110, 118), (620, 256, 113), (620, 258, 119), (620, 304, 112), (621, 111, 117), (621, 150, 116), (622, 129, 118), (622, 132, 117), (622, 168, 117), (622, 308, 112), (622, 347, 111), (623, 129, 118), (623, 169, 117), (623, 203, 115), (623, 303, 112), (623, 306, 111), (623, 309, 111), (623, 310, 111), (623, 346, 110), (623, 347, 111), (624, 102, 118), (624, 130, 117), (624, 133, 117), (624, 164, 116), (624, 305, 112), (624, 306, 111), (624, 308, 111), (624, 311, 112), (624, 388, 116), (625, 167, 117), (625, 169, 116), (625, 198, 116), (625, 205, 115), (625, 303, 111), (625, 316, 111), (625, 317, 111), (625, 318, 111), (625, 319, 111), (625, 339, 111), (625, 341, 111), (625, 342, 111), (625, 343, 111), (625, 389, 123), (626, 130, 117), (626, 138, 118), (626, 167, 117), (626, 199, 115), (626, 200, 115), (626, 203, 115), (626, 301, 112), (626, 326, 111), (626, 329, 111), (626, 330, 111), (626, 331, 111), (626, 332, 111), (626, 333, 111), (626, 339, 111), (626, 342, 111), (626, 343, 111), (627, 50, 118), (627, 130, 117), (627, 141, 116), (627, 202, 116), (627, 204, 115), (627, 254, 115), (627, 322, 111), (627, 337, 111), (627, 339, 111), (628, 50, 117), (628, 183, 116), (628, 184, 116), (628, 185, 116), (628, 254, 114), (628, 255, 114), (628, 302, 111), (628, 341, 111), (629, 132, 118), (629, 188, 115), (629, 192, 116), (629, 255, 114), (629, 341, 111), (630, 107, 118), (630, 144, 117), (630, 194, 115), (630, 302, 112), (630, 342, 111), (630, 344, 111), (630, 348, 110), (630, 410, 128), (631, 50, 117), (631, 134, 118), (631, 272, 114), (631, 344, 111), (631, 348, 110), (632, 47, 118), (632, 50, 117), (632, 196, 116), (632, 198, 116), (632, 202, 115), (632, 203, 115), (632, 205, 115), (632, 206, 115), (632, 342, 111), (633, 52, 118), (633, 53, 118), (633, 54, 118), (633, 206, 115), (633, 214, 114), (633, 343, 111), (634, 47, 117), (634, 53, 118), (634, 55, 118), (635, 51, 118), (635, 76, 117), (635, 97, 117), (635, 112, 118), (635, 136, 116), (636, 61, 117), (636, 79, 117), (636, 85, 117), (636, 97, 117), (636, 301, 113), (637, 84, 115), (637, 305, 113), (637, 347, 112), (638, 92, 117), (638, 347, 112), (638, 408, 126), (639, 83, 117), (639, 387, 117), (639, 406, 124), (639, 408, 124), (640, 85, 117), (640, 89, 117), (640, 92, 117), (640, 111, 120), (640, 277, 113), (640, 278, 113), (640, 279, 113), (641, 85, 117), (641, 92, 117), (641, 347, 112), (642, 91, 114), (643, 144, 115), (643, 145, 115), (643, 343, 112), (644, 87, 114), (644, 142, 115), (644, 342, 112), (645, 94, 115), (645, 112, 121), (645, 162, 115), (645, 343, 112), (646, 90, 115), (646, 111, 119), (646, 143, 116), (646, 146, 116), (646, 148, 116), (646, 149, 116), (646, 347, 102), (647, 118, 121), (647, 144, 116), (647, 151, 116), (647, 163, 116), (647, 164, 116), (647, 343, 111), (648, 144, 116), (648, 364, 115), (648, 386, 124), (649, 92, 116), (649, 119, 121), (649, 145, 116), (650, 112, 120), (650, 293, 113), (650, 294, 113), (650, 295, 113), (651, 93, 116), (651, 302, 112), (652, 459, 84), (654, 318, 112), (655, 122, 121), (655, 318, 111), (655, 321, 112), (655, 322, 112), (655, 323, 112), (655, 334, 112), (655, 439, 87), (656, 228, 114), (656, 229, 114), (656, 233, 113), (656, 256, 113), (656, 321, 112), (656, 323, 112), (656, 332, 111), (656, 347, 111), (657, 118, 120), (657, 125, 66), (657, 236, 114), (657, 331, 111), (657, 347, 111), (658, 149, 118), (658, 244, 114), (659, 254, 113), (659, 255, 113), (659, 381, 118), (660, 261, 113), (660, 869, -42), (661, 99, 116), (662, 126, 121), (662, 127, 121), (663, 120, 122), (664, 152, 120), (665, 311, 113), (666, 340, 111), (666, 341, 111), (666, 380, 117), (667, 119, 121), (667, 129, 121), (667, 331, 111), (667, 339, 112), (667, 340, 111), (667, 343, 112), (667, 344, 112), (668, 328, 112), (668, 329, 112), (668, 330, 112), (668, 339, 112), (668, 343, 111), (668, 380, 118), (669, 342, 111), (669, 343, 111), (669, 344, 111), (670, 343, 111), (670, 380, 117), (671, 343, 111), (672, 344, 111), (672, 410, 11), (673, 103, 117), (673, 106, 118), (673, 120, 119), (674, 146, 122), (674, 426, -43), (674, 432, 11), (677, 136, 121), (677, 155, 122), (678, 156, 121), (679, 123, 120), (679, 352, 115), (680, 136, 122), (681, 157, 122), (682, 125, 119), (682, 347, 113), (682, 459, 11), (683, 459, 13), (684, 352, 113), (686, 126, 121), (686, 138, 123), (686, 346, 116), (686, 353, 113), (687, 160, 122), (688, 354, 114), (688, 379, 118), (689, 493, -77), (691, 356, 115), (691, 378, 116), (692, 455, -69), (693, 356, 114), (694, 129, 121), (694, 164, 126), (697, 119, 119), (698, 390, 121), (698, 955, 88), (699, 130, 123), (699, 970, 4), (699, 971, 4), (699, 972, 89), (700, 998, 109), (701, 130, 119), (701, 372, 116), (702, 130, 119), (703, 122, 120), (703, 934, 90), (705, 123, 120), (705, 360, 114), (706, 354, 115), (707, 124, 120), (707, 354, 115), (708, 125, 120), (708, 149, 125), (709, 372, 116), (710, 126, 120), (711, 130, 120), (712, 127, 120), (712, 130, 120), (714, 363, 113), (716, 128, 120), (716, 174, 124), (718, 129, 120), (718, 175, 124), (719, 133, 120), (719, 134, 120), (720, 133, 120), (720, 490, -69), (721, 131, 120), (723, 159, 125), (725, 133, 121), (726, 133, 121), (727, 158, 126), (727, 160, 126), (729, 135, 121), (730, 135, 121), (730, 159, 126), (730, 162, 126), (731, 136, 121), (734, 161, 126), (738, 1036, -31), (740, 144, 122), (741, 139, 122), (742, 141, 122), (742, 164, 128), (742, 433, -93), (743, 142, 122), (743, 638, -73), (746, 166, 127), (747, 192, 126), (751, 171, 66), (755, 170, 127), (756, 173, 128), (757, 468, 122), (757, 479, 119), (757, 818, 59), (758, 686, 112), (760, 172, 128), (763, 174, 128), (764, 176, 65), (764, 961, -75), (768, 369, 118), (772, 154, 125), (775, 443, -4), (778, 383, 121), (778, 457, 108), (779, 456, -5), (784, 185, 63), (784, 387, 122), (785, 210, 128), (793, 1029, 66), (793, 1033, -10), (794, 884, -29), (795, 1011, -44), (796, 400, -6), (796, 1025, 128), (797, 427, -8), (798, 165, 126), (798, 1024, -5), (800, 433, 45), (801, 880, -11), (802, 412, 119), (802, 771, 52), (816, 224, 126), (816, 389, -93), (816, 393, -93), (819, 409, 104), (823, 653, 95), (824, 423, -25), (833, 871, 62), (839, 769, -124), (845, 470, 83), (848, 185, 128), (850, 481, 47), (850, 864, -124), (855, 763, -120), (855, 1017, -109), (864, 762, -125), (865, 383, 6), (865, 761, -124), (868, 346, 53), (869, 930, 84), (883, 350, 48), (884, 337, -6), (885, 371, -87), (888, 586, 60), (889, 373, 45), (900, 759, -50), (902, 642, -3), (913, 388, 37), (913, 389, 37), (916, 435, 38), (917, 842, -107), (919, 704, 58), (927, 561, -120), (940, 695, 54), (944, 840, -85), (947, 354, 72), (957, 691, 52), (961, 901, 34), (963, 935, 116), (964, 952, -89), (965, 274, 54), (965, 275, 54), (965, 988, -94), (965, 989, -94), (966, 828, -90), (966, 983, -92), (966, 984, -92), (966, 985, -92), (966, 986, -92), (970, 667, -48), (972, 689, 56), (975, 356, 84), (975, 397, 118), (981, 1018, 58), (989, 413, -70), (990, 1000, -97), (997, 813, -85), (1006, 832, 83), (1007, 769, 53), (1010, 812, -61), (1018, 392, 38), (1026, 867, 91), (1033, 777, 92), (1057, 665, -59), (1084, 974, -4), (1119, 787, 40), (1125, 850, 28), (1127, 574, 44), (1128, 1008, -1), (1129, 973, 72), (1129, 974, 72), (1130, 989, -2), (1132, 642, 47), (1150, 639, 52), (1150, 777, 126), (1157, 635, 48), (1161, 843, -14), (1173, 631, 46), (1174, 461, 93), (1197, 804, -98), (1202, 624, -55), (1214, 500, -60), (1223, 618, 52), (1236, 494, -77), (1238, 929, 81), (1258, 916, 88), (1261, 606, 45), (1266, 684, 66), (1270, 603, 47), (1276, 685, 58), (1281, 998, 110), (1281, 999, 110), (1296, 78, 107), (1297, 86, 107), (1297, 100, 109), (1297, 101, 109), (1297, 103, 109), (1297, 104, 109), (1298, 75, 108), (1298, 102, 109), (1298, 105, 108), (1298, 106, 108), (1299, 75, 108), (1300, 69, 109), (1301, 68, 108), (1302, 69, 109), (1302, 75, 109), (1303, 73, 108), (1304, 50, 107), (1304, 65, 108), (1304, 76, 108), (1307, 507, 77), (1308, 60, 108), (1308, 74, 108), (1320, 691, 122), (1320, 692, 124), (1322, 666, 65), (1325, 588, 47), (1341, 197, 123), (1341, 719, 39), (1341, 720, 39), (1354, 344, 97), (1355, 344, 97), (1355, 753, 38), (1356, 343, 98), (1358, 335, 96), (1360, 335, 97), (1360, 336, 97), (1364, 336, 98), (1367, 561, -53), (1371, 461, 125), (1372, 459, 126), (1380, 337, 99), (1383, 335, 111), (1388, 771, 98), (1408, 820, 110), (1412, 825, 114), (1412, 886, 120), (1413, 827, 114), (1417, 856, 116), (1418, 570, 107), (1421, 568, 103), (1422, 912, 19), (1426, 910, 20), (1427, 890, 120), (1428, 889, 120), (1433, 895, 120), (1437, 942, 125), (1437, 943, 125), (1441, 436, 101), (1443, 963, 124), (1447, 554, 109), (1452, 472, 113), (1453, 896, 123), (1454, 337, 102), (1454, 418, 102), (1454, 421, 100), (1455, 417, 101), (1456, 305, 99), (1456, 308, 99), (1456, 416, 102), (1456, 425, 102), (1457, 416, 102), (1457, 917, 119), (1458, 897, 125), (1461, 477, 115), (1461, 478, 115), (1461, 915, 127), (1464, 410, 98), (1467, 92, 125), (1467, 334, 103), (1467, 409, 99), (1468, 407, 98), (1468, 441, 103), (1469, 440, 104), (1469, 443, 103), (1470, 405, 98), (1472, 404, 98), (1502, 401, 111), (1504, 471, 122), (1511, 413, 59), (1514, 460, 104), (1514, 810, 126), (1519, 897, 124), (1519, 899, 124), (1520, 896, 122), (1526, 929, 124), (1527, 931, 125), (1528, 899, 122), (1529, 912, 125), (1529, 913, 124), (1529, 916, 125), (1531, 839, 124), (1536, 969, 128), (1539, 962, 127), (1540, 964, 127), (1540, 965, 127), (1544, 975, 120), (1556, 535, 126), (1584, 863, 49), (1634, 1013, 86), (1686, 1017, -39), (1712, 979, 90), (1713, 978, 86), (1758, 833, -38), (1763, 833, -22), (1766, 964, 127), (1766, 971, 128), (1768, 1013, 121), (1769, 966, 128), (1778, 834, -14), (1850, 981, 38), (1853, 970, 126), (1885, 905, 89), (1897, 935, 125), (1902, 171, 1), (1903, 67, 1), (1903, 291, 1), (1903, 292, 1), (1903, 299, 1), (1903, 300, 1), (1904, 260, 1), (1904, 261, 1), (1905, 329, 1)]
diff --git a/tests/test_single_matching.cpp b/tests/test_single_matching.cpp
index e9e6a75..9443a33 100644
--- a/tests/test_single_matching.cpp
+++ b/tests/test_single_matching.cpp
@@ -1,7 +1,7 @@
 #define APPROVALS_GOOGLETEST
 #include <ApprovalTests.hpp>     
 #include <gtest/gtest.h>
-#include "gpc/inference.hpp"
+#include "gpc/forest.hpp"
 
 
 TEST(Approval, Inference)
@@ -34,12 +34,12 @@ TEST(Approval, Inference)
     simg.readPNG(leftImgPath);
     timg.readPNG(rightImgPath); 
     // Get learned filter for the given image dimensions.
-    GPCForest_t::FilterMask fm =
+    gpc::inference::FilterMask fm =
         forest.readForest(forestPath, simg.cols(), simg.rows());
 
-    GPCForest_t::PreprocessedImage simgP =
+    gpc::inference::PreprocessedImage simgP =
         forest.preprocessImage(simg, inferencesettings);
-    GPCForest_t::PreprocessedImage timgP =
+    gpc::inference::PreprocessedImage timgP =
         forest.preprocessImage(timg, inferencesettings);
 
     // Match rectified stereo images
@@ -49,6 +49,45 @@ TEST(Approval, Inference)
 
     std::stringstream ss;
     ss << supp;
-    EXPECT_EQ(866, supp.size());
+    EXPECT_EQ(1024, supp.size());
     ApprovalTests::Approvals::verify(ss.str());
 }
+std::vector<ndb::Descriptor> getSrcDescriptors() {
+    return ndb::Descriptor::deserialize("statesSrc.txt", true);
+}
+
+std::vector<ndb::Descriptor> getTarDescriptors() {
+    return ndb::Descriptor::deserialize("statesTar.txt", false);
+}
+
+
+/*
+TEST(A,B) {
+    std::vector<ndb::Descriptor> srcOriginal = getSrcDescriptors(); 
+    std::vector<ndb::Descriptor> tarOriginal = getTarDescriptors();
+    std::vector<ndb::Descriptor> srcBaseline = srcOriginal;
+    std::vector<ndb::Descriptor> tarBaseline = tarOriginal;
+    std::vector<ndb::Descriptor> srcAlt = srcOriginal;
+    std::vector<ndb::Descriptor> tarAlt = tarOriginal;
+    
+    std::vector<ndb::Correspondence> 
+        matches = gpc::inference::Forest::findCorrespondences(srcBaseline, tarBaseline);
+
+
+    // Alternative method
+    gpc::inference::SoAFramePersistentSingleSlab srcFrame, tarFrame;
+    srcFrame.preallocate(srcOriginal.size()); // size known
+    tarFrame.preallocate(tarOriginal.size());
+
+    std::vector<uint32_t> resultSrc, resultTar;
+    resultSrc.reserve(srcOriginal.size()/10);
+    resultTar.reserve(tarOriginal.size()/10);
+    gpc::inference::Forest::prepareSoAFramesPersistentSingleSlabUnordered(srcAlt, tarAlt, srcFrame, tarFrame);
+    gpc::inference::Forest::matchPipelinedBranchlessPreallocateSingleSlabUnordered(srcFrame, tarFrame, resultSrc, resultTar);
+
+    // Ensure ID pairings of (resultSrc, resultTar) match the naive version. 
+    // We ignore exact matching for now and just expect the count to be the same
+    EXPECT_EQ(matches.size(), resultSrc.size());
+    EXPECT_EQ(matches.size(), resultTar.size());
+}
+*/