diff --git a/CMakeLists.txt b/CMakeLists.txt index 55e3851..6d3a39e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,13 +1,19 @@ cmake_minimum_required(VERSION 3.10) include(CheckCXXCompilerFlag) include(CheckCXXSourceRuns) +include(CMakePushCheckState) project(openGPC CXX) set (REQ_CPP11_FEATURES cxx_strong_enums cxx_auto_type) +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE) + add_compile_options(-O3 -funroll-loops) +endif() set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) +include(FetchContent) find_package(Eigen3 REQUIRED) find_package(PNG REQUIRED) find_package(Threads REQUIRED) @@ -15,29 +21,66 @@ find_package(Threads REQUIRED) include_directories(${EIGEN3_INCLUDE_DIR}) include_directories(${PNG_INCLUDE_DIRS}) include_directories(lib) +include(FetchContent) +set(HWY_ENABLE_TESTS OFF CACHE BOOL "Disable Highway tests" FORCE) +set(HWY_ENABLE_EXAMPLES OFF CACHE BOOL "Disable Highway examples" FORCE) +FetchContent_Declare( + highway + GIT_REPOSITORY https://github.com/google/highway.git + GIT_TAG 1.3.0 +) +FetchContent_MakeAvailable(highway) + +include(FetchContent) +set(BENCHMARK_ENABLE_TESTING OFF CACHE BOOL "" FORCE) +set(BENCHMARK_ENABLE_INSTALL OFF CACHE BOOL "" FORCE) +set(BENCHMARK_ENABLE_GTEST_LIB OFF CACHE BOOL "" FORCE) + +FetchContent_Declare( + google_benchmark + GIT_REPOSITORY https://github.com/google/benchmark.git + GIT_TAG v1.9.5 +) +# MUST go before FetchContent_MakeAvailable +set(BENCHMARK_ENABLE_TESTING OFF CACHE BOOL "" FORCE) +set(BENCHMARK_ENABLE_INSTALL OFF CACHE BOOL "" FORCE) + +# Force the library itself to build in Release mode +set(CMAKE_BUILD_TYPE Release CACHE STRING "" FORCE) -#By default, use SSE intrinsics -option(SSE "Enable SSE/AVX optimizations if available" ON) - -add_compile_options(-O3 -funroll-loops) -if(SSE) - message(STATUS "Checking if target CPU supports AVX2 instructions...") - check_cxx_source_runs(" - #include - int main() { - __m256i x = _mm256_set1_epi32(1); - return _mm256_extract_epi32(x, 0); - } - " CPU_HAS_AVX2) - - if(CPU_HAS_AVX2) - message(STATUS "AVX2: supported and enabled") - add_compile_definitions(_INTRINSICS_SSE) - add_compile_options(-mavx2 -march=core-avx2) - endif() +FetchContent_MakeAvailable(google_benchmark) +FetchContent_MakeAvailable(google_benchmark) +add_library(gpc_core + lib/gpc/forest.cpp + lib/gpc/fern.cpp + lib/gpc/feature.cpp + lib/gpc/kernels/sobel.cpp + lib/gpc/kernels/box.cpp + lib/gpc/kernels/census.cpp + lib/gpc/kernels/gpc.cpp + lib/gpc/kernels/utils.cpp + lib/gpc/kernels/box_hwy.cpp + lib/gpc/kernels/sobel_hwy.cpp + lib/gpc/kernels/gpc_hwy.cpp +) +if(MSVC) + target_compile_options(gpc_core PUBLIC /arch:AVX2) +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64") + target_compile_options(gpc_core PUBLIC -march=native) +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64") + target_compile_options(gpc_core PUBLIC -mcpu=native) endif() +target_link_libraries(gpc_core + PUBLIC + Eigen3::Eigen + ${PNG_LIBRARIES} + Threads::Threads + hwy +) +target_include_directories(gpc_core PUBLIC lib) enable_testing() add_subdirectory(samples) add_subdirectory(tests) +add_subdirectory(benchmarks) diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt new file mode 100644 index 0000000..7942fa6 --- /dev/null +++ b/benchmarks/CMakeLists.txt @@ -0,0 +1,31 @@ +add_executable(sobel_bench sobel_bench.cpp) + +target_link_libraries(sobel_bench + PRIVATE + gpc_core + benchmark::benchmark + hwy +) + +set_target_properties(sobel_bench PROPERTIES INTERPROCEDURAL_OPTIMIZATION TRUE) + +add_executable(kernel_bench kernel_bench.cpp) +target_link_libraries(kernel_bench + PRIVATE + gpc_core + benchmark::benchmark +) +add_executable(box_bench box_bench.cpp) + +target_link_libraries(box_bench + PRIVATE + gpc_core + benchmark::benchmark +) +add_executable(correspondence_bench correspondence_bench.cpp) + +target_link_libraries(correspondence_bench + PRIVATE + gpc_core + benchmark::benchmark +) diff --git a/benchmarks/box_bench.cpp b/benchmarks/box_bench.cpp new file mode 100644 index 0000000..f0aeeb8 --- /dev/null +++ b/benchmarks/box_bench.cpp @@ -0,0 +1,52 @@ +#include +#include +#include "gpc/kernels/box.hpp" +#include "gpc/kernels/box_hwy.hpp" +static void BM_BoxHighway(benchmark::State& state) { + int w = 1920, h = 1080; + std::vector in(w * h, 128); + std::vector out(w * h, 0); + state.SetLabel(hwy::TargetName(HWY_TARGET)); + for (auto _ : state) { + ndb::testing::box_hwy(in.data(), out.data(), w, h); + + benchmark::DoNotOptimize(out.data()); + benchmark::ClobberMemory(); + } +} + +#if HWY_TARGET == HWY_AVX2 +static void BM_BoxLegacySIMD(benchmark::State& state) { + int w = 1920, h = 1080; + std::vector in(w * h, 128); + std::vector out(w * h, 0); + + state.SetLabel("AVX2_legacy"); + for (auto _ : state) { + ndb::boxSSE(in.data(), out.data(), w, h); + + benchmark::DoNotOptimize(out.data()); + benchmark::ClobberMemory(); + } +} +#endif +static void BM_BoxNaive(benchmark::State& state) { + int w = 1920, h = 1080; + std::vector in(w * h, 128); + std::vector out(w * h, 0); + + state.SetLabel("naive"); + for (auto _ : state) { + ndb::boxNaive(in.data(), out.data(), w, h); + + benchmark::DoNotOptimize(out.data()); + benchmark::ClobberMemory(); + } +} +BENCHMARK(BM_BoxHighway)->Unit(benchmark::kMillisecond); +#if HWY_TARGET == HWY_AVX2 +BENCHMARK(BM_BoxLegacySIMD)->Unit(benchmark::kMillisecond); +#endif +BENCHMARK(BM_BoxNaive)->Unit(benchmark::kMillisecond); + +BENCHMARK_MAIN(); diff --git a/benchmarks/correspondence_bench.cpp b/benchmarks/correspondence_bench.cpp new file mode 100644 index 0000000..b847fb1 --- /dev/null +++ b/benchmarks/correspondence_bench.cpp @@ -0,0 +1,84 @@ +#include +#include "gpc/forest.hpp" +#include "gpc/inference.hpp" +#include +#include +#include +#include + +#define NUM_ELEMENTS 262668 //10*1224*375 //1024*1024 + +/** + * Generates a reproducible Pareto-distributed vector. + * @param count Number of IDs to generate. + * @param target_mean The theoretical mean (requires alpha > 1). + * @param seed A fixed value (e.g., 42) for deterministic benchmarks. + */ +std::vector generate_pareto_ids(size_t count, double target_mean, uint32_t seed = 42) { + std::vector ids; + ids.reserve(count); + + // Using a fixed seed for benchmark consistency + std::mt19937 gen(seed); + + // 1e-9 epsilon prevents division by zero/infinity + std::uniform_real_distribution dist(1e-9, 1.0); + + const double alpha = 1.16; + const double xm = target_mean * (alpha - 1.0) / alpha; + + for (size_t i = 0; i < count; ++i) { + // Inverse Transform Sampling + double val = xm / std::pow(dist(gen), 1.0 / alpha); + + // Casting to uint32_t will handle the Pareto "tail" by wrapping + // values that exceed 2^32-1, simulating a dense ID space. + ids.push_back(ndb::Descriptor(ndb::Point(0,0), static_cast(val))); + } + + return ids; +} +std::vector getSrcDescriptors() { + std::vector v = ndb::Descriptor::deserialize("statesSrcLarge.txt", true); + std::vector out; + for (size_t i = 0; i < v.size(); i++) { + if (v[i].point.y % 5 == 0 && (v[i].state & 0xFFFFFFFF) != 0) { + out.push_back(v[i]); + } + } + return out; + //return generate_pareto_ids(NUM_ELEMENTS, 1000.0, 42); // 1M IDs with mean ~1000 +} + +std::vector getTarDescriptors() { + std::vector v = ndb::Descriptor::deserialize("statesTarLarge.txt", false); + std::vector out; + for (size_t i = 0; i < v.size(); i++) { + if (v[i].point.y % 5 == 0 && (v[i].state & 0xFFFFFFFF) != 0) { + out.push_back(v[i]); + } + } + return out; + + //return generate_pareto_ids(NUM_ELEMENTS, 1001.0, 42); // 1M IDs with mean ~1000 +} +static void matchBySorting( + benchmark::State& state) { + std::vector srcOriginal = getSrcDescriptors(); + std::vector tarOriginal = getTarDescriptors(); + for (auto _ : state) { + state.PauseTiming(); + std::vector src = srcOriginal; + std::vector tar = tarOriginal; + state.ResumeTiming(); + std::vector + matches = gpc::inference::Forest::findCorrespondences(src, tar); + + state.counters["matches"] = matches.size(); + benchmark::DoNotOptimize(matches); + benchmark::ClobberMemory(); + } +} +BENCHMARK(matchBySorting) + ->Unit(benchmark::kMillisecond); +BENCHMARK_MAIN(); diff --git a/benchmarks/kernel_bench.cpp b/benchmarks/kernel_bench.cpp new file mode 100644 index 0000000..d86cfe8 --- /dev/null +++ b/benchmarks/kernel_bench.cpp @@ -0,0 +1,62 @@ +#include +#include "gpc/inference.hpp" + +typedef gpc::inference::Forest GPCForest_t; +GPCForest_t forest; + +static void fullInference( + benchmark::State& state){ + + std::string forestPath = "../forests/defaultZeroForest.txt"; + std::string leftImgPath = "../data/middlebury/im0.png"; + std::string rightImgPath = "../data/middlebury/im1.png"; + gpc::inference::InferenceSettings inferencesettings = + gpc::inference::InferenceSettings() + .builder() + .gradientThreshold(state.range(0)) // 0...255 gradient threshold for sobel filter + .verticalTolerance( + 0) // 0px tolerance for rectified epipolar matches + .dispHigh(128) // limit disparities to 128 + .epipolarMode(true) // match GPC states in epipolar mode. more + // matches, lower accuracy than global + .useHashtable(false); // use sort method for matching. faster for + // <100K descriptors + + ndb::Buffer simg, timg; + // Load images + simg.readPNG(leftImgPath); + timg.readPNG(rightImgPath); + + // Get learned filter for the given image dimensions. + GPCForest_t::FilterMask fm = + forest.readForest(forestPath, simg.cols(), simg.rows()); + + + + for (auto _ : state) { + GPCForest_t::PreprocessedImage simgP = + forest.preprocessImage(simg, inferencesettings); + GPCForest_t::PreprocessedImage timgP = + forest.preprocessImage(timg, inferencesettings); + std::vector supp = + forest.rectifiedMatch(simgP, timgP, fm, inferencesettings); + state.counters["candidates_s"] = simgP.mask.size(); + state.counters["candidates_t"] = timgP.mask.size(); + state.counters["matches"] = supp.size(); + benchmark::DoNotOptimize(supp); + benchmark::ClobberMemory(); + } + +} + +BENCHMARK(fullInference) + ->Unit(benchmark::kMillisecond) + ->Args({0}) + ->Args({5}) + ->Args({10}) + ->Args({20}) + ->Args({50}) + ->Args({100}); + + +BENCHMARK_MAIN(); diff --git a/benchmarks/sobel_bench.cpp b/benchmarks/sobel_bench.cpp new file mode 100644 index 0000000..490a861 --- /dev/null +++ b/benchmarks/sobel_bench.cpp @@ -0,0 +1,56 @@ +#include +#include +#include "gpc/kernels/sobel.hpp" +#include "gpc/kernels/sobel_hwy.hpp" +static void BM_SobelHighway(benchmark::State& state) { + int w = 1920, h = 1080; + std::vector in(w * h, 128); + std::vector out(w * h, 0); + state.SetLabel(hwy::TargetName(HWY_TARGET)); + // Warmup is handled automatically by the library + for (auto _ : state) { + ndb::testing::sobel_hwy(in.data(), out.data(), w, h, 50); + + // Ensure the compiler doesn't skip the work + benchmark::DoNotOptimize(out.data()); + benchmark::ClobberMemory(); + } +} + +#if HWY_TARGET == HWY_AVX2 +static void BM_SobelLegacySIMD(benchmark::State& state) { + int w = 1920, h = 1080; + std::vector in(w * h, 128); + std::vector out(w * h, 0); + + state.SetLabel("AVX2_legacy"); + for (auto _ : state) { + ndb::sobelSSE(in.data(), out.data(), w, 1, h - 1, 1); + + // Ensure the compiler doesn't skip the work + benchmark::DoNotOptimize(out.data()); + benchmark::ClobberMemory(); + } +} +#endif +static void BM_SobelNaive(benchmark::State& state) { + int w = 1920, h = 1080; + std::vector in(w * h, 128); + std::vector out(w * h, 0); + + state.SetLabel("naive"); + for (auto _ : state) { + ndb::sobelNaive(in.data(), out.data(), w, h, 50); + + // Ensure the compiler doesn't skip the work + benchmark::DoNotOptimize(out.data()); + benchmark::ClobberMemory(); + } +} +BENCHMARK(BM_SobelHighway)->Unit(benchmark::kMillisecond); +#if HWY_TARGET == HWY_AVX2 +BENCHMARK(BM_SobelLegacySIMD)->Unit(benchmark::kMillisecond); +#endif +BENCHMARK(BM_SobelNaive)->Unit(benchmark::kMillisecond); + +BENCHMARK_MAIN(); diff --git a/format_code.sh b/format_code.sh index 0eaf149..16d9cb5 100755 --- a/format_code.sh +++ b/format_code.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash set -euo pipefail -EXPECTED_VERSION="21.1.5" +EXPECTED_VERSION="21.1.8" root_folder=$(git rev-parse --show-toplevel) change_in_place=false diff --git a/lib/gpc/Feature.hpp b/lib/gpc/Feature.hpp index f0f7072..c2064b4 100644 --- a/lib/gpc/Feature.hpp +++ b/lib/gpc/Feature.hpp @@ -39,7 +39,6 @@ #include //for log2 #include #include -#include #include #include #include @@ -96,32 +95,13 @@ class Feature { * @param params The parameters for this split * @param[in] trip The triplet */ - inline void getDecisions(bool& ref, - bool& pos, - bool& neg, - params& params, - const GPCPatchTriplet& trip) { - ref = - ((int)trip.ref.feature(params.i) - (int)trip.ref.feature(params.j) < - params.tau); - pos = - ((int)trip.pos.feature(params.i) - (int)trip.pos.feature(params.j) < - params.tau); - neg = - ((int)trip.neg.feature(params.i) - (int)trip.neg.feature(params.j) < - params.tau); - } - - Feature() { - std::random_device rd2; - rng = std::mt19937(rd2()); - randIJ7 = std::uniform_int_distribution(0, 48); - randIJ17 = std::uniform_int_distribution(0, 17 * 17 - 1); - randIJ27 = std::uniform_int_distribution(0, 27 * 27 - 1); - - randTAU = std::uniform_int_distribution(-15, 15); - } + void getDecisions(bool& ref, + bool& pos, + bool& neg, + params& params, + const GPCPatchTriplet& trip); + Feature(); /** * @brief Returns a random hyperplane within a 27 x 27 * pixel-sized patch. depending on the scale @@ -132,50 +112,7 @@ class Feature { * @param scale Determines which patch size is used * @param params returns the parameters */ - void inline sampleHyperplane(int scale, params& params) { - if (scale == 2) { - params.i = params.j; // s.t. they regenerate each iteration - while (params.i == params.j) { // i and j need to be distinct - int i = randIJ7(rng); - int j = randIJ7(rng); - params.ix = i % 7 - 3; - params.iy = i / 7 - 3; - params.jx = j % 7 - 3; - params.jy = j / 7 - 3; - - params.i = 280 + (params.ix + 3) + 27 * (params.iy + 3); - params.j = 280 + (params.jx + 3) + 27 * (params.jy + 3); - } - } else if (scale == 1) { - params.i = params.j; // s.t. they regenerate each iteration - while (params.i == params.j) { // i and j need to be distinct - int i = randIJ17(rng); - int j = randIJ17(rng); - params.ix = i % 17 - 8; - params.iy = i / 17 - 8; - params.jx = j % 17 - 8; - params.jy = j / 17 - 8; - - params.i = 140 + (params.ix + 8) + 27 * (params.iy + 8); - params.j = 140 + (params.jx + 8) + 27 * (params.jy + 8); - } - } else if (scale == 0) { - params.i = params.j; // s.t. they regenerate each iteration - while (params.i == params.j) { // i and j need to be distinct - params.i = randIJ27(rng); - params.j = randIJ27(rng); - params.ix = params.i % 27 - 13; - params.iy = params.i / 27 - 13; - params.jx = params.j % 27 - 13; - params.jy = params.j / 27 - 13; - - params.i = (params.ix + 13) + 27 * (params.iy + 13); - params.j = (params.jx + 13) + 27 * (params.jy + 13); - } - } - params.tau = randTAU(rng); - } - + void sampleHyperplane(int scale, params& params); /** * @brief Gets all descriptors (triplets) for an image pair for * training given the three keypoint vectors. @@ -193,56 +130,7 @@ class Feature { std::vector& ref, std::vector& pos, std::vector& neg, - std::vector& triplets) { - ndb::Buffer LL(bwL.rows(), bwL.cols()); - LL.width = bwL.width; - ndb::box(bwL.data(), LL.data(), bwL.cols(), bwL.rows(), 1); - LL.clearBoundary(); - - ndb::Buffer RR(bwL.rows(), bwL.cols()); - RR.width = bwR.width; - ndb::box(bwR.data(), RR.data(), bwR.cols(), bwR.rows(), 1); - RR.clearBoundary(); - - auto f = [=](ndb::Point& kp) { - if (kp.x > 20 && kp.y > 20 && kp.x < bwL.cols() - 20 && - kp.y < bwL.rows() - 20) - return false; - else - return true; - }; - - for (std::vector::size_type i = 0; i != ref.size(); i++) { - if (!f(ref[i]) && !f(pos[i]) && !f(neg[i])) { - // Get all descriptors: - GPCPatchTriplet newPatch; - - // Reference patch - //==================================== - newPatch.ref.x = ref[i].x; - newPatch.ref.y = ref[i].y; - - LL.getPatch(newPatch.ref.feature, ref[i].x, ref[i].y, 27); - - // Extract a positive match in the right image - //==================================== - newPatch.pos.x = pos[i].x; - newPatch.pos.y = pos[i].y; - - RR.getPatch(newPatch.pos.feature, pos[i].x, pos[i].y, 27); - - // Extract negative patch - //==================================== - newPatch.neg.x = neg[i].x; - newPatch.neg.y = neg[i].y; - - RR.getPatch(newPatch.neg.feature, neg[i].x, neg[i].y, 27); - - triplets.push_back(std::move(newPatch)); - } - } - } - + std::vector& triplets); /** * @brief Store a vector of triplets of training data to file * @@ -250,17 +138,7 @@ class Feature { * @param path The path where we'd like to store the training data * in binary form. */ - void storeAllTriplets(std::vector& data, - std::string path) { - ofstream fout; - fout.open(path, ios::binary | ios::out); - for (auto& triplet : data) { - fout.write((char*)triplet.ref.feature.data(), 27 * 27); - fout.write((char*)triplet.pos.feature.data(), 27 * 27); - fout.write((char*)triplet.neg.feature.data(), 27 * 27); - } - fout.close(); - } + void storeAllTriplets(std::vector& data, std::string path); /** * @brief Read triplets of training data from a binary file * written by the storeAllTriplets method. @@ -269,33 +147,7 @@ class Feature { * * @return The training set */ - std::vector loadAllTriplets(std::string path) { - std::vector data; - std::ifstream in(path, std::ifstream::ate | std::ifstream::binary); - uint32_t filesize = in.tellg(); - if (filesize % ((27 * 27) * 3)) { - cout << "ERR: File is not a training set of this feature type" - << endl; - cout << "FS: " << filesize << endl; - return data; - } - int numSamples = filesize / ((27 * 27) * 3); - data.resize(numSamples); - ifstream fin; - fin.open(path, ios::binary | ios::in); - for (auto& datum : data) { - datum.ref.feature.resize(27, 27); - datum.pos.feature.resize(27, 27); - datum.neg.feature.resize(27, 27); - - fin.read((char*)datum.ref.feature.data(), 27 * 27); - fin.read((char*)datum.pos.feature.data(), 27 * 27); - fin.read((char*)datum.neg.feature.data(), 27 * 27); - } - fin.close(); - return data; - } - + std::vector loadAllTriplets(std::string path); }; // Feature } // namespace training } // namespace gpc diff --git a/lib/gpc/Fern.hpp b/lib/gpc/Fern.hpp index 5f5f533..68c6a1f 100644 --- a/lib/gpc/Fern.hpp +++ b/lib/gpc/Fern.hpp @@ -167,10 +167,7 @@ OptimizerSettings TauOptimizer(int taulo, int tauhi, int numResamples, bool onlyScoreNonSplitSamples, - double w1) { - return OptimizerSettings( - taulo, tauhi, numResamples, onlyScoreNonSplitSamples, w1); -} + double w1); /** * @brief Optimzer setting factory for a zero fern * @@ -184,9 +181,7 @@ OptimizerSettings TauOptimizer(int taulo, */ OptimizerSettings ZeroOptimizer(int numResamples, bool onlyScoreNonSplitSamples, - double w1) { - return OptimizerSettings(0, 1, numResamples, onlyScoreNonSplitSamples, w1); -} + double w1); struct FernSettings { const int maxDepth; const int scale; @@ -231,62 +226,7 @@ class Fern { FernSettings fernsetting, OptimizerSettings optsetting, int scoreUntilLevel, - splitStats& s) { - s.tp = 0; - s.fn = 0; - s.fp = 0; - s.prec = 0.; - s.rec = 0.; - s.hmean = 0.; - s.convcomb = 0.; - s.tot = 0; - for (auto& triplet : data) { - uint64_t ref = 0, pos = 0, neg = 0; - // Score the first scoreUntilLevel levels of a given fern - for (int i = 0; i < scoreUntilLevel + 1; i++) { - ref <<= 1; - pos <<= 1; - neg <<= 1; - bool refDec, posDec, negDec; - - // Decisions need to be added into a codeword - Feature.getDecisions( - refDec, posDec, negDec, params[i], triplet); - if (refDec) ref++; - if (posDec) pos++; - if (negDec) neg++; - } - // Only count those that haven't been true positives yet - // Ignore samples previously classified as True positive - if (!(triplet.pos.split == true && triplet.neg.split == true)) { - s.tot++; - // Decide which are equal (i.e. set the split indicators) - if (ref == pos) { // 110(TP), 111, 001(TP), 000 - if (ref != neg) { // 110 (TP), 001(TP) - s.tp++; - } else { // 111(FN), 000(FN) - s.fn++; - } - } else { // 100, 101, 011, 010 - if (ref != neg) { // 100(FN), 011(FN) FN - s.fn++; - } else { // 101(FP), 010(FP) - s.fp++; - } - } - } - } - - // Compute statistics of this split - double w2 = 1. - optsetting.w1_; - s.prec = ((s.tp + s.fp) == 0) ? 0. : double(s.tp) / (s.tp + s.fp); - s.rec = ((s.tp + s.fn) == 0) ? 0. : double(s.tp) / (s.tp + s.fn); - - s.hmean = (s.prec + s.rec == 0.) - ? 0. - : s.prec * s.rec / ((1. - w2) * s.prec + w2 * s.rec); - s.convcomb = (1. - w2) * s.prec + w2 * s.rec; - } + splitStats& s); /** * @brief Mark those samples in the set as "split" if they have been * correctly classified(ref=pos and pos!=neg) with the parameter @@ -298,26 +238,7 @@ class Fern { */ void markSplitSamples(std::vector& data, std::vector& params, - int numParams) { - for (auto& triplet : data) { - // Evaluate triplet on all given parameters - uint64_t ref = 0, pos = 0, neg = 0; - for (int i = 0; i < numParams; i++) { - ref <<= 1; // shift by one - pos <<= 1; // shift by one - neg <<= 1; // shift by one - bool refDec, posDec, negDec; - - Feature.getDecisions( - refDec, posDec, negDec, params[i], triplet); - if (refDec) ref++; - if (posDec) pos++; - if (negDec) neg++; - } - if (ref == pos) triplet.pos.split = true; - if (ref != neg) triplet.neg.split = true; - } - } + int numParams); /** * @brief Reset the mark on the training samples on whether they have been * split correctly or not Since we do not operate on copies of the training @@ -325,12 +246,7 @@ class Fern { * * @param data */ - void resetMarkOnSamples(std::vector& data) { - for (auto& triplet : data) { - triplet.pos.split = false; - triplet.neg.split = false; - } - } + void resetMarkOnSamples(std::vector& data); /** * @brief Train a fern given a set of training data and some optimizer @@ -340,70 +256,21 @@ class Fern { * @param optsetting the optimizer settings */ void train(std::vector& trainingSamples, - OptimizerSettings optsetting) { - splitStats stats; - float maxScore = 0.f; - SplitParams_t bestParams; - - fernparams.resize(fernsettings.maxDepth); - - cout << setw(7) << "Level" << setw(10) << "Prec" << setw(10) << "Rec" - << setw(10) << "Har" << setw(8) << "Tot" << setw(8) << "TP" - << setw(8) << "FP" << setw(8) << "FN" << setw(6) << "scale" - << setw(5) << "tau" << setw(5) << "i" << setw(5) << "j" << endl; - if (optsetting.onlyScoreNonSplitSamples_) - resetMarkOnSamples(trainingSamples); - for (int level = 0; level < fernsettings.maxDepth; level++) { - maxScore = 0.f; - for (int k = 0; k < optsetting.numResamples_; k++) { - // Samples a hyperplane in the requested scale - Feature.sampleHyperplane(fernsettings.scale, fernparams[level]); - // Iterates over a small range of tau (intercept) - for (int tau = optsetting.taulo_; tau < optsetting.tauhi_; - tau++) { - fernparams[level].tau = tau; - // Score hyperplane set we have so far - evalSplit(trainingSamples, - fernparams, - fernsettings, - optsetting, - level, - stats); - // If score exceeds previously best, replace paramset - if (stats.hmean > maxScore) { - bestParams = fernparams[level]; - maxScore = stats.hmean; - } - } // tau loop - } // k loop - // Store best performing parameters - fernparams[level] = bestParams; - - // Mark samples as split if they were labeled true positive - if (optsetting.onlyScoreNonSplitSamples_) - markSplitSamples(trainingSamples, fernparams, level); - cout << setw(7) << level << setw(10) << stats.prec << setw(10) - << stats.rec << setw(10) << stats.hmean << setw(8) << stats.tot - << setw(8) << stats.tp << setw(8) << stats.fp << setw(8) - << stats.fn << setw(6) << fernsettings.scale << setw(5) - << fernparams[level].tau << setw(5) << fernparams[level].i - << setw(5) << fernparams[level].j << endl; - } // level loop - } // train + OptimizerSettings optsetting); /** * @brief Returns the decision of the first five levels of the ferns * * @return The parameters. */ - std::vector getParameters() { return fernparams; } + std::vector getParameters(); /** * @brief Return the scale that this fern uses * * @return The scale */ - int getScale() { return fernsettings.scale; } + int getScale(); }; // Fern @@ -417,7 +284,10 @@ class Fern { * * @return */ -std::vector FernFactory(int num_S, int num_M, int num_L, int maxDepth) { +inline std::vector FernFactory(int num_S, + int num_M, + int num_L, + int maxDepth) { std::vector ferns; for (int i = 0; i < num_S; i++) ferns.push_back(Fern(FernSettings(maxDepth, 2))); diff --git a/lib/gpc/buffer.hpp b/lib/gpc/buffer.hpp index dd6dce6..0b58119 100644 --- a/lib/gpc/buffer.hpp +++ b/lib/gpc/buffer.hpp @@ -33,6 +33,10 @@ #include #include +#include +#include +#include +#include #include #include @@ -80,6 +84,58 @@ struct Descriptor { bool operator<(const Descriptor& d) const { return state < d.state; } bool operator<=(const Descriptor& d) const { return state <= d.state; } int operator%(const int& d) const { return state % d; } + static void serialize(const std::string& filename, + const std::vector& data) { + std::ofstream outFile(filename); + if (!outFile.is_open()) { + std::cerr << "Error opening file for writing: " << filename + << std::endl; + return; + } + + for (const auto& desc : data) { + outFile << desc.point.x << "," << desc.point.y << "," << desc.state + << "\n"; + } + outFile.close(); + } + + /** + * Deserializes a CSV file back into a vector of Descriptors. + */ + static std::vector deserialize(const std::string& filename, + bool srcDescr) { + std::vector result; + std::ifstream inFile(filename); + if (!inFile.is_open()) { + std::cerr << "Error opening file for reading: " << filename + << std::endl; + return result; + } + + std::string line; + while (std::getline(inFile, line)) { + if (line.empty()) continue; + + std::stringstream ss(line); + std::string x_str, y_str, state_str; + + // Split by comma + if (std::getline(ss, x_str, ',') && std::getline(ss, y_str, ',') && + std::getline(ss, state_str, ',')) { + Descriptor d; + d.point.x = std::stod(x_str); + d.point.y = std::stod(y_str); + d.state = std::stoull(state_str); + d.srcDescr = srcDescr; + + // if (d.point.y > 200 && d.point.y < 400) + result.push_back(d); + } + } + inFile.close(); + return result; + } }; // Keeps support points with associated disparity // Support points are only used in the left image @@ -896,7 +952,7 @@ class RGBBuffer : public Buffer { free(rowPointers); } }; -Buffer getDisparityVisualization( +inline Buffer getDisparityVisualization( ndb::Buffer& srcImg, std::vector& validEstimateIndices, ndb::Buffer& disparity) { @@ -969,8 +1025,8 @@ Buffer getDisparityVisualization( } return dispVis; } -Buffer getDisparityVisualization(ndb::Buffer& srcImg, - std::vector& support) { +inline Buffer getDisparityVisualization( + ndb::Buffer& srcImg, std::vector& support) { float min_disparity = 0; float max_disparity = 128; Buffer dispVis(Eigen::Vector2i(srcImg.width, srcImg.rows())); diff --git a/lib/gpc/feature.cpp b/lib/gpc/feature.cpp new file mode 100644 index 0000000..9e1322f --- /dev/null +++ b/lib/gpc/feature.cpp @@ -0,0 +1,212 @@ +// Copyright (c) 2018, ETH Zurich +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Implements and extends the method proposed in +// The Global Patch Collider +// Shenlong Wang, Sean Ryan Fanello, Christoph Rhemann, Shahram Izadi, Pushmeet +// Kohli CVPR 2016 Code Author: Niklaus Bamert (bamertn@ethz.ch) + +#include +#include +#include //for log2 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace std; + +namespace gpc { +namespace training { +void Feature::getDecisions(bool& ref, + bool& pos, + bool& neg, + params& params, + const GPCPatchTriplet& trip) { + ref = ((int)trip.ref.feature(params.i) - (int)trip.ref.feature(params.j) < + params.tau); + pos = ((int)trip.pos.feature(params.i) - (int)trip.pos.feature(params.j) < + params.tau); + neg = ((int)trip.neg.feature(params.i) - (int)trip.neg.feature(params.j) < + params.tau); +} + +Feature::Feature() { + std::random_device rd2; + rng = std::mt19937(rd2()); + randIJ7 = std::uniform_int_distribution(0, 48); + randIJ17 = std::uniform_int_distribution(0, 17 * 17 - 1); + randIJ27 = std::uniform_int_distribution(0, 27 * 27 - 1); + + randTAU = std::uniform_int_distribution(-15, 15); +} +void Feature::sampleHyperplane(int scale, params& params) { + if (scale == 2) { + params.i = params.j; // s.t. they regenerate each iteration + while (params.i == params.j) { // i and j need to be distinct + int i = randIJ7(rng); + int j = randIJ7(rng); + params.ix = i % 7 - 3; + params.iy = i / 7 - 3; + params.jx = j % 7 - 3; + params.jy = j / 7 - 3; + + params.i = 280 + (params.ix + 3) + 27 * (params.iy + 3); + params.j = 280 + (params.jx + 3) + 27 * (params.jy + 3); + } + } else if (scale == 1) { + params.i = params.j; // s.t. they regenerate each iteration + while (params.i == params.j) { // i and j need to be distinct + int i = randIJ17(rng); + int j = randIJ17(rng); + params.ix = i % 17 - 8; + params.iy = i / 17 - 8; + params.jx = j % 17 - 8; + params.jy = j / 17 - 8; + + params.i = 140 + (params.ix + 8) + 27 * (params.iy + 8); + params.j = 140 + (params.jx + 8) + 27 * (params.jy + 8); + } + } else if (scale == 0) { + params.i = params.j; // s.t. they regenerate each iteration + while (params.i == params.j) { // i and j need to be distinct + params.i = randIJ27(rng); + params.j = randIJ27(rng); + params.ix = params.i % 27 - 13; + params.iy = params.i / 27 - 13; + params.jx = params.j % 27 - 13; + params.jy = params.j / 27 - 13; + + params.i = (params.ix + 13) + 27 * (params.iy + 13); + params.j = (params.jx + 13) + 27 * (params.jy + 13); + } + } + params.tau = randTAU(rng); +} +void Feature::extractAllTriplets(ndb::Buffer& bwL, + ndb::Buffer& bwR, + std::vector& ref, + std::vector& pos, + std::vector& neg, + std::vector& triplets) { + ndb::Buffer LL(bwL.rows(), bwL.cols()); + LL.width = bwL.width; + ndb::box(bwL.data(), LL.data(), bwL.cols(), bwL.rows(), 1); + LL.clearBoundary(); + + ndb::Buffer RR(bwL.rows(), bwL.cols()); + RR.width = bwR.width; + ndb::box(bwR.data(), RR.data(), bwR.cols(), bwR.rows(), 1); + RR.clearBoundary(); + + auto f = [=](ndb::Point& kp) { + if (kp.x > 20 && kp.y > 20 && kp.x < bwL.cols() - 20 && + kp.y < bwL.rows() - 20) + return false; + else + return true; + }; + + for (std::vector::size_type i = 0; i != ref.size(); i++) { + if (!f(ref[i]) && !f(pos[i]) && !f(neg[i])) { + // Get all descriptors: + GPCPatchTriplet newPatch; + + // Reference patch + //==================================== + newPatch.ref.x = ref[i].x; + newPatch.ref.y = ref[i].y; + + LL.getPatch(newPatch.ref.feature, ref[i].x, ref[i].y, 27); + + // Extract a positive match in the right image + //==================================== + newPatch.pos.x = pos[i].x; + newPatch.pos.y = pos[i].y; + + RR.getPatch(newPatch.pos.feature, pos[i].x, pos[i].y, 27); + + // Extract negative patch + //==================================== + newPatch.neg.x = neg[i].x; + newPatch.neg.y = neg[i].y; + + RR.getPatch(newPatch.neg.feature, neg[i].x, neg[i].y, 27); + + triplets.push_back(std::move(newPatch)); + } + } +} + +void Feature::storeAllTriplets(std::vector& data, + std::string path) { + ofstream fout; + fout.open(path, ios::binary | ios::out); + for (auto& triplet : data) { + fout.write((char*)triplet.ref.feature.data(), 27 * 27); + fout.write((char*)triplet.pos.feature.data(), 27 * 27); + fout.write((char*)triplet.neg.feature.data(), 27 * 27); + } + fout.close(); +} +std::vector Feature::loadAllTriplets( + std::string path) { + std::vector data; + std::ifstream in(path, std::ifstream::ate | std::ifstream::binary); + uint32_t filesize = in.tellg(); + if (filesize % ((27 * 27) * 3)) { + cout << "ERR: File is not a training set of this feature type" << endl; + cout << "FS: " << filesize << endl; + return data; + } + int numSamples = filesize / ((27 * 27) * 3); + data.resize(numSamples); + ifstream fin; + fin.open(path, ios::binary | ios::in); + for (auto& datum : data) { + datum.ref.feature.resize(27, 27); + datum.pos.feature.resize(27, 27); + datum.neg.feature.resize(27, 27); + + fin.read((char*)datum.ref.feature.data(), 27 * 27); + fin.read((char*)datum.pos.feature.data(), 27 * 27); + fin.read((char*)datum.neg.feature.data(), 27 * 27); + } + fin.close(); + return data; +} + +} // namespace training +} // namespace gpc diff --git a/lib/gpc/fern.cpp b/lib/gpc/fern.cpp new file mode 100644 index 0000000..5a2632c --- /dev/null +++ b/lib/gpc/fern.cpp @@ -0,0 +1,204 @@ +// Copyright (c) 2018, ETH Zurich +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Implements and extends the method proposed in +// The Global Patch Collider +// Shenlong Wang, Sean Ryan Fanello, Christoph Rhemann, Shahram Izadi, Pushmeet +// Kohli CVPR 2016 Code Author: Niklaus Bamert (bamertn@ethz.ch) +#include "gpc/Fern.hpp" + +#include +#include +#include +#include +#include + +#include "gpc/Feature.hpp" + +using namespace std; +namespace gpc { +namespace training { +OptimizerSettings TauOptimizer(int taulo, + int tauhi, + int numResamples, + bool onlyScoreNonSplitSamples, + double w1) { + return OptimizerSettings( + taulo, tauhi, numResamples, onlyScoreNonSplitSamples, w1); +} +OptimizerSettings ZeroOptimizer(int numResamples, + bool onlyScoreNonSplitSamples, + double w1) { + return OptimizerSettings(0, 1, numResamples, onlyScoreNonSplitSamples, w1); +} +void Fern::evalSplit(std::vector& data, + std::vector& params, + FernSettings fernsetting, + OptimizerSettings optsetting, + int scoreUntilLevel, + splitStats& s) { + s.tp = 0; + s.fn = 0; + s.fp = 0; + s.prec = 0.; + s.rec = 0.; + s.hmean = 0.; + s.convcomb = 0.; + s.tot = 0; + for (auto& triplet : data) { + uint64_t ref = 0, pos = 0, neg = 0; + // Score the first scoreUntilLevel levels of a given fern + for (int i = 0; i < scoreUntilLevel + 1; i++) { + ref <<= 1; + pos <<= 1; + neg <<= 1; + bool refDec, posDec, negDec; + + // Decisions need to be added into a codeword + Feature.getDecisions(refDec, posDec, negDec, params[i], triplet); + if (refDec) ref++; + if (posDec) pos++; + if (negDec) neg++; + } + // Only count those that haven't been true positives yet + // Ignore samples previously classified as True positive + if (!(triplet.pos.split == true && triplet.neg.split == true)) { + s.tot++; + // Decide which are equal (i.e. set the split indicators) + if (ref == pos) { // 110(TP), 111, 001(TP), 000 + if (ref != neg) { // 110 (TP), 001(TP) + s.tp++; + } else { // 111(FN), 000(FN) + s.fn++; + } + } else { // 100, 101, 011, 010 + if (ref != neg) { // 100(FN), 011(FN) FN + s.fn++; + } else { // 101(FP), 010(FP) + s.fp++; + } + } + } + } + + // Compute statistics of this split + double w2 = 1. - optsetting.w1_; + s.prec = ((s.tp + s.fp) == 0) ? 0. : double(s.tp) / (s.tp + s.fp); + s.rec = ((s.tp + s.fn) == 0) ? 0. : double(s.tp) / (s.tp + s.fn); + + s.hmean = (s.prec + s.rec == 0.) + ? 0. + : s.prec * s.rec / ((1. - w2) * s.prec + w2 * s.rec); + s.convcomb = (1. - w2) * s.prec + w2 * s.rec; +} +void Fern::markSplitSamples(std::vector& data, + std::vector& params, + int numParams) { + for (auto& triplet : data) { + // Evaluate triplet on all given parameters + uint64_t ref = 0, pos = 0, neg = 0; + for (int i = 0; i < numParams; i++) { + ref <<= 1; // shift by one + pos <<= 1; // shift by one + neg <<= 1; // shift by one + bool refDec, posDec, negDec; + + Feature.getDecisions(refDec, posDec, negDec, params[i], triplet); + if (refDec) ref++; + if (posDec) pos++; + if (negDec) neg++; + } + if (ref == pos) triplet.pos.split = true; + if (ref != neg) triplet.neg.split = true; + } +} +void Fern::resetMarkOnSamples(std::vector& data) { + for (auto& triplet : data) { + triplet.pos.split = false; + triplet.neg.split = false; + } +} + +void Fern::train(std::vector& trainingSamples, + OptimizerSettings optsetting) { + splitStats stats; + float maxScore = 0.f; + SplitParams_t bestParams; + + fernparams.resize(fernsettings.maxDepth); + + cout << setw(7) << "Level" << setw(10) << "Prec" << setw(10) << "Rec" + << setw(10) << "Har" << setw(8) << "Tot" << setw(8) << "TP" << setw(8) + << "FP" << setw(8) << "FN" << setw(6) << "scale" << setw(5) << "tau" + << setw(5) << "i" << setw(5) << "j" << endl; + if (optsetting.onlyScoreNonSplitSamples_) + resetMarkOnSamples(trainingSamples); + for (int level = 0; level < fernsettings.maxDepth; level++) { + maxScore = 0.f; + for (int k = 0; k < optsetting.numResamples_; k++) { + // Samples a hyperplane in the requested scale + Feature.sampleHyperplane(fernsettings.scale, fernparams[level]); + // Iterates over a small range of tau (intercept) + for (int tau = optsetting.taulo_; tau < optsetting.tauhi_; tau++) { + fernparams[level].tau = tau; + // Score hyperplane set we have so far + evalSplit(trainingSamples, + fernparams, + fernsettings, + optsetting, + level, + stats); + // If score exceeds previously best, replace paramset + if (stats.hmean > maxScore) { + bestParams = fernparams[level]; + maxScore = stats.hmean; + } + } // tau loop + } // k loop + // Store best performing parameters + fernparams[level] = bestParams; + + // Mark samples as split if they were labeled true positive + if (optsetting.onlyScoreNonSplitSamples_) + markSplitSamples(trainingSamples, fernparams, level); + cout << setw(7) << level << setw(10) << stats.prec << setw(10) + << stats.rec << setw(10) << stats.hmean << setw(8) << stats.tot + << setw(8) << stats.tp << setw(8) << stats.fp << setw(8) + << stats.fn << setw(6) << fernsettings.scale << setw(5) + << fernparams[level].tau << setw(5) << fernparams[level].i + << setw(5) << fernparams[level].j << endl; + } // level loop +} // train + +std::vector Fern::getParameters() { return fernparams; } + +int Fern::getScale() { return fernsettings.scale; } + +} // namespace training +} // namespace gpc diff --git a/lib/gpc/filter.hpp b/lib/gpc/filter.hpp deleted file mode 100644 index 49384f0..0000000 --- a/lib/gpc/filter.hpp +++ /dev/null @@ -1,1000 +0,0 @@ -// Copyright (c) 2018, ETH Zurich -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are met: -// -// 1. Redistributions of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// 3. Neither the name of the copyright holder nor the names of its contributors -// may be used to endorse or promote products derived from this software without -// specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE -// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -// POSSIBILITY OF SUCH DAMAGE. -// -// Code Author: Niklaus Bamert (bamertn@ethz.ch) -#ifndef __NDB__FILTER -#define __NDB__FILTER - -#include -#include - -#include "gpc/buffer.hpp" -using namespace std; - -#ifdef _INTRINSICS_SSE -#include -// greater and lesser than simd ops for unsigned 8bit integer (epu8) -#define _mm_cmpgt_epu8(v0, v1) \ - _mm_cmpgt_epi8(_mm_xor_si128(v0, _mm_set1_epi8(-128)), \ - _mm_xor_si128(v1, _mm_set1_epi8(-128))) -#define _mm_cmplt_epu8(v1, v0) \ - _mm_cmpgt_epi8(_mm_xor_si128(v0, _mm_set1_epi8(-128)), \ - _mm_xor_si128(v1, _mm_set1_epi8(-128))) -#endif -namespace ndb { -/** - * @brief Gets indices of non-zero values in array a. - * Credits: - * https://stackoverflow.com/questions/18971401/sparse-array-compression-using-simd-avx2/41958528#41958528 - * - * @param input array - * @param n number of input elements - * @param ind output array (indices into n of nonzero elements) - * @param m number of elements in output - */ -__attribute__((noinline)) void arr2ind(const unsigned char* a, - int n, - int* ind, - int* m) { -#ifdef _INTRINSICS_SSE - int i, m0, k; - __m256i msk; - m0 = 0; - for (i = 0; i < n; i = i + 32) { /* Load 32 bytes and compare with zero: */ - msk = _mm256_cmpeq_epi8(_mm256_load_si256((__m256i*)&a[i]), - _mm256_setzero_si256()); - k = _mm256_movemask_epi8(msk); - k = ~k; /* Search for nonzero bits instead of zero bits. */ - while (k) { - ind[m0] = - i + _tzcnt_u32( - k); /* Count the number of trailing zero bits in k. */ - m0++; - k = _blsr_u32(k); /* Clear the lowest set bit in k. */ - } - } - *m = m0; -#else - int nnz = 0; - for (int i = 0; i < n; i++) { - if (a[i] != 0) { - nnz++; - *ind = i; - ind++; - } - } - *m = nnz; -#endif -} -/** - * @brief Unpacks 16x8bit from a 128bit simd var into 2x128bit vars - * (8x16bit) - * - * @param[in] x the 128 bit vector to be unpacked - * @param y0 The y 0 - * @param y1 The y 1 - */ -#ifdef _INTRINSICS_SSE -void unpack8to16(const __m128i x, __m128i& y0, __m128i& y1) { - __m128i zero = _mm_setzero_si128(); - y0 = _mm_unpacklo_epi8(x, zero); - y1 = _mm_unpackhi_epi8(x, zero); -} -/** - * @brief Packs 2x128bit vars with 16bit values(where 8 upper bits are - * zero) into 1x128bit with 8bit values - * - * @param[in] x0 The x 0 - * @param[in] x1 The x 1 - * @param y the packed vector - */ -void pack16to8(const __m128i x0, const __m128i x1, __m128i& y) { - y = _mm_packus_epi16(x0, x1); -} - -#endif -/** - * @brief Calls a given functional f with subranges based on the given start - * and end indices. Here the functional is assumed to take two integer - * arguments indicating their respective start and end ranges. - * nThreads determines the number of threads the given range shall be - * split into. The range is inclusive on the lower bound and exclusive on the - * upper bound, i.e. [start,end) - * - * @param f function object (e.g. a lambda functional) - * @param start start of the range - * @param end end of the range - * @param nThreads number of threads to use - */ -void parFor(std::function const& f, - int start, - int end, - int nThreads) { - // Range definition - // quantities derived from range - int segSize = (end - start) / nThreads; - int lastSeg = (end - start) % nThreads; - - std::vector threads; - threads.reserve(nThreads); - - // Spawn threads - for (int t = 0; t < nThreads - 1; t++) { - threads.emplace_back(f, start + t * segSize, start + (t + 1) * segSize); - } - threads.emplace_back(f, - start + (nThreads - 1) * segSize, - start + (nThreads)*segSize + lastSeg); - // Join - for (auto& t : threads) t.join(); -} - -/** - * @brief Naive 3x3 sobel filter implementation - * - * @param in input image - * @param blurred The blurred output image - * @param[in] width The width - * @param[in] height The height - * @param[in] numThreads number of threads to use - * @param threshold threshold to binarize sobel filter output - */ -void sobelNaive( - uint8_t* in, uint8_t* gradient, int width, int height, uint8_t threshold) { - assert(width % 16 == 0 && "width must be multiple of 16!"); - int thresholdSq = threshold * threshold; - uint8_t* ptr = in; - - uint8_t* p11 = ptr + 0 * width; - uint8_t* p12 = ptr + 0 * width + 1; - uint8_t* p13 = ptr + 0 * width + 2; - - uint8_t* p21 = ptr + 1 * width; - uint8_t* p22 = ptr + 1 * width + 1; - uint8_t* p23 = ptr + 1 * width + 2; - - uint8_t* p31 = ptr + 2 * width; - uint8_t* p32 = ptr + 2 * width + 1; - uint8_t* p33 = ptr + 2 * width + 2; - - // output pointer - uint8_t* optr = gradient + 1 * width + 1; - // Apply 3x3 box filter to image less pixel border of 1 (to avoid treating - // boundary) (unoptimized) - for (int iy = 1; iy < height - 1; iy++) { - for (int ix = 0; ix < width; ix++) { - int sx = (*p11 + *p31 + 2 * *p21 - *p13 - 2 * *p23 - *p33) / 9; - int sy = (*p11 + *p13 + 2 * *p12 - *p31 - 2 * *p32 - *p33) / 9; - - int val = sx * sx + sy * sy; - - *optr = val > thresholdSq ? 255 : 0; - p11++; - p12++; - p13++; - p21++; - p22++; - p23++; - p31++; - p32++; - p33++; - optr++; - } - } -} -/** - * @brief Naive 3x3 box filter implementation - * - * @param in input image - * @param blurred The blurred output image - * @param[in] width The width - * @param[in] height The height - * @param[in] numThreads number of threads to use - */ -void boxNaive(uint8_t* in, uint8_t* blurred, int width, int height) { - assert(width % 16 == 0 && "width must be multiple of 16!"); - // allocate space for result - uint8_t* ptr = in; - uint8_t* p11 = ptr + 0 * width; - uint8_t* p12 = ptr + 0 * width + 1; - uint8_t* p13 = ptr + 0 * width + 2; - - uint8_t* p21 = ptr + 1 * width; - uint8_t* p22 = ptr + 1 * width + 1; - uint8_t* p23 = ptr + 1 * width + 2; - - uint8_t* p31 = ptr + 2 * width; - uint8_t* p32 = ptr + 2 * width + 1; - uint8_t* p33 = ptr + 2 * width + 2; - uint8_t* optr = blurred + 1 * width + 1; - - // Apply 3x3 box filter to image less pixel border of 1 (to avoid treating - // boundary) (unoptimized) - for (int iy = 1; iy < height - 1; iy++) { - for (int ix = 0; ix < width; ix++) { - int res = - (*p11 + *p12 + *p13 + *p21 + *p22 + *p23 + *p31 + *p32 + *p33) / - 9; - *optr = res; - p11++; - p12++; - p13++; - p21++; - p22++; - p23++; - p31++; - p32++; - p33++; - optr++; - } - } -} -/** - * @brief Applies a gpc filter defined by the pixel-difference tests in - * fastmask. Naive implementation - * - * @param in The input image. - * @param grad The gradient image, such that we can skip non-gradient - * pixels - * @param gpc The output image of 32bit codes - * @param fastmask The fastmask containing the gpc filter - * @param idx The gradient indices. Only used if no intrincs are available - * and the call gets forwarded to the naive implementation. - * @param width The width of the image at pointer *in - * @param height The height of the image at pointer *in - */ -void gpcFilterNaive(uint8_t* in, - const uint8_t* grad, - uint32_t* gpc, - std::vector fastmask, - std::vector& idx, - int width, - int height) { - // output buffer of same size - uint32_t tmp; - - int j = 0; - for (auto k : idx) { - tmp = 0; - for (uint8_t i = 0; i < fastmask.size(); i += 2) { - tmp <<= 1; // shift by one - if (*(in + k + fastmask[i]) > *(in + k + fastmask[i + 1])) - tmp++; // set this test's result to 1 - } - gpc[k] = tmp; - j++; - } -} -/** - * @brief Applies a gpc filter defined by the pixel-difference tests in - * fastmask. Additionally uses a threshold vector (tau) Naive implementation. - * - * @param in The input image. - * @param grad The gradient image, such that we can skip non-gradient - * pixels - * @param gpc The output image of 32bit codes - * @param fastmask The fastmask containing the gpc filter - * @param width The width of the image at pointer *in - * @param height The height of the image at pointer *in - */ -void gpcFilterTauNaive(uint8_t* in, - const uint8_t* grad, - uint32_t* gpc, - std::vector fastmask, - std::vector tau, - std::vector& idx, - int width, - int height) { - uint32_t tmp; - - int j = 0; - for (auto k : idx) { - tmp = 0; - for (uint8_t i = 0; i < fastmask.size(); i += 2) { - tmp <<= 1; // shift by one - if (*(in + k + fastmask[i]) > - *(in + k + fastmask[i + 1]) - tau[i / 2]) - tmp++; // set this test's result to 1 - } - gpc[k] = tmp; - j++; - } -} /** - * @brief boxfilter using SSE2 instructions. Loosely based on - * https://www.ignorantus.com/box_sse2/, published under - * the https://creativecommons.org/publicdomain/zero/1.0/ licence. - * - * @param in input image - * @param blurred The blurred - * @param[in] width The width - * @param[in] height The height - * @param[in] numThreads number of threads to use - */ -void box(uint8_t* in, uint8_t* blurred, int width, int height, int numThreads) { - assert(width % 16 == 0 && "width must be multiple of 16!"); -#ifndef _INTRINSICS_SSE - boxNaive(in, blurred, width, height); -#else - auto boxFilterSegment = [&](int start, int end) { - int x, y; - __m128i one_third; - __m128i *dst0, *dst1; - __m128i zero = _mm_setzero_si128(); - - one_third = _mm_set1_epi16( - 21846); // 2^16/3+1. For 16bit ints. 2^8/3+1=86.33 for 8bit - dst0 = (__m128i*)(blurred + width * (start)); - dst1 = (__m128i*)(blurred + width * (start + 1)); - for (y = start; y < end; - y += 2) { // We compute results for two rows in one iteration - const uint8_t *row0, *row1, *row2, *row3; - - row1 = in + y * width; - row0 = row1 - width; - row2 = row1 + width; - row3 = row2 + width; - - for (x = 0; x < width; x += 16) { - __m128i s00, s01, s02; - __m128i r00, r01, r02; - __m128i ra00, ra01, ra02; - __m128i rb00, rb01, rb02; - - __m128i a00, a01, a02, b00, b01, b02; - - __m128i tmp0, tmp1, res; - - s00 = _mm_loadu_si128((__m128i*)(row0 - 1)); - s01 = _mm_loadu_si128((__m128i*)(row0 + 1)); - s02 = _mm_load_si128((__m128i*)(row0)); - unpack8to16(s00, a00, b00); - unpack8to16(s01, a01, b01); - unpack8to16(s02, a02, b02); - - ra00 = _mm_mulhi_epi16( - _mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third); - rb00 = _mm_mulhi_epi16( - _mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third); - - s00 = _mm_loadu_si128((__m128i*)(row1 - 1)); - s01 = _mm_loadu_si128((__m128i*)(row1 + 1)); - s02 = _mm_load_si128((__m128i*)(row1)); - unpack8to16(s00, a00, b00); - unpack8to16(s01, a01, b01); - unpack8to16(s02, a02, b02); - - ra01 = _mm_mulhi_epi16( - _mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third); - rb01 = _mm_mulhi_epi16( - _mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third); - - s00 = _mm_loadu_si128((__m128i*)(row2 - 1)); - s01 = _mm_loadu_si128((__m128i*)(row2 + 1)); - s02 = _mm_load_si128((__m128i*)(row2)); - unpack8to16(s00, a00, b00); - unpack8to16(s01, a01, b01); - unpack8to16(s02, a02, b02); - - ra02 = _mm_mulhi_epi16( - _mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third); - rb02 = _mm_mulhi_epi16( - _mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third); - - tmp0 = _mm_mulhi_epi16( - _mm_adds_epi16(_mm_adds_epi16(ra00, ra01), ra02), - one_third); - tmp1 = _mm_mulhi_epi16( - _mm_adds_epi16(_mm_adds_epi16(rb00, rb01), rb02), - one_third); - - pack16to8(tmp0, tmp1, res); - _mm_store_si128(dst0++, res); - - s00 = _mm_loadu_si128((__m128i*)(row3 - 1)); - s01 = _mm_loadu_si128((__m128i*)(row3 + 1)); - s02 = _mm_load_si128((__m128i*)(row3)); - unpack8to16(s00, a00, b00); - unpack8to16(s01, a01, b01); - unpack8to16(s02, a02, b02); - ra00 = _mm_mulhi_epi16( - _mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third); - rb00 = _mm_mulhi_epi16( - _mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third); - - tmp0 = _mm_mulhi_epi16( - _mm_adds_epi16(_mm_adds_epi16(ra00, ra01), ra02), - one_third); - tmp1 = _mm_mulhi_epi16( - _mm_adds_epi16(_mm_adds_epi16(rb00, rb01), rb02), - one_third); - - pack16to8(tmp0, tmp1, res); - _mm_store_si128(dst1++, res); - - row0 += 16; - row1 += 16; - row2 += 16; - row3 += 16; - } - // still storing 128bit, but now in 16 x 8bit format, so /16 instead - // of /8 - dst0 += width / 16; - dst1 += width / 16; - } - }; // lambda - - boxFilterSegment(1, height - 3); - // parFor(boxFilterSegment,1,height-3,4); -#endif -} -/** - * @brief 3x3 Sobel filter. Input dimension must be multiple of 16 - * - * @param in { parameter_description } - * @param blurred The blurred - * @param[in] width The width - * @param[in] height The height - * @param[in] threshold The threshold - * @param[in] numThreads number of threads to use - */ - -void sobel(uint8_t* in, - uint8_t* blurred, - int width, - int height, - uint8_t threshold, - int numThreads) { - assert(width % 16 == 0 && "width must be multiple of 16!"); -#ifndef _INTRINSICS_SSE - sobelNaive(in, blurred, width, height, threshold); -#else - auto sobelSSESegment = [&](int start, int end) { - __m128i one_third, one_ninth, one, two, mone, mtwo, binThres; - __m128i *dst0, *dst1; - __m128i zero = _mm_setzero_si128(); - - int x, y; - one_third = _mm_set1_epi16( - 21846); // 2^16/3+1. For 16bit ints. 2^8/3+1=86.33 for 8bit - one_ninth = _mm_set1_epi16(7282); // 2^16/9+1. For 16bit ints. - - binThres = _mm_set1_epi16(threshold * threshold); - - dst0 = (__m128i*)(blurred + width * 1); - // dst1 = (__m128i *)(blurred + width * 2); - for (y = start; y < end; - y++) { // We compute results for two rows in one iteration - const uint8_t *row0, *row1, *row2; - - row1 = in + y * width; - row0 = row1 - width; - row2 = row1 + width; - - for (x = 0; x < width; x += 16) { - // Note: Center element not used in sobel kernels!! - // Kernel indices: - // 00 01 02 - // 10 11 12 - // 20 21 22 - - __m128i a00, a01, a02, a10, a12, a20, a21, a22; - __m128i b00, b01, b02, b10, b12, b20, b21, b22; - - __m128i raA, raB, rbA, rbB; - __m128i tmpa, tmpb, sya, syb, sxa, sxb, res; - - unpack8to16(_mm_loadu_si128((__m128i*)(row0 - 1)), a00, b00); - unpack8to16(_mm_load_si128((__m128i*)(row0)), a01, b01); - unpack8to16(_mm_loadu_si128((__m128i*)(row0 + 1)), a02, b02); - - unpack8to16(_mm_loadu_si128((__m128i*)(row1 - 1)), a10, b10); - unpack8to16(_mm_loadu_si128((__m128i*)(row1 + 1)), a12, b12); - - unpack8to16(_mm_loadu_si128((__m128i*)(row2 - 1)), a20, b20); - unpack8to16(_mm_load_si128((__m128i*)(row2)), a21, b21); - unpack8to16(_mm_loadu_si128((__m128i*)(row2 + 1)), a22, b22); - - // Sobel kernels for x and y direction. - // 1 0 -1 1 2 1 - // sx = 2 0 -2 sy = 0 0 0 - // 1 0 -1 -1-2-1 - // Note that neither kernel uses the center element) - - // In the following, mullo is used to multiply intermediate - // results with -1 To divide by 3, 16bit overflow divide by - // multiply is used, which thus uses the upper 16bit(_mm_mulhi) - // of the 32bit temporary result. - - // sx column kernel vectors (1,2,1) - // Two chained add/sub are used for 2 and -2 - raA = _mm_mulhi_epi16( - _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(a00, a20), a10), - a10), - one_ninth); - rbA = _mm_mulhi_epi16( - _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(b00, b20), b10), - b10), - one_ninth); - - // sx column kernel vector (-1 -2 -1) - raB = _mm_mulhi_epi16( - _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(a02, a22), a12), - a12), - one_ninth); - rbB = _mm_mulhi_epi16( - _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(b02, b22), b12), - b12), - one_ninth); - - // Square of sx: Add squares of above temporaries into final sum - tmpa = _mm_sub_epi16(raA, raB); - tmpb = _mm_sub_epi16(rbA, rbB); - - sxa = _mm_mullo_epi16(tmpa, tmpa); - sxb = _mm_mullo_epi16(tmpb, tmpb); - - // sy row kernel vector (1,2,1) - // Two chained add are used for 2 and -2 - raA = _mm_mulhi_epi16( - _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(a00, a02), a01), - a01), - one_ninth); - rbA = _mm_mulhi_epi16( - _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(b00, b02), b01), - b01), - one_ninth); - - // sy row kernel vector (-1 -2 -1) - raB = _mm_mulhi_epi16( - _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(a20, a22), a21), - a21), - one_ninth); - rbB = _mm_mulhi_epi16( - _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(b20, b22), b21), - b21), - one_ninth); - - // Square of sx: Add squares of above temporaries into final sum - tmpa = _mm_sub_epi16(raA, raB); - tmpb = _mm_sub_epi16(rbA, rbB); - - // watch out, can't overwrite this - sya = _mm_mullo_epi16(tmpa, tmpa); - syb = _mm_mullo_epi16(tmpb, tmpb); - - __m128i zero = _mm_setzero_si128(); - - // The unpacklo is necessary because _mm_cmput_epi16 sets the - // output to 0xFFFF if the comparison is true. When packing - // 16bit to 8bit however, 0xFFFF will be interpreted (in a - // signed environment) as being negative, and hence set to 0, - // resulting in a 0 output everywhere. using unpacklo in between - // we get 0xFFFF->0xFF - pack16to8( - _mm_unpacklo_epi8( - _mm_cmpgt_epi16(_mm_adds_epi16(sxa, sya), binThres), - zero), - _mm_unpacklo_epi8( - _mm_cmpgt_epi16(_mm_adds_epi16(sxb, syb), binThres), - zero), - res); - - _mm_store_si128(dst0++, res); - - row0 += 16; - row1 += 16; - row2 += 16; - } // cols - } // rows - }; // Lambda - sobelSSESegment(1, height - 3); -#endif -} - -/** - * @brief Checks if the 128bits in xmm are all zero - * - * @param xmm - * - * @return true if all zeros, false otherwise - */ -#ifdef _INTRINSICS_SSE -inline bool isAllZeros(__m128i xmm) { - return _mm_movemask_epi8(_mm_cmpeq_epi8(xmm, _mm_setzero_si128())) == - 0xFFFF; -} -#endif -/** - * @brief Applies a gpc filter defined by the pixel-difference tests in - * fastmask. Accelerated with SSE. - * - * @param in The input image. - * @param grad The gradient image, such that we can skip non-gradient - * pixels - * @param gpc The output image of 32bit codes - * @param fastmask The fastmask containing the gpc filter - * @param idx The gradient indices. Only used if no intrincs are available - * and the call gets forwarded to the naive implementation. - * @param width The width of the image at pointer *in - * @param height The height of the image at pointer *in - * @param numThreadsNumber of threads to use - */ -void gpcFilter(uint8_t* in, - const uint8_t* grad, - uint32_t* gpc, - std::vector fastmask, - std::vector& idx, - int width, - int height, - int numThreads) { - assert(width % 16 == 0 && "width must be multiple of 16!"); -#ifndef _INTRINSICS_SSE - gpcFilterNaive(in, grad, gpc, fastmask, idx, width, height); -#else - auto gpcFilterSegment = [&](int start, int end) { - __m128i zero = _mm_set1_epi8(0); - __m128i one = _mm_set1_epi8(1); - for (int y = start; y < end; y++) { - for (int x = 0; x < width; x += 16) { - uint8_t* rowPtr; - rowPtr = in + (y - 2) * width + x; - __m128i out[4]; // temporary output vector of 4 128bit words - - const uint8_t* center = (in + y * width + x); - const uint8_t* centerGrad = (grad + y * width + x); - // We only process the current segment if there are any non-zero - // values (high gradient pixels) - if (!isAllZeros(_mm_lddqu_si128((__m128i*)centerGrad))) { - __m128i* dst = - (__m128i*)(gpc + y * width + - x); // Set starting point to pixel (2,2) - out[0] = zero; - out[1] = zero; - out[2] = zero; - out[3] = zero; - uint8_t k = 0; - __m128i bitMask = one; - for (uint8_t i = 0; i < fastmask.size() && i < 64; i += 2) { - out[k] |= _mm_and_si128( - _mm_cmpgt_epu8( - _mm_lddqu_si128( - (__m128i*)(center + fastmask[i])), - _mm_lddqu_si128( - (__m128i*)(center + fastmask[i + 1]))), - bitMask); - // Keeps index into output vector and updates bit mask - if (i % 16 == 0 && i != 0) { - bitMask = one; - k++; - } else { - bitMask += bitMask; - } - } - // 8bit to 16bit - __m128i high1 = _mm_unpacklo_epi8(out[2], out[3]); - __m128i high2 = _mm_unpackhi_epi8(out[2], out[3]); - __m128i low1 = _mm_unpacklo_epi8(out[0], out[1]); - __m128i low2 = _mm_unpackhi_epi8(out[0], out[1]); - - // 16bit to 32bit ints - _mm_storeu_si128(dst, _mm_unpacklo_epi16(low1, high1)); - _mm_storeu_si128(dst + 1, _mm_unpackhi_epi16(low1, high1)); - _mm_storeu_si128(dst + 2, _mm_unpacklo_epi16(low2, high2)); - _mm_storeu_si128(dst + 3, _mm_unpackhi_epi16(low2, high2)); - } - } // col iteration - } // row iteration - }; - - if (numThreads == 1) - gpcFilterSegment(13, height - 15); - else - parFor(gpcFilterSegment, 13, height - 15, 4); -#endif -} -/** - * @brief Applies a gpc filter defined by the pixel-difference tests in - * fastmask. Additionally uses a threshold vector (tau) - * - * @param in The input image. - * @param grad The gradient image, such that we can skip non-gradient - * pixels - * @param gpc The output image of 32bit codes - * @param fastmask The fastmask containing the gpc filter - * @param width The width of the image at pointer *in - * @param height The height of the image at pointer *in - * @param numThreads Number of threads to use - */ -void gpcFilterTau(uint8_t* in, - const uint8_t* grad, - uint32_t* gpc, - std::vector fastmask, - std::vector tau, - std::vector& idx, - int width, - int height, - int numThreads) { - assert(width % 16 == 0 && "width must be multiple of 16!"); -#ifndef _INTRINSICS_SSE - gpcFilterTauNaive(in, grad, gpc, fastmask, tau, idx, width, height); -#else - auto gpcFilterSegment = [&](int start, int end) { - __m128i zero = _mm_set1_epi8(0); - __m128i one = _mm_set1_epi8(1); - for (int y = start; y < end; y++) { - for (int x = 0; x < width; x += 16) { - uint8_t* rowPtr; - rowPtr = in + (y - 2) * width + x; - __m128i out[4]; // temporary output vector of 4 128bit words - - const uint8_t* center = (in + y * width + x); - const uint8_t* centerGrad = (grad + y * width + x); - // We only process the current segment if there are any non-zero - // values (high gradient pixels) - if (!isAllZeros(_mm_lddqu_si128((__m128i*)centerGrad))) { - __m128i* dst = - (__m128i*)(gpc + y * width + - x); // Set starting point to pixel (2,2) - out[0] = zero; - out[1] = zero; - out[2] = zero; - out[3] = zero; - uint8_t k = 0; - __m128i bitMask = one; - for (uint8_t i = 0; i < fastmask.size() && i < 64; i += 2) { - out[k] |= _mm_and_si128( - _mm_cmpgt_epu8( - _mm_lddqu_si128( - (__m128i*)(center + fastmask[i])), - _mm_subs_epi8( - _mm_lddqu_si128( - (__m128i*)(center + fastmask[i + 1])), - _mm_set1_epi8(tau[i / 2])) // deduct tau - ), - bitMask); - // Keeps index into output vector and updates bit mask - if (i % 16 == 0 && i != 0) { - bitMask = one; - k++; - } else { - bitMask += bitMask; - } - } - // 8bit to 16bit - __m128i high1 = _mm_unpacklo_epi8(out[2], out[3]); - __m128i high2 = _mm_unpackhi_epi8(out[2], out[3]); - __m128i low1 = _mm_unpacklo_epi8(out[0], out[1]); - __m128i low2 = _mm_unpackhi_epi8(out[0], out[1]); - - // 16bit to 32bit ints - _mm_storeu_si128(dst, _mm_unpacklo_epi16(low1, high1)); - _mm_storeu_si128(dst + 1, _mm_unpackhi_epi16(low1, high1)); - _mm_storeu_si128(dst + 2, _mm_unpacklo_epi16(low2, high2)); - _mm_storeu_si128(dst + 3, _mm_unpackhi_epi16(low2, high2)); - } - } // col iteration - } // row iteration - }; - - if (numThreads == 1) - gpcFilterSegment(13, height - 15); - else - parFor(gpcFilterSegment, 13, height - 15, 4); -#endif -} -/** - * @brief Naive version of 5x5 census transoform - * - * @param in Input image - * @param census 32bit census transform output - * @param width Width of the image at *in pointer - * @param height Heiht of the image at *in pointer - */ -void census5x5Naive(uint8_t* in, uint32_t* census, int width, int height) { - uint32_t val; - uint32_t* dst; - for (int y = 2; y < height - 3; y++) { - for (int x = 0; x < width; x++) { - val = 0; - dst = census + y * width + x; - int i = 0; - // patch loops - for (int px = -2; px <= 2; px++) { - for (int py = -2; py <= 2; py++) { - if (!(px == 0 && py == 0)) { - val |= (in[(y + py) * width + (x + px)] > - in[y * width + x]) - ? (1 << i) - : 0; - i++; - } - } - } // End patch loops - *dst = val; - } - } // End pixel loops -} - -/** - * @brief 5x5 dense census transform of input image. binary codes are returned - * as a 32bit image - * - * @param in - * @param census - * @param width - * @param height - */ -void census5x5(uint8_t* in, uint32_t* census, int width, int height) { - assert(width % 16 == 0 && "width must be multiple of 16!"); -#ifndef _INTRINSICS_SSE - census5x5Naive(in, census, width, height); -#else - __m128i zero = _mm_set1_epi8(0); - __m128i one = _mm_set1_epi8(1); - - for (int y = 2; y < height - 3; y++) { - for (int x = 0; x < width; x += 16) { - uint8_t* rowPtr; - rowPtr = in + (y - 2) * width + x; - __m128i center = _mm_lddqu_si128((__m128i*)(in + y * width + x)); - __m128i* dst = (__m128i*)(census + y * width + - x); // Set starting point to pixel (2,2) - // row 0 - __m128i bitMask = one; - __m128i byte1 = _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 2))), - bitMask); - bitMask += bitMask; // 2 - byte1 |= _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 1))), - bitMask); - bitMask += bitMask; // 4 - byte1 |= _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr))), - bitMask); - bitMask += bitMask; // 8 - byte1 |= _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 1))), - bitMask); - bitMask += bitMask; // 16 - byte1 |= _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 2))), - bitMask); - - // row 1 - rowPtr += width; - bitMask += bitMask; // 32 - byte1 |= _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 2))), - bitMask); - bitMask += bitMask; // 64 - byte1 |= _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 1))), - bitMask); - bitMask += bitMask; // 128 - byte1 |= _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr))), - bitMask); - bitMask = one; // 1 - __m128i byte2 = _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 1))), - bitMask); - bitMask += bitMask; // 2 - byte2 |= _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 2))), - bitMask); - - // row 2 - rowPtr += width; - bitMask += bitMask; // 4 - byte2 |= _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 2))), - bitMask); - bitMask += bitMask; // 8 - byte2 |= _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 1))), - bitMask); - bitMask += bitMask; // 16 - byte2 |= _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 1))), - bitMask); - bitMask += bitMask; // 32 - byte2 |= _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 2))), - bitMask); - - // row 3 - rowPtr += width; - bitMask += bitMask; // 64 - byte2 |= _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 2))), - bitMask); - bitMask += bitMask; // 128 - byte2 |= _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 1))), - bitMask); - bitMask = one; // 1 - __m128i byte3 = _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr))), - bitMask); - bitMask += bitMask; // 2 - byte3 |= _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 1))), - bitMask); - bitMask += bitMask; // 4 - byte3 |= _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 2))), - bitMask); - - // row 4 - rowPtr += width; - bitMask += bitMask; // 8 - byte3 |= _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 2))), - bitMask); - bitMask += bitMask; // 16 - byte3 |= _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 1))), - bitMask); - bitMask += bitMask; // 32 - byte3 |= _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr))), - bitMask); - bitMask += bitMask; // 64 - byte3 |= _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 1))), - bitMask); - bitMask += bitMask; // 128 - byte3 |= _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 2))), - bitMask); - - // 8bit to 16bit - __m128i high1 = _mm_unpacklo_epi8(byte3, zero); - __m128i high2 = _mm_unpackhi_epi8(byte3, zero); - __m128i low1 = _mm_unpacklo_epi8(byte1, byte2); - __m128i low2 = _mm_unpackhi_epi8(byte1, byte2); - - // 16bit to 32bit ints - _mm_storeu_si128(dst, _mm_unpacklo_epi16(low1, high1)); - _mm_storeu_si128(dst + 1, _mm_unpackhi_epi16(low1, high1)); - _mm_storeu_si128(dst + 2, _mm_unpacklo_epi16(low2, high2)); - _mm_storeu_si128(dst + 3, _mm_unpackhi_epi16(low2, high2)); - - } // col iteration - } // row iteration - // if(numThreads == 1) - // gpcFilterSegment(13,height-15); - // else - // parFor(gpcFilterSegment,13,height-15,4); - -#endif -} // census5x5 -} // namespace ndb -#endif diff --git a/lib/gpc/forest.cpp b/lib/gpc/forest.cpp new file mode 100644 index 0000000..940eb33 --- /dev/null +++ b/lib/gpc/forest.cpp @@ -0,0 +1,380 @@ +// Copyright (c) 2018, ETH Zurich +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Implements and extends the method proposed in +// The Global Patch Collider +// Shenlong Wang, Sean Ryan Fanello, Christoph Rhemann, Shahram Izadi, Pushmeet +// Kohli CVPR 2016 Code Author: Niklaus Bamert (bamertn@ethz.ch) +#include +// #include +#include +#include +#include +#include +#include +#include +#include +#include + +// GPC includes +#include + +#include "gpc/Feature.hpp" +#include "gpc/SintelOpticalFlow.hpp" +#include "gpc/SintelStereo.hpp" +#include "gpc/buffer.hpp" +#include "gpc/forest.hpp" +#include "gpc/hashmatch.hpp" +#include "gpc/kernels/box.hpp" +#include "gpc/kernels/gpc.hpp" +#include "gpc/kernels/sobel.hpp" +#include "gpc/kernels/utils.hpp" + +namespace gpc { +namespace inference { +/** + * @brief Computes sparse matches on a pair of rectified and smoothed + * images. Here the src and tar images refer to the left and right images, + * respectively. + * + * @param src Preprocessed source(left) image + * @param tar Preprocessed target(right) image + * @param fastmask forest mask of relative integer offsets. + * + * @return + */ +std::vector Forest::depthPriorFast( + PreprocessedImage& src, + PreprocessedImage& tar, + FilterMask& fastmask, + InferenceSettings& settings) { + std::chrono::high_resolution_clock::time_point t0, t1; + std::vector statesSrc = evalFastMaskOnSubsetSSE( + src.smooth, src.grad, src.mask, fastmask, settings); + std::vector statesTar = evalFastMaskOnSubsetSSE( + tar.smooth, tar.grad, tar.mask, fastmask, settings); + // Epipolar mode. Use upper 32bit of 64bit descriptor to store y + // coordinate + if (settings.epipolarMode_) { + for (auto& el : statesSrc) el.state |= uint64_t(el.point.y) << 32; + for (auto& el : statesTar) el.state |= uint64_t(el.point.y) << 32; + } + // Use sort method for matching + if (settings.useHashtable_ == false) { + t0 = sysTick(); + std::vector corr = + findCorrespondences(statesSrc, statesTar); + t1 = sysTick(); + std::cout << "findCorrespondences (without allocation): " + << gpc::inference::tickToMs(t1, t0) << " ms" << std::endl; + std::cout << "length src: " << statesSrc.size() << std::endl; + return corr; + } + // Use hashtable matching + else { + for (auto& q : statesSrc) q.srcDescr = true; + for (auto& q : statesTar) q.srcDescr = false; + + ndb::Hashmatch hm(214673, + statesSrc.size() + statesTar.size()); + std::vector> corr; + for (auto& q : statesSrc) hm.insert(q); + for (auto& q : statesTar) hm.insert(q); + hm.getDuplicates(corr); + // Store vertices in a format that is more convenient for us: + std::vector corr2; + for (auto& e : corr) { + corr2.push_back(ndb::Correspondence(e.first.point, e.second.point)); + } + + return corr2; + } +} +std::vector Forest::findCorrespondences( + std::vector& srcStates, + std::vector& tarStates) { + int numStates = std::min(srcStates.size(), tarStates.size()); + // Limit search to rectified epipolar case. + std::sort(srcStates.begin(), srcStates.end()); + std::sort(tarStates.begin(), tarStates.end()); + std::vector corr; + uint32_t j = 0; + for (uint32_t i = 0; i < srcStates.size(); ++i) { + bool unique = true; + while (i + 1 < srcStates.size() && srcStates[i] == srcStates[i + 1]) + ++i, unique = false; + + if (unique) { + for (; j < tarStates.size() - 1; ++j) { + if (!(tarStates[j] < srcStates[i])) break; + } + + if (j != tarStates.size() - 1 && tarStates[j] == srcStates[i] && + ((j + 1) == tarStates.size() - 1 || + !(tarStates[j] == tarStates[j + 1]))) + corr.push_back(ndb::Correspondence(srcStates[i].point, + tarStates[j].point)); + } + } + return corr; +} + +/** + * @brief Evaluates a given forest mask on an image and returns the + * descriptors + * + * @param img The image + * @param grad gradient image + * @param idx offsets with high gradient pixels within the grad image + * @param fastmask the forest mask + * + * @return + */ +std::vector Forest::evalFastMaskOnSubsetSSE( + ndb::Buffer& img, + ndb::Buffer& grad, + std::vector& idx, + FilterMask& fastmask, + InferenceSettings& settings) { + std::chrono::high_resolution_clock::time_point t0, t1; + + // output buffer of same size + ndb::Buffer gpcstates(img.rows(), img.cols(), 0); + if (fastmask.type == 0) { + ndb::gpcFilter(img.data(), + grad.data(), + gpcstates.data(), + fastmask.mask, + idx, + img.cols(), + img.rows()); + } else { + ndb::gpcFilterTau(img.data(), + grad.data(), + gpcstates.data(), + fastmask.mask, + fastmask.tau, + idx, + img.cols(), + img.rows()); + } + std::vector out(idx.size()); + int j = 0; + + for (auto k : idx) { + int x = k % img.cols(); + int y = k / img.cols(); + out[j] = ndb::Descriptor(ndb::Point(x, y), gpcstates.data()[k]); + j++; + } + return out; +} + +/** + * @brief Preprocesses an image. (smooth, binary sobel image and gradient + * pixel indices) + * + * @param img The raw input image to be preprocessed + * @param InferenceSettings inference settings struct + * + * @return the preprocessed image + */ +PreprocessedImage Forest::preprocessImage(ndb::Buffer& img, + InferenceSettings settings) { + assert((settings.gradientThreshold_ >= 0 && + settings.gradientThreshold_ <= 255) && + "gradientThreshold needs to be within 0...255"); + + ndb::Buffer smooth(img.rows(), img.cols()); + + smooth.width = img.width; + // 0.2ms + ndb::box(img.data(), + smooth.data(), + img.cols(), + img.rows(), + settings.numThreads_); + // 4.2 *10^-5 ms + smooth.clearBoundary(); + ndb::Buffer grad(img.rows(), img.cols()); + grad.width = img.width; + // 4.2*10-5ms (unclear how) + ndb::sobel(img.data(), + grad.data(), + img.cols(), + img.rows(), + settings.gradientThreshold_, + settings.numThreads_); + gpc::inference::time_point t0 = gpc::inference::sysTick(); + ndb::Buffer idx; + idx.resize(grad.rows(), grad.cols()); + auto ff = [&](ndb::Buffer& in, std::vector& out, int m) { + for (int i = 0; i < m; i++) { + int x = in.data()[i] % grad.cols(); + int y = in.data()[i] / grad.cols(); + if (y >= 13 && y < grad.rows() - 13 && x >= 13 && + x < grad.cols() - 13) + out.push_back(in.data()[i]); + } + }; + int m; + // mask indexing gradient pixels + std::vector mask; + ndb::arr2ind(grad.data(), grad.cols() * grad.rows(), idx.data(), &m); + ff(idx, mask, m); + + gpc::inference::time_point t1 = gpc::inference::sysTick(); + // Our outputs are: smooth, grad, mask; + return PreprocessedImage(smooth, grad, mask); +} +/** + * @brief Finds matches between two stereo images based on a given forest + * mask. + * + * @param simg source image (assumed to be the left image) + * @param timg target image (assumed to be the right image) + * @param forestmask forest mask, provided by readForest method + * @param InferenceSettings inference settings struct + * @return Set of correspondences (ptSrc, ptTar) where + * ptSrc and ptTar are points in the source and target images, respectively. + */ +std::vector Forest::stereoMatch( + PreprocessedImage& simg, + PreprocessedImage& timg, + FilterMask& forestmask, + InferenceSettings settings) { + // make sure the delivered mask matches the image dimensions + assert((forestmask.width == simg.smooth.cols() && + forestmask.height == simg.smooth.rows()) && + "Source Image: dimension does not fit dimension of supplied forest " + "mask"); + assert((forestmask.width == timg.smooth.cols() && + forestmask.height == simg.smooth.rows()) && + "Targe Image: dimension does not fit dimension of supplied forest " + "mask"); + bool m_debug = false; + // Match + std::vector corr = + depthPriorFast(simg, timg, forestmask, settings); + + return corr; +} + +/** + * @brief Returns support (set of x,y coordinates and + * disparity) of a pair of images that have been rectified. + * + * @@param simg source image (assumed to be the left image) + * @param timg target image (assumed to be the right image) + * @param forestmask forest mask, provided by readForest method + * @param InferenceSettings inference settings struct + * In practice, values between 5...20 produce good + * results. + * + * @return Set of supports (x,y,d) with x,y the coordinate + * of a point in the left image and d the disparity. + */ +std::vector Forest::rectifiedMatch(PreprocessedImage& simg, + PreprocessedImage& timg, + FilterMask& forestmask, + InferenceSettings settings) { + // Do matching + std::vector corr = + stereoMatch(simg, timg, forestmask, settings); + // Filter epipolar matches + std::vector supp; + for (auto& e : corr) { + // epipolar constraint + if (std::abs(e.srcPt.y - e.tarPt.y) <= settings.verticalTolerance_ + // disparity filter + && std::abs(e.srcPt.x - e.tarPt.x) <= settings.dispHigh_) + supp.push_back( + ndb::Support(e.srcPt.x, e.srcPt.y, e.srcPt.x - e.tarPt.x)); + } + return supp; +} + +/** + * @brief Reads text-based forest format and returns a mask for a given + * image size. + * + * @param path Path to the file that contains the forest. + * @param width 16-Byte aligned width of the image in pixels + * @param height height of the image in pixels + * + * @return + */ +FilterMask Forest::readForest(std::string path, int width, int height) { + std::ifstream ff(path); + + std::vector fastmask; + std::vector taus; + if (ff.fail()) { + cout << "Error opening forest file" << endl; + return FilterMask(fastmask, width, height, 0); + } + int numNonZeroTau = 0; + int numFerns; + int type; + ff >> numFerns; + cout << "number of ferns:" << numFerns << endl; + for (int i = 0; i < numFerns; i++) { + int fernID, numTests; + std::string fernScale; + ff >> fernID >> fernScale >> numTests; + for (int j = 0; j < numTests; j++) { + int levelID, ix, iy, jx, jy, tau; + ff >> levelID >> ix >> iy >> jx >> jy >> tau; + // Limit mask size to 32 binary tests + if (fastmask.size() < 64 && taus.size() < 32) { + fastmask.push_back(ix + iy * width); + fastmask.push_back(jx + jy * width); + taus.push_back(tau); + } else { + cout << "Note: A maximum of 32 fern features are allowed, " + "discarding " + "remainder of forest." + << endl; + } + if (tau != 0) numNonZeroTau++; + } + } + if (numNonZeroTau == 0) { + type = 0; // We have a zero forest (all tau=0) + FilterMask fm(fastmask, width, height, type); + return fm; + } else { + type = 1; // We have a tau forest (some tau!=0) + FilterMask fm(fastmask, taus, width, height, type); + return fm; + } +} + +} // namespace inference +} // namespace gpc diff --git a/lib/gpc/forest.hpp b/lib/gpc/forest.hpp new file mode 100644 index 0000000..6226b83 --- /dev/null +++ b/lib/gpc/forest.hpp @@ -0,0 +1,280 @@ +// Copyright (c) 2018, ETH Zurich +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Implements and extends the method proposed in +// The Global Patch Collider +// Shenlong Wang, Sean Ryan Fanello, Christoph Rhemann, Shahram Izadi, Pushmeet +// Kohli CVPR 2016 Code Author: Niklaus Bamert (bamertn@ethz.ch) +#ifndef _GPC_inference +#define _GPC_inference +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// GPC includes +#include "gpc/Feature.hpp" +#include "gpc/SintelOpticalFlow.hpp" +#include "gpc/SintelStereo.hpp" +#include "gpc/buffer.hpp" +#include "gpc/hashmatch.hpp" + +/** + * @brief The inference class of the GPC forest + * + */ +namespace gpc { +namespace inference { +typedef typename std::chrono::high_resolution_clock::time_point time_point; +inline std::chrono::high_resolution_clock::time_point sysTick() { + return std::chrono::high_resolution_clock::now(); +} +inline float tickToMs(std::chrono::high_resolution_clock::time_point t0, + std::chrono::high_resolution_clock::time_point t1) { + return std::abs( + 1000. * + std::chrono::duration_cast>(t1 - t0) + .count()); +} +struct InferenceSettings { + // Threshold to be used for edge detection. Can be 0...255. + // In practice, values between 5...20 produce good results. uint8_t + // gradientThreshold; + uint8_t gradientThreshold_ = 10; + // upper absolute limit for disparity in pixels. The lower (implied) limit + // is + // 0 + int dispHigh_ = 128; + // vertical deviation tolerance in pixels for corresponding features in + // rectified stereo images. + int verticalTolerance_ = 1; + // Whether to use epipolar mode on matching or not. + bool epipolarMode_ = false; + // Use hashtable to match extracted descriptors. Usually only faster with a + // large number of descriptors (> 100k) or when using multiple threads. Note + // that the hashtable method does not return a slightly reduced amount of + // matches as a result of the hash table implementation (small bucket size) + // if false, the descriptors are sorted and matched by iterating + // alternatingly through both sets. + bool useHashtable_ = false; + + // Number of threads to use for inference + int numThreads_ = 1; + + // Default contructor defaults to using a single thread + InferenceSettings(uint8_t gradientThreshold, + int dispHigh, + int verticalTolerance, + bool epipolarMode, + bool useHashtable, + int numThreads) + : gradientThreshold_(gradientThreshold), + dispHigh_(dispHigh), + verticalTolerance_(verticalTolerance), + epipolarMode_(epipolarMode), + useHashtable_(useHashtable), + numThreads_(numThreads) {} + + InferenceSettings() {} + InferenceSettings& builder(void) { return *this; } + InferenceSettings& gradientThreshold(uint8_t gradientThreshold) { + this->gradientThreshold_ = gradientThreshold; + return *this; + } + InferenceSettings& dispHigh(int dispHigh) { + this->dispHigh_ = dispHigh; + return *this; + } + InferenceSettings& verticalTolerance(int verticalTolerance) { + this->verticalTolerance_ = verticalTolerance; + return *this; + } + InferenceSettings& epipolarMode(bool epipolarMode) { + this->epipolarMode_ = epipolarMode; + return *this; + } + InferenceSettings& useHashtable(bool useHashtable) { + this->useHashtable_ = useHashtable; + return *this; + } + InferenceSettings& numThreads(int numThreads) { + if (numThreads > std::thread::hardware_concurrency()) + this->numThreads_ = std::thread::hardware_concurrency(); + else + this->numThreads_ = numThreads; + return *this; + } +}; +/** + * @brief FilterMask object that is returned by the forest reader + */ +struct FilterMask { + std::vector mask; + std::vector tau; + int width; + int height; + int type; + FilterMask(std::vector mask, int width, int height, int type) { + this->mask = mask; + this->width = width; + this->height = height; + this->type = type; + } + FilterMask(std::vector mask, + std::vector tau, + int width, + int height, + int type) { + this->mask = mask; + this->tau = tau; + this->width = width; + this->height = height; + this->type = type; + } +}; +struct PreprocessedImage { + ndb::Buffer smooth; + ndb::Buffer grad; + std::vector mask; + PreprocessedImage(ndb::Buffer& smooth, + ndb::Buffer& grad, + std::vector& mask) + : smooth(smooth), grad(grad), mask(mask) {}; +}; + +enum CorrMethod { sorting = 's', hashtable = 'h' }; +struct MatchStats { + double prec, rec, timeProp, timeMatch; + int numInlier, numStates, numMatches; +}; +class Forest { + public: + /** + * @brief Computes sparse matches on a pair of rectified and smoothed + * images. Here the src and tar images refer to the left and right images, + * respectively. + * + * @param src Preprocessed source(left) image + * @param tar Preprocessed target(right) image + * @param fastmask forest mask of relative integer offsets. + * + * @return + */ + std::vector depthPriorFast( + PreprocessedImage& src, + PreprocessedImage& tar, + FilterMask& fastmask, + InferenceSettings& settings); + static std::vector findCorrespondences( + std::vector& srcStates, + std::vector& tarStates); + /** + * @brief Evaluates a given forest mask on an image and returns the + * descriptors + * + * @param img The image + * @param grad gradient image + * @param idx offsets with high gradient pixels within the grad image + * @param fastmask the forest mask + * + * @return + */ + std::vector evalFastMaskOnSubsetSSE( + ndb::Buffer& img, + ndb::Buffer& grad, + std::vector& idx, + FilterMask& fastmask, + InferenceSettings& settings); + + /** + * @brief Preprocesses an image. (smooth, binary sobel image and gradient + * pixel indices) + * + * @param img The raw input image to be preprocessed + * @param InferenceSettings inference settings struct + * + * @return the preprocessed image + */ + PreprocessedImage preprocessImage(ndb::Buffer& img, + InferenceSettings settings); + /** + * @brief Finds matches between two stereo images based on a given forest + * mask. + * + * @param simg source image (assumed to be the left image) + * @param timg target image (assumed to be the right image) + * @param forestmask forest mask, provided by readForest method + * @param InferenceSettings inference settings struct + * @return Set of correspondences (ptSrc, ptTar) where + * ptSrc and ptTar are points in the source and target images, respectively. + */ + std::vector stereoMatch(PreprocessedImage& simg, + PreprocessedImage& timg, + FilterMask& forestmask, + InferenceSettings settings); + /** + * @brief Returns support (set of x,y coordinates and + * disparity) of a pair of images that have been rectified. + * + * @@param simg source image (assumed to be the left image) + * @param timg target image (assumed to be the right image) + * @param forestmask forest mask, provided by readForest method + * @param InferenceSettings inference settings struct + * In practice, values between 5...20 produce good + * results. + * + * @return Set of supports (x,y,d) with x,y the coordinate + * of a point in the left image and d the disparity. + */ + std::vector rectifiedMatch(PreprocessedImage& simg, + PreprocessedImage& timg, + FilterMask& forestmask, + InferenceSettings settings); + + /** + * @brief Reads text-based forest format and returns a mask for a given + * image size. + * + * @param path Path to the file that contains the forest. + * @param width 16-Byte aligned width of the image in pixels + * @param height height of the image in pixels + * + * @return + */ + FilterMask readForest(std::string path, int width, int height); +}; // forest class +} // namespace inference +} // namespace gpc + +#endif diff --git a/lib/gpc/inference.hpp b/lib/gpc/inference.hpp index e1a887a..df1c840 100644 --- a/lib/gpc/inference.hpp +++ b/lib/gpc/inference.hpp @@ -48,8 +48,11 @@ #include "gpc/SintelOpticalFlow.hpp" #include "gpc/SintelStereo.hpp" #include "gpc/buffer.hpp" -#include "gpc/filter.hpp" #include "gpc/hashmatch.hpp" +#include "gpc/kernels/box.hpp" +#include "gpc/kernels/gpc.hpp" +#include "gpc/kernels/sobel.hpp" +#include "gpc/kernels/utils.hpp" /** * @brief The inference class of the GPC forest @@ -295,8 +298,7 @@ class Forest { fastmask.mask, idx, img.cols(), - img.rows(), - settings.numThreads_); + img.rows()); } else { ndb::gpcFilterTau(img.data(), grad.data(), @@ -305,8 +307,7 @@ class Forest { fastmask.tau, idx, img.cols(), - img.rows(), - settings.numThreads_); + img.rows()); } std::vector out(idx.size()); int j = 0; @@ -465,7 +466,6 @@ class Forest { int numFerns; int type; ff >> numFerns; - cout << "number of ferns:" << numFerns << endl; for (int i = 0; i < numFerns; i++) { int fernID, numTests; std::string fernScale; diff --git a/lib/gpc/kernels/box.cpp b/lib/gpc/kernels/box.cpp new file mode 100644 index 0000000..5b51b60 --- /dev/null +++ b/lib/gpc/kernels/box.cpp @@ -0,0 +1,195 @@ +// Copyright (c) 2018, ETH Zurich +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Code Author: Niklaus Bamert (bamertn@ethz.ch) + +#include "gpc/kernels/box.hpp" + +#include + +#include "gpc/kernels/utils.hpp" +namespace ndb { +namespace testing { +void box_hwy(uint8_t* in, uint8_t* blurred, int width, int height); +} +void boxNaive(uint8_t* in, uint8_t* blurred, int width, int height) { + assert(width % 16 == 0 && "width must be multiple of 16!"); + // allocate space for result + uint8_t* ptr = in; + uint8_t* p11 = ptr + 0 * width; + uint8_t* p12 = ptr + 0 * width + 1; + uint8_t* p13 = ptr + 0 * width + 2; + + uint8_t* p21 = ptr + 1 * width; + uint8_t* p22 = ptr + 1 * width + 1; + uint8_t* p23 = ptr + 1 * width + 2; + + uint8_t* p31 = ptr + 2 * width; + uint8_t* p32 = ptr + 2 * width + 1; + uint8_t* p33 = ptr + 2 * width + 2; + uint8_t* optr = blurred + 1 * width + 1; + + // Apply 3x3 box filter to image less pixel border of 1 (to avoid treating + // boundary) (unoptimized) + for (int iy = 1; iy < height - 1; iy++) { + for (int ix = 0; ix < width; ix++) { + int res = + (*p11 + *p12 + *p13 + *p21 + *p22 + *p23 + *p31 + *p32 + *p33) / + 9; + *optr = res; + p11++; + p12++; + p13++; + p21++; + p22++; + p23++; + p31++; + p32++; + p33++; + optr++; + } + } +} +#if HWY_TARGET == HWY_AVX2 +/** + * @brief SSE implementation of the 3x3 box filter. + * Processed two rows at a time using fixed-point multiplication for division. + */ +#include +void boxSSE(uint8_t* in, uint8_t* blurred, int width, int height) { + int start = 1; + int end = height - 3; + + int x, y; + __m128i one_third = _mm_set1_epi16(21846); // 2^16/3 + 1 + + __m128i* dst0 = (__m128i*)(blurred + width * start); + __m128i* dst1 = (__m128i*)(blurred + width * (start + 1)); + + for (y = start; y < end; y += 2) { + const uint8_t *row0, *row1, *row2, *row3; + + row1 = in + y * width; + row0 = row1 - width; + row2 = row1 + width; + row3 = row2 + width; + + for (x = 0; x < width; x += 16) { + __m128i s00, s01, s02; + __m128i ra00, ra01, ra02, rb00, rb01, rb02; + __m128i a00, a01, a02, b00, b01, b02; + __m128i tmp0, tmp1, res; + + // Row 0 Processing + s00 = _mm_loadu_si128((__m128i*)(row0 - 1)); + s01 = _mm_loadu_si128((__m128i*)(row0 + 1)); + s02 = _mm_load_si128((__m128i*)(row0)); + unpack8to16(s00, a00, b00); + unpack8to16(s01, a01, b01); + unpack8to16(s02, a02, b02); + ra00 = _mm_mulhi_epi16( + _mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third); + rb00 = _mm_mulhi_epi16( + _mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third); + + // Row 1 Processing + s00 = _mm_loadu_si128((__m128i*)(row1 - 1)); + s01 = _mm_loadu_si128((__m128i*)(row1 + 1)); + s02 = _mm_load_si128((__m128i*)(row1)); + unpack8to16(s00, a00, b00); + unpack8to16(s01, a01, b01); + unpack8to16(s02, a02, b02); + ra01 = _mm_mulhi_epi16( + _mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third); + rb01 = _mm_mulhi_epi16( + _mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third); + + // Row 2 Processing + s00 = _mm_loadu_si128((__m128i*)(row2 - 1)); + s01 = _mm_loadu_si128((__m128i*)(row2 + 1)); + s02 = _mm_load_si128((__m128i*)(row2)); + unpack8to16(s00, a00, b00); + unpack8to16(s01, a01, b01); + unpack8to16(s02, a02, b02); + ra02 = _mm_mulhi_epi16( + _mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third); + rb02 = _mm_mulhi_epi16( + _mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third); + + // Accumulate rows 0, 1, 2 for dst0 + tmp0 = _mm_mulhi_epi16( + _mm_adds_epi16(_mm_adds_epi16(ra00, ra01), ra02), one_third); + tmp1 = _mm_mulhi_epi16( + _mm_adds_epi16(_mm_adds_epi16(rb00, rb01), rb02), one_third); + pack16to8(tmp0, tmp1, res); + _mm_store_si128(dst0++, res); + + // Row 3 Processing + s00 = _mm_loadu_si128((__m128i*)(row3 - 1)); + s01 = _mm_loadu_si128((__m128i*)(row3 + 1)); + s02 = _mm_load_si128((__m128i*)(row3)); + unpack8to16(s00, a00, b00); + unpack8to16(s01, a01, b01); + unpack8to16(s02, a02, b02); + ra00 = _mm_mulhi_epi16( + _mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third); + rb00 = _mm_mulhi_epi16( + _mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third); + + // Accumulate rows 1, 2, 3 for dst1 + tmp0 = _mm_mulhi_epi16( + _mm_adds_epi16(_mm_adds_epi16(ra01, ra02), ra00), one_third); + tmp1 = _mm_mulhi_epi16( + _mm_adds_epi16(_mm_adds_epi16(rb01, rb02), rb00), one_third); + pack16to8(tmp0, tmp1, res); + _mm_store_si128(dst1++, res); + + row0 += 16; + row1 += 16; + row2 += 16; + row3 += 16; + } + dst0 += width / 16; + dst1 += width / 16; + } +} +#endif +void box(uint8_t* in, uint8_t* blurred, int width, int height, int numThreads) { + assert(width % 16 == 0 && "width must be multiple of 16!"); +#if defined(__ARM_NEON) || defined(__aarch64__) + testing::box_hwy(in, blurred, width, height); +#else +#if HWY_TARGET == HWY_AVX2 + boxSSE(in, blurred, width, height); +#else + boxNaive(in, blurred, width, height); +#endif +#endif +} +} // namespace ndb diff --git a/lib/gpc/kernels/box.hpp b/lib/gpc/kernels/box.hpp new file mode 100644 index 0000000..b00dc88 --- /dev/null +++ b/lib/gpc/kernels/box.hpp @@ -0,0 +1,68 @@ +// Copyright (c) 2018, ETH Zurich +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Code Author: Niklaus Bamert (bamertn@ethz.ch) + +#ifndef __NDB__KERNEL_BOX +#define __NDB__KERNEL_BOX +using namespace std; + +#include "gpc/buffer.hpp" + +namespace ndb { +/** + * @brief Naive 3x3 box filter implementation + * + * @param in input image + * @param blurred The blurred output image + * @param[in] width The width + * @param[in] height The height + * @param[in] numThreads number of threads to use + */ +void boxNaive(uint8_t* in, uint8_t* blurred, int width, int height); + +/** + * @brief boxfilter using SSE2 instructions. Loosely based on + * https://www.ignorantus.com/box_sse2/, published under + * the https://creativecommons.org/publicdomain/zero/1.0/ licence. + * + * @param in input image + * @param blurred The blurred + * @param[in] width The width + * @param[in] height The height + * @param[in] numThreads number of threads to use + */ +void box(uint8_t* in, uint8_t* blurred, int width, int height, int numThreads); + +#if HWY_TARGET == HWY_AVX2 +void boxSSE(uint8_t* in, uint8_t* blurred, int width, int height); +#endif + +} // namespace ndb +#endif diff --git a/lib/gpc/kernels/box_hwy.cpp b/lib/gpc/kernels/box_hwy.cpp new file mode 100644 index 0000000..ba2f5dd --- /dev/null +++ b/lib/gpc/kernels/box_hwy.cpp @@ -0,0 +1,111 @@ +#define HWY_TARGET HWY_NEON +#include + +HWY_BEFORE_NAMESPACE(); +namespace ndb { +namespace HWY_NAMESPACE { +namespace hn = hwy::HWY_NAMESPACE; + +void BoxKernel(const uint8_t* HWY_RESTRICT in, + uint8_t* HWY_RESTRICT blurred, + int width, + int height) { + const hn::ScalableTag d8; + const hn::Half d8_h; + const hn::Rebind d16; + + const size_t N = hn::Lanes(d8); + const auto divisor = hn::Set(d16, (uint16_t)7282); + + for (int y = 1; y < height - 2; y += 2) { + const uint8_t* r0 = in + (y - 1) * width; + const uint8_t* r1 = in + y * width; + const uint8_t* r2 = in + (y + 1) * width; + const uint8_t* r3 = in + (y + 2) * width; + + uint8_t* out0 = blurred + y * width + 1; + uint8_t* out1 = blurred + (y + 1) * width + 1; + + for (int x = 0; x < width; x += N) { + auto v00 = hn::LoadU(d8, r0 + x); + auto v01 = hn::LoadU(d8, r0 + x + 1); + auto v02 = hn::LoadU(d8, r0 + x + 2); + auto v10 = hn::LoadU(d8, r1 + x); + auto v11 = hn::LoadU(d8, r1 + x + 1); + auto v12 = hn::LoadU(d8, r1 + x + 2); + auto v20 = hn::LoadU(d8, r2 + x); + auto v21 = hn::LoadU(d8, r2 + x + 1); + auto v22 = hn::LoadU(d8, r2 + x + 2); + auto v30 = hn::LoadU(d8, r3 + x); + auto v31 = hn::LoadU(d8, r3 + x + 1); + auto v32 = hn::LoadU(d8, r3 + x + 2); + + // Lower Half Math + auto s1_lo = + hn::Add(hn::PromoteTo(d16, hn::LowerHalf(v11)), + hn::Add(hn::PromoteTo(d16, hn::LowerHalf(v10)), + hn::PromoteTo(d16, hn::LowerHalf(v12)))); + auto s2_lo = + hn::Add(hn::PromoteTo(d16, hn::LowerHalf(v21)), + hn::Add(hn::PromoteTo(d16, hn::LowerHalf(v20)), + hn::PromoteTo(d16, hn::LowerHalf(v22)))); + + auto row0_lo = hn::Add( + hn::Add(hn::PromoteTo(d16, hn::LowerHalf(v01)), + hn::Add(hn::PromoteTo(d16, hn::LowerHalf(v00)), + hn::PromoteTo(d16, hn::LowerHalf(v02)))), + hn::Add(s1_lo, s2_lo)); + auto row1_lo = hn::Add( + hn::Add(hn::PromoteTo(d16, hn::LowerHalf(v31)), + hn::Add(hn::PromoteTo(d16, hn::LowerHalf(v30)), + hn::PromoteTo(d16, hn::LowerHalf(v32)))), + hn::Add(s1_lo, s2_lo)); + + // Upper Half Math + auto s1_hi = + hn::Add(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v11)), + hn::Add(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v10)), + hn::PromoteTo(d16, hn::UpperHalf(d8_h, v12)))); + auto s2_hi = + hn::Add(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v21)), + hn::Add(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v20)), + hn::PromoteTo(d16, hn::UpperHalf(d8_h, v22)))); + + auto row0_hi = hn::Add( + hn::Add(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v01)), + hn::Add(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v00)), + hn::PromoteTo(d16, hn::UpperHalf(d8_h, v02)))), + hn::Add(s1_hi, s2_hi)); + auto row1_hi = hn::Add( + hn::Add(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v31)), + hn::Add(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v30)), + hn::PromoteTo(d16, hn::UpperHalf(d8_h, v32)))), + hn::Add(s1_hi, s2_hi)); + + hn::StoreU(hn::OrderedDemote2To(d8, + hn::MulHigh(row0_lo, divisor), + hn::MulHigh(row0_hi, divisor)), + d8, + out0 + x); + hn::StoreU(hn::OrderedDemote2To(d8, + hn::MulHigh(row1_lo, divisor), + hn::MulHigh(row1_hi, divisor)), + d8, + out1 + x); + } + } +} +} // namespace HWY_NAMESPACE +} // namespace ndb +HWY_AFTER_NAMESPACE(); + +namespace ndb { +namespace testing { +// #if defined(HWY_TARGET) && HWY_TARGET == HWY_NEON +void box_hwy(uint8_t* in, uint8_t* blurred, int width, int height) { + // ndb::N_NEON::BoxKernel(in, blurred, width, height); + HWY_STATIC_DISPATCH(BoxKernel)(in, blurred, width, height); +} +// #endif +} // namespace testing +} // namespace ndb diff --git a/lib/gpc/kernels/box_hwy.hpp b/lib/gpc/kernels/box_hwy.hpp new file mode 100644 index 0000000..ae39d71 --- /dev/null +++ b/lib/gpc/kernels/box_hwy.hpp @@ -0,0 +1,18 @@ +#ifndef __NDB__KERNEL_BOX_HWY +#define __NDB__KERNEL_BOX_HWY + +#include + +namespace ndb { + +namespace testing { +/** + * Entry point for benchmarking the MulHigh (approximate) version. + */ +void box_hwy(uint8_t* in, uint8_t* blurred, int width, int height); + +} // namespace testing + +} // namespace ndb + +#endif // GPC_KERNELS_BOX_HWY_H_ diff --git a/lib/gpc/kernels/census.cpp b/lib/gpc/kernels/census.cpp new file mode 100644 index 0000000..8b265cc --- /dev/null +++ b/lib/gpc/kernels/census.cpp @@ -0,0 +1,203 @@ +// Copyright (c) 2018, ETH Zurich +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Code Author: Niklaus Bamert (bamertn@ethz.ch) +#include "gpc/kernels/census.hpp" + +#include +void census5x5Naive(uint8_t* in, uint32_t* census, int width, int height) { + uint32_t val; + uint32_t* dst; + for (int y = 2; y < height - 3; y++) { + for (int x = 0; x < width; x++) { + val = 0; + dst = census + y * width + x; + int i = 0; + // patch loops + for (int px = -2; px <= 2; px++) { + for (int py = -2; py <= 2; py++) { + if (!(px == 0 && py == 0)) { + val |= (in[(y + py) * width + (x + px)] > + in[y * width + x]) + ? (1 << i) + : 0; + i++; + } + } + } // End patch loops + *dst = val; + } + } // End pixel loops +} +void census5x5(uint8_t* in, uint32_t* census, int width, int height) { + assert(width % 16 == 0 && "width must be multiple of 16!"); +#ifndef _INTRINSICS_SSE + census5x5Naive(in, census, width, height); +#else + __m128i zero = _mm_set1_epi8(0); + __m128i one = _mm_set1_epi8(1); + + for (int y = 2; y < height - 3; y++) { + for (int x = 0; x < width; x += 16) { + uint8_t* rowPtr; + rowPtr = in + (y - 2) * width + x; + __m128i center = _mm_lddqu_si128((__m128i*)(in + y * width + x)); + __m128i* dst = (__m128i*)(census + y * width + + x); // Set starting point to pixel (2,2) + // row 0 + __m128i bitMask = one; + __m128i byte1 = _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 2))), + bitMask); + bitMask += bitMask; // 2 + byte1 |= _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 1))), + bitMask); + bitMask += bitMask; // 4 + byte1 |= _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr))), + bitMask); + bitMask += bitMask; // 8 + byte1 |= _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 1))), + bitMask); + bitMask += bitMask; // 16 + byte1 |= _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 2))), + bitMask); + + // row 1 + rowPtr += width; + bitMask += bitMask; // 32 + byte1 |= _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 2))), + bitMask); + bitMask += bitMask; // 64 + byte1 |= _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 1))), + bitMask); + bitMask += bitMask; // 128 + byte1 |= _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr))), + bitMask); + bitMask = one; // 1 + __m128i byte2 = _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 1))), + bitMask); + bitMask += bitMask; // 2 + byte2 |= _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 2))), + bitMask); + + // row 2 + rowPtr += width; + bitMask += bitMask; // 4 + byte2 |= _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 2))), + bitMask); + bitMask += bitMask; // 8 + byte2 |= _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 1))), + bitMask); + bitMask += bitMask; // 16 + byte2 |= _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 1))), + bitMask); + bitMask += bitMask; // 32 + byte2 |= _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 2))), + bitMask); + + // row 3 + rowPtr += width; + bitMask += bitMask; // 64 + byte2 |= _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 2))), + bitMask); + bitMask += bitMask; // 128 + byte2 |= _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 1))), + bitMask); + bitMask = one; // 1 + __m128i byte3 = _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr))), + bitMask); + bitMask += bitMask; // 2 + byte3 |= _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 1))), + bitMask); + bitMask += bitMask; // 4 + byte3 |= _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 2))), + bitMask); + + // row 4 + rowPtr += width; + bitMask += bitMask; // 8 + byte3 |= _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 2))), + bitMask); + bitMask += bitMask; // 16 + byte3 |= _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 1))), + bitMask); + bitMask += bitMask; // 32 + byte3 |= _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr))), + bitMask); + bitMask += bitMask; // 64 + byte3 |= _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 1))), + bitMask); + bitMask += bitMask; // 128 + byte3 |= _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 2))), + bitMask); + + // 8bit to 16bit + __m128i high1 = _mm_unpacklo_epi8(byte3, zero); + __m128i high2 = _mm_unpackhi_epi8(byte3, zero); + __m128i low1 = _mm_unpacklo_epi8(byte1, byte2); + __m128i low2 = _mm_unpackhi_epi8(byte1, byte2); + + // 16bit to 32bit ints + _mm_storeu_si128(dst, _mm_unpacklo_epi16(low1, high1)); + _mm_storeu_si128(dst + 1, _mm_unpackhi_epi16(low1, high1)); + _mm_storeu_si128(dst + 2, _mm_unpacklo_epi16(low2, high2)); + _mm_storeu_si128(dst + 3, _mm_unpackhi_epi16(low2, high2)); + + } // col iteration + } // row iteration + // if(numThreads == 1) + // gpcFilterSegment(13,height-15); + // else + // parFor(gpcFilterSegment,13,height-15,4); + +#endif +} // census5x5 diff --git a/lib/gpc/kernels/census.hpp b/lib/gpc/kernels/census.hpp new file mode 100644 index 0000000..054a45f --- /dev/null +++ b/lib/gpc/kernels/census.hpp @@ -0,0 +1,60 @@ +// Copyright (c) 2018, ETH Zurich +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Code Author: Niklaus Bamert (bamertn@ethz.ch) + +#ifndef __NDB__KERNEL_CENSUS +#define __NDB__KERNEL_CENSUS + +#include "gpc/buffer.hpp" + +namespace ndb { +/** + * @brief Naive version of 5x5 census transoform + * + * @param in Input image + * @param census 32bit census transform output + * @param width Width of the image at *in pointer + * @param height Heiht of the image at *in pointer + */ +void census5x5Naive(uint8_t* in, uint32_t* census, int width, int height); + +/** + * @brief 5x5 dense census transform of input image. binary codes are returned + * as a 32bit image + * + * @param in + * @param census + * @param width + * @param height + */ +void census5x5(uint8_t* in, uint32_t* census, int width, int height); + +} // namespace ndb +#endif diff --git a/lib/gpc/kernels/gpc.cpp b/lib/gpc/kernels/gpc.cpp new file mode 100644 index 0000000..7bdf97b --- /dev/null +++ b/lib/gpc/kernels/gpc.cpp @@ -0,0 +1,257 @@ +// Copyright (c) 2018, ETH Zurich +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Code Author: Niklaus Bamert (bamertn@ethz.ch) +#include "gpc/kernels/gpc.hpp" + +#include +namespace ndb { +void gpcFilterNaive(uint8_t* in, + const uint8_t* grad, + uint32_t* gpc, + std::vector fastmask, + std::vector& idx, + int width, + int height) { + // output buffer of same size + uint32_t tmp; + + int j = 0; + for (auto k : idx) { + tmp = 0; + for (uint8_t i = 0; i < fastmask.size(); i += 2) { + tmp <<= 1; // shift by one + if (*(in + k + fastmask[i]) > *(in + k + fastmask[i + 1])) + tmp++; // set this test's result to 1 + } + gpc[k] = tmp; + j++; + } +} + +void gpcFilterTauNaive(uint8_t* in, + const uint8_t* grad, + uint32_t* gpc, + std::vector fastmask, + std::vector tau, + std::vector& idx, + int width, + int height) { + uint32_t tmp; + + int j = 0; + for (auto k : idx) { + tmp = 0; + for (uint8_t i = 0; i < fastmask.size(); i += 2) { + tmp <<= 1; // shift by one + if (*(in + k + fastmask[i]) > + *(in + k + fastmask[i + 1]) - tau[i / 2]) + tmp++; // set this test's result to 1 + } + gpc[k] = tmp; + j++; + } +} + +#if (HWY_ARCH_X86) && (HWY_TARGET == HWY_AVX2) +bool isAllZeros(__m128i xmm) { + return _mm_movemask_epi8(_mm_cmpeq_epi8(xmm, _mm_setzero_si128())) == + 0xFFFF; +} +void gpcFilterSSE(uint8_t* in, + const uint8_t* grad, + uint32_t* gpc, + std::vector fastmask, + std::vector& idx, + int width, + int height) { + const int start = 13; + const int end = height - 15; + __m128i zero = _mm_set1_epi8(0); + __m128i one = _mm_set1_epi8(1); + for (int y = start; y < end; y++) { + for (int x = 0; x < width; x += 16) { + uint8_t* rowPtr; + rowPtr = in + (y - 2) * width + x; + __m128i out[4]; // temporary output vector of 4 128bit words + + const uint8_t* center = (in + y * width + x); + const uint8_t* centerGrad = (grad + y * width + x); + // We only process the current segment if there are any non-zero + // values (high gradient pixels) + if (!isAllZeros(_mm_lddqu_si128((__m128i*)centerGrad))) { + __m128i* dst = + (__m128i*)(gpc + y * width + + x); // Set starting point to pixel (2,2) + out[0] = zero; + out[1] = zero; + out[2] = zero; + out[3] = zero; + uint8_t k = 0; + __m128i bitMask = one; + for (uint8_t i = 0; i < fastmask.size() && i < 64; i += 2) { + out[k] |= _mm_and_si128( + _mm_cmpgt_epu8( + _mm_lddqu_si128((__m128i*)(center + fastmask[i])), + _mm_lddqu_si128( + (__m128i*)(center + fastmask[i + 1]))), + bitMask); + // Keeps index into output vector and updates bit mask + if (i % 16 == 0 && i != 0) { + bitMask = one; + k++; + } else { + bitMask += bitMask; + } + } + // 8bit to 16bit + __m128i high1 = _mm_unpacklo_epi8(out[2], out[3]); + __m128i high2 = _mm_unpackhi_epi8(out[2], out[3]); + __m128i low1 = _mm_unpacklo_epi8(out[0], out[1]); + __m128i low2 = _mm_unpackhi_epi8(out[0], out[1]); + + // 16bit to 32bit ints + _mm_storeu_si128(dst, _mm_unpacklo_epi16(low1, high1)); + _mm_storeu_si128(dst + 1, _mm_unpackhi_epi16(low1, high1)); + _mm_storeu_si128(dst + 2, _mm_unpacklo_epi16(low2, high2)); + _mm_storeu_si128(dst + 3, _mm_unpackhi_epi16(low2, high2)); + } + } // col iteration + } // row iteration +} +#endif +void gpcFilter(uint8_t* in, + const uint8_t* grad, + uint32_t* gpc, + std::vector fastmask, + std::vector& idx, + int width, + int height) { + assert(width % 16 == 0 && "width must be multiple of 16!"); +#if defined(__ARM_NEON) || defined(__aarch64__) + // Replace with call to highway + gpcFilterNaive(in, grad, gpc, fastmask, idx, width, height); +#else +#if (HWY_ARCH_X86) && (HWY_TARGET == HWY_AVX2) + gpcFilterSSE(in, grad, gpc, fastmask, idx, width, height); +#else + gpcFilterNaive(in, grad, gpc, fastmask, idx, width, height); +#endif +#endif +} + +#if (HWY_ARCH_X86) && (HWY_TARGET == HWY_AVX2) +void gpcFilterTauSSE(uint8_t* in, + const uint8_t* grad, + uint32_t* gpc, + std::vector fastmask, + std::vector tau, + std::vector& idx, + int width, + int height) { + const int start = 13; + const int end = height - 15; + __m128i zero = _mm_set1_epi8(0); + __m128i one = _mm_set1_epi8(1); + for (int y = start; y < end; y++) { + for (int x = 0; x < width; x += 16) { + uint8_t* rowPtr; + rowPtr = in + (y - 2) * width + x; + __m128i out[4]; // temporary output vector of 4 128bit words + + const uint8_t* center = (in + y * width + x); + const uint8_t* centerGrad = (grad + y * width + x); + // We only process the current segment if there are any non-zero + // values (high gradient pixels) + if (!isAllZeros(_mm_lddqu_si128((__m128i*)centerGrad))) { + __m128i* dst = + (__m128i*)(gpc + y * width + + x); // Set starting point to pixel (2,2) + out[0] = zero; + out[1] = zero; + out[2] = zero; + out[3] = zero; + uint8_t k = 0; + __m128i bitMask = one; + for (uint8_t i = 0; i < fastmask.size() && i < 64; i += 2) { + out[k] |= _mm_and_si128( + _mm_cmpgt_epu8( + _mm_lddqu_si128((__m128i*)(center + fastmask[i])), + _mm_subs_epi8( + _mm_lddqu_si128( + (__m128i*)(center + fastmask[i + 1])), + _mm_set1_epi8(tau[i / 2])) // deduct tau + ), + bitMask); + // Keeps index into output vector and updates bit mask + if (i % 16 == 0 && i != 0) { + bitMask = one; + k++; + } else { + bitMask += bitMask; + } + } + // 8bit to 16bit + __m128i high1 = _mm_unpacklo_epi8(out[2], out[3]); + __m128i high2 = _mm_unpackhi_epi8(out[2], out[3]); + __m128i low1 = _mm_unpacklo_epi8(out[0], out[1]); + __m128i low2 = _mm_unpackhi_epi8(out[0], out[1]); + + // 16bit to 32bit ints + _mm_storeu_si128(dst, _mm_unpacklo_epi16(low1, high1)); + _mm_storeu_si128(dst + 1, _mm_unpackhi_epi16(low1, high1)); + _mm_storeu_si128(dst + 2, _mm_unpacklo_epi16(low2, high2)); + _mm_storeu_si128(dst + 3, _mm_unpackhi_epi16(low2, high2)); + } + } // col iteration + } // row iteration +} +#endif + +void gpcFilterTau(uint8_t* in, + const uint8_t* grad, + uint32_t* gpc, + std::vector fastmask, + std::vector tau, + std::vector& idx, + int width, + int height) { + assert(width % 16 == 0 && "width must be multiple of 16!"); +#if defined(__ARM_NEON) || defined(__aarch64__) + // Replace with call to highway + gpcFilterTauNaive(in, grad, gpc, fastmask, tau, idx, width, height); +#else +#if (HWY_ARCH_X86) && (HWY_TARGET == HWY_AVX2) + gpcFilterTauSSE(in, grad, gpc, fastmask, tau, idx, width, height); +#else + gpcFilterTauNaive(in, grad, gpc, fastmask, tau, idx, width, height); +#endif +#endif +} +} // namespace ndb diff --git a/lib/gpc/kernels/gpc.hpp b/lib/gpc/kernels/gpc.hpp new file mode 100644 index 0000000..bdd4bf4 --- /dev/null +++ b/lib/gpc/kernels/gpc.hpp @@ -0,0 +1,151 @@ +// Copyright (c) 2018, ETH Zurich +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Code Author: Niklaus Bamert (bamertn@ethz.ch) + +#ifndef __NDB__KERNEL_GPC +#define __NDB__KERNEL_GPC +using namespace std; + +#include "gpc/buffer.hpp" + +namespace ndb { +/** + * @brief Applies a gpc filter defined by the pixel-difference tests in + * fastmask. Accelerated with SSE. + * + * @param in The input image. + * @param grad The gradient image, such that we can skip non-gradient + * pixels + * @param gpc The output image of 32bit codes + * @param fastmask The fastmask containing the gpc filter + * @param idx The gradient indices. Only used if no intrincs are available + * and the call gets forwarded to the naive implementation. + * @param width The width of the image at pointer *in + * @param height The height of the image at pointer *in + */ +void gpcFilter(uint8_t* in, + const uint8_t* grad, + uint32_t* gpc, + std::vector fastmask, + std::vector& idx, + int width, + int height); + +/** + * @brief Applies a gpc filter defined by the pixel-difference tests in + * fastmask. Naive implementation + * + * @param in The input image. + * @param grad The gradient image, such that we can skip non-gradient + * pixels + * @param gpc The output image of 32bit codes + * @param fastmask The fastmask containing the gpc filter + * @param idx The gradient indices. Only used if no intrincs are available + * and the call gets forwarded to the naive implementation. + * @param width The width of the image at pointer *in + * @param height The height of the image at pointer *in + */ +void gpcFilterNaive(uint8_t* in, + const uint8_t* grad, + uint32_t* gpc, + std::vector fastmask, + std::vector& idx, + int width, + int height); +/** + * @brief Applies a gpc filter defined by the pixel-difference tests in + * fastmask. Additionally uses a threshold vector (tau) + * + * @param in The input image. + * @param grad The gradient image, such that we can skip non-gradient + * pixels + * @param gpc The output image of 32bit codes + * @param fastmask The fastmask containing the gpc filter + * @param width The width of the image at pointer *in + * @param height The height of the image at pointer *in + */ +void gpcFilterTau(uint8_t* in, + const uint8_t* grad, + uint32_t* gpc, + std::vector fastmask, + std::vector tau, + std::vector& idx, + int width, + int height); + +/** + * @brief Applies a gpc filter defined by the pixel-difference tests in + * fastmask. Additionally uses a threshold vector (tau) Naive implementation. + * + * @param in The input image. + * @param grad The gradient image, such that we can skip non-gradient + * pixels + * @param gpc The output image of 32bit codes + * @param fastmask The fastmask containing the gpc filter + * @param width The width of the image at pointer *in + * @param height The height of the image at pointer *in + */ +void gpcFilterTauNaive(uint8_t* in, + const uint8_t* grad, + uint32_t* gpc, + std::vector fastmask, + std::vector tau, + std::vector& idx, + int width, + int height); +/** + * @brief Checks if the 128bits in xmm are all zero + * + * @param xmm + * + * @return true if all zeros, false otherwise + */ +#if (HWY_ARCH_X86) && (HWY_TARGET == HWY_AVX2) +bool isAllZeros(__m128i xmm); +void gpcFilterTauSSE(uint8_t* in, + const uint8_t* grad, + uint32_t* gpc, + std::vector fastmask, + std::vector tau, + std::vector& idx, + int width, + int height); +void gpcFilterSSE(uint8_t* in, + const uint8_t* grad, + uint32_t* gpc, + std::vector fastmask, + std::vector& idx, + int width, + int height); + +#endif + +} // namespace ndb +#endif diff --git a/lib/gpc/kernels/gpc_hwy.cpp b/lib/gpc/kernels/gpc_hwy.cpp new file mode 100644 index 0000000..d84710a --- /dev/null +++ b/lib/gpc/kernels/gpc_hwy.cpp @@ -0,0 +1,158 @@ +// #define HWY_TARGET HWY_NEON +#include "gpc_hwy.hpp" +HWY_BEFORE_NAMESPACE(); +namespace ndb { +namespace HWY_NAMESPACE { +namespace hn = hwy::HWY_NAMESPACE; + +// dense! +#include + +namespace hn = hwy::HWY_NAMESPACE; + +// Dense Version +void GPCKernel(const uint8_t* HWY_RESTRICT in, + const uint8_t* HWY_RESTRICT grad, + uint32_t* HWY_RESTRICT gpc, + const std::vector& fastmask, + const std::vector& tau, + int width, + int height) { + const hn::ScalableTag d8; + const hn::ScalableTag d32; + const size_t N = hn::Lanes(d8); + + const int border = 13; + const auto v_zero8 = hn::Zero(d8); + const auto v_one8 = hn::Set(d8, 1); + const int32_t* fm = fastmask.data(); + + for (int y = border; y < height - border; ++y) { + const int row_base = y * width; + uint32_t* HWY_RESTRICT row_out = gpc + row_base; + + for (int x = border; x <= width - border - (int)N; x += N) { + const int k = row_base + x; + + auto v_acc0 = hn::Zero(d8); // Bits 0-7 + auto v_acc1 = hn::Zero(d8); // Bits 8-15 + auto v_acc2 = hn::Zero(d8); // Bits 16-23 + auto v_acc3 = hn::Zero(d8); // Bits 24-31 + + // Pass 1: Bits 0-7 + for (int i = 0; i < 16; i += 2) { + v_acc0 = hn::Add(v_acc0, v_acc0); + auto mask = hn::Gt(hn::LoadU(d8, in + k + fm[i]), + hn::LoadU(d8, in + k + fm[i + 1])); + v_acc0 = hn::Or(v_acc0, hn::IfThenElse(mask, v_one8, v_zero8)); + } + + // Pass 2: Bits 8-15 + for (int i = 16; i < 32; i += 2) { + v_acc1 = hn::Add(v_acc1, v_acc1); + auto mask = hn::Gt(hn::LoadU(d8, in + k + fm[i]), + hn::LoadU(d8, in + k + fm[i + 1])); + v_acc1 = hn::Or(v_acc1, hn::IfThenElse(mask, v_one8, v_zero8)); + } + + // Pass 3: Bits 16-23 + for (int i = 32; i < 48; i += 2) { + v_acc2 = hn::Add(v_acc2, v_acc2); + auto mask = hn::Gt(hn::LoadU(d8, in + k + fm[i]), + hn::LoadU(d8, in + k + fm[i + 1])); + v_acc2 = hn::Or(v_acc2, hn::IfThenElse(mask, v_one8, v_zero8)); + } + + // Pass 4: Bits 24-31 + for (int i = 48; i < 64; i += 2) { + v_acc3 = hn::Add(v_acc3, v_acc3); + auto mask = hn::Gt(hn::LoadU(d8, in + k + fm[i]), + hn::LoadU(d8, in + k + fm[i + 1])); + v_acc3 = hn::Or(v_acc3, hn::IfThenElse(mask, v_one8, v_zero8)); + } + + //extract and combine: + for (size_t lane = 0; lane < N; ++lane) { + uint32_t final_val = + (uint32_t(hn::ExtractLane(v_acc0, lane)) << 24) | + (uint32_t(hn::ExtractLane(v_acc1, lane)) << 16) | + (uint32_t(hn::ExtractLane(v_acc2, lane)) << 8) | + (uint32_t(hn::ExtractLane(v_acc3, lane))); + row_out[x + lane] = final_val; + } + } + } +} +void GPCKerneli(const uint8_t* HWY_RESTRICT in, + const uint8_t* HWY_RESTRICT grad, + uint32_t* HWY_RESTRICT gpc, + const std::vector& fastmask, + const std::vector& tau, + int width, + int height) { + // We use the ScalableTag, but we will "Narrow" our view manually + const hn::ScalableTag d32; + const hn::Rebind + d8_n; // Same number of lanes as d32 + + const size_t N = hn::Lanes(d32); + const auto v_zero = hn::Zero(d32); + const bool use_tau = !tau.empty(); + + for (int y = 0; y < height; ++y) { + for (int x = 0; x < width; x += N) { + const uint8_t* centerGrad = grad + y * width + x; + + // Load the gradient bytes for the current N lanes + auto v_grad = hn::LoadU(d8_n, centerGrad); + + // Promotion-free zero check + if (hn::AllTrue(d8_n, hn::Eq(v_grad, hn::Zero(d8_n)))) { + continue; + } + + auto v_tmp = hn::Zero(d32); + + for (size_t i = 0; i < fastmask.size(); i += 2) { + v_tmp = hn::ShiftLeft<1>(v_tmp); + + // Promote N lanes of uint8 to N lanes of uint32 + auto v1 = hn::PromoteTo( + d32, hn::LoadU(d8_n, in + y * width + x + fastmask[i])); + auto v2 = hn::PromoteTo( + d32, hn::LoadU(d8_n, in + y * width + x + fastmask[i + 1])); + + hn::Mask mask; + if (use_tau) { + auto v_tau = hn::Set(d32, tau[i / 2]); + mask = hn::Gt(v1, hn::Sub(v2, v_tau)); + } else { + mask = hn::Gt(v1, v2); + } + + v_tmp = hn::Add(v_tmp, + hn::IfThenElse(mask, hn::Set(d32, 1), v_zero)); + } + + hn::StoreU(v_tmp, d32, gpc + y * width + x); + } + } +} + +} // namespace HWY_NAMESPACE +} // namespace ndb +HWY_AFTER_NAMESPACE(); + +namespace ndb { +namespace testing { +void gpc_hwy(uint8_t* in, + uint8_t* grad, + uint32_t* HWY_RESTRICT gpc, + const std::vector& fastmask, + const std::vector& tau, + int width, + int height) { + HWY_STATIC_DISPATCH(GPCKernel)(in, grad, gpc, fastmask, tau, width, height); +} +} // namespace testing +} // namespace ndb diff --git a/lib/gpc/kernels/gpc_hwy.hpp b/lib/gpc/kernels/gpc_hwy.hpp new file mode 100644 index 0000000..8a83751 --- /dev/null +++ b/lib/gpc/kernels/gpc_hwy.hpp @@ -0,0 +1,23 @@ +#ifndef __NDB__KERNEL_GPC_HWY +#define __NDB__KERNEL_GPC_HWY + +#include + +#include + +namespace ndb { + +namespace testing { +void gpc_hwy(uint8_t* in, + uint8_t* grad, + uint32_t* HWY_RESTRICT gpc, + const std::vector& fastmask, + const std::vector& tau, + int width, + int height); + +} + +} // namespace ndb + +#endif // GPC_KERNELS_SOBEL_HWY_H_ diff --git a/lib/gpc/kernels/sobel.cpp b/lib/gpc/kernels/sobel.cpp new file mode 100644 index 0000000..becaf50 --- /dev/null +++ b/lib/gpc/kernels/sobel.cpp @@ -0,0 +1,201 @@ +// Copyright (c) 2018, ETH Zurich +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Code Author: Niklaus Bamert (bamertn@ethz.ch) +#include "gpc/kernels/sobel.hpp" + +#include + +#include "gpc/kernels/utils.hpp" +namespace ndb { +namespace testing { +void sobel_hwy( + uint8_t* in, uint8_t* blurred, int width, int height, uint8_t threshold); +} +void sobelNaive( + uint8_t* in, uint8_t* gradient, int width, int height, uint8_t threshold) { + assert(width % 16 == 0 && "width must be multiple of 16!"); + int thresholdSq = threshold * threshold; + uint8_t* ptr = in; + + uint8_t* p11 = ptr + 0 * width; + uint8_t* p12 = ptr + 0 * width + 1; + uint8_t* p13 = ptr + 0 * width + 2; + + uint8_t* p21 = ptr + 1 * width; + uint8_t* p22 = ptr + 1 * width + 1; + uint8_t* p23 = ptr + 1 * width + 2; + + uint8_t* p31 = ptr + 2 * width; + uint8_t* p32 = ptr + 2 * width + 1; + uint8_t* p33 = ptr + 2 * width + 2; + + // output pointer + uint8_t* optr = gradient + 1 * width + 1; + // Apply 3x3 box filter to image less pixel border of 1 (to avoid treating + // boundary) (unoptimized) + for (int iy = 1; iy < height - 1; iy++) { + for (int ix = 0; ix < width; ix++) { + // Approximate division by 9 with fixed-point multiplication (2^16/9 + // = 7282) + int16_t sum_x = (*p11 + *p31 + 2 * *p21 - *p13 - 2 * *p23 - *p33); + int16_t sum_y = (*p11 + *p13 + 2 * *p12 - *p31 - 2 * *p32 - *p33); + + int sx = (static_cast(sum_x) * 7282) >> 16; + int sy = (static_cast(sum_y) * 7282) >> 16; + int val = sx * sx + sy * sy; + + *optr = val > thresholdSq ? 255 : 0; + p11++; + p12++; + p13++; + p21++; + p22++; + p23++; + p31++; + p32++; + p33++; + optr++; + } + } +} +// #ifdef _INTRINSICS_SSE +#if HWY_TARGET == HWY_AVX2 +#include + +void sobelSSE(const uint8_t* in, + uint8_t* blurred, + int width, + int start, + int end, + uint8_t threshold) { + __m128i zero = _mm_setzero_si128(); + __m128i one_ninth = _mm_set1_epi16(7282); // 2^16/9 + __m128i binThres = _mm_set1_epi16(threshold * threshold); + + for (int y = start; y < end; y++) { + const uint8_t* row1 = in + y * width; + const uint8_t* row0 = row1 - width; + const uint8_t* row2 = row1 + width; + + // Output destination for this specific row + __m128i* dst = (__m128i*)(blurred + y * width + 1); + + for (int x = 0; x < width; x += 16) { + __m128i a00, a01, a02, a10, a12, a20, a21, a22; + __m128i b00, b01, b02, b10, b12, b20, b21, b22; + __m128i raA, raB, rbA, rbB; + __m128i tmpa, tmpb, sya, syb, sxa, sxb, res; + + // Load and unpack 3x3 neighborhood (excluding center a11/b11) + unpack8to16(_mm_loadu_si128((__m128i*)(row0 + x - 1)), a00, b00); + unpack8to16(_mm_loadu_si128((__m128i*)(row0 + x)), a01, b01); + unpack8to16(_mm_loadu_si128((__m128i*)(row0 + x + 1)), a02, b02); + + unpack8to16(_mm_loadu_si128((__m128i*)(row1 + x - 1)), a10, b10); + unpack8to16(_mm_loadu_si128((__m128i*)(row1 + x + 1)), a12, b12); + + unpack8to16(_mm_loadu_si128((__m128i*)(row2 + x - 1)), a20, b20); + unpack8to16(_mm_loadu_si128((__m128i*)(row2 + x)), a21, b21); + unpack8to16(_mm_loadu_si128((__m128i*)(row2 + x + 1)), a22, b22); + + // --- SX Calculation --- + // Left col (1,2,1) + raA = _mm_mulhi_epi16( + _mm_add_epi16(_mm_add_epi16(a00, a20), _mm_add_epi16(a10, a10)), + one_ninth); + rbA = _mm_mulhi_epi16( + _mm_add_epi16(_mm_add_epi16(b00, b20), _mm_add_epi16(b10, b10)), + one_ninth); + // Right col (-1,-2,-1) + raB = _mm_mulhi_epi16( + _mm_add_epi16(_mm_add_epi16(a02, a22), _mm_add_epi16(a12, a12)), + one_ninth); + rbB = _mm_mulhi_epi16( + _mm_add_epi16(_mm_add_epi16(b02, b22), _mm_add_epi16(b12, b12)), + one_ninth); + + tmpa = _mm_sub_epi16(raA, raB); + tmpb = _mm_sub_epi16(rbA, rbB); + sxa = _mm_mullo_epi16(tmpa, tmpa); + sxb = _mm_mullo_epi16(tmpb, tmpb); + + // --- SY Calculation --- + // Top row (1,2,1) + raA = _mm_mulhi_epi16( + _mm_add_epi16(_mm_add_epi16(a00, a02), _mm_add_epi16(a01, a01)), + one_ninth); + rbA = _mm_mulhi_epi16( + _mm_add_epi16(_mm_add_epi16(b00, b02), _mm_add_epi16(b01, b01)), + one_ninth); + // Bottom row (-1,-2,-1) + raB = _mm_mulhi_epi16( + _mm_add_epi16(_mm_add_epi16(a20, a22), _mm_add_epi16(a21, a21)), + one_ninth); + rbB = _mm_mulhi_epi16( + _mm_add_epi16(_mm_add_epi16(b20, b22), _mm_add_epi16(b21, b21)), + one_ninth); + + tmpa = _mm_sub_epi16(raA, raB); + tmpb = _mm_sub_epi16(rbA, rbB); + sya = _mm_mullo_epi16(tmpa, tmpa); + syb = _mm_mullo_epi16(tmpb, tmpb); + + // --- Thresholding and Packing --- + pack16to8( + _mm_unpacklo_epi8( + _mm_cmpgt_epi16(_mm_adds_epi16(sxa, sya), binThres), zero), + _mm_unpacklo_epi8( + _mm_cmpgt_epi16(_mm_adds_epi16(sxb, syb), binThres), zero), + res); + + _mm_storeu_si128(dst++, res); + } + } +} +#endif +void sobel(uint8_t* in, + uint8_t* blurred, + int width, + int height, + uint8_t threshold, + int numThreads) { + assert(width % 16 == 0 && "width must be multiple of 16!"); +#if defined(__ARM_NEON) || defined(__aarch64__) + sobelNaive(in, blurred, width, height, threshold); + // testing::sobel_hwy(in, blurred, width, height, threshold); // not exact! +#else +#ifndef _INTRINSICS_SSE + sobelNaive(in, blurred, width, height, threshold); +#else + sobelSSE(in, blurred, width, 1, height - 1, threshold); +#endif +#endif +} +} // namespace ndb diff --git a/lib/gpc/kernels/sobel.hpp b/lib/gpc/kernels/sobel.hpp new file mode 100644 index 0000000..312a70c --- /dev/null +++ b/lib/gpc/kernels/sobel.hpp @@ -0,0 +1,77 @@ +// Copyright (c) 2018, ETH Zurich +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Code Author: Niklaus Bamert (bamertn@ethz.ch) + +#ifndef __NDB__KERNEL_SOBEL +#define __NDB__KERNEL_SOBEL +#include "gpc/buffer.hpp" + +namespace ndb { +#if HWY_TARGET == HWY_AVX2 +void sobelSSE(const uint8_t* in, + uint8_t* blurred, + int width, + int start, + int end, + uint8_t threshold); + +#endif +/** + * @brief Naive 3x3 sobel filter implementation + * + * @param in input image + * @param blurred The blurred output image + * @param[in] width The width + * @param[in] height The height + * @param[in] numThreads number of threads to use + * @param threshold threshold to binarize sobel filter output + */ +void sobelNaive( + uint8_t* in, uint8_t* gradient, int width, int height, uint8_t threshold); + +/** + * @brief 3x3 Sobel filter. Input dimension must be multiple of 16 + * + * @param in { parameter_description } + * @param blurred The blurred + * @param[in] width The width + * @param[in] height The height + * @param[in] threshold The threshold + * @param[in] numThreads number of threads to use + */ + +void sobel(uint8_t* in, + uint8_t* blurred, + int width, + int height, + uint8_t threshold, + int numThreads); +} // namespace ndb +#endif diff --git a/lib/gpc/kernels/sobel_hwy.cpp b/lib/gpc/kernels/sobel_hwy.cpp new file mode 100644 index 0000000..493b52d --- /dev/null +++ b/lib/gpc/kernels/sobel_hwy.cpp @@ -0,0 +1,125 @@ +// #define HWY_TARGET HWY_NEON +#include + +HWY_BEFORE_NAMESPACE(); +namespace ndb { +namespace HWY_NAMESPACE { +namespace hn = hwy::HWY_NAMESPACE; +void SobelKernelNoDiv(const uint8_t* HWY_RESTRICT in, + uint8_t* HWY_RESTRICT gradient, + int width, + int height, + uint8_t threshold) { + const hn::ScalableTag d8; + const hn::Half d8_h; + const hn::Rebind d16; + // d32 has half the lanes of d16 + const hn::Rebind> d32; + + const size_t N = hn::Lanes(d8); + const auto vDivMult = hn::Set(d16, (int16_t)7282); + const auto vThreshSq = hn::Set(d32, (int32_t)threshold * threshold); + const auto v255_16 = hn::Set(d16, (int16_t)255); + const auto v255_8 = hn::Set(d8, (uint8_t)255); + const auto v0_8 = hn::Zero(d8); + + for (int y = 1; y < height - 1; ++y) { + const uint8_t* r0 = in + (y - 1) * width; + const uint8_t* r1 = in + y * width; + const uint8_t* r2 = in + (y + 1) * width; + uint8_t* out = gradient + y * width + 1; + + for (int x = 0; x < width; x += N) { + auto v11 = hn::LoadU(d8, r0 + x); + auto v12 = hn::LoadU(d8, r0 + x + 1); + auto v13 = hn::LoadU(d8, r0 + x + 2); + auto v21 = hn::LoadU(d8, r1 + x); + auto v23 = hn::LoadU(d8, r1 + x + 2); + auto v31 = hn::LoadU(d8, r2 + x); + auto v32 = hn::LoadU(d8, r2 + x + 1); + auto v33 = hn::LoadU(d8, r2 + x + 2); + + // Helper to process 8 pixels into a 16-bit mask-like result + auto process_half = [&](auto p11, + auto p12, + auto p13, + auto p21, + auto p23, + auto p31, + auto p32, + auto p33) { + // Sobel derivatives in 16-bit + auto sx16 = hn::MulHigh( + hn::Sub(hn::Add(hn::Add(p11, p31), hn::Add(p21, p21)), + hn::Add(hn::Add(p13, p33), hn::Add(p23, p23))), + vDivMult); + auto sy16 = hn::MulHigh( + hn::Sub(hn::Add(hn::Add(p11, p13), hn::Add(p12, p12)), + hn::Add(hn::Add(p31, p33), hn::Add(p32, p32))), + vDivMult); + + // Magnitude squared in 32-bit + auto sx_lo = hn::PromoteLowerTo(d32, sx16); + auto sy_lo = hn::PromoteLowerTo(d32, sy16); + auto mag_lo = + hn::Add(hn::Mul(sx_lo, sx_lo), hn::Mul(sy_lo, sy_lo)); + + auto sx_hi = hn::PromoteUpperTo(d32, sx16); + auto sy_hi = hn::PromoteUpperTo(d32, sy16); + auto mag_hi = + hn::Add(hn::Mul(sx_hi, sx_hi), hn::Mul(sy_hi, sy_hi)); + + // Comparison in 32-bit, returning 16-bit values (0 or 255) to + // avoid mask issues + auto m_lo = hn::IfThenElse(hn::Gt(mag_lo, vThreshSq), + hn::Set(d32, 255), + hn::Zero(d32)); + auto m_hi = hn::IfThenElse(hn::Gt(mag_hi, vThreshSq), + hn::Set(d32, 255), + hn::Zero(d32)); + + return hn::OrderedDemote2To(d16, m_lo, m_hi); + }; + + // Process halves using standard Highway promotion + auto res_lo = process_half(hn::PromoteLowerTo(d16, v11), + hn::PromoteLowerTo(d16, v12), + hn::PromoteLowerTo(d16, v13), + hn::PromoteLowerTo(d16, v21), + hn::PromoteLowerTo(d16, v23), + hn::PromoteLowerTo(d16, v31), + hn::PromoteLowerTo(d16, v32), + hn::PromoteLowerTo(d16, v33)); + + auto res_hi = process_half(hn::PromoteUpperTo(d16, v11), + hn::PromoteUpperTo(d16, v12), + hn::PromoteUpperTo(d16, v13), + hn::PromoteUpperTo(d16, v21), + hn::PromoteUpperTo(d16, v23), + hn::PromoteUpperTo(d16, v31), + hn::PromoteUpperTo(d16, v32), + hn::PromoteUpperTo(d16, v33)); + + // Final store: 16-bit to 8-bit demotion + auto final_val = hn::OrderedDemote2To(d8, res_lo, res_hi); + hn::StoreU(final_val, d8, out + x); + } + } +} + +} // namespace HWY_NAMESPACE +} // namespace ndb +HWY_AFTER_NAMESPACE(); + +namespace ndb { +namespace testing { +// #if defined(HWY_TARGET) && HWY_TARGET == HWY_NEON +void sobel_hwy( + uint8_t* in, uint8_t* blurred, int width, int height, uint8_t threshold) { + // ndb::N_NEON::SobelKernel(in, blurred, width, height, threshold); + HWY_STATIC_DISPATCH(SobelKernelNoDiv)( + in, blurred, width, height, threshold); +} +// #endif +} // namespace testing +} // namespace ndb diff --git a/lib/gpc/kernels/sobel_hwy.hpp b/lib/gpc/kernels/sobel_hwy.hpp new file mode 100644 index 0000000..7c5d4ce --- /dev/null +++ b/lib/gpc/kernels/sobel_hwy.hpp @@ -0,0 +1,18 @@ +#ifndef __NDB__KERNEL_SOBEL_HWY +#define __NDB__KERNEL_SOBEL_HWY + +#include + +namespace ndb { + +namespace testing { +/** + * Entry point for benchmarking the MulHigh (approximate) version. + */ +void sobel_hwy( + uint8_t* in, uint8_t* blurred, int width, int height, uint8_t threshold); +} // namespace testing + +} // namespace ndb + +#endif // GPC_KERNELS_SOBEL_HWY_H_ diff --git a/lib/gpc/kernels/utils.cpp b/lib/gpc/kernels/utils.cpp new file mode 100644 index 0000000..1b1ab4b --- /dev/null +++ b/lib/gpc/kernels/utils.cpp @@ -0,0 +1,105 @@ +// Copyright (c) 2018, ETH Zurich +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Code Author: Niklaus Bamert (bamertn@ethz.ch) +#include "gpc/kernels/utils.hpp" + +#include +#include +#include + +using namespace std; + +namespace ndb { +void arr2ind(const unsigned char* a, int n, int* ind, int* m) { +#if HWY_TARGET == HWY_AVX2 + int i, m0, k; + __m256i msk; + m0 = 0; + for (i = 0; i < n; i = i + 32) { /* Load 32 bytes and compare with zero: */ + msk = _mm256_cmpeq_epi8(_mm256_load_si256((__m256i*)&a[i]), + _mm256_setzero_si256()); + k = _mm256_movemask_epi8(msk); + k = ~k; /* Search for nonzero bits instead of zero bits. */ + while (k) { + ind[m0] = + i + _tzcnt_u32( + k); /* Count the number of trailing zero bits in k. */ + m0++; + k = _blsr_u32(k); /* Clear the lowest set bit in k. */ + } + } + *m = m0; +#else + int nnz = 0; + for (int i = 0; i < n; i++) { + if (a[i] != 0) { + nnz++; + *ind = i; + ind++; + } + } + *m = nnz; +#endif +} +#if HWY_TARGET == HWY_AVX2 +void unpack8to16(const __m128i x, __m128i& y0, __m128i& y1) { + __m128i zero = _mm_setzero_si128(); + y0 = _mm_unpacklo_epi8(x, zero); + y1 = _mm_unpackhi_epi8(x, zero); +} +void pack16to8(const __m128i x0, const __m128i x1, __m128i& y) { + y = _mm_packus_epi16(x0, x1); +} + +#endif +void parFor(std::function const& f, + int start, + int end, + int nThreads) { + // Range definition + // quantities derived from range + int segSize = (end - start) / nThreads; + int lastSeg = (end - start) % nThreads; + + std::vector threads; + threads.reserve(nThreads); + + // Spawn threads + for (int t = 0; t < nThreads - 1; t++) { + threads.emplace_back(f, start + t * segSize, start + (t + 1) * segSize); + } + threads.emplace_back(f, + start + (nThreads - 1) * segSize, + start + (nThreads)*segSize + lastSeg); + // Join + for (auto& t : threads) t.join(); +} + +} // namespace ndb diff --git a/lib/gpc/kernels/utils.hpp b/lib/gpc/kernels/utils.hpp new file mode 100644 index 0000000..3985c1e --- /dev/null +++ b/lib/gpc/kernels/utils.hpp @@ -0,0 +1,105 @@ +// Copyright (c) 2018, ETH Zurich +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Code Author: Niklaus Bamert (bamertn@ethz.ch) +#ifndef __NDB__KERNEL_UTILS +#define __NDB__KERNEL_UTILS + +#include + +#include +#include + +#include "gpc/buffer.hpp" +using namespace std; + +#if HWY_TARGET == HWY_AVX2 +#include +// greater and lesser than simd ops for unsigned 8bit integer (epu8) +#define _mm_cmpgt_epu8(v0, v1) \ + _mm_cmpgt_epi8(_mm_xor_si128(v0, _mm_set1_epi8(-128)), \ + _mm_xor_si128(v1, _mm_set1_epi8(-128))) +#define _mm_cmplt_epu8(v1, v0) \ + _mm_cmpgt_epi8(_mm_xor_si128(v0, _mm_set1_epi8(-128)), \ + _mm_xor_si128(v1, _mm_set1_epi8(-128))) +#endif +namespace ndb { +/** + * @brief Gets indices of non-zero values in array a. + * Credits: + * https://stackoverflow.com/questions/18971401/sparse-array-compression-using-simd-avx2/41958528#41958528 + * + * @param input array + * @param n number of input elements + * @param ind output array (indices into n of nonzero elements) + * @param m number of elements in output + */ +void arr2ind(const unsigned char* a, int n, int* ind, int* m); + +#if HWY_TARGET == HWY_AVX2 +/** + * @brief Unpacks 16x8bit from a 128bit simd var into 2x128bit vars + * (8x16bit) + * + * @param[in] x the 128 bit vector to be unpacked + * @param y0 The y 0 + * @param y1 The y 1 + */ +void unpack8to16(const __m128i x, __m128i& y0, __m128i& y1); + +/** + * @brief Packs 2x128bit vars with 16bit values(where 8 upper bits are + * zero) into 1x128bit with 8bit values + * + * @param[in] x0 The x 0 + * @param[in] x1 The x 1 + * @param y the packed vector + */ +void pack16to8(const __m128i x0, const __m128i x1, __m128i& y); +#endif +/** + * @brief Calls a given functional f with subranges based on the given start + * and end indices. Here the functional is assumed to take two integer + * arguments indicating their respective start and end ranges. + * nThreads determines the number of threads the given range shall be + * split into. The range is inclusive on the lower bound and exclusive on the + * upper bound, i.e. [start,end) + * + * @param f function object (e.g. a lambda functional) + * @param start start of the range + * @param end end of the range + * @param nThreads number of threads to use + */ +void parFor(std::function const& f, + int start, + int end, + int nThreads); + +} // namespace ndb +#endif diff --git a/lib/gpc/training.hpp b/lib/gpc/training.hpp index f1e398b..8d557cc 100644 --- a/lib/gpc/training.hpp +++ b/lib/gpc/training.hpp @@ -49,7 +49,6 @@ #include "gpc/SintelOpticalFlow.hpp" #include "gpc/SintelStereo.hpp" #include "gpc/buffer.hpp" -#include "gpc/filter.hpp" #include "gpc/hashmatch.hpp" namespace gpc { diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt index 3bbe11f..14f4a2c 100644 --- a/samples/CMakeLists.txt +++ b/samples/CMakeLists.txt @@ -1,10 +1,13 @@ add_executable(extract extract.cpp) -target_link_libraries(extract ${PNG_LIBRARIES} Threads::Threads) +target_link_libraries(extract gpc_core) add_executable(train train.cpp) -target_link_libraries(train ${PNG_LIBRARIES} Threads::Threads) +target_link_libraries(train gpc_core) add_executable(sparsematch sparsematch.cpp) -target_link_libraries(sparsematch ${PNG_LIBRARIES} Threads::Threads) +target_link_libraries(sparsematch gpc_core) + +add_executable(target target.cpp) +target_link_libraries(target gpc_core) diff --git a/samples/sparsematch.cpp b/samples/sparsematch.cpp index ed43016..b25a96f 100644 --- a/samples/sparsematch.cpp +++ b/samples/sparsematch.cpp @@ -1,12 +1,35 @@ +#include + #include -#include "gpc/inference.hpp" +#include "gpc/forest.hpp" using namespace std; +std::vector gpcFilterDense( + uint8_t* in, const std::vector& fastmask, int width, int height) { + uint32_t tmp; + uint32_t usableW = width - 26; + uint32_t usableH = height - 26; + std::vector out(usableW * usableH); + int j = 0; + for (int y = 13; y < height - 13; y++) { + for (int x = 13; x < width - 13; x++) { + tmp = 0; + int idx = y * width + x; + for (size_t i = 0; i < fastmask.size(); i += 2) { + tmp <<= 1; // shift by one + if (*(in + idx + fastmask[i]) > *(in + idx + fastmask[i + 1])) + tmp++; // set this test's result to 1 + } + out[j] = ndb::Descriptor(ndb::Point(x, y), tmp); + j++; + } + } + return out; +} int main(int argc, char** argv) { - std::string forestPath = "../../forests/defaultZeroForest.txt"; - std::string leftImgPath = "../../data/kitti/training/image_0/000000_10.png"; - std::string rightImgPath = - "../../data/kitti/training/image_1/000000_10.png"; + std::string forestPath = "../forests/defaultZeroForest.txt"; + std::string leftImgPath = "../data/middlebury/im0.png"; + std::string rightImgPath = "../data/middlebury/im1.png"; if (argc == 4) { forestPath = argv[1]; @@ -32,7 +55,8 @@ int main(int argc, char** argv) { gpc::inference::InferenceSettings inferencesettings = gpc::inference::InferenceSettings() .builder() - .gradientThreshold(5) + .gradientThreshold( + 1) // gradientthres 20: matching ~3ms, 2: matching: ~30ms. .verticalTolerance( 0) // 0px tolerance for rectified epipolar matches .dispHigh(128) // limit disparities to 128 @@ -46,15 +70,14 @@ int main(int argc, char** argv) { timg.readPNG(rightImgPath); // Get learned filter for the given image dimensions. - GPCForest_t::FilterMask fm = + gpc::inference::FilterMask fm = forest.readForest(forestPath, simg.cols(), simg.rows()); - // Preprocess images (box filter, sobel filter, indices of high gradient - // pixels) gpc::inference::time_point t0 = gpc::inference::sysTick(); - GPCForest_t::PreprocessedImage simgP = + + gpc::inference::PreprocessedImage simgP = forest.preprocessImage(simg, inferencesettings); - GPCForest_t::PreprocessedImage timgP = + gpc::inference::PreprocessedImage timgP = forest.preprocessImage(timg, inferencesettings); gpc::inference::time_point t1 = gpc::inference::sysTick(); @@ -62,14 +85,25 @@ int main(int argc, char** argv) { std::vector supp = forest.rectifiedMatch(simgP, timgP, fm, inferencesettings); gpc::inference::time_point t2 = gpc::inference::sysTick(); - cout << "tPreprocess: " << gpc::inference::tickToMs(t1, t0) << " ms" - << ", #candidatesL:" << simgP.mask.size() - << ", #candidatesR:" << timgP.mask.size() - << ", tMatch: " << gpc::inference::tickToMs(t2, t1) << " ms" - << ", num matches:" << supp.size() << endl; + std::cout << "Number of features(s,t): " << simgP.mask.size() << "," + << timgP.mask.size() << std::endl; + std::cout << "Number of matches: " << supp.size() << std::endl; + std::cout << "Preprocessing time: " << gpc::inference::tickToMs(t1, t0) + << " ms" << std::endl; + std::cout << "Matching time: " << gpc::inference::tickToMs(t2, t1) << " ms" + << std::endl; + /* + std::vector statesSrc = forest.evalFastMaskOnSubsetSSE( + simgP.smooth, simgP.grad, simgP.mask, fm, inferencesettings); + std::vector statesTar = forest.evalFastMaskOnSubsetSSE( + timgP.smooth, timgP.grad, timgP.mask, fm, inferencesettings); + */ + + std::vector statesSrc = gpcFilterDense( + simgP.smooth.data(), fm.mask, simgP.smooth.cols(), simgP.smooth.rows()); + std::vector statesTar = gpcFilterDense( + timgP.smooth.data(), fm.mask, timgP.smooth.cols(), timgP.smooth.rows()); - // Output sparse disparities overlayed on left input image - ndb::Buffer renderDisp; - renderDisp = ndb::getDisparityVisualization(simg, supp); - renderDisp.writePNGRGB("disparity.png"); + ndb::Descriptor::serialize("statesSrcLargeS.txt", statesSrc); + ndb::Descriptor::serialize("statesTarLargeS.txt", statesTar); } diff --git a/samples/target.cpp b/samples/target.cpp new file mode 100644 index 0000000..f457f9d --- /dev/null +++ b/samples/target.cpp @@ -0,0 +1,15 @@ +#include + +#include +int main() { + std::cout << "Compiled for: " << hwy::TargetName(HWY_TARGET) << std::endl; +#if HWY_TARGET == HWY_AVX2 + std::cout << "Using 256-bit AVX2 paths." << std::endl; +#elif HWY_TARGET == HWY_NEON + std::cout << "Using 128-bit NEON paths." << std::endl; +#elif HWY_TARGET == HWY_SSE4 + std::cout << "Using 128-bit SSE4 paths." << std::endl; +#else + std::cout << "Using Scalar fallback." << std::endl; +#endif +} diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index acf86db..e14ceda 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -11,8 +11,22 @@ FetchContent_MakeAvailable(approvaltests) find_package(GTest REQUIRED) -add_executable(test_single_matching test_single_matching.cpp) -target_link_libraries(test_single_matching PRIVATE ${PNG_LIBRARIES} ApprovalTests::ApprovalTests GTest::gtest_main) +function(add_gpc_approval_test TEST_NAME SOURCE_FILE) + add_executable(${TEST_NAME} ${SOURCE_FILE}) + + target_link_libraries(${TEST_NAME} + PRIVATE + gpc_core + ApprovalTests::ApprovalTests + GTest::gtest_main + ) + -add_test(NAME single_matching COMMAND test_single_matching) + add_test(NAME ${TEST_NAME} COMMAND ${TEST_NAME}) +endfunction() + +add_gpc_approval_test(test_single_matching test_single_matching.cpp) +add_gpc_approval_test(test_kernel_box test_kernel_box.cpp) +add_gpc_approval_test(test_kernel_sobel test_kernel_sobel.cpp) +add_gpc_approval_test(test_kernel_gpc test_kernel_gpc.cpp) diff --git a/tests/test_kernel_box.cpp b/tests/test_kernel_box.cpp new file mode 100644 index 0000000..9913a7a --- /dev/null +++ b/tests/test_kernel_box.cpp @@ -0,0 +1,39 @@ +#include +#include +#include +#include "gpc/kernels/box.hpp" // Naive version +#include "gpc/kernels/box_hwy.hpp" // Highway version + +TEST(Approval, BoxKernel) { + const int width = 640; + const int height = 480; + const int radius = 2; // Typical for 5x5 box + + // 1. Prepare randomized input + std::vector input(width * height); + std::mt19937 gen(42); + std::uniform_int_distribution<> dis(0, 255); + for (auto& val : input) val = dis(gen); + + // 2. Prepare output buffers + std::vector outNaive(width * height, 0); + std::vector outHighway(width * height, 0); + + // 3. Run Naive version + ndb::boxNaive(input.data(), outNaive.data(), width, height); + + // 4. Run Highway version (only if compiled for the target) + ndb::testing::box_hwy(input.data(), outHighway.data(), width, height); + + + // 5. Compare results + // We skip the border (radius) because different implementations + // might handle edges differently. + for (int y = radius; y < height - radius; ++y) { + for (int x = radius; x < width - radius; ++x) { + int idx = y * width + x; + ASSERT_EQ(outNaive[idx], outHighway[idx]) + << "Mismatch at (" << x << "," << y << ")"; + } + } +} diff --git a/tests/test_kernel_gpc.cpp b/tests/test_kernel_gpc.cpp new file mode 100644 index 0000000..7989f35 --- /dev/null +++ b/tests/test_kernel_gpc.cpp @@ -0,0 +1,82 @@ +#include +#include +#include +#include "gpc/forest.hpp" +#include "gpc/kernels/gpc.hpp" // Naive version +#include "gpc/kernels/sobel.hpp" // Highway version +#include "gpc/kernels/gpc_hwy.hpp" // Highway version +#include "gpc/kernels/utils.hpp" // Highway version + +TEST(Approval, GPCKernel) { + auto file = std::filesystem::absolute(__FILE__); + auto dir = file.parent_path(); + std::filesystem::path forestPath = dir / ".." / "forests" / "defaultZeroForest.txt"; + + const int width = 640; + const int height = 480; + const int radius = 2; // Typical for 5x5 bo + const int threshold = 0; // Example threshold for binarization + + typedef gpc::inference::Forest GPCForest_t; + GPCForest_t forest; + gpc::inference::FilterMask fm = + forest.readForest(forestPath, width, height); + + // 1. Prepare randomized input + std::vector input(width * height); + std::mt19937 gen(42); + std::uniform_int_distribution<> dis(0, 255); + for (auto& val : input) val = dis(gen); + + // 2. Prepare output buffers + std::vector grad(width * height, 0); + std::vector outNaive(width * height, 0); + std::vector outHighway(width * height, 0); + + // 3. Prepare gradient and fastmask + ndb::sobelNaive(input.data(), grad.data(), width, height, threshold); + + // More prep + std::vector idx(grad.size()); + auto ff = [&](std::vector& in, std::vector& out, int m) { + for (int i = 0; i < m; i++) { + int x = in.data()[i] % width; + int y = in.data()[i] / width; + if (y >= 13 && y < height - 13 && x >= 13 && x < width - 13) + out.push_back(in.data()[i]); + } + }; + int m; + // mask indexing gradient pixels + std::vector fastmask; + ndb::arr2ind(grad.data(), width * height, idx.data(), &m); + ff(idx, fastmask, m); + + std::vector tau; + // 4. Run Naive version + ndb::gpcFilterNaive(input.data(), grad.data(), outNaive.data(), + fm.mask, fastmask, width, height); + /* + * fastmask.mask, fastmask.tau, idx + fastmask.mask is.. imo the extraction pattern. lets se... + it's filtermask! lol + where idx is... preprocessed.mask. WTF..what is that lol + */ + + // 5. Run Highway version + // + //ndb::gpcFilterSSE(input.data(), grad.data(), outHighway.data(), fastmask, tau, width, height); + ndb::testing::gpc_hwy(input.data(), grad.data(), outHighway.data(), + fastmask, tau, width, height); + + // 6. Compare results + // We skip the border (radius) because different implementations + // might handle edges differently. + for (int y = radius; y < height - radius; ++y) { + for (int x = radius; x < width - radius; ++x) { + int idx = y * width + x; + ASSERT_EQ(outNaive[idx], outHighway[idx]) + << "Mismatch at (" << x << "," << y << ")"; + } + } +} diff --git a/tests/test_kernel_sobel.cpp b/tests/test_kernel_sobel.cpp new file mode 100644 index 0000000..bfc56ff --- /dev/null +++ b/tests/test_kernel_sobel.cpp @@ -0,0 +1,39 @@ +#include +#include +#include +#include "gpc/kernels/sobel.hpp" // Naive version +#include "gpc/kernels/sobel_hwy.hpp" // Highway version + +TEST(Approval, SobelKernel) { + const int width = 640; + const int height = 480; + const int radius = 2; // Typical for 5x5 bo + const int threshold = 30; // Example threshold for binarization + + // 1. Prepare randomized input + std::vector input(width * height); + std::mt19937 gen(42); + std::uniform_int_distribution<> dis(0, 255); + for (auto& val : input) val = dis(gen); + + // 2. Prepare output buffers + std::vector outNaive(width * height, 0); + std::vector outHighway(width * height, 0); + + // 3. Run Naive version + ndb::sobelNaive(input.data(), outNaive.data(), width, height, threshold); + + // 4. Run Highway version (only if compiled for the target) + ndb::testing::sobel_hwy(input.data(), outHighway.data(), width, height, threshold); + + // 5. Compare results + // We skip the border (radius) because different implementations + // might handle edges differently. + for (int y = radius; y < height - radius; ++y) { + for (int x = radius; x < width - radius; ++x) { + int idx = y * width + x; + ASSERT_EQ(outNaive[idx], outHighway[idx]) + << "Mismatch at (" << x << "," << y << ")"; + } + } +} diff --git a/tests/test_single_matching.Approval.Inference.approved.txt b/tests/test_single_matching.Approval.Inference.approved.txt index 294072e..d481bd2 100644 --- a/tests/test_single_matching.Approval.Inference.approved.txt +++ b/tests/test_single_matching.Approval.Inference.approved.txt @@ -1 +1 @@ -[(13, 569, 0), (13, 570, 0), (13, 658, -1), (13, 659, -1), (13, 660, -1), (13, 671, -1), (13, 690, -2), (14, 562, 1), (14, 571, 1), (14, 572, 1), (14, 573, 1), (14, 609, 0), (14, 792, -5), (14, 793, -5), (14, 794, -5), (14, 857, -5), (15, 566, 2), (15, 828, -5), (15, 868, -6), (15, 1006, -10), (16, 850, -5), (143, 76, 103), (215, 753, 102), (236, 336, 128), (236, 337, 128), (237, 340, 128), (237, 341, 128), (239, 347, 128), (239, 351, 128), (239, 352, 128), (239, 353, 128), (239, 356, 128), (240, 263, 67), (240, 351, 128), (240, 352, 128), (240, 354, 128), (240, 357, 128), (240, 359, 128), (240, 362, 128), (241, 264, 68), (241, 362, 128), (241, 364, 128), (242, 364, 128), (242, 367, 128), (243, 267, 68), (243, 370, 128), (243, 371, 128), (243, 372, 127), (243, 373, 127), (243, 374, 127), (243, 375, 128), (243, 377, 128), (243, 421, 124), (244, 268, 69), (244, 377, 127), (244, 378, 127), (244, 380, 127), (244, 420, 124), (244, 421, 124), (244, 442, 127), (245, 270, 69), (245, 385, 127), (245, 386, 127), (245, 425, 124), (245, 439, 127), (246, 387, 128), (246, 388, 127), (246, 389, 127), (247, 442, 128), (247, 448, 128), (248, 374, 128), (248, 446, 128), (249, 276, 69), (250, 277, 69), (250, 412, 127), (250, 413, 126), (250, 456, 127), (252, 426, 127), (252, 932, 115), (254, 425, 126), (256, 287, 68), (257, 289, 68), (258, 290, 68), (259, 292, 69), (259, 421, 126), (262, 419, 127), (264, 418, 126), (266, 418, 6), (266, 550, 121), (267, 422, 126), (269, 286, 82), (270, 938, 119), (272, 582, 120), (272, 583, 120), (273, 843, 113), (275, 934, 124), (277, 560, 127), (278, 617, 119), (279, 570, 96), (279, 617, 119), (279, 620, 119), (279, 621, 119), (282, 623, -1), (283, 644, 118), (284, 637, 92), (285, 635, -2), (285, 642, 119), (287, 401, 40), (290, 557, -38), (291, 634, 125), (292, 643, 126), (292, 674, 118), (296, 570, -40), (296, 665, 5), (296, 715, 116), (296, 716, 116), (297, 674, 119), (298, 672, 120), (298, 673, 120), (299, 682, 119), (299, 723, 117), (300, 687, 119), (300, 688, 119), (300, 690, 118), (301, 580, -42), (301, 693, 119), (301, 698, 118), (301, 744, 115), (304, 587, -43), (304, 953, 125), (310, 600, -46), (310, 986, 128), (311, 506, 62), (312, 205, 127), (320, 840, 112), (321, 836, 112), (322, 237, 72), (322, 841, 111), (324, 843, 112), (324, 844, 112), (325, 838, 113), (325, 844, 112), (325, 845, 112), (326, 842, 113), (328, 228, 127), (329, 228, 127), (332, 845, 113), (333, 844, 114), (336, 719, 42), (338, 716, 38), (339, 235, 125), (339, 381, 99), (339, 718, 39), (344, 232, 52), (346, 455, -117), (346, 456, -116), (346, 554, -121), (354, 834, 125), (355, 831, 126), (359, 223, 124), (368, 218, 126), (370, 392, -97), (377, 58, 124), (385, 315, 123), (387, 316, 124), (387, 384, 94), (388, 313, 119), (391, 312, 122), (395, 299, 122), (396, 54, 109), (396, 475, -24), (400, 918, -107), (404, 180, 123), (407, 453, -109), (408, 286, 119), (409, 208, 123), (412, 504, -44), (414, 340, 121), (415, 220, 123), (415, 221, 123), (417, 561, 87), (418, 218, 123), (420, 1035, -94), (425, 338, 118), (433, 53, 124), (434, 379, 117), (436, 379, 118), (437, 52, 128), (437, 205, -47), (437, 379, 118), (439, 378, 109), (440, 317, 120), (443, 285, 120), (443, 394, 125), (446, 414, 126), (448, 202, 121), (452, 330, 119), (452, 719, 64), (454, 199, 122), (455, 52, 126), (455, 723, 65), (456, 377, 118), (457, 199, 122), (463, 325, 120), (465, 330, 112), (470, 192, 121), (472, 330, 120), (473, 327, 118), (473, 372, 115), (477, 372, 112), (480, 272, 108), (480, 279, 119), (481, 170, 120), (481, 186, 120), (481, 221, 115), (482, 169, 120), (483, 185, 121), (486, 823, 63), (489, 181, 120), (492, 370, 115), (500, 312, 118), (504, 179, 119), (505, 172, 122), (508, 171, 120), (509, 171, 120), (510, 961, 126), (517, 169, 121), (517, 217, 118), (518, 401, 124), (519, 400, 122), (520, 400, 124), (523, 321, 115), (523, 407, 126), (525, 171, 118), (526, 272, 117), (527, 363, 127), (529, 315, 116), (532, 171, 119), (534, 360, 110), (539, 360, 112), (543, 297, 117), (552, 588, 27), (555, 262, 96), (555, 295, 117), (555, 391, 123), (556, 357, 112), (557, 291, 116), (558, 594, 27), (565, 161, 115), (570, 169, 116), (570, 170, 120), (572, 354, 111), (572, 398, 122), (573, 313, 114), (573, 611, 28), (574, 609, 29), (576, 394, 105), (577, 615, 29), (578, 352, 113), (579, 352, 110), (580, 697, -38), (581, 166, 112), (583, 354, 112), (585, 622, 31), (589, 310, 112), (591, 262, 113), (593, 352, 113), (595, 204, 115), (597, 68, 117), (597, 69, 117), (598, 64, 117), (598, 66, 117), (598, 67, 117), (598, 68, 117), (598, 69, 117), (598, 70, 117), (598, 71, 117), (598, 72, 117), (598, 81, 117), (599, 73, 117), (599, 75, 117), (599, 86, 116), (599, 87, 116), (600, 83, 117), (600, 84, 117), (600, 85, 117), (600, 88, 117), (600, 94, 116), (600, 95, 116), (601, 92, 117), (601, 99, 116), (601, 100, 116), (601, 101, 116), (601, 102, 116), (602, 102, 116), (602, 106, 116), (602, 107, 116), (603, 151, 115), (603, 204, 114), (603, 205, 113), (604, 126, 115), (604, 146, 115), (604, 147, 115), (605, 146, 115), (605, 205, 113), (605, 348, 107), (606, 144, 115), (606, 146, 115), (606, 148, 115), (606, 150, 115), (606, 203, 114), (607, 150, 115), (608, 55, 118), (609, 204, 114), (610, 205, 114), (611, 205, 114), (611, 208, 114), (611, 209, 114), (611, 210, 114), (612, 150, 116), (612, 202, 114), (612, 205, 114), (612, 206, 114), (612, 208, 113), (612, 209, 113), (612, 345, 112), (613, 102, 117), (613, 202, 114), (613, 348, 111), (613, 652, -113), (614, 212, 114), (614, 305, 111), (614, 306, 111), (615, 103, 118), (615, 347, 111), (616, 154, 114), (616, 233, 113), (616, 234, 113), (616, 236, 113), (616, 250, 113), (616, 251, 113), (616, 303, 112), (616, 655, 34), (617, 132, 117), (617, 160, 116), (617, 239, 113), (617, 241, 113), (617, 242, 113), (617, 251, 113), (617, 347, 110), (618, 134, 117), (618, 137, 117), (618, 138, 117), (618, 250, 113), (619, 108, 118), (619, 109, 118), (619, 110, 118), (619, 129, 117), (619, 135, 117), (619, 136, 117), (619, 138, 117), (619, 147, 117), (619, 154, 116), (619, 253, 113), (620, 103, 118), (620, 110, 118), (620, 256, 113), (620, 258, 119), (620, 304, 112), (621, 111, 117), (621, 150, 116), (622, 129, 118), (622, 168, 117), (622, 308, 112), (622, 347, 111), (623, 129, 118), (623, 203, 115), (623, 303, 112), (623, 306, 111), (623, 309, 111), (623, 310, 111), (623, 346, 110), (623, 347, 111), (624, 130, 117), (624, 164, 116), (624, 205, 115), (624, 305, 112), (624, 308, 111), (624, 311, 112), (624, 388, 116), (625, 167, 117), (625, 169, 116), (625, 198, 116), (625, 205, 115), (625, 303, 111), (625, 316, 111), (625, 317, 111), (625, 318, 111), (625, 319, 111), (625, 339, 111), (625, 341, 111), (625, 342, 111), (625, 343, 111), (625, 389, 123), (626, 130, 117), (626, 138, 118), (626, 199, 115), (626, 200, 115), (626, 301, 112), (626, 326, 111), (626, 329, 111), (626, 330, 111), (626, 331, 111), (626, 332, 111), (626, 333, 111), (626, 339, 111), (626, 342, 111), (626, 343, 111), (627, 130, 117), (627, 141, 116), (627, 254, 115), (627, 337, 111), (627, 339, 111), (628, 50, 117), (628, 183, 116), (628, 184, 116), (628, 185, 116), (628, 254, 114), (628, 255, 114), (628, 302, 111), (628, 341, 111), (629, 97, 117), (629, 132, 118), (629, 188, 115), (629, 192, 116), (629, 193, 116), (629, 255, 114), (629, 341, 111), (630, 144, 117), (630, 193, 115), (630, 194, 115), (630, 302, 112), (630, 342, 111), (630, 344, 111), (630, 348, 110), (631, 134, 118), (631, 344, 111), (631, 348, 110), (632, 47, 118), (632, 50, 117), (632, 109, 119), (632, 196, 116), (632, 198, 116), (632, 202, 115), (632, 203, 115), (632, 205, 115), (632, 206, 115), (632, 342, 111), (633, 52, 118), (633, 53, 118), (633, 54, 118), (633, 206, 115), (633, 214, 114), (634, 53, 118), (634, 55, 118), (634, 64, 117), (635, 51, 118), (635, 76, 117), (635, 97, 117), (635, 112, 118), (635, 136, 116), (636, 61, 117), (636, 79, 117), (636, 97, 117), (636, 301, 113), (637, 75, 118), (637, 84, 115), (637, 305, 113), (637, 347, 112), (638, 347, 112), (638, 408, 126), (639, 83, 117), (639, 387, 117), (639, 408, 124), (640, 85, 117), (640, 89, 117), (640, 92, 117), (640, 277, 113), (640, 278, 113), (641, 85, 117), (641, 111, 119), (641, 347, 112), (642, 91, 114), (643, 144, 115), (643, 145, 115), (643, 343, 112), (643, 346, 111), (644, 87, 114), (644, 142, 115), (644, 342, 112), (645, 94, 115), (645, 112, 121), (645, 162, 115), (645, 343, 112), (646, 90, 115), (646, 143, 116), (646, 146, 116), (646, 148, 116), (646, 149, 116), (646, 166, 115), (646, 167, 115), (646, 347, 102), (647, 118, 121), (647, 151, 116), (647, 163, 116), (647, 164, 116), (647, 343, 111), (647, 364, 115), (648, 144, 116), (648, 364, 115), (648, 386, 124), (649, 92, 116), (649, 119, 121), (649, 170, 115), (650, 294, 113), (651, 93, 116), (652, 459, 84), (654, 318, 112), (655, 122, 121), (655, 318, 111), (655, 321, 112), (655, 322, 112), (655, 323, 112), (655, 334, 112), (655, 439, 87), (656, 228, 114), (656, 229, 114), (656, 233, 113), (656, 256, 113), (656, 321, 112), (656, 323, 112), (656, 332, 111), (656, 347, 111), (657, 125, 66), (657, 331, 111), (657, 347, 111), (658, 149, 118), (658, 244, 114), (659, 254, 113), (659, 255, 113), (659, 265, 113), (660, 261, 113), (661, 99, 116), (662, 126, 121), (662, 127, 121), (664, 152, 120), (665, 126, 123), (665, 311, 113), (666, 340, 111), (666, 341, 111), (666, 380, 117), (667, 119, 121), (667, 129, 121), (667, 331, 111), (667, 339, 112), (667, 340, 111), (667, 343, 112), (668, 328, 112), (668, 329, 112), (668, 330, 112), (668, 339, 112), (668, 343, 111), (668, 380, 118), (669, 342, 111), (669, 343, 111), (669, 344, 111), (670, 343, 111), (671, 343, 111), (672, 344, 111), (673, 103, 117), (673, 106, 118), (673, 120, 119), (674, 426, -43), (674, 432, 11), (675, 135, 122), (677, 136, 121), (677, 155, 122), (678, 156, 121), (679, 123, 120), (679, 352, 115), (680, 136, 122), (681, 157, 122), (682, 125, 119), (682, 347, 113), (682, 459, 11), (683, 459, 13), (684, 138, 123), (686, 346, 116), (688, 354, 114), (688, 379, 118), (689, 493, -77), (691, 356, 115), (691, 378, 116), (692, 455, -69), (693, 356, 114), (694, 164, 126), (695, 356, 114), (697, 119, 119), (698, 130, 118), (698, 390, 121), (698, 955, 88), (699, 970, 4), (699, 971, 4), (699, 972, 89), (700, 998, 109), (703, 122, 120), (705, 123, 120), (706, 354, 115), (707, 124, 120), (707, 354, 115), (708, 125, 120), (708, 149, 125), (709, 372, 116), (710, 126, 120), (711, 130, 120), (712, 127, 120), (712, 130, 120), (715, 128, 120), (716, 128, 120), (716, 174, 124), (718, 129, 120), (718, 175, 124), (719, 134, 120), (720, 490, -69), (721, 131, 120), (723, 159, 125), (725, 133, 121), (726, 133, 121), (727, 158, 126), (727, 160, 126), (729, 135, 121), (730, 135, 121), (730, 159, 126), (730, 162, 126), (731, 136, 121), (734, 161, 126), (738, 1036, -31), (741, 139, 122), (742, 141, 122), (742, 164, 128), (742, 433, -93), (743, 638, -73), (744, 140, 121), (746, 166, 127), (747, 192, 126), (751, 171, 66), (755, 170, 127), (756, 173, 128), (757, 468, 122), (757, 479, 119), (760, 172, 128), (763, 174, 128), (764, 176, 65), (768, 369, 118), (772, 154, 125), (775, 443, -4), (777, 448, -3), (778, 457, 108), (778, 458, 108), (784, 185, 63), (793, 1033, -10), (794, 884, -29), (795, 1011, -44), (796, 400, -6), (796, 1025, 128), (797, 427, -8), (798, 165, 126), (798, 1024, -5), (800, 433, 45), (801, 880, -11), (802, 412, 119), (802, 771, 52), (802, 937, 94), (816, 224, 126), (816, 389, -93), (816, 393, -93), (816, 876, -77), (820, 934, -91), (823, 653, 95), (824, 423, -25), (824, 965, 66), (830, 936, 86), (833, 871, 62), (839, 769, -124), (845, 470, 83), (848, 185, 128), (850, 481, 47), (850, 864, -124), (851, 424, 2), (864, 762, -125), (865, 383, 6), (865, 761, -124), (868, 346, 53), (873, 476, 7), (884, 337, -6), (885, 371, -87), (885, 372, -87), (888, 586, 60), (889, 373, 45), (900, 759, -50), (902, 642, -3), (913, 389, 37), (916, 435, 38), (917, 842, -107), (919, 704, 58), (927, 561, -120), (940, 695, 54), (944, 840, -85), (957, 691, 52), (963, 269, 54), (963, 935, 116), (964, 952, -89), (965, 274, 54), (965, 275, 54), (965, 988, -94), (965, 989, -94), (966, 828, -90), (966, 983, -92), (966, 984, -92), (966, 985, -92), (966, 986, -92), (970, 667, -48), (972, 689, 56), (975, 397, 118), (990, 1000, -97), (997, 813, -85), (1007, 769, 53), (1010, 812, -61), (1018, 392, 38), (1026, 867, 91), (1057, 665, -59), (1085, 971, -84), (1087, 382, 112), (1119, 787, 40), (1128, 1008, -1), (1129, 973, 72), (1129, 974, 72), (1130, 989, -2), (1132, 642, 47), (1150, 639, 52), (1157, 635, 48), (1173, 631, 46), (1174, 461, 93), (1197, 804, -98), (1202, 624, -55), (1214, 500, -60), (1223, 618, 52), (1236, 494, -77), (1238, 929, 81), (1258, 916, 88), (1261, 606, 45), (1266, 684, 66), (1270, 603, 47), (1276, 685, 58), (1281, 998, 110), (1281, 999, 110), (1281, 1000, 112), (1288, 928, 120), (1288, 929, 120), (1296, 78, 107), (1297, 86, 107), (1297, 100, 109), (1297, 101, 109), (1297, 103, 109), (1297, 104, 109), (1298, 75, 108), (1298, 102, 109), (1298, 105, 108), (1298, 106, 108), (1299, 75, 108), (1300, 69, 109), (1301, 68, 108), (1302, 69, 109), (1302, 75, 109), (1303, 73, 108), (1304, 50, 107), (1304, 65, 108), (1304, 76, 108), (1307, 507, 77), (1322, 666, 65), (1325, 588, 47), (1341, 197, 123), (1341, 719, 39), (1341, 720, 39), (1354, 344, 97), (1355, 752, 38), (1355, 753, 38), (1356, 343, 98), (1358, 335, 96), (1360, 335, 97), (1360, 336, 97), (1364, 336, 98), (1367, 561, -53), (1371, 461, 125), (1372, 459, 126), (1380, 337, 99), (1383, 335, 111), (1403, 715, 83), (1408, 820, 110), (1412, 825, 114), (1417, 856, 116), (1418, 570, 107), (1426, 910, 20), (1427, 890, 120), (1428, 889, 120), (1437, 942, 125), (1437, 943, 125), (1443, 963, 124), (1453, 896, 123), (1455, 417, 101), (1456, 305, 99), (1456, 308, 99), (1456, 416, 102), (1457, 306, 100), (1457, 416, 102), (1457, 917, 119), (1458, 897, 125), (1461, 477, 115), (1464, 410, 98), (1467, 334, 103), (1469, 407, 98), (1469, 440, 104), (1469, 441, 103), (1469, 443, 103), (1514, 460, 104), (1519, 897, 124), (1519, 899, 124), (1520, 897, 124), (1526, 929, 124), (1528, 899, 122), (1529, 912, 125), (1529, 913, 124), (1529, 916, 125), (1531, 839, 124), (1536, 969, 128), (1539, 962, 127), (1540, 964, 127), (1540, 965, 127), (1561, 1000, 128), (1584, 863, 49), (1686, 1017, -39), (1712, 979, 90), (1713, 978, 86), (1766, 971, 128), (1769, 966, 128), (1778, 834, -14), (1853, 970, 126), (1885, 905, 89), (1897, 935, 125), (1902, 171, 1), (1903, 67, 1), (1903, 291, 1), (1903, 292, 1), (1904, 260, 1), (1904, 261, 1), (1905, 329, 1)] +[(13, 570, 0), (13, 658, -1), (13, 659, -1), (13, 660, -1), (13, 690, -2), (14, 562, 1), (14, 571, 1), (14, 572, 1), (14, 573, 1), (14, 609, 0), (14, 792, -5), (14, 857, -5), (15, 566, 2), (15, 868, -6), (15, 1006, -10), (16, 850, -5), (143, 76, 103), (150, 736, 102), (165, 543, 81), (165, 546, 80), (175, 982, 128), (178, 546, 87), (183, 546, 81), (183, 743, 99), (188, 742, 102), (191, 749, 105), (195, 547, 77), (203, 756, 103), (215, 753, 102), (224, 690, 95), (236, 336, 128), (236, 337, 128), (237, 340, 128), (237, 341, 128), (239, 347, 128), (239, 351, 128), (239, 352, 128), (239, 353, 128), (239, 356, 128), (240, 263, 67), (240, 351, 128), (240, 352, 128), (240, 354, 128), (240, 357, 128), (240, 359, 128), (240, 362, 128), (241, 264, 68), (241, 362, 128), (241, 364, 128), (242, 364, 128), (242, 367, 128), (243, 267, 68), (243, 370, 128), (243, 371, 128), (243, 372, 127), (243, 373, 127), (243, 374, 127), (243, 375, 128), (243, 377, 128), (243, 421, 124), (244, 376, 127), (244, 377, 127), (244, 378, 127), (244, 420, 124), (244, 421, 124), (244, 442, 127), (245, 270, 69), (245, 385, 127), (245, 386, 127), (245, 425, 124), (245, 439, 127), (246, 387, 128), (246, 388, 127), (246, 389, 127), (246, 741, 102), (247, 362, 128), (247, 442, 128), (247, 448, 128), (248, 373, 128), (248, 374, 128), (248, 376, 128), (248, 446, 128), (249, 276, 69), (249, 404, 127), (250, 277, 69), (250, 411, 127), (250, 412, 127), (250, 413, 126), (250, 456, 127), (252, 426, 127), (254, 425, 126), (256, 287, 68), (257, 289, 68), (258, 290, 68), (259, 292, 69), (259, 421, 126), (261, 521, 94), (262, 419, 127), (264, 418, 126), (264, 853, 114), (266, 418, 6), (266, 550, 121), (267, 422, 126), (269, 286, 82), (272, 582, 120), (272, 583, 120), (273, 324, 116), (273, 843, 113), (275, 934, 124), (276, 839, 112), (277, 560, 127), (278, 585, 125), (278, 617, 119), (279, 570, 96), (279, 617, 119), (279, 620, 119), (279, 621, 119), (282, 582, 127), (282, 623, -1), (283, 644, 118), (284, 637, 92), (285, 635, -2), (285, 642, 119), (287, 401, 40), (288, 554, -37), (290, 557, -38), (290, 558, -39), (291, 634, 125), (292, 643, 126), (292, 674, 118), (296, 570, -40), (296, 665, 5), (296, 716, 116), (297, 674, 119), (298, 673, 120), (299, 677, 120), (299, 682, 119), (300, 687, 119), (300, 688, 119), (300, 690, 118), (301, 580, -42), (301, 693, 119), (301, 698, 118), (301, 744, 115), (304, 953, 125), (309, 952, 125), (309, 953, 125), (310, 600, -46), (312, 205, 127), (319, 236, 128), (320, 619, -51), (320, 840, 112), (321, 836, 112), (322, 237, 72), (322, 238, 126), (322, 841, 111), (324, 843, 112), (324, 844, 112), (325, 231, 128), (325, 838, 113), (325, 844, 112), (325, 845, 112), (325, 983, 128), (326, 842, 113), (328, 228, 127), (329, 228, 127), (329, 240, 125), (329, 706, 34), (332, 231, 126), (333, 844, 114), (336, 718, -52), (336, 719, 42), (338, 716, 38), (339, 235, 125), (339, 381, 99), (340, 748, -65), (344, 232, 52), (346, 455, -117), (346, 456, -116), (346, 554, -121), (347, 229, 125), (347, 969, 125), (348, 229, 123), (355, 241, 119), (355, 830, 126), (355, 831, 126), (356, 830, 126), (358, 222, 126), (361, 639, 75), (366, 217, 124), (368, 218, 126), (368, 689, 3), (377, 58, 124), (385, 314, 123), (385, 315, 123), (390, 314, 123), (391, 308, 123), (391, 312, 122), (393, 303, 121), (393, 307, 122), (394, 303, 121), (394, 305, 122), (395, 299, 122), (396, 294, 121), (396, 301, 122), (400, 918, -107), (404, 180, 123), (404, 293, 121), (407, 453, -109), (408, 286, 119), (409, 208, 123), (414, 340, 121), (414, 602, 57), (415, 220, 123), (415, 221, 123), (415, 222, 123), (418, 218, 123), (419, 224, 122), (420, 1035, -94), (420, 1036, -94), (421, 221, 122), (423, 226, 122), (423, 227, 122), (425, 338, 118), (433, 53, 124), (433, 384, 119), (434, 379, 117), (436, 379, 118), (437, 52, 128), (437, 379, 118), (439, 378, 109), (439, 380, 128), (440, 204, 122), (440, 317, 120), (443, 285, 120), (443, 394, 125), (446, 414, 126), (448, 202, 121), (449, 200, 121), (449, 202, 121), (452, 330, 119), (452, 719, 64), (453, 201, 119), (453, 705, 74), (453, 720, 64), (454, 199, 122), (455, 52, 126), (456, 226, 121), (456, 377, 118), (458, 327, 119), (459, 197, 121), (463, 325, 120), (465, 330, 112), (472, 330, 120), (473, 372, 115), (474, 188, 120), (477, 372, 112), (480, 272, 108), (480, 279, 119), (481, 170, 120), (481, 186, 120), (481, 221, 115), (481, 882, 118), (482, 169, 120), (483, 185, 121), (486, 823, 63), (488, 221, 115), (492, 370, 115), (493, 180, 114), (499, 313, 118), (502, 172, 120), (506, 173, 119), (508, 175, 121), (510, 961, 126), (511, 397, 123), (512, 961, 124), (513, 323, 120), (513, 988, -79), (517, 167, 123), (517, 169, 121), (517, 217, 118), (518, 401, 124), (518, 801, 111), (518, 960, 126), (519, 400, 122), (519, 957, 125), (520, 400, 124), (520, 801, 112), (523, 407, 126), (524, 961, 128), (525, 171, 118), (525, 306, 117), (526, 272, 117), (527, 363, 127), (529, 313, 117), (530, 313, 117), (530, 639, -38), (532, 171, 119), (532, 312, 118), (532, 641, -41), (533, 300, 116), (534, 360, 110), (534, 384, 121), (539, 360, 112), (541, 315, 115), (543, 297, 117), (551, 388, 121), (552, 588, 27), (555, 169, 118), (555, 262, 96), (555, 295, 117), (555, 391, 123), (556, 357, 112), (557, 291, 116), (557, 386, 122), (558, 594, 27), (561, 290, 116), (565, 161, 115), (565, 293, 116), (570, 161, 119), (570, 169, 116), (572, 354, 111), (572, 398, 122), (573, 611, 28), (574, 609, 29), (576, 394, 105), (577, 615, 29), (578, 352, 113), (579, 352, 110), (580, 697, -38), (581, 166, 112), (583, 354, 112), (585, 622, 31), (589, 310, 112), (589, 353, 112), (591, 262, 113), (593, 352, 113), (593, 895, 119), (594, 381, 115), (597, 68, 117), (597, 69, 117), (597, 165, 115), (598, 64, 117), (598, 66, 117), (598, 67, 117), (598, 68, 117), (598, 69, 117), (598, 70, 117), (598, 71, 117), (598, 72, 117), (598, 81, 117), (599, 72, 117), (599, 73, 117), (599, 75, 117), (599, 86, 116), (599, 87, 116), (600, 83, 117), (600, 84, 117), (600, 85, 117), (600, 88, 117), (600, 94, 116), (600, 95, 116), (601, 91, 117), (601, 92, 117), (601, 99, 116), (601, 100, 116), (601, 101, 116), (601, 102, 116), (602, 102, 116), (602, 106, 116), (602, 107, 116), (603, 151, 115), (603, 204, 114), (603, 205, 113), (604, 126, 115), (604, 146, 115), (604, 147, 115), (604, 203, 114), (605, 146, 115), (605, 205, 113), (605, 348, 107), (606, 144, 115), (606, 146, 115), (606, 148, 115), (606, 150, 115), (606, 203, 114), (607, 150, 115), (608, 55, 118), (609, 204, 114), (609, 350, 112), (610, 205, 114), (611, 205, 114), (611, 208, 114), (611, 209, 114), (611, 210, 114), (612, 150, 116), (612, 202, 114), (612, 205, 114), (612, 206, 114), (612, 208, 113), (612, 209, 113), (612, 345, 112), (613, 102, 117), (613, 202, 114), (613, 348, 111), (613, 652, -113), (614, 212, 114), (614, 305, 111), (614, 306, 111), (615, 103, 118), (616, 151, 116), (616, 154, 114), (616, 233, 113), (616, 234, 113), (616, 236, 113), (616, 250, 113), (616, 251, 113), (616, 303, 112), (616, 349, 111), (616, 655, 34), (617, 132, 117), (617, 160, 116), (617, 204, 116), (617, 239, 113), (617, 241, 113), (617, 242, 113), (617, 251, 113), (617, 347, 110), (618, 134, 117), (618, 137, 117), (618, 250, 113), (619, 108, 118), (619, 109, 118), (619, 110, 118), (619, 129, 117), (619, 135, 117), (619, 136, 117), (619, 138, 117), (619, 147, 117), (619, 154, 116), (619, 253, 113), (620, 103, 118), (620, 107, 118), (620, 110, 118), (620, 256, 113), (620, 258, 119), (620, 304, 112), (621, 111, 117), (621, 150, 116), (622, 129, 118), (622, 132, 117), (622, 168, 117), (622, 308, 112), (622, 347, 111), (623, 129, 118), (623, 169, 117), (623, 203, 115), (623, 303, 112), (623, 306, 111), (623, 309, 111), (623, 310, 111), (623, 346, 110), (623, 347, 111), (624, 102, 118), (624, 130, 117), (624, 133, 117), (624, 164, 116), (624, 305, 112), (624, 306, 111), (624, 308, 111), (624, 311, 112), (624, 388, 116), (625, 167, 117), (625, 169, 116), (625, 198, 116), (625, 205, 115), (625, 303, 111), (625, 316, 111), (625, 317, 111), (625, 318, 111), (625, 319, 111), (625, 339, 111), (625, 341, 111), (625, 342, 111), (625, 343, 111), (625, 389, 123), (626, 130, 117), (626, 138, 118), (626, 167, 117), (626, 199, 115), (626, 200, 115), (626, 203, 115), (626, 301, 112), (626, 326, 111), (626, 329, 111), (626, 330, 111), (626, 331, 111), (626, 332, 111), (626, 333, 111), (626, 339, 111), (626, 342, 111), (626, 343, 111), (627, 50, 118), (627, 130, 117), (627, 141, 116), (627, 202, 116), (627, 204, 115), (627, 254, 115), (627, 322, 111), (627, 337, 111), (627, 339, 111), (628, 50, 117), (628, 183, 116), (628, 184, 116), (628, 185, 116), (628, 254, 114), (628, 255, 114), (628, 302, 111), (628, 341, 111), (629, 132, 118), (629, 188, 115), (629, 192, 116), (629, 255, 114), (629, 341, 111), (630, 107, 118), (630, 144, 117), (630, 194, 115), (630, 302, 112), (630, 342, 111), (630, 344, 111), (630, 348, 110), (630, 410, 128), (631, 50, 117), (631, 134, 118), (631, 272, 114), (631, 344, 111), (631, 348, 110), (632, 47, 118), (632, 50, 117), (632, 196, 116), (632, 198, 116), (632, 202, 115), (632, 203, 115), (632, 205, 115), (632, 206, 115), (632, 342, 111), (633, 52, 118), (633, 53, 118), (633, 54, 118), (633, 206, 115), (633, 214, 114), (633, 343, 111), (634, 47, 117), (634, 53, 118), (634, 55, 118), (635, 51, 118), (635, 76, 117), (635, 97, 117), (635, 112, 118), (635, 136, 116), (636, 61, 117), (636, 79, 117), (636, 85, 117), (636, 97, 117), (636, 301, 113), (637, 84, 115), (637, 305, 113), (637, 347, 112), (638, 92, 117), (638, 347, 112), (638, 408, 126), (639, 83, 117), (639, 387, 117), (639, 406, 124), (639, 408, 124), (640, 85, 117), (640, 89, 117), (640, 92, 117), (640, 111, 120), (640, 277, 113), (640, 278, 113), (640, 279, 113), (641, 85, 117), (641, 92, 117), (641, 347, 112), (642, 91, 114), (643, 144, 115), (643, 145, 115), (643, 343, 112), (644, 87, 114), (644, 142, 115), (644, 342, 112), (645, 94, 115), (645, 112, 121), (645, 162, 115), (645, 343, 112), (646, 90, 115), (646, 111, 119), (646, 143, 116), (646, 146, 116), (646, 148, 116), (646, 149, 116), (646, 347, 102), (647, 118, 121), (647, 144, 116), (647, 151, 116), (647, 163, 116), (647, 164, 116), (647, 343, 111), (648, 144, 116), (648, 364, 115), (648, 386, 124), (649, 92, 116), (649, 119, 121), (649, 145, 116), (650, 112, 120), (650, 293, 113), (650, 294, 113), (650, 295, 113), (651, 93, 116), (651, 302, 112), (652, 459, 84), (654, 318, 112), (655, 122, 121), (655, 318, 111), (655, 321, 112), (655, 322, 112), (655, 323, 112), (655, 334, 112), (655, 439, 87), (656, 228, 114), (656, 229, 114), (656, 233, 113), (656, 256, 113), (656, 321, 112), (656, 323, 112), (656, 332, 111), (656, 347, 111), (657, 118, 120), (657, 125, 66), (657, 236, 114), (657, 331, 111), (657, 347, 111), (658, 149, 118), (658, 244, 114), (659, 254, 113), (659, 255, 113), (659, 381, 118), (660, 261, 113), (660, 869, -42), (661, 99, 116), (662, 126, 121), (662, 127, 121), (663, 120, 122), (664, 152, 120), (665, 311, 113), (666, 340, 111), (666, 341, 111), (666, 380, 117), (667, 119, 121), (667, 129, 121), (667, 331, 111), (667, 339, 112), (667, 340, 111), (667, 343, 112), (667, 344, 112), (668, 328, 112), (668, 329, 112), (668, 330, 112), (668, 339, 112), (668, 343, 111), (668, 380, 118), (669, 342, 111), (669, 343, 111), (669, 344, 111), (670, 343, 111), (670, 380, 117), (671, 343, 111), (672, 344, 111), (672, 410, 11), (673, 103, 117), (673, 106, 118), (673, 120, 119), (674, 146, 122), (674, 426, -43), (674, 432, 11), (677, 136, 121), (677, 155, 122), (678, 156, 121), (679, 123, 120), (679, 352, 115), (680, 136, 122), (681, 157, 122), (682, 125, 119), (682, 347, 113), (682, 459, 11), (683, 459, 13), (684, 352, 113), (686, 126, 121), (686, 138, 123), (686, 346, 116), (686, 353, 113), (687, 160, 122), (688, 354, 114), (688, 379, 118), (689, 493, -77), (691, 356, 115), (691, 378, 116), (692, 455, -69), (693, 356, 114), (694, 129, 121), (694, 164, 126), (697, 119, 119), (698, 390, 121), (698, 955, 88), (699, 130, 123), (699, 970, 4), (699, 971, 4), (699, 972, 89), (700, 998, 109), (701, 130, 119), (701, 372, 116), (702, 130, 119), (703, 122, 120), (703, 934, 90), (705, 123, 120), (705, 360, 114), (706, 354, 115), (707, 124, 120), (707, 354, 115), (708, 125, 120), (708, 149, 125), (709, 372, 116), (710, 126, 120), (711, 130, 120), (712, 127, 120), (712, 130, 120), (714, 363, 113), (716, 128, 120), (716, 174, 124), (718, 129, 120), (718, 175, 124), (719, 133, 120), (719, 134, 120), (720, 133, 120), (720, 490, -69), (721, 131, 120), (723, 159, 125), (725, 133, 121), (726, 133, 121), (727, 158, 126), (727, 160, 126), (729, 135, 121), (730, 135, 121), (730, 159, 126), (730, 162, 126), (731, 136, 121), (734, 161, 126), (738, 1036, -31), (740, 144, 122), (741, 139, 122), (742, 141, 122), (742, 164, 128), (742, 433, -93), (743, 142, 122), (743, 638, -73), (746, 166, 127), (747, 192, 126), (751, 171, 66), (755, 170, 127), (756, 173, 128), (757, 468, 122), (757, 479, 119), (757, 818, 59), (758, 686, 112), (760, 172, 128), (763, 174, 128), (764, 176, 65), (764, 961, -75), (768, 369, 118), (772, 154, 125), (775, 443, -4), (778, 383, 121), (778, 457, 108), (779, 456, -5), (784, 185, 63), (784, 387, 122), (785, 210, 128), (793, 1029, 66), (793, 1033, -10), (794, 884, -29), (795, 1011, -44), (796, 400, -6), (796, 1025, 128), (797, 427, -8), (798, 165, 126), (798, 1024, -5), (800, 433, 45), (801, 880, -11), (802, 412, 119), (802, 771, 52), (816, 224, 126), (816, 389, -93), (816, 393, -93), (819, 409, 104), (823, 653, 95), (824, 423, -25), (833, 871, 62), (839, 769, -124), (845, 470, 83), (848, 185, 128), (850, 481, 47), (850, 864, -124), (855, 763, -120), (855, 1017, -109), (864, 762, -125), (865, 383, 6), (865, 761, -124), (868, 346, 53), (869, 930, 84), (883, 350, 48), (884, 337, -6), (885, 371, -87), (888, 586, 60), (889, 373, 45), (900, 759, -50), (902, 642, -3), (913, 388, 37), (913, 389, 37), (916, 435, 38), (917, 842, -107), (919, 704, 58), (927, 561, -120), (940, 695, 54), (944, 840, -85), (947, 354, 72), (957, 691, 52), (961, 901, 34), (963, 935, 116), (964, 952, -89), (965, 274, 54), (965, 275, 54), (965, 988, -94), (965, 989, -94), (966, 828, -90), (966, 983, -92), (966, 984, -92), (966, 985, -92), (966, 986, -92), (970, 667, -48), (972, 689, 56), (975, 356, 84), (975, 397, 118), (981, 1018, 58), (989, 413, -70), (990, 1000, -97), (997, 813, -85), (1006, 832, 83), (1007, 769, 53), (1010, 812, -61), (1018, 392, 38), (1026, 867, 91), (1033, 777, 92), (1057, 665, -59), (1084, 974, -4), (1119, 787, 40), (1125, 850, 28), (1127, 574, 44), (1128, 1008, -1), (1129, 973, 72), (1129, 974, 72), (1130, 989, -2), (1132, 642, 47), (1150, 639, 52), (1150, 777, 126), (1157, 635, 48), (1161, 843, -14), (1173, 631, 46), (1174, 461, 93), (1197, 804, -98), (1202, 624, -55), (1214, 500, -60), (1223, 618, 52), (1236, 494, -77), (1238, 929, 81), (1258, 916, 88), (1261, 606, 45), (1266, 684, 66), (1270, 603, 47), (1276, 685, 58), (1281, 998, 110), (1281, 999, 110), (1296, 78, 107), (1297, 86, 107), (1297, 100, 109), (1297, 101, 109), (1297, 103, 109), (1297, 104, 109), (1298, 75, 108), (1298, 102, 109), (1298, 105, 108), (1298, 106, 108), (1299, 75, 108), (1300, 69, 109), (1301, 68, 108), (1302, 69, 109), (1302, 75, 109), (1303, 73, 108), (1304, 50, 107), (1304, 65, 108), (1304, 76, 108), (1307, 507, 77), (1308, 60, 108), (1308, 74, 108), (1320, 691, 122), (1320, 692, 124), (1322, 666, 65), (1325, 588, 47), (1341, 197, 123), (1341, 719, 39), (1341, 720, 39), (1354, 344, 97), (1355, 344, 97), (1355, 753, 38), (1356, 343, 98), (1358, 335, 96), (1360, 335, 97), (1360, 336, 97), (1364, 336, 98), (1367, 561, -53), (1371, 461, 125), (1372, 459, 126), (1380, 337, 99), (1383, 335, 111), (1388, 771, 98), (1408, 820, 110), (1412, 825, 114), (1412, 886, 120), (1413, 827, 114), (1417, 856, 116), (1418, 570, 107), (1421, 568, 103), (1422, 912, 19), (1426, 910, 20), (1427, 890, 120), (1428, 889, 120), (1433, 895, 120), (1437, 942, 125), (1437, 943, 125), (1441, 436, 101), (1443, 963, 124), (1447, 554, 109), (1452, 472, 113), (1453, 896, 123), (1454, 337, 102), (1454, 418, 102), (1454, 421, 100), (1455, 417, 101), (1456, 305, 99), (1456, 308, 99), (1456, 416, 102), (1456, 425, 102), (1457, 416, 102), (1457, 917, 119), (1458, 897, 125), (1461, 477, 115), (1461, 478, 115), (1461, 915, 127), (1464, 410, 98), (1467, 92, 125), (1467, 334, 103), (1467, 409, 99), (1468, 407, 98), (1468, 441, 103), (1469, 440, 104), (1469, 443, 103), (1470, 405, 98), (1472, 404, 98), (1502, 401, 111), (1504, 471, 122), (1511, 413, 59), (1514, 460, 104), (1514, 810, 126), (1519, 897, 124), (1519, 899, 124), (1520, 896, 122), (1526, 929, 124), (1527, 931, 125), (1528, 899, 122), (1529, 912, 125), (1529, 913, 124), (1529, 916, 125), (1531, 839, 124), (1536, 969, 128), (1539, 962, 127), (1540, 964, 127), (1540, 965, 127), (1544, 975, 120), (1556, 535, 126), (1584, 863, 49), (1634, 1013, 86), (1686, 1017, -39), (1712, 979, 90), (1713, 978, 86), (1758, 833, -38), (1763, 833, -22), (1766, 964, 127), (1766, 971, 128), (1768, 1013, 121), (1769, 966, 128), (1778, 834, -14), (1850, 981, 38), (1853, 970, 126), (1885, 905, 89), (1897, 935, 125), (1902, 171, 1), (1903, 67, 1), (1903, 291, 1), (1903, 292, 1), (1903, 299, 1), (1903, 300, 1), (1904, 260, 1), (1904, 261, 1), (1905, 329, 1)] diff --git a/tests/test_single_matching.cpp b/tests/test_single_matching.cpp index e9e6a75..9443a33 100644 --- a/tests/test_single_matching.cpp +++ b/tests/test_single_matching.cpp @@ -1,7 +1,7 @@ #define APPROVALS_GOOGLETEST #include #include -#include "gpc/inference.hpp" +#include "gpc/forest.hpp" TEST(Approval, Inference) @@ -34,12 +34,12 @@ TEST(Approval, Inference) simg.readPNG(leftImgPath); timg.readPNG(rightImgPath); // Get learned filter for the given image dimensions. - GPCForest_t::FilterMask fm = + gpc::inference::FilterMask fm = forest.readForest(forestPath, simg.cols(), simg.rows()); - GPCForest_t::PreprocessedImage simgP = + gpc::inference::PreprocessedImage simgP = forest.preprocessImage(simg, inferencesettings); - GPCForest_t::PreprocessedImage timgP = + gpc::inference::PreprocessedImage timgP = forest.preprocessImage(timg, inferencesettings); // Match rectified stereo images @@ -49,6 +49,45 @@ TEST(Approval, Inference) std::stringstream ss; ss << supp; - EXPECT_EQ(866, supp.size()); + EXPECT_EQ(1024, supp.size()); ApprovalTests::Approvals::verify(ss.str()); } +std::vector getSrcDescriptors() { + return ndb::Descriptor::deserialize("statesSrc.txt", true); +} + +std::vector getTarDescriptors() { + return ndb::Descriptor::deserialize("statesTar.txt", false); +} + + +/* +TEST(A,B) { + std::vector srcOriginal = getSrcDescriptors(); + std::vector tarOriginal = getTarDescriptors(); + std::vector srcBaseline = srcOriginal; + std::vector tarBaseline = tarOriginal; + std::vector srcAlt = srcOriginal; + std::vector tarAlt = tarOriginal; + + std::vector + matches = gpc::inference::Forest::findCorrespondences(srcBaseline, tarBaseline); + + + // Alternative method + gpc::inference::SoAFramePersistentSingleSlab srcFrame, tarFrame; + srcFrame.preallocate(srcOriginal.size()); // size known + tarFrame.preallocate(tarOriginal.size()); + + std::vector resultSrc, resultTar; + resultSrc.reserve(srcOriginal.size()/10); + resultTar.reserve(tarOriginal.size()/10); + gpc::inference::Forest::prepareSoAFramesPersistentSingleSlabUnordered(srcAlt, tarAlt, srcFrame, tarFrame); + gpc::inference::Forest::matchPipelinedBranchlessPreallocateSingleSlabUnordered(srcFrame, tarFrame, resultSrc, resultTar); + + // Ensure ID pairings of (resultSrc, resultTar) match the naive version. + // We ignore exact matching for now and just expect the count to be the same + EXPECT_EQ(matches.size(), resultSrc.size()); + EXPECT_EQ(matches.size(), resultTar.size()); +} +*/