Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
5a98492
cmakelists eigen3
bamert Feb 14, 2026
a4436e7
decouple forest.hpp
bamert Feb 14, 2026
7d00a91
decouple fern
bamert Feb 14, 2026
5d43602
decouple feature
bamert Feb 14, 2026
af850bb
decouple filter
bamert Feb 15, 2026
d4dcdf6
add filter cpp
bamert Feb 15, 2026
b682d44
break out sobel kernel
bamert Feb 15, 2026
6ed662a
break out box filter
bamert Feb 15, 2026
fb4d626
break out census filter
bamert Feb 15, 2026
7d0ce8d
extract gpc filter, move utils
bamert Feb 15, 2026
f0ed437
add highway
bamert Feb 15, 2026
06bda6e
add highway implementation of box filter
bamert Feb 15, 2026
5e06b64
highway sobel kernel
bamert Feb 15, 2026
0285f71
checkin
bamert Feb 18, 2026
832a651
cmakelist eigen include
bamert Feb 18, 2026
ca65a42
update dir
bamert Feb 18, 2026
d3e1c32
update
bamert Feb 18, 2026
362f3f5
add benchmark. coarse runtime measurement
bamert Feb 18, 2026
75846ed
update legacy bench
bamert Feb 22, 2026
0b37d45
benchmark, perf, box and sobel acceptance tests
bamert Feb 22, 2026
018e23c
move
bamert Feb 22, 2026
e3873bc
Merge branch 'timings' into decouple-headers
bamert Feb 22, 2026
83c18b5
rename
bamert Feb 22, 2026
5358de1
update hwy kernels
bamert Feb 22, 2026
8f65484
static dispatch
bamert Feb 22, 2026
a9443c0
update hwy sobel filter to be pixel accurate with naive version (alth…
bamert Feb 22, 2026
3506349
wip dense gpt hwy kernel
bamert Feb 24, 2026
b1b5dee
add matching kernels
bamert Mar 8, 2026
55994f5
add correspondence bench and(de)serialization
bamert Mar 8, 2026
31bd8a2
add naive hash match version
bamert Mar 8, 2026
46b8d5d
add additional matching method
bamert Mar 15, 2026
62807c6
move individual HT benchmarks to separate repo
bamert Apr 7, 2026
e8ed2f3
add target
bamert Apr 7, 2026
ee46a5b
no div
bamert Apr 7, 2026
cbddf6b
approximate division by 9 with fixed point multiplication for compari…
bamert Apr 7, 2026
bd27dfa
formatting
bamert Apr 7, 2026
432cc92
update approval test blob due to div /9 approximation in sobel filter
bamert Apr 7, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 62 additions & 19 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,43 +1,86 @@
cmake_minimum_required(VERSION 3.10)
include(CheckCXXCompilerFlag)
include(CheckCXXSourceRuns)
include(CMakePushCheckState)
project(openGPC CXX)
set (REQ_CPP11_FEATURES cxx_strong_enums cxx_auto_type)
if(NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
add_compile_options(-O3 -funroll-loops)
endif()

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

include(FetchContent)
find_package(Eigen3 REQUIRED)
find_package(PNG REQUIRED)
find_package(Threads REQUIRED)

include_directories(${EIGEN3_INCLUDE_DIR})
include_directories(${PNG_INCLUDE_DIRS})
include_directories(lib)
include(FetchContent)
set(HWY_ENABLE_TESTS OFF CACHE BOOL "Disable Highway tests" FORCE)
set(HWY_ENABLE_EXAMPLES OFF CACHE BOOL "Disable Highway examples" FORCE)
FetchContent_Declare(
highway
GIT_REPOSITORY https://github.com/google/highway.git
GIT_TAG 1.3.0
)
FetchContent_MakeAvailable(highway)

include(FetchContent)
set(BENCHMARK_ENABLE_TESTING OFF CACHE BOOL "" FORCE)
set(BENCHMARK_ENABLE_INSTALL OFF CACHE BOOL "" FORCE)
set(BENCHMARK_ENABLE_GTEST_LIB OFF CACHE BOOL "" FORCE)

FetchContent_Declare(
google_benchmark
GIT_REPOSITORY https://github.com/google/benchmark.git
GIT_TAG v1.9.5
)
# MUST go before FetchContent_MakeAvailable
set(BENCHMARK_ENABLE_TESTING OFF CACHE BOOL "" FORCE)
set(BENCHMARK_ENABLE_INSTALL OFF CACHE BOOL "" FORCE)

# Force the library itself to build in Release mode
set(CMAKE_BUILD_TYPE Release CACHE STRING "" FORCE)

#By default, use SSE intrinsics
option(SSE "Enable SSE/AVX optimizations if available" ON)

add_compile_options(-O3 -funroll-loops)
if(SSE)
message(STATUS "Checking if target CPU supports AVX2 instructions...")
check_cxx_source_runs("
#include <immintrin.h>
int main() {
__m256i x = _mm256_set1_epi32(1);
return _mm256_extract_epi32(x, 0);
}
" CPU_HAS_AVX2)

if(CPU_HAS_AVX2)
message(STATUS "AVX2: supported and enabled")
add_compile_definitions(_INTRINSICS_SSE)
add_compile_options(-mavx2 -march=core-avx2)
endif()
FetchContent_MakeAvailable(google_benchmark)
FetchContent_MakeAvailable(google_benchmark)
add_library(gpc_core
lib/gpc/forest.cpp
lib/gpc/fern.cpp
lib/gpc/feature.cpp
lib/gpc/kernels/sobel.cpp
lib/gpc/kernels/box.cpp
lib/gpc/kernels/census.cpp
lib/gpc/kernels/gpc.cpp
lib/gpc/kernels/utils.cpp
lib/gpc/kernels/box_hwy.cpp
lib/gpc/kernels/sobel_hwy.cpp
lib/gpc/kernels/gpc_hwy.cpp
)
if(MSVC)
target_compile_options(gpc_core PUBLIC /arch:AVX2)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64")
target_compile_options(gpc_core PUBLIC -march=native)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64")
target_compile_options(gpc_core PUBLIC -mcpu=native)
endif()
target_link_libraries(gpc_core
PUBLIC
Eigen3::Eigen
${PNG_LIBRARIES}
Threads::Threads
hwy
)
target_include_directories(gpc_core PUBLIC lib)

enable_testing()
add_subdirectory(samples)
add_subdirectory(tests)
add_subdirectory(benchmarks)

31 changes: 31 additions & 0 deletions benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
add_executable(sobel_bench sobel_bench.cpp)

target_link_libraries(sobel_bench
PRIVATE
gpc_core
benchmark::benchmark
hwy
)

set_target_properties(sobel_bench PROPERTIES INTERPROCEDURAL_OPTIMIZATION TRUE)

add_executable(kernel_bench kernel_bench.cpp)
target_link_libraries(kernel_bench
PRIVATE
gpc_core
benchmark::benchmark
)
add_executable(box_bench box_bench.cpp)

target_link_libraries(box_bench
PRIVATE
gpc_core
benchmark::benchmark
)
add_executable(correspondence_bench correspondence_bench.cpp)

target_link_libraries(correspondence_bench
PRIVATE
gpc_core
benchmark::benchmark
)
52 changes: 52 additions & 0 deletions benchmarks/box_bench.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#include <benchmark/benchmark.h>
#include <hwy/highway.h>
#include "gpc/kernels/box.hpp"
#include "gpc/kernels/box_hwy.hpp"
static void BM_BoxHighway(benchmark::State& state) {
int w = 1920, h = 1080;
std::vector<uint8_t> in(w * h, 128);
std::vector<uint8_t> out(w * h, 0);
state.SetLabel(hwy::TargetName(HWY_TARGET));
for (auto _ : state) {
ndb::testing::box_hwy(in.data(), out.data(), w, h);

benchmark::DoNotOptimize(out.data());
benchmark::ClobberMemory();
}
}

#if HWY_TARGET == HWY_AVX2
static void BM_BoxLegacySIMD(benchmark::State& state) {
int w = 1920, h = 1080;
std::vector<uint8_t> in(w * h, 128);
std::vector<uint8_t> out(w * h, 0);

state.SetLabel("AVX2_legacy");
for (auto _ : state) {
ndb::boxSSE(in.data(), out.data(), w, h);

benchmark::DoNotOptimize(out.data());
benchmark::ClobberMemory();
}
}
#endif
static void BM_BoxNaive(benchmark::State& state) {
int w = 1920, h = 1080;
std::vector<uint8_t> in(w * h, 128);
std::vector<uint8_t> out(w * h, 0);

state.SetLabel("naive");
for (auto _ : state) {
ndb::boxNaive(in.data(), out.data(), w, h);

benchmark::DoNotOptimize(out.data());
benchmark::ClobberMemory();
}
}
BENCHMARK(BM_BoxHighway)->Unit(benchmark::kMillisecond);
#if HWY_TARGET == HWY_AVX2
BENCHMARK(BM_BoxLegacySIMD)->Unit(benchmark::kMillisecond);
#endif
BENCHMARK(BM_BoxNaive)->Unit(benchmark::kMillisecond);

BENCHMARK_MAIN();
84 changes: 84 additions & 0 deletions benchmarks/correspondence_bench.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#include <benchmark/benchmark.h>
#include "gpc/forest.hpp"
#include "gpc/inference.hpp"
#include <vector>
#include <random>
#include <cmath>
#include <cstdint>

#define NUM_ELEMENTS 262668 //10*1224*375 //1024*1024

/**
* Generates a reproducible Pareto-distributed vector.
* @param count Number of IDs to generate.
* @param target_mean The theoretical mean (requires alpha > 1).
* @param seed A fixed value (e.g., 42) for deterministic benchmarks.
*/
std::vector<ndb::Descriptor> generate_pareto_ids(size_t count, double target_mean, uint32_t seed = 42) {
std::vector<ndb::Descriptor> ids;
ids.reserve(count);

// Using a fixed seed for benchmark consistency
std::mt19937 gen(seed);

// 1e-9 epsilon prevents division by zero/infinity
std::uniform_real_distribution<double> dist(1e-9, 1.0);

const double alpha = 1.16;
const double xm = target_mean * (alpha - 1.0) / alpha;

for (size_t i = 0; i < count; ++i) {
// Inverse Transform Sampling
double val = xm / std::pow(dist(gen), 1.0 / alpha);

// Casting to uint32_t will handle the Pareto "tail" by wrapping
// values that exceed 2^32-1, simulating a dense ID space.
ids.push_back(ndb::Descriptor(ndb::Point(0,0), static_cast<uint32_t>(val)));
}

return ids;
}
std::vector<ndb::Descriptor> getSrcDescriptors() {
std::vector<ndb::Descriptor> v = ndb::Descriptor::deserialize("statesSrcLarge.txt", true);
std::vector<ndb::Descriptor> out;
for (size_t i = 0; i < v.size(); i++) {
if (v[i].point.y % 5 == 0 && (v[i].state & 0xFFFFFFFF) != 0) {
out.push_back(v[i]);
}
}
return out;
//return generate_pareto_ids(NUM_ELEMENTS, 1000.0, 42); // 1M IDs with mean ~1000
}

std::vector<ndb::Descriptor> getTarDescriptors() {
std::vector<ndb::Descriptor> v = ndb::Descriptor::deserialize("statesTarLarge.txt", false);
std::vector<ndb::Descriptor> out;
for (size_t i = 0; i < v.size(); i++) {
if (v[i].point.y % 5 == 0 && (v[i].state & 0xFFFFFFFF) != 0) {
out.push_back(v[i]);
}
}
return out;

//return generate_pareto_ids(NUM_ELEMENTS, 1001.0, 42); // 1M IDs with mean ~1000
}
static void matchBySorting(
benchmark::State& state) {
std::vector<ndb::Descriptor> srcOriginal = getSrcDescriptors();
std::vector<ndb::Descriptor> tarOriginal = getTarDescriptors();
for (auto _ : state) {
state.PauseTiming();
std::vector<ndb::Descriptor> src = srcOriginal;
std::vector<ndb::Descriptor> tar = tarOriginal;
state.ResumeTiming();
std::vector<ndb::Correspondence>
matches = gpc::inference::Forest::findCorrespondences(src, tar);

state.counters["matches"] = matches.size();
benchmark::DoNotOptimize(matches);
benchmark::ClobberMemory();
}
}
BENCHMARK(matchBySorting)
->Unit(benchmark::kMillisecond);
BENCHMARK_MAIN();
62 changes: 62 additions & 0 deletions benchmarks/kernel_bench.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#include <benchmark/benchmark.h>
#include "gpc/inference.hpp"

typedef gpc::inference::Forest GPCForest_t;
GPCForest_t forest;

static void fullInference(
benchmark::State& state){

std::string forestPath = "../forests/defaultZeroForest.txt";
std::string leftImgPath = "../data/middlebury/im0.png";
std::string rightImgPath = "../data/middlebury/im1.png";
gpc::inference::InferenceSettings inferencesettings =
gpc::inference::InferenceSettings()
.builder()
.gradientThreshold(state.range(0)) // 0...255 gradient threshold for sobel filter
.verticalTolerance(
0) // 0px tolerance for rectified epipolar matches
.dispHigh(128) // limit disparities to 128
.epipolarMode(true) // match GPC states in epipolar mode. more
// matches, lower accuracy than global
.useHashtable(false); // use sort method for matching. faster for
// <100K descriptors

ndb::Buffer<uint8_t> simg, timg;
// Load images
simg.readPNG(leftImgPath);
timg.readPNG(rightImgPath);

// Get learned filter for the given image dimensions.
GPCForest_t::FilterMask fm =
forest.readForest(forestPath, simg.cols(), simg.rows());



for (auto _ : state) {
GPCForest_t::PreprocessedImage simgP =
forest.preprocessImage(simg, inferencesettings);
GPCForest_t::PreprocessedImage timgP =
forest.preprocessImage(timg, inferencesettings);
std::vector<ndb::Support> supp =
forest.rectifiedMatch(simgP, timgP, fm, inferencesettings);
state.counters["candidates_s"] = simgP.mask.size();
state.counters["candidates_t"] = timgP.mask.size();
state.counters["matches"] = supp.size();
benchmark::DoNotOptimize(supp);
benchmark::ClobberMemory();
}

}

BENCHMARK(fullInference)
->Unit(benchmark::kMillisecond)
->Args({0})
->Args({5})
->Args({10})
->Args({20})
->Args({50})
->Args({100});


BENCHMARK_MAIN();
56 changes: 56 additions & 0 deletions benchmarks/sobel_bench.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#include <benchmark/benchmark.h>
#include <hwy/highway.h>
#include "gpc/kernels/sobel.hpp"
#include "gpc/kernels/sobel_hwy.hpp"
static void BM_SobelHighway(benchmark::State& state) {
int w = 1920, h = 1080;
std::vector<uint8_t> in(w * h, 128);
std::vector<uint8_t> out(w * h, 0);
state.SetLabel(hwy::TargetName(HWY_TARGET));
// Warmup is handled automatically by the library
for (auto _ : state) {
ndb::testing::sobel_hwy(in.data(), out.data(), w, h, 50);

// Ensure the compiler doesn't skip the work
benchmark::DoNotOptimize(out.data());
benchmark::ClobberMemory();
}
}

#if HWY_TARGET == HWY_AVX2
static void BM_SobelLegacySIMD(benchmark::State& state) {
int w = 1920, h = 1080;
std::vector<uint8_t> in(w * h, 128);
std::vector<uint8_t> out(w * h, 0);

state.SetLabel("AVX2_legacy");
for (auto _ : state) {
ndb::sobelSSE(in.data(), out.data(), w, 1, h - 1, 1);

// Ensure the compiler doesn't skip the work
benchmark::DoNotOptimize(out.data());
benchmark::ClobberMemory();
}
}
#endif
static void BM_SobelNaive(benchmark::State& state) {
int w = 1920, h = 1080;
std::vector<uint8_t> in(w * h, 128);
std::vector<uint8_t> out(w * h, 0);

state.SetLabel("naive");
for (auto _ : state) {
ndb::sobelNaive(in.data(), out.data(), w, h, 50);

// Ensure the compiler doesn't skip the work
benchmark::DoNotOptimize(out.data());
benchmark::ClobberMemory();
}
}
BENCHMARK(BM_SobelHighway)->Unit(benchmark::kMillisecond);
#if HWY_TARGET == HWY_AVX2
BENCHMARK(BM_SobelLegacySIMD)->Unit(benchmark::kMillisecond);
#endif
BENCHMARK(BM_SobelNaive)->Unit(benchmark::kMillisecond);

BENCHMARK_MAIN();
Loading
Loading