From b76dc5fc06d8af8fed98ad82d2055ceaf98ddd72 Mon Sep 17 00:00:00 2001 From: Enrico Seiler Date: Wed, 28 Jan 2026 16:52:07 +0100 Subject: [PATCH] feat: track occupancy without dynamic hibf --- include/hibf/config.hpp | 35 ++++++++++++- src/build/construct_ibf.cpp | 2 +- src/config.cpp | 3 ++ src/interleaved_bloom_filter.cpp | 2 +- test/snippet/hibf/hibf_construction.cpp | 2 + test/unit/hibf/config_test.cpp | 69 +++++++++++++++++++++++-- 6 files changed, 106 insertions(+), 7 deletions(-) diff --git a/include/hibf/config.hpp b/include/hibf/config.hpp index fda23d27..360b68d3 100644 --- a/include/hibf/config.hpp +++ b/include/hibf/config.hpp @@ -41,6 +41,7 @@ namespace seqan::hibf * | Layout | seqan::hibf::config::sketch_bits | 12 | | * | Layout | seqan::hibf::config::tmax | 0 | 0 indicates unset | * | Layout | seqan::hibf::config::empty_bin_fraction | 0.0 | Dynamic Layout | + * | General | seqan::hibf::config::track_occupancy | false | | * | Layout | seqan::hibf::config::max_rearrangement_ratio | 0.5 | | * | Layout | seqan::hibf::config::alpha | 1.2 | | * | Layout | seqan::hibf::config::disable_estimate_union | false | | @@ -243,11 +244,32 @@ struct config * designated to contain any data. The resulting layout will be very similar to a layout with `tmax` set to `58` * and no empty bins. * + * Choosing a value larger than `0.0` will also enable the `track_occupancy` option. + * * Value must be in range [0.0,1.0). * Recommendation: default value (0.0). This option is not recommended for general use. */ double empty_bin_fraction{}; + /*!\brief Track the amount of emplaced elements for each technical bin. + * + * An IBF can track how many elements were emplaced into each technical bin. + * This option can be useful for a dynamic index, or to compute the exact FPR for a technical bin. + * + * The occupancy of a technical bin `i` of IBF `ibf` can be accessed via `ibf.occupancy[i]`. + * + * For occupancy, emplacing an element means that a bit of the conceptual Bloom Filter representing the respective + * technical bin changes. + * For example, adding the same value multiple times to the same technical bin will not increase the occupancy. + * Likewise, if the respective bits for a value have already been set by previous emplacing operations, the + * occupancy will not increase. + * + * This option comes with a minor performance penalty for seqan::hibf::interleaved_bloom_filter::emplace. + * + * Recommendation: default value (false). + */ + bool track_occupancy{false}; + /*!\brief A scaling factor to influence the amount of merged bins produced by the layout algorithm. * * The layout algorithm optimizes the space consumption of the resulting HIBF, but currently has no means of @@ -330,6 +352,8 @@ struct config * * Not setting seqan::hibf::config::tmax, or setting it to `0`, results in a default tmax * `std::ceil(std::sqrt(number_of_user_bins))` being used. * * seqan::hibf::config::tmax is increased to the next multiple of 64. + * * Setting seqan::hibf::config::empty_bin_fraction to a value larger than `0.0` will also enable + * seqan::hibf::config::track_occupancy. */ void validate_and_set_defaults(); @@ -354,7 +378,7 @@ struct config private: friend class cereal::access; - static constexpr uint32_t version{2}; + static constexpr uint32_t version{3u}; template void serialize(archive_t & archive) @@ -371,9 +395,16 @@ struct config archive(CEREAL_NVP(sketch_bits)); archive(CEREAL_NVP(tmax)); - if (parsed_version > 1u) + if (parsed_version >= 2u) + { archive(CEREAL_NVP(empty_bin_fraction)); + if (parsed_version >= 3u) + archive(CEREAL_NVP(track_occupancy)); + else + track_occupancy = empty_bin_fraction != 0.0; + } + archive(CEREAL_NVP(alpha)); archive(CEREAL_NVP(max_rearrangement_ratio)); archive(CEREAL_NVP(disable_estimate_union)); diff --git a/src/build/construct_ibf.cpp b/src/build/construct_ibf.cpp index ef529650..68abcffc 100644 --- a/src/build/construct_ibf.cpp +++ b/src/build/construct_ibf.cpp @@ -51,7 +51,7 @@ seqan::hibf::interleaved_bloom_filter construct_ibf(robin_hood::unordered_flat_s seqan::hibf::interleaved_bloom_filter ibf{bin_count, bin_size, seqan::hibf::hash_function_count{data.config.number_of_hash_functions}, - data.config.empty_bin_fraction > 0.0}; + data.config.track_occupancy}; local_index_allocation_timer.stop(); data.index_allocation_timer += local_index_allocation_timer; diff --git a/src/config.cpp b/src/config.cpp index b9f95d34..cb39b169 100644 --- a/src/config.cpp +++ b/src/config.cpp @@ -116,6 +116,9 @@ void config::validate_and_set_defaults() if (empty_bin_fraction < 0.0 || empty_bin_fraction >= 1.0) throw std::invalid_argument{"[HIBF CONFIG ERROR] config::empty_bin_fraction must be in [0.0,1.0)."}; + if (empty_bin_fraction != 0.0) + track_occupancy = true; + if (alpha < 0.0) throw std::invalid_argument{"[HIBF CONFIG ERROR] config::alpha must be positive."}; diff --git a/src/interleaved_bloom_filter.cpp b/src/interleaved_bloom_filter.cpp index 05721d67..47a6dd4a 100644 --- a/src/interleaved_bloom_filter.cpp +++ b/src/interleaved_bloom_filter.cpp @@ -105,7 +105,7 @@ interleaved_bloom_filter::interleaved_bloom_filter(config & configuration, size_ interleaved_bloom_filter{seqan::hibf::bin_count{configuration.number_of_user_bins}, seqan::hibf::bin_size{max_bin_size(configuration, max_bin_elements)}, seqan::hibf::hash_function_count{configuration.number_of_hash_functions}, - configuration.empty_bin_fraction > 0.0} + configuration.track_occupancy} { size_t const chunk_size = std::clamp(std::bit_ceil(bin_count() / configuration.threads), 8u, 64u); diff --git a/test/snippet/hibf/hibf_construction.cpp b/test/snippet/hibf/hibf_construction.cpp index a6fc6811..2f2713b2 100644 --- a/test/snippet/hibf/hibf_construction.cpp +++ b/test/snippet/hibf/hibf_construction.cpp @@ -28,6 +28,8 @@ int main() .threads = 1, // recommended to adapt .sketch_bits = 12, .tmax = 0, // triggers default copmutation + .empty_bin_fraction = 0.0, + .track_occupancy = false, .alpha = 1.2, .max_rearrangement_ratio = 0.5, .disable_estimate_union = false, diff --git a/test/unit/hibf/config_test.cpp b/test/unit/hibf/config_test.cpp index 02adfb6e..5162908b 100644 --- a/test/unit/hibf/config_test.cpp +++ b/test/unit/hibf/config_test.cpp @@ -37,7 +37,7 @@ TEST(config_test, write_to) std::string const expected_file{"@HIBF_CONFIG\n" "@{\n" "@ \"hibf_config\": {\n" - "@ \"version\": 2,\n" + "@ \"version\": 3,\n" "@ \"number_of_user_bins\": 123456789,\n" "@ \"number_of_hash_functions\": 4,\n" "@ \"maximum_fpr\": 0.0001,\n" @@ -46,6 +46,7 @@ TEST(config_test, write_to) "@ \"sketch_bits\": 8,\n" "@ \"tmax\": 128,\n" "@ \"empty_bin_fraction\": 0.0,\n" + "@ \"track_occupancy\": false,\n" "@ \"alpha\": 1.0,\n" "@ \"max_rearrangement_ratio\": 0.333,\n" "@ \"disable_estimate_union\": true,\n" @@ -62,7 +63,7 @@ TEST(config_test, read_from) std::stringstream ss{"@HIBF_CONFIG\n" "@{\n" "@ \"hibf_config\": {\n" - "@ \"version\": 2,\n" + "@ \"version\": 3,\n" "@ \"number_of_user_bins\": 123456789,\n" "@ \"number_of_hash_functions\": 4,\n" "@ \"maximum_fpr\": 0.0001,\n" @@ -71,6 +72,7 @@ TEST(config_test, read_from) "@ \"sketch_bits\": 8,\n" "@ \"tmax\": 128,\n" "@ \"empty_bin_fraction\": 0.5,\n" + "@ \"track_occupancy\": true,\n" "@ \"alpha\": 1.0,\n" "@ \"max_rearrangement_ratio\": 0.333,\n" "@ \"disable_estimate_union\": true,\n" @@ -90,6 +92,7 @@ TEST(config_test, read_from) EXPECT_EQ(configuration.sketch_bits, 8); EXPECT_EQ(configuration.tmax, 128); EXPECT_EQ(configuration.empty_bin_fraction, 0.5); + EXPECT_EQ(configuration.track_occupancy, true); EXPECT_EQ(configuration.alpha, 1.0); EXPECT_EQ(configuration.max_rearrangement_ratio, 0.333); EXPECT_EQ(configuration.disable_estimate_union, true); @@ -134,6 +137,46 @@ TEST(config_test, read_from_v1) EXPECT_EQ(configuration.disable_rearrangement, false); } +TEST(config_test, read_from_v2) +{ + std::stringstream ss{"@HIBF_CONFIG\n" + "@{\n" + "@ \"hibf_config\": {\n" + "@ \"version\": 2,\n" + "@ \"number_of_user_bins\": 123456789,\n" + "@ \"number_of_hash_functions\": 4,\n" + "@ \"maximum_fpr\": 0.0001,\n" + "@ \"relaxed_fpr\": 0.3,\n" + "@ \"threads\": 31,\n" + "@ \"sketch_bits\": 8,\n" + "@ \"tmax\": 128,\n" + "@ \"empty_bin_fraction\": 0.5,\n" + "@ \"alpha\": 1.0,\n" + "@ \"max_rearrangement_ratio\": 0.333,\n" + "@ \"disable_estimate_union\": true,\n" + "@ \"disable_rearrangement\": false\n" + "@ }\n" + "@}\n" + "@HIBF_CONFIG_END\n"}; + + seqan::hibf::config configuration; + configuration.read_from(ss); + + EXPECT_EQ(configuration.number_of_user_bins, 123456789); + EXPECT_EQ(configuration.number_of_hash_functions, 4); + EXPECT_EQ(configuration.maximum_fpr, 0.0001); + EXPECT_EQ(configuration.relaxed_fpr, 0.3); + EXPECT_EQ(configuration.threads, 31); + EXPECT_EQ(configuration.sketch_bits, 8); + EXPECT_EQ(configuration.tmax, 128); + EXPECT_EQ(configuration.empty_bin_fraction, 0.5); + EXPECT_EQ(configuration.track_occupancy, true); + EXPECT_EQ(configuration.alpha, 1.0); + EXPECT_EQ(configuration.max_rearrangement_ratio, 0.333); + EXPECT_EQ(configuration.disable_estimate_union, true); + EXPECT_EQ(configuration.disable_rearrangement, false); +} + TEST(config_test, read_from_with_more_meta) { std::stringstream ss{"@blah some chopper stuff\n" @@ -144,7 +187,7 @@ TEST(config_test, read_from_with_more_meta) "@HIBF_CONFIG\n" "@{\n" "@ \"hibf_config\": {\n" - "@ \"version\": 1,\n" + "@ \"version\": 3,\n" "@ \"number_of_user_bins\": 123456789,\n" "@ \"number_of_hash_functions\": 4,\n" "@ \"maximum_fpr\": 0.0001,\n" @@ -152,6 +195,8 @@ TEST(config_test, read_from_with_more_meta) "@ \"threads\": 31,\n" "@ \"sketch_bits\": 8,\n" "@ \"tmax\": 128,\n" + "@ \"empty_bin_fraction\": 0.0,\n" + "@ \"track_occupancy\": true,\n" "@ \"alpha\": 1.0,\n" "@ \"max_rearrangement_ratio\": 0.333,\n" "@ \"disable_estimate_union\": true,\n" @@ -170,6 +215,8 @@ TEST(config_test, read_from_with_more_meta) EXPECT_EQ(configuration.threads, 31); EXPECT_EQ(configuration.sketch_bits, 8); EXPECT_EQ(configuration.tmax, 128); + EXPECT_EQ(configuration.empty_bin_fraction, 0.0); + EXPECT_EQ(configuration.track_occupancy, true); EXPECT_EQ(configuration.alpha, 1.0); EXPECT_EQ(configuration.max_rearrangement_ratio, 0.333); EXPECT_EQ(configuration.disable_estimate_union, true); @@ -349,6 +396,20 @@ TEST(config_test, validate_and_set_defaults) "[HIBF CONFIG ERROR] config::empty_bin_fraction must be in [0.0,1.0)."); } + // empty_bin_fraction != 0.0 also enables tracking occupancy + { + seqan::hibf::config configuration{.input_fn = dummy_input_fn, + .number_of_user_bins = 1u, + .empty_bin_fraction = 0.0, + .track_occupancy = false}; + configuration.validate_and_set_defaults(); + EXPECT_EQ(configuration.track_occupancy, false); + + configuration.empty_bin_fraction = 0.3; + configuration.validate_and_set_defaults(); + EXPECT_EQ(configuration.track_occupancy, true); + } + // alpha must be positive { seqan::hibf::config configuration{.input_fn = dummy_input_fn, .number_of_user_bins = 1u, .alpha = -0.1}; @@ -413,6 +474,8 @@ TEST(config_test, serialisation) .threads = 31, .sketch_bits = 8, .tmax = 128, + .empty_bin_fraction = 0.13, + .track_occupancy = true, .alpha = 1.0, .max_rearrangement_ratio = 0.333, .disable_estimate_union = true,