diff --git a/.github/workflows/benchmarks-test.yml b/.github/workflows/benchmarks-test.yml index c957c0c..a1e20d8 100644 --- a/.github/workflows/benchmarks-test.yml +++ b/.github/workflows/benchmarks-test.yml @@ -2,55 +2,57 @@ name: CMake Build and Run Benchmark Tests on: push: - branches: [ main ] + branches: [main] pull_request: - branches: [ main ] + branches: [main] jobs: build-and-test: runs-on: ubuntu-latest - + steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v4 - - name: Create Build Directory - run: mkdir build + - name: Create Build Directory + run: mkdir build - - name: Configure CMake - working-directory: ./build - run: cmake -DDISABLE_AVX512=ON -DENABLE_ADDRESS_SANITIZER=ON .. + - name: Configure CMake + working-directory: ./build + run: cmake -DDISABLE_AVX512=ON -DENABLE_ADDRESS_SANITIZER=ON .. - - name: Build Project - working-directory: ./build - run: make -j + - name: Build Project + working-directory: ./build + run: make -j - - name: Run Benchmark Tests - working-directory: ./build - run: ./benchmark_tests - - build-and-test-with-SDE: - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v4 + - name: Run Benchmark Tests + working-directory: ./build + run: ./benchmark_tests + + # Note! Benchmark tests on SDE are taking too long to run on CI. + # Need to find a better way to check with AVX-512 + # build-and-test-with-SDE: + # runs-on: ubuntu-latest + + # steps: + # - uses: actions/checkout@v4 - - name: Create Build Directory - run: mkdir build + # - name: Create Build Directory + # run: mkdir build - - name: Download and Unpack SDE - working-directory: ./build - run: | - wget https://downloadmirror.intel.com/859732/sde-external-9.58.0-2025-06-16-lin.tar.xz - tar xf sde-external-9.58.0-2025-06-16-lin.tar.xz + # - name: Download and Unpack SDE + # working-directory: ./build + # run: | + # wget https://downloadmirror.intel.com/859732/sde-external-9.58.0-2025-06-16-lin.tar.xz + # tar xf sde-external-9.58.0-2025-06-16-lin.tar.xz - - name: Configure CMake - working-directory: ./build - run: cmake -DENABLE_ADDRESS_SANITIZER=ON -DMARCH=icelake-client -DHAVE_STD_REGEX=ON .. + # - name: Configure CMake + # working-directory: ./build + # run: cmake -DENABLE_ADDRESS_SANITIZER=ON -DMARCH=icelake-client -DHAVE_STD_REGEX=ON .. - - name: Build Project - working-directory: ./build - run: make -j + # - name: Build Project + # working-directory: ./build + # run: make -j - - name: Run Benchmark Tests - working-directory: ./build - run: sde-external-9.58.0-2025-06-16-lin/sde64 -icl -emu-xinuse 0 -- ./benchmark_tests \ No newline at end of file + # - name: Run Benchmark Tests + # working-directory: ./build + # run: sde-external-9.58.0-2025-06-16-lin/sde64 -icl -emu-xinuse 0 -- ./benchmark_tests diff --git a/include/pixie/bits.h b/include/pixie/bits.h index c7f0091..d1a170b 100644 --- a/include/pixie/bits.h +++ b/include/pixie/bits.h @@ -46,17 +46,17 @@ static inline const __m256i mask_first_half = _mm256_setr_epi8( // clang-format on #endif -inline uint64_t first_bits_mask(size_t num) { +static inline uint64_t first_bits_mask(size_t num) { return num >= 64 ? UINT64_MAX : ((1llu << num) - 1); } /** * @brief Number of 1 bits in positions 0 .. count - 1 - * @details - * Surprisingly one cannot just do (1 << L) - 1 for - * L > 64 to produce mask of ones of length L. - * The best we can do with a single instruction is (1 << L) - 1 for L=8k - * using maskz_set1. + * @details Assumes count + * < 512. + * @details Surprisingly one cannot just do (1 << L) - 1 for L > 64 to + * produce mask of ones of length L. The best we can do with a single + * instruction is (1 << L) - 1 for L=8k using maskz_set1. * * To make mask for arbitrary L we use shldv instuction, it doesn't * really matter what epi is used the recipe is the following: @@ -70,7 +70,7 @@ inline uint64_t first_bits_mask(size_t num) { * The rest is standard, i.e. popcount_epi64 to perform popcount on * 64 bits and then reduce_add to sum the result. */ -uint64_t rank_512(const uint64_t* x, uint64_t count) { +static inline uint64_t rank_512(const uint64_t* x, uint64_t count) { #ifdef PIXIE_AVX512_SUPPORT __m512i a = _mm512_maskz_set1_epi64((1ull << ((count >> 6))) - 1, @@ -105,7 +105,7 @@ uint64_t rank_512(const uint64_t* x, uint64_t count) { /** * @brief Return position of @p rank 1 bit in @p x */ -uint64_t select_64(uint64_t x, uint64_t rank) { +static inline uint64_t select_64(uint64_t x, uint64_t rank) { return _tzcnt_u64(_pdep_u64(1ull << rank, x)); } @@ -126,18 +126,34 @@ uint64_t select_64(uint64_t x, uint64_t rank) { * It can also be used as an alternative for linear search but i don't * see a proper SIMD algorithm to make it faster. */ -uint64_t select_512(const uint64_t* x, uint64_t rank) { +static inline uint64_t select_512(const uint64_t* x, uint64_t rank) { #ifdef PIXIE_AVX512_SUPPORT __m512i res = _mm512_loadu_epi64(x); - alignas(64) std::array counts; - _mm512_store_epi64(counts.data(), _mm512_popcnt_epi64(res)); - - size_t i = 0; - while (i < 8 && counts[i] <= rank) { - rank -= counts[i++]; + __m512i counts = _mm512_popcnt_epi64(res); + __m512i prefix = counts; + + const __m512i idx_shift1 = _mm512_set_epi64(6, 5, 4, 3, 2, 1, 0, 0); + const __m512i idx_shift2 = _mm512_set_epi64(5, 4, 3, 2, 1, 0, 0, 0); + const __m512i idx_shift4 = _mm512_set_epi64(3, 2, 1, 0, 0, 0, 0, 0); + + __m512i tmp = _mm512_maskz_permutexvar_epi64(0xFE, idx_shift1, prefix); + prefix = _mm512_add_epi64(prefix, tmp); + tmp = _mm512_maskz_permutexvar_epi64(0xFC, idx_shift2, prefix); + prefix = _mm512_add_epi64(prefix, tmp); + tmp = _mm512_maskz_permutexvar_epi64(0xF0, idx_shift4, prefix); + prefix = _mm512_add_epi64(prefix, tmp); + + __mmask8 mask = _mm512_cmpgt_epu64_mask(prefix, _mm512_set1_epi64(rank)); + uint32_t i = _tzcnt_u32(static_cast(mask)); + uint64_t prev = 0; + if (i != 0) { + __m512i idx_prev = _mm512_set1_epi64(static_cast(i - 1)); + __m512i prev_vec = _mm512_permutexvar_epi64(idx_prev, prefix); + prev = static_cast( + _mm_cvtsi128_si64(_mm512_castsi512_si128(prev_vec))); } - return i * 64 + select_64(x[i], rank); + return i * 64 + select_64(x[i], rank - prev); #else @@ -156,19 +172,35 @@ uint64_t select_512(const uint64_t* x, uint64_t rank) { * @brief Return position of @p rank0 0 bit in @p x * @details select_512 with bit inversion. */ -uint64_t select0_512(const uint64_t* x, uint64_t rank0) { +static inline uint64_t select0_512(const uint64_t* x, uint64_t rank0) { #ifdef PIXIE_AVX512_SUPPORT __m512i res = _mm512_loadu_epi64(x); res = _mm512_xor_epi64(res, _mm512_set1_epi64(-1)); - alignas(64) std::array counts; - _mm512_store_epi64(counts.data(), _mm512_popcnt_epi64(res)); - - size_t i = 0; - while (i < 8 && counts[i] <= rank0) { - rank0 -= counts[i++]; + __m512i counts = _mm512_popcnt_epi64(res); + __m512i prefix = counts; + + const __m512i idx_shift1 = _mm512_set_epi64(6, 5, 4, 3, 2, 1, 0, 0); + const __m512i idx_shift2 = _mm512_set_epi64(5, 4, 3, 2, 1, 0, 0, 0); + const __m512i idx_shift4 = _mm512_set_epi64(3, 2, 1, 0, 0, 0, 0, 0); + + __m512i tmp = _mm512_maskz_permutexvar_epi64(0xFE, idx_shift1, prefix); + prefix = _mm512_add_epi64(prefix, tmp); + tmp = _mm512_maskz_permutexvar_epi64(0xFC, idx_shift2, prefix); + prefix = _mm512_add_epi64(prefix, tmp); + tmp = _mm512_maskz_permutexvar_epi64(0xF0, idx_shift4, prefix); + prefix = _mm512_add_epi64(prefix, tmp); + + __mmask8 mask = _mm512_cmpgt_epu64_mask(prefix, _mm512_set1_epi64(rank0)); + uint32_t i = _tzcnt_u32(static_cast(mask)); + uint64_t prev = 0; + if (i != 0) { + __m512i idx_prev = _mm512_set1_epi64(static_cast(i - 1)); + __m512i prev_vec = _mm512_permutexvar_epi64(idx_prev, prefix); + prev = static_cast( + _mm_cvtsi128_si64(_mm512_castsi512_si128(prev_vec))); } - return i * 64 + select_64(~x[i], rank0); + return i * 64 + select_64(~x[i], rank0 - prev); #else @@ -187,7 +219,7 @@ uint64_t select0_512(const uint64_t* x, uint64_t rank0) { * @brief Compare 4 64-bit numbers of @p x with @p y and * return the length of the prefix where @p y is less then @p x */ -uint16_t lower_bound_4x64(const uint64_t* x, uint64_t y) { +static inline uint16_t lower_bound_4x64(const uint64_t* x, uint64_t y) { #ifdef PIXIE_AVX512_SUPPORT auto y_4 = _mm256_set1_epi64x(y); @@ -236,10 +268,10 @@ uint16_t lower_bound_4x64(const uint64_t* x, uint64_t y) { * offsets. * @param delta_scalar Shared delta offset. */ -uint16_t lower_bound_delta_4x64(const uint64_t* x, - uint64_t y, - const uint64_t* delta_array, - uint64_t delta_scalar) { +static inline uint16_t lower_bound_delta_4x64(const uint64_t* x, + uint64_t y, + const uint64_t* delta_array, + uint64_t delta_scalar) { #ifdef PIXIE_AVX512_SUPPORT const __m256i dlt_256 = _mm256_loadu_epi64(delta_array); @@ -290,7 +322,7 @@ uint16_t lower_bound_delta_4x64(const uint64_t* x, * @brief Compare 8 64-bit numbers of @p x with @p y and * return the length of the prefix where @p y is less then @p x */ -uint16_t lower_bound_8x64(const uint64_t* x, uint64_t y) { +static inline uint16_t lower_bound_8x64(const uint64_t* x, uint64_t y) { #ifdef PIXIE_AVX512_SUPPORT auto y_8 = _mm512_set1_epi64(y); @@ -336,10 +368,10 @@ uint16_t lower_bound_8x64(const uint64_t* x, uint64_t y) { * offsets. * @param delta_scalar Shared delta offset. */ -uint16_t lower_bound_delta_8x64(const uint64_t* x, - uint64_t y, - const uint64_t* delta_array, - uint64_t delta_scalar) { +static inline uint16_t lower_bound_delta_8x64(const uint64_t* x, + uint64_t y, + const uint64_t* delta_array, + uint64_t delta_scalar) { #ifdef PIXIE_AVX512_SUPPORT const __m512i dlt_512 = _mm512_loadu_epi64(delta_array); @@ -591,10 +623,14 @@ void popcount_32x8(const uint8_t* x, uint8_t* result) { /** * @brief Calculates 32 bit ranks of every 8th bit, result is stored as 32 - * 8-bit integers. + + * * 8-bit integers. + * @details Prefix sums are computed modulo 256 (uint8_t + * wraparound). * * @param x Pointer to 32 input 8-bit integers - * @param result Pointer to store the resulting 32 8-bit integers + * @param + * result Pointer to store the resulting 32 8-bit integers */ void rank_32x8(const uint8_t* x, uint8_t* result) { #ifdef PIXIE_AVX512_SUPPORT diff --git a/include/pixie/bitvector.h b/include/pixie/bitvector.h index 4012cd2..66791ce 100644 --- a/include/pixie/bitvector.h +++ b/include/pixie/bitvector.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include @@ -61,18 +62,18 @@ class BitVector { constexpr static size_t kSuperBlockRankIntSize = 64; constexpr static size_t kBasicBlockRankIntSize = 16; constexpr static size_t kBasicBlockSize = 512; - constexpr static size_t kSuperBlockSize = 65536; constexpr static size_t kWordsPerBlock = 8; - constexpr static size_t kSelectSampleFrequency = 16384; + constexpr static size_t kSuperBlockSize = 65536; constexpr static size_t kBlocksPerSuperBlock = 128; + constexpr static size_t kSelectSampleFrequency = 16384; alignas(64) uint64_t delta_super[8]; alignas(64) uint16_t delta_basic[32]; - std::vector super_block_rank_; - std::vector basic_block_rank_; - std::vector select1_samples_; - std::vector select0_samples_; + AlignedStorage super_block_rank_; // 64-bit global prefix sums + AlignedStorage basic_block_rank_; // 16-bit local prefix sums + AlignedStorage select1_samples_; // 64-bit global positions + AlignedStorage select0_samples_; // 64-bit global positions const size_t num_bits_; const size_t padded_size_; size_t max_rank_; @@ -89,21 +90,24 @@ class BitVector { // This reduces branching in select num_superblocks = ((num_superblocks + 7) / 8) * 8; size_t num_basicblocks = num_superblocks * kBlocksPerSuperBlock; - super_block_rank_.resize(num_superblocks); - basic_block_rank_.resize(num_basicblocks); + super_block_rank_.resize(num_superblocks * 64); + basic_block_rank_.resize(num_basicblocks * 16); + + auto super_block_rank = super_block_rank_.As64BitInts(); + auto basic_block_rank = basic_block_rank_.As16BitInts(); uint64_t super_block_sum = 0; uint16_t basic_block_sum = 0; - for (size_t i = 0; i / kBasicBlockSize < basic_block_rank_.size(); + for (size_t i = 0; i / kBasicBlockSize < basic_block_rank.size(); i += kWordSize) { if (i % kSuperBlockSize == 0) { super_block_sum += basic_block_sum; - super_block_rank_[i / kSuperBlockSize] = super_block_sum; + super_block_rank[i / kSuperBlockSize] = super_block_sum; basic_block_sum = 0; } if (i % kBasicBlockSize == 0) { - basic_block_rank_[i / kBasicBlockSize] = basic_block_sum; + basic_block_rank[i / kBasicBlockSize] = basic_block_sum; } if (i / kWordSize < bits_.size()) { basic_block_sum += std::popcount(bits_[i / kWordSize]); @@ -120,8 +124,23 @@ class BitVector { uint64_t milestone0 = kSelectSampleFrequency; uint64_t rank = 0; uint64_t rank0 = 0; - select1_samples_.emplace_back(0); - select0_samples_.emplace_back(0); + + size_t num_one_samples = + 1 + (max_rank_ + kSelectSampleFrequency - 1) / kSelectSampleFrequency; + size_t num_zero_samples = + 1 + (num_bits_ - max_rank_ + kSelectSampleFrequency - 1) / + kSelectSampleFrequency; + + select1_samples_.resize(num_one_samples * 64); + select0_samples_.resize(num_zero_samples * 64); + auto select1_samples = select1_samples_.As64BitInts(); + auto select0_samples = select0_samples_.As64BitInts(); + + select1_samples[0] = 0; + select0_samples[0] = 0; + + size_t num_zeros = 1, num_ones = 1; + for (size_t i = 0; i < bits_.size(); ++i) { auto ones = std::popcount(bits_[i]); auto zeros = 64 - ones; @@ -129,12 +148,12 @@ class BitVector { auto pos = select_64(bits_[i], milestone - rank - 1); // TODO: try including global rank into select samples to save // a cache miss on global rank scan - select1_samples_.emplace_back((64 * i + pos) / kSuperBlockSize); + select1_samples[num_ones++] = (64 * i + pos) / kSuperBlockSize; milestone += kSelectSampleFrequency; } if (rank0 + zeros >= milestone0) { auto pos = select_64(~bits_[i], milestone0 - rank0 - 1); - select0_samples_.emplace_back((64 * i + pos) / kSuperBlockSize); + select0_samples[num_zeros++] = (64 * i + pos) / kSuperBlockSize; milestone0 += kSelectSampleFrequency; } rank += ones; @@ -155,23 +174,26 @@ class BitVector { * rank of the 1-bit to locate. */ uint64_t find_superblock(uint64_t rank) const { - uint64_t left = select1_samples_[rank / kSelectSampleFrequency]; + auto select1_samples = select1_samples_.AsConst64BitInts(); + auto super_block_rank = super_block_rank_.AsConst64BitInts(); + + uint64_t left = select1_samples[rank / kSelectSampleFrequency]; - while (left + 7 < super_block_rank_.size()) { - auto len = lower_bound_8x64(&super_block_rank_[left], rank); + while (left + 7 < super_block_rank.size()) { + auto len = lower_bound_8x64(&super_block_rank[left], rank); if (len < 8) { return left + len - 1; } left += 8; } - if (left + 3 < super_block_rank_.size()) { - auto len = lower_bound_4x64(&super_block_rank_[left], rank); + if (left + 3 < super_block_rank.size()) { + auto len = lower_bound_4x64(&super_block_rank[left], rank); if (len < 4) { return left + len - 1; } left += 4; } - while (left < super_block_rank_.size() && super_block_rank_[left] < rank) { + while (left < super_block_rank.size() && super_block_rank[left] < rank) { left++; } return left - 1; @@ -183,26 +205,29 @@ class BitVector { * rank of the 0-bit to locate. */ uint64_t find_superblock_zeros(uint64_t rank0) const { - uint64_t left = select0_samples_[rank0 / kSelectSampleFrequency]; + auto select0_samples = select0_samples_.AsConst64BitInts(); + auto super_block_rank = super_block_rank_.AsConst64BitInts(); + + uint64_t left = select0_samples[rank0 / kSelectSampleFrequency]; - while (left + 7 < super_block_rank_.size()) { - auto len = lower_bound_delta_8x64(&super_block_rank_[left], rank0, + while (left + 7 < super_block_rank.size()) { + auto len = lower_bound_delta_8x64(&super_block_rank[left], rank0, delta_super, kSuperBlockSize * left); if (len < 8) { return left + len - 1; } left += 8; } - if (left + 3 < super_block_rank_.size()) { - auto len = lower_bound_delta_4x64(&super_block_rank_[left], rank0, + if (left + 3 < super_block_rank.size()) { + auto len = lower_bound_delta_4x64(&super_block_rank[left], rank0, delta_super, kSuperBlockSize * left); if (len < 4) { return left + len - 1; } left += 4; } - while (left < super_block_rank_.size() && - kSuperBlockSize * left - super_block_rank_[left] < rank0) { + while (left < super_block_rank.size() && + kSuperBlockSize * left - super_block_rank[left] < rank0) { left++; } return left - 1; @@ -220,9 +245,11 @@ class BitVector { * * 4 iterations. */ uint64_t find_basicblock(uint16_t local_rank, uint64_t s_block) const { + auto basic_block_rank = basic_block_rank_.AsConst16BitInts(); + for (size_t pos = 0; pos < kBlocksPerSuperBlock; pos += 32) { auto count = lower_bound_32x16( - &basic_block_rank_[kBlocksPerSuperBlock * s_block + pos], local_rank); + &basic_block_rank[kBlocksPerSuperBlock * s_block + pos], local_rank); if (count < 32) { return kBlocksPerSuperBlock * s_block + pos + count - 1; } @@ -242,9 +269,10 @@ class BitVector { * 4 iterations. */ uint64_t find_basicblock_zeros(uint16_t local_rank0, uint64_t s_block) const { + auto basic_block_rank = basic_block_rank_.AsConst16BitInts(); for (size_t pos = 0; pos < kBlocksPerSuperBlock; pos += 32) { auto count = lower_bound_delta_32x16( - &basic_block_rank_[kBlocksPerSuperBlock * s_block + pos], local_rank0, + &basic_block_rank[kBlocksPerSuperBlock * s_block + pos], local_rank0, delta_basic, kBasicBlockSize * pos); if (count < 32) { return kBlocksPerSuperBlock * s_block + pos + count - 1; @@ -271,15 +299,18 @@ class BitVector { * we backoff to find_basicblock */ uint64_t find_basicblock_is(uint16_t local_rank, uint64_t s_block) const { - auto lower = super_block_rank_[s_block]; - auto upper = super_block_rank_[s_block + 1]; + auto super_block_rank = super_block_rank_.AsConst64BitInts(); + auto basic_block_rank = basic_block_rank_.AsConst16BitInts(); + + auto lower = super_block_rank[s_block]; + auto upper = super_block_rank[s_block + 1]; uint64_t pos = kBlocksPerSuperBlock * local_rank / (upper - lower); pos = pos + 16 < 32 ? 0 : (pos - 16); pos = pos > 96 ? 96 : pos; while (pos < 96) { auto count = lower_bound_32x16( - &basic_block_rank_[kBlocksPerSuperBlock * s_block + pos], local_rank); + &basic_block_rank[kBlocksPerSuperBlock * s_block + pos], local_rank); if (count == 0) { return find_basicblock(local_rank, s_block); } @@ -290,7 +321,7 @@ class BitVector { } pos = 96; auto count = lower_bound_32x16( - &basic_block_rank_[kBlocksPerSuperBlock * s_block + pos], local_rank); + &basic_block_rank[kBlocksPerSuperBlock * s_block + pos], local_rank); if (count == 0) { return find_basicblock(local_rank, s_block); } @@ -317,16 +348,19 @@ class BitVector { */ uint64_t find_basicblock_is_zeros(uint16_t local_rank0, uint64_t s_block) const { - auto lower = kSuperBlockSize * s_block - super_block_rank_[s_block]; + auto super_block_rank = super_block_rank_.AsConst64BitInts(); + auto basic_block_rank = basic_block_rank_.AsConst16BitInts(); + + auto lower = kSuperBlockSize * s_block - super_block_rank[s_block]; auto upper = - kSuperBlockSize * (s_block + 1) - super_block_rank_[s_block + 1]; + kSuperBlockSize * (s_block + 1) - super_block_rank[s_block + 1]; uint64_t pos = kBlocksPerSuperBlock * local_rank0 / (upper - lower); pos = pos + 16 < 32 ? 0 : (pos - 16); pos = pos > 96 ? 96 : pos; while (pos < 96) { auto count = lower_bound_delta_32x16( - &basic_block_rank_[kBlocksPerSuperBlock * s_block + pos], local_rank0, + &basic_block_rank[kBlocksPerSuperBlock * s_block + pos], local_rank0, delta_basic, kBasicBlockSize * pos); if (count == 0) { return find_basicblock_zeros(local_rank0, s_block); @@ -338,7 +372,7 @@ class BitVector { } pos = 96; auto count = lower_bound_delta_32x16( - &basic_block_rank_[kBlocksPerSuperBlock * s_block + pos], local_rank0, + &basic_block_rank[kBlocksPerSuperBlock * s_block + pos], local_rank0, delta_basic, kBasicBlockSize * pos); if (count == 0) { return find_basicblock_zeros(local_rank0, s_block); @@ -389,10 +423,14 @@ class BitVector { if (pos >= bits_.size() * kWordSize) [[unlikely]] { return max_rank_; } + + auto super_block_rank = super_block_rank_.AsConst64BitInts(); + auto basic_block_rank = basic_block_rank_.AsConst16BitInts(); + uint64_t b_block = pos / kBasicBlockSize; uint64_t s_block = pos / kSuperBlockSize; // Precomputed rank - uint64_t result = super_block_rank_[s_block] + basic_block_rank_[b_block]; + uint64_t result = super_block_rank[s_block] + basic_block_rank[b_block]; // Basic block tail result += rank_512(&bits_[b_block * kWordsPerBlock], pos - (b_block * kBasicBlockSize)); @@ -426,10 +464,13 @@ class BitVector { if (rank == 0) [[unlikely]] { return 0; } + auto super_block_rank = super_block_rank_.AsConst64BitInts(); + auto basic_block_rank = basic_block_rank_.AsConst16BitInts(); + uint64_t s_block = find_superblock(rank); - rank -= super_block_rank_[s_block]; + rank -= super_block_rank[s_block]; auto pos = find_basicblock_is(rank, s_block); - rank -= basic_block_rank_[pos]; + rank -= basic_block_rank[pos]; pos *= kWordsPerBlock; // Final search @@ -458,11 +499,14 @@ class BitVector { if (rank0 == 0) [[unlikely]] { return 0; } + auto super_block_rank = super_block_rank_.AsConst64BitInts(); + auto basic_block_rank = basic_block_rank_.AsConst16BitInts(); + uint64_t s_block = find_superblock_zeros(rank0); - rank0 -= kSuperBlockSize * s_block - super_block_rank_[s_block]; + rank0 -= kSuperBlockSize * s_block - super_block_rank[s_block]; auto pos = find_basicblock_is_zeros(rank0, s_block); auto pos_in_super_block = pos & (kBlocksPerSuperBlock - 1); - rank0 -= kBasicBlockSize * pos_in_super_block - basic_block_rank_[pos]; + rank0 -= kBasicBlockSize * pos_in_super_block - basic_block_rank[pos]; pos *= kWordsPerBlock; // Final search diff --git a/include/pixie/cache_line.h b/include/pixie/cache_line.h new file mode 100644 index 0000000..4a6d543 --- /dev/null +++ b/include/pixie/cache_line.h @@ -0,0 +1,87 @@ +#pragma once + +#include +#include +#include +#include + +/** + * @brief A simple struct to represent a aligned storage for a cache line. + */ +struct alignas(64) CacheLine { + std::array data; +}; + +/** + * @brief A simple aligned storage for cache-line sized blocks. + * + * @details Provides typed views over the same underlying storage as cache + * lines, 64-bit words, or bytes. All spans are contiguous and sized to the + * total storage capacity. + */ +class AlignedStorage { + private: + std::vector data_; + + public: + AlignedStorage() = default; + /** + * @brief Construct storage for at least @p bits bytes, rounded up to 512 + * bits. + */ + AlignedStorage(size_t bits) : data_((bits + 511) / 512) {} + + /** + * @brief Resize storage to hold at least @p bits bits, rounded up to 512 + */ + void resize(size_t bits) { data_.resize((bits + 511) / 512); } + /** @brief Mutable view as cache lines. */ + std::span AsLines() { return data_; } + /** @brief Const view as cache lines. */ + std::span AsConstLines() const { return data_; } + + /** + * @brief Mutable view as 64-bit words. + */ + std::span As64BitInts() { + return std::span(reinterpret_cast(data_.data()), + data_.size() * 8); + } + + /** @brief Const view as 64-bit words. */ + std::span AsConst64BitInts() const { + return std::span( + reinterpret_cast(data_.data()), data_.size() * 8); + } + + /** + * @brief Mutable view as bytes. + * @note Uses a byte pointer to the underlying storage. + */ + std::span AsBytes() { + return std::span(reinterpret_cast(data_.data()), + data_.size() * 64); + } + + /** @brief Const view as bytes. */ + std::span AsConstBytes() const { + return std::span( + reinterpret_cast(data_.data()), data_.size() * 64); + } + + /** + * @brief Mutable view as bytes. + * @note Uses a byte pointer to the underlying storage. + */ + std::span As16BitInts() { + return std::span( + reinterpret_cast(data_.data()), data_.size() * 32); + } + + /** @brief Const view as bytes. */ + std::span AsConst16BitInts() const { + return std::span( + reinterpret_cast(data_.data()), + data_.size() * 32); + } +}; diff --git a/src/benchmarks/benchmarks.cpp b/src/benchmarks/benchmarks.cpp index 03efed3..4682fe6 100644 --- a/src/benchmarks/benchmarks.cpp +++ b/src/benchmarks/benchmarks.cpp @@ -1,23 +1,96 @@ #include #include +#include + #ifdef PIXIE_THIRD_PARTY_BENCHMARKS // TODO: change the pasta/bit_vector usage of std::aligned_alloc #include #endif +#include #include #include #include -static void BM_RankNonInterleaved(benchmark::State& state) { - size_t n = state.range(0); +constexpr size_t kBenchmarkRandomCopies = 8; + +#ifdef _WIN32 +#include +void platform_setup() { + // Pin to core 3 + SetThreadAffinityMask(GetCurrentThread(), DWORD_PTR(1) << 3); + SetPriorityClass(GetCurrentProcess(), HIGH_PRIORITY_CLASS); + SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_TIME_CRITICAL); + SetProcessPriorityBoost(GetCurrentProcess(), TRUE); +} +#else +#include +void platform_setup() { + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(3, &cpuset); + sched_setaffinity(0, sizeof(cpuset), &cpuset); +} +#endif + +static void PrepareRandomBits50pFill(std::span bits) { + auto randomly_filled_length = bits.size() / kBenchmarkRandomCopies; + std::mt19937_64 rng(42); + + for (int i = 0; i < randomly_filled_length; ++i) { + bits[i] = rng(); + } + for (int i = 1; i < kBenchmarkRandomCopies; ++i) { + std::copy_n(bits.begin(), randomly_filled_length, + bits.begin() + i * randomly_filled_length); + } + for (int i = kBenchmarkRandomCopies * randomly_filled_length; i < bits.size(); + ++i) { + bits[i] = rng(); + } +} + +static void PrepareRandomBits12p5Fill(std::span bits) { + auto randomly_filled_length = bits.size() / kBenchmarkRandomCopies; + std::mt19937_64 rng(42); + + for (int i = 0; i < randomly_filled_length; ++i) { + bits[i] = rng() & rng() & rng(); + } + for (int i = 1; i < kBenchmarkRandomCopies; ++i) { + std::copy_n(bits.begin(), randomly_filled_length, + bits.begin() + i * randomly_filled_length); + } + for (int i = kBenchmarkRandomCopies * randomly_filled_length; i < bits.size(); + ++i) { + bits[i] = rng() & rng() & rng(); + } +} + +static void PrepareRandomBits87p5Fill(std::span bits) { + auto randomly_filled_length = bits.size() / kBenchmarkRandomCopies; std::mt19937_64 rng(42); - std::vector bits(((8 + n / 64) / 8) * 8); - for (auto& x : bits) { - x = rng(); + for (int i = 0; i < randomly_filled_length; ++i) { + bits[i] = rng() | rng() | rng(); + } + for (int i = 1; i < kBenchmarkRandomCopies; ++i) { + std::copy_n(bits.begin(), randomly_filled_length, + bits.begin() + i * randomly_filled_length); + } + for (int i = kBenchmarkRandomCopies * randomly_filled_length; i < bits.size(); + ++i) { + bits[i] = rng() | rng() | rng(); } - pixie::BitVector bv(bits, n); +} +static void BM_RankNonInterleaved(benchmark::State& state) { + size_t n = state.range(0); + AlignedStorage bits(n); + auto bits_as_words = bits.As64BitInts(); + PrepareRandomBits50pFill(bits_as_words); + pixie::BitVector bv(bits_as_words, n); + + std::mt19937_64 rng(42); for (auto _ : state) { uint64_t pos = rng() % n; benchmark::DoNotOptimize(bv.rank(pos)); @@ -26,14 +99,12 @@ static void BM_RankNonInterleaved(benchmark::State& state) { static void BM_RankZeroNonInterleaved(benchmark::State& state) { size_t n = state.range(0); - std::mt19937_64 rng(42); - std::vector bits(((8 + n / 64) / 8) * 8); - for (auto& x : bits) { - x = rng(); - } - pixie::BitVector bv(bits, n); + AlignedStorage bits(n); + PrepareRandomBits50pFill(bits.As64BitInts()); + pixie::BitVector bv(bits.As64BitInts(), n); + std::mt19937_64 rng(42); for (auto _ : state) { uint64_t pos = rng() % n; benchmark::DoNotOptimize(bv.rank0(pos)); @@ -42,14 +113,12 @@ static void BM_RankZeroNonInterleaved(benchmark::State& state) { static void BM_RankInterleaved(benchmark::State& state) { size_t n = state.range(0); - std::mt19937_64 rng(42); - std::vector bits(((8 + n / 64) / 8) * 8); - for (auto& x : bits) { - x = rng(); - } - pixie::BitVectorInterleaved bv(bits, n); + AlignedStorage bits(n); + PrepareRandomBits50pFill(bits.As64BitInts()); + pixie::BitVectorInterleaved bv(bits.As64BitInts(), n); + std::mt19937_64 rng(42); for (auto _ : state) { uint64_t pos = rng() % n; benchmark::DoNotOptimize(bv.rank(pos)); @@ -58,16 +127,14 @@ static void BM_RankInterleaved(benchmark::State& state) { static void BM_SelectNonInterleaved(benchmark::State& state) { size_t n = state.range(0); - std::mt19937_64 rng(42); - std::vector bits(((8 + n / 64) / 8) * 8); - for (auto& x : bits) { - x = rng(); - } - pixie::BitVector bv(bits, n); + AlignedStorage bits(n); + PrepareRandomBits50pFill(bits.As64BitInts()); + pixie::BitVector bv(bits.As64BitInts(), n); auto max_rank = bv.rank(bv.size()) + 1; + std::mt19937_64 rng(42); for (auto _ : state) { uint64_t rank = rng() % max_rank; benchmark::DoNotOptimize(bv.select(rank)); @@ -76,176 +143,143 @@ static void BM_SelectNonInterleaved(benchmark::State& state) { static void BM_SelectZeroNonInterleaved(benchmark::State& state) { size_t n = state.range(0); - std::mt19937_64 rng(42); - std::vector bits(((8 + n / 64) / 8) * 8); - for (auto& x : bits) { - x = rng(); - } - pixie::BitVector bv(bits, n); + AlignedStorage bits(n); + PrepareRandomBits50pFill(bits.As64BitInts()); + pixie::BitVector bv(bits.As64BitInts(), n); auto max_rank0 = bv.rank0(bv.size()) + 1; + std::mt19937_64 rng(42); for (auto _ : state) { uint64_t rank0 = rng() % max_rank0; benchmark::DoNotOptimize(bv.select0(rank0)); } } -static void BM_RankNonInterleaved10PercentFill(benchmark::State& state) { +static void BM_RankNonInterleaved12p5PercentFill(benchmark::State& state) { size_t n = state.range(0); - std::mt19937_64 rng(42); - std::vector bits(((8 + n / 64) / 8) * 8); - size_t num_ones = n * 0.1; - for (int i = 0; i < num_ones; i++) { - uint64_t pos = rng() % n; - bits[pos / 64] |= (1ULL << pos % 64); - } - - pixie::BitVector bv(bits, n); + AlignedStorage bits(n); + PrepareRandomBits12p5Fill(bits.As64BitInts()); + pixie::BitVector bv(bits.As64BitInts(), n); - for (auto _ : state) { - uint64_t pos = rng() % n; - benchmark::DoNotOptimize(bv.rank(pos)); + std::mt19937_64 rng(42); + if (std::popcount(n) == 1) { + for (auto _ : state) { + uint64_t pos = rng() & (n - 1); + benchmark::DoNotOptimize(bv.rank(pos)); + } + } else { + for (auto _ : state) { + uint64_t pos = rng() % n; + benchmark::DoNotOptimize(bv.rank(pos)); + } } } -static void BM_RankZeroNonInterleaved10PercentFill(benchmark::State& state) { +static void BM_RankZeroNonInterleaved12p5PercentFill(benchmark::State& state) { size_t n = state.range(0); - std::mt19937_64 rng(42); - - std::vector bits(((8 + n / 64) / 8) * 8); - size_t num_ones = n * 0.1; - for (int i = 0; i < num_ones; i++) { - uint64_t pos = rng() % n; - bits[pos / 64] |= (1ULL << pos % 64); - } - pixie::BitVector bv(bits, n); + AlignedStorage bits(n); + PrepareRandomBits12p5Fill(bits.As64BitInts()); + pixie::BitVector bv(bits.As64BitInts(), n); + std::mt19937_64 rng(42); for (auto _ : state) { uint64_t pos = rng() % n; benchmark::DoNotOptimize(bv.rank0(pos)); } } -static void BM_SelectNonInterleaved10PercentFill(benchmark::State& state) { +static void BM_SelectNonInterleaved12p5PercentFill(benchmark::State& state) { size_t n = state.range(0); - std::mt19937_64 rng(42); - - std::vector bits(((8 + n / 64) / 8) * 8); - size_t num_ones = n * 0.1; - for (int i = 0; i < num_ones; i++) { - uint64_t pos = rng() % n; - bits[pos / 64] |= (1ULL << pos % 64); - } - pixie::BitVector bv(bits, n); + AlignedStorage bits(n); + PrepareRandomBits12p5Fill(bits.As64BitInts()); + pixie::BitVector bv(bits.As64BitInts(), n); auto max_rank = bv.rank(bv.size()) + 1; + std::mt19937_64 rng(42); for (auto _ : state) { uint64_t rank = rng() % max_rank; benchmark::DoNotOptimize(bv.select(rank)); } } -static void BM_SelectZeroNonInterleaved10PercentFill(benchmark::State& state) { +static void BM_SelectZeroNonInterleaved12p5PercentFill( + benchmark::State& state) { size_t n = state.range(0); - std::mt19937_64 rng(42); - - std::vector bits(((8 + n / 64) / 8) * 8); - size_t num_ones = n * 0.1; - for (int i = 0; i < num_ones; i++) { - uint64_t pos = rng() % n; - bits[pos / 64] |= (1ULL << pos % 64); - } - pixie::BitVector bv(bits, n); + AlignedStorage bits(n); + PrepareRandomBits12p5Fill(bits.As64BitInts()); + pixie::BitVector bv(bits.As64BitInts(), n); auto max_rank0 = bv.rank0(bv.size()) + 1; + std::mt19937_64 rng(42); for (auto _ : state) { uint64_t rank0 = rng() % max_rank0; benchmark::DoNotOptimize(bv.select0(rank0)); } } -static void BM_RankNonInterleaved90PercentFill(benchmark::State& state) { +static void BM_RankNonInterleaved87p5PercentFill(benchmark::State& state) { size_t n = state.range(0); - std::mt19937_64 rng(42); - - std::vector bits(((8 + n / 64) / 8) * 8); - size_t num_ones = n * 0.9; - for (int i = 0; i < num_ones; i++) { - uint64_t pos = rng() % n; - bits[pos / 64] |= (1ULL << pos % 64); - } - pixie::BitVector bv(bits, n); + AlignedStorage bits(n); + PrepareRandomBits87p5Fill(bits.As64BitInts()); + pixie::BitVector bv(bits.As64BitInts(), n); + std::mt19937_64 rng(42); for (auto _ : state) { uint64_t pos = rng() % n; benchmark::DoNotOptimize(bv.rank(pos)); } } -static void BM_RankZeroNonInterleaved90PercentFill(benchmark::State& state) { +static void BM_RankZeroNonInterleaved87p5PercentFill(benchmark::State& state) { size_t n = state.range(0); - std::mt19937_64 rng(42); - std::vector bits(((8 + n / 64) / 8) * 8); - size_t num_ones = n * 0.9; - for (int i = 0; i < num_ones; i++) { - uint64_t pos = rng() % n; - bits[pos / 64] |= (1ULL << pos % 64); - } - - pixie::BitVector bv(bits, n); + AlignedStorage bits(n); + PrepareRandomBits87p5Fill(bits.As64BitInts()); + pixie::BitVector bv(bits.As64BitInts(), n); + std::mt19937_64 rng(42); for (auto _ : state) { uint64_t pos = rng() % n; benchmark::DoNotOptimize(bv.rank0(pos)); } } -static void BM_SelectNonInterleaved90PercentFill(benchmark::State& state) { +static void BM_SelectNonInterleaved87p5PercentFill(benchmark::State& state) { size_t n = state.range(0); - std::mt19937_64 rng(42); - std::vector bits(((8 + n / 64) / 8) * 8); - size_t num_ones = n * 0.9; - for (int i = 0; i < num_ones; i++) { - uint64_t pos = rng() % n; - bits[pos / 64] |= (1ULL << pos % 64); - } - - pixie::BitVector bv(bits, n); + AlignedStorage bits(n); + PrepareRandomBits87p5Fill(bits.As64BitInts()); + pixie::BitVector bv(bits.As64BitInts(), n); auto max_rank = bv.rank(bv.size()) + 1; + std::mt19937_64 rng(42); for (auto _ : state) { uint64_t rank = rng() % max_rank; benchmark::DoNotOptimize(bv.select(rank)); } } -static void BM_SelectZeroNonInterleaved90PercentFill(benchmark::State& state) { +static void BM_SelectZeroNonInterleaved87p5PercentFill( + benchmark::State& state) { size_t n = state.range(0); - std::mt19937_64 rng(42); - std::vector bits(((8 + n / 64) / 8) * 8); - size_t num_ones = n * 0.9; - for (int i = 0; i < num_ones; i++) { - uint64_t pos = rng() % n; - bits[pos / 64] |= (1ULL << pos % 64); - } - - pixie::BitVector bv(bits, n); + AlignedStorage bits(n); + PrepareRandomBits87p5Fill(bits.As64BitInts()); + pixie::BitVector bv(bits.As64BitInts(), n); auto max_rank0 = bv.rank0(bv.size()) + 1; + std::mt19937_64 rng(42); for (auto _ : state) { uint64_t rank0 = rng() % max_rank0; benchmark::DoNotOptimize(bv.select(rank0)); @@ -256,76 +290,89 @@ BENCHMARK(BM_RankInterleaved) ->ArgNames({"n"}) ->RangeMultiplier(4) ->Range(8, 1ull << 34) - ->Iterations(100000); + ->Iterations(100000) + ->Repetitions(100); BENCHMARK(BM_RankNonInterleaved) ->ArgNames({"n"}) ->RangeMultiplier(4) ->Range(8, 1ull << 34) - ->Iterations(100000); + ->Iterations(100000) + ->Repetitions(100); BENCHMARK(BM_RankZeroNonInterleaved) ->ArgNames({"n"}) ->RangeMultiplier(4) ->Range(8, 1ull << 34) - ->Iterations(100000); + ->Iterations(100000) + ->Repetitions(100); BENCHMARK(BM_SelectNonInterleaved) ->ArgNames({"n"}) ->RangeMultiplier(4) ->Range(8, 1ull << 34) - ->Iterations(100000); + ->Iterations(100000) + ->Repetitions(100); BENCHMARK(BM_SelectZeroNonInterleaved) ->ArgNames({"n"}) ->RangeMultiplier(4) ->Range(8, 1ull << 34) - ->Iterations(100000); + ->Iterations(100000) + ->Repetitions(100); -BENCHMARK(BM_RankNonInterleaved10PercentFill) +BENCHMARK(BM_RankNonInterleaved12p5PercentFill) ->ArgNames({"n"}) ->RangeMultiplier(4) ->Range(8, 1ull << 34) - ->Iterations(100000); + ->Iterations(100000) + ->Repetitions(100); -BENCHMARK(BM_RankZeroNonInterleaved10PercentFill) +BENCHMARK(BM_RankZeroNonInterleaved12p5PercentFill) ->ArgNames({"n"}) ->RangeMultiplier(4) ->Range(8, 1ull << 34) - ->Iterations(100000); + ->Iterations(100000) + ->Repetitions(100); -BENCHMARK(BM_SelectNonInterleaved10PercentFill) +BENCHMARK(BM_SelectNonInterleaved12p5PercentFill) ->ArgNames({"n"}) ->RangeMultiplier(4) ->Range(8, 1ull << 34) - ->Iterations(100000); + ->Iterations(100000) + ->Repetitions(100); -BENCHMARK(BM_SelectZeroNonInterleaved10PercentFill) +BENCHMARK(BM_SelectZeroNonInterleaved12p5PercentFill) ->ArgNames({"n"}) ->RangeMultiplier(4) ->Range(8, 1ull << 34) - ->Iterations(100000); + ->Iterations(100000) + ->Repetitions(100); -BENCHMARK(BM_RankNonInterleaved90PercentFill) +BENCHMARK(BM_RankNonInterleaved87p5PercentFill) ->ArgNames({"n"}) ->RangeMultiplier(4) ->Range(8, 1ull << 34) - ->Iterations(100000); + ->Iterations(100000) + ->Repetitions(100); -BENCHMARK(BM_RankZeroNonInterleaved90PercentFill) +BENCHMARK(BM_RankZeroNonInterleaved87p5PercentFill) ->ArgNames({"n"}) ->RangeMultiplier(4) ->Range(8, 1ull << 34) - ->Iterations(100000); + ->Iterations(100000) + ->Repetitions(100); -BENCHMARK(BM_SelectNonInterleaved90PercentFill) +BENCHMARK(BM_SelectNonInterleaved87p5PercentFill) ->ArgNames({"n"}) ->RangeMultiplier(4) ->Range(8, 1ull << 34) - ->Iterations(100000); + ->Iterations(100000) + ->Repetitions(100); -BENCHMARK(BM_SelectZeroNonInterleaved90PercentFill) +BENCHMARK(BM_SelectZeroNonInterleaved87p5PercentFill) ->ArgNames({"n"}) ->RangeMultiplier(4) ->Range(8, 1ull << 34) - ->Iterations(100000); + ->Iterations(100000) + ->Repetitions(100); diff --git a/src/docs/Doxyfile.in b/src/docs/Doxyfile.in index 630a86e..34796fc 100644 --- a/src/docs/Doxyfile.in +++ b/src/docs/Doxyfile.in @@ -190,7 +190,7 @@ FULL_PATH_NAMES = YES # will be relative from the directory where Doxygen is started. # This tag requires that the tag FULL_PATH_NAMES is set to YES. -STRIP_FROM_PATH = +STRIP_FROM_PATH = @CMAKE_CURRENT_SOURCE_DIR@ # The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the # path mentioned in the documentation of a class, which tells the reader which @@ -1005,7 +1005,8 @@ WARN_LOGFILE = # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING # Note: If this tag is empty the current directory is searched. -INPUT = . include +INPUT = @CMAKE_CURRENT_SOURCE_DIR@/include \ + @CMAKE_CURRENT_SOURCE_DIR@/README.md # This tag can be used to specify the character encoding of the source files # that Doxygen parses. Internally Doxygen uses the UTF-8 encoding. Doxygen uses @@ -1100,7 +1101,7 @@ FILE_PATTERNS = *.c \ # be searched for input files as well. # The default value is: NO. -RECURSIVE = NO +RECURSIVE = YES # The EXCLUDE tag can be used to specify files and/or directories that should be # excluded from the INPUT source files. This way you can easily exclude a @@ -1220,7 +1221,7 @@ FILTER_SOURCE_PATTERNS = # (index.html). This can be useful if you have a project on for instance GitHub # and want to reuse the introduction page also for the Doxygen output. -USE_MDFILE_AS_MAINPAGE = README.md +USE_MDFILE_AS_MAINPAGE = @CMAKE_CURRENT_SOURCE_DIR@/README.md # If the IMPLICIT_DIR_DOCS tag is set to YES, any README.md file found in sub- # directories of the project's root, is used as the documentation for that sub-