diff --git a/.github/workflows/benchmarks-test.yml b/.github/workflows/benchmarks-test.yml
index c957c0c..a1e20d8 100644
--- a/.github/workflows/benchmarks-test.yml
+++ b/.github/workflows/benchmarks-test.yml
@@ -2,55 +2,57 @@ name: CMake Build and Run Benchmark Tests
 
 on:
   push:
-    branches: [ main ]
+    branches: [main]
   pull_request:
-    branches: [ main ]
+    branches: [main]
 
 jobs:
   build-and-test:
     runs-on: ubuntu-latest
-    
+
     steps:
-    - uses: actions/checkout@v4
+      - uses: actions/checkout@v4
 
-    - name: Create Build Directory
-      run: mkdir build
+      - name: Create Build Directory
+        run: mkdir build
 
-    - name: Configure CMake
-      working-directory: ./build
-      run: cmake -DDISABLE_AVX512=ON -DENABLE_ADDRESS_SANITIZER=ON ..
+      - name: Configure CMake
+        working-directory: ./build
+        run: cmake -DDISABLE_AVX512=ON -DENABLE_ADDRESS_SANITIZER=ON ..
 
-    - name: Build Project
-      working-directory: ./build
-      run: make -j
+      - name: Build Project
+        working-directory: ./build
+        run: make -j
 
-    - name: Run Benchmark Tests
-      working-directory: ./build
-      run: ./benchmark_tests
-  
-  build-and-test-with-SDE:
-    runs-on: ubuntu-latest
-    
-    steps:
-    - uses: actions/checkout@v4
+      - name: Run Benchmark Tests
+        working-directory: ./build
+        run: ./benchmark_tests
+
+  # Note! Benchmark tests on SDE are taking too long to run on CI.
+  # Need to find a better way to check with AVX-512
+  # build-and-test-with-SDE:
+  #   runs-on: ubuntu-latest
+
+  #   steps:
+  #   - uses: actions/checkout@v4
 
-    - name: Create Build Directory
-      run: mkdir build
+  #   - name: Create Build Directory
+  #     run: mkdir build
 
-    - name: Download and Unpack SDE
-      working-directory: ./build
-      run: |
-        wget https://downloadmirror.intel.com/859732/sde-external-9.58.0-2025-06-16-lin.tar.xz 
-        tar xf sde-external-9.58.0-2025-06-16-lin.tar.xz
+  #   - name: Download and Unpack SDE
+  #     working-directory: ./build
+  #     run: |
+  #       wget https://downloadmirror.intel.com/859732/sde-external-9.58.0-2025-06-16-lin.tar.xz
+  #       tar xf sde-external-9.58.0-2025-06-16-lin.tar.xz
 
-    - name: Configure CMake
-      working-directory: ./build
-      run: cmake -DENABLE_ADDRESS_SANITIZER=ON -DMARCH=icelake-client -DHAVE_STD_REGEX=ON ..
+  #   - name: Configure CMake
+  #     working-directory: ./build
+  #     run: cmake -DENABLE_ADDRESS_SANITIZER=ON -DMARCH=icelake-client -DHAVE_STD_REGEX=ON ..
 
-    - name: Build Project
-      working-directory: ./build
-      run: make -j
+  #   - name: Build Project
+  #     working-directory: ./build
+  #     run: make -j
 
-    - name: Run Benchmark Tests
-      working-directory: ./build
-      run: sde-external-9.58.0-2025-06-16-lin/sde64 -icl -emu-xinuse 0 -- ./benchmark_tests
\ No newline at end of file
+  #   - name: Run Benchmark Tests
+  #     working-directory: ./build
+  #     run: sde-external-9.58.0-2025-06-16-lin/sde64 -icl -emu-xinuse 0 -- ./benchmark_tests
diff --git a/include/pixie/bits.h b/include/pixie/bits.h
index c7f0091..d1a170b 100644
--- a/include/pixie/bits.h
+++ b/include/pixie/bits.h
@@ -46,17 +46,17 @@ static inline const __m256i mask_first_half = _mm256_setr_epi8(
 // clang-format on
 #endif
 
-inline uint64_t first_bits_mask(size_t num) {
+static inline uint64_t first_bits_mask(size_t num) {
   return num >= 64 ? UINT64_MAX : ((1llu << num) - 1);
 }
 
 /**
  * @brief Number of 1 bits in positions 0 .. count - 1
- * @details
- * Surprisingly one cannot just do (1 << L) - 1 for
- * L > 64 to produce mask of ones of length L.
- * The best we can do with a single instruction is (1 << L) - 1 for L=8k
- * using maskz_set1.
+ * @details Assumes count
+ * < 512.
+ * @details Surprisingly one cannot just do (1 << L) - 1 for L > 64 to
+ * produce mask of ones of length L. The best we can do with a single
+ * instruction is (1 << L) - 1 for L=8k using maskz_set1.
  *
  * To make mask for arbitrary L we use shldv instuction, it doesn't
  * really matter what epi is used the recipe is the following:
@@ -70,7 +70,7 @@ inline uint64_t first_bits_mask(size_t num) {
  * The rest is standard, i.e. popcount_epi64 to perform popcount on
  * 64 bits and then reduce_add to sum the result.
  */
-uint64_t rank_512(const uint64_t* x, uint64_t count) {
+static inline uint64_t rank_512(const uint64_t* x, uint64_t count) {
 #ifdef PIXIE_AVX512_SUPPORT
 
   __m512i a = _mm512_maskz_set1_epi64((1ull << ((count >> 6))) - 1,
@@ -105,7 +105,7 @@ uint64_t rank_512(const uint64_t* x, uint64_t count) {
 /**
  * @brief Return position of @p rank 1 bit in @p x
  */
-uint64_t select_64(uint64_t x, uint64_t rank) {
+static inline uint64_t select_64(uint64_t x, uint64_t rank) {
   return _tzcnt_u64(_pdep_u64(1ull << rank, x));
 }
 
@@ -126,18 +126,34 @@ uint64_t select_64(uint64_t x, uint64_t rank) {
  * It can also be used as an alternative for linear search but i don't
  * see a proper SIMD algorithm to make it faster.
  */
-uint64_t select_512(const uint64_t* x, uint64_t rank) {
+static inline uint64_t select_512(const uint64_t* x, uint64_t rank) {
 #ifdef PIXIE_AVX512_SUPPORT
 
   __m512i res = _mm512_loadu_epi64(x);
-  alignas(64) std::array<uint64_t, 8> counts;
-  _mm512_store_epi64(counts.data(), _mm512_popcnt_epi64(res));
-
-  size_t i = 0;
-  while (i < 8 && counts[i] <= rank) {
-    rank -= counts[i++];
+  __m512i counts = _mm512_popcnt_epi64(res);
+  __m512i prefix = counts;
+
+  const __m512i idx_shift1 = _mm512_set_epi64(6, 5, 4, 3, 2, 1, 0, 0);
+  const __m512i idx_shift2 = _mm512_set_epi64(5, 4, 3, 2, 1, 0, 0, 0);
+  const __m512i idx_shift4 = _mm512_set_epi64(3, 2, 1, 0, 0, 0, 0, 0);
+
+  __m512i tmp = _mm512_maskz_permutexvar_epi64(0xFE, idx_shift1, prefix);
+  prefix = _mm512_add_epi64(prefix, tmp);
+  tmp = _mm512_maskz_permutexvar_epi64(0xFC, idx_shift2, prefix);
+  prefix = _mm512_add_epi64(prefix, tmp);
+  tmp = _mm512_maskz_permutexvar_epi64(0xF0, idx_shift4, prefix);
+  prefix = _mm512_add_epi64(prefix, tmp);
+
+  __mmask8 mask = _mm512_cmpgt_epu64_mask(prefix, _mm512_set1_epi64(rank));
+  uint32_t i = _tzcnt_u32(static_cast<uint32_t>(mask));
+  uint64_t prev = 0;
+  if (i != 0) {
+    __m512i idx_prev = _mm512_set1_epi64(static_cast<int64_t>(i - 1));
+    __m512i prev_vec = _mm512_permutexvar_epi64(idx_prev, prefix);
+    prev = static_cast<uint64_t>(
+        _mm_cvtsi128_si64(_mm512_castsi512_si128(prev_vec)));
   }
-  return i * 64 + select_64(x[i], rank);
+  return i * 64 + select_64(x[i], rank - prev);
 
 #else
 
@@ -156,19 +172,35 @@ uint64_t select_512(const uint64_t* x, uint64_t rank) {
  * @brief Return position of @p rank0 0 bit in @p x
  * @details select_512 with bit inversion.
  */
-uint64_t select0_512(const uint64_t* x, uint64_t rank0) {
+static inline uint64_t select0_512(const uint64_t* x, uint64_t rank0) {
 #ifdef PIXIE_AVX512_SUPPORT
 
   __m512i res = _mm512_loadu_epi64(x);
   res = _mm512_xor_epi64(res, _mm512_set1_epi64(-1));
-  alignas(64) std::array<uint64_t, 8> counts;
-  _mm512_store_epi64(counts.data(), _mm512_popcnt_epi64(res));
-
-  size_t i = 0;
-  while (i < 8 && counts[i] <= rank0) {
-    rank0 -= counts[i++];
+  __m512i counts = _mm512_popcnt_epi64(res);
+  __m512i prefix = counts;
+
+  const __m512i idx_shift1 = _mm512_set_epi64(6, 5, 4, 3, 2, 1, 0, 0);
+  const __m512i idx_shift2 = _mm512_set_epi64(5, 4, 3, 2, 1, 0, 0, 0);
+  const __m512i idx_shift4 = _mm512_set_epi64(3, 2, 1, 0, 0, 0, 0, 0);
+
+  __m512i tmp = _mm512_maskz_permutexvar_epi64(0xFE, idx_shift1, prefix);
+  prefix = _mm512_add_epi64(prefix, tmp);
+  tmp = _mm512_maskz_permutexvar_epi64(0xFC, idx_shift2, prefix);
+  prefix = _mm512_add_epi64(prefix, tmp);
+  tmp = _mm512_maskz_permutexvar_epi64(0xF0, idx_shift4, prefix);
+  prefix = _mm512_add_epi64(prefix, tmp);
+
+  __mmask8 mask = _mm512_cmpgt_epu64_mask(prefix, _mm512_set1_epi64(rank0));
+  uint32_t i = _tzcnt_u32(static_cast<uint32_t>(mask));
+  uint64_t prev = 0;
+  if (i != 0) {
+    __m512i idx_prev = _mm512_set1_epi64(static_cast<int64_t>(i - 1));
+    __m512i prev_vec = _mm512_permutexvar_epi64(idx_prev, prefix);
+    prev = static_cast<uint64_t>(
+        _mm_cvtsi128_si64(_mm512_castsi512_si128(prev_vec)));
   }
-  return i * 64 + select_64(~x[i], rank0);
+  return i * 64 + select_64(~x[i], rank0 - prev);
 
 #else
 
@@ -187,7 +219,7 @@ uint64_t select0_512(const uint64_t* x, uint64_t rank0) {
  * @brief Compare 4 64-bit numbers of @p x with @p y and
  * return the length of the prefix where @p y is less then @p x
  */
-uint16_t lower_bound_4x64(const uint64_t* x, uint64_t y) {
+static inline uint16_t lower_bound_4x64(const uint64_t* x, uint64_t y) {
 #ifdef PIXIE_AVX512_SUPPORT
 
   auto y_4 = _mm256_set1_epi64x(y);
@@ -236,10 +268,10 @@ uint16_t lower_bound_4x64(const uint64_t* x, uint64_t y) {
  * offsets.
  * @param delta_scalar Shared delta offset.
  */
-uint16_t lower_bound_delta_4x64(const uint64_t* x,
-                                uint64_t y,
-                                const uint64_t* delta_array,
-                                uint64_t delta_scalar) {
+static inline uint16_t lower_bound_delta_4x64(const uint64_t* x,
+                                              uint64_t y,
+                                              const uint64_t* delta_array,
+                                              uint64_t delta_scalar) {
 #ifdef PIXIE_AVX512_SUPPORT
 
   const __m256i dlt_256 = _mm256_loadu_epi64(delta_array);
@@ -290,7 +322,7 @@ uint16_t lower_bound_delta_4x64(const uint64_t* x,
  * @brief Compare 8 64-bit numbers of @p x with @p y and
  * return the length of the prefix where @p y is less then @p x
  */
-uint16_t lower_bound_8x64(const uint64_t* x, uint64_t y) {
+static inline uint16_t lower_bound_8x64(const uint64_t* x, uint64_t y) {
 #ifdef PIXIE_AVX512_SUPPORT
 
   auto y_8 = _mm512_set1_epi64(y);
@@ -336,10 +368,10 @@ uint16_t lower_bound_8x64(const uint64_t* x, uint64_t y) {
  * offsets.
  * @param delta_scalar Shared delta offset.
  */
-uint16_t lower_bound_delta_8x64(const uint64_t* x,
-                                uint64_t y,
-                                const uint64_t* delta_array,
-                                uint64_t delta_scalar) {
+static inline uint16_t lower_bound_delta_8x64(const uint64_t* x,
+                                              uint64_t y,
+                                              const uint64_t* delta_array,
+                                              uint64_t delta_scalar) {
 #ifdef PIXIE_AVX512_SUPPORT
 
   const __m512i dlt_512 = _mm512_loadu_epi64(delta_array);
@@ -591,10 +623,14 @@ void popcount_32x8(const uint8_t* x, uint8_t* result) {
 
 /**
  * @brief Calculates 32 bit ranks of every 8th bit, result is stored as 32
- * 8-bit integers.
+
+ * * 8-bit integers.
+ * @details Prefix sums are computed modulo 256 (uint8_t
+ * wraparound).
  *
  * @param x Pointer to 32 input 8-bit integers
- * @param result Pointer to store the resulting 32 8-bit integers
+ * @param
+ * result Pointer to store the resulting 32 8-bit integers
  */
 void rank_32x8(const uint8_t* x, uint8_t* result) {
 #ifdef PIXIE_AVX512_SUPPORT
diff --git a/include/pixie/bitvector.h b/include/pixie/bitvector.h
index 4012cd2..66791ce 100644
--- a/include/pixie/bitvector.h
+++ b/include/pixie/bitvector.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <pixie/bits.h>
+#include <pixie/cache_line.h>
 
 #include <algorithm>
 #include <bit>
@@ -61,18 +62,18 @@ class BitVector {
   constexpr static size_t kSuperBlockRankIntSize = 64;
   constexpr static size_t kBasicBlockRankIntSize = 16;
   constexpr static size_t kBasicBlockSize = 512;
-  constexpr static size_t kSuperBlockSize = 65536;
   constexpr static size_t kWordsPerBlock = 8;
-  constexpr static size_t kSelectSampleFrequency = 16384;
+  constexpr static size_t kSuperBlockSize = 65536;
   constexpr static size_t kBlocksPerSuperBlock = 128;
+  constexpr static size_t kSelectSampleFrequency = 16384;
 
   alignas(64) uint64_t delta_super[8];
   alignas(64) uint16_t delta_basic[32];
 
-  std::vector<uint64_t> super_block_rank_;
-  std::vector<uint16_t> basic_block_rank_;
-  std::vector<uint64_t> select1_samples_;
-  std::vector<uint64_t> select0_samples_;
+  AlignedStorage super_block_rank_;  // 64-bit global prefix sums
+  AlignedStorage basic_block_rank_;  // 16-bit local prefix sums
+  AlignedStorage select1_samples_;   // 64-bit global positions
+  AlignedStorage select0_samples_;   // 64-bit global positions
   const size_t num_bits_;
   const size_t padded_size_;
   size_t max_rank_;
@@ -89,21 +90,24 @@ class BitVector {
     // This reduces branching in select
     num_superblocks = ((num_superblocks + 7) / 8) * 8;
     size_t num_basicblocks = num_superblocks * kBlocksPerSuperBlock;
-    super_block_rank_.resize(num_superblocks);
-    basic_block_rank_.resize(num_basicblocks);
+    super_block_rank_.resize(num_superblocks * 64);
+    basic_block_rank_.resize(num_basicblocks * 16);
+
+    auto super_block_rank = super_block_rank_.As64BitInts();
+    auto basic_block_rank = basic_block_rank_.As16BitInts();
 
     uint64_t super_block_sum = 0;
     uint16_t basic_block_sum = 0;
 
-    for (size_t i = 0; i / kBasicBlockSize < basic_block_rank_.size();
+    for (size_t i = 0; i / kBasicBlockSize < basic_block_rank.size();
          i += kWordSize) {
       if (i % kSuperBlockSize == 0) {
         super_block_sum += basic_block_sum;
-        super_block_rank_[i / kSuperBlockSize] = super_block_sum;
+        super_block_rank[i / kSuperBlockSize] = super_block_sum;
         basic_block_sum = 0;
       }
       if (i % kBasicBlockSize == 0) {
-        basic_block_rank_[i / kBasicBlockSize] = basic_block_sum;
+        basic_block_rank[i / kBasicBlockSize] = basic_block_sum;
       }
       if (i / kWordSize < bits_.size()) {
         basic_block_sum += std::popcount(bits_[i / kWordSize]);
@@ -120,8 +124,23 @@ class BitVector {
     uint64_t milestone0 = kSelectSampleFrequency;
     uint64_t rank = 0;
     uint64_t rank0 = 0;
-    select1_samples_.emplace_back(0);
-    select0_samples_.emplace_back(0);
+
+    size_t num_one_samples =
+        1 + (max_rank_ + kSelectSampleFrequency - 1) / kSelectSampleFrequency;
+    size_t num_zero_samples =
+        1 + (num_bits_ - max_rank_ + kSelectSampleFrequency - 1) /
+                kSelectSampleFrequency;
+
+    select1_samples_.resize(num_one_samples * 64);
+    select0_samples_.resize(num_zero_samples * 64);
+    auto select1_samples = select1_samples_.As64BitInts();
+    auto select0_samples = select0_samples_.As64BitInts();
+
+    select1_samples[0] = 0;
+    select0_samples[0] = 0;
+
+    size_t num_zeros = 1, num_ones = 1;
+
     for (size_t i = 0; i < bits_.size(); ++i) {
       auto ones = std::popcount(bits_[i]);
       auto zeros = 64 - ones;
@@ -129,12 +148,12 @@ class BitVector {
         auto pos = select_64(bits_[i], milestone - rank - 1);
         // TODO: try including global rank into select samples to save
         //       a cache miss on global rank scan
-        select1_samples_.emplace_back((64 * i + pos) / kSuperBlockSize);
+        select1_samples[num_ones++] = (64 * i + pos) / kSuperBlockSize;
         milestone += kSelectSampleFrequency;
       }
       if (rank0 + zeros >= milestone0) {
         auto pos = select_64(~bits_[i], milestone0 - rank0 - 1);
-        select0_samples_.emplace_back((64 * i + pos) / kSuperBlockSize);
+        select0_samples[num_zeros++] = (64 * i + pos) / kSuperBlockSize;
         milestone0 += kSelectSampleFrequency;
       }
       rank += ones;
@@ -155,23 +174,26 @@ class BitVector {
    * rank of the 1-bit to locate.
    */
   uint64_t find_superblock(uint64_t rank) const {
-    uint64_t left = select1_samples_[rank / kSelectSampleFrequency];
+    auto select1_samples = select1_samples_.AsConst64BitInts();
+    auto super_block_rank = super_block_rank_.AsConst64BitInts();
+
+    uint64_t left = select1_samples[rank / kSelectSampleFrequency];
 
-    while (left + 7 < super_block_rank_.size()) {
-      auto len = lower_bound_8x64(&super_block_rank_[left], rank);
+    while (left + 7 < super_block_rank.size()) {
+      auto len = lower_bound_8x64(&super_block_rank[left], rank);
       if (len < 8) {
         return left + len - 1;
       }
       left += 8;
     }
-    if (left + 3 < super_block_rank_.size()) {
-      auto len = lower_bound_4x64(&super_block_rank_[left], rank);
+    if (left + 3 < super_block_rank.size()) {
+      auto len = lower_bound_4x64(&super_block_rank[left], rank);
       if (len < 4) {
         return left + len - 1;
       }
       left += 4;
     }
-    while (left < super_block_rank_.size() && super_block_rank_[left] < rank) {
+    while (left < super_block_rank.size() && super_block_rank[left] < rank) {
       left++;
     }
     return left - 1;
@@ -183,26 +205,29 @@ class BitVector {
    * rank of the 0-bit to locate.
    */
   uint64_t find_superblock_zeros(uint64_t rank0) const {
-    uint64_t left = select0_samples_[rank0 / kSelectSampleFrequency];
+    auto select0_samples = select0_samples_.AsConst64BitInts();
+    auto super_block_rank = super_block_rank_.AsConst64BitInts();
+
+    uint64_t left = select0_samples[rank0 / kSelectSampleFrequency];
 
-    while (left + 7 < super_block_rank_.size()) {
-      auto len = lower_bound_delta_8x64(&super_block_rank_[left], rank0,
+    while (left + 7 < super_block_rank.size()) {
+      auto len = lower_bound_delta_8x64(&super_block_rank[left], rank0,
                                         delta_super, kSuperBlockSize * left);
       if (len < 8) {
         return left + len - 1;
       }
       left += 8;
     }
-    if (left + 3 < super_block_rank_.size()) {
-      auto len = lower_bound_delta_4x64(&super_block_rank_[left], rank0,
+    if (left + 3 < super_block_rank.size()) {
+      auto len = lower_bound_delta_4x64(&super_block_rank[left], rank0,
                                         delta_super, kSuperBlockSize * left);
       if (len < 4) {
         return left + len - 1;
       }
       left += 4;
     }
-    while (left < super_block_rank_.size() &&
-           kSuperBlockSize * left - super_block_rank_[left] < rank0) {
+    while (left < super_block_rank.size() &&
+           kSuperBlockSize * left - super_block_rank[left] < rank0) {
       left++;
     }
     return left - 1;
@@ -220,9 +245,11 @@ class BitVector {
    * * 4 iterations.
    */
   uint64_t find_basicblock(uint16_t local_rank, uint64_t s_block) const {
+    auto basic_block_rank = basic_block_rank_.AsConst16BitInts();
+
     for (size_t pos = 0; pos < kBlocksPerSuperBlock; pos += 32) {
       auto count = lower_bound_32x16(
-          &basic_block_rank_[kBlocksPerSuperBlock * s_block + pos], local_rank);
+          &basic_block_rank[kBlocksPerSuperBlock * s_block + pos], local_rank);
       if (count < 32) {
         return kBlocksPerSuperBlock * s_block + pos + count - 1;
       }
@@ -242,9 +269,10 @@ class BitVector {
    * 4 iterations.
    */
   uint64_t find_basicblock_zeros(uint16_t local_rank0, uint64_t s_block) const {
+    auto basic_block_rank = basic_block_rank_.AsConst16BitInts();
     for (size_t pos = 0; pos < kBlocksPerSuperBlock; pos += 32) {
       auto count = lower_bound_delta_32x16(
-          &basic_block_rank_[kBlocksPerSuperBlock * s_block + pos], local_rank0,
+          &basic_block_rank[kBlocksPerSuperBlock * s_block + pos], local_rank0,
           delta_basic, kBasicBlockSize * pos);
       if (count < 32) {
         return kBlocksPerSuperBlock * s_block + pos + count - 1;
@@ -271,15 +299,18 @@ class BitVector {
    * we backoff to find_basicblock
    */
   uint64_t find_basicblock_is(uint16_t local_rank, uint64_t s_block) const {
-    auto lower = super_block_rank_[s_block];
-    auto upper = super_block_rank_[s_block + 1];
+    auto super_block_rank = super_block_rank_.AsConst64BitInts();
+    auto basic_block_rank = basic_block_rank_.AsConst16BitInts();
+
+    auto lower = super_block_rank[s_block];
+    auto upper = super_block_rank[s_block + 1];
 
     uint64_t pos = kBlocksPerSuperBlock * local_rank / (upper - lower);
     pos = pos + 16 < 32 ? 0 : (pos - 16);
     pos = pos > 96 ? 96 : pos;
     while (pos < 96) {
       auto count = lower_bound_32x16(
-          &basic_block_rank_[kBlocksPerSuperBlock * s_block + pos], local_rank);
+          &basic_block_rank[kBlocksPerSuperBlock * s_block + pos], local_rank);
       if (count == 0) {
         return find_basicblock(local_rank, s_block);
       }
@@ -290,7 +321,7 @@ class BitVector {
     }
     pos = 96;
     auto count = lower_bound_32x16(
-        &basic_block_rank_[kBlocksPerSuperBlock * s_block + pos], local_rank);
+        &basic_block_rank[kBlocksPerSuperBlock * s_block + pos], local_rank);
     if (count == 0) {
       return find_basicblock(local_rank, s_block);
     }
@@ -317,16 +348,19 @@ class BitVector {
    */
   uint64_t find_basicblock_is_zeros(uint16_t local_rank0,
                                     uint64_t s_block) const {
-    auto lower = kSuperBlockSize * s_block - super_block_rank_[s_block];
+    auto super_block_rank = super_block_rank_.AsConst64BitInts();
+    auto basic_block_rank = basic_block_rank_.AsConst16BitInts();
+
+    auto lower = kSuperBlockSize * s_block - super_block_rank[s_block];
     auto upper =
-        kSuperBlockSize * (s_block + 1) - super_block_rank_[s_block + 1];
+        kSuperBlockSize * (s_block + 1) - super_block_rank[s_block + 1];
 
     uint64_t pos = kBlocksPerSuperBlock * local_rank0 / (upper - lower);
     pos = pos + 16 < 32 ? 0 : (pos - 16);
     pos = pos > 96 ? 96 : pos;
     while (pos < 96) {
       auto count = lower_bound_delta_32x16(
-          &basic_block_rank_[kBlocksPerSuperBlock * s_block + pos], local_rank0,
+          &basic_block_rank[kBlocksPerSuperBlock * s_block + pos], local_rank0,
           delta_basic, kBasicBlockSize * pos);
       if (count == 0) {
         return find_basicblock_zeros(local_rank0, s_block);
@@ -338,7 +372,7 @@ class BitVector {
     }
     pos = 96;
     auto count = lower_bound_delta_32x16(
-        &basic_block_rank_[kBlocksPerSuperBlock * s_block + pos], local_rank0,
+        &basic_block_rank[kBlocksPerSuperBlock * s_block + pos], local_rank0,
         delta_basic, kBasicBlockSize * pos);
     if (count == 0) {
       return find_basicblock_zeros(local_rank0, s_block);
@@ -389,10 +423,14 @@ class BitVector {
     if (pos >= bits_.size() * kWordSize) [[unlikely]] {
       return max_rank_;
     }
+
+    auto super_block_rank = super_block_rank_.AsConst64BitInts();
+    auto basic_block_rank = basic_block_rank_.AsConst16BitInts();
+
     uint64_t b_block = pos / kBasicBlockSize;
     uint64_t s_block = pos / kSuperBlockSize;
     // Precomputed rank
-    uint64_t result = super_block_rank_[s_block] + basic_block_rank_[b_block];
+    uint64_t result = super_block_rank[s_block] + basic_block_rank[b_block];
     // Basic block tail
     result += rank_512(&bits_[b_block * kWordsPerBlock],
                        pos - (b_block * kBasicBlockSize));
@@ -426,10 +464,13 @@ class BitVector {
     if (rank == 0) [[unlikely]] {
       return 0;
     }
+    auto super_block_rank = super_block_rank_.AsConst64BitInts();
+    auto basic_block_rank = basic_block_rank_.AsConst16BitInts();
+
     uint64_t s_block = find_superblock(rank);
-    rank -= super_block_rank_[s_block];
+    rank -= super_block_rank[s_block];
     auto pos = find_basicblock_is(rank, s_block);
-    rank -= basic_block_rank_[pos];
+    rank -= basic_block_rank[pos];
     pos *= kWordsPerBlock;
 
     // Final search
@@ -458,11 +499,14 @@ class BitVector {
     if (rank0 == 0) [[unlikely]] {
       return 0;
     }
+    auto super_block_rank = super_block_rank_.AsConst64BitInts();
+    auto basic_block_rank = basic_block_rank_.AsConst16BitInts();
+
     uint64_t s_block = find_superblock_zeros(rank0);
-    rank0 -= kSuperBlockSize * s_block - super_block_rank_[s_block];
+    rank0 -= kSuperBlockSize * s_block - super_block_rank[s_block];
     auto pos = find_basicblock_is_zeros(rank0, s_block);
     auto pos_in_super_block = pos & (kBlocksPerSuperBlock - 1);
-    rank0 -= kBasicBlockSize * pos_in_super_block - basic_block_rank_[pos];
+    rank0 -= kBasicBlockSize * pos_in_super_block - basic_block_rank[pos];
     pos *= kWordsPerBlock;
 
     // Final search
diff --git a/include/pixie/cache_line.h b/include/pixie/cache_line.h
new file mode 100644
index 0000000..4a6d543
--- /dev/null
+++ b/include/pixie/cache_line.h
@@ -0,0 +1,87 @@
+#pragma once
+
+#include <array>
+#include <cstddef>
+#include <span>
+#include <vector>
+
+/**
+ * @brief A simple struct to represent a aligned storage for a cache line.
+ */
+struct alignas(64) CacheLine {
+  std::array<std::byte, 64> data;
+};
+
+/**
+ * @brief A simple aligned storage for cache-line sized blocks.
+ *
+ * @details Provides typed views over the same underlying storage as cache
+ * lines, 64-bit words, or bytes. All spans are contiguous and sized to the
+ * total storage capacity.
+ */
+class AlignedStorage {
+ private:
+  std::vector<CacheLine> data_;
+
+ public:
+  AlignedStorage() = default;
+  /**
+   * @brief Construct storage for at least @p bits bytes, rounded up to 512
+   * bits.
+   */
+  AlignedStorage(size_t bits) : data_((bits + 511) / 512) {}
+
+  /**
+   * @brief Resize storage to hold at least @p bits bits, rounded up to 512
+   */
+  void resize(size_t bits) { data_.resize((bits + 511) / 512); }
+  /** @brief Mutable view as cache lines. */
+  std::span<CacheLine> AsLines() { return data_; }
+  /** @brief Const view as cache lines. */
+  std::span<const CacheLine> AsConstLines() const { return data_; }
+
+  /**
+   * @brief Mutable view as 64-bit words.
+   */
+  std::span<uint64_t> As64BitInts() {
+    return std::span<uint64_t>(reinterpret_cast<uint64_t*>(data_.data()),
+                               data_.size() * 8);
+  }
+
+  /** @brief Const view as 64-bit words. */
+  std::span<const uint64_t> AsConst64BitInts() const {
+    return std::span<const uint64_t>(
+        reinterpret_cast<const uint64_t*>(data_.data()), data_.size() * 8);
+  }
+
+  /**
+   * @brief Mutable view as bytes.
+   * @note Uses a byte pointer to the underlying storage.
+   */
+  std::span<std::byte> AsBytes() {
+    return std::span<std::byte>(reinterpret_cast<std::byte*>(data_.data()),
+                                data_.size() * 64);
+  }
+
+  /** @brief Const view as bytes. */
+  std::span<const std::byte> AsConstBytes() const {
+    return std::span<const std::byte>(
+        reinterpret_cast<const std::byte*>(data_.data()), data_.size() * 64);
+  }
+
+  /**
+   * @brief Mutable view as bytes.
+   * @note Uses a byte pointer to the underlying storage.
+   */
+  std::span<std::uint16_t> As16BitInts() {
+    return std::span<std::uint16_t>(
+        reinterpret_cast<std::uint16_t*>(data_.data()), data_.size() * 32);
+  }
+
+  /** @brief Const view as bytes. */
+  std::span<const std::uint16_t> AsConst16BitInts() const {
+    return std::span<const std::uint16_t>(
+        reinterpret_cast<const std::uint16_t*>(data_.data()),
+        data_.size() * 32);
+  }
+};
diff --git a/src/benchmarks/benchmarks.cpp b/src/benchmarks/benchmarks.cpp
index 03efed3..4682fe6 100644
--- a/src/benchmarks/benchmarks.cpp
+++ b/src/benchmarks/benchmarks.cpp
@@ -1,23 +1,96 @@
 #include <benchmark/benchmark.h>
 #include <pixie/bitvector.h>
+#include <pixie/cache_line.h>
+
 #ifdef PIXIE_THIRD_PARTY_BENCHMARKS
 // TODO: change the pasta/bit_vector usage of std::aligned_alloc
 #include <pasta/bit_vector/bit_vector.hpp>
 #endif
+#include <bit>
 #include <cstdlib>
 #include <random>
 #include <vector>
 
-static void BM_RankNonInterleaved(benchmark::State& state) {
-  size_t n = state.range(0);
+constexpr size_t kBenchmarkRandomCopies = 8;
+
+#ifdef _WIN32
+#include <windows.h>
+void platform_setup() {
+  // Pin to core 3
+  SetThreadAffinityMask(GetCurrentThread(), DWORD_PTR(1) << 3);
+  SetPriorityClass(GetCurrentProcess(), HIGH_PRIORITY_CLASS);
+  SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_TIME_CRITICAL);
+  SetProcessPriorityBoost(GetCurrentProcess(), TRUE);
+}
+#else
+#include <sched.h>
+void platform_setup() {
+  cpu_set_t cpuset;
+  CPU_ZERO(&cpuset);
+  CPU_SET(3, &cpuset);
+  sched_setaffinity(0, sizeof(cpuset), &cpuset);
+}
+#endif
+
+static void PrepareRandomBits50pFill(std::span<uint64_t> bits) {
+  auto randomly_filled_length = bits.size() / kBenchmarkRandomCopies;
+  std::mt19937_64 rng(42);
+
+  for (int i = 0; i < randomly_filled_length; ++i) {
+    bits[i] = rng();
+  }
+  for (int i = 1; i < kBenchmarkRandomCopies; ++i) {
+    std::copy_n(bits.begin(), randomly_filled_length,
+                bits.begin() + i * randomly_filled_length);
+  }
+  for (int i = kBenchmarkRandomCopies * randomly_filled_length; i < bits.size();
+       ++i) {
+    bits[i] = rng();
+  }
+}
+
+static void PrepareRandomBits12p5Fill(std::span<uint64_t> bits) {
+  auto randomly_filled_length = bits.size() / kBenchmarkRandomCopies;
+  std::mt19937_64 rng(42);
+
+  for (int i = 0; i < randomly_filled_length; ++i) {
+    bits[i] = rng() & rng() & rng();
+  }
+  for (int i = 1; i < kBenchmarkRandomCopies; ++i) {
+    std::copy_n(bits.begin(), randomly_filled_length,
+                bits.begin() + i * randomly_filled_length);
+  }
+  for (int i = kBenchmarkRandomCopies * randomly_filled_length; i < bits.size();
+       ++i) {
+    bits[i] = rng() & rng() & rng();
+  }
+}
+
+static void PrepareRandomBits87p5Fill(std::span<uint64_t> bits) {
+  auto randomly_filled_length = bits.size() / kBenchmarkRandomCopies;
   std::mt19937_64 rng(42);
 
-  std::vector<uint64_t> bits(((8 + n / 64) / 8) * 8);
-  for (auto& x : bits) {
-    x = rng();
+  for (int i = 0; i < randomly_filled_length; ++i) {
+    bits[i] = rng() | rng() | rng();
+  }
+  for (int i = 1; i < kBenchmarkRandomCopies; ++i) {
+    std::copy_n(bits.begin(), randomly_filled_length,
+                bits.begin() + i * randomly_filled_length);
+  }
+  for (int i = kBenchmarkRandomCopies * randomly_filled_length; i < bits.size();
+       ++i) {
+    bits[i] = rng() | rng() | rng();
   }
-  pixie::BitVector bv(bits, n);
+}
 
+static void BM_RankNonInterleaved(benchmark::State& state) {
+  size_t n = state.range(0);
+  AlignedStorage bits(n);
+  auto bits_as_words = bits.As64BitInts();
+  PrepareRandomBits50pFill(bits_as_words);
+  pixie::BitVector bv(bits_as_words, n);
+
+  std::mt19937_64 rng(42);
   for (auto _ : state) {
     uint64_t pos = rng() % n;
     benchmark::DoNotOptimize(bv.rank(pos));
@@ -26,14 +99,12 @@ static void BM_RankNonInterleaved(benchmark::State& state) {
 
 static void BM_RankZeroNonInterleaved(benchmark::State& state) {
   size_t n = state.range(0);
-  std::mt19937_64 rng(42);
 
-  std::vector<uint64_t> bits(((8 + n / 64) / 8) * 8);
-  for (auto& x : bits) {
-    x = rng();
-  }
-  pixie::BitVector bv(bits, n);
+  AlignedStorage bits(n);
+  PrepareRandomBits50pFill(bits.As64BitInts());
+  pixie::BitVector bv(bits.As64BitInts(), n);
 
+  std::mt19937_64 rng(42);
   for (auto _ : state) {
     uint64_t pos = rng() % n;
     benchmark::DoNotOptimize(bv.rank0(pos));
@@ -42,14 +113,12 @@ static void BM_RankZeroNonInterleaved(benchmark::State& state) {
 
 static void BM_RankInterleaved(benchmark::State& state) {
   size_t n = state.range(0);
-  std::mt19937_64 rng(42);
 
-  std::vector<uint64_t> bits(((8 + n / 64) / 8) * 8);
-  for (auto& x : bits) {
-    x = rng();
-  }
-  pixie::BitVectorInterleaved bv(bits, n);
+  AlignedStorage bits(n);
+  PrepareRandomBits50pFill(bits.As64BitInts());
+  pixie::BitVectorInterleaved bv(bits.As64BitInts(), n);
 
+  std::mt19937_64 rng(42);
   for (auto _ : state) {
     uint64_t pos = rng() % n;
     benchmark::DoNotOptimize(bv.rank(pos));
@@ -58,16 +127,14 @@ static void BM_RankInterleaved(benchmark::State& state) {
 
 static void BM_SelectNonInterleaved(benchmark::State& state) {
   size_t n = state.range(0);
-  std::mt19937_64 rng(42);
 
-  std::vector<uint64_t> bits(((8 + n / 64) / 8) * 8);
-  for (auto& x : bits) {
-    x = rng();
-  }
-  pixie::BitVector bv(bits, n);
+  AlignedStorage bits(n);
+  PrepareRandomBits50pFill(bits.As64BitInts());
+  pixie::BitVector bv(bits.As64BitInts(), n);
 
   auto max_rank = bv.rank(bv.size()) + 1;
 
+  std::mt19937_64 rng(42);
   for (auto _ : state) {
     uint64_t rank = rng() % max_rank;
     benchmark::DoNotOptimize(bv.select(rank));
@@ -76,176 +143,143 @@ static void BM_SelectNonInterleaved(benchmark::State& state) {
 
 static void BM_SelectZeroNonInterleaved(benchmark::State& state) {
   size_t n = state.range(0);
-  std::mt19937_64 rng(42);
 
-  std::vector<uint64_t> bits(((8 + n / 64) / 8) * 8);
-  for (auto& x : bits) {
-    x = rng();
-  }
-  pixie::BitVector bv(bits, n);
+  AlignedStorage bits(n);
+  PrepareRandomBits50pFill(bits.As64BitInts());
+  pixie::BitVector bv(bits.As64BitInts(), n);
 
   auto max_rank0 = bv.rank0(bv.size()) + 1;
 
+  std::mt19937_64 rng(42);
   for (auto _ : state) {
     uint64_t rank0 = rng() % max_rank0;
     benchmark::DoNotOptimize(bv.select0(rank0));
   }
 }
 
-static void BM_RankNonInterleaved10PercentFill(benchmark::State& state) {
+static void BM_RankNonInterleaved12p5PercentFill(benchmark::State& state) {
   size_t n = state.range(0);
-  std::mt19937_64 rng(42);
 
-  std::vector<uint64_t> bits(((8 + n / 64) / 8) * 8);
-  size_t num_ones = n * 0.1;
-  for (int i = 0; i < num_ones; i++) {
-    uint64_t pos = rng() % n;
-    bits[pos / 64] |= (1ULL << pos % 64);
-  }
-
-  pixie::BitVector bv(bits, n);
+  AlignedStorage bits(n);
+  PrepareRandomBits12p5Fill(bits.As64BitInts());
+  pixie::BitVector bv(bits.As64BitInts(), n);
 
-  for (auto _ : state) {
-    uint64_t pos = rng() % n;
-    benchmark::DoNotOptimize(bv.rank(pos));
+  std::mt19937_64 rng(42);
+  if (std::popcount(n) == 1) {
+    for (auto _ : state) {
+      uint64_t pos = rng() & (n - 1);
+      benchmark::DoNotOptimize(bv.rank(pos));
+    }
+  } else {
+    for (auto _ : state) {
+      uint64_t pos = rng() % n;
+      benchmark::DoNotOptimize(bv.rank(pos));
+    }
   }
 }
 
-static void BM_RankZeroNonInterleaved10PercentFill(benchmark::State& state) {
+static void BM_RankZeroNonInterleaved12p5PercentFill(benchmark::State& state) {
   size_t n = state.range(0);
-  std::mt19937_64 rng(42);
-
-  std::vector<uint64_t> bits(((8 + n / 64) / 8) * 8);
-  size_t num_ones = n * 0.1;
-  for (int i = 0; i < num_ones; i++) {
-    uint64_t pos = rng() % n;
-    bits[pos / 64] |= (1ULL << pos % 64);
-  }
 
-  pixie::BitVector bv(bits, n);
+  AlignedStorage bits(n);
+  PrepareRandomBits12p5Fill(bits.As64BitInts());
+  pixie::BitVector bv(bits.As64BitInts(), n);
 
+  std::mt19937_64 rng(42);
   for (auto _ : state) {
     uint64_t pos = rng() % n;
     benchmark::DoNotOptimize(bv.rank0(pos));
   }
 }
 
-static void BM_SelectNonInterleaved10PercentFill(benchmark::State& state) {
+static void BM_SelectNonInterleaved12p5PercentFill(benchmark::State& state) {
   size_t n = state.range(0);
-  std::mt19937_64 rng(42);
-
-  std::vector<uint64_t> bits(((8 + n / 64) / 8) * 8);
-  size_t num_ones = n * 0.1;
-  for (int i = 0; i < num_ones; i++) {
-    uint64_t pos = rng() % n;
-    bits[pos / 64] |= (1ULL << pos % 64);
-  }
 
-  pixie::BitVector bv(bits, n);
+  AlignedStorage bits(n);
+  PrepareRandomBits12p5Fill(bits.As64BitInts());
+  pixie::BitVector bv(bits.As64BitInts(), n);
 
   auto max_rank = bv.rank(bv.size()) + 1;
 
+  std::mt19937_64 rng(42);
   for (auto _ : state) {
     uint64_t rank = rng() % max_rank;
     benchmark::DoNotOptimize(bv.select(rank));
   }
 }
 
-static void BM_SelectZeroNonInterleaved10PercentFill(benchmark::State& state) {
+static void BM_SelectZeroNonInterleaved12p5PercentFill(
+    benchmark::State& state) {
   size_t n = state.range(0);
-  std::mt19937_64 rng(42);
-
-  std::vector<uint64_t> bits(((8 + n / 64) / 8) * 8);
-  size_t num_ones = n * 0.1;
-  for (int i = 0; i < num_ones; i++) {
-    uint64_t pos = rng() % n;
-    bits[pos / 64] |= (1ULL << pos % 64);
-  }
 
-  pixie::BitVector bv(bits, n);
+  AlignedStorage bits(n);
+  PrepareRandomBits12p5Fill(bits.As64BitInts());
+  pixie::BitVector bv(bits.As64BitInts(), n);
 
   auto max_rank0 = bv.rank0(bv.size()) + 1;
 
+  std::mt19937_64 rng(42);
   for (auto _ : state) {
     uint64_t rank0 = rng() % max_rank0;
     benchmark::DoNotOptimize(bv.select0(rank0));
   }
 }
 
-static void BM_RankNonInterleaved90PercentFill(benchmark::State& state) {
+static void BM_RankNonInterleaved87p5PercentFill(benchmark::State& state) {
   size_t n = state.range(0);
-  std::mt19937_64 rng(42);
-
-  std::vector<uint64_t> bits(((8 + n / 64) / 8) * 8);
-  size_t num_ones = n * 0.9;
-  for (int i = 0; i < num_ones; i++) {
-    uint64_t pos = rng() % n;
-    bits[pos / 64] |= (1ULL << pos % 64);
-  }
 
-  pixie::BitVector bv(bits, n);
+  AlignedStorage bits(n);
+  PrepareRandomBits87p5Fill(bits.As64BitInts());
+  pixie::BitVector bv(bits.As64BitInts(), n);
 
+  std::mt19937_64 rng(42);
   for (auto _ : state) {
     uint64_t pos = rng() % n;
     benchmark::DoNotOptimize(bv.rank(pos));
   }
 }
 
-static void BM_RankZeroNonInterleaved90PercentFill(benchmark::State& state) {
+static void BM_RankZeroNonInterleaved87p5PercentFill(benchmark::State& state) {
   size_t n = state.range(0);
-  std::mt19937_64 rng(42);
 
-  std::vector<uint64_t> bits(((8 + n / 64) / 8) * 8);
-  size_t num_ones = n * 0.9;
-  for (int i = 0; i < num_ones; i++) {
-    uint64_t pos = rng() % n;
-    bits[pos / 64] |= (1ULL << pos % 64);
-  }
-
-  pixie::BitVector bv(bits, n);
+  AlignedStorage bits(n);
+  PrepareRandomBits87p5Fill(bits.As64BitInts());
+  pixie::BitVector bv(bits.As64BitInts(), n);
 
+  std::mt19937_64 rng(42);
   for (auto _ : state) {
     uint64_t pos = rng() % n;
     benchmark::DoNotOptimize(bv.rank0(pos));
   }
 }
 
-static void BM_SelectNonInterleaved90PercentFill(benchmark::State& state) {
+static void BM_SelectNonInterleaved87p5PercentFill(benchmark::State& state) {
   size_t n = state.range(0);
-  std::mt19937_64 rng(42);
 
-  std::vector<uint64_t> bits(((8 + n / 64) / 8) * 8);
-  size_t num_ones = n * 0.9;
-  for (int i = 0; i < num_ones; i++) {
-    uint64_t pos = rng() % n;
-    bits[pos / 64] |= (1ULL << pos % 64);
-  }
-
-  pixie::BitVector bv(bits, n);
+  AlignedStorage bits(n);
+  PrepareRandomBits87p5Fill(bits.As64BitInts());
+  pixie::BitVector bv(bits.As64BitInts(), n);
 
   auto max_rank = bv.rank(bv.size()) + 1;
 
+  std::mt19937_64 rng(42);
   for (auto _ : state) {
     uint64_t rank = rng() % max_rank;
     benchmark::DoNotOptimize(bv.select(rank));
   }
 }
 
-static void BM_SelectZeroNonInterleaved90PercentFill(benchmark::State& state) {
+static void BM_SelectZeroNonInterleaved87p5PercentFill(
+    benchmark::State& state) {
   size_t n = state.range(0);
-  std::mt19937_64 rng(42);
 
-  std::vector<uint64_t> bits(((8 + n / 64) / 8) * 8);
-  size_t num_ones = n * 0.9;
-  for (int i = 0; i < num_ones; i++) {
-    uint64_t pos = rng() % n;
-    bits[pos / 64] |= (1ULL << pos % 64);
-  }
-
-  pixie::BitVector bv(bits, n);
+  AlignedStorage bits(n);
+  PrepareRandomBits87p5Fill(bits.As64BitInts());
+  pixie::BitVector bv(bits.As64BitInts(), n);
 
   auto max_rank0 = bv.rank0(bv.size()) + 1;
 
+  std::mt19937_64 rng(42);
   for (auto _ : state) {
     uint64_t rank0 = rng() % max_rank0;
     benchmark::DoNotOptimize(bv.select(rank0));
@@ -256,76 +290,89 @@ BENCHMARK(BM_RankInterleaved)
     ->ArgNames({"n"})
     ->RangeMultiplier(4)
     ->Range(8, 1ull << 34)
-    ->Iterations(100000);
+    ->Iterations(100000)
+    ->Repetitions(100);
 
 BENCHMARK(BM_RankNonInterleaved)
     ->ArgNames({"n"})
     ->RangeMultiplier(4)
     ->Range(8, 1ull << 34)
-    ->Iterations(100000);
+    ->Iterations(100000)
+    ->Repetitions(100);
 
 BENCHMARK(BM_RankZeroNonInterleaved)
     ->ArgNames({"n"})
     ->RangeMultiplier(4)
     ->Range(8, 1ull << 34)
-    ->Iterations(100000);
+    ->Iterations(100000)
+    ->Repetitions(100);
 
 BENCHMARK(BM_SelectNonInterleaved)
     ->ArgNames({"n"})
     ->RangeMultiplier(4)
     ->Range(8, 1ull << 34)
-    ->Iterations(100000);
+    ->Iterations(100000)
+    ->Repetitions(100);
 
 BENCHMARK(BM_SelectZeroNonInterleaved)
     ->ArgNames({"n"})
     ->RangeMultiplier(4)
     ->Range(8, 1ull << 34)
-    ->Iterations(100000);
+    ->Iterations(100000)
+    ->Repetitions(100);
 
-BENCHMARK(BM_RankNonInterleaved10PercentFill)
+BENCHMARK(BM_RankNonInterleaved12p5PercentFill)
     ->ArgNames({"n"})
     ->RangeMultiplier(4)
     ->Range(8, 1ull << 34)
-    ->Iterations(100000);
+    ->Iterations(100000)
+    ->Repetitions(100);
 
-BENCHMARK(BM_RankZeroNonInterleaved10PercentFill)
+BENCHMARK(BM_RankZeroNonInterleaved12p5PercentFill)
     ->ArgNames({"n"})
     ->RangeMultiplier(4)
     ->Range(8, 1ull << 34)
-    ->Iterations(100000);
+    ->Iterations(100000)
+    ->Repetitions(100);
 
-BENCHMARK(BM_SelectNonInterleaved10PercentFill)
+BENCHMARK(BM_SelectNonInterleaved12p5PercentFill)
     ->ArgNames({"n"})
     ->RangeMultiplier(4)
     ->Range(8, 1ull << 34)
-    ->Iterations(100000);
+    ->Iterations(100000)
+    ->Repetitions(100);
 
-BENCHMARK(BM_SelectZeroNonInterleaved10PercentFill)
+BENCHMARK(BM_SelectZeroNonInterleaved12p5PercentFill)
     ->ArgNames({"n"})
     ->RangeMultiplier(4)
     ->Range(8, 1ull << 34)
-    ->Iterations(100000);
+    ->Iterations(100000)
+    ->Repetitions(100);
 
-BENCHMARK(BM_RankNonInterleaved90PercentFill)
+BENCHMARK(BM_RankNonInterleaved87p5PercentFill)
     ->ArgNames({"n"})
     ->RangeMultiplier(4)
     ->Range(8, 1ull << 34)
-    ->Iterations(100000);
+    ->Iterations(100000)
+    ->Repetitions(100);
 
-BENCHMARK(BM_RankZeroNonInterleaved90PercentFill)
+BENCHMARK(BM_RankZeroNonInterleaved87p5PercentFill)
     ->ArgNames({"n"})
     ->RangeMultiplier(4)
     ->Range(8, 1ull << 34)
-    ->Iterations(100000);
+    ->Iterations(100000)
+    ->Repetitions(100);
 
-BENCHMARK(BM_SelectNonInterleaved90PercentFill)
+BENCHMARK(BM_SelectNonInterleaved87p5PercentFill)
     ->ArgNames({"n"})
     ->RangeMultiplier(4)
     ->Range(8, 1ull << 34)
-    ->Iterations(100000);
+    ->Iterations(100000)
+    ->Repetitions(100);
 
-BENCHMARK(BM_SelectZeroNonInterleaved90PercentFill)
+BENCHMARK(BM_SelectZeroNonInterleaved87p5PercentFill)
     ->ArgNames({"n"})
     ->RangeMultiplier(4)
     ->Range(8, 1ull << 34)
-    ->Iterations(100000);
+    ->Iterations(100000)
+    ->Repetitions(100);
diff --git a/src/docs/Doxyfile.in b/src/docs/Doxyfile.in
index 630a86e..34796fc 100644
--- a/src/docs/Doxyfile.in
+++ b/src/docs/Doxyfile.in
@@ -190,7 +190,7 @@ FULL_PATH_NAMES        = YES
 # will be relative from the directory where Doxygen is started.
 # This tag requires that the tag FULL_PATH_NAMES is set to YES.
 
-STRIP_FROM_PATH        =
+STRIP_FROM_PATH        = @CMAKE_CURRENT_SOURCE_DIR@
 
 # The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
 # path mentioned in the documentation of a class, which tells the reader which
@@ -1005,7 +1005,8 @@ WARN_LOGFILE           =
 # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
 # Note: If this tag is empty the current directory is searched.
 
-INPUT                  = . include
+INPUT                  = @CMAKE_CURRENT_SOURCE_DIR@/include \
+                         @CMAKE_CURRENT_SOURCE_DIR@/README.md
 
 # This tag can be used to specify the character encoding of the source files
 # that Doxygen parses. Internally Doxygen uses the UTF-8 encoding. Doxygen uses
@@ -1100,7 +1101,7 @@ FILE_PATTERNS          = *.c \
 # be searched for input files as well.
 # The default value is: NO.
 
-RECURSIVE              = NO
+RECURSIVE              = YES
 
 # The EXCLUDE tag can be used to specify files and/or directories that should be
 # excluded from the INPUT source files. This way you can easily exclude a
@@ -1220,7 +1221,7 @@ FILTER_SOURCE_PATTERNS =
 # (index.html). This can be useful if you have a project on for instance GitHub
 # and want to reuse the introduction page also for the Doxygen output.
 
-USE_MDFILE_AS_MAINPAGE = README.md
+USE_MDFILE_AS_MAINPAGE = @CMAKE_CURRENT_SOURCE_DIR@/README.md
 
 # If the IMPLICIT_DIR_DOCS tag is set to YES, any README.md file found in sub-
 # directories of the project's root, is used as the documentation for that sub-