Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 32 additions & 35 deletions src/algorithms/cobra.rs
Original file line number Diff line number Diff line change
Expand Up @@ -987,18 +987,21 @@ fn bit_rev_1024<T>(buf: &mut [T]) {
buf.swap(991, 1007);
}

/// ## References
/// [1] <https://www.katjaas.nl/bitreversal/bitreversal.html>
pub(crate) fn bit_rev<T>(buf: &mut [T], log_n: usize) {
// Use unrolled versions for common sizes
/// Dispatches to fully unrolled versions for small sizes
pub(crate) fn bit_rev_unrolled<T>(buf: &mut [T], log_n: usize) {
match log_n {
6 => return bit_rev_64(buf),
7 => return bit_rev_128(buf),
8 => return bit_rev_256(buf),
9 => return bit_rev_512(buf),
10 => return bit_rev_1024(buf),
_ => {} // Fall through to generic implementation
6 => bit_rev_64(buf),
7 => bit_rev_128(buf),
8 => bit_rev_256(buf),
9 => bit_rev_512(buf),
10 => bit_rev_1024(buf),
_ => bit_rev_gray(buf, log_n),
}
}

/// ## References
/// [1] <https://www.katjaas.nl/bitreversal/bitreversal.html>
pub(crate) fn bit_rev_gray<T>(buf: &mut [T], log_n: usize) {
let mut nodd: usize;
let mut noddrev; // to hold bitwise negated or odd values

Expand Down Expand Up @@ -1036,12 +1039,8 @@ pub(crate) fn bit_rev<T>(buf: &mut [T], log_n: usize) {
}
}

#[allow(dead_code)]
#[deprecated(
since = "0.1.0",
note = "Naive bit reverse permutation is slow and not cache friendly. COBRA should be used instead."
)]
pub(crate) fn bit_reverse_permutation<T>(buf: &mut [T]) {
/// Slow, naive implementation of bit reversal
pub(crate) fn bit_rev_naive<T>(buf: &mut [T], _log_n: usize) {
let n = buf.len();
let mut j = 0;

Expand Down Expand Up @@ -1078,23 +1077,23 @@ pub(crate) fn bit_reverse_permutation<T>(buf: &mut [T]) {
"x86+sse4.2",
"x86+sse2",
))]
pub fn cobra_apply<T: Default + Copy + Clone>(v: &mut [T], log_n: usize) {
pub fn bit_rev_cobra<T: Default + Copy + Clone>(v: &mut [T], log_n: usize) {
const { assert!(BLOCK_WIDTH == 1 << LOG_BLOCK_WIDTH) };
if log_n <= 2 * LOG_BLOCK_WIDTH {
bit_rev(v, log_n);
bit_rev_gray(v, log_n);
return;
}
let num_b_bits = log_n - 2 * LOG_BLOCK_WIDTH;
let b_size: usize = 1 << num_b_bits;
let block_width: usize = 1 << LOG_BLOCK_WIDTH;

let mut buffer = [T::default(); BLOCK_WIDTH * BLOCK_WIDTH];

for b in 0..b_size {
let b_rev = b.reverse_bits() >> ((b_size - 1).leading_zeros());

// Copy block to buffer
for a in 0..block_width {
let a_rev = a.reverse_bits() >> ((block_width - 1).leading_zeros());
for a in 0..BLOCK_WIDTH {
let a_rev = a.reverse_bits() >> ((BLOCK_WIDTH - 1).leading_zeros());
for c in 0..BLOCK_WIDTH {
buffer[(a_rev << LOG_BLOCK_WIDTH) | c] =
v[(a << num_b_bits << LOG_BLOCK_WIDTH) | (b << LOG_BLOCK_WIDTH) | c];
Expand All @@ -1103,10 +1102,10 @@ pub fn cobra_apply<T: Default + Copy + Clone>(v: &mut [T], log_n: usize) {

for c in 0..BLOCK_WIDTH {
// NOTE: Typo in original pseudocode by Carter and Gatlin at the following line:
let c_rev = c.reverse_bits() >> ((block_width - 1).leading_zeros());
let c_rev = c.reverse_bits() >> ((BLOCK_WIDTH - 1).leading_zeros());

for a_rev in 0..BLOCK_WIDTH {
let a = a_rev.reverse_bits() >> ((block_width - 1).leading_zeros());
let a = a_rev.reverse_bits() >> ((BLOCK_WIDTH - 1).leading_zeros());

// To guarantee each value is swapped only one time:
// index < reversed_index <-->
Expand All @@ -1130,9 +1129,9 @@ pub fn cobra_apply<T: Default + Copy + Clone>(v: &mut [T], log_n: usize) {

// Copy changes that were swapped into buffer above:
for a in 0..BLOCK_WIDTH {
let a_rev = a.reverse_bits() >> ((block_width - 1).leading_zeros());
let a_rev = a.reverse_bits() >> ((BLOCK_WIDTH - 1).leading_zeros());
for c in 0..BLOCK_WIDTH {
let c_rev = c.reverse_bits() >> ((block_width - 1).leading_zeros());
let c_rev = c.reverse_bits() >> ((BLOCK_WIDTH - 1).leading_zeros());
let index_less_than_reverse = a < c_rev
|| (a == c_rev && b < b_rev)
|| (a == c_rev && b == b_rev && a_rev < c);
Expand Down Expand Up @@ -1175,30 +1174,30 @@ mod tests {
}

#[test]
fn cobra() {
fn test_cobra_bit_reversal() {
for n in 4..23 {
let big_n = 1 << n;
let mut v: Vec<_> = (0..big_n).collect();
cobra_apply(&mut v, n);
bit_rev_cobra(&mut v, n);

let x: Vec<_> = (0..big_n).collect();
assert_eq!(v, top_down_bit_reverse_permutation(&x));
}
}

#[test]
fn bit_reversal() {
fn test_gray_bit_reversal() {
let n = 3;
let big_n = 1 << n;
let mut buf: Vec<f64> = (0..big_n).map(f64::from).collect();
bit_rev(&mut buf, n);
bit_rev_gray(&mut buf, n);
println!("{buf:?}");
assert_eq!(buf, vec![0.0, 4.0, 2.0, 6.0, 1.0, 5.0, 3.0, 7.0]);

let n = 4;
let big_n = 1 << n;
let mut buf: Vec<f64> = (0..big_n).map(f64::from).collect();
bit_rev(&mut buf, n);
bit_rev_gray(&mut buf, n);
println!("{buf:?}");
assert_eq!(
buf,
Expand All @@ -1210,17 +1209,15 @@ mod tests {
}

#[test]
fn naive_bit_reverse_permutation() {
fn test_naive_bit_reversal() {
for n in 2..24 {
let big_n = 1 << n;
let mut actual_re: Vec<f64> = (0..big_n).map(f64::from).collect();
let mut actual_im: Vec<f64> = (0..big_n).map(f64::from).collect();

#[allow(deprecated)]
bit_reverse_permutation(&mut actual_re);
bit_rev_naive(&mut actual_re, n);

#[allow(deprecated)]
bit_reverse_permutation(&mut actual_im);
bit_rev_naive(&mut actual_im, n);

let input_re: Vec<f64> = (0..big_n).map(f64::from).collect();
let expected_re = top_down_bit_reverse_permutation(&input_re);
Expand Down
10 changes: 5 additions & 5 deletions src/algorithms/dif.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
//! 2. Works up to the last stage, with log(N) stages in total.
//! 3. Optionally apply bit-reversal at the end
//!
use crate::algorithms::cobra::cobra_apply;
use crate::algorithms::cobra::bit_rev_cobra;
use crate::kernels::common::fft_chunk_2;
use crate::kernels::dif::{fft_32_chunk_n_simd, fft_64_chunk_n_simd, fft_chunk_4, fft_chunk_n};
use crate::options::Options;
Expand Down Expand Up @@ -121,8 +121,8 @@ pub fn fft_64_with_opts_and_plan(
if opts.dif_perform_bit_reversal {
run_maybe_in_parallel(
opts.multithreaded_bit_reversal,
|| cobra_apply(reals, n),
|| cobra_apply(imags, n),
|| bit_rev_cobra(reals, n),
|| bit_rev_cobra(imags, n),
);
}

Expand Down Expand Up @@ -224,8 +224,8 @@ pub fn fft_32_with_opts_and_plan(
if opts.dif_perform_bit_reversal {
run_maybe_in_parallel(
opts.multithreaded_bit_reversal,
|| cobra_apply(reals, n),
|| cobra_apply(imags, n),
|| bit_rev_cobra(reals, n),
|| bit_rev_cobra(imags, n),
);
}

Expand Down
9 changes: 4 additions & 5 deletions src/algorithms/dit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
//! DIT starts with fine-grained memory access and progressively works with
//! larger contiguous chunks.
//!
use crate::algorithms::cobra::cobra_apply;
use crate::kernels::dit::{
fft_dit_32_chunk_n_simd, fft_dit_64_chunk_n_simd, fft_dit_chunk_16_simd_f32,
fft_dit_chunk_16_simd_f64, fft_dit_chunk_2, fft_dit_chunk_32_simd_f32,
Expand Down Expand Up @@ -250,8 +249,8 @@ pub fn fft_64_dit_with_planner_and_opts(
// DIT requires bit-reversed input
run_maybe_in_parallel(
opts.multithreaded_bit_reversal,
|| cobra_apply(reals, log_n),
|| cobra_apply(imags, log_n),
|| (planner.bit_rev_impl)(reals, log_n),
|| (planner.bit_rev_impl)(imags, log_n),
);

// Handle inverse FFT
Expand Down Expand Up @@ -293,8 +292,8 @@ pub fn fft_32_dit_with_planner_and_opts(
// DIT requires bit-reversed input
run_maybe_in_parallel(
opts.multithreaded_bit_reversal,
|| cobra_apply(reals, log_n),
|| cobra_apply(imags, log_n),
|| (planner.bit_rev_impl)(reals, log_n),
|| (planner.bit_rev_impl)(imags, log_n),
);

// Handle inverse FFT
Expand Down
68 changes: 68 additions & 0 deletions src/bencher.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
//! Benchmarks various function implementations and returns the fastest one on the current hardware

use std::time::Instant;

use crate::algorithms::cobra::*;
use crate::planner::BitRevFunc;

/// Benchmarks multiple implementations and returns the fastest one
fn find_fastest_implementation<T: Default + Copy + Clone>(
implementations: &[BitRevFunc<T>],
test_data: &[T],
iterations: usize,
) -> BitRevFunc<T> {
let mut results = Vec::new();

for func in implementations.iter() {
// Create a fresh copy of test data for each implementation
let mut data = test_data.to_vec();
let log_n = test_data.len().ilog2() as usize;

// Warm-up run
func(&mut data, log_n);
// Reset data between iterations
data.copy_from_slice(test_data);

// Actual benchmark
let start = Instant::now();
for _ in 0..iterations {
// Reset data between iterations
data.copy_from_slice(test_data);
func(&mut data, log_n);
}
let elapsed = start.elapsed().as_nanos();

results.push((*func, elapsed));
// println!(
// "Implementation {:?}: {} ns total ({} ns/iter)",
// func,
// elapsed,
// elapsed / iterations as u128
// );
}

// Find the fastest
let fastest = results.iter().min_by_key(|(_, time)| time).unwrap();

fastest.0
}

pub fn measure_fastest_bit_reversal_impl<T: Default + Copy + Clone>(log_n: usize) -> BitRevFunc<T> {
let implementations: &[BitRevFunc<T>] =
&[bit_rev_cobra, bit_rev_gray, bit_rev_naive, bit_rev_unrolled];

let test_data: Vec<T> = vec![T::default(); 1 << log_n];
let iterations = 10;

let fastest = find_fastest_implementation(implementations, &test_data, iterations);

fastest
}

pub fn guess_fastest_bit_reversal_impl<T: Default + Copy + Clone>(log_n: usize) -> BitRevFunc<T> {
if log_n <= 10 {
bit_rev_gray
} else {
bit_rev_cobra
}
}
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ use crate::planner::{Direction, Planner32, Planner64, PlannerDit32, PlannerDit64
use crate::utils::{combine_re_im, deinterleave_complex32, deinterleave_complex64};

mod algorithms;
mod bencher;
mod kernels;
pub mod options;
mod parallel;
Expand Down
Loading
Loading