Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion benchmarks/benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,9 @@ void evaluateProperties(const std::vector<T> &lines,
}

struct diy_float_t {
uint64_t significand;
diy_float_t(uint64_t significand, int exponent, bool is_negative)
: significand(significand), exponent(exponent), is_negative(is_negative) {}
uint64_t significand;
int exponent;
bool is_negative;
};
Expand Down
180 changes: 68 additions & 112 deletions benchmarks/benchutil.h
Original file line number Diff line number Diff line change
@@ -1,136 +1,92 @@
#ifndef BENCHUTIL_H
#define BENCHUTIL_H

#include "counters/event_counter.h"
#include <cfloat>
#include <cstdio>

#if defined(__linux__) || (__APPLE__ && __aarch64__)
#define USING_COUNTERS
#include "counters/event_counter.h"
#else
#include <chrono>
#endif
#include <atomic>
event_collector collector;

#ifdef USING_COUNTERS
template <class T, class Func>
std::vector<event_count> time_it_ns(const std::vector<T> &lines,
Func&& function, size_t repeat) {
std::vector<event_count> aggregate;
event_collector collector;
bool printed_bug = false;
for (size_t i = 0; i < repeat; i++) {
template <class function_type>
event_aggregate bench(const function_type &&function, size_t min_repeat = 10,
size_t min_time_ns = 400'000'000,
size_t max_repeat = 1000000) {
size_t N = min_repeat;
if (N == 0) {
N = 1;
}
volatile double dontoptimize = 0.0;
// We warmm up first. We warmup for at least 0.4s (by default). This makes
// sure that the processor is in a consistent state.
event_aggregate warm_aggregate{};
for (size_t i = 0; i < N; i++) {
std::atomic_thread_fence(std::memory_order_acquire);
collector.start();
dontoptimize = double(function());
std::atomic_thread_fence(std::memory_order_release);
event_count allocate_count = collector.end();
warm_aggregate << allocate_count;
if ((i + 1 == N) && (warm_aggregate.total_elapsed_ns() < min_time_ns) &&
(N < max_repeat)) {
N *= 10;
}
}
// Actual measure, another 0.4s (by default), this time with a processor
// warmed up.
event_aggregate aggregate{};
for (size_t i = 0; i < N; i++) {
std::atomic_thread_fence(std::memory_order_acquire);
collector.start();
if (function(lines) == 0 && !printed_bug) {
printf("bug\n");
printed_bug = true;
dontoptimize = double(function());
std::atomic_thread_fence(std::memory_order_release);
event_count allocate_count = collector.end();
aggregate << allocate_count;
if ((i + 1 == N) && (aggregate.total_elapsed_ns() < min_time_ns) &&
(N < max_repeat)) {
N *= 10;
}
aggregate.push_back(collector.end());
}
return aggregate;
}

template <class T, class Func>
void pretty_print(const std::vector<T> &lines, const std::string &name,
Func&& function, size_t repeat = 100) {
Func &&function, size_t repeat = 100) {
const size_t number_of_floats = lines.size();
const double volume = static_cast<double>(function(lines));
const double volumeMB = volume / (1024. * 1024.);
const std::vector<event_count> events = time_it_ns(lines, function, repeat);
double average_ns{0};
double min_ns{DBL_MAX};
double cycles_min{DBL_MAX};
double instructions_min{DBL_MAX};
double cycles_avg{0};
double instructions_avg{0};
double branches_min{0};
double branches_avg{0};
double branch_misses_min{0};
double branch_misses_avg{0};
for (event_count e : events) {
const double ns = e.elapsed_ns();
average_ns += ns;
min_ns = std::min(min_ns, ns);

const double cycles = e.cycles();
cycles_avg += cycles;
cycles_min = std::min(cycles_min, cycles);

const double instructions = e.instructions();
instructions_avg += instructions;
instructions_min = std::min(instructions_min, instructions);

const double branches = e.branches();
branches_avg += branches;
branches_min = std::min(branches_min, branches);

const double branch_misses = e.missed_branches();
branch_misses_avg += branch_misses;
branch_misses_min = std::min(branch_misses_min, branch_misses);
}
cycles_avg /= events.size();
instructions_avg /= events.size();
average_ns /= events.size();
branches_avg /= events.size();
const double volumeMB = volume / 1'000'000;
auto agg = bench([&function, &lines]() { return function(lines); }, repeat);

printf("%-30s: %8.2f MB/s (+/- %.1f %%) ", name.data(),
volumeMB * 1000000000 / min_ns,
(average_ns - min_ns) * 100.0 / average_ns);
volumeMB * 1000'000'000 / agg.fastest_elapsed_ns(),
(agg.elapsed_ns() - agg.fastest_elapsed_ns()) * 100.0 /
agg.elapsed_ns());
printf("%8.2f MB ", volumeMB);
printf("%8.2f Mfloat/s ", number_of_floats * 1000 / min_ns);
if (instructions_min > 0) {
printf(" %8.2f i/B %8.2f i/f (+/- %.1f %%) ", instructions_min / volume,
instructions_min / number_of_floats,
(instructions_avg - instructions_min) * 100.0 / instructions_avg);
printf(" %8.2f ns/f ", agg.fastest_elapsed_ns() / number_of_floats);
printf("%8.2f Mfloat/s\n",
number_of_floats * 1000 / agg.fastest_elapsed_ns());
// We only print out performance counters if they are available.
if (collector.has_events()) {
// Somewhat arbitrarily, we use two new lines for the counters.
printf(" ");
printf(" %8.2f i/B %8.2f i/f (+/- %.1f %%) ",
agg.fastest_instructions() / volume,
agg.fastest_instructions() / number_of_floats,
(agg.instructions() - agg.fastest_instructions()) * 100.0 /
agg.instructions());

printf(" %8.2f c/B %8.2f c/f (+/- %.1f %%) ", cycles_min / volume,
cycles_min / number_of_floats,
(cycles_avg - cycles_min) * 100.0 / cycles_avg);
printf(" %8.2f i/c ", instructions_min / cycles_min);
printf(" %8.2f b/f ", branches_avg / number_of_floats);
printf(" %8.2f bm/f ", branch_misses_avg / number_of_floats);
printf(" %8.2f GHz ", cycles_min / min_ns);
}
printf("\n");
}
#else
template <class T, class Func>
std::pair<double, double> time_it_ns(const std::vector<T> &lines,
Func&& function, size_t repeat) {
typename std::chrono::high_resolution_clock::time_point t1, t2;
double average = 0;
double min_value = DBL_MAX;
bool printed_bug = false;
for (size_t i = 0; i < repeat; i++) {
t1 = std::chrono::high_resolution_clock::now();
if (function(lines) == 0 && !printed_bug) {
printf("bug\n");
printed_bug = true;
}
t2 = std::chrono::high_resolution_clock::now();
const double dif =
std::chrono::duration_cast<std::chrono::nanoseconds>(t2 - t1).count();
average += dif;
min_value = std::min(min_value, dif);
printf(" %8.2f c/B %8.2f c/f (+/- %.1f %%)\n",
agg.fastest_cycles() / volume,
agg.fastest_cycles() / number_of_floats,
(agg.cycles() - agg.fastest_cycles()) * 100.0 / agg.cycles());
printf(" ");
printf(" %8.2f i/c ", agg.fastest_instructions() / agg.fastest_cycles());
printf(" %8.2f b/f ", agg.branches() / number_of_floats);
printf(" ");
printf(" %8.2f bm/f ", agg.branch_misses() / number_of_floats);
printf(" %8.2f GHz ", agg.fastest_cycles() / agg.fastest_elapsed_ns());
printf("\n");
}
average /= repeat;
return std::make_pair(min_value, average);
}

template <class T, class Func>
void pretty_print(const std::vector<T> &lines, const std::string &name,
Func&& function, size_t repeat = 100) {
const size_t number_of_floats = lines.size();
const double volume = static_cast<double>(function(lines));
const double volumeMB = volume / (1024. * 1024.);
const std::pair<double, double> result = time_it_ns(lines, function, repeat);

printf("%-30s: %8.2f MB/s (+/- %.1f %%) ", name.data(),
volumeMB * 1000000000 / result.first,
(result.second - result.first) * 100.0 / result.second);
printf("%8.2f MB ", volumeMB);
printf("%8.2f Mfloat/s ", number_of_floats * 1000 / result.first);
printf(" %8.2f ns/f \n", double(result.first) / number_of_floats);
}

#endif
#endif //// BENCHUTIL_H
Loading