diff --git a/benchmarks/benchmark.cpp b/benchmarks/benchmark.cpp index 00b9f2b..4c0f497 100644 --- a/benchmarks/benchmark.cpp +++ b/benchmarks/benchmark.cpp @@ -86,7 +86,9 @@ void evaluateProperties(const std::vector &lines, } struct diy_float_t { - uint64_t significand; + diy_float_t(uint64_t significand, int exponent, bool is_negative) + : significand(significand), exponent(exponent), is_negative(is_negative) {} + uint64_t significand; int exponent; bool is_negative; }; diff --git a/benchmarks/benchutil.h b/benchmarks/benchutil.h index e88a89e..7ae5ddd 100644 --- a/benchmarks/benchutil.h +++ b/benchmarks/benchutil.h @@ -1,136 +1,92 @@ #ifndef BENCHUTIL_H #define BENCHUTIL_H +#include "counters/event_counter.h" #include #include -#if defined(__linux__) || (__APPLE__ && __aarch64__) - #define USING_COUNTERS - #include "counters/event_counter.h" -#else - #include -#endif +#include +event_collector collector; -#ifdef USING_COUNTERS -template -std::vector time_it_ns(const std::vector &lines, - Func&& function, size_t repeat) { - std::vector aggregate; - event_collector collector; - bool printed_bug = false; - for (size_t i = 0; i < repeat; i++) { +template +event_aggregate bench(const function_type &&function, size_t min_repeat = 10, + size_t min_time_ns = 400'000'000, + size_t max_repeat = 1000000) { + size_t N = min_repeat; + if (N == 0) { + N = 1; + } + volatile double dontoptimize = 0.0; + // We warmm up first. We warmup for at least 0.4s (by default). This makes + // sure that the processor is in a consistent state. + event_aggregate warm_aggregate{}; + for (size_t i = 0; i < N; i++) { + std::atomic_thread_fence(std::memory_order_acquire); + collector.start(); + dontoptimize = double(function()); + std::atomic_thread_fence(std::memory_order_release); + event_count allocate_count = collector.end(); + warm_aggregate << allocate_count; + if ((i + 1 == N) && (warm_aggregate.total_elapsed_ns() < min_time_ns) && + (N < max_repeat)) { + N *= 10; + } + } + // Actual measure, another 0.4s (by default), this time with a processor + // warmed up. + event_aggregate aggregate{}; + for (size_t i = 0; i < N; i++) { + std::atomic_thread_fence(std::memory_order_acquire); collector.start(); - if (function(lines) == 0 && !printed_bug) { - printf("bug\n"); - printed_bug = true; + dontoptimize = double(function()); + std::atomic_thread_fence(std::memory_order_release); + event_count allocate_count = collector.end(); + aggregate << allocate_count; + if ((i + 1 == N) && (aggregate.total_elapsed_ns() < min_time_ns) && + (N < max_repeat)) { + N *= 10; } - aggregate.push_back(collector.end()); } return aggregate; } template void pretty_print(const std::vector &lines, const std::string &name, - Func&& function, size_t repeat = 100) { + Func &&function, size_t repeat = 100) { const size_t number_of_floats = lines.size(); const double volume = static_cast(function(lines)); - const double volumeMB = volume / (1024. * 1024.); - const std::vector events = time_it_ns(lines, function, repeat); - double average_ns{0}; - double min_ns{DBL_MAX}; - double cycles_min{DBL_MAX}; - double instructions_min{DBL_MAX}; - double cycles_avg{0}; - double instructions_avg{0}; - double branches_min{0}; - double branches_avg{0}; - double branch_misses_min{0}; - double branch_misses_avg{0}; - for (event_count e : events) { - const double ns = e.elapsed_ns(); - average_ns += ns; - min_ns = std::min(min_ns, ns); - - const double cycles = e.cycles(); - cycles_avg += cycles; - cycles_min = std::min(cycles_min, cycles); - - const double instructions = e.instructions(); - instructions_avg += instructions; - instructions_min = std::min(instructions_min, instructions); - - const double branches = e.branches(); - branches_avg += branches; - branches_min = std::min(branches_min, branches); - - const double branch_misses = e.missed_branches(); - branch_misses_avg += branch_misses; - branch_misses_min = std::min(branch_misses_min, branch_misses); - } - cycles_avg /= events.size(); - instructions_avg /= events.size(); - average_ns /= events.size(); - branches_avg /= events.size(); + const double volumeMB = volume / 1'000'000; + auto agg = bench([&function, &lines]() { return function(lines); }, repeat); printf("%-30s: %8.2f MB/s (+/- %.1f %%) ", name.data(), - volumeMB * 1000000000 / min_ns, - (average_ns - min_ns) * 100.0 / average_ns); + volumeMB * 1000'000'000 / agg.fastest_elapsed_ns(), + (agg.elapsed_ns() - agg.fastest_elapsed_ns()) * 100.0 / + agg.elapsed_ns()); printf("%8.2f MB ", volumeMB); - printf("%8.2f Mfloat/s ", number_of_floats * 1000 / min_ns); - if (instructions_min > 0) { - printf(" %8.2f i/B %8.2f i/f (+/- %.1f %%) ", instructions_min / volume, - instructions_min / number_of_floats, - (instructions_avg - instructions_min) * 100.0 / instructions_avg); + printf(" %8.2f ns/f ", agg.fastest_elapsed_ns() / number_of_floats); + printf("%8.2f Mfloat/s\n", + number_of_floats * 1000 / agg.fastest_elapsed_ns()); + // We only print out performance counters if they are available. + if (collector.has_events()) { + // Somewhat arbitrarily, we use two new lines for the counters. + printf(" "); + printf(" %8.2f i/B %8.2f i/f (+/- %.1f %%) ", + agg.fastest_instructions() / volume, + agg.fastest_instructions() / number_of_floats, + (agg.instructions() - agg.fastest_instructions()) * 100.0 / + agg.instructions()); - printf(" %8.2f c/B %8.2f c/f (+/- %.1f %%) ", cycles_min / volume, - cycles_min / number_of_floats, - (cycles_avg - cycles_min) * 100.0 / cycles_avg); - printf(" %8.2f i/c ", instructions_min / cycles_min); - printf(" %8.2f b/f ", branches_avg / number_of_floats); - printf(" %8.2f bm/f ", branch_misses_avg / number_of_floats); - printf(" %8.2f GHz ", cycles_min / min_ns); - } - printf("\n"); -} -#else -template -std::pair time_it_ns(const std::vector &lines, - Func&& function, size_t repeat) { - typename std::chrono::high_resolution_clock::time_point t1, t2; - double average = 0; - double min_value = DBL_MAX; - bool printed_bug = false; - for (size_t i = 0; i < repeat; i++) { - t1 = std::chrono::high_resolution_clock::now(); - if (function(lines) == 0 && !printed_bug) { - printf("bug\n"); - printed_bug = true; - } - t2 = std::chrono::high_resolution_clock::now(); - const double dif = - std::chrono::duration_cast(t2 - t1).count(); - average += dif; - min_value = std::min(min_value, dif); + printf(" %8.2f c/B %8.2f c/f (+/- %.1f %%)\n", + agg.fastest_cycles() / volume, + agg.fastest_cycles() / number_of_floats, + (agg.cycles() - agg.fastest_cycles()) * 100.0 / agg.cycles()); + printf(" "); + printf(" %8.2f i/c ", agg.fastest_instructions() / agg.fastest_cycles()); + printf(" %8.2f b/f ", agg.branches() / number_of_floats); + printf(" "); + printf(" %8.2f bm/f ", agg.branch_misses() / number_of_floats); + printf(" %8.2f GHz ", agg.fastest_cycles() / agg.fastest_elapsed_ns()); + printf("\n"); } - average /= repeat; - return std::make_pair(min_value, average); -} - -template -void pretty_print(const std::vector &lines, const std::string &name, - Func&& function, size_t repeat = 100) { - const size_t number_of_floats = lines.size(); - const double volume = static_cast(function(lines)); - const double volumeMB = volume / (1024. * 1024.); - const std::pair result = time_it_ns(lines, function, repeat); - - printf("%-30s: %8.2f MB/s (+/- %.1f %%) ", name.data(), - volumeMB * 1000000000 / result.first, - (result.second - result.first) * 100.0 / result.second); - printf("%8.2f MB ", volumeMB); - printf("%8.2f Mfloat/s ", number_of_floats * 1000 / result.first); - printf(" %8.2f ns/f \n", double(result.first) / number_of_floats); } - -#endif #endif //// BENCHUTIL_H diff --git a/benchmarks/counters/apple_arm_events.h b/benchmarks/counters/apple_arm_events.h index da45900..e4cac1d 100644 --- a/benchmarks/counters/apple_arm_events.h +++ b/benchmarks/counters/apple_arm_events.h @@ -1,3 +1,5 @@ +/* clang-format off */ + // Original design from: // ============================================================================= // XNU kperf/kpc @@ -41,11 +43,11 @@ #include #include -#include // for dlopen() and dlsym() -#include // for mach_absolute_time() -#include // for kdebug trace decode -#include // for sysctl() -#include // for usleep() +#include // for dlopen() and dlsym() +#include // for mach_absolute_time() +#include // for kdebug trace decode +#include // for sysctl() +#include // for usleep() struct performance_counters { double cycles; @@ -57,7 +59,9 @@ struct performance_counters { performance_counters(double c, double b, double m, double i) : cycles(c), branches(b), missed_branches(m), instructions(i) {} performance_counters(double init) - : cycles(init), branches(init), missed_branches(init), + : cycles(init), + branches(init), + missed_branches(init), instructions(init) {} inline performance_counters &operator-=(const performance_counters &other) { @@ -126,17 +130,17 @@ typedef size_t usize; #define KPC_CLASS_RAWPMU (3) // Cross-platform class mask constants. -#define KPC_CLASS_FIXED_MASK (1u << KPC_CLASS_FIXED) // 1 -#define KPC_CLASS_CONFIGURABLE_MASK (1u << KPC_CLASS_CONFIGURABLE) // 2 -#define KPC_CLASS_POWER_MASK (1u << KPC_CLASS_POWER) // 4 -#define KPC_CLASS_RAWPMU_MASK (1u << KPC_CLASS_RAWPMU) // 8 +#define KPC_CLASS_FIXED_MASK (1u << KPC_CLASS_FIXED) // 1 +#define KPC_CLASS_CONFIGURABLE_MASK (1u << KPC_CLASS_CONFIGURABLE) // 2 +#define KPC_CLASS_POWER_MASK (1u << KPC_CLASS_POWER) // 4 +#define KPC_CLASS_RAWPMU_MASK (1u << KPC_CLASS_RAWPMU) // 8 // PMU version constants. -#define KPC_PMU_ERROR (0) // Error -#define KPC_PMU_INTEL_V3 (1) // Intel -#define KPC_PMU_ARM_APPLE (2) // ARM64 -#define KPC_PMU_INTEL_V2 (3) // Old Intel -#define KPC_PMU_ARM_V2 (4) // Old ARM +#define KPC_PMU_ERROR (0) // Error +#define KPC_PMU_INTEL_V3 (1) // Intel +#define KPC_PMU_ARM_APPLE (2) // ARM64 +#define KPC_PMU_INTEL_V2 (3) // Old Intel +#define KPC_PMU_ARM_V2 (4) // Old ARM // The maximum number of counters we could read from every class in one go. // ARMV7: FIXED: 1, CONFIGURABLE: 4 @@ -354,19 +358,6 @@ static u64 (*kperf_ticks_to_ns)(u64 ticks); /// CPU ticks frequency (mach_absolute_time). static u64 (*kperf_tick_frequency)(void); -/// Get lightweight PET mode (not in kperf.framework). -static int kperf_lightweight_pet_get(u32 *enabled) { - if (!enabled) - return -1; - usize size = 4; - return sysctlbyname("kperf.lightweight_pet", enabled, &size, NULL, 0); -} - -/// Set lightweight PET mode (not in kperf.framework). -static int kperf_lightweight_pet_set(u32 enabled) { - return sysctlbyname("kperf.lightweight_pet", NULL, NULL, &enabled, 4); -} - // ----------------------------------------------------------------------------- // header (reverse engineered) // This framework provides some functions to access the local CPU database. @@ -381,11 +372,11 @@ static int kperf_lightweight_pet_set(u32 enabled) { /// KPEP event (size: 48/28 bytes on 64/32 bit OS) typedef struct kpep_event { - const char *name; ///< Unique name of a event, such as "INST_RETIRED.ANY". - const char *description; ///< Description for this event. - const char *errata; ///< Errata, currently NULL. - const char *alias; ///< Alias name, such as "Instructions", "Cycles". - const char *fallback; ///< Fallback event name for fixed counter. + const char *name; ///< Unique name of a event, such as "INST_RETIRED.ANY". + const char *description; ///< Description for this event. + const char *errata; ///< Errata, currently NULL. + const char *alias; ///< Alias name, such as "Instructions", "Cycles". + const char *fallback; ///< Fallback event name for fixed counter. u32 mask; u8 number; u8 umask; @@ -395,25 +386,25 @@ typedef struct kpep_event { /// KPEP database (size: 144/80 bytes on 64/32 bit OS) typedef struct kpep_db { - const char *name; ///< Database name, such as "haswell". - const char *cpu_id; ///< Plist name, such as "cpu_7_8_10b282dc". - const char *marketing_name; ///< Marketing name, such as "Intel Haswell". - void *plist_data; ///< Plist data (CFDataRef), currently NULL. - void *event_map; ///< All events (CFDict). + const char *name; ///< Database name, such as "haswell". + const char *cpu_id; ///< Plist name, such as "cpu_7_8_10b282dc". + const char *marketing_name; ///< Marketing name, such as "Intel Haswell". + void *plist_data; ///< Plist data (CFDataRef), currently NULL. + void *event_map; ///< All events (CFDict). kpep_event - *event_arr; ///< Event struct buffer (sizeof(kpep_event) * events_count). - kpep_event **fixed_event_arr; ///< Fixed counter events (sizeof(kpep_event *) - ///< * fixed_counter_count) - void *alias_map; ///< All aliases (CFDict). + *event_arr; ///< Event struct buffer (sizeof(kpep_event) * events_count). + kpep_event **fixed_event_arr; ///< Fixed counter events (sizeof(kpep_event *) + ///< * fixed_counter_count) + void *alias_map; ///< All aliases (CFDict). usize reserved_1; usize reserved_2; usize reserved_3; - usize event_count; ///< All events count. + usize event_count; ///< All events count. usize alias_count; usize fixed_counter_count; usize config_counter_count; usize power_counter_count; - u32 archtecture; ///< see `KPEP CPU archtecture constants` above. + u32 archtecture; ///< see `KPEP CPU archtecture constants` above. u32 fixed_counter_bits; u32 config_counter_bits; u32 power_counter_bits; @@ -422,14 +413,14 @@ typedef struct kpep_db { /// KPEP config (size: 80/44 bytes on 64/32 bit OS) typedef struct kpep_config { kpep_db *db; - kpep_event **ev_arr; ///< (sizeof(kpep_event *) * counter_count), init NULL - usize *ev_map; ///< (sizeof(usize *) * counter_count), init 0 - usize *ev_idx; ///< (sizeof(usize *) * counter_count), init -1 - u32 *flags; ///< (sizeof(u32 *) * counter_count), init 0 - u64 *kpc_periods; ///< (sizeof(u64 *) * counter_count), init 0 - usize event_count; /// kpep_config_events_count() + kpep_event **ev_arr; ///< (sizeof(kpep_event *) * counter_count), init NULL + usize *ev_map; ///< (sizeof(usize *) * counter_count), init 0 + usize *ev_idx; ///< (sizeof(usize *) * counter_count), init -1 + u32 *flags; ///< (sizeof(u32 *) * counter_count), init 0 + u64 *kpc_periods; ///< (sizeof(u64 *) * counter_count), init 0 + usize event_count; /// kpep_config_events_count() usize counter_count; - u32 classes; ///< See `class mask constants` above. + u32 classes; ///< See `class mask constants` above. u32 config_counter; u32 power_counter; u32 reserved; @@ -613,7 +604,7 @@ typedef struct { } lib_symbol; #define lib_nelems(x) (sizeof(x) / sizeof((x)[0])) -#define lib_symbol_def(name) \ +#define lib_symbol_def(name) \ { #name, (void **)&name } static const lib_symbol lib_symbols_kperf[] = { @@ -680,7 +671,7 @@ static const lib_symbol lib_symbols_kperfdata[] = { }; #define lib_path_kperf "/System/Library/PrivateFrameworks/kperf.framework/kperf" -#define lib_path_kperfdata \ +#define lib_path_kperfdata \ "/System/Library/PrivateFrameworks/kperfdata.framework/kperfdata" static bool lib_inited = false; @@ -693,10 +684,8 @@ static void *lib_handle_kperfdata = NULL; static void lib_deinit(void) { lib_inited = false; lib_has_err = false; - if (lib_handle_kperf) - dlclose(lib_handle_kperf); - if (lib_handle_kperfdata) - dlclose(lib_handle_kperfdata); + if (lib_handle_kperf) dlclose(lib_handle_kperf); + if (lib_handle_kperfdata) dlclose(lib_handle_kperfdata); lib_handle_kperf = NULL; lib_handle_kperfdata = NULL; for (usize i = 0; i < lib_nelems(lib_symbols_kperf); i++) { @@ -710,16 +699,15 @@ static void lib_deinit(void) { } static bool lib_init(void) { -#define return_err() \ - do { \ - lib_deinit(); \ - lib_inited = true; \ - lib_has_err = true; \ - return false; \ +#define return_err() \ + do { \ + lib_deinit(); \ + lib_inited = true; \ + lib_has_err = true; \ + return false; \ } while (false) - if (lib_inited) - return !lib_has_err; + if (lib_inited) return !lib_has_err; // load dynamic library lib_handle_kperf = dlopen(lib_path_kperf, RTLD_LAZY); @@ -831,133 +819,44 @@ typedef struct { // kdebug utils // ----------------------------------------------------------------------------- -/// Clean up trace buffers and reset ktrace/kdebug/kperf. -/// @return 0 on success. -static int kdebug_reset(void) { - int mib[3] = {CTL_KERN, KERN_KDEBUG, KERN_KDREMOVE}; - return sysctl(mib, 3, NULL, NULL, NULL, 0); -} - -/// Disable and reinitialize the trace buffers. -/// @return 0 on success. -static int kdebug_reinit(void) { - int mib[3] = {CTL_KERN, KERN_KDEBUG, KERN_KDSETUP}; - return sysctl(mib, 3, NULL, NULL, NULL, 0); -} - -/// Set debug filter. -static int kdebug_setreg(kd_regtype *kdr) { - int mib[3] = {CTL_KERN, KERN_KDEBUG, KERN_KDSETREG}; - usize size = sizeof(kd_regtype); - return sysctl(mib, 3, kdr, &size, NULL, 0); -} - -/// Set maximum number of trace entries (kd_buf). -/// Only allow allocation up to half the available memory (sane_size). -/// @return 0 on success. -static int kdebug_trace_setbuf(int nbufs) { - int mib[4] = {CTL_KERN, KERN_KDEBUG, KERN_KDSETBUF, nbufs}; - return sysctl(mib, 4, NULL, NULL, NULL, 0); -} - -/// Enable or disable kdebug trace. -/// Trace buffer must already be initialized. -/// @return 0 on success. -static int kdebug_trace_enable(bool enable) { - int mib[4] = {CTL_KERN, KERN_KDEBUG, KERN_KDENABLE, enable}; - return sysctl(mib, 4, NULL, 0, NULL, 0); -} - -/// Retrieve trace buffer information from kernel. -/// @return 0 on success. -static int kdebug_get_bufinfo(kbufinfo_t *info) { - if (!info) - return -1; - int mib[3] = {CTL_KERN, KERN_KDEBUG, KERN_KDGETBUF}; - size_t needed = sizeof(kbufinfo_t); - return sysctl(mib, 3, info, &needed, NULL, 0); -} - -/// Retrieve trace buffers from kernel. -/// @param buf Memory to receive buffer data, array of `kd_buf`. -/// @param len Length of `buf` in bytes. -/// @param count Number of trace entries (kd_buf) obtained. -/// @return 0 on success. -static int kdebug_trace_read(void *buf, usize len, usize *count) { - if (count) - *count = 0; - if (!buf || !len) - return -1; - - // Note: the input and output units are not the same. - // input: bytes - // output: number of kd_buf - int mib[3] = {CTL_KERN, KERN_KDEBUG, KERN_KDREADTR}; - int ret = sysctl(mib, 3, buf, &len, NULL, 0); - if (ret != 0) - return ret; - *count = len; - return 0; -} - -/// Block until there are new buffers filled or `timeout_ms` have passed. -/// @param timeout_ms timeout milliseconds, 0 means wait forever. -/// @param suc set true if new buffers filled. -/// @return 0 on success. -static int kdebug_wait(usize timeout_ms, bool *suc) { - if (timeout_ms == 0) - return -1; - int mib[3] = {CTL_KERN, KERN_KDEBUG, KERN_KDBUFWAIT}; - usize val = timeout_ms; - int ret = sysctl(mib, 3, NULL, &val, NULL, 0); - if (suc) - *suc = !!val; - return ret; -} - -// ----------------------------------------------------------------------------- -// Demo -// ----------------------------------------------------------------------------- - #define EVENT_NAME_MAX 8 typedef struct { - const char *alias; /// name for print - const char *names[EVENT_NAME_MAX]; /// name from pmc db + const char *alias; /// name for print + const char *names[EVENT_NAME_MAX]; /// name from pmc db } event_alias; /// Event names from /usr/share/kpep/.plist static const event_alias profile_events[] = { {"cycles", { - "FIXED_CYCLES", // Apple A7-A15//CORE_ACTIVE_CYCLE - "CPU_CLK_UNHALTED.THREAD", // Intel Core 1th-10th - "CPU_CLK_UNHALTED.CORE", // Intel Yonah, Merom + "FIXED_CYCLES", // Apple A7-A15 + "CPU_CLK_UNHALTED.THREAD", // Intel Core 1th-10th + "CPU_CLK_UNHALTED.CORE", // Intel Yonah, Merom }}, {"instructions", { - "FIXED_INSTRUCTIONS", // Apple A7-A15 - "INST_RETIRED.ANY" // Intel Yonah, Merom, Core 1th-10th + "FIXED_INSTRUCTIONS", // Apple A7-A15 + "INST_RETIRED.ANY" // Intel Yonah, Merom, Core 1th-10th }}, {"branches", { - "INST_BRANCH", // Apple A7-A15 - "BR_INST_RETIRED.ALL_BRANCHES", // Intel Core 1th-10th - "INST_RETIRED.ANY", // Intel Yonah, Merom + "INST_BRANCH", // Apple A7-A15 + "BR_INST_RETIRED.ALL_BRANCHES", // Intel Core 1th-10th + "INST_RETIRED.ANY", // Intel Yonah, Merom }}, {"branch-misses", { - "BRANCH_MISPRED_NONSPEC", // Apple A7-A15, since iOS 15, macOS 12 - "BRANCH_MISPREDICT", // Apple A7-A14 - "BR_MISP_RETIRED.ALL_BRANCHES", // Intel Core 2th-10th - "BR_INST_RETIRED.MISPRED", // Intel Yonah, Merom + "BRANCH_MISPRED_NONSPEC", // Apple A7-A15, since iOS 15, macOS 12 + "BRANCH_MISPREDICT", // Apple A7-A14 + "BR_MISP_RETIRED.ALL_BRANCHES", // Intel Core 2th-10th + "BR_INST_RETIRED.MISPRED", // Intel Yonah, Merom }}, }; static kpep_event *get_event(kpep_db *db, const event_alias *alias) { for (usize j = 0; j < EVENT_NAME_MAX; j++) { const char *name = alias->names[j]; - if (!name) - break; + if (!name) break; kpep_event *ev = NULL; if (kpep_db_event(db, name, &ev) == 0) { return ev; @@ -966,146 +865,143 @@ static kpep_event *get_event(kpep_db *db, const event_alias *alias) { return NULL; } -kpc_config_t regs[KPC_MAX_COUNTERS] = {0}; -usize counter_map[KPC_MAX_COUNTERS] = {0}; -u64 counters_0[KPC_MAX_COUNTERS] = {0}; -u64 counters_1[KPC_MAX_COUNTERS] = {0}; -const usize ev_count = sizeof(profile_events) / sizeof(profile_events[0]); +struct AppleEvents { + kpc_config_t regs[KPC_MAX_COUNTERS] = {0}; + usize counter_map[KPC_MAX_COUNTERS] = {0}; + u64 counters_0[KPC_MAX_COUNTERS] = {0}; + u64 counters_1[KPC_MAX_COUNTERS] = {0}; + static constexpr usize ev_count = + sizeof(profile_events) / sizeof(profile_events[0]); + bool init = false; + bool worked = false; + inline bool setup_performance_counters() { + if (init) { + return worked; + } + init = true; -bool setup_performance_counters() { - static bool init = false; - static bool worked = false; + // load dylib + if (!lib_init()) { + printf("Error: %s\n", lib_err_msg); + return (worked = false); + } - if (init) { - return worked; - } - init = true; + // check permission + int force_ctrs = 0; + if (kpc_force_all_ctrs_get(&force_ctrs)) { + printf("Permission denied, xnu/kpc requires root privileges.\n"); + return (worked = false); + } + int ret; + // load pmc db + kpep_db *db = NULL; + if ((ret = kpep_db_create(NULL, &db))) { + printf("Error: cannot load pmc database: %d.\n", ret); + return (worked = false); + } + //printf("loaded db: %s (%s)\n", db->name, db->marketing_name); + // printf("number of fixed counters: %zu\n", db->fixed_counter_count); + // printf("number of configurable counters: %zu\n", + // db->config_counter_count); + + // create a config + kpep_config *cfg = NULL; + if ((ret = kpep_config_create(db, &cfg))) { + printf("Failed to create kpep config: %d (%s).\n", ret, + kpep_config_error_desc(ret)); + return (worked = false); + } + if ((ret = kpep_config_force_counters(cfg))) { + printf("Failed to force counters: %d (%s).\n", ret, + kpep_config_error_desc(ret)); + return (worked = false); + } - // load dylib - if (!lib_init()) { - printf("Error: %s\n", lib_err_msg); - return (worked = false); - } + // get events + kpep_event *ev_arr[ev_count] = {0}; + for (usize i = 0; i < ev_count; i++) { + const event_alias *alias = profile_events + i; + ev_arr[i] = get_event(db, alias); + if (!ev_arr[i]) { + printf("Cannot find event: %s.\n", alias->alias); + return (worked = false); + } + } - // check permission - int force_ctrs = 0; - if (kpc_force_all_ctrs_get(&force_ctrs)) { - printf("Permission denied, xnu/kpc requires root privileges.\n"); - return (worked = false); - } - int ret; - // load pmc db - kpep_db *db = NULL; - if ((ret = kpep_db_create(NULL, &db))) { - printf("Error: cannot load pmc database: %d.\n", ret); - return (worked = false); - } - printf("loaded db: %s (%s)\n", db->name, db->marketing_name); - - // create a config - kpep_config *cfg = NULL; - if ((ret = kpep_config_create(db, &cfg))) { - printf("Failed to create kpep config: %d (%s).\n", ret, - kpep_config_error_desc(ret)); - return (worked = false); - } - if ((ret = kpep_config_force_counters(cfg))) { - printf("Failed to force counters: %d (%s).\n", ret, - kpep_config_error_desc(ret)); - return (worked = false); - } + // add event to config + for (usize i = 0; i < ev_count; i++) { + kpep_event *ev = ev_arr[i]; + if ((ret = kpep_config_add_event(cfg, &ev, 0, NULL))) { + printf("Failed to add event: %d (%s).\n", ret, + kpep_config_error_desc(ret)); + return (worked = false); + } + } - // get events - kpep_event *ev_arr[ev_count] = {0}; - for (usize i = 0; i < ev_count; i++) { - const event_alias *alias = profile_events + i; - ev_arr[i] = get_event(db, alias); - if (!ev_arr[i]) { - printf("Cannot find event: %s.\n", alias->alias); + // prepare buffer and config + u32 classes = 0; + usize reg_count = 0; + if ((ret = kpep_config_kpc_classes(cfg, &classes))) { + printf("Failed get kpc classes: %d (%s).\n", ret, + kpep_config_error_desc(ret)); return (worked = false); } - } - - // add event to config - for (usize i = 0; i < ev_count; i++) { - kpep_event *ev = ev_arr[i]; - if ((ret = kpep_config_add_event(cfg, &ev, 0, NULL))) { - printf("Failed to add event: %d (%s).\n", ret, + if ((ret = kpep_config_kpc_count(cfg, ®_count))) { + printf("Failed get kpc count: %d (%s).\n", ret, + kpep_config_error_desc(ret)); + return (worked = false); + } + if ((ret = kpep_config_kpc_map(cfg, counter_map, sizeof(counter_map)))) { + printf("Failed get kpc map: %d (%s).\n", ret, + kpep_config_error_desc(ret)); + return (worked = false); + } + if ((ret = kpep_config_kpc(cfg, regs, sizeof(regs)))) { + printf("Failed get kpc registers: %d (%s).\n", ret, kpep_config_error_desc(ret)); return (worked = false); } - } - // prepare buffer and config - u32 classes = 0; - usize reg_count = 0; - if ((ret = kpep_config_kpc_classes(cfg, &classes))) { - printf("Failed get kpc classes: %d (%s).\n", ret, - kpep_config_error_desc(ret)); - return (worked = false); - } - if ((ret = kpep_config_kpc_count(cfg, ®_count))) { - printf("Failed get kpc count: %d (%s).\n", ret, - kpep_config_error_desc(ret)); - return (worked = false); - } - if ((ret = kpep_config_kpc_map(cfg, counter_map, sizeof(counter_map)))) { - printf("Failed get kpc map: %d (%s).\n", ret, kpep_config_error_desc(ret)); - return (worked = false); - } - if ((ret = kpep_config_kpc(cfg, regs, sizeof(regs)))) { - printf("Failed get kpc registers: %d (%s).\n", ret, - kpep_config_error_desc(ret)); - return (worked = false); - } + // set config to kernel + if ((ret = kpc_force_all_ctrs_set(1))) { + printf("Failed force all ctrs: %d.\n", ret); + return (worked = false); + } + if ((classes & KPC_CLASS_CONFIGURABLE_MASK) && reg_count) { + if ((ret = kpc_set_config(classes, regs))) { + printf("Failed set kpc config: %d.\n", ret); + return (worked = false); + } + } - // set config to kernel - if ((ret = kpc_force_all_ctrs_set(1))) { - printf("Failed force all ctrs: %d.\n", ret); - return (worked = false); - } - if ((classes & KPC_CLASS_CONFIGURABLE_MASK) && reg_count) { - if ((ret = kpc_set_config(classes, regs))) { - printf("Failed set kpc config: %d.\n", ret); + // start counting + if ((ret = kpc_set_counting(classes))) { + printf("Failed set counting: %d.\n", ret); + return (worked = false); + } + if ((ret = kpc_set_thread_counting(classes))) { + printf("Failed set thread counting: %d.\n", ret); return (worked = false); } - } - // start counting - if ((ret = kpc_set_counting(classes))) { - printf("Failed set counting: %d.\n", ret); - return (worked = false); - } - if ((ret = kpc_set_thread_counting(classes))) { - printf("Failed set thread counting: %d.\n", ret); - return (worked = false); + return (worked = true); } - return (worked = true); -} - -inline performance_counters get_counters() { - static bool warned = false; - int ret; - // get counters before - if ((ret = kpc_get_thread_counters(0, KPC_MAX_COUNTERS, counters_0))) { - if (!warned) { - - printf("Failed get thread counters before: %d.\n", ret); - warned = true; + inline performance_counters get_counters() { + static bool warned = false; + int ret; + // get counters before + if ((ret = kpc_get_thread_counters(0, KPC_MAX_COUNTERS, counters_0))) { + if (!warned) { + printf("Failed get thread counters before: %d.\n", ret); + warned = true; + } + return 1; } - return 1; + return performance_counters{ + counters_0[counter_map[0]], counters_0[counter_map[2]], + counters_0[counter_map[3]], counters_0[counter_map[1]]}; } - /*printf("counters value:\n"); - for (usize i = 0; i < ev_count; i++) { - const event_alias *alias = profile_events + i; - usize idx = counter_map[i]; - u64 val = counters_1[idx] - counters_0[idx]; - printf("%14s: %llu\n", alias->alias, val); - }*/ - return performance_counters{ - counters_0[counter_map[0]], counters_0[counter_map[2]], - counters_0[counter_map[3]], counters_0[counter_map[1]]}; -} +}; #endif diff --git a/benchmarks/counters/event_counter.h b/benchmarks/counters/event_counter.h index 2826883..b37e367 100644 --- a/benchmarks/counters/event_counter.h +++ b/benchmarks/counters/event_counter.h @@ -5,6 +5,7 @@ #ifndef _MSC_VER #include #endif +#include #include @@ -27,42 +28,30 @@ struct event_count { event_count(const std::chrono::duration _elapsed, const std::vector _event_counts) : elapsed(_elapsed), event_counts(_event_counts) {} - event_count(const event_count &other) + event_count(const event_count& other) : elapsed(other.elapsed), event_counts(other.event_counts) {} // The types of counters (so we can read the getter more easily) enum event_counter_types { - CPU_CYCLES = 0, - INSTRUCTIONS = 1, - BRANCHES = 2, - MISSED_BRANCHES = 3 + CPU_CYCLES, + INSTRUCTIONS, + BRANCH_MISSES=2, + BRANCH=4 }; - double elapsed_sec() const { - return std::chrono::duration(elapsed).count(); - } - double elapsed_ns() const { - return std::chrono::duration(elapsed).count(); - } - double cycles() const { - return static_cast(event_counts[CPU_CYCLES]); - } - double instructions() const { - return static_cast(event_counts[INSTRUCTIONS]); - } - double branches() const { - return static_cast(event_counts[BRANCHES]); - } - double missed_branches() const { - return static_cast(event_counts[MISSED_BRANCHES]); - } + double elapsed_sec() const { return std::chrono::duration(elapsed).count(); } + double elapsed_ns() const { return std::chrono::duration(elapsed).count(); } + double cycles() const { return static_cast(event_counts[CPU_CYCLES]); } + double instructions() const { return static_cast(event_counts[INSTRUCTIONS]); } + double branch_misses() const { return static_cast(event_counts[BRANCH_MISSES]); } + double branches() const { return static_cast(event_counts[BRANCH]); } - event_count &operator=(const event_count &other) { + event_count& operator=(const event_count& other) { this->elapsed = other.elapsed; this->event_counts = other.event_counts; return *this; } - event_count operator+(const event_count &other) const { + event_count operator+(const event_count& other) const { return event_count(elapsed + other.elapsed, { event_counts[0] + other.event_counts[0], @@ -73,7 +62,7 @@ struct event_count { }); } - void operator+=(const event_count &other) { *this = *this + other; } + void operator+=(const event_count& other) { *this = *this + other; } }; struct event_aggregate { @@ -85,7 +74,7 @@ struct event_aggregate { event_aggregate() = default; - void operator<<(const event_count &other) { + void operator<<(const event_count& other) { if (iterations == 0 || other.elapsed < best.elapsed) { best = other; } @@ -97,13 +86,15 @@ struct event_aggregate { } double elapsed_sec() const { return total.elapsed_sec() / iterations; } + double total_elapsed_ns() const { return total.elapsed_ns(); } double elapsed_ns() const { return total.elapsed_ns() / iterations; } double cycles() const { return total.cycles() / iterations; } + double branch_misses() const { return total.branch_misses() / iterations; } + double branches() const { return total.branches() / iterations; } double instructions() const { return total.instructions() / iterations; } - double branches() const { return total.branches() / iterations; } - double missed_branches() const { - return total.missed_branches() / iterations; - } + double fastest_elapsed_ns() const { return best.elapsed_ns(); } + double fastest_cycles() const { return best.cycles(); } + double fastest_instructions() const { return best.instructions(); } }; struct event_collector { @@ -114,14 +105,15 @@ struct event_collector { LinuxEvents linux_events; event_collector() : linux_events(std::vector{ - PERF_COUNT_HW_CPU_CYCLES, PERF_COUNT_HW_INSTRUCTIONS, - PERF_COUNT_HW_BRANCH_INSTRUCTIONS, // Retired branch instructions - PERF_COUNT_HW_BRANCH_MISSES}) {} + PERF_COUNT_HW_CPU_CYCLES, + PERF_COUNT_HW_INSTRUCTIONS, + }) {} bool has_events() { return linux_events.is_working(); } #elif __APPLE__ && __aarch64__ + AppleEvents apple_events; performance_counters diff; - event_collector() : diff(0) { setup_performance_counters(); } - bool has_events() { return setup_performance_counters(); } + event_collector() : diff(0) { apple_events.setup_performance_counters(); } + bool has_events() { return apple_events.setup_performance_counters(); } #else event_collector() {} bool has_events() { return false; } @@ -132,25 +124,25 @@ struct event_collector { linux_events.start(); #elif __APPLE__ && __aarch64__ if (has_events()) { - diff = get_counters(); + diff = apple_events.get_counters(); } #endif start_clock = std::chrono::steady_clock::now(); } - inline event_count &end() { + inline event_count& end() { const auto end_clock = std::chrono::steady_clock::now(); #if defined(__linux) linux_events.end(count.event_counts); #elif __APPLE__ && __aarch64__ if (has_events()) { - performance_counters end = get_counters(); + performance_counters end = apple_events.get_counters(); diff = end - diff; } count.event_counts[0] = diff.cycles; count.event_counts[1] = diff.instructions; - count.event_counts[2] = diff.branches; - count.event_counts[3] = diff.missed_branches; - count.event_counts[4] = 0; + count.event_counts[2] = diff.missed_branches; + count.event_counts[3] = 0; + count.event_counts[4] = diff.branches; #endif count.elapsed = end_clock - start_clock; return count;