diff --git a/benchmark/connection/tensor_cache_locality.cpp b/benchmark/connection/tensor_cache_locality.cpp index 441b49c0..c25bef67 100644 --- a/benchmark/connection/tensor_cache_locality.cpp +++ b/benchmark/connection/tensor_cache_locality.cpp @@ -1,8 +1,8 @@ -// Simulates the "10k tensor actors" cache-locality question: -// - SML path: one SML machine per tensor, dispatch fill/unlink events. -// - Flat path: one tightly packed refs/live array with direct function calls. +// Generic "10k actor" dispatch benchmark focused on SML dispatch overhead. +// This intentionally avoids use-case-specific event semantics (e.g. link/unlink). -#include +#include +#include #include #include #include @@ -10,585 +10,260 @@ #include #include +#if defined(TEST_GBENCH) +#include +#endif + namespace sml = boost::sml; namespace { -constexpr std::size_t kTensorCount = 10'000; -constexpr std::size_t kNodeCount = 10'000; -constexpr std::size_t kFanIn = 4; -constexpr std::uint16_t kInitialRefs = 4; +constexpr std::size_t kActorCount = 10'000; +constexpr std::size_t kDispatchCount = 50'000; -struct ev_fill {}; -struct ev_unlink {}; -struct ev_fill_idx { - std::uint16_t id{}; -}; -struct ev_unlink_idx { - std::uint16_t id{}; -}; -struct ev_tick {}; -using pooled_fill = sml::utility::indexed_event; -using pooled_unlink = sml::utility::indexed_event; -using pooled_tick = sml::utility::indexed_event; - -struct tensor_slot { - std::uint16_t refs{}; - std::uint16_t live{}; -}; - -struct node_op { - std::uint16_t dst{}; - std::array src{}; -}; +struct ev_pulse {}; +using pooled_pulse = sml::utility::indexed_event; enum class access_pattern : std::uint8_t { local, random }; -std::vector make_ops(const access_pattern pattern) { - std::vector ops; - ops.reserve(kNodeCount); +std::vector make_ids(const access_pattern pattern) { + std::vector ids; + ids.reserve(kDispatchCount); if (pattern == access_pattern::local) { - for (std::size_t i = 0; i < kNodeCount; ++i) { - node_op op{}; - op.dst = static_cast(i % kTensorCount); - for (std::size_t j = 0; j < kFanIn; ++j) { - op.src[j] = static_cast((i + j + 1) % kTensorCount); - } - ops.push_back(op); + for (std::size_t i = 0; i < kDispatchCount; ++i) { + ids.push_back(static_cast(i % kActorCount)); } - return ops; + return ids; } std::minstd_rand rng{1337}; - std::uniform_int_distribution dist{0u, static_cast(kTensorCount - 1)}; - for (std::size_t i = 0; i < kNodeCount; ++i) { - node_op op{}; - op.dst = static_cast(dist(rng)); - for (auto& src : op.src) { - src = static_cast(dist(rng)); - } - ops.push_back(op); + std::uniform_int_distribution dist{0u, static_cast(kActorCount - 1)}; + for (std::size_t i = 0; i < kDispatchCount; ++i) { + ids.push_back(static_cast(dist(rng))); } - return ops; -} - -const std::vector& local_ops() { - static const auto ops = make_ops(access_pattern::local); - return ops; + return ids; } -const std::vector& random_ops() { - static const auto ops = make_ops(access_pattern::random); - return ops; +const std::vector& local_ids() { + static const auto ids = make_ids(access_pattern::local); + return ids; } -const std::vector& random_tensor_ids() { - static const auto ids = [] { - std::vector out; - out.reserve(kNodeCount * (1 + kFanIn)); - std::minstd_rand rng{424242}; - std::uniform_int_distribution dist{0u, static_cast(kTensorCount - 1)}; - for (std::size_t i = 0; i < kNodeCount * (1 + kFanIn); ++i) { - out.push_back(static_cast(dist(rng))); - } - return out; - }(); +const std::vector& random_ids() { + static const auto ids = make_ids(access_pattern::random); return ids; } -struct flat_tensor_pool { - explicit flat_tensor_pool() : slots(kTensorCount) {} - - void reset() { - for (auto& slot : slots) { - slot.refs = kInitialRefs; - slot.live = 0; - } - } +struct direct_pool { + direct_pool() : flags(kActorCount) {} - std::uint16_t fill(const std::uint16_t id) { - auto& slot = slots[id]; - ++slot.live; - return slot.live; - } - - std::uint16_t fill_ev(const ev_fill_idx& ev) { return fill(ev.id); } - - std::uint16_t unlink(const std::uint16_t id) { - auto& slot = slots[id]; - if (slot.refs > 0) { - --slot.refs; - } - return slot.refs; + bool process_event(const std::uint16_t id) { + auto& flag = flags[id]; + flag ^= static_cast(1); + return flag != 0; } - std::uint16_t unlink_ev(const ev_unlink_idx& ev) { return unlink(ev.id); } + std::uint8_t sample(const std::uint16_t id) const { return flags[id]; } - std::vector slots; + std::vector flags; }; -struct tensor_actor { +struct pulse_actor { auto operator()() const { using namespace sml; - const auto hot = "hot"_s; - const auto can_unlink = [](const tensor_slot& slot) { return slot.refs > 0; }; - const auto do_fill = [](tensor_slot& slot) { ++slot.live; }; - const auto do_unlink = [](tensor_slot& slot) { --slot.refs; }; // clang-format off return make_transition_table( - *hot + event / do_fill, - hot + event [can_unlink] / do_unlink + *"off"_s + event = "on"_s, + "on"_s + event = "off"_s ); // clang-format on } }; -struct nodata_actor { - auto operator()() const { - using namespace sml; - const auto a = "a"_s; - const auto b = "b"_s; - // clang-format off - return make_transition_table( - *a + event = b, - b + event = a - ); - // clang-format on +struct actor_pool { + actor_pool() { + actors.reserve(kActorCount); + for (std::size_t i = 0; i < kActorCount; ++i) { + actors.emplace_back(); + } } -}; -struct nodata_direct { - bool state{}; - bool process_event(const ev_tick&) noexcept { - state = !state; - return state; + bool process_event(const std::uint16_t id) { return actors[id].process_event(ev_pulse{}); } + + bool sample(const std::uint16_t id) const { + using namespace sml; + return actors[id].is("on"_s); } + + std::vector> actors; }; -struct nodata_pool_storage { - explicit nodata_pool_storage(const std::size_t count) : flags(count) {} +struct pool_storage { + explicit pool_storage(const std::size_t count) : flags(count) {} - void reset() { - for (auto& flag : flags) { - flag = 0; - } - } + void reset() { std::fill(flags.begin(), flags.end(), static_cast(0)); } std::vector flags; }; -struct nodata_router_actor { +struct router_actor { auto operator()() const { using namespace sml; const auto hot = "hot"_s; - const auto toggle = [](nodata_pool_storage& storage, const pooled_tick& ev) { + const auto toggle = [](pool_storage& storage, const pooled_pulse& ev) { storage.flags[ev.id] ^= static_cast(1); }; // clang-format off return make_transition_table( - *hot + event / toggle + *hot + event / toggle ); // clang-format on } }; -struct nodata_sm_pool { - nodata_sm_pool() : pool(kTensorCount) {} +struct pooled_dispatch { + pooled_dispatch() : pool(kActorCount) {} bool process_event(const std::uint16_t id) { - pool.template process_indexed(id); + pool.template process_indexed(id); return pool.storage().flags[id] != 0; } - template - std::size_t process_event_batch(const TRange& ids) { - return pool.template process_indexed_batch(ids); + std::size_t process_batch(const std::vector& ids) { + return pool.template process_indexed_batch(ids); } std::uint8_t sample(const std::uint16_t id) const { return pool.storage().flags[id]; } - sml::utility::sm_pool pool; + sml::utility::sm_pool pool; }; -struct sml_tensor_pool { - sml_tensor_pool() : slots(kTensorCount) { - actors.reserve(kTensorCount); - for (auto& slot : slots) { - actors.emplace_back(slot); - } - } - - void reset() { - for (auto& slot : slots) { - slot.refs = kInitialRefs; - slot.live = 0; - } - } - - std::uint16_t fill(const std::uint16_t id) { - actors[id].process_event(ev_fill{}); - return slots[id].live; - } - - std::uint16_t unlink(const std::uint16_t id) { - actors[id].process_event(ev_unlink{}); - return slots[id].refs; - } - - std::vector slots; - std::vector> actors; -}; - -struct sml_tensor_pool_fused { - struct entry { - entry() : sm(slot) {} - tensor_slot slot{}; - sml::sm sm; - }; - - sml_tensor_pool_fused() { - entries.reserve(kTensorCount); - for (std::size_t i = 0; i < kTensorCount; ++i) { - entries.emplace_back(); - } - } - - void reset() { - for (auto& e : entries) { - e.slot.refs = kInitialRefs; - e.slot.live = 0; - } - } - - std::uint16_t fill(const std::uint16_t id) { - auto& e = entries[id]; - e.sm.process_event(ev_fill{}); - return e.slot.live; - } - - std::uint16_t unlink(const std::uint16_t id) { - auto& e = entries[id]; - e.sm.process_event(ev_unlink{}); - return e.slot.refs; - } - - std::vector entries; -}; - -struct tensor_router_actor { - auto operator()() const { - using namespace sml; - const auto hot = "hot"_s; - const auto do_fill = [](flat_tensor_pool& pool, const pooled_fill& ev) { ++pool.slots[ev.id].live; }; - const auto can_unlink = [](const flat_tensor_pool& pool, const pooled_unlink& ev) { return pool.slots[ev.id].refs > 0; }; - const auto do_unlink = [](flat_tensor_pool& pool, const pooled_unlink& ev) { --pool.slots[ev.id].refs; }; - // clang-format off - return make_transition_table( - *hot + event / do_fill, - hot + event [can_unlink] / do_unlink - ); - // clang-format on - } -}; - -struct sml_router_pool { - sml_router_pool() : pool(kTensorCount) {} - - void reset() { pool.reset(); } - - std::uint16_t fill(const std::uint16_t id) { - pool.template process_indexed(id); - return pool.storage().slots[id].live; - } - - std::uint16_t unlink(const std::uint16_t id) { - pool.template process_indexed(id); - return pool.storage().slots[id].refs; - } - - sml::utility::sm_pool pool; -}; - -struct sml_router_pool_fold { - sml_router_pool_fold() : pool(kTensorCount) {} - - void reset() { pool.reset(); } - - std::uint16_t fill(const std::uint16_t id) { - pool.template process_indexed(id); - return pool.storage().slots[id].live; - } - - std::uint16_t unlink(const std::uint16_t id) { - pool.template process_indexed(id); - return pool.storage().slots[id].refs; - } - - sml::utility::sm_pool> pool; -}; - -template -std::uint64_t run_once(TPool& pool, const std::vector& ops) { - std::uint64_t sink = 0; - for (const auto& op : ops) { - sink += pool.fill(op.dst); - for (const auto src : op.src) { - sink += pool.unlink(src); - } - } - return sink; -} - -template -std::uint64_t run_once_event_api(TPool& pool, const std::vector& ops) { - std::uint64_t sink = 0; - for (const auto& op : ops) { - const ev_fill_idx fill_event{op.dst}; - sink += pool.fill_ev(fill_event); - for (const auto src : op.src) { - const ev_unlink_idx unlink_event{src}; - sink += pool.unlink_ev(unlink_event); - } - } - return sink; -} - } // namespace #if defined(TEST_ASM) int main() { - flat_tensor_pool flat{}; - sml_tensor_pool actors{}; + const auto& ids = random_ids(); + + direct_pool direct; + actor_pool actor; + pooled_dispatch pooled; - flat.reset(); - actors.reset(); + for (std::size_t i = 0; i < 128; ++i) { + const auto id = ids[i % ids.size()]; + direct.process_event(id); + actor.process_event(id); + pooled.process_event(id); + } - volatile std::uint64_t sink = 0; - sink += run_once(flat, local_ops()); - sink += run_once(actors, local_ops()); - return sink == 0; + return static_cast(direct.sample(0) + actor.sample(0) + pooled.sample(0)); } #elif defined(TEST_PERF) int main() { - flat_tensor_pool flat{}; - sml_tensor_pool actors{}; + const auto& ids = random_ids(); + direct_pool direct; + actor_pool actor; + pooled_dispatch pooled; - volatile std::uint64_t sink = 0; - for (auto i = 0; i < 2'000; ++i) { - flat.reset(); - sink += run_once(flat, random_ops()); - } - for (auto i = 0; i < 2'000; ++i) { - actors.reset(); - sink += run_once(actors, random_ops()); + std::uint64_t sink = 0; + for (std::size_t round = 0; round < 1000; ++round) { + for (const auto id : ids) { + sink += static_cast(direct.process_event(id)); + } + for (const auto id : ids) { + sink += static_cast(actor.process_event(id)); + } + sink += pooled.process_batch(ids); } - return sink == 0; + + return static_cast(sink & 0xFFu); } #elif defined(TEST_GBENCH) -#include - -template -static void run_bench(benchmark::State& state, TPool& pool, const std::vector& ops) { +void run_scalar_bench(benchmark::State& state, const std::vector& ids, direct_pool& pool) { for (auto _ : state) { - state.PauseTiming(); - pool.reset(); - state.ResumeTiming(); - - auto sink = run_once(pool, ops); - benchmark::DoNotOptimize(sink); - benchmark::ClobberMemory(); + for (const auto id : ids) { + benchmark::DoNotOptimize(pool.process_event(id)); + } } + state.SetItemsProcessed(static_cast(state.iterations()) * static_cast(ids.size())); + benchmark::DoNotOptimize(pool.sample(ids.front())); } -static void BM_tensor_flat_local(benchmark::State& state) { - flat_tensor_pool pool{}; - run_bench(state, pool, local_ops()); -} - -static void BM_tensor_sml_local(benchmark::State& state) { - sml_tensor_pool pool{}; - run_bench(state, pool, local_ops()); -} - -static void BM_tensor_flat_random(benchmark::State& state) { - flat_tensor_pool pool{}; - run_bench(state, pool, random_ops()); -} - -static void BM_tensor_flat_event_local(benchmark::State& state) { - flat_tensor_pool pool{}; +void run_scalar_bench(benchmark::State& state, const std::vector& ids, actor_pool& pool) { for (auto _ : state) { - state.PauseTiming(); - pool.reset(); - state.ResumeTiming(); - - auto sink = run_once_event_api(pool, local_ops()); - benchmark::DoNotOptimize(sink); - benchmark::ClobberMemory(); + for (const auto id : ids) { + benchmark::DoNotOptimize(pool.process_event(id)); + } } + state.SetItemsProcessed(static_cast(state.iterations()) * static_cast(ids.size())); + benchmark::DoNotOptimize(pool.sample(ids.front())); } -static void BM_tensor_flat_event_random(benchmark::State& state) { - flat_tensor_pool pool{}; +void run_scalar_bench(benchmark::State& state, const std::vector& ids, pooled_dispatch& pool) { for (auto _ : state) { - state.PauseTiming(); - pool.reset(); - state.ResumeTiming(); - - auto sink = run_once_event_api(pool, random_ops()); - benchmark::DoNotOptimize(sink); - benchmark::ClobberMemory(); + for (const auto id : ids) { + benchmark::DoNotOptimize(pool.process_event(id)); + } } + state.SetItemsProcessed(static_cast(state.iterations()) * static_cast(ids.size())); + benchmark::DoNotOptimize(pool.sample(ids.front())); } -static void BM_tensor_sml_random(benchmark::State& state) { - sml_tensor_pool pool{}; - run_bench(state, pool, random_ops()); -} - -static void BM_tensor_sml_fused_local(benchmark::State& state) { - sml_tensor_pool_fused pool{}; - run_bench(state, pool, local_ops()); -} - -static void BM_tensor_sml_fused_random(benchmark::State& state) { - sml_tensor_pool_fused pool{}; - run_bench(state, pool, random_ops()); -} - -static void BM_tensor_sml_router_local(benchmark::State& state) { - sml_router_pool pool{}; - run_bench(state, pool, local_ops()); -} - -static void BM_tensor_sml_router_random(benchmark::State& state) { - sml_router_pool pool{}; - run_bench(state, pool, random_ops()); -} - -static void BM_tensor_sml_router_fold_local(benchmark::State& state) { - sml_router_pool_fold pool{}; - run_bench(state, pool, local_ops()); -} - -static void BM_tensor_sml_router_fold_random(benchmark::State& state) { - sml_router_pool_fold pool{}; - run_bench(state, pool, random_ops()); -} - -static void BM_dispatch_direct_single(benchmark::State& state) { - nodata_direct actor{}; - std::uint64_t sink = 0; +void run_batch_bench(benchmark::State& state, const std::vector& ids, pooled_dispatch& pool) { for (auto _ : state) { - sink += static_cast(actor.process_event(ev_tick{})); - benchmark::DoNotOptimize(sink); + benchmark::DoNotOptimize(pool.process_batch(ids)); } + state.SetItemsProcessed(static_cast(state.iterations()) * static_cast(ids.size())); + benchmark::DoNotOptimize(pool.sample(ids.front())); } -static void BM_dispatch_sml_single(benchmark::State& state) { - sml::sm actor{}; - std::uint64_t sink = 0; - for (auto _ : state) { - sink += static_cast(actor.process_event(ev_tick{})); - benchmark::DoNotOptimize(sink); - } +static void BM_direct_local(benchmark::State& state) { + direct_pool pool; + run_scalar_bench(state, local_ids(), pool); } -static void BM_dispatch_direct_actor_array_random(benchmark::State& state) { - std::vector actors(kTensorCount); - const auto& ids = random_tensor_ids(); - std::uint64_t sink = 0; - for (auto _ : state) { - for (const auto id : ids) { - sink += static_cast(actors[id].process_event(ev_tick{})); - } - benchmark::DoNotOptimize(sink); - } - state.SetItemsProcessed(static_cast(state.iterations() * ids.size())); +static void BM_direct_random(benchmark::State& state) { + direct_pool pool; + run_scalar_bench(state, random_ids(), pool); } -static void BM_dispatch_sml_actor_array_random(benchmark::State& state) { - std::vector> actors; - actors.reserve(kTensorCount); - for (std::size_t i = 0; i < kTensorCount; ++i) { - actors.emplace_back(); - } - - const auto& ids = random_tensor_ids(); - std::uint64_t sink = 0; - for (auto _ : state) { - for (const auto id : ids) { - sink += static_cast(actors[id].process_event(ev_tick{})); - } - benchmark::DoNotOptimize(sink); - } - state.SetItemsProcessed(static_cast(state.iterations() * ids.size())); +static void BM_sml_actor_local(benchmark::State& state) { + actor_pool pool; + run_scalar_bench(state, local_ids(), pool); } -static void BM_dispatch_sml_actor_array_random_fold(benchmark::State& state) { - std::vector>> actors; - actors.reserve(kTensorCount); - for (std::size_t i = 0; i < kTensorCount; ++i) { - actors.emplace_back(); - } +static void BM_sml_actor_random(benchmark::State& state) { + actor_pool pool; + run_scalar_bench(state, random_ids(), pool); +} - const auto& ids = random_tensor_ids(); - std::uint64_t sink = 0; - for (auto _ : state) { - for (const auto id : ids) { - sink += static_cast(actors[id].process_event(ev_tick{})); - } - benchmark::DoNotOptimize(sink); - } - state.SetItemsProcessed(static_cast(state.iterations() * ids.size())); +static void BM_sml_pool_local(benchmark::State& state) { + pooled_dispatch pool; + run_scalar_bench(state, local_ids(), pool); } -static void BM_dispatch_sml_pool_random(benchmark::State& state) { - nodata_sm_pool actors{}; - const auto& ids = random_tensor_ids(); - std::uint64_t sink = 0; - for (auto _ : state) { - for (const auto id : ids) { - sink += static_cast(actors.process_event(id)); - } - benchmark::DoNotOptimize(sink); - } - state.SetItemsProcessed(static_cast(state.iterations() * ids.size())); +static void BM_sml_pool_random(benchmark::State& state) { + pooled_dispatch pool; + run_scalar_bench(state, random_ids(), pool); } -static void BM_dispatch_sml_pool_batch_random(benchmark::State& state) { - nodata_sm_pool actors{}; - const auto& ids = random_tensor_ids(); - std::uint64_t sink = 0; - for (auto _ : state) { - sink += actors.process_event_batch(ids); - sink += actors.sample(ids[0]); - sink += actors.sample(ids[1]); - benchmark::DoNotOptimize(sink); - } - state.SetItemsProcessed(static_cast(state.iterations() * ids.size())); +static void BM_sml_pool_batch_local(benchmark::State& state) { + pooled_dispatch pool; + run_batch_bench(state, local_ids(), pool); } -BENCHMARK(BM_tensor_flat_local); -BENCHMARK(BM_tensor_sml_local); -BENCHMARK(BM_tensor_flat_random); -BENCHMARK(BM_tensor_sml_random); -BENCHMARK(BM_tensor_flat_event_local); -BENCHMARK(BM_tensor_flat_event_random); -BENCHMARK(BM_tensor_sml_fused_local); -BENCHMARK(BM_tensor_sml_fused_random); -BENCHMARK(BM_tensor_sml_router_local); -BENCHMARK(BM_tensor_sml_router_random); -BENCHMARK(BM_tensor_sml_router_fold_local); -BENCHMARK(BM_tensor_sml_router_fold_random); -BENCHMARK(BM_dispatch_direct_single); -BENCHMARK(BM_dispatch_sml_single); -BENCHMARK(BM_dispatch_direct_actor_array_random); -BENCHMARK(BM_dispatch_sml_actor_array_random); -BENCHMARK(BM_dispatch_sml_actor_array_random_fold); -BENCHMARK(BM_dispatch_sml_pool_random); -BENCHMARK(BM_dispatch_sml_pool_batch_random); +static void BM_sml_pool_batch_random(benchmark::State& state) { + pooled_dispatch pool; + run_batch_bench(state, random_ids(), pool); +} +BENCHMARK(BM_direct_local); +BENCHMARK(BM_direct_random); +BENCHMARK(BM_sml_actor_local); +BENCHMARK(BM_sml_actor_random); +BENCHMARK(BM_sml_pool_local); +BENCHMARK(BM_sml_pool_random); +BENCHMARK(BM_sml_pool_batch_local); +BENCHMARK(BM_sml_pool_batch_random); BENCHMARK_MAIN(); #endif diff --git a/include/boost/sml/utility/sm_pool.hpp b/include/boost/sml/utility/sm_pool.hpp index 70277ab4..d1ae0a28 100644 --- a/include/boost/sml/utility/sm_pool.hpp +++ b/include/boost/sml/utility/sm_pool.hpp @@ -46,14 +46,16 @@ class sm_pool { template bool process_indexed(const std::size_t id, const TEvent& event = {}) { - return sm_.process_event(with_id(id, event)); + return sm_.process_event(indexed_event{id, event}); } template std::size_t process_indexed_batch(TIt first, TIt last, const TEvent& event = {}) { std::size_t handled = 0; + indexed_event indexed{0u, event}; for (; first != last; ++first) { - handled += static_cast(process_indexed(static_cast(*first), event)); + indexed.id = static_cast(*first); + handled += static_cast(sm_.process_event(indexed)); } return handled; } @@ -72,7 +74,7 @@ class sm_pool { std::size_t process_event_batch(TIt first, TIt last) { std::size_t handled = 0; for (; first != last; ++first) { - handled += static_cast(process_event(*first)); + handled += static_cast(sm_.process_event(*first)); } return handled; }