diff --git a/hwy/contrib/hash/phast-inl.h b/hwy/contrib/hash/phast-inl.h index cdb466f2b8..893811491f 100644 --- a/hwy/contrib/hash/phast-inl.h +++ b/hwy/contrib/hash/phast-inl.h @@ -193,6 +193,7 @@ class Phast { Phast() = default; Phast(PhastConfig config, const Triple32 hash, PackedSeeds&& seeds_packed) : config_(config), hash_(hash), seeds_packed_(std::move(seeds_packed)) {} + Phast(Phast&& other) = default; Phast& operator=(Phast&& other) = default; @@ -208,6 +209,21 @@ class Phast { return QueryWithSeeds(hash, seed, config_); } + // Two vectors have higher throughput because they utilize all 16-bit lanes + // for Hash16. Can be called directly, or via QueryBatch. + template > + HWY_INLINE void Query2(DU32 du32, VU32 key0, VU32 key1, VU32& idx0, + VU32& idx1) const { + hash_.TwoVec(du32, key0, key1); + + // Load 8-bit seeds from bucket. + const VU32 bucket_mask = Set(du32, config_.BucketMask()); + const VU32 seed0 = seeds_packed_.Get(du32, And(key0, bucket_mask)); + const VU32 seed1 = seeds_packed_.Get(du32, And(key1, bucket_mask)); + + QueryWithSeeds(du32, key0, key1, seed0, seed1, config_, idx0, idx1); + } + // Same, for a batch of keys. Considerably higher throughput than repeated // single queries: 7.8 GB/s on Turin for 1M keys. void QueryBatch(const uint32_t* HWY_RESTRICT keys, size_t num_keys, @@ -326,21 +342,6 @@ class Phast { return Hash16(combined); // caller will AND } - // Two vectors have higher throughput because they utilize all 16-bit lanes - // for Hash16. Called by QueryBatch. - template > - HWY_INLINE void Query2(DU32 du32, VU32 key0, VU32 key1, VU32& idx0, - VU32& idx1) const { - hash_.TwoVec(du32, key0, key1); - - // Load 8-bit seeds from bucket. - const VU32 bucket_mask = Set(du32, config_.BucketMask()); - const VU32 seed0 = seeds_packed_.Get(du32, And(key0, bucket_mask)); - const VU32 seed1 = seeds_packed_.Get(du32, And(key1, bucket_mask)); - - QueryWithSeeds(du32, key0, key1, seed0, seed1, config_, idx0, idx1); - } - PhastConfig config_ = {}; Triple32 hash_; PackedSeeds seeds_packed_; @@ -673,7 +674,7 @@ class PhastBuilder { return hash_for_key_idx_[key_idx] & config_.BucketMask(); } - static constexpr size_t kMaxHashesPerBucket = 16; + static constexpr size_t kMaxHashesPerBucket = 32; size_t PopulateBuckets(size_t num_keys) { PROFILER_FUNC; diff --git a/hwy/contrib/hash/phast_bench.cc b/hwy/contrib/hash/phast_bench.cc index 939d35e5fe..e61aedb447 100644 --- a/hwy/contrib/hash/phast_bench.cc +++ b/hwy/contrib/hash/phast_bench.cc @@ -13,9 +13,19 @@ // See the License for the specific language governing permissions and // limitations under the License. +// If set, we also benchmark absl::flat_hash_set. +#include "hwy/detect_compiler_arch.h" +#define HWY_HAVE_ABSL 0 + #include #include +#include + +#if HWY_HAVE_ABSL +#include "third_party/absl/container/flat_hash_set.h" +#endif + #ifndef HWY_DISABLED_TARGETS #define HWY_DISABLED_TARGETS (HWY_SSE2 | HWY_SSSE3 | HWY_SSE4) #endif // HWY_DISABLED_TARGETS @@ -23,6 +33,8 @@ #include "hwy/contrib/thread_pool/thread_pool.h" #include "hwy/contrib/thread_pool/topology.h" #include "hwy/nanobenchmark.h" +#include "hwy/per_target.h" // VectorBytes +#include "hwy/robust_statistics.h" #include "hwy/timer.h" // clang-format off @@ -42,7 +54,50 @@ namespace HWY_NAMESPACE { namespace { #if (HWY_TARGET != HWY_SCALAR && HWY_TARGET != HWY_EMU128) || HWY_IDE -HWY_NOINLINE void TestLatency(const Phast& phast) { +// Increase when running manually; this is to keep tests fast. Must be a power +// of two of at least NumWorkers * 2 * VectorBytes()/4 to enable wraparound. +HWY_INLINE_VAR constexpr size_t kNumKeys = + HWY_IS_DEBUG_BUILD ? 16 * 1024 : 128 * 1024; + +static ThreadPool MakePool() { + static Topology topology; + if (topology.packages.empty()) return ThreadPool(ThreadPool::MaxThreads()); + // Minus one because these are in addition to the main thread. + return ThreadPool(topology.packages[0].cores.size() - 1); +} + +HWY_NOINLINE AlignedVector GenerateKeys(size_t num_keys) { + // Round up to two vectors so we do not have to handle remainders here. + num_keys = RoundUpTo(num_keys, 2 * VectorBytes() / sizeof(uint32_t)); + // Must be distinct, hence do not use FillRandom(). + AlignedVector keys; + keys.reserve(num_keys); + AesCtrEngine engine(/*deterministic=*/true); + Triple32 permutation(engine, Unpredictable1()); + for (size_t i = 0; i < num_keys; ++i) { + keys.push_back(permutation(i)); + } + return keys; +} + +HWY_NOINLINE Phast MakePhast(const AlignedVector& keys, + ThreadPool& pool) { + const size_t num_keys = keys.size(); + const uint32_t slice_length = num_keys > 4 * 1024 * 1024 ? 8192 + : num_keys > 256 * 1024 ? 4096 + : num_keys >= 10 * 1000 ? 512 + : 256; + const uint32_t headroom_percent = num_keys > 4 * 1024 * 1024 ? 6 + : num_keys > 256 * 1024 ? 2 + : 5; + PhastConfig config(num_keys, 2, slice_length, headroom_percent); + return BuildPhast(keys.data(), config, pool); +} + +HWY_NOINLINE void TestLatency() { + const AlignedVector keys = GenerateKeys(1000); + ThreadPool pool = MakePool(); + const Phast phast = MakePhast(keys, pool); if (phast.IsEmpty()) { HWY_WARN("Phast build failed, skipping latency test.\n"); return; @@ -68,82 +123,193 @@ HWY_NOINLINE void TestLatency(const Phast& phast) { } } -HWY_NOINLINE void TestThroughput(const Phast& phast, - const AlignedVector& keys) { +HWY_NOINLINE void TestBW() { + const ScalableTag du8; + using VU8 = Vec; + HWY_LANES_CONSTEXPR size_t N = Lanes(du8); + const VU8 k1 = Set(du8, static_cast(Unpredictable1())); + + ThreadPool pool = MakePool(); + pool.SetWaitMode(PoolWaitMode::kSpin); + + const size_t kBytesPerWorker = kNumKeys * sizeof(uint32_t); + const size_t num_bytes = pool.NumWorkers() * kBytesPerWorker; + // Large array, avoid AlignedVector because it zero-initializes on 1 thread. + AlignedFreeUniquePtr bytes = AllocateAligned(num_bytes); + pool.Run(0, pool.NumWorkers(), [&](uint64_t task_idx, size_t /*worker*/) { + FillBytes(&bytes[task_idx * kBytesPerWorker], + static_cast(Unpredictable1()), kBytesPerWorker); + }); + + // Using nanobenchmark is too slow because it involves multiple iterations. + constexpr size_t kNumReps = AdjustedReps(20); + std::vector elapsed_times; + elapsed_times.reserve(kNumReps); + for (size_t rep = 0; rep < kNumReps; ++rep) { + const double t0 = platform::Now(); + pool.Run(0, pool.NumWorkers(), [&](uint64_t task_idx, size_t /*worker*/) { + uint8_t* my_bytes = &bytes[task_idx * kBytesPerWorker]; + for (size_t i = 0; i < kBytesPerWorker; i += 2 * N) { + const VU8 v0 = Load(du8, my_bytes + i); + const VU8 v1 = Load(du8, my_bytes + i + N); + Store(Add(v0, k1), du8, my_bytes + i); + Store(Add(v1, k1), du8, my_bytes + i + N); + } + }); + uint32_t result = bytes[Unpredictable1()]; + PreventElision(result); + elapsed_times.push_back(platform::Now() - t0); + } + const double elapsed = + robust_statistics::Median(elapsed_times.data(), elapsed_times.size()); + printf("MemBW: %7.2f ms = %4.1f GB/s\n", elapsed * 1E3, + num_bytes / elapsed * 1E-9); +} + +// Benchmarks PHAST used as a hash table, with an extra Gather at the returned +// index to verify set membership. +HWY_NOINLINE void TestThroughput() { + const AlignedVector keys = GenerateKeys(kNumKeys); + ThreadPool pool = MakePool(); + pool.SetWaitMode(PoolWaitMode::kSpin); + + const Phast phast = MakePhast(keys, pool); if (phast.IsEmpty()) { HWY_WARN("Phast build failed, skipping throughput test.\n"); return; } - AlignedVector indices(keys.size()); + const ScalableTag du32; + const RebindToSigned di32; + using VU32 = Vec; + using MU32 = Mask; + HWY_LANES_CONSTEXPR size_t N = Lanes(du32); + HWY_DASSERT(kNumKeys % (2 * N) == 0); // See GenerateKeys(). + + // Scatter keys to the verification slots. Could also sort K32V32 and copy. + AlignedVector key_verify(phast.Config().NumSlots()); + for (size_t i = 0; i < kNumKeys; i += 2 * N) { + const VU32 keys0 = Load(du32, &keys[i]); + const VU32 keys1 = Load(du32, &keys[i + N]); + VU32 idx0, idx1; + phast.Query2(du32, keys0, keys1, idx0, idx1); + ScatterIndex(keys0, du32, key_verify.data(), BitCast(di32, idx0)); + ScatterIndex(keys1, du32, key_verify.data(), BitCast(di32, idx1)); + } FuncInput input = Unpredictable1(); Result results[1]; Params params = DefaultBenchmarkParams(); + params.min_samples_per_eval = 2; + params.max_evals = 3; params.verbose = false; + // Each worker starts at a different offset in the keys to avoid unrealistic + // cache behavior, without requiring separate per-worker allocations. + const size_t num_workers_pow2 = 1u << hwy::CeilLog2(pool.NumWorkers()); + const size_t keys_per_chunk = kNumKeys / (num_workers_pow2 * 2 * N); + + AlignedVector per_worker(pool.NumWorkers() * HWY_ALIGNMENT); const size_t num_results = MeasureClosure( [&](FuncInput func_input) { - phast.QueryBatch(keys.data(), keys.size(), indices.data()); - return indices[func_input]; + pool.Run(0, pool.NumWorkers(), [&](uint64_t task_idx, size_t worker) { + MU32 eq0 = SetMask(du32, true); + MU32 eq1 = SetMask(du32, true); + for (size_t i = 0; i < kNumKeys; i += 2 * N) { + const size_t wrapped_i = (worker * keys_per_chunk + i) % kNumKeys; + const VU32 keys0 = Load(du32, &keys[wrapped_i]); + const VU32 keys1 = Load(du32, &keys[wrapped_i + N]); + VU32 idx0, idx1; + phast.Query2(du32, keys0, keys1, idx0, idx1); + eq0 = MaskedEq( + eq0, keys0, + GatherIndex(du32, key_verify.data(), BitCast(di32, idx0))); + eq1 = MaskedEq( + eq1, keys1, + GatherIndex(du32, key_verify.data(), BitCast(di32, idx1))); + } + per_worker[worker * HWY_ALIGNMENT] = AllTrue(du32, And(eq0, eq1)); + }); + return per_worker[Unpredictable1() * HWY_ALIGNMENT]; }, &input, 1, results, params); + for (size_t i = 0; i < pool.NumWorkers(); ++i) { + HWY_ASSERT(per_worker[i * HWY_ALIGNMENT]); + } + printf("\n"); if (num_results == 1) { const double ns = results[0].ticks / platform::InvariantTicksPerSecond() * 1E9; + const size_t bytes = kNumKeys * sizeof(uint32_t) * pool.NumWorkers(); printf( - "Query batch throughput: %7.2f ns = %4.1f MB/s; measurement " + "Batch verify throughput: %7.2f ns = %4.1f GB/s; measurement " "MAD=%4.2f%%\n", - ns, static_cast(keys.size() * sizeof(uint32_t)) / ns * 1E3, - results[0].variability * 100.0); + ns, static_cast(bytes) / ns, results[0].variability * 100.0); } else { HWY_WARN("Measurement failed."); } } -HWY_NOINLINE AlignedVector GenerateKeys(size_t num_keys) { - // Must be distinct, hence do not use FillRandom(). - AlignedVector keys; - keys.reserve(num_keys); - AesCtrEngine engine(/*deterministic=*/true); - Triple32 permutation(engine, Unpredictable1()); - for (size_t i = 0; i < num_keys; ++i) { - keys.push_back(permutation(i)); - } - return keys; -} +// Compare with absl::flat_hash_set - just set membership. +HWY_NOINLINE void TestAbslThroughput() { + if constexpr (HWY_HAVE_ABSL) { + const AlignedVector keys = GenerateKeys(kNumKeys); + absl::flat_hash_set set(keys.begin(), keys.end()); -static ThreadPool MakePool() { - static Topology topology; - if (topology.packages.empty()) return ThreadPool(ThreadPool::MaxThreads()); - // Minus one because these are in addition to the main thread. - return ThreadPool(topology.packages[0].cores.size() - 1); -} + FuncInput input = Unpredictable1(); + Result results[1]; + Params params = DefaultBenchmarkParams(); + params.min_samples_per_eval = 2; + params.max_evals = 3; + params.verbose = false; -HWY_NOINLINE Phast MakePhast(const AlignedVector& keys) { - const size_t num_keys = keys.size(); - const uint32_t slice_length = num_keys > 256 * 1024 ? 4096 - : num_keys >= 10 * 1000 ? 512 - : 256; - const uint32_t headroom_percent = num_keys > 256 * 1024 ? 2 : 5; - PhastConfig config(num_keys, 2, slice_length, headroom_percent); - ThreadPool pool = MakePool(); - return BuildPhast(keys.data(), config, pool); -} + ThreadPool pool = MakePool(); + pool.SetWaitMode(PoolWaitMode::kSpin); -HWY_NOINLINE void TestAllLatency() { - const AlignedVector keys = GenerateKeys(1000); - TestLatency(MakePhast(keys)); -} -HWY_NOINLINE void TestAllThroughput() { - const size_t num_keys = HWY_IS_DEBUG_BUILD ? 10 * 1000 : 1000 * 1000; - const AlignedVector keys = GenerateKeys(num_keys); - TestThroughput(MakePhast(keys), keys); + // Each worker starts at a different offset in the keys to avoid unrealistic + // cache behavior, without requiring separate per-worker allocations. + const size_t num_workers_pow2 = 1u << hwy::CeilLog2(pool.NumWorkers()); + const size_t N = VectorBytes() / sizeof(uint32_t); + const size_t keys_per_chunk = kNumKeys / (num_workers_pow2 * 2 * N); + + AlignedVector per_worker(pool.NumWorkers() * HWY_ALIGNMENT); + const size_t num_results = MeasureClosure( + [&](FuncInput func_input) { + pool.Run(0, pool.NumWorkers(), [&](uint64_t task_idx, size_t worker) { + bool all_found = true; + for (size_t i = 0; i < kNumKeys; ++i) { + all_found &= + set.contains(keys[(i + worker * keys_per_chunk) % kNumKeys]); + } + per_worker[worker * HWY_ALIGNMENT] = all_found; + }); + return per_worker[Unpredictable1() * HWY_ALIGNMENT]; + }, + &input, 1, results, params); + for (size_t i = 0; i < pool.NumWorkers(); ++i) { + HWY_ASSERT(per_worker[i * HWY_ALIGNMENT]); + } + if (num_results == 1) { + const double ns = + results[0].ticks / platform::InvariantTicksPerSecond() * 1E9; + const size_t bytes = kNumKeys * sizeof(uint32_t) * pool.NumWorkers(); + printf( + "Batch absl verify throughput: %7.2f ns = %4.1f GB/s; measurement " + "MAD=%4.2f%%\n", + ns, static_cast(bytes) / ns, results[0].variability * 100.0); + } else { + HWY_WARN("Measurement failed."); + } + } else { + HWY_WARN("absl::flat_hash_set not available, skipping test."); + } } #else // HWY_TARGET == HWY_SCALAR || HWY_TARGET == HWY_EMU128 -void TestAllLatency() {} -void TestAllThroughput() {} +void TestLatency() {} +void TestBW() {} +void TestThroughput() {} +void TestAbslThroughput() {} #endif // HWY_TARGET != HWY_SCALAR && HWY_TARGET != HWY_EMU128 } // namespace @@ -155,8 +321,10 @@ HWY_AFTER_NAMESPACE(); #if HWY_ONCE namespace hwy { HWY_BEFORE_TEST(PhastBench); -HWY_EXPORT_AND_TEST_BEST_P(PhastBench, TestAllLatency); -HWY_EXPORT_AND_TEST_BEST_P(PhastBench, TestAllThroughput); +HWY_EXPORT_AND_TEST_BEST_P(PhastBench, TestLatency); +HWY_EXPORT_AND_TEST_BEST_P(PhastBench, TestBW); +HWY_EXPORT_AND_TEST_BEST_P(PhastBench, TestThroughput); +HWY_EXPORT_AND_TEST_BEST_P(PhastBench, TestAbslThroughput); HWY_AFTER_TEST(); } // namespace hwy #endif // HWY_ONCE diff --git a/hwy_tests.bzl b/hwy_tests.bzl index 6cb2046f55..24683801d6 100644 --- a/hwy_tests.bzl +++ b/hwy_tests.bzl @@ -129,8 +129,10 @@ HWY_CONTRIB_TESTS = ( ":hash", ":profiler", ":random", + ":robust_statistics", ":thread_pool", ":topology", + # Placeholder for flat_hash_set, do not remove ], ), (