Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 17 additions & 16 deletions hwy/contrib/hash/phast-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,7 @@ class Phast {
Phast() = default;
Phast(PhastConfig config, const Triple32 hash, PackedSeeds&& seeds_packed)
: config_(config), hash_(hash), seeds_packed_(std::move(seeds_packed)) {}

Phast(Phast&& other) = default;
Phast& operator=(Phast&& other) = default;

Expand All @@ -208,6 +209,21 @@ class Phast {
return QueryWithSeeds(hash, seed, config_);
}

// Two vectors have higher throughput because they utilize all 16-bit lanes
// for Hash16. Can be called directly, or via QueryBatch.
template <class DU32, class VU32 = Vec<DU32>>
HWY_INLINE void Query2(DU32 du32, VU32 key0, VU32 key1, VU32& idx0,
VU32& idx1) const {
hash_.TwoVec(du32, key0, key1);

// Load 8-bit seeds from bucket.
const VU32 bucket_mask = Set(du32, config_.BucketMask());
const VU32 seed0 = seeds_packed_.Get(du32, And(key0, bucket_mask));
const VU32 seed1 = seeds_packed_.Get(du32, And(key1, bucket_mask));

QueryWithSeeds(du32, key0, key1, seed0, seed1, config_, idx0, idx1);
}

// Same, for a batch of keys. Considerably higher throughput than repeated
// single queries: 7.8 GB/s on Turin for 1M keys.
void QueryBatch(const uint32_t* HWY_RESTRICT keys, size_t num_keys,
Expand Down Expand Up @@ -326,21 +342,6 @@ class Phast {
return Hash16(combined); // caller will AND
}

// Two vectors have higher throughput because they utilize all 16-bit lanes
// for Hash16. Called by QueryBatch.
template <class DU32, class VU32 = Vec<DU32>>
HWY_INLINE void Query2(DU32 du32, VU32 key0, VU32 key1, VU32& idx0,
VU32& idx1) const {
hash_.TwoVec(du32, key0, key1);

// Load 8-bit seeds from bucket.
const VU32 bucket_mask = Set(du32, config_.BucketMask());
const VU32 seed0 = seeds_packed_.Get(du32, And(key0, bucket_mask));
const VU32 seed1 = seeds_packed_.Get(du32, And(key1, bucket_mask));

QueryWithSeeds(du32, key0, key1, seed0, seed1, config_, idx0, idx1);
}

PhastConfig config_ = {};
Triple32 hash_;
PackedSeeds seeds_packed_;
Expand Down Expand Up @@ -673,7 +674,7 @@ class PhastBuilder {
return hash_for_key_idx_[key_idx] & config_.BucketMask();
}

static constexpr size_t kMaxHashesPerBucket = 16;
static constexpr size_t kMaxHashesPerBucket = 32;

size_t PopulateBuckets(size_t num_keys) {
PROFILER_FUNC;
Expand Down
264 changes: 216 additions & 48 deletions hwy/contrib/hash/phast_bench.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,28 @@
// See the License for the specific language governing permissions and
// limitations under the License.

// If set, we also benchmark absl::flat_hash_set.
#include "hwy/detect_compiler_arch.h"
#define HWY_HAVE_ABSL 0

#include <stdint.h>
#include <stdio.h>

#include <vector>

#if HWY_HAVE_ABSL
#include "third_party/absl/container/flat_hash_set.h"
#endif

#ifndef HWY_DISABLED_TARGETS
#define HWY_DISABLED_TARGETS (HWY_SSE2 | HWY_SSSE3 | HWY_SSE4)
#endif // HWY_DISABLED_TARGETS

#include "hwy/contrib/thread_pool/thread_pool.h"
#include "hwy/contrib/thread_pool/topology.h"
#include "hwy/nanobenchmark.h"
#include "hwy/per_target.h" // VectorBytes
#include "hwy/robust_statistics.h"
#include "hwy/timer.h"

// clang-format off
Expand All @@ -42,7 +54,50 @@ namespace HWY_NAMESPACE {
namespace {
#if (HWY_TARGET != HWY_SCALAR && HWY_TARGET != HWY_EMU128) || HWY_IDE

HWY_NOINLINE void TestLatency(const Phast& phast) {
// Increase when running manually; this is to keep tests fast. Must be a power
// of two of at least NumWorkers * 2 * VectorBytes()/4 to enable wraparound.
HWY_INLINE_VAR constexpr size_t kNumKeys =
HWY_IS_DEBUG_BUILD ? 16 * 1024 : 128 * 1024;

static ThreadPool MakePool() {
static Topology topology;
if (topology.packages.empty()) return ThreadPool(ThreadPool::MaxThreads());
// Minus one because these are in addition to the main thread.
return ThreadPool(topology.packages[0].cores.size() - 1);
}

HWY_NOINLINE AlignedVector<uint32_t> GenerateKeys(size_t num_keys) {
// Round up to two vectors so we do not have to handle remainders here.
num_keys = RoundUpTo(num_keys, 2 * VectorBytes() / sizeof(uint32_t));
// Must be distinct, hence do not use FillRandom().
AlignedVector<uint32_t> keys;
keys.reserve(num_keys);
AesCtrEngine engine(/*deterministic=*/true);
Triple32 permutation(engine, Unpredictable1());
for (size_t i = 0; i < num_keys; ++i) {
keys.push_back(permutation(i));
}
return keys;
}

HWY_NOINLINE Phast MakePhast(const AlignedVector<uint32_t>& keys,
ThreadPool& pool) {
const size_t num_keys = keys.size();
const uint32_t slice_length = num_keys > 4 * 1024 * 1024 ? 8192
: num_keys > 256 * 1024 ? 4096
: num_keys >= 10 * 1000 ? 512
: 256;
const uint32_t headroom_percent = num_keys > 4 * 1024 * 1024 ? 6
: num_keys > 256 * 1024 ? 2
: 5;
PhastConfig config(num_keys, 2, slice_length, headroom_percent);
return BuildPhast(keys.data(), config, pool);
}

HWY_NOINLINE void TestLatency() {
const AlignedVector<uint32_t> keys = GenerateKeys(1000);
ThreadPool pool = MakePool();
const Phast phast = MakePhast(keys, pool);
if (phast.IsEmpty()) {
HWY_WARN("Phast build failed, skipping latency test.\n");
return;
Expand All @@ -68,82 +123,193 @@ HWY_NOINLINE void TestLatency(const Phast& phast) {
}
}

HWY_NOINLINE void TestThroughput(const Phast& phast,
const AlignedVector<uint32_t>& keys) {
HWY_NOINLINE void TestBW() {
const ScalableTag<uint8_t> du8;
using VU8 = Vec<decltype(du8)>;
HWY_LANES_CONSTEXPR size_t N = Lanes(du8);
const VU8 k1 = Set(du8, static_cast<uint8_t>(Unpredictable1()));

ThreadPool pool = MakePool();
pool.SetWaitMode(PoolWaitMode::kSpin);

const size_t kBytesPerWorker = kNumKeys * sizeof(uint32_t);
const size_t num_bytes = pool.NumWorkers() * kBytesPerWorker;
// Large array, avoid AlignedVector because it zero-initializes on 1 thread.
AlignedFreeUniquePtr<uint8_t[]> bytes = AllocateAligned<uint8_t>(num_bytes);
pool.Run(0, pool.NumWorkers(), [&](uint64_t task_idx, size_t /*worker*/) {
FillBytes(&bytes[task_idx * kBytesPerWorker],
static_cast<uint8_t>(Unpredictable1()), kBytesPerWorker);
});

// Using nanobenchmark is too slow because it involves multiple iterations.
constexpr size_t kNumReps = AdjustedReps(20);
std::vector<double> elapsed_times;
elapsed_times.reserve(kNumReps);
for (size_t rep = 0; rep < kNumReps; ++rep) {
const double t0 = platform::Now();
pool.Run(0, pool.NumWorkers(), [&](uint64_t task_idx, size_t /*worker*/) {
uint8_t* my_bytes = &bytes[task_idx * kBytesPerWorker];
for (size_t i = 0; i < kBytesPerWorker; i += 2 * N) {
const VU8 v0 = Load(du8, my_bytes + i);
const VU8 v1 = Load(du8, my_bytes + i + N);
Store(Add(v0, k1), du8, my_bytes + i);
Store(Add(v1, k1), du8, my_bytes + i + N);
}
});
uint32_t result = bytes[Unpredictable1()];
PreventElision(result);
elapsed_times.push_back(platform::Now() - t0);
}
const double elapsed =
robust_statistics::Median(elapsed_times.data(), elapsed_times.size());
printf("MemBW: %7.2f ms = %4.1f GB/s\n", elapsed * 1E3,
num_bytes / elapsed * 1E-9);
}

// Benchmarks PHAST used as a hash table, with an extra Gather at the returned
// index to verify set membership.
HWY_NOINLINE void TestThroughput() {
const AlignedVector<uint32_t> keys = GenerateKeys(kNumKeys);
ThreadPool pool = MakePool();
pool.SetWaitMode(PoolWaitMode::kSpin);

const Phast phast = MakePhast(keys, pool);
if (phast.IsEmpty()) {
HWY_WARN("Phast build failed, skipping throughput test.\n");
return;
}

AlignedVector<uint32_t> indices(keys.size());
const ScalableTag<uint32_t> du32;
const RebindToSigned<decltype(du32)> di32;
using VU32 = Vec<decltype(du32)>;
using MU32 = Mask<decltype(du32)>;
HWY_LANES_CONSTEXPR size_t N = Lanes(du32);
HWY_DASSERT(kNumKeys % (2 * N) == 0); // See GenerateKeys().

// Scatter keys to the verification slots. Could also sort K32V32 and copy.
AlignedVector<uint32_t> key_verify(phast.Config().NumSlots());
for (size_t i = 0; i < kNumKeys; i += 2 * N) {
const VU32 keys0 = Load(du32, &keys[i]);
const VU32 keys1 = Load(du32, &keys[i + N]);
VU32 idx0, idx1;
phast.Query2(du32, keys0, keys1, idx0, idx1);
ScatterIndex(keys0, du32, key_verify.data(), BitCast(di32, idx0));
ScatterIndex(keys1, du32, key_verify.data(), BitCast(di32, idx1));
}

FuncInput input = Unpredictable1();
Result results[1];
Params params = DefaultBenchmarkParams();
params.min_samples_per_eval = 2;
params.max_evals = 3;
params.verbose = false;

// Each worker starts at a different offset in the keys to avoid unrealistic
// cache behavior, without requiring separate per-worker allocations.
const size_t num_workers_pow2 = 1u << hwy::CeilLog2(pool.NumWorkers());
const size_t keys_per_chunk = kNumKeys / (num_workers_pow2 * 2 * N);

AlignedVector<uint8_t> per_worker(pool.NumWorkers() * HWY_ALIGNMENT);
const size_t num_results = MeasureClosure(
[&](FuncInput func_input) {
phast.QueryBatch(keys.data(), keys.size(), indices.data());
return indices[func_input];
pool.Run(0, pool.NumWorkers(), [&](uint64_t task_idx, size_t worker) {
MU32 eq0 = SetMask(du32, true);
MU32 eq1 = SetMask(du32, true);
for (size_t i = 0; i < kNumKeys; i += 2 * N) {
const size_t wrapped_i = (worker * keys_per_chunk + i) % kNumKeys;
const VU32 keys0 = Load(du32, &keys[wrapped_i]);
const VU32 keys1 = Load(du32, &keys[wrapped_i + N]);
VU32 idx0, idx1;
phast.Query2(du32, keys0, keys1, idx0, idx1);
eq0 = MaskedEq(
eq0, keys0,
GatherIndex(du32, key_verify.data(), BitCast(di32, idx0)));
eq1 = MaskedEq(
eq1, keys1,
GatherIndex(du32, key_verify.data(), BitCast(di32, idx1)));
}
per_worker[worker * HWY_ALIGNMENT] = AllTrue(du32, And(eq0, eq1));
});
return per_worker[Unpredictable1() * HWY_ALIGNMENT];
},
&input, 1, results, params);
for (size_t i = 0; i < pool.NumWorkers(); ++i) {
HWY_ASSERT(per_worker[i * HWY_ALIGNMENT]);
}
printf("\n");
if (num_results == 1) {
const double ns =
results[0].ticks / platform::InvariantTicksPerSecond() * 1E9;
const size_t bytes = kNumKeys * sizeof(uint32_t) * pool.NumWorkers();
printf(
"Query batch throughput: %7.2f ns = %4.1f MB/s; measurement "
"Batch verify throughput: %7.2f ns = %4.1f GB/s; measurement "
"MAD=%4.2f%%\n",
ns, static_cast<double>(keys.size() * sizeof(uint32_t)) / ns * 1E3,
results[0].variability * 100.0);
ns, static_cast<double>(bytes) / ns, results[0].variability * 100.0);
} else {
HWY_WARN("Measurement failed.");
}
}

HWY_NOINLINE AlignedVector<uint32_t> GenerateKeys(size_t num_keys) {
// Must be distinct, hence do not use FillRandom().
AlignedVector<uint32_t> keys;
keys.reserve(num_keys);
AesCtrEngine engine(/*deterministic=*/true);
Triple32 permutation(engine, Unpredictable1());
for (size_t i = 0; i < num_keys; ++i) {
keys.push_back(permutation(i));
}
return keys;
}
// Compare with absl::flat_hash_set - just set membership.
HWY_NOINLINE void TestAbslThroughput() {
if constexpr (HWY_HAVE_ABSL) {
const AlignedVector<uint32_t> keys = GenerateKeys(kNumKeys);
absl::flat_hash_set<uint32_t> set(keys.begin(), keys.end());

static ThreadPool MakePool() {
static Topology topology;
if (topology.packages.empty()) return ThreadPool(ThreadPool::MaxThreads());
// Minus one because these are in addition to the main thread.
return ThreadPool(topology.packages[0].cores.size() - 1);
}
FuncInput input = Unpredictable1();
Result results[1];
Params params = DefaultBenchmarkParams();
params.min_samples_per_eval = 2;
params.max_evals = 3;
params.verbose = false;

HWY_NOINLINE Phast MakePhast(const AlignedVector<uint32_t>& keys) {
const size_t num_keys = keys.size();
const uint32_t slice_length = num_keys > 256 * 1024 ? 4096
: num_keys >= 10 * 1000 ? 512
: 256;
const uint32_t headroom_percent = num_keys > 256 * 1024 ? 2 : 5;
PhastConfig config(num_keys, 2, slice_length, headroom_percent);
ThreadPool pool = MakePool();
return BuildPhast(keys.data(), config, pool);
}
ThreadPool pool = MakePool();
pool.SetWaitMode(PoolWaitMode::kSpin);

HWY_NOINLINE void TestAllLatency() {
const AlignedVector<uint32_t> keys = GenerateKeys(1000);
TestLatency(MakePhast(keys));
}
HWY_NOINLINE void TestAllThroughput() {
const size_t num_keys = HWY_IS_DEBUG_BUILD ? 10 * 1000 : 1000 * 1000;
const AlignedVector<uint32_t> keys = GenerateKeys(num_keys);
TestThroughput(MakePhast(keys), keys);
// Each worker starts at a different offset in the keys to avoid unrealistic
// cache behavior, without requiring separate per-worker allocations.
const size_t num_workers_pow2 = 1u << hwy::CeilLog2(pool.NumWorkers());
const size_t N = VectorBytes() / sizeof(uint32_t);
const size_t keys_per_chunk = kNumKeys / (num_workers_pow2 * 2 * N);

AlignedVector<uint8_t> per_worker(pool.NumWorkers() * HWY_ALIGNMENT);
const size_t num_results = MeasureClosure(
[&](FuncInput func_input) {
pool.Run(0, pool.NumWorkers(), [&](uint64_t task_idx, size_t worker) {
bool all_found = true;
for (size_t i = 0; i < kNumKeys; ++i) {
all_found &=
set.contains(keys[(i + worker * keys_per_chunk) % kNumKeys]);
}
per_worker[worker * HWY_ALIGNMENT] = all_found;
});
return per_worker[Unpredictable1() * HWY_ALIGNMENT];
},
&input, 1, results, params);
for (size_t i = 0; i < pool.NumWorkers(); ++i) {
HWY_ASSERT(per_worker[i * HWY_ALIGNMENT]);
}
if (num_results == 1) {
const double ns =
results[0].ticks / platform::InvariantTicksPerSecond() * 1E9;
const size_t bytes = kNumKeys * sizeof(uint32_t) * pool.NumWorkers();
printf(
"Batch absl verify throughput: %7.2f ns = %4.1f GB/s; measurement "
"MAD=%4.2f%%\n",
ns, static_cast<double>(bytes) / ns, results[0].variability * 100.0);
} else {
HWY_WARN("Measurement failed.");
}
} else {
HWY_WARN("absl::flat_hash_set not available, skipping test.");
}
}

#else // HWY_TARGET == HWY_SCALAR || HWY_TARGET == HWY_EMU128
void TestAllLatency() {}
void TestAllThroughput() {}
void TestLatency() {}
void TestBW() {}
void TestThroughput() {}
void TestAbslThroughput() {}
#endif // HWY_TARGET != HWY_SCALAR && HWY_TARGET != HWY_EMU128

} // namespace
Expand All @@ -155,8 +321,10 @@ HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
HWY_BEFORE_TEST(PhastBench);
HWY_EXPORT_AND_TEST_BEST_P(PhastBench, TestAllLatency);
HWY_EXPORT_AND_TEST_BEST_P(PhastBench, TestAllThroughput);
HWY_EXPORT_AND_TEST_BEST_P(PhastBench, TestLatency);
HWY_EXPORT_AND_TEST_BEST_P(PhastBench, TestBW);
HWY_EXPORT_AND_TEST_BEST_P(PhastBench, TestThroughput);
HWY_EXPORT_AND_TEST_BEST_P(PhastBench, TestAbslThroughput);
HWY_AFTER_TEST();
} // namespace hwy
#endif // HWY_ONCE
Loading
Loading