From 8a5131702efd216fd8659b9d1395f1a4e881338b Mon Sep 17 00:00:00 2001
From: Steffen Smolka <steffen.smolka@gmail.com>
Date: Wed, 10 Jun 2026 08:59:49 -0700
Subject: [PATCH 1/7] [NetKAT] Shrink node storage pages from 64 MiB to 16 KiB.

The managers' node vectors allocate memory in pages. At 64 MiB, every page
allocation exceeds malloc's mmap threshold (typically 128 KiB), so each
manager pays an mmap/munmap syscall pair - significant for short-lived
managers, which compile a policy and are discarded. At 16 KiB, pages are
recycled through the allocator's freelists, while still amortizing
allocation over hundreds of nodes.

In benchmarks, this speeds up first-time compilation of small policies by
up to 3x (e.g. BM_FirstTimeCompileOverlappingPredicate: 10.8us -> 3.7us);
the syscall cost was diagnosed with strace -c.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 netkat/packet_set.h         | 11 +++++++----
 netkat/packet_transformer.h | 11 +++++++----
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/netkat/packet_set.h b/netkat/packet_set.h
index f11088f..e7daf8e 100644
--- a/netkat/packet_set.h
+++ b/netkat/packet_set.h
@@ -343,10 +343,13 @@ class PacketSetManager {
 
   [[nodiscard]] std::string ToString(const DecisionNode& node) const;
 
-  // The page size of the `nodes_` vector: 64 MiB or ~ 67 MB.
-  // Chosen large enough to reduce the cost of dynamic allocation, and small
-  // enough to avoid excessive memory overhead.
-  static constexpr size_t kPageSize = (1 << 26) / sizeof(DecisionNode);
+  // The page size of the `nodes_` vector: 16 KiB.
+  // Chosen large enough to amortize the cost of dynamic allocation over
+  // hundreds of nodes, and small enough that pages stay below the malloc
+  // mmap/trim thresholds (typically 128 KiB): this way, short-lived managers
+  // recycle pages through the allocator's freelists instead of paying an
+  // mmap/munmap syscall pair per manager.
+  static constexpr size_t kPageSize = (1 << 14) / sizeof(DecisionNode);
 
   // The decision nodes forming the BDD-style DAG representation of packet sets.
   // `PacketSetHandle::node_index_` indexes into this vector.
diff --git a/netkat/packet_transformer.h b/netkat/packet_transformer.h
index 4c9c90d..9ce7215 100644
--- a/netkat/packet_transformer.h
+++ b/netkat/packet_transformer.h
@@ -399,10 +399,13 @@ class PacketTransformerManager {
 
   [[nodiscard]] std::string ToString(const DecisionNode& node) const;
 
-  // The page size of the `nodes_` vector: 64 MiB or ~ 67 MB.
-  // Chosen large enough to reduce the cost of dynamic allocation, and small
-  // enough to avoid excessive memory overhead.
-  static constexpr size_t kPageSize = (1 << 26) / sizeof(DecisionNode);
+  // The page size of the `nodes_` vector: 16 KiB.
+  // Chosen large enough to amortize the cost of dynamic allocation over
+  // hundreds of nodes, and small enough that pages stay below the malloc
+  // mmap/trim thresholds (typically 128 KiB): this way, short-lived managers
+  // recycle pages through the allocator's freelists instead of paying an
+  // mmap/munmap syscall pair per manager.
+  static constexpr size_t kPageSize = (1 << 14) / sizeof(DecisionNode);
 
   // Helper functions to deal with DecisionNodes directly.
   // TODO(dilo): Is there a convenient way to either avoid these or avoid making

From 06c0551ae325af47a4dc8c52c3813f757ee8e7cd Mon Sep 17 00:00:00 2001
From: Steffen Smolka <steffen.smolka@gmail.com>
Date: Wed, 10 Jun 2026 09:08:08 -0700
Subject: [PATCH 2/7] [NetKAT] Define the small page sizes as power-of-two node
 counts.

Deriving the page size from a byte budget yields a non-power-of-two
node count for packet sets (16 KiB / 24 B = 682), which forces the
index arithmetic in PagedStableVector::operator[] -- on the hot path of
nearly every operation -- to compile to multiply sequences instead of
single shift/mask instructions. Round to 512 nodes (12 KiB) instead;
transformer pages become an explicit 256 nodes (16 KiB), numerically
unchanged. Both stay far below the malloc mmap/trim thresholds, which
is what this PR is about.

This also unblocks stacking #101, which enforces power-of-two page
sizes at compile time.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 netkat/packet_set.h         | 8 +++++---
 netkat/packet_transformer.h | 8 +++++---
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/netkat/packet_set.h b/netkat/packet_set.h
index e7daf8e..8b24d51 100644
--- a/netkat/packet_set.h
+++ b/netkat/packet_set.h
@@ -343,13 +343,15 @@ class PacketSetManager {
 
   [[nodiscard]] std::string ToString(const DecisionNode& node) const;
 
-  // The page size of the `nodes_` vector: 16 KiB.
+  // The page size of the `nodes_` vector: 512 nodes, or 12 KiB.
   // Chosen large enough to amortize the cost of dynamic allocation over
   // hundreds of nodes, and small enough that pages stay below the malloc
   // mmap/trim thresholds (typically 128 KiB): this way, short-lived managers
   // recycle pages through the allocator's freelists instead of paying an
-  // mmap/munmap syscall pair per manager.
-  static constexpr size_t kPageSize = (1 << 14) / sizeof(DecisionNode);
+  // mmap/munmap syscall pair per manager. A power of two so that indexing
+  // into the vector -- which is on the hot path of nearly every operation --
+  // compiles to shifts and masks rather than multiply sequences.
+  static constexpr size_t kPageSize = size_t{1} << 9;
 
   // The decision nodes forming the BDD-style DAG representation of packet sets.
   // `PacketSetHandle::node_index_` indexes into this vector.
diff --git a/netkat/packet_transformer.h b/netkat/packet_transformer.h
index 9ce7215..ae9ab40 100644
--- a/netkat/packet_transformer.h
+++ b/netkat/packet_transformer.h
@@ -399,13 +399,15 @@ class PacketTransformerManager {
 
   [[nodiscard]] std::string ToString(const DecisionNode& node) const;
 
-  // The page size of the `nodes_` vector: 16 KiB.
+  // The page size of the `nodes_` vector: 256 nodes, or 16 KiB.
   // Chosen large enough to amortize the cost of dynamic allocation over
   // hundreds of nodes, and small enough that pages stay below the malloc
   // mmap/trim thresholds (typically 128 KiB): this way, short-lived managers
   // recycle pages through the allocator's freelists instead of paying an
-  // mmap/munmap syscall pair per manager.
-  static constexpr size_t kPageSize = (1 << 14) / sizeof(DecisionNode);
+  // mmap/munmap syscall pair per manager. A power of two so that indexing
+  // into the vector -- which is on the hot path of nearly every operation --
+  // compiles to shifts and masks rather than multiply sequences.
+  static constexpr size_t kPageSize = size_t{1} << 8;
 
   // Helper functions to deal with DecisionNodes directly.
   // TODO(dilo): Is there a convenient way to either avoid these or avoid making

From 0d67df8b22d4f7d2b4cab60d464329cad1eb8874 Mon Sep 17 00:00:00 2001
From: Steffen Smolka <steffen.smolka@gmail.com>
Date: Wed, 10 Jun 2026 03:31:44 -0700
Subject: [PATCH 3/7] [NetKAT] Simplify PagedStableVector bookkeeping; add a
 microbenchmark.

Detect page boundaries via the last page's size instead of recomputing
size() % PageSize on every insertion. Besides being cheaper, this fixes
a latent invariant bug: if an insertion threw after allocating a fresh
page, the next insertion would allocate a second empty page, leaving a
hole mid-vector that silently corrupts the index arithmetic for all
subsequent elements. (Unreachable today since DecisionNode insertions
do not throw, but a footgun for future element types.)

The new microbenchmark quantifies the data structure's design choices:
power-of-two vs non-power-of-two page sizes for operator[] indexing
(up to 2.6x), and paged vs flat appends (~1.4x in favor of paged, no
relocation copies), with a flat std::vector as the reference.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 netkat/BUILD.bazel                      |  10 ++
 netkat/paged_stable_vector.h            |  27 +++--
 netkat/paged_stable_vector_benchmark.cc | 133 ++++++++++++++++++++++++
 3 files changed, 164 insertions(+), 6 deletions(-)
 create mode 100644 netkat/paged_stable_vector_benchmark.cc

diff --git a/netkat/BUILD.bazel b/netkat/BUILD.bazel
index 9121fef..9c15abc 100644
--- a/netkat/BUILD.bazel
+++ b/netkat/BUILD.bazel
@@ -397,6 +397,16 @@ cc_test(
     ],
 )
 
+cc_binary(
+    name = "paged_stable_vector_benchmark",
+    testonly = True,
+    srcs = ["paged_stable_vector_benchmark.cc"],
+    deps = [
+        ":paged_stable_vector",
+        "@com_google_benchmark//:benchmark_main",
+    ],
+)
+
 cc_test(
     name = "packet_transformer_test",
     srcs = ["packet_transformer_test.cc"],
diff --git a/netkat/paged_stable_vector.h b/netkat/paged_stable_vector.h
index 04ac895..6e6a67e 100644
--- a/netkat/paged_stable_vector.h
+++ b/netkat/paged_stable_vector.h
@@ -36,26 +36,28 @@ namespace netkat {
 // significant for very large vectors in performance-sensitive applications.
 //
 // The API of this class is kept just large enough to cover our use cases.
+//
+// PERFORMANCE: Prefer a power-of-two `PageSize` so that the index arithmetic
+// in `operator[]` compiles to shifts and masks rather than multiply sequences.
 template <class T, size_t PageSize>
 class PagedStableVector {
  public:
   PagedStableVector() = default;
 
-  size_t size() const {
-    return data_.empty() ? 0
-                         : (data_.size() - 1) * PageSize + data_.back().size();
-  }
+  size_t size() const { return size_; }
 
   template <class Value>
   void push_back(Value&& value) {
-    if (size() % PageSize == 0) data_.emplace_back().reserve(PageSize);
+    ReserveSpaceForNextElement();
     data_.back().push_back(std::forward<Value>(value));
+    ++size_;
   }
 
   template <class... Args>
   void emplace_back(Args&&... value) {
-    if (size() % PageSize == 0) data_.emplace_back().reserve(PageSize);
+    ReserveSpaceForNextElement();
     data_.back().emplace_back(std::forward<Args>(value)...);
+    ++size_;
   }
 
   T& operator[](size_t index) {
@@ -66,7 +68,20 @@ class PagedStableVector {
   }
 
  private:
+  void ReserveSpaceForNextElement() {
+    if (data_.empty() || data_.back().size() == PageSize) {
+      // Reserving each page upfront is what guarantees pointer stability: a
+      // page never grows beyond its initial capacity, so its elements are
+      // never relocated.
+      data_.emplace_back().reserve(PageSize);
+    }
+  }
+
   std::vector<std::vector<T>> data_;
+
+  // Tracked explicitly (rather than computed from `data_`) since clients call
+  // `size()` on every element insertion.
+  size_t size_ = 0;
 };
 
 }  // namespace netkat
diff --git a/netkat/paged_stable_vector_benchmark.cc b/netkat/paged_stable_vector_benchmark.cc
new file mode 100644
index 0000000..dc13e0a
--- /dev/null
+++ b/netkat/paged_stable_vector_benchmark.cc
@@ -0,0 +1,133 @@
+// Copyright 2026 The NetKAT authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Benchmarks for `PagedStableVector`, exercising the access patterns of its
+// only clients (`PacketSetManager`/`PacketTransformerManager`): indexed reads
+// of decision nodes during BDD traversal, and appends of new nodes.
+//
+// The benchmarks are instantiated with a power-of-two and a non-power-of-two
+// page size to quantify the cost of `operator[]`'s index arithmetic: division
+// by a non-power-of-two constant compiles to a multiply sequence rather than a
+// shift. A flat `std::vector` (no paging, no pointer stability) serves as the
+// lower-bound reference.
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+#include "benchmark/benchmark.h"
+#include "netkat/paged_stable_vector.h"
+
+namespace netkat {
+namespace {
+
+// Same size and alignment as `PacketSetManager::DecisionNode`.
+struct FakeNode {
+  uint64_t a = 0;
+  uint64_t b = 0;
+  uint64_t c = 0;
+};
+static_assert(sizeof(FakeNode) == 24);
+
+// The page size of `PacketSetManager::nodes_` at the time of writing:
+// a 64 MiB byte budget divided by the node size, yielding a non-power-of-two
+// number of elements per page.
+constexpr size_t kNonPow2PageSize = (size_t{1} << 26) / sizeof(FakeNode);
+static_assert((kNonPow2PageSize & (kNonPow2PageSize - 1)) != 0);
+
+// A power-of-two page size of comparable magnitude (~48 MiB worth of nodes).
+constexpr size_t kPow2PageSize = size_t{1} << 21;
+
+template <class Vector>
+Vector MakeFilledVector(size_t size) {
+  Vector vec;
+  for (size_t i = 0; i < size; ++i) {
+    vec.push_back(FakeNode{.a = i, .b = i, .c = i});
+  }
+  return vec;
+}
+
+// Returns `size` indices in [0, size) in pseudo-random order, simulating the
+// data-dependent node lookups of BDD traversal. Uses a fixed-seed LCG so all
+// instantiations see the identical sequence.
+std::vector<uint32_t> PseudoRandomIndices(size_t size) {
+  std::vector<uint32_t> indices;
+  indices.reserve(size);
+  uint64_t state = 42;
+  for (size_t i = 0; i < size; ++i) {
+    state = state * 6364136223846793005ULL + 1442695040888963407ULL;
+    indices.push_back(static_cast<uint32_t>((state >> 33) % size));
+  }
+  return indices;
+}
+
+template <class Vector>
+void BM_PushBack(benchmark::State& state) {
+  const size_t size = state.range(0);
+  for (auto s : state) {
+    Vector vec = MakeFilledVector<Vector>(size);
+    benchmark::DoNotOptimize(vec);
+  }
+  state.SetItemsProcessed(state.iterations() * size);
+}
+
+template <class Vector>
+void BM_SequentialRead(benchmark::State& state) {
+  const size_t size = state.range(0);
+  Vector vec = MakeFilledVector<Vector>(size);
+  for (auto s : state) {
+    uint64_t sum = 0;
+    for (size_t i = 0; i < size; ++i) sum += vec[i].a;
+    benchmark::DoNotOptimize(sum);
+  }
+  state.SetItemsProcessed(state.iterations() * size);
+}
+
+template <class Vector>
+void BM_RandomRead(benchmark::State& state) {
+  const size_t size = state.range(0);
+  Vector vec = MakeFilledVector<Vector>(size);
+  const std::vector<uint32_t> indices = PseudoRandomIndices(size);
+  for (auto s : state) {
+    uint64_t sum = 0;
+    for (uint32_t index : indices) sum += vec[index].a;
+    benchmark::DoNotOptimize(sum);
+  }
+  state.SetItemsProcessed(state.iterations() * size);
+}
+
+// 4M elements ≈ 96 MiB: spans multiple pages and far exceeds L3, like the
+// node vectors of large NetKAT models. 256k elements ≈ 6 MiB: fits in L3,
+// making the index arithmetic (rather than memory stalls) the bottleneck.
+constexpr size_t kSmall = size_t{1} << 18;
+constexpr size_t kLarge = size_t{1} << 22;
+
+using NonPow2Vector = PagedStableVector<FakeNode, kNonPow2PageSize>;
+using Pow2Vector = PagedStableVector<FakeNode, kPow2PageSize>;
+using FlatVector = std::vector<FakeNode>;
+
+BENCHMARK_TEMPLATE(BM_PushBack, NonPow2Vector)->Arg(kSmall)->Arg(kLarge);
+BENCHMARK_TEMPLATE(BM_PushBack, Pow2Vector)->Arg(kSmall)->Arg(kLarge);
+BENCHMARK_TEMPLATE(BM_PushBack, FlatVector)->Arg(kSmall)->Arg(kLarge);
+
+BENCHMARK_TEMPLATE(BM_SequentialRead, NonPow2Vector)->Arg(kSmall)->Arg(kLarge);
+BENCHMARK_TEMPLATE(BM_SequentialRead, Pow2Vector)->Arg(kSmall)->Arg(kLarge);
+BENCHMARK_TEMPLATE(BM_SequentialRead, FlatVector)->Arg(kSmall)->Arg(kLarge);
+
+BENCHMARK_TEMPLATE(BM_RandomRead, NonPow2Vector)->Arg(kSmall)->Arg(kLarge);
+BENCHMARK_TEMPLATE(BM_RandomRead, Pow2Vector)->Arg(kSmall)->Arg(kLarge);
+BENCHMARK_TEMPLATE(BM_RandomRead, FlatVector)->Arg(kSmall)->Arg(kLarge);
+
+}  // namespace
+}  // namespace netkat

From 7a2335c6ad82cdd5cbe264b8f1465283be6dac92 Mon Sep 17 00:00:00 2001
From: Steffen Smolka <steffen.smolka@gmail.com>
Date: Wed, 10 Jun 2026 03:38:23 -0700
Subject: [PATCH 4/7] [NetKAT] Enforce power-of-two PagedStableVector pages at
 compile time.

Turn the prose recommendation into a static_assert: fast index
arithmetic is the reason this class can match a flat std::vector on
reads, so a non-power-of-two PageSize should be a compile error rather
than a silent 2x regression. With the property enforced, the benchmark
no longer needs to re-litigate the page-size choice, and is slimmed
down to its long-term job of guarding paged-vs-flat performance.

Also drop the cached size_ member in favor of deriving the size from
the pages again: it measured neutral, and the static_assert now
guarantees the derived computation compiles to a shift, so the extra
invariant bought nothing.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 netkat/paged_stable_vector.h            | 21 +++++++++--------
 netkat/paged_stable_vector_benchmark.cc | 31 ++++++++-----------------
 netkat/paged_stable_vector_test.cc      |  3 ++-
 3 files changed, 23 insertions(+), 32 deletions(-)

diff --git a/netkat/paged_stable_vector.h b/netkat/paged_stable_vector.h
index 6e6a67e..13b355c 100644
--- a/netkat/paged_stable_vector.h
+++ b/netkat/paged_stable_vector.h
@@ -19,6 +19,7 @@
 #ifndef GOOGLE_NETKAT_NETKAT_PAGED_STABLE_VECTOR_H_
 #define GOOGLE_NETKAT_NETKAT_PAGED_STABLE_VECTOR_H_
 
+#include <bit>
 #include <cstddef>
 #include <utility>
 #include <vector>
@@ -36,28 +37,32 @@ namespace netkat {
 // significant for very large vectors in performance-sensitive applications.
 //
 // The API of this class is kept just large enough to cover our use cases.
-//
-// PERFORMANCE: Prefer a power-of-two `PageSize` so that the index arithmetic
-// in `operator[]` compiles to shifts and masks rather than multiply sequences.
 template <class T, size_t PageSize>
 class PagedStableVector {
  public:
+  // Index arithmetic (`operator[]`, `size()`) is on our clients' hot paths.
+  // Requiring a power-of-two `PageSize` guarantees it compiles to shifts and
+  // masks rather than multiply sequences.
+  static_assert(std::has_single_bit(PageSize),
+                "PageSize must be a power of two");
+
   PagedStableVector() = default;
 
-  size_t size() const { return size_; }
+  size_t size() const {
+    return data_.empty() ? 0
+                         : (data_.size() - 1) * PageSize + data_.back().size();
+  }
 
   template <class Value>
   void push_back(Value&& value) {
     ReserveSpaceForNextElement();
     data_.back().push_back(std::forward<Value>(value));
-    ++size_;
   }
 
   template <class... Args>
   void emplace_back(Args&&... value) {
     ReserveSpaceForNextElement();
     data_.back().emplace_back(std::forward<Args>(value)...);
-    ++size_;
   }
 
   T& operator[](size_t index) {
@@ -78,10 +83,6 @@ class PagedStableVector {
   }
 
   std::vector<std::vector<T>> data_;
-
-  // Tracked explicitly (rather than computed from `data_`) since clients call
-  // `size()` on every element insertion.
-  size_t size_ = 0;
 };
 
 }  // namespace netkat
diff --git a/netkat/paged_stable_vector_benchmark.cc b/netkat/paged_stable_vector_benchmark.cc
index dc13e0a..b3af0be 100644
--- a/netkat/paged_stable_vector_benchmark.cc
+++ b/netkat/paged_stable_vector_benchmark.cc
@@ -16,11 +16,10 @@
 // only clients (`PacketSetManager`/`PacketTransformerManager`): indexed reads
 // of decision nodes during BDD traversal, and appends of new nodes.
 //
-// The benchmarks are instantiated with a power-of-two and a non-power-of-two
-// page size to quantify the cost of `operator[]`'s index arithmetic: division
-// by a non-power-of-two constant compiles to a multiply sequence rather than a
-// shift. A flat `std::vector` (no paging, no pointer stability) serves as the
-// lower-bound reference.
+// A flat `std::vector` (no paging, no pointer stability) serves as the
+// reference: it bounds read performance from above (no double indirection,
+// perfect contiguity) and append performance from below (it must relocate all
+// elements whenever it grows beyond its capacity).
 
 #include <cstddef>
 #include <cstdint>
@@ -40,14 +39,8 @@ struct FakeNode {
 };
 static_assert(sizeof(FakeNode) == 24);
 
-// The page size of `PacketSetManager::nodes_` at the time of writing:
-// a 64 MiB byte budget divided by the node size, yielding a non-power-of-two
-// number of elements per page.
-constexpr size_t kNonPow2PageSize = (size_t{1} << 26) / sizeof(FakeNode);
-static_assert((kNonPow2PageSize & (kNonPow2PageSize - 1)) != 0);
-
-// A power-of-two page size of comparable magnitude (~48 MiB worth of nodes).
-constexpr size_t kPow2PageSize = size_t{1} << 21;
+// The page size of `PacketSetManager::nodes_`: 2^21 nodes, or 48 MiB.
+constexpr size_t kPageSize = size_t{1} << 21;
 
 template <class Vector>
 Vector MakeFilledVector(size_t size) {
@@ -113,20 +106,16 @@ void BM_RandomRead(benchmark::State& state) {
 constexpr size_t kSmall = size_t{1} << 18;
 constexpr size_t kLarge = size_t{1} << 22;
 
-using NonPow2Vector = PagedStableVector<FakeNode, kNonPow2PageSize>;
-using Pow2Vector = PagedStableVector<FakeNode, kPow2PageSize>;
+using PagedVector = PagedStableVector<FakeNode, kPageSize>;
 using FlatVector = std::vector<FakeNode>;
 
-BENCHMARK_TEMPLATE(BM_PushBack, NonPow2Vector)->Arg(kSmall)->Arg(kLarge);
-BENCHMARK_TEMPLATE(BM_PushBack, Pow2Vector)->Arg(kSmall)->Arg(kLarge);
+BENCHMARK_TEMPLATE(BM_PushBack, PagedVector)->Arg(kSmall)->Arg(kLarge);
 BENCHMARK_TEMPLATE(BM_PushBack, FlatVector)->Arg(kSmall)->Arg(kLarge);
 
-BENCHMARK_TEMPLATE(BM_SequentialRead, NonPow2Vector)->Arg(kSmall)->Arg(kLarge);
-BENCHMARK_TEMPLATE(BM_SequentialRead, Pow2Vector)->Arg(kSmall)->Arg(kLarge);
+BENCHMARK_TEMPLATE(BM_SequentialRead, PagedVector)->Arg(kSmall)->Arg(kLarge);
 BENCHMARK_TEMPLATE(BM_SequentialRead, FlatVector)->Arg(kSmall)->Arg(kLarge);
 
-BENCHMARK_TEMPLATE(BM_RandomRead, NonPow2Vector)->Arg(kSmall)->Arg(kLarge);
-BENCHMARK_TEMPLATE(BM_RandomRead, Pow2Vector)->Arg(kSmall)->Arg(kLarge);
+BENCHMARK_TEMPLATE(BM_RandomRead, PagedVector)->Arg(kSmall)->Arg(kLarge);
 BENCHMARK_TEMPLATE(BM_RandomRead, FlatVector)->Arg(kSmall)->Arg(kLarge);
 
 }  // namespace
diff --git a/netkat/paged_stable_vector_test.cc b/netkat/paged_stable_vector_test.cc
index 5b73580..747005a 100644
--- a/netkat/paged_stable_vector_test.cc
+++ b/netkat/paged_stable_vector_test.cc
@@ -25,7 +25,8 @@ namespace {
 
 // A small, but otherwise random page size used throughout the tests.
 // Using a small page size is useful for exercising the page replacement logic.
-static constexpr int kSmallPageSize = 3;
+// Must be a power of two, as required by `PagedStableVector`.
+static constexpr int kSmallPageSize = 4;
 
 void PushBackInreasesSize(std::vector<std::string> elements) {
   PagedStableVector<std::string, kSmallPageSize> vector;

From 3592d99121545dbd0485d30738e65fa9f36fb6ea Mon Sep 17 00:00:00 2001
From: Steffen Smolka <steffen.smolka@gmail.com>
Date: Wed, 10 Jun 2026 03:49:01 -0700
Subject: [PATCH 5/7] [NetKAT] Add large-scale packet set benchmarks.

The existing benchmarks compile predicates of only tens of BDD nodes,
so they cannot detect effects that only manifest at scale: node arena
performance, unique-table pressure, or the algorithmic complexity of
the set operations. This makes them blind both to regressions at
realistic model sizes and to the improvements we most care about
landing next, such as operation memoization (b/382379263) and
complement edges (b/382380335).

Add benchmarks over pseudo-random sets drawn from a ~1M element space
encoded across 5 hex-digit fields. Random sets have incompressible
BDDs, so node counts scale with set size (~10^5-10^6 nodes), mimicking
large real-world NetKAT models and providing a yardstick for future
algorithmic work.

These benchmarks put the page-size change of this PR in perspective:
it yields ~1-3% end to end, since unique-table hashing dominates node
creation. The 1.1-2.6x microbenchmark win applies to the arena in
isolation only.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 netkat/packet_set_benchmark.cc | 80 ++++++++++++++++++++++++++++++++++
 1 file changed, 80 insertions(+)

diff --git a/netkat/packet_set_benchmark.cc b/netkat/packet_set_benchmark.cc
index 7de2fd0..7685713 100644
--- a/netkat/packet_set_benchmark.cc
+++ b/netkat/packet_set_benchmark.cc
@@ -12,7 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <cstdint>
 #include <optional>
+#include <utility>
 
 #include "absl/strings/str_cat.h"
 #include "benchmark/benchmark.h"
@@ -120,4 +122,82 @@ void BM_ReCompileOverlappingPredicate(benchmark::State& state) {
 }
 BENCHMARK(BM_ReCompileOverlappingPredicate);
 
+// -- Large-scale benchmarks ---------------------------------------------------
+//
+// The benchmarks above build BDDs of only tens of nodes, so they cannot detect
+// effects that only manifest at scale (node arena performance, unique-table
+// pressure, algorithmic complexity of the set operations). The benchmarks
+// below operate on sets of pseudo-random members of a 16^5 ~= 1M element
+// space, encoded over 5 hex-digit fields. Random sets have incompressible
+// BDDs, so node counts scale with set size, mimicking large real-world NetKAT
+// models.
+
+constexpr int kNumDigits = 5;
+
+// The `i`-th pseudo-random member of the space, under the given `seed`.
+// Distinct `i` mostly yield distinct members; collisions just shrink the set.
+uint32_t Member(uint32_t i, uint32_t seed) {
+  uint64_t state = (i + seed) * 6364136223846793005ULL + 1442695040888963407ULL;
+  return static_cast<uint32_t>(state >> 33) & ((1u << (4 * kNumDigits)) - 1);
+}
+
+// Matches exactly the packets whose digit fields encode `member`.
+PredicateProto MemberPredicate(uint32_t member) {
+  PredicateProto pred = MatchProto("f0", member & 15);
+  for (int d = 1; d < kNumDigits; ++d) {
+    pred = AndProto(std::move(pred),
+                    MatchProto(absl::StrCat("f", d), (member >> (4 * d)) & 15));
+  }
+  return pred;
+}
+
+// A balanced Or-tree over members [lo, hi) -- balanced to keep proto/compile
+// recursion depth logarithmic.
+PredicateProto RandomSetPredicate(uint32_t lo, uint32_t hi, uint32_t seed) {
+  if (hi - lo == 1) return MemberPredicate(Member(lo, seed));
+  uint32_t mid = lo + (hi - lo) / 2;
+  return OrProto(RandomSetPredicate(lo, mid, seed),
+                 RandomSetPredicate(mid, hi, seed));
+}
+
+// Benchmarks first-time compilation of a large random set, dominated by node
+// creation: unique-table hashing and arena appends.
+void BM_CompileLargeRandomSet(benchmark::State& state) {
+  PredicateProto pred = RandomSetPredicate(0, state.range(0), /*seed=*/1);
+  for (auto s : state) {
+    PacketSetManager manager;
+    PacketSetHandle set = manager.Compile(pred);
+    benchmark::DoNotOptimize(set);
+  }
+}
+BENCHMARK(BM_CompileLargeRandomSet)->Arg(1 << 12)->Arg(1 << 15);
+
+// Benchmarks `Not` of a large random set: a full traversal that copies every
+// node of the operand (no complement edges yet, see b/382380335).
+void BM_NotOfLargeRandomSet(benchmark::State& state) {
+  PacketSetManager manager;
+  PacketSetHandle set =
+      manager.Compile(RandomSetPredicate(0, state.range(0), /*seed=*/1));
+  for (auto s : state) {
+    PacketSetHandle result = manager.Not(set);
+    benchmark::DoNotOptimize(result);
+  }
+}
+BENCHMARK(BM_NotOfLargeRandomSet)->Arg(1 << 12)->Arg(1 << 15);
+
+// Benchmarks `Xor` of two large random sets: a compound operation (two `And`s,
+// several `Not`s) that traverses both operands and creates many nodes.
+void BM_XorOfLargeRandomSets(benchmark::State& state) {
+  PacketSetManager manager;
+  PacketSetHandle lhs =
+      manager.Compile(RandomSetPredicate(0, state.range(0), /*seed=*/1));
+  PacketSetHandle rhs =
+      manager.Compile(RandomSetPredicate(0, state.range(0), /*seed=*/2));
+  for (auto s : state) {
+    PacketSetHandle result = manager.Xor(lhs, rhs);
+    benchmark::DoNotOptimize(result);
+  }
+}
+BENCHMARK(BM_XorOfLargeRandomSets)->Arg(1 << 12)->Arg(1 << 15);
+
 }  // namespace netkat

From dabc4dcbef59dd74b80572cd30e2e8c4644966a6 Mon Sep 17 00:00:00 2001
From: Steffen Smolka <steffen.smolka@gmail.com>
Date: Wed, 10 Jun 2026 03:58:10 -0700
Subject: [PATCH 6/7] [NetKAT] Strengthen the PagedStableVector
 pointer-stability test.

The test only held references to the first two elements, both in the
first page. The positions most at risk of invalidation are elements
adjacent to page boundaries, at the moments a new page or a larger page
table gets allocated. Hold a reference to every element across several
pages instead, and verify contents in addition to addresses.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 netkat/paged_stable_vector_test.cc | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/netkat/paged_stable_vector_test.cc b/netkat/paged_stable_vector_test.cc
index 747005a..9ad2345 100644
--- a/netkat/paged_stable_vector_test.cc
+++ b/netkat/paged_stable_vector_test.cc
@@ -85,11 +85,15 @@ FUZZ_TEST(PagedStableVectorTest, BracketAssigmentWorks);
 TEST(PagedStableVectorTest, ReferencesDontGetInvalidated) {
   PagedStableVector<std::string, kSmallPageSize> vector;
 
-  // Store a few references.
-  vector.push_back("first element");
-  std::string* first_element_ptr = &vector[0];
-  vector.push_back("second element");
-  std::string* second_element_ptr = &vector[1];
+  // Store a reference to every element as it is added, spanning several pages
+  // so that some references point to elements right before and right after
+  // page boundaries -- the positions most at risk when a new page or a larger
+  // page table gets allocated.
+  std::vector<std::string*> element_ptrs;
+  for (int i = 0; i < 10 * kSmallPageSize; ++i) {
+    vector.push_back(std::to_string(i));
+    element_ptrs.push_back(&vector[i]);
+  }
 
   // Push a ton of elements to trigger page allocation.
   // If this were a regular std::vector, the references would be invalidated.
@@ -98,8 +102,10 @@ TEST(PagedStableVectorTest, ReferencesDontGetInvalidated) {
   }
 
   // Check that the references are still valid.
-  EXPECT_EQ(&vector[0], first_element_ptr);
-  EXPECT_EQ(&vector[1], second_element_ptr);
+  for (int i = 0; i < element_ptrs.size(); ++i) {
+    EXPECT_EQ(&vector[i], element_ptrs[i]);
+    EXPECT_EQ(*element_ptrs[i], std::to_string(i));
+  }
 };
 
 }  // namespace

From 18014212941df96e1c8987df778645c30ff14770 Mon Sep 17 00:00:00 2001
From: Steffen Smolka <steffen.smolka@gmail.com>
Date: Wed, 10 Jun 2026 09:10:26 -0700
Subject: [PATCH 7/7] [NetKAT] Align microbenchmark page size with the
 production page size.

The managers' pages shrank from 2^21 to 2^9 nodes (see #105); keep the
microbenchmark representative of what production uses.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 netkat/paged_stable_vector_benchmark.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/netkat/paged_stable_vector_benchmark.cc b/netkat/paged_stable_vector_benchmark.cc
index b3af0be..41c6386 100644
--- a/netkat/paged_stable_vector_benchmark.cc
+++ b/netkat/paged_stable_vector_benchmark.cc
@@ -39,8 +39,8 @@ struct FakeNode {
 };
 static_assert(sizeof(FakeNode) == 24);
 
-// The page size of `PacketSetManager::nodes_`: 2^21 nodes, or 48 MiB.
-constexpr size_t kPageSize = size_t{1} << 21;
+// The page size of `PacketSetManager::nodes_`: 512 nodes, or 12 KiB.
+constexpr size_t kPageSize = size_t{1} << 9;
 
 template <class Vector>
 Vector MakeFilledVector(size_t size) {