From 8a5131702efd216fd8659b9d1395f1a4e881338b Mon Sep 17 00:00:00 2001 From: Steffen Smolka Date: Wed, 10 Jun 2026 08:59:49 -0700 Subject: [PATCH 1/2] [NetKAT] Shrink node storage pages from 64 MiB to 16 KiB. The managers' node vectors allocate memory in pages. At 64 MiB, every page allocation exceeds malloc's mmap threshold (typically 128 KiB), so each manager pays an mmap/munmap syscall pair - significant for short-lived managers, which compile a policy and are discarded. At 16 KiB, pages are recycled through the allocator's freelists, while still amortizing allocation over hundreds of nodes. In benchmarks, this speeds up first-time compilation of small policies by up to 3x (e.g. BM_FirstTimeCompileOverlappingPredicate: 10.8us -> 3.7us); the syscall cost was diagnosed with strace -c. Co-Authored-By: Claude Fable 5 --- netkat/packet_set.h | 11 +++++++---- netkat/packet_transformer.h | 11 +++++++---- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/netkat/packet_set.h b/netkat/packet_set.h index f11088f..e7daf8e 100644 --- a/netkat/packet_set.h +++ b/netkat/packet_set.h @@ -343,10 +343,13 @@ class PacketSetManager { [[nodiscard]] std::string ToString(const DecisionNode& node) const; - // The page size of the `nodes_` vector: 64 MiB or ~ 67 MB. - // Chosen large enough to reduce the cost of dynamic allocation, and small - // enough to avoid excessive memory overhead. - static constexpr size_t kPageSize = (1 << 26) / sizeof(DecisionNode); + // The page size of the `nodes_` vector: 16 KiB. + // Chosen large enough to amortize the cost of dynamic allocation over + // hundreds of nodes, and small enough that pages stay below the malloc + // mmap/trim thresholds (typically 128 KiB): this way, short-lived managers + // recycle pages through the allocator's freelists instead of paying an + // mmap/munmap syscall pair per manager. + static constexpr size_t kPageSize = (1 << 14) / sizeof(DecisionNode); // The decision nodes forming the BDD-style DAG representation of packet sets. // `PacketSetHandle::node_index_` indexes into this vector. diff --git a/netkat/packet_transformer.h b/netkat/packet_transformer.h index 4c9c90d..9ce7215 100644 --- a/netkat/packet_transformer.h +++ b/netkat/packet_transformer.h @@ -399,10 +399,13 @@ class PacketTransformerManager { [[nodiscard]] std::string ToString(const DecisionNode& node) const; - // The page size of the `nodes_` vector: 64 MiB or ~ 67 MB. - // Chosen large enough to reduce the cost of dynamic allocation, and small - // enough to avoid excessive memory overhead. - static constexpr size_t kPageSize = (1 << 26) / sizeof(DecisionNode); + // The page size of the `nodes_` vector: 16 KiB. + // Chosen large enough to amortize the cost of dynamic allocation over + // hundreds of nodes, and small enough that pages stay below the malloc + // mmap/trim thresholds (typically 128 KiB): this way, short-lived managers + // recycle pages through the allocator's freelists instead of paying an + // mmap/munmap syscall pair per manager. + static constexpr size_t kPageSize = (1 << 14) / sizeof(DecisionNode); // Helper functions to deal with DecisionNodes directly. // TODO(dilo): Is there a convenient way to either avoid these or avoid making From 06c0551ae325af47a4dc8c52c3813f757ee8e7cd Mon Sep 17 00:00:00 2001 From: Steffen Smolka Date: Wed, 10 Jun 2026 09:08:08 -0700 Subject: [PATCH 2/2] [NetKAT] Define the small page sizes as power-of-two node counts. Deriving the page size from a byte budget yields a non-power-of-two node count for packet sets (16 KiB / 24 B = 682), which forces the index arithmetic in PagedStableVector::operator[] -- on the hot path of nearly every operation -- to compile to multiply sequences instead of single shift/mask instructions. Round to 512 nodes (12 KiB) instead; transformer pages become an explicit 256 nodes (16 KiB), numerically unchanged. Both stay far below the malloc mmap/trim thresholds, which is what this PR is about. This also unblocks stacking #101, which enforces power-of-two page sizes at compile time. Co-Authored-By: Claude Fable 5 --- netkat/packet_set.h | 8 +++++--- netkat/packet_transformer.h | 8 +++++--- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/netkat/packet_set.h b/netkat/packet_set.h index e7daf8e..8b24d51 100644 --- a/netkat/packet_set.h +++ b/netkat/packet_set.h @@ -343,13 +343,15 @@ class PacketSetManager { [[nodiscard]] std::string ToString(const DecisionNode& node) const; - // The page size of the `nodes_` vector: 16 KiB. + // The page size of the `nodes_` vector: 512 nodes, or 12 KiB. // Chosen large enough to amortize the cost of dynamic allocation over // hundreds of nodes, and small enough that pages stay below the malloc // mmap/trim thresholds (typically 128 KiB): this way, short-lived managers // recycle pages through the allocator's freelists instead of paying an - // mmap/munmap syscall pair per manager. - static constexpr size_t kPageSize = (1 << 14) / sizeof(DecisionNode); + // mmap/munmap syscall pair per manager. A power of two so that indexing + // into the vector -- which is on the hot path of nearly every operation -- + // compiles to shifts and masks rather than multiply sequences. + static constexpr size_t kPageSize = size_t{1} << 9; // The decision nodes forming the BDD-style DAG representation of packet sets. // `PacketSetHandle::node_index_` indexes into this vector. diff --git a/netkat/packet_transformer.h b/netkat/packet_transformer.h index 9ce7215..ae9ab40 100644 --- a/netkat/packet_transformer.h +++ b/netkat/packet_transformer.h @@ -399,13 +399,15 @@ class PacketTransformerManager { [[nodiscard]] std::string ToString(const DecisionNode& node) const; - // The page size of the `nodes_` vector: 16 KiB. + // The page size of the `nodes_` vector: 256 nodes, or 16 KiB. // Chosen large enough to amortize the cost of dynamic allocation over // hundreds of nodes, and small enough that pages stay below the malloc // mmap/trim thresholds (typically 128 KiB): this way, short-lived managers // recycle pages through the allocator's freelists instead of paying an - // mmap/munmap syscall pair per manager. - static constexpr size_t kPageSize = (1 << 14) / sizeof(DecisionNode); + // mmap/munmap syscall pair per manager. A power of two so that indexing + // into the vector -- which is on the hot path of nearly every operation -- + // compiles to shifts and masks rather than multiply sequences. + static constexpr size_t kPageSize = size_t{1} << 8; // Helper functions to deal with DecisionNodes directly. // TODO(dilo): Is there a convenient way to either avoid these or avoid making