Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
430 changes: 430 additions & 0 deletions bench/live-fork-pause-window/bench-hugepages.py

Large diffs are not rendered by default.

55 changes: 55 additions & 0 deletions crates/forkd-cli/src/doctor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ pub fn run(daemon_url: &str, daemon_token: Option<String>) -> anyhow::Result<()>
// unprivileged_userfaultfd / runs an older kernel.
check_uffd_wp(),
check_memfd_create(),
check_hugepages(),
];

print_report(&checks);
Expand Down Expand Up @@ -597,6 +598,60 @@ fn check_memfd_create() -> Check {
}
}

fn check_hugepages() -> Check {
#[cfg(target_os = "linux")]
{
let meminfo = match std::fs::read_to_string("/proc/meminfo") {
Ok(s) => s,
Err(e) => {
return Check::warn(
"hugepages",
format!("read /proc/meminfo: {e}"),
"expected on Linux",
)
}
};

let parse = |key: &str| -> Option<u64> {
meminfo
.lines()
.find(|l| l.starts_with(key))
.and_then(|l| l.split_whitespace().nth(1))
.and_then(|v| v.parse().ok())
};

let total = parse("HugePages_Total:").unwrap_or(0);
let free = parse("HugePages_Free:").unwrap_or(0);

if total == 0 {
return Check::warn(
"hugepages",
"HugePages_Total=0 (none reserved)",
"echo 512 | sudo tee /proc/sys/vm/nr_hugepages \
(or rerun scripts/setup-host.sh); needed for --hugepages spawns",
);
}
if free == 0 {
return Check::warn(
"hugepages",
format!("HugePages_Total={total} but HugePages_Free=0 (pool exhausted)"),
"increase /proc/sys/vm/nr_hugepages or stop other hugepage consumers",
);
}
Check::pass(
"hugepages",
format!(
"{free}/{total} 2 MiB pages free ({} MiB available)",
free * 2
),
)
}
#[cfg(not(target_os = "linux"))]
{
Check::skip("hugepages", "not Linux")
}
}

fn check_daemon(daemon_url: &str, token: Option<&str>) -> Check {
let url = format!("{}/v1/snapshots", daemon_url.trim_end_matches('/'));
let agent = ureq::AgentBuilder::new()
Expand Down
16 changes: 15 additions & 1 deletion crates/forkd-cli/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,14 @@ enum Cmd {
/// backend swap; cost shows up on the first live BRANCH.
#[arg(long)]
live_fork: bool,
/// Back the memfd with 2 MiB hugepages (`MFD_HUGETLB | MFD_HUGE_2MB`).
/// Only meaningful with `--live-fork`. Reduces TLB pressure during
/// spawn-many and live BRANCH bulk-copy. Requires hugepages to be
/// reserved on the host (`echo N > /proc/sys/vm/nr_hugepages`);
/// `forkd doctor` checks availability. Falls back to normal pages
/// with a warning if the pool is exhausted.
#[arg(long, requires = "live_fork")]
hugepages: bool,
/// Keep `/tmp/forkd-fork-<tag>/` after shutdown (default: remove).
/// Useful for post-mortem inspection of child console logs and
/// Firecracker API sockets.
Expand Down Expand Up @@ -727,6 +735,7 @@ fn main() -> Result<()> {
per_child_netns,
memory_limit_mib,
live_fork,
hugepages,
keep_workdir,
} => fork_cmd(
tag,
Expand All @@ -735,6 +744,7 @@ fn main() -> Result<()> {
per_child_netns,
memory_limit_mib,
live_fork,
hugepages,
keep_workdir,
),
Cmd::Exec {
Expand Down Expand Up @@ -2275,13 +2285,15 @@ fn branch_snapshot_via_daemon(
Ok(())
}

#[allow(clippy::too_many_arguments)]
fn fork_cmd(
tag: String,
n: usize,
settle_secs: u64,
per_child_netns: bool,
memory_limit_mib: Option<u64>,
live_fork: bool,
hugepages: bool,
keep_workdir: bool,
) -> Result<()> {
validate_tag(&tag)?;
Expand Down Expand Up @@ -2325,7 +2337,9 @@ fn fork_cmd(
// it can arm UFFD_WP on the shmem-backed VMA. Default
// stays File for backward compat with v0.3.x flows.
memory_backend: if live_fork {
forkd_vmm::MemoryBackend::MemfdShared
forkd_vmm::MemoryBackend::MemfdShared {
use_hugepages: hugepages,
}
} else {
forkd_vmm::MemoryBackend::File
},
Expand Down
9 changes: 9 additions & 0 deletions crates/forkd-controller/src/api.rs
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,15 @@ pub struct CreateSandboxRequest {
/// sandbox without going through the CLI's surface.
#[serde(default)]
pub live_fork: bool,
/// Back the sandbox's memfd with 2 MiB hugepages (`MFD_HUGETLB |
/// MFD_HUGE_2MB`). Reduces TLB pressure during spawn-many and the
/// BRANCH bulk-copy pass. Only meaningful when `live_fork: true`;
/// ignored otherwise. Requires non-zero `HugePages_Free` in
/// `/proc/meminfo` — `forkd doctor` checks this. Falls back to
/// normal 4 KiB pages with a warning if the pool is exhausted at
/// spawn time.
#[serde(default)]
pub hugepages: bool,
}

fn default_one() -> usize {
Expand Down
4 changes: 3 additions & 1 deletion crates/forkd-controller/src/http.rs
Original file line number Diff line number Diff line change
Expand Up @@ -605,7 +605,9 @@ async fn create_sandbox(
// memfd-backed RAM so the Phase 6 mode=live BRANCH path can
// arm UFFD_WP on it. Default stays File for backward compat.
memory_backend: if req.live_fork {
forkd_vmm::MemoryBackend::MemfdShared
forkd_vmm::MemoryBackend::MemfdShared {
use_hugepages: req.hugepages,
}
} else {
forkd_vmm::MemoryBackend::File
},
Expand Down
22 changes: 19 additions & 3 deletions crates/forkd-vmm/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -454,7 +454,16 @@ pub enum MemoryBackend {
/// silently falls back to `MAP_PRIVATE`, breaking the WP-capture
/// invariant. `forkd doctor` (Phase 8) will check for the patched
/// binary at daemon start.
MemfdShared,
///
/// `use_hugepages`: when true, the memfd is backed with 2 MiB
/// hugepages (`MFD_HUGETLB | MFD_HUGE_2MB`). Reduces TLB pressure
/// during spawn-many and the BRANCH bulk-copy pass. Requires the
/// host hugepage pool to be non-empty (`HugePages_Free > 0` in
/// `/proc/meminfo`); `forkd doctor` checks this. Falls back to
/// normal 4 KiB pages with a warning if the pool is exhausted.
MemfdShared {
use_hugepages: bool,
},
}

/// Options controlling a fork-many operation.
Expand Down Expand Up @@ -1335,7 +1344,7 @@ impl Snapshot {
// v0.4 MemfdShared (Phase 5b) IS wired below. Anything else
// fails loudly so callers don't silently get File semantics.
match opts.memory_backend {
MemoryBackend::File | MemoryBackend::MemfdShared => {}
MemoryBackend::File | MemoryBackend::MemfdShared { .. } => {}
MemoryBackend::Userfault { .. } => bail!(
"MemoryBackend::Userfault is v0.3 scaffolding and not yet \
implemented — see docs/design/userfaultfd.md for status"
Expand Down Expand Up @@ -1418,11 +1427,18 @@ impl Snapshot {
// request goes out. The memfd holds the FC-visible RAM pages;
// forkd-controller keeps an mmap on the same memfd so Phase 6
// can arm UFFDIO_WRITEPROTECT on the shared VMA.
if matches!(opts.memory_backend, MemoryBackend::MemfdShared) {
if matches!(opts.memory_backend, MemoryBackend::MemfdShared { .. }) {
let use_hugepages = matches!(
opts.memory_backend,
MemoryBackend::MemfdShared {
use_hugepages: true
}
);
for (i, child) in children.iter_mut().enumerate() {
let region = memfd::create_and_populate(
&self.memory,
&format!("forkd-source-mem-{}", opts.netns_offset + i + 1),
use_hugepages,
)
.with_context(|| {
format!(
Expand Down
Loading