diff --git a/bench/live-fork-pause-window/bench-hugepages.py b/bench/live-fork-pause-window/bench-hugepages.py new file mode 100644 index 0000000..ef8db56 --- /dev/null +++ b/bench/live-fork-pause-window/bench-hugepages.py @@ -0,0 +1,430 @@ +#!/usr/bin/env python3 +"""Hugepage vs baseline spawn latency + live-BRANCH pause-window benchmark. + +Compares two spawn configurations back-to-back: + - baseline: live_fork=true, hugepages=false (4 KiB pages) + - hugepages: live_fork=true, hugepages=true (2 MiB pages, MFD_HUGETLB) + +For each iteration the script: + 1. Spawns N sandboxes, records wall-clock spawn time. + 2. Picks the first sandbox and runs a live BRANCH off it, records pause_ms. + 3. Kills all sandboxes + the branch snapshot. + +Iterations are interleaved (baseline, hugepages, baseline, ...) so +cache effects average out rather than stacking on one configuration. + +Metrics emitted +--------------- +- spawn_ms wall-clock from POST /v1/sandboxes to last sandbox confirmed +- ms_per_child spawn_ms / n +- pause_ms source-VM pause window from the BRANCH response + +Output +------ +- bench-hugepages.csv one row per iteration +- stdout summary table: p50/p90/max for both metrics, side by side + +Usage (must run as root — FC API sockets are root-owned):: + + sudo python3 bench-hugepages.py \\ + --source-tag python-numpy \\ + --n 100 \\ + --iterations 10 + +On memory-constrained hosts (< 4 GiB RAM or < 50 free hugepages) reduce +--n to avoid OOM. The script warns when HugePages_Free < n. +""" +import argparse +import json +import os +import shutil +import socket +import statistics +import subprocess +import sys +import time +import urllib.error +import urllib.request + +DEFAULT_BIN = os.path.expanduser("~/forkd/target/release/forkd-controller") +DEFAULT_FC = "/usr/local/bin/firecracker" +DEFAULT_SNAP_ROOT = os.path.expanduser("~/.local/share/forkd/snapshots") + +WORK = "/tmp/forkd-bench-hugepages" +CSV_PATH = os.path.join(WORK, "bench-hugepages.csv") + + +# --------------------------------------------------------------------------- +# HTTP helpers +# --------------------------------------------------------------------------- + +def http(base_url, method, path, body=None, timeout=120): + data = json.dumps(body).encode() if body is not None else None + headers = {"Content-Type": "application/json"} if body is not None else {} + req = urllib.request.Request( + f"{base_url}{path}", data=data, method=method, headers=headers + ) + try: + with urllib.request.urlopen(req, timeout=timeout) as resp: + raw = resp.read().decode("utf-8", errors="replace") + return resp.status, json.loads(raw) if raw else None + except urllib.error.HTTPError as e: + raw = e.read().decode("utf-8", errors="replace") + try: + return e.code, json.loads(raw) + except json.JSONDecodeError: + return e.code, raw + + +def wait_for_healthy(base_url, port, deadline_s=20): + end = time.time() + deadline_s + while time.time() < end: + try: + s = socket.create_connection(("127.0.0.1", port), timeout=1) + s.close() + status, _ = http(base_url, "GET", "/healthz", timeout=2) + if status == 200: + return + except (ConnectionRefusedError, socket.timeout, OSError): + pass + time.sleep(0.3) + raise RuntimeError(f"daemon not healthy after {deadline_s}s") + + +# --------------------------------------------------------------------------- +# Daemon lifecycle +# --------------------------------------------------------------------------- + +def setup_workdir(source_tag, source_dir): + shutil.rmtree(WORK, ignore_errors=True) + os.makedirs(f"{WORK}/snapshots", exist_ok=True) + os.makedirs(f"{WORK}/audit", exist_ok=True) + + # Create a real directory for the snapshot so we can rewrite + # snapshot.json with paths correct for this machine. Symlink the + # large binary files (memory.bin, vmstate) to avoid copying GiBs. + target = f"{WORK}/snapshots/{source_tag}" + os.makedirs(target, exist_ok=True) + + snap_json_src = os.path.join(source_dir, "snapshot.json") + with open(snap_json_src) as f: + snap = json.load(f) + + # Rewrite vmstate and memory paths to point at their actual locations + # on this machine. Handles snapshots packed on a different host where + # the absolute paths are baked in (e.g. /home/yangdongxu/...). + for key in ("vmstate", "memory"): + if key in snap: + filename = os.path.basename(snap[key]) + actual = os.path.join(source_dir, filename) + snap[key] = actual + # Symlink into work dir so the daemon can find them there too. + link = os.path.join(target, filename) + if not os.path.lexists(link): + os.symlink(actual, link) + + with open(os.path.join(target, "snapshot.json"), "w") as f: + json.dump(snap, f, indent=2) + + state = { + "snapshots": { + source_tag: { + "tag": source_tag, + "dir": target, + "created_at_unix": int(time.time()), + "status": "ready", + } + } + } + with open(f"{WORK}/state.json", "w") as f: + json.dump(state, f, indent=2) + + +def start_daemon(bin_path, bind): + log = open(f"{WORK}/controller.log", "wb") + return subprocess.Popen( + [ + "sudo", bin_path, "serve", + "--bind", bind, + "--state", f"{WORK}/state.json", + "--snapshot-root", f"{WORK}/snapshots", + "--audit-log", f"{WORK}/audit/audit.log", + ], + stdout=log, + stderr=log, + stdin=subprocess.DEVNULL, + ) + + +def kill_leftovers(bind): + subprocess.run( + ["sudo", "pkill", "-f", f"forkd-controller serve --bind {bind}"], + stderr=subprocess.DEVNULL, + ) + subprocess.run( + ["sudo", "pkill", "-9", "-f", "/usr/local/bin/firecracker"], + stderr=subprocess.DEVNULL, + ) + time.sleep(0.5) + + +# --------------------------------------------------------------------------- +# Benchmark core +# --------------------------------------------------------------------------- + +def hugepages_free(): + """Read HugePages_Free from /proc/meminfo.""" + try: + for line in open("/proc/meminfo"): + if line.startswith("HugePages_Free:"): + return int(line.split()[1]) + except OSError: + pass + return 0 + + +def spawn_sandboxes(base_url, tag, n, hugepages): + """POST /v1/sandboxes; return (sandbox_ids, spawn_ms).""" + body = { + "snapshot_tag": tag, + "n": n, + "live_fork": True, + "hugepages": hugepages, + "per_child_netns": True, + } + t0 = time.time() + status, resp = http(base_url, "POST", "/v1/sandboxes", body) + spawn_ms = (time.time() - t0) * 1000 + if status != 201: + raise RuntimeError(f"spawn HTTP {status}: {resp!r}") + ids = [s["id"] for s in resp] + return ids, spawn_ms + + +def branch_sandbox(base_url, sandbox_id, iteration, hugepages_label, mode): + """BRANCH sandbox_id with the given mode; return pause_ms and branch tag.""" + tag = f"bench-hp{hugepages_label}-{mode}-{iteration:03d}-{int(time.time() * 1000)}" + body = {"tag": tag, "mode": mode} + if mode == "live": + body["wait"] = True + status, resp = http( + base_url, + "POST", + f"/v1/sandboxes/{sandbox_id}/branch", + body, + timeout=60, + ) + if status not in (201, 202): + raise RuntimeError(f"branch HTTP {status}: {resp!r}") + return resp.get("pause_ms"), tag + + +def kill_sandboxes(base_url, ids): + for sid in ids: + http(base_url, "DELETE", f"/v1/sandboxes/{sid}") + + +def delete_snapshot(base_url, tag): + status, _ = http(base_url, "DELETE", f"/v1/snapshots/{tag}") + if status not in (200, 204): + print(f" warn: DELETE snapshot {tag} -> HTTP {status}", file=sys.stderr) + + +def run_iteration(base_url, tag, n, hugepages, iteration, branch_mode): + """One full iteration: spawn N → branch first → kill all. Returns row dict.""" + label = "true" if hugepages else "false" + + # Spawn N sandboxes. + ids, spawn_ms = spawn_sandboxes(base_url, tag, n, hugepages) + + # Branch the first sandbox to get pause_ms. + pause_ms, branch_tag = branch_sandbox(base_url, ids[0], iteration, label, branch_mode) + + # Cleanup. + kill_sandboxes(base_url, ids) + delete_snapshot(base_url, branch_tag) + + return { + "hugepages": label, + "n": n, + "iteration": iteration, + "spawn_ms": round(spawn_ms, 2), + "ms_per_child": round(spawn_ms / n, 2), + "pause_ms": pause_ms, + } + + +# --------------------------------------------------------------------------- +# Reporting +# --------------------------------------------------------------------------- + +COLS = ["hugepages", "n", "iteration", "spawn_ms", "ms_per_child", "pause_ms"] + + +def write_csv(rows, path): + with open(path, "w") as f: + f.write(",".join(COLS) + "\n") + for r in rows: + f.write(",".join("" if r[c] is None else str(r[c]) for c in COLS) + "\n") + + +def pct(vals, p): + if not vals: + return float("nan") + if len(vals) == 1: + return vals[0] + return statistics.quantiles(vals, n=100)[p - 1] + + +def summarize(rows, n, csv_path): + write_csv(rows, csv_path) + + by_hp = {"false": [], "true": []} + for r in rows: + by_hp[r["hugepages"]].append(r) + + print(f"\n=== SUMMARY n={n} ===") + header = ( + f" {'config':<16} {'iters':>5} " + f"{'spawn_ms p50':>13} {'p99':>7} {'max':>7} " + f"{'ms/child p50':>13} {'p99':>7} " + f"{'pause_ms p50':>13} {'p99':>7} {'max':>7}" + ) + print(header) + print(" " + "-" * (len(header) - 2)) + + for label in ("false", "true"): + rs = by_hp[label] + if not rs: + continue + spawns = [r["spawn_ms"] for r in rs] + per_child = [r["ms_per_child"] for r in rs] + pauses = [r["pause_ms"] for r in rs if r["pause_ms"] is not None] + + print( + f" {'hugepages='+label:<16} {len(rs):>5} " + f"{statistics.median(spawns):>13.1f} {pct(spawns,99):>7.1f} {max(spawns):>7.1f} " + f"{statistics.median(per_child):>13.2f} {pct(per_child,99):>7.2f} " + f"{statistics.median(pauses) if pauses else float('nan'):>13.1f} " + f"{pct(pauses,99) if pauses else float('nan'):>7.1f} " + f"{max(pauses) if pauses else float('nan'):>7.1f}" + ) + + # Headline speedup ratios. + base_rows = by_hp["false"] + hp_rows = by_hp["true"] + if base_rows and hp_rows: + base_p50 = statistics.median(r["spawn_ms"] for r in base_rows) + hp_p50 = statistics.median(r["spawn_ms"] for r in hp_rows) + base_p99 = pct([r["spawn_ms"] for r in base_rows], 99) + hp_p99 = pct([r["spawn_ms"] for r in hp_rows], 99) + if hp_p50 > 0: + print(f"\n spawn speedup p50: {base_p50:.0f}ms → {hp_p50:.0f}ms ({base_p50/hp_p50:.2f}×)") + if hp_p99 > 0: + print(f" spawn speedup p99: {base_p99:.0f}ms → {hp_p99:.0f}ms ({base_p99/hp_p99:.2f}×)") + + print(f"\n CSV written to: {csv_path}") + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + +def main(): + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument("--source-tag", default="python-numpy", + help="snapshot tag to spawn from (default: python-numpy)") + parser.add_argument("--snap-root", default=DEFAULT_SNAP_ROOT, + help="directory containing snapshot subdirs") + parser.add_argument("--controller-bin", default=DEFAULT_BIN, + help="path to forkd-controller binary") + parser.add_argument("--port", type=int, default=8892, + help="port for the isolated controller instance") + parser.add_argument("--n", type=int, default=100, + help="sandboxes to spawn per iteration (default: 100)") + parser.add_argument("--iterations", type=int, default=10, + help="iterations per configuration (default: 10)") + parser.add_argument("--out-csv", default=CSV_PATH, + help="path for the output CSV") + parser.add_argument("--branch-mode", default="diff", + choices=["full", "diff", "live"], + help="BRANCH mode to use for pause_ms measurement (default: diff)") + args = parser.parse_args() + + bind = f"127.0.0.1:{args.port}" + base_url = f"http://{bind}" + + source_dir = os.path.join(args.snap_root, args.source_tag) + if not os.path.isdir(source_dir): + sys.exit(f"source snapshot not found: {source_dir}\n" + f"run: forkd pull deeplethe/{args.source_tag}") + + # Warn if hugepage pool looks too small for the requested N. + free_hp = hugepages_free() + hugepage_bytes_needed = args.n * 2 # rough: each sandbox needs ~2 MiB for memfd + if free_hp > 0 and free_hp * 2 < hugepage_bytes_needed: + print( + f"[!] warning: HugePages_Free={free_hp} ({free_hp * 2} MiB) may be " + f"insufficient for --n {args.n}. Consider reducing --n or increasing " + f"/proc/sys/vm/nr_hugepages.", + file=sys.stderr, + ) + + src_mem = os.path.join(source_dir, "memory.bin") + src_bytes = os.path.getsize(src_mem) if os.path.exists(src_mem) else None + + print(f"[*] source: {source_dir}") + if src_bytes: + print(f" memory.bin: {src_bytes} bytes ({src_bytes // (1024 * 1024)} MiB)") + print(f"[*] n={args.n} iterations={args.iterations} branch-mode={args.branch_mode} controller={bind}") + print(f"[*] HugePages_Free={free_hp} ({free_hp * 2} MiB available)") + + kill_leftovers(bind) + setup_workdir(args.source_tag, source_dir) + + print("[*] starting daemon") + daemon = start_daemon(args.controller_bin, bind) + rows = [] + + try: + wait_for_healthy(base_url, args.port) + print("[+] daemon healthy\n") + + # Interleave baseline and hugepages iterations so thermal / + # cache effects average out across both configurations. + for i in range(args.iterations): + for hugepages in (False, True): + label = "true" if hugepages else "false" + print(f" [hugepages={label} iter={i}] running...", flush=True) + row = run_iteration(base_url, args.source_tag, args.n, hugepages, i, args.branch_mode) + rows.append(row) + print( + f" [hugepages={label} iter={i}] done " + f"spawn={row['spawn_ms']:.0f}ms " + f"({row['ms_per_child']:.1f}ms/child) " + f"pause={row['pause_ms']}ms" + ) + + summarize(rows, args.n, args.out_csv) + + finally: + print("\n[*] tearing down") + subprocess.run(["sudo", "kill", str(daemon.pid)], stderr=subprocess.DEVNULL) + subprocess.run( + ["sudo", "pkill", "-9", "-f", "/usr/local/bin/firecracker"], + stderr=subprocess.DEVNULL, + ) + time.sleep(0.5) + + +if __name__ == "__main__": + try: + main() + except Exception as e: + print(f"\n[!] FAIL: {e}", file=sys.stderr) + import traceback + traceback.print_exc() + sys.exit(1) diff --git a/crates/forkd-cli/src/doctor.rs b/crates/forkd-cli/src/doctor.rs index 31b25b6..c5068fa 100644 --- a/crates/forkd-cli/src/doctor.rs +++ b/crates/forkd-cli/src/doctor.rs @@ -98,6 +98,7 @@ pub fn run(daemon_url: &str, daemon_token: Option) -> anyhow::Result<()> // unprivileged_userfaultfd / runs an older kernel. check_uffd_wp(), check_memfd_create(), + check_hugepages(), ]; print_report(&checks); @@ -597,6 +598,60 @@ fn check_memfd_create() -> Check { } } +fn check_hugepages() -> Check { + #[cfg(target_os = "linux")] + { + let meminfo = match std::fs::read_to_string("/proc/meminfo") { + Ok(s) => s, + Err(e) => { + return Check::warn( + "hugepages", + format!("read /proc/meminfo: {e}"), + "expected on Linux", + ) + } + }; + + let parse = |key: &str| -> Option { + meminfo + .lines() + .find(|l| l.starts_with(key)) + .and_then(|l| l.split_whitespace().nth(1)) + .and_then(|v| v.parse().ok()) + }; + + let total = parse("HugePages_Total:").unwrap_or(0); + let free = parse("HugePages_Free:").unwrap_or(0); + + if total == 0 { + return Check::warn( + "hugepages", + "HugePages_Total=0 (none reserved)", + "echo 512 | sudo tee /proc/sys/vm/nr_hugepages \ + (or rerun scripts/setup-host.sh); needed for --hugepages spawns", + ); + } + if free == 0 { + return Check::warn( + "hugepages", + format!("HugePages_Total={total} but HugePages_Free=0 (pool exhausted)"), + "increase /proc/sys/vm/nr_hugepages or stop other hugepage consumers", + ); + } + Check::pass( + "hugepages", + format!( + "{free}/{total} 2 MiB pages free ({} MiB available)", + free * 2 + ), + ) + } + #[cfg(not(target_os = "linux"))] + { + Check::skip("hugepages", "not Linux") + } +} + fn check_daemon(daemon_url: &str, token: Option<&str>) -> Check { let url = format!("{}/v1/snapshots", daemon_url.trim_end_matches('/')); let agent = ureq::AgentBuilder::new() diff --git a/crates/forkd-cli/src/main.rs b/crates/forkd-cli/src/main.rs index 1c599ef..9d9874e 100644 --- a/crates/forkd-cli/src/main.rs +++ b/crates/forkd-cli/src/main.rs @@ -204,6 +204,14 @@ enum Cmd { /// backend swap; cost shows up on the first live BRANCH. #[arg(long)] live_fork: bool, + /// Back the memfd with 2 MiB hugepages (`MFD_HUGETLB | MFD_HUGE_2MB`). + /// Only meaningful with `--live-fork`. Reduces TLB pressure during + /// spawn-many and live BRANCH bulk-copy. Requires hugepages to be + /// reserved on the host (`echo N > /proc/sys/vm/nr_hugepages`); + /// `forkd doctor` checks availability. Falls back to normal pages + /// with a warning if the pool is exhausted. + #[arg(long, requires = "live_fork")] + hugepages: bool, /// Keep `/tmp/forkd-fork-/` after shutdown (default: remove). /// Useful for post-mortem inspection of child console logs and /// Firecracker API sockets. @@ -727,6 +735,7 @@ fn main() -> Result<()> { per_child_netns, memory_limit_mib, live_fork, + hugepages, keep_workdir, } => fork_cmd( tag, @@ -735,6 +744,7 @@ fn main() -> Result<()> { per_child_netns, memory_limit_mib, live_fork, + hugepages, keep_workdir, ), Cmd::Exec { @@ -2275,6 +2285,7 @@ fn branch_snapshot_via_daemon( Ok(()) } +#[allow(clippy::too_many_arguments)] fn fork_cmd( tag: String, n: usize, @@ -2282,6 +2293,7 @@ fn fork_cmd( per_child_netns: bool, memory_limit_mib: Option, live_fork: bool, + hugepages: bool, keep_workdir: bool, ) -> Result<()> { validate_tag(&tag)?; @@ -2325,7 +2337,9 @@ fn fork_cmd( // it can arm UFFD_WP on the shmem-backed VMA. Default // stays File for backward compat with v0.3.x flows. memory_backend: if live_fork { - forkd_vmm::MemoryBackend::MemfdShared + forkd_vmm::MemoryBackend::MemfdShared { + use_hugepages: hugepages, + } } else { forkd_vmm::MemoryBackend::File }, diff --git a/crates/forkd-controller/src/api.rs b/crates/forkd-controller/src/api.rs index 4446eda..21b8be4 100644 --- a/crates/forkd-controller/src/api.rs +++ b/crates/forkd-controller/src/api.rs @@ -273,6 +273,15 @@ pub struct CreateSandboxRequest { /// sandbox without going through the CLI's surface. #[serde(default)] pub live_fork: bool, + /// Back the sandbox's memfd with 2 MiB hugepages (`MFD_HUGETLB | + /// MFD_HUGE_2MB`). Reduces TLB pressure during spawn-many and the + /// BRANCH bulk-copy pass. Only meaningful when `live_fork: true`; + /// ignored otherwise. Requires non-zero `HugePages_Free` in + /// `/proc/meminfo` — `forkd doctor` checks this. Falls back to + /// normal 4 KiB pages with a warning if the pool is exhausted at + /// spawn time. + #[serde(default)] + pub hugepages: bool, } fn default_one() -> usize { diff --git a/crates/forkd-controller/src/http.rs b/crates/forkd-controller/src/http.rs index ba37071..706bcc4 100644 --- a/crates/forkd-controller/src/http.rs +++ b/crates/forkd-controller/src/http.rs @@ -605,7 +605,9 @@ async fn create_sandbox( // memfd-backed RAM so the Phase 6 mode=live BRANCH path can // arm UFFD_WP on it. Default stays File for backward compat. memory_backend: if req.live_fork { - forkd_vmm::MemoryBackend::MemfdShared + forkd_vmm::MemoryBackend::MemfdShared { + use_hugepages: req.hugepages, + } } else { forkd_vmm::MemoryBackend::File }, diff --git a/crates/forkd-vmm/src/lib.rs b/crates/forkd-vmm/src/lib.rs index 589c167..add6e4e 100644 --- a/crates/forkd-vmm/src/lib.rs +++ b/crates/forkd-vmm/src/lib.rs @@ -454,7 +454,16 @@ pub enum MemoryBackend { /// silently falls back to `MAP_PRIVATE`, breaking the WP-capture /// invariant. `forkd doctor` (Phase 8) will check for the patched /// binary at daemon start. - MemfdShared, + /// + /// `use_hugepages`: when true, the memfd is backed with 2 MiB + /// hugepages (`MFD_HUGETLB | MFD_HUGE_2MB`). Reduces TLB pressure + /// during spawn-many and the BRANCH bulk-copy pass. Requires the + /// host hugepage pool to be non-empty (`HugePages_Free > 0` in + /// `/proc/meminfo`); `forkd doctor` checks this. Falls back to + /// normal 4 KiB pages with a warning if the pool is exhausted. + MemfdShared { + use_hugepages: bool, + }, } /// Options controlling a fork-many operation. @@ -1335,7 +1344,7 @@ impl Snapshot { // v0.4 MemfdShared (Phase 5b) IS wired below. Anything else // fails loudly so callers don't silently get File semantics. match opts.memory_backend { - MemoryBackend::File | MemoryBackend::MemfdShared => {} + MemoryBackend::File | MemoryBackend::MemfdShared { .. } => {} MemoryBackend::Userfault { .. } => bail!( "MemoryBackend::Userfault is v0.3 scaffolding and not yet \ implemented — see docs/design/userfaultfd.md for status" @@ -1418,11 +1427,18 @@ impl Snapshot { // request goes out. The memfd holds the FC-visible RAM pages; // forkd-controller keeps an mmap on the same memfd so Phase 6 // can arm UFFDIO_WRITEPROTECT on the shared VMA. - if matches!(opts.memory_backend, MemoryBackend::MemfdShared) { + if matches!(opts.memory_backend, MemoryBackend::MemfdShared { .. }) { + let use_hugepages = matches!( + opts.memory_backend, + MemoryBackend::MemfdShared { + use_hugepages: true + } + ); for (i, child) in children.iter_mut().enumerate() { let region = memfd::create_and_populate( &self.memory, &format!("forkd-source-mem-{}", opts.netns_offset + i + 1), + use_hugepages, ) .with_context(|| { format!( diff --git a/crates/forkd-vmm/src/memfd.rs b/crates/forkd-vmm/src/memfd.rs index 4d2164b..7813183 100644 --- a/crates/forkd-vmm/src/memfd.rs +++ b/crates/forkd-vmm/src/memfd.rs @@ -34,6 +34,14 @@ use std::path::{Path, PathBuf}; use anyhow::{Context, Result}; +// log2(2 MiB) = 21, shifted into the hugepage-size field per . +#[cfg(target_os = "linux")] +const MFD_HUGE_2MB: libc::c_uint = 21 << libc::MFD_HUGE_SHIFT; + +// 2 MiB in bytes — ftruncate on a hugetlb memfd requires size to be a multiple of this. +#[cfg(target_os = "linux")] +const HUGE_PAGE_2MB: u64 = 2 * 1024 * 1024; + /// A memfd populated from a snapshot's memory file. Dropping the value /// closes the fd and releases the backing pages. /// @@ -86,10 +94,15 @@ impl MemfdRegion { /// `/proc/self/fd/` -> `target`); keep it short and ASCII. The /// kernel limit is 249 bytes plus the `memfd:` prefix. /// +/// `use_hugepages` is a boolean flag that when turned on activates the +/// MFD_HUGETLB & MFD_HUGE_2MB flags - backing the guest RAM with 2MiB +/// pages. If hugepage allocation fails (usually due to `ENOMEM` where +/// we have exhausted the hugepage pool), we fallback to default behavior. +/// /// Returns `Err` immediately if the source is missing or unreadable — /// no partial memfd is created in that case. #[cfg(target_os = "linux")] -pub fn create_and_populate(source: &Path, name: &str) -> Result { +pub fn create_and_populate(source: &Path, name: &str, use_hugepages: bool) -> Result { use std::io::copy; use std::os::unix::io::FromRawFd; @@ -101,22 +114,66 @@ pub fn create_and_populate(source: &Path, name: &str) -> Result { .len(); let cname = CString::new(name).context("memfd name must not contain null bytes")?; - // SAFETY: `cname` is a valid C string for the duration of the call; - // memfd_create either returns a fresh owned fd or -1. Flags are a - // literal bitfield. No aliasing concerns. - let fd = unsafe { libc::memfd_create(cname.as_ptr(), libc::MFD_CLOEXEC) }; - if fd < 0 { - return Err(io::Error::last_os_error()).context("memfd_create"); - } + + // Attempt hugepage-backed allocation if requested; normal pages otherwise. + let (fd, alloc_size, backed_by_hugepages) = if use_hugepages { + let aligned_size = (size_bytes + HUGE_PAGE_2MB - 1) & !(HUGE_PAGE_2MB - 1); + // SAFETY: `cname` is a valid C string for the duration of the call; + // memfd_create returns a fresh owned fd or -1. Flags are a literal bitfield. + let fd = unsafe { + libc::memfd_create( + cname.as_ptr(), + libc::MFD_CLOEXEC | libc::MFD_HUGETLB | MFD_HUGE_2MB, + ) + }; + (fd, aligned_size, true) + } else { + // SAFETY: same as above. + let fd = unsafe { libc::memfd_create(cname.as_ptr(), libc::MFD_CLOEXEC) }; + (fd, size_bytes, false) + }; + + // Handle allocation failure. For hugepages, ENOMEM means the pool is + // exhausted - warn and retry with normal 4 KiB pages. Any other error, + // or a failure on the normal path, is fatal. + let (fd, alloc_size, backed_by_hugepages) = if fd < 0 { + let err = io::Error::last_os_error(); + if backed_by_hugepages && err.raw_os_error() == Some(libc::ENOMEM) { + // hugepage allocation failure + tracing::warn!( + "hugepage pool exhausted (HugePages_Free=0?); \ + falling back to normal 4 KiB pages for memfd '{name}'. \ + Increase /proc/sys/vm/nr_hugepages to suppress this." + ); + // SAFETY: same as above — fresh syscall, no aliasing. + let fd = unsafe { libc::memfd_create(cname.as_ptr(), libc::MFD_CLOEXEC) }; + if fd < 0 { + return Err(io::Error::last_os_error()).context("memfd_create (fallback)"); + } + (fd, size_bytes, false) + } else { + // some other unknown error + return Err(err).context("memfd_create"); + } + } else { + // no error + (fd, alloc_size, backed_by_hugepages) + }; + // SAFETY: `fd` is freshly returned by memfd_create above and not // shared with any other File. `File::from_raw_fd` takes ownership. let mut memfd = unsafe { File::from_raw_fd(fd) }; memfd - .set_len(size_bytes) - .with_context(|| format!("ftruncate memfd to {size_bytes} B"))?; + .set_len(alloc_size) + .with_context(|| format!("ftruncate memfd to {alloc_size} B"))?; - let copied = copy(&mut src, &mut memfd) - .with_context(|| format!("copy {} -> memfd", source.display()))?; + // Hugetlb-backed memfds don't support write(), must use copy_via_mmap + let copied = if backed_by_hugepages { + copy_via_mmap(&src, &memfd, size_bytes, alloc_size) + .with_context(|| format!("copy (mmap) {} -> memfd", source.display()))? + } else { + copy(&mut src, &mut memfd).with_context(|| format!("copy {} -> memfd", source.display()))? + }; if copied != size_bytes { anyhow::bail!( "short copy: source {} is {size_bytes} B but copied {copied}", @@ -130,11 +187,104 @@ pub fn create_and_populate(source: &Path, name: &str) -> Result { }) } +/// Populate a hugetlb-backed memfd from a source file via mmap + memcpy. +/// +/// HugeTLB files don't support write(). This method populates these pages +/// by mmp the memfd MAP_SHARED, mmap the source file MAP_PRIVATE, and memcpy +/// between the two mappings. +/// +/// # Parameters +/// - `src`: the source file (`memory.bin`) to copy from. +/// - `dst`: the hugetlb-backed memfd to copy into. Must already be sized +/// via `set_len(alloc_size)` before calling. +/// - `size_bytes`: exact number of bytes to copy from `src`. Must be <= +/// `alloc_size`. +/// - `alloc_size`: the memfd's `ftruncate`'d size (hugepage-aligned, +/// >= `size_bytes`). Used as the mmap length for `dst`. +/// +/// # Returns +/// The number of bytes copied (always equal to `size_bytes` on success). +#[cfg(target_os = "linux")] +fn copy_via_mmap(src: &File, dst: &File, size_bytes: u64, alloc_size: u64) -> io::Result { + use std::os::fd::AsRawFd; + + if size_bytes > alloc_size { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + format!("size_bytes ({size_bytes}) must be <= alloc_size ({alloc_size})"), + )); + } + + // dst is mapped at alloc_size (hugepage-aligned) but only size_bytes of + // actual data is copied into it. The tail (alloc_size - size_bytes) stays + // as the post-ftruncate zero-fill. FC never reads past size_bytes since + // the VMM API call passes size_bytes as the memory region length. + // + // SAFETY: dst is an open memfd sized to alloc_size via set_len; we own + // it and munmap before returning. MAP_SHARED so the memcpy below lands + // in the fd's backing pages rather than a private anonymous copy. + let dst_ptr = unsafe { + libc::mmap( + std::ptr::null_mut(), + alloc_size as usize, + libc::PROT_READ | libc::PROT_WRITE, + libc::MAP_SHARED, + dst.as_raw_fd(), + 0, + ) + }; + if dst_ptr == libc::MAP_FAILED { + return Err(io::Error::last_os_error()); + } + + // SAFETY: src is an open readable file of at least size_bytes; we own + // it and munmap before returning. MAP_PRIVATE so reads don't affect + // the source file's contents. + let src_ptr = unsafe { + libc::mmap( + std::ptr::null_mut(), + size_bytes as usize, + libc::PROT_READ, + libc::MAP_PRIVATE, + src.as_raw_fd(), + 0, + ) + }; + if src_ptr == libc::MAP_FAILED { + // SAFETY: dst_ptr is a valid mapping we created above. + unsafe { libc::munmap(dst_ptr, alloc_size as usize) }; + return Err(io::Error::last_os_error()); + } + + // SAFETY: both pointers are valid for the given lengths and don't overlap + // (they came from separate mmap calls on different fds). + unsafe { + std::ptr::copy_nonoverlapping( + src_ptr as *const u8, + dst_ptr as *mut u8, + size_bytes as usize, + ); + } + + // SAFETY: both pointers are valid mappings we created above with the + // exact lengths passed here. + unsafe { + libc::munmap(src_ptr, size_bytes as usize); + libc::munmap(dst_ptr, alloc_size as usize); + } + + Ok(size_bytes) +} + /// Non-Linux stub. `memfd_create` is a Linux-only syscall; building /// forkd on other platforms is a configuration error for the v0.4 /// live-fork path. #[cfg(not(target_os = "linux"))] -pub fn create_and_populate(_source: &Path, _name: &str) -> Result { +pub fn create_and_populate( + _source: &Path, + _name: &str, + _use_hugepages: bool, +) -> Result { anyhow::bail!( "memfd_create is Linux-only; v0.4 live-fork requires a Linux host with kernel >= 5.7" ) @@ -157,7 +307,7 @@ mod tests { #[test] fn create_and_populate_succeeds_for_small_file() { let src = write_temp_file("small", &vec![0xAAu8; 4096]); - let region = create_and_populate(&src, "forkd-test-small").unwrap(); + let region = create_and_populate(&src, "forkd-test-small", false).unwrap(); assert_eq!(region.size_bytes(), 4096); let p = region.backend_path(); let s = p.to_str().unwrap(); @@ -179,7 +329,7 @@ mod tests { let pattern: Vec = (0..8192).map(|i| (i % 256) as u8).collect(); let src = write_temp_file("match", &pattern); - let region = create_and_populate(&src, "forkd-test-match").unwrap(); + let region = create_and_populate(&src, "forkd-test-match", false).unwrap(); assert_eq!(region.size_bytes(), 8192); let mut reader = region.try_clone().unwrap(); @@ -196,6 +346,7 @@ mod tests { let result = create_and_populate( Path::new("/nonexistent/forkd-memfd-test/this-must-not-exist"), "forkd-test-missing", + false, ); assert!( result.is_err(), @@ -209,4 +360,142 @@ mod tests { "error must include source path; got: {msg}" ); } + + // --- copy_via_mmap unit tests (no hugepages required) --- + + #[test] + fn copy_via_mmap_size_guard_rejects_oversized_request() { + // size_bytes > alloc_size must return an error immediately. + // Use /dev/zero as a stand-in fd — we never reach the mmap calls. + let zero = File::open("/dev/zero").unwrap(); + let err = copy_via_mmap(&zero, &zero, 8192, 4096).unwrap_err(); + assert_eq!(err.kind(), std::io::ErrorKind::InvalidInput); + let msg = err.to_string(); + assert!( + msg.contains("size_bytes") && msg.contains("alloc_size"), + "error must name both fields; got: {msg}" + ); + } + + #[test] + fn copy_via_mmap_content_matches() { + use std::os::unix::io::FromRawFd; + + // Build a source file with a known pattern. + let pattern: Vec = (0..4096).map(|i| (i % 256) as u8).collect(); + let src = write_temp_file("mmap-src", &pattern); + let src_file = File::open(&src).unwrap(); + + // Create a plain (non-hugetlb) memfd as the destination. + let name = std::ffi::CString::new("forkd-mmap-test").unwrap(); + let fd = unsafe { libc::memfd_create(name.as_ptr(), libc::MFD_CLOEXEC) }; + assert!(fd >= 0, "memfd_create failed"); + let dst_file = unsafe { File::from_raw_fd(fd) }; + dst_file.set_len(4096).unwrap(); + + let copied = copy_via_mmap(&src_file, &dst_file, 4096, 4096).unwrap(); + assert_eq!(copied, 4096); + + // Read back through the fd and verify content. + let mut reader = dst_file.try_clone().unwrap(); + reader.seek(SeekFrom::Start(0)).unwrap(); + let mut buf = vec![0u8; 4096]; + reader.read_exact(&mut buf).unwrap(); + assert_eq!(buf, pattern, "mmap copy must produce identical bytes"); + + let _ = std::fs::remove_file(&src); + } + + // --- hugepages tests (skipped gracefully when pool unavailable) --- + + fn hugepages_available() -> bool { + std::fs::read_to_string("/proc/meminfo") + .ok() + .and_then(|s| { + s.lines() + .find(|l| l.starts_with("HugePages_Free:")) + .and_then(|l| l.split_whitespace().nth(1)) + .and_then(|v| v.parse::().ok()) + }) + .map(|n| n > 0) + .unwrap_or(false) + } + + #[test] + fn hugepages_metadata_correct() { + if !hugepages_available() { + eprintln!( + "skipping hugepages_metadata_correct: HugePages_Free=0 \ + (run `echo 512 | sudo tee /proc/sys/vm/nr_hugepages` to enable)" + ); + return; + } + let src = write_temp_file("hp-meta", &vec![0xBBu8; 4096]); + let region = create_and_populate(&src, "forkd-test-hp-meta", true).unwrap(); + + assert_eq!( + region.size_bytes(), + 4096, + "size_bytes must reflect source, not alloc_size" + ); + + let p = region.backend_path(); + let s = p.to_str().unwrap(); + let expected_prefix = format!("/proc/{}/fd/", std::process::id()); + assert!( + s.starts_with(&expected_prefix), + "expected {expected_prefix}N path, got: {s}" + ); + let _ = std::fs::remove_file(&src); + } + + #[test] + fn hugepages_content_matches_source() { + if !hugepages_available() { + eprintln!( + "skipping hugepages_content_matches_source: HugePages_Free=0 \ + (run `echo 512 | sudo tee /proc/sys/vm/nr_hugepages` to enable)" + ); + return; + } + let pattern: Vec = (0..8192).map(|i| (i % 256) as u8).collect(); + let src = write_temp_file("hp-content", &pattern); + + let region = create_and_populate(&src, "forkd-test-hp-content", true).unwrap(); + assert_eq!(region.size_bytes(), 8192); + + let mut reader = region.try_clone().unwrap(); + reader.seek(SeekFrom::Start(0)).unwrap(); + let mut buf = vec![0u8; 8192]; + reader.read_exact(&mut buf).unwrap(); + assert_eq!( + buf, pattern, + "hugepage-backed memfd content must match source" + ); + + let _ = std::fs::remove_file(&src); + } + + #[test] + fn hugepages_size_bytes_is_source_size_not_aligned() { + if !hugepages_available() { + eprintln!( + "skipping hugepages_size_bytes_is_source_size_not_aligned: HugePages_Free=0 \ + (run `echo 512 | sudo tee /proc/sys/vm/nr_hugepages` to enable)" + ); + return; + } + // 4096 bytes is well below 2 MiB — alloc_size will be rounded up to + // 2 MiB, but size_bytes() must still return 4096. + let src = write_temp_file("hp-align", &vec![0xCCu8; 4096]); + let region = create_and_populate(&src, "forkd-test-hp-align", true).unwrap(); + + assert_eq!( + region.size_bytes(), + 4096, + "size_bytes must be the source size (4096), not the hugepage-aligned alloc_size ({})", + HUGE_PAGE_2MB, + ); + let _ = std::fs::remove_file(&src); + } } diff --git a/sdk/python/forkd/controller.py b/sdk/python/forkd/controller.py index 365094f..c0d7962 100644 --- a/sdk/python/forkd/controller.py +++ b/sdk/python/forkd/controller.py @@ -117,6 +117,7 @@ def spawn_sandboxes( memory_limit_mib: Optional[int] = None, prewarm: bool = False, live_fork: bool = False, + hugepages: bool = False, ) -> list[dict]: """``POST /v1/sandboxes`` — fork N children from a snapshot tag. @@ -136,6 +137,14 @@ def spawn_sandboxes( see ``docs/VENDORED-FIRECRACKER.md``. No effect at spawn time beyond the backend swap; cost shows up on the first live BRANCH. + hugepages: + v0.4+. Back the memfd with 2 MiB hugepages + (``MFD_HUGETLB | MFD_HUGE_2MB``). Only meaningful with + ``live_fork=True``. Reduces TLB pressure during spawn-many + and live BRANCH bulk-copy. Requires non-zero + ``HugePages_Free`` in ``/proc/meminfo`` — ``forkd doctor`` + checks availability. Falls back to normal 4 KiB pages with + a warning if the pool is exhausted. Returns the list of SandboxInfo dicts (id, snapshot_tag, netns, guest_addr, created_at_unix, pid, memory_limit_mib). @@ -151,6 +160,8 @@ def spawn_sandboxes( body["prewarm"] = True if live_fork: body["live_fork"] = True + if hugepages: + body["hugepages"] = True return self._request("POST", "/v1/sandboxes", body) def list_sandboxes(self) -> list[dict]: diff --git a/sdk/typescript/package-lock.json b/sdk/typescript/package-lock.json index a94d00b..4cfac31 100644 --- a/sdk/typescript/package-lock.json +++ b/sdk/typescript/package-lock.json @@ -1,12 +1,12 @@ { "name": "@deeplethe/forkd", - "version": "0.3.3", + "version": "0.3.4", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "@deeplethe/forkd", - "version": "0.3.3", + "version": "0.3.4", "license": "Apache-2.0", "devDependencies": { "@types/node": "^20.0.0", diff --git a/sdk/typescript/src/controller.ts b/sdk/typescript/src/controller.ts index 678b156..7d67d69 100644 --- a/sdk/typescript/src/controller.ts +++ b/sdk/typescript/src/controller.ts @@ -131,6 +131,8 @@ export class Controller { memoryLimitMib?: number; prewarm?: boolean; liveFork?: boolean; + /** v0.4+: back the memfd with 2 MiB hugepages. Only meaningful with `liveFork: true`. */ + hugepages?: boolean; }): Promise { const body: SpawnOptions = { snapshot_tag: options.snapshotTag, @@ -146,6 +148,9 @@ export class Controller { if (options.liveFork !== undefined) { body.live_fork = options.liveFork; } + if (options.hugepages !== undefined) { + body.hugepages = options.hugepages; + } return this.request("POST", "/v1/sandboxes", body); } diff --git a/sdk/typescript/src/types.ts b/sdk/typescript/src/types.ts index 99be5de..f0d665a 100644 --- a/sdk/typescript/src/types.ts +++ b/sdk/typescript/src/types.ts @@ -57,6 +57,15 @@ export interface SpawnOptions { * `docs/VENDORED-FIRECRACKER.md`). */ live_fork?: boolean; + /** + * v0.4+: back the memfd with 2 MiB hugepages (`MFD_HUGETLB | + * MFD_HUGE_2MB`). Only meaningful with `live_fork: true`. Reduces + * TLB pressure during spawn-many and live BRANCH bulk-copy. Requires + * non-zero `HugePages_Free` in `/proc/meminfo` — `forkd doctor` + * checks availability. Falls back to normal 4 KiB pages with a + * warning if the pool is exhausted. + */ + hugepages?: boolean; } /**