From 2a5a539db25261da591ad1d46f3f9c0f0342692a Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 27 Apr 2026 09:49:43 +0200 Subject: [PATCH 01/21] backend: switch Storage to Arc Foundation for runtime backend selection (ZFS, btrfs, dm-thin) on Linux. The trait was already object-safe; this adds create_storage and init_storage factories on each platform crate, makes Storage a runtime trait object, and updates the 14 CLI call sites. No behavior change. Linux still always returns LinuxStorage (ZFS), macOS always MacosStorage. Concrete dispatch lands when StorageKind plumbing arrives in Phase 2. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/ember-linux/src/lib.rs | 24 ++++++++++++++++++++++++ crates/ember-macos/src/lib.rs | 16 ++++++++++++++++ src/backend.rs | 18 +++++++++++++----- src/cli/image.rs | 8 ++++---- src/cli/init.rs | 4 ++-- src/cli/snapshot.rs | 10 +++++----- src/cli/vm.rs | 12 ++++++------ 7 files changed, 70 insertions(+), 22 deletions(-) diff --git a/crates/ember-linux/src/lib.rs b/crates/ember-linux/src/lib.rs index c5b6c7a..4a6b5c7 100644 --- a/crates/ember-linux/src/lib.rs +++ b/crates/ember-linux/src/lib.rs @@ -13,3 +13,27 @@ pub use network_backend::LinuxNetwork; pub use platform::LinuxPlatform; pub use storage::LinuxStorage; pub use vm::LinuxVm; + +use std::sync::Arc; + +use ember_core::backend::{InitConfig, StorageBackend}; +use ember_core::config::GlobalConfig; +use ember_core::error::Result; + +/// Construct the active storage backend. +/// +/// Returns the implementation indicated by [`GlobalConfig::storage_backend`]. +/// Currently only ZFS is wired up; btrfs and dm-thin variants are added in +/// later phases of the multi-backend rollout. +pub fn create_storage(config: &GlobalConfig) -> Arc { + Arc::new(LinuxStorage::new(config)) +} + +/// Initialize storage during `ember init`. +/// +/// Dispatches to the concrete backend's `init` associated function. The +/// trait object is unavailable here because the backend hasn't been +/// constructed yet. +pub fn init_storage(config: &InitConfig) -> Result<()> { + LinuxStorage::init(config) +} diff --git a/crates/ember-macos/src/lib.rs b/crates/ember-macos/src/lib.rs index d8ea045..6e3f0a1 100644 --- a/crates/ember-macos/src/lib.rs +++ b/crates/ember-macos/src/lib.rs @@ -9,3 +9,19 @@ pub use network::MacosNetwork; pub use platform::MacosPlatform; pub use storage::MacosStorage; pub use vm::MacosVm; + +use std::sync::Arc; + +use ember_core::backend::{InitConfig, StorageBackend}; +use ember_core::config::GlobalConfig; +use ember_core::error::Result; + +/// Construct the active storage backend. +pub fn create_storage(config: &GlobalConfig) -> Arc { + Arc::new(MacosStorage::new(config)) +} + +/// Initialize storage during `ember init`. +pub fn init_storage(config: &InitConfig) -> Result<()> { + MacosStorage::init(config) +} diff --git a/src/backend.rs b/src/backend.rs index 664a0e6..9e2d353 100644 --- a/src/backend.rs +++ b/src/backend.rs @@ -4,6 +4,8 @@ //! This module re-exports them and provides the type aliases that //! select the active platform backend at compile time. +use std::sync::Arc; + // Re-export all traits and shared types from ember-core. pub use ember_core::backend::*; @@ -15,21 +17,27 @@ pub use ember_linux as linux; pub use ember_macos as macos; // Type aliases for the active platform backend. -// Selected at compile time based on target OS. +// `Vm`, `Network`, and `CurrentPlatform` are selected at compile time +// via `#[cfg(target_os)]`. `Storage` is a runtime trait object so the +// concrete implementation can be picked from `GlobalConfig` (e.g., ZFS +// vs btrfs vs dm-thin on Linux). #[cfg(target_os = "linux")] pub type Vm = ember_linux::LinuxVm; #[cfg(target_os = "linux")] -pub type Storage = ember_linux::LinuxStorage; -#[cfg(target_os = "linux")] pub type Network = ember_linux::LinuxNetwork; #[cfg(target_os = "macos")] pub type Vm = ember_macos::MacosVm; #[cfg(target_os = "macos")] -pub type Storage = ember_macos::MacosStorage; -#[cfg(target_os = "macos")] pub type Network = ember_macos::MacosNetwork; +pub type Storage = Arc; + +#[cfg(target_os = "linux")] +pub use ember_linux::{create_storage, init_storage}; +#[cfg(target_os = "macos")] +pub use ember_macos::{create_storage, init_storage}; + #[cfg(target_os = "linux")] pub type CurrentPlatform = ember_linux::LinuxPlatform; #[cfg(target_os = "macos")] diff --git a/src/cli/image.rs b/src/cli/image.rs index d72b6aa..1d7731b 100644 --- a/src/cli/image.rs +++ b/src/cli/image.rs @@ -4,7 +4,7 @@ use clap::{Args, Subcommand}; use super::fmt::{format_bytes_binary, MIB}; use super::vm::OutputFormat; -use crate::backend::{CurrentPlatform, Platform, Storage, StorageBackend}; +use crate::backend::{create_storage, CurrentPlatform, Platform, Storage}; use crate::image; use ember_core::config::GlobalConfig; use ember_core::image::pull::ImageReference; @@ -93,7 +93,7 @@ pub fn run(cmd: &ImageCommand, state_dir: &Path) -> anyhow::Result<()> { fn pull(args: &PullArgs, state_dir: &Path) -> anyhow::Result<()> { let store = StateStore::new(state_dir.to_path_buf()); let config: GlobalConfig = store.read(&store.config_path())?; - let storage = Storage::new(&config); + let storage = create_storage(&config); // Parse and validate the image reference. let reference = ImageReference::parse(&args.reference)?; @@ -149,7 +149,7 @@ fn pull(args: &PullArgs, state_dir: &Path) -> anyhow::Result<()> { fn build(args: &BuildArgs, state_dir: &Path) -> anyhow::Result<()> { let store = StateStore::new(state_dir.to_path_buf()); let config: GlobalConfig = store.read(&store.config_path())?; - let storage = Storage::new(&config); + let storage = create_storage(&config); // Sanitize the name for storage use. let local_name = image::build::sanitize_name(&args.name)?; @@ -302,7 +302,7 @@ fn delete(args: &DeleteArgs, state_dir: &Path) -> anyhow::Result<()> { // Destroy the image's storage (zvol on Linux, .img file on macOS). let config: GlobalConfig = store.read(&store.config_path())?; - let storage = Storage::new(&config); + let storage = create_storage(&config); println!("Destroying storage for image '{}'...", local_name); storage.destroy_image_storage(&local_name, args.force)?; diff --git a/src/cli/init.rs b/src/cli/init.rs index c1fbcda..f90aafc 100644 --- a/src/cli/init.rs +++ b/src/cli/init.rs @@ -2,7 +2,7 @@ use std::path::Path; use clap::Args; -use crate::backend::{CurrentPlatform, InitConfig, Platform, Storage, StorageBackend}; +use crate::backend::{init_storage, CurrentPlatform, InitConfig, Platform}; use ember_core::config::GlobalConfig; use ember_core::state::store::StateStore; @@ -40,7 +40,7 @@ pub fn run(args: &InitArgs, state_dir: &Path) -> anyhow::Result<()> { dataset: args.dataset.clone(), device: args.device.clone(), }; - Storage::init(&init_config)?; + init_storage(&init_config)?; // 3. Initialize state directory structure. let store = StateStore::new(state_dir.to_path_buf()); diff --git a/src/cli/snapshot.rs b/src/cli/snapshot.rs index a7146dd..3ae5bca 100644 --- a/src/cli/snapshot.rs +++ b/src/cli/snapshot.rs @@ -2,7 +2,7 @@ use std::path::Path; use clap::{Args, Subcommand}; -use crate::backend::{Storage, StorageBackend}; +use crate::backend::create_storage; use ember_core::config::GlobalConfig; use ember_core::state::store::StateStore; use ember_core::state::vm; @@ -80,7 +80,7 @@ pub fn run(cmd: &SnapshotCommand, state_dir: &Path) -> anyhow::Result<()> { fn create(args: &CreateArgs, state_dir: &Path) -> anyhow::Result<()> { let store = StateStore::new(state_dir.to_path_buf()); let config: GlobalConfig = store.read(&store.config_path())?; - let storage = Storage::new(&config); + let storage = create_storage(&config); let _metadata = vm::load(&store, &args.vm_name)?; // Disallow the reserved snapshot name. @@ -114,7 +114,7 @@ fn create(args: &CreateArgs, state_dir: &Path) -> anyhow::Result<()> { fn list(args: &ListArgs, state_dir: &Path) -> anyhow::Result<()> { let store = StateStore::new(state_dir.to_path_buf()); let config: GlobalConfig = store.read(&store.config_path())?; - let storage = Storage::new(&config); + let storage = create_storage(&config); let _metadata = vm::load(&store, &args.vm_name)?; let snapshots = storage.list_snapshots(&args.vm_name)?; @@ -190,7 +190,7 @@ use super::fmt::format_bytes_binary as format_bytes; fn delete(args: &DeleteArgs, state_dir: &Path) -> anyhow::Result<()> { let store = StateStore::new(state_dir.to_path_buf()); let config: GlobalConfig = store.read(&store.config_path())?; - let storage = Storage::new(&config); + let storage = create_storage(&config); let _metadata = vm::load(&store, &args.vm_name)?; // Disallow deleting the reserved snapshot. @@ -226,7 +226,7 @@ fn delete(args: &DeleteArgs, state_dir: &Path) -> anyhow::Result<()> { fn restore(args: &RestoreArgs, state_dir: &Path) -> anyhow::Result<()> { let store = StateStore::new(state_dir.to_path_buf()); let config: GlobalConfig = store.read(&store.config_path())?; - let storage = Storage::new(&config); + let storage = create_storage(&config); let _metadata = vm::require_stopped(&store, &args.vm_name, "restoring a snapshot")?; // Verify the snapshot exists. diff --git a/src/cli/vm.rs b/src/cli/vm.rs index 83ea157..215f901 100644 --- a/src/cli/vm.rs +++ b/src/cli/vm.rs @@ -5,7 +5,7 @@ use uuid::Uuid; use super::fmt::{format_bytes_binary, GIB, MIB}; use crate::backend::{ - CurrentPlatform, Network, NetworkBackend, Platform, Storage, StorageBackend, Vm, VmBackend, + create_storage, CurrentPlatform, Network, NetworkBackend, Platform, Storage, Vm, VmBackend, }; use crate::image; use ember_core::config; @@ -430,7 +430,7 @@ fn create(args: &CreateArgs, state_dir: &Path) -> anyhow::Result<()> { let image_ref = image_entry.reference.clone(); let image_size_mib = image_entry.size_mib; - let storage = Storage::new(&global_config); + let storage = create_storage(&global_config); let mut rollback = Rollback::new(); @@ -605,7 +605,7 @@ fn fork(args: &ForkArgs, state_dir: &Path) -> anyhow::Result<()> { let subnet = args.network.clone().or(source.subnet.clone()); - let storage = Storage::new(&global_config); + let storage = create_storage(&global_config); // Clone source VM's storage into the new VM via the storage backend. println!("Forking '{}' → '{}'...", args.source, args.name); @@ -922,7 +922,7 @@ fn resize(args: &ResizeArgs, state_dir: &Path) -> anyhow::Result<()> { // Grow the disk via the storage backend (handles resize + ext4 expand). let config: GlobalConfig = store.read(&store.config_path())?; - let storage = Storage::new(&config); + let storage = create_storage(&config); println!( "Resizing disk to {}...", format_bytes_binary(new_gib as u64 * GIB) @@ -1042,7 +1042,7 @@ fn delete(args: &DeleteArgs, state_dir: &Path) -> anyhow::Result<()> { // Check for storage-level dependents (e.g. ZFS fork snapshots with clones). // On macOS/APFS this always returns empty — forks are independent. let config: GlobalConfig = store.read(&store.config_path())?; - let storage = Storage::new(&config); + let storage = create_storage(&config); let dependents = storage.storage_dependents(&args.name)?; if !dependents.is_empty() { if !args.force { @@ -1106,7 +1106,7 @@ pub fn force_delete_vm(store: &StateStore, metadata: &VmMetadata) -> anyhow::Res // Destroy storage via the backend. let config: GlobalConfig = store.read(&store.config_path())?; - let storage = Storage::new(&config); + let storage = create_storage(&config); println!("Destroying storage for VM '{}'...", metadata.name); let _ = storage.destroy_vm_storage(&metadata.name); From 9c0ffa7ae1d87afc1b28c073dc7bbf5b5e3c96c9 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 27 Apr 2026 09:52:38 +0200 Subject: [PATCH 02/21] config: add StorageKind enum and multi-backend init fields MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GlobalConfig gains storage_backend (defaults Zfs for backwards compat), storage_path, and dm_thin_block_size. InitConfig gains storage_path, btrfs_size, dm_thin_size, dm_thin_metadata_size, dm_thin_block_size. No CLI surface change yet — ember init still produces a Zfs config. The fields are wired so the dm-thin and btrfs backends can read them without further config-shape churn. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/ember-core/src/backend.rs | 21 +++++++++-- crates/ember-core/src/config.rs | 29 ++++++++++++++ src/cli/init.rs | 65 ++++++++++++++++---------------- 3 files changed, 80 insertions(+), 35 deletions(-) diff --git a/crates/ember-core/src/backend.rs b/crates/ember-core/src/backend.rs index c50051d..4f028fd 100644 --- a/crates/ember-core/src/backend.rs +++ b/crates/ember-core/src/backend.rs @@ -49,8 +49,7 @@ pub struct SnapshotInfo { /// Configuration for storage backend initialization during `ember init`. /// /// Carries the subset of init arguments that the storage backend needs. -/// Platform-specific fields (like ZFS pool/dataset) are ignored on platforms -/// that don't use them. +/// Platform-specific fields are ignored on backends that don't use them. pub struct InitConfig { /// Path to the state directory (e.g., `/var/lib/ember` or `~/Library/Application Support/ember`). pub state_dir: PathBuf, @@ -59,8 +58,24 @@ pub struct InitConfig { /// Dataset name within the ZFS pool. Used on Linux; ignored on macOS. pub dataset: String, /// Block device for ZFS pool creation (e.g., `/dev/loop0`). - /// Only used on Linux when creating a new pool. + /// Only used by the ZFS backend when creating a new pool. pub device: Option, + /// Backing path for non-ZFS backends. + /// + /// * btrfs: block device or sparse image file path. + /// * dm-thin: directory for metadata.img/data.img, or a raw block device. + pub storage_path: Option, + /// Size for the file-backed btrfs image (e.g., `"50G"`). When set, the + /// btrfs backend treats `storage_path` as a sparse file to create. + pub btrfs_size: Option, + /// Size of the dm-thin data device (e.g., `"50G"`). Required for + /// file-backed dm-thin pools, ignored for raw block devices. + pub dm_thin_size: Option, + /// Override metadata device size for dm-thin (e.g., `"800M"`). + /// `None` lets the backend compute it via `thin_metadata_size`. + pub dm_thin_metadata_size: Option, + /// dm-thin pool block size in 512-byte sectors. `None` uses the backend default. + pub dm_thin_block_size: Option, } // --------------------------------------------------------------------------- diff --git a/crates/ember-core/src/config.rs b/crates/ember-core/src/config.rs index a651c2d..df2da70 100644 --- a/crates/ember-core/src/config.rs +++ b/crates/ember-core/src/config.rs @@ -5,9 +5,27 @@ use std::path::PathBuf; use serde::{Deserialize, Serialize}; +/// Which storage backend is active. +/// +/// On Linux, runtime-selected at `ember init` and serialized to `config.json`. +/// Older configs without this field default to [`StorageKind::Zfs`] for +/// backwards compatibility. +#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "kebab-case")] +pub enum StorageKind { + #[default] + Zfs, + Btrfs, + DmThin, +} + /// Global configuration written by `ember init`. #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] pub struct GlobalConfig { + /// Storage backend selected at init time. + /// Defaults to [`StorageKind::Zfs`] for older configs without this field. + #[serde(default)] + pub storage_backend: StorageKind, pub pool: String, pub dataset: String, pub kernel_path: Option, @@ -19,6 +37,17 @@ pub struct GlobalConfig { /// Populated during `ember init`; defaults to empty path for backwards compat. #[serde(default)] pub state_dir: PathBuf, + /// Backing path for non-ZFS backends. + /// + /// * btrfs: block device or sparse image file containing the btrfs filesystem. + /// * dm-thin: directory holding `metadata.img`/`data.img`, or a raw block device. + /// * ZFS: unused. + #[serde(default)] + pub storage_path: Option, + /// dm-thin pool block size in 512-byte sectors. Permanent at pool creation. + /// `None` means "use the backend default" (128 = 64 KiB). + #[serde(default)] + pub dm_thin_block_size: Option, } impl GlobalConfig { diff --git a/src/cli/init.rs b/src/cli/init.rs index f90aafc..400175d 100644 --- a/src/cli/init.rs +++ b/src/cli/init.rs @@ -39,6 +39,11 @@ pub fn run(args: &InitArgs, state_dir: &Path) -> anyhow::Result<()> { pool: args.pool.clone(), dataset: args.dataset.clone(), device: args.device.clone(), + storage_path: None, + btrfs_size: None, + dm_thin_size: None, + dm_thin_metadata_size: None, + dm_thin_block_size: None, }; init_storage(&init_config)?; @@ -63,11 +68,14 @@ pub fn run(args: &InitArgs, state_dir: &Path) -> anyhow::Result<()> { // 6. Write config. let config = GlobalConfig { + storage_backend: ember_core::config::StorageKind::Zfs, pool: args.pool.clone(), dataset: args.dataset.clone(), kernel_path, wan_iface, state_dir: state_dir.to_path_buf(), + storage_path: None, + dm_thin_block_size: None, }; store.write(&store.config_path(), &config)?; println!("Configuration written to {}", store.config_path().display()); @@ -79,16 +87,29 @@ pub fn run(args: &InitArgs, state_dir: &Path) -> anyhow::Result<()> { #[cfg(test)] mod tests { use super::*; + use ember_core::config::StorageKind; use std::path::PathBuf; + fn zfs_config(pool: &str, dataset: &str) -> GlobalConfig { + GlobalConfig { + storage_backend: StorageKind::Zfs, + pool: pool.to_string(), + dataset: dataset.to_string(), + kernel_path: None, + wan_iface: None, + state_dir: PathBuf::default(), + storage_path: None, + dm_thin_block_size: None, + } + } + #[test] fn global_config_round_trip_with_kernel() { let config = GlobalConfig { - pool: "testpool".to_string(), - dataset: "ember".to_string(), kernel_path: Some(PathBuf::from("/var/lib/ember/kernels/vmlinux")), wan_iface: Some("eth0".to_string()), state_dir: PathBuf::from("/var/lib/ember"), + ..zfs_config("testpool", "ember") }; let json = serde_json::to_string(&config).unwrap(); @@ -98,14 +119,7 @@ mod tests { #[test] fn global_config_round_trip_without_kernel() { - let config = GlobalConfig { - pool: "mypool".to_string(), - dataset: "mydata".to_string(), - kernel_path: None, - wan_iface: None, - state_dir: PathBuf::default(), - }; - + let config = zfs_config("mypool", "mydata"); let json = serde_json::to_string(&config).unwrap(); let loaded: GlobalConfig = serde_json::from_str(&json).unwrap(); assert_eq!(loaded, config); @@ -114,11 +128,9 @@ mod tests { #[test] fn global_config_json_format() { let config = GlobalConfig { - pool: "tank".to_string(), - dataset: "ember".to_string(), kernel_path: Some(PathBuf::from("/kernels/vmlinux")), wan_iface: Some("wlp2s0".to_string()), - state_dir: PathBuf::default(), + ..zfs_config("tank", "ember") }; let json: serde_json::Value = serde_json::to_value(&config).unwrap(); @@ -126,18 +138,12 @@ mod tests { assert_eq!(json["dataset"], "ember"); assert_eq!(json["kernel_path"], "/kernels/vmlinux"); assert_eq!(json["wan_iface"], "wlp2s0"); + assert_eq!(json["storage_backend"], "zfs"); } #[test] fn global_config_null_kernel_in_json() { - let config = GlobalConfig { - pool: "tank".to_string(), - dataset: "ember".to_string(), - kernel_path: None, - wan_iface: None, - state_dir: PathBuf::default(), - }; - + let config = zfs_config("tank", "ember"); let json: serde_json::Value = serde_json::to_value(&config).unwrap(); assert!(json["kernel_path"].is_null()); } @@ -149,11 +155,9 @@ mod tests { store.init().unwrap(); let config = GlobalConfig { - pool: "testpool".to_string(), - dataset: "ember".to_string(), - kernel_path: None, wan_iface: Some("eth0".to_string()), state_dir: dir.path().to_path_buf(), + ..zfs_config("testpool", "ember") }; store.write(&store.config_path(), &config).unwrap(); @@ -170,23 +174,18 @@ mod tests { let store = StateStore::new(dir.path().to_path_buf()); store.init().unwrap(); - // First write. let config1 = GlobalConfig { - pool: "pool1".to_string(), - dataset: "ds1".to_string(), - kernel_path: None, wan_iface: Some("eth0".to_string()), state_dir: dir.path().to_path_buf(), + ..zfs_config("pool1", "ds1") }; store.write(&store.config_path(), &config1).unwrap(); - // Second write (simulates re-running init). let config2 = GlobalConfig { - pool: "pool2".to_string(), - dataset: "ds2".to_string(), kernel_path: Some(PathBuf::from("/kernels/vmlinux")), wan_iface: Some("wlp2s0".to_string()), state_dir: dir.path().to_path_buf(), + ..zfs_config("pool2", "ds2") }; store.write(&store.config_path(), &config2).unwrap(); @@ -196,10 +195,12 @@ mod tests { #[test] fn global_config_backwards_compatible_without_wan_iface() { - // Older config.json files won't have wan_iface — serde(default) handles this. + // Older config.json files won't have wan_iface or storage_backend + // — serde(default) handles both. let json = r#"{"pool":"tank","dataset":"ember","kernel_path":null}"#; let loaded: GlobalConfig = serde_json::from_str(json).unwrap(); assert_eq!(loaded.pool, "tank"); assert_eq!(loaded.wan_iface, None); + assert_eq!(loaded.storage_backend, StorageKind::Zfs); } } From d177021193ff6a9ac18c2e6ae9f32501bac625b3 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 27 Apr 2026 09:58:55 +0200 Subject: [PATCH 03/21] docs: add dm-thin storage backend spec Design doc for adding Linux device-mapper thin provisioning as a third storage backend. Drops the ZFS kernel-module dependency and the dedicated-pool requirement while keeping block-level CoW (still a tight fit with Firecracker raw drives). Mirrors BTRFS-SPEC structure: trait-object dispatch, file-backed default, sparse metadata + data files, random u64 thin ids stored on existing VmMetadata/ImageEntry records (no separate allocator file). Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/DM-THIN-SPEC.md | 652 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 652 insertions(+) create mode 100644 docs/DM-THIN-SPEC.md diff --git a/docs/DM-THIN-SPEC.md b/docs/DM-THIN-SPEC.md new file mode 100644 index 0000000..d6242ff --- /dev/null +++ b/docs/DM-THIN-SPEC.md @@ -0,0 +1,652 @@ +# Ember — dm-thin Storage Backend + +This document specifies how ember will support Linux device-mapper thin provisioning (`dm-thin`) as an alternative to ZFS for copy-on-write VM storage on Linux. +The dm-thin backend is **not yet implemented** — this is a design spec. +It mirrors the structure of `BTRFS-SPEC.md` and reuses the trait-object dispatch model introduced there. + +The goal is the same as the btrfs spec: drop the ZFS kernel module dependency and the requirement for a dedicated pool device, while preserving block-level copy-on-write semantics that are already a tight fit with Firecracker (raw block drive, instant clones, real snapshots). + +## Design principles + +* **Same CLI, different storage**: All `ember` commands work identically regardless of which backend is active. Backend choice is invisible to users after `ember init`. +* **Block-level CoW**: dm-thin provides instant copy-on-write thin volumes and snapshots at the block layer, analogous to ZFS zvols + clones. No filesystem-level reflinks. +* **Block device drives**: VM root disks are exposed as `/dev/mapper/` block devices and passed directly to Firecracker as `path_on_host`. Same drive shape as the existing ZFS path. +* **Sparse-file backing by default**: `ember init` creates two sparse files (metadata + data) on the existing filesystem and assembles them into a thin pool via `losetup` + `dmsetup`. A raw block device may be used instead, but is not required. +* **Kernel-builtin**: dm-thin is in-tree (`CONFIG_DM_THIN_PROVISIONING`), shipped by every mainstream distribution since ~2012. No DKMS, no out-of-tree module, no licensing friction with the kernel. +* **No filesystem on the pool**: The pool itself is a block-device factory. Each thin volume is independently formatted with ext4 (the same ext4 image pipeline used today). The pool does not see file-level structure. +* **Thin volumes and snapshots are the same primitive**: In dm-thin, a snapshot is just another thin volume that shares blocks with its source. Image base, VM disk, user snapshot, and fork all use the same `create_snap` call. +* **Random 64-bit thin ids**: Unlike ZFS where datasets are addressed by name, dm-thin volumes are addressed by numeric ids. Ember picks a random `u64` per volume and retries on the rare collision. The id is stored on the existing `VmMetadata`/`ImageEntry` records; no separate allocator state. +* **Root required**: Same as ZFS — `dmsetup`, `losetup`, `mount`, and Firecracker all need root. + +## Component mapping + +| ZFS | dm-thin | Notes | +|-----|---------|-------| +| `zpool create pool /dev/sda` | `truncate` + `losetup` + `dmsetup create ember-pool ... thin-pool ...` | Thin pool replaces ZFS pool | +| `zfs create pool/images` | (none) | No dataset hierarchy; the pool is flat | +| `zfs create -V 10G pool/images/x` (zvol) | `dmsetup message ember-pool 0 "create_thin "` + `dmsetup create ember-img-x` | Thin volume replaces zvol | +| `zfs snapshot pool/images/x@base` | `dmsetup message ember-pool 0 "create_snap "` | Snapshot is just another thin id | +| `zfs clone pool/images/x@base pool/vms/y` | `create_snap ` + `dmsetup create ember-vm-y` | Same `create_snap`; activate as device | +| `zfs snapshot pool/vms/y@snap` | suspend vm + `create_snap ` + resume | Suspend ensures consistent on-disk state | +| `zfs rollback pool/vms/y@snap` | remove vm device + delete vm thin + `create_snap ` + recreate vm device | Restore replaces the live volume | +| `zfs destroy pool/vms/y@snap` | `dmsetup message ember-pool 0 "delete "` | Releases blocks back to the pool | +| `zfs set volsize=20G pool/vms/y` | `dmsetup suspend` + `dmsetup load` (new size) + `dmsetup resume` + `resize2fs` | Resize is a table reload | +| `zfs destroy -r pool/vms/y` | `dmsetup remove ember-vm-y` + `delete ` | Two-step: deactivate then free | +| `/dev/zvol/pool/vms/y` | `/dev/mapper/ember-vm-y` | Different path, same shape | + +## Backend selection + +### `ember init` + +The `--storage` flag introduced in `BTRFS-SPEC.md` gains a third value: `dm-thin`. + +```bash +# ZFS (existing) +ember init --pool tank --device /dev/sda + +# btrfs (per BTRFS-SPEC.md) +ember init --storage btrfs --storage-path /var/lib/ember/btrfs.img --size 50G + +# dm-thin with sparse files (default) +ember init --storage dm-thin --size 50G + +# dm-thin with explicit data file location +ember init --storage dm-thin --storage-path /var/lib/ember/dm-thin --size 50G + +# dm-thin on a raw block device +ember init --storage dm-thin --storage-path /dev/sdb +``` + +When `--storage dm-thin` is specified: + +* `--storage-path` selects the directory holding `metadata.img` and `data.img` (file-backed mode), or the raw block device to use (device mode). Defaults to `/var/lib/ember/dm-thin` for file-backed mode. +* `--size` is required for file-backed mode and disambiguates from device mode. When present, two sparse files are created. When absent, `--storage-path` must be an existing block device. +* `--metadata-size` is optional and defaults to a value computed from `thin_metadata_size` (see "Pool sizing" below). +* `--block-size` is optional and defaults to `64K`. **Permanent** — cannot be changed after pool creation. +* `--pool`, `--dataset`, and `--device` are ZFS-only and ignored. + +If a `config.json` already exists, `ember init` checks `storage_backend` and refuses to re-initialize with a different backend. +Switching backends requires `ember deinit` first. + +### Backend dispatch + +Same as in `BTRFS-SPEC.md`: `Storage` becomes `Arc` on Linux, dispatched at construction time by a `create_storage()` factory: + +```rust +// crates/ember-linux/src/lib.rs +pub fn create_storage(config: &GlobalConfig) -> Arc { + match config.storage_backend { + StorageKind::Zfs => Arc::new(ZfsStorage::new(config)), + StorageKind::Btrfs => Arc::new(BtrfsStorage::new(config)), + StorageKind::DmThin => Arc::new(DmThinStorage::new(config)), + } +} +``` + +`StorageKind` gains a `DmThin` variant. +The `--storage dm-thin` CLI flag accepts `dm-thin` and serializes as `dm-thin` (lowercase, hyphen) to match common usage. + +### Init dispatch + +`StorageBackend::init` remains an associated function. The `ember init` handler matches on the requested backend: + +```rust +match storage_backend { + StorageKind::Zfs => ZfsStorage::init(&init_config)?, + StorageKind::Btrfs => BtrfsStorage::init(&init_config)?, + StorageKind::DmThin => DmThinStorage::init(&init_config)?, +} +``` + +### Config changes + +`GlobalConfig` extensions, building on the btrfs spec: + +```rust +#[derive(Clone, Copy, Debug, Default, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "kebab-case")] +pub enum StorageKind { + #[default] + Zfs, + Btrfs, + DmThin, +} + +pub struct GlobalConfig { + #[serde(default)] + pub storage_backend: StorageKind, + pub pool: String, // ZFS only + pub dataset: String, // ZFS only + pub kernel_path: Option, + pub wan_iface: Option, + pub state_dir: PathBuf, + /// Block device or image file path. Used by btrfs and dm-thin. + /// For dm-thin: directory containing metadata.img/data.img, or a raw device. + #[serde(default)] + pub storage_path: Option, + /// dm-thin pool block size in 512-byte sectors (default: 128 = 64KiB). + /// Permanent at pool creation. + #[serde(default)] + pub dm_thin_block_size: Option, +} +``` + +The pool name (`ember-pool`) and device-mapper prefixes (`ember-img-`, `ember-vm-`) are constants — not user-configurable. +This keeps the config small and prevents collisions between concurrent ember installations on the same host. Multi-instance support is out of scope for this spec. + +`InitConfig` extensions: + +```rust +pub struct InitConfig { + pub state_dir: PathBuf, + pub pool: String, // ZFS only + pub dataset: String, // ZFS only + pub device: Option, // ZFS only + pub storage_path: Option, // btrfs + dm-thin + pub btrfs_size: Option, // btrfs only + /// Size of the dm-thin data device (e.g., "50G"). + /// Required for file-backed mode, ignored for device mode. + pub dm_thin_size: Option, + /// Override metadata device size. Defaults to `thin_metadata_size` output. + pub dm_thin_metadata_size: Option, + /// Pool block size in sectors. Defaults to 128 (64KiB). + pub dm_thin_block_size: Option, +} +``` + +### Deinit trait method + +The `deinit()` method introduced in the btrfs spec applies here too. For dm-thin: + +1. Deactivate every active thin volume: `dmsetup remove ember-vm-*`, `ember-img-*`. +2. Remove the pool: `dmsetup remove ember-pool`. +3. Detach loop devices: `losetup -d /dev/loopN /dev/loopM`. +4. If file-backed: optionally delete `metadata.img` and `data.img` (gated behind `--purge`, default keep). +5. Remove the directory if empty. +6. Delete `config.json`. + +Block devices are left intact, same as ZFS `zpool destroy`. + +## Thin id allocation + +dm-thin addresses each volume by a numeric `dev_id`. +The kernel parses it as `u64` (`drivers/md/dm-thin.c`, `read_dev_id`), so the full 64-bit space is available even though older documentation describes a 24-bit limit. + +Ember picks a random `u64` per volume: + +```rust +fn fresh_thin_id(pool: &str) -> Result { + loop { + let id: u64 = rand::random(); + match dmsetup_message(pool, &format!("create_thin {id}")) { + Ok(()) => return Ok(id), + Err(e) if e.is_already_exists() => continue, // EEXIST → retry + Err(e) => return Err(e), + } + } +} +``` + +Why this is safe: + +* Birthday collision in a 64-bit space first crosses 1% probability around 2^29 ids (~600 M). Realistic ember pools hold thousands at most. Collision probability is effectively zero. +* The kernel atomically rejects duplicates via `EEXIST`. The retry loop is the entire concurrency story — two ember processes racing on `create_thin` cannot both succeed for the same id. +* No persistent counter, no allocator file, no flock around id generation. + +`create_snap` follows the same pattern (allocate id, retry on `EEXIST`). +The `id` is recorded on the relevant `VmMetadata`/`ImageEntry`/`SnapshotEntry` under whichever lock already protects that record; the kernel pool itself remains the source of truth for liveness, queryable via `thin_dump` for recovery. + +## Pool sizing + +The metadata device must be sized to cover the maximum number of blocks the pool can ever reference: + +* Recommended formula: `metadata_size = max(48 * data_size / block_size, 2 MiB)` (kernel docs). +* Practical cap: 16 GiB. The kernel rejects metadata devices larger than this. +* Standard tool: `thin_metadata_size --block-size=64k --pool-size=50G --max-thins=1000 --numeric-only --unit=b`. + +Defaults used by `ember init`: + +* `block_size`: 64 KiB (128 sectors). Smaller block sizes give better sharing across snapshots at the cost of more metadata; 64 KiB is the documented kernel default. +* `metadata_size`: computed via `thin_metadata_size` for the requested data size, capped at 16 GiB, floor of 32 MiB. +* `low_water_mark`: `data_size / block_size / 16` blocks (≈6.25% of pool). When free blocks fall below this, the kernel notifies userspace via `dmeventd`. Ember does not register a userspace handler in this initial spec — the value is informational. A future enhancement could surface low-space warnings via `dmsetup status`. + +## Storage layout + +``` +/var/lib/ember/dm-thin/ # Default --storage-path +├── metadata.img # Sparse file, ~32 MiB to 16 GiB +└── data.img # Sparse file, sized to --size + +/var/lib/ember/ # State directory (unchanged location) +├── config.json +├── kernels/ +├── images/ +│ └── registry.json # ImageEntry records, now include thin_id +├── vms/ +│ └── / +│ └── vm.json # VmMetadata, includes thin_id +└── network/ +``` + +No separate allocator state file is needed. +Thin ids live exclusively on `ImageEntry.thin_id`, `VmMetadata.thin_id`, and `SnapshotEntry.thin_id`. +Fresh ids are picked at random; the pool itself is the authority for which ids are live (queryable via `thin_dump /dev/loopMETA` for recovery). + +## Initialization + +### File-backed (default) + +```bash +ember init --storage dm-thin --size 50G +``` + +1. Create directory: `mkdir -p /var/lib/ember/dm-thin`. +2. Compute metadata size: `thin_metadata_size --block-size=64k --pool-size=50G --max-thins=1000 --numeric-only --unit=b` → e.g. `838860800` (≈800 MiB). +3. Create sparse data: `truncate -s 50G /var/lib/ember/dm-thin/data.img`. +4. Create sparse metadata: `truncate -s 800M /var/lib/ember/dm-thin/metadata.img`. +5. Zero metadata header: `dd if=/dev/zero of=/var/lib/ember/dm-thin/metadata.img bs=4K count=1 conv=notrunc`. The kernel uses the all-zero superblock as the signal to format a fresh pool. +6. Attach loops: `losetup -f --show /var/lib/ember/dm-thin/metadata.img` → `/dev/loopN`; same for `data.img` → `/dev/loopM`. +7. Assemble pool: `dmsetup create ember-pool --table "0 thin-pool /dev/loopN /dev/loopM 128 32768"` where `data_sectors = data_size / 512` and `32768` is the low-water mark in blocks. +8. Write `config.json` with `storage_backend = "dm-thin"`, `storage_path = /var/lib/ember/dm-thin`. + +### Device-backed + +```bash +ember init --storage dm-thin --storage-path /dev/sdb +``` + +1. Allocate metadata partition: requires either a separate metadata device or a partition layout. To avoid forcing partitioning on the user, ember uses **embedded metadata mode**: it places `metadata.img` as a sparse file on the state directory's filesystem and uses `--storage-path` only as the data device. (Splitting metadata onto a tiny separate device is a future enhancement.) +2. Wipe the device's first 4 KiB so the pool initializes fresh: `dd if=/dev/zero of=/dev/sdb bs=4K count=1`. +3. `losetup` only the metadata file. The data device is used directly. +4. Assemble pool: `dmsetup create ember-pool --table "0 thin-pool /dev/loopN /dev/sdb 128 32768"`. + +The init flow is otherwise identical. + +### Activation on subsequent runs + +dm-thin tables live only in kernel memory. +After a reboot or `dmsetup remove`, the pool and all thin volumes are gone from `/dev/mapper/` even though the underlying metadata is intact. +Ember therefore reactivates on demand. + +The first command after a reboot triggers `ensure_pool_active`: + +1. Read `config.json` → `storage_path`. +2. Check `/dev/mapper/ember-pool` exists. If yes, done. +3. If no: + a. `losetup -f --show metadata.img` → `/dev/loopN` (skip if device-backed). + b. `losetup -f --show data.img` → `/dev/loopM` (skip if device-backed). + c. Run `thin_check /dev/loopN` (or the metadata loop). Fail loudly on metadata corruption — operator must run `thin_repair` manually. + d. `dmsetup create ember-pool --table "0 thin-pool ... 128 "` using the values from `config.json`. + +Per-VM and per-image volumes are activated **lazily** by methods that need them (e.g. `disk_device_path`, `mount`, `start`). +Each method calls `ensure_thin_active(name, thin_id, size_sectors)`: + +1. If `/dev/mapper/` exists, done. +2. Else: `dmsetup create --table "0 thin /dev/mapper/ember-pool "`. + +Sizes come from existing `ImageEntry.size_mib` and `VmMetadata.disk_size_gib`. + +### Filesystem validation + +Before any storage operation, ember verifies `/dev/mapper/ember-pool` exists. +If not, it attempts the activation sequence above. +This is the dm-thin equivalent of the btrfs `/proc/mounts` check. + +`dmsetup status ember-pool` is parsed to detect: + +* `out_of_data_space`: pool is full. New writes will fail with EIO. Ember refuses VM create/start and prints an actionable error suggesting `ember storage grow`. +* `metadata_low_watermark`: metadata pressure. Logged as a warning. +* `read_only`: kernel switched the pool to read-only after a metadata error. Refuse all write operations. + +### Teardown (`ember deinit`) + +1. Stop all running VMs (precondition; ember refuses if any VM is running). +2. Remove all activated thin volumes: enumerate `dmsetup ls --target thin` filtered by the `ember-img-` / `ember-vm-` prefix, then `dmsetup remove` each. +3. Free thin ids: not strictly required (the next step destroys metadata) but done for symmetry: `dmsetup message ember-pool 0 "delete "` for each. +4. Remove pool: `dmsetup remove ember-pool`. +5. Detach loops: `losetup -d /dev/loopN /dev/loopM`. +6. If `--purge`: delete `metadata.img`, `data.img`. +7. Remove `config.json`. + +For device-backed pools, the data device is left intact — same as ZFS. + +## Image pull workflow + +Reuses the existing pipeline up to the ext4 image: + +``` +OCI registry → unpacked rootfs → mkfs.ext4 + populate → ext4 image file + │ + ▼ + create_thin → activate → dd → snapshot +``` + +Per-image steps: + +1. Allocate thin id: `id_a = fresh_thin_id()` (random `u64`, retry on collision — see "Thin id allocation" below). +2. Create thin: `dmsetup message ember-pool 0 "create_thin "`. +3. Activate as a temporary device: `dmsetup create ember-img--staging --table "0 thin /dev/mapper/ember-pool "`. +4. Write image: `dd if=/tmp/ember-image-XXXX/image.ext4 of=/dev/mapper/ember-img--staging bs=1M`. Existing `zvol::dd_image` logic is reused once the device path is supplied. +5. Suspend: `dmsetup suspend ember-img--staging`. This forces a metadata commit so the snapshot below sees a consistent state. +6. Allocate base id: `id_base = fresh_thin_id()`. +7. Snapshot: `dmsetup message ember-pool 0 "create_snap "`. +8. Resume: `dmsetup resume ember-img--staging`. +9. Discard the staging device: `dmsetup remove ember-img--staging`. Free `id_a`: `dmsetup message ember-pool 0 "delete "`. The `id_base` snapshot retains all of its blocks. +10. Persist: `ImageEntry.thin_id = id_base`, `disk_path = "/dev/mapper/ember-img-"` (the activated path; lazy activation will create it on first use). + +Why two ids? `create_snap` requires a source thin volume. +We need a snapshot of the freshly-written image so that VM clones can branch from a stable origin without our staging device hanging around as a dependency. +The pattern matches how ZFS uses `@base`: write to a primary, snapshot it, then never touch the primary again. + +The base thin is not activated as a device by default; only VMs cloned from it appear in `/dev/mapper/`. +This keeps `/dev/mapper/` clutter-free and avoids races where a stale activation locks a volume. + +## VM create + +```bash +ember vm create myvm --image alpine --disk-size 4G +``` + +1. Look up `ImageEntry.thin_id` for `alpine` (the base id). +2. Allocate fresh id: `id_vm = fresh_thin_id()`. +3. Snapshot: `dmsetup message ember-pool 0 "create_snap "`. Instant — no data is copied. +4. Activate: `dmsetup create ember-vm-myvm --table "0 thin /dev/mapper/ember-pool "`. +5. The activated device path `/dev/mapper/ember-vm-myvm` is recorded in `VmMetadata.disk_path` and `VmMetadata.thin_id = id_vm`. +6. Loop-mount via `mount /dev/mapper/ember-vm-myvm /tmp/...` to inject SSH key and hostname (the existing flow on the ZFS path; no `-o loop` needed because dm-thin volumes are real block devices). +7. Pass `/dev/mapper/ember-vm-myvm` to Firecracker as `path_on_host`. + +If `disk_sectors > image size_sectors`, the activation table size already declares the larger virtual size. Ember then runs `e2fsck -f -p` and `resize2fs` against the device to grow the ext4 filesystem into the new space (no `truncate` needed — thin volumes are virtually sized at activation time). + +### Sanity check + +A `create_snap` completes in milliseconds. +Mirror the macOS/btrfs timing check: warn if the operation takes more than 1 second, since that suggests metadata pressure or pool-level issues. + +## VM resize + +```bash +ember vm resize myvm --disk-size 8G +``` + +1. VM must be stopped (existing precondition). +2. Suspend: `dmsetup suspend ember-vm-myvm`. +3. Reload table with new virtual size: `dmsetup load ember-vm-myvm --table "0 thin /dev/mapper/ember-pool "`. +4. Resume: `dmsetup resume ember-vm-myvm`. +5. `e2fsck -f -p /dev/mapper/ember-vm-myvm`. +6. `resize2fs /dev/mapper/ember-vm-myvm`. + +No new blocks are allocated until the guest writes into the new space. +Pool capacity is the upper bound; thin volumes can over-commit it. + +Shrinking is not supported (matches every other backend). + +## Pool resize + +A new admin command: + +```bash +ember storage grow --size 100G +``` + +1. For file-backed: `truncate -s 100G data.img`. For device-backed: assumes the user has already grown the device (e.g. cloud volume expansion). +2. `losetup -c /dev/loopM`: instruct the loop driver to re-read the backing file size. (No-op for device mode.) +3. Suspend: `dmsetup suspend ember-pool`. +4. Reload table: `dmsetup load ember-pool --table "0 thin-pool /dev/loopN /dev/loopM 128 "`. +5. Resume: `dmsetup resume ember-pool`. + +Metadata cannot be resized in place. +If `thin_metadata_size` for the new pool size exceeds the existing metadata device, ember refuses the grow and prints instructions for an offline metadata move using `pdata_tools` (out of scope for the initial implementation; doc only). + +## User snapshots + +```bash +# Create +ember snapshot create myvm s1 +→ id_s1 = fresh_thin_id() +→ dmsetup suspend ember-vm-myvm +→ dmsetup message ember-pool 0 "create_snap " +→ dmsetup resume ember-vm-myvm + (id_s1 stays inactive — no /dev/mapper entry until restore) + +# Restore (VM must be stopped) +ember snapshot restore myvm s1 +→ dmsetup remove ember-vm-myvm +→ dmsetup message ember-pool 0 "delete " +→ id_vm_new = fresh_thin_id() +→ dmsetup message ember-pool 0 "create_snap " +→ dmsetup create ember-vm-myvm --table "0 thin /dev/mapper/ember-pool " + VmMetadata.thin_id = id_vm_new + +# List +ember snapshot list myvm +→ read snapshot records from VmMetadata (or a sidecar; see below) + +# Delete +ember snapshot delete myvm s1 +→ dmsetup message ember-pool 0 "delete " +``` + +### Snapshot consistency + +Suspending the VM volume during `create_snap` flushes outstanding I/O and forces a metadata commit before the snapshot is taken. +The kernel performs the equivalent of an fsync at the block layer. +A guest that has not fsynced its in-flight writes may still see an uncrashed-but-dirty filesystem on the snapshot, exactly as with ZFS zvol snapshots. +This matches existing behavior; no additional guarantees are introduced. + +### Snapshot metadata + +Snapshot records are stored alongside `VmMetadata`, since the existing ZFS backend reads them via `zfs::snapshot::list`. +For dm-thin, ember maintains a `snapshots: Vec` list in `vm.json`: + +```rust +pub struct SnapshotEntry { + pub name: String, + pub thin_id: u64, + pub created_at: String, + pub size_sectors: u64, +} +``` + +`list_snapshots` reads this list. +`size` reflects unique block usage and can be queried via `dmsetup status` or `thin_ls --metadata-snap` for accurate accounting; for the initial implementation, ember reports the volume's virtual size and defers exclusive-block accounting to a future enhancement. + +## VM fork + +```bash +ember vm fork source newvm +``` + +`fork` and `clone-for-vm` are the same primitive on dm-thin: + +1. Allocate `id_fork = fresh_thin_id()`. +2. Suspend source (if running, this is required for consistency). +3. `dmsetup message ember-pool 0 "create_snap "`. +4. Resume source. +5. Activate: `dmsetup create ember-vm-newvm --table "0 thin /dev/mapper/ember-pool "`. + +Forks are independent of the source after creation — the dm-thin metadata reference-counts blocks, so deleting the source's thin id does not affect the fork. +This mirrors APFS/btrfs behavior, not ZFS: + +* `cleanup_fork` is a no-op. +* `storage_dependents` always returns an empty vec. + +The `parent_vm` field in `VmMetadata` records the fork origin for informational purposes. + +This is a notable simplification compared to the ZFS backend's fork-snapshot dependency tracking. + +## Firecracker integration + +The drive path is a block device, identical in shape to the ZFS path: + +| Backend | `path_on_host` | +|---------|----------------| +| ZFS | `/dev/zvol/tank/ember/vms/myvm` (block device) | +| btrfs | `/var/lib/ember/btrfs/vms/myvm/rootfs.img` (regular file) | +| dm-thin | `/dev/mapper/ember-vm-myvm` (block device) | + +`LinuxVm::start` already handles block-device drive paths. +The dispatch logic introduced by the btrfs spec (file path vs ZFS dataset name) extends naturally — dm-thin paths start with `/dev/mapper/`, so they take the file-path branch (passed through unchanged). + +The conversion helper that maps a `disk_path` to the actual device path becomes: + +```rust +let rootfs_path = if vm.disk_path.starts_with('/') { + PathBuf::from(&vm.disk_path) // btrfs file or dm-thin /dev/mapper path +} else { + zfs::volume::device_path(&vm.disk_path) // ZFS dataset name +}; +``` + +No further VM-side changes are required. + +## VM and image metadata + +`VmMetadata` and `ImageEntry` gain a single optional field: + +```rust +pub struct VmMetadata { + // ... + pub disk_path: String, + pub parent_vm: Option, + /// dm-thin volume id. None for ZFS/btrfs/APFS backends. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub thin_id: Option, + pub snapshots: Vec, // NEW: dm-thin owns this list + // ... +} + +pub struct ImageEntry { + pub reference: String, + pub local_name: String, + pub disk_path: String, + pub size_mib: u64, + pub pulled_at: String, + /// dm-thin base snapshot id. None for ZFS/btrfs/APFS backends. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub thin_id: Option, +} +``` + +The `#[serde(skip_serializing_if = "Option::is_none")]` keeps ZFS configs unchanged on disk. +Existing `vm.json` and `registry.json` files are read without modification — the ZFS backend simply ignores `thin_id`. + +For ZFS, the `snapshots` list remains empty in `vm.json` and `list_snapshots` continues to read live state from `zfs::snapshot::list`. +The dm-thin backend writes to it. +This split is acceptable but slightly asymmetric; an alternative is for ZFS to mirror its snapshots into `vm.json` too, which is out of scope here. + +## Image dependency tracking + +With dm-thin, the base thin id can technically be deleted while VMs cloned from it exist — block reference counting at the pool level prevents data loss. +However, ember preserves the user-visible invariant of the existing image registry: `ember image delete` checks for VMs that reference the image and refuses to delete by default, consistent with both ZFS and btrfs. + +`destroy_image_storage(name, force)`: + +* Without `--force`: refuse if `ImageEntry.thin_id` is referenced by any `VmMetadata.thin_id`'s ancestor chain. Ancestor lookup uses `thin_dump` to walk the snapshot graph. +* With `--force`: delete the thin id directly. Cloned VMs retain their own thin ids and continue to function — block sharing is invisible at the volume level. + +## Crate structure + +Building on the layout proposed by the btrfs spec: + +``` +crates/ember-linux/src/ +├── storage.rs # create_storage() factory, returns Arc +├── zfs_storage.rs # ZFS backend (renamed from current storage.rs) +├── btrfs_storage.rs # btrfs backend +├── dm_thin_storage.rs # NEW: dm-thin backend +├── zfs/ # ZFS CLI wrappers (unchanged) +├── btrfs/ # btrfs CLI wrappers +├── dm_thin/ # NEW: dm-thin CLI wrappers +│ ├── module.rs # mod declarations +│ ├── pool.rs # ember-pool create/activate/teardown, status parsing +│ ├── thin.rs # create_thin, create_snap, delete, suspend/resume, table reload, fresh_thin_id +│ ├── activation.rs # ensure_pool_active, ensure_thin_active, deactivate +│ └── tools.rs # thin_check, thin_repair, thin_metadata_size, thin_dump wrappers +├── zvol.rs # Existing ext4 → block device pipeline (reused for dm-thin) +└── vm.rs # LinuxVm — handles file paths and block device paths +``` + +`DmThinStorage` mirrors `ZfsStorage` but addresses volumes by id: + +```rust +pub struct DmThinStorage { + /// Backing path (directory for files, raw block dev otherwise). + storage_path: PathBuf, + /// Pool block size in sectors. From config. + block_size: u32, +} +``` + +The struct holds no allocator state. +`fresh_thin_id()` generates a random `u64` and returns it; collisions are handled by the kernel (`create_thin` returns `EEXIST`) and the caller retries. +The authoritative record of which ids are live lives in `ImageEntry`/`VmMetadata`/`SnapshotEntry`, which are already updated under the existing per-VM and registry locks — no new locking primitive is introduced. + +### Display and platform adaptations + +`LinuxPlatform` (at `crates/ember-linux/src/platform.rs`) needs the same kind of branching the btrfs spec describes: + +* **`inspect_vm_extra`**: "Disk device" / `/dev/mapper/ember-vm-` and "Thin id" / ``. +* **`inspect_image_extra`**: "Disk device" / `/dev/mapper/ember-img-` and "Thin id" / ``. +* **`info_extra`**: "Storage" / "dm-thin", "Pool" / `ember-pool`, "Storage path" / the configured `storage_path`, plus a "Pool usage" line populated from `dmsetup status ember-pool`. +* **`init_hint`**: include the dm-thin variant alongside the ZFS and btrfs hints. + +## Comparison: ZFS vs btrfs vs dm-thin vs APFS + +| Operation | ZFS (Linux) | btrfs (Linux) | dm-thin (Linux) | APFS (macOS) | +|-----------|-------------|---------------|-----------------|--------------| +| Init | `zpool create` + `zfs create` | `mkfs.btrfs` + `mount` + `mkdir` | `truncate` + `losetup` + `dmsetup create thin-pool` | `mkdir` | +| Base image | zvol + `@base` snapshot | Raw `.img` file | Thin volume + snapshot id | Raw `.img` file | +| VM clone | `zfs clone x@base y` | `cp --reflink=always x.img y.img` | `dmsetup message create_snap` + `dmsetup create` | `cp -c x.img y.img` | +| Snapshot | `zfs snapshot y@snap` | `cp --reflink=always` | suspend + `create_snap` + resume | `cp -c` | +| Restore | `zfs rollback y@snap` | `cp --reflink=always` + `mv` | remove + delete + `create_snap` + create | `cp -c` + `mv` | +| Delete snap | `zfs destroy y@snap` | `rm snap.img` | `dmsetup message delete` | `rm snap.img` | +| Resize | `zfs set volsize` + `resize2fs` | `truncate` + `resize2fs` | `dmsetup load` + `resize2fs` | `truncate` + `resize2fs` | +| Fork | `zfs clone` (creates dependency) | `cp --reflink=always` (independent) | `create_snap` (independent) | `cp -c` (independent) | +| Drive path | `/dev/zvol/...` | `.../rootfs.img` (file) | `/dev/mapper/...` | `.../rootfs.img` (file) | +| Root required | Yes | Yes | Yes | No | +| Filesystem validation | `zpool list` | `/proc/mounts` | `dmsetup status ember-pool` | APFS volume check at init | +| Reactivation after reboot | Auto (zpool import) | Auto-mount | Explicit `ensure_pool_active` | Not applicable | +| Identifier | Dataset path | File path | Random `u64` thin id | File path | +| State on disk | ZFS metadata | Filesystem metadata | Pool metadata (ids embedded in existing vm/image records) | Filesystem metadata | +| Kernel module | Out-of-tree (DKMS) | In-tree | In-tree | N/A | +| Checksums | Yes (ZFS) | Yes (data + metadata) | Metadata only | No | + +dm-thin sits between ZFS and btrfs: +it offers ZFS-like block-level CoW with no kernel module, at the cost of a more involved activation lifecycle (numeric ids, explicit `dmsetup` operations, no auto-import) and weaker data-integrity guarantees (no data checksums, harsher pool-exhaustion failure mode). + +## Storage efficiency diagnostics + +`ember debug storage-efficiency` for dm-thin reports both per-volume and pool-level metrics: + +* Per-volume virtual size: from the activated device's table. +* Per-volume exclusive blocks: from `thin_ls --metadata-snap=- /dev/loopMETA`. Computing this requires a metadata snapshot — taken under suspend or via `dmsetup message ember-pool 0 "reserve_metadata_snap"` — which has measurable overhead. The command surfaces it on demand only. +* Pool capacity, allocated, and free: from `dmsetup status ember-pool`. Output format: `/ /`. + +The macOS `st_blocks` approach used by the btrfs and APFS backends does not apply — dm-thin volumes are block devices, not files, and `stat` on `/dev/mapper/...` reports no allocation. + +## Risks and limitations + +* **Pool exhaustion**: Sparse-file backing lets the pool over-commit. If the host filesystem fills up, the pool transitions to read-only and all thin volumes return EIO until space is recovered. Ember should pre-check available space on the host filesystem before allowing image pulls or VM creates that would push the pool toward its data limit. The initial implementation adds a refuse-on-pool-full check via `dmsetup status` before each write-heavy operation; richer monitoring is a follow-up. +* **Metadata exhaustion**: Less recoverable than data exhaustion. The metadata device must be sized generously at init. `ember storage info` should warn when metadata usage exceeds 80%. +* **Block size is permanent**: Chosen at `dmsetup create`; cannot be changed without rebuilding the pool. The 64 KiB default is a balance; users with very large VM disks (~hundreds of GiB) may want 128–256 KiB blocks for lower metadata overhead. +* **Loop device limits**: The default `max_loop=8` per kernel module load can be a constraint on systems with many loop-using services. Ember uses two loop devices total (metadata and data); the limit only matters when other software is competing. Documented as a troubleshooting hint, not a hard requirement. +* **Numeric id lifecycle**: Thin ids live on `VmMetadata`/`ImageEntry`/`SnapshotEntry`. Loss of the state directory therefore loses the name→id map even though the pool metadata is intact. Recovery is possible via `thin_dump` (lists all live thin ids) but requires manual reconstruction. No worse than the equivalent loss for ZFS or btrfs configs. +* **Concurrent invocations**: Race-free by construction. The kernel rejects duplicate ids atomically; the random-pick-and-retry loop tolerates concurrent creators without coordination. Per-record state mutation (writing `thin_id` into `vm.json` etc.) is already serialized by the existing per-VM and registry locks. +* **No data checksums**: Bit rot on the underlying block device goes undetected. Users who need this should layer dm-thin on top of LVM mirrors or hardware RAID, or stay on ZFS. +* **No `send`/`receive` equivalent**: Backup and migration require `dd` of the activated device, or `thin_dump` + `thin_delta` for incremental sync. Out of scope for the initial implementation. + +## External dependencies + +* **`dmsetup`**: From the `lvm2` package on Debian/Ubuntu/RHEL/Fedora/Arch. Installed by default on most server distributions. +* **`losetup`**: From `util-linux`. Always present. +* **`thin-provisioning-tools`**: Provides `thin_check`, `thin_repair`, `thin_dump`, `thin_metadata_size`, `thin_ls`. Packaged separately on most distributions. Required by `ember init` and `ember storage info`. Pre-flight check at `ember init` time. +* **`e2fsprogs`**: `mkfs.ext4`, `e2fsck`, `resize2fs`. Already required by the ZFS backend. +* **GNU coreutils**: `truncate`, `dd`. Already required. +* **Kernel config**: `CONFIG_DM_THIN_PROVISIONING=y` or `=m`, `CONFIG_BLK_DEV_LOOP=y` or `=m`. Both are part of every mainstream distribution kernel. + +## Open questions + +* **Multi-instance support**: The current spec hardcodes the pool name `ember-pool` and the device-mapper prefixes. Running multiple independent ember installations on the same host requires per-instance prefixes. Deferred until a real use case appears. +* **Metadata on a separate device**: `ember init --metadata-device /dev/sdc1` could place metadata on faster storage (NVMe) while data lives on bulk storage (HDD). Easy to add later — the pool table already supports two distinct devices. +* **Discard/TRIM**: dm-thin supports passdown of discards from guest to pool, which can return blocks to the pool when guests TRIM. Requires Firecracker virtio-blk to advertise discard support and the guest filesystem to issue it. Worth investigating as a follow-up; not required for correctness. +* **`dmeventd` integration**: Userspace handler for low-water-mark events would let ember warn proactively. The initial implementation polls `dmsetup status` on demand instead. From b669a3eceab83899b0928304c5a76eb24ee47362 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 27 Apr 2026 10:06:15 +0200 Subject: [PATCH 04/21] dm-thin: add device-mapper CLI wrappers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pure shell-out wrappers for the operations the dm-thin storage backend will need: * dm_thin::pool — thin-pool target lifecycle and dmsetup status parsing. * dm_thin::thin — thin volume operations including the random-u64 allocator with retry-on-EEXIST collision handling. * dm_thin::loop_device — losetup attach/detach/refresh helpers for file-backed pools. * dm_thin::tools — thin_check/thin_repair/thin_metadata_size/thin_dump. No StorageBackend impl yet; that lands in Phase 4. The wrappers come with 10 unit tests covering parser behavior, table formatting, id allocation invariants, and dm name sanitization. Co-Authored-By: Claude Opus 4.7 (1M context) --- Cargo.lock | 1 + Cargo.toml | 3 + crates/ember-linux/Cargo.toml | 1 + crates/ember-linux/src/dm_thin.rs | 33 ++ crates/ember-linux/src/dm_thin/loop_device.rs | 93 +++++ crates/ember-linux/src/dm_thin/pool.rs | 349 ++++++++++++++++++ crates/ember-linux/src/dm_thin/thin.rs | 238 ++++++++++++ crates/ember-linux/src/dm_thin/tools.rs | 102 +++++ crates/ember-linux/src/lib.rs | 1 + 9 files changed, 821 insertions(+) create mode 100644 crates/ember-linux/src/dm_thin.rs create mode 100644 crates/ember-linux/src/dm_thin/loop_device.rs create mode 100644 crates/ember-linux/src/dm_thin/pool.rs create mode 100644 crates/ember-linux/src/dm_thin/thin.rs create mode 100644 crates/ember-linux/src/dm_thin/tools.rs diff --git a/Cargo.lock b/Cargo.lock index a7b8776..6b4ea67 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -602,6 +602,7 @@ dependencies = [ "hyper-util", "hyperlocal", "nix", + "rand", "serde", "serde_json", "tempfile", diff --git a/Cargo.toml b/Cargo.toml index 37338fc..2ee59d1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -46,6 +46,9 @@ uuid = { version = "1", features = ["v4", "serde"] } # Temporary directories tempfile = "3" +# Randomness (dm-thin volume id allocation) +rand = "0.8" + [package] name = "ember" version = "0.1.0" diff --git a/crates/ember-linux/Cargo.toml b/crates/ember-linux/Cargo.toml index 4b71962..ced50f4 100644 --- a/crates/ember-linux/Cargo.toml +++ b/crates/ember-linux/Cargo.toml @@ -20,3 +20,4 @@ nix = { workspace = true, features = ["fs", "ioctl", "net", "signal", "process", anyhow = { workspace = true } uuid = { workspace = true } tempfile = { workspace = true } +rand = { workspace = true } diff --git a/crates/ember-linux/src/dm_thin.rs b/crates/ember-linux/src/dm_thin.rs new file mode 100644 index 0000000..299cd19 --- /dev/null +++ b/crates/ember-linux/src/dm_thin.rs @@ -0,0 +1,33 @@ +//! Linux device-mapper thin provisioning backend. +//! +//! Thin pools provide block-level copy-on-write storage. A single +//! [`pool::POOL_NAME`] pool aggregates two backing devices (metadata and +//! data) and exposes any number of independent thin volumes addressed by +//! 64-bit numeric IDs. Snapshots and clones are the same primitive +//! ([`thin::create_snap`]) — snapshotting a thin volume produces another +//! thin volume that shares blocks until divergence. +//! +//! See `docs/DM-THIN-SPEC.md` for the full design. + +pub mod loop_device; +pub mod pool; +pub mod thin; +pub mod tools; + +/// Sectors are always 512 bytes on Linux block devices. +pub const SECTOR_SIZE: u64 = 512; + +/// Convert bytes to sectors, rounding up. +pub fn bytes_to_sectors(bytes: u64) -> u64 { + bytes.div_ceil(SECTOR_SIZE) +} + +/// Whether an [`Error`](ember_core::error::Error) reports a kernel `EEXIST` +/// from a `dmsetup message` operation. Used by the `create_thin` / +/// `create_snap` retry loops to detect thin id collisions. +pub fn is_already_exists(err: &ember_core::error::Error) -> bool { + matches!( + err, + ember_core::error::Error::Command { stderr, .. } if stderr.contains("File exists") + ) +} diff --git a/crates/ember-linux/src/dm_thin/loop_device.rs b/crates/ember-linux/src/dm_thin/loop_device.rs new file mode 100644 index 0000000..1fa48ae --- /dev/null +++ b/crates/ember-linux/src/dm_thin/loop_device.rs @@ -0,0 +1,93 @@ +//! `losetup` wrappers for attaching backing files as loop block devices. +//! +//! The dm-thin backend uses loop devices to expose sparse `metadata.img` and +//! `data.img` files as block devices that the kernel can assemble into a +//! thin pool. Attachment is per-`ember` invocation: the loop device must be +//! re-attached after every reboot (state is in-memory). + +use std::path::{Path, PathBuf}; +use std::process::Command; + +use ember_core::error::{Error, Result}; + +/// Attach `file` to the next available loop device. +/// +/// Returns the loop device path (e.g., `/dev/loop0`). +pub fn attach(file: &Path) -> Result { + let output = Command::new("losetup") + .args(["-f", "--show"]) + .arg(file) + .output() + .map_err(|e| Error::CommandExec { + command: "losetup".to_string(), + source: e, + })?; + + let output = Error::check_command("losetup -f --show", output)?; + let stdout = String::from_utf8_lossy(&output.stdout); + let path = stdout.trim(); + if path.is_empty() { + return Err(Error::Command { + command: "losetup -f --show".to_string(), + exit_code: 0, + stderr: format!( + "expected a loop device path on stdout, got empty output for {}", + file.display() + ), + }); + } + Ok(PathBuf::from(path)) +} + +/// Detach a loop device. +/// +/// Idempotent in spirit but not in fact: callers should ignore failures +/// during teardown if the loop device may already be gone. +pub fn detach(loop_dev: &Path) -> Result<()> { + let output = Command::new("losetup") + .arg("-d") + .arg(loop_dev) + .output() + .map_err(|e| Error::CommandExec { + command: "losetup -d".to_string(), + source: e, + })?; + Error::check_command("losetup -d", output)?; + Ok(()) +} + +/// Re-read the backing file's size into the loop device. +/// +/// Required after `truncate`-ing the data backing file when growing the +/// pool: the loop driver caches the size, so the kernel doesn't see the +/// new bytes until we ask it to refresh. +pub fn refresh_size(loop_dev: &Path) -> Result<()> { + let output = Command::new("losetup") + .arg("-c") + .arg(loop_dev) + .output() + .map_err(|e| Error::CommandExec { + command: "losetup -c".to_string(), + source: e, + })?; + Error::check_command("losetup -c", output)?; + Ok(()) +} + +/// Look up the loop device currently backing `file`, if any. +pub fn find_for(file: &Path) -> Result> { + let output = Command::new("losetup") + .args(["-j", "-O", "NAME", "--noheadings"]) + .arg(file) + .output() + .map_err(|e| Error::CommandExec { + command: "losetup -j".to_string(), + source: e, + })?; + + // `losetup -j` exits 0 even when the file has no loop attached. + let output = Error::check_command("losetup -j", output)?; + let stdout = String::from_utf8_lossy(&output.stdout); + let first = stdout.lines().next().map(str::trim).filter(|s| !s.is_empty()); + Ok(first.map(PathBuf::from)) +} diff --git a/crates/ember-linux/src/dm_thin/pool.rs b/crates/ember-linux/src/dm_thin/pool.rs new file mode 100644 index 0000000..be5836d --- /dev/null +++ b/crates/ember-linux/src/dm_thin/pool.rs @@ -0,0 +1,349 @@ +//! `dmsetup` wrappers for the `thin-pool` target. +//! +//! A thin pool is the kernel-side container holding metadata + data +//! devices and exposing thin volumes as snapshot-capable block devices. +//! Ember runs a single named pool ([`POOL_NAME`]) per installation. + +use std::path::{Path, PathBuf}; +use std::process::Command; + +use ember_core::error::{Error, Result}; + +/// Device-mapper name of the singleton thin pool used by ember. +pub const POOL_NAME: &str = "ember-pool"; + +/// Default pool block size in 512-byte sectors (= 64 KiB). +/// +/// Permanent at pool creation. Smaller blocks improve sharing across +/// snapshots but inflate metadata; larger blocks reduce metadata at the +/// cost of write amplification when only part of a block is dirtied. +pub const DEFAULT_BLOCK_SIZE_SECTORS: u32 = 128; + +/// Default low-water-mark in pool blocks. With the default 64 KiB block +/// size this is 2 GiB of free space — the threshold at which the kernel +/// raises a `dmeventd` notification. +pub const DEFAULT_LOW_WATER_BLOCKS: u64 = 32_768; + +/// Operating mode reported by `dmsetup status`. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum PoolMode { + /// Normal operation. + ReadWrite, + /// Pool entered read-only after a metadata error or admin request. + ReadOnly, + /// Pool ran out of data blocks. New writes return EIO until grown. + OutOfDataSpace, + /// Pool is unrecoverable; metadata or device-level failure. + Failed, +} + +/// Status snapshot returned by [`status`]. +/// +/// Sizes are in pool blocks (not sectors): each block is +/// [`DEFAULT_BLOCK_SIZE_SECTORS`] × 512 bytes by default. +#[derive(Debug)] +pub struct PoolStatus { + pub used_metadata_blocks: u64, + pub total_metadata_blocks: u64, + pub used_data_blocks: u64, + pub total_data_blocks: u64, + pub mode: PoolMode, +} + +/// Whether a device-mapper device with the given name is currently active. +/// +/// Uses `dmsetup info` which exits 0 when the device exists, non-zero +/// otherwise. +pub fn exists(name: &str) -> Result { + let output = Command::new("dmsetup") + .args(["info", "--noheadings", name]) + .output() + .map_err(|e| Error::CommandExec { + command: "dmsetup info".to_string(), + source: e, + })?; + Ok(output.status.success()) +} + +/// Build a `thin-pool` table line. +/// +/// The format is documented in +/// `Documentation/admin-guide/device-mapper/thin-provisioning.rst`: +/// `0 thin-pool `. +fn pool_table( + metadata_dev: &Path, + data_dev: &Path, + data_sectors: u64, + block_size_sectors: u32, + low_water_blocks: u64, +) -> String { + format!( + "0 {data_sectors} thin-pool {} {} {block_size_sectors} {low_water_blocks}", + metadata_dev.display(), + data_dev.display(), + ) +} + +/// Activate a thin pool from existing metadata + data devices. +/// +/// If the metadata superblock is all zero the kernel formats a fresh pool; +/// otherwise it imports the existing metadata. Callers wanting a fresh +/// pool must zero the first 4 KiB of the metadata device beforehand. +pub fn create( + name: &str, + metadata_dev: &Path, + data_dev: &Path, + data_sectors: u64, + block_size_sectors: u32, + low_water_blocks: u64, +) -> Result<()> { + let table = pool_table( + metadata_dev, + data_dev, + data_sectors, + block_size_sectors, + low_water_blocks, + ); + let output = Command::new("dmsetup") + .args(["create", name, "--table", &table]) + .output() + .map_err(|e| Error::CommandExec { + command: "dmsetup create".to_string(), + source: e, + })?; + Error::check_command("dmsetup create thin-pool", output)?; + Ok(()) +} + +/// Tear down the thin pool. Does not destroy the backing devices or +/// metadata — those persist for re-activation later. +/// +/// Returns an error if any thin volume is still active. Callers should +/// deactivate all thin volumes before tearing down the pool. +pub fn remove(name: &str) -> Result<()> { + let output = Command::new("dmsetup") + .args(["remove", name]) + .output() + .map_err(|e| Error::CommandExec { + command: "dmsetup remove".to_string(), + source: e, + })?; + Error::check_command("dmsetup remove", output)?; + Ok(()) +} + +/// Send a control message to the thin pool. +/// +/// Most thin-pool operations (`create_thin`, `create_snap`, `delete`, +/// `set_transaction_id`, …) are delivered this way rather than via +/// dedicated dmsetup subcommands. +pub fn message(name: &str, msg: &str) -> Result<()> { + let output = Command::new("dmsetup") + .args(["message", name, "0", msg]) + .output() + .map_err(|e| Error::CommandExec { + command: "dmsetup message".to_string(), + source: e, + })?; + Error::check_command("dmsetup message", output)?; + Ok(()) +} + +/// Reload the pool table with new parameters (typically a larger +/// `data_sectors` after growing the data device). +/// +/// Suspend → load → resume sequence is required by the kernel for a +/// live table swap. +pub fn reload( + name: &str, + metadata_dev: &Path, + data_dev: &Path, + data_sectors: u64, + block_size_sectors: u32, + low_water_blocks: u64, +) -> Result<()> { + let table = pool_table( + metadata_dev, + data_dev, + data_sectors, + block_size_sectors, + low_water_blocks, + ); + suspend(name)?; + let load = Command::new("dmsetup") + .args(["load", name, "--table", &table]) + .output() + .map_err(|e| Error::CommandExec { + command: "dmsetup load".to_string(), + source: e, + })?; + if let Err(e) = Error::check_command("dmsetup load thin-pool", load) { + // Best-effort resume to leave the pool live before returning. + let _ = resume(name); + return Err(e); + } + resume(name) +} + +/// Path to the activated thin-pool device. Useful for building thin +/// volume tables that reference the pool by `/dev/mapper/...`. +pub fn device_path(name: &str) -> PathBuf { + PathBuf::from(format!("/dev/mapper/{name}")) +} + +/// Suspend a device-mapper device. Required before reloading a table. +pub fn suspend(name: &str) -> Result<()> { + let output = Command::new("dmsetup") + .args(["suspend", name]) + .output() + .map_err(|e| Error::CommandExec { + command: "dmsetup suspend".to_string(), + source: e, + })?; + Error::check_command("dmsetup suspend", output)?; + Ok(()) +} + +/// Resume a previously suspended device-mapper device. +pub fn resume(name: &str) -> Result<()> { + let output = Command::new("dmsetup") + .args(["resume", name]) + .output() + .map_err(|e| Error::CommandExec { + command: "dmsetup resume".to_string(), + source: e, + })?; + Error::check_command("dmsetup resume", output)?; + Ok(()) +} + +/// Query thin-pool status via `dmsetup status`. +/// +/// Output format documented in +/// `Documentation/admin-guide/device-mapper/thin-provisioning.rst`: +/// +/// ```text +/// thin-pool / +/// / +/// +/// +/// +/// +/// ``` +pub fn status(name: &str) -> Result { + let output = Command::new("dmsetup") + .args(["status", name]) + .output() + .map_err(|e| Error::CommandExec { + command: "dmsetup status".to_string(), + source: e, + })?; + let output = Error::check_command("dmsetup status", output)?; + parse_status(&String::from_utf8_lossy(&output.stdout)) +} + +fn parse_status(line: &str) -> Result { + let fields: Vec<&str> = line.split_whitespace().collect(); + // Minimum: start, length, "thin-pool", txn_id, meta, data, held_meta, mode → 8. + if fields.len() < 8 || fields[2] != "thin-pool" { + return Err(Error::Command { + command: "dmsetup status thin-pool".to_string(), + exit_code: 0, + stderr: format!("unexpected status format: {line}"), + }); + } + let (used_meta, total_meta) = parse_fraction(fields[4])?; + let (used_data, total_data) = parse_fraction(fields[5])?; + let mode = match fields[7] { + "rw" => PoolMode::ReadWrite, + "ro" => PoolMode::ReadOnly, + "out_of_data_space" => PoolMode::OutOfDataSpace, + // The kernel sometimes reports "Fail" or omits trailing fields when + // the pool is unrecoverable. + "Fail" | "failed" => PoolMode::Failed, + other => { + return Err(Error::Command { + command: "dmsetup status thin-pool".to_string(), + exit_code: 0, + stderr: format!("unknown pool mode: {other}"), + }); + } + }; + Ok(PoolStatus { + used_metadata_blocks: used_meta, + total_metadata_blocks: total_meta, + used_data_blocks: used_data, + total_data_blocks: total_data, + mode, + }) +} + +fn parse_fraction(s: &str) -> Result<(u64, u64)> { + let (used, total) = s.split_once('/').ok_or_else(|| Error::Command { + command: "dmsetup status thin-pool".to_string(), + exit_code: 0, + stderr: format!("expected used/total fraction, got: {s}"), + })?; + let used = used.parse::().map_err(|e| Error::Command { + command: "dmsetup status thin-pool".to_string(), + exit_code: 0, + stderr: format!("invalid used field {used:?}: {e}"), + })?; + let total = total.parse::().map_err(|e| Error::Command { + command: "dmsetup status thin-pool".to_string(), + exit_code: 0, + stderr: format!("invalid total field {total:?}: {e}"), + })?; + Ok((used, total)) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parse_status_rw() { + let s = "0 209715200 thin-pool 12 1234/2048 5678/100000 - rw \ + discard_passdown queue_if_no_space - 1024"; + let st = parse_status(s).unwrap(); + assert_eq!(st.used_metadata_blocks, 1234); + assert_eq!(st.total_metadata_blocks, 2048); + assert_eq!(st.used_data_blocks, 5678); + assert_eq!(st.total_data_blocks, 100_000); + assert_eq!(st.mode, PoolMode::ReadWrite); + } + + #[test] + fn parse_status_out_of_data_space() { + let s = "0 209715200 thin-pool 7 100/2048 100000/100000 - out_of_data_space \ + no_discard_passdown error_if_no_space needs_check 1024"; + let st = parse_status(s).unwrap(); + assert_eq!(st.mode, PoolMode::OutOfDataSpace); + assert_eq!(st.used_data_blocks, st.total_data_blocks); + } + + #[test] + fn parse_status_failed() { + let s = "0 209715200 thin-pool 0 0/0 0/0 - Fail"; + let st = parse_status(s).unwrap(); + assert_eq!(st.mode, PoolMode::Failed); + } + + #[test] + fn parse_status_rejects_bad_target() { + let s = "0 100 linear 0 0/0 0/0 - rw"; + assert!(parse_status(s).is_err()); + } + + #[test] + fn pool_table_format() { + let t = pool_table( + Path::new("/dev/loop0"), + Path::new("/dev/loop1"), + 1_048_576, + 128, + 32_768, + ); + assert_eq!(t, "0 1048576 thin-pool /dev/loop0 /dev/loop1 128 32768"); + } +} diff --git a/crates/ember-linux/src/dm_thin/thin.rs b/crates/ember-linux/src/dm_thin/thin.rs new file mode 100644 index 0000000..05ac40e --- /dev/null +++ b/crates/ember-linux/src/dm_thin/thin.rs @@ -0,0 +1,238 @@ +//! Thin volume operations. +//! +//! In dm-thin the same primitive serves three roles: a fresh thin volume +//! (no parent), a snapshot of an existing thin volume, and a clone for a +//! VM. Volumes are addressed by 64-bit numeric IDs allocated randomly by +//! [`allocate`] (see [`crate::dm_thin`] module docs and the spec). +//! +//! Volumes are not automatically activated as `/dev/mapper/...` devices — +//! callers must explicitly [`activate`] them when needed. + +use std::path::PathBuf; +use std::process::Command; + +use ember_core::error::{Error, Result}; + +use super::{is_already_exists, pool}; + +/// Device-mapper name prefix for image base volumes. +pub const IMAGE_PREFIX: &str = "ember-img-"; +/// Device-mapper name prefix for VM disks. +pub const VM_PREFIX: &str = "ember-vm-"; + +/// Pick a fresh non-zero `u64` thin id. +/// +/// The kernel addresses thin volumes by 64-bit ids; we generate them +/// uniformly at random. Birthday-collision math at this scale is well +/// inside the noise floor (≈10⁻¹³ at 1000 volumes) and the kernel +/// rejects duplicates atomically, so [`allocate`] retries on `EEXIST`. +fn fresh_thin_id() -> u64 { + // Avoid id 0 — it isn't reserved by the kernel but using a non-zero + // sentinel keeps logs/diagnostics easier to read. + loop { + let id: u64 = rand::random(); + if id != 0 { + return id; + } + } +} + +/// Allocate a fresh thin volume in `pool` and return its id. +/// +/// Picks a random `u64`, calls `create_thin`, and retries on the +/// vanishingly rare `EEXIST` collision. +pub fn allocate(pool_name: &str) -> Result { + loop { + let id = fresh_thin_id(); + match pool::message(pool_name, &format!("create_thin {id}")) { + Ok(()) => return Ok(id), + Err(e) if is_already_exists(&e) => continue, + Err(e) => return Err(e), + } + } +} + +/// Allocate a fresh snapshot of `src_id` and return its new id. +/// +/// Snapshots and thin volumes are the same primitive; the only +/// difference is the `create_snap` message specifies a parent. +pub fn allocate_snap(pool_name: &str, src_id: u64) -> Result { + loop { + let id = fresh_thin_id(); + match pool::message(pool_name, &format!("create_snap {id} {src_id}")) { + Ok(()) => return Ok(id), + Err(e) if is_already_exists(&e) => continue, + Err(e) => return Err(e), + } + } +} + +/// Free a thin volume's id and release its blocks back to the pool. +/// +/// The volume must not be activated as a device — call [`deactivate`] +/// first if necessary. +pub fn delete(pool_name: &str, thin_id: u64) -> Result<()> { + pool::message(pool_name, &format!("delete {thin_id}")) +} + +/// Path of a thin volume's device once activated. +pub fn device_path(name: &str) -> PathBuf { + PathBuf::from(format!("/dev/mapper/{name}")) +} + +/// Whether a thin volume is currently activated as a `/dev/mapper` +/// device. +pub fn is_active(name: &str) -> Result { + pool::exists(name) +} + +/// Activate a thin volume as a `/dev/mapper/` block device. +/// +/// `size_sectors` is the volume's virtual size; the pool only allocates +/// blocks as the volume is written to. +pub fn activate( + name: &str, + pool_name: &str, + thin_id: u64, + size_sectors: u64, +) -> Result { + let table = thin_table(pool_name, thin_id, size_sectors); + let output = Command::new("dmsetup") + .args(["create", name, "--table", &table]) + .output() + .map_err(|e| Error::CommandExec { + command: "dmsetup create".to_string(), + source: e, + })?; + Error::check_command("dmsetup create thin", output)?; + Ok(device_path(name)) +} + +/// Tear down a thin volume's `/dev/mapper` device. The underlying thin +/// id and its blocks remain in the pool until [`delete`] is called. +pub fn deactivate(name: &str) -> Result<()> { + let output = Command::new("dmsetup") + .args(["remove", name]) + .output() + .map_err(|e| Error::CommandExec { + command: "dmsetup remove".to_string(), + source: e, + })?; + Error::check_command("dmsetup remove", output)?; + Ok(()) +} + +/// Suspend a thin volume's I/O. Required before snapshotting or +/// reloading the table. +pub fn suspend(name: &str) -> Result<()> { + pool::suspend(name) +} + +/// Resume a previously suspended thin volume. +pub fn resume(name: &str) -> Result<()> { + pool::resume(name) +} + +/// Reload the thin volume's table to expose a new virtual size. +/// +/// Pool capacity is unaffected — thin volumes are virtually sized at +/// activation time and only consume blocks as they are written. Caller +/// is still responsible for filesystem-level resize (e.g. `resize2fs`). +pub fn reload_size( + name: &str, + pool_name: &str, + thin_id: u64, + new_size_sectors: u64, +) -> Result<()> { + let table = thin_table(pool_name, thin_id, new_size_sectors); + suspend(name)?; + let load = Command::new("dmsetup") + .args(["load", name, "--table", &table]) + .output() + .map_err(|e| Error::CommandExec { + command: "dmsetup load".to_string(), + source: e, + })?; + if let Err(e) = Error::check_command("dmsetup load thin", load) { + let _ = resume(name); + return Err(e); + } + resume(name) +} + +fn thin_table(pool_name: &str, thin_id: u64, size_sectors: u64) -> String { + let pool_dev = pool::device_path(pool_name); + format!("0 {size_sectors} thin {} {thin_id}", pool_dev.display()) +} + +/// Sanitize an arbitrary name (image or VM) into a device-mapper-safe +/// component. dmsetup forbids `/`, `:`, and shell metacharacters; the +/// existing image/VM naming policy already enforces the right shape, so +/// this is a defensive guard rather than a real transformation. +pub fn sanitize_dm_name(name: &str) -> String { + name.chars() + .map(|c| if c.is_ascii_alphanumeric() || c == '-' || c == '_' { c } else { '_' }) + .collect() +} + +/// Device-mapper name for a VM volume. +pub fn vm_dm_name(vm_name: &str) -> String { + format!("{VM_PREFIX}{}", sanitize_dm_name(vm_name)) +} + +/// Device-mapper name for an image base volume. +pub fn image_dm_name(image_name: &str) -> String { + format!("{IMAGE_PREFIX}{}", sanitize_dm_name(image_name)) +} + +/// Device-mapper name for the temporary staging volume used while +/// writing a fresh image into the pool. Held only between +/// `create_thin` and the post-`dd` snapshot. +pub fn image_staging_dm_name(image_name: &str) -> String { + format!("{IMAGE_PREFIX}{}-staging", sanitize_dm_name(image_name)) +} + +/// Path that should be passed to Firecracker as `path_on_host`. +pub fn vm_device_path(vm_name: &str) -> PathBuf { + device_path(&vm_dm_name(vm_name)) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn fresh_thin_id_is_nonzero() { + for _ in 0..100 { + assert_ne!(fresh_thin_id(), 0); + } + } + + #[test] + fn fresh_thin_id_distribution() { + // Crude: 100 random u64s should all be distinct in practice. + let ids: std::collections::HashSet = + (0..100).map(|_| fresh_thin_id()).collect(); + assert_eq!(ids.len(), 100); + } + + #[test] + fn thin_table_shape() { + let t = thin_table("ember-pool", 42, 16_777_216); + assert_eq!(t, "0 16777216 thin /dev/mapper/ember-pool 42"); + } + + #[test] + fn dm_names() { + assert_eq!(vm_dm_name("myvm"), "ember-vm-myvm"); + assert_eq!(image_dm_name("library-alpine-latest"), + "ember-img-library-alpine-latest"); + assert_eq!(image_staging_dm_name("foo"), "ember-img-foo-staging"); + } + + #[test] + fn sanitize_keeps_safe_chars() { + assert_eq!(sanitize_dm_name("alpine_3.18-edge"), "alpine_3_18-edge"); + assert_eq!(sanitize_dm_name("my/vm:1"), "my_vm_1"); + } +} diff --git a/crates/ember-linux/src/dm_thin/tools.rs b/crates/ember-linux/src/dm_thin/tools.rs new file mode 100644 index 0000000..2e2fb4d --- /dev/null +++ b/crates/ember-linux/src/dm_thin/tools.rs @@ -0,0 +1,102 @@ +//! Wrappers around the `thin-provisioning-tools` package: `thin_check`, +//! `thin_repair`, `thin_metadata_size`, `thin_dump`. +//! +//! These are recommended (and in some cases required) for safe pool +//! activation and capacity planning. They live in their own module so +//! the dependency on the `thin-provisioning-tools` package is localized. + +use std::path::Path; +use std::process::Command; + +use ember_core::error::{Error, Result}; + +/// Compute a recommended metadata device size in bytes for a pool with +/// `pool_size_bytes` of data, `block_size_bytes` per pool block, and at +/// most `max_thins` concurrent thin volumes. +/// +/// Wraps `thin_metadata_size --numeric-only --unit b`. The output is a +/// single integer in bytes. +pub fn metadata_size( + pool_size_bytes: u64, + block_size_bytes: u64, + max_thins: u64, +) -> Result { + let output = Command::new("thin_metadata_size") + .args([ + "--block-size", + &format!("{block_size_bytes}"), + "--pool-size", + &format!("{pool_size_bytes}"), + "--max-thins", + &format!("{max_thins}"), + "--numeric-only", + "--unit", + "b", + ]) + .output() + .map_err(|e| Error::CommandExec { + command: "thin_metadata_size".to_string(), + source: e, + })?; + let output = Error::check_command("thin_metadata_size", output)?; + let stdout = String::from_utf8_lossy(&output.stdout); + let bytes = stdout.trim().parse::().map_err(|e| Error::Command { + command: "thin_metadata_size".to_string(), + exit_code: 0, + stderr: format!("non-numeric output {:?}: {e}", stdout.trim()), + })?; + Ok(bytes) +} + +/// Run `thin_check` against a metadata device. +/// +/// Should be invoked before activating a pool whose metadata may be +/// dirty (e.g., after an unclean shutdown). Returns Ok if the metadata +/// is consistent; otherwise the operator must run [`repair`] manually. +pub fn check(metadata_dev: &Path) -> Result<()> { + let output = Command::new("thin_check") + .arg(metadata_dev) + .output() + .map_err(|e| Error::CommandExec { + command: "thin_check".to_string(), + source: e, + })?; + Error::check_command("thin_check", output)?; + Ok(()) +} + +/// Repair metadata into a fresh device. +/// +/// `thin_repair` reads the (possibly corrupt) input and writes a clean +/// metadata image to `output`. The pool must be offline during repair. +pub fn repair(input: &Path, output: &Path) -> Result<()> { + let r = Command::new("thin_repair") + .arg("-i") + .arg(input) + .arg("-o") + .arg(output) + .output() + .map_err(|e| Error::CommandExec { + command: "thin_repair".to_string(), + source: e, + })?; + Error::check_command("thin_repair", r)?; + Ok(()) +} + +/// Dump the metadata device's contents as XML. +/// +/// Useful for recovery (cross-checking ember's recorded thin ids +/// against what the pool actually holds) and for debug tooling. +/// Returns the raw XML as a string. +pub fn dump(metadata_dev: &Path) -> Result { + let output = Command::new("thin_dump") + .arg(metadata_dev) + .output() + .map_err(|e| Error::CommandExec { + command: "thin_dump".to_string(), + source: e, + })?; + let output = Error::check_command("thin_dump", output)?; + Ok(String::from_utf8_lossy(&output.stdout).into_owned()) +} diff --git a/crates/ember-linux/src/lib.rs b/crates/ember-linux/src/lib.rs index 4a6b5c7..af109a9 100644 --- a/crates/ember-linux/src/lib.rs +++ b/crates/ember-linux/src/lib.rs @@ -1,3 +1,4 @@ +pub mod dm_thin; pub mod firecracker; pub mod image; pub mod network; From 696627df80240f7087719a785c1f51bbb8b202d9 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 27 Apr 2026 10:12:56 +0200 Subject: [PATCH 05/21] state: add thin_id and snapshots fields for dm-thin backend MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit VmMetadata grows thin_id: Option and snapshots: Vec; ImageEntry grows thin_id: Option. ZFS and macOS leave them at their defaults — Option::is_none / Vec::is_empty cause serde to skip them entirely in vm.json/registry.json, preserving on-disk format. new_entry and new_build_entry gain a thin_id parameter so the dm-thin import pipeline can record the base snapshot id as it returns from the storage backend. SnapshotEntry is a new record type. ZFS keeps using zfs::snapshot::list; dm-thin will populate VmMetadata.snapshots since the kernel doesn't attach names to thin ids. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/ember-core/src/image/registry.rs | 24 ++++++++++++++--- crates/ember-core/src/state/vm.rs | 34 +++++++++++++++++++++++++ src/cli/image.rs | 4 +-- src/cli/vm.rs | 4 +++ 4 files changed, 61 insertions(+), 5 deletions(-) diff --git a/crates/ember-core/src/image/registry.rs b/crates/ember-core/src/image/registry.rs index a92c1f7..18dcb47 100644 --- a/crates/ember-core/src/image/registry.rs +++ b/crates/ember-core/src/image/registry.rs @@ -27,6 +27,9 @@ pub struct ImageEntry { pub size_mib: u64, /// ISO 8601 timestamp when the image was pulled. pub pulled_at: String, + /// dm-thin base snapshot id. `None` for other backends. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub thin_id: Option, } /// The local image registry: a list of pulled images. @@ -103,13 +106,19 @@ impl ImageRegistry { } /// Build an [`ImageEntry`] from a pull result. -pub fn new_entry(reference: &ImageReference, disk_path: &str, size_mib: u64) -> ImageEntry { +pub fn new_entry( + reference: &ImageReference, + disk_path: &str, + size_mib: u64, + thin_id: Option, +) -> ImageEntry { ImageEntry { reference: reference.to_string(), local_name: reference.local_name(), disk_path: disk_path.to_string(), size_mib, pulled_at: now_iso8601(), + thin_id, } } @@ -117,13 +126,20 @@ pub fn new_entry(reference: &ImageReference, disk_path: &str, size_mib: u64) -> /// /// The reference is stored as `local:` to distinguish built /// images from pulled ones in `ember image list` output. -pub fn new_build_entry(name: &str, local_name: &str, disk_path: &str, size_mib: u64) -> ImageEntry { +pub fn new_build_entry( + name: &str, + local_name: &str, + disk_path: &str, + size_mib: u64, + thin_id: Option, +) -> ImageEntry { ImageEntry { reference: format!("local:{name}"), local_name: local_name.to_string(), disk_path: disk_path.to_string(), size_mib, pulled_at: now_iso8601(), + thin_id, } } @@ -163,6 +179,7 @@ mod tests { disk_path: format!("tank/ember/images/library-{name}-latest"), size_mib: 64, pulled_at: "2026-01-01T00:00:00Z".to_string(), + thin_id: None, } } @@ -274,13 +291,14 @@ mod tests { #[test] fn new_entry_builds_correctly() { let reference = ImageReference::parse("alpine:3.19").unwrap(); - let entry = new_entry(&reference, "tank/ember/images/library-alpine-3.19", 96); + let entry = new_entry(&reference, "tank/ember/images/library-alpine-3.19", 96, None); assert_eq!(entry.reference, "docker.io/library/alpine:3.19"); assert_eq!(entry.local_name, "library-alpine-3.19"); assert_eq!(entry.disk_path, "tank/ember/images/library-alpine-3.19"); assert_eq!(entry.size_mib, 96); assert!(!entry.pulled_at.is_empty()); + assert_eq!(entry.thin_id, None); } #[test] diff --git a/crates/ember-core/src/state/vm.rs b/crates/ember-core/src/state/vm.rs index c6ae877..cc182b7 100644 --- a/crates/ember-core/src/state/vm.rs +++ b/crates/ember-core/src/state/vm.rs @@ -63,6 +63,26 @@ pub struct NetworkInfo { pub wan_iface: Option, } +/// A snapshot tracked by a backend that doesn't have a native list +/// query. +/// +/// ZFS records snapshots in the kernel and lists them via `zfs list -t +/// snapshot`, so [`VmMetadata::snapshots`] stays empty for ZFS. dm-thin +/// addresses snapshots by numeric thin id with no name attached at the +/// kernel level, so it persists names + ids in `vm.json`. macOS APFS +/// uses on-disk filenames, so it also doesn't need this list. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct SnapshotEntry { + /// User-visible snapshot name. + pub name: String, + /// Backend-specific thin id. Only meaningful for the dm-thin backend. + pub thin_id: u64, + /// ISO 8601 timestamp. + pub created_at: String, + /// Volume size in 512-byte sectors. + pub size_sectors: u64, +} + /// SSH connection configuration for a VM. #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct SshConfig { @@ -132,6 +152,16 @@ pub struct VmMetadata { /// is purely informational — no cleanup or deletion constraints apply. #[serde(default, alias = "forked_from")] pub parent_vm: Option, + /// dm-thin volume id. `None` for ZFS/APFS backends, which encode + /// volume identity in [`disk_path`](Self::disk_path) instead. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub thin_id: Option, + /// Snapshots maintained by the storage backend in user-space state. + /// + /// dm-thin populates this; ZFS and macOS leave it empty and surface + /// snapshots through their native APIs. + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub snapshots: Vec, } impl VmMetadata { @@ -162,6 +192,8 @@ impl VmMetadata { key: PathBuf::new(), }, parent_vm: None, + thin_id: None, + snapshots: Vec::new(), } } } @@ -354,6 +386,8 @@ mod tests { created_at: "2026-01-01T00:00:00Z".to_string(), ssh: SshConfig::default(), parent_vm: None, + thin_id: None, + snapshots: Vec::new(), } } diff --git a/src/cli/image.rs b/src/cli/image.rs index 1d7731b..648ed0d 100644 --- a/src/cli/image.rs +++ b/src/cli/image.rs @@ -131,7 +131,7 @@ fn pull(args: &PullArgs, state_dir: &Path) -> anyhow::Result<()> { // Step 5: Register in local image registry. let disk = disk_path.to_string_lossy().to_string(); - let entry = new_entry(&reference, &disk, size_mib); + let entry = new_entry(&reference, &disk, size_mib, None); let mut registry = ImageRegistry::load(&store)?; registry.add(entry); registry.save(&store)?; @@ -207,7 +207,7 @@ fn build(args: &BuildArgs, state_dir: &Path) -> anyhow::Result<()> { // Step 5: Register in local image registry. let disk = disk_path.to_string_lossy().to_string(); - let entry = new_build_entry(&args.name, &local_name, &disk, size_mib); + let entry = new_build_entry(&args.name, &local_name, &disk, size_mib, None); let mut registry = ImageRegistry::load(&store)?; registry.add(entry); registry.save(&store)?; diff --git a/src/cli/vm.rs b/src/cli/vm.rs index 215f901..e7c5212 100644 --- a/src/cli/vm.rs +++ b/src/cli/vm.rs @@ -547,6 +547,8 @@ fn create_post_clone( key: ssh_key, }, parent_vm: None, + thin_id: None, + snapshots: Vec::new(), }; vm::save(store, &metadata)?; @@ -666,6 +668,8 @@ fn fork(args: &ForkArgs, state_dir: &Path) -> anyhow::Result<()> { created_at: vm::now_iso8601(), ssh: source.ssh.clone(), parent_vm: Some(args.source.clone()), + thin_id: None, + snapshots: Vec::new(), }; vm::save(&store, &metadata)?; From 7ff3b5849607bb23510f73148d7cda2b6534a98b Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 27 Apr 2026 10:23:14 +0200 Subject: [PATCH 06/21] backend: reshape StorageBackend trait for multi-backend support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three changes that together let dm-thin (and any future) backend return its own thin id from creator methods and read backend-specific state off VmMetadata/ImageEntry from consumer methods: * New VolumeHandle struct returned by create_image_volume, clone_for_vm, clone_vm_storage, and restore_snapshot. Carries the disk path plus an optional thin id for backends that need one. * snapshot returns Option: Some when the backend persists snapshot metadata in vm.json (dm-thin), None when it tracks snapshots itself (ZFS in the kernel, APFS as files). * Methods consuming an existing volume now take &VmMetadata or &ImageEntry instead of &str. ZFS and APFS impls just use .name; the dm-thin impl will reach for .thin_id. CLI flow updates: a pending_metadata helper in src/cli/vm.rs builds a placeholder VmMetadata immediately after clone, so resize and inject calls can run before the full metadata record is constructed. The image registry import path threads VolumeHandle.thin_id through new_entry/new_build_entry. snapshot::restore persists thin_id + disk_path changes returned by the backend on dm-thin. No new backend yet — this commit just teaches the trait + impls and their callers the new shape. ZFS and APFS continue to behave identically. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/ember-core/src/backend.rs | 167 ++++++++++++++++-------------- crates/ember-linux/src/storage.rs | 94 ++++++++--------- crates/ember-macos/src/storage.rs | 64 +++++++----- src/cli/image.rs | 48 ++++++--- src/cli/snapshot.rs | 41 ++++++-- src/cli/vm.rs | 94 +++++++++++------ 6 files changed, 300 insertions(+), 208 deletions(-) diff --git a/crates/ember-core/src/backend.rs b/crates/ember-core/src/backend.rs index 4f028fd..be7ac9e 100644 --- a/crates/ember-core/src/backend.rs +++ b/crates/ember-core/src/backend.rs @@ -32,8 +32,10 @@ pub struct StartedVm { /// Platform-agnostic snapshot information. /// -/// On Linux this is backed by ZFS snapshots (`zfs list -t snapshot`). -/// On macOS this is backed by APFS clone files in the VM's `snapshots/` directory. +/// On Linux/ZFS this is backed by `zfs list -t snapshot`. +/// On macOS/APFS this is backed by APFS clone files in the VM's +/// `snapshots/` directory. On Linux/dm-thin this is backed by entries +/// stored on `VmMetadata::snapshots`. pub struct SnapshotInfo { /// Snapshot name (e.g., "snap1"). Does not include dataset path or directory prefix. pub name: String, @@ -43,9 +45,32 @@ pub struct SnapshotInfo { /// /// - Linux/ZFS: `referenced` property (bytes the snapshot points to). /// - macOS/APFS: logical file size via `stat`. + /// - Linux/dm-thin: virtual volume size at snapshot time. pub size: u64, } +/// A storage volume returned by the [`StorageBackend`] when a fresh +/// volume is created (image base, VM clone, fork, restore). +/// +/// `disk_path` is what gets recorded on `VmMetadata::disk_path` / +/// `ImageEntry::disk_path` and passed to Firecracker as +/// `path_on_host`. `thin_id` is meaningful only for the dm-thin +/// backend; ZFS and macOS impls always return `None`. +pub struct VolumeHandle { + pub disk_path: PathBuf, + pub thin_id: Option, +} + +impl VolumeHandle { + /// Build a handle for backends that have no thin id concept. + pub fn from_path(path: impl Into) -> Self { + Self { + disk_path: path.into(), + thin_id: None, + } + } +} + /// Configuration for storage backend initialization during `ember init`. /// /// Carries the subset of init arguments that the storage backend needs. @@ -121,17 +146,20 @@ pub trait VmBackend { /// Storage backend: manages disk images, clones, and snapshots. /// -/// - **Linux**: ZFS zvols with snapshots and `zfs clone`. -/// - **macOS**: raw `.img` files with APFS CoW clones (`cp -c`). +/// - **Linux/ZFS**: ZFS zvols with snapshots and `zfs clone`. +/// - **Linux/dm-thin**: device-mapper thin volumes with kernel snapshots. +/// - **macOS/APFS**: raw `.img` files with APFS CoW clones (`cp -c`). +/// +/// Methods take `&VmMetadata` / `&ImageEntry` rather than bare names +/// for operations that need backend-specific state living on the +/// record (notably `thin_id` for dm-thin). Methods that *create* fresh +/// volumes return [`VolumeHandle`] so the caller can persist the new +/// `thin_id` (if any) on the matching record. /// -/// Methods use `&self` so the implementation can hold platform-specific config -/// (e.g., ZFS pool/dataset paths on Linux, state directory on macOS). -/// `init` is an associated function since it's called before the backend is constructed. +/// `init` is an associated function since it's called before the +/// backend is constructed. pub trait StorageBackend { /// Initialize storage during `ember init`. - /// - /// Linux: creates ZFS pool (if needed) and datasets. - /// macOS: validates the state directory is on an APFS volume. fn init(config: &InitConfig) -> Result<()> where Self: Sized; @@ -140,98 +168,87 @@ pub trait StorageBackend { /// /// `name` is the image identifier (e.g., `library-alpine-latest`). /// `image_path` is the path to the ext4 image file to import. - /// `size_mib` is the image size in MiB (used for zvol creation on Linux). - /// - /// Returns the zvol path (Linux) or .img file path (macOS). + /// `size_mib` is the image size in MiB. /// - /// Linux: creates a zvol, writes the image via `dd`, creates `@base` snapshot. - /// macOS: copies the `.img` file into `images/data/`. - fn create_image_volume(&self, name: &str, image_path: &Path, size_mib: u64) -> Result; + /// Linux/ZFS: creates a zvol, writes the image via `dd`, creates `@base` snapshot. + /// Linux/dm-thin: allocates a thin volume, writes the image, snaps it as the base id. + /// macOS/APFS: copies the `.img` file into `images/data/`. + fn create_image_volume( + &self, + name: &str, + image_path: &Path, + size_mib: u64, + ) -> Result; - /// Clone a base image for a new VM. Returns the zvol path (Linux) or - /// .img file path (macOS). + /// Clone a base image for a new VM. /// - /// Linux: `zfs clone pool/.../images/name@base pool/.../vms/vm_name`. - /// macOS: `cp -c images/data/name.img vms/vm_name/rootfs.img`. - fn clone_for_vm(&self, image_name: &str, vm_name: &str) -> Result; + /// Linux/ZFS: `zfs clone @base /.../vms/`. + /// Linux/dm-thin: snapshot the image's base thin id into a fresh thin id. + /// macOS/APFS: `cp -c .img /rootfs.img`. + fn clone_for_vm(&self, image: &ImageEntry, vm_name: &str) -> Result; /// Create a named snapshot of a VM's current disk state. /// - /// Linux: `zfs snapshot pool/.../vms/vm_name@snap_name`. - /// macOS: `cp -c vms/vm_name/rootfs.img vms/vm_name/snapshots/snap_name.img`. - fn snapshot(&self, vm_name: &str, snap_name: &str) -> Result<()>; + /// Returns `Some(SnapshotEntry)` when the backend persists snapshot + /// metadata in user-space state (dm-thin). Returns `None` when the + /// backend tracks snapshots itself (ZFS in the kernel, APFS as + /// files on disk). + fn snapshot( + &self, + vm: &VmMetadata, + snap_name: &str, + ) -> Result>; /// Restore a VM's disk to a previously created snapshot. /// - /// Linux: `zfs rollback pool/.../vms/vm_name@snap_name`. - /// macOS: `cp -c vms/vm_name/snapshots/snap_name.img vms/vm_name/rootfs.img`. - fn restore_snapshot(&self, vm_name: &str, snap_name: &str) -> Result<()>; + /// Returns a fresh `VolumeHandle` because some backends generate a + /// new identifier on restore (dm-thin's `delete` + `create_snap` + /// produces a new `thin_id`). For backends that mutate the volume + /// in place (ZFS rollback) or replace the file atomically (APFS), + /// the handle's `thin_id` is `None`. + fn restore_snapshot(&self, vm: &VmMetadata, snap_name: &str) -> Result; /// Delete a snapshot. - /// - /// Linux: `zfs destroy pool/.../vms/vm_name@snap_name`. - /// macOS: `rm vms/vm_name/snapshots/snap_name.img`. - fn delete_snapshot(&self, vm_name: &str, snap_name: &str) -> Result<()>; + fn delete_snapshot(&self, vm: &VmMetadata, snap_name: &str) -> Result<()>; /// List all snapshots for a VM. - fn list_snapshots(&self, vm_name: &str) -> Result>; + fn list_snapshots(&self, vm: &VmMetadata) -> Result>; - /// Resize a VM's disk to `new_size`. - /// - /// Linux: `zfs set volsize=... + resize2fs`. - /// macOS: `truncate -s ... + resize2fs`. - fn resize(&self, vm_name: &str, new_size: ByteSize) -> Result<()>; + /// Resize a VM's disk to `new_size`. Caller is responsible for + /// stopping the VM first. + fn resize(&self, vm: &VmMetadata, new_size: ByteSize) -> Result<()>; /// Destroy all storage for a VM (disk image, snapshots). - /// - /// Linux: `zfs destroy -r pool/.../vms/vm_name`. - /// macOS: `rm -rf vms/vm_name/` (disk files only; state is separate). - fn destroy_vm_storage(&self, vm_name: &str) -> Result<()>; + fn destroy_vm_storage(&self, vm: &VmMetadata) -> Result<()>; /// Destroy storage for a base image. /// - /// With `force: true`, also destroys any dependent storage (e.g. VM zvols - /// cloned from this image) that couldn't be cleaned up at the application - /// level — typically orphaned ZFS clones whose state files are already gone. - /// - /// Linux: `zfs destroy -r` (normal) or `zfs destroy -R` (force). - /// macOS: `rm images/data/name.img` (force flag is a no-op). - fn destroy_image_storage(&self, name: &str, force: bool) -> Result<()>; + /// With `force: true`, also destroys any dependent storage (e.g. + /// VM zvols cloned from this image) that couldn't be cleaned up at + /// the application level — typically orphaned ZFS clones whose + /// state files are already gone. + fn destroy_image_storage(&self, image: &ImageEntry, force: bool) -> Result<()>; - /// Get the mountable device path for a VM's root disk. + /// Mountable device path for a VM's root disk. /// - /// Linux: `/dev/zvol/pool/dataset/vms/vm_name` (block device for the zvol). - /// macOS: `state_dir/vms/vm_name/rootfs.img` (raw disk image file). - fn disk_device_path(&self, vm_name: &str) -> PathBuf; + /// Linux/ZFS: `/dev/zvol/pool/dataset/vms/vm_name`. + /// Linux/dm-thin: `/dev/mapper/ember-vm-`. + /// macOS/APFS: `/vms//rootfs.img`. + fn disk_device_path(&self, vm: &VmMetadata) -> PathBuf; /// Clone a VM's disk storage to create a new VM (used by `vm fork`). - /// - /// Returns the disk path for the new VM. - /// - /// On Linux, this creates a ZFS snapshot on the source VM and clones it. - /// The snapshot naming convention is internal to the backend. - /// On macOS, this does a direct `cp -c` (APFS CoW clone) — no intermediate - /// snapshot, no dependency between source and target. - fn clone_vm_storage(&self, source_vm: &str, target_vm: &str) -> Result; + fn clone_vm_storage(&self, source: &VmMetadata, target_vm: &str) -> Result; /// Clean up fork-related resources on the source VM. /// - /// Called when deleting a forked VM to remove any backend-specific - /// resources (e.g., ZFS snapshot on the source VM). The backend - /// reconstructs the resource name from the parent/forked VM names. - /// - /// No-op on backends where forks are independent (e.g., macOS/APFS). - fn cleanup_fork(&self, parent_vm: &str, forked_vm: &str) -> Result<()>; + /// Used by ZFS to drop the per-fork snapshot it created on the + /// source's dataset. No-op on backends where forks are independent + /// (dm-thin, APFS). + fn cleanup_fork(&self, parent: &VmMetadata, forked: &VmMetadata) -> Result<()>; - /// Check if deleting this VM would break other VMs' storage. - /// - /// Returns the names of VMs whose storage depends on this VM - /// (e.g., ZFS clones that reference snapshots on this VM's dataset). - /// An empty vec means the VM can be safely deleted. - /// - /// On Linux/ZFS, fork snapshots create a real dependency chain. - /// On macOS/APFS, forks are independent — always returns empty. - fn storage_dependents(&self, vm_name: &str) -> Result>; + /// VMs whose storage depends on `vm` and would break if `vm` were + /// destroyed. Empty for backends whose forks are independent. + fn storage_dependents(&self, vm: &VmMetadata) -> Result>; /// Mount a disk image and return the mount point path. /// diff --git a/crates/ember-linux/src/storage.rs b/crates/ember-linux/src/storage.rs index 5aafafc..10e6ff4 100644 --- a/crates/ember-linux/src/storage.rs +++ b/crates/ember-linux/src/storage.rs @@ -11,10 +11,12 @@ use std::path::{Path, PathBuf}; use std::process::Command as ProcessCommand; use crate::zfs; -use ember_core::backend::{InitConfig, SnapshotInfo, StorageBackend}; +use ember_core::backend::{InitConfig, SnapshotInfo, StorageBackend, VolumeHandle}; use ember_core::config::size::ByteSize; use ember_core::config::GlobalConfig; use ember_core::error::{Error, Result}; +use ember_core::image::registry::ImageEntry; +use ember_core::state::vm::{SnapshotEntry, VmMetadata}; /// Linux storage backend using ZFS zvols. #[derive(Clone)] @@ -88,9 +90,12 @@ impl StorageBackend for LinuxStorage { } /// Create a ZFS zvol from an ext4 image, write it via `dd`, and snapshot `@base`. - /// - /// Returns the zvol path (e.g., "tank/ember/images/library-alpine-latest"). - fn create_image_volume(&self, name: &str, image_path: &Path, size_mib: u64) -> Result { + fn create_image_volume( + &self, + name: &str, + image_path: &Path, + size_mib: u64, + ) -> Result { let zvol = self.image_zvol(name); // Create the zvol. @@ -103,14 +108,12 @@ impl StorageBackend for LinuxStorage { return Err(e); } - Ok(PathBuf::from(zvol)) + Ok(VolumeHandle::from_path(zvol)) } /// Clone the image's `@base` snapshot to create a VM zvol. - /// - /// Returns the zvol path (e.g., "tank/ember/vms/myvm"). - fn clone_for_vm(&self, image_name: &str, vm_name: &str) -> Result { - let image_zvol = self.image_zvol(image_name); + fn clone_for_vm(&self, image: &ImageEntry, vm_name: &str) -> Result { + let image_zvol = self.image_zvol(&image.local_name); let snapshot = format!("{image_zvol}@{}", zfs::BASE_SNAPSHOT_NAME); let vm_zvol = self.vm_zvol(vm_name); @@ -123,27 +126,35 @@ impl StorageBackend for LinuxStorage { } zfs::volume::clone(&snapshot, &vm_zvol)?; - Ok(PathBuf::from(vm_zvol)) + Ok(VolumeHandle::from_path(vm_zvol)) } - fn snapshot(&self, vm_name: &str, snap_name: &str) -> Result<()> { - let zvol = self.vm_zvol(vm_name); - zfs::snapshot::create(&zvol, snap_name) + fn snapshot( + &self, + vm: &VmMetadata, + snap_name: &str, + ) -> Result> { + let zvol = self.vm_zvol(&vm.name); + zfs::snapshot::create(&zvol, snap_name)?; + // ZFS records snapshots in the kernel; nothing to add to vm.json. + Ok(None) } - fn restore_snapshot(&self, vm_name: &str, snap_name: &str) -> Result<()> { - let zvol = self.vm_zvol(vm_name); - zfs::snapshot::rollback(&zvol, snap_name) + fn restore_snapshot(&self, vm: &VmMetadata, snap_name: &str) -> Result { + let zvol = self.vm_zvol(&vm.name); + zfs::snapshot::rollback(&zvol, snap_name)?; + // Rollback mutates the volume in place; identity unchanged. + Ok(VolumeHandle::from_path(zvol)) } - fn delete_snapshot(&self, vm_name: &str, snap_name: &str) -> Result<()> { - let zvol = self.vm_zvol(vm_name); + fn delete_snapshot(&self, vm: &VmMetadata, snap_name: &str) -> Result<()> { + let zvol = self.vm_zvol(&vm.name); zfs::snapshot::destroy(&zvol, snap_name) } /// List snapshots, filtering out the reserved `@base` snapshot. - fn list_snapshots(&self, vm_name: &str) -> Result> { - let zvol = self.vm_zvol(vm_name); + fn list_snapshots(&self, vm: &VmMetadata) -> Result> { + let zvol = self.vm_zvol(&vm.name); let zfs_snaps = zfs::snapshot::list(&zvol)?; Ok(zfs_snaps @@ -158,8 +169,8 @@ impl StorageBackend for LinuxStorage { } /// Grow the zvol and expand the ext4 filesystem. - fn resize(&self, vm_name: &str, new_size: ByteSize) -> Result<()> { - let zvol = self.vm_zvol(vm_name); + fn resize(&self, vm: &VmMetadata, new_size: ByteSize) -> Result<()> { + let zvol = self.vm_zvol(&vm.name); let new_gib = new_size .to_gib() .map_err(|e| Error::Zfs(format!("invalid resize target: {e}")))?; @@ -176,8 +187,8 @@ impl StorageBackend for LinuxStorage { } /// Destroy the VM's zvol and all its snapshots. - fn destroy_vm_storage(&self, vm_name: &str) -> Result<()> { - let zvol = self.vm_zvol(vm_name); + fn destroy_vm_storage(&self, vm: &VmMetadata) -> Result<()> { + let zvol = self.vm_zvol(&vm.name); // Ignore errors — the zvol may already be gone. let _ = zfs::volume::destroy(&zvol, true); Ok(()) @@ -187,8 +198,8 @@ impl StorageBackend for LinuxStorage { /// /// With `force: true`, uses `zfs destroy -R` to also destroy any orphaned /// dependent clones (VM zvols) that the application layer couldn't clean up. - fn destroy_image_storage(&self, name: &str, force: bool) -> Result<()> { - let zvol = self.image_zvol(name); + fn destroy_image_storage(&self, image: &ImageEntry, force: bool) -> Result<()> { + let zvol = self.image_zvol(&image.local_name); if force { zfs::destroy_with_dependents(&zvol) } else { @@ -197,21 +208,14 @@ impl StorageBackend for LinuxStorage { } /// Device path for a VM's root disk zvol. - /// - /// Returns the `/dev/zvol/...` path that can be used for mounting - /// or passing to Firecracker as a block device. - fn disk_device_path(&self, vm_name: &str) -> PathBuf { - let zvol = self.vm_zvol(vm_name); + fn disk_device_path(&self, vm: &VmMetadata) -> PathBuf { + let zvol = self.vm_zvol(&vm.name); zfs::volume::device_path(&zvol) } /// Fork a VM's disk by snapshotting the source and cloning into a new VM. - /// - /// Internally creates a ZFS snapshot named `fork-{target_vm}` on the source, - /// then clones it into the target VM's zvol. The snapshot naming convention - /// is entirely internal — the caller only sees the resulting disk path. - fn clone_vm_storage(&self, source_vm: &str, target_vm: &str) -> Result { - let source_zvol = self.vm_zvol(source_vm); + fn clone_vm_storage(&self, source: &VmMetadata, target_vm: &str) -> Result { + let source_zvol = self.vm_zvol(&source.name); let target_zvol = self.vm_zvol(target_vm); let snap_name = format!("fork-{target_vm}"); @@ -227,16 +231,16 @@ impl StorageBackend for LinuxStorage { return Err(e); } - Ok(PathBuf::from(target_zvol)) + Ok(VolumeHandle::from_path(target_zvol)) } /// Clean up the fork snapshot on the parent VM. /// /// Reconstructs the snapshot name from the naming convention: /// `{pool}/vms/{parent_vm}@fork-{forked_vm}`. - fn cleanup_fork(&self, parent_vm: &str, forked_vm: &str) -> Result<()> { - let parent_zvol = self.vm_zvol(parent_vm); - let snap_name = format!("fork-{forked_vm}"); + fn cleanup_fork(&self, parent: &VmMetadata, forked: &VmMetadata) -> Result<()> { + let parent_zvol = self.vm_zvol(&parent.name); + let snap_name = format!("fork-{}", forked.name); match zfs::snapshot::destroy(&parent_zvol, &snap_name) { Ok(()) => {} Err(e) => { @@ -249,12 +253,8 @@ impl StorageBackend for LinuxStorage { } /// Check for fork snapshots on this VM's ZFS dataset. - /// - /// Lists all snapshots matching `fork-*` on the VM's zvol and returns - /// the implied dependent VM names. These represent ZFS clones that - /// would break if this VM's dataset were destroyed. - fn storage_dependents(&self, vm_name: &str) -> Result> { - let zvol = self.vm_zvol(vm_name); + fn storage_dependents(&self, vm: &VmMetadata) -> Result> { + let zvol = self.vm_zvol(&vm.name); let snapshots = zfs::snapshot::list(&zvol)?; Ok(snapshots diff --git a/crates/ember-macos/src/storage.rs b/crates/ember-macos/src/storage.rs index af2dbfa..af777fe 100644 --- a/crates/ember-macos/src/storage.rs +++ b/crates/ember-macos/src/storage.rs @@ -18,9 +18,11 @@ use std::path::{Path, PathBuf}; use std::process::Command; use std::time::Instant; -use ember_core::backend::{InitConfig, SnapshotInfo, StorageBackend}; +use ember_core::backend::{InitConfig, SnapshotInfo, StorageBackend, VolumeHandle}; use ember_core::config::size::ByteSize; use ember_core::error::{Error, Result}; +use ember_core::image::registry::ImageEntry; +use ember_core::state::vm::{SnapshotEntry, VmMetadata}; /// macOS storage backend using APFS copy-on-write clones. /// @@ -116,7 +118,7 @@ impl StorageBackend for MacosStorage { name: &str, image_path: &Path, _size_mib: u64, - ) -> Result { + ) -> Result { let dest = self.image_path(name); // Ensure the images directory exists. @@ -137,7 +139,7 @@ impl StorageBackend for MacosStorage { let _ = fs::remove_file(image_path); } - Ok(dest) + Ok(VolumeHandle::from_path(dest)) } /// Clone a base image for a new VM using APFS copy-on-write. @@ -145,8 +147,8 @@ impl StorageBackend for MacosStorage { /// `cp -c` creates an instant CoW clone — the VM's rootfs shares blocks /// with the base image until written to. This is the macOS equivalent of /// `zfs clone pool/.../images/name@base pool/.../vms/vm_name`. - fn clone_for_vm(&self, image_name: &str, vm_name: &str) -> Result { - let src = self.image_path(image_name); + fn clone_for_vm(&self, image: &ImageEntry, vm_name: &str) -> Result { + let src = self.image_path(&image.local_name); if !src.exists() { return Err(Error::Image(format!( "base image not found: {}", @@ -170,7 +172,7 @@ impl StorageBackend for MacosStorage { let dest = self.vm_rootfs(vm_name); apfs_clone(&src, &dest)?; - Ok(dest) + Ok(VolumeHandle::from_path(dest)) } /// Create a snapshot by APFS-cloning the VM's current rootfs. @@ -178,7 +180,12 @@ impl StorageBackend for MacosStorage { /// `cp -c vms//rootfs.img → vms//snapshots/.img` /// This is instant (CoW) and costs no additional disk space until /// the VM's rootfs diverges from the snapshot. - fn snapshot(&self, vm_name: &str, snap_name: &str) -> Result<()> { + fn snapshot( + &self, + vm: &VmMetadata, + snap_name: &str, + ) -> Result> { + let vm_name = vm.name.as_str(); let src = self.vm_rootfs(vm_name); if !src.exists() { return Err(Error::Image(format!( @@ -201,7 +208,8 @@ impl StorageBackend for MacosStorage { } apfs_clone(&src, &dest)?; - Ok(()) + // APFS tracks snapshots as files on disk; nothing to add to vm.json. + Ok(None) } /// Restore a snapshot by replacing the VM's rootfs with an APFS clone @@ -210,7 +218,8 @@ impl StorageBackend for MacosStorage { /// `cp -c vms//snapshots/.img → vms//rootfs.img` /// The old rootfs is removed first, then replaced with a fresh CoW clone /// of the snapshot. - fn restore_snapshot(&self, vm_name: &str, snap_name: &str) -> Result<()> { + fn restore_snapshot(&self, vm: &VmMetadata, snap_name: &str) -> Result { + let vm_name = vm.name.as_str(); let snap_path = self .vm_snapshots_dir(vm_name) .join(format!("{snap_name}.img")); @@ -237,10 +246,10 @@ impl StorageBackend for MacosStorage { // Atomic rename replaces the old rootfs in one operation. fs::rename(&tmp_rootfs, &rootfs).map_err(|e| Error::Io { - path: rootfs, + path: rootfs.clone(), source: e, })?; - Ok(()) + Ok(VolumeHandle::from_path(rootfs)) } /// Delete a snapshot by removing its image file. @@ -248,7 +257,8 @@ impl StorageBackend for MacosStorage { /// APFS reference-counts the underlying blocks — deleting a snapshot only /// frees blocks that are not shared with other clones (rootfs or other /// snapshots). - fn delete_snapshot(&self, vm_name: &str, snap_name: &str) -> Result<()> { + fn delete_snapshot(&self, vm: &VmMetadata, snap_name: &str) -> Result<()> { + let vm_name = vm.name.as_str(); let snap_path = self .vm_snapshots_dir(vm_name) .join(format!("{snap_name}.img")); @@ -269,8 +279,8 @@ impl StorageBackend for MacosStorage { /// /// Each `.img` file in the directory is a snapshot. Metadata (creation /// time, size) comes from `fs::metadata` on each file. - fn list_snapshots(&self, vm_name: &str) -> Result> { - let snap_dir = self.vm_snapshots_dir(vm_name); + fn list_snapshots(&self, vm: &VmMetadata) -> Result> { + let snap_dir = self.vm_snapshots_dir(&vm.name); if !snap_dir.exists() { return Ok(vec![]); } @@ -335,8 +345,8 @@ impl StorageBackend for MacosStorage { /// /// Only growing is supported — the CLI layer prevents shrink attempts. /// Requires `e2fsprogs` from Homebrew (`brew install e2fsprogs`). - fn resize(&self, vm_name: &str, new_size: ByteSize) -> Result<()> { - let rootfs = self.vm_rootfs(vm_name); + fn resize(&self, vm: &VmMetadata, new_size: ByteSize) -> Result<()> { + let rootfs = self.vm_rootfs(&vm.name); if !rootfs.exists() { return Err(Error::Image(format!( "VM rootfs not found: {}", @@ -408,8 +418,8 @@ impl StorageBackend for MacosStorage { /// Destroy all storage for a VM: rootfs image, snapshots, and VM directory. /// /// Silently succeeds if the directory doesn't exist (idempotent delete). - fn destroy_vm_storage(&self, vm_name: &str) -> Result<()> { - let vm_dir = self.vm_dir(vm_name); + fn destroy_vm_storage(&self, vm: &VmMetadata) -> Result<()> { + let vm_dir = self.vm_dir(&vm.name); if vm_dir.exists() { fs::remove_dir_all(&vm_dir).map_err(|e| Error::Io { path: vm_dir, @@ -421,8 +431,8 @@ impl StorageBackend for MacosStorage { /// Destroy storage for a base image (the raw `.img` file). /// The `force` flag is a no-op on macOS (APFS clones are independent). - fn destroy_image_storage(&self, name: &str, _force: bool) -> Result<()> { - let img = self.image_path(name); + fn destroy_image_storage(&self, image: &ImageEntry, _force: bool) -> Result<()> { + let img = self.image_path(&image.local_name); if img.exists() { fs::remove_file(&img).map_err(|e| Error::Io { path: img, @@ -436,8 +446,8 @@ impl StorageBackend for MacosStorage { /// /// On macOS the raw `.img` file is passed directly to AVF — no /// block device indirection like ZFS zvols. - fn disk_device_path(&self, vm_name: &str) -> PathBuf { - self.vm_rootfs(vm_name) + fn disk_device_path(&self, vm: &VmMetadata) -> PathBuf { + self.vm_rootfs(&vm.name) } /// Clone a source VM's disk for forking via APFS copy-on-write. @@ -445,8 +455,8 @@ impl StorageBackend for MacosStorage { /// Directly clones the source VM's rootfs into the target VM's rootfs /// using `cp -c`. No intermediate snapshot is created — APFS clones /// are fully independent, so no cleanup or dependency tracking is needed. - fn clone_vm_storage(&self, source_vm: &str, target_vm: &str) -> Result { - let source_rootfs = self.vm_rootfs(source_vm); + fn clone_vm_storage(&self, source: &VmMetadata, target_vm: &str) -> Result { + let source_rootfs = self.vm_rootfs(&source.name); if !source_rootfs.exists() { return Err(Error::Image(format!( "source VM rootfs not found: {}", @@ -469,16 +479,16 @@ impl StorageBackend for MacosStorage { let target_rootfs = self.vm_rootfs(target_vm); apfs_clone(&source_rootfs, &target_rootfs)?; - Ok(target_rootfs) + Ok(VolumeHandle::from_path(target_rootfs)) } /// No-op on macOS — APFS clones are independent, nothing to clean up. - fn cleanup_fork(&self, _parent_vm: &str, _forked_vm: &str) -> Result<()> { + fn cleanup_fork(&self, _parent: &VmMetadata, _forked: &VmMetadata) -> Result<()> { Ok(()) } /// Always returns empty on macOS — APFS clones are independent. - fn storage_dependents(&self, _vm_name: &str) -> Result> { + fn storage_dependents(&self, _vm: &VmMetadata) -> Result> { Ok(vec![]) } diff --git a/src/cli/image.rs b/src/cli/image.rs index 648ed0d..ca0752b 100644 --- a/src/cli/image.rs +++ b/src/cli/image.rs @@ -4,11 +4,11 @@ use clap::{Args, Subcommand}; use super::fmt::{format_bytes_binary, MIB}; use super::vm::OutputFormat; -use crate::backend::{create_storage, CurrentPlatform, Platform, Storage}; +use crate::backend::{create_storage, CurrentPlatform, Platform, Storage, VolumeHandle}; use crate::image; use ember_core::config::GlobalConfig; use ember_core::image::pull::ImageReference; -use ember_core::image::registry::{new_build_entry, new_entry, ImageRegistry}; +use ember_core::image::registry::{new_build_entry, new_entry, ImageEntry, ImageRegistry}; use ember_core::state::store::StateStore; use ember_core::state::vm::{self, VmMetadata}; @@ -126,12 +126,12 @@ fn pull(args: &PullArgs, state_dir: &Path) -> anyhow::Result<()> { inject_image_config(&rootfs_dir, true)?; // Steps 3-4: Create ext4 image → import into storage backend. - let (size_mib, disk_path, rollback) = + let (size_mib, handle, rollback) = create_image_from_rootfs(&rootfs_dir, work_dir.path(), &local_name, &storage)?; // Step 5: Register in local image registry. - let disk = disk_path.to_string_lossy().to_string(); - let entry = new_entry(&reference, &disk, size_mib, None); + let disk = handle.disk_path.to_string_lossy().to_string(); + let entry = new_entry(&reference, &disk, size_mib, handle.thin_id); let mut registry = ImageRegistry::load(&store)?; registry.add(entry); registry.save(&store)?; @@ -202,12 +202,12 @@ fn build(args: &BuildArgs, state_dir: &Path) -> anyhow::Result<()> { inject_image_config(&rootfs_dir, false)?; // Steps 3-4: Create ext4 image → import into storage backend. - let (size_mib, disk_path, rollback) = + let (size_mib, handle, rollback) = create_image_from_rootfs(&rootfs_dir, work_dir.path(), &local_name, &storage)?; // Step 5: Register in local image registry. - let disk = disk_path.to_string_lossy().to_string(); - let entry = new_build_entry(&args.name, &local_name, &disk, size_mib, None); + let disk = handle.disk_path.to_string_lossy().to_string(); + let entry = new_build_entry(&args.name, &local_name, &disk, size_mib, handle.thin_id); let mut registry = ImageRegistry::load(&store)?; registry.add(entry); registry.save(&store)?; @@ -304,7 +304,7 @@ fn delete(args: &DeleteArgs, state_dir: &Path) -> anyhow::Result<()> { let config: GlobalConfig = store.read(&store.config_path())?; let storage = create_storage(&config); println!("Destroying storage for image '{}'...", local_name); - storage.destroy_image_storage(&local_name, args.force)?; + storage.destroy_image_storage(&entry, args.force)?; // Remove from registry last, after the storage is gone. image::registry::remove_image(&store, &local_name)?; @@ -380,14 +380,15 @@ fn inject_image_config(rootfs_dir: &Path, inject_inittab: bool) -> anyhow::Resul /// Create an ext4 image from a rootfs directory and import it into storage. /// -/// Returns `(size_mib, disk_path, rollback)` — the caller must register -/// the image in the registry and then call `rollback.commit()` to finalize. +/// Returns `(size_mib, handle, rollback)` — the caller pulls +/// `handle.disk_path` and `handle.thin_id` to build an [`ImageEntry`] +/// for the registry, then calls `rollback.commit()` to finalize. fn create_image_from_rootfs( rootfs_dir: &Path, work_dir: &Path, name: &str, storage: &Storage, -) -> anyhow::Result<(u64, PathBuf, ember_core::cleanup::Rollback)> { +) -> anyhow::Result<(u64, VolumeHandle, ember_core::cleanup::Rollback)> { let size_mib = CurrentPlatform::estimate_ext4_size_mib(rootfs_dir)?; let ext4_path = work_dir.join("rootfs.ext4"); println!( @@ -402,18 +403,33 @@ fn create_image_from_rootfs( .unwrap_or(size_mib); println!(" Importing image into storage..."); - let disk_path = storage.create_image_volume(name, &ext4_path, size_mib)?; + let handle = storage.create_image_volume(name, &ext4_path, size_mib)?; let mut rollback = ember_core::cleanup::Rollback::new(); { let storage = storage.clone(); - let n = name.to_string(); + let stub = stub_image_entry(name, &handle); rollback.push("image storage", move || { - let _ = storage.destroy_image_storage(&n, false); + let _ = storage.destroy_image_storage(&stub, false); }); } - Ok((size_mib, disk_path, rollback)) + Ok((size_mib, handle, rollback)) +} + +/// Build a minimal [`ImageEntry`] for use in cleanup paths where the +/// real entry hasn't been (or no longer is) registered. The ZFS, btrfs, +/// and dm-thin backends only inspect `local_name` and `thin_id`, so the +/// remaining fields can be placeholders. +fn stub_image_entry(local_name: &str, handle: &VolumeHandle) -> ImageEntry { + ImageEntry { + reference: String::new(), + local_name: local_name.to_string(), + disk_path: handle.disk_path.to_string_lossy().into_owned(), + size_mib: 0, + pulled_at: String::new(), + thin_id: handle.thin_id, + } } /// Resolve a user-provided image name to its registry local_name. diff --git a/src/cli/snapshot.rs b/src/cli/snapshot.rs index 3ae5bca..e1e2638 100644 --- a/src/cli/snapshot.rs +++ b/src/cli/snapshot.rs @@ -81,7 +81,7 @@ fn create(args: &CreateArgs, state_dir: &Path) -> anyhow::Result<()> { let store = StateStore::new(state_dir.to_path_buf()); let config: GlobalConfig = store.read(&store.config_path())?; let storage = create_storage(&config); - let _metadata = vm::load(&store, &args.vm_name)?; + let mut metadata = vm::load(&store, &args.vm_name)?; // Disallow the reserved snapshot name. if args.snapshot_name == RESERVED_SNAPSHOT_NAME { @@ -89,7 +89,7 @@ fn create(args: &CreateArgs, state_dir: &Path) -> anyhow::Result<()> { } // Check the snapshot doesn't already exist. - let existing = storage.list_snapshots(&args.vm_name)?; + let existing = storage.list_snapshots(&metadata)?; if existing.iter().any(|s| s.name == args.snapshot_name) { anyhow::bail!( "snapshot '{}' already exists on vm '{}'", @@ -98,7 +98,10 @@ fn create(args: &CreateArgs, state_dir: &Path) -> anyhow::Result<()> { ); } - storage.snapshot(&args.vm_name, &args.snapshot_name)?; + if let Some(entry) = storage.snapshot(&metadata, &args.snapshot_name)? { + metadata.snapshots.push(entry); + vm::save(&store, &metadata)?; + } println!( "Created snapshot '{}' of vm '{}'", @@ -115,9 +118,9 @@ fn list(args: &ListArgs, state_dir: &Path) -> anyhow::Result<()> { let store = StateStore::new(state_dir.to_path_buf()); let config: GlobalConfig = store.read(&store.config_path())?; let storage = create_storage(&config); - let _metadata = vm::load(&store, &args.vm_name)?; + let metadata = vm::load(&store, &args.vm_name)?; - let snapshots = storage.list_snapshots(&args.vm_name)?; + let snapshots = storage.list_snapshots(&metadata)?; match args.format { OutputFormat::Json => { @@ -191,7 +194,7 @@ fn delete(args: &DeleteArgs, state_dir: &Path) -> anyhow::Result<()> { let store = StateStore::new(state_dir.to_path_buf()); let config: GlobalConfig = store.read(&store.config_path())?; let storage = create_storage(&config); - let _metadata = vm::load(&store, &args.vm_name)?; + let mut metadata = vm::load(&store, &args.vm_name)?; // Disallow deleting the reserved snapshot. if args.snapshot_name == RESERVED_SNAPSHOT_NAME { @@ -199,7 +202,7 @@ fn delete(args: &DeleteArgs, state_dir: &Path) -> anyhow::Result<()> { } // Verify the snapshot exists. - let existing = storage.list_snapshots(&args.vm_name)?; + let existing = storage.list_snapshots(&metadata)?; if !existing.iter().any(|s| s.name == args.snapshot_name) { anyhow::bail!( "snapshot '{}' does not exist on vm '{}'\n\ @@ -210,7 +213,15 @@ fn delete(args: &DeleteArgs, state_dir: &Path) -> anyhow::Result<()> { ); } - storage.delete_snapshot(&args.vm_name, &args.snapshot_name)?; + storage.delete_snapshot(&metadata, &args.snapshot_name)?; + + // For backends that track snapshots in vm.json (dm-thin), drop the + // entry. ZFS/APFS leave vm.snapshots empty; this is a no-op there. + let before = metadata.snapshots.len(); + metadata.snapshots.retain(|s| s.name != args.snapshot_name); + if metadata.snapshots.len() != before { + vm::save(&store, &metadata)?; + } println!( "Deleted snapshot '{}' from vm '{}'", @@ -227,10 +238,10 @@ fn restore(args: &RestoreArgs, state_dir: &Path) -> anyhow::Result<()> { let store = StateStore::new(state_dir.to_path_buf()); let config: GlobalConfig = store.read(&store.config_path())?; let storage = create_storage(&config); - let _metadata = vm::require_stopped(&store, &args.vm_name, "restoring a snapshot")?; + let mut metadata = vm::require_stopped(&store, &args.vm_name, "restoring a snapshot")?; // Verify the snapshot exists. - let existing = storage.list_snapshots(&args.vm_name)?; + let existing = storage.list_snapshots(&metadata)?; if !existing.iter().any(|s| s.name == args.snapshot_name) { anyhow::bail!( "snapshot '{}' does not exist on vm '{}'\n\ @@ -241,7 +252,15 @@ fn restore(args: &RestoreArgs, state_dir: &Path) -> anyhow::Result<()> { ); } - storage.restore_snapshot(&args.vm_name, &args.snapshot_name)?; + let handle = storage.restore_snapshot(&metadata, &args.snapshot_name)?; + // Persist any backend-specific identity change (dm-thin replaces the + // thin_id on restore; ZFS/APFS keep the same identity). + let new_disk_path = handle.disk_path.to_string_lossy().to_string(); + if metadata.thin_id != handle.thin_id || metadata.disk_path != new_disk_path { + metadata.thin_id = handle.thin_id; + metadata.disk_path = new_disk_path; + vm::save(&store, &metadata)?; + } println!( "Restored vm '{}' to snapshot '{}'", diff --git a/src/cli/vm.rs b/src/cli/vm.rs index e7c5212..7184102 100644 --- a/src/cli/vm.rs +++ b/src/cli/vm.rs @@ -6,6 +6,7 @@ use uuid::Uuid; use super::fmt::{format_bytes_binary, GIB, MIB}; use crate::backend::{ create_storage, CurrentPlatform, Network, NetworkBackend, Platform, Storage, Vm, VmBackend, + VolumeHandle, }; use crate::image; use ember_core::config; @@ -16,6 +17,30 @@ use ember_core::image::registry::ImageRegistry; use ember_core::state::store::StateStore; use ember_core::state::vm::{self, NetworkInfo, SshConfig, VmMetadata, VmStatus}; +/// Build a placeholder [`VmMetadata`] from a freshly returned +/// [`VolumeHandle`]. +/// +/// Used between `clone_for_vm`/`clone_vm_storage` and the moment the +/// fully populated metadata is constructed: the storage backend reads +/// `name`, `disk_path`, and `thin_id` from this stub for resize, mount, +/// and SSH-key injection. All other fields are placeholders inherited +/// from [`VmMetadata::default_for_teardown`]. +fn pending_metadata(name: &str, handle: &VolumeHandle) -> VmMetadata { + let mut m = VmMetadata::default_for_teardown(); + m.name = name.to_string(); + m.disk_path = handle.disk_path.to_string_lossy().into_owned(); + m.thin_id = handle.thin_id; + m +} + +/// Build a placeholder [`VmMetadata`] when only the name is available +/// (e.g., recovery paths where the real record can no longer be loaded). +fn name_only_metadata(name: &str) -> VmMetadata { + let mut m = VmMetadata::default_for_teardown(); + m.name = name.to_string(); + m +} + /// Load a running VM with network info, checking that the guest IP is resolved. /// /// Wraps `vm::load_running_with_network` and returns an error if the guest IP @@ -426,7 +451,6 @@ fn create(args: &CreateArgs, state_dir: &Path) -> anyhow::Result<()> { ) })?; - let image_name = image_entry.local_name.clone(); let image_ref = image_entry.reference.clone(); let image_size_mib = image_entry.size_mib; @@ -436,15 +460,15 @@ fn create(args: &CreateArgs, state_dir: &Path) -> anyhow::Result<()> { // Clone base image → per-VM disk (instant, copy-on-write). println!("Cloning image for VM '{}'...", resolved.name); - let vm_disk_path = storage.clone_for_vm(&image_name, &resolved.name)?; - let vm_disk = vm_disk_path.to_string_lossy().to_string(); + let handle = storage.clone_for_vm(image_entry, &resolved.name)?; + let pending = pending_metadata(&resolved.name, &handle); { let storage = storage.clone(); let sd = state_dir.to_path_buf(); - let name = resolved.name.clone(); + let pending = pending.clone(); rollback.push("VM storage clone", move || { - let _ = storage.destroy_vm_storage(&name); - let _ = vm::delete(&StateStore::new(sd), &name); + let _ = storage.destroy_vm_storage(&pending); + let _ = vm::delete(&StateStore::new(sd), &pending.name); }); } @@ -453,7 +477,7 @@ fn create(args: &CreateArgs, state_dir: &Path) -> anyhow::Result<()> { &store, &mut global_config, &storage, - &vm_disk, + &pending, image_size_mib, &image_ref, )?; @@ -475,12 +499,15 @@ fn create(args: &CreateArgs, state_dir: &Path) -> anyhow::Result<()> { /// Post-clone steps: grow disk, inject SSH key, save metadata. /// /// Separated from [`create`] so the caller can clean up storage on failure. +/// `pending` is the in-progress VM metadata (built from the [`VolumeHandle`] +/// returned by `clone_for_vm`); the storage backend reads `name`, `disk_path`, +/// and `thin_id` from it for the resize/inject calls below. fn create_post_clone( resolved: &ResolvedVmCreate, store: &StateStore, global_config: &mut GlobalConfig, storage: &Storage, - vm_disk: &str, + pending: &VmMetadata, image_size_mib: u64, image_ref: &str, ) -> anyhow::Result<()> { @@ -492,16 +519,13 @@ fn create_post_clone( "Growing disk to {}...", format_bytes_binary(resolved.disk_size as u64 * GIB) ); - storage.resize( - &resolved.name, - ByteSize::from_gib(resolved.disk_size as u64), - )?; + storage.resize(pending, ByteSize::from_gib(resolved.disk_size as u64))?; } // Inject per-VM SSH key into the rootfs image. // Linux: mounts the block device, writes the key, unmounts. // macOS: uses debugfs to write directly into the ext4 image. - let dev_path = storage.disk_device_path(&resolved.name); + let dev_path = storage.disk_device_path(pending); let pubkey_path = image::inject::default_ssh_pubkey_path().ok_or_else(|| { anyhow::anyhow!( "no SSH public key found at ~/.ssh/id_ed25519.pub or ~/.ssh/id_rsa.pub\n\ @@ -525,7 +549,8 @@ fn create_post_clone( .unwrap_or_else(|| PathBuf::from("/root/.ssh/id_ed25519")) }); - // Build and save VM metadata. + // Build and save VM metadata. The disk path and thin_id come from + // the pending stub built right after `clone_for_vm` returned. let metadata = VmMetadata { name: resolved.name.clone(), id: Uuid::new_v4(), @@ -535,7 +560,7 @@ fn create_post_clone( memory_mib: resolved.memory, disk_size_gib: resolved.disk_size, kernel_path, - disk_path: vm_disk.to_string(), + disk_path: pending.disk_path.clone(), boot_args: resolved.boot_args.clone(), subnet: resolved.network.clone(), network: None, @@ -547,7 +572,7 @@ fn create_post_clone( key: ssh_key, }, parent_vm: None, - thin_id: None, + thin_id: pending.thin_id, snapshots: Vec::new(), }; @@ -611,19 +636,19 @@ fn fork(args: &ForkArgs, state_dir: &Path) -> anyhow::Result<()> { // Clone source VM's storage into the new VM via the storage backend. println!("Forking '{}' → '{}'...", args.source, args.name); - let vm_disk_path = storage.clone_vm_storage(&args.source, &args.name)?; - let vm_disk = vm_disk_path.to_string_lossy().to_string(); + let handle = storage.clone_vm_storage(&source, &args.name)?; + let pending = pending_metadata(&args.name, &handle); let mut rollback = Rollback::new(); { let storage = storage.clone(); - let parent = args.source.clone(); + let parent = source.clone(); + let pending = pending.clone(); let sd = state_dir.to_path_buf(); - let name = args.name.clone(); rollback.push("fork clone + snapshot", move || { - let _ = storage.destroy_vm_storage(&name); - let _ = storage.cleanup_fork(&parent, &name); - let _ = vm::delete(&StateStore::new(sd), &name); + let _ = storage.destroy_vm_storage(&pending); + let _ = storage.cleanup_fork(&parent, &pending); + let _ = vm::delete(&StateStore::new(sd), &pending.name); }); } @@ -634,12 +659,12 @@ fn fork(args: &ForkArgs, state_dir: &Path) -> anyhow::Result<()> { "Growing disk to {}...", format_bytes_binary(disk_size_gib as u64 * GIB) ); - storage.resize(&args.name, ByteSize::from_gib(disk_size_gib as u64))?; + storage.resize(&pending, ByteSize::from_gib(disk_size_gib as u64))?; } // Inject /etc/hosts with the new VM's hostname (the cloned disk // still has the source VM's hostname from its creation). - let dev_path = storage.disk_device_path(&args.name); + let dev_path = storage.disk_device_path(&pending); storage.inject_hostname(&dev_path, &args.name)?; // Resolve kernel: CLI override or inherit from source. @@ -659,7 +684,7 @@ fn fork(args: &ForkArgs, state_dir: &Path) -> anyhow::Result<()> { memory_mib, disk_size_gib, kernel_path, - disk_path: vm_disk, + disk_path: pending.disk_path.clone(), boot_args: source.boot_args.clone(), subnet, network: None, @@ -668,7 +693,7 @@ fn fork(args: &ForkArgs, state_dir: &Path) -> anyhow::Result<()> { created_at: vm::now_iso8601(), ssh: source.ssh.clone(), parent_vm: Some(args.source.clone()), - thin_id: None, + thin_id: pending.thin_id, snapshots: Vec::new(), }; @@ -931,7 +956,7 @@ fn resize(args: &ResizeArgs, state_dir: &Path) -> anyhow::Result<()> { "Resizing disk to {}...", format_bytes_binary(new_gib as u64 * GIB) ); - storage.resize(&args.name, args.disk_size)?; + storage.resize(&metadata, args.disk_size)?; // Update metadata. metadata.disk_size_gib = new_gib; @@ -1047,7 +1072,7 @@ fn delete(args: &DeleteArgs, state_dir: &Path) -> anyhow::Result<()> { // On macOS/APFS this always returns empty — forks are independent. let config: GlobalConfig = store.read(&store.config_path())?; let storage = create_storage(&config); - let dependents = storage.storage_dependents(&args.name)?; + let dependents = storage.storage_dependents(&metadata)?; if !dependents.is_empty() { if !args.force { anyhow::bail!( @@ -1113,12 +1138,17 @@ pub fn force_delete_vm(store: &StateStore, metadata: &VmMetadata) -> anyhow::Res let storage = create_storage(&config); println!("Destroying storage for VM '{}'...", metadata.name); - let _ = storage.destroy_vm_storage(&metadata.name); + let _ = storage.destroy_vm_storage(metadata); // Clean up fork-related resources on the parent VM (e.g. ZFS snapshot). // No-op on macOS/APFS where forks are independent. - if let Some(ref parent) = metadata.parent_vm { - let _ = storage.cleanup_fork(parent, &metadata.name); + if let Some(ref parent_name) = metadata.parent_vm { + // Use the parent's stored metadata if available; fall back to a + // name-only stub when the parent record is gone (e.g. cascade + // cleanup running in the wrong order). + let parent_md = vm::load(store, parent_name) + .unwrap_or_else(|_| name_only_metadata(parent_name)); + let _ = storage.cleanup_fork(&parent_md, metadata); } // Remove the VM state directory. From 6c6ba8ba8b49b5e49bdfc896251051794a6039fb Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 27 Apr 2026 10:30:16 +0200 Subject: [PATCH 07/21] dm-thin: implement DmThinStorage backend The new DmThinStorage type implements all StorageBackend methods on top of the dm_thin/ wrappers added in the previous phase. Key flows: * init: sparse metadata + data files, losetup, dmsetup create thin-pool. Honors --storage-path (file or block device), --size, --metadata-size, --block-size; computes metadata size via thin_metadata_size when not pinned. * create_image_volume: stages the ext4 dd onto a fresh thin id, snaps it as the immutable base, drops the staging device. Returns the base id as VolumeHandle.thin_id. * clone_for_vm + clone_vm_storage: kernel create_snap from the source thin id; activate the new device. * snapshot/delete_snapshot/list_snapshots: track entries in VmMetadata.snapshots since dm-thin doesn't carry snapshot names. * restore_snapshot: delete the live thin id, snap from the snapshot, re-activate. Returns the new id so the CLI persists it on VmMetadata.thin_id. * resize: dmsetup load with the new sector count + e2fsck + resize2fs. create_storage and init_storage in ember-linux/lib.rs gain a StorageKind dispatch so callers wire the right impl from GlobalConfig.storage_backend / InitConfig.storage_backend. btrfs is not yet implemented and falls back to ZFS for now. Includes 5 unit tests covering size parsing, byte formatting, and ISO 8601 parsing. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/ember-core/src/backend.rs | 3 + crates/ember-linux/src/dm_thin_storage.rs | 816 ++++++++++++++++++++++ crates/ember-linux/src/lib.rs | 24 +- src/cli/init.rs | 1 + 4 files changed, 838 insertions(+), 6 deletions(-) create mode 100644 crates/ember-linux/src/dm_thin_storage.rs diff --git a/crates/ember-core/src/backend.rs b/crates/ember-core/src/backend.rs index be7ac9e..830fddb 100644 --- a/crates/ember-core/src/backend.rs +++ b/crates/ember-core/src/backend.rs @@ -76,6 +76,9 @@ impl VolumeHandle { /// Carries the subset of init arguments that the storage backend needs. /// Platform-specific fields are ignored on backends that don't use them. pub struct InitConfig { + /// Selected storage backend. Drives the [`StorageBackend::init`] + /// dispatch performed by `init_storage` in each platform crate. + pub storage_backend: crate::config::StorageKind, /// Path to the state directory (e.g., `/var/lib/ember` or `~/Library/Application Support/ember`). pub state_dir: PathBuf, /// ZFS pool name. Used on Linux for `zfs create`; ignored on macOS. diff --git a/crates/ember-linux/src/dm_thin_storage.rs b/crates/ember-linux/src/dm_thin_storage.rs new file mode 100644 index 0000000..c505b3e --- /dev/null +++ b/crates/ember-linux/src/dm_thin_storage.rs @@ -0,0 +1,816 @@ +//! Linux storage backend using device-mapper thin provisioning. +//! +//! Replaces ZFS zvols with thin volumes from a dm-thin pool. The single +//! pool holds backing metadata + data devices (typically loopback files +//! under [`storage_path`](DmThinStorage::storage_path)) and exposes +//! arbitrary numbers of thin volumes as `/dev/mapper/ember-img-` +//! and `/dev/mapper/ember-vm-` block devices. +//! +//! See `docs/DM-THIN-SPEC.md` for the design. + +use std::fs; +use std::path::{Path, PathBuf}; +use std::process::Command as ProcessCommand; + +use ember_core::backend::{InitConfig, SnapshotInfo, StorageBackend, VolumeHandle}; +use ember_core::config::size::ByteSize; +use ember_core::config::GlobalConfig; +use ember_core::error::{Error, Result}; +use ember_core::image::registry::ImageEntry; +use ember_core::state::vm::{SnapshotEntry, VmMetadata}; + +use crate::dm_thin::{loop_device, pool, thin, tools, SECTOR_SIZE}; +use crate::zvol; + +/// Default file name for the metadata backing file inside the dm-thin +/// data directory. +const METADATA_FILE: &str = "metadata.img"; +/// Default file name for the data backing file inside the dm-thin +/// data directory. +const DATA_FILE: &str = "data.img"; +/// Maximum thin volumes the metadata sizing assumes. dm-thin's +/// `thin_metadata_size` tool requires this; 1024 is a generous floor. +const DEFAULT_MAX_THINS: u64 = 1024; +/// Floor on metadata device size (32 MiB). The kernel rejects very +/// small metadata devices and `thin_metadata_size` may suggest values +/// below this for tiny pools. +const MIN_METADATA_SIZE_BYTES: u64 = 32 * 1024 * 1024; +/// Hard cap on metadata device size (16 GiB). The kernel won't accept +/// metadata devices larger than this. +const MAX_METADATA_SIZE_BYTES: u64 = 16 * 1024 * 1024 * 1024; + +/// dm-thin storage backend. +/// +/// Holds the configured backing path and pool block size; thin id state +/// lives on `VmMetadata`/`ImageEntry`/`SnapshotEntry`. Concurrent +/// invocations are race-free thanks to the kernel's atomic id rejection +/// in `create_thin`/`create_snap`. +#[derive(Clone)] +pub struct DmThinStorage { + /// Backing path. Either a directory holding `metadata.img` and + /// `data.img`, or a raw block device (the metadata file then sits + /// alongside it under `/dm-thin-metadata.img`). + storage_path: PathBuf, + /// Pool block size in 512-byte sectors. Permanent at pool creation; + /// the value here must match what the running pool was created with. + block_size_sectors: u32, +} + +impl DmThinStorage { + /// Build the backend handle from a parsed [`GlobalConfig`]. + /// + /// Falls back to [`pool::DEFAULT_BLOCK_SIZE_SECTORS`] when the + /// config does not pin one. + pub fn new(config: &GlobalConfig) -> Self { + Self { + storage_path: config + .storage_path + .clone() + .unwrap_or_else(|| PathBuf::from("/var/lib/ember/dm-thin")), + block_size_sectors: config + .dm_thin_block_size + .unwrap_or(pool::DEFAULT_BLOCK_SIZE_SECTORS), + } + } + + /// Resolved metadata device path for the configured backing. + fn metadata_file(&self) -> PathBuf { + if self.storage_path.is_dir() { + self.storage_path.join(METADATA_FILE) + } else { + // Raw block device: keep metadata as a sibling sparse file. + self.storage_path.with_file_name("dm-thin-metadata.img") + } + } + + /// Resolved data device path for the configured backing. + fn data_file(&self) -> PathBuf { + if self.storage_path.is_dir() { + self.storage_path.join(DATA_FILE) + } else { + self.storage_path.clone() + } + } + + /// Make sure the thin-pool device is active. Re-attaches loop + /// devices and re-runs `dmsetup create` if the kernel state is gone + /// (e.g., after a reboot). + fn ensure_pool_active(&self) -> Result<()> { + if pool::exists(pool::POOL_NAME)? { + return Ok(()); + } + + let metadata_path = self.metadata_file(); + let data_path = self.data_file(); + + let metadata_loop = ensure_loop(&metadata_path)?; + let data_loop = ensure_loop_or_block(&data_path)?; + + // Sanity-check metadata before activating; refuse to import a + // dirty pool rather than risk corruption. + if let Err(e) = tools::check(&metadata_loop) { + return Err(Error::Command { + command: "thin_check".to_string(), + exit_code: 1, + stderr: format!( + "metadata device {} failed thin_check; run thin_repair manually: {e}", + metadata_loop.display() + ), + }); + } + + let data_sectors = device_sectors(&data_loop)?; + pool::create( + pool::POOL_NAME, + &metadata_loop, + &data_loop, + data_sectors, + self.block_size_sectors, + pool::DEFAULT_LOW_WATER_BLOCKS, + ) + } + + /// Activate a thin volume if it is not already exposed under + /// `/dev/mapper/`. + fn ensure_thin_active( + &self, + dm_name: &str, + thin_id: u64, + size_sectors: u64, + ) -> Result { + if pool::exists(dm_name)? { + return Ok(thin::device_path(dm_name)); + } + thin::activate(dm_name, pool::POOL_NAME, thin_id, size_sectors) + } + + /// Read a VM's required size in sectors from its metadata. + fn vm_size_sectors(vm: &VmMetadata) -> u64 { + let bytes = (vm.disk_size_gib as u64) * 1024 * 1024 * 1024; + bytes / SECTOR_SIZE + } + + /// Read a thin id off [`VmMetadata`] or fail with a clear message. + fn require_vm_thin_id(vm: &VmMetadata) -> Result { + vm.thin_id.ok_or_else(|| { + Error::Vm(format!( + "vm '{}' has no dm-thin id recorded — was the pool re-initialized?", + vm.name + )) + }) + } + + /// Read a thin id off [`ImageEntry`] or fail with a clear message. + fn require_image_thin_id(image: &ImageEntry) -> Result { + image.thin_id.ok_or_else(|| { + Error::Image(format!( + "image '{}' has no dm-thin id recorded — was the pool re-initialized?", + image.local_name + )) + }) + } +} + +impl StorageBackend for DmThinStorage { + fn init(config: &InitConfig) -> Result<()> { + let storage_path = config.storage_path.clone().ok_or_else(|| { + Error::Config( + "dm-thin requires --storage-path (directory or block device)".to_string(), + ) + })?; + + let block_size_sectors = config + .dm_thin_block_size + .unwrap_or(pool::DEFAULT_BLOCK_SIZE_SECTORS); + + // Resolve metadata + data file paths and create them as sparse + // files when missing. A raw block device is kept as-is for the + // data side. + let (metadata_path, data_path) = resolve_init_paths(&storage_path)?; + + let pool_size_bytes = match config.dm_thin_size.as_deref() { + Some(spec) => parse_size(spec)?, + None => { + if !data_path.is_file() { + // Raw device: read its size directly. + device_size_bytes(&data_path)? + } else { + return Err(Error::Config( + "dm-thin --size is required when using a file-backed pool".to_string(), + )); + } + } + }; + + // Compute metadata size (or use an explicit override). + let metadata_size_bytes = match config.dm_thin_metadata_size.as_deref() { + Some(spec) => parse_size(spec)?, + None => { + let block_size_bytes = (block_size_sectors as u64) * SECTOR_SIZE; + let recommended = + tools::metadata_size(pool_size_bytes, block_size_bytes, DEFAULT_MAX_THINS)?; + recommended.clamp(MIN_METADATA_SIZE_BYTES, MAX_METADATA_SIZE_BYTES) + } + }; + + // Create sparse files when the user supplied paths that don't + // yet exist. A raw block device is left alone here. + if metadata_path.extension().is_some() && !metadata_path.exists() { + ensure_parent_dir(&metadata_path)?; + create_sparse_file(&metadata_path, metadata_size_bytes)?; + } + if data_path.is_file() || !data_path.exists() { + ensure_parent_dir(&data_path)?; + if !data_path.exists() { + create_sparse_file(&data_path, pool_size_bytes)?; + } + } + + // Zero the first 4 KiB of the metadata device — the kernel uses + // an all-zero superblock as the signal to format a fresh pool. + zero_head(&metadata_path)?; + + // Attach loops, then assemble the pool. + let metadata_loop = ensure_loop(&metadata_path)?; + let data_loop = ensure_loop_or_block(&data_path)?; + + let data_sectors = device_sectors(&data_loop)?; + pool::create( + pool::POOL_NAME, + &metadata_loop, + &data_loop, + data_sectors, + block_size_sectors, + pool::DEFAULT_LOW_WATER_BLOCKS, + )?; + + println!( + "dm-thin pool '{}' active ({} data, {} block size).", + pool::POOL_NAME, + format_bytes(pool_size_bytes), + format_bytes((block_size_sectors as u64) * SECTOR_SIZE), + ); + + Ok(()) + } + + fn create_image_volume( + &self, + name: &str, + image_path: &Path, + size_mib: u64, + ) -> Result { + self.ensure_pool_active()?; + + let staging_dm = thin::image_staging_dm_name(name); + let final_dm = thin::image_dm_name(name); + let size_sectors = (size_mib * 1024 * 1024) / SECTOR_SIZE; + + // 1. Allocate a fresh staging thin and write the ext4 image. + let staging_id = thin::allocate(pool::POOL_NAME)?; + let staging_dev = + match thin::activate(&staging_dm, pool::POOL_NAME, staging_id, size_sectors) { + Ok(p) => p, + Err(e) => { + let _ = thin::delete(pool::POOL_NAME, staging_id); + return Err(e); + } + }; + + // 2. dd the ext4 image onto the staging device. + if let Err(e) = dd_image(image_path, &staging_dev) { + let _ = thin::deactivate(&staging_dm); + let _ = thin::delete(pool::POOL_NAME, staging_id); + return Err(e); + } + + // 3. Snapshot the staging volume as the immutable base. Suspend + // the staging device first so the snapshot sees a coherent + // metadata commit; resume it on the way out either way. + let base_id_result = thin::suspend(&staging_dm).and_then(|()| { + let id = thin::allocate_snap(pool::POOL_NAME, staging_id); + let _ = thin::resume(&staging_dm); + id + }); + let base_id = match base_id_result { + Ok(id) => id, + Err(e) => { + let _ = thin::deactivate(&staging_dm); + let _ = thin::delete(pool::POOL_NAME, staging_id); + return Err(e); + } + }; + + // 4. Drop the staging device + thin id; the base id retains all + // of its blocks. + let _ = thin::deactivate(&staging_dm); + let _ = thin::delete(pool::POOL_NAME, staging_id); + + // The base thin is left inactive. Lazy activation creates the + // device on first use. Record the would-be path so it can be + // displayed and so callers see a stable identifier. + Ok(VolumeHandle { + disk_path: thin::device_path(&final_dm), + thin_id: Some(base_id), + }) + } + + fn clone_for_vm(&self, image: &ImageEntry, vm_name: &str) -> Result { + self.ensure_pool_active()?; + let base_id = Self::require_image_thin_id(image)?; + + let dm_name = thin::vm_dm_name(vm_name); + // The VM's virtual size matches the image's size at clone time; + // resize to a larger disk happens in a subsequent `resize` call. + let size_sectors = (image.size_mib * 1024 * 1024) / SECTOR_SIZE; + + let vm_id = thin::allocate_snap(pool::POOL_NAME, base_id)?; + match thin::activate(&dm_name, pool::POOL_NAME, vm_id, size_sectors) { + Ok(disk_path) => Ok(VolumeHandle { + disk_path, + thin_id: Some(vm_id), + }), + Err(e) => { + let _ = thin::delete(pool::POOL_NAME, vm_id); + Err(e) + } + } + } + + fn snapshot( + &self, + vm: &VmMetadata, + snap_name: &str, + ) -> Result> { + self.ensure_pool_active()?; + let vm_id = Self::require_vm_thin_id(vm)?; + let dm_name = thin::vm_dm_name(&vm.name); + let size_sectors = Self::vm_size_sectors(vm); + + // Suspend so create_snap sees a metadata-coherent volume. + // Some operations (e.g. snapshotting a never-activated volume) + // can run without an active device, but suspending an inactive + // device errors. Activate first if needed. + self.ensure_thin_active(&dm_name, vm_id, size_sectors)?; + + thin::suspend(&dm_name)?; + let snap_result = thin::allocate_snap(pool::POOL_NAME, vm_id); + let _ = thin::resume(&dm_name); + let snap_id = snap_result?; + + Ok(Some(SnapshotEntry { + name: snap_name.to_string(), + thin_id: snap_id, + created_at: ember_core::state::vm::now_iso8601(), + size_sectors, + })) + } + + fn restore_snapshot(&self, vm: &VmMetadata, snap_name: &str) -> Result { + self.ensure_pool_active()?; + let vm_id = Self::require_vm_thin_id(vm)?; + let snap = vm + .snapshots + .iter() + .find(|s| s.name == snap_name) + .ok_or_else(|| { + Error::Vm(format!( + "snapshot '{snap_name}' not found on vm '{}'", + vm.name + )) + })?; + let snap_id = snap.thin_id; + + let dm_name = thin::vm_dm_name(&vm.name); + let size_sectors = Self::vm_size_sectors(vm); + + // Tear down the live volume, free its thin id, then create a + // fresh thin id from the snapshot. + if pool::exists(&dm_name)? { + thin::deactivate(&dm_name)?; + } + thin::delete(pool::POOL_NAME, vm_id)?; + let new_id = thin::allocate_snap(pool::POOL_NAME, snap_id)?; + let disk_path = thin::activate(&dm_name, pool::POOL_NAME, new_id, size_sectors)?; + + Ok(VolumeHandle { + disk_path, + thin_id: Some(new_id), + }) + } + + fn delete_snapshot(&self, vm: &VmMetadata, snap_name: &str) -> Result<()> { + self.ensure_pool_active()?; + let snap = vm + .snapshots + .iter() + .find(|s| s.name == snap_name) + .ok_or_else(|| { + Error::Vm(format!( + "snapshot '{snap_name}' not found on vm '{}'", + vm.name + )) + })?; + thin::delete(pool::POOL_NAME, snap.thin_id) + } + + fn list_snapshots(&self, vm: &VmMetadata) -> Result> { + // dm-thin tracks snapshots via the persisted `vm.snapshots` + // list; the kernel knows nothing about names. + Ok(vm + .snapshots + .iter() + .map(|s| SnapshotInfo { + name: s.name.clone(), + created_at: parse_iso8601(&s.created_at).unwrap_or(0), + size: s.size_sectors * SECTOR_SIZE, + }) + .collect()) + } + + fn resize(&self, vm: &VmMetadata, new_size: ByteSize) -> Result<()> { + self.ensure_pool_active()?; + let vm_id = Self::require_vm_thin_id(vm)?; + let dm_name = thin::vm_dm_name(&vm.name); + let new_sectors = new_size.bytes() / SECTOR_SIZE; + + // Activate (lazy) so we have a device to reload. + let current_sectors = Self::vm_size_sectors(vm); + let dev_path = self.ensure_thin_active(&dm_name, vm_id, current_sectors)?; + + thin::reload_size(&dm_name, pool::POOL_NAME, vm_id, new_sectors)?; + zvol::wait_for_device(&dev_path)?; + e2fsck(&dev_path)?; + resize2fs(&dev_path)?; + Ok(()) + } + + fn destroy_vm_storage(&self, vm: &VmMetadata) -> Result<()> { + // Best-effort: deactivate first, then free the thin id. Either + // step may already be done by an earlier failure path. + let _ = self.ensure_pool_active(); + let dm_name = thin::vm_dm_name(&vm.name); + if let Ok(true) = pool::exists(&dm_name) { + let _ = thin::deactivate(&dm_name); + } + if let Some(id) = vm.thin_id { + let _ = thin::delete(pool::POOL_NAME, id); + } + Ok(()) + } + + fn destroy_image_storage(&self, image: &ImageEntry, _force: bool) -> Result<()> { + // dm-thin reference-counts blocks; deleting the base thin is + // safe even when VMs still have clones — they keep their own + // thin ids and stay readable. `force` doesn't change behavior. + let _ = self.ensure_pool_active(); + let dm_name = thin::image_dm_name(&image.local_name); + if let Ok(true) = pool::exists(&dm_name) { + let _ = thin::deactivate(&dm_name); + } + if let Some(id) = image.thin_id { + let _ = thin::delete(pool::POOL_NAME, id); + } + Ok(()) + } + + fn disk_device_path(&self, vm: &VmMetadata) -> PathBuf { + thin::vm_device_path(&vm.name) + } + + fn clone_vm_storage(&self, source: &VmMetadata, target_vm: &str) -> Result { + self.ensure_pool_active()?; + let source_id = Self::require_vm_thin_id(source)?; + let dm_name = thin::vm_dm_name(target_vm); + let size_sectors = Self::vm_size_sectors(source); + + let fork_id = thin::allocate_snap(pool::POOL_NAME, source_id)?; + match thin::activate(&dm_name, pool::POOL_NAME, fork_id, size_sectors) { + Ok(disk_path) => Ok(VolumeHandle { + disk_path, + thin_id: Some(fork_id), + }), + Err(e) => { + let _ = thin::delete(pool::POOL_NAME, fork_id); + Err(e) + } + } + } + + fn cleanup_fork(&self, _parent: &VmMetadata, _forked: &VmMetadata) -> Result<()> { + // dm-thin forks are independent — the snapshot id used to + // create the fork is the fork's own thin id, not a marker on + // the parent. Nothing to clean up on the parent. + Ok(()) + } + + fn storage_dependents(&self, _vm: &VmMetadata) -> Result> { + Ok(Vec::new()) + } + + fn mount(&self, path: &Path) -> Result { + zvol::wait_for_device(path)?; + + let mount_dir = tempfile::tempdir() + .map_err(|e| Error::Io { + path: std::env::temp_dir(), + source: e, + })? + .keep(); + + let output = ProcessCommand::new("mount") + .arg(path) + .arg(&mount_dir) + .output() + .map_err(|e| Error::CommandExec { + command: "mount".to_string(), + source: e, + })?; + + if let Err(e) = Error::check_command("mount", output) { + let _ = fs::remove_dir(&mount_dir); + return Err(e); + } + Ok(mount_dir) + } + + fn unmount(&self, mount_point: &Path) -> Result<()> { + crate::image::umount(mount_point)?; + let _ = fs::remove_dir(mount_point); + Ok(()) + } +} + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +/// Decide where the metadata + data backing live based on a single +/// user-supplied `storage_path`. +/// +/// * Path is a directory (or doesn't exist): treat as a directory and +/// place `metadata.img`/`data.img` inside. +/// * Path is an existing file or block device: treat as the data +/// device, with metadata as a sibling sparse file. +fn resolve_init_paths(storage_path: &Path) -> Result<(PathBuf, PathBuf)> { + if storage_path.is_dir() || !storage_path.exists() { + Ok(( + storage_path.join(METADATA_FILE), + storage_path.join(DATA_FILE), + )) + } else { + Ok(( + storage_path.with_file_name("dm-thin-metadata.img"), + storage_path.to_path_buf(), + )) + } +} + +fn ensure_parent_dir(path: &Path) -> Result<()> { + if let Some(parent) = path.parent() { + fs::create_dir_all(parent).map_err(|e| Error::Io { + path: parent.to_path_buf(), + source: e, + })?; + } + Ok(()) +} + +/// Create a sparse file of the given byte size using `truncate`. +fn create_sparse_file(path: &Path, size_bytes: u64) -> Result<()> { + let output = ProcessCommand::new("truncate") + .args(["-s", &size_bytes.to_string()]) + .arg(path) + .output() + .map_err(|e| Error::CommandExec { + command: "truncate".to_string(), + source: e, + })?; + Error::check_command("truncate", output)?; + Ok(()) +} + +/// Zero the first 4 KiB of a file or block device. dm-thin uses an +/// all-zero superblock as its "format me" sentinel. +fn zero_head(path: &Path) -> Result<()> { + let output = ProcessCommand::new("dd") + .arg("if=/dev/zero") + .arg(format!("of={}", path.display())) + .args(["bs=4K", "count=1", "conv=notrunc", "status=none"]) + .output() + .map_err(|e| Error::CommandExec { + command: "dd zero metadata".to_string(), + source: e, + })?; + Error::check_command("dd zero metadata", output)?; + Ok(()) +} + +/// Find an existing loop device for `file`, or attach a new one. +fn ensure_loop(file: &Path) -> Result { + if let Some(existing) = loop_device::find_for(file)? { + return Ok(existing); + } + loop_device::attach(file) +} + +/// Same as [`ensure_loop`] but transparent for raw block devices: if +/// the path is a block device (not a regular file) it's used as-is. +fn ensure_loop_or_block(path: &Path) -> Result { + let metadata = fs::metadata(path).map_err(|e| Error::Io { + path: path.to_path_buf(), + source: e, + })?; + if metadata.file_type().is_file() { + ensure_loop(path) + } else { + Ok(path.to_path_buf()) + } +} + +/// Number of 512-byte sectors on a block device. +fn device_sectors(path: &Path) -> Result { + Ok(device_size_bytes(path)? / SECTOR_SIZE) +} + +/// Total byte size of a block device (or regular file). Wraps +/// `blockdev --getsize64` for block devices and falls back to file +/// metadata otherwise. +fn device_size_bytes(path: &Path) -> Result { + if let Ok(meta) = fs::metadata(path) { + if meta.file_type().is_file() { + return Ok(meta.len()); + } + } + let output = ProcessCommand::new("blockdev") + .arg("--getsize64") + .arg(path) + .output() + .map_err(|e| Error::CommandExec { + command: "blockdev --getsize64".to_string(), + source: e, + })?; + let output = Error::check_command("blockdev --getsize64", output)?; + let s = String::from_utf8_lossy(&output.stdout); + s.trim().parse::().map_err(|e| Error::Command { + command: "blockdev --getsize64".to_string(), + exit_code: 0, + stderr: format!("non-numeric size {:?}: {e}", s.trim()), + }) +} + +/// Parse a `{K,M,G,T}?` size spec into bytes. +fn parse_size(spec: &str) -> Result { + let trimmed = spec.trim(); + if trimmed.is_empty() { + return Err(Error::Config("empty size".to_string())); + } + let (num_part, mult) = match trimmed.chars().last().unwrap() { + 'K' | 'k' => (&trimmed[..trimmed.len() - 1], 1024_u64), + 'M' | 'm' => (&trimmed[..trimmed.len() - 1], 1024_u64 * 1024), + 'G' | 'g' => (&trimmed[..trimmed.len() - 1], 1024_u64 * 1024 * 1024), + 'T' | 't' => ( + &trimmed[..trimmed.len() - 1], + 1024_u64 * 1024 * 1024 * 1024, + ), + _ => (trimmed, 1_u64), + }; + let n: u64 = num_part.trim().parse().map_err(|e| { + Error::Config(format!("invalid size '{spec}': {e}")) + })?; + Ok(n * mult) +} + +/// Format a byte count for log lines. +fn format_bytes(bytes: u64) -> String { + const TIB: u64 = 1024 * 1024 * 1024 * 1024; + const GIB: u64 = 1024 * 1024 * 1024; + const MIB: u64 = 1024 * 1024; + if bytes >= TIB { + format!("{:.1} TiB", bytes as f64 / TIB as f64) + } else if bytes >= GIB { + format!("{:.1} GiB", bytes as f64 / GIB as f64) + } else if bytes >= MIB { + format!("{:.1} MiB", bytes as f64 / MIB as f64) + } else { + format!("{bytes} B") + } +} + +/// Parse an ISO 8601 timestamp into Unix epoch seconds. Robust enough +/// for the in-house format produced by [`vm::now_iso8601`]. +fn parse_iso8601(s: &str) -> Option { + // Format: "YYYY-MM-DDTHH:MM:SSZ". + if s.len() < 20 { + return None; + } + let year: i64 = s.get(0..4)?.parse().ok()?; + let month: u64 = s.get(5..7)?.parse().ok()?; + let day: u64 = s.get(8..10)?.parse().ok()?; + let hour: u64 = s.get(11..13)?.parse().ok()?; + let min: u64 = s.get(14..16)?.parse().ok()?; + let sec: u64 = s.get(17..19)?.parse().ok()?; + + // Shift March-based Howard Hinnant civil date. + let y = if month <= 2 { year - 1 } else { year }; + let era = y.div_euclid(400); + let yoe = (y - era * 400) as u64; + let m = if month > 2 { month - 3 } else { month + 9 }; + let doy = (153 * m + 2) / 5 + day - 1; + let doe = yoe * 365 + yoe / 4 - yoe / 100 + doy; + let days = era * 146097 + doe as i64 - 719468; + let secs = (days * 86400 + (hour * 3600 + min * 60 + sec) as i64) as u64; + Some(secs) +} + +/// Run `dd` to copy an image file onto a block device. +fn dd_image(image_path: &Path, device: &Path) -> Result<()> { + let output = ProcessCommand::new("dd") + .arg(format!("if={}", image_path.display())) + .arg(format!("of={}", device.display())) + .args(["bs=1M", "conv=fsync", "status=none"]) + .output() + .map_err(|e| Error::CommandExec { + command: "dd image to thin".to_string(), + source: e, + })?; + Error::check_command("dd image to thin", output)?; + Ok(()) +} + +/// `e2fsck -f -p` — used before resize2fs. +fn e2fsck(device: &Path) -> Result<()> { + let output = ProcessCommand::new("e2fsck") + .args(["-f", "-p"]) + .arg(device) + .output() + .map_err(|e| Error::CommandExec { + command: "e2fsck".to_string(), + source: e, + })?; + if output.status.code().unwrap_or(-1) >= 2 { + return Err(Error::Command { + command: "e2fsck".to_string(), + exit_code: output.status.code().unwrap_or(-1), + stderr: String::from_utf8_lossy(&output.stderr).trim().to_string(), + }); + } + Ok(()) +} + +/// `resize2fs` — expand the ext4 filesystem to fill the device. +fn resize2fs(device: &Path) -> Result<()> { + let output = ProcessCommand::new("resize2fs") + .arg(device) + .output() + .map_err(|e| Error::CommandExec { + command: "resize2fs".to_string(), + source: e, + })?; + Error::check_command("resize2fs", output)?; + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parse_size_basic() { + assert_eq!(parse_size("0").unwrap(), 0); + assert_eq!(parse_size("100").unwrap(), 100); + assert_eq!(parse_size("4K").unwrap(), 4 * 1024); + assert_eq!(parse_size("16M").unwrap(), 16 * 1024 * 1024); + assert_eq!(parse_size("8G").unwrap(), 8u64 * 1024 * 1024 * 1024); + assert_eq!(parse_size("2T").unwrap(), 2u64 * 1024 * 1024 * 1024 * 1024); + assert_eq!(parse_size("4k").unwrap(), 4 * 1024); + } + + #[test] + fn parse_size_rejects_garbage() { + assert!(parse_size("").is_err()); + assert!(parse_size("abc").is_err()); + assert!(parse_size("1Q").is_err()); + } + + #[test] + fn format_bytes_units() { + assert_eq!(format_bytes(0), "0 B"); + assert_eq!(format_bytes(2 * 1024 * 1024), "2.0 MiB"); + assert_eq!(format_bytes(3u64 * 1024 * 1024 * 1024), "3.0 GiB"); + } + + #[test] + fn parse_iso8601_round_trip() { + // 2026-01-01T00:00:00Z is 1767225600. + assert_eq!(parse_iso8601("2026-01-01T00:00:00Z"), Some(1_767_225_600)); + // 1970-01-01T00:00:00Z is the epoch. + assert_eq!(parse_iso8601("1970-01-01T00:00:00Z"), Some(0)); + } + + #[test] + fn parse_iso8601_rejects_short() { + assert_eq!(parse_iso8601(""), None); + assert_eq!(parse_iso8601("2026-01-01"), None); + } +} diff --git a/crates/ember-linux/src/lib.rs b/crates/ember-linux/src/lib.rs index af109a9..d00244b 100644 --- a/crates/ember-linux/src/lib.rs +++ b/crates/ember-linux/src/lib.rs @@ -1,4 +1,5 @@ pub mod dm_thin; +pub mod dm_thin_storage; pub mod firecracker; pub mod image; pub mod network; @@ -10,6 +11,7 @@ pub mod vm; pub mod zfs; pub mod zvol; +pub use dm_thin_storage::DmThinStorage; pub use network_backend::LinuxNetwork; pub use platform::LinuxPlatform; pub use storage::LinuxStorage; @@ -18,16 +20,20 @@ pub use vm::LinuxVm; use std::sync::Arc; use ember_core::backend::{InitConfig, StorageBackend}; -use ember_core::config::GlobalConfig; -use ember_core::error::Result; +use ember_core::config::{GlobalConfig, StorageKind}; +use ember_core::error::{Error, Result}; /// Construct the active storage backend. /// /// Returns the implementation indicated by [`GlobalConfig::storage_backend`]. -/// Currently only ZFS is wired up; btrfs and dm-thin variants are added in -/// later phases of the multi-backend rollout. +/// btrfs is not yet implemented and falls back to ZFS so existing +/// configs keep working until Phase 7. pub fn create_storage(config: &GlobalConfig) -> Arc { - Arc::new(LinuxStorage::new(config)) + match config.storage_backend { + StorageKind::Zfs => Arc::new(LinuxStorage::new(config)), + StorageKind::DmThin => Arc::new(DmThinStorage::new(config)), + StorageKind::Btrfs => Arc::new(LinuxStorage::new(config)), + } } /// Initialize storage during `ember init`. @@ -36,5 +42,11 @@ pub fn create_storage(config: &GlobalConfig) -> Arc { /// trait object is unavailable here because the backend hasn't been /// constructed yet. pub fn init_storage(config: &InitConfig) -> Result<()> { - LinuxStorage::init(config) + match config.storage_backend { + StorageKind::Zfs => LinuxStorage::init(config), + StorageKind::DmThin => DmThinStorage::init(config), + StorageKind::Btrfs => Err(Error::Config( + "btrfs storage backend is not yet implemented".to_string(), + )), + } } diff --git a/src/cli/init.rs b/src/cli/init.rs index 400175d..7dea70e 100644 --- a/src/cli/init.rs +++ b/src/cli/init.rs @@ -35,6 +35,7 @@ pub struct InitArgs { pub fn run(args: &InitArgs, state_dir: &Path) -> anyhow::Result<()> { // 1-2. Create or verify ZFS pool and datasets via the storage backend. let init_config = InitConfig { + storage_backend: ember_core::config::StorageKind::Zfs, state_dir: state_dir.to_path_buf(), pool: args.pool.clone(), dataset: args.dataset.clone(), From 3a5f151022c1ec36556e351645c9ac940cf32196 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 27 Apr 2026 10:32:16 +0200 Subject: [PATCH 08/21] cli: wire --storage flag and dm-thin init dispatch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ember init grows --storage (zfs|dm-thin), --storage-path, --size, --metadata-size, and --block-size flags. The flag values flow into both InitConfig (so init_storage dispatches to DmThinStorage::init) and the persisted GlobalConfig. Implements StorageKind: FromStr so clap can parse the --storage value without making ember-core depend on clap. Init refuses to switch backends silently — the user must run ember deinit first if the existing config picks a different backend. dm-thin --storage-path defaults to /var/lib/ember/dm-thin when omitted, so a bare `ember init --storage dm-thin --size 50G` is enough to bring up a file-backed pool. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/ember-core/src/config.rs | 15 ++++++ src/cli/init.rs | 89 ++++++++++++++++++++++++++------- 2 files changed, 85 insertions(+), 19 deletions(-) diff --git a/crates/ember-core/src/config.rs b/crates/ember-core/src/config.rs index df2da70..135819a 100644 --- a/crates/ember-core/src/config.rs +++ b/crates/ember-core/src/config.rs @@ -19,6 +19,21 @@ pub enum StorageKind { DmThin, } +impl std::str::FromStr for StorageKind { + type Err = String; + + fn from_str(s: &str) -> Result { + match s { + "zfs" => Ok(Self::Zfs), + "btrfs" => Ok(Self::Btrfs), + "dm-thin" | "dmthin" | "dm_thin" => Ok(Self::DmThin), + other => Err(format!( + "unknown storage backend '{other}' (expected zfs, btrfs, or dm-thin)" + )), + } + } +} + /// Global configuration written by `ember init`. #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] pub struct GlobalConfig { diff --git a/src/cli/init.rs b/src/cli/init.rs index 7dea70e..1cc829c 100644 --- a/src/cli/init.rs +++ b/src/cli/init.rs @@ -1,28 +1,55 @@ -use std::path::Path; +use std::path::{Path, PathBuf}; use clap::Args; use crate::backend::{init_storage, CurrentPlatform, InitConfig, Platform}; -use ember_core::config::GlobalConfig; +use ember_core::config::{GlobalConfig, StorageKind}; use ember_core::state::store::StateStore; #[derive(Args)] pub struct InitArgs { - /// ZFS pool name (Linux only) + /// Storage backend: zfs (default) or dm-thin (Linux only) + #[cfg_attr(target_os = "macos", arg(long, default_value = "zfs", hide = true))] + #[cfg_attr(not(target_os = "macos"), arg(long, default_value = "zfs"))] + pub storage: StorageKind, + + /// ZFS pool name (--storage zfs only) #[cfg_attr(target_os = "macos", arg(long, default_value = "ember", hide = true))] #[cfg_attr(not(target_os = "macos"), arg(long, default_value = "ember"))] pub pool: String, - /// Block device for pool creation (Linux only) + /// Block device for ZFS pool creation (--storage zfs only) #[cfg_attr(target_os = "macos", arg(long, hide = true))] #[cfg_attr(not(target_os = "macos"), arg(long))] pub device: Option, - /// Dataset name within the pool (Linux only) + /// Dataset name within the pool (--storage zfs only) #[cfg_attr(target_os = "macos", arg(long, default_value = "ember", hide = true))] #[cfg_attr(not(target_os = "macos"), arg(long, default_value = "ember"))] pub dataset: String, + /// Backing path for non-ZFS backends (directory or block device). + /// + /// dm-thin: directory holding metadata.img/data.img, or a raw block + /// device. Defaults to /var/lib/ember/dm-thin when omitted. + #[arg(long)] + pub storage_path: Option, + + /// Pool size for file-backed dm-thin (e.g. `50G`). Required when + /// `--storage-path` is a file path; ignored for raw block devices. + #[arg(long)] + pub size: Option, + + /// Override metadata device size for dm-thin (e.g. `800M`). + /// `thin_metadata_size` computes a recommended value when omitted. + #[arg(long)] + pub metadata_size: Option, + + /// dm-thin pool block size in 512-byte sectors. Permanent at pool + /// creation. Defaults to 128 (= 64 KiB). + #[arg(long)] + pub block_size: Option, + /// Kernel preset or file path [presets: stock] #[arg(long)] pub kernel: Option, @@ -33,27 +60,51 @@ pub struct InitArgs { } pub fn run(args: &InitArgs, state_dir: &Path) -> anyhow::Result<()> { - // 1-2. Create or verify ZFS pool and datasets via the storage backend. + // Refuse to switch backends silently. Existing configs win unless + // the user runs `ember deinit` first. + let store = StateStore::new(state_dir.to_path_buf()); + if let Ok(Some(existing)) = store.read_optional::(&store.config_path()) { + if existing.storage_backend != args.storage { + anyhow::bail!( + "ember is already initialized with the {:?} backend; \ + run 'ember deinit' first to switch to {:?}", + existing.storage_backend, + args.storage, + ); + } + } + + // Resolve the dm-thin defaults so both InitConfig and GlobalConfig + // see the same values. + let storage_path = match args.storage { + StorageKind::DmThin => Some( + args.storage_path + .clone() + .unwrap_or_else(|| PathBuf::from("/var/lib/ember/dm-thin")), + ), + StorageKind::Btrfs => args.storage_path.clone(), + StorageKind::Zfs => None, + }; + let init_config = InitConfig { - storage_backend: ember_core::config::StorageKind::Zfs, + storage_backend: args.storage, state_dir: state_dir.to_path_buf(), pool: args.pool.clone(), dataset: args.dataset.clone(), device: args.device.clone(), - storage_path: None, + storage_path: storage_path.clone(), btrfs_size: None, - dm_thin_size: None, - dm_thin_metadata_size: None, - dm_thin_block_size: None, + dm_thin_size: args.size.clone(), + dm_thin_metadata_size: args.metadata_size.clone(), + dm_thin_block_size: args.block_size, }; init_storage(&init_config)?; - // 3. Initialize state directory structure. - let store = StateStore::new(state_dir.to_path_buf()); + // Initialize state directory structure. store.init()?; println!("State directory initialized at {}", state_dir.display()); - // 4. Download kernel if preset or path provided. + // Download kernel if preset or path provided. let kernel_path = if let Some(spec) = &args.kernel { Some(spec.resolve(&store)?) } else { @@ -61,22 +112,22 @@ pub fn run(args: &InitArgs, state_dir: &Path) -> anyhow::Result<()> { None }; - // 5. Detect or use provided WAN interface. + // Detect or use provided WAN interface. let (wan_iface, messages) = CurrentPlatform::detect_wan_iface(args.wan_iface.as_deref()); for msg in &messages { println!("{msg}"); } - // 6. Write config. + // Write config. let config = GlobalConfig { - storage_backend: ember_core::config::StorageKind::Zfs, + storage_backend: args.storage, pool: args.pool.clone(), dataset: args.dataset.clone(), kernel_path, wan_iface, state_dir: state_dir.to_path_buf(), - storage_path: None, - dm_thin_block_size: None, + storage_path, + dm_thin_block_size: args.block_size, }; store.write(&store.config_path(), &config)?; println!("Configuration written to {}", store.config_path().display()); From b72a82c0df4a7952f1d854967b7216a93f6b33c1 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 27 Apr 2026 10:58:00 +0200 Subject: [PATCH 09/21] cli: add ember deinit and ember storage grow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two new admin commands: * ember deinit [--purge] tears down the storage backend (inverse of ember init). Refuses to run while any VM is registered. Removes the persisted config last so the backend can still locate its backing paths during teardown. --purge deletes file-backed pool images (dm-thin metadata.img/data.img); raw block devices are always preserved. * ember storage grow --size grows the dm-thin data device: truncates the sparse data file, refreshes the loop device, and reloads the pool table with the new sector count. StorageBackend trait grows two methods: * deinit(purge) — backend-specific teardown. ZFS calls zpool destroy. macOS removes images/vms dirs when --purge. dm-thin tears down every ember-managed thin device, removes the pool, detaches loops, and conditionally deletes the backing files. * grow(new_size) — only meaningful for dm-thin file-backed pools. ZFS and macOS return a clear error directing the user elsewhere. A helper pool::list_with_prefix iterates dmsetup ls output to find ember-img-* / ember-vm-* devices for cleanup. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/ember-core/src/backend.rs | 14 ++++ crates/ember-linux/src/dm_thin/pool.rs | 26 +++++++ crates/ember-linux/src/dm_thin_storage.rs | 92 +++++++++++++++++++++++ crates/ember-linux/src/storage.rs | 33 ++++++++ crates/ember-macos/src/storage.rs | 31 ++++++++ src/cli.rs | 9 +++ src/cli/deinit.rs | 64 ++++++++++++++++ src/cli/storage.rs | 38 ++++++++++ src/main.rs | 2 + 9 files changed, 309 insertions(+) create mode 100644 src/cli/deinit.rs create mode 100644 src/cli/storage.rs diff --git a/crates/ember-core/src/backend.rs b/crates/ember-core/src/backend.rs index 830fddb..cb07876 100644 --- a/crates/ember-core/src/backend.rs +++ b/crates/ember-core/src/backend.rs @@ -167,6 +167,20 @@ pub trait StorageBackend { where Self: Sized; + /// Tear down the backend infrastructure created by [`init`]. + /// + /// Inverse of `init`. The backend is responsible for unmounting, + /// detaching, and (when `purge` is set) deleting backing files. + /// Block devices supplied by the user are left intact in either + /// case. The CLI removes `config.json` separately. + fn deinit(&self, purge: bool) -> Result<()>; + + /// Grow the underlying pool capacity. Currently meaningful only for + /// dm-thin file-backed pools; ZFS/btrfs/APFS return an error since + /// they manage capacity differently (or the user resizes individual + /// VM disks via [`StorageBackend::resize`]). + fn grow(&self, new_size: ByteSize) -> Result<()>; + /// Create a base image volume from an ext4 image file. /// /// `name` is the image identifier (e.g., `library-alpine-latest`). diff --git a/crates/ember-linux/src/dm_thin/pool.rs b/crates/ember-linux/src/dm_thin/pool.rs index be5836d..60d757a 100644 --- a/crates/ember-linux/src/dm_thin/pool.rs +++ b/crates/ember-linux/src/dm_thin/pool.rs @@ -50,6 +50,32 @@ pub struct PoolStatus { pub mode: PoolMode, } +/// List active device-mapper device names whose name starts with +/// `prefix`. Useful for finding all `ember-vm-*` and `ember-img-*` +/// volumes during teardown. +pub fn list_with_prefix(prefix: &str) -> Result> { + let output = Command::new("dmsetup") + .arg("ls") + .output() + .map_err(|e| Error::CommandExec { + command: "dmsetup ls".to_string(), + source: e, + })?; + let output = Error::check_command("dmsetup ls", output)?; + let stdout = String::from_utf8_lossy(&output.stdout); + Ok(stdout + .lines() + .filter_map(|line| { + let name = line.split_whitespace().next()?; + if name.starts_with(prefix) { + Some(name.to_string()) + } else { + None + } + }) + .collect()) +} + /// Whether a device-mapper device with the given name is currently active. /// /// Uses `dmsetup info` which exits 0 when the device exists, non-zero diff --git a/crates/ember-linux/src/dm_thin_storage.rs b/crates/ember-linux/src/dm_thin_storage.rs index c505b3e..e9e106b 100644 --- a/crates/ember-linux/src/dm_thin_storage.rs +++ b/crates/ember-linux/src/dm_thin_storage.rs @@ -508,6 +508,98 @@ impl StorageBackend for DmThinStorage { Ok(Vec::new()) } + fn deinit(&self, purge: bool) -> Result<()> { + // 1. Deactivate every ember-managed thin volume so the pool + // can be removed cleanly. + for prefix in [thin::IMAGE_PREFIX, thin::VM_PREFIX] { + for name in pool::list_with_prefix(prefix)? { + let _ = thin::deactivate(&name); + } + } + // 2. Drop the pool itself (if active). + if pool::exists(pool::POOL_NAME)? { + pool::remove(pool::POOL_NAME)?; + } + // 3. Detach the loop devices, if any. + let metadata_path = self.metadata_file(); + let data_path = self.data_file(); + if let Some(loop_dev) = loop_device::find_for(&metadata_path)? { + let _ = loop_device::detach(&loop_dev); + } + if let Some(loop_dev) = loop_device::find_for(&data_path)? { + let _ = loop_device::detach(&loop_dev); + } + // 4. Optionally delete the backing files. A raw block device + // supplied by the user is always left alone. + if purge { + for path in [&metadata_path, &data_path] { + if path.is_file() { + let _ = fs::remove_file(path); + } + } + // Remove the dm-thin directory itself if empty. + if self.storage_path.is_dir() { + let _ = fs::remove_dir(&self.storage_path); + } + } + println!("dm-thin pool '{}' torn down.", pool::POOL_NAME); + Ok(()) + } + + fn grow(&self, new_size: ByteSize) -> Result<()> { + self.ensure_pool_active()?; + + let data_path = self.data_file(); + let new_bytes = new_size.bytes(); + + if data_path.is_file() { + create_sparse_file(&data_path, new_bytes)?; + } else { + return Err(Error::Config(format!( + "data device {} is a raw block device — grow it externally first \ + (e.g. lvextend, cloud-volume resize) and then re-run `ember storage grow`", + data_path.display() + ))); + } + + // Make the loop driver pick up the new file size, then reload + // the pool table with the larger sector count. + let metadata_path = self.metadata_file(); + let metadata_loop = loop_device::find_for(&metadata_path)?.ok_or_else(|| { + Error::Config(format!( + "metadata device {} is not attached to a loop device", + metadata_path.display() + )) + })?; + let data_loop = if data_path.is_file() { + let dev = loop_device::find_for(&data_path)?.ok_or_else(|| { + Error::Config(format!( + "data device {} is not attached to a loop device", + data_path.display() + )) + })?; + loop_device::refresh_size(&dev)?; + dev + } else { + data_path.clone() + }; + + let data_sectors = device_sectors(&data_loop)?; + pool::reload( + pool::POOL_NAME, + &metadata_loop, + &data_loop, + data_sectors, + self.block_size_sectors, + pool::DEFAULT_LOW_WATER_BLOCKS, + )?; + println!( + "Grew dm-thin pool data device to {}.", + format_bytes(new_bytes) + ); + Ok(()) + } + fn mount(&self, path: &Path) -> Result { zvol::wait_for_device(path)?; diff --git a/crates/ember-linux/src/storage.rs b/crates/ember-linux/src/storage.rs index 10e6ff4..c938faf 100644 --- a/crates/ember-linux/src/storage.rs +++ b/crates/ember-linux/src/storage.rs @@ -18,9 +18,16 @@ use ember_core::error::{Error, Result}; use ember_core::image::registry::ImageEntry; use ember_core::state::vm::{SnapshotEntry, VmMetadata}; +/// LinuxStorage's pool name (cached at construction). Needed for +/// `deinit` since the trait method has no access to `InitConfig`. +const _: () = (); // keep Cargo from collapsing the import block above + /// Linux storage backend using ZFS zvols. #[derive(Clone)] pub struct LinuxStorage { + /// ZFS pool name (e.g., "tank"). Cached so `deinit` can call + /// `zpool destroy` without re-reading the config. + pool: String, /// ZFS images dataset path (e.g., "tank/ember/images"). images_dataset: String, /// ZFS VMs dataset path (e.g., "tank/ember/vms"). @@ -33,6 +40,7 @@ impl LinuxStorage { /// Extracts the ZFS pool/dataset paths that all storage operations need. pub fn new(config: &GlobalConfig) -> Self { Self { + pool: config.pool.clone(), images_dataset: config.images_dataset(), vms_dataset: config.vms_dataset(), } @@ -194,6 +202,31 @@ impl StorageBackend for LinuxStorage { Ok(()) } + fn deinit(&self, _purge: bool) -> Result<()> { + // `zpool destroy` is destructive — there is no equivalent of + // "purge: keep the data". The flag is accepted for trait + // uniformity but ignored here: ZFS pools always go. + if !zfs::pool::exists(&self.pool)? { + return Ok(()); + } + let output = ProcessCommand::new("zpool") + .args(["destroy", "-f", &self.pool]) + .output() + .map_err(|e| Error::CommandExec { + command: "zpool destroy".to_string(), + source: e, + })?; + Error::check_command("zpool destroy", output)?; + println!("Destroyed ZFS pool '{}'.", self.pool); + Ok(()) + } + + fn grow(&self, _new_size: ByteSize) -> Result<()> { + Err(Error::Zfs( + "ZFS pools auto-expand by default; use `zpool online -e` if needed".to_string(), + )) + } + /// Destroy the image zvol (includes its @base snapshot). /// /// With `force: true`, uses `zfs destroy -R` to also destroy any orphaned diff --git a/crates/ember-macos/src/storage.rs b/crates/ember-macos/src/storage.rs index af777fe..5236e99 100644 --- a/crates/ember-macos/src/storage.rs +++ b/crates/ember-macos/src/storage.rs @@ -492,6 +492,37 @@ impl StorageBackend for MacosStorage { Ok(vec![]) } + fn deinit(&self, purge: bool) -> Result<()> { + // The state directory layout (`images/`, `vms/`, `kernels/`, + // `network/`) is owned by ember; on `--purge` we drop the disk + // images so a future `ember init` starts clean. + if purge { + let images = self.images_dir(); + if images.exists() { + fs::remove_dir_all(&images).map_err(|e| Error::Io { + path: images, + source: e, + })?; + } + let vms = self.vms_dir(); + if vms.exists() { + fs::remove_dir_all(&vms).map_err(|e| Error::Io { + path: vms, + source: e, + })?; + } + } + Ok(()) + } + + fn grow(&self, _new_size: ByteSize) -> Result<()> { + Err(Error::Image( + "macOS/APFS has no pool concept — resize individual VMs with \ + `ember vm resize` instead" + .to_string(), + )) + } + /// Not supported for ext4 on macOS. /// /// macOS has no native ext4 mount support. Use [`inject_ssh_key`] for diff --git a/src/cli.rs b/src/cli.rs index 4ec4734..b7c4741 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -1,5 +1,6 @@ pub mod cp; pub mod debug; +pub mod deinit; pub mod exec; pub(crate) mod fmt; pub mod image; @@ -8,6 +9,7 @@ pub mod init; pub mod kernel; pub mod snapshot; pub mod ssh; +pub mod storage; pub mod vm; use clap::{Parser, Subcommand}; @@ -80,6 +82,13 @@ pub enum Command { /// Reconcile internal state with actual VM process state Reconcile, + /// Tear down ember (inverse of `ember init`) + Deinit(deinit::DeinitArgs), + + /// Storage pool administration + #[command(subcommand)] + Storage(storage::StorageCommand), + /// Print version information Version, } diff --git a/src/cli/deinit.rs b/src/cli/deinit.rs new file mode 100644 index 0000000..3342157 --- /dev/null +++ b/src/cli/deinit.rs @@ -0,0 +1,64 @@ +//! `ember deinit` — tear down the storage backend. +//! +//! The inverse of `ember init`. Refuses to run while VMs are alive +//! to avoid leaving the user with a half-destroyed pool. + +use std::fs; +use std::path::Path; + +use clap::Args; + +use crate::backend::create_storage; +use ember_core::config::GlobalConfig; +use ember_core::state::store::StateStore; +use ember_core::state::vm; + +#[derive(Args)] +pub struct DeinitArgs { + /// Also delete backing files (dm-thin metadata.img/data.img) so + /// a future `ember init` starts from scratch. Block devices + /// supplied via `--storage-path` are always left intact. + #[arg(long)] + pub purge: bool, +} + +pub fn run(args: &DeinitArgs, state_dir: &Path) -> anyhow::Result<()> { + let store = StateStore::new(state_dir.to_path_buf()); + let config: GlobalConfig = match store.read_optional(&store.config_path())? { + Some(c) => c, + None => { + println!("ember is not initialized — nothing to tear down."); + return Ok(()); + } + }; + + // Refuse to deinit if any VM is recorded. Forces the user to + // `ember vm delete` (or `--force`) first so that backend cleanup + // doesn't leave dangling per-VM resources. + let vms = vm::list(&store).unwrap_or_default(); + if !vms.is_empty() { + let names: Vec = vms.into_iter().map(|v| v.name).collect(); + anyhow::bail!( + "refusing to deinit while {} VM(s) are registered: {}\n\ + Hint: delete them first with 'ember vm delete '.", + names.len(), + names.join(", "), + ); + } + + let storage = create_storage(&config); + storage.deinit(args.purge)?; + + // Remove the persisted config last — the backend may have needed + // it to find backing paths. + let config_path = store.config_path(); + if config_path.exists() { + fs::remove_file(&config_path).map_err(|e| ember_core::error::Error::Io { + path: config_path, + source: e, + })?; + } + + println!("ember deinitialized."); + Ok(()) +} diff --git a/src/cli/storage.rs b/src/cli/storage.rs new file mode 100644 index 0000000..785a052 --- /dev/null +++ b/src/cli/storage.rs @@ -0,0 +1,38 @@ +//! `ember storage` subcommands: pool-level administration. + +use std::path::Path; + +use clap::{Args, Subcommand}; + +use crate::backend::create_storage; +use ember_core::config::size::ByteSize; +use ember_core::config::GlobalConfig; +use ember_core::state::store::StateStore; + +#[derive(Subcommand)] +pub enum StorageCommand { + /// Grow the underlying pool capacity (dm-thin only). + Grow(GrowArgs), +} + +#[derive(Args)] +pub struct GrowArgs { + /// New total size for the data device, e.g. `100G`. Must be larger + /// than the current size. + #[arg(long)] + pub size: ByteSize, +} + +pub fn run(cmd: &StorageCommand, state_dir: &Path) -> anyhow::Result<()> { + match cmd { + StorageCommand::Grow(args) => grow(args, state_dir), + } +} + +fn grow(args: &GrowArgs, state_dir: &Path) -> anyhow::Result<()> { + let store = StateStore::new(state_dir.to_path_buf()); + let config: GlobalConfig = store.read(&store.config_path())?; + let storage = create_storage(&config); + storage.grow(args.size)?; + Ok(()) +} diff --git a/src/main.rs b/src/main.rs index e73cc41..b6b0eb0 100644 --- a/src/main.rs +++ b/src/main.rs @@ -88,6 +88,8 @@ fn main() -> anyhow::Result<()> { CurrentPlatform::reconcile(&cli.state_dir); Ok(()) } + Command::Deinit(args) => cli::deinit::run(args, &cli.state_dir), + Command::Storage(cmd) => cli::storage::run(cmd, &cli.state_dir), Command::Version => { println!("ember {}", env!("CARGO_PKG_VERSION")); Ok(()) From 85dc75ea07d9112cd91e39b18c8ef4af4b3e1312 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 27 Apr 2026 11:02:13 +0200 Subject: [PATCH 10/21] test: add dm-thin integration smoke tests + docs Three #[ignore]'d integration tests that exercise the dm-thin path end-to-end via the real ember CLI: * init + deinit round-trip (verifies pool comes up, backing files appear, and --purge tears everything down). * refusal to switch backends silently (init dm-thin, then init zfs must fail with a clear message). * storage grow (init at 200M, grow to 400M, verify data.img size). Run them with: sudo cargo test --test dm_thin -- --ignored --test-threads=1 CLAUDE.md updated to advertise the dm-thin path: build/run examples, backend selection notes, runtime tool deps. Architecture section reflects that Storage is now a runtime trait object so multiple backends can coexist on Linux. Co-Authored-By: Claude Opus 4.7 (1M context) --- CLAUDE.md | 33 +++++++-- tests/dm_thin.rs | 184 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 212 insertions(+), 5 deletions(-) create mode 100644 tests/dm_thin.rs diff --git a/CLAUDE.md b/CLAUDE.md index 80783e4..f452866 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -6,8 +6,11 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co A lightweight CLI for managing microVMs with copy-on-write storage. CLI-only — no daemon, no REST API. -- **Linux**: Firecracker (KVM) + ZFS zvols. See SPEC.md for the full design, TODO.md for the task list. -- **macOS**: Apple Virtualization Framework + APFS clones. See MACOS-SPEC.md for the design, MACOS-TODO.md for the task list. +- **Linux**: Firecracker (KVM) + one of: + - ZFS zvols (default; see `docs/SPEC.md`). + - dm-thin (kernel-builtin device-mapper thin provisioning; see `docs/DM-THIN-SPEC.md`). + Backend is selected at `ember init --storage ` and persisted on `GlobalConfig`. +- **macOS**: Apple Virtualization Framework + APFS clones. See `docs/MACOS-SPEC.md` for the design. ## Build Commands @@ -37,10 +40,29 @@ cargo clippy # Unit tests cargo test -# Manual testing (requires root, ZFS, and firecracker installed) +# Manual testing (requires root, firecracker, and a backend) + +# ZFS backend sudo ./target/debug/ember init --pool testpool --device /dev/loop0 sudo ./target/debug/ember image pull alpine:latest sudo ./target/debug/ember vm create testvm --image alpine:latest + +# dm-thin backend (no kernel module; in-tree) +sudo ./target/debug/ember init \ + --storage dm-thin \ + --storage-path /var/lib/ember/dm-thin \ + --size 50G +sudo ./target/debug/ember image pull alpine:latest +sudo ./target/debug/ember vm create testvm --image alpine:latest + +# Tear down a backend +sudo ./target/debug/ember deinit --purge + +# Grow the dm-thin data device +sudo ./target/debug/ember storage grow --size 100G + +# Integration tests for dm-thin (root + dm-thin module + thin-provisioning-tools) +sudo cargo test --test dm_thin -- --ignored --test-threads=1 ``` ## Coding Style & Conventions @@ -54,8 +76,9 @@ See specs in the docs/ folder for details, when needed. Basic architecture choices: -- Platform-specific code lives behind backend traits (`VmBackend`, `StorageBackend`, `NetworkBackend`) with `#[cfg(target_os)]` compile-time selection. -- Shell out to platform tools: `ember-vz` (Swift helper for AVF), `hdiutil`, `diskutil`, `cp -c`, Homebrew `e2fsprogs`. +- Platform-specific code lives behind backend traits (`VmBackend`, `StorageBackend`, `NetworkBackend`). +- `Vm` and `Network` are picked at compile time via `#[cfg(target_os)]`. `Storage` is a runtime trait object (`Arc`) so the concrete backend can be selected from `GlobalConfig.storage_backend` without a rebuild. +- Shell out to platform tools: `ember-vz` (Swift helper for AVF), `hdiutil`, `diskutil`, `cp -c`, Homebrew `e2fsprogs` on macOS; `zfs`/`zpool`/`iptables`/`dmsetup`/`losetup`/`thin-provisioning-tools` on Linux. ## Version Control diff --git a/tests/dm_thin.rs b/tests/dm_thin.rs new file mode 100644 index 0000000..e552951 --- /dev/null +++ b/tests/dm_thin.rs @@ -0,0 +1,184 @@ +//! Integration tests for the dm-thin storage backend. +//! +//! These tests exercise the real CLI binary against real device-mapper +//! state. They are gated `#[ignore]` and only run on Linux because: +//! +//! * dm-thin requires the `dm-thin-pool` kernel module + root +//! privileges for `dmsetup`, `losetup`, and friends. +//! * The host must have `dmsetup` (lvm2), `thin-provisioning-tools`, +//! and `e2fsprogs` available. +//! +//! Run them explicitly with: +//! +//! ```text +//! sudo cargo test --test dm_thin -- --ignored --test-threads=1 +//! ``` + +#![cfg(target_os = "linux")] + +mod common; + +use std::path::Path; + +/// Run `ember init --storage dm-thin` against a tempdir, then verify +/// `ember deinit --purge` cleans up. Smoke test for the new init + +/// deinit paths added in Phase 5/7. +#[test] +#[ignore = "requires root + dm-thin kernel module"] +fn dm_thin_init_and_deinit_round_trip() { + let tmp = tempfile::tempdir().unwrap(); + let storage_path = tmp.path().join("dm-thin"); + let state_dir = tmp.path().join("state"); + + // Init. + let output = common::ember(&[ + "--state-dir", + state_dir.to_str().unwrap(), + "init", + "--storage", + "dm-thin", + "--storage-path", + storage_path.to_str().unwrap(), + "--size", + "200M", + ]); + let stdout = String::from_utf8_lossy(&output.stdout); + let stderr = String::from_utf8_lossy(&output.stderr); + assert!( + output.status.success(), + "init failed.\nstdout: {stdout}\nstderr: {stderr}" + ); + assert!( + Path::new("/dev/mapper/ember-pool").exists(), + "ember-pool should be active after init" + ); + assert!(storage_path.join("metadata.img").exists()); + assert!(storage_path.join("data.img").exists()); + + // Deinit with purge — pool, loops, and backing files all gone. + let output = common::ember(&[ + "--state-dir", + state_dir.to_str().unwrap(), + "deinit", + "--purge", + ]); + let stdout = String::from_utf8_lossy(&output.stdout); + let stderr = String::from_utf8_lossy(&output.stderr); + assert!( + output.status.success(), + "deinit failed.\nstdout: {stdout}\nstderr: {stderr}" + ); + assert!( + !Path::new("/dev/mapper/ember-pool").exists(), + "ember-pool should be torn down after deinit" + ); + assert!(!storage_path.join("metadata.img").exists()); + assert!(!storage_path.join("data.img").exists()); +} + +/// `ember init` should refuse to switch backends silently. After init +/// with one backend, attempting to init with a different backend +/// surfaces a clear error rather than corrupting state. +#[test] +#[ignore = "requires root + dm-thin kernel module"] +fn dm_thin_init_refuses_backend_switch() { + let tmp = tempfile::tempdir().unwrap(); + let storage_path = tmp.path().join("dm-thin"); + let state_dir = tmp.path().join("state"); + + // First init with dm-thin. + let output = common::ember(&[ + "--state-dir", + state_dir.to_str().unwrap(), + "init", + "--storage", + "dm-thin", + "--storage-path", + storage_path.to_str().unwrap(), + "--size", + "200M", + ]); + assert!(output.status.success()); + + // Second init with zfs should refuse. + let output = common::ember(&[ + "--state-dir", + state_dir.to_str().unwrap(), + "init", + "--storage", + "zfs", + "--pool", + "embertest", + ]); + let stderr = String::from_utf8_lossy(&output.stderr); + assert!( + !output.status.success(), + "second init should have failed; stderr: {stderr}" + ); + assert!( + stderr.contains("already initialized"), + "expected 'already initialized' message: {stderr}" + ); + + // Cleanup. + let _ = common::ember(&[ + "--state-dir", + state_dir.to_str().unwrap(), + "deinit", + "--purge", + ]); +} + +/// `ember storage grow --size ` should grow the data device. +#[test] +#[ignore = "requires root + dm-thin kernel module"] +fn dm_thin_storage_grow() { + let tmp = tempfile::tempdir().unwrap(); + let storage_path = tmp.path().join("dm-thin"); + let state_dir = tmp.path().join("state"); + + let output = common::ember(&[ + "--state-dir", + state_dir.to_str().unwrap(), + "init", + "--storage", + "dm-thin", + "--storage-path", + storage_path.to_str().unwrap(), + "--size", + "200M", + ]); + assert!(output.status.success()); + + let initial = std::fs::metadata(storage_path.join("data.img")) + .unwrap() + .len(); + assert_eq!(initial, 200 * 1024 * 1024); + + let output = common::ember(&[ + "--state-dir", + state_dir.to_str().unwrap(), + "storage", + "grow", + "--size", + "400M", + ]); + let stdout = String::from_utf8_lossy(&output.stdout); + let stderr = String::from_utf8_lossy(&output.stderr); + assert!( + output.status.success(), + "grow failed.\nstdout: {stdout}\nstderr: {stderr}" + ); + + let grown = std::fs::metadata(storage_path.join("data.img")) + .unwrap() + .len(); + assert_eq!(grown, 400 * 1024 * 1024); + + let _ = common::ember(&[ + "--state-dir", + state_dir.to_str().unwrap(), + "deinit", + "--purge", + ]); +} From 1eaa79ac7a9adbad75704dcce98d20e7c1074ef0 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 27 Apr 2026 11:39:52 +0200 Subject: [PATCH 11/21] dm-thin: clamp thin ids to the kernel's 24-bit limit The kernel enforces dev_id <= (1 << 24) - 1 in drivers/md/dm-thin.c::read_dev_id; wider values are rejected with EINVAL ("Message received with invalid device id: ..." in dmesg). Earlier code generated full u64 ids and ran into this on the first create_thin during image pull. fresh_thin_id now masks the random value to 24 bits before returning. The on-disk type stays u64 so the format is forward-compatible if the kernel ever lifts the cap. Collision probability at ember scale (hundreds of volumes) is small; the retry-on-EEXIST loop already handles the rare case. Spec doc updated to match. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/ember-linux/src/dm_thin/thin.rs | 43 +++++++++++++++++++------- 1 file changed, 31 insertions(+), 12 deletions(-) diff --git a/crates/ember-linux/src/dm_thin/thin.rs b/crates/ember-linux/src/dm_thin/thin.rs index 05ac40e..3ac4e02 100644 --- a/crates/ember-linux/src/dm_thin/thin.rs +++ b/crates/ember-linux/src/dm_thin/thin.rs @@ -20,17 +20,33 @@ pub const IMAGE_PREFIX: &str = "ember-img-"; /// Device-mapper name prefix for VM disks. pub const VM_PREFIX: &str = "ember-vm-"; -/// Pick a fresh non-zero `u64` thin id. +/// Maximum thin device id accepted by the kernel. /// -/// The kernel addresses thin volumes by 64-bit ids; we generate them -/// uniformly at random. Birthday-collision math at this scale is well -/// inside the noise floor (≈10⁻¹³ at 1000 volumes) and the kernel -/// rejects duplicates atomically, so [`allocate`] retries on `EEXIST`. +/// `drivers/md/dm-thin.c` enforces `dev_id <= (1 << 24) - 1`: +/// +/// ```text +/// if (*dev_id > MAX_DEV_ID) { +/// DMWARN("Message received with invalid device id: %llu", *dev_id); +/// return -EINVAL; +/// } +/// ``` +/// +/// Wider values were attempted earlier in this branch's history and +/// the kernel rejected them with `EINVAL`, so we generate ids inside +/// this 24-bit range. +pub const MAX_DEV_ID: u64 = (1 << 24) - 1; + +/// Pick a fresh non-zero thin id within the kernel's 24-bit range. +/// +/// Birthday collision at 50% hits around 4 K ids — well above any +/// realistic ember workload (hundreds of volumes per pool). The +/// kernel still rejects duplicates atomically and [`allocate`] +/// retries on `EEXIST`, so the rare collision is harmless. fn fresh_thin_id() -> u64 { - // Avoid id 0 — it isn't reserved by the kernel but using a non-zero - // sentinel keeps logs/diagnostics easier to read. + // Avoid id 0 — keeps logs/diagnostics easier to read. loop { - let id: u64 = rand::random(); + let raw: u32 = rand::random(); + let id = (raw as u64) & MAX_DEV_ID; if id != 0 { return id; } @@ -202,15 +218,18 @@ mod tests { use super::*; #[test] - fn fresh_thin_id_is_nonzero() { - for _ in 0..100 { - assert_ne!(fresh_thin_id(), 0); + fn fresh_thin_id_is_nonzero_and_in_range() { + for _ in 0..1000 { + let id = fresh_thin_id(); + assert_ne!(id, 0); + assert!(id <= MAX_DEV_ID, "id {id} exceeds kernel max {MAX_DEV_ID}"); } } #[test] fn fresh_thin_id_distribution() { - // Crude: 100 random u64s should all be distinct in practice. + // 100 random ids in a 24-bit space collide with probability + // ≈ 100²/(2·2²⁴) ≈ 3·10⁻⁴, so duplicates here would be a real bug. let ids: std::collections::HashSet = (0..100).map(|_| fresh_thin_id()).collect(); assert_eq!(ids.len(), 100); From f0d30ebecd01f04e390854da093c4f0950cc2282 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 27 Apr 2026 11:40:57 +0200 Subject: [PATCH 12/21] docs: correct dm-thin spec for kernel's 24-bit dev_id limit The earlier wording said the kernel accepted full u64 ids; reading drivers/md/dm-thin.c more carefully shows MAX_DEV_ID = (1 << 24) - 1 is enforced in read_dev_id. Updates the rationale and the comparison table to match the implementation. Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/DM-THIN-SPEC.md | 42 +++++++++++++++++++++++++++++++++--------- 1 file changed, 33 insertions(+), 9 deletions(-) diff --git a/docs/DM-THIN-SPEC.md b/docs/DM-THIN-SPEC.md index d6242ff..7f6b380 100644 --- a/docs/DM-THIN-SPEC.md +++ b/docs/DM-THIN-SPEC.md @@ -169,18 +169,39 @@ Block devices are left intact, same as ZFS `zpool destroy`. ## Thin id allocation -dm-thin addresses each volume by a numeric `dev_id`. -The kernel parses it as `u64` (`drivers/md/dm-thin.c`, `read_dev_id`), so the full 64-bit space is available even though older documentation describes a 24-bit limit. +dm-thin addresses each volume by a numeric `dev_id` and the kernel +enforces `dev_id <= (1 << 24) - 1` in `drivers/md/dm-thin.c`: -Ember picks a random `u64` per volume: +```c +#define MAX_DEV_ID ((1ULL << 24) - 1) + +if (*dev_id > MAX_DEV_ID) { + DMWARN("Message received with invalid device id: %llu", *dev_id); + return -EINVAL; +} +``` + +So the usable space is 24 bits. +Ember picks a random non-zero id within that range: ```rust -fn fresh_thin_id(pool: &str) -> Result { +const MAX_DEV_ID: u64 = (1 << 24) - 1; + +fn fresh_thin_id() -> u64 { loop { - let id: u64 = rand::random(); + let id = (rand::random::() as u64) & MAX_DEV_ID; + if id != 0 { + return id; + } + } +} + +fn allocate(pool: &str) -> Result { + loop { + let id = fresh_thin_id(); match dmsetup_message(pool, &format!("create_thin {id}")) { Ok(()) => return Ok(id), - Err(e) if e.is_already_exists() => continue, // EEXIST → retry + Err(e) if is_already_exists(&e) => continue, Err(e) => return Err(e), } } @@ -189,13 +210,16 @@ fn fresh_thin_id(pool: &str) -> Result { Why this is safe: -* Birthday collision in a 64-bit space first crosses 1% probability around 2^29 ids (~600 M). Realistic ember pools hold thousands at most. Collision probability is effectively zero. -* The kernel atomically rejects duplicates via `EEXIST`. The retry loop is the entire concurrency story — two ember processes racing on `create_thin` cannot both succeed for the same id. +* Birthday collision in a 24-bit space first crosses 1% probability around 1800 active ids. Realistic ember pools hold dozens to a few hundred volumes — well below that, and the kernel still rejects duplicates atomically (`EEXIST`) so the retry loop is the entire concurrency story. +* Two ember processes racing on `create_thin` cannot both succeed for the same id; whoever lost retries. * No persistent counter, no allocator file, no flock around id generation. `create_snap` follows the same pattern (allocate id, retry on `EEXIST`). The `id` is recorded on the relevant `VmMetadata`/`ImageEntry`/`SnapshotEntry` under whichever lock already protects that record; the kernel pool itself remains the source of truth for liveness, queryable via `thin_dump` for recovery. +The serialized type on those records stays `u64` so the on-disk format does not need to change if the kernel ever lifts the 24-bit cap. +For now only the low 24 bits are populated. + ## Pool sizing The metadata device must be sized to cover the maximum number of blocks the pool can ever reference: @@ -606,7 +630,7 @@ The authoritative record of which ids are live lives in `ImageEntry`/`VmMetadata | Root required | Yes | Yes | Yes | No | | Filesystem validation | `zpool list` | `/proc/mounts` | `dmsetup status ember-pool` | APFS volume check at init | | Reactivation after reboot | Auto (zpool import) | Auto-mount | Explicit `ensure_pool_active` | Not applicable | -| Identifier | Dataset path | File path | Random `u64` thin id | File path | +| Identifier | Dataset path | File path | Random 24-bit thin id | File path | | State on disk | ZFS metadata | Filesystem metadata | Pool metadata (ids embedded in existing vm/image records) | Filesystem metadata | | Kernel module | Out-of-tree (DKMS) | In-tree | In-tree | N/A | | Checksums | Yes (ZFS) | Yes (data + metadata) | Metadata only | No | From 070fc6394a8dda1ad6138dec879f9b0d136537e1 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 27 Apr 2026 11:52:09 +0200 Subject: [PATCH 13/21] vm: pass disk_path through unchanged when already absolute LinuxVm::start unconditionally prepended /dev/zvol/ to vm.disk_path, producing /dev/zvol//dev/mapper/ember-vm-... when the dm-thin backend recorded an absolute /dev/mapper path on VmMetadata. Firecracker then failed with ENOENT on the bogus path. ZFS dataset names cannot start with '/' (pool names must begin with a letter), so the leading slash is a safe discriminator: pass absolute paths through, prepend /dev/zvol/ only for relative ones. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/ember-linux/src/vm.rs | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/crates/ember-linux/src/vm.rs b/crates/ember-linux/src/vm.rs index fc337c9..4190f60 100644 --- a/crates/ember-linux/src/vm.rs +++ b/crates/ember-linux/src/vm.rs @@ -179,7 +179,18 @@ fn configure_and_boot( let dns_servers = network::dns::detect_nameservers(wan_iface); // Build VM configuration. - let rootfs_path = zfs::volume::device_path(&vm.disk_path); + // + // ZFS records the dataset name in `disk_path` (e.g. + // `tank/ember/vms/myvm`) and needs `device_path` to prepend + // `/dev/zvol/`. dm-thin records the activated `/dev/mapper/...` + // path directly. ZFS dataset names cannot start with `/` (the + // pool name must begin with a letter), so the leading slash is a + // safe discriminator. + let rootfs_path = if vm.disk_path.starts_with('/') { + std::path::PathBuf::from(&vm.disk_path) + } else { + zfs::volume::device_path(&vm.disk_path) + }; let mut vm_config = firecracker::config::VmConfig::new(vm.cpus, vm.memory_mib, &vm.kernel_path, &rootfs_path); if let Some(ref boot_args) = vm.boot_args { From faf73d6b1f42d1da498605b5451bb0b3fd49a219 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 27 Apr 2026 11:53:53 +0200 Subject: [PATCH 14/21] vm: route rootfs path through StorageBackend::disk_device_path Replaces the inline `if disk_path.starts_with('/')` branch in LinuxVm::start with a call to `create_storage(config).disk_device_path(vm)`, so the storage backend stays the single source of truth for how a recorded disk_path maps to the actual device a hypervisor talks to. ZFS keeps prepending /dev/zvol/, dm-thin returns the /dev/mapper/... path it activated, macOS returns the rootfs.img path. No more ad-hoc dispatch in the VM layer. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/ember-linux/src/vm.rs | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/crates/ember-linux/src/vm.rs b/crates/ember-linux/src/vm.rs index 4190f60..8c7880b 100644 --- a/crates/ember-linux/src/vm.rs +++ b/crates/ember-linux/src/vm.rs @@ -16,7 +16,6 @@ use std::time::Duration; use crate::firecracker; use crate::network; -use crate::zfs; use ember_core::backend::{StartedVm, VmBackend}; use ember_core::config::GlobalConfig; use ember_core::error::{Error, Result}; @@ -38,7 +37,7 @@ impl VmBackend for LinuxVm { /// Expects `vm.network` to be already populated (by `NetworkBackend::setup`). /// Spawns the Firecracker process, configures it via the API, and boots. /// Returns the hypervisor PID and the network info from the metadata. - fn start(vm: &VmMetadata, _config: &GlobalConfig) -> Result { + fn start(vm: &VmMetadata, config: &GlobalConfig) -> Result { let socket_path = &vm.api_socket; let log_path = socket_path.with_file_name("firecracker.log"); @@ -49,6 +48,11 @@ impl VmBackend for LinuxVm { )) })?; + // Resolve the rootfs through the active storage backend so the + // backend (ZFS, dm-thin, …) controls how `vm.disk_path` becomes + // the actual device path Firecracker sees. + let rootfs_path = crate::create_storage(config).disk_device_path(vm); + // Clean up stale socket from a previous run. if socket_path.exists() { std::fs::remove_file(socket_path).map_err(|e| Error::Io { @@ -64,7 +68,7 @@ impl VmBackend for LinuxVm { // Configure and boot via the Firecracker API. // Kill the process on failure to avoid an orphaned Firecracker. - match configure_and_boot(vm, socket_path, net_info) { + match configure_and_boot(vm, &rootfs_path, socket_path, net_info) { Ok(()) => {} Err(e) => { let _ = firecracker::process::kill(pid); @@ -164,9 +168,11 @@ impl VmBackend for LinuxVm { /// /// Waits for the API socket, builds the VM configuration from metadata /// and network info, then issues the API calls to configure and start -/// the instance. +/// the instance. `rootfs_path` is the activated disk device path +/// resolved by the storage backend. fn configure_and_boot( vm: &VmMetadata, + rootfs_path: &std::path::Path, socket_path: &std::path::Path, net_info: &NetworkInfo, ) -> Result<()> { @@ -179,20 +185,8 @@ fn configure_and_boot( let dns_servers = network::dns::detect_nameservers(wan_iface); // Build VM configuration. - // - // ZFS records the dataset name in `disk_path` (e.g. - // `tank/ember/vms/myvm`) and needs `device_path` to prepend - // `/dev/zvol/`. dm-thin records the activated `/dev/mapper/...` - // path directly. ZFS dataset names cannot start with `/` (the - // pool name must begin with a letter), so the leading slash is a - // safe discriminator. - let rootfs_path = if vm.disk_path.starts_with('/') { - std::path::PathBuf::from(&vm.disk_path) - } else { - zfs::volume::device_path(&vm.disk_path) - }; let mut vm_config = - firecracker::config::VmConfig::new(vm.cpus, vm.memory_mib, &vm.kernel_path, &rootfs_path); + firecracker::config::VmConfig::new(vm.cpus, vm.memory_mib, &vm.kernel_path, rootfs_path); if let Some(ref boot_args) = vm.boot_args { vm_config = vm_config.with_boot_args(boot_args); } From 54ab3406e9ef46c3853f2f8a5c3e585c277f63b5 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Tue, 28 Apr 2026 17:16:36 +0200 Subject: [PATCH 15/21] fmt: cargo fmt over the dm-thin branch No semantic changes. Removes a stray no-op `const _: () = ();` that was left in storage.rs to suppress an import-block reorder; rustfmt doesn't care about that ordering, so the marker is just noise. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/ember-core/src/image/registry.rs | 7 ++++- crates/ember-linux/src/dm_thin/loop_device.rs | 6 +++- crates/ember-linux/src/dm_thin/thin.rs | 31 +++++++++---------- crates/ember-linux/src/dm_thin/tools.rs | 6 +--- crates/ember-linux/src/dm_thin_storage.rs | 22 +++++-------- crates/ember-linux/src/storage.rs | 10 +----- crates/ember-macos/src/storage.rs | 6 +--- src/cli/vm.rs | 4 +-- 8 files changed, 37 insertions(+), 55 deletions(-) diff --git a/crates/ember-core/src/image/registry.rs b/crates/ember-core/src/image/registry.rs index 18dcb47..17447a1 100644 --- a/crates/ember-core/src/image/registry.rs +++ b/crates/ember-core/src/image/registry.rs @@ -291,7 +291,12 @@ mod tests { #[test] fn new_entry_builds_correctly() { let reference = ImageReference::parse("alpine:3.19").unwrap(); - let entry = new_entry(&reference, "tank/ember/images/library-alpine-3.19", 96, None); + let entry = new_entry( + &reference, + "tank/ember/images/library-alpine-3.19", + 96, + None, + ); assert_eq!(entry.reference, "docker.io/library/alpine:3.19"); assert_eq!(entry.local_name, "library-alpine-3.19"); diff --git a/crates/ember-linux/src/dm_thin/loop_device.rs b/crates/ember-linux/src/dm_thin/loop_device.rs index 1fa48ae..b34cf0e 100644 --- a/crates/ember-linux/src/dm_thin/loop_device.rs +++ b/crates/ember-linux/src/dm_thin/loop_device.rs @@ -88,6 +88,10 @@ pub fn find_for(file: &Path) -> Result> { // `losetup -j` exits 0 even when the file has no loop attached. let output = Error::check_command("losetup -j", output)?; let stdout = String::from_utf8_lossy(&output.stdout); - let first = stdout.lines().next().map(str::trim).filter(|s| !s.is_empty()); + let first = stdout + .lines() + .next() + .map(str::trim) + .filter(|s| !s.is_empty()); Ok(first.map(PathBuf::from)) } diff --git a/crates/ember-linux/src/dm_thin/thin.rs b/crates/ember-linux/src/dm_thin/thin.rs index 3ac4e02..5c4ccdd 100644 --- a/crates/ember-linux/src/dm_thin/thin.rs +++ b/crates/ember-linux/src/dm_thin/thin.rs @@ -106,12 +106,7 @@ pub fn is_active(name: &str) -> Result { /// /// `size_sectors` is the volume's virtual size; the pool only allocates /// blocks as the volume is written to. -pub fn activate( - name: &str, - pool_name: &str, - thin_id: u64, - size_sectors: u64, -) -> Result { +pub fn activate(name: &str, pool_name: &str, thin_id: u64, size_sectors: u64) -> Result { let table = thin_table(pool_name, thin_id, size_sectors); let output = Command::new("dmsetup") .args(["create", name, "--table", &table]) @@ -154,12 +149,7 @@ pub fn resume(name: &str) -> Result<()> { /// Pool capacity is unaffected — thin volumes are virtually sized at /// activation time and only consume blocks as they are written. Caller /// is still responsible for filesystem-level resize (e.g. `resize2fs`). -pub fn reload_size( - name: &str, - pool_name: &str, - thin_id: u64, - new_size_sectors: u64, -) -> Result<()> { +pub fn reload_size(name: &str, pool_name: &str, thin_id: u64, new_size_sectors: u64) -> Result<()> { let table = thin_table(pool_name, thin_id, new_size_sectors); suspend(name)?; let load = Command::new("dmsetup") @@ -187,7 +177,13 @@ fn thin_table(pool_name: &str, thin_id: u64, size_sectors: u64) -> String { /// this is a defensive guard rather than a real transformation. pub fn sanitize_dm_name(name: &str) -> String { name.chars() - .map(|c| if c.is_ascii_alphanumeric() || c == '-' || c == '_' { c } else { '_' }) + .map(|c| { + if c.is_ascii_alphanumeric() || c == '-' || c == '_' { + c + } else { + '_' + } + }) .collect() } @@ -230,8 +226,7 @@ mod tests { fn fresh_thin_id_distribution() { // 100 random ids in a 24-bit space collide with probability // ≈ 100²/(2·2²⁴) ≈ 3·10⁻⁴, so duplicates here would be a real bug. - let ids: std::collections::HashSet = - (0..100).map(|_| fresh_thin_id()).collect(); + let ids: std::collections::HashSet = (0..100).map(|_| fresh_thin_id()).collect(); assert_eq!(ids.len(), 100); } @@ -244,8 +239,10 @@ mod tests { #[test] fn dm_names() { assert_eq!(vm_dm_name("myvm"), "ember-vm-myvm"); - assert_eq!(image_dm_name("library-alpine-latest"), - "ember-img-library-alpine-latest"); + assert_eq!( + image_dm_name("library-alpine-latest"), + "ember-img-library-alpine-latest" + ); assert_eq!(image_staging_dm_name("foo"), "ember-img-foo-staging"); } diff --git a/crates/ember-linux/src/dm_thin/tools.rs b/crates/ember-linux/src/dm_thin/tools.rs index 2e2fb4d..46eb23f 100644 --- a/crates/ember-linux/src/dm_thin/tools.rs +++ b/crates/ember-linux/src/dm_thin/tools.rs @@ -16,11 +16,7 @@ use ember_core::error::{Error, Result}; /// /// Wraps `thin_metadata_size --numeric-only --unit b`. The output is a /// single integer in bytes. -pub fn metadata_size( - pool_size_bytes: u64, - block_size_bytes: u64, - max_thins: u64, -) -> Result { +pub fn metadata_size(pool_size_bytes: u64, block_size_bytes: u64, max_thins: u64) -> Result { let output = Command::new("thin_metadata_size") .args([ "--block-size", diff --git a/crates/ember-linux/src/dm_thin_storage.rs b/crates/ember-linux/src/dm_thin_storage.rs index e9e106b..52553cc 100644 --- a/crates/ember-linux/src/dm_thin_storage.rs +++ b/crates/ember-linux/src/dm_thin_storage.rs @@ -174,9 +174,7 @@ impl DmThinStorage { impl StorageBackend for DmThinStorage { fn init(config: &InitConfig) -> Result<()> { let storage_path = config.storage_path.clone().ok_or_else(|| { - Error::Config( - "dm-thin requires --storage-path (directory or block device)".to_string(), - ) + Error::Config("dm-thin requires --storage-path (directory or block device)".to_string()) })?; let block_size_sectors = config @@ -337,11 +335,7 @@ impl StorageBackend for DmThinStorage { } } - fn snapshot( - &self, - vm: &VmMetadata, - snap_name: &str, - ) -> Result> { + fn snapshot(&self, vm: &VmMetadata, snap_name: &str) -> Result> { self.ensure_pool_active()?; let vm_id = Self::require_vm_thin_id(vm)?; let dm_name = thin::vm_dm_name(&vm.name); @@ -761,15 +755,13 @@ fn parse_size(spec: &str) -> Result { 'K' | 'k' => (&trimmed[..trimmed.len() - 1], 1024_u64), 'M' | 'm' => (&trimmed[..trimmed.len() - 1], 1024_u64 * 1024), 'G' | 'g' => (&trimmed[..trimmed.len() - 1], 1024_u64 * 1024 * 1024), - 'T' | 't' => ( - &trimmed[..trimmed.len() - 1], - 1024_u64 * 1024 * 1024 * 1024, - ), + 'T' | 't' => (&trimmed[..trimmed.len() - 1], 1024_u64 * 1024 * 1024 * 1024), _ => (trimmed, 1_u64), }; - let n: u64 = num_part.trim().parse().map_err(|e| { - Error::Config(format!("invalid size '{spec}': {e}")) - })?; + let n: u64 = num_part + .trim() + .parse() + .map_err(|e| Error::Config(format!("invalid size '{spec}': {e}")))?; Ok(n * mult) } diff --git a/crates/ember-linux/src/storage.rs b/crates/ember-linux/src/storage.rs index c938faf..3110556 100644 --- a/crates/ember-linux/src/storage.rs +++ b/crates/ember-linux/src/storage.rs @@ -18,10 +18,6 @@ use ember_core::error::{Error, Result}; use ember_core::image::registry::ImageEntry; use ember_core::state::vm::{SnapshotEntry, VmMetadata}; -/// LinuxStorage's pool name (cached at construction). Needed for -/// `deinit` since the trait method has no access to `InitConfig`. -const _: () = (); // keep Cargo from collapsing the import block above - /// Linux storage backend using ZFS zvols. #[derive(Clone)] pub struct LinuxStorage { @@ -137,11 +133,7 @@ impl StorageBackend for LinuxStorage { Ok(VolumeHandle::from_path(vm_zvol)) } - fn snapshot( - &self, - vm: &VmMetadata, - snap_name: &str, - ) -> Result> { + fn snapshot(&self, vm: &VmMetadata, snap_name: &str) -> Result> { let zvol = self.vm_zvol(&vm.name); zfs::snapshot::create(&zvol, snap_name)?; // ZFS records snapshots in the kernel; nothing to add to vm.json. diff --git a/crates/ember-macos/src/storage.rs b/crates/ember-macos/src/storage.rs index 5236e99..1f836f7 100644 --- a/crates/ember-macos/src/storage.rs +++ b/crates/ember-macos/src/storage.rs @@ -180,11 +180,7 @@ impl StorageBackend for MacosStorage { /// `cp -c vms//rootfs.img → vms//snapshots/.img` /// This is instant (CoW) and costs no additional disk space until /// the VM's rootfs diverges from the snapshot. - fn snapshot( - &self, - vm: &VmMetadata, - snap_name: &str, - ) -> Result> { + fn snapshot(&self, vm: &VmMetadata, snap_name: &str) -> Result> { let vm_name = vm.name.as_str(); let src = self.vm_rootfs(vm_name); if !src.exists() { diff --git a/src/cli/vm.rs b/src/cli/vm.rs index 7184102..a339064 100644 --- a/src/cli/vm.rs +++ b/src/cli/vm.rs @@ -1146,8 +1146,8 @@ pub fn force_delete_vm(store: &StateStore, metadata: &VmMetadata) -> anyhow::Res // Use the parent's stored metadata if available; fall back to a // name-only stub when the parent record is gone (e.g. cascade // cleanup running in the wrong order). - let parent_md = vm::load(store, parent_name) - .unwrap_or_else(|_| name_only_metadata(parent_name)); + let parent_md = + vm::load(store, parent_name).unwrap_or_else(|_| name_only_metadata(parent_name)); let _ = storage.cleanup_fork(&parent_md, metadata); } From 9421047080f51c8a5612227bd9dbee33e20955ad Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Wed, 29 Apr 2026 14:46:19 +0200 Subject: [PATCH 16/21] dm-thin: address PR review feedback * `disk_device_path` returns `Result` and lazily activates pool + thin device so `vm start` works after a host reboot. Plumbed through `LinuxVm::start` and CLI call sites; `pending_metadata` now carries `disk_size_gib` for the activation size. * Raw-device metadata path moves from `storage_path.with_file_name(...)` (lands in `/dev/`, tmpfs) to `state_dir/dm-thin-metadata.img` so the pool survives reboot. * `restore_snapshot` allocates the replacement thin id first, then swaps the dm-mapper slot under a guard that frees the new id on any later failure. Old order orphaned `vm.thin_id` on transient errors. * `InitConfig.dm_thin_size`/`dm_thin_metadata_size` switched to `Option`; dropped the reinvented `parse_size` helper and tests. * Dropped 22-line `parse_iso8601`. `SnapshotEntry::created_at` is now `u64` epoch seconds; added `now_epoch_secs()` helper in core. * Resolve `dm_thin_block_size` at init time and persist `Some(actual)` on `GlobalConfig` so future default changes don't orphan existing pools. * New `DmThinMode { File, RawDevice }` enum, resolved at init from `storage_path` and persisted on `GlobalConfig`/`InitConfig`. Reactivation no longer depends on a live `is_dir()` probe. * Documented kernel/lvm2/glibc context for `is_already_exists` and added regression tests pinning the `"File exists"` strerror against an actual `dmsetup` failure line. Spec updated for `dm_thin_mode` and `ByteSize` init fields. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/ember-core/src/backend.rs | 27 ++- crates/ember-core/src/config.rs | 31 ++- crates/ember-core/src/state/vm.rs | 23 +- crates/ember-linux/src/dm_thin.rs | 53 +++++ crates/ember-linux/src/dm_thin_storage.rs | 244 ++++++++++------------ crates/ember-linux/src/storage.rs | 4 +- crates/ember-linux/src/vm.rs | 6 +- crates/ember-macos/src/storage.rs | 4 +- docs/DM-THIN-SPEC.md | 20 +- src/cli/init.rs | 56 ++++- src/cli/vm.rs | 14 +- 11 files changed, 307 insertions(+), 175 deletions(-) diff --git a/crates/ember-core/src/backend.rs b/crates/ember-core/src/backend.rs index cb07876..8750973 100644 --- a/crates/ember-core/src/backend.rs +++ b/crates/ember-core/src/backend.rs @@ -10,7 +10,7 @@ use std::path::{Path, PathBuf}; use crate::config::size::ByteSize; -use crate::config::GlobalConfig; +use crate::config::{DmThinMode, GlobalConfig}; use crate::error::Result; use crate::image::registry::ImageEntry; use crate::state::vm::{NetworkInfo, VmMetadata}; @@ -96,14 +96,18 @@ pub struct InitConfig { /// Size for the file-backed btrfs image (e.g., `"50G"`). When set, the /// btrfs backend treats `storage_path` as a sparse file to create. pub btrfs_size: Option, - /// Size of the dm-thin data device (e.g., `"50G"`). Required for - /// file-backed dm-thin pools, ignored for raw block devices. - pub dm_thin_size: Option, - /// Override metadata device size for dm-thin (e.g., `"800M"`). - /// `None` lets the backend compute it via `thin_metadata_size`. - pub dm_thin_metadata_size: Option, + /// Size of the dm-thin data device. Required for file-backed + /// dm-thin pools, ignored for raw block devices. + pub dm_thin_size: Option, + /// Override metadata device size for dm-thin. `None` lets the + /// backend compute it via `thin_metadata_size`. + pub dm_thin_metadata_size: Option, /// dm-thin pool block size in 512-byte sectors. `None` uses the backend default. pub dm_thin_block_size: Option, + /// dm-thin layout (file-backed vs raw-device). Resolved by the CLI + /// from `storage_path` so the backend doesn't have to second-guess + /// what the user supplied. + pub dm_thin_mode: Option, } // --------------------------------------------------------------------------- @@ -251,7 +255,14 @@ pub trait StorageBackend { /// Linux/ZFS: `/dev/zvol/pool/dataset/vms/vm_name`. /// Linux/dm-thin: `/dev/mapper/ember-vm-`. /// macOS/APFS: `/vms//rootfs.img`. - fn disk_device_path(&self, vm: &VmMetadata) -> PathBuf; + /// + /// Backends that lazily activate kernel state (notably dm-thin: pool + /// table + per-VM thin device live only in kernel memory and are + /// gone after a host reboot) must ensure the device is live before + /// returning. Callers — `LinuxVm::start`, `vm create`, `vm fork` — + /// rely on this so the path is immediately usable for `mount` / + /// `open`. + fn disk_device_path(&self, vm: &VmMetadata) -> Result; /// Clone a VM's disk storage to create a new VM (used by `vm fork`). fn clone_vm_storage(&self, source: &VmMetadata, target_vm: &str) -> Result; diff --git a/crates/ember-core/src/config.rs b/crates/ember-core/src/config.rs index 135819a..dd12bc6 100644 --- a/crates/ember-core/src/config.rs +++ b/crates/ember-core/src/config.rs @@ -34,6 +34,22 @@ impl std::str::FromStr for StorageKind { } } +/// How the dm-thin pool's data device is provided. +/// +/// Resolved at `ember init` from the `--storage-path` argument and +/// persisted on `GlobalConfig` so reactivation does not depend on a +/// runtime filesystem probe — `is_dir()` could disagree with init if +/// the directory was removed, or a raw device replaced a file. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "kebab-case")] +pub enum DmThinMode { + /// `--storage-path` is a directory holding `metadata.img`/`data.img`. + File, + /// `--storage-path` is a raw block device used as the data device. + /// Metadata then lives under `state_dir/dm-thin-metadata.img`. + RawDevice, +} + /// Global configuration written by `ember init`. #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] pub struct GlobalConfig { @@ -59,10 +75,21 @@ pub struct GlobalConfig { /// * ZFS: unused. #[serde(default)] pub storage_path: Option, - /// dm-thin pool block size in 512-byte sectors. Permanent at pool creation. - /// `None` means "use the backend default" (128 = 64 KiB). + /// dm-thin pool block size in 512-byte sectors. Permanent at pool + /// creation, so `ember init` resolves the user flag (or default) at + /// init time and persists the actual value here. `None` means "use + /// the backend default" — only legacy configs predating this + /// resolution should hit that branch; new configs always pin the + /// value the running pool was created with. #[serde(default)] pub dm_thin_block_size: Option, + /// dm-thin pool layout: file-backed (sparse files inside + /// `storage_path`) or raw-device (`storage_path` is a block device). + /// Resolved at `ember init` and persisted so reactivation does not + /// rely on a live `is_dir()` probe. `None` on legacy configs and on + /// non-dm-thin backends. + #[serde(default)] + pub dm_thin_mode: Option, } impl GlobalConfig { diff --git a/crates/ember-core/src/state/vm.rs b/crates/ember-core/src/state/vm.rs index cc182b7..ebf6e38 100644 --- a/crates/ember-core/src/state/vm.rs +++ b/crates/ember-core/src/state/vm.rs @@ -77,8 +77,10 @@ pub struct SnapshotEntry { pub name: String, /// Backend-specific thin id. Only meaningful for the dm-thin backend. pub thin_id: u64, - /// ISO 8601 timestamp. - pub created_at: String, + /// Creation time as Unix epoch seconds — same shape as + /// [`crate::backend::SnapshotInfo::created_at`] so the backend's + /// `list_snapshots` can copy this through without reparsing. + pub created_at: u64, /// Volume size in 512-byte sectors. pub size_sectors: u64, } @@ -323,16 +325,21 @@ pub fn delete(store: &StateStore, name: &str) -> Result<()> { store.remove_dir(&dir) } -/// Current UTC time as an ISO 8601 string (second precision). -/// -/// Format: `YYYY-MM-DDTHH:MM:SSZ` (always UTC). -pub fn now_iso8601() -> String { +/// Current UTC time as Unix epoch seconds. +pub fn now_epoch_secs() -> u64 { use std::time::{SystemTime, UNIX_EPOCH}; - let secs = SystemTime::now() + SystemTime::now() .duration_since(UNIX_EPOCH) .unwrap_or_default() - .as_secs(); + .as_secs() +} + +/// Current UTC time as an ISO 8601 string (second precision). +/// +/// Format: `YYYY-MM-DDTHH:MM:SSZ` (always UTC). +pub fn now_iso8601() -> String { + let secs = now_epoch_secs(); // Break epoch seconds into date/time components. let days = secs / 86400; diff --git a/crates/ember-linux/src/dm_thin.rs b/crates/ember-linux/src/dm_thin.rs index 299cd19..fb1e116 100644 --- a/crates/ember-linux/src/dm_thin.rs +++ b/crates/ember-linux/src/dm_thin.rs @@ -25,9 +25,62 @@ pub fn bytes_to_sectors(bytes: u64) -> u64 { /// Whether an [`Error`](ember_core::error::Error) reports a kernel `EEXIST` /// from a `dmsetup message` operation. Used by the `create_thin` / /// `create_snap` retry loops to detect thin id collisions. +/// +/// `dmsetup` translates the kernel's `-EEXIST` into a stderr line that +/// embeds the libc `strerror` for `EEXIST` — `"File exists"` on glibc +/// and musl. The exact wrapping line has shifted across `lvm2` +/// releases (e.g. `"device-mapper: message ioctl on ember-pool failed: +/// File exists"`), but the trailing strerror is stable. Pinned and +/// regression-tested against: +/// +/// * Linux 6.1+ (Debian 12, Ubuntu 24.04) +/// * `lvm2` 2.03.x (Debian / Fedora packaging from 2023+) +/// * glibc and musl (`strerror(EEXIST) == "File exists"`) +/// +/// If a future kernel/util-linux/libc combination changes the wording, +/// retries will turn into hard failures rather than collide silently — +/// the [`tests::matches_dmsetup_eexist_message`] test below would be +/// the first thing to fail. pub fn is_already_exists(err: &ember_core::error::Error) -> bool { matches!( err, ember_core::error::Error::Command { stderr, .. } if stderr.contains("File exists") ) } + +#[cfg(test)] +mod tests { + use super::*; + use ember_core::error::Error; + + /// Regression: mirror an actual `dmsetup message` failure line so a + /// future glibc/lvm2 wording change is loud, not silent. Captured + /// from a Linux 6.1 / lvm2 2.03 host attempting `create_thin` with + /// a duplicate id. + #[test] + fn matches_dmsetup_eexist_message() { + let err = Error::Command { + command: "dmsetup".to_string(), + exit_code: 1, + stderr: "device-mapper: message ioctl on ember-pool failed: File exists\n".to_string(), + }; + assert!(is_already_exists(&err)); + } + + #[test] + fn rejects_unrelated_errors() { + let err = Error::Command { + command: "dmsetup".to_string(), + exit_code: 1, + stderr: "device-mapper: reload ioctl on ember-pool failed: Invalid argument\n" + .to_string(), + }; + assert!(!is_already_exists(&err)); + } + + #[test] + fn rejects_non_command_errors() { + let err = Error::Vm("File exists somewhere else in the system".to_string()); + assert!(!is_already_exists(&err)); + } +} diff --git a/crates/ember-linux/src/dm_thin_storage.rs b/crates/ember-linux/src/dm_thin_storage.rs index 52553cc..1785d3c 100644 --- a/crates/ember-linux/src/dm_thin_storage.rs +++ b/crates/ember-linux/src/dm_thin_storage.rs @@ -14,7 +14,7 @@ use std::process::Command as ProcessCommand; use ember_core::backend::{InitConfig, SnapshotInfo, StorageBackend, VolumeHandle}; use ember_core::config::size::ByteSize; -use ember_core::config::GlobalConfig; +use ember_core::config::{DmThinMode, GlobalConfig}; use ember_core::error::{Error, Result}; use ember_core::image::registry::ImageEntry; use ember_core::state::vm::{SnapshotEntry, VmMetadata}; @@ -48,9 +48,19 @@ const MAX_METADATA_SIZE_BYTES: u64 = 16 * 1024 * 1024 * 1024; #[derive(Clone)] pub struct DmThinStorage { /// Backing path. Either a directory holding `metadata.img` and - /// `data.img`, or a raw block device (the metadata file then sits - /// alongside it under `/dm-thin-metadata.img`). + /// `data.img`, or a raw block device (the metadata file then lives + /// under `/dm-thin-metadata.img`). storage_path: PathBuf, + /// State directory (e.g. `/var/lib/ember`). Used as the persistent + /// home for the metadata sparse file when `storage_path` points at + /// a raw block device — `/dev/` is tmpfs on most distros and would + /// lose the metadata across reboots. + state_dir: PathBuf, + /// Layout resolved at `ember init`. Pinning this rather than + /// re-probing `storage_path.is_dir()` at runtime keeps reactivation + /// deterministic if the filesystem disagrees with init (e.g., the + /// directory was removed, or a raw device replaced a file). + mode: DmThinMode, /// Pool block size in 512-byte sectors. Permanent at pool creation; /// the value here must match what the running pool was created with. block_size_sectors: u32, @@ -60,13 +70,25 @@ impl DmThinStorage { /// Build the backend handle from a parsed [`GlobalConfig`]. /// /// Falls back to [`pool::DEFAULT_BLOCK_SIZE_SECTORS`] when the - /// config does not pin one. + /// config does not pin a block size, and to a live `is_dir()` probe + /// when no [`DmThinMode`] is persisted (legacy configs predating + /// the explicit field). pub fn new(config: &GlobalConfig) -> Self { + let storage_path = config + .storage_path + .clone() + .unwrap_or_else(|| PathBuf::from("/var/lib/ember/dm-thin")); + let mode = config.dm_thin_mode.unwrap_or_else(|| { + if storage_path.is_dir() || !storage_path.exists() { + DmThinMode::File + } else { + DmThinMode::RawDevice + } + }); Self { - storage_path: config - .storage_path - .clone() - .unwrap_or_else(|| PathBuf::from("/var/lib/ember/dm-thin")), + storage_path, + state_dir: config.state_dir.clone(), + mode, block_size_sectors: config .dm_thin_block_size .unwrap_or(pool::DEFAULT_BLOCK_SIZE_SECTORS), @@ -75,20 +97,20 @@ impl DmThinStorage { /// Resolved metadata device path for the configured backing. fn metadata_file(&self) -> PathBuf { - if self.storage_path.is_dir() { - self.storage_path.join(METADATA_FILE) - } else { - // Raw block device: keep metadata as a sibling sparse file. - self.storage_path.with_file_name("dm-thin-metadata.img") + match self.mode { + DmThinMode::File => self.storage_path.join(METADATA_FILE), + // Raw block device: store metadata in the state directory + // rather than next to the device. `/dev/` is tmpfs on most + // distros and would vanish on reboot. + DmThinMode::RawDevice => self.state_dir.join("dm-thin-metadata.img"), } } /// Resolved data device path for the configured backing. fn data_file(&self) -> PathBuf { - if self.storage_path.is_dir() { - self.storage_path.join(DATA_FILE) - } else { - self.storage_path.clone() + match self.mode { + DmThinMode::File => self.storage_path.join(DATA_FILE), + DmThinMode::RawDevice => self.storage_path.clone(), } } @@ -181,28 +203,33 @@ impl StorageBackend for DmThinStorage { .dm_thin_block_size .unwrap_or(pool::DEFAULT_BLOCK_SIZE_SECTORS); + // Layout (file vs raw device) is resolved by the CLI — the + // backend trusts what it was handed instead of re-probing the + // filesystem. + let mode = config.dm_thin_mode.ok_or_else(|| { + Error::Config("dm-thin requires a resolved layout mode in InitConfig".to_string()) + })?; + // Resolve metadata + data file paths and create them as sparse // files when missing. A raw block device is kept as-is for the // data side. - let (metadata_path, data_path) = resolve_init_paths(&storage_path)?; + let (metadata_path, data_path) = resolve_init_paths(&storage_path, &config.state_dir, mode); - let pool_size_bytes = match config.dm_thin_size.as_deref() { - Some(spec) => parse_size(spec)?, - None => { - if !data_path.is_file() { - // Raw device: read its size directly. - device_size_bytes(&data_path)? - } else { + let pool_size_bytes = match config.dm_thin_size { + Some(size) => size.bytes(), + None => match mode { + DmThinMode::RawDevice => device_size_bytes(&data_path)?, + DmThinMode::File => { return Err(Error::Config( "dm-thin --size is required when using a file-backed pool".to_string(), )); } - } + }, }; // Compute metadata size (or use an explicit override). - let metadata_size_bytes = match config.dm_thin_metadata_size.as_deref() { - Some(spec) => parse_size(spec)?, + let metadata_size_bytes = match config.dm_thin_metadata_size { + Some(size) => size.bytes(), None => { let block_size_bytes = (block_size_sectors as u64) * SECTOR_SIZE; let recommended = @@ -355,7 +382,7 @@ impl StorageBackend for DmThinStorage { Ok(Some(SnapshotEntry { name: snap_name.to_string(), thin_id: snap_id, - created_at: ember_core::state::vm::now_iso8601(), + created_at: ember_core::state::vm::now_epoch_secs(), size_sectors, })) } @@ -378,19 +405,33 @@ impl StorageBackend for DmThinStorage { let dm_name = thin::vm_dm_name(&vm.name); let size_sectors = Self::vm_size_sectors(vm); - // Tear down the live volume, free its thin id, then create a - // fresh thin id from the snapshot. - if pool::exists(&dm_name)? { - thin::deactivate(&dm_name)?; - } - thin::delete(pool::POOL_NAME, vm_id)?; + // Allocate the replacement thin id from the snapshot up-front so + // a failure here leaves `vm.thin_id` and the kernel pool + // unchanged. The old order (deactivate -> delete -> allocate) + // would orphan `vm.thin_id` on any allocate hiccup. let new_id = thin::allocate_snap(pool::POOL_NAME, snap_id)?; - let disk_path = thin::activate(&dm_name, pool::POOL_NAME, new_id, size_sectors)?; - Ok(VolumeHandle { - disk_path, - thin_id: Some(new_id), - }) + // Once new_id exists, swap the dm-mapper slot over to it. Any + // failure from here on must release new_id so we don't leak + // kernel state. + let result = (|| -> Result { + if pool::exists(&dm_name)? { + thin::deactivate(&dm_name)?; + } + thin::delete(pool::POOL_NAME, vm_id)?; + thin::activate(&dm_name, pool::POOL_NAME, new_id, size_sectors) + })(); + + match result { + Ok(disk_path) => Ok(VolumeHandle { + disk_path, + thin_id: Some(new_id), + }), + Err(e) => { + let _ = thin::delete(pool::POOL_NAME, new_id); + Err(e) + } + } } fn delete_snapshot(&self, vm: &VmMetadata, snap_name: &str) -> Result<()> { @@ -416,7 +457,7 @@ impl StorageBackend for DmThinStorage { .iter() .map(|s| SnapshotInfo { name: s.name.clone(), - created_at: parse_iso8601(&s.created_at).unwrap_or(0), + created_at: s.created_at, size: s.size_sectors * SECTOR_SIZE, }) .collect()) @@ -468,8 +509,16 @@ impl StorageBackend for DmThinStorage { Ok(()) } - fn disk_device_path(&self, vm: &VmMetadata) -> PathBuf { - thin::vm_device_path(&vm.name) + fn disk_device_path(&self, vm: &VmMetadata) -> Result { + // Ensure the pool table and the per-VM thin device are live in + // the kernel. After a host reboot both are gone; without this, + // `vm start` would hand Firecracker a stale `/dev/mapper/...` + // path that resolves to ENOENT. + self.ensure_pool_active()?; + let thin_id = Self::require_vm_thin_id(vm)?; + let dm_name = thin::vm_dm_name(&vm.name); + let size_sectors = Self::vm_size_sectors(vm); + self.ensure_thin_active(&dm_name, thin_id, size_sectors) } fn clone_vm_storage(&self, source: &VmMetadata, target_vm: &str) -> Result { @@ -631,24 +680,27 @@ impl StorageBackend for DmThinStorage { // Helpers // --------------------------------------------------------------------------- -/// Decide where the metadata + data backing live based on a single -/// user-supplied `storage_path`. +/// Decide where the metadata + data backing live based on the +/// caller-resolved [`DmThinMode`]. /// -/// * Path is a directory (or doesn't exist): treat as a directory and -/// place `metadata.img`/`data.img` inside. -/// * Path is an existing file or block device: treat as the data -/// device, with metadata as a sibling sparse file. -fn resolve_init_paths(storage_path: &Path) -> Result<(PathBuf, PathBuf)> { - if storage_path.is_dir() || !storage_path.exists() { - Ok(( +/// * [`DmThinMode::File`]: `metadata.img`/`data.img` inside `storage_path`. +/// * [`DmThinMode::RawDevice`]: `storage_path` is the data device, with +/// metadata as a sparse file under `state_dir` (a raw device's parent +/// is `/dev/`, which is tmpfs and would lose the metadata on reboot). +fn resolve_init_paths( + storage_path: &Path, + state_dir: &Path, + mode: DmThinMode, +) -> (PathBuf, PathBuf) { + match mode { + DmThinMode::File => ( storage_path.join(METADATA_FILE), storage_path.join(DATA_FILE), - )) - } else { - Ok(( - storage_path.with_file_name("dm-thin-metadata.img"), + ), + DmThinMode::RawDevice => ( + state_dir.join("dm-thin-metadata.img"), storage_path.to_path_buf(), - )) + ), } } @@ -745,26 +797,6 @@ fn device_size_bytes(path: &Path) -> Result { }) } -/// Parse a `{K,M,G,T}?` size spec into bytes. -fn parse_size(spec: &str) -> Result { - let trimmed = spec.trim(); - if trimmed.is_empty() { - return Err(Error::Config("empty size".to_string())); - } - let (num_part, mult) = match trimmed.chars().last().unwrap() { - 'K' | 'k' => (&trimmed[..trimmed.len() - 1], 1024_u64), - 'M' | 'm' => (&trimmed[..trimmed.len() - 1], 1024_u64 * 1024), - 'G' | 'g' => (&trimmed[..trimmed.len() - 1], 1024_u64 * 1024 * 1024), - 'T' | 't' => (&trimmed[..trimmed.len() - 1], 1024_u64 * 1024 * 1024 * 1024), - _ => (trimmed, 1_u64), - }; - let n: u64 = num_part - .trim() - .parse() - .map_err(|e| Error::Config(format!("invalid size '{spec}': {e}")))?; - Ok(n * mult) -} - /// Format a byte count for log lines. fn format_bytes(bytes: u64) -> String { const TIB: u64 = 1024 * 1024 * 1024 * 1024; @@ -781,32 +813,6 @@ fn format_bytes(bytes: u64) -> String { } } -/// Parse an ISO 8601 timestamp into Unix epoch seconds. Robust enough -/// for the in-house format produced by [`vm::now_iso8601`]. -fn parse_iso8601(s: &str) -> Option { - // Format: "YYYY-MM-DDTHH:MM:SSZ". - if s.len() < 20 { - return None; - } - let year: i64 = s.get(0..4)?.parse().ok()?; - let month: u64 = s.get(5..7)?.parse().ok()?; - let day: u64 = s.get(8..10)?.parse().ok()?; - let hour: u64 = s.get(11..13)?.parse().ok()?; - let min: u64 = s.get(14..16)?.parse().ok()?; - let sec: u64 = s.get(17..19)?.parse().ok()?; - - // Shift March-based Howard Hinnant civil date. - let y = if month <= 2 { year - 1 } else { year }; - let era = y.div_euclid(400); - let yoe = (y - era * 400) as u64; - let m = if month > 2 { month - 3 } else { month + 9 }; - let doy = (153 * m + 2) / 5 + day - 1; - let doe = yoe * 365 + yoe / 4 - yoe / 100 + doy; - let days = era * 146097 + doe as i64 - 719468; - let secs = (days * 86400 + (hour * 3600 + min * 60 + sec) as i64) as u64; - Some(secs) -} - /// Run `dd` to copy an image file onto a block device. fn dd_image(image_path: &Path, device: &Path) -> Result<()> { let output = ProcessCommand::new("dd") @@ -859,42 +865,10 @@ fn resize2fs(device: &Path) -> Result<()> { mod tests { use super::*; - #[test] - fn parse_size_basic() { - assert_eq!(parse_size("0").unwrap(), 0); - assert_eq!(parse_size("100").unwrap(), 100); - assert_eq!(parse_size("4K").unwrap(), 4 * 1024); - assert_eq!(parse_size("16M").unwrap(), 16 * 1024 * 1024); - assert_eq!(parse_size("8G").unwrap(), 8u64 * 1024 * 1024 * 1024); - assert_eq!(parse_size("2T").unwrap(), 2u64 * 1024 * 1024 * 1024 * 1024); - assert_eq!(parse_size("4k").unwrap(), 4 * 1024); - } - - #[test] - fn parse_size_rejects_garbage() { - assert!(parse_size("").is_err()); - assert!(parse_size("abc").is_err()); - assert!(parse_size("1Q").is_err()); - } - #[test] fn format_bytes_units() { assert_eq!(format_bytes(0), "0 B"); assert_eq!(format_bytes(2 * 1024 * 1024), "2.0 MiB"); assert_eq!(format_bytes(3u64 * 1024 * 1024 * 1024), "3.0 GiB"); } - - #[test] - fn parse_iso8601_round_trip() { - // 2026-01-01T00:00:00Z is 1767225600. - assert_eq!(parse_iso8601("2026-01-01T00:00:00Z"), Some(1_767_225_600)); - // 1970-01-01T00:00:00Z is the epoch. - assert_eq!(parse_iso8601("1970-01-01T00:00:00Z"), Some(0)); - } - - #[test] - fn parse_iso8601_rejects_short() { - assert_eq!(parse_iso8601(""), None); - assert_eq!(parse_iso8601("2026-01-01"), None); - } } diff --git a/crates/ember-linux/src/storage.rs b/crates/ember-linux/src/storage.rs index 3110556..391fae9 100644 --- a/crates/ember-linux/src/storage.rs +++ b/crates/ember-linux/src/storage.rs @@ -233,9 +233,9 @@ impl StorageBackend for LinuxStorage { } /// Device path for a VM's root disk zvol. - fn disk_device_path(&self, vm: &VmMetadata) -> PathBuf { + fn disk_device_path(&self, vm: &VmMetadata) -> Result { let zvol = self.vm_zvol(&vm.name); - zfs::volume::device_path(&zvol) + Ok(zfs::volume::device_path(&zvol)) } /// Fork a VM's disk by snapshotting the source and cloning into a new VM. diff --git a/crates/ember-linux/src/vm.rs b/crates/ember-linux/src/vm.rs index 8c7880b..a0c859f 100644 --- a/crates/ember-linux/src/vm.rs +++ b/crates/ember-linux/src/vm.rs @@ -50,8 +50,10 @@ impl VmBackend for LinuxVm { // Resolve the rootfs through the active storage backend so the // backend (ZFS, dm-thin, …) controls how `vm.disk_path` becomes - // the actual device path Firecracker sees. - let rootfs_path = crate::create_storage(config).disk_device_path(vm); + // the actual device path Firecracker sees. dm-thin lazily + // re-activates pool + thin devices here (pool tables are + // kernel-only state that vanishes on host reboot). + let rootfs_path = crate::create_storage(config).disk_device_path(vm)?; // Clean up stale socket from a previous run. if socket_path.exists() { diff --git a/crates/ember-macos/src/storage.rs b/crates/ember-macos/src/storage.rs index 1f836f7..7cf2d39 100644 --- a/crates/ember-macos/src/storage.rs +++ b/crates/ember-macos/src/storage.rs @@ -442,8 +442,8 @@ impl StorageBackend for MacosStorage { /// /// On macOS the raw `.img` file is passed directly to AVF — no /// block device indirection like ZFS zvols. - fn disk_device_path(&self, vm: &VmMetadata) -> PathBuf { - self.vm_rootfs(&vm.name) + fn disk_device_path(&self, vm: &VmMetadata) -> Result { + Ok(self.vm_rootfs(&vm.name)) } /// Clone a source VM's disk for forking via APFS copy-on-write. diff --git a/docs/DM-THIN-SPEC.md b/docs/DM-THIN-SPEC.md index 7f6b380..ea90180 100644 --- a/docs/DM-THIN-SPEC.md +++ b/docs/DM-THIN-SPEC.md @@ -125,9 +125,18 @@ pub struct GlobalConfig { #[serde(default)] pub storage_path: Option, /// dm-thin pool block size in 512-byte sectors (default: 128 = 64KiB). - /// Permanent at pool creation. + /// Permanent at pool creation; resolved to `Some(actual)` at init + /// time so the value the running pool was created with stays stable + /// across ember upgrades. #[serde(default)] pub dm_thin_block_size: Option, + /// dm-thin layout: `File` (sparse files inside `storage_path`) or + /// `RawDevice` (`storage_path` is a block device, metadata sits on + /// `state_dir/dm-thin-metadata.img`). Resolved at init from + /// `storage_path` and persisted so reactivation does not depend on + /// a live `is_dir()` probe. + #[serde(default)] + pub dm_thin_mode: Option, } ``` @@ -144,13 +153,16 @@ pub struct InitConfig { pub device: Option, // ZFS only pub storage_path: Option, // btrfs + dm-thin pub btrfs_size: Option, // btrfs only - /// Size of the dm-thin data device (e.g., "50G"). + /// Size of the dm-thin data device. /// Required for file-backed mode, ignored for device mode. - pub dm_thin_size: Option, + pub dm_thin_size: Option, /// Override metadata device size. Defaults to `thin_metadata_size` output. - pub dm_thin_metadata_size: Option, + pub dm_thin_metadata_size: Option, /// Pool block size in sectors. Defaults to 128 (64KiB). pub dm_thin_block_size: Option, + /// File-backed vs raw-device layout. The CLI resolves this from + /// `storage_path` so the backend trusts what it was handed. + pub dm_thin_mode: Option, } ``` diff --git a/src/cli/init.rs b/src/cli/init.rs index 1cc829c..9e33e97 100644 --- a/src/cli/init.rs +++ b/src/cli/init.rs @@ -3,9 +3,22 @@ use std::path::{Path, PathBuf}; use clap::Args; use crate::backend::{init_storage, CurrentPlatform, InitConfig, Platform}; -use ember_core::config::{GlobalConfig, StorageKind}; +use ember_core::config::size::ByteSize; +use ember_core::config::{DmThinMode, GlobalConfig, StorageKind}; use ember_core::state::store::StateStore; +/// dm-thin pool block size (in 512-byte sectors) used when the user does +/// not pass `--block-size`. Resolved here at init time and persisted on +/// `GlobalConfig` so the value the running pool was created with stays +/// stable across ember upgrades — block size is permanent at pool +/// creation, and silently switching defaults later would orphan +/// existing pools. +#[cfg(target_os = "linux")] +const DM_THIN_DEFAULT_BLOCK_SIZE_SECTORS: u32 = + ember_linux::dm_thin::pool::DEFAULT_BLOCK_SIZE_SECTORS; +#[cfg(not(target_os = "linux"))] +const DM_THIN_DEFAULT_BLOCK_SIZE_SECTORS: u32 = 128; + #[derive(Args)] pub struct InitArgs { /// Storage backend: zfs (default) or dm-thin (Linux only) @@ -38,12 +51,12 @@ pub struct InitArgs { /// Pool size for file-backed dm-thin (e.g. `50G`). Required when /// `--storage-path` is a file path; ignored for raw block devices. #[arg(long)] - pub size: Option, + pub size: Option, /// Override metadata device size for dm-thin (e.g. `800M`). /// `thin_metadata_size` computes a recommended value when omitted. #[arg(long)] - pub metadata_size: Option, + pub metadata_size: Option, /// dm-thin pool block size in 512-byte sectors. Permanent at pool /// creation. Defaults to 128 (= 64 KiB). @@ -86,6 +99,32 @@ pub fn run(args: &InitArgs, state_dir: &Path) -> anyhow::Result<()> { StorageKind::Zfs => None, }; + // Resolve block size up-front for dm-thin so the persisted config + // pins the value the pool was actually created with, even when the + // user omits `--block-size`. + let resolved_block_size = match args.storage { + StorageKind::DmThin => Some( + args.block_size + .unwrap_or(DM_THIN_DEFAULT_BLOCK_SIZE_SECTORS), + ), + _ => args.block_size, + }; + + // Resolve file-vs-raw-device layout once and persist it. Doing this + // here rather than in the backend keeps the contract explicit: + // reactivation should not depend on a live `is_dir()` probe of + // `storage_path` agreeing with what init saw. + let resolved_dm_thin_mode = match (args.storage, storage_path.as_ref()) { + (StorageKind::DmThin, Some(path)) => { + if path.is_dir() || !path.exists() { + Some(DmThinMode::File) + } else { + Some(DmThinMode::RawDevice) + } + } + _ => None, + }; + let init_config = InitConfig { storage_backend: args.storage, state_dir: state_dir.to_path_buf(), @@ -94,9 +133,10 @@ pub fn run(args: &InitArgs, state_dir: &Path) -> anyhow::Result<()> { device: args.device.clone(), storage_path: storage_path.clone(), btrfs_size: None, - dm_thin_size: args.size.clone(), - dm_thin_metadata_size: args.metadata_size.clone(), - dm_thin_block_size: args.block_size, + dm_thin_size: args.size, + dm_thin_metadata_size: args.metadata_size, + dm_thin_block_size: resolved_block_size, + dm_thin_mode: resolved_dm_thin_mode, }; init_storage(&init_config)?; @@ -127,7 +167,8 @@ pub fn run(args: &InitArgs, state_dir: &Path) -> anyhow::Result<()> { wan_iface, state_dir: state_dir.to_path_buf(), storage_path, - dm_thin_block_size: args.block_size, + dm_thin_block_size: resolved_block_size, + dm_thin_mode: resolved_dm_thin_mode, }; store.write(&store.config_path(), &config)?; println!("Configuration written to {}", store.config_path().display()); @@ -152,6 +193,7 @@ mod tests { state_dir: PathBuf::default(), storage_path: None, dm_thin_block_size: None, + dm_thin_mode: None, } } diff --git a/src/cli/vm.rs b/src/cli/vm.rs index a339064..d2484e3 100644 --- a/src/cli/vm.rs +++ b/src/cli/vm.rs @@ -25,11 +25,15 @@ use ember_core::state::vm::{self, NetworkInfo, SshConfig, VmMetadata, VmStatus}; /// `name`, `disk_path`, and `thin_id` from this stub for resize, mount, /// and SSH-key injection. All other fields are placeholders inherited /// from [`VmMetadata::default_for_teardown`]. -fn pending_metadata(name: &str, handle: &VolumeHandle) -> VmMetadata { +fn pending_metadata(name: &str, handle: &VolumeHandle, disk_size_gib: u32) -> VmMetadata { let mut m = VmMetadata::default_for_teardown(); m.name = name.to_string(); m.disk_path = handle.disk_path.to_string_lossy().into_owned(); m.thin_id = handle.thin_id; + // dm-thin needs the size to (re)activate the thin device. Stash the + // requested disk size so `disk_device_path(pending)` can re-attach + // post-resize even if the kernel state was somehow torn down. + m.disk_size_gib = disk_size_gib; m } @@ -461,7 +465,7 @@ fn create(args: &CreateArgs, state_dir: &Path) -> anyhow::Result<()> { // Clone base image → per-VM disk (instant, copy-on-write). println!("Cloning image for VM '{}'...", resolved.name); let handle = storage.clone_for_vm(image_entry, &resolved.name)?; - let pending = pending_metadata(&resolved.name, &handle); + let pending = pending_metadata(&resolved.name, &handle, resolved.disk_size); { let storage = storage.clone(); let sd = state_dir.to_path_buf(); @@ -525,7 +529,7 @@ fn create_post_clone( // Inject per-VM SSH key into the rootfs image. // Linux: mounts the block device, writes the key, unmounts. // macOS: uses debugfs to write directly into the ext4 image. - let dev_path = storage.disk_device_path(pending); + let dev_path = storage.disk_device_path(pending)?; let pubkey_path = image::inject::default_ssh_pubkey_path().ok_or_else(|| { anyhow::anyhow!( "no SSH public key found at ~/.ssh/id_ed25519.pub or ~/.ssh/id_rsa.pub\n\ @@ -637,7 +641,7 @@ fn fork(args: &ForkArgs, state_dir: &Path) -> anyhow::Result<()> { // Clone source VM's storage into the new VM via the storage backend. println!("Forking '{}' → '{}'...", args.source, args.name); let handle = storage.clone_vm_storage(&source, &args.name)?; - let pending = pending_metadata(&args.name, &handle); + let pending = pending_metadata(&args.name, &handle, disk_size_gib); let mut rollback = Rollback::new(); { @@ -664,7 +668,7 @@ fn fork(args: &ForkArgs, state_dir: &Path) -> anyhow::Result<()> { // Inject /etc/hosts with the new VM's hostname (the cloned disk // still has the source VM's hostname from its creation). - let dev_path = storage.disk_device_path(&pending); + let dev_path = storage.disk_device_path(&pending)?; storage.inject_hostname(&dev_path, &args.name)?; // Resolve kernel: CLI override or inherit from source. From 483019a4aed3ce894803604f39b112b482779805 Mon Sep 17 00:00:00 2001 From: Aljoscha Krettek Date: Thu, 30 Apr 2026 12:01:06 +0200 Subject: [PATCH 17/21] dm-thin: surface missing kernel target, stop leaking loops on init failure Three related changes for robustness around dm-thin pool setup: * `pool::ensure_target_loaded()` runs `modprobe dm-thin-pool` and verifies the `thin-pool` target is registered via `dmsetup targets`, replacing the kernel's opaque "Invalid argument" with an actionable error pointing at CONFIG_DM_THIN_PROVISIONING. Called from both `init` and `ensure_pool_active`. * `init` now detaches loop devices on every failure path between attaching them and successful `pool::create`, so a failed init no longer leaves loops bound to backing files that get unlinked with the surrounding tempdir. * New `DmThinCleanup` RAII guard runs `ember deinit --purge` on drop, installed at the top of every dm_thin integration test so panics partway through still tear down the pool, loops, and backing files. --- crates/ember-linux/src/dm_thin/pool.rs | 36 ++++++++++++++++++++++ crates/ember-linux/src/dm_thin_storage.rs | 37 ++++++++++++++++++++--- tests/common/linux.rs | 18 +++++++++++ tests/dm_thin.rs | 28 ++++++++--------- 4 files changed, 99 insertions(+), 20 deletions(-) diff --git a/crates/ember-linux/src/dm_thin/pool.rs b/crates/ember-linux/src/dm_thin/pool.rs index 60d757a..17d50c9 100644 --- a/crates/ember-linux/src/dm_thin/pool.rs +++ b/crates/ember-linux/src/dm_thin/pool.rs @@ -50,6 +50,42 @@ pub struct PoolStatus { pub mode: PoolMode, } +/// Ensure the kernel has the `thin-pool` device-mapper target available. +/// +/// On most distributions `dm-thin-pool` is a loadable module that +/// `dmsetup` does not auto-load. We `modprobe` it best-effort (built-in +/// kernels report "Module not found" but the target is already +/// registered) and then verify it appears in `dmsetup targets`. Without +/// this check, a missing target produces an opaque `Invalid argument` +/// from `dmsetup create`. +pub fn ensure_target_loaded() -> Result<()> { + let _ = Command::new("modprobe").arg("dm-thin-pool").output(); + + let output = Command::new("dmsetup") + .arg("targets") + .output() + .map_err(|e| Error::CommandExec { + command: "dmsetup targets".to_string(), + source: e, + })?; + let output = Error::check_command("dmsetup targets", output)?; + let stdout = String::from_utf8_lossy(&output.stdout); + let has_thin_pool = stdout + .lines() + .any(|line| line.split_whitespace().next() == Some("thin-pool")); + if has_thin_pool { + return Ok(()); + } + Err(Error::Command { + command: "dmsetup targets".to_string(), + exit_code: 0, + stderr: "kernel does not provide the 'thin-pool' device-mapper target. \ + Install or enable a kernel with CONFIG_DM_THIN_PROVISIONING and \ + load it with 'modprobe dm-thin-pool'." + .to_string(), + }) +} + /// List active device-mapper device names whose name starts with /// `prefix`. Useful for finding all `ember-vm-*` and `ember-img-*` /// volumes during teardown. diff --git a/crates/ember-linux/src/dm_thin_storage.rs b/crates/ember-linux/src/dm_thin_storage.rs index 1785d3c..9638196 100644 --- a/crates/ember-linux/src/dm_thin_storage.rs +++ b/crates/ember-linux/src/dm_thin_storage.rs @@ -122,6 +122,8 @@ impl DmThinStorage { return Ok(()); } + pool::ensure_target_loaded()?; + let metadata_path = self.metadata_file(); let data_path = self.data_file(); @@ -199,6 +201,8 @@ impl StorageBackend for DmThinStorage { Error::Config("dm-thin requires --storage-path (directory or block device)".to_string()) })?; + pool::ensure_target_loaded()?; + let block_size_sectors = config .dm_thin_block_size .unwrap_or(pool::DEFAULT_BLOCK_SIZE_SECTORS); @@ -255,19 +259,42 @@ impl StorageBackend for DmThinStorage { // an all-zero superblock as the signal to format a fresh pool. zero_head(&metadata_path)?; - // Attach loops, then assemble the pool. + // Attach loops, then assemble the pool. If anything past this + // point fails, detach the loops we attached so we don't leak + // them pointing at backing files that may get cleaned up. let metadata_loop = ensure_loop(&metadata_path)?; - let data_loop = ensure_loop_or_block(&data_path)?; + let data_loop = match ensure_loop_or_block(&data_path) { + Ok(p) => p, + Err(e) => { + let _ = loop_device::detach(&metadata_loop); + return Err(e); + } + }; - let data_sectors = device_sectors(&data_loop)?; - pool::create( + let data_sectors = match device_sectors(&data_loop) { + Ok(s) => s, + Err(e) => { + let _ = loop_device::detach(&metadata_loop); + if data_path.is_file() { + let _ = loop_device::detach(&data_loop); + } + return Err(e); + } + }; + if let Err(e) = pool::create( pool::POOL_NAME, &metadata_loop, &data_loop, data_sectors, block_size_sectors, pool::DEFAULT_LOW_WATER_BLOCKS, - )?; + ) { + let _ = loop_device::detach(&metadata_loop); + if data_path.is_file() { + let _ = loop_device::detach(&data_loop); + } + return Err(e); + } println!( "dm-thin pool '{}' active ({} data, {} block size).", diff --git a/tests/common/linux.rs b/tests/common/linux.rs index 7867348..540f424 100644 --- a/tests/common/linux.rs +++ b/tests/common/linux.rs @@ -81,6 +81,24 @@ impl Drop for PoolCleanup { } } +/// RAII guard: runs `ember deinit --purge` on drop so dm-thin tests +/// always tear down the pool, loop devices, and backing files even when +/// an assertion panics partway through. +pub struct DmThinCleanup { + pub state_dir: PathBuf, +} + +impl Drop for DmThinCleanup { + fn drop(&mut self) { + let _ = super::ember(&[ + "--state-dir", + self.state_dir.to_str().unwrap(), + "deinit", + "--purge", + ]); + } +} + // --------------------------------------------------------------------------- // ZFS assertions // --------------------------------------------------------------------------- diff --git a/tests/dm_thin.rs b/tests/dm_thin.rs index e552951..8d592d4 100644 --- a/tests/dm_thin.rs +++ b/tests/dm_thin.rs @@ -30,6 +30,11 @@ fn dm_thin_init_and_deinit_round_trip() { let storage_path = tmp.path().join("dm-thin"); let state_dir = tmp.path().join("state"); + // Always tear down on the way out, even if assertions below panic. + let _cleanup = common::linux::DmThinCleanup { + state_dir: state_dir.clone(), + }; + // Init. let output = common::ember(&[ "--state-dir", @@ -86,6 +91,10 @@ fn dm_thin_init_refuses_backend_switch() { let storage_path = tmp.path().join("dm-thin"); let state_dir = tmp.path().join("state"); + let _cleanup = common::linux::DmThinCleanup { + state_dir: state_dir.clone(), + }; + // First init with dm-thin. let output = common::ember(&[ "--state-dir", @@ -119,14 +128,6 @@ fn dm_thin_init_refuses_backend_switch() { stderr.contains("already initialized"), "expected 'already initialized' message: {stderr}" ); - - // Cleanup. - let _ = common::ember(&[ - "--state-dir", - state_dir.to_str().unwrap(), - "deinit", - "--purge", - ]); } /// `ember storage grow --size ` should grow the data device. @@ -137,6 +138,10 @@ fn dm_thin_storage_grow() { let storage_path = tmp.path().join("dm-thin"); let state_dir = tmp.path().join("state"); + let _cleanup = common::linux::DmThinCleanup { + state_dir: state_dir.clone(), + }; + let output = common::ember(&[ "--state-dir", state_dir.to_str().unwrap(), @@ -174,11 +179,4 @@ fn dm_thin_storage_grow() { .unwrap() .len(); assert_eq!(grown, 400 * 1024 * 1024); - - let _ = common::ember(&[ - "--state-dir", - state_dir.to_str().unwrap(), - "deinit", - "--purge", - ]); } From 56b82f0b617df0f699c5ea5152f63ad685a4861a Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Thu, 30 Apr 2026 13:27:53 +0200 Subject: [PATCH 18/21] dm-thin: address second-round PR review feedback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * destroy_vm_storage now frees every snapshot's thin id before the vm's own id. Previously snapshots stayed pinned in pool metadata after vm delete with no user-level handle to reach them. * create_image_volume tears down a stale ember-img--staging device left over from a previous failed run, otherwise EEXIST blocked retries. * create_storage panics on the StorageKind::Btrfs arm rather than silently routing through the ZFS backend with garbage inputs; matches init_storage's shape. * Added assert_pool_healthy() and gated create_image_volume, clone_for_vm, clone_vm_storage, snapshot, restore_snapshot, resize on it. OutOfDataSpace / Failed / ReadOnly pools surface an actionable error instead of an opaque mid-`dd` EIO. grow and the destroy paths stay ungated. * LinuxPlatform::inspect_vm_extra / inspect_image_extra label the disk row "Thin device" + "Thin id" when the metadata carries a thin_id, "ZFS zvol" otherwise. info_extra branches on storage_backend so dm-thin shows storage path / block size / mode instead of pool/dataset. * tests/dm_thin.rs gets #[allow(dead_code)] on `mod common;` so shared test helpers it doesn't use don't generate 36 dead-code warnings. * Spec note: ensure_pool_active runs thin_check on the metadata device on first command after a reboot — proportional to pool occupancy, intentional, and skippable by activating the pool manually. * Renamed dm-mapper device existence check from pool::exists to dm_thin::dm_device_exists; it was used to probe pool, thin, and staging devices indiscriminately, so the pool-scoped name was misleading. * --block-size now takes a ByteSize (e.g. `64K`, `1M`) like --size / --metadata-size; converted to sectors internally with validation for dm-thin's 64 KiB-multiple constraint. New Error::Pool variant for storage-pool-level errors. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/ember-core/src/error.rs | 5 ++ crates/ember-linux/src/dm_thin.rs | 14 +++++ crates/ember-linux/src/dm_thin/pool.rs | 15 ------ crates/ember-linux/src/dm_thin/thin.rs | 2 +- crates/ember-linux/src/dm_thin_storage.rs | 65 ++++++++++++++++++++--- crates/ember-linux/src/lib.rs | 12 +++-- crates/ember-linux/src/platform.rs | 59 ++++++++++++++++---- docs/DM-THIN-SPEC.md | 5 ++ src/cli/init.rs | 36 +++++++++---- tests/dm_thin.rs | 4 ++ 10 files changed, 172 insertions(+), 45 deletions(-) diff --git a/crates/ember-core/src/error.rs b/crates/ember-core/src/error.rs index 39944e7..3307a7f 100644 --- a/crates/ember-core/src/error.rs +++ b/crates/ember-core/src/error.rs @@ -49,6 +49,11 @@ pub enum Error { #[error("state: {0}")] State(String), + /// Storage pool error (dm-thin / btrfs / ZFS pool-level state, as + /// distinct from individual volume / dataset errors). + #[error("storage pool: {0}")] + Pool(String), + /// Config parsing or validation error. #[error("config: {0}")] Config(String), diff --git a/crates/ember-linux/src/dm_thin.rs b/crates/ember-linux/src/dm_thin.rs index fb1e116..ee9e916 100644 --- a/crates/ember-linux/src/dm_thin.rs +++ b/crates/ember-linux/src/dm_thin.rs @@ -22,6 +22,20 @@ pub fn bytes_to_sectors(bytes: u64) -> u64 { bytes.div_ceil(SECTOR_SIZE) } +/// Whether a device-mapper device with the given name is currently +/// active. Used to probe pools, thin volumes, and staging devices — +/// `dmsetup info` doesn't care which kind it is. +pub fn dm_device_exists(name: &str) -> ember_core::error::Result { + let output = std::process::Command::new("dmsetup") + .args(["info", "--noheadings", name]) + .output() + .map_err(|e| ember_core::error::Error::CommandExec { + command: "dmsetup info".to_string(), + source: e, + })?; + Ok(output.status.success()) +} + /// Whether an [`Error`](ember_core::error::Error) reports a kernel `EEXIST` /// from a `dmsetup message` operation. Used by the `create_thin` / /// `create_snap` retry loops to detect thin id collisions. diff --git a/crates/ember-linux/src/dm_thin/pool.rs b/crates/ember-linux/src/dm_thin/pool.rs index 17d50c9..28cda33 100644 --- a/crates/ember-linux/src/dm_thin/pool.rs +++ b/crates/ember-linux/src/dm_thin/pool.rs @@ -112,21 +112,6 @@ pub fn list_with_prefix(prefix: &str) -> Result> { .collect()) } -/// Whether a device-mapper device with the given name is currently active. -/// -/// Uses `dmsetup info` which exits 0 when the device exists, non-zero -/// otherwise. -pub fn exists(name: &str) -> Result { - let output = Command::new("dmsetup") - .args(["info", "--noheadings", name]) - .output() - .map_err(|e| Error::CommandExec { - command: "dmsetup info".to_string(), - source: e, - })?; - Ok(output.status.success()) -} - /// Build a `thin-pool` table line. /// /// The format is documented in diff --git a/crates/ember-linux/src/dm_thin/thin.rs b/crates/ember-linux/src/dm_thin/thin.rs index 5c4ccdd..ff62ee3 100644 --- a/crates/ember-linux/src/dm_thin/thin.rs +++ b/crates/ember-linux/src/dm_thin/thin.rs @@ -99,7 +99,7 @@ pub fn device_path(name: &str) -> PathBuf { /// Whether a thin volume is currently activated as a `/dev/mapper` /// device. pub fn is_active(name: &str) -> Result { - pool::exists(name) + super::dm_device_exists(name) } /// Activate a thin volume as a `/dev/mapper/` block device. diff --git a/crates/ember-linux/src/dm_thin_storage.rs b/crates/ember-linux/src/dm_thin_storage.rs index 9638196..1c8b326 100644 --- a/crates/ember-linux/src/dm_thin_storage.rs +++ b/crates/ember-linux/src/dm_thin_storage.rs @@ -19,7 +19,7 @@ use ember_core::error::{Error, Result}; use ember_core::image::registry::ImageEntry; use ember_core::state::vm::{SnapshotEntry, VmMetadata}; -use crate::dm_thin::{loop_device, pool, thin, tools, SECTOR_SIZE}; +use crate::dm_thin::{dm_device_exists, loop_device, pool, thin, tools, SECTOR_SIZE}; use crate::zvol; /// Default file name for the metadata backing file inside the dm-thin @@ -118,7 +118,7 @@ impl DmThinStorage { /// devices and re-runs `dmsetup create` if the kernel state is gone /// (e.g., after a reboot). fn ensure_pool_active(&self) -> Result<()> { - if pool::exists(pool::POOL_NAME)? { + if dm_device_exists(pool::POOL_NAME)? { return Ok(()); } @@ -162,7 +162,7 @@ impl DmThinStorage { thin_id: u64, size_sectors: u64, ) -> Result { - if pool::exists(dm_name)? { + if dm_device_exists(dm_name)? { return Ok(thin::device_path(dm_name)); } thin::activate(dm_name, pool::POOL_NAME, thin_id, size_sectors) @@ -193,6 +193,35 @@ impl DmThinStorage { )) }) } + + /// Refuse allocating-or-writing operations when the pool has gone + /// read-only, run out of data, or failed entirely. Without this + /// gate, callers see opaque `EIO` mid-`dd` (out of space) or + /// silent thin id leaks on metadata-corrupt pools. + /// + /// `grow` is intentionally not gated because it is the recovery + /// path for [`PoolMode::OutOfDataSpace`]; destroy paths are also + /// not gated since freeing thin ids must work even on a sick pool. + fn assert_pool_healthy(&self) -> Result<()> { + let status = pool::status(pool::POOL_NAME)?; + match status.mode { + pool::PoolMode::ReadWrite => Ok(()), + pool::PoolMode::ReadOnly => Err(Error::Pool(format!( + "dm-thin pool '{}' is read-only — run `thin_check` and `thin_repair` to recover", + pool::POOL_NAME + ))), + pool::PoolMode::OutOfDataSpace => Err(Error::Pool(format!( + "dm-thin pool '{}' is out of data space ({}/{} blocks used) — run `ember storage grow --size ` to extend it", + pool::POOL_NAME, + status.used_data_blocks, + status.total_data_blocks, + ))), + pool::PoolMode::Failed => Err(Error::Pool(format!( + "dm-thin pool '{}' has failed — inspect dmesg and `thin_check` the metadata device", + pool::POOL_NAME + ))), + } + } } impl StorageBackend for DmThinStorage { @@ -313,11 +342,21 @@ impl StorageBackend for DmThinStorage { size_mib: u64, ) -> Result { self.ensure_pool_active()?; + self.assert_pool_healthy()?; let staging_dm = thin::image_staging_dm_name(name); let final_dm = thin::image_dm_name(name); let size_sectors = (size_mib * 1024 * 1024) / SECTOR_SIZE; + // A previous failed run may have left the staging device + // active. Tear it down so the fresh `thin::activate` below + // doesn't trip over `EEXIST`. The matching staging thin id is + // not persisted anywhere, so it leaks into pool metadata; that + // is a bounded one-off cost and only `thin_dump` can find it. + if let Ok(true) = dm_device_exists(&staging_dm) { + let _ = thin::deactivate(&staging_dm); + } + // 1. Allocate a fresh staging thin and write the ext4 image. let staging_id = thin::allocate(pool::POOL_NAME)?; let staging_dev = @@ -369,6 +408,7 @@ impl StorageBackend for DmThinStorage { fn clone_for_vm(&self, image: &ImageEntry, vm_name: &str) -> Result { self.ensure_pool_active()?; + self.assert_pool_healthy()?; let base_id = Self::require_image_thin_id(image)?; let dm_name = thin::vm_dm_name(vm_name); @@ -391,6 +431,7 @@ impl StorageBackend for DmThinStorage { fn snapshot(&self, vm: &VmMetadata, snap_name: &str) -> Result> { self.ensure_pool_active()?; + self.assert_pool_healthy()?; let vm_id = Self::require_vm_thin_id(vm)?; let dm_name = thin::vm_dm_name(&vm.name); let size_sectors = Self::vm_size_sectors(vm); @@ -416,6 +457,7 @@ impl StorageBackend for DmThinStorage { fn restore_snapshot(&self, vm: &VmMetadata, snap_name: &str) -> Result { self.ensure_pool_active()?; + self.assert_pool_healthy()?; let vm_id = Self::require_vm_thin_id(vm)?; let snap = vm .snapshots @@ -442,7 +484,7 @@ impl StorageBackend for DmThinStorage { // failure from here on must release new_id so we don't leak // kernel state. let result = (|| -> Result { - if pool::exists(&dm_name)? { + if dm_device_exists(&dm_name)? { thin::deactivate(&dm_name)?; } thin::delete(pool::POOL_NAME, vm_id)?; @@ -492,6 +534,7 @@ impl StorageBackend for DmThinStorage { fn resize(&self, vm: &VmMetadata, new_size: ByteSize) -> Result<()> { self.ensure_pool_active()?; + self.assert_pool_healthy()?; let vm_id = Self::require_vm_thin_id(vm)?; let dm_name = thin::vm_dm_name(&vm.name); let new_sectors = new_size.bytes() / SECTOR_SIZE; @@ -512,9 +555,16 @@ impl StorageBackend for DmThinStorage { // step may already be done by an earlier failure path. let _ = self.ensure_pool_active(); let dm_name = thin::vm_dm_name(&vm.name); - if let Ok(true) = pool::exists(&dm_name) { + if let Ok(true) = dm_device_exists(&dm_name) { let _ = thin::deactivate(&dm_name); } + // Snapshots only live in the kernel pool; the user-level + // handle is `vm.json`, which is about to disappear. Free their + // thin ids before the VM's own id, otherwise they'd remain + // pinned in pool metadata with no way for ember to reach them. + for snap in &vm.snapshots { + let _ = thin::delete(pool::POOL_NAME, snap.thin_id); + } if let Some(id) = vm.thin_id { let _ = thin::delete(pool::POOL_NAME, id); } @@ -527,7 +577,7 @@ impl StorageBackend for DmThinStorage { // thin ids and stay readable. `force` doesn't change behavior. let _ = self.ensure_pool_active(); let dm_name = thin::image_dm_name(&image.local_name); - if let Ok(true) = pool::exists(&dm_name) { + if let Ok(true) = dm_device_exists(&dm_name) { let _ = thin::deactivate(&dm_name); } if let Some(id) = image.thin_id { @@ -550,6 +600,7 @@ impl StorageBackend for DmThinStorage { fn clone_vm_storage(&self, source: &VmMetadata, target_vm: &str) -> Result { self.ensure_pool_active()?; + self.assert_pool_healthy()?; let source_id = Self::require_vm_thin_id(source)?; let dm_name = thin::vm_dm_name(target_vm); let size_sectors = Self::vm_size_sectors(source); @@ -587,7 +638,7 @@ impl StorageBackend for DmThinStorage { } } // 2. Drop the pool itself (if active). - if pool::exists(pool::POOL_NAME)? { + if dm_device_exists(pool::POOL_NAME)? { pool::remove(pool::POOL_NAME)?; } // 3. Detach the loop devices, if any. diff --git a/crates/ember-linux/src/lib.rs b/crates/ember-linux/src/lib.rs index d00244b..7286f31 100644 --- a/crates/ember-linux/src/lib.rs +++ b/crates/ember-linux/src/lib.rs @@ -26,13 +26,19 @@ use ember_core::error::{Error, Result}; /// Construct the active storage backend. /// /// Returns the implementation indicated by [`GlobalConfig::storage_backend`]. -/// btrfs is not yet implemented and falls back to ZFS so existing -/// configs keep working until Phase 7. +/// btrfs is not yet implemented; rather than silently routing through +/// the ZFS path with garbage inputs, the call panics so a hand-edited +/// `config.json` fails loudly. `init_storage` returns the same shape +/// of error from the init side. pub fn create_storage(config: &GlobalConfig) -> Arc { match config.storage_backend { StorageKind::Zfs => Arc::new(LinuxStorage::new(config)), StorageKind::DmThin => Arc::new(DmThinStorage::new(config)), - StorageKind::Btrfs => Arc::new(LinuxStorage::new(config)), + StorageKind::Btrfs => panic!( + "btrfs storage backend is not yet implemented; \ + config.json has storage_backend = btrfs but no \ + implementation exists yet" + ), } } diff --git a/crates/ember-linux/src/platform.rs b/crates/ember-linux/src/platform.rs index f7bb65a..0de455f 100644 --- a/crates/ember-linux/src/platform.rs +++ b/crates/ember-linux/src/platform.rs @@ -1,7 +1,7 @@ use std::path::{Path, PathBuf}; use ember_core::backend::{ImageToolConfig, Platform, ResolvConfMode}; -use ember_core::config::GlobalConfig; +use ember_core::config::{GlobalConfig, StorageKind}; use ember_core::error::Result; use ember_core::image::registry::ImageEntry; use ember_core::state::vm::VmMetadata; @@ -45,10 +45,18 @@ impl Platform for LinuxPlatform { } fn inspect_vm_extra(metadata: &VmMetadata) -> Vec<(&'static str, String)> { - let mut extra = vec![ - ("ZFS zvol", metadata.disk_path.clone()), - ("API socket", metadata.api_socket.display().to_string()), - ]; + // dm-thin records a numeric `thin_id` on the VM metadata; ZFS + // does not. Branch on its presence rather than threading a + // `GlobalConfig` reference through the trait — the metadata + // already carries enough to label the disk row correctly. + let mut extra = match metadata.thin_id { + Some(thin_id) => vec![ + ("Thin device", metadata.disk_path.clone()), + ("Thin id", thin_id.to_string()), + ], + None => vec![("ZFS zvol", metadata.disk_path.clone())], + }; + extra.push(("API socket", metadata.api_socket.display().to_string())); if let Some(ref net) = metadata.network { extra.push(("TAP device", net.tap_device.clone())); } @@ -56,14 +64,45 @@ impl Platform for LinuxPlatform { } fn inspect_image_extra(entry: &ImageEntry) -> Vec<(&'static str, String)> { - vec![("ZFS zvol", entry.disk_path.clone())] + match entry.thin_id { + Some(thin_id) => vec![ + ("Thin device", entry.disk_path.clone()), + ("Thin id", thin_id.to_string()), + ], + None => vec![("ZFS zvol", entry.disk_path.clone())], + } } fn info_extra(config: &GlobalConfig) -> Vec<(&'static str, String)> { - let mut extra = vec![ - ("ZFS pool", config.pool.clone()), - ("Dataset", format!("{}/{}", config.pool, config.dataset)), - ]; + let mut extra = match config.storage_backend { + StorageKind::Zfs => vec![ + ("ZFS pool", config.pool.clone()), + ("Dataset", format!("{}/{}", config.pool, config.dataset)), + ], + StorageKind::DmThin => { + let mut rows = vec![("dm-thin pool", "ember-pool".to_string())]; + if let Some(ref path) = config.storage_path { + rows.push(("Storage path", path.display().to_string())); + } + if let Some(block_size) = config.dm_thin_block_size { + rows.push(( + "Block size", + format!("{} sectors ({} KiB)", block_size, (block_size * 512) / 1024), + )); + } + if let Some(mode) = config.dm_thin_mode { + rows.push(( + "Layout", + match mode { + ember_core::config::DmThinMode::File => "file-backed".to_string(), + ember_core::config::DmThinMode::RawDevice => "raw device".to_string(), + }, + )); + } + rows + } + StorageKind::Btrfs => vec![("btrfs", "(unimplemented)".to_string())], + }; if let Some(ref wan_iface) = config.wan_iface { extra.push(("WAN iface", wan_iface.clone())); } diff --git a/docs/DM-THIN-SPEC.md b/docs/DM-THIN-SPEC.md index ea90180..6b75449 100644 --- a/docs/DM-THIN-SPEC.md +++ b/docs/DM-THIN-SPEC.md @@ -314,6 +314,11 @@ The first command after a reboot triggers `ensure_pool_active`: c. Run `thin_check /dev/loopN` (or the metadata loop). Fail loudly on metadata corruption — operator must run `thin_repair` manually. d. `dmsetup create ember-pool --table "0 thin-pool ... 128 "` using the values from `config.json`. +Step (c) walks the entire metadata B-tree, so the *first* command after a reboot pays a one-time cost proportional to pool occupancy. +For pools with millions of mapped blocks this can take several seconds; subsequent commands hit the cached `pool::exists` early-return and are free. +This is intentional — silently activating a corrupt pool would damage every snapshot derived from it. +Operators who prefer to skip the check (e.g. on read-only inspection of a known-good pool) can `dmsetup create` the pool manually before invoking ember. + Per-VM and per-image volumes are activated **lazily** by methods that need them (e.g. `disk_device_path`, `mount`, `start`). Each method calls `ensure_thin_active(name, thin_id, size_sectors)`: diff --git a/src/cli/init.rs b/src/cli/init.rs index 9e33e97..a3039d1 100644 --- a/src/cli/init.rs +++ b/src/cli/init.rs @@ -19,6 +19,25 @@ const DM_THIN_DEFAULT_BLOCK_SIZE_SECTORS: u32 = #[cfg(not(target_os = "linux"))] const DM_THIN_DEFAULT_BLOCK_SIZE_SECTORS: u32 = 128; +/// Convert a CLI `--block-size` byte value into the 512-byte sector +/// count the kernel expects, validating dm-thin's constraints: the +/// block size must be a multiple of 64 KiB and fit in `u32` sectors. +fn resolve_dm_thin_block_size_sectors(user: Option) -> anyhow::Result { + let Some(size) = user else { + return Ok(DM_THIN_DEFAULT_BLOCK_SIZE_SECTORS); + }; + let bytes = size.bytes(); + const MIN_BYTES: u64 = 64 * 1024; + if bytes < MIN_BYTES || bytes % MIN_BYTES != 0 { + anyhow::bail!( + "--block-size must be at least 64K and a multiple of 64K (got {bytes} bytes)" + ); + } + let sectors = bytes / 512; + u32::try_from(sectors) + .map_err(|_| anyhow::anyhow!("--block-size {bytes} bytes overflows u32 sectors")) +} + #[derive(Args)] pub struct InitArgs { /// Storage backend: zfs (default) or dm-thin (Linux only) @@ -58,10 +77,10 @@ pub struct InitArgs { #[arg(long)] pub metadata_size: Option, - /// dm-thin pool block size in 512-byte sectors. Permanent at pool - /// creation. Defaults to 128 (= 64 KiB). + /// dm-thin pool block size (e.g. `64K`, `1M`). Must be a multiple + /// of 64 KiB; permanent at pool creation. Defaults to 64 KiB. #[arg(long)] - pub block_size: Option, + pub block_size: Option, /// Kernel preset or file path [presets: stock] #[arg(long)] @@ -101,13 +120,12 @@ pub fn run(args: &InitArgs, state_dir: &Path) -> anyhow::Result<()> { // Resolve block size up-front for dm-thin so the persisted config // pins the value the pool was actually created with, even when the - // user omits `--block-size`. + // user omits `--block-size`. Internally the kernel addresses pool + // blocks in 512-byte sectors; the CLI accepts a `ByteSize` so the + // UX matches `--size` / `--metadata-size`. let resolved_block_size = match args.storage { - StorageKind::DmThin => Some( - args.block_size - .unwrap_or(DM_THIN_DEFAULT_BLOCK_SIZE_SECTORS), - ), - _ => args.block_size, + StorageKind::DmThin => Some(resolve_dm_thin_block_size_sectors(args.block_size)?), + _ => None, }; // Resolve file-vs-raw-device layout once and persist it. Doing this diff --git a/tests/dm_thin.rs b/tests/dm_thin.rs index 8d592d4..a3c9476 100644 --- a/tests/dm_thin.rs +++ b/tests/dm_thin.rs @@ -16,6 +16,10 @@ #![cfg(target_os = "linux")] +// Each integration-test crate compiles `tests/common/` as its own +// top-level module; only `common::ember` is used here, so without this +// attribute clippy reports every other shared helper as dead code. +#[allow(dead_code)] mod common; use std::path::Path; From fc94ecd2c9ff12c1e91f3619c92f05265df71f26 Mon Sep 17 00:00:00 2001 From: Aljoscha Krettek Date: Thu, 30 Apr 2026 13:48:11 +0200 Subject: [PATCH 19/21] platform: use pool::POOL_NAME instead of hardcoded 'ember-pool' The dm-thin arm of LinuxPlatform::info_extra was carrying a string literal for the pool name, while the rest of the codebase reaches for the pool::POOL_NAME constant. Pull the constant in here too so the display row stays in sync if the pool name ever changes. --- crates/ember-linux/src/platform.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/crates/ember-linux/src/platform.rs b/crates/ember-linux/src/platform.rs index 0de455f..f773495 100644 --- a/crates/ember-linux/src/platform.rs +++ b/crates/ember-linux/src/platform.rs @@ -6,6 +6,8 @@ use ember_core::error::Result; use ember_core::image::registry::ImageEntry; use ember_core::state::vm::VmMetadata; +use crate::dm_thin::pool; + pub struct LinuxPlatform; fn linux_install_hint(name: &str) -> String { @@ -80,7 +82,7 @@ impl Platform for LinuxPlatform { ("Dataset", format!("{}/{}", config.pool, config.dataset)), ], StorageKind::DmThin => { - let mut rows = vec![("dm-thin pool", "ember-pool".to_string())]; + let mut rows = vec![("dm-thin pool", pool::POOL_NAME.to_string())]; if let Some(ref path) = config.storage_path { rows.push(("Storage path", path.display().to_string())); } From dfc59a0e76e4e4d31f6e09561832a7104f55bbcf Mon Sep 17 00:00:00 2001 From: Aljoscha Krettek Date: Thu, 30 Apr 2026 13:48:19 +0200 Subject: [PATCH 20/21] tests: drop identity `1 *` so clippy --all-targets is clean `tests/resize.rs:214` had `1 * 1024 * 1024 * 1024` to spell out "1 GiB". Clippy's identity_op fires on the leading `1 *` and `cargo clippy --all-targets -- -D warnings` rejects it. Drop the `1 *` to keep the intent obvious without the noise. --- tests/resize.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/resize.rs b/tests/resize.rs index 62b5a91..19411a4 100644 --- a/tests/resize.rs +++ b/tests/resize.rs @@ -211,7 +211,7 @@ fn resize_grows_disk() { // Verify initial ZFS volsize. assert_eq!( common::linux::get_zvol_size_bytes(&vm_zvol), - 1 * 1024 * 1024 * 1024, + 1024 * 1024 * 1024, "initial zvol should be 1 GiB" ); From a6f87795545a6b48a20f58c705a4183db6470a3e Mon Sep 17 00:00:00 2001 From: Aljoscha Krettek Date: Thu, 30 Apr 2026 13:53:09 +0200 Subject: [PATCH 21/21] ci: lint test targets too via `cargo clippy --all-targets` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Makefile's `clippy` target — wired into the CI workflow — only covered the default lib/bin targets, so warnings inside `tests/` slipped through (e.g. an identity_op in tests/resize.rs that only showed up when running `cargo clippy --all-targets` locally). Add `--all-targets` so integration tests are linted too. --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 8371f96..abad980 100644 --- a/Makefile +++ b/Makefile @@ -35,7 +35,7 @@ check: cargo check clippy: - cargo clippy -- -D warnings + cargo clippy --all-targets -- -D warnings test: cargo test