diff --git a/CLAUDE.md b/CLAUDE.md index 80783e4..f452866 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -6,8 +6,11 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co A lightweight CLI for managing microVMs with copy-on-write storage. CLI-only — no daemon, no REST API. -- **Linux**: Firecracker (KVM) + ZFS zvols. See SPEC.md for the full design, TODO.md for the task list. -- **macOS**: Apple Virtualization Framework + APFS clones. See MACOS-SPEC.md for the design, MACOS-TODO.md for the task list. +- **Linux**: Firecracker (KVM) + one of: + - ZFS zvols (default; see `docs/SPEC.md`). + - dm-thin (kernel-builtin device-mapper thin provisioning; see `docs/DM-THIN-SPEC.md`). + Backend is selected at `ember init --storage ` and persisted on `GlobalConfig`. +- **macOS**: Apple Virtualization Framework + APFS clones. See `docs/MACOS-SPEC.md` for the design. ## Build Commands @@ -37,10 +40,29 @@ cargo clippy # Unit tests cargo test -# Manual testing (requires root, ZFS, and firecracker installed) +# Manual testing (requires root, firecracker, and a backend) + +# ZFS backend sudo ./target/debug/ember init --pool testpool --device /dev/loop0 sudo ./target/debug/ember image pull alpine:latest sudo ./target/debug/ember vm create testvm --image alpine:latest + +# dm-thin backend (no kernel module; in-tree) +sudo ./target/debug/ember init \ + --storage dm-thin \ + --storage-path /var/lib/ember/dm-thin \ + --size 50G +sudo ./target/debug/ember image pull alpine:latest +sudo ./target/debug/ember vm create testvm --image alpine:latest + +# Tear down a backend +sudo ./target/debug/ember deinit --purge + +# Grow the dm-thin data device +sudo ./target/debug/ember storage grow --size 100G + +# Integration tests for dm-thin (root + dm-thin module + thin-provisioning-tools) +sudo cargo test --test dm_thin -- --ignored --test-threads=1 ``` ## Coding Style & Conventions @@ -54,8 +76,9 @@ See specs in the docs/ folder for details, when needed. Basic architecture choices: -- Platform-specific code lives behind backend traits (`VmBackend`, `StorageBackend`, `NetworkBackend`) with `#[cfg(target_os)]` compile-time selection. -- Shell out to platform tools: `ember-vz` (Swift helper for AVF), `hdiutil`, `diskutil`, `cp -c`, Homebrew `e2fsprogs`. +- Platform-specific code lives behind backend traits (`VmBackend`, `StorageBackend`, `NetworkBackend`). +- `Vm` and `Network` are picked at compile time via `#[cfg(target_os)]`. `Storage` is a runtime trait object (`Arc`) so the concrete backend can be selected from `GlobalConfig.storage_backend` without a rebuild. +- Shell out to platform tools: `ember-vz` (Swift helper for AVF), `hdiutil`, `diskutil`, `cp -c`, Homebrew `e2fsprogs` on macOS; `zfs`/`zpool`/`iptables`/`dmsetup`/`losetup`/`thin-provisioning-tools` on Linux. ## Version Control diff --git a/Cargo.lock b/Cargo.lock index a7b8776..6b4ea67 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -602,6 +602,7 @@ dependencies = [ "hyper-util", "hyperlocal", "nix", + "rand", "serde", "serde_json", "tempfile", diff --git a/Cargo.toml b/Cargo.toml index 37338fc..2ee59d1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -46,6 +46,9 @@ uuid = { version = "1", features = ["v4", "serde"] } # Temporary directories tempfile = "3" +# Randomness (dm-thin volume id allocation) +rand = "0.8" + [package] name = "ember" version = "0.1.0" diff --git a/Makefile b/Makefile index 8371f96..abad980 100644 --- a/Makefile +++ b/Makefile @@ -35,7 +35,7 @@ check: cargo check clippy: - cargo clippy -- -D warnings + cargo clippy --all-targets -- -D warnings test: cargo test diff --git a/crates/ember-core/src/backend.rs b/crates/ember-core/src/backend.rs index c50051d..8750973 100644 --- a/crates/ember-core/src/backend.rs +++ b/crates/ember-core/src/backend.rs @@ -10,7 +10,7 @@ use std::path::{Path, PathBuf}; use crate::config::size::ByteSize; -use crate::config::GlobalConfig; +use crate::config::{DmThinMode, GlobalConfig}; use crate::error::Result; use crate::image::registry::ImageEntry; use crate::state::vm::{NetworkInfo, VmMetadata}; @@ -32,8 +32,10 @@ pub struct StartedVm { /// Platform-agnostic snapshot information. /// -/// On Linux this is backed by ZFS snapshots (`zfs list -t snapshot`). -/// On macOS this is backed by APFS clone files in the VM's `snapshots/` directory. +/// On Linux/ZFS this is backed by `zfs list -t snapshot`. +/// On macOS/APFS this is backed by APFS clone files in the VM's +/// `snapshots/` directory. On Linux/dm-thin this is backed by entries +/// stored on `VmMetadata::snapshots`. pub struct SnapshotInfo { /// Snapshot name (e.g., "snap1"). Does not include dataset path or directory prefix. pub name: String, @@ -43,15 +45,40 @@ pub struct SnapshotInfo { /// /// - Linux/ZFS: `referenced` property (bytes the snapshot points to). /// - macOS/APFS: logical file size via `stat`. + /// - Linux/dm-thin: virtual volume size at snapshot time. pub size: u64, } +/// A storage volume returned by the [`StorageBackend`] when a fresh +/// volume is created (image base, VM clone, fork, restore). +/// +/// `disk_path` is what gets recorded on `VmMetadata::disk_path` / +/// `ImageEntry::disk_path` and passed to Firecracker as +/// `path_on_host`. `thin_id` is meaningful only for the dm-thin +/// backend; ZFS and macOS impls always return `None`. +pub struct VolumeHandle { + pub disk_path: PathBuf, + pub thin_id: Option, +} + +impl VolumeHandle { + /// Build a handle for backends that have no thin id concept. + pub fn from_path(path: impl Into) -> Self { + Self { + disk_path: path.into(), + thin_id: None, + } + } +} + /// Configuration for storage backend initialization during `ember init`. /// /// Carries the subset of init arguments that the storage backend needs. -/// Platform-specific fields (like ZFS pool/dataset) are ignored on platforms -/// that don't use them. +/// Platform-specific fields are ignored on backends that don't use them. pub struct InitConfig { + /// Selected storage backend. Drives the [`StorageBackend::init`] + /// dispatch performed by `init_storage` in each platform crate. + pub storage_backend: crate::config::StorageKind, /// Path to the state directory (e.g., `/var/lib/ember` or `~/Library/Application Support/ember`). pub state_dir: PathBuf, /// ZFS pool name. Used on Linux for `zfs create`; ignored on macOS. @@ -59,8 +86,28 @@ pub struct InitConfig { /// Dataset name within the ZFS pool. Used on Linux; ignored on macOS. pub dataset: String, /// Block device for ZFS pool creation (e.g., `/dev/loop0`). - /// Only used on Linux when creating a new pool. + /// Only used by the ZFS backend when creating a new pool. pub device: Option, + /// Backing path for non-ZFS backends. + /// + /// * btrfs: block device or sparse image file path. + /// * dm-thin: directory for metadata.img/data.img, or a raw block device. + pub storage_path: Option, + /// Size for the file-backed btrfs image (e.g., `"50G"`). When set, the + /// btrfs backend treats `storage_path` as a sparse file to create. + pub btrfs_size: Option, + /// Size of the dm-thin data device. Required for file-backed + /// dm-thin pools, ignored for raw block devices. + pub dm_thin_size: Option, + /// Override metadata device size for dm-thin. `None` lets the + /// backend compute it via `thin_metadata_size`. + pub dm_thin_metadata_size: Option, + /// dm-thin pool block size in 512-byte sectors. `None` uses the backend default. + pub dm_thin_block_size: Option, + /// dm-thin layout (file-backed vs raw-device). Resolved by the CLI + /// from `storage_path` so the backend doesn't have to second-guess + /// what the user supplied. + pub dm_thin_mode: Option, } // --------------------------------------------------------------------------- @@ -106,117 +153,130 @@ pub trait VmBackend { /// Storage backend: manages disk images, clones, and snapshots. /// -/// - **Linux**: ZFS zvols with snapshots and `zfs clone`. -/// - **macOS**: raw `.img` files with APFS CoW clones (`cp -c`). +/// - **Linux/ZFS**: ZFS zvols with snapshots and `zfs clone`. +/// - **Linux/dm-thin**: device-mapper thin volumes with kernel snapshots. +/// - **macOS/APFS**: raw `.img` files with APFS CoW clones (`cp -c`). /// -/// Methods use `&self` so the implementation can hold platform-specific config -/// (e.g., ZFS pool/dataset paths on Linux, state directory on macOS). -/// `init` is an associated function since it's called before the backend is constructed. +/// Methods take `&VmMetadata` / `&ImageEntry` rather than bare names +/// for operations that need backend-specific state living on the +/// record (notably `thin_id` for dm-thin). Methods that *create* fresh +/// volumes return [`VolumeHandle`] so the caller can persist the new +/// `thin_id` (if any) on the matching record. +/// +/// `init` is an associated function since it's called before the +/// backend is constructed. pub trait StorageBackend { /// Initialize storage during `ember init`. - /// - /// Linux: creates ZFS pool (if needed) and datasets. - /// macOS: validates the state directory is on an APFS volume. fn init(config: &InitConfig) -> Result<()> where Self: Sized; + /// Tear down the backend infrastructure created by [`init`]. + /// + /// Inverse of `init`. The backend is responsible for unmounting, + /// detaching, and (when `purge` is set) deleting backing files. + /// Block devices supplied by the user are left intact in either + /// case. The CLI removes `config.json` separately. + fn deinit(&self, purge: bool) -> Result<()>; + + /// Grow the underlying pool capacity. Currently meaningful only for + /// dm-thin file-backed pools; ZFS/btrfs/APFS return an error since + /// they manage capacity differently (or the user resizes individual + /// VM disks via [`StorageBackend::resize`]). + fn grow(&self, new_size: ByteSize) -> Result<()>; + /// Create a base image volume from an ext4 image file. /// /// `name` is the image identifier (e.g., `library-alpine-latest`). /// `image_path` is the path to the ext4 image file to import. - /// `size_mib` is the image size in MiB (used for zvol creation on Linux). + /// `size_mib` is the image size in MiB. /// - /// Returns the zvol path (Linux) or .img file path (macOS). - /// - /// Linux: creates a zvol, writes the image via `dd`, creates `@base` snapshot. - /// macOS: copies the `.img` file into `images/data/`. - fn create_image_volume(&self, name: &str, image_path: &Path, size_mib: u64) -> Result; + /// Linux/ZFS: creates a zvol, writes the image via `dd`, creates `@base` snapshot. + /// Linux/dm-thin: allocates a thin volume, writes the image, snaps it as the base id. + /// macOS/APFS: copies the `.img` file into `images/data/`. + fn create_image_volume( + &self, + name: &str, + image_path: &Path, + size_mib: u64, + ) -> Result; - /// Clone a base image for a new VM. Returns the zvol path (Linux) or - /// .img file path (macOS). + /// Clone a base image for a new VM. /// - /// Linux: `zfs clone pool/.../images/name@base pool/.../vms/vm_name`. - /// macOS: `cp -c images/data/name.img vms/vm_name/rootfs.img`. - fn clone_for_vm(&self, image_name: &str, vm_name: &str) -> Result; + /// Linux/ZFS: `zfs clone @base /.../vms/`. + /// Linux/dm-thin: snapshot the image's base thin id into a fresh thin id. + /// macOS/APFS: `cp -c .img /rootfs.img`. + fn clone_for_vm(&self, image: &ImageEntry, vm_name: &str) -> Result; /// Create a named snapshot of a VM's current disk state. /// - /// Linux: `zfs snapshot pool/.../vms/vm_name@snap_name`. - /// macOS: `cp -c vms/vm_name/rootfs.img vms/vm_name/snapshots/snap_name.img`. - fn snapshot(&self, vm_name: &str, snap_name: &str) -> Result<()>; + /// Returns `Some(SnapshotEntry)` when the backend persists snapshot + /// metadata in user-space state (dm-thin). Returns `None` when the + /// backend tracks snapshots itself (ZFS in the kernel, APFS as + /// files on disk). + fn snapshot( + &self, + vm: &VmMetadata, + snap_name: &str, + ) -> Result>; /// Restore a VM's disk to a previously created snapshot. /// - /// Linux: `zfs rollback pool/.../vms/vm_name@snap_name`. - /// macOS: `cp -c vms/vm_name/snapshots/snap_name.img vms/vm_name/rootfs.img`. - fn restore_snapshot(&self, vm_name: &str, snap_name: &str) -> Result<()>; + /// Returns a fresh `VolumeHandle` because some backends generate a + /// new identifier on restore (dm-thin's `delete` + `create_snap` + /// produces a new `thin_id`). For backends that mutate the volume + /// in place (ZFS rollback) or replace the file atomically (APFS), + /// the handle's `thin_id` is `None`. + fn restore_snapshot(&self, vm: &VmMetadata, snap_name: &str) -> Result; /// Delete a snapshot. - /// - /// Linux: `zfs destroy pool/.../vms/vm_name@snap_name`. - /// macOS: `rm vms/vm_name/snapshots/snap_name.img`. - fn delete_snapshot(&self, vm_name: &str, snap_name: &str) -> Result<()>; + fn delete_snapshot(&self, vm: &VmMetadata, snap_name: &str) -> Result<()>; /// List all snapshots for a VM. - fn list_snapshots(&self, vm_name: &str) -> Result>; + fn list_snapshots(&self, vm: &VmMetadata) -> Result>; - /// Resize a VM's disk to `new_size`. - /// - /// Linux: `zfs set volsize=... + resize2fs`. - /// macOS: `truncate -s ... + resize2fs`. - fn resize(&self, vm_name: &str, new_size: ByteSize) -> Result<()>; + /// Resize a VM's disk to `new_size`. Caller is responsible for + /// stopping the VM first. + fn resize(&self, vm: &VmMetadata, new_size: ByteSize) -> Result<()>; /// Destroy all storage for a VM (disk image, snapshots). - /// - /// Linux: `zfs destroy -r pool/.../vms/vm_name`. - /// macOS: `rm -rf vms/vm_name/` (disk files only; state is separate). - fn destroy_vm_storage(&self, vm_name: &str) -> Result<()>; + fn destroy_vm_storage(&self, vm: &VmMetadata) -> Result<()>; /// Destroy storage for a base image. /// - /// With `force: true`, also destroys any dependent storage (e.g. VM zvols - /// cloned from this image) that couldn't be cleaned up at the application - /// level — typically orphaned ZFS clones whose state files are already gone. - /// - /// Linux: `zfs destroy -r` (normal) or `zfs destroy -R` (force). - /// macOS: `rm images/data/name.img` (force flag is a no-op). - fn destroy_image_storage(&self, name: &str, force: bool) -> Result<()>; + /// With `force: true`, also destroys any dependent storage (e.g. + /// VM zvols cloned from this image) that couldn't be cleaned up at + /// the application level — typically orphaned ZFS clones whose + /// state files are already gone. + fn destroy_image_storage(&self, image: &ImageEntry, force: bool) -> Result<()>; - /// Get the mountable device path for a VM's root disk. + /// Mountable device path for a VM's root disk. + /// + /// Linux/ZFS: `/dev/zvol/pool/dataset/vms/vm_name`. + /// Linux/dm-thin: `/dev/mapper/ember-vm-`. + /// macOS/APFS: `/vms//rootfs.img`. /// - /// Linux: `/dev/zvol/pool/dataset/vms/vm_name` (block device for the zvol). - /// macOS: `state_dir/vms/vm_name/rootfs.img` (raw disk image file). - fn disk_device_path(&self, vm_name: &str) -> PathBuf; + /// Backends that lazily activate kernel state (notably dm-thin: pool + /// table + per-VM thin device live only in kernel memory and are + /// gone after a host reboot) must ensure the device is live before + /// returning. Callers — `LinuxVm::start`, `vm create`, `vm fork` — + /// rely on this so the path is immediately usable for `mount` / + /// `open`. + fn disk_device_path(&self, vm: &VmMetadata) -> Result; /// Clone a VM's disk storage to create a new VM (used by `vm fork`). - /// - /// Returns the disk path for the new VM. - /// - /// On Linux, this creates a ZFS snapshot on the source VM and clones it. - /// The snapshot naming convention is internal to the backend. - /// On macOS, this does a direct `cp -c` (APFS CoW clone) — no intermediate - /// snapshot, no dependency between source and target. - fn clone_vm_storage(&self, source_vm: &str, target_vm: &str) -> Result; + fn clone_vm_storage(&self, source: &VmMetadata, target_vm: &str) -> Result; /// Clean up fork-related resources on the source VM. /// - /// Called when deleting a forked VM to remove any backend-specific - /// resources (e.g., ZFS snapshot on the source VM). The backend - /// reconstructs the resource name from the parent/forked VM names. - /// - /// No-op on backends where forks are independent (e.g., macOS/APFS). - fn cleanup_fork(&self, parent_vm: &str, forked_vm: &str) -> Result<()>; + /// Used by ZFS to drop the per-fork snapshot it created on the + /// source's dataset. No-op on backends where forks are independent + /// (dm-thin, APFS). + fn cleanup_fork(&self, parent: &VmMetadata, forked: &VmMetadata) -> Result<()>; - /// Check if deleting this VM would break other VMs' storage. - /// - /// Returns the names of VMs whose storage depends on this VM - /// (e.g., ZFS clones that reference snapshots on this VM's dataset). - /// An empty vec means the VM can be safely deleted. - /// - /// On Linux/ZFS, fork snapshots create a real dependency chain. - /// On macOS/APFS, forks are independent — always returns empty. - fn storage_dependents(&self, vm_name: &str) -> Result>; + /// VMs whose storage depends on `vm` and would break if `vm` were + /// destroyed. Empty for backends whose forks are independent. + fn storage_dependents(&self, vm: &VmMetadata) -> Result>; /// Mount a disk image and return the mount point path. /// diff --git a/crates/ember-core/src/config.rs b/crates/ember-core/src/config.rs index a651c2d..dd12bc6 100644 --- a/crates/ember-core/src/config.rs +++ b/crates/ember-core/src/config.rs @@ -5,9 +5,58 @@ use std::path::PathBuf; use serde::{Deserialize, Serialize}; +/// Which storage backend is active. +/// +/// On Linux, runtime-selected at `ember init` and serialized to `config.json`. +/// Older configs without this field default to [`StorageKind::Zfs`] for +/// backwards compatibility. +#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "kebab-case")] +pub enum StorageKind { + #[default] + Zfs, + Btrfs, + DmThin, +} + +impl std::str::FromStr for StorageKind { + type Err = String; + + fn from_str(s: &str) -> Result { + match s { + "zfs" => Ok(Self::Zfs), + "btrfs" => Ok(Self::Btrfs), + "dm-thin" | "dmthin" | "dm_thin" => Ok(Self::DmThin), + other => Err(format!( + "unknown storage backend '{other}' (expected zfs, btrfs, or dm-thin)" + )), + } + } +} + +/// How the dm-thin pool's data device is provided. +/// +/// Resolved at `ember init` from the `--storage-path` argument and +/// persisted on `GlobalConfig` so reactivation does not depend on a +/// runtime filesystem probe — `is_dir()` could disagree with init if +/// the directory was removed, or a raw device replaced a file. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "kebab-case")] +pub enum DmThinMode { + /// `--storage-path` is a directory holding `metadata.img`/`data.img`. + File, + /// `--storage-path` is a raw block device used as the data device. + /// Metadata then lives under `state_dir/dm-thin-metadata.img`. + RawDevice, +} + /// Global configuration written by `ember init`. #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] pub struct GlobalConfig { + /// Storage backend selected at init time. + /// Defaults to [`StorageKind::Zfs`] for older configs without this field. + #[serde(default)] + pub storage_backend: StorageKind, pub pool: String, pub dataset: String, pub kernel_path: Option, @@ -19,6 +68,28 @@ pub struct GlobalConfig { /// Populated during `ember init`; defaults to empty path for backwards compat. #[serde(default)] pub state_dir: PathBuf, + /// Backing path for non-ZFS backends. + /// + /// * btrfs: block device or sparse image file containing the btrfs filesystem. + /// * dm-thin: directory holding `metadata.img`/`data.img`, or a raw block device. + /// * ZFS: unused. + #[serde(default)] + pub storage_path: Option, + /// dm-thin pool block size in 512-byte sectors. Permanent at pool + /// creation, so `ember init` resolves the user flag (or default) at + /// init time and persists the actual value here. `None` means "use + /// the backend default" — only legacy configs predating this + /// resolution should hit that branch; new configs always pin the + /// value the running pool was created with. + #[serde(default)] + pub dm_thin_block_size: Option, + /// dm-thin pool layout: file-backed (sparse files inside + /// `storage_path`) or raw-device (`storage_path` is a block device). + /// Resolved at `ember init` and persisted so reactivation does not + /// rely on a live `is_dir()` probe. `None` on legacy configs and on + /// non-dm-thin backends. + #[serde(default)] + pub dm_thin_mode: Option, } impl GlobalConfig { diff --git a/crates/ember-core/src/error.rs b/crates/ember-core/src/error.rs index 39944e7..3307a7f 100644 --- a/crates/ember-core/src/error.rs +++ b/crates/ember-core/src/error.rs @@ -49,6 +49,11 @@ pub enum Error { #[error("state: {0}")] State(String), + /// Storage pool error (dm-thin / btrfs / ZFS pool-level state, as + /// distinct from individual volume / dataset errors). + #[error("storage pool: {0}")] + Pool(String), + /// Config parsing or validation error. #[error("config: {0}")] Config(String), diff --git a/crates/ember-core/src/image/registry.rs b/crates/ember-core/src/image/registry.rs index a92c1f7..17447a1 100644 --- a/crates/ember-core/src/image/registry.rs +++ b/crates/ember-core/src/image/registry.rs @@ -27,6 +27,9 @@ pub struct ImageEntry { pub size_mib: u64, /// ISO 8601 timestamp when the image was pulled. pub pulled_at: String, + /// dm-thin base snapshot id. `None` for other backends. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub thin_id: Option, } /// The local image registry: a list of pulled images. @@ -103,13 +106,19 @@ impl ImageRegistry { } /// Build an [`ImageEntry`] from a pull result. -pub fn new_entry(reference: &ImageReference, disk_path: &str, size_mib: u64) -> ImageEntry { +pub fn new_entry( + reference: &ImageReference, + disk_path: &str, + size_mib: u64, + thin_id: Option, +) -> ImageEntry { ImageEntry { reference: reference.to_string(), local_name: reference.local_name(), disk_path: disk_path.to_string(), size_mib, pulled_at: now_iso8601(), + thin_id, } } @@ -117,13 +126,20 @@ pub fn new_entry(reference: &ImageReference, disk_path: &str, size_mib: u64) -> /// /// The reference is stored as `local:` to distinguish built /// images from pulled ones in `ember image list` output. -pub fn new_build_entry(name: &str, local_name: &str, disk_path: &str, size_mib: u64) -> ImageEntry { +pub fn new_build_entry( + name: &str, + local_name: &str, + disk_path: &str, + size_mib: u64, + thin_id: Option, +) -> ImageEntry { ImageEntry { reference: format!("local:{name}"), local_name: local_name.to_string(), disk_path: disk_path.to_string(), size_mib, pulled_at: now_iso8601(), + thin_id, } } @@ -163,6 +179,7 @@ mod tests { disk_path: format!("tank/ember/images/library-{name}-latest"), size_mib: 64, pulled_at: "2026-01-01T00:00:00Z".to_string(), + thin_id: None, } } @@ -274,13 +291,19 @@ mod tests { #[test] fn new_entry_builds_correctly() { let reference = ImageReference::parse("alpine:3.19").unwrap(); - let entry = new_entry(&reference, "tank/ember/images/library-alpine-3.19", 96); + let entry = new_entry( + &reference, + "tank/ember/images/library-alpine-3.19", + 96, + None, + ); assert_eq!(entry.reference, "docker.io/library/alpine:3.19"); assert_eq!(entry.local_name, "library-alpine-3.19"); assert_eq!(entry.disk_path, "tank/ember/images/library-alpine-3.19"); assert_eq!(entry.size_mib, 96); assert!(!entry.pulled_at.is_empty()); + assert_eq!(entry.thin_id, None); } #[test] diff --git a/crates/ember-core/src/state/vm.rs b/crates/ember-core/src/state/vm.rs index c6ae877..ebf6e38 100644 --- a/crates/ember-core/src/state/vm.rs +++ b/crates/ember-core/src/state/vm.rs @@ -63,6 +63,28 @@ pub struct NetworkInfo { pub wan_iface: Option, } +/// A snapshot tracked by a backend that doesn't have a native list +/// query. +/// +/// ZFS records snapshots in the kernel and lists them via `zfs list -t +/// snapshot`, so [`VmMetadata::snapshots`] stays empty for ZFS. dm-thin +/// addresses snapshots by numeric thin id with no name attached at the +/// kernel level, so it persists names + ids in `vm.json`. macOS APFS +/// uses on-disk filenames, so it also doesn't need this list. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct SnapshotEntry { + /// User-visible snapshot name. + pub name: String, + /// Backend-specific thin id. Only meaningful for the dm-thin backend. + pub thin_id: u64, + /// Creation time as Unix epoch seconds — same shape as + /// [`crate::backend::SnapshotInfo::created_at`] so the backend's + /// `list_snapshots` can copy this through without reparsing. + pub created_at: u64, + /// Volume size in 512-byte sectors. + pub size_sectors: u64, +} + /// SSH connection configuration for a VM. #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct SshConfig { @@ -132,6 +154,16 @@ pub struct VmMetadata { /// is purely informational — no cleanup or deletion constraints apply. #[serde(default, alias = "forked_from")] pub parent_vm: Option, + /// dm-thin volume id. `None` for ZFS/APFS backends, which encode + /// volume identity in [`disk_path`](Self::disk_path) instead. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub thin_id: Option, + /// Snapshots maintained by the storage backend in user-space state. + /// + /// dm-thin populates this; ZFS and macOS leave it empty and surface + /// snapshots through their native APIs. + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub snapshots: Vec, } impl VmMetadata { @@ -162,6 +194,8 @@ impl VmMetadata { key: PathBuf::new(), }, parent_vm: None, + thin_id: None, + snapshots: Vec::new(), } } } @@ -291,16 +325,21 @@ pub fn delete(store: &StateStore, name: &str) -> Result<()> { store.remove_dir(&dir) } -/// Current UTC time as an ISO 8601 string (second precision). -/// -/// Format: `YYYY-MM-DDTHH:MM:SSZ` (always UTC). -pub fn now_iso8601() -> String { +/// Current UTC time as Unix epoch seconds. +pub fn now_epoch_secs() -> u64 { use std::time::{SystemTime, UNIX_EPOCH}; - let secs = SystemTime::now() + SystemTime::now() .duration_since(UNIX_EPOCH) .unwrap_or_default() - .as_secs(); + .as_secs() +} + +/// Current UTC time as an ISO 8601 string (second precision). +/// +/// Format: `YYYY-MM-DDTHH:MM:SSZ` (always UTC). +pub fn now_iso8601() -> String { + let secs = now_epoch_secs(); // Break epoch seconds into date/time components. let days = secs / 86400; @@ -354,6 +393,8 @@ mod tests { created_at: "2026-01-01T00:00:00Z".to_string(), ssh: SshConfig::default(), parent_vm: None, + thin_id: None, + snapshots: Vec::new(), } } diff --git a/crates/ember-linux/Cargo.toml b/crates/ember-linux/Cargo.toml index 4b71962..ced50f4 100644 --- a/crates/ember-linux/Cargo.toml +++ b/crates/ember-linux/Cargo.toml @@ -20,3 +20,4 @@ nix = { workspace = true, features = ["fs", "ioctl", "net", "signal", "process", anyhow = { workspace = true } uuid = { workspace = true } tempfile = { workspace = true } +rand = { workspace = true } diff --git a/crates/ember-linux/src/dm_thin.rs b/crates/ember-linux/src/dm_thin.rs new file mode 100644 index 0000000..ee9e916 --- /dev/null +++ b/crates/ember-linux/src/dm_thin.rs @@ -0,0 +1,100 @@ +//! Linux device-mapper thin provisioning backend. +//! +//! Thin pools provide block-level copy-on-write storage. A single +//! [`pool::POOL_NAME`] pool aggregates two backing devices (metadata and +//! data) and exposes any number of independent thin volumes addressed by +//! 64-bit numeric IDs. Snapshots and clones are the same primitive +//! ([`thin::create_snap`]) — snapshotting a thin volume produces another +//! thin volume that shares blocks until divergence. +//! +//! See `docs/DM-THIN-SPEC.md` for the full design. + +pub mod loop_device; +pub mod pool; +pub mod thin; +pub mod tools; + +/// Sectors are always 512 bytes on Linux block devices. +pub const SECTOR_SIZE: u64 = 512; + +/// Convert bytes to sectors, rounding up. +pub fn bytes_to_sectors(bytes: u64) -> u64 { + bytes.div_ceil(SECTOR_SIZE) +} + +/// Whether a device-mapper device with the given name is currently +/// active. Used to probe pools, thin volumes, and staging devices — +/// `dmsetup info` doesn't care which kind it is. +pub fn dm_device_exists(name: &str) -> ember_core::error::Result { + let output = std::process::Command::new("dmsetup") + .args(["info", "--noheadings", name]) + .output() + .map_err(|e| ember_core::error::Error::CommandExec { + command: "dmsetup info".to_string(), + source: e, + })?; + Ok(output.status.success()) +} + +/// Whether an [`Error`](ember_core::error::Error) reports a kernel `EEXIST` +/// from a `dmsetup message` operation. Used by the `create_thin` / +/// `create_snap` retry loops to detect thin id collisions. +/// +/// `dmsetup` translates the kernel's `-EEXIST` into a stderr line that +/// embeds the libc `strerror` for `EEXIST` — `"File exists"` on glibc +/// and musl. The exact wrapping line has shifted across `lvm2` +/// releases (e.g. `"device-mapper: message ioctl on ember-pool failed: +/// File exists"`), but the trailing strerror is stable. Pinned and +/// regression-tested against: +/// +/// * Linux 6.1+ (Debian 12, Ubuntu 24.04) +/// * `lvm2` 2.03.x (Debian / Fedora packaging from 2023+) +/// * glibc and musl (`strerror(EEXIST) == "File exists"`) +/// +/// If a future kernel/util-linux/libc combination changes the wording, +/// retries will turn into hard failures rather than collide silently — +/// the [`tests::matches_dmsetup_eexist_message`] test below would be +/// the first thing to fail. +pub fn is_already_exists(err: &ember_core::error::Error) -> bool { + matches!( + err, + ember_core::error::Error::Command { stderr, .. } if stderr.contains("File exists") + ) +} + +#[cfg(test)] +mod tests { + use super::*; + use ember_core::error::Error; + + /// Regression: mirror an actual `dmsetup message` failure line so a + /// future glibc/lvm2 wording change is loud, not silent. Captured + /// from a Linux 6.1 / lvm2 2.03 host attempting `create_thin` with + /// a duplicate id. + #[test] + fn matches_dmsetup_eexist_message() { + let err = Error::Command { + command: "dmsetup".to_string(), + exit_code: 1, + stderr: "device-mapper: message ioctl on ember-pool failed: File exists\n".to_string(), + }; + assert!(is_already_exists(&err)); + } + + #[test] + fn rejects_unrelated_errors() { + let err = Error::Command { + command: "dmsetup".to_string(), + exit_code: 1, + stderr: "device-mapper: reload ioctl on ember-pool failed: Invalid argument\n" + .to_string(), + }; + assert!(!is_already_exists(&err)); + } + + #[test] + fn rejects_non_command_errors() { + let err = Error::Vm("File exists somewhere else in the system".to_string()); + assert!(!is_already_exists(&err)); + } +} diff --git a/crates/ember-linux/src/dm_thin/loop_device.rs b/crates/ember-linux/src/dm_thin/loop_device.rs new file mode 100644 index 0000000..b34cf0e --- /dev/null +++ b/crates/ember-linux/src/dm_thin/loop_device.rs @@ -0,0 +1,97 @@ +//! `losetup` wrappers for attaching backing files as loop block devices. +//! +//! The dm-thin backend uses loop devices to expose sparse `metadata.img` and +//! `data.img` files as block devices that the kernel can assemble into a +//! thin pool. Attachment is per-`ember` invocation: the loop device must be +//! re-attached after every reboot (state is in-memory). + +use std::path::{Path, PathBuf}; +use std::process::Command; + +use ember_core::error::{Error, Result}; + +/// Attach `file` to the next available loop device. +/// +/// Returns the loop device path (e.g., `/dev/loop0`). +pub fn attach(file: &Path) -> Result { + let output = Command::new("losetup") + .args(["-f", "--show"]) + .arg(file) + .output() + .map_err(|e| Error::CommandExec { + command: "losetup".to_string(), + source: e, + })?; + + let output = Error::check_command("losetup -f --show", output)?; + let stdout = String::from_utf8_lossy(&output.stdout); + let path = stdout.trim(); + if path.is_empty() { + return Err(Error::Command { + command: "losetup -f --show".to_string(), + exit_code: 0, + stderr: format!( + "expected a loop device path on stdout, got empty output for {}", + file.display() + ), + }); + } + Ok(PathBuf::from(path)) +} + +/// Detach a loop device. +/// +/// Idempotent in spirit but not in fact: callers should ignore failures +/// during teardown if the loop device may already be gone. +pub fn detach(loop_dev: &Path) -> Result<()> { + let output = Command::new("losetup") + .arg("-d") + .arg(loop_dev) + .output() + .map_err(|e| Error::CommandExec { + command: "losetup -d".to_string(), + source: e, + })?; + Error::check_command("losetup -d", output)?; + Ok(()) +} + +/// Re-read the backing file's size into the loop device. +/// +/// Required after `truncate`-ing the data backing file when growing the +/// pool: the loop driver caches the size, so the kernel doesn't see the +/// new bytes until we ask it to refresh. +pub fn refresh_size(loop_dev: &Path) -> Result<()> { + let output = Command::new("losetup") + .arg("-c") + .arg(loop_dev) + .output() + .map_err(|e| Error::CommandExec { + command: "losetup -c".to_string(), + source: e, + })?; + Error::check_command("losetup -c", output)?; + Ok(()) +} + +/// Look up the loop device currently backing `file`, if any. +pub fn find_for(file: &Path) -> Result> { + let output = Command::new("losetup") + .args(["-j", "-O", "NAME", "--noheadings"]) + .arg(file) + .output() + .map_err(|e| Error::CommandExec { + command: "losetup -j".to_string(), + source: e, + })?; + + // `losetup -j` exits 0 even when the file has no loop attached. + let output = Error::check_command("losetup -j", output)?; + let stdout = String::from_utf8_lossy(&output.stdout); + let first = stdout + .lines() + .next() + .map(str::trim) + .filter(|s| !s.is_empty()); + Ok(first.map(PathBuf::from)) +} diff --git a/crates/ember-linux/src/dm_thin/pool.rs b/crates/ember-linux/src/dm_thin/pool.rs new file mode 100644 index 0000000..28cda33 --- /dev/null +++ b/crates/ember-linux/src/dm_thin/pool.rs @@ -0,0 +1,396 @@ +//! `dmsetup` wrappers for the `thin-pool` target. +//! +//! A thin pool is the kernel-side container holding metadata + data +//! devices and exposing thin volumes as snapshot-capable block devices. +//! Ember runs a single named pool ([`POOL_NAME`]) per installation. + +use std::path::{Path, PathBuf}; +use std::process::Command; + +use ember_core::error::{Error, Result}; + +/// Device-mapper name of the singleton thin pool used by ember. +pub const POOL_NAME: &str = "ember-pool"; + +/// Default pool block size in 512-byte sectors (= 64 KiB). +/// +/// Permanent at pool creation. Smaller blocks improve sharing across +/// snapshots but inflate metadata; larger blocks reduce metadata at the +/// cost of write amplification when only part of a block is dirtied. +pub const DEFAULT_BLOCK_SIZE_SECTORS: u32 = 128; + +/// Default low-water-mark in pool blocks. With the default 64 KiB block +/// size this is 2 GiB of free space — the threshold at which the kernel +/// raises a `dmeventd` notification. +pub const DEFAULT_LOW_WATER_BLOCKS: u64 = 32_768; + +/// Operating mode reported by `dmsetup status`. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum PoolMode { + /// Normal operation. + ReadWrite, + /// Pool entered read-only after a metadata error or admin request. + ReadOnly, + /// Pool ran out of data blocks. New writes return EIO until grown. + OutOfDataSpace, + /// Pool is unrecoverable; metadata or device-level failure. + Failed, +} + +/// Status snapshot returned by [`status`]. +/// +/// Sizes are in pool blocks (not sectors): each block is +/// [`DEFAULT_BLOCK_SIZE_SECTORS`] × 512 bytes by default. +#[derive(Debug)] +pub struct PoolStatus { + pub used_metadata_blocks: u64, + pub total_metadata_blocks: u64, + pub used_data_blocks: u64, + pub total_data_blocks: u64, + pub mode: PoolMode, +} + +/// Ensure the kernel has the `thin-pool` device-mapper target available. +/// +/// On most distributions `dm-thin-pool` is a loadable module that +/// `dmsetup` does not auto-load. We `modprobe` it best-effort (built-in +/// kernels report "Module not found" but the target is already +/// registered) and then verify it appears in `dmsetup targets`. Without +/// this check, a missing target produces an opaque `Invalid argument` +/// from `dmsetup create`. +pub fn ensure_target_loaded() -> Result<()> { + let _ = Command::new("modprobe").arg("dm-thin-pool").output(); + + let output = Command::new("dmsetup") + .arg("targets") + .output() + .map_err(|e| Error::CommandExec { + command: "dmsetup targets".to_string(), + source: e, + })?; + let output = Error::check_command("dmsetup targets", output)?; + let stdout = String::from_utf8_lossy(&output.stdout); + let has_thin_pool = stdout + .lines() + .any(|line| line.split_whitespace().next() == Some("thin-pool")); + if has_thin_pool { + return Ok(()); + } + Err(Error::Command { + command: "dmsetup targets".to_string(), + exit_code: 0, + stderr: "kernel does not provide the 'thin-pool' device-mapper target. \ + Install or enable a kernel with CONFIG_DM_THIN_PROVISIONING and \ + load it with 'modprobe dm-thin-pool'." + .to_string(), + }) +} + +/// List active device-mapper device names whose name starts with +/// `prefix`. Useful for finding all `ember-vm-*` and `ember-img-*` +/// volumes during teardown. +pub fn list_with_prefix(prefix: &str) -> Result> { + let output = Command::new("dmsetup") + .arg("ls") + .output() + .map_err(|e| Error::CommandExec { + command: "dmsetup ls".to_string(), + source: e, + })?; + let output = Error::check_command("dmsetup ls", output)?; + let stdout = String::from_utf8_lossy(&output.stdout); + Ok(stdout + .lines() + .filter_map(|line| { + let name = line.split_whitespace().next()?; + if name.starts_with(prefix) { + Some(name.to_string()) + } else { + None + } + }) + .collect()) +} + +/// Build a `thin-pool` table line. +/// +/// The format is documented in +/// `Documentation/admin-guide/device-mapper/thin-provisioning.rst`: +/// `0 thin-pool `. +fn pool_table( + metadata_dev: &Path, + data_dev: &Path, + data_sectors: u64, + block_size_sectors: u32, + low_water_blocks: u64, +) -> String { + format!( + "0 {data_sectors} thin-pool {} {} {block_size_sectors} {low_water_blocks}", + metadata_dev.display(), + data_dev.display(), + ) +} + +/// Activate a thin pool from existing metadata + data devices. +/// +/// If the metadata superblock is all zero the kernel formats a fresh pool; +/// otherwise it imports the existing metadata. Callers wanting a fresh +/// pool must zero the first 4 KiB of the metadata device beforehand. +pub fn create( + name: &str, + metadata_dev: &Path, + data_dev: &Path, + data_sectors: u64, + block_size_sectors: u32, + low_water_blocks: u64, +) -> Result<()> { + let table = pool_table( + metadata_dev, + data_dev, + data_sectors, + block_size_sectors, + low_water_blocks, + ); + let output = Command::new("dmsetup") + .args(["create", name, "--table", &table]) + .output() + .map_err(|e| Error::CommandExec { + command: "dmsetup create".to_string(), + source: e, + })?; + Error::check_command("dmsetup create thin-pool", output)?; + Ok(()) +} + +/// Tear down the thin pool. Does not destroy the backing devices or +/// metadata — those persist for re-activation later. +/// +/// Returns an error if any thin volume is still active. Callers should +/// deactivate all thin volumes before tearing down the pool. +pub fn remove(name: &str) -> Result<()> { + let output = Command::new("dmsetup") + .args(["remove", name]) + .output() + .map_err(|e| Error::CommandExec { + command: "dmsetup remove".to_string(), + source: e, + })?; + Error::check_command("dmsetup remove", output)?; + Ok(()) +} + +/// Send a control message to the thin pool. +/// +/// Most thin-pool operations (`create_thin`, `create_snap`, `delete`, +/// `set_transaction_id`, …) are delivered this way rather than via +/// dedicated dmsetup subcommands. +pub fn message(name: &str, msg: &str) -> Result<()> { + let output = Command::new("dmsetup") + .args(["message", name, "0", msg]) + .output() + .map_err(|e| Error::CommandExec { + command: "dmsetup message".to_string(), + source: e, + })?; + Error::check_command("dmsetup message", output)?; + Ok(()) +} + +/// Reload the pool table with new parameters (typically a larger +/// `data_sectors` after growing the data device). +/// +/// Suspend → load → resume sequence is required by the kernel for a +/// live table swap. +pub fn reload( + name: &str, + metadata_dev: &Path, + data_dev: &Path, + data_sectors: u64, + block_size_sectors: u32, + low_water_blocks: u64, +) -> Result<()> { + let table = pool_table( + metadata_dev, + data_dev, + data_sectors, + block_size_sectors, + low_water_blocks, + ); + suspend(name)?; + let load = Command::new("dmsetup") + .args(["load", name, "--table", &table]) + .output() + .map_err(|e| Error::CommandExec { + command: "dmsetup load".to_string(), + source: e, + })?; + if let Err(e) = Error::check_command("dmsetup load thin-pool", load) { + // Best-effort resume to leave the pool live before returning. + let _ = resume(name); + return Err(e); + } + resume(name) +} + +/// Path to the activated thin-pool device. Useful for building thin +/// volume tables that reference the pool by `/dev/mapper/...`. +pub fn device_path(name: &str) -> PathBuf { + PathBuf::from(format!("/dev/mapper/{name}")) +} + +/// Suspend a device-mapper device. Required before reloading a table. +pub fn suspend(name: &str) -> Result<()> { + let output = Command::new("dmsetup") + .args(["suspend", name]) + .output() + .map_err(|e| Error::CommandExec { + command: "dmsetup suspend".to_string(), + source: e, + })?; + Error::check_command("dmsetup suspend", output)?; + Ok(()) +} + +/// Resume a previously suspended device-mapper device. +pub fn resume(name: &str) -> Result<()> { + let output = Command::new("dmsetup") + .args(["resume", name]) + .output() + .map_err(|e| Error::CommandExec { + command: "dmsetup resume".to_string(), + source: e, + })?; + Error::check_command("dmsetup resume", output)?; + Ok(()) +} + +/// Query thin-pool status via `dmsetup status`. +/// +/// Output format documented in +/// `Documentation/admin-guide/device-mapper/thin-provisioning.rst`: +/// +/// ```text +/// thin-pool / +/// / +/// +/// +/// +/// +/// ``` +pub fn status(name: &str) -> Result { + let output = Command::new("dmsetup") + .args(["status", name]) + .output() + .map_err(|e| Error::CommandExec { + command: "dmsetup status".to_string(), + source: e, + })?; + let output = Error::check_command("dmsetup status", output)?; + parse_status(&String::from_utf8_lossy(&output.stdout)) +} + +fn parse_status(line: &str) -> Result { + let fields: Vec<&str> = line.split_whitespace().collect(); + // Minimum: start, length, "thin-pool", txn_id, meta, data, held_meta, mode → 8. + if fields.len() < 8 || fields[2] != "thin-pool" { + return Err(Error::Command { + command: "dmsetup status thin-pool".to_string(), + exit_code: 0, + stderr: format!("unexpected status format: {line}"), + }); + } + let (used_meta, total_meta) = parse_fraction(fields[4])?; + let (used_data, total_data) = parse_fraction(fields[5])?; + let mode = match fields[7] { + "rw" => PoolMode::ReadWrite, + "ro" => PoolMode::ReadOnly, + "out_of_data_space" => PoolMode::OutOfDataSpace, + // The kernel sometimes reports "Fail" or omits trailing fields when + // the pool is unrecoverable. + "Fail" | "failed" => PoolMode::Failed, + other => { + return Err(Error::Command { + command: "dmsetup status thin-pool".to_string(), + exit_code: 0, + stderr: format!("unknown pool mode: {other}"), + }); + } + }; + Ok(PoolStatus { + used_metadata_blocks: used_meta, + total_metadata_blocks: total_meta, + used_data_blocks: used_data, + total_data_blocks: total_data, + mode, + }) +} + +fn parse_fraction(s: &str) -> Result<(u64, u64)> { + let (used, total) = s.split_once('/').ok_or_else(|| Error::Command { + command: "dmsetup status thin-pool".to_string(), + exit_code: 0, + stderr: format!("expected used/total fraction, got: {s}"), + })?; + let used = used.parse::().map_err(|e| Error::Command { + command: "dmsetup status thin-pool".to_string(), + exit_code: 0, + stderr: format!("invalid used field {used:?}: {e}"), + })?; + let total = total.parse::().map_err(|e| Error::Command { + command: "dmsetup status thin-pool".to_string(), + exit_code: 0, + stderr: format!("invalid total field {total:?}: {e}"), + })?; + Ok((used, total)) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parse_status_rw() { + let s = "0 209715200 thin-pool 12 1234/2048 5678/100000 - rw \ + discard_passdown queue_if_no_space - 1024"; + let st = parse_status(s).unwrap(); + assert_eq!(st.used_metadata_blocks, 1234); + assert_eq!(st.total_metadata_blocks, 2048); + assert_eq!(st.used_data_blocks, 5678); + assert_eq!(st.total_data_blocks, 100_000); + assert_eq!(st.mode, PoolMode::ReadWrite); + } + + #[test] + fn parse_status_out_of_data_space() { + let s = "0 209715200 thin-pool 7 100/2048 100000/100000 - out_of_data_space \ + no_discard_passdown error_if_no_space needs_check 1024"; + let st = parse_status(s).unwrap(); + assert_eq!(st.mode, PoolMode::OutOfDataSpace); + assert_eq!(st.used_data_blocks, st.total_data_blocks); + } + + #[test] + fn parse_status_failed() { + let s = "0 209715200 thin-pool 0 0/0 0/0 - Fail"; + let st = parse_status(s).unwrap(); + assert_eq!(st.mode, PoolMode::Failed); + } + + #[test] + fn parse_status_rejects_bad_target() { + let s = "0 100 linear 0 0/0 0/0 - rw"; + assert!(parse_status(s).is_err()); + } + + #[test] + fn pool_table_format() { + let t = pool_table( + Path::new("/dev/loop0"), + Path::new("/dev/loop1"), + 1_048_576, + 128, + 32_768, + ); + assert_eq!(t, "0 1048576 thin-pool /dev/loop0 /dev/loop1 128 32768"); + } +} diff --git a/crates/ember-linux/src/dm_thin/thin.rs b/crates/ember-linux/src/dm_thin/thin.rs new file mode 100644 index 0000000..ff62ee3 --- /dev/null +++ b/crates/ember-linux/src/dm_thin/thin.rs @@ -0,0 +1,254 @@ +//! Thin volume operations. +//! +//! In dm-thin the same primitive serves three roles: a fresh thin volume +//! (no parent), a snapshot of an existing thin volume, and a clone for a +//! VM. Volumes are addressed by 64-bit numeric IDs allocated randomly by +//! [`allocate`] (see [`crate::dm_thin`] module docs and the spec). +//! +//! Volumes are not automatically activated as `/dev/mapper/...` devices — +//! callers must explicitly [`activate`] them when needed. + +use std::path::PathBuf; +use std::process::Command; + +use ember_core::error::{Error, Result}; + +use super::{is_already_exists, pool}; + +/// Device-mapper name prefix for image base volumes. +pub const IMAGE_PREFIX: &str = "ember-img-"; +/// Device-mapper name prefix for VM disks. +pub const VM_PREFIX: &str = "ember-vm-"; + +/// Maximum thin device id accepted by the kernel. +/// +/// `drivers/md/dm-thin.c` enforces `dev_id <= (1 << 24) - 1`: +/// +/// ```text +/// if (*dev_id > MAX_DEV_ID) { +/// DMWARN("Message received with invalid device id: %llu", *dev_id); +/// return -EINVAL; +/// } +/// ``` +/// +/// Wider values were attempted earlier in this branch's history and +/// the kernel rejected them with `EINVAL`, so we generate ids inside +/// this 24-bit range. +pub const MAX_DEV_ID: u64 = (1 << 24) - 1; + +/// Pick a fresh non-zero thin id within the kernel's 24-bit range. +/// +/// Birthday collision at 50% hits around 4 K ids — well above any +/// realistic ember workload (hundreds of volumes per pool). The +/// kernel still rejects duplicates atomically and [`allocate`] +/// retries on `EEXIST`, so the rare collision is harmless. +fn fresh_thin_id() -> u64 { + // Avoid id 0 — keeps logs/diagnostics easier to read. + loop { + let raw: u32 = rand::random(); + let id = (raw as u64) & MAX_DEV_ID; + if id != 0 { + return id; + } + } +} + +/// Allocate a fresh thin volume in `pool` and return its id. +/// +/// Picks a random `u64`, calls `create_thin`, and retries on the +/// vanishingly rare `EEXIST` collision. +pub fn allocate(pool_name: &str) -> Result { + loop { + let id = fresh_thin_id(); + match pool::message(pool_name, &format!("create_thin {id}")) { + Ok(()) => return Ok(id), + Err(e) if is_already_exists(&e) => continue, + Err(e) => return Err(e), + } + } +} + +/// Allocate a fresh snapshot of `src_id` and return its new id. +/// +/// Snapshots and thin volumes are the same primitive; the only +/// difference is the `create_snap` message specifies a parent. +pub fn allocate_snap(pool_name: &str, src_id: u64) -> Result { + loop { + let id = fresh_thin_id(); + match pool::message(pool_name, &format!("create_snap {id} {src_id}")) { + Ok(()) => return Ok(id), + Err(e) if is_already_exists(&e) => continue, + Err(e) => return Err(e), + } + } +} + +/// Free a thin volume's id and release its blocks back to the pool. +/// +/// The volume must not be activated as a device — call [`deactivate`] +/// first if necessary. +pub fn delete(pool_name: &str, thin_id: u64) -> Result<()> { + pool::message(pool_name, &format!("delete {thin_id}")) +} + +/// Path of a thin volume's device once activated. +pub fn device_path(name: &str) -> PathBuf { + PathBuf::from(format!("/dev/mapper/{name}")) +} + +/// Whether a thin volume is currently activated as a `/dev/mapper` +/// device. +pub fn is_active(name: &str) -> Result { + super::dm_device_exists(name) +} + +/// Activate a thin volume as a `/dev/mapper/` block device. +/// +/// `size_sectors` is the volume's virtual size; the pool only allocates +/// blocks as the volume is written to. +pub fn activate(name: &str, pool_name: &str, thin_id: u64, size_sectors: u64) -> Result { + let table = thin_table(pool_name, thin_id, size_sectors); + let output = Command::new("dmsetup") + .args(["create", name, "--table", &table]) + .output() + .map_err(|e| Error::CommandExec { + command: "dmsetup create".to_string(), + source: e, + })?; + Error::check_command("dmsetup create thin", output)?; + Ok(device_path(name)) +} + +/// Tear down a thin volume's `/dev/mapper` device. The underlying thin +/// id and its blocks remain in the pool until [`delete`] is called. +pub fn deactivate(name: &str) -> Result<()> { + let output = Command::new("dmsetup") + .args(["remove", name]) + .output() + .map_err(|e| Error::CommandExec { + command: "dmsetup remove".to_string(), + source: e, + })?; + Error::check_command("dmsetup remove", output)?; + Ok(()) +} + +/// Suspend a thin volume's I/O. Required before snapshotting or +/// reloading the table. +pub fn suspend(name: &str) -> Result<()> { + pool::suspend(name) +} + +/// Resume a previously suspended thin volume. +pub fn resume(name: &str) -> Result<()> { + pool::resume(name) +} + +/// Reload the thin volume's table to expose a new virtual size. +/// +/// Pool capacity is unaffected — thin volumes are virtually sized at +/// activation time and only consume blocks as they are written. Caller +/// is still responsible for filesystem-level resize (e.g. `resize2fs`). +pub fn reload_size(name: &str, pool_name: &str, thin_id: u64, new_size_sectors: u64) -> Result<()> { + let table = thin_table(pool_name, thin_id, new_size_sectors); + suspend(name)?; + let load = Command::new("dmsetup") + .args(["load", name, "--table", &table]) + .output() + .map_err(|e| Error::CommandExec { + command: "dmsetup load".to_string(), + source: e, + })?; + if let Err(e) = Error::check_command("dmsetup load thin", load) { + let _ = resume(name); + return Err(e); + } + resume(name) +} + +fn thin_table(pool_name: &str, thin_id: u64, size_sectors: u64) -> String { + let pool_dev = pool::device_path(pool_name); + format!("0 {size_sectors} thin {} {thin_id}", pool_dev.display()) +} + +/// Sanitize an arbitrary name (image or VM) into a device-mapper-safe +/// component. dmsetup forbids `/`, `:`, and shell metacharacters; the +/// existing image/VM naming policy already enforces the right shape, so +/// this is a defensive guard rather than a real transformation. +pub fn sanitize_dm_name(name: &str) -> String { + name.chars() + .map(|c| { + if c.is_ascii_alphanumeric() || c == '-' || c == '_' { + c + } else { + '_' + } + }) + .collect() +} + +/// Device-mapper name for a VM volume. +pub fn vm_dm_name(vm_name: &str) -> String { + format!("{VM_PREFIX}{}", sanitize_dm_name(vm_name)) +} + +/// Device-mapper name for an image base volume. +pub fn image_dm_name(image_name: &str) -> String { + format!("{IMAGE_PREFIX}{}", sanitize_dm_name(image_name)) +} + +/// Device-mapper name for the temporary staging volume used while +/// writing a fresh image into the pool. Held only between +/// `create_thin` and the post-`dd` snapshot. +pub fn image_staging_dm_name(image_name: &str) -> String { + format!("{IMAGE_PREFIX}{}-staging", sanitize_dm_name(image_name)) +} + +/// Path that should be passed to Firecracker as `path_on_host`. +pub fn vm_device_path(vm_name: &str) -> PathBuf { + device_path(&vm_dm_name(vm_name)) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn fresh_thin_id_is_nonzero_and_in_range() { + for _ in 0..1000 { + let id = fresh_thin_id(); + assert_ne!(id, 0); + assert!(id <= MAX_DEV_ID, "id {id} exceeds kernel max {MAX_DEV_ID}"); + } + } + + #[test] + fn fresh_thin_id_distribution() { + // 100 random ids in a 24-bit space collide with probability + // ≈ 100²/(2·2²⁴) ≈ 3·10⁻⁴, so duplicates here would be a real bug. + let ids: std::collections::HashSet = (0..100).map(|_| fresh_thin_id()).collect(); + assert_eq!(ids.len(), 100); + } + + #[test] + fn thin_table_shape() { + let t = thin_table("ember-pool", 42, 16_777_216); + assert_eq!(t, "0 16777216 thin /dev/mapper/ember-pool 42"); + } + + #[test] + fn dm_names() { + assert_eq!(vm_dm_name("myvm"), "ember-vm-myvm"); + assert_eq!( + image_dm_name("library-alpine-latest"), + "ember-img-library-alpine-latest" + ); + assert_eq!(image_staging_dm_name("foo"), "ember-img-foo-staging"); + } + + #[test] + fn sanitize_keeps_safe_chars() { + assert_eq!(sanitize_dm_name("alpine_3.18-edge"), "alpine_3_18-edge"); + assert_eq!(sanitize_dm_name("my/vm:1"), "my_vm_1"); + } +} diff --git a/crates/ember-linux/src/dm_thin/tools.rs b/crates/ember-linux/src/dm_thin/tools.rs new file mode 100644 index 0000000..46eb23f --- /dev/null +++ b/crates/ember-linux/src/dm_thin/tools.rs @@ -0,0 +1,98 @@ +//! Wrappers around the `thin-provisioning-tools` package: `thin_check`, +//! `thin_repair`, `thin_metadata_size`, `thin_dump`. +//! +//! These are recommended (and in some cases required) for safe pool +//! activation and capacity planning. They live in their own module so +//! the dependency on the `thin-provisioning-tools` package is localized. + +use std::path::Path; +use std::process::Command; + +use ember_core::error::{Error, Result}; + +/// Compute a recommended metadata device size in bytes for a pool with +/// `pool_size_bytes` of data, `block_size_bytes` per pool block, and at +/// most `max_thins` concurrent thin volumes. +/// +/// Wraps `thin_metadata_size --numeric-only --unit b`. The output is a +/// single integer in bytes. +pub fn metadata_size(pool_size_bytes: u64, block_size_bytes: u64, max_thins: u64) -> Result { + let output = Command::new("thin_metadata_size") + .args([ + "--block-size", + &format!("{block_size_bytes}"), + "--pool-size", + &format!("{pool_size_bytes}"), + "--max-thins", + &format!("{max_thins}"), + "--numeric-only", + "--unit", + "b", + ]) + .output() + .map_err(|e| Error::CommandExec { + command: "thin_metadata_size".to_string(), + source: e, + })?; + let output = Error::check_command("thin_metadata_size", output)?; + let stdout = String::from_utf8_lossy(&output.stdout); + let bytes = stdout.trim().parse::().map_err(|e| Error::Command { + command: "thin_metadata_size".to_string(), + exit_code: 0, + stderr: format!("non-numeric output {:?}: {e}", stdout.trim()), + })?; + Ok(bytes) +} + +/// Run `thin_check` against a metadata device. +/// +/// Should be invoked before activating a pool whose metadata may be +/// dirty (e.g., after an unclean shutdown). Returns Ok if the metadata +/// is consistent; otherwise the operator must run [`repair`] manually. +pub fn check(metadata_dev: &Path) -> Result<()> { + let output = Command::new("thin_check") + .arg(metadata_dev) + .output() + .map_err(|e| Error::CommandExec { + command: "thin_check".to_string(), + source: e, + })?; + Error::check_command("thin_check", output)?; + Ok(()) +} + +/// Repair metadata into a fresh device. +/// +/// `thin_repair` reads the (possibly corrupt) input and writes a clean +/// metadata image to `output`. The pool must be offline during repair. +pub fn repair(input: &Path, output: &Path) -> Result<()> { + let r = Command::new("thin_repair") + .arg("-i") + .arg(input) + .arg("-o") + .arg(output) + .output() + .map_err(|e| Error::CommandExec { + command: "thin_repair".to_string(), + source: e, + })?; + Error::check_command("thin_repair", r)?; + Ok(()) +} + +/// Dump the metadata device's contents as XML. +/// +/// Useful for recovery (cross-checking ember's recorded thin ids +/// against what the pool actually holds) and for debug tooling. +/// Returns the raw XML as a string. +pub fn dump(metadata_dev: &Path) -> Result { + let output = Command::new("thin_dump") + .arg(metadata_dev) + .output() + .map_err(|e| Error::CommandExec { + command: "thin_dump".to_string(), + source: e, + })?; + let output = Error::check_command("thin_dump", output)?; + Ok(String::from_utf8_lossy(&output.stdout).into_owned()) +} diff --git a/crates/ember-linux/src/dm_thin_storage.rs b/crates/ember-linux/src/dm_thin_storage.rs new file mode 100644 index 0000000..1c8b326 --- /dev/null +++ b/crates/ember-linux/src/dm_thin_storage.rs @@ -0,0 +1,952 @@ +//! Linux storage backend using device-mapper thin provisioning. +//! +//! Replaces ZFS zvols with thin volumes from a dm-thin pool. The single +//! pool holds backing metadata + data devices (typically loopback files +//! under [`storage_path`](DmThinStorage::storage_path)) and exposes +//! arbitrary numbers of thin volumes as `/dev/mapper/ember-img-` +//! and `/dev/mapper/ember-vm-` block devices. +//! +//! See `docs/DM-THIN-SPEC.md` for the design. + +use std::fs; +use std::path::{Path, PathBuf}; +use std::process::Command as ProcessCommand; + +use ember_core::backend::{InitConfig, SnapshotInfo, StorageBackend, VolumeHandle}; +use ember_core::config::size::ByteSize; +use ember_core::config::{DmThinMode, GlobalConfig}; +use ember_core::error::{Error, Result}; +use ember_core::image::registry::ImageEntry; +use ember_core::state::vm::{SnapshotEntry, VmMetadata}; + +use crate::dm_thin::{dm_device_exists, loop_device, pool, thin, tools, SECTOR_SIZE}; +use crate::zvol; + +/// Default file name for the metadata backing file inside the dm-thin +/// data directory. +const METADATA_FILE: &str = "metadata.img"; +/// Default file name for the data backing file inside the dm-thin +/// data directory. +const DATA_FILE: &str = "data.img"; +/// Maximum thin volumes the metadata sizing assumes. dm-thin's +/// `thin_metadata_size` tool requires this; 1024 is a generous floor. +const DEFAULT_MAX_THINS: u64 = 1024; +/// Floor on metadata device size (32 MiB). The kernel rejects very +/// small metadata devices and `thin_metadata_size` may suggest values +/// below this for tiny pools. +const MIN_METADATA_SIZE_BYTES: u64 = 32 * 1024 * 1024; +/// Hard cap on metadata device size (16 GiB). The kernel won't accept +/// metadata devices larger than this. +const MAX_METADATA_SIZE_BYTES: u64 = 16 * 1024 * 1024 * 1024; + +/// dm-thin storage backend. +/// +/// Holds the configured backing path and pool block size; thin id state +/// lives on `VmMetadata`/`ImageEntry`/`SnapshotEntry`. Concurrent +/// invocations are race-free thanks to the kernel's atomic id rejection +/// in `create_thin`/`create_snap`. +#[derive(Clone)] +pub struct DmThinStorage { + /// Backing path. Either a directory holding `metadata.img` and + /// `data.img`, or a raw block device (the metadata file then lives + /// under `/dm-thin-metadata.img`). + storage_path: PathBuf, + /// State directory (e.g. `/var/lib/ember`). Used as the persistent + /// home for the metadata sparse file when `storage_path` points at + /// a raw block device — `/dev/` is tmpfs on most distros and would + /// lose the metadata across reboots. + state_dir: PathBuf, + /// Layout resolved at `ember init`. Pinning this rather than + /// re-probing `storage_path.is_dir()` at runtime keeps reactivation + /// deterministic if the filesystem disagrees with init (e.g., the + /// directory was removed, or a raw device replaced a file). + mode: DmThinMode, + /// Pool block size in 512-byte sectors. Permanent at pool creation; + /// the value here must match what the running pool was created with. + block_size_sectors: u32, +} + +impl DmThinStorage { + /// Build the backend handle from a parsed [`GlobalConfig`]. + /// + /// Falls back to [`pool::DEFAULT_BLOCK_SIZE_SECTORS`] when the + /// config does not pin a block size, and to a live `is_dir()` probe + /// when no [`DmThinMode`] is persisted (legacy configs predating + /// the explicit field). + pub fn new(config: &GlobalConfig) -> Self { + let storage_path = config + .storage_path + .clone() + .unwrap_or_else(|| PathBuf::from("/var/lib/ember/dm-thin")); + let mode = config.dm_thin_mode.unwrap_or_else(|| { + if storage_path.is_dir() || !storage_path.exists() { + DmThinMode::File + } else { + DmThinMode::RawDevice + } + }); + Self { + storage_path, + state_dir: config.state_dir.clone(), + mode, + block_size_sectors: config + .dm_thin_block_size + .unwrap_or(pool::DEFAULT_BLOCK_SIZE_SECTORS), + } + } + + /// Resolved metadata device path for the configured backing. + fn metadata_file(&self) -> PathBuf { + match self.mode { + DmThinMode::File => self.storage_path.join(METADATA_FILE), + // Raw block device: store metadata in the state directory + // rather than next to the device. `/dev/` is tmpfs on most + // distros and would vanish on reboot. + DmThinMode::RawDevice => self.state_dir.join("dm-thin-metadata.img"), + } + } + + /// Resolved data device path for the configured backing. + fn data_file(&self) -> PathBuf { + match self.mode { + DmThinMode::File => self.storage_path.join(DATA_FILE), + DmThinMode::RawDevice => self.storage_path.clone(), + } + } + + /// Make sure the thin-pool device is active. Re-attaches loop + /// devices and re-runs `dmsetup create` if the kernel state is gone + /// (e.g., after a reboot). + fn ensure_pool_active(&self) -> Result<()> { + if dm_device_exists(pool::POOL_NAME)? { + return Ok(()); + } + + pool::ensure_target_loaded()?; + + let metadata_path = self.metadata_file(); + let data_path = self.data_file(); + + let metadata_loop = ensure_loop(&metadata_path)?; + let data_loop = ensure_loop_or_block(&data_path)?; + + // Sanity-check metadata before activating; refuse to import a + // dirty pool rather than risk corruption. + if let Err(e) = tools::check(&metadata_loop) { + return Err(Error::Command { + command: "thin_check".to_string(), + exit_code: 1, + stderr: format!( + "metadata device {} failed thin_check; run thin_repair manually: {e}", + metadata_loop.display() + ), + }); + } + + let data_sectors = device_sectors(&data_loop)?; + pool::create( + pool::POOL_NAME, + &metadata_loop, + &data_loop, + data_sectors, + self.block_size_sectors, + pool::DEFAULT_LOW_WATER_BLOCKS, + ) + } + + /// Activate a thin volume if it is not already exposed under + /// `/dev/mapper/`. + fn ensure_thin_active( + &self, + dm_name: &str, + thin_id: u64, + size_sectors: u64, + ) -> Result { + if dm_device_exists(dm_name)? { + return Ok(thin::device_path(dm_name)); + } + thin::activate(dm_name, pool::POOL_NAME, thin_id, size_sectors) + } + + /// Read a VM's required size in sectors from its metadata. + fn vm_size_sectors(vm: &VmMetadata) -> u64 { + let bytes = (vm.disk_size_gib as u64) * 1024 * 1024 * 1024; + bytes / SECTOR_SIZE + } + + /// Read a thin id off [`VmMetadata`] or fail with a clear message. + fn require_vm_thin_id(vm: &VmMetadata) -> Result { + vm.thin_id.ok_or_else(|| { + Error::Vm(format!( + "vm '{}' has no dm-thin id recorded — was the pool re-initialized?", + vm.name + )) + }) + } + + /// Read a thin id off [`ImageEntry`] or fail with a clear message. + fn require_image_thin_id(image: &ImageEntry) -> Result { + image.thin_id.ok_or_else(|| { + Error::Image(format!( + "image '{}' has no dm-thin id recorded — was the pool re-initialized?", + image.local_name + )) + }) + } + + /// Refuse allocating-or-writing operations when the pool has gone + /// read-only, run out of data, or failed entirely. Without this + /// gate, callers see opaque `EIO` mid-`dd` (out of space) or + /// silent thin id leaks on metadata-corrupt pools. + /// + /// `grow` is intentionally not gated because it is the recovery + /// path for [`PoolMode::OutOfDataSpace`]; destroy paths are also + /// not gated since freeing thin ids must work even on a sick pool. + fn assert_pool_healthy(&self) -> Result<()> { + let status = pool::status(pool::POOL_NAME)?; + match status.mode { + pool::PoolMode::ReadWrite => Ok(()), + pool::PoolMode::ReadOnly => Err(Error::Pool(format!( + "dm-thin pool '{}' is read-only — run `thin_check` and `thin_repair` to recover", + pool::POOL_NAME + ))), + pool::PoolMode::OutOfDataSpace => Err(Error::Pool(format!( + "dm-thin pool '{}' is out of data space ({}/{} blocks used) — run `ember storage grow --size ` to extend it", + pool::POOL_NAME, + status.used_data_blocks, + status.total_data_blocks, + ))), + pool::PoolMode::Failed => Err(Error::Pool(format!( + "dm-thin pool '{}' has failed — inspect dmesg and `thin_check` the metadata device", + pool::POOL_NAME + ))), + } + } +} + +impl StorageBackend for DmThinStorage { + fn init(config: &InitConfig) -> Result<()> { + let storage_path = config.storage_path.clone().ok_or_else(|| { + Error::Config("dm-thin requires --storage-path (directory or block device)".to_string()) + })?; + + pool::ensure_target_loaded()?; + + let block_size_sectors = config + .dm_thin_block_size + .unwrap_or(pool::DEFAULT_BLOCK_SIZE_SECTORS); + + // Layout (file vs raw device) is resolved by the CLI — the + // backend trusts what it was handed instead of re-probing the + // filesystem. + let mode = config.dm_thin_mode.ok_or_else(|| { + Error::Config("dm-thin requires a resolved layout mode in InitConfig".to_string()) + })?; + + // Resolve metadata + data file paths and create them as sparse + // files when missing. A raw block device is kept as-is for the + // data side. + let (metadata_path, data_path) = resolve_init_paths(&storage_path, &config.state_dir, mode); + + let pool_size_bytes = match config.dm_thin_size { + Some(size) => size.bytes(), + None => match mode { + DmThinMode::RawDevice => device_size_bytes(&data_path)?, + DmThinMode::File => { + return Err(Error::Config( + "dm-thin --size is required when using a file-backed pool".to_string(), + )); + } + }, + }; + + // Compute metadata size (or use an explicit override). + let metadata_size_bytes = match config.dm_thin_metadata_size { + Some(size) => size.bytes(), + None => { + let block_size_bytes = (block_size_sectors as u64) * SECTOR_SIZE; + let recommended = + tools::metadata_size(pool_size_bytes, block_size_bytes, DEFAULT_MAX_THINS)?; + recommended.clamp(MIN_METADATA_SIZE_BYTES, MAX_METADATA_SIZE_BYTES) + } + }; + + // Create sparse files when the user supplied paths that don't + // yet exist. A raw block device is left alone here. + if metadata_path.extension().is_some() && !metadata_path.exists() { + ensure_parent_dir(&metadata_path)?; + create_sparse_file(&metadata_path, metadata_size_bytes)?; + } + if data_path.is_file() || !data_path.exists() { + ensure_parent_dir(&data_path)?; + if !data_path.exists() { + create_sparse_file(&data_path, pool_size_bytes)?; + } + } + + // Zero the first 4 KiB of the metadata device — the kernel uses + // an all-zero superblock as the signal to format a fresh pool. + zero_head(&metadata_path)?; + + // Attach loops, then assemble the pool. If anything past this + // point fails, detach the loops we attached so we don't leak + // them pointing at backing files that may get cleaned up. + let metadata_loop = ensure_loop(&metadata_path)?; + let data_loop = match ensure_loop_or_block(&data_path) { + Ok(p) => p, + Err(e) => { + let _ = loop_device::detach(&metadata_loop); + return Err(e); + } + }; + + let data_sectors = match device_sectors(&data_loop) { + Ok(s) => s, + Err(e) => { + let _ = loop_device::detach(&metadata_loop); + if data_path.is_file() { + let _ = loop_device::detach(&data_loop); + } + return Err(e); + } + }; + if let Err(e) = pool::create( + pool::POOL_NAME, + &metadata_loop, + &data_loop, + data_sectors, + block_size_sectors, + pool::DEFAULT_LOW_WATER_BLOCKS, + ) { + let _ = loop_device::detach(&metadata_loop); + if data_path.is_file() { + let _ = loop_device::detach(&data_loop); + } + return Err(e); + } + + println!( + "dm-thin pool '{}' active ({} data, {} block size).", + pool::POOL_NAME, + format_bytes(pool_size_bytes), + format_bytes((block_size_sectors as u64) * SECTOR_SIZE), + ); + + Ok(()) + } + + fn create_image_volume( + &self, + name: &str, + image_path: &Path, + size_mib: u64, + ) -> Result { + self.ensure_pool_active()?; + self.assert_pool_healthy()?; + + let staging_dm = thin::image_staging_dm_name(name); + let final_dm = thin::image_dm_name(name); + let size_sectors = (size_mib * 1024 * 1024) / SECTOR_SIZE; + + // A previous failed run may have left the staging device + // active. Tear it down so the fresh `thin::activate` below + // doesn't trip over `EEXIST`. The matching staging thin id is + // not persisted anywhere, so it leaks into pool metadata; that + // is a bounded one-off cost and only `thin_dump` can find it. + if let Ok(true) = dm_device_exists(&staging_dm) { + let _ = thin::deactivate(&staging_dm); + } + + // 1. Allocate a fresh staging thin and write the ext4 image. + let staging_id = thin::allocate(pool::POOL_NAME)?; + let staging_dev = + match thin::activate(&staging_dm, pool::POOL_NAME, staging_id, size_sectors) { + Ok(p) => p, + Err(e) => { + let _ = thin::delete(pool::POOL_NAME, staging_id); + return Err(e); + } + }; + + // 2. dd the ext4 image onto the staging device. + if let Err(e) = dd_image(image_path, &staging_dev) { + let _ = thin::deactivate(&staging_dm); + let _ = thin::delete(pool::POOL_NAME, staging_id); + return Err(e); + } + + // 3. Snapshot the staging volume as the immutable base. Suspend + // the staging device first so the snapshot sees a coherent + // metadata commit; resume it on the way out either way. + let base_id_result = thin::suspend(&staging_dm).and_then(|()| { + let id = thin::allocate_snap(pool::POOL_NAME, staging_id); + let _ = thin::resume(&staging_dm); + id + }); + let base_id = match base_id_result { + Ok(id) => id, + Err(e) => { + let _ = thin::deactivate(&staging_dm); + let _ = thin::delete(pool::POOL_NAME, staging_id); + return Err(e); + } + }; + + // 4. Drop the staging device + thin id; the base id retains all + // of its blocks. + let _ = thin::deactivate(&staging_dm); + let _ = thin::delete(pool::POOL_NAME, staging_id); + + // The base thin is left inactive. Lazy activation creates the + // device on first use. Record the would-be path so it can be + // displayed and so callers see a stable identifier. + Ok(VolumeHandle { + disk_path: thin::device_path(&final_dm), + thin_id: Some(base_id), + }) + } + + fn clone_for_vm(&self, image: &ImageEntry, vm_name: &str) -> Result { + self.ensure_pool_active()?; + self.assert_pool_healthy()?; + let base_id = Self::require_image_thin_id(image)?; + + let dm_name = thin::vm_dm_name(vm_name); + // The VM's virtual size matches the image's size at clone time; + // resize to a larger disk happens in a subsequent `resize` call. + let size_sectors = (image.size_mib * 1024 * 1024) / SECTOR_SIZE; + + let vm_id = thin::allocate_snap(pool::POOL_NAME, base_id)?; + match thin::activate(&dm_name, pool::POOL_NAME, vm_id, size_sectors) { + Ok(disk_path) => Ok(VolumeHandle { + disk_path, + thin_id: Some(vm_id), + }), + Err(e) => { + let _ = thin::delete(pool::POOL_NAME, vm_id); + Err(e) + } + } + } + + fn snapshot(&self, vm: &VmMetadata, snap_name: &str) -> Result> { + self.ensure_pool_active()?; + self.assert_pool_healthy()?; + let vm_id = Self::require_vm_thin_id(vm)?; + let dm_name = thin::vm_dm_name(&vm.name); + let size_sectors = Self::vm_size_sectors(vm); + + // Suspend so create_snap sees a metadata-coherent volume. + // Some operations (e.g. snapshotting a never-activated volume) + // can run without an active device, but suspending an inactive + // device errors. Activate first if needed. + self.ensure_thin_active(&dm_name, vm_id, size_sectors)?; + + thin::suspend(&dm_name)?; + let snap_result = thin::allocate_snap(pool::POOL_NAME, vm_id); + let _ = thin::resume(&dm_name); + let snap_id = snap_result?; + + Ok(Some(SnapshotEntry { + name: snap_name.to_string(), + thin_id: snap_id, + created_at: ember_core::state::vm::now_epoch_secs(), + size_sectors, + })) + } + + fn restore_snapshot(&self, vm: &VmMetadata, snap_name: &str) -> Result { + self.ensure_pool_active()?; + self.assert_pool_healthy()?; + let vm_id = Self::require_vm_thin_id(vm)?; + let snap = vm + .snapshots + .iter() + .find(|s| s.name == snap_name) + .ok_or_else(|| { + Error::Vm(format!( + "snapshot '{snap_name}' not found on vm '{}'", + vm.name + )) + })?; + let snap_id = snap.thin_id; + + let dm_name = thin::vm_dm_name(&vm.name); + let size_sectors = Self::vm_size_sectors(vm); + + // Allocate the replacement thin id from the snapshot up-front so + // a failure here leaves `vm.thin_id` and the kernel pool + // unchanged. The old order (deactivate -> delete -> allocate) + // would orphan `vm.thin_id` on any allocate hiccup. + let new_id = thin::allocate_snap(pool::POOL_NAME, snap_id)?; + + // Once new_id exists, swap the dm-mapper slot over to it. Any + // failure from here on must release new_id so we don't leak + // kernel state. + let result = (|| -> Result { + if dm_device_exists(&dm_name)? { + thin::deactivate(&dm_name)?; + } + thin::delete(pool::POOL_NAME, vm_id)?; + thin::activate(&dm_name, pool::POOL_NAME, new_id, size_sectors) + })(); + + match result { + Ok(disk_path) => Ok(VolumeHandle { + disk_path, + thin_id: Some(new_id), + }), + Err(e) => { + let _ = thin::delete(pool::POOL_NAME, new_id); + Err(e) + } + } + } + + fn delete_snapshot(&self, vm: &VmMetadata, snap_name: &str) -> Result<()> { + self.ensure_pool_active()?; + let snap = vm + .snapshots + .iter() + .find(|s| s.name == snap_name) + .ok_or_else(|| { + Error::Vm(format!( + "snapshot '{snap_name}' not found on vm '{}'", + vm.name + )) + })?; + thin::delete(pool::POOL_NAME, snap.thin_id) + } + + fn list_snapshots(&self, vm: &VmMetadata) -> Result> { + // dm-thin tracks snapshots via the persisted `vm.snapshots` + // list; the kernel knows nothing about names. + Ok(vm + .snapshots + .iter() + .map(|s| SnapshotInfo { + name: s.name.clone(), + created_at: s.created_at, + size: s.size_sectors * SECTOR_SIZE, + }) + .collect()) + } + + fn resize(&self, vm: &VmMetadata, new_size: ByteSize) -> Result<()> { + self.ensure_pool_active()?; + self.assert_pool_healthy()?; + let vm_id = Self::require_vm_thin_id(vm)?; + let dm_name = thin::vm_dm_name(&vm.name); + let new_sectors = new_size.bytes() / SECTOR_SIZE; + + // Activate (lazy) so we have a device to reload. + let current_sectors = Self::vm_size_sectors(vm); + let dev_path = self.ensure_thin_active(&dm_name, vm_id, current_sectors)?; + + thin::reload_size(&dm_name, pool::POOL_NAME, vm_id, new_sectors)?; + zvol::wait_for_device(&dev_path)?; + e2fsck(&dev_path)?; + resize2fs(&dev_path)?; + Ok(()) + } + + fn destroy_vm_storage(&self, vm: &VmMetadata) -> Result<()> { + // Best-effort: deactivate first, then free the thin id. Either + // step may already be done by an earlier failure path. + let _ = self.ensure_pool_active(); + let dm_name = thin::vm_dm_name(&vm.name); + if let Ok(true) = dm_device_exists(&dm_name) { + let _ = thin::deactivate(&dm_name); + } + // Snapshots only live in the kernel pool; the user-level + // handle is `vm.json`, which is about to disappear. Free their + // thin ids before the VM's own id, otherwise they'd remain + // pinned in pool metadata with no way for ember to reach them. + for snap in &vm.snapshots { + let _ = thin::delete(pool::POOL_NAME, snap.thin_id); + } + if let Some(id) = vm.thin_id { + let _ = thin::delete(pool::POOL_NAME, id); + } + Ok(()) + } + + fn destroy_image_storage(&self, image: &ImageEntry, _force: bool) -> Result<()> { + // dm-thin reference-counts blocks; deleting the base thin is + // safe even when VMs still have clones — they keep their own + // thin ids and stay readable. `force` doesn't change behavior. + let _ = self.ensure_pool_active(); + let dm_name = thin::image_dm_name(&image.local_name); + if let Ok(true) = dm_device_exists(&dm_name) { + let _ = thin::deactivate(&dm_name); + } + if let Some(id) = image.thin_id { + let _ = thin::delete(pool::POOL_NAME, id); + } + Ok(()) + } + + fn disk_device_path(&self, vm: &VmMetadata) -> Result { + // Ensure the pool table and the per-VM thin device are live in + // the kernel. After a host reboot both are gone; without this, + // `vm start` would hand Firecracker a stale `/dev/mapper/...` + // path that resolves to ENOENT. + self.ensure_pool_active()?; + let thin_id = Self::require_vm_thin_id(vm)?; + let dm_name = thin::vm_dm_name(&vm.name); + let size_sectors = Self::vm_size_sectors(vm); + self.ensure_thin_active(&dm_name, thin_id, size_sectors) + } + + fn clone_vm_storage(&self, source: &VmMetadata, target_vm: &str) -> Result { + self.ensure_pool_active()?; + self.assert_pool_healthy()?; + let source_id = Self::require_vm_thin_id(source)?; + let dm_name = thin::vm_dm_name(target_vm); + let size_sectors = Self::vm_size_sectors(source); + + let fork_id = thin::allocate_snap(pool::POOL_NAME, source_id)?; + match thin::activate(&dm_name, pool::POOL_NAME, fork_id, size_sectors) { + Ok(disk_path) => Ok(VolumeHandle { + disk_path, + thin_id: Some(fork_id), + }), + Err(e) => { + let _ = thin::delete(pool::POOL_NAME, fork_id); + Err(e) + } + } + } + + fn cleanup_fork(&self, _parent: &VmMetadata, _forked: &VmMetadata) -> Result<()> { + // dm-thin forks are independent — the snapshot id used to + // create the fork is the fork's own thin id, not a marker on + // the parent. Nothing to clean up on the parent. + Ok(()) + } + + fn storage_dependents(&self, _vm: &VmMetadata) -> Result> { + Ok(Vec::new()) + } + + fn deinit(&self, purge: bool) -> Result<()> { + // 1. Deactivate every ember-managed thin volume so the pool + // can be removed cleanly. + for prefix in [thin::IMAGE_PREFIX, thin::VM_PREFIX] { + for name in pool::list_with_prefix(prefix)? { + let _ = thin::deactivate(&name); + } + } + // 2. Drop the pool itself (if active). + if dm_device_exists(pool::POOL_NAME)? { + pool::remove(pool::POOL_NAME)?; + } + // 3. Detach the loop devices, if any. + let metadata_path = self.metadata_file(); + let data_path = self.data_file(); + if let Some(loop_dev) = loop_device::find_for(&metadata_path)? { + let _ = loop_device::detach(&loop_dev); + } + if let Some(loop_dev) = loop_device::find_for(&data_path)? { + let _ = loop_device::detach(&loop_dev); + } + // 4. Optionally delete the backing files. A raw block device + // supplied by the user is always left alone. + if purge { + for path in [&metadata_path, &data_path] { + if path.is_file() { + let _ = fs::remove_file(path); + } + } + // Remove the dm-thin directory itself if empty. + if self.storage_path.is_dir() { + let _ = fs::remove_dir(&self.storage_path); + } + } + println!("dm-thin pool '{}' torn down.", pool::POOL_NAME); + Ok(()) + } + + fn grow(&self, new_size: ByteSize) -> Result<()> { + self.ensure_pool_active()?; + + let data_path = self.data_file(); + let new_bytes = new_size.bytes(); + + if data_path.is_file() { + create_sparse_file(&data_path, new_bytes)?; + } else { + return Err(Error::Config(format!( + "data device {} is a raw block device — grow it externally first \ + (e.g. lvextend, cloud-volume resize) and then re-run `ember storage grow`", + data_path.display() + ))); + } + + // Make the loop driver pick up the new file size, then reload + // the pool table with the larger sector count. + let metadata_path = self.metadata_file(); + let metadata_loop = loop_device::find_for(&metadata_path)?.ok_or_else(|| { + Error::Config(format!( + "metadata device {} is not attached to a loop device", + metadata_path.display() + )) + })?; + let data_loop = if data_path.is_file() { + let dev = loop_device::find_for(&data_path)?.ok_or_else(|| { + Error::Config(format!( + "data device {} is not attached to a loop device", + data_path.display() + )) + })?; + loop_device::refresh_size(&dev)?; + dev + } else { + data_path.clone() + }; + + let data_sectors = device_sectors(&data_loop)?; + pool::reload( + pool::POOL_NAME, + &metadata_loop, + &data_loop, + data_sectors, + self.block_size_sectors, + pool::DEFAULT_LOW_WATER_BLOCKS, + )?; + println!( + "Grew dm-thin pool data device to {}.", + format_bytes(new_bytes) + ); + Ok(()) + } + + fn mount(&self, path: &Path) -> Result { + zvol::wait_for_device(path)?; + + let mount_dir = tempfile::tempdir() + .map_err(|e| Error::Io { + path: std::env::temp_dir(), + source: e, + })? + .keep(); + + let output = ProcessCommand::new("mount") + .arg(path) + .arg(&mount_dir) + .output() + .map_err(|e| Error::CommandExec { + command: "mount".to_string(), + source: e, + })?; + + if let Err(e) = Error::check_command("mount", output) { + let _ = fs::remove_dir(&mount_dir); + return Err(e); + } + Ok(mount_dir) + } + + fn unmount(&self, mount_point: &Path) -> Result<()> { + crate::image::umount(mount_point)?; + let _ = fs::remove_dir(mount_point); + Ok(()) + } +} + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +/// Decide where the metadata + data backing live based on the +/// caller-resolved [`DmThinMode`]. +/// +/// * [`DmThinMode::File`]: `metadata.img`/`data.img` inside `storage_path`. +/// * [`DmThinMode::RawDevice`]: `storage_path` is the data device, with +/// metadata as a sparse file under `state_dir` (a raw device's parent +/// is `/dev/`, which is tmpfs and would lose the metadata on reboot). +fn resolve_init_paths( + storage_path: &Path, + state_dir: &Path, + mode: DmThinMode, +) -> (PathBuf, PathBuf) { + match mode { + DmThinMode::File => ( + storage_path.join(METADATA_FILE), + storage_path.join(DATA_FILE), + ), + DmThinMode::RawDevice => ( + state_dir.join("dm-thin-metadata.img"), + storage_path.to_path_buf(), + ), + } +} + +fn ensure_parent_dir(path: &Path) -> Result<()> { + if let Some(parent) = path.parent() { + fs::create_dir_all(parent).map_err(|e| Error::Io { + path: parent.to_path_buf(), + source: e, + })?; + } + Ok(()) +} + +/// Create a sparse file of the given byte size using `truncate`. +fn create_sparse_file(path: &Path, size_bytes: u64) -> Result<()> { + let output = ProcessCommand::new("truncate") + .args(["-s", &size_bytes.to_string()]) + .arg(path) + .output() + .map_err(|e| Error::CommandExec { + command: "truncate".to_string(), + source: e, + })?; + Error::check_command("truncate", output)?; + Ok(()) +} + +/// Zero the first 4 KiB of a file or block device. dm-thin uses an +/// all-zero superblock as its "format me" sentinel. +fn zero_head(path: &Path) -> Result<()> { + let output = ProcessCommand::new("dd") + .arg("if=/dev/zero") + .arg(format!("of={}", path.display())) + .args(["bs=4K", "count=1", "conv=notrunc", "status=none"]) + .output() + .map_err(|e| Error::CommandExec { + command: "dd zero metadata".to_string(), + source: e, + })?; + Error::check_command("dd zero metadata", output)?; + Ok(()) +} + +/// Find an existing loop device for `file`, or attach a new one. +fn ensure_loop(file: &Path) -> Result { + if let Some(existing) = loop_device::find_for(file)? { + return Ok(existing); + } + loop_device::attach(file) +} + +/// Same as [`ensure_loop`] but transparent for raw block devices: if +/// the path is a block device (not a regular file) it's used as-is. +fn ensure_loop_or_block(path: &Path) -> Result { + let metadata = fs::metadata(path).map_err(|e| Error::Io { + path: path.to_path_buf(), + source: e, + })?; + if metadata.file_type().is_file() { + ensure_loop(path) + } else { + Ok(path.to_path_buf()) + } +} + +/// Number of 512-byte sectors on a block device. +fn device_sectors(path: &Path) -> Result { + Ok(device_size_bytes(path)? / SECTOR_SIZE) +} + +/// Total byte size of a block device (or regular file). Wraps +/// `blockdev --getsize64` for block devices and falls back to file +/// metadata otherwise. +fn device_size_bytes(path: &Path) -> Result { + if let Ok(meta) = fs::metadata(path) { + if meta.file_type().is_file() { + return Ok(meta.len()); + } + } + let output = ProcessCommand::new("blockdev") + .arg("--getsize64") + .arg(path) + .output() + .map_err(|e| Error::CommandExec { + command: "blockdev --getsize64".to_string(), + source: e, + })?; + let output = Error::check_command("blockdev --getsize64", output)?; + let s = String::from_utf8_lossy(&output.stdout); + s.trim().parse::().map_err(|e| Error::Command { + command: "blockdev --getsize64".to_string(), + exit_code: 0, + stderr: format!("non-numeric size {:?}: {e}", s.trim()), + }) +} + +/// Format a byte count for log lines. +fn format_bytes(bytes: u64) -> String { + const TIB: u64 = 1024 * 1024 * 1024 * 1024; + const GIB: u64 = 1024 * 1024 * 1024; + const MIB: u64 = 1024 * 1024; + if bytes >= TIB { + format!("{:.1} TiB", bytes as f64 / TIB as f64) + } else if bytes >= GIB { + format!("{:.1} GiB", bytes as f64 / GIB as f64) + } else if bytes >= MIB { + format!("{:.1} MiB", bytes as f64 / MIB as f64) + } else { + format!("{bytes} B") + } +} + +/// Run `dd` to copy an image file onto a block device. +fn dd_image(image_path: &Path, device: &Path) -> Result<()> { + let output = ProcessCommand::new("dd") + .arg(format!("if={}", image_path.display())) + .arg(format!("of={}", device.display())) + .args(["bs=1M", "conv=fsync", "status=none"]) + .output() + .map_err(|e| Error::CommandExec { + command: "dd image to thin".to_string(), + source: e, + })?; + Error::check_command("dd image to thin", output)?; + Ok(()) +} + +/// `e2fsck -f -p` — used before resize2fs. +fn e2fsck(device: &Path) -> Result<()> { + let output = ProcessCommand::new("e2fsck") + .args(["-f", "-p"]) + .arg(device) + .output() + .map_err(|e| Error::CommandExec { + command: "e2fsck".to_string(), + source: e, + })?; + if output.status.code().unwrap_or(-1) >= 2 { + return Err(Error::Command { + command: "e2fsck".to_string(), + exit_code: output.status.code().unwrap_or(-1), + stderr: String::from_utf8_lossy(&output.stderr).trim().to_string(), + }); + } + Ok(()) +} + +/// `resize2fs` — expand the ext4 filesystem to fill the device. +fn resize2fs(device: &Path) -> Result<()> { + let output = ProcessCommand::new("resize2fs") + .arg(device) + .output() + .map_err(|e| Error::CommandExec { + command: "resize2fs".to_string(), + source: e, + })?; + Error::check_command("resize2fs", output)?; + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn format_bytes_units() { + assert_eq!(format_bytes(0), "0 B"); + assert_eq!(format_bytes(2 * 1024 * 1024), "2.0 MiB"); + assert_eq!(format_bytes(3u64 * 1024 * 1024 * 1024), "3.0 GiB"); + } +} diff --git a/crates/ember-linux/src/lib.rs b/crates/ember-linux/src/lib.rs index c5b6c7a..7286f31 100644 --- a/crates/ember-linux/src/lib.rs +++ b/crates/ember-linux/src/lib.rs @@ -1,3 +1,5 @@ +pub mod dm_thin; +pub mod dm_thin_storage; pub mod firecracker; pub mod image; pub mod network; @@ -9,7 +11,48 @@ pub mod vm; pub mod zfs; pub mod zvol; +pub use dm_thin_storage::DmThinStorage; pub use network_backend::LinuxNetwork; pub use platform::LinuxPlatform; pub use storage::LinuxStorage; pub use vm::LinuxVm; + +use std::sync::Arc; + +use ember_core::backend::{InitConfig, StorageBackend}; +use ember_core::config::{GlobalConfig, StorageKind}; +use ember_core::error::{Error, Result}; + +/// Construct the active storage backend. +/// +/// Returns the implementation indicated by [`GlobalConfig::storage_backend`]. +/// btrfs is not yet implemented; rather than silently routing through +/// the ZFS path with garbage inputs, the call panics so a hand-edited +/// `config.json` fails loudly. `init_storage` returns the same shape +/// of error from the init side. +pub fn create_storage(config: &GlobalConfig) -> Arc { + match config.storage_backend { + StorageKind::Zfs => Arc::new(LinuxStorage::new(config)), + StorageKind::DmThin => Arc::new(DmThinStorage::new(config)), + StorageKind::Btrfs => panic!( + "btrfs storage backend is not yet implemented; \ + config.json has storage_backend = btrfs but no \ + implementation exists yet" + ), + } +} + +/// Initialize storage during `ember init`. +/// +/// Dispatches to the concrete backend's `init` associated function. The +/// trait object is unavailable here because the backend hasn't been +/// constructed yet. +pub fn init_storage(config: &InitConfig) -> Result<()> { + match config.storage_backend { + StorageKind::Zfs => LinuxStorage::init(config), + StorageKind::DmThin => DmThinStorage::init(config), + StorageKind::Btrfs => Err(Error::Config( + "btrfs storage backend is not yet implemented".to_string(), + )), + } +} diff --git a/crates/ember-linux/src/platform.rs b/crates/ember-linux/src/platform.rs index f7bb65a..f773495 100644 --- a/crates/ember-linux/src/platform.rs +++ b/crates/ember-linux/src/platform.rs @@ -1,11 +1,13 @@ use std::path::{Path, PathBuf}; use ember_core::backend::{ImageToolConfig, Platform, ResolvConfMode}; -use ember_core::config::GlobalConfig; +use ember_core::config::{GlobalConfig, StorageKind}; use ember_core::error::Result; use ember_core::image::registry::ImageEntry; use ember_core::state::vm::VmMetadata; +use crate::dm_thin::pool; + pub struct LinuxPlatform; fn linux_install_hint(name: &str) -> String { @@ -45,10 +47,18 @@ impl Platform for LinuxPlatform { } fn inspect_vm_extra(metadata: &VmMetadata) -> Vec<(&'static str, String)> { - let mut extra = vec![ - ("ZFS zvol", metadata.disk_path.clone()), - ("API socket", metadata.api_socket.display().to_string()), - ]; + // dm-thin records a numeric `thin_id` on the VM metadata; ZFS + // does not. Branch on its presence rather than threading a + // `GlobalConfig` reference through the trait — the metadata + // already carries enough to label the disk row correctly. + let mut extra = match metadata.thin_id { + Some(thin_id) => vec![ + ("Thin device", metadata.disk_path.clone()), + ("Thin id", thin_id.to_string()), + ], + None => vec![("ZFS zvol", metadata.disk_path.clone())], + }; + extra.push(("API socket", metadata.api_socket.display().to_string())); if let Some(ref net) = metadata.network { extra.push(("TAP device", net.tap_device.clone())); } @@ -56,14 +66,45 @@ impl Platform for LinuxPlatform { } fn inspect_image_extra(entry: &ImageEntry) -> Vec<(&'static str, String)> { - vec![("ZFS zvol", entry.disk_path.clone())] + match entry.thin_id { + Some(thin_id) => vec![ + ("Thin device", entry.disk_path.clone()), + ("Thin id", thin_id.to_string()), + ], + None => vec![("ZFS zvol", entry.disk_path.clone())], + } } fn info_extra(config: &GlobalConfig) -> Vec<(&'static str, String)> { - let mut extra = vec![ - ("ZFS pool", config.pool.clone()), - ("Dataset", format!("{}/{}", config.pool, config.dataset)), - ]; + let mut extra = match config.storage_backend { + StorageKind::Zfs => vec![ + ("ZFS pool", config.pool.clone()), + ("Dataset", format!("{}/{}", config.pool, config.dataset)), + ], + StorageKind::DmThin => { + let mut rows = vec![("dm-thin pool", pool::POOL_NAME.to_string())]; + if let Some(ref path) = config.storage_path { + rows.push(("Storage path", path.display().to_string())); + } + if let Some(block_size) = config.dm_thin_block_size { + rows.push(( + "Block size", + format!("{} sectors ({} KiB)", block_size, (block_size * 512) / 1024), + )); + } + if let Some(mode) = config.dm_thin_mode { + rows.push(( + "Layout", + match mode { + ember_core::config::DmThinMode::File => "file-backed".to_string(), + ember_core::config::DmThinMode::RawDevice => "raw device".to_string(), + }, + )); + } + rows + } + StorageKind::Btrfs => vec![("btrfs", "(unimplemented)".to_string())], + }; if let Some(ref wan_iface) = config.wan_iface { extra.push(("WAN iface", wan_iface.clone())); } diff --git a/crates/ember-linux/src/storage.rs b/crates/ember-linux/src/storage.rs index 5aafafc..391fae9 100644 --- a/crates/ember-linux/src/storage.rs +++ b/crates/ember-linux/src/storage.rs @@ -11,14 +11,19 @@ use std::path::{Path, PathBuf}; use std::process::Command as ProcessCommand; use crate::zfs; -use ember_core::backend::{InitConfig, SnapshotInfo, StorageBackend}; +use ember_core::backend::{InitConfig, SnapshotInfo, StorageBackend, VolumeHandle}; use ember_core::config::size::ByteSize; use ember_core::config::GlobalConfig; use ember_core::error::{Error, Result}; +use ember_core::image::registry::ImageEntry; +use ember_core::state::vm::{SnapshotEntry, VmMetadata}; /// Linux storage backend using ZFS zvols. #[derive(Clone)] pub struct LinuxStorage { + /// ZFS pool name (e.g., "tank"). Cached so `deinit` can call + /// `zpool destroy` without re-reading the config. + pool: String, /// ZFS images dataset path (e.g., "tank/ember/images"). images_dataset: String, /// ZFS VMs dataset path (e.g., "tank/ember/vms"). @@ -31,6 +36,7 @@ impl LinuxStorage { /// Extracts the ZFS pool/dataset paths that all storage operations need. pub fn new(config: &GlobalConfig) -> Self { Self { + pool: config.pool.clone(), images_dataset: config.images_dataset(), vms_dataset: config.vms_dataset(), } @@ -88,9 +94,12 @@ impl StorageBackend for LinuxStorage { } /// Create a ZFS zvol from an ext4 image, write it via `dd`, and snapshot `@base`. - /// - /// Returns the zvol path (e.g., "tank/ember/images/library-alpine-latest"). - fn create_image_volume(&self, name: &str, image_path: &Path, size_mib: u64) -> Result { + fn create_image_volume( + &self, + name: &str, + image_path: &Path, + size_mib: u64, + ) -> Result { let zvol = self.image_zvol(name); // Create the zvol. @@ -103,14 +112,12 @@ impl StorageBackend for LinuxStorage { return Err(e); } - Ok(PathBuf::from(zvol)) + Ok(VolumeHandle::from_path(zvol)) } /// Clone the image's `@base` snapshot to create a VM zvol. - /// - /// Returns the zvol path (e.g., "tank/ember/vms/myvm"). - fn clone_for_vm(&self, image_name: &str, vm_name: &str) -> Result { - let image_zvol = self.image_zvol(image_name); + fn clone_for_vm(&self, image: &ImageEntry, vm_name: &str) -> Result { + let image_zvol = self.image_zvol(&image.local_name); let snapshot = format!("{image_zvol}@{}", zfs::BASE_SNAPSHOT_NAME); let vm_zvol = self.vm_zvol(vm_name); @@ -123,27 +130,31 @@ impl StorageBackend for LinuxStorage { } zfs::volume::clone(&snapshot, &vm_zvol)?; - Ok(PathBuf::from(vm_zvol)) + Ok(VolumeHandle::from_path(vm_zvol)) } - fn snapshot(&self, vm_name: &str, snap_name: &str) -> Result<()> { - let zvol = self.vm_zvol(vm_name); - zfs::snapshot::create(&zvol, snap_name) + fn snapshot(&self, vm: &VmMetadata, snap_name: &str) -> Result> { + let zvol = self.vm_zvol(&vm.name); + zfs::snapshot::create(&zvol, snap_name)?; + // ZFS records snapshots in the kernel; nothing to add to vm.json. + Ok(None) } - fn restore_snapshot(&self, vm_name: &str, snap_name: &str) -> Result<()> { - let zvol = self.vm_zvol(vm_name); - zfs::snapshot::rollback(&zvol, snap_name) + fn restore_snapshot(&self, vm: &VmMetadata, snap_name: &str) -> Result { + let zvol = self.vm_zvol(&vm.name); + zfs::snapshot::rollback(&zvol, snap_name)?; + // Rollback mutates the volume in place; identity unchanged. + Ok(VolumeHandle::from_path(zvol)) } - fn delete_snapshot(&self, vm_name: &str, snap_name: &str) -> Result<()> { - let zvol = self.vm_zvol(vm_name); + fn delete_snapshot(&self, vm: &VmMetadata, snap_name: &str) -> Result<()> { + let zvol = self.vm_zvol(&vm.name); zfs::snapshot::destroy(&zvol, snap_name) } /// List snapshots, filtering out the reserved `@base` snapshot. - fn list_snapshots(&self, vm_name: &str) -> Result> { - let zvol = self.vm_zvol(vm_name); + fn list_snapshots(&self, vm: &VmMetadata) -> Result> { + let zvol = self.vm_zvol(&vm.name); let zfs_snaps = zfs::snapshot::list(&zvol)?; Ok(zfs_snaps @@ -158,8 +169,8 @@ impl StorageBackend for LinuxStorage { } /// Grow the zvol and expand the ext4 filesystem. - fn resize(&self, vm_name: &str, new_size: ByteSize) -> Result<()> { - let zvol = self.vm_zvol(vm_name); + fn resize(&self, vm: &VmMetadata, new_size: ByteSize) -> Result<()> { + let zvol = self.vm_zvol(&vm.name); let new_gib = new_size .to_gib() .map_err(|e| Error::Zfs(format!("invalid resize target: {e}")))?; @@ -176,19 +187,44 @@ impl StorageBackend for LinuxStorage { } /// Destroy the VM's zvol and all its snapshots. - fn destroy_vm_storage(&self, vm_name: &str) -> Result<()> { - let zvol = self.vm_zvol(vm_name); + fn destroy_vm_storage(&self, vm: &VmMetadata) -> Result<()> { + let zvol = self.vm_zvol(&vm.name); // Ignore errors — the zvol may already be gone. let _ = zfs::volume::destroy(&zvol, true); Ok(()) } + fn deinit(&self, _purge: bool) -> Result<()> { + // `zpool destroy` is destructive — there is no equivalent of + // "purge: keep the data". The flag is accepted for trait + // uniformity but ignored here: ZFS pools always go. + if !zfs::pool::exists(&self.pool)? { + return Ok(()); + } + let output = ProcessCommand::new("zpool") + .args(["destroy", "-f", &self.pool]) + .output() + .map_err(|e| Error::CommandExec { + command: "zpool destroy".to_string(), + source: e, + })?; + Error::check_command("zpool destroy", output)?; + println!("Destroyed ZFS pool '{}'.", self.pool); + Ok(()) + } + + fn grow(&self, _new_size: ByteSize) -> Result<()> { + Err(Error::Zfs( + "ZFS pools auto-expand by default; use `zpool online -e` if needed".to_string(), + )) + } + /// Destroy the image zvol (includes its @base snapshot). /// /// With `force: true`, uses `zfs destroy -R` to also destroy any orphaned /// dependent clones (VM zvols) that the application layer couldn't clean up. - fn destroy_image_storage(&self, name: &str, force: bool) -> Result<()> { - let zvol = self.image_zvol(name); + fn destroy_image_storage(&self, image: &ImageEntry, force: bool) -> Result<()> { + let zvol = self.image_zvol(&image.local_name); if force { zfs::destroy_with_dependents(&zvol) } else { @@ -197,21 +233,14 @@ impl StorageBackend for LinuxStorage { } /// Device path for a VM's root disk zvol. - /// - /// Returns the `/dev/zvol/...` path that can be used for mounting - /// or passing to Firecracker as a block device. - fn disk_device_path(&self, vm_name: &str) -> PathBuf { - let zvol = self.vm_zvol(vm_name); - zfs::volume::device_path(&zvol) + fn disk_device_path(&self, vm: &VmMetadata) -> Result { + let zvol = self.vm_zvol(&vm.name); + Ok(zfs::volume::device_path(&zvol)) } /// Fork a VM's disk by snapshotting the source and cloning into a new VM. - /// - /// Internally creates a ZFS snapshot named `fork-{target_vm}` on the source, - /// then clones it into the target VM's zvol. The snapshot naming convention - /// is entirely internal — the caller only sees the resulting disk path. - fn clone_vm_storage(&self, source_vm: &str, target_vm: &str) -> Result { - let source_zvol = self.vm_zvol(source_vm); + fn clone_vm_storage(&self, source: &VmMetadata, target_vm: &str) -> Result { + let source_zvol = self.vm_zvol(&source.name); let target_zvol = self.vm_zvol(target_vm); let snap_name = format!("fork-{target_vm}"); @@ -227,16 +256,16 @@ impl StorageBackend for LinuxStorage { return Err(e); } - Ok(PathBuf::from(target_zvol)) + Ok(VolumeHandle::from_path(target_zvol)) } /// Clean up the fork snapshot on the parent VM. /// /// Reconstructs the snapshot name from the naming convention: /// `{pool}/vms/{parent_vm}@fork-{forked_vm}`. - fn cleanup_fork(&self, parent_vm: &str, forked_vm: &str) -> Result<()> { - let parent_zvol = self.vm_zvol(parent_vm); - let snap_name = format!("fork-{forked_vm}"); + fn cleanup_fork(&self, parent: &VmMetadata, forked: &VmMetadata) -> Result<()> { + let parent_zvol = self.vm_zvol(&parent.name); + let snap_name = format!("fork-{}", forked.name); match zfs::snapshot::destroy(&parent_zvol, &snap_name) { Ok(()) => {} Err(e) => { @@ -249,12 +278,8 @@ impl StorageBackend for LinuxStorage { } /// Check for fork snapshots on this VM's ZFS dataset. - /// - /// Lists all snapshots matching `fork-*` on the VM's zvol and returns - /// the implied dependent VM names. These represent ZFS clones that - /// would break if this VM's dataset were destroyed. - fn storage_dependents(&self, vm_name: &str) -> Result> { - let zvol = self.vm_zvol(vm_name); + fn storage_dependents(&self, vm: &VmMetadata) -> Result> { + let zvol = self.vm_zvol(&vm.name); let snapshots = zfs::snapshot::list(&zvol)?; Ok(snapshots diff --git a/crates/ember-linux/src/vm.rs b/crates/ember-linux/src/vm.rs index fc337c9..a0c859f 100644 --- a/crates/ember-linux/src/vm.rs +++ b/crates/ember-linux/src/vm.rs @@ -16,7 +16,6 @@ use std::time::Duration; use crate::firecracker; use crate::network; -use crate::zfs; use ember_core::backend::{StartedVm, VmBackend}; use ember_core::config::GlobalConfig; use ember_core::error::{Error, Result}; @@ -38,7 +37,7 @@ impl VmBackend for LinuxVm { /// Expects `vm.network` to be already populated (by `NetworkBackend::setup`). /// Spawns the Firecracker process, configures it via the API, and boots. /// Returns the hypervisor PID and the network info from the metadata. - fn start(vm: &VmMetadata, _config: &GlobalConfig) -> Result { + fn start(vm: &VmMetadata, config: &GlobalConfig) -> Result { let socket_path = &vm.api_socket; let log_path = socket_path.with_file_name("firecracker.log"); @@ -49,6 +48,13 @@ impl VmBackend for LinuxVm { )) })?; + // Resolve the rootfs through the active storage backend so the + // backend (ZFS, dm-thin, …) controls how `vm.disk_path` becomes + // the actual device path Firecracker sees. dm-thin lazily + // re-activates pool + thin devices here (pool tables are + // kernel-only state that vanishes on host reboot). + let rootfs_path = crate::create_storage(config).disk_device_path(vm)?; + // Clean up stale socket from a previous run. if socket_path.exists() { std::fs::remove_file(socket_path).map_err(|e| Error::Io { @@ -64,7 +70,7 @@ impl VmBackend for LinuxVm { // Configure and boot via the Firecracker API. // Kill the process on failure to avoid an orphaned Firecracker. - match configure_and_boot(vm, socket_path, net_info) { + match configure_and_boot(vm, &rootfs_path, socket_path, net_info) { Ok(()) => {} Err(e) => { let _ = firecracker::process::kill(pid); @@ -164,9 +170,11 @@ impl VmBackend for LinuxVm { /// /// Waits for the API socket, builds the VM configuration from metadata /// and network info, then issues the API calls to configure and start -/// the instance. +/// the instance. `rootfs_path` is the activated disk device path +/// resolved by the storage backend. fn configure_and_boot( vm: &VmMetadata, + rootfs_path: &std::path::Path, socket_path: &std::path::Path, net_info: &NetworkInfo, ) -> Result<()> { @@ -179,9 +187,8 @@ fn configure_and_boot( let dns_servers = network::dns::detect_nameservers(wan_iface); // Build VM configuration. - let rootfs_path = zfs::volume::device_path(&vm.disk_path); let mut vm_config = - firecracker::config::VmConfig::new(vm.cpus, vm.memory_mib, &vm.kernel_path, &rootfs_path); + firecracker::config::VmConfig::new(vm.cpus, vm.memory_mib, &vm.kernel_path, rootfs_path); if let Some(ref boot_args) = vm.boot_args { vm_config = vm_config.with_boot_args(boot_args); } diff --git a/crates/ember-macos/src/lib.rs b/crates/ember-macos/src/lib.rs index d8ea045..6e3f0a1 100644 --- a/crates/ember-macos/src/lib.rs +++ b/crates/ember-macos/src/lib.rs @@ -9,3 +9,19 @@ pub use network::MacosNetwork; pub use platform::MacosPlatform; pub use storage::MacosStorage; pub use vm::MacosVm; + +use std::sync::Arc; + +use ember_core::backend::{InitConfig, StorageBackend}; +use ember_core::config::GlobalConfig; +use ember_core::error::Result; + +/// Construct the active storage backend. +pub fn create_storage(config: &GlobalConfig) -> Arc { + Arc::new(MacosStorage::new(config)) +} + +/// Initialize storage during `ember init`. +pub fn init_storage(config: &InitConfig) -> Result<()> { + MacosStorage::init(config) +} diff --git a/crates/ember-macos/src/storage.rs b/crates/ember-macos/src/storage.rs index af2dbfa..7cf2d39 100644 --- a/crates/ember-macos/src/storage.rs +++ b/crates/ember-macos/src/storage.rs @@ -18,9 +18,11 @@ use std::path::{Path, PathBuf}; use std::process::Command; use std::time::Instant; -use ember_core::backend::{InitConfig, SnapshotInfo, StorageBackend}; +use ember_core::backend::{InitConfig, SnapshotInfo, StorageBackend, VolumeHandle}; use ember_core::config::size::ByteSize; use ember_core::error::{Error, Result}; +use ember_core::image::registry::ImageEntry; +use ember_core::state::vm::{SnapshotEntry, VmMetadata}; /// macOS storage backend using APFS copy-on-write clones. /// @@ -116,7 +118,7 @@ impl StorageBackend for MacosStorage { name: &str, image_path: &Path, _size_mib: u64, - ) -> Result { + ) -> Result { let dest = self.image_path(name); // Ensure the images directory exists. @@ -137,7 +139,7 @@ impl StorageBackend for MacosStorage { let _ = fs::remove_file(image_path); } - Ok(dest) + Ok(VolumeHandle::from_path(dest)) } /// Clone a base image for a new VM using APFS copy-on-write. @@ -145,8 +147,8 @@ impl StorageBackend for MacosStorage { /// `cp -c` creates an instant CoW clone — the VM's rootfs shares blocks /// with the base image until written to. This is the macOS equivalent of /// `zfs clone pool/.../images/name@base pool/.../vms/vm_name`. - fn clone_for_vm(&self, image_name: &str, vm_name: &str) -> Result { - let src = self.image_path(image_name); + fn clone_for_vm(&self, image: &ImageEntry, vm_name: &str) -> Result { + let src = self.image_path(&image.local_name); if !src.exists() { return Err(Error::Image(format!( "base image not found: {}", @@ -170,7 +172,7 @@ impl StorageBackend for MacosStorage { let dest = self.vm_rootfs(vm_name); apfs_clone(&src, &dest)?; - Ok(dest) + Ok(VolumeHandle::from_path(dest)) } /// Create a snapshot by APFS-cloning the VM's current rootfs. @@ -178,7 +180,8 @@ impl StorageBackend for MacosStorage { /// `cp -c vms//rootfs.img → vms//snapshots/.img` /// This is instant (CoW) and costs no additional disk space until /// the VM's rootfs diverges from the snapshot. - fn snapshot(&self, vm_name: &str, snap_name: &str) -> Result<()> { + fn snapshot(&self, vm: &VmMetadata, snap_name: &str) -> Result> { + let vm_name = vm.name.as_str(); let src = self.vm_rootfs(vm_name); if !src.exists() { return Err(Error::Image(format!( @@ -201,7 +204,8 @@ impl StorageBackend for MacosStorage { } apfs_clone(&src, &dest)?; - Ok(()) + // APFS tracks snapshots as files on disk; nothing to add to vm.json. + Ok(None) } /// Restore a snapshot by replacing the VM's rootfs with an APFS clone @@ -210,7 +214,8 @@ impl StorageBackend for MacosStorage { /// `cp -c vms//snapshots/.img → vms//rootfs.img` /// The old rootfs is removed first, then replaced with a fresh CoW clone /// of the snapshot. - fn restore_snapshot(&self, vm_name: &str, snap_name: &str) -> Result<()> { + fn restore_snapshot(&self, vm: &VmMetadata, snap_name: &str) -> Result { + let vm_name = vm.name.as_str(); let snap_path = self .vm_snapshots_dir(vm_name) .join(format!("{snap_name}.img")); @@ -237,10 +242,10 @@ impl StorageBackend for MacosStorage { // Atomic rename replaces the old rootfs in one operation. fs::rename(&tmp_rootfs, &rootfs).map_err(|e| Error::Io { - path: rootfs, + path: rootfs.clone(), source: e, })?; - Ok(()) + Ok(VolumeHandle::from_path(rootfs)) } /// Delete a snapshot by removing its image file. @@ -248,7 +253,8 @@ impl StorageBackend for MacosStorage { /// APFS reference-counts the underlying blocks — deleting a snapshot only /// frees blocks that are not shared with other clones (rootfs or other /// snapshots). - fn delete_snapshot(&self, vm_name: &str, snap_name: &str) -> Result<()> { + fn delete_snapshot(&self, vm: &VmMetadata, snap_name: &str) -> Result<()> { + let vm_name = vm.name.as_str(); let snap_path = self .vm_snapshots_dir(vm_name) .join(format!("{snap_name}.img")); @@ -269,8 +275,8 @@ impl StorageBackend for MacosStorage { /// /// Each `.img` file in the directory is a snapshot. Metadata (creation /// time, size) comes from `fs::metadata` on each file. - fn list_snapshots(&self, vm_name: &str) -> Result> { - let snap_dir = self.vm_snapshots_dir(vm_name); + fn list_snapshots(&self, vm: &VmMetadata) -> Result> { + let snap_dir = self.vm_snapshots_dir(&vm.name); if !snap_dir.exists() { return Ok(vec![]); } @@ -335,8 +341,8 @@ impl StorageBackend for MacosStorage { /// /// Only growing is supported — the CLI layer prevents shrink attempts. /// Requires `e2fsprogs` from Homebrew (`brew install e2fsprogs`). - fn resize(&self, vm_name: &str, new_size: ByteSize) -> Result<()> { - let rootfs = self.vm_rootfs(vm_name); + fn resize(&self, vm: &VmMetadata, new_size: ByteSize) -> Result<()> { + let rootfs = self.vm_rootfs(&vm.name); if !rootfs.exists() { return Err(Error::Image(format!( "VM rootfs not found: {}", @@ -408,8 +414,8 @@ impl StorageBackend for MacosStorage { /// Destroy all storage for a VM: rootfs image, snapshots, and VM directory. /// /// Silently succeeds if the directory doesn't exist (idempotent delete). - fn destroy_vm_storage(&self, vm_name: &str) -> Result<()> { - let vm_dir = self.vm_dir(vm_name); + fn destroy_vm_storage(&self, vm: &VmMetadata) -> Result<()> { + let vm_dir = self.vm_dir(&vm.name); if vm_dir.exists() { fs::remove_dir_all(&vm_dir).map_err(|e| Error::Io { path: vm_dir, @@ -421,8 +427,8 @@ impl StorageBackend for MacosStorage { /// Destroy storage for a base image (the raw `.img` file). /// The `force` flag is a no-op on macOS (APFS clones are independent). - fn destroy_image_storage(&self, name: &str, _force: bool) -> Result<()> { - let img = self.image_path(name); + fn destroy_image_storage(&self, image: &ImageEntry, _force: bool) -> Result<()> { + let img = self.image_path(&image.local_name); if img.exists() { fs::remove_file(&img).map_err(|e| Error::Io { path: img, @@ -436,8 +442,8 @@ impl StorageBackend for MacosStorage { /// /// On macOS the raw `.img` file is passed directly to AVF — no /// block device indirection like ZFS zvols. - fn disk_device_path(&self, vm_name: &str) -> PathBuf { - self.vm_rootfs(vm_name) + fn disk_device_path(&self, vm: &VmMetadata) -> Result { + Ok(self.vm_rootfs(&vm.name)) } /// Clone a source VM's disk for forking via APFS copy-on-write. @@ -445,8 +451,8 @@ impl StorageBackend for MacosStorage { /// Directly clones the source VM's rootfs into the target VM's rootfs /// using `cp -c`. No intermediate snapshot is created — APFS clones /// are fully independent, so no cleanup or dependency tracking is needed. - fn clone_vm_storage(&self, source_vm: &str, target_vm: &str) -> Result { - let source_rootfs = self.vm_rootfs(source_vm); + fn clone_vm_storage(&self, source: &VmMetadata, target_vm: &str) -> Result { + let source_rootfs = self.vm_rootfs(&source.name); if !source_rootfs.exists() { return Err(Error::Image(format!( "source VM rootfs not found: {}", @@ -469,19 +475,50 @@ impl StorageBackend for MacosStorage { let target_rootfs = self.vm_rootfs(target_vm); apfs_clone(&source_rootfs, &target_rootfs)?; - Ok(target_rootfs) + Ok(VolumeHandle::from_path(target_rootfs)) } /// No-op on macOS — APFS clones are independent, nothing to clean up. - fn cleanup_fork(&self, _parent_vm: &str, _forked_vm: &str) -> Result<()> { + fn cleanup_fork(&self, _parent: &VmMetadata, _forked: &VmMetadata) -> Result<()> { Ok(()) } /// Always returns empty on macOS — APFS clones are independent. - fn storage_dependents(&self, _vm_name: &str) -> Result> { + fn storage_dependents(&self, _vm: &VmMetadata) -> Result> { Ok(vec![]) } + fn deinit(&self, purge: bool) -> Result<()> { + // The state directory layout (`images/`, `vms/`, `kernels/`, + // `network/`) is owned by ember; on `--purge` we drop the disk + // images so a future `ember init` starts clean. + if purge { + let images = self.images_dir(); + if images.exists() { + fs::remove_dir_all(&images).map_err(|e| Error::Io { + path: images, + source: e, + })?; + } + let vms = self.vms_dir(); + if vms.exists() { + fs::remove_dir_all(&vms).map_err(|e| Error::Io { + path: vms, + source: e, + })?; + } + } + Ok(()) + } + + fn grow(&self, _new_size: ByteSize) -> Result<()> { + Err(Error::Image( + "macOS/APFS has no pool concept — resize individual VMs with \ + `ember vm resize` instead" + .to_string(), + )) + } + /// Not supported for ext4 on macOS. /// /// macOS has no native ext4 mount support. Use [`inject_ssh_key`] for diff --git a/docs/DM-THIN-SPEC.md b/docs/DM-THIN-SPEC.md new file mode 100644 index 0000000..6b75449 --- /dev/null +++ b/docs/DM-THIN-SPEC.md @@ -0,0 +1,693 @@ +# Ember — dm-thin Storage Backend + +This document specifies how ember will support Linux device-mapper thin provisioning (`dm-thin`) as an alternative to ZFS for copy-on-write VM storage on Linux. +The dm-thin backend is **not yet implemented** — this is a design spec. +It mirrors the structure of `BTRFS-SPEC.md` and reuses the trait-object dispatch model introduced there. + +The goal is the same as the btrfs spec: drop the ZFS kernel module dependency and the requirement for a dedicated pool device, while preserving block-level copy-on-write semantics that are already a tight fit with Firecracker (raw block drive, instant clones, real snapshots). + +## Design principles + +* **Same CLI, different storage**: All `ember` commands work identically regardless of which backend is active. Backend choice is invisible to users after `ember init`. +* **Block-level CoW**: dm-thin provides instant copy-on-write thin volumes and snapshots at the block layer, analogous to ZFS zvols + clones. No filesystem-level reflinks. +* **Block device drives**: VM root disks are exposed as `/dev/mapper/` block devices and passed directly to Firecracker as `path_on_host`. Same drive shape as the existing ZFS path. +* **Sparse-file backing by default**: `ember init` creates two sparse files (metadata + data) on the existing filesystem and assembles them into a thin pool via `losetup` + `dmsetup`. A raw block device may be used instead, but is not required. +* **Kernel-builtin**: dm-thin is in-tree (`CONFIG_DM_THIN_PROVISIONING`), shipped by every mainstream distribution since ~2012. No DKMS, no out-of-tree module, no licensing friction with the kernel. +* **No filesystem on the pool**: The pool itself is a block-device factory. Each thin volume is independently formatted with ext4 (the same ext4 image pipeline used today). The pool does not see file-level structure. +* **Thin volumes and snapshots are the same primitive**: In dm-thin, a snapshot is just another thin volume that shares blocks with its source. Image base, VM disk, user snapshot, and fork all use the same `create_snap` call. +* **Random 64-bit thin ids**: Unlike ZFS where datasets are addressed by name, dm-thin volumes are addressed by numeric ids. Ember picks a random `u64` per volume and retries on the rare collision. The id is stored on the existing `VmMetadata`/`ImageEntry` records; no separate allocator state. +* **Root required**: Same as ZFS — `dmsetup`, `losetup`, `mount`, and Firecracker all need root. + +## Component mapping + +| ZFS | dm-thin | Notes | +|-----|---------|-------| +| `zpool create pool /dev/sda` | `truncate` + `losetup` + `dmsetup create ember-pool ... thin-pool ...` | Thin pool replaces ZFS pool | +| `zfs create pool/images` | (none) | No dataset hierarchy; the pool is flat | +| `zfs create -V 10G pool/images/x` (zvol) | `dmsetup message ember-pool 0 "create_thin "` + `dmsetup create ember-img-x` | Thin volume replaces zvol | +| `zfs snapshot pool/images/x@base` | `dmsetup message ember-pool 0 "create_snap "` | Snapshot is just another thin id | +| `zfs clone pool/images/x@base pool/vms/y` | `create_snap ` + `dmsetup create ember-vm-y` | Same `create_snap`; activate as device | +| `zfs snapshot pool/vms/y@snap` | suspend vm + `create_snap ` + resume | Suspend ensures consistent on-disk state | +| `zfs rollback pool/vms/y@snap` | remove vm device + delete vm thin + `create_snap ` + recreate vm device | Restore replaces the live volume | +| `zfs destroy pool/vms/y@snap` | `dmsetup message ember-pool 0 "delete "` | Releases blocks back to the pool | +| `zfs set volsize=20G pool/vms/y` | `dmsetup suspend` + `dmsetup load` (new size) + `dmsetup resume` + `resize2fs` | Resize is a table reload | +| `zfs destroy -r pool/vms/y` | `dmsetup remove ember-vm-y` + `delete ` | Two-step: deactivate then free | +| `/dev/zvol/pool/vms/y` | `/dev/mapper/ember-vm-y` | Different path, same shape | + +## Backend selection + +### `ember init` + +The `--storage` flag introduced in `BTRFS-SPEC.md` gains a third value: `dm-thin`. + +```bash +# ZFS (existing) +ember init --pool tank --device /dev/sda + +# btrfs (per BTRFS-SPEC.md) +ember init --storage btrfs --storage-path /var/lib/ember/btrfs.img --size 50G + +# dm-thin with sparse files (default) +ember init --storage dm-thin --size 50G + +# dm-thin with explicit data file location +ember init --storage dm-thin --storage-path /var/lib/ember/dm-thin --size 50G + +# dm-thin on a raw block device +ember init --storage dm-thin --storage-path /dev/sdb +``` + +When `--storage dm-thin` is specified: + +* `--storage-path` selects the directory holding `metadata.img` and `data.img` (file-backed mode), or the raw block device to use (device mode). Defaults to `/var/lib/ember/dm-thin` for file-backed mode. +* `--size` is required for file-backed mode and disambiguates from device mode. When present, two sparse files are created. When absent, `--storage-path` must be an existing block device. +* `--metadata-size` is optional and defaults to a value computed from `thin_metadata_size` (see "Pool sizing" below). +* `--block-size` is optional and defaults to `64K`. **Permanent** — cannot be changed after pool creation. +* `--pool`, `--dataset`, and `--device` are ZFS-only and ignored. + +If a `config.json` already exists, `ember init` checks `storage_backend` and refuses to re-initialize with a different backend. +Switching backends requires `ember deinit` first. + +### Backend dispatch + +Same as in `BTRFS-SPEC.md`: `Storage` becomes `Arc` on Linux, dispatched at construction time by a `create_storage()` factory: + +```rust +// crates/ember-linux/src/lib.rs +pub fn create_storage(config: &GlobalConfig) -> Arc { + match config.storage_backend { + StorageKind::Zfs => Arc::new(ZfsStorage::new(config)), + StorageKind::Btrfs => Arc::new(BtrfsStorage::new(config)), + StorageKind::DmThin => Arc::new(DmThinStorage::new(config)), + } +} +``` + +`StorageKind` gains a `DmThin` variant. +The `--storage dm-thin` CLI flag accepts `dm-thin` and serializes as `dm-thin` (lowercase, hyphen) to match common usage. + +### Init dispatch + +`StorageBackend::init` remains an associated function. The `ember init` handler matches on the requested backend: + +```rust +match storage_backend { + StorageKind::Zfs => ZfsStorage::init(&init_config)?, + StorageKind::Btrfs => BtrfsStorage::init(&init_config)?, + StorageKind::DmThin => DmThinStorage::init(&init_config)?, +} +``` + +### Config changes + +`GlobalConfig` extensions, building on the btrfs spec: + +```rust +#[derive(Clone, Copy, Debug, Default, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "kebab-case")] +pub enum StorageKind { + #[default] + Zfs, + Btrfs, + DmThin, +} + +pub struct GlobalConfig { + #[serde(default)] + pub storage_backend: StorageKind, + pub pool: String, // ZFS only + pub dataset: String, // ZFS only + pub kernel_path: Option, + pub wan_iface: Option, + pub state_dir: PathBuf, + /// Block device or image file path. Used by btrfs and dm-thin. + /// For dm-thin: directory containing metadata.img/data.img, or a raw device. + #[serde(default)] + pub storage_path: Option, + /// dm-thin pool block size in 512-byte sectors (default: 128 = 64KiB). + /// Permanent at pool creation; resolved to `Some(actual)` at init + /// time so the value the running pool was created with stays stable + /// across ember upgrades. + #[serde(default)] + pub dm_thin_block_size: Option, + /// dm-thin layout: `File` (sparse files inside `storage_path`) or + /// `RawDevice` (`storage_path` is a block device, metadata sits on + /// `state_dir/dm-thin-metadata.img`). Resolved at init from + /// `storage_path` and persisted so reactivation does not depend on + /// a live `is_dir()` probe. + #[serde(default)] + pub dm_thin_mode: Option, +} +``` + +The pool name (`ember-pool`) and device-mapper prefixes (`ember-img-`, `ember-vm-`) are constants — not user-configurable. +This keeps the config small and prevents collisions between concurrent ember installations on the same host. Multi-instance support is out of scope for this spec. + +`InitConfig` extensions: + +```rust +pub struct InitConfig { + pub state_dir: PathBuf, + pub pool: String, // ZFS only + pub dataset: String, // ZFS only + pub device: Option, // ZFS only + pub storage_path: Option, // btrfs + dm-thin + pub btrfs_size: Option, // btrfs only + /// Size of the dm-thin data device. + /// Required for file-backed mode, ignored for device mode. + pub dm_thin_size: Option, + /// Override metadata device size. Defaults to `thin_metadata_size` output. + pub dm_thin_metadata_size: Option, + /// Pool block size in sectors. Defaults to 128 (64KiB). + pub dm_thin_block_size: Option, + /// File-backed vs raw-device layout. The CLI resolves this from + /// `storage_path` so the backend trusts what it was handed. + pub dm_thin_mode: Option, +} +``` + +### Deinit trait method + +The `deinit()` method introduced in the btrfs spec applies here too. For dm-thin: + +1. Deactivate every active thin volume: `dmsetup remove ember-vm-*`, `ember-img-*`. +2. Remove the pool: `dmsetup remove ember-pool`. +3. Detach loop devices: `losetup -d /dev/loopN /dev/loopM`. +4. If file-backed: optionally delete `metadata.img` and `data.img` (gated behind `--purge`, default keep). +5. Remove the directory if empty. +6. Delete `config.json`. + +Block devices are left intact, same as ZFS `zpool destroy`. + +## Thin id allocation + +dm-thin addresses each volume by a numeric `dev_id` and the kernel +enforces `dev_id <= (1 << 24) - 1` in `drivers/md/dm-thin.c`: + +```c +#define MAX_DEV_ID ((1ULL << 24) - 1) + +if (*dev_id > MAX_DEV_ID) { + DMWARN("Message received with invalid device id: %llu", *dev_id); + return -EINVAL; +} +``` + +So the usable space is 24 bits. +Ember picks a random non-zero id within that range: + +```rust +const MAX_DEV_ID: u64 = (1 << 24) - 1; + +fn fresh_thin_id() -> u64 { + loop { + let id = (rand::random::() as u64) & MAX_DEV_ID; + if id != 0 { + return id; + } + } +} + +fn allocate(pool: &str) -> Result { + loop { + let id = fresh_thin_id(); + match dmsetup_message(pool, &format!("create_thin {id}")) { + Ok(()) => return Ok(id), + Err(e) if is_already_exists(&e) => continue, + Err(e) => return Err(e), + } + } +} +``` + +Why this is safe: + +* Birthday collision in a 24-bit space first crosses 1% probability around 1800 active ids. Realistic ember pools hold dozens to a few hundred volumes — well below that, and the kernel still rejects duplicates atomically (`EEXIST`) so the retry loop is the entire concurrency story. +* Two ember processes racing on `create_thin` cannot both succeed for the same id; whoever lost retries. +* No persistent counter, no allocator file, no flock around id generation. + +`create_snap` follows the same pattern (allocate id, retry on `EEXIST`). +The `id` is recorded on the relevant `VmMetadata`/`ImageEntry`/`SnapshotEntry` under whichever lock already protects that record; the kernel pool itself remains the source of truth for liveness, queryable via `thin_dump` for recovery. + +The serialized type on those records stays `u64` so the on-disk format does not need to change if the kernel ever lifts the 24-bit cap. +For now only the low 24 bits are populated. + +## Pool sizing + +The metadata device must be sized to cover the maximum number of blocks the pool can ever reference: + +* Recommended formula: `metadata_size = max(48 * data_size / block_size, 2 MiB)` (kernel docs). +* Practical cap: 16 GiB. The kernel rejects metadata devices larger than this. +* Standard tool: `thin_metadata_size --block-size=64k --pool-size=50G --max-thins=1000 --numeric-only --unit=b`. + +Defaults used by `ember init`: + +* `block_size`: 64 KiB (128 sectors). Smaller block sizes give better sharing across snapshots at the cost of more metadata; 64 KiB is the documented kernel default. +* `metadata_size`: computed via `thin_metadata_size` for the requested data size, capped at 16 GiB, floor of 32 MiB. +* `low_water_mark`: `data_size / block_size / 16` blocks (≈6.25% of pool). When free blocks fall below this, the kernel notifies userspace via `dmeventd`. Ember does not register a userspace handler in this initial spec — the value is informational. A future enhancement could surface low-space warnings via `dmsetup status`. + +## Storage layout + +``` +/var/lib/ember/dm-thin/ # Default --storage-path +├── metadata.img # Sparse file, ~32 MiB to 16 GiB +└── data.img # Sparse file, sized to --size + +/var/lib/ember/ # State directory (unchanged location) +├── config.json +├── kernels/ +├── images/ +│ └── registry.json # ImageEntry records, now include thin_id +├── vms/ +│ └── / +│ └── vm.json # VmMetadata, includes thin_id +└── network/ +``` + +No separate allocator state file is needed. +Thin ids live exclusively on `ImageEntry.thin_id`, `VmMetadata.thin_id`, and `SnapshotEntry.thin_id`. +Fresh ids are picked at random; the pool itself is the authority for which ids are live (queryable via `thin_dump /dev/loopMETA` for recovery). + +## Initialization + +### File-backed (default) + +```bash +ember init --storage dm-thin --size 50G +``` + +1. Create directory: `mkdir -p /var/lib/ember/dm-thin`. +2. Compute metadata size: `thin_metadata_size --block-size=64k --pool-size=50G --max-thins=1000 --numeric-only --unit=b` → e.g. `838860800` (≈800 MiB). +3. Create sparse data: `truncate -s 50G /var/lib/ember/dm-thin/data.img`. +4. Create sparse metadata: `truncate -s 800M /var/lib/ember/dm-thin/metadata.img`. +5. Zero metadata header: `dd if=/dev/zero of=/var/lib/ember/dm-thin/metadata.img bs=4K count=1 conv=notrunc`. The kernel uses the all-zero superblock as the signal to format a fresh pool. +6. Attach loops: `losetup -f --show /var/lib/ember/dm-thin/metadata.img` → `/dev/loopN`; same for `data.img` → `/dev/loopM`. +7. Assemble pool: `dmsetup create ember-pool --table "0 thin-pool /dev/loopN /dev/loopM 128 32768"` where `data_sectors = data_size / 512` and `32768` is the low-water mark in blocks. +8. Write `config.json` with `storage_backend = "dm-thin"`, `storage_path = /var/lib/ember/dm-thin`. + +### Device-backed + +```bash +ember init --storage dm-thin --storage-path /dev/sdb +``` + +1. Allocate metadata partition: requires either a separate metadata device or a partition layout. To avoid forcing partitioning on the user, ember uses **embedded metadata mode**: it places `metadata.img` as a sparse file on the state directory's filesystem and uses `--storage-path` only as the data device. (Splitting metadata onto a tiny separate device is a future enhancement.) +2. Wipe the device's first 4 KiB so the pool initializes fresh: `dd if=/dev/zero of=/dev/sdb bs=4K count=1`. +3. `losetup` only the metadata file. The data device is used directly. +4. Assemble pool: `dmsetup create ember-pool --table "0 thin-pool /dev/loopN /dev/sdb 128 32768"`. + +The init flow is otherwise identical. + +### Activation on subsequent runs + +dm-thin tables live only in kernel memory. +After a reboot or `dmsetup remove`, the pool and all thin volumes are gone from `/dev/mapper/` even though the underlying metadata is intact. +Ember therefore reactivates on demand. + +The first command after a reboot triggers `ensure_pool_active`: + +1. Read `config.json` → `storage_path`. +2. Check `/dev/mapper/ember-pool` exists. If yes, done. +3. If no: + a. `losetup -f --show metadata.img` → `/dev/loopN` (skip if device-backed). + b. `losetup -f --show data.img` → `/dev/loopM` (skip if device-backed). + c. Run `thin_check /dev/loopN` (or the metadata loop). Fail loudly on metadata corruption — operator must run `thin_repair` manually. + d. `dmsetup create ember-pool --table "0 thin-pool ... 128 "` using the values from `config.json`. + +Step (c) walks the entire metadata B-tree, so the *first* command after a reboot pays a one-time cost proportional to pool occupancy. +For pools with millions of mapped blocks this can take several seconds; subsequent commands hit the cached `pool::exists` early-return and are free. +This is intentional — silently activating a corrupt pool would damage every snapshot derived from it. +Operators who prefer to skip the check (e.g. on read-only inspection of a known-good pool) can `dmsetup create` the pool manually before invoking ember. + +Per-VM and per-image volumes are activated **lazily** by methods that need them (e.g. `disk_device_path`, `mount`, `start`). +Each method calls `ensure_thin_active(name, thin_id, size_sectors)`: + +1. If `/dev/mapper/` exists, done. +2. Else: `dmsetup create --table "0 thin /dev/mapper/ember-pool "`. + +Sizes come from existing `ImageEntry.size_mib` and `VmMetadata.disk_size_gib`. + +### Filesystem validation + +Before any storage operation, ember verifies `/dev/mapper/ember-pool` exists. +If not, it attempts the activation sequence above. +This is the dm-thin equivalent of the btrfs `/proc/mounts` check. + +`dmsetup status ember-pool` is parsed to detect: + +* `out_of_data_space`: pool is full. New writes will fail with EIO. Ember refuses VM create/start and prints an actionable error suggesting `ember storage grow`. +* `metadata_low_watermark`: metadata pressure. Logged as a warning. +* `read_only`: kernel switched the pool to read-only after a metadata error. Refuse all write operations. + +### Teardown (`ember deinit`) + +1. Stop all running VMs (precondition; ember refuses if any VM is running). +2. Remove all activated thin volumes: enumerate `dmsetup ls --target thin` filtered by the `ember-img-` / `ember-vm-` prefix, then `dmsetup remove` each. +3. Free thin ids: not strictly required (the next step destroys metadata) but done for symmetry: `dmsetup message ember-pool 0 "delete "` for each. +4. Remove pool: `dmsetup remove ember-pool`. +5. Detach loops: `losetup -d /dev/loopN /dev/loopM`. +6. If `--purge`: delete `metadata.img`, `data.img`. +7. Remove `config.json`. + +For device-backed pools, the data device is left intact — same as ZFS. + +## Image pull workflow + +Reuses the existing pipeline up to the ext4 image: + +``` +OCI registry → unpacked rootfs → mkfs.ext4 + populate → ext4 image file + │ + ▼ + create_thin → activate → dd → snapshot +``` + +Per-image steps: + +1. Allocate thin id: `id_a = fresh_thin_id()` (random `u64`, retry on collision — see "Thin id allocation" below). +2. Create thin: `dmsetup message ember-pool 0 "create_thin "`. +3. Activate as a temporary device: `dmsetup create ember-img--staging --table "0 thin /dev/mapper/ember-pool "`. +4. Write image: `dd if=/tmp/ember-image-XXXX/image.ext4 of=/dev/mapper/ember-img--staging bs=1M`. Existing `zvol::dd_image` logic is reused once the device path is supplied. +5. Suspend: `dmsetup suspend ember-img--staging`. This forces a metadata commit so the snapshot below sees a consistent state. +6. Allocate base id: `id_base = fresh_thin_id()`. +7. Snapshot: `dmsetup message ember-pool 0 "create_snap "`. +8. Resume: `dmsetup resume ember-img--staging`. +9. Discard the staging device: `dmsetup remove ember-img--staging`. Free `id_a`: `dmsetup message ember-pool 0 "delete "`. The `id_base` snapshot retains all of its blocks. +10. Persist: `ImageEntry.thin_id = id_base`, `disk_path = "/dev/mapper/ember-img-"` (the activated path; lazy activation will create it on first use). + +Why two ids? `create_snap` requires a source thin volume. +We need a snapshot of the freshly-written image so that VM clones can branch from a stable origin without our staging device hanging around as a dependency. +The pattern matches how ZFS uses `@base`: write to a primary, snapshot it, then never touch the primary again. + +The base thin is not activated as a device by default; only VMs cloned from it appear in `/dev/mapper/`. +This keeps `/dev/mapper/` clutter-free and avoids races where a stale activation locks a volume. + +## VM create + +```bash +ember vm create myvm --image alpine --disk-size 4G +``` + +1. Look up `ImageEntry.thin_id` for `alpine` (the base id). +2. Allocate fresh id: `id_vm = fresh_thin_id()`. +3. Snapshot: `dmsetup message ember-pool 0 "create_snap "`. Instant — no data is copied. +4. Activate: `dmsetup create ember-vm-myvm --table "0 thin /dev/mapper/ember-pool "`. +5. The activated device path `/dev/mapper/ember-vm-myvm` is recorded in `VmMetadata.disk_path` and `VmMetadata.thin_id = id_vm`. +6. Loop-mount via `mount /dev/mapper/ember-vm-myvm /tmp/...` to inject SSH key and hostname (the existing flow on the ZFS path; no `-o loop` needed because dm-thin volumes are real block devices). +7. Pass `/dev/mapper/ember-vm-myvm` to Firecracker as `path_on_host`. + +If `disk_sectors > image size_sectors`, the activation table size already declares the larger virtual size. Ember then runs `e2fsck -f -p` and `resize2fs` against the device to grow the ext4 filesystem into the new space (no `truncate` needed — thin volumes are virtually sized at activation time). + +### Sanity check + +A `create_snap` completes in milliseconds. +Mirror the macOS/btrfs timing check: warn if the operation takes more than 1 second, since that suggests metadata pressure or pool-level issues. + +## VM resize + +```bash +ember vm resize myvm --disk-size 8G +``` + +1. VM must be stopped (existing precondition). +2. Suspend: `dmsetup suspend ember-vm-myvm`. +3. Reload table with new virtual size: `dmsetup load ember-vm-myvm --table "0 thin /dev/mapper/ember-pool "`. +4. Resume: `dmsetup resume ember-vm-myvm`. +5. `e2fsck -f -p /dev/mapper/ember-vm-myvm`. +6. `resize2fs /dev/mapper/ember-vm-myvm`. + +No new blocks are allocated until the guest writes into the new space. +Pool capacity is the upper bound; thin volumes can over-commit it. + +Shrinking is not supported (matches every other backend). + +## Pool resize + +A new admin command: + +```bash +ember storage grow --size 100G +``` + +1. For file-backed: `truncate -s 100G data.img`. For device-backed: assumes the user has already grown the device (e.g. cloud volume expansion). +2. `losetup -c /dev/loopM`: instruct the loop driver to re-read the backing file size. (No-op for device mode.) +3. Suspend: `dmsetup suspend ember-pool`. +4. Reload table: `dmsetup load ember-pool --table "0 thin-pool /dev/loopN /dev/loopM 128 "`. +5. Resume: `dmsetup resume ember-pool`. + +Metadata cannot be resized in place. +If `thin_metadata_size` for the new pool size exceeds the existing metadata device, ember refuses the grow and prints instructions for an offline metadata move using `pdata_tools` (out of scope for the initial implementation; doc only). + +## User snapshots + +```bash +# Create +ember snapshot create myvm s1 +→ id_s1 = fresh_thin_id() +→ dmsetup suspend ember-vm-myvm +→ dmsetup message ember-pool 0 "create_snap " +→ dmsetup resume ember-vm-myvm + (id_s1 stays inactive — no /dev/mapper entry until restore) + +# Restore (VM must be stopped) +ember snapshot restore myvm s1 +→ dmsetup remove ember-vm-myvm +→ dmsetup message ember-pool 0 "delete " +→ id_vm_new = fresh_thin_id() +→ dmsetup message ember-pool 0 "create_snap " +→ dmsetup create ember-vm-myvm --table "0 thin /dev/mapper/ember-pool " + VmMetadata.thin_id = id_vm_new + +# List +ember snapshot list myvm +→ read snapshot records from VmMetadata (or a sidecar; see below) + +# Delete +ember snapshot delete myvm s1 +→ dmsetup message ember-pool 0 "delete " +``` + +### Snapshot consistency + +Suspending the VM volume during `create_snap` flushes outstanding I/O and forces a metadata commit before the snapshot is taken. +The kernel performs the equivalent of an fsync at the block layer. +A guest that has not fsynced its in-flight writes may still see an uncrashed-but-dirty filesystem on the snapshot, exactly as with ZFS zvol snapshots. +This matches existing behavior; no additional guarantees are introduced. + +### Snapshot metadata + +Snapshot records are stored alongside `VmMetadata`, since the existing ZFS backend reads them via `zfs::snapshot::list`. +For dm-thin, ember maintains a `snapshots: Vec` list in `vm.json`: + +```rust +pub struct SnapshotEntry { + pub name: String, + pub thin_id: u64, + pub created_at: String, + pub size_sectors: u64, +} +``` + +`list_snapshots` reads this list. +`size` reflects unique block usage and can be queried via `dmsetup status` or `thin_ls --metadata-snap` for accurate accounting; for the initial implementation, ember reports the volume's virtual size and defers exclusive-block accounting to a future enhancement. + +## VM fork + +```bash +ember vm fork source newvm +``` + +`fork` and `clone-for-vm` are the same primitive on dm-thin: + +1. Allocate `id_fork = fresh_thin_id()`. +2. Suspend source (if running, this is required for consistency). +3. `dmsetup message ember-pool 0 "create_snap "`. +4. Resume source. +5. Activate: `dmsetup create ember-vm-newvm --table "0 thin /dev/mapper/ember-pool "`. + +Forks are independent of the source after creation — the dm-thin metadata reference-counts blocks, so deleting the source's thin id does not affect the fork. +This mirrors APFS/btrfs behavior, not ZFS: + +* `cleanup_fork` is a no-op. +* `storage_dependents` always returns an empty vec. + +The `parent_vm` field in `VmMetadata` records the fork origin for informational purposes. + +This is a notable simplification compared to the ZFS backend's fork-snapshot dependency tracking. + +## Firecracker integration + +The drive path is a block device, identical in shape to the ZFS path: + +| Backend | `path_on_host` | +|---------|----------------| +| ZFS | `/dev/zvol/tank/ember/vms/myvm` (block device) | +| btrfs | `/var/lib/ember/btrfs/vms/myvm/rootfs.img` (regular file) | +| dm-thin | `/dev/mapper/ember-vm-myvm` (block device) | + +`LinuxVm::start` already handles block-device drive paths. +The dispatch logic introduced by the btrfs spec (file path vs ZFS dataset name) extends naturally — dm-thin paths start with `/dev/mapper/`, so they take the file-path branch (passed through unchanged). + +The conversion helper that maps a `disk_path` to the actual device path becomes: + +```rust +let rootfs_path = if vm.disk_path.starts_with('/') { + PathBuf::from(&vm.disk_path) // btrfs file or dm-thin /dev/mapper path +} else { + zfs::volume::device_path(&vm.disk_path) // ZFS dataset name +}; +``` + +No further VM-side changes are required. + +## VM and image metadata + +`VmMetadata` and `ImageEntry` gain a single optional field: + +```rust +pub struct VmMetadata { + // ... + pub disk_path: String, + pub parent_vm: Option, + /// dm-thin volume id. None for ZFS/btrfs/APFS backends. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub thin_id: Option, + pub snapshots: Vec, // NEW: dm-thin owns this list + // ... +} + +pub struct ImageEntry { + pub reference: String, + pub local_name: String, + pub disk_path: String, + pub size_mib: u64, + pub pulled_at: String, + /// dm-thin base snapshot id. None for ZFS/btrfs/APFS backends. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub thin_id: Option, +} +``` + +The `#[serde(skip_serializing_if = "Option::is_none")]` keeps ZFS configs unchanged on disk. +Existing `vm.json` and `registry.json` files are read without modification — the ZFS backend simply ignores `thin_id`. + +For ZFS, the `snapshots` list remains empty in `vm.json` and `list_snapshots` continues to read live state from `zfs::snapshot::list`. +The dm-thin backend writes to it. +This split is acceptable but slightly asymmetric; an alternative is for ZFS to mirror its snapshots into `vm.json` too, which is out of scope here. + +## Image dependency tracking + +With dm-thin, the base thin id can technically be deleted while VMs cloned from it exist — block reference counting at the pool level prevents data loss. +However, ember preserves the user-visible invariant of the existing image registry: `ember image delete` checks for VMs that reference the image and refuses to delete by default, consistent with both ZFS and btrfs. + +`destroy_image_storage(name, force)`: + +* Without `--force`: refuse if `ImageEntry.thin_id` is referenced by any `VmMetadata.thin_id`'s ancestor chain. Ancestor lookup uses `thin_dump` to walk the snapshot graph. +* With `--force`: delete the thin id directly. Cloned VMs retain their own thin ids and continue to function — block sharing is invisible at the volume level. + +## Crate structure + +Building on the layout proposed by the btrfs spec: + +``` +crates/ember-linux/src/ +├── storage.rs # create_storage() factory, returns Arc +├── zfs_storage.rs # ZFS backend (renamed from current storage.rs) +├── btrfs_storage.rs # btrfs backend +├── dm_thin_storage.rs # NEW: dm-thin backend +├── zfs/ # ZFS CLI wrappers (unchanged) +├── btrfs/ # btrfs CLI wrappers +├── dm_thin/ # NEW: dm-thin CLI wrappers +│ ├── module.rs # mod declarations +│ ├── pool.rs # ember-pool create/activate/teardown, status parsing +│ ├── thin.rs # create_thin, create_snap, delete, suspend/resume, table reload, fresh_thin_id +│ ├── activation.rs # ensure_pool_active, ensure_thin_active, deactivate +│ └── tools.rs # thin_check, thin_repair, thin_metadata_size, thin_dump wrappers +├── zvol.rs # Existing ext4 → block device pipeline (reused for dm-thin) +└── vm.rs # LinuxVm — handles file paths and block device paths +``` + +`DmThinStorage` mirrors `ZfsStorage` but addresses volumes by id: + +```rust +pub struct DmThinStorage { + /// Backing path (directory for files, raw block dev otherwise). + storage_path: PathBuf, + /// Pool block size in sectors. From config. + block_size: u32, +} +``` + +The struct holds no allocator state. +`fresh_thin_id()` generates a random `u64` and returns it; collisions are handled by the kernel (`create_thin` returns `EEXIST`) and the caller retries. +The authoritative record of which ids are live lives in `ImageEntry`/`VmMetadata`/`SnapshotEntry`, which are already updated under the existing per-VM and registry locks — no new locking primitive is introduced. + +### Display and platform adaptations + +`LinuxPlatform` (at `crates/ember-linux/src/platform.rs`) needs the same kind of branching the btrfs spec describes: + +* **`inspect_vm_extra`**: "Disk device" / `/dev/mapper/ember-vm-` and "Thin id" / ``. +* **`inspect_image_extra`**: "Disk device" / `/dev/mapper/ember-img-` and "Thin id" / ``. +* **`info_extra`**: "Storage" / "dm-thin", "Pool" / `ember-pool`, "Storage path" / the configured `storage_path`, plus a "Pool usage" line populated from `dmsetup status ember-pool`. +* **`init_hint`**: include the dm-thin variant alongside the ZFS and btrfs hints. + +## Comparison: ZFS vs btrfs vs dm-thin vs APFS + +| Operation | ZFS (Linux) | btrfs (Linux) | dm-thin (Linux) | APFS (macOS) | +|-----------|-------------|---------------|-----------------|--------------| +| Init | `zpool create` + `zfs create` | `mkfs.btrfs` + `mount` + `mkdir` | `truncate` + `losetup` + `dmsetup create thin-pool` | `mkdir` | +| Base image | zvol + `@base` snapshot | Raw `.img` file | Thin volume + snapshot id | Raw `.img` file | +| VM clone | `zfs clone x@base y` | `cp --reflink=always x.img y.img` | `dmsetup message create_snap` + `dmsetup create` | `cp -c x.img y.img` | +| Snapshot | `zfs snapshot y@snap` | `cp --reflink=always` | suspend + `create_snap` + resume | `cp -c` | +| Restore | `zfs rollback y@snap` | `cp --reflink=always` + `mv` | remove + delete + `create_snap` + create | `cp -c` + `mv` | +| Delete snap | `zfs destroy y@snap` | `rm snap.img` | `dmsetup message delete` | `rm snap.img` | +| Resize | `zfs set volsize` + `resize2fs` | `truncate` + `resize2fs` | `dmsetup load` + `resize2fs` | `truncate` + `resize2fs` | +| Fork | `zfs clone` (creates dependency) | `cp --reflink=always` (independent) | `create_snap` (independent) | `cp -c` (independent) | +| Drive path | `/dev/zvol/...` | `.../rootfs.img` (file) | `/dev/mapper/...` | `.../rootfs.img` (file) | +| Root required | Yes | Yes | Yes | No | +| Filesystem validation | `zpool list` | `/proc/mounts` | `dmsetup status ember-pool` | APFS volume check at init | +| Reactivation after reboot | Auto (zpool import) | Auto-mount | Explicit `ensure_pool_active` | Not applicable | +| Identifier | Dataset path | File path | Random 24-bit thin id | File path | +| State on disk | ZFS metadata | Filesystem metadata | Pool metadata (ids embedded in existing vm/image records) | Filesystem metadata | +| Kernel module | Out-of-tree (DKMS) | In-tree | In-tree | N/A | +| Checksums | Yes (ZFS) | Yes (data + metadata) | Metadata only | No | + +dm-thin sits between ZFS and btrfs: +it offers ZFS-like block-level CoW with no kernel module, at the cost of a more involved activation lifecycle (numeric ids, explicit `dmsetup` operations, no auto-import) and weaker data-integrity guarantees (no data checksums, harsher pool-exhaustion failure mode). + +## Storage efficiency diagnostics + +`ember debug storage-efficiency` for dm-thin reports both per-volume and pool-level metrics: + +* Per-volume virtual size: from the activated device's table. +* Per-volume exclusive blocks: from `thin_ls --metadata-snap=- /dev/loopMETA`. Computing this requires a metadata snapshot — taken under suspend or via `dmsetup message ember-pool 0 "reserve_metadata_snap"` — which has measurable overhead. The command surfaces it on demand only. +* Pool capacity, allocated, and free: from `dmsetup status ember-pool`. Output format: `/ /`. + +The macOS `st_blocks` approach used by the btrfs and APFS backends does not apply — dm-thin volumes are block devices, not files, and `stat` on `/dev/mapper/...` reports no allocation. + +## Risks and limitations + +* **Pool exhaustion**: Sparse-file backing lets the pool over-commit. If the host filesystem fills up, the pool transitions to read-only and all thin volumes return EIO until space is recovered. Ember should pre-check available space on the host filesystem before allowing image pulls or VM creates that would push the pool toward its data limit. The initial implementation adds a refuse-on-pool-full check via `dmsetup status` before each write-heavy operation; richer monitoring is a follow-up. +* **Metadata exhaustion**: Less recoverable than data exhaustion. The metadata device must be sized generously at init. `ember storage info` should warn when metadata usage exceeds 80%. +* **Block size is permanent**: Chosen at `dmsetup create`; cannot be changed without rebuilding the pool. The 64 KiB default is a balance; users with very large VM disks (~hundreds of GiB) may want 128–256 KiB blocks for lower metadata overhead. +* **Loop device limits**: The default `max_loop=8` per kernel module load can be a constraint on systems with many loop-using services. Ember uses two loop devices total (metadata and data); the limit only matters when other software is competing. Documented as a troubleshooting hint, not a hard requirement. +* **Numeric id lifecycle**: Thin ids live on `VmMetadata`/`ImageEntry`/`SnapshotEntry`. Loss of the state directory therefore loses the name→id map even though the pool metadata is intact. Recovery is possible via `thin_dump` (lists all live thin ids) but requires manual reconstruction. No worse than the equivalent loss for ZFS or btrfs configs. +* **Concurrent invocations**: Race-free by construction. The kernel rejects duplicate ids atomically; the random-pick-and-retry loop tolerates concurrent creators without coordination. Per-record state mutation (writing `thin_id` into `vm.json` etc.) is already serialized by the existing per-VM and registry locks. +* **No data checksums**: Bit rot on the underlying block device goes undetected. Users who need this should layer dm-thin on top of LVM mirrors or hardware RAID, or stay on ZFS. +* **No `send`/`receive` equivalent**: Backup and migration require `dd` of the activated device, or `thin_dump` + `thin_delta` for incremental sync. Out of scope for the initial implementation. + +## External dependencies + +* **`dmsetup`**: From the `lvm2` package on Debian/Ubuntu/RHEL/Fedora/Arch. Installed by default on most server distributions. +* **`losetup`**: From `util-linux`. Always present. +* **`thin-provisioning-tools`**: Provides `thin_check`, `thin_repair`, `thin_dump`, `thin_metadata_size`, `thin_ls`. Packaged separately on most distributions. Required by `ember init` and `ember storage info`. Pre-flight check at `ember init` time. +* **`e2fsprogs`**: `mkfs.ext4`, `e2fsck`, `resize2fs`. Already required by the ZFS backend. +* **GNU coreutils**: `truncate`, `dd`. Already required. +* **Kernel config**: `CONFIG_DM_THIN_PROVISIONING=y` or `=m`, `CONFIG_BLK_DEV_LOOP=y` or `=m`. Both are part of every mainstream distribution kernel. + +## Open questions + +* **Multi-instance support**: The current spec hardcodes the pool name `ember-pool` and the device-mapper prefixes. Running multiple independent ember installations on the same host requires per-instance prefixes. Deferred until a real use case appears. +* **Metadata on a separate device**: `ember init --metadata-device /dev/sdc1` could place metadata on faster storage (NVMe) while data lives on bulk storage (HDD). Easy to add later — the pool table already supports two distinct devices. +* **Discard/TRIM**: dm-thin supports passdown of discards from guest to pool, which can return blocks to the pool when guests TRIM. Requires Firecracker virtio-blk to advertise discard support and the guest filesystem to issue it. Worth investigating as a follow-up; not required for correctness. +* **`dmeventd` integration**: Userspace handler for low-water-mark events would let ember warn proactively. The initial implementation polls `dmsetup status` on demand instead. diff --git a/src/backend.rs b/src/backend.rs index 664a0e6..9e2d353 100644 --- a/src/backend.rs +++ b/src/backend.rs @@ -4,6 +4,8 @@ //! This module re-exports them and provides the type aliases that //! select the active platform backend at compile time. +use std::sync::Arc; + // Re-export all traits and shared types from ember-core. pub use ember_core::backend::*; @@ -15,21 +17,27 @@ pub use ember_linux as linux; pub use ember_macos as macos; // Type aliases for the active platform backend. -// Selected at compile time based on target OS. +// `Vm`, `Network`, and `CurrentPlatform` are selected at compile time +// via `#[cfg(target_os)]`. `Storage` is a runtime trait object so the +// concrete implementation can be picked from `GlobalConfig` (e.g., ZFS +// vs btrfs vs dm-thin on Linux). #[cfg(target_os = "linux")] pub type Vm = ember_linux::LinuxVm; #[cfg(target_os = "linux")] -pub type Storage = ember_linux::LinuxStorage; -#[cfg(target_os = "linux")] pub type Network = ember_linux::LinuxNetwork; #[cfg(target_os = "macos")] pub type Vm = ember_macos::MacosVm; #[cfg(target_os = "macos")] -pub type Storage = ember_macos::MacosStorage; -#[cfg(target_os = "macos")] pub type Network = ember_macos::MacosNetwork; +pub type Storage = Arc; + +#[cfg(target_os = "linux")] +pub use ember_linux::{create_storage, init_storage}; +#[cfg(target_os = "macos")] +pub use ember_macos::{create_storage, init_storage}; + #[cfg(target_os = "linux")] pub type CurrentPlatform = ember_linux::LinuxPlatform; #[cfg(target_os = "macos")] diff --git a/src/cli.rs b/src/cli.rs index 4ec4734..b7c4741 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -1,5 +1,6 @@ pub mod cp; pub mod debug; +pub mod deinit; pub mod exec; pub(crate) mod fmt; pub mod image; @@ -8,6 +9,7 @@ pub mod init; pub mod kernel; pub mod snapshot; pub mod ssh; +pub mod storage; pub mod vm; use clap::{Parser, Subcommand}; @@ -80,6 +82,13 @@ pub enum Command { /// Reconcile internal state with actual VM process state Reconcile, + /// Tear down ember (inverse of `ember init`) + Deinit(deinit::DeinitArgs), + + /// Storage pool administration + #[command(subcommand)] + Storage(storage::StorageCommand), + /// Print version information Version, } diff --git a/src/cli/deinit.rs b/src/cli/deinit.rs new file mode 100644 index 0000000..3342157 --- /dev/null +++ b/src/cli/deinit.rs @@ -0,0 +1,64 @@ +//! `ember deinit` — tear down the storage backend. +//! +//! The inverse of `ember init`. Refuses to run while VMs are alive +//! to avoid leaving the user with a half-destroyed pool. + +use std::fs; +use std::path::Path; + +use clap::Args; + +use crate::backend::create_storage; +use ember_core::config::GlobalConfig; +use ember_core::state::store::StateStore; +use ember_core::state::vm; + +#[derive(Args)] +pub struct DeinitArgs { + /// Also delete backing files (dm-thin metadata.img/data.img) so + /// a future `ember init` starts from scratch. Block devices + /// supplied via `--storage-path` are always left intact. + #[arg(long)] + pub purge: bool, +} + +pub fn run(args: &DeinitArgs, state_dir: &Path) -> anyhow::Result<()> { + let store = StateStore::new(state_dir.to_path_buf()); + let config: GlobalConfig = match store.read_optional(&store.config_path())? { + Some(c) => c, + None => { + println!("ember is not initialized — nothing to tear down."); + return Ok(()); + } + }; + + // Refuse to deinit if any VM is recorded. Forces the user to + // `ember vm delete` (or `--force`) first so that backend cleanup + // doesn't leave dangling per-VM resources. + let vms = vm::list(&store).unwrap_or_default(); + if !vms.is_empty() { + let names: Vec = vms.into_iter().map(|v| v.name).collect(); + anyhow::bail!( + "refusing to deinit while {} VM(s) are registered: {}\n\ + Hint: delete them first with 'ember vm delete '.", + names.len(), + names.join(", "), + ); + } + + let storage = create_storage(&config); + storage.deinit(args.purge)?; + + // Remove the persisted config last — the backend may have needed + // it to find backing paths. + let config_path = store.config_path(); + if config_path.exists() { + fs::remove_file(&config_path).map_err(|e| ember_core::error::Error::Io { + path: config_path, + source: e, + })?; + } + + println!("ember deinitialized."); + Ok(()) +} diff --git a/src/cli/image.rs b/src/cli/image.rs index d72b6aa..ca0752b 100644 --- a/src/cli/image.rs +++ b/src/cli/image.rs @@ -4,11 +4,11 @@ use clap::{Args, Subcommand}; use super::fmt::{format_bytes_binary, MIB}; use super::vm::OutputFormat; -use crate::backend::{CurrentPlatform, Platform, Storage, StorageBackend}; +use crate::backend::{create_storage, CurrentPlatform, Platform, Storage, VolumeHandle}; use crate::image; use ember_core::config::GlobalConfig; use ember_core::image::pull::ImageReference; -use ember_core::image::registry::{new_build_entry, new_entry, ImageRegistry}; +use ember_core::image::registry::{new_build_entry, new_entry, ImageEntry, ImageRegistry}; use ember_core::state::store::StateStore; use ember_core::state::vm::{self, VmMetadata}; @@ -93,7 +93,7 @@ pub fn run(cmd: &ImageCommand, state_dir: &Path) -> anyhow::Result<()> { fn pull(args: &PullArgs, state_dir: &Path) -> anyhow::Result<()> { let store = StateStore::new(state_dir.to_path_buf()); let config: GlobalConfig = store.read(&store.config_path())?; - let storage = Storage::new(&config); + let storage = create_storage(&config); // Parse and validate the image reference. let reference = ImageReference::parse(&args.reference)?; @@ -126,12 +126,12 @@ fn pull(args: &PullArgs, state_dir: &Path) -> anyhow::Result<()> { inject_image_config(&rootfs_dir, true)?; // Steps 3-4: Create ext4 image → import into storage backend. - let (size_mib, disk_path, rollback) = + let (size_mib, handle, rollback) = create_image_from_rootfs(&rootfs_dir, work_dir.path(), &local_name, &storage)?; // Step 5: Register in local image registry. - let disk = disk_path.to_string_lossy().to_string(); - let entry = new_entry(&reference, &disk, size_mib); + let disk = handle.disk_path.to_string_lossy().to_string(); + let entry = new_entry(&reference, &disk, size_mib, handle.thin_id); let mut registry = ImageRegistry::load(&store)?; registry.add(entry); registry.save(&store)?; @@ -149,7 +149,7 @@ fn pull(args: &PullArgs, state_dir: &Path) -> anyhow::Result<()> { fn build(args: &BuildArgs, state_dir: &Path) -> anyhow::Result<()> { let store = StateStore::new(state_dir.to_path_buf()); let config: GlobalConfig = store.read(&store.config_path())?; - let storage = Storage::new(&config); + let storage = create_storage(&config); // Sanitize the name for storage use. let local_name = image::build::sanitize_name(&args.name)?; @@ -202,12 +202,12 @@ fn build(args: &BuildArgs, state_dir: &Path) -> anyhow::Result<()> { inject_image_config(&rootfs_dir, false)?; // Steps 3-4: Create ext4 image → import into storage backend. - let (size_mib, disk_path, rollback) = + let (size_mib, handle, rollback) = create_image_from_rootfs(&rootfs_dir, work_dir.path(), &local_name, &storage)?; // Step 5: Register in local image registry. - let disk = disk_path.to_string_lossy().to_string(); - let entry = new_build_entry(&args.name, &local_name, &disk, size_mib); + let disk = handle.disk_path.to_string_lossy().to_string(); + let entry = new_build_entry(&args.name, &local_name, &disk, size_mib, handle.thin_id); let mut registry = ImageRegistry::load(&store)?; registry.add(entry); registry.save(&store)?; @@ -302,9 +302,9 @@ fn delete(args: &DeleteArgs, state_dir: &Path) -> anyhow::Result<()> { // Destroy the image's storage (zvol on Linux, .img file on macOS). let config: GlobalConfig = store.read(&store.config_path())?; - let storage = Storage::new(&config); + let storage = create_storage(&config); println!("Destroying storage for image '{}'...", local_name); - storage.destroy_image_storage(&local_name, args.force)?; + storage.destroy_image_storage(&entry, args.force)?; // Remove from registry last, after the storage is gone. image::registry::remove_image(&store, &local_name)?; @@ -380,14 +380,15 @@ fn inject_image_config(rootfs_dir: &Path, inject_inittab: bool) -> anyhow::Resul /// Create an ext4 image from a rootfs directory and import it into storage. /// -/// Returns `(size_mib, disk_path, rollback)` — the caller must register -/// the image in the registry and then call `rollback.commit()` to finalize. +/// Returns `(size_mib, handle, rollback)` — the caller pulls +/// `handle.disk_path` and `handle.thin_id` to build an [`ImageEntry`] +/// for the registry, then calls `rollback.commit()` to finalize. fn create_image_from_rootfs( rootfs_dir: &Path, work_dir: &Path, name: &str, storage: &Storage, -) -> anyhow::Result<(u64, PathBuf, ember_core::cleanup::Rollback)> { +) -> anyhow::Result<(u64, VolumeHandle, ember_core::cleanup::Rollback)> { let size_mib = CurrentPlatform::estimate_ext4_size_mib(rootfs_dir)?; let ext4_path = work_dir.join("rootfs.ext4"); println!( @@ -402,18 +403,33 @@ fn create_image_from_rootfs( .unwrap_or(size_mib); println!(" Importing image into storage..."); - let disk_path = storage.create_image_volume(name, &ext4_path, size_mib)?; + let handle = storage.create_image_volume(name, &ext4_path, size_mib)?; let mut rollback = ember_core::cleanup::Rollback::new(); { let storage = storage.clone(); - let n = name.to_string(); + let stub = stub_image_entry(name, &handle); rollback.push("image storage", move || { - let _ = storage.destroy_image_storage(&n, false); + let _ = storage.destroy_image_storage(&stub, false); }); } - Ok((size_mib, disk_path, rollback)) + Ok((size_mib, handle, rollback)) +} + +/// Build a minimal [`ImageEntry`] for use in cleanup paths where the +/// real entry hasn't been (or no longer is) registered. The ZFS, btrfs, +/// and dm-thin backends only inspect `local_name` and `thin_id`, so the +/// remaining fields can be placeholders. +fn stub_image_entry(local_name: &str, handle: &VolumeHandle) -> ImageEntry { + ImageEntry { + reference: String::new(), + local_name: local_name.to_string(), + disk_path: handle.disk_path.to_string_lossy().into_owned(), + size_mib: 0, + pulled_at: String::new(), + thin_id: handle.thin_id, + } } /// Resolve a user-provided image name to its registry local_name. diff --git a/src/cli/init.rs b/src/cli/init.rs index c1fbcda..a3039d1 100644 --- a/src/cli/init.rs +++ b/src/cli/init.rs @@ -1,28 +1,87 @@ -use std::path::Path; +use std::path::{Path, PathBuf}; use clap::Args; -use crate::backend::{CurrentPlatform, InitConfig, Platform, Storage, StorageBackend}; -use ember_core::config::GlobalConfig; +use crate::backend::{init_storage, CurrentPlatform, InitConfig, Platform}; +use ember_core::config::size::ByteSize; +use ember_core::config::{DmThinMode, GlobalConfig, StorageKind}; use ember_core::state::store::StateStore; +/// dm-thin pool block size (in 512-byte sectors) used when the user does +/// not pass `--block-size`. Resolved here at init time and persisted on +/// `GlobalConfig` so the value the running pool was created with stays +/// stable across ember upgrades — block size is permanent at pool +/// creation, and silently switching defaults later would orphan +/// existing pools. +#[cfg(target_os = "linux")] +const DM_THIN_DEFAULT_BLOCK_SIZE_SECTORS: u32 = + ember_linux::dm_thin::pool::DEFAULT_BLOCK_SIZE_SECTORS; +#[cfg(not(target_os = "linux"))] +const DM_THIN_DEFAULT_BLOCK_SIZE_SECTORS: u32 = 128; + +/// Convert a CLI `--block-size` byte value into the 512-byte sector +/// count the kernel expects, validating dm-thin's constraints: the +/// block size must be a multiple of 64 KiB and fit in `u32` sectors. +fn resolve_dm_thin_block_size_sectors(user: Option) -> anyhow::Result { + let Some(size) = user else { + return Ok(DM_THIN_DEFAULT_BLOCK_SIZE_SECTORS); + }; + let bytes = size.bytes(); + const MIN_BYTES: u64 = 64 * 1024; + if bytes < MIN_BYTES || bytes % MIN_BYTES != 0 { + anyhow::bail!( + "--block-size must be at least 64K and a multiple of 64K (got {bytes} bytes)" + ); + } + let sectors = bytes / 512; + u32::try_from(sectors) + .map_err(|_| anyhow::anyhow!("--block-size {bytes} bytes overflows u32 sectors")) +} + #[derive(Args)] pub struct InitArgs { - /// ZFS pool name (Linux only) + /// Storage backend: zfs (default) or dm-thin (Linux only) + #[cfg_attr(target_os = "macos", arg(long, default_value = "zfs", hide = true))] + #[cfg_attr(not(target_os = "macos"), arg(long, default_value = "zfs"))] + pub storage: StorageKind, + + /// ZFS pool name (--storage zfs only) #[cfg_attr(target_os = "macos", arg(long, default_value = "ember", hide = true))] #[cfg_attr(not(target_os = "macos"), arg(long, default_value = "ember"))] pub pool: String, - /// Block device for pool creation (Linux only) + /// Block device for ZFS pool creation (--storage zfs only) #[cfg_attr(target_os = "macos", arg(long, hide = true))] #[cfg_attr(not(target_os = "macos"), arg(long))] pub device: Option, - /// Dataset name within the pool (Linux only) + /// Dataset name within the pool (--storage zfs only) #[cfg_attr(target_os = "macos", arg(long, default_value = "ember", hide = true))] #[cfg_attr(not(target_os = "macos"), arg(long, default_value = "ember"))] pub dataset: String, + /// Backing path for non-ZFS backends (directory or block device). + /// + /// dm-thin: directory holding metadata.img/data.img, or a raw block + /// device. Defaults to /var/lib/ember/dm-thin when omitted. + #[arg(long)] + pub storage_path: Option, + + /// Pool size for file-backed dm-thin (e.g. `50G`). Required when + /// `--storage-path` is a file path; ignored for raw block devices. + #[arg(long)] + pub size: Option, + + /// Override metadata device size for dm-thin (e.g. `800M`). + /// `thin_metadata_size` computes a recommended value when omitted. + #[arg(long)] + pub metadata_size: Option, + + /// dm-thin pool block size (e.g. `64K`, `1M`). Must be a multiple + /// of 64 KiB; permanent at pool creation. Defaults to 64 KiB. + #[arg(long)] + pub block_size: Option, + /// Kernel preset or file path [presets: stock] #[arg(long)] pub kernel: Option, @@ -33,21 +92,77 @@ pub struct InitArgs { } pub fn run(args: &InitArgs, state_dir: &Path) -> anyhow::Result<()> { - // 1-2. Create or verify ZFS pool and datasets via the storage backend. + // Refuse to switch backends silently. Existing configs win unless + // the user runs `ember deinit` first. + let store = StateStore::new(state_dir.to_path_buf()); + if let Ok(Some(existing)) = store.read_optional::(&store.config_path()) { + if existing.storage_backend != args.storage { + anyhow::bail!( + "ember is already initialized with the {:?} backend; \ + run 'ember deinit' first to switch to {:?}", + existing.storage_backend, + args.storage, + ); + } + } + + // Resolve the dm-thin defaults so both InitConfig and GlobalConfig + // see the same values. + let storage_path = match args.storage { + StorageKind::DmThin => Some( + args.storage_path + .clone() + .unwrap_or_else(|| PathBuf::from("/var/lib/ember/dm-thin")), + ), + StorageKind::Btrfs => args.storage_path.clone(), + StorageKind::Zfs => None, + }; + + // Resolve block size up-front for dm-thin so the persisted config + // pins the value the pool was actually created with, even when the + // user omits `--block-size`. Internally the kernel addresses pool + // blocks in 512-byte sectors; the CLI accepts a `ByteSize` so the + // UX matches `--size` / `--metadata-size`. + let resolved_block_size = match args.storage { + StorageKind::DmThin => Some(resolve_dm_thin_block_size_sectors(args.block_size)?), + _ => None, + }; + + // Resolve file-vs-raw-device layout once and persist it. Doing this + // here rather than in the backend keeps the contract explicit: + // reactivation should not depend on a live `is_dir()` probe of + // `storage_path` agreeing with what init saw. + let resolved_dm_thin_mode = match (args.storage, storage_path.as_ref()) { + (StorageKind::DmThin, Some(path)) => { + if path.is_dir() || !path.exists() { + Some(DmThinMode::File) + } else { + Some(DmThinMode::RawDevice) + } + } + _ => None, + }; + let init_config = InitConfig { + storage_backend: args.storage, state_dir: state_dir.to_path_buf(), pool: args.pool.clone(), dataset: args.dataset.clone(), device: args.device.clone(), + storage_path: storage_path.clone(), + btrfs_size: None, + dm_thin_size: args.size, + dm_thin_metadata_size: args.metadata_size, + dm_thin_block_size: resolved_block_size, + dm_thin_mode: resolved_dm_thin_mode, }; - Storage::init(&init_config)?; + init_storage(&init_config)?; - // 3. Initialize state directory structure. - let store = StateStore::new(state_dir.to_path_buf()); + // Initialize state directory structure. store.init()?; println!("State directory initialized at {}", state_dir.display()); - // 4. Download kernel if preset or path provided. + // Download kernel if preset or path provided. let kernel_path = if let Some(spec) = &args.kernel { Some(spec.resolve(&store)?) } else { @@ -55,19 +170,23 @@ pub fn run(args: &InitArgs, state_dir: &Path) -> anyhow::Result<()> { None }; - // 5. Detect or use provided WAN interface. + // Detect or use provided WAN interface. let (wan_iface, messages) = CurrentPlatform::detect_wan_iface(args.wan_iface.as_deref()); for msg in &messages { println!("{msg}"); } - // 6. Write config. + // Write config. let config = GlobalConfig { + storage_backend: args.storage, pool: args.pool.clone(), dataset: args.dataset.clone(), kernel_path, wan_iface, state_dir: state_dir.to_path_buf(), + storage_path, + dm_thin_block_size: resolved_block_size, + dm_thin_mode: resolved_dm_thin_mode, }; store.write(&store.config_path(), &config)?; println!("Configuration written to {}", store.config_path().display()); @@ -79,16 +198,30 @@ pub fn run(args: &InitArgs, state_dir: &Path) -> anyhow::Result<()> { #[cfg(test)] mod tests { use super::*; + use ember_core::config::StorageKind; use std::path::PathBuf; + fn zfs_config(pool: &str, dataset: &str) -> GlobalConfig { + GlobalConfig { + storage_backend: StorageKind::Zfs, + pool: pool.to_string(), + dataset: dataset.to_string(), + kernel_path: None, + wan_iface: None, + state_dir: PathBuf::default(), + storage_path: None, + dm_thin_block_size: None, + dm_thin_mode: None, + } + } + #[test] fn global_config_round_trip_with_kernel() { let config = GlobalConfig { - pool: "testpool".to_string(), - dataset: "ember".to_string(), kernel_path: Some(PathBuf::from("/var/lib/ember/kernels/vmlinux")), wan_iface: Some("eth0".to_string()), state_dir: PathBuf::from("/var/lib/ember"), + ..zfs_config("testpool", "ember") }; let json = serde_json::to_string(&config).unwrap(); @@ -98,14 +231,7 @@ mod tests { #[test] fn global_config_round_trip_without_kernel() { - let config = GlobalConfig { - pool: "mypool".to_string(), - dataset: "mydata".to_string(), - kernel_path: None, - wan_iface: None, - state_dir: PathBuf::default(), - }; - + let config = zfs_config("mypool", "mydata"); let json = serde_json::to_string(&config).unwrap(); let loaded: GlobalConfig = serde_json::from_str(&json).unwrap(); assert_eq!(loaded, config); @@ -114,11 +240,9 @@ mod tests { #[test] fn global_config_json_format() { let config = GlobalConfig { - pool: "tank".to_string(), - dataset: "ember".to_string(), kernel_path: Some(PathBuf::from("/kernels/vmlinux")), wan_iface: Some("wlp2s0".to_string()), - state_dir: PathBuf::default(), + ..zfs_config("tank", "ember") }; let json: serde_json::Value = serde_json::to_value(&config).unwrap(); @@ -126,18 +250,12 @@ mod tests { assert_eq!(json["dataset"], "ember"); assert_eq!(json["kernel_path"], "/kernels/vmlinux"); assert_eq!(json["wan_iface"], "wlp2s0"); + assert_eq!(json["storage_backend"], "zfs"); } #[test] fn global_config_null_kernel_in_json() { - let config = GlobalConfig { - pool: "tank".to_string(), - dataset: "ember".to_string(), - kernel_path: None, - wan_iface: None, - state_dir: PathBuf::default(), - }; - + let config = zfs_config("tank", "ember"); let json: serde_json::Value = serde_json::to_value(&config).unwrap(); assert!(json["kernel_path"].is_null()); } @@ -149,11 +267,9 @@ mod tests { store.init().unwrap(); let config = GlobalConfig { - pool: "testpool".to_string(), - dataset: "ember".to_string(), - kernel_path: None, wan_iface: Some("eth0".to_string()), state_dir: dir.path().to_path_buf(), + ..zfs_config("testpool", "ember") }; store.write(&store.config_path(), &config).unwrap(); @@ -170,23 +286,18 @@ mod tests { let store = StateStore::new(dir.path().to_path_buf()); store.init().unwrap(); - // First write. let config1 = GlobalConfig { - pool: "pool1".to_string(), - dataset: "ds1".to_string(), - kernel_path: None, wan_iface: Some("eth0".to_string()), state_dir: dir.path().to_path_buf(), + ..zfs_config("pool1", "ds1") }; store.write(&store.config_path(), &config1).unwrap(); - // Second write (simulates re-running init). let config2 = GlobalConfig { - pool: "pool2".to_string(), - dataset: "ds2".to_string(), kernel_path: Some(PathBuf::from("/kernels/vmlinux")), wan_iface: Some("wlp2s0".to_string()), state_dir: dir.path().to_path_buf(), + ..zfs_config("pool2", "ds2") }; store.write(&store.config_path(), &config2).unwrap(); @@ -196,10 +307,12 @@ mod tests { #[test] fn global_config_backwards_compatible_without_wan_iface() { - // Older config.json files won't have wan_iface — serde(default) handles this. + // Older config.json files won't have wan_iface or storage_backend + // — serde(default) handles both. let json = r#"{"pool":"tank","dataset":"ember","kernel_path":null}"#; let loaded: GlobalConfig = serde_json::from_str(json).unwrap(); assert_eq!(loaded.pool, "tank"); assert_eq!(loaded.wan_iface, None); + assert_eq!(loaded.storage_backend, StorageKind::Zfs); } } diff --git a/src/cli/snapshot.rs b/src/cli/snapshot.rs index a7146dd..e1e2638 100644 --- a/src/cli/snapshot.rs +++ b/src/cli/snapshot.rs @@ -2,7 +2,7 @@ use std::path::Path; use clap::{Args, Subcommand}; -use crate::backend::{Storage, StorageBackend}; +use crate::backend::create_storage; use ember_core::config::GlobalConfig; use ember_core::state::store::StateStore; use ember_core::state::vm; @@ -80,8 +80,8 @@ pub fn run(cmd: &SnapshotCommand, state_dir: &Path) -> anyhow::Result<()> { fn create(args: &CreateArgs, state_dir: &Path) -> anyhow::Result<()> { let store = StateStore::new(state_dir.to_path_buf()); let config: GlobalConfig = store.read(&store.config_path())?; - let storage = Storage::new(&config); - let _metadata = vm::load(&store, &args.vm_name)?; + let storage = create_storage(&config); + let mut metadata = vm::load(&store, &args.vm_name)?; // Disallow the reserved snapshot name. if args.snapshot_name == RESERVED_SNAPSHOT_NAME { @@ -89,7 +89,7 @@ fn create(args: &CreateArgs, state_dir: &Path) -> anyhow::Result<()> { } // Check the snapshot doesn't already exist. - let existing = storage.list_snapshots(&args.vm_name)?; + let existing = storage.list_snapshots(&metadata)?; if existing.iter().any(|s| s.name == args.snapshot_name) { anyhow::bail!( "snapshot '{}' already exists on vm '{}'", @@ -98,7 +98,10 @@ fn create(args: &CreateArgs, state_dir: &Path) -> anyhow::Result<()> { ); } - storage.snapshot(&args.vm_name, &args.snapshot_name)?; + if let Some(entry) = storage.snapshot(&metadata, &args.snapshot_name)? { + metadata.snapshots.push(entry); + vm::save(&store, &metadata)?; + } println!( "Created snapshot '{}' of vm '{}'", @@ -114,10 +117,10 @@ fn create(args: &CreateArgs, state_dir: &Path) -> anyhow::Result<()> { fn list(args: &ListArgs, state_dir: &Path) -> anyhow::Result<()> { let store = StateStore::new(state_dir.to_path_buf()); let config: GlobalConfig = store.read(&store.config_path())?; - let storage = Storage::new(&config); - let _metadata = vm::load(&store, &args.vm_name)?; + let storage = create_storage(&config); + let metadata = vm::load(&store, &args.vm_name)?; - let snapshots = storage.list_snapshots(&args.vm_name)?; + let snapshots = storage.list_snapshots(&metadata)?; match args.format { OutputFormat::Json => { @@ -190,8 +193,8 @@ use super::fmt::format_bytes_binary as format_bytes; fn delete(args: &DeleteArgs, state_dir: &Path) -> anyhow::Result<()> { let store = StateStore::new(state_dir.to_path_buf()); let config: GlobalConfig = store.read(&store.config_path())?; - let storage = Storage::new(&config); - let _metadata = vm::load(&store, &args.vm_name)?; + let storage = create_storage(&config); + let mut metadata = vm::load(&store, &args.vm_name)?; // Disallow deleting the reserved snapshot. if args.snapshot_name == RESERVED_SNAPSHOT_NAME { @@ -199,7 +202,7 @@ fn delete(args: &DeleteArgs, state_dir: &Path) -> anyhow::Result<()> { } // Verify the snapshot exists. - let existing = storage.list_snapshots(&args.vm_name)?; + let existing = storage.list_snapshots(&metadata)?; if !existing.iter().any(|s| s.name == args.snapshot_name) { anyhow::bail!( "snapshot '{}' does not exist on vm '{}'\n\ @@ -210,7 +213,15 @@ fn delete(args: &DeleteArgs, state_dir: &Path) -> anyhow::Result<()> { ); } - storage.delete_snapshot(&args.vm_name, &args.snapshot_name)?; + storage.delete_snapshot(&metadata, &args.snapshot_name)?; + + // For backends that track snapshots in vm.json (dm-thin), drop the + // entry. ZFS/APFS leave vm.snapshots empty; this is a no-op there. + let before = metadata.snapshots.len(); + metadata.snapshots.retain(|s| s.name != args.snapshot_name); + if metadata.snapshots.len() != before { + vm::save(&store, &metadata)?; + } println!( "Deleted snapshot '{}' from vm '{}'", @@ -226,11 +237,11 @@ fn delete(args: &DeleteArgs, state_dir: &Path) -> anyhow::Result<()> { fn restore(args: &RestoreArgs, state_dir: &Path) -> anyhow::Result<()> { let store = StateStore::new(state_dir.to_path_buf()); let config: GlobalConfig = store.read(&store.config_path())?; - let storage = Storage::new(&config); - let _metadata = vm::require_stopped(&store, &args.vm_name, "restoring a snapshot")?; + let storage = create_storage(&config); + let mut metadata = vm::require_stopped(&store, &args.vm_name, "restoring a snapshot")?; // Verify the snapshot exists. - let existing = storage.list_snapshots(&args.vm_name)?; + let existing = storage.list_snapshots(&metadata)?; if !existing.iter().any(|s| s.name == args.snapshot_name) { anyhow::bail!( "snapshot '{}' does not exist on vm '{}'\n\ @@ -241,7 +252,15 @@ fn restore(args: &RestoreArgs, state_dir: &Path) -> anyhow::Result<()> { ); } - storage.restore_snapshot(&args.vm_name, &args.snapshot_name)?; + let handle = storage.restore_snapshot(&metadata, &args.snapshot_name)?; + // Persist any backend-specific identity change (dm-thin replaces the + // thin_id on restore; ZFS/APFS keep the same identity). + let new_disk_path = handle.disk_path.to_string_lossy().to_string(); + if metadata.thin_id != handle.thin_id || metadata.disk_path != new_disk_path { + metadata.thin_id = handle.thin_id; + metadata.disk_path = new_disk_path; + vm::save(&store, &metadata)?; + } println!( "Restored vm '{}' to snapshot '{}'", diff --git a/src/cli/storage.rs b/src/cli/storage.rs new file mode 100644 index 0000000..785a052 --- /dev/null +++ b/src/cli/storage.rs @@ -0,0 +1,38 @@ +//! `ember storage` subcommands: pool-level administration. + +use std::path::Path; + +use clap::{Args, Subcommand}; + +use crate::backend::create_storage; +use ember_core::config::size::ByteSize; +use ember_core::config::GlobalConfig; +use ember_core::state::store::StateStore; + +#[derive(Subcommand)] +pub enum StorageCommand { + /// Grow the underlying pool capacity (dm-thin only). + Grow(GrowArgs), +} + +#[derive(Args)] +pub struct GrowArgs { + /// New total size for the data device, e.g. `100G`. Must be larger + /// than the current size. + #[arg(long)] + pub size: ByteSize, +} + +pub fn run(cmd: &StorageCommand, state_dir: &Path) -> anyhow::Result<()> { + match cmd { + StorageCommand::Grow(args) => grow(args, state_dir), + } +} + +fn grow(args: &GrowArgs, state_dir: &Path) -> anyhow::Result<()> { + let store = StateStore::new(state_dir.to_path_buf()); + let config: GlobalConfig = store.read(&store.config_path())?; + let storage = create_storage(&config); + storage.grow(args.size)?; + Ok(()) +} diff --git a/src/cli/vm.rs b/src/cli/vm.rs index 83ea157..d2484e3 100644 --- a/src/cli/vm.rs +++ b/src/cli/vm.rs @@ -5,7 +5,8 @@ use uuid::Uuid; use super::fmt::{format_bytes_binary, GIB, MIB}; use crate::backend::{ - CurrentPlatform, Network, NetworkBackend, Platform, Storage, StorageBackend, Vm, VmBackend, + create_storage, CurrentPlatform, Network, NetworkBackend, Platform, Storage, Vm, VmBackend, + VolumeHandle, }; use crate::image; use ember_core::config; @@ -16,6 +17,34 @@ use ember_core::image::registry::ImageRegistry; use ember_core::state::store::StateStore; use ember_core::state::vm::{self, NetworkInfo, SshConfig, VmMetadata, VmStatus}; +/// Build a placeholder [`VmMetadata`] from a freshly returned +/// [`VolumeHandle`]. +/// +/// Used between `clone_for_vm`/`clone_vm_storage` and the moment the +/// fully populated metadata is constructed: the storage backend reads +/// `name`, `disk_path`, and `thin_id` from this stub for resize, mount, +/// and SSH-key injection. All other fields are placeholders inherited +/// from [`VmMetadata::default_for_teardown`]. +fn pending_metadata(name: &str, handle: &VolumeHandle, disk_size_gib: u32) -> VmMetadata { + let mut m = VmMetadata::default_for_teardown(); + m.name = name.to_string(); + m.disk_path = handle.disk_path.to_string_lossy().into_owned(); + m.thin_id = handle.thin_id; + // dm-thin needs the size to (re)activate the thin device. Stash the + // requested disk size so `disk_device_path(pending)` can re-attach + // post-resize even if the kernel state was somehow torn down. + m.disk_size_gib = disk_size_gib; + m +} + +/// Build a placeholder [`VmMetadata`] when only the name is available +/// (e.g., recovery paths where the real record can no longer be loaded). +fn name_only_metadata(name: &str) -> VmMetadata { + let mut m = VmMetadata::default_for_teardown(); + m.name = name.to_string(); + m +} + /// Load a running VM with network info, checking that the guest IP is resolved. /// /// Wraps `vm::load_running_with_network` and returns an error if the guest IP @@ -426,25 +455,24 @@ fn create(args: &CreateArgs, state_dir: &Path) -> anyhow::Result<()> { ) })?; - let image_name = image_entry.local_name.clone(); let image_ref = image_entry.reference.clone(); let image_size_mib = image_entry.size_mib; - let storage = Storage::new(&global_config); + let storage = create_storage(&global_config); let mut rollback = Rollback::new(); // Clone base image → per-VM disk (instant, copy-on-write). println!("Cloning image for VM '{}'...", resolved.name); - let vm_disk_path = storage.clone_for_vm(&image_name, &resolved.name)?; - let vm_disk = vm_disk_path.to_string_lossy().to_string(); + let handle = storage.clone_for_vm(image_entry, &resolved.name)?; + let pending = pending_metadata(&resolved.name, &handle, resolved.disk_size); { let storage = storage.clone(); let sd = state_dir.to_path_buf(); - let name = resolved.name.clone(); + let pending = pending.clone(); rollback.push("VM storage clone", move || { - let _ = storage.destroy_vm_storage(&name); - let _ = vm::delete(&StateStore::new(sd), &name); + let _ = storage.destroy_vm_storage(&pending); + let _ = vm::delete(&StateStore::new(sd), &pending.name); }); } @@ -453,7 +481,7 @@ fn create(args: &CreateArgs, state_dir: &Path) -> anyhow::Result<()> { &store, &mut global_config, &storage, - &vm_disk, + &pending, image_size_mib, &image_ref, )?; @@ -475,12 +503,15 @@ fn create(args: &CreateArgs, state_dir: &Path) -> anyhow::Result<()> { /// Post-clone steps: grow disk, inject SSH key, save metadata. /// /// Separated from [`create`] so the caller can clean up storage on failure. +/// `pending` is the in-progress VM metadata (built from the [`VolumeHandle`] +/// returned by `clone_for_vm`); the storage backend reads `name`, `disk_path`, +/// and `thin_id` from it for the resize/inject calls below. fn create_post_clone( resolved: &ResolvedVmCreate, store: &StateStore, global_config: &mut GlobalConfig, storage: &Storage, - vm_disk: &str, + pending: &VmMetadata, image_size_mib: u64, image_ref: &str, ) -> anyhow::Result<()> { @@ -492,16 +523,13 @@ fn create_post_clone( "Growing disk to {}...", format_bytes_binary(resolved.disk_size as u64 * GIB) ); - storage.resize( - &resolved.name, - ByteSize::from_gib(resolved.disk_size as u64), - )?; + storage.resize(pending, ByteSize::from_gib(resolved.disk_size as u64))?; } // Inject per-VM SSH key into the rootfs image. // Linux: mounts the block device, writes the key, unmounts. // macOS: uses debugfs to write directly into the ext4 image. - let dev_path = storage.disk_device_path(&resolved.name); + let dev_path = storage.disk_device_path(pending)?; let pubkey_path = image::inject::default_ssh_pubkey_path().ok_or_else(|| { anyhow::anyhow!( "no SSH public key found at ~/.ssh/id_ed25519.pub or ~/.ssh/id_rsa.pub\n\ @@ -525,7 +553,8 @@ fn create_post_clone( .unwrap_or_else(|| PathBuf::from("/root/.ssh/id_ed25519")) }); - // Build and save VM metadata. + // Build and save VM metadata. The disk path and thin_id come from + // the pending stub built right after `clone_for_vm` returned. let metadata = VmMetadata { name: resolved.name.clone(), id: Uuid::new_v4(), @@ -535,7 +564,7 @@ fn create_post_clone( memory_mib: resolved.memory, disk_size_gib: resolved.disk_size, kernel_path, - disk_path: vm_disk.to_string(), + disk_path: pending.disk_path.clone(), boot_args: resolved.boot_args.clone(), subnet: resolved.network.clone(), network: None, @@ -547,6 +576,8 @@ fn create_post_clone( key: ssh_key, }, parent_vm: None, + thin_id: pending.thin_id, + snapshots: Vec::new(), }; vm::save(store, &metadata)?; @@ -605,23 +636,23 @@ fn fork(args: &ForkArgs, state_dir: &Path) -> anyhow::Result<()> { let subnet = args.network.clone().or(source.subnet.clone()); - let storage = Storage::new(&global_config); + let storage = create_storage(&global_config); // Clone source VM's storage into the new VM via the storage backend. println!("Forking '{}' → '{}'...", args.source, args.name); - let vm_disk_path = storage.clone_vm_storage(&args.source, &args.name)?; - let vm_disk = vm_disk_path.to_string_lossy().to_string(); + let handle = storage.clone_vm_storage(&source, &args.name)?; + let pending = pending_metadata(&args.name, &handle, disk_size_gib); let mut rollback = Rollback::new(); { let storage = storage.clone(); - let parent = args.source.clone(); + let parent = source.clone(); + let pending = pending.clone(); let sd = state_dir.to_path_buf(); - let name = args.name.clone(); rollback.push("fork clone + snapshot", move || { - let _ = storage.destroy_vm_storage(&name); - let _ = storage.cleanup_fork(&parent, &name); - let _ = vm::delete(&StateStore::new(sd), &name); + let _ = storage.destroy_vm_storage(&pending); + let _ = storage.cleanup_fork(&parent, &pending); + let _ = vm::delete(&StateStore::new(sd), &pending.name); }); } @@ -632,12 +663,12 @@ fn fork(args: &ForkArgs, state_dir: &Path) -> anyhow::Result<()> { "Growing disk to {}...", format_bytes_binary(disk_size_gib as u64 * GIB) ); - storage.resize(&args.name, ByteSize::from_gib(disk_size_gib as u64))?; + storage.resize(&pending, ByteSize::from_gib(disk_size_gib as u64))?; } // Inject /etc/hosts with the new VM's hostname (the cloned disk // still has the source VM's hostname from its creation). - let dev_path = storage.disk_device_path(&args.name); + let dev_path = storage.disk_device_path(&pending)?; storage.inject_hostname(&dev_path, &args.name)?; // Resolve kernel: CLI override or inherit from source. @@ -657,7 +688,7 @@ fn fork(args: &ForkArgs, state_dir: &Path) -> anyhow::Result<()> { memory_mib, disk_size_gib, kernel_path, - disk_path: vm_disk, + disk_path: pending.disk_path.clone(), boot_args: source.boot_args.clone(), subnet, network: None, @@ -666,6 +697,8 @@ fn fork(args: &ForkArgs, state_dir: &Path) -> anyhow::Result<()> { created_at: vm::now_iso8601(), ssh: source.ssh.clone(), parent_vm: Some(args.source.clone()), + thin_id: pending.thin_id, + snapshots: Vec::new(), }; vm::save(&store, &metadata)?; @@ -922,12 +955,12 @@ fn resize(args: &ResizeArgs, state_dir: &Path) -> anyhow::Result<()> { // Grow the disk via the storage backend (handles resize + ext4 expand). let config: GlobalConfig = store.read(&store.config_path())?; - let storage = Storage::new(&config); + let storage = create_storage(&config); println!( "Resizing disk to {}...", format_bytes_binary(new_gib as u64 * GIB) ); - storage.resize(&args.name, args.disk_size)?; + storage.resize(&metadata, args.disk_size)?; // Update metadata. metadata.disk_size_gib = new_gib; @@ -1042,8 +1075,8 @@ fn delete(args: &DeleteArgs, state_dir: &Path) -> anyhow::Result<()> { // Check for storage-level dependents (e.g. ZFS fork snapshots with clones). // On macOS/APFS this always returns empty — forks are independent. let config: GlobalConfig = store.read(&store.config_path())?; - let storage = Storage::new(&config); - let dependents = storage.storage_dependents(&args.name)?; + let storage = create_storage(&config); + let dependents = storage.storage_dependents(&metadata)?; if !dependents.is_empty() { if !args.force { anyhow::bail!( @@ -1106,15 +1139,20 @@ pub fn force_delete_vm(store: &StateStore, metadata: &VmMetadata) -> anyhow::Res // Destroy storage via the backend. let config: GlobalConfig = store.read(&store.config_path())?; - let storage = Storage::new(&config); + let storage = create_storage(&config); println!("Destroying storage for VM '{}'...", metadata.name); - let _ = storage.destroy_vm_storage(&metadata.name); + let _ = storage.destroy_vm_storage(metadata); // Clean up fork-related resources on the parent VM (e.g. ZFS snapshot). // No-op on macOS/APFS where forks are independent. - if let Some(ref parent) = metadata.parent_vm { - let _ = storage.cleanup_fork(parent, &metadata.name); + if let Some(ref parent_name) = metadata.parent_vm { + // Use the parent's stored metadata if available; fall back to a + // name-only stub when the parent record is gone (e.g. cascade + // cleanup running in the wrong order). + let parent_md = + vm::load(store, parent_name).unwrap_or_else(|_| name_only_metadata(parent_name)); + let _ = storage.cleanup_fork(&parent_md, metadata); } // Remove the VM state directory. diff --git a/src/main.rs b/src/main.rs index e73cc41..b6b0eb0 100644 --- a/src/main.rs +++ b/src/main.rs @@ -88,6 +88,8 @@ fn main() -> anyhow::Result<()> { CurrentPlatform::reconcile(&cli.state_dir); Ok(()) } + Command::Deinit(args) => cli::deinit::run(args, &cli.state_dir), + Command::Storage(cmd) => cli::storage::run(cmd, &cli.state_dir), Command::Version => { println!("ember {}", env!("CARGO_PKG_VERSION")); Ok(()) diff --git a/tests/common/linux.rs b/tests/common/linux.rs index 7867348..540f424 100644 --- a/tests/common/linux.rs +++ b/tests/common/linux.rs @@ -81,6 +81,24 @@ impl Drop for PoolCleanup { } } +/// RAII guard: runs `ember deinit --purge` on drop so dm-thin tests +/// always tear down the pool, loop devices, and backing files even when +/// an assertion panics partway through. +pub struct DmThinCleanup { + pub state_dir: PathBuf, +} + +impl Drop for DmThinCleanup { + fn drop(&mut self) { + let _ = super::ember(&[ + "--state-dir", + self.state_dir.to_str().unwrap(), + "deinit", + "--purge", + ]); + } +} + // --------------------------------------------------------------------------- // ZFS assertions // --------------------------------------------------------------------------- diff --git a/tests/dm_thin.rs b/tests/dm_thin.rs new file mode 100644 index 0000000..a3c9476 --- /dev/null +++ b/tests/dm_thin.rs @@ -0,0 +1,186 @@ +//! Integration tests for the dm-thin storage backend. +//! +//! These tests exercise the real CLI binary against real device-mapper +//! state. They are gated `#[ignore]` and only run on Linux because: +//! +//! * dm-thin requires the `dm-thin-pool` kernel module + root +//! privileges for `dmsetup`, `losetup`, and friends. +//! * The host must have `dmsetup` (lvm2), `thin-provisioning-tools`, +//! and `e2fsprogs` available. +//! +//! Run them explicitly with: +//! +//! ```text +//! sudo cargo test --test dm_thin -- --ignored --test-threads=1 +//! ``` + +#![cfg(target_os = "linux")] + +// Each integration-test crate compiles `tests/common/` as its own +// top-level module; only `common::ember` is used here, so without this +// attribute clippy reports every other shared helper as dead code. +#[allow(dead_code)] +mod common; + +use std::path::Path; + +/// Run `ember init --storage dm-thin` against a tempdir, then verify +/// `ember deinit --purge` cleans up. Smoke test for the new init + +/// deinit paths added in Phase 5/7. +#[test] +#[ignore = "requires root + dm-thin kernel module"] +fn dm_thin_init_and_deinit_round_trip() { + let tmp = tempfile::tempdir().unwrap(); + let storage_path = tmp.path().join("dm-thin"); + let state_dir = tmp.path().join("state"); + + // Always tear down on the way out, even if assertions below panic. + let _cleanup = common::linux::DmThinCleanup { + state_dir: state_dir.clone(), + }; + + // Init. + let output = common::ember(&[ + "--state-dir", + state_dir.to_str().unwrap(), + "init", + "--storage", + "dm-thin", + "--storage-path", + storage_path.to_str().unwrap(), + "--size", + "200M", + ]); + let stdout = String::from_utf8_lossy(&output.stdout); + let stderr = String::from_utf8_lossy(&output.stderr); + assert!( + output.status.success(), + "init failed.\nstdout: {stdout}\nstderr: {stderr}" + ); + assert!( + Path::new("/dev/mapper/ember-pool").exists(), + "ember-pool should be active after init" + ); + assert!(storage_path.join("metadata.img").exists()); + assert!(storage_path.join("data.img").exists()); + + // Deinit with purge — pool, loops, and backing files all gone. + let output = common::ember(&[ + "--state-dir", + state_dir.to_str().unwrap(), + "deinit", + "--purge", + ]); + let stdout = String::from_utf8_lossy(&output.stdout); + let stderr = String::from_utf8_lossy(&output.stderr); + assert!( + output.status.success(), + "deinit failed.\nstdout: {stdout}\nstderr: {stderr}" + ); + assert!( + !Path::new("/dev/mapper/ember-pool").exists(), + "ember-pool should be torn down after deinit" + ); + assert!(!storage_path.join("metadata.img").exists()); + assert!(!storage_path.join("data.img").exists()); +} + +/// `ember init` should refuse to switch backends silently. After init +/// with one backend, attempting to init with a different backend +/// surfaces a clear error rather than corrupting state. +#[test] +#[ignore = "requires root + dm-thin kernel module"] +fn dm_thin_init_refuses_backend_switch() { + let tmp = tempfile::tempdir().unwrap(); + let storage_path = tmp.path().join("dm-thin"); + let state_dir = tmp.path().join("state"); + + let _cleanup = common::linux::DmThinCleanup { + state_dir: state_dir.clone(), + }; + + // First init with dm-thin. + let output = common::ember(&[ + "--state-dir", + state_dir.to_str().unwrap(), + "init", + "--storage", + "dm-thin", + "--storage-path", + storage_path.to_str().unwrap(), + "--size", + "200M", + ]); + assert!(output.status.success()); + + // Second init with zfs should refuse. + let output = common::ember(&[ + "--state-dir", + state_dir.to_str().unwrap(), + "init", + "--storage", + "zfs", + "--pool", + "embertest", + ]); + let stderr = String::from_utf8_lossy(&output.stderr); + assert!( + !output.status.success(), + "second init should have failed; stderr: {stderr}" + ); + assert!( + stderr.contains("already initialized"), + "expected 'already initialized' message: {stderr}" + ); +} + +/// `ember storage grow --size ` should grow the data device. +#[test] +#[ignore = "requires root + dm-thin kernel module"] +fn dm_thin_storage_grow() { + let tmp = tempfile::tempdir().unwrap(); + let storage_path = tmp.path().join("dm-thin"); + let state_dir = tmp.path().join("state"); + + let _cleanup = common::linux::DmThinCleanup { + state_dir: state_dir.clone(), + }; + + let output = common::ember(&[ + "--state-dir", + state_dir.to_str().unwrap(), + "init", + "--storage", + "dm-thin", + "--storage-path", + storage_path.to_str().unwrap(), + "--size", + "200M", + ]); + assert!(output.status.success()); + + let initial = std::fs::metadata(storage_path.join("data.img")) + .unwrap() + .len(); + assert_eq!(initial, 200 * 1024 * 1024); + + let output = common::ember(&[ + "--state-dir", + state_dir.to_str().unwrap(), + "storage", + "grow", + "--size", + "400M", + ]); + let stdout = String::from_utf8_lossy(&output.stdout); + let stderr = String::from_utf8_lossy(&output.stderr); + assert!( + output.status.success(), + "grow failed.\nstdout: {stdout}\nstderr: {stderr}" + ); + + let grown = std::fs::metadata(storage_path.join("data.img")) + .unwrap() + .len(); + assert_eq!(grown, 400 * 1024 * 1024); +} diff --git a/tests/resize.rs b/tests/resize.rs index 62b5a91..19411a4 100644 --- a/tests/resize.rs +++ b/tests/resize.rs @@ -211,7 +211,7 @@ fn resize_grows_disk() { // Verify initial ZFS volsize. assert_eq!( common::linux::get_zvol_size_bytes(&vm_zvol), - 1 * 1024 * 1024 * 1024, + 1024 * 1024 * 1024, "initial zvol should be 1 GiB" );