Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 75 additions & 9 deletions crates/facelock-cli/src/commands/daemon.rs
Original file line number Diff line number Diff line change
Expand Up @@ -811,11 +811,31 @@ pub fn run(config_path: Option<String>) -> anyhow::Result<()> {
rt.block_on(run_dbus_server(handler, idle_timeout_secs, config_mtime))
}

/// Drop all Linux capabilities and set PR_SET_NO_NEW_PRIVS.
/// Bitmask (low 32-bit word, caps 0-31) of the capabilities the daemon keeps
/// after startup: CAP_SETUID (bit 7) and CAP_SETGID (bit 6).
///
/// These two are required for the desktop-notification privilege-drop: the
/// daemon runs as root and execs `runuser`/`su` to `setgroups()` + `setuid()`
/// into the user's session bus (see `notifications.rs::send_as_user`). Under
/// `NoNewPrivileges` that exec cannot regain privilege, so the caps must be
/// retained — and held in the inheritable set so systemd `AmbientCapabilities`
/// survives the exec into the non-setuid `runuser`. Every other capability is
/// dropped. Factored into a pure `const fn` so the mask can be unit-tested
/// without calling `capset` (which needs privilege and may fail in CI).
const fn retained_capability_mask() -> u32 {
// CAP_SETGID = 6, CAP_SETUID = 7.
(1 << 7) | (1 << 6)
}

/// Drop all Linux capabilities except CAP_SETUID + CAP_SETGID, and set
/// PR_SET_NO_NEW_PRIVS.
///
/// After initialization the daemon has already opened the camera fd, loaded
/// models, connected to D-Bus, and opened the database. It no longer needs
/// any elevated capabilities, so we clear them all.
/// models, connected to D-Bus, and opened the database. It no longer needs any
/// elevated capabilities EXCEPT the two required to drop privilege for desktop
/// notifications (`runuser` → `setgroups`/`setuid`); those are retained via
/// [`retained_capability_mask`] in the effective, permitted, AND inheritable
/// sets, and everything else is cleared.
///
/// Returns `Ok(())` on success. Errors are non-fatal — the caller should
/// warn and continue.
Expand Down Expand Up @@ -848,17 +868,24 @@ fn drop_capabilities() -> std::result::Result<(), String> {
));
}

// Clear all capability sets (effective, permitted, inheritable).
// V3 uses two CapData structs (for caps 0-31 and 32-63).
// Retain exactly CAP_SETUID + CAP_SETGID (needed for the runuser/su
// notification privilege-drop); clear every other capability. The
// retained bits go in effective, permitted, AND inheritable — the
// inheritable set is what lets systemd AmbientCapabilities keep these
// caps across the exec into the non-setuid `runuser` under
// NoNewPrivileges. V3 uses two CapData structs (caps 0-31 and 32-63);
// the retained caps (6, 7) live in the low word, so the high word
// stays fully zeroed.
let keep = retained_capability_mask();
let mut header = CapHeader {
version: LINUX_CAP_V3,
pid: 0,
};
let mut data = [
CapData {
effective: 0,
permitted: 0,
inheritable: 0,
effective: keep,
permitted: keep,
inheritable: keep,
},
CapData {
effective: 0,
Expand Down Expand Up @@ -905,7 +932,9 @@ async fn run_dbus_server(
// Drop capabilities now that initialization is complete — camera fd is
// open, models are loaded, D-Bus is connected, database is open.
match drop_capabilities() {
Ok(()) => info!("dropped all capabilities and set no-new-privs"),
Ok(()) => info!(
"retained CAP_SETUID+CAP_SETGID for notification privilege-drop; dropped all others and set no-new-privs"
),
Err(e) => warn!("failed to drop capabilities (continuing): {e}"),
}

Expand Down Expand Up @@ -1107,4 +1136,41 @@ mod tests {
.unwrap_err();
assert!(matches!(err, fdo::Error::AccessDenied(_)));
}

#[test]
fn retained_capability_mask_is_exactly_setuid_and_setgid() {
// Cap bit numbers per <linux/capability.h>.
const CAP_SETGID: u32 = 6;
const CAP_SETUID: u32 = 7;
const CAP_DAC_OVERRIDE: u32 = 1;
const CAP_NET_RAW: u32 = 13;
const CAP_SYS_ADMIN: u32 = 21;

let mask = retained_capability_mask();

// Exactly the two caps required for the runuser/su notification
// privilege-drop are retained.
assert_eq!(mask, (1 << CAP_SETUID) | (1 << CAP_SETGID));
assert_eq!(mask, 0b1100_0000);

// The two we want are present.
assert_ne!(mask & (1 << CAP_SETUID), 0, "CAP_SETUID must be retained");
assert_ne!(mask & (1 << CAP_SETGID), 0, "CAP_SETGID must be retained");

// Dangerous caps are NOT retained.
assert_eq!(
mask & (1 << CAP_SYS_ADMIN),
0,
"CAP_SYS_ADMIN must be dropped"
);
assert_eq!(mask & (1 << CAP_NET_RAW), 0, "CAP_NET_RAW must be dropped");
assert_eq!(
mask & (1 << CAP_DAC_OVERRIDE),
0,
"CAP_DAC_OVERRIDE must be dropped"
);

// Exactly two bits set, and none in the high word (caps 32-63).
assert_eq!(mask.count_ones(), 2);
}
}
48 changes: 38 additions & 10 deletions crates/facelock-cli/src/notifications.rs
Original file line number Diff line number Diff line change
Expand Up @@ -115,11 +115,19 @@ pub fn send_notification(event: &NotifyEvent) {

/// Send notification as a specific user.
///
/// Uses `runuser` to run `notify-send` as the target user with a proper
/// login environment. This works even from systemd services where the
/// daemon's mount namespace may not include `/run/user/<uid>/`.
/// Uses `setpriv` to become the target user (real+effective uid/gid plus
/// supplementary groups) and run `notify-send` in that context. `setpriv`
/// does NOT open a PAM session, unlike `runuser`/`su`. That matters because
/// the daemon runs under a hardened systemd sandbox: `runuser` triggers a
/// full PAM login stack (pam_systemd trying to register a logind session,
/// pam_limits trying to adjust rlimits, etc.), and those modules fail under
/// the sandbox's restrictions, silently killing the notification. `setpriv`
/// only needs the CAP_SETUID/CAP_SETGID capabilities the daemon already
/// retains (see commit e006edc) and performs the uid/gid switch directly via
/// syscalls, with no session machinery to fail.
///
/// Falls back to `su -c` if `runuser` is not available.
/// Falls back to `runuser` if `setpriv` is not available (e.g. non-systemd
/// environments without the hardened sandbox, where a PAM session is fine).
fn send_as_user(user: &str, event: &NotifyEvent) {
use std::process::Command;

Expand All @@ -131,6 +139,7 @@ fn send_as_user(user: &str, event: &NotifyEvent) {
}
};
let uid = user_info.uid.as_raw();
let gid = user_info.gid.as_raw();
let bus_addr = format!("unix:path=/run/user/{uid}/bus");
let timeout = event.timeout_ms().to_string();

Expand All @@ -143,17 +152,36 @@ fn send_as_user(user: &str, event: &NotifyEvent) {
event.body().replace('\'', "'\\''"),
);

// Try runuser first (available on most systems, works from systemd services),
// fall back to su -c
let result = Command::new("runuser")
.args(["-u", user, "--", "sh", "-c", &notify_cmd])
// Try setpriv first: it switches real+effective uid/gid and supplementary
// groups without opening a PAM session, so it works under the hardened
// systemd sandbox. `--ambient-caps -all` defensively strips the daemon's
// ambient CAP_SETUID/CAP_SETGID from the child (the kernel also clears
// them on the uid change, but we drop them explicitly too).
let setpriv_args: Vec<String> = vec![
"--reuid".into(),
uid.to_string(),
"--regid".into(),
gid.to_string(),
"--init-groups".into(),
"--ambient-caps".into(),
"-all".into(),
"--".into(),
"sh".into(),
"-c".into(),
notify_cmd.clone(),
];

let result = Command::new("setpriv")
.args(&setpriv_args)
.stdin(std::process::Stdio::null())
.stdout(std::process::Stdio::null())
.stderr(std::process::Stdio::piped())
.output()
.or_else(|_| {
Command::new("su")
.args(["-", user, "-c", &notify_cmd])
// setpriv missing entirely (unlikely on util-linux systems): fall
// back to runuser, which is fine outside the hardened sandbox.
Command::new("runuser")
.args(["-u", user, "--", "sh", "-c", &notify_cmd])
.stdin(std::process::Stdio::null())
.stdout(std::process::Stdio::null())
.stderr(std::process::Stdio::piped())
Expand Down
64 changes: 61 additions & 3 deletions docs/security.md
Original file line number Diff line number Diff line change
Expand Up @@ -284,13 +284,71 @@ The systemd unit (`systemd/facelock-daemon.service`) includes layered hardening:

**Phase 2 (shipped):** `ProtectKernelTunables/Modules/ControlGroups=yes`, `RestrictNamespaces=yes`, `LockPersonality=yes`, `RestrictRealtime=yes`, `RestrictSUIDSGID=yes`

**Deferred device/seccomp phase:** `DevicePolicy`/`DeviceAllow` is intentionally omitted because cgroup device ACLs interfered with camera auto-detection, and seccomp filtering is deferred to future work. Standard Unix permissions still restrict `/dev/video*` and `/dev/tpmrm0`.

**GPU compatibility note:** `MemoryDenyWriteExecute=yes` is still intentionally omitted because it breaks ONNX Runtime JIT paths such as CUDA and TensorRT. Verify hardening score with:
**Phase 3 (shipped — capabilities, seccomp, network):**

- `CapabilityBoundingSet=CAP_SETUID CAP_SETGID` / `AmbientCapabilities=CAP_SETUID CAP_SETGID` —
the daemon retains **exactly** these two capabilities and no others. Device access needs no
caps (`/dev/video*` and `/dev/tpmrm0` are root-owned and opened via standard file
permissions), but the desktop-notification path execs `runuser -u <user> -- notify-send` to
drop into the user's session bus, and `runuser` calls `setgroups()`/`setuid()`, which require
`CAP_SETGID` + `CAP_SETUID`. They are declared **Ambient** (not merely in the bounding set) so
the caps survive the exec into the non-setuid `runuser` under `NoNewPrivileges=yes`. The daemon
also narrows its in-process capability set to exactly these two after initialization
(`drop_capabilities()` in `facelock-cli`, holding them in effective/permitted/inheritable);
everything else is dropped.
- **This was empirically required.** An earlier revision set both directives **empty** on the
theory that the daemon needs no capabilities. That was wrong: on real hardware it broke
notifications with `runuser: cannot set groups: Operation not permitted`.
- **Direct-D-Bus-as-root is NOT a viable alternative.** Having root connect straight to the
user's session bus (skipping setuid entirely) does not work under `dbus-broker`, which rejects
UID 0 on a user session bus — `sudo DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/1000/bus
notify-send test` fails with `Error sending data: Broken pipe`. The setuid-via-`runuser`
path — and therefore these two capabilities — is required for notification delivery.
- Notifications remain best-effort/fire-and-forget: they never block or fail the auth path, so
even if delivery fails the biometric result and PAM fall-through are unaffected.
- **End-to-end delivery is validated only on the maintainer's real hardware under systemd.**
The unit tests assert the retained capability mask and `systemctl show` asserts the directive
set; neither proves a notification actually pops.
- `RestrictAddressFamilies=AF_UNIX AF_NETLINK` + `IPAddressDeny=any` — the daemon only talks
local sockets (system D-Bus, per-user session bus for notifications, kernel netlink). All
inference is local; a compromised daemon cannot open TCP/IP sockets or exfiltrate over the
network.
- `SystemCallFilter=@system-service` + `SystemCallErrorNumber=EPERM` +
`SystemCallArchitectures=native` — allowlist seccomp. `@system-service` includes `ioctl`
(V4L2), `capget`/`capset` (in-process drop), and the memory-management syscalls ONNX Runtime
needs. Blocked syscalls return `EPERM` instead of killing the process, so an unexpected
syscall degrades to a normal auth error (PAM falls through to password) rather than a crash
loop — never a lockout.
- `ProtectProc=invisible` + `ProcSubset=pid` — other processes and non-PID `/proc` contents are
hidden from the daemon.
- `ProtectHostname=yes`.

**Intentionally omitted directives (and why):**

- `ProtectClock=yes` — implies `DeviceAllow=char-rtc`, which switches the unit to a
device-cgroup allowlist and breaks `/dev/video*` camera access (see below). `clock_settime`
and related syscalls are already denied with `EPERM` by `SystemCallFilter=@system-service`.
- `DevicePolicy`/`DeviceAllow` — cgroup device ACLs interfered with camera auto-detection.
Standard Unix permissions still restrict `/dev/video*` and `/dev/tpmrm0`.
- `MemoryDenyWriteExecute=yes` — breaks ONNX Runtime JIT paths such as CUDA and TensorRT.
- `User=` — the daemon must open the camera/TPM as root; non-root operation has not been
validated on real hardware.

**Exposure score:** `systemd-analyze security --offline=true` reports **2.6 (OK)** for the
Phase 1–3 unit, down from 7.1 (MEDIUM) with Phase 1–2 only. (The score rose from 2.2 to 2.6
when the empty capability sets were corrected to `CAP_SETUID CAP_SETGID` — the two caps the
notification privilege-drop genuinely needs; the small increase is the honest cost of a working
notification path.) Verify with:
```bash
systemd-analyze security facelock-daemon.service
```

**Regression coverage:** `just test-deb-pkg` / `just test-rpm-pkg` boot the package container
with systemd as PID 1 (`test/run-pkg-validate-systemd.sh`) and assert via `systemctl show`
that the installed unit carries the Phase 3 directives, that the daemon starts and answers on
D-Bus inside the sandbox, and that an `AF_INET` socket cannot be created under the same
directive set (outbound TCP blocked).

## Security Configuration Reference

```toml
Expand Down
8 changes: 4 additions & 4 deletions justfile
Original file line number Diff line number Diff line change
Expand Up @@ -415,12 +415,12 @@ test-deb: build-release
podman build -t facelock-deb-test -f test/Containerfile.ubuntu .
podman run --rm facelock-deb-test

# Package test — build real .deb, install via dpkg, run automated validation
# Package test — build real .deb, install via dpkg, validate under booted systemd
test-deb-pkg: build-release
#!/usr/bin/env bash
set -euo pipefail
podman build --build-arg ORT_VERSION={{_ort-version}} -t facelock-deb-pkg -f test/Containerfile.deb-e2e .
podman run --rm facelock-deb-pkg
test/run-pkg-validate-systemd.sh facelock-deb-pkg

# Package test — build real TPM .deb (trixie), install via dpkg, run automated validation
test-deb-tpm-pkg: build-release
Expand All @@ -429,12 +429,12 @@ test-deb-tpm-pkg: build-release
podman build --build-arg ORT_VERSION={{_ort-version}} -t facelock-deb-tpm-pkg -f test/Containerfile.deb-tpm-e2e .
podman run --rm facelock-deb-tpm-pkg

# Package test — build real .rpm, install via dnf, run automated validation
# Package test — build real .rpm, install via dnf, validate under booted systemd
test-rpm-pkg: build-release
#!/usr/bin/env bash
set -euo pipefail
podman build --build-arg ORT_VERSION={{_ort-version}} -t facelock-rpm-pkg -f test/Containerfile.rpm-e2e .
podman run --rm facelock-rpm-pkg
test/run-pkg-validate-systemd.sh facelock-rpm-pkg

# COPR-equivalent build — Packit SRPM + mock from-source rebuild on a Fedora chroot (slow, opt-in)
test-copr:
Expand Down
59 changes: 56 additions & 3 deletions systemd/facelock-daemon.service
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,63 @@ RestrictSUIDSGID=yes
# DevicePolicy=closed/auto both use cgroup device ACLs which hide /dev/video*
# from stat(), breaking camera auto-detection. Omitted — the daemon only needs
# /dev/video* and /dev/tpmrm0, both protected by standard Unix permissions.
# ProtectSystem=strict already prevents writing to /dev/.
# Note: ProtectSystem=strict does NOT cover /dev/ — it only remounts /usr,
# /etc, and /boot read-only. Device access here is restricted solely by
# standard Unix file permissions (root:facelock ownership/mode) on
# /dev/video* and /dev/tpmrm0.

# Deferred: MemoryDenyWriteExecute=yes breaks ONNX Runtime JIT.
# Phase 3 (seccomp, capabilities, network) deferred to future work.
# Phase 3: Capabilities, seccomp, and network lockdown
# The daemon needs exactly two Linux capabilities: CAP_SETUID and CAP_SETGID.
# /dev/video* and /dev/tpmrm0 are root-owned and opened via standard file
# permissions, so device access needs no caps — but the desktop-notification
# path execs `runuser -u <user>` to drop into the user's session bus, and
# runuser calls setgroups()/setuid(), which require CAP_SETGID + CAP_SETUID.
# These are declared as Ambient (not just in the bounding set) so the caps
# survive the exec into the non-setuid `runuser` under NoNewPrivileges=yes.
# The daemon further narrows its in-process capability set to exactly these
# two after initialization (see daemon.rs::drop_capabilities); everything else
# stays denied.
#
# NOTE (empirically required): an earlier revision set both of these empty on
# the theory that "the daemon needs no capabilities." That was wrong — it broke
# notifications on real hardware with `runuser: cannot set groups: Operation not
# permitted`. Connecting to the user session bus directly as root (no setuid)
# is NOT a viable alternative: dbus-broker rejects UID 0 on a user session bus
# ("Broken pipe"), so the setuid-via-runuser path — and thus these two caps —
# is required.
CapabilityBoundingSet=CAP_SETUID CAP_SETGID
AmbientCapabilities=CAP_SETUID CAP_SETGID
# Only local sockets: D-Bus (system bus + per-user session bus for
# notifications) is AF_UNIX. AF_NETLINK is retained conservatively: no
# workspace code opens a netlink socket directly (device enumeration uses
# Path::exists, not netlink), but glibc NSS / name-resolution paths
# (getaddrinfo and friends) may use NETLINK_ROUTE under the hood. Whether
# AF_NETLINK can be safely dropped here is unverified — treat this as a
# deferred, test-first hardening step rather than drop it blind.
# No AF_INET/AF_INET6 — all inference is local, nothing talks TCP/IP.
RestrictAddressFamilies=AF_UNIX AF_NETLINK
IPAddressDeny=any
# Allowlist seccomp filter. @system-service includes ioctl (V4L2 camera),
# capget/capset (in-process capability drop), and the memory-management
# syscalls ONNX Runtime needs. Blocked syscalls return EPERM instead of
# killing the process, so an unexpected syscall degrades to a normal auth
# error (PAM falls through to password) rather than a crash loop.
SystemCallFilter=@system-service
SystemCallErrorNumber=EPERM
SystemCallArchitectures=native
# Hide other processes' /proc entries and non-PID /proc contents.
ProtectProc=invisible
ProcSubset=pid
ProtectHostname=yes
# ProtectClock=yes is intentionally OMITTED: it implies DeviceAllow=char-rtc,
# which switches the unit to a device-cgroup allowlist and would break
# /dev/video* access (see Phase 2.5 note above). clock_settime and friends
# are already denied (EPERM) by SystemCallFilter=@system-service.

# Still deferred: MemoryDenyWriteExecute=yes breaks ONNX Runtime JIT
# (CUDA/TensorRT execution providers), and User= is not set because the
# daemon must open the camera/TPM as root and no non-root operation has
# been validated on real hardware.

[Install]
WantedBy=multi-user.target
Loading
Loading