From 07d82e4a4f4755b2a45d2c567e9b778dbb471f37 Mon Sep 17 00:00:00 2001 From: OpenSauce Date: Sun, 31 May 2026 16:51:14 +0100 Subject: [PATCH] fix(standalone): enable flush-to-zero on the JACK RT thread MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Denormal (subnormal) float arithmetic is extremely slow, especially on ARM (Raspberry Pi). As signals decay toward silence, the IR convolver and filter tails can drive intermediate values into the denormal range, causing erratic CPU spikes that don't track IR length — some IRs run fine, others struggle, with no relation to how heavily they're trimmed. There was no global flush-to-zero anywhere; only a few amp stages manually flush their state at 1e-20 (itself a normal f32, and not covering the convolver). The VST3/CLAP plugin already gets FTZ from nih-plug's process wrapper, but the standalone JACK thread set nothing. Set the CPU flush-to-zero flag on the JACK process thread (MXCSR bit 15 on x86 SSE, FPCR bit 24 on AArch64), mirroring nih-plug's approach via inline asm since Rust 1.75 deprecated the _mm_setcsr intrinsics. Idempotent and cheap, so it runs each process callback. The per-stage manual flushes stay as belt-and-suspenders. Refs #251. --- rustortion-standalone/src/audio/denormals.rs | 50 ++++++++++++++++++++ rustortion-standalone/src/audio/jack.rs | 5 ++ rustortion-standalone/src/audio/mod.rs | 1 + 3 files changed, 56 insertions(+) create mode 100644 rustortion-standalone/src/audio/denormals.rs diff --git a/rustortion-standalone/src/audio/denormals.rs b/rustortion-standalone/src/audio/denormals.rs new file mode 100644 index 0000000..744995e --- /dev/null +++ b/rustortion-standalone/src/audio/denormals.rs @@ -0,0 +1,50 @@ +//! Flush-to-zero (FTZ) setup for the real-time audio thread. +//! +//! Denormal (subnormal) floating-point arithmetic is catastrophically slow — up to +//! ~10–100× on some CPUs, and especially bad on ARM (Raspberry Pi). As signals decay +//! toward silence, the IR convolver and filter tails can drive intermediate values into +//! the denormal range, causing erratic CPU spikes that don't track IR length. Enabling +//! the CPU's flush-to-zero flag makes denormal results flush to zero, keeping cost +//! consistent. +//! +//! The VST3/CLAP plugin already gets this from nih-plug's process wrapper; the standalone +//! JACK process thread must set it itself. The flag is per-thread, so this is called from +//! inside the JACK process callback. +//! +//! The implementation mirrors nih-plug's `ScopedFtz` — Rust 1.75 deprecated the +//! `_mm_setcsr` intrinsics, so this uses inline assembly: MXCSR bit 15 on x86 SSE, FPCR +//! bit 24 on AArch64. On other targets it is a no-op. + +/// Enable flush-to-zero for denormals on the current thread. Idempotent and cheap (a +/// register read plus a conditional write), so it is safe to call every process callback. +#[inline] +pub fn enable_flush_to_zero() { + #[cfg(target_feature = "sse")] + { + // MXCSR bit 15 = Flush-To-Zero. + const SSE_FTZ_BIT: u32 = 1 << 15; + let mut mxcsr: u32 = 0; + // SAFETY: stmxcsr/ldmxcsr only read/write the current thread's MXCSR register. + unsafe { + std::arch::asm!("stmxcsr [{}]", in(reg) std::ptr::addr_of_mut!(mxcsr)); + if mxcsr & SSE_FTZ_BIT == 0 { + let updated = mxcsr | SSE_FTZ_BIT; + std::arch::asm!("ldmxcsr [{}]", in(reg) std::ptr::addr_of!(updated)); + } + } + } + + #[cfg(target_arch = "aarch64")] + { + // FPCR bit 24 = Flush-to-zero mode. + const AARCH64_FTZ_BIT: u64 = 1 << 24; + let mut fpcr: u64; + // SAFETY: FPCR is EL0-accessible; this reads then conditionally sets the FZ bit. + unsafe { + std::arch::asm!("mrs {}, fpcr", out(reg) fpcr); + if fpcr & AARCH64_FTZ_BIT == 0 { + std::arch::asm!("msr fpcr, {}", in(reg) fpcr | AARCH64_FTZ_BIT); + } + } + } +} diff --git a/rustortion-standalone/src/audio/jack.rs b/rustortion-standalone/src/audio/jack.rs index d84aff4..4ccb2bc 100644 --- a/rustortion-standalone/src/audio/jack.rs +++ b/rustortion-standalone/src/audio/jack.rs @@ -64,6 +64,11 @@ impl ProcessHandler { impl jack::ProcessHandler for ProcessHandler { fn process(&mut self, _client: &jack::Client, ps: &jack::ProcessScope) -> jack::Control { + // Denormals are extremely slow (esp. on ARM/Pi) and the IR convolver + filter + // tails can produce them as signals decay. The plugin gets FTZ from nih-plug; + // the standalone must set it on its own RT thread. Idempotent and cheap. + crate::audio::denormals::enable_flush_to_zero(); + let input = self.ports.get_input(ps); if let Err(e) = self.audio_engine.process(input, self.buffer.as_mut_slice()) { diff --git a/rustortion-standalone/src/audio/mod.rs b/rustortion-standalone/src/audio/mod.rs index a16c62f..e2f9889 100644 --- a/rustortion-standalone/src/audio/mod.rs +++ b/rustortion-standalone/src/audio/mod.rs @@ -1,3 +1,4 @@ +pub mod denormals; pub mod jack; pub mod manager; pub mod ports;