diff --git a/README.md b/README.md index 81fc92e..717beca 100644 --- a/README.md +++ b/README.md @@ -224,6 +224,12 @@ for await detection in listener.detections() { } ``` +Pass `echoCancellation: true` to route the microphone through the platform's voice-processing I/O unit, which subtracts what the device is playing out (e.g. your app's own TTS) from the captured signal so the listener only reacts to the user's voice. It defaults to `false` (raw capture). + +```swift +let listener = WakeWordListener(model: model, threshold: 0.5, debounce: 2.0, echoCancellation: true) +``` + The mel spectrogram and speech embedding `.onnx` models ship inside the Swift package; only the classifier ships with your app. Audio at any sample rate is resampled to 16 kHz internally via `AVAudioConverter` (matches the Rust crate's 22050–384000 Hz input range); the listener handles mic-hardware resampling automatically. ONNX Runtime with the CoreML Execution Provider dispatches to ANE / GPU / CPU by default (override via `executionProvider:`). Add `NSMicrophoneUsageDescription` to Info.plist (and `com.apple.security.device.audio-input` on sandboxed macOS apps) for listener use. A runnable SwiftUI demo (iOS + macOS) lives in [examples/ios_wakeword/](examples/ios_wakeword/). diff --git a/swift/Sources/LiveKitWakeWord/WakeWordError.swift b/swift/Sources/LiveKitWakeWord/WakeWordError.swift index 2f9ec4d..4ec957b 100644 --- a/swift/Sources/LiveKitWakeWord/WakeWordError.swift +++ b/swift/Sources/LiveKitWakeWord/WakeWordError.swift @@ -32,6 +32,10 @@ public enum WakeWordError: Error, LocalizedError, Sendable { /// The ONNX Runtime raised an error during session creation or /// inference. case runtimeFailure(underlying: Error) + /// Acoustic echo cancellation was requested (``WakeWordListener`` created + /// with `echoCancellation: true`) but the platform's voice-processing I/O + /// unit could not be enabled. + case echoCancellationUnavailable(underlying: Error) public var errorDescription: String? { switch self { @@ -54,6 +58,8 @@ public enum WakeWordError: Error, LocalizedError, Sendable { return "LiveKitWakeWord: resampling failed." case .runtimeFailure(let underlying): return "LiveKitWakeWord: ONNX Runtime error (\(underlying))." + case .echoCancellationUnavailable(let underlying): + return "LiveKitWakeWord: could not enable echo cancellation (\(underlying))." } } } diff --git a/swift/Sources/LiveKitWakeWord/WakeWordListener.swift b/swift/Sources/LiveKitWakeWord/WakeWordListener.swift index ec5243e..b97391b 100644 --- a/swift/Sources/LiveKitWakeWord/WakeWordListener.swift +++ b/swift/Sources/LiveKitWakeWord/WakeWordListener.swift @@ -43,6 +43,7 @@ public struct Detection: Sendable { public actor WakeWordListener { public let threshold: Float public let debounce: TimeInterval + public let echoCancellation: Bool private let model: WakeWordModel private var engine: AVAudioEngine? @@ -71,16 +72,22 @@ public actor WakeWordListener { /// same utterance. /// - windowSeconds: Length of the rolling audio window fed to the /// model. 2 s matches the Rust crate's recommendation. + /// - echoCancellation: When `true`, the microphone is routed through the + /// platform's voice-processing I/O unit so audio the device is playing + /// out (e.g. an assistant's own TTS) is removed from the captured + /// signal. Defaults to `false` (raw capture). public init( model: WakeWordModel, threshold: Float = 0.5, debounce: TimeInterval = 2.0, - windowSeconds: Double = 2.0 + windowSeconds: Double = 2.0, + echoCancellation: Bool = false ) { self.model = model self.threshold = threshold self.debounce = debounce self.windowSeconds = windowSeconds + self.echoCancellation = echoCancellation } /// Start capturing audio and running inference. Must be called after @@ -90,12 +97,22 @@ public actor WakeWordListener { #if os(iOS) let session = AVAudioSession.sharedInstance() - try session.setCategory(.playAndRecord, mode: .measurement, options: [.defaultToSpeaker]) + let mode: AVAudioSession.Mode = echoCancellation ? .voiceChat : .measurement + try session.setCategory(.playAndRecord, mode: mode, options: [.defaultToSpeaker]) try session.setActive(true, options: []) #endif let engine = AVAudioEngine() let input = engine.inputNode + + if echoCancellation { + do { + try input.setVoiceProcessingEnabled(true) + } catch { + throw WakeWordError.echoCancellationUnavailable(underlying: error) + } + } + let hwFormat = input.inputFormat(forBus: 0) guard hwFormat.sampleRate > 0 else { throw WakeWordError.unsupportedSampleRate(rate: 0)