diff --git a/docs/superpowers/specs/2026-06-21-native-overlay-retry-audio-design.md b/docs/superpowers/specs/2026-06-21-native-overlay-retry-audio-design.md new file mode 100644 index 0000000..f9192c6 --- /dev/null +++ b/docs/superpowers/specs/2026-06-21-native-overlay-retry-audio-design.md @@ -0,0 +1,109 @@ +# Native Overlay, Retry, and Recording Asset Design + +- Date: 2026-06-21 +- Status: Approved +- Branch: `codex/native-cpal-capture` + +## Goal + +Make the macOS recording main path independent of WebView control logic, then make ASR failure and late-result behavior recoverable by using saved WAV recordings as retryable transcription assets. + +The user-visible behavior outside recording, overlay feedback, retry, and history playback should remain unchanged. + +## Execution Order + +1. Native overlay and native cue playback. +2. Fix late ASR result handling. +3. Add recording asset, history playback, retry, and retention policy. + +## Phase 1: Native Main Path + +On macOS, the recording main path should no longer depend on WebView for recording lifecycle control or cue playback. + +The existing native overlay remains the visual surface. It should handle actionable failure states directly, including a retry icon button. The retry control must be visually subtle and fit the current glass pill style. Its maximum visual footprint must not exceed the current recording waveform element, so the control remains refined rather than dominant. + +Failure overlay behavior: + +- Show the existing failure text style. +- Show a refresh-style icon button only, without text. +- Display a 5-second countdown affordance around the retry button. +- If the user does not click within 5 seconds, hide the overlay. +- The failed transcription attempt remains available in input history when a WAV exists. + +Cue playback should move to native playback on macOS so start/end cues do not depend on the overlay WebView. Windows can keep the current WebView path unless the native implementation is naturally cross-platform. + +## Phase 2: Late ASR Result Handling + +The current Doubao flow can return a partial result when `commit_and_await_final` times out after 5 seconds, while the server may continue sending a more complete result afterward. This causes premature paste of incomplete text. + +The fix should prefer correctness over premature paste: + +- Do not paste a partial result merely because the 5-second commit wait elapsed. +- If the session has not produced a reliable final result by the deadline, mark the attempt as failed or retryable instead of pasting known-incomplete text. +- If a definite final result or terminal close arrives within the accepted completion window, paste normally. +- The saved WAV should make manual retry cheap, so retryable failure is better than silently pasting partial text. + +## Phase 3: Recording Assets and Retry + +Each transcription attempt should have a durable record that can represent success or failure. + +History entries should support: + +- `status`: success or failed. +- `text`: successful final text, or a short failure description. +- `audioPath`: saved WAV path when available. +- `error`: failure reason when applicable. +- `retryOf`: optional original entry timestamp or ID. + +Successful entries continue to count toward usage statistics. Failed entries should appear in input history but should not increase total session or character counts. + +Retry behavior: + +- Retry uses the saved WAV, not the microphone. +- Retry can be triggered from the native failure overlay within 5 seconds. +- Retry can also be triggered from Settings home input history. +- A successful retry creates or updates a successful history record and follows the normal paste/clipboard/statistics path. +- If recording retention is disabled, the failed WAV is deleted after a retry succeeds. + +History UI behavior: + +- Successful rows show play, copy, and delete icon buttons. +- Failed rows show play, retry, and delete icon buttons. +- Buttons must match the current input-record action style: orange solid rounded-square icon buttons with white line icons. + +## Recording Retention Setting + +Add an app setting for whether to retain recordings. + +Default: disabled. + +When enabled: + +- Keep successful and failed recordings for the most recent 1 month. +- Prune older recordings and references. + +When disabled: + +- Keep only recordings needed for failed retryable entries. +- Delete recordings after successful transcription or successful retry. + +## Testing + +Backend: + +- Unit tests for history serialization/backward compatibility. +- Unit tests for retention pruning decisions. +- Tests for retrying a WAV through the same ASR path where practical. +- Tests for Doubao commit timeout behavior so partial text is not treated as successful final output. + +Frontend/settings: + +- Tests for history rows with success and failure states. +- Tests for play/retry button bridge calls. + +Manual: + +- Network timeout creates a failed history entry with WAV. +- Native overlay retry starts a transcription attempt from WAV. +- Settings history retry works after overlay disappears. +- Successful retry removes failed-only WAV when retention is disabled. diff --git a/package.json b/package.json index 96d3160..681bcb2 100644 --- a/package.json +++ b/package.json @@ -28,6 +28,7 @@ "scripts": { "tauri": "tauri", "dev": "tauri dev", + "dev:no-watch": "tauri dev --no-watch", "dev:web": "vite", "build:web": "vite build", "pack": "tsx --env-file=.env scripts/pack.ts", diff --git a/src-tauri/Cargo.toml b/src-tauri/Cargo.toml index fb6741a..cdf92d4 100644 --- a/src-tauri/Cargo.toml +++ b/src-tauri/Cargo.toml @@ -59,3 +59,6 @@ wiremock = "0.6" objc2 = "0.6" objc2-app-kit = "0.3" objc2-foundation = "0.3" +# Native microphone capture is macOS-only (native_audio is cfg(macos)); keeping +# cpal off other targets avoids pulling ALSA (libasound2-dev) into the Linux CI. +cpal = "0.15" diff --git a/src-tauri/src/app_state.rs b/src-tauri/src/app_state.rs index 4f18af6..408f02b 100644 --- a/src-tauri/src/app_state.rs +++ b/src-tauri/src/app_state.rs @@ -32,6 +32,9 @@ pub struct AppInner { /// once it attaches. Always accessed while holding `asr_session` to stay /// ordered against the drain. pub pending_audio: Mutex>>, + /// Full-session 16k mono PCM captured from the same stream sent to ASR. + /// Saved as a WAV when a recording is finalized, for diagnostics and review. + pub recording_audio: Mutex>, /// Resolves when the background ASR connect finishes (Ok) or fails (Err). /// `stop_recording` awaits this when the user stops before the session is ready. pub connect_rx: Mutex>>>, @@ -44,6 +47,10 @@ pub struct AppInner { /// audio, so already-recognized text is accumulated here and prepended to the /// new session's output. Reset at the start of every recording. pub accumulated_text: Mutex, + /// Native microphone capture used on macOS to avoid WebView/WebRTC input + /// processing. Other platforms keep the renderer getUserMedia path. + #[cfg(target_os = "macos")] + pub native_audio: Mutex>, } pub type AppHandle = Arc; @@ -67,8 +74,11 @@ pub fn create_app_state( pending_audio_warmup: Mutex::new(None), latest_transcript: Mutex::new((String::new(), String::new())), pending_audio: Mutex::new(Vec::new()), + recording_audio: Mutex::new(Vec::new()), connect_rx: Mutex::new(None), session_epoch: std::sync::atomic::AtomicU64::new(0), accumulated_text: Mutex::new(String::new()), + #[cfg(target_os = "macos")] + native_audio: Mutex::new(None), }) } diff --git a/src-tauri/src/commands.rs b/src-tauri/src/commands.rs index e61d4c7..ab46bae 100644 --- a/src-tauri/src/commands.rs +++ b/src-tauri/src/commands.rs @@ -289,49 +289,38 @@ fn compute_audio_level(samples: &[f32]) -> Option { Some((rms * 13.0 + peak * 2.8).powf(0.82).min(1.0)) } -/// Receive an audio chunk from the renderer (base64-encoded i16 PCM), -/// decode to f32 samples and forward to the active ASR session. -#[tauri::command] -pub async fn send_audio_chunk( - _app: AppHandle, - state: State<'_, AppState>, - base64_chunk: String, -) -> Result { - use base64::Engine as _; +pub(crate) async fn append_audio_samples( + app: &AppHandle, + state: &AppState, + samples: Vec, +) -> bool { use std::sync::atomic::{AtomicU64, Ordering}; static CHUNK_COUNT: AtomicU64 = AtomicU64::new(0); let n = CHUNK_COUNT.fetch_add(1, Ordering::Relaxed); if n == 0 || n.is_multiple_of(50) { log_audio!( debug, - "Received chunk #{} ({} bytes base64)", + "Received audio chunk #{} ({} samples)", n, - base64_chunk.len() + samples.len() ); } - // Decode base64 → i16 PCM bytes → f32 samples - let bytes = match base64::engine::general_purpose::STANDARD.decode(&base64_chunk) { - Ok(data) => data, - Err(_) => { - log_audio!(warn, "Chunk #{} base64 decode failed", n); - return Ok(serde_json::json!({ "ok": false, "message": "音频数据解码失败" })); - } - }; - let samples: Vec = bytes - .chunks_exact(2) - .map(|chunk| { - let sample = i16::from_le_bytes([chunk[0], chunk[1]]); - sample as f32 / 32768.0 - }) - .collect(); + state + .recording_audio + .lock() + .await + .extend_from_slice(&samples); // Drive the native waveform (macOS only) from the same PCM the ASR receives, // whether the chunk is sent immediately or buffered. #[cfg(target_os = "macos")] if let Some(level) = compute_audio_level(&samples) { - crate::overlay::set_audio_level(&_app, level); + crate::overlay::set_audio_level(app, level); } + // `app` only drives the macOS native waveform above; unused on other platforms. + #[cfg(not(target_os = "macos"))] + let _ = app; // Hold the `asr_session` lock across the decision so buffering stays ordered // against the background connect task's drain (same lock), guaranteeing no @@ -340,7 +329,7 @@ pub async fn send_audio_chunk( if let Some(ref session) = *session { if session.is_ready() { session.append_audio(&samples); - return Ok(serde_json::json!({ "ok": true })); + return false; } } @@ -350,6 +339,7 @@ pub async fn send_audio_chunk( let mut pending = state.pending_audio.lock().await; if pending.len() < MAX_PENDING_CHUNKS { pending.push(samples); + return true; } else if n.is_multiple_of(50) { log_audio!( warn, @@ -358,7 +348,37 @@ pub async fn send_audio_chunk( n ); } - Ok(serde_json::json!({ "ok": true, "buffered": true })) + true +} + +/// Receive an audio chunk from the renderer (base64-encoded i16 PCM), +/// decode to f32 samples and forward to the active ASR session. +#[tauri::command] +pub async fn send_audio_chunk( + app: AppHandle, + state: State<'_, AppState>, + base64_chunk: String, +) -> Result { + use base64::Engine as _; + + // Decode base64 → i16 PCM bytes → f32 samples + let bytes = match base64::engine::general_purpose::STANDARD.decode(&base64_chunk) { + Ok(data) => data, + Err(_) => { + log_audio!(warn, "Audio chunk base64 decode failed"); + return Ok(serde_json::json!({ "ok": false, "message": "音频数据解码失败" })); + } + }; + let samples: Vec = bytes + .chunks_exact(2) + .map(|chunk| { + let sample = i16::from_le_bytes([chunk[0], chunk[1]]); + sample as f32 / 32768.0 + }) + .collect(); + + let buffered = append_audio_samples(&app, &state, samples).await; + Ok(serde_json::json!({ "ok": true, "buffered": buffered })) } /// Notify that audio has stopped in the renderer. diff --git a/src-tauri/src/lib.rs b/src-tauri/src/lib.rs index 0d31398..f7f7565 100644 --- a/src-tauri/src/lib.rs +++ b/src-tauri/src/lib.rs @@ -9,6 +9,8 @@ mod hotword; mod llm; mod migration; mod model; +#[cfg(target_os = "macos")] +mod native_audio; mod overlay; mod paste; mod stats; @@ -26,8 +28,12 @@ use tauri::{ }; /// Delay after the mic stream is ready, before entering Recording / playing the -/// start cue. Gives the browser AEC/AGC time to converge so the first words are -/// not attenuated. Trade-off: added latency between key press and "go". +/// start cue. The renderer (getUserMedia) path needs it so the browser's AEC/AGC +/// converge before the first words. Native cpal capture has no such DSP warmup, +/// so macOS uses 0 — testing whether dropped leading words / cue glitches return. +#[cfg(target_os = "macos")] +const AUDIO_SETTLE_MS: u64 = 0; +#[cfg(not(target_os = "macos"))] const AUDIO_SETTLE_MS: u64 = 350; #[cfg_attr(mobile, tauri::mobile_entry_point)] @@ -371,14 +377,19 @@ fn resolve_configured_sound_path( /// dedicated, kept-warm AudioContext, so the cue is full-volume and never /// truncated. Falls back to `afplay` only if the file cannot be read. fn emit_cue(app: &AppHandle, app_inner: &Arc, name: &str) { - use base64::Engine as _; - let Some(file_path) = resolve_configured_sound_path(app, app_inner, name) else { return; }; + #[cfg(target_os = "macos")] + { + crate::paste::play_sound(&file_path); + } + + #[cfg(not(target_os = "macos"))] match std::fs::read(&file_path) { Ok(bytes) => { + use base64::Engine as _; let data = base64::engine::general_purpose::STANDARD.encode(&bytes); let _ = app.emit( "overlay:event", @@ -424,6 +435,92 @@ async fn stop_renderer_audio( } } +async fn stop_audio_capture( + app: &AppHandle, + app_inner: &Arc, + timeout_ms: u64, +) { + #[cfg(target_os = "macos")] + native_audio::stop_capture(app_inner).await; + + stop_renderer_audio(app, app_inner, timeout_ms).await; +} + +async fn save_recording_wav(app: &AppHandle, app_inner: &Arc) { + let samples = { + let mut audio = app_inner.recording_audio.lock().await; + if audio.is_empty() { + return; + } + std::mem::take(&mut *audio) + }; + + let data_dir = match app.path().app_data_dir() { + Ok(dir) => dir, + Err(error) => { + log_audio!( + warn, + "Resolve app data dir for recording WAV failed: {}", + error + ); + return; + } + }; + let output_dir = data_dir.join("recordings"); + if let Err(error) = std::fs::create_dir_all(&output_dir) { + log_audio!( + warn, + "Create recording WAV directory failed ({}): {}", + output_dir.display(), + error + ); + return; + } + + let ts = chrono::Local::now().format("%Y%m%d-%H%M%S%.3f"); + let path = output_dir.join(format!("voicepaste-{ts}.wav")); + match write_wav_16k_mono(&path, &samples) { + Ok(()) => log_audio!(info, "Recording WAV saved: {}", path.display()), + Err(error) => log_audio!( + warn, + "Write recording WAV failed ({}): {}", + path.display(), + error + ), + } +} + +fn write_wav_16k_mono(path: &std::path::Path, samples: &[f32]) -> Result<(), String> { + const SAMPLE_RATE: u32 = 16_000; + const CHANNELS: u16 = 1; + const BYTES_PER_SAMPLE: u16 = 2; + + let data_bytes = samples.len() * BYTES_PER_SAMPLE as usize; + let riff_size = 36usize + .checked_add(data_bytes) + .ok_or_else(|| "WAV too large".to_string())?; + let mut wav = Vec::with_capacity(44 + data_bytes); + wav.extend_from_slice(b"RIFF"); + wav.extend_from_slice(&(riff_size as u32).to_le_bytes()); + wav.extend_from_slice(b"WAVE"); + wav.extend_from_slice(b"fmt "); + wav.extend_from_slice(&16u32.to_le_bytes()); + wav.extend_from_slice(&1u16.to_le_bytes()); + wav.extend_from_slice(&CHANNELS.to_le_bytes()); + wav.extend_from_slice(&SAMPLE_RATE.to_le_bytes()); + wav.extend_from_slice(&(SAMPLE_RATE * CHANNELS as u32 * BYTES_PER_SAMPLE as u32).to_le_bytes()); + wav.extend_from_slice(&(CHANNELS * BYTES_PER_SAMPLE).to_le_bytes()); + wav.extend_from_slice(&(BYTES_PER_SAMPLE * 8).to_le_bytes()); + wav.extend_from_slice(b"data"); + wav.extend_from_slice(&(data_bytes as u32).to_le_bytes()); + for &sample in samples { + let pcm = (sample.clamp(-1.0, 1.0) * i16::MAX as f32) as i16; + wav.extend_from_slice(&pcm.to_le_bytes()); + } + + std::fs::write(path, wav).map_err(|e| e.to_string()) +} + async fn wait_for_audio_warmup( app_inner: &Arc, timeout_ms: u64, @@ -653,6 +750,7 @@ async fn start_recording(app_handle: AppHandle) { } *app_inner.latest_transcript.lock().await = (String::new(), String::new()); + app_inner.recording_audio.lock().await.clear(); let _ = app_handle.emit("overlay:event", serde_json::json!({ "type": "reset" })); // Re-position before showing so the overlay follows the current display layout // (e.g. after an external monitor was connected/disconnected). @@ -663,6 +761,24 @@ async fn start_recording(app_handle: AppHandle) { // 2. Warm up microphone capture set_app_state(&app_handle, &app_inner, app_state::AppState::Connecting).await; + #[cfg(target_os = "macos")] + if let Err(e) = native_audio::start_capture(app_handle.clone(), Arc::clone(&app_inner)).await { + *recording_state.0.lock().unwrap() = false; + set_app_state(&app_handle, &app_inner, app_state::AppState::Idle).await; + if let Some(overlay) = app_handle.get_webview_window("overlay") { + let _ = overlay.hide(); + } + log_rec!(warn, "Native audio warmup failed: {}", e); + let _ = app_handle.emit( + "overlay:event", + serde_json::json!({ + "type": "hint", + "payload": { "text": e, "level": "error", "variant": "text" } + }), + ); + return; + } + let _ = app_handle.emit( "overlay:event", serde_json::json!({ @@ -671,7 +787,7 @@ async fn start_recording(app_handle: AppHandle) { ); if let Err(e) = wait_for_audio_warmup(&app_inner, 8000).await { *recording_state.0.lock().unwrap() = false; - stop_renderer_audio(&app_handle, &app_inner, 1200).await; + stop_audio_capture(&app_handle, &app_inner, 1200).await; set_app_state(&app_handle, &app_inner, app_state::AppState::Idle).await; if let Some(overlay) = app_handle.get_webview_window("overlay") { let _ = overlay.hide(); @@ -690,7 +806,7 @@ async fn start_recording(app_handle: AppHandle) { // Check if recording was cancelled during warmup (hold mode: quick press-release) if !*recording_state.0.lock().unwrap() { log_rec!(warn, "Cancelled during warmup, aborting start"); - stop_renderer_audio(&app_handle, &app_inner, 1200).await; + stop_audio_capture(&app_handle, &app_inner, 1200).await; set_app_state(&app_handle, &app_inner, app_state::AppState::Idle).await; if let Some(overlay) = app_handle.get_webview_window("overlay") { let _ = overlay.hide(); @@ -698,19 +814,16 @@ async fn start_recording(app_handle: AppHandle) { return; } - // Settle delay before the cue: getUserMedia resolving only means the stream - // exists, not that its AEC/AGC have converged. The mic is live and DSP converges - // on real input during this wait (capture stays gated off until Recording), while - // the renderer's cue keep-alive (set up during warmup) holds the output device - // warm so the cue still plays smoothly afterwards. The cue is the user's "go" - // signal, so it must land AFTER this delay — never before, or the user would - // speak into the unconverged window and lose the first words. + // Settle delay before the cue: the selected capture backend is live during + // this wait (audio stays gated off until Recording), while the renderer's cue + // keep-alive holds the output device warm so the start cue plays smoothly. + // The cue is the user's "go" signal, so it lands after capture warmup. tokio::time::sleep(std::time::Duration::from_millis(AUDIO_SETTLE_MS)).await; // Re-check cancellation: the user may have released during the settle delay. if !*recording_state.0.lock().unwrap() { log_rec!(warn, "Cancelled during settle, aborting start"); - stop_renderer_audio(&app_handle, &app_inner, 1200).await; + stop_audio_capture(&app_handle, &app_inner, 1200).await; set_app_state(&app_handle, &app_inner, app_state::AppState::Idle).await; if let Some(overlay) = app_handle.get_webview_window("overlay") { let _ = overlay.hide(); @@ -928,7 +1041,8 @@ async fn connect_and_attach( } *app_handle.state::().0.lock().unwrap() = false; app_inner.pending_audio.lock().await.clear(); - stop_renderer_audio(&app_handle, &app_inner, 1200).await; + stop_audio_capture(&app_handle, &app_inner, 1200).await; + save_recording_wav(&app_handle, &app_inner).await; // Emit error hint BEFORE setting idle so the overlay shows it: the // frontend's idle handler only clears "info"-level hints. let _ = app_handle.emit("overlay:event", serde_json::json!({ @@ -967,7 +1081,8 @@ async fn stop_recording(app_handle: AppHandle) { set_app_state(&app_handle, &app_inner, app_state::AppState::Finishing).await; // 2. Stop renderer audio first so the final buffered chunk is flushed. - stop_renderer_audio(&app_handle, &app_inner, 1200).await; + stop_audio_capture(&app_handle, &app_inner, 1200).await; + save_recording_wav(&app_handle, &app_inner).await; // 3. Acquire the ready ASR session. If the background connect hasn't finished // (user stopped before it was ready), wait for it to resolve so the buffered @@ -1108,13 +1223,14 @@ async fn cancel_recording(app_handle: AppHandle) { .session_epoch .fetch_add(1, std::sync::atomic::Ordering::SeqCst); app_inner.pending_audio.lock().await.clear(); + app_inner.recording_audio.lock().await.clear(); // Clear the active prompt ID since the session was cancelled if let Some(active) = app_handle.try_state::() { *active.0.lock().unwrap() = None; } - stop_renderer_audio(&app_handle, &app_inner, 1200).await; + stop_audio_capture(&app_handle, &app_inner, 1200).await; if let Some(session) = app_inner.asr_session.lock().await.take() { session.close(); @@ -1393,7 +1509,8 @@ async fn finalize_on_failure(app: &AppHandle, app_inner: &Arc, + input_thread: Option>, + forward_task: tauri::async_runtime::JoinHandle<()>, +} + +impl NativeAudioCapture { + async fn stop(mut self) { + let _ = self.stop_tx.send(()); + if let Some(input_thread) = self.input_thread.take() { + let _ = tokio::task::spawn_blocking(move || input_thread.join()).await; + } + let _ = self.forward_task.await; + } +} + +pub async fn start_capture( + app: AppHandle, + app_inner: Arc, +) -> Result<(), String> { + let mut slot = app_inner.native_audio.lock().await; + if slot.is_some() { + return Ok(()); + } + + let (audio_tx, mut rx) = tokio::sync::mpsc::unbounded_channel::>(); + let (stop_tx, stop_rx) = std::sync::mpsc::channel::<()>(); + let (ready_tx, ready_rx) = tokio::sync::oneshot::channel::>(); + + let input_thread = thread::Builder::new() + .name("voicepaste-native-audio".to_string()) + .spawn(move || { + if let Err(error) = run_input_thread(audio_tx, stop_rx, ready_tx) { + log_audio!(error, "Native audio thread exited with error: {}", error); + } + }) + .map_err(|e| format!("启动原生录音线程失败: {e}"))?; + + // Await the ready signal asynchronously: the worker sends it the moment the + // input stream is built, so we never block a tokio worker thread. + match ready_rx.await { + Ok(Ok(())) => {} + Ok(Err(error)) => { + let _ = stop_tx.send(()); + let _ = input_thread.join(); + return Err(error); + } + Err(_) => { + let _ = input_thread.join(); + return Err("原生录音线程提前退出".to_string()); + } + } + + let forward_app = app.clone(); + let forward_inner = Arc::clone(&app_inner); + let forward_task = tauri::async_runtime::spawn(async move { + while let Some(samples) = rx.recv().await { + let state = forward_inner.state.lock().await.clone(); + if matches!( + state, + app_state::AppState::Recording | app_state::AppState::Finishing + ) { + crate::commands::append_audio_samples(&forward_app, &forward_inner, samples).await; + } + } + }); + + *slot = Some(NativeAudioCapture { + stop_tx, + input_thread: Some(input_thread), + forward_task, + }); + log_audio!(info, "Native cpal microphone capture started"); + Ok(()) +} + +pub async fn stop_capture(app_inner: &Arc) { + let capture = app_inner.native_audio.lock().await.take(); + if let Some(capture) = capture { + capture.stop().await; + log_audio!(info, "Native cpal microphone capture stopped"); + } +} + +fn run_input_thread( + tx: tokio::sync::mpsc::UnboundedSender>, + stop_rx: std::sync::mpsc::Receiver<()>, + ready_tx: tokio::sync::oneshot::Sender>, +) -> Result<(), String> { + let final_chunk = Arc::new(Mutex::new(Vec::::with_capacity(TARGET_CHUNK_SAMPLES))); + let stream = match build_input_stream(tx.clone(), Arc::clone(&final_chunk)) { + Ok(stream) => stream, + Err(error) => { + let _ = ready_tx.send(Err(error.clone())); + return Err(error); + } + }; + if let Err(error) = stream.play() { + let message = format!("启动麦克风输入流失败: {error}"); + let _ = ready_tx.send(Err(message.clone())); + return Err(message); + } + let _ = ready_tx.send(Ok(())); + stop_rx + .recv() + .map_err(|e| format!("等待停止原生录音失败: {e}"))?; + drop(stream); + if let Ok(mut chunk) = final_chunk.lock() { + if !chunk.is_empty() { + let tail = std::mem::take(&mut *chunk); + let _ = tx.send(tail); + } + } + Ok(()) +} + +fn build_input_stream( + tx: tokio::sync::mpsc::UnboundedSender>, + final_chunk: Arc>>, +) -> Result { + let host = cpal::default_host(); + let device = host + .default_input_device() + .ok_or_else(|| "未找到默认麦克风输入设备".to_string())?; + let config = device + .default_input_config() + .map_err(|e| format!("读取默认麦克风配置失败: {e}"))?; + let sample_rate = config.sample_rate().0; + let channels = usize::from(config.channels()); + let stream_config = config.config(); + + log_audio!( + info, + "Native input device: sample_rate={}, channels={}, format={:?}", + sample_rate, + channels, + config.sample_format() + ); + + let err_fn = |err| { + log_audio!(error, "Native microphone stream error: {}", err); + }; + + match config.sample_format() { + cpal::SampleFormat::F32 => build_stream::( + &device, + &stream_config, + channels, + sample_rate, + tx, + final_chunk, + err_fn, + ), + cpal::SampleFormat::I16 => build_stream::( + &device, + &stream_config, + channels, + sample_rate, + tx, + final_chunk, + err_fn, + ), + cpal::SampleFormat::U16 => build_stream::( + &device, + &stream_config, + channels, + sample_rate, + tx, + final_chunk, + err_fn, + ), + other => Err(format!("不支持的采样格式: {other:?}")), + } +} + +fn build_stream( + device: &cpal::Device, + config: &cpal::StreamConfig, + channels: usize, + sample_rate: u32, + tx: tokio::sync::mpsc::UnboundedSender>, + final_chunk: Arc>>, + err_fn: impl FnMut(cpal::StreamError) + Send + 'static, +) -> Result +where + T: cpal::Sample + cpal::SizedSample + Send + 'static, + f32: FromNativeSample, +{ + let mut resampler = StreamingResampler::new(sample_rate, TARGET_SAMPLE_RATE); + + device + .build_input_stream( + config, + move |data: &[T], _| { + let mono = downmix_to_mono(data, channels); + let samples = resampler.push(&mono); + let Ok(mut chunk) = final_chunk.lock() else { + return; + }; + for sample in samples { + chunk.push(sample); + if chunk.len() >= TARGET_CHUNK_SAMPLES { + let full = std::mem::take(&mut *chunk); + if tx.send(full).is_err() { + return; + } + } + } + }, + err_fn, + None, + ) + .map_err(|e| format!("创建麦克风输入流失败: {e}")) +} + +fn downmix_to_mono(data: &[T], channels: usize) -> Vec +where + T: Copy, + f32: FromNativeSample, +{ + data.chunks(channels) + .map(|frame| { + let sum = frame + .iter() + .map(|&sample| f32::from_native_sample(sample)) + .sum::(); + sum / channels as f32 + }) + .collect() +} + +trait FromNativeSample { + fn from_native_sample(sample: T) -> f32; +} + +impl FromNativeSample for f32 { + fn from_native_sample(sample: f32) -> f32 { + sample.clamp(-1.0, 1.0) + } +} + +impl FromNativeSample for f32 { + fn from_native_sample(sample: i16) -> f32 { + sample as f32 / i16::MAX as f32 + } +} + +impl FromNativeSample for f32 { + fn from_native_sample(sample: u16) -> f32 { + (sample as f32 - 32768.0) / 32768.0 + } +} + +struct StreamingResampler { + from_rate: u32, + to_rate: u32, + ratio: f64, + position: f64, + input: Vec, +} + +impl StreamingResampler { + fn new(from_rate: u32, to_rate: u32) -> Self { + Self { + from_rate, + to_rate, + ratio: from_rate as f64 / to_rate as f64, + position: 0.0, + input: Vec::new(), + } + } + + fn push(&mut self, samples: &[f32]) -> Vec { + if samples.is_empty() { + return Vec::new(); + } + if self.from_rate == self.to_rate { + return samples.to_vec(); + } + + self.input.extend_from_slice(samples); + let mut output = Vec::new(); + while self.position + 1.0 < self.input.len() as f64 { + let idx = self.position.floor() as usize; + let frac = (self.position - idx as f64) as f32; + let a = self.input[idx]; + let b = self.input[idx + 1]; + output.push(a + (b - a) * frac); + self.position += self.ratio; + } + + // Keep at least the last input sample as the interpolation anchor for + // the next callback. With ratios such as 48k -> 16k, `position` can step + // past the current buffer length after the final emitted sample; never + // drain beyond the slice or CoreAudio's no-unwind callback will abort. + let consumed = (self.position.floor() as usize).min(self.input.len().saturating_sub(1)); + if consumed > 0 { + self.input.drain(..consumed); + self.position -= consumed as f64; + } + output + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn resampler_handles_48k_coreaudio_512_frame_callbacks() { + let mut resampler = StreamingResampler::new(48_000, 16_000); + for _ in 0..20 { + let input = vec![0.25; 512]; + let output = resampler.push(&input); + assert!(!output.is_empty()); + } + } + + #[test] + fn resampler_keeps_last_sample_for_next_interpolation_window() { + let mut resampler = StreamingResampler::new(48_000, 16_000); + let _ = resampler.push(&vec![0.0; 512]); + assert!(!resampler.input.is_empty()); + assert!(resampler.position >= 0.0); + assert!(resampler.position < resampler.input.len() as f64 + resampler.ratio); + } +} diff --git a/web/src/ui/main-overlay.ts b/web/src/ui/main-overlay.ts index bdea157..6cc04a5 100644 --- a/web/src/ui/main-overlay.ts +++ b/web/src/ui/main-overlay.ts @@ -444,6 +444,10 @@ function stopCueKeepAlive(): void { } } +function usesNativeAudioCapture(): boolean { + return currentAppearance.platform === "macos"; +} + // Create the cue context if needed, resume it, and start the keep-alive so the // output device is warm and settled by the time a cue plays. Idempotent; called // during warmup. @@ -674,7 +678,12 @@ onOverlayEvent(async (event: OverlayEvent) => { case "audio:warmup": try { state.audioReady = false; - await startAudioCapture(); + if (usesNativeAudioCapture()) { + ensureCueContextWarm(); + state.audioReady = true; + } else { + await startAudioCapture(); + } sendAudioWarmupReady(); } catch (error) { const msg = (error as Error).message || String(error); @@ -689,7 +698,11 @@ onOverlayEvent(async (event: OverlayEvent) => { try { state.appState = "recording"; state.audioReady = false; - await startAudioCapture(); + if (usesNativeAudioCapture()) { + state.audioReady = true; + } else { + await startAudioCapture(); + } startWaveformAnimation(); state.hintText = ""; state.hintLevel = "info"; @@ -704,7 +717,12 @@ onOverlayEvent(async (event: OverlayEvent) => { updateView(); break; case "recording:stop": - await stopAudioCapture(); + if (usesNativeAudioCapture()) { + stopWaveformAnimation(); + state.pendingSamples = []; + } else { + await stopAudioCapture(); + } notifyAudioStopped(); break; case "transcript": {