From 452fdd99f67a41fd21a68110c81bb9a491c42d13 Mon Sep 17 00:00:00 2001 From: chen fengchao Date: Sun, 21 Jun 2026 11:26:53 +0800 Subject: [PATCH 1/8] =?UTF-8?q?feat(audio):=20=E4=BD=BF=E7=94=A8=20cpal=20?= =?UTF-8?q?=E5=8E=9F=E7=94=9F=E9=87=87=E9=9B=86=E5=BD=95=E9=9F=B3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 为避免 WebView/WebRTC 音频处理导致录音音量前轻后响,macOS 改用 Rust/cpal 采集并复用现有 ASR 缓冲管线。 同时保存 ASR 实际收到的 16k mono PCM 为 WAV,便于继续排查音频质量。 --- src-tauri/Cargo.toml | 1 + src-tauri/src/app_state.rs | 10 + src-tauri/src/commands.rs | 75 +++++--- src-tauri/src/lib.rs | 141 ++++++++++++-- src-tauri/src/native_audio.rs | 334 ++++++++++++++++++++++++++++++++++ web/src/ui/main-overlay.ts | 24 ++- 6 files changed, 537 insertions(+), 48 deletions(-) create mode 100644 src-tauri/src/native_audio.rs diff --git a/src-tauri/Cargo.toml b/src-tauri/Cargo.toml index fb6741a..553a5c8 100644 --- a/src-tauri/Cargo.toml +++ b/src-tauri/Cargo.toml @@ -45,6 +45,7 @@ async-trait = "0.1" sherpa-onnx = { version = "1.13", default-features = false } tar = "0.4" bzip2 = "0.4" +cpal = "0.15" [features] default = [] diff --git a/src-tauri/src/app_state.rs b/src-tauri/src/app_state.rs index 4f18af6..408f02b 100644 --- a/src-tauri/src/app_state.rs +++ b/src-tauri/src/app_state.rs @@ -32,6 +32,9 @@ pub struct AppInner { /// once it attaches. Always accessed while holding `asr_session` to stay /// ordered against the drain. pub pending_audio: Mutex>>, + /// Full-session 16k mono PCM captured from the same stream sent to ASR. + /// Saved as a WAV when a recording is finalized, for diagnostics and review. + pub recording_audio: Mutex>, /// Resolves when the background ASR connect finishes (Ok) or fails (Err). /// `stop_recording` awaits this when the user stops before the session is ready. pub connect_rx: Mutex>>>, @@ -44,6 +47,10 @@ pub struct AppInner { /// audio, so already-recognized text is accumulated here and prepended to the /// new session's output. Reset at the start of every recording. pub accumulated_text: Mutex, + /// Native microphone capture used on macOS to avoid WebView/WebRTC input + /// processing. Other platforms keep the renderer getUserMedia path. + #[cfg(target_os = "macos")] + pub native_audio: Mutex>, } pub type AppHandle = Arc; @@ -67,8 +74,11 @@ pub fn create_app_state( pending_audio_warmup: Mutex::new(None), latest_transcript: Mutex::new((String::new(), String::new())), pending_audio: Mutex::new(Vec::new()), + recording_audio: Mutex::new(Vec::new()), connect_rx: Mutex::new(None), session_epoch: std::sync::atomic::AtomicU64::new(0), accumulated_text: Mutex::new(String::new()), + #[cfg(target_os = "macos")] + native_audio: Mutex::new(None), }) } diff --git a/src-tauri/src/commands.rs b/src-tauri/src/commands.rs index e61d4c7..3559437 100644 --- a/src-tauri/src/commands.rs +++ b/src-tauri/src/commands.rs @@ -289,48 +289,34 @@ fn compute_audio_level(samples: &[f32]) -> Option { Some((rms * 13.0 + peak * 2.8).powf(0.82).min(1.0)) } -/// Receive an audio chunk from the renderer (base64-encoded i16 PCM), -/// decode to f32 samples and forward to the active ASR session. -#[tauri::command] -pub async fn send_audio_chunk( - _app: AppHandle, - state: State<'_, AppState>, - base64_chunk: String, -) -> Result { - use base64::Engine as _; +pub(crate) async fn append_audio_samples( + app: &AppHandle, + state: &AppState, + samples: Vec, +) -> bool { use std::sync::atomic::{AtomicU64, Ordering}; static CHUNK_COUNT: AtomicU64 = AtomicU64::new(0); let n = CHUNK_COUNT.fetch_add(1, Ordering::Relaxed); if n == 0 || n.is_multiple_of(50) { log_audio!( debug, - "Received chunk #{} ({} bytes base64)", + "Received audio chunk #{} ({} samples)", n, - base64_chunk.len() + samples.len() ); } - // Decode base64 → i16 PCM bytes → f32 samples - let bytes = match base64::engine::general_purpose::STANDARD.decode(&base64_chunk) { - Ok(data) => data, - Err(_) => { - log_audio!(warn, "Chunk #{} base64 decode failed", n); - return Ok(serde_json::json!({ "ok": false, "message": "音频数据解码失败" })); - } - }; - let samples: Vec = bytes - .chunks_exact(2) - .map(|chunk| { - let sample = i16::from_le_bytes([chunk[0], chunk[1]]); - sample as f32 / 32768.0 - }) - .collect(); + state + .recording_audio + .lock() + .await + .extend_from_slice(&samples); // Drive the native waveform (macOS only) from the same PCM the ASR receives, // whether the chunk is sent immediately or buffered. #[cfg(target_os = "macos")] if let Some(level) = compute_audio_level(&samples) { - crate::overlay::set_audio_level(&_app, level); + crate::overlay::set_audio_level(app, level); } // Hold the `asr_session` lock across the decision so buffering stays ordered @@ -340,7 +326,7 @@ pub async fn send_audio_chunk( if let Some(ref session) = *session { if session.is_ready() { session.append_audio(&samples); - return Ok(serde_json::json!({ "ok": true })); + return false; } } @@ -350,6 +336,7 @@ pub async fn send_audio_chunk( let mut pending = state.pending_audio.lock().await; if pending.len() < MAX_PENDING_CHUNKS { pending.push(samples); + return true; } else if n.is_multiple_of(50) { log_audio!( warn, @@ -358,7 +345,37 @@ pub async fn send_audio_chunk( n ); } - Ok(serde_json::json!({ "ok": true, "buffered": true })) + true +} + +/// Receive an audio chunk from the renderer (base64-encoded i16 PCM), +/// decode to f32 samples and forward to the active ASR session. +#[tauri::command] +pub async fn send_audio_chunk( + app: AppHandle, + state: State<'_, AppState>, + base64_chunk: String, +) -> Result { + use base64::Engine as _; + + // Decode base64 → i16 PCM bytes → f32 samples + let bytes = match base64::engine::general_purpose::STANDARD.decode(&base64_chunk) { + Ok(data) => data, + Err(_) => { + log_audio!(warn, "Audio chunk base64 decode failed"); + return Ok(serde_json::json!({ "ok": false, "message": "音频数据解码失败" })); + } + }; + let samples: Vec = bytes + .chunks_exact(2) + .map(|chunk| { + let sample = i16::from_le_bytes([chunk[0], chunk[1]]); + sample as f32 / 32768.0 + }) + .collect(); + + let buffered = append_audio_samples(&app, &state, samples).await; + Ok(serde_json::json!({ "ok": true, "buffered": buffered })) } /// Notify that audio has stopped in the renderer. diff --git a/src-tauri/src/lib.rs b/src-tauri/src/lib.rs index 0d31398..fb3441e 100644 --- a/src-tauri/src/lib.rs +++ b/src-tauri/src/lib.rs @@ -9,6 +9,8 @@ mod hotword; mod llm; mod migration; mod model; +#[cfg(target_os = "macos")] +mod native_audio; mod overlay; mod paste; mod stats; @@ -26,8 +28,9 @@ use tauri::{ }; /// Delay after the mic stream is ready, before entering Recording / playing the -/// start cue. Gives the browser AEC/AGC time to converge so the first words are -/// not attenuated. Trade-off: added latency between key press and "go". +/// start cue. The renderer capture path uses this to let browser audio processing +/// settle; native capture keeps the same short guard so the user-facing timing +/// remains predictable across platforms. const AUDIO_SETTLE_MS: u64 = 350; #[cfg_attr(mobile, tauri::mobile_entry_point)] @@ -424,6 +427,92 @@ async fn stop_renderer_audio( } } +async fn stop_audio_capture( + app: &AppHandle, + app_inner: &Arc, + timeout_ms: u64, +) { + #[cfg(target_os = "macos")] + native_audio::stop_capture(app_inner).await; + + stop_renderer_audio(app, app_inner, timeout_ms).await; +} + +async fn save_recording_wav(app: &AppHandle, app_inner: &Arc) { + let samples = { + let mut audio = app_inner.recording_audio.lock().await; + if audio.is_empty() { + return; + } + std::mem::take(&mut *audio) + }; + + let data_dir = match app.path().app_data_dir() { + Ok(dir) => dir, + Err(error) => { + log_audio!( + warn, + "Resolve app data dir for recording WAV failed: {}", + error + ); + return; + } + }; + let output_dir = data_dir.join("recordings"); + if let Err(error) = std::fs::create_dir_all(&output_dir) { + log_audio!( + warn, + "Create recording WAV directory failed ({}): {}", + output_dir.display(), + error + ); + return; + } + + let ts = chrono::Local::now().format("%Y%m%d-%H%M%S%.3f"); + let path = output_dir.join(format!("voicepaste-{ts}.wav")); + match write_wav_16k_mono(&path, &samples) { + Ok(()) => log_audio!(info, "Recording WAV saved: {}", path.display()), + Err(error) => log_audio!( + warn, + "Write recording WAV failed ({}): {}", + path.display(), + error + ), + } +} + +fn write_wav_16k_mono(path: &std::path::Path, samples: &[f32]) -> Result<(), String> { + const SAMPLE_RATE: u32 = 16_000; + const CHANNELS: u16 = 1; + const BYTES_PER_SAMPLE: u16 = 2; + + let data_bytes = samples.len() * BYTES_PER_SAMPLE as usize; + let riff_size = 36usize + .checked_add(data_bytes) + .ok_or_else(|| "WAV too large".to_string())?; + let mut wav = Vec::with_capacity(44 + data_bytes); + wav.extend_from_slice(b"RIFF"); + wav.extend_from_slice(&(riff_size as u32).to_le_bytes()); + wav.extend_from_slice(b"WAVE"); + wav.extend_from_slice(b"fmt "); + wav.extend_from_slice(&16u32.to_le_bytes()); + wav.extend_from_slice(&1u16.to_le_bytes()); + wav.extend_from_slice(&CHANNELS.to_le_bytes()); + wav.extend_from_slice(&SAMPLE_RATE.to_le_bytes()); + wav.extend_from_slice(&(SAMPLE_RATE * CHANNELS as u32 * BYTES_PER_SAMPLE as u32).to_le_bytes()); + wav.extend_from_slice(&(CHANNELS * BYTES_PER_SAMPLE).to_le_bytes()); + wav.extend_from_slice(&(BYTES_PER_SAMPLE * 8).to_le_bytes()); + wav.extend_from_slice(b"data"); + wav.extend_from_slice(&(data_bytes as u32).to_le_bytes()); + for &sample in samples { + let pcm = (sample.clamp(-1.0, 1.0) * i16::MAX as f32) as i16; + wav.extend_from_slice(&pcm.to_le_bytes()); + } + + std::fs::write(path, wav).map_err(|e| e.to_string()) +} + async fn wait_for_audio_warmup( app_inner: &Arc, timeout_ms: u64, @@ -653,6 +742,7 @@ async fn start_recording(app_handle: AppHandle) { } *app_inner.latest_transcript.lock().await = (String::new(), String::new()); + app_inner.recording_audio.lock().await.clear(); let _ = app_handle.emit("overlay:event", serde_json::json!({ "type": "reset" })); // Re-position before showing so the overlay follows the current display layout // (e.g. after an external monitor was connected/disconnected). @@ -663,6 +753,24 @@ async fn start_recording(app_handle: AppHandle) { // 2. Warm up microphone capture set_app_state(&app_handle, &app_inner, app_state::AppState::Connecting).await; + #[cfg(target_os = "macos")] + if let Err(e) = native_audio::start_capture(app_handle.clone(), Arc::clone(&app_inner)).await { + *recording_state.0.lock().unwrap() = false; + set_app_state(&app_handle, &app_inner, app_state::AppState::Idle).await; + if let Some(overlay) = app_handle.get_webview_window("overlay") { + let _ = overlay.hide(); + } + log_rec!(warn, "Native audio warmup failed: {}", e); + let _ = app_handle.emit( + "overlay:event", + serde_json::json!({ + "type": "hint", + "payload": { "text": e, "level": "error", "variant": "text" } + }), + ); + return; + } + let _ = app_handle.emit( "overlay:event", serde_json::json!({ @@ -671,7 +779,7 @@ async fn start_recording(app_handle: AppHandle) { ); if let Err(e) = wait_for_audio_warmup(&app_inner, 8000).await { *recording_state.0.lock().unwrap() = false; - stop_renderer_audio(&app_handle, &app_inner, 1200).await; + stop_audio_capture(&app_handle, &app_inner, 1200).await; set_app_state(&app_handle, &app_inner, app_state::AppState::Idle).await; if let Some(overlay) = app_handle.get_webview_window("overlay") { let _ = overlay.hide(); @@ -690,7 +798,7 @@ async fn start_recording(app_handle: AppHandle) { // Check if recording was cancelled during warmup (hold mode: quick press-release) if !*recording_state.0.lock().unwrap() { log_rec!(warn, "Cancelled during warmup, aborting start"); - stop_renderer_audio(&app_handle, &app_inner, 1200).await; + stop_audio_capture(&app_handle, &app_inner, 1200).await; set_app_state(&app_handle, &app_inner, app_state::AppState::Idle).await; if let Some(overlay) = app_handle.get_webview_window("overlay") { let _ = overlay.hide(); @@ -698,19 +806,16 @@ async fn start_recording(app_handle: AppHandle) { return; } - // Settle delay before the cue: getUserMedia resolving only means the stream - // exists, not that its AEC/AGC have converged. The mic is live and DSP converges - // on real input during this wait (capture stays gated off until Recording), while - // the renderer's cue keep-alive (set up during warmup) holds the output device - // warm so the cue still plays smoothly afterwards. The cue is the user's "go" - // signal, so it must land AFTER this delay — never before, or the user would - // speak into the unconverged window and lose the first words. + // Settle delay before the cue: the selected capture backend is live during + // this wait (audio stays gated off until Recording), while the renderer's cue + // keep-alive holds the output device warm so the start cue plays smoothly. + // The cue is the user's "go" signal, so it lands after capture warmup. tokio::time::sleep(std::time::Duration::from_millis(AUDIO_SETTLE_MS)).await; // Re-check cancellation: the user may have released during the settle delay. if !*recording_state.0.lock().unwrap() { log_rec!(warn, "Cancelled during settle, aborting start"); - stop_renderer_audio(&app_handle, &app_inner, 1200).await; + stop_audio_capture(&app_handle, &app_inner, 1200).await; set_app_state(&app_handle, &app_inner, app_state::AppState::Idle).await; if let Some(overlay) = app_handle.get_webview_window("overlay") { let _ = overlay.hide(); @@ -928,7 +1033,8 @@ async fn connect_and_attach( } *app_handle.state::().0.lock().unwrap() = false; app_inner.pending_audio.lock().await.clear(); - stop_renderer_audio(&app_handle, &app_inner, 1200).await; + stop_audio_capture(&app_handle, &app_inner, 1200).await; + save_recording_wav(&app_handle, &app_inner).await; // Emit error hint BEFORE setting idle so the overlay shows it: the // frontend's idle handler only clears "info"-level hints. let _ = app_handle.emit("overlay:event", serde_json::json!({ @@ -967,7 +1073,8 @@ async fn stop_recording(app_handle: AppHandle) { set_app_state(&app_handle, &app_inner, app_state::AppState::Finishing).await; // 2. Stop renderer audio first so the final buffered chunk is flushed. - stop_renderer_audio(&app_handle, &app_inner, 1200).await; + stop_audio_capture(&app_handle, &app_inner, 1200).await; + save_recording_wav(&app_handle, &app_inner).await; // 3. Acquire the ready ASR session. If the background connect hasn't finished // (user stopped before it was ready), wait for it to resolve so the buffered @@ -1108,13 +1215,14 @@ async fn cancel_recording(app_handle: AppHandle) { .session_epoch .fetch_add(1, std::sync::atomic::Ordering::SeqCst); app_inner.pending_audio.lock().await.clear(); + app_inner.recording_audio.lock().await.clear(); // Clear the active prompt ID since the session was cancelled if let Some(active) = app_handle.try_state::() { *active.0.lock().unwrap() = None; } - stop_renderer_audio(&app_handle, &app_inner, 1200).await; + stop_audio_capture(&app_handle, &app_inner, 1200).await; if let Some(session) = app_inner.asr_session.lock().await.take() { session.close(); @@ -1393,7 +1501,8 @@ async fn finalize_on_failure(app: &AppHandle, app_inner: &Arc, + input_thread: Option>, + forward_task: tauri::async_runtime::JoinHandle<()>, +} + +impl NativeAudioCapture { + async fn stop(mut self) { + let _ = self.stop_tx.send(()); + if let Some(input_thread) = self.input_thread.take() { + let _ = tokio::task::spawn_blocking(move || input_thread.join()).await; + } + let _ = self.forward_task.await; + } +} + +pub async fn start_capture( + app: AppHandle, + app_inner: Arc, +) -> Result<(), String> { + let mut slot = app_inner.native_audio.lock().await; + if slot.is_some() { + return Ok(()); + } + + let (audio_tx, mut rx) = tokio::sync::mpsc::unbounded_channel::>(); + let (stop_tx, stop_rx) = std::sync::mpsc::channel::<()>(); + let (ready_tx, ready_rx) = std::sync::mpsc::channel::>(); + + let input_thread = thread::Builder::new() + .name("voicepaste-native-audio".to_string()) + .spawn(move || { + if let Err(error) = run_input_thread(audio_tx, stop_rx, ready_tx) { + log_audio!(error, "Native audio thread exited with error: {}", error); + } + }) + .map_err(|e| format!("启动原生录音线程失败: {e}"))?; + + match ready_rx.recv() { + Ok(Ok(())) => {} + Ok(Err(error)) => { + let _ = stop_tx.send(()); + let _ = input_thread.join(); + return Err(error); + } + Err(_) => { + let _ = input_thread.join(); + return Err("原生录音线程提前退出".to_string()); + } + } + + let forward_app = app.clone(); + let forward_inner = Arc::clone(&app_inner); + let forward_task = tauri::async_runtime::spawn(async move { + while let Some(samples) = rx.recv().await { + let state = forward_inner.state.lock().await.clone(); + if matches!( + state, + app_state::AppState::Recording | app_state::AppState::Finishing + ) { + crate::commands::append_audio_samples(&forward_app, &forward_inner, samples).await; + } + } + }); + + *slot = Some(NativeAudioCapture { + stop_tx, + input_thread: Some(input_thread), + forward_task, + }); + log_audio!(info, "Native cpal microphone capture started"); + Ok(()) +} + +pub async fn stop_capture(app_inner: &Arc) { + let capture = app_inner.native_audio.lock().await.take(); + if let Some(capture) = capture { + capture.stop().await; + log_audio!(info, "Native cpal microphone capture stopped"); + } +} + +fn run_input_thread( + tx: tokio::sync::mpsc::UnboundedSender>, + stop_rx: std::sync::mpsc::Receiver<()>, + ready_tx: std::sync::mpsc::Sender>, +) -> Result<(), String> { + let final_chunk = Arc::new(Mutex::new(Vec::::with_capacity(TARGET_CHUNK_SAMPLES))); + let stream = match build_input_stream(tx.clone(), Arc::clone(&final_chunk)) { + Ok(stream) => stream, + Err(error) => { + let _ = ready_tx.send(Err(error.clone())); + return Err(error); + } + }; + if let Err(error) = stream.play() { + let message = format!("启动麦克风输入流失败: {error}"); + let _ = ready_tx.send(Err(message.clone())); + return Err(message); + } + let _ = ready_tx.send(Ok(())); + stop_rx + .recv() + .map_err(|e| format!("等待停止原生录音失败: {e}"))?; + drop(stream); + if let Ok(mut chunk) = final_chunk.lock() { + if !chunk.is_empty() { + let tail = std::mem::take(&mut *chunk); + let _ = tx.send(tail); + } + } + Ok(()) +} + +fn build_input_stream( + tx: tokio::sync::mpsc::UnboundedSender>, + final_chunk: Arc>>, +) -> Result { + let host = cpal::default_host(); + let device = host + .default_input_device() + .ok_or_else(|| "未找到默认麦克风输入设备".to_string())?; + let config = device + .default_input_config() + .map_err(|e| format!("读取默认麦克风配置失败: {e}"))?; + let sample_rate = config.sample_rate().0; + let channels = usize::from(config.channels()); + let stream_config = config.config(); + + log_audio!( + info, + "Native input device: sample_rate={}, channels={}, format={:?}", + sample_rate, + channels, + config.sample_format() + ); + + let err_fn = |err| { + log_audio!(error, "Native microphone stream error: {}", err); + }; + + match config.sample_format() { + cpal::SampleFormat::F32 => build_stream::( + &device, + &stream_config, + channels, + sample_rate, + tx, + final_chunk, + err_fn, + ), + cpal::SampleFormat::I16 => build_stream::( + &device, + &stream_config, + channels, + sample_rate, + tx, + final_chunk, + err_fn, + ), + cpal::SampleFormat::U16 => build_stream::( + &device, + &stream_config, + channels, + sample_rate, + tx, + final_chunk, + err_fn, + ), + other => Err(format!("不支持的采样格式: {other:?}")), + } +} + +fn build_stream( + device: &cpal::Device, + config: &cpal::StreamConfig, + channels: usize, + sample_rate: u32, + tx: tokio::sync::mpsc::UnboundedSender>, + final_chunk: Arc>>, + err_fn: impl FnMut(cpal::StreamError) + Send + 'static, +) -> Result +where + T: cpal::Sample + cpal::SizedSample + Send + 'static, + f32: FromNativeSample, +{ + let mut resampler = StreamingResampler::new(sample_rate, TARGET_SAMPLE_RATE); + + device + .build_input_stream( + config, + move |data: &[T], _| { + let mono = downmix_to_mono(data, channels); + let samples = resampler.push(&mono); + let Ok(mut chunk) = final_chunk.lock() else { + return; + }; + for sample in samples { + chunk.push(sample); + if chunk.len() >= TARGET_CHUNK_SAMPLES { + let full = std::mem::take(&mut *chunk); + if tx.send(full).is_err() { + return; + } + } + } + }, + err_fn, + None, + ) + .map_err(|e| format!("创建麦克风输入流失败: {e}")) +} + +fn downmix_to_mono(data: &[T], channels: usize) -> Vec +where + T: Copy, + f32: FromNativeSample, +{ + data.chunks(channels) + .map(|frame| { + let sum = frame + .iter() + .map(|&sample| f32::from_native_sample(sample)) + .sum::(); + sum / channels as f32 + }) + .collect() +} + +trait FromNativeSample { + fn from_native_sample(sample: T) -> f32; +} + +impl FromNativeSample for f32 { + fn from_native_sample(sample: f32) -> f32 { + sample.clamp(-1.0, 1.0) + } +} + +impl FromNativeSample for f32 { + fn from_native_sample(sample: i16) -> f32 { + sample as f32 / i16::MAX as f32 + } +} + +impl FromNativeSample for f32 { + fn from_native_sample(sample: u16) -> f32 { + (sample as f32 - 32768.0) / 32768.0 + } +} + +struct StreamingResampler { + from_rate: u32, + to_rate: u32, + ratio: f64, + position: f64, + input: Vec, +} + +impl StreamingResampler { + fn new(from_rate: u32, to_rate: u32) -> Self { + Self { + from_rate, + to_rate, + ratio: from_rate as f64 / to_rate as f64, + position: 0.0, + input: Vec::new(), + } + } + + fn push(&mut self, samples: &[f32]) -> Vec { + if samples.is_empty() { + return Vec::new(); + } + if self.from_rate == self.to_rate { + return samples.to_vec(); + } + + self.input.extend_from_slice(samples); + let mut output = Vec::new(); + while self.position + 1.0 < self.input.len() as f64 { + let idx = self.position.floor() as usize; + let frac = (self.position - idx as f64) as f32; + let a = self.input[idx]; + let b = self.input[idx + 1]; + output.push(a + (b - a) * frac); + self.position += self.ratio; + } + + // Keep at least the last input sample as the interpolation anchor for + // the next callback. With ratios such as 48k -> 16k, `position` can step + // past the current buffer length after the final emitted sample; never + // drain beyond the slice or CoreAudio's no-unwind callback will abort. + let consumed = (self.position.floor() as usize).min(self.input.len().saturating_sub(1)); + if consumed > 0 { + self.input.drain(..consumed); + self.position -= consumed as f64; + } + output + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn resampler_handles_48k_coreaudio_512_frame_callbacks() { + let mut resampler = StreamingResampler::new(48_000, 16_000); + for _ in 0..20 { + let input = vec![0.25; 512]; + let output = resampler.push(&input); + assert!(!output.is_empty()); + } + } + + #[test] + fn resampler_keeps_last_sample_for_next_interpolation_window() { + let mut resampler = StreamingResampler::new(48_000, 16_000); + let _ = resampler.push(&vec![0.0; 512]); + assert!(!resampler.input.is_empty()); + assert!(resampler.position >= 0.0); + assert!(resampler.position < resampler.input.len() as f64 + resampler.ratio); + } +} diff --git a/web/src/ui/main-overlay.ts b/web/src/ui/main-overlay.ts index bdea157..6cc04a5 100644 --- a/web/src/ui/main-overlay.ts +++ b/web/src/ui/main-overlay.ts @@ -444,6 +444,10 @@ function stopCueKeepAlive(): void { } } +function usesNativeAudioCapture(): boolean { + return currentAppearance.platform === "macos"; +} + // Create the cue context if needed, resume it, and start the keep-alive so the // output device is warm and settled by the time a cue plays. Idempotent; called // during warmup. @@ -674,7 +678,12 @@ onOverlayEvent(async (event: OverlayEvent) => { case "audio:warmup": try { state.audioReady = false; - await startAudioCapture(); + if (usesNativeAudioCapture()) { + ensureCueContextWarm(); + state.audioReady = true; + } else { + await startAudioCapture(); + } sendAudioWarmupReady(); } catch (error) { const msg = (error as Error).message || String(error); @@ -689,7 +698,11 @@ onOverlayEvent(async (event: OverlayEvent) => { try { state.appState = "recording"; state.audioReady = false; - await startAudioCapture(); + if (usesNativeAudioCapture()) { + state.audioReady = true; + } else { + await startAudioCapture(); + } startWaveformAnimation(); state.hintText = ""; state.hintLevel = "info"; @@ -704,7 +717,12 @@ onOverlayEvent(async (event: OverlayEvent) => { updateView(); break; case "recording:stop": - await stopAudioCapture(); + if (usesNativeAudioCapture()) { + stopWaveformAnimation(); + state.pendingSamples = []; + } else { + await stopAudioCapture(); + } notifyAudioStopped(); break; case "transcript": { From d168f605b5230d47bfd7715d1f67c652efc34b8f Mon Sep 17 00:00:00 2001 From: chen fengchao Date: Tue, 23 Jun 2026 23:59:58 +0800 Subject: [PATCH 2/8] =?UTF-8?q?refactor(audio):=20=E5=8E=9F=E7=94=9F?= =?UTF-8?q?=E5=BD=95=E9=9F=B3=20ready=20=E4=BF=A1=E5=8F=B7=E6=94=B9?= =?UTF-8?q?=E7=94=A8=20tokio=20oneshot?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 原先用 std::sync::mpsc 的阻塞 recv 在 async 上下文里等待采集 线程就绪,会占住一个 tokio worker 线程。改用 oneshot 异步等待, 采集流建好的瞬间即返回,不再阻塞执行器。 Co-Authored-By: Claude Opus 4.8 --- src-tauri/src/native_audio.rs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src-tauri/src/native_audio.rs b/src-tauri/src/native_audio.rs index 837d051..f8268ba 100644 --- a/src-tauri/src/native_audio.rs +++ b/src-tauri/src/native_audio.rs @@ -34,7 +34,7 @@ pub async fn start_capture( let (audio_tx, mut rx) = tokio::sync::mpsc::unbounded_channel::>(); let (stop_tx, stop_rx) = std::sync::mpsc::channel::<()>(); - let (ready_tx, ready_rx) = std::sync::mpsc::channel::>(); + let (ready_tx, ready_rx) = tokio::sync::oneshot::channel::>(); let input_thread = thread::Builder::new() .name("voicepaste-native-audio".to_string()) @@ -45,7 +45,9 @@ pub async fn start_capture( }) .map_err(|e| format!("启动原生录音线程失败: {e}"))?; - match ready_rx.recv() { + // Await the ready signal asynchronously: the worker sends it the moment the + // input stream is built, so we never block a tokio worker thread. + match ready_rx.await { Ok(Ok(())) => {} Ok(Err(error)) => { let _ = stop_tx.send(()); @@ -92,7 +94,7 @@ pub async fn stop_capture(app_inner: &Arc) { fn run_input_thread( tx: tokio::sync::mpsc::UnboundedSender>, stop_rx: std::sync::mpsc::Receiver<()>, - ready_tx: std::sync::mpsc::Sender>, + ready_tx: tokio::sync::oneshot::Sender>, ) -> Result<(), String> { let final_chunk = Arc::new(Mutex::new(Vec::::with_capacity(TARGET_CHUNK_SAMPLES))); let stream = match build_input_stream(tx.clone(), Arc::clone(&final_chunk)) { From bf31d50baa8944f5bfd297dec7dc1ca2643b7cd7 Mon Sep 17 00:00:00 2001 From: chen fengchao Date: Sun, 21 Jun 2026 11:56:53 +0800 Subject: [PATCH 3/8] =?UTF-8?q?docs(audio):=20=E8=AE=B0=E5=BD=95=E5=8E=9F?= =?UTF-8?q?=E7=94=9F=E6=82=AC=E6=B5=AE=E7=AA=97=E4=B8=8E=E9=87=8D=E8=AF=95?= =?UTF-8?q?=E8=AE=BE=E8=AE=A1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 先固化 WebView-free 主路径、late result 修复、录音资产和重试策略,确保后续实现按已确认的阶段推进。 --- ...06-21-native-overlay-retry-audio-design.md | 109 ++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 docs/superpowers/specs/2026-06-21-native-overlay-retry-audio-design.md diff --git a/docs/superpowers/specs/2026-06-21-native-overlay-retry-audio-design.md b/docs/superpowers/specs/2026-06-21-native-overlay-retry-audio-design.md new file mode 100644 index 0000000..f9192c6 --- /dev/null +++ b/docs/superpowers/specs/2026-06-21-native-overlay-retry-audio-design.md @@ -0,0 +1,109 @@ +# Native Overlay, Retry, and Recording Asset Design + +- Date: 2026-06-21 +- Status: Approved +- Branch: `codex/native-cpal-capture` + +## Goal + +Make the macOS recording main path independent of WebView control logic, then make ASR failure and late-result behavior recoverable by using saved WAV recordings as retryable transcription assets. + +The user-visible behavior outside recording, overlay feedback, retry, and history playback should remain unchanged. + +## Execution Order + +1. Native overlay and native cue playback. +2. Fix late ASR result handling. +3. Add recording asset, history playback, retry, and retention policy. + +## Phase 1: Native Main Path + +On macOS, the recording main path should no longer depend on WebView for recording lifecycle control or cue playback. + +The existing native overlay remains the visual surface. It should handle actionable failure states directly, including a retry icon button. The retry control must be visually subtle and fit the current glass pill style. Its maximum visual footprint must not exceed the current recording waveform element, so the control remains refined rather than dominant. + +Failure overlay behavior: + +- Show the existing failure text style. +- Show a refresh-style icon button only, without text. +- Display a 5-second countdown affordance around the retry button. +- If the user does not click within 5 seconds, hide the overlay. +- The failed transcription attempt remains available in input history when a WAV exists. + +Cue playback should move to native playback on macOS so start/end cues do not depend on the overlay WebView. Windows can keep the current WebView path unless the native implementation is naturally cross-platform. + +## Phase 2: Late ASR Result Handling + +The current Doubao flow can return a partial result when `commit_and_await_final` times out after 5 seconds, while the server may continue sending a more complete result afterward. This causes premature paste of incomplete text. + +The fix should prefer correctness over premature paste: + +- Do not paste a partial result merely because the 5-second commit wait elapsed. +- If the session has not produced a reliable final result by the deadline, mark the attempt as failed or retryable instead of pasting known-incomplete text. +- If a definite final result or terminal close arrives within the accepted completion window, paste normally. +- The saved WAV should make manual retry cheap, so retryable failure is better than silently pasting partial text. + +## Phase 3: Recording Assets and Retry + +Each transcription attempt should have a durable record that can represent success or failure. + +History entries should support: + +- `status`: success or failed. +- `text`: successful final text, or a short failure description. +- `audioPath`: saved WAV path when available. +- `error`: failure reason when applicable. +- `retryOf`: optional original entry timestamp or ID. + +Successful entries continue to count toward usage statistics. Failed entries should appear in input history but should not increase total session or character counts. + +Retry behavior: + +- Retry uses the saved WAV, not the microphone. +- Retry can be triggered from the native failure overlay within 5 seconds. +- Retry can also be triggered from Settings home input history. +- A successful retry creates or updates a successful history record and follows the normal paste/clipboard/statistics path. +- If recording retention is disabled, the failed WAV is deleted after a retry succeeds. + +History UI behavior: + +- Successful rows show play, copy, and delete icon buttons. +- Failed rows show play, retry, and delete icon buttons. +- Buttons must match the current input-record action style: orange solid rounded-square icon buttons with white line icons. + +## Recording Retention Setting + +Add an app setting for whether to retain recordings. + +Default: disabled. + +When enabled: + +- Keep successful and failed recordings for the most recent 1 month. +- Prune older recordings and references. + +When disabled: + +- Keep only recordings needed for failed retryable entries. +- Delete recordings after successful transcription or successful retry. + +## Testing + +Backend: + +- Unit tests for history serialization/backward compatibility. +- Unit tests for retention pruning decisions. +- Tests for retrying a WAV through the same ASR path where practical. +- Tests for Doubao commit timeout behavior so partial text is not treated as successful final output. + +Frontend/settings: + +- Tests for history rows with success and failure states. +- Tests for play/retry button bridge calls. + +Manual: + +- Network timeout creates a failed history entry with WAV. +- Native overlay retry starts a transcription attempt from WAV. +- Settings history retry works after overlay disappears. +- Successful retry removes failed-only WAV when retention is disabled. From 924145028a997077ac6a9dde6f21c0819584c7fa Mon Sep 17 00:00:00 2001 From: chen fengchao Date: Tue, 23 Jun 2026 23:59:58 +0800 Subject: [PATCH 4/8] =?UTF-8?q?chore:=20=E6=B7=BB=E5=8A=A0=20dev:no-watch?= =?UTF-8?q?=20=E8=84=9A=E6=9C=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 调试原生悬浮窗时常需要不触发热重载的运行方式。 Co-Authored-By: Claude Opus 4.8 --- package.json | 1 + 1 file changed, 1 insertion(+) diff --git a/package.json b/package.json index 96d3160..681bcb2 100644 --- a/package.json +++ b/package.json @@ -28,6 +28,7 @@ "scripts": { "tauri": "tauri", "dev": "tauri dev", + "dev:no-watch": "tauri dev --no-watch", "dev:web": "vite", "build:web": "vite build", "pack": "tsx --env-file=.env scripts/pack.ts", From f24a146139ff4c089affdc448e7ddb06103ee345 Mon Sep 17 00:00:00 2001 From: chen fengchao Date: Wed, 24 Jun 2026 00:28:06 +0800 Subject: [PATCH 5/8] =?UTF-8?q?feat(audio):=20=E5=BC=80=E5=A7=8B/=E7=BB=93?= =?UTF-8?q?=E6=9D=9F=E6=8F=90=E7=A4=BA=E9=9F=B3=E6=94=B9=E7=94=A8=E5=8E=9F?= =?UTF-8?q?=E7=94=9F=E6=92=AD=E6=94=BE=EF=BC=8CmacOS=20=E5=8E=BB=E6=8E=89?= =?UTF-8?q?=20settle=20=E5=BB=B6=E8=BF=9F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 原 WebView getUserMedia 路径靠 base64 事件让前端 AudioContext 播 提示音;原生采集下直接用 paste::play_sound 播放,更稳更准时。 原生采集没有浏览器 AEC/AGC 需要收敛,macOS 的 settle 延迟置 0, 按键到「开始」更跟手。 Co-Authored-By: Claude Opus 4.8 --- src-tauri/src/lib.rs | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/src-tauri/src/lib.rs b/src-tauri/src/lib.rs index fb3441e..f7f7565 100644 --- a/src-tauri/src/lib.rs +++ b/src-tauri/src/lib.rs @@ -28,9 +28,12 @@ use tauri::{ }; /// Delay after the mic stream is ready, before entering Recording / playing the -/// start cue. The renderer capture path uses this to let browser audio processing -/// settle; native capture keeps the same short guard so the user-facing timing -/// remains predictable across platforms. +/// start cue. The renderer (getUserMedia) path needs it so the browser's AEC/AGC +/// converge before the first words. Native cpal capture has no such DSP warmup, +/// so macOS uses 0 — testing whether dropped leading words / cue glitches return. +#[cfg(target_os = "macos")] +const AUDIO_SETTLE_MS: u64 = 0; +#[cfg(not(target_os = "macos"))] const AUDIO_SETTLE_MS: u64 = 350; #[cfg_attr(mobile, tauri::mobile_entry_point)] @@ -374,14 +377,19 @@ fn resolve_configured_sound_path( /// dedicated, kept-warm AudioContext, so the cue is full-volume and never /// truncated. Falls back to `afplay` only if the file cannot be read. fn emit_cue(app: &AppHandle, app_inner: &Arc, name: &str) { - use base64::Engine as _; - let Some(file_path) = resolve_configured_sound_path(app, app_inner, name) else { return; }; + #[cfg(target_os = "macos")] + { + crate::paste::play_sound(&file_path); + } + + #[cfg(not(target_os = "macos"))] match std::fs::read(&file_path) { Ok(bytes) => { + use base64::Engine as _; let data = base64::engine::general_purpose::STANDARD.encode(&bytes); let _ = app.emit( "overlay:event", From fa7d4eba2142fb3e2923fe8b541a0505d3844643 Mon Sep 17 00:00:00 2001 From: chen fengchao Date: Wed, 24 Jun 2026 09:30:17 +0800 Subject: [PATCH 6/8] =?UTF-8?q?build:=20=E4=BF=AE=E5=A4=8D=20Linux=20CI=20?= =?UTF-8?q?=E6=9E=84=E5=BB=BA=EF=BC=88cpal=20=E9=99=90=E5=AE=9A=20macOS=20?= =?UTF-8?q?+=20=E9=9D=9E=20macOS=20=E6=9C=AA=E7=94=A8=E5=8F=82=E6=95=B0?= =?UTF-8?q?=EF=BC=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - native_audio 仅在 macOS 编译,cpal 也只在 macOS 用到;放在通用 依赖里会让 Linux CI 拉取 cpal→alsa-sys,需要 libasound2-dev (alsa.pc) 而构建失败。改到 macOS target 依赖即可避开。 - append_audio_samples 的 app 仅用于 macOS 原生波形,非 macOS 下 以 let _ = app 消除 unused 警告(clippy -D warnings)。 Co-Authored-By: Claude Opus 4.8 --- src-tauri/Cargo.toml | 4 +++- src-tauri/src/commands.rs | 3 +++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src-tauri/Cargo.toml b/src-tauri/Cargo.toml index 553a5c8..cdf92d4 100644 --- a/src-tauri/Cargo.toml +++ b/src-tauri/Cargo.toml @@ -45,7 +45,6 @@ async-trait = "0.1" sherpa-onnx = { version = "1.13", default-features = false } tar = "0.4" bzip2 = "0.4" -cpal = "0.15" [features] default = [] @@ -60,3 +59,6 @@ wiremock = "0.6" objc2 = "0.6" objc2-app-kit = "0.3" objc2-foundation = "0.3" +# Native microphone capture is macOS-only (native_audio is cfg(macos)); keeping +# cpal off other targets avoids pulling ALSA (libasound2-dev) into the Linux CI. +cpal = "0.15" diff --git a/src-tauri/src/commands.rs b/src-tauri/src/commands.rs index 3559437..ab46bae 100644 --- a/src-tauri/src/commands.rs +++ b/src-tauri/src/commands.rs @@ -318,6 +318,9 @@ pub(crate) async fn append_audio_samples( if let Some(level) = compute_audio_level(&samples) { crate::overlay::set_audio_level(app, level); } + // `app` only drives the macOS native waveform above; unused on other platforms. + #[cfg(not(target_os = "macos"))] + let _ = app; // Hold the `asr_session` lock across the decision so buffering stays ordered // against the background connect task's drain (same lock), guaranteeing no From d56fd4fd73ccadfcdd3cf2de79607a6f7be2bf41 Mon Sep 17 00:00:00 2001 From: chen fengchao Date: Tue, 23 Jun 2026 23:59:43 +0800 Subject: [PATCH 7/8] =?UTF-8?q?fix(asr):=20doubao=20commit=20=E8=B6=85?= =?UTF-8?q?=E6=97=B6=E8=BF=94=E5=9B=9E=E9=94=99=E8=AF=AF=E8=80=8C=E9=9D=9E?= =?UTF-8?q?=E5=9B=9E=E9=80=80=E7=A9=BA=E6=96=87=E6=9C=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 超时回退到 partial/final 文本会把未完成的识别当成成功结果, 掩盖网络问题。改为返回错误并把超时放宽到 15s,让上层据此走 失败兜底(提示 + 重试),结果更可控。 Co-Authored-By: Claude Opus 4.8 --- src-tauri/src/asr/doubao.rs | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/src-tauri/src/asr/doubao.rs b/src-tauri/src/asr/doubao.rs index 1c0b727..60b3a3b 100644 --- a/src-tauri/src/asr/doubao.rs +++ b/src-tauri/src/asr/doubao.rs @@ -629,8 +629,6 @@ impl AsrEngine for DoubaoEngine { let session = DoubaoSession { is_ready: is_ready.clone(), is_committed: is_committed.clone(), - final_text: final_text.clone(), - latest_result_text: latest_result_text.clone(), writer_tx, commit_tx: commit_tx.clone(), }; @@ -930,8 +928,6 @@ impl AsrEngine for DoubaoEngine { struct DoubaoSession { is_ready: Arc, is_committed: Arc, - final_text: Arc>, - latest_result_text: Arc>, /// Sends frames to the dedicated writer task. A single FIFO consumer keeps /// frames ordered and guarantees the last packet is written after all audio. writer_tx: mpsc::UnboundedSender, @@ -979,14 +975,9 @@ impl AsrSession for DoubaoSession { let (tx, rx) = tokio::sync::oneshot::channel(); *self.commit_tx.lock().await = Some(tx); - match tokio::time::timeout(std::time::Duration::from_secs(5), rx).await { + match tokio::time::timeout(std::time::Duration::from_secs(15), rx).await { Ok(Ok(text)) => Ok(text), - _ => { - // Timeout: use whatever we have - let latest = self.latest_result_text.lock().await.clone(); - let final_t = self.final_text.lock().await.clone(); - Ok(if latest.is_empty() { final_t } else { latest }) - } + _ => Err("ASR 最终结果超时,请检查网络连接".to_string()), } } From 2154c6ed0f3b6aac877ee8d2064678017da62653 Mon Sep 17 00:00:00 2001 From: chen fengchao Date: Wed, 24 Jun 2026 00:00:45 +0800 Subject: [PATCH 8/8] =?UTF-8?q?feat(overlay):=20=E5=A4=B1=E8=B4=A5?= =?UTF-8?q?=E8=BD=AC=E5=86=99=E5=BD=95=E9=9F=B3=E7=95=99=E5=AD=98=E4=B8=8E?= =?UTF-8?q?=E4=B8=80=E9=94=AE=E9=87=8D=E8=AF=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 为转写失败提供完整的恢复路径,避免一次网络抖动就丢失整段语音。 录音留存:与发送给 ASR 相同的 16k 单声道 PCM 整段缓存,停止时 写为 WAV。成功后按 keep_recordings 设置保留或删除,失败录音保留 以供重试,未重试的录音 31 天后随留存清理一并回收。新增「保留录音」 设置项与历史记录里的播放/重试入口。 一键重试:失败在悬浮窗给出可重试提示与按钮,重放保留的 WAV 重新 转写,结果以流式方式回填,焦点在粘贴前交还给原窗口(点击按钮会 激活悬浮窗,故记录并恢复前台 App)。 键盘操作:失败提示展示期间再次按主热键即触发重试;错误态与重试中 均可按 ESC 终止。重试按钮显示触发热键(如「重试 (R ⌥)」),符号与 设置页一致。 其它:没说话就停止时立即结束、不进入重试(跳过开头提示音窗口的 能量判断);macOS 原生采集无 AEC/AGC,settle 延迟置 0。 Co-Authored-By: Claude Opus 4.8 --- config.yaml.example | 1 + src-tauri/src/app_state.rs | 9 + src-tauri/src/commands.rs | 17 + src-tauri/src/config.rs | 3 + src-tauri/src/hotkey.rs | 2 +- src-tauri/src/lib.rs | 782 +++++++++++++++++++++++---- src-tauri/src/overlay.rs | 398 +++++++++++++- src-tauri/src/stats.rs | 184 ++++++- web/index.html | 7 + web/src/bridge/overlay.ts | 5 + web/src/bridge/settings.ts | 10 +- web/src/ui/main-overlay.ts | 99 +++- web/src/ui/pages/AppSettingsPage.tsx | 12 +- web/src/ui/pages/HomePage.tsx | 117 +++- web/styles.css | 58 ++ 15 files changed, 1560 insertions(+), 144 deletions(-) diff --git a/config.yaml.example b/config.yaml.example index 7ae25a3..50707e4 100644 --- a/config.yaml.example +++ b/config.yaml.example @@ -9,6 +9,7 @@ app: hotkey_mode: toggle # toggle=按一次开始再按一次结束,hold=按住说话松开结束 remove_trailing_period: true # 自动删除识别结果末尾的句号 keep_clipboard: true # 保留识别结果在剪贴板 + keep_recordings: false # 是否保留成功录音;关闭时仅保留失败录音用于重试 theme: system # dark / light / system overlay_style: liquid # macOS 悬浮窗外观(仅 macOS 生效) sound: diff --git a/src-tauri/src/app_state.rs b/src-tauri/src/app_state.rs index 408f02b..308c8ef 100644 --- a/src-tauri/src/app_state.rs +++ b/src-tauri/src/app_state.rs @@ -35,6 +35,12 @@ pub struct AppInner { /// Full-session 16k mono PCM captured from the same stream sent to ASR. /// Saved as a WAV when a recording is finalized, for diagnostics and review. pub recording_audio: Mutex>, + /// WAV path for the current recording once saved. + pub current_recording_wav: Mutex>, + /// History timestamp of the failed entry currently being retried. + pub current_retry_of: Mutex>, + /// Latest failed history entry that has a WAV and can be retried from the overlay. + pub current_failure_ts: Mutex>, /// Resolves when the background ASR connect finishes (Ok) or fails (Err). /// `stop_recording` awaits this when the user stops before the session is ready. pub connect_rx: Mutex>>>, @@ -75,6 +81,9 @@ pub fn create_app_state( latest_transcript: Mutex::new((String::new(), String::new())), pending_audio: Mutex::new(Vec::new()), recording_audio: Mutex::new(Vec::new()), + current_recording_wav: Mutex::new(None), + current_retry_of: Mutex::new(None), + current_failure_ts: Mutex::new(None), connect_rx: Mutex::new(None), session_epoch: std::sync::atomic::AtomicU64::new(0), accumulated_text: Mutex::new(String::new()), diff --git a/src-tauri/src/commands.rs b/src-tauri/src/commands.rs index ab46bae..a84444e 100644 --- a/src-tauri/src/commands.rs +++ b/src-tauri/src/commands.rs @@ -271,6 +271,23 @@ pub async fn delete_history( Ok(serde_json::json!({ "ok": true })) } +/// Retry a failed history entry by replaying its saved WAV through ASR. +#[tauri::command] +pub async fn retry_history_transcription( + app: AppHandle, + ts: String, +) -> Result { + crate::retry_history_transcription(app, ts).await +} + +/// Retry the latest failed recording from the overlay retry button. +#[tauri::command] +pub async fn retry_latest_failed_transcription( + app: AppHandle, +) -> Result { + crate::retry_latest_failed_transcription(app).await +} + /// Compute a 0..1 loudness level from f32 PCM samples for the overlay waveform. /// Mirrors the web AnalyserNode mapping (RMS + peak, mild compression). #[cfg(target_os = "macos")] diff --git a/src-tauri/src/config.rs b/src-tauri/src/config.rs index f1b4d54..3f54d47 100644 --- a/src-tauri/src/config.rs +++ b/src-tauri/src/config.rs @@ -140,6 +140,8 @@ pub struct AppSettings { pub remove_trailing_period: bool, #[serde(default = "default_true")] pub keep_clipboard: bool, + #[serde(default)] + pub keep_recordings: bool, #[serde(default = "default_theme")] pub theme: String, #[serde(default = "default_overlay_style")] @@ -812,6 +814,7 @@ impl Default for AppConfig { hotkey_mode: default_hotkey_mode(), remove_trailing_period: true, keep_clipboard: true, + keep_recordings: false, theme: default_theme(), overlay_style: default_overlay_style(), sound: None, diff --git a/src-tauri/src/hotkey.rs b/src-tauri/src/hotkey.rs index 209a7ec..3052d83 100644 --- a/src-tauri/src/hotkey.rs +++ b/src-tauri/src/hotkey.rs @@ -505,7 +505,7 @@ fn run_listener_loop(tap: &Tap, config: &HotkeyConfig, app_handle: &tauri::AppHa escape_was_pressed = true; let handle = app_handle.clone(); tauri::async_runtime::spawn(async move { - crate::cancel_recording(handle).await; + crate::on_escape(handle).await; }); } diff --git a/src-tauri/src/lib.rs b/src-tauri/src/lib.rs index f7f7565..24657b1 100644 --- a/src-tauri/src/lib.rs +++ b/src-tauri/src/lib.rs @@ -162,6 +162,8 @@ pub fn run() { commands::get_stats, commands::get_history, commands::delete_history, + commands::retry_history_transcription, + commands::retry_latest_failed_transcription, commands::send_audio_chunk, commands::audio_stopped, commands::audio_warmup_ready, @@ -268,6 +270,107 @@ fn position_overlay(app_handle: &AppHandle) { } } +fn set_overlay_retry_interaction(app_handle: &AppHandle, enabled: bool) { + if let Some(overlay) = app_handle.get_webview_window("overlay") { + let _ = overlay.set_ignore_cursor_events(!enabled); + } + if enabled { + // The user's app is still frontmost here; remember it so a successful retry + // can return focus before pasting (clicking the retry button activates the + // overlay otherwise). Self-capture is filtered out inside the helper. + overlay::capture_foreground_app(app_handle); + } +} + +/// Map one accelerator token to the symbol the settings UI shows. Mirrors the +/// frontend `normalizeHotkeyLabel` so the overlay label matches system settings. +fn normalize_hotkey_key(key: &str) -> &str { + match key { + "CmdOrCtrl" | "CommandOrControl" | "Command" | "Cmd" | "Meta" => "⌘", + "Control" | "Ctrl" => "⌃", + "Shift" => "⇧", + "Alt" | "Option" => "⌥", + "Space" => "␣", + "ControlLeft" => "L ⌃", + "ControlRight" => "R ⌃", + "ShiftLeft" => "L ⇧", + "ShiftRight" => "R ⇧", + "AltLeft" => "L ⌥", + "AltRight" => "R ⌥", + "MetaLeft" => "L ⌘", + "MetaRight" => "R ⌘", + other => other, + } +} + +/// Format an accelerator string ("AltRight", "Control+Space") into the symbol +/// label shown in settings ("R ⌥", "⌃ ␣"). +fn format_hotkey_label(hotkey: &str) -> String { + hotkey + .split('+') + .map(|k| normalize_hotkey_key(k.trim())) + .collect::>() + .join(" ") +} + +/// The configured main hotkey, formatted for display. Empty for recorded keycode +/// sequences (which have no stable accelerator string). +async fn current_hotkey_label(app_inner: &Arc) -> String { + let Ok(config) = app_inner.config_manager.load_config() else { + return String::new(); + }; + match &config.app.hotkey { + serde_norway::Value::String(s) => format_hotkey_label(s), + _ => String::new(), + } +} + +/// Emit a retryable error hint, tagged with the main hotkey label so the overlay +/// can show which key (also) triggers the retry. Centralizes every failure path. +async fn emit_retryable_error_hint( + app: &AppHandle, + app_inner: &Arc, + text: &str, +) { + let hotkey = current_hotkey_label(app_inner).await; + let _ = app.emit( + "overlay:event", + serde_json::json!({ + "type": "hint", + "payload": { + "text": text, + "level": "error", + "variant": "text", + "retryable": true, + "hotkey": hotkey + } + }), + ); +} + +fn schedule_retry_overlay_hide(app_handle: AppHandle, app_inner: Arc) { + // While the retry is shown (idle), keep ESC live so it can dismiss the failure. + // set_app_state(Idle) just disabled it, so re-enable it here. + set_escape_enabled_now(&app_handle, true); + tauri::async_runtime::spawn(async move { + tokio::time::sleep(Duration::from_secs(5)).await; + let still_idle = { + let s = app_inner.state.lock().await; + matches!(*s, app_state::AppState::Idle) + }; + if still_idle { + // The retry affordance is gone once the overlay hides, so drop the + // pending failure: the hotkey reverts to starting a new recording. + *app_inner.current_failure_ts.lock().await = None; + set_escape_enabled_now(&app_handle, false); + set_overlay_retry_interaction(&app_handle, false); + if let Some(overlay) = app_handle.get_webview_window("overlay") { + let _ = overlay.hide(); + } + } + }); +} + fn app_state_name(state: &app_state::AppState) -> &'static str { match state { app_state::AppState::Idle => "idle", @@ -412,6 +515,7 @@ fn emit_cue(app: &AppHandle, app_inner: &Arc, name: &str) { } } +#[cfg(not(target_os = "macos"))] async fn stop_renderer_audio( app: &AppHandle, app_inner: &Arc, @@ -441,16 +545,24 @@ async fn stop_audio_capture( timeout_ms: u64, ) { #[cfg(target_os = "macos")] - native_audio::stop_capture(app_inner).await; + { + native_audio::stop_capture(app_inner).await; + let _ = timeout_ms; + let _ = app; + } + #[cfg(not(target_os = "macos"))] stop_renderer_audio(app, app_inner, timeout_ms).await; } -async fn save_recording_wav(app: &AppHandle, app_inner: &Arc) { +async fn save_recording_wav( + app: &AppHandle, + app_inner: &Arc, +) -> Option { let samples = { let mut audio = app_inner.recording_audio.lock().await; if audio.is_empty() { - return; + return app_inner.current_recording_wav.lock().await.clone(); } std::mem::take(&mut *audio) }; @@ -463,7 +575,7 @@ async fn save_recording_wav(app: &AppHandle, app_inner: &Arc log_audio!(info, "Recording WAV saved: {}", path.display()), - Err(error) => log_audio!( - warn, - "Write recording WAV failed ({}): {}", - path.display(), - error - ), + Ok(()) => { + log_audio!(info, "Recording WAV saved: {}", path.display()); + *app_inner.current_recording_wav.lock().await = Some(path.clone()); + Some(path) + } + Err(error) => { + log_audio!( + warn, + "Write recording WAV failed ({}): {}", + path.display(), + error + ); + None + } } } @@ -521,6 +640,129 @@ fn write_wav_16k_mono(path: &std::path::Path, samples: &[f32]) -> Result<(), Str std::fs::write(path, wav).map_err(|e| e.to_string()) } +async fn current_recording_wav_string(app_inner: &Arc) -> Option { + app_inner + .current_recording_wav + .lock() + .await + .as_ref() + .map(|path| path.to_string_lossy().to_string()) +} + +async fn record_transcription_failure( + app: &AppHandle, + app_inner: &Arc, + message: &str, +) -> String { + let audio_path = current_recording_wav_string(app_inner).await; + let retry_of = app_inner.current_retry_of.lock().await.clone(); + let ts = app_inner + .stats + .lock() + .await + .record_failure(message, audio_path, retry_of); + *app_inner.current_failure_ts.lock().await = Some(ts.clone()); + set_overlay_retry_interaction(app, true); + ts +} + +fn prune_old_recordings(app: &AppHandle) { + let Ok(data_dir) = app.path().app_data_dir() else { + return; + }; + let recordings_dir = data_dir.join("recordings"); + let Ok(entries) = std::fs::read_dir(recordings_dir) else { + return; + }; + let cutoff = std::time::SystemTime::now() + .checked_sub(std::time::Duration::from_secs(31 * 24 * 60 * 60)) + .unwrap_or(std::time::UNIX_EPOCH); + + for entry in entries.flatten() { + let path = entry.path(); + if path.extension().and_then(|ext| ext.to_str()) != Some("wav") { + continue; + } + let Ok(metadata) = entry.metadata() else { + continue; + }; + let Ok(modified) = metadata.modified() else { + continue; + }; + if modified < cutoff { + let _ = std::fs::remove_file(path); + } + } +} + +/// Heuristic: did this recording capture actual sound (speech) rather than +/// silence? Used to tell a genuine no-speech stop (end immediately) apart from +/// speech whose transcript was lost to a slow/failed network (keep commit + +/// retry). Biased toward "has sound" so real speech is never silently dropped. +fn recording_has_audio_signal(samples: &[f32]) -> bool { + // 16k mono. Native capture has no AEC, so the start cue bleeds into the mic + // at the very beginning; skip that leading window so the cue is never mistaken + // for speech. Anything the user actually says runs past it (and if they spoke + // inside it, a transcript would have arrived, short-circuiting this check). + const CUE_SKIP: usize = 11_200; // ~0.7s at 16k covers the start cue + echo tail + const MIN_VOICE: usize = 1_600; // need ~100ms of real audio after the cue + if samples.len() < CUE_SKIP + MIN_VOICE { + return false; + } + let tail = &samples[CUE_SKIP..]; + let peak = tail.iter().fold(0.0f32, |m, &s| m.max(s.abs())); + let rms = (tail.iter().map(|&s| s * s).sum::() / tail.len() as f32).sqrt(); + // A quiet mic noise floor sits well below these; speech clears both easily. + peak >= 0.02 && rms >= 0.004 +} + +/// Drop the WAV and recording bookkeeping for a recording that produced nothing +/// worth keeping (e.g. the user stopped without speaking). Nothing to retry. +async fn discard_recording_artifacts(app_inner: &Arc) { + if let Some(path) = app_inner.current_recording_wav.lock().await.take() { + let _ = std::fs::remove_file(path); + } + *app_inner.current_retry_of.lock().await = None; + *app_inner.current_failure_ts.lock().await = None; + app_inner.recording_audio.lock().await.clear(); +} + +async fn record_success_and_apply_retention( + app: &AppHandle, + app_inner: &Arc, + text: &str, + keep_recordings: bool, +) { + let wav_path = app_inner.current_recording_wav.lock().await.take(); + let retry_of = app_inner.current_retry_of.lock().await.take(); + let audio_path = if keep_recordings { + wav_path + .as_ref() + .map(|path| path.to_string_lossy().to_string()) + } else { + if let Some(path) = wav_path { + let _ = std::fs::remove_file(path); + } + None + }; + let mut stats = app_inner.stats.lock().await; + if let Some(retry_ts) = retry_of.as_ref() { + if stats.replace_history_with_success(retry_ts, text, audio_path.clone()) { + drop(stats); + // Always prune: a never-retried failure recording is only deleted on a + // later success, otherwise it is reclaimed by the 31-day retention sweep, + // even when keep_recordings is off (only failure WAVs persist then). + prune_old_recordings(app); + return; + } + } + stats.record_session_with_audio(text, audio_path, retry_of); + drop(stats); + + prune_old_recordings(app); +} + +#[cfg(not(target_os = "macos"))] async fn wait_for_audio_warmup( app_inner: &Arc, timeout_ms: u64, @@ -652,6 +894,19 @@ struct ActivePromptId(std::sync::Mutex>); /// Handle hotkey press event. In toggle mode, toggles recording. In hold mode, starts recording. /// `prompt_id` is `Some(id)` when a prompt-template hotkey was triggered, `None` for the main hotkey. async fn on_hotkey_pressed(app_handle: AppHandle, mode: &str, prompt_id: Option) { + // Keyboard-driven retry: while a retryable failure is shown (idle, retry button + // visible), the main hotkey triggers the retry instead of a new recording, so + // the user can retry without reaching for the mouse. + if prompt_id.is_none() { + let app_inner = app_handle.state::>(); + let can_retry = matches!(*app_inner.state.lock().await, app_state::AppState::Idle) + && app_inner.current_failure_ts.lock().await.is_some(); + if can_retry { + let _ = retry_latest_failed_transcription(app_handle.clone()).await; + return; + } + } + // Store the active prompt ID for the recording session if let Some(active) = app_handle.try_state::() { *active.0.lock().unwrap() = prompt_id; @@ -751,6 +1006,10 @@ async fn start_recording(app_handle: AppHandle) { *app_inner.latest_transcript.lock().await = (String::new(), String::new()); app_inner.recording_audio.lock().await.clear(); + *app_inner.current_recording_wav.lock().await = None; + *app_inner.current_retry_of.lock().await = None; + *app_inner.current_failure_ts.lock().await = None; + set_overlay_retry_interaction(&app_handle, false); let _ = app_handle.emit("overlay:event", serde_json::json!({ "type": "reset" })); // Re-position before showing so the overlay follows the current display layout // (e.g. after an external monitor was connected/disconnected). @@ -779,28 +1038,31 @@ async fn start_recording(app_handle: AppHandle) { return; } - let _ = app_handle.emit( - "overlay:event", - serde_json::json!({ - "type": "audio:warmup", - }), - ); - if let Err(e) = wait_for_audio_warmup(&app_inner, 8000).await { - *recording_state.0.lock().unwrap() = false; - stop_audio_capture(&app_handle, &app_inner, 1200).await; - set_app_state(&app_handle, &app_inner, app_state::AppState::Idle).await; - if let Some(overlay) = app_handle.get_webview_window("overlay") { - let _ = overlay.hide(); - } - log_rec!(warn, "Audio warmup failed: {}", e); + #[cfg(not(target_os = "macos"))] + { let _ = app_handle.emit( "overlay:event", serde_json::json!({ - "type": "hint", - "payload": { "text": e, "level": "error", "variant": "text" } + "type": "audio:warmup", }), ); - return; + if let Err(e) = wait_for_audio_warmup(&app_inner, 8000).await { + *recording_state.0.lock().unwrap() = false; + stop_audio_capture(&app_handle, &app_inner, 1200).await; + set_app_state(&app_handle, &app_inner, app_state::AppState::Idle).await; + if let Some(overlay) = app_handle.get_webview_window("overlay") { + let _ = overlay.hide(); + } + log_rec!(warn, "Audio warmup failed: {}", e); + let _ = app_handle.emit( + "overlay:event", + serde_json::json!({ + "type": "hint", + "payload": { "text": e, "level": "error", "variant": "text" } + }), + ); + return; + } } // Check if recording was cancelled during warmup (hold mode: quick press-release) @@ -860,6 +1122,7 @@ async fn start_recording(app_handle: AppHandle) { emit_cue(&app_handle, &app_inner, "start"); set_app_state(&app_handle, &app_inner, app_state::AppState::Recording).await; + #[cfg(not(target_os = "macos"))] let _ = app_handle.emit( "overlay:event", serde_json::json!({ "type": "recording:start" }), @@ -950,6 +1213,208 @@ async fn create_active_session( result.map(|(session, event_rx)| (session, event_rx, show_recording_hint)) } +pub(crate) async fn retry_history_transcription( + app_handle: AppHandle, + ts: String, +) -> Result { + let app_inner = app_handle.state::>(); + let retry_epoch = app_inner + .session_epoch + .fetch_add(1, std::sync::atomic::Ordering::SeqCst) + + 1; + // No outer wall-clock timeout: the connection phase is already bounded by the + // ASR connect timeout (5s, surfaced as a failure + retry), and the final wait + // by commit_and_await_final's own timeout. An outer cap would only risk + // cutting off a valid streaming transcription mid-flight. + retry_history_transcription_inner(app_handle, ts, retry_epoch).await +} + +/// Record a retry attempt as a failure, surface the error hint, and arm the +/// overlay retry affordance + auto-hide. Shared by every failure path of +/// `retry_history_transcription_inner`. +async fn fail_retry( + app_handle: &AppHandle, + app_inner: &Arc, + ts: &str, + message: &str, +) { + let failure_ts = app_inner.stats.lock().await.record_failure( + message, + current_recording_wav_string(app_inner).await, + Some(ts.to_string()), + ); + *app_inner.current_failure_ts.lock().await = Some(failure_ts); + emit_retryable_error_hint(app_handle, app_inner, message).await; + set_overlay_retry_interaction(app_handle, true); + set_app_state(app_handle, app_inner, app_state::AppState::Idle).await; + schedule_retry_overlay_hide(app_handle.clone(), Arc::clone(app_inner)); +} + +async fn retry_history_transcription_inner( + app_handle: AppHandle, + ts: String, + retry_epoch: u64, +) -> Result { + let app_inner = app_handle.state::>(); + let entry = { + let stats = app_inner.stats.lock().await; + stats + .find_history(&ts) + .ok_or_else(|| "未找到输入记录".to_string())? + }; + let audio_path = entry + .audio_path + .clone() + .ok_or_else(|| "这条记录没有可重试的录音".to_string())?; + let path = PathBuf::from(&audio_path); + let samples = read_wav_16k_mono(&path)?; + if samples.is_empty() { + return Err("录音文件为空,无法重试".to_string()); + } + + set_overlay_retry_interaction(&app_handle, false); + set_app_state(&app_handle, &app_inner, app_state::AppState::Finishing).await; + // Clear the stale failure hint + old transcript, then show a "retrying" + // placeholder while the connection is established. The overlay yields this + // placeholder to the live transcript the moment the replayed recognition + // starts streaming in (see visible_hint / getVisibleHintText), so the user + // sees "重试中" → streaming text, like a normal recording. + let _ = app_handle.emit("overlay:event", serde_json::json!({ "type": "reset" })); + let _ = app_handle.emit( + "overlay:event", + serde_json::json!({ + "type": "hint", + "payload": { "text": "", "level": "info", "variant": "retry" } + }), + ); + *app_inner.latest_transcript.lock().await = (String::new(), String::new()); + *app_inner.current_recording_wav.lock().await = Some(path); + *app_inner.current_retry_of.lock().await = Some(ts.clone()); + + let config = app_inner.config_manager.load_config()?; + let hotwords = app_inner.hotword_manager.active_words(); + let (session, event_rx, _) = match create_active_session(&app_handle, &config, &hotwords).await + { + Ok(result) => result, + Err(error) => { + if !is_current_epoch(&app_inner, retry_epoch) { + return Err("重试已取消".to_string()); + } + let message = format!("{error},请检查网络连接"); + fail_retry(&app_handle, &app_inner, &ts, &message).await; + return Err(message); + } + }; + let session: Arc = Arc::from(session); + if !is_current_epoch(&app_inner, retry_epoch) { + session.close(); + return Err("重试已取消".to_string()); + } + let events_app = app_handle.clone(); + tauri::async_runtime::spawn(async move { + manage_asr_session(events_app, event_rx, retry_epoch).await; + }); + + for chunk in samples.chunks(1600) { + session.append_audio(chunk); + } + + let text = match session.commit_and_await_final().await { + Ok(text) if !text.trim().is_empty() => text, + Ok(_) => { + if !is_current_epoch(&app_inner, retry_epoch) { + session.close(); + return Err("重试已取消".to_string()); + } + session.close(); + let message = "重试转写没有得到文本,请检查网络连接"; + fail_retry(&app_handle, &app_inner, &ts, message).await; + return Err(message.to_string()); + } + Err(error) => { + if !is_current_epoch(&app_inner, retry_epoch) { + session.close(); + return Err("重试已取消".to_string()); + } + session.close(); + let error = format!("{error},请检查网络连接"); + fail_retry(&app_handle, &app_inner, &ts, &error).await; + return Err(error); + } + }; + + if !is_current_epoch(&app_inner, retry_epoch) { + session.close(); + return Err("重试已取消".to_string()); + } + // Hand focus back to the app the user was in before clicking retry, then give + // the OS a moment to switch, so the paste keystroke lands in the right window. + overlay::restore_foreground_app(&app_handle); + tokio::time::sleep(Duration::from_millis(150)).await; + finalize_and_paste(&app_handle, &app_inner, text.clone()).await; + session.close(); + *app_inner.current_failure_ts.lock().await = None; + set_app_state(&app_handle, &app_inner, app_state::AppState::Idle).await; + if let Some(overlay) = app_handle.get_webview_window("overlay") { + let _ = overlay.hide(); + } + Ok(serde_json::json!({ "ok": true, "text": text })) +} + +pub(crate) async fn retry_latest_failed_transcription( + app_handle: AppHandle, +) -> Result { + let app_inner = app_handle.state::>(); + let ts = app_inner + .current_failure_ts + .lock() + .await + .clone() + .ok_or_else(|| "没有可重试的失败录音".to_string())?; + retry_history_transcription(app_handle, ts).await +} + +fn read_wav_16k_mono(path: &std::path::Path) -> Result, String> { + let data = std::fs::read(path).map_err(|e| format!("读取录音文件失败: {e}"))?; + if data.len() < 44 || &data[0..4] != b"RIFF" || &data[8..12] != b"WAVE" { + return Err("录音文件不是有效 WAV".to_string()); + } + let mut pos = 12usize; + let mut channels = 0u16; + let mut sample_rate = 0u32; + let mut bits = 0u16; + let mut data_range = None; + while pos + 8 <= data.len() { + let id = &data[pos..pos + 4]; + let size = u32::from_le_bytes([data[pos + 4], data[pos + 5], data[pos + 6], data[pos + 7]]) + as usize; + let start = pos + 8; + let end = start.saturating_add(size).min(data.len()); + if id == b"fmt " && size >= 16 && end <= data.len() { + channels = u16::from_le_bytes([data[start + 2], data[start + 3]]); + sample_rate = u32::from_le_bytes([ + data[start + 4], + data[start + 5], + data[start + 6], + data[start + 7], + ]); + bits = u16::from_le_bytes([data[start + 14], data[start + 15]]); + } else if id == b"data" { + data_range = Some(start..end); + break; + } + pos = start + size + (size % 2); + } + if channels != 1 || sample_rate != 16_000 || bits != 16 { + return Err("仅支持 16kHz mono 16-bit WAV 重试".to_string()); + } + let range = data_range.ok_or_else(|| "WAV 缺少 data chunk".to_string())?; + Ok(data[range] + .chunks_exact(2) + .map(|chunk| i16::from_le_bytes([chunk[0], chunk[1]]) as f32 / 32768.0) + .collect()) +} + /// Connect the ASR session in the background (one retry), then attach it: flush /// any audio buffered during the connect and publish the ready session. Signals /// completion through `connect_tx` so `stop_recording` can wait when the user @@ -1043,28 +1508,14 @@ async fn connect_and_attach( app_inner.pending_audio.lock().await.clear(); stop_audio_capture(&app_handle, &app_inner, 1200).await; save_recording_wav(&app_handle, &app_inner).await; + let message = format!("ASR 连接失败: {},请检查网络连接", e); + record_transcription_failure(&app_handle, &app_inner, &message).await; // Emit error hint BEFORE setting idle so the overlay shows it: the // frontend's idle handler only clears "info"-level hints. - let _ = app_handle.emit("overlay:event", serde_json::json!({ - "type": "hint", - "payload": { "text": format!("ASR 连接失败: {}", e), "level": "error", "variant": "text" } - })); + emit_retryable_error_hint(&app_handle, &app_inner, &message).await; set_app_state(&app_handle, &app_inner, app_state::AppState::Idle).await; // Auto-hide after a delay so the user can read it; guard: still idle. - let delayed_handle = app_handle.clone(); - let delayed_inner: Arc = Arc::clone(&app_inner); - tauri::async_runtime::spawn(async move { - tokio::time::sleep(Duration::from_secs(3)).await; - let still_idle = { - let s = delayed_inner.state.lock().await; - matches!(*s, app_state::AppState::Idle) - }; - if still_idle { - if let Some(overlay) = delayed_handle.get_webview_window("overlay") { - let _ = overlay.hide(); - } - } - }); + schedule_retry_overlay_hide(app_handle.clone(), Arc::clone(&app_inner)); } } } @@ -1082,6 +1533,13 @@ async fn stop_recording(app_handle: AppHandle) { // 2. Stop renderer audio first so the final buffered chunk is flushed. stop_audio_capture(&app_handle, &app_inner, 1200).await; + // Snapshot whether real sound was captured before save_recording_wav drains + // the buffer: a silent stop ends immediately, but speech whose transcript was + // lost (slow/failed network) must keep the retry path even with no result yet. + let captured_audio_signal = { + let audio = app_inner.recording_audio.lock().await; + recording_has_audio_signal(&audio) + }; save_recording_wav(&app_handle, &app_inner).await; // 3. Acquire the ready ASR session. If the background connect hasn't finished @@ -1103,16 +1561,47 @@ async fn stop_recording(app_handle: AppHandle) { *app_inner.asr_events.lock().await = None; if let Some(session) = session { - // 4. Commit and get this session's final text. + // 4. No speech case: the session connected but produced no transcript + // (no partial/final this session, nothing accumulated across reconnects) + // AND the captured audio was silent. The user stopped without speaking; + // Doubao won't emit a final for silence, so committing would block until + // the timeout and then wrongly offer a retry. End immediately, ESC-like. + // If audio WAS captured but no transcript arrived (slow/failed network), + // fall through to commit so the result — or a retry — is still possible. + let recognized_anything = { + let (final_t, partial_t) = app_inner.latest_transcript.lock().await.clone(); + let accumulated = app_inner.accumulated_text.lock().await.clone(); + !final_t.trim().is_empty() + || !partial_t.trim().is_empty() + || !accumulated.trim().is_empty() + }; + if !recognized_anything && !captured_audio_signal { + log_rec!(info, "Stop with no recognized speech; ending immediately"); + session.close(); + app_inner.pending_audio.lock().await.clear(); + *app_inner.accumulated_text.lock().await = String::new(); + discard_recording_artifacts(&app_inner).await; + set_overlay_retry_interaction(&app_handle, false); + if let Some(overlay) = app_handle.get_webview_window("overlay") { + let _ = overlay.hide(); + } + set_app_state(&app_handle, &app_inner, app_state::AppState::Idle).await; + return; + } + + // 5. Commit and get this session's final text. let session_text = match session.commit_and_await_final().await { Ok(t) => t, - Err(_) => { - let (final_t, partial_t) = app_inner.latest_transcript.lock().await.clone(); - if !final_t.is_empty() { - final_t - } else { - partial_t - } + Err(e) => { + log_rec!(warn, "ASR commit failed: {}", e); + session.close(); + app_inner.pending_audio.lock().await.clear(); + *app_inner.accumulated_text.lock().await = String::new(); + record_transcription_failure(&app_handle, &app_inner, &e).await; + emit_retryable_error_hint(&app_handle, &app_inner, &e).await; + set_app_state(&app_handle, &app_inner, app_state::AppState::Idle).await; + schedule_retry_overlay_hide(app_handle.clone(), Arc::clone(&app_inner)); + return; } }; log_rec!( @@ -1148,26 +1637,12 @@ async fn stop_recording(app_handle: AppHandle) { warn, "Stop with no ready ASR session; discarding buffered audio" ); - let _ = app_handle.emit("overlay:event", serde_json::json!({ - "type": "hint", - "payload": { "text": "语音服务连接失败,请重试", "level": "error", "variant": "text" } - })); + let message = "语音服务连接失败,请检查网络连接"; + record_transcription_failure(&app_handle, &app_inner, message).await; + emit_retryable_error_hint(&app_handle, &app_inner, message).await; *app_inner.accumulated_text.lock().await = String::new(); set_app_state(&app_handle, &app_inner, app_state::AppState::Idle).await; - let delayed_handle = app_handle.clone(); - let delayed_inner: Arc = Arc::clone(&app_inner); - tauri::async_runtime::spawn(async move { - tokio::time::sleep(Duration::from_secs(3)).await; - let still_idle = { - let s = delayed_inner.state.lock().await; - matches!(*s, app_state::AppState::Idle) - }; - if still_idle { - if let Some(overlay) = delayed_handle.get_webview_window("overlay") { - let _ = overlay.hide(); - } - } - }); + schedule_retry_overlay_hide(app_handle.clone(), Arc::clone(&app_inner)); return; } } @@ -1176,6 +1651,7 @@ async fn stop_recording(app_handle: AppHandle) { *app_inner.accumulated_text.lock().await = String::new(); // 12. Hide overlay + set_overlay_retry_interaction(&app_handle, false); if let Some(overlay) = app_handle.get_webview_window("overlay") { let _ = overlay.hide(); } @@ -1202,6 +1678,55 @@ pub async fn toggle_recording(app_handle: AppHandle) { } /// Cancel the active recording without committing or pasting text. +/// Directly toggle the ESC-cancel shortcut, independent of the recording state +/// machine. Used to keep ESC live while a retryable failure is shown (idle). +fn set_escape_enabled_now(app: &AppHandle, enabled: bool) { + if let Some(hc) = app.try_state::() { + hotkey::set_escape_enabled(&hc, enabled); + } +} + +/// ESC handler. Routes to the right teardown for whatever is on screen: +/// an active recording, an in-flight retry, or a shown retryable failure. +pub(crate) async fn on_escape(app_handle: AppHandle) { + if is_recording(&app_handle) { + cancel_recording(app_handle).await; + return; + } + let app_inner = app_handle.state::>(); + let state = app_inner.state.lock().await.clone(); + match state { + // Retry in progress (a normal commit has no retry marker): abort it. + app_state::AppState::Finishing if app_inner.current_retry_of.lock().await.is_some() => { + abort_retry_or_failure(&app_handle, &app_inner).await; + } + // Retryable failure currently shown: dismiss it. + app_state::AppState::Idle if app_inner.current_failure_ts.lock().await.is_some() => { + abort_retry_or_failure(&app_handle, &app_inner).await; + } + _ => {} + } +} + +/// Tear down an in-flight retry or a shown retryable failure: discard any +/// in-flight result via the epoch bump, clear retry state, and hide the overlay. +async fn abort_retry_or_failure(app_handle: &AppHandle, app_inner: &Arc) { + app_inner + .session_epoch + .fetch_add(1, std::sync::atomic::Ordering::SeqCst); + *app_inner.current_retry_of.lock().await = None; + *app_inner.current_failure_ts.lock().await = None; + *app_inner.latest_transcript.lock().await = (String::new(), String::new()); + *app_inner.accumulated_text.lock().await = String::new(); + set_overlay_retry_interaction(app_handle, false); + let _ = app_handle.emit("overlay:event", serde_json::json!({ "type": "reset" })); + if let Some(overlay) = app_handle.get_webview_window("overlay") { + let _ = overlay.hide(); + } + // set_app_state(Idle) also re-syncs (disables) the ESC shortcut. + set_app_state(app_handle, app_inner, app_state::AppState::Idle).await; +} + async fn cancel_recording(app_handle: AppHandle) { let app_inner = app_handle.state::>(); let recording_state = app_handle.state::(); @@ -1224,6 +1749,8 @@ async fn cancel_recording(app_handle: AppHandle) { .fetch_add(1, std::sync::atomic::Ordering::SeqCst); app_inner.pending_audio.lock().await.clear(); app_inner.recording_audio.lock().await.clear(); + *app_inner.current_recording_wav.lock().await = None; + *app_inner.current_retry_of.lock().await = None; // Clear the active prompt ID since the session was cancelled if let Some(active) = app_handle.try_state::() { @@ -1302,6 +1829,11 @@ async fn manage_asr_session( final_text, partial_text, } => { + // Stop feeding the overlay once this session is superseded + // (e.g. the user pressed ESC to abort an in-flight retry). + if !is_current_epoch(&app_inner, my_epoch) { + break 'outer; + } // A real transcript means the (possibly reconnected) session is // healthy again: reset the failure counter. reconnect_attempts = 0; @@ -1513,33 +2045,15 @@ async fn finalize_on_failure(app: &AppHandle, app_inner: &Arc() { *active.0.lock().unwrap() = None; } set_app_state(app, app_inner, app_state::AppState::Idle).await; - let delayed_handle = app.clone(); - let delayed_inner: Arc = Arc::clone(app_inner); - tauri::async_runtime::spawn(async move { - tokio::time::sleep(Duration::from_secs(3)).await; - let still_idle = { - let s = delayed_inner.state.lock().await; - matches!(*s, app_state::AppState::Idle) - }; - if still_idle { - if let Some(overlay) = delayed_handle.get_webview_window("overlay") { - let _ = overlay.hide(); - } - } - }); + schedule_retry_overlay_hide(app.clone(), Arc::clone(app_inner)); return; } @@ -1743,8 +2257,12 @@ async fn finalize_and_paste( } } - // Record usage stats - app_inner.stats.lock().await.record_session(&final_text); + // Record usage stats and retain/delete the WAV according to user settings. + let keep_recordings = config + .as_ref() + .map(|c| c.app.keep_recordings) + .unwrap_or(false); + record_success_and_apply_retention(app_handle, app_inner, &final_text, keep_recordings).await; emit_cue(app_handle, app_inner, "end"); } @@ -1778,3 +2296,75 @@ pub fn reload_hotkey_bindings(app: &AppHandle) { let prompts = app_inner.config_manager.load_prompts(); hotkey::reload_bindings(&hc, &hotkey_str, &mode, &prompts); } + +#[cfg(test)] +mod audio_signal_tests { + use super::recording_has_audio_signal; + + #[test] + fn silence_is_not_treated_as_speech() { + let silence = vec![0.0f32; 16_000]; + assert!(!recording_has_audio_signal(&silence)); + } + + #[test] + fn quiet_noise_floor_is_not_treated_as_speech() { + // ~ -54 dBFS hum: below both gates, must not look like speech. + let noise: Vec = (0..16_000) + .map(|i| if i % 2 == 0 { 0.002 } else { -0.002 }) + .collect(); + assert!(!recording_has_audio_signal(&noise)); + } + + #[test] + fn very_short_clip_is_not_treated_as_speech() { + // Under 100ms even at full amplitude is an accidental tap, not speech. + let blip = vec![0.5f32; 800]; + assert!(!recording_has_audio_signal(&blip)); + } + + #[test] + fn loud_sustained_signal_is_treated_as_speech() { + // A 0.3-amplitude tone clears both the peak and RMS gates. + let tone: Vec = (0..16_000).map(|i| 0.3 * (i as f32 * 0.2).sin()).collect(); + assert!(recording_has_audio_signal(&tone)); + } + + #[test] + fn start_cue_bleed_then_silence_is_not_treated_as_speech() { + // Loud cue in the first ~0.5s, silence afterward: must be skipped, not + // mistaken for the user speaking (no AEC in native capture). + let mut samples = vec![0.0f32; 16_000]; + for (i, s) in samples.iter_mut().enumerate().take(8_000) { + *s = 0.4 * (i as f32 * 0.3).sin(); + } + assert!(!recording_has_audio_signal(&samples)); + } +} + +#[cfg(test)] +mod hotkey_label_tests { + use super::format_hotkey_label; + + #[test] + fn function_key_passes_through() { + assert_eq!(format_hotkey_label("F13"), "F13"); + } + + #[test] + fn sided_modifier_matches_settings_symbol() { + // Mirrors the frontend normalizeHotkeyLabel ("AltRight" -> "R ⌥"). + assert_eq!(format_hotkey_label("AltRight"), "R ⌥"); + } + + #[test] + fn combo_is_symbolized_and_joined() { + assert_eq!(format_hotkey_label("Control+Space"), "⌃ ␣"); + assert_eq!(format_hotkey_label("CmdOrCtrl+Shift+A"), "⌘ ⇧ A"); + } + + #[test] + fn empty_stays_empty() { + assert_eq!(format_hotkey_label(""), ""); + } +} diff --git a/src-tauri/src/overlay.rs b/src-tauri/src/overlay.rs index ee521ca..3e305c0 100644 --- a/src-tauri/src/overlay.rs +++ b/src-tauri/src/overlay.rs @@ -24,27 +24,96 @@ pub fn set_audio_level(app: &AppHandle, level: f64) { macos::set_audio_level(app, level); } +/// Remember the app the user is currently working in, so we can hand keyboard +/// focus back to it after a retry (clicking the native retry button activates the +/// overlay, which would otherwise swallow the paste). No-op off macOS. +#[allow(unused_variables)] +pub fn capture_foreground_app(app: &AppHandle) { + #[cfg(target_os = "macos")] + macos::capture_foreground_app(app); +} + +/// Reactivate the app captured by [`capture_foreground_app`] so the subsequent +/// paste lands in the window the user was in. No-op off macOS. +#[allow(unused_variables)] +pub fn restore_foreground_app(app: &AppHandle) { + #[cfg(target_os = "macos")] + macos::restore_foreground_app(app); +} + /// Native macOS overlay renderer. Builds and updates an AppKit pill /// (`NSGlassEffectView` → container → indicator + transcript label) living inside /// the overlay window's content view, above the transparent WebView. #[cfg(target_os = "macos")] mod macos { use objc2::rc::Retained; - use objc2::runtime::AnyObject; - use objc2::{msg_send, MainThreadMarker}; + use objc2::runtime::{AnyClass, AnyObject, ClassBuilder, Sel}; + use objc2::{class, msg_send, sel, MainThreadMarker}; use objc2_app_kit::{ - NSAppearance, NSAppearanceNameAqua, NSAppearanceNameDarkAqua, NSColor, NSFont, - NSGlassEffectView, NSGlassEffectViewStyle, NSLineBreakMode, NSProgressIndicator, - NSProgressIndicatorStyle, NSTextField, NSView, NSVisualEffectBlendingMode, - NSVisualEffectMaterial, NSVisualEffectState, NSVisualEffectView, NSWindow, + NSAppearance, NSAppearanceNameAqua, NSAppearanceNameDarkAqua, + NSApplicationActivationOptions, NSAttributedStringNSStringDrawing, NSBezelStyle, NSButton, + NSColor, NSFont, NSGlassEffectView, NSGlassEffectViewStyle, NSLineBreakMode, + NSProgressIndicator, NSProgressIndicatorStyle, NSRunningApplication, NSTextField, NSView, + NSVisualEffectBlendingMode, NSVisualEffectMaterial, NSVisualEffectState, + NSVisualEffectView, NSWindow, NSWorkspace, }; use objc2_foundation::{ - NSArray, NSAttributedString, NSMutableAttributedString, NSNumber, NSPoint, NSRange, NSRect, - NSSize, NSString, + NSArray, NSAttributedString, NSBundle, NSMutableAttributedString, NSNumber, NSPoint, + NSRange, NSRect, NSSize, NSString, }; use std::cell::RefCell; + use std::sync::{Mutex as StdMutex, OnceLock}; use tauri::{AppHandle, Manager}; + static RETRY_APP: OnceLock>> = OnceLock::new(); + /// Bundle id of the app the user was in before the overlay stole focus, so a + /// successful retry can hand keyboard focus back for the paste. + static PREV_FOREGROUND_BUNDLE: OnceLock>> = OnceLock::new(); + + fn our_bundle_id() -> Option { + NSBundle::mainBundle() + .bundleIdentifier() + .map(|s| s.to_string()) + } + + pub fn capture_foreground_app(app: &AppHandle) { + let _ = app.run_on_main_thread(|| { + let ws = NSWorkspace::sharedWorkspace(); + let Some(front) = ws.frontmostApplication() else { + return; + }; + let Some(bid) = front.bundleIdentifier() else { + return; + }; + let bid = bid.to_string(); + // If we are already frontmost (e.g. re-arming retry after a failed + // attempt), keep the previously captured app instead of ourselves. + if our_bundle_id().as_deref() == Some(bid.as_str()) { + return; + } + let slot = PREV_FOREGROUND_BUNDLE.get_or_init(|| StdMutex::new(None)); + if let Ok(mut guard) = slot.lock() { + *guard = Some(bid); + } + }); + } + + pub fn restore_foreground_app(app: &AppHandle) { + let bid = PREV_FOREGROUND_BUNDLE + .get() + .and_then(|slot| slot.lock().ok().and_then(|g| g.clone())); + let Some(bid) = bid else { + return; + }; + let _ = app.run_on_main_thread(move || { + let ns_bid = NSString::from_str(&bid); + let apps = NSRunningApplication::runningApplicationsWithBundleIdentifier(&ns_bid); + if let Some(target) = apps.firstObject() { + target.activateWithOptions(NSApplicationActivationOptions::ActivateAllWindows); + } + }); + } + // --- Layout constants (mirror web/styles.css + app.js scheduleResize) --- const FONT_SIZE: f64 = 14.0; const PAD_LEFT: f64 = 14.0; @@ -67,6 +136,11 @@ mod macos { const WAVE_GAP_LEFT: f64 = 12.0; // gap between text and waveform const WAVE_MAX_H: f64 = 22.0; const WAVE_MIN_H: f64 = 3.0; + const RETRY_SIZE: f64 = 22.0; + const RETRY_MIN_W: f64 = 38.0; // floor; the button grows to fit "重试 (R ⌥)" + const RETRY_TEXT_PAD: f64 = 24.0; // horizontal padding around the button title + const RETRY_GAP_LEFT: f64 = 8.0; + const RETRY_RIGHT_INSET: f64 = 26.0; /// Logical overlay model, mirrored from overlay events (parallels app.js `state`). #[derive(Default)] @@ -75,7 +149,9 @@ mod macos { partial_text: String, hint_text: String, hint_level: String, // "info" | "warn" | "error" - hint_variant: String, // "text" | "progress" + hint_variant: String, // "text" | "progress" | "retry" + hint_retryable: bool, + retry_hotkey: String, // formatted main hotkey label, e.g. "R ⌥" app_state: String, // "idle" | "connecting" | "recording" | "finishing" // Sticky layout (prevents width jitter while recording/finishing). layout_width: f64, @@ -117,8 +193,13 @@ mod macos { spinner: Retained, label: Retained, bars: [Retained; WAVE_N], + retry_view: Retained, + retry_button: Retained, + _retry_target: Retained, dot_layer: Retained, // CALayer for the indicator dot ripple_layer: Retained, // CALayer halo behind the dot (recording ripple) + retry_track_layer: Retained, // CALayer for the retry ring track + retry_progress_layer: Retained, // CALayer for the retry countdown ring fade_mask: Retained, // CAGradientLayer for the multi-line top fade applied_variant: String, // "" (auto/inherit) | "light" | "dark" } @@ -128,6 +209,36 @@ mod macos { static VIEWS: RefCell> = const { RefCell::new(None) }; } + extern "C" fn retry_clicked(_this: *mut AnyObject, _sel: Sel, _sender: *mut AnyObject) { + let Some(lock) = RETRY_APP.get() else { + return; + }; + let Ok(guard) = lock.lock() else { + return; + }; + let Some(app) = guard.as_ref().cloned() else { + return; + }; + tauri::async_runtime::spawn(async move { + let _ = crate::retry_latest_failed_transcription(app).await; + }); + } + + fn retry_target_class() -> &'static AnyClass { + if let Some(cls) = AnyClass::get(c"VoicePasteRetryTarget") { + return cls; + } + let mut builder = + ClassBuilder::new(c"VoicePasteRetryTarget", class!(NSObject)).expect("class builder"); + unsafe { + builder.add_method( + sel!(retryClicked:), + retry_clicked as extern "C" fn(*mut AnyObject, Sel, *mut AnyObject), + ); + } + builder.register() + } + fn liquid_glass_available() -> bool { objc2::runtime::AnyClass::get(c"NSGlassEffectView").is_some() } @@ -151,6 +262,11 @@ mod macos { /// Parse an incoming overlay event and update the native pill on the main thread. pub fn dispatch(app: &AppHandle, event: &serde_json::Value) { + let slot = RETRY_APP.get_or_init(|| StdMutex::new(None)); + if let Ok(mut guard) = slot.lock() { + *guard = Some(app.clone()); + } + let kind = event.get("type").and_then(|v| v.as_str()).unwrap_or(""); // Only visual events drive the native pill. Audio lifecycle events // (audio:warmup / recording:start / recording:stop) belong to the WebView worker. @@ -183,6 +299,8 @@ mod macos { model.hint_text.clear(); model.hint_level = "info".into(); model.hint_variant = "text".into(); + model.hint_retryable = false; + model.retry_hotkey.clear(); model.layout_width = 0.0; model.layout_wrap = false; model.smoothed_level = 0.0; @@ -200,6 +318,8 @@ mod macos { { model.hint_text.clear(); model.hint_variant = "text".into(); + model.hint_retryable = false; + model.retry_hotkey.clear(); } // Collapse the waveform when not actively recording. if s != "recording" { @@ -236,6 +356,15 @@ mod macos { .and_then(|v| v.as_str()) .unwrap_or("text") .into(); + model.hint_retryable = payload + .and_then(|p| p.get("retryable")) + .and_then(|v| v.as_bool()) + .unwrap_or(false); + model.retry_hotkey = payload + .and_then(|p| p.get("hotkey")) + .and_then(|v| v.as_str()) + .unwrap_or("") + .into(); } "appearance" => { if let Some(s) = payload @@ -306,6 +435,18 @@ mod macos { "Preparing…".into() }; } + if visual_state == "finishing" && model.hint_variant == "retry" { + // Placeholder shown only until the replayed transcript starts + // streaming in; then yield to the live text below. + if model.final_text.is_empty() && model.partial_text.is_empty() { + return if zh { + "重试中…".into() + } else { + "Retrying…".into() + }; + } + return String::new(); + } if visual_state == "finishing" && model.hint_variant == "progress" { return if zh { "思考中…".into() @@ -313,6 +454,8 @@ mod macos { "Thinking…".into() }; } + // The retry label + hotkey live inside the retry button, not in the + // message text, so the hint is just the error message. model.hint_text.clone() } @@ -373,6 +516,52 @@ mod macos { container.addSubview(&spinner); container.addSubview(&label); + let retry_view = NSView::new(mtm); + retry_view.setWantsLayer(true); + set_layer_color(&retry_view, &NSColor::clearColor(), RETRY_SIZE / 2.0); + let retry_track_layer = make_retry_ring_layer(); + let retry_progress_layer = make_retry_ring_layer(); + unsafe { + let retry_layer: *mut AnyObject = msg_send![&*retry_view, layer]; + if !retry_layer.is_null() { + let _: () = msg_send![retry_layer, addSublayer: &*retry_track_layer]; + let _: () = msg_send![retry_layer, addSublayer: &*retry_progress_layer]; + } + } + let retry_target = unsafe { + let cls = retry_target_class(); + let obj: *mut AnyObject = msg_send![cls, new]; + Retained::from_raw(obj).expect("retry target init") + }; + let retry_button = unsafe { + NSButton::buttonWithTitle_target_action( + &NSString::from_str("重试"), + Some(&retry_target), + Some(sel!(retryClicked:)), + mtm, + ) + }; + retry_button.setBordered(true); + retry_button.setTransparent(false); + retry_button.setShowsBorderOnlyWhileMouseInside(true); + retry_button.setBezelStyle(NSBezelStyle::AccessoryBarAction); + retry_button.setFont(Some(&NSFont::systemFontOfSize_weight(12.0, 600.0))); + retry_button.setContentTintColor(Some(&NSColor::systemRedColor())); + retry_button.setBezelColor(Some(&NSColor::colorWithSRGBRed_green_blue_alpha( + 1.0, 0.231, 0.231, 0.10, + ))); + retry_button.setAttributedTitle(&retry_title_attr("")); + unsafe { + let cell: *mut AnyObject = msg_send![&*retry_button, cell]; + if !cell.is_null() { + let _: () = msg_send![cell, setHighlightsBy: 1usize]; + let _: () = msg_send![cell, setShowsStateBy: 0usize]; + } + } + retry_view.addSubview(&retry_button); + retry_view.setHidden(true); + container.addSubview(&retry_view); + // Waveform bars (right side), green and rounded; positioned per render. let bars: [Retained; WAVE_N] = std::array::from_fn(|_| { let b = NSView::new(mtm); @@ -413,8 +602,13 @@ mod macos { spinner, label, bars, + retry_view, + retry_button, + _retry_target: retry_target, dot_layer, ripple_layer, + retry_track_layer, + retry_progress_layer, fade_mask: make_fade_mask(), applied_variant: "".into(), }); @@ -466,6 +660,100 @@ mod macos { } } + fn make_retry_ring_layer() -> Retained { + unsafe { + let cls = objc2::runtime::AnyClass::get(c"CAShapeLayer").expect("CAShapeLayer"); + let obj: *mut AnyObject = msg_send![cls, alloc]; + let obj: *mut AnyObject = msg_send![obj, init]; + let layer = Retained::from_raw(obj).expect("CAShapeLayer init"); + let clear = NSColor::clearColor(); + let clear_cg: *mut AnyObject = msg_send![&*clear, CGColor]; + let _: () = msg_send![&*layer, setFillColor: clear_cg]; + let _: () = msg_send![&*layer, setLineWidth: 1.6f64]; + let cap = NSString::from_str("round"); + let _: () = msg_send![&*layer, setLineCap: &*cap]; + let _: () = msg_send![&*layer, setStrokeEnd: 1.0f64]; + layer + } + } + + fn set_retry_ring_path(layer: &AnyObject, width: f64) { + unsafe { + let cls = objc2::runtime::AnyClass::get(c"NSBezierPath").expect("NSBezierPath"); + let inset = 1.9; + let left = inset; + let right = width - inset; + let top = RETRY_SIZE - inset; + let bottom = inset; + let radius = (top - bottom) / 2.0; + let center_y = RETRY_SIZE / 2.0; + let left_center = NSPoint { + x: left + radius, + y: center_y, + }; + let right_center = NSPoint { + x: right - radius, + y: center_y, + }; + let path: *mut AnyObject = msg_send![cls, bezierPath]; + let _: () = msg_send![path, moveToPoint: NSPoint { x: width / 2.0, y: top }]; + let _: () = msg_send![path, lineToPoint: NSPoint { x: left + radius, y: top }]; + let _: () = msg_send![ + path, + appendBezierPathWithArcWithCenter: left_center, + radius: radius, + startAngle: 90.0f64, + endAngle: 270.0f64, + clockwise: false + ]; + let _: () = msg_send![path, lineToPoint: NSPoint { x: right - radius, y: bottom }]; + let _: () = msg_send![ + path, + appendBezierPathWithArcWithCenter: right_center, + radius: radius, + startAngle: 270.0f64, + endAngle: 90.0f64, + clockwise: false + ]; + let _: () = msg_send![path, closePath]; + let cg_path: *mut AnyObject = msg_send![path, CGPath]; + let _: () = msg_send![layer, setPath: cg_path]; + } + } + + fn set_retry_countdown(layer: &AnyObject, on: bool) { + unsafe { + let key = NSString::from_str("retry-countdown"); + if on { + let existing: *mut AnyObject = msg_send![layer, animationForKey: &*key]; + if !existing.is_null() { + return; + } + let anim_cls = + objc2::runtime::AnyClass::get(c"CABasicAnimation").expect("CABasicAnimation"); + let path = NSString::from_str("strokeStart"); + let anim: *mut AnyObject = msg_send![anim_cls, animationWithKeyPath: &*path]; + let _: () = msg_send![anim, setFromValue: &*NSNumber::numberWithDouble(0.0)]; + let _: () = msg_send![anim, setToValue: &*NSNumber::numberWithDouble(1.0)]; + let _: () = msg_send![anim, setDuration: 5.0f64]; + let _: () = msg_send![anim, setRemovedOnCompletion: false]; + let fill = NSString::from_str("forwards"); + let _: () = msg_send![anim, setFillMode: &*fill]; + let tcls = objc2::runtime::AnyClass::get(c"CAMediaTimingFunction") + .expect("CAMediaTimingFunction"); + let tname = NSString::from_str("linear"); + let tf: *mut AnyObject = msg_send![tcls, functionWithName: &*tname]; + let _: () = msg_send![anim, setTimingFunction: tf]; + let _: () = msg_send![layer, addAnimation: anim, forKey: &*key]; + } else { + let _: () = msg_send![layer, removeAnimationForKey: &*key]; + let _: () = msg_send![layer, setOpacity: 1.0f32]; + let _: () = msg_send![layer, setStrokeStart: 0.0f64]; + let _: () = msg_send![layer, setStrokeEnd: 1.0f64]; + } + } + } + /// Add or remove the recording dot's expanding-ring "ripple" on a halo layer, /// faithfully matching the web `vp-ring` keyframes: the dot itself stays fixed /// while a ring scales out from it and fades, looping every 1.6s (ease-out). @@ -672,6 +960,28 @@ mod macos { Retained::into_super(attr) } + fn retry_title_attr(hotkey: &str) -> Retained { + // "重试 (R ⌥)" — label + the configured hotkey, matching settings symbols. + let text = if hotkey.is_empty() { + "重试".to_string() + } else { + format!("重试 ({hotkey})") + }; + let attr = NSMutableAttributedString::from_nsstring(&NSString::from_str(&text)); + let font = NSFont::systemFontOfSize_weight(12.0, 600.0); + let range = NSRange::new(0, text.encode_utf16().count()); + let color = NSColor::colorWithSRGBRed_green_blue_alpha(1.0, 0.231, 0.231, 1.0); + unsafe { + attr.addAttribute_value_range(objc2_app_kit::NSFontAttributeName, &font, range); + attr.addAttribute_value_range( + objc2_app_kit::NSForegroundColorAttributeName, + &color, + range, + ); + } + Retained::into_super(attr) + } + fn hint_color(level: &str) -> Retained { match level { "error" => NSColor::systemRedColor(), @@ -703,6 +1013,15 @@ mod macos { let hint = visible_hint(model); let has_hint = !hint.is_empty(); let has_text = !model.final_text.is_empty() || !model.partial_text.is_empty(); + let show_retry = model.hint_retryable && model.hint_level == "error" && has_hint; + // Build the retry button title ("重试 (R ⌥)") and size the button to fit it, + // so the label + hotkey live inside the button rather than in the message. + let retry_title = retry_title_attr(&model.retry_hotkey); + let retry_w = if show_retry { + (retry_title.size().width.ceil() + RETRY_TEXT_PAD).max(RETRY_MIN_W) + } else { + 0.0 + }; VIEWS.with(|v| { let mut slot = v.borrow_mut(); @@ -749,13 +1068,18 @@ mod macos { // its width so the text never overlaps the bars. let show_wave = model.app_state == "recording"; - // Chrome around the text (left pad + indicator + gap + right pad + waveform). + // Chrome around the text (left pad + indicator + gap + right action area). let wave_reserve = if show_wave { WAVE_GAP_LEFT + WAVE_AREA_W } else { 0.0 }; - let chrome = PAD_LEFT + INDICATOR_W + GAP + PAD_RIGHT + wave_reserve; + let retry_reserve = if show_retry { + RETRY_GAP_LEFT + retry_w + (RETRY_RIGHT_INSET - PAD_RIGHT) + } else { + 0.0 + }; + let chrome = PAD_LEFT + INDICATOR_W + GAP + PAD_RIGHT + wave_reserve + retry_reserve; let text_w = measured_w + TEXT_SLACK; let next_width = if want_wrap { MULTI_LINE_WIDTH + chrome @@ -930,6 +1254,58 @@ mod macos { // Waveform bars (right), shown only while recording. layout_bars(views, pill_w, pill_h, model, show_wave); + views.retry_view.setHidden(!show_retry); + if show_retry { + let retry_x = pill_w - RETRY_RIGHT_INSET - retry_w; + let retry_y = ((pill_h - RETRY_SIZE) / 2.0).round(); + views.retry_view.setFrame(NSRect { + origin: NSPoint { + x: retry_x, + y: retry_y, + }, + size: NSSize { + width: retry_w, + height: RETRY_SIZE, + }, + }); + views.retry_button.setFrame(NSRect { + origin: NSPoint { x: 0.0, y: 0.0 }, + size: NSSize { + width: retry_w, + height: RETRY_SIZE, + }, + }); + views.retry_button.setAttributedTitle(&retry_title); + unsafe { + let bg = NSColor::colorWithSRGBRed_green_blue_alpha(1.0, 0.231, 0.231, 0.08); + let border = + NSColor::colorWithSRGBRed_green_blue_alpha(1.0, 0.231, 0.231, 0.84); + let view_layer: *mut AnyObject = msg_send![&*views.retry_view, layer]; + if !view_layer.is_null() { + let bg_cg: *mut AnyObject = msg_send![&*bg, CGColor]; + let _: () = msg_send![view_layer, setBackgroundColor: bg_cg]; + let _: () = msg_send![view_layer, setCornerRadius: RETRY_SIZE / 2.0]; + } + let ring_frame = NSRect { + origin: NSPoint { x: 0.0, y: 0.0 }, + size: NSSize { + width: retry_w, + height: RETRY_SIZE, + }, + }; + let _: () = msg_send![&*views.retry_track_layer, setFrame: ring_frame]; + let _: () = msg_send![&*views.retry_progress_layer, setFrame: ring_frame]; + set_retry_ring_path(&views.retry_track_layer, retry_w); + set_retry_ring_path(&views.retry_progress_layer, retry_w); + let track = NSColor::colorWithSRGBRed_green_blue_alpha(1.0, 0.231, 0.231, 0.18); + let track_cg: *mut AnyObject = msg_send![&*track, CGColor]; + let _: () = msg_send![&*views.retry_track_layer, setStrokeColor: track_cg]; + let border_cg: *mut AnyObject = msg_send![&*border, CGColor]; + let _: () = msg_send![&*views.retry_progress_layer, setStrokeColor: border_cg]; + } + } + set_retry_countdown(&views.retry_progress_layer, show_retry); + // Keep the pill visible throughout an active session (so the indicator // stays up while waiting for the first transcript); only hide it when idle // with no content. diff --git a/src-tauri/src/stats.rs b/src-tauri/src/stats.rs index 51372a4..669e79d 100644 --- a/src-tauri/src/stats.rs +++ b/src-tauri/src/stats.rs @@ -24,6 +24,14 @@ pub struct HistoryEntry { pub text: String, #[serde(default)] pub chars: usize, + #[serde(default = "default_history_status")] + pub status: String, + #[serde(rename = "audioPath", default, skip_serializing_if = "Option::is_none")] + pub audio_path: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub error: Option, + #[serde(rename = "retryOf", default, skip_serializing_if = "Option::is_none")] + pub retry_of: Option, } pub struct StatsService { @@ -54,7 +62,12 @@ impl StatsService { } } - pub fn record_session(&mut self, text: &str) { + pub fn record_session_with_audio( + &mut self, + text: &str, + audio_path: Option, + retry_of: Option, + ) { if text.is_empty() { return; } @@ -79,10 +92,96 @@ impl StatsService { ts: now.to_rfc3339(), text: text.to_string(), chars: char_count, + status: "success".to_string(), + audio_path, + error: None, + retry_of, }; self.append_history(&entry); } + pub fn replace_history_with_success( + &mut self, + ts: &str, + text: &str, + audio_path: Option, + ) -> bool { + if text.is_empty() { + return false; + } + + let Ok(d) = chrono::DateTime::parse_from_rfc3339(ts) else { + return false; + }; + let local = d.with_timezone(&Local); + let key = local.format("%Y-%m-%d").to_string(); + let file_path = self.history_dir.join(format!("{}.jsonl", key)); + let Ok(content) = fs::read_to_string(&file_path) else { + return false; + }; + + let mut replaced = false; + let char_count = text.len(); + let mut next_lines = Vec::new(); + for line in content.lines().filter(|line| !line.is_empty()) { + match serde_json::from_str::(line) { + Ok(mut entry) if entry.ts == ts => { + entry.text = text.to_string(); + entry.chars = char_count; + entry.status = "success".to_string(); + entry.audio_path = audio_path.clone(); + entry.error = None; + entry.retry_of = None; + if let Ok(json) = serde_json::to_string(&entry) { + next_lines.push(json); + replaced = true; + } else { + next_lines.push(line.to_string()); + } + } + Ok(entry) => { + if let Ok(json) = serde_json::to_string(&entry) { + next_lines.push(json); + } else { + next_lines.push(line.to_string()); + } + } + Err(_) => next_lines.push(line.to_string()), + } + } + + if !replaced { + return false; + } + + if fs::write(&file_path, format!("{}\n", next_lines.join("\n"))).is_err() { + return false; + } + self.record_usage(text); + true + } + + pub fn record_failure( + &mut self, + message: &str, + audio_path: Option, + retry_of: Option, + ) -> String { + let now = Local::now(); + let ts = now.to_rfc3339(); + let entry = HistoryEntry { + ts: ts.clone(), + text: message.to_string(), + chars: 0, + status: "failed".to_string(), + audio_path, + error: Some(message.to_string()), + retry_of, + }; + self.append_history(&entry); + ts + } + pub fn get_stats(&self) -> &Stats { &self.stats } @@ -123,6 +222,10 @@ impl StatsService { } pub fn delete_history(&mut self, ts: &str) { + self.delete_history_entry(ts, true); + } + + pub fn delete_history_entry(&mut self, ts: &str, delete_audio: bool) { if let Ok(d) = chrono::DateTime::parse_from_rfc3339(ts) { let local = d.with_timezone(&Local); let key = local.format("%Y-%m-%d").to_string(); @@ -131,12 +234,18 @@ impl StatsService { if file_path.exists() { if let Ok(content) = fs::read_to_string(&file_path) { let lines: Vec<&str> = content.lines().filter(|l| !l.is_empty()).collect(); + let mut removed_audio_paths = Vec::new(); let new_lines: Vec = lines .iter() - .filter(|line| { - serde_json::from_str::(line) - .map(|e| e.ts != ts) - .unwrap_or(true) + .filter(|line| match serde_json::from_str::(line) { + Ok(e) if e.ts == ts => { + if let Some(path) = e.audio_path { + removed_audio_paths.push(path); + } + false + } + Ok(_) => true, + Err(_) => true, }) .map(|s| s.to_string()) .collect(); @@ -147,12 +256,23 @@ impl StatsService { } else { let _ = fs::write(&file_path, format!("{}\n", new_lines.join("\n"))); } + if delete_audio { + for path in removed_audio_paths { + let _ = fs::remove_file(path); + } + } } } } } } + pub fn find_history(&self, ts: &str) -> Option { + self.get_history(365) + .into_iter() + .find(|entry| entry.ts == ts) + } + fn flush_stats(&mut self) { self.prune_daily_counts(); let path = self.data_dir.join("stats.json"); @@ -161,6 +281,22 @@ impl StatsService { } } + fn record_usage(&mut self, text: &str) { + if text.is_empty() { + return; + } + + let now = Local::now(); + if self.stats.first_used_at.is_none() { + self.stats.first_used_at = Some(now.to_rfc3339()); + } + self.stats.total_sessions += 1; + self.stats.total_characters += text.len() as u64; + let key = now.format("%Y-%m-%d").to_string(); + *self.stats.daily_counts.entry(key).or_insert(0) += text.len() as u64; + self.flush_stats(); + } + fn prune_daily_counts(&mut self) { let cutoff = Local::now() - chrono::Duration::days(MAX_DAILY_COUNTS_DAYS); let cutoff_key = cutoff.format("%Y-%m-%d").to_string(); @@ -194,6 +330,10 @@ fn is_date_key(s: &str) -> bool { chrono::NaiveDate::parse_from_str(s, "%Y-%m-%d").is_ok() } +fn default_history_status() -> String { + "success".to_string() +} + // --------------------------------------------------------------------------- // Tests // --------------------------------------------------------------------------- @@ -233,6 +373,10 @@ mod tests { ts: "2025-01-01T00:00:00+00:00".to_string(), text: "hello".to_string(), chars: 5, + status: "success".to_string(), + audio_path: None, + error: None, + retry_of: None, }; let json = serde_json::to_string(&entry).unwrap(); assert!(json.contains("hello")); @@ -250,7 +394,7 @@ mod tests { #[test] fn record_session_increments_counters() { let (mut svc, _dir) = new_stats_service(); - svc.record_session("hello world"); + svc.record_session_with_audio("hello world", None, None); let stats = svc.get_stats(); assert_eq!(stats.total_sessions, 1); assert_eq!(stats.total_characters, 11); // "hello world".len() @@ -260,7 +404,7 @@ mod tests { #[test] fn record_session_empty_text_ignored() { let (mut svc, _dir) = new_stats_service(); - svc.record_session(""); + svc.record_session_with_audio("", None, None); let stats = svc.get_stats(); assert_eq!(stats.total_sessions, 0); } @@ -268,8 +412,8 @@ mod tests { #[test] fn record_session_multiple_increments() { let (mut svc, _dir) = new_stats_service(); - svc.record_session("first"); - svc.record_session("second"); + svc.record_session_with_audio("first", None, None); + svc.record_session_with_audio("second", None, None); let stats = svc.get_stats(); assert_eq!(stats.total_sessions, 2); assert_eq!(stats.total_characters, 11); @@ -278,7 +422,7 @@ mod tests { #[test] fn daily_counts_populated() { let (mut svc, _dir) = new_stats_service(); - svc.record_session("test"); + svc.record_session_with_audio("test", None, None); let stats = svc.get_stats(); assert_eq!(stats.daily_counts.len(), 1); let today = chrono::Local::now().format("%Y-%m-%d").to_string(); @@ -348,6 +492,26 @@ mod tests { assert_eq!(history[0].text, "old"); } + #[test] + fn replace_history_with_success_updates_entry_in_place() { + let today = chrono::Local::now().format("%Y-%m-%d").to_string(); + let (mut svc, _dir) = new_stats_service(); + let failure_ts = svc.record_failure("timeout", Some("/tmp/retry.wav".to_string()), None); + + assert_eq!(failure_ts[0..10], today); + assert!(svc.replace_history_with_success(&failure_ts, "重试成功", None)); + + let history = svc.get_history(365); + assert_eq!(history.len(), 1); + assert_eq!(history[0].ts, failure_ts); + assert_eq!(history[0].text, "重试成功"); + assert_eq!(history[0].status, "success"); + assert_eq!(history[0].chars, "重试成功".len()); + assert!(history[0].audio_path.is_none()); + assert!(history[0].error.is_none()); + assert_eq!(svc.get_stats().total_sessions, 1); + } + #[test] fn delete_history_removes_entry() { let dir = tempdir().unwrap(); diff --git a/web/index.html b/web/index.html index 57d6d3a..423caf3 100644 --- a/web/index.html +++ b/web/index.html @@ -34,6 +34,13 @@ + diff --git a/web/src/bridge/overlay.ts b/web/src/bridge/overlay.ts index 69b40aa..0142765 100644 --- a/web/src/bridge/overlay.ts +++ b/web/src/bridge/overlay.ts @@ -67,3 +67,8 @@ export async function sendAudioWarmupFailed(payload: { message?: string } = {}): export async function getConfig(): Promise { return invoke("get_app_config"); } + +/** Retry the latest failed recording directly from the overlay. */ +export async function retryLatestFailedTranscription(): Promise { + await invoke("retry_latest_failed_transcription"); +} diff --git a/web/src/bridge/settings.ts b/web/src/bridge/settings.ts index 8fde11b..c1f4d6d 100644 --- a/web/src/bridge/settings.ts +++ b/web/src/bridge/settings.ts @@ -177,10 +177,18 @@ export async function getHistory(daysBack = 1): Promise { return invoke("get_history", { daysBack }); } -export async function deleteHistory(ts: number): Promise { +export async function deleteHistory(ts: string): Promise { return invoke("delete_history", { ts }); } +export async function playSoundFile(filePath: string): Promise { + return invoke("play_sound_file", { filePath }); +} + +export async function retryHistoryTranscription(ts: string): Promise { + return invoke("retry_history_transcription", { ts }); +} + // ---- Prompts ---- export async function loadPrompts(): Promise { diff --git a/web/src/ui/main-overlay.ts b/web/src/ui/main-overlay.ts index 6cc04a5..922f530 100644 --- a/web/src/ui/main-overlay.ts +++ b/web/src/ui/main-overlay.ts @@ -9,6 +9,7 @@ import { getConfig, notifyAudioStopped, onOverlayEvent, + retryLatestFailedTranscription, sendAudioChunk, sendAudioWarmupFailed, sendAudioWarmupReady, @@ -28,6 +29,7 @@ interface OverlayState { hintText: string; hintLevel: HintLevel; hintVariant: string; + retryHotkey: string; appState: AppState; audioReady: boolean; mediaStream: MediaStream | null; @@ -40,6 +42,8 @@ interface OverlayState { layoutWrap: boolean; renderedWidth: number; waveBarLevels: number[]; + retryVisible: boolean; + retrying: boolean; } interface AppearanceConfig { @@ -56,6 +60,7 @@ const state: OverlayState = { hintText: "", hintLevel: "info", hintVariant: "text", + retryHotkey: "", appState: "idle", audioReady: false, mediaStream: null, @@ -68,6 +73,8 @@ const state: OverlayState = { layoutWrap: false, renderedWidth: 0, waveBarLevels: [], + retryVisible: false, + retrying: false, }; // ---- DOM elements ---- @@ -88,9 +95,12 @@ const elements = { transcript: getEl("transcript"), measureText: getEl("measureText"), statusBars: getEl("statusBars"), + retryButton: getEl("retryButton") as HTMLButtonElement, + retryLabel: getEl("retryLabel"), }; const statusBarItems = Array.from(elements.statusBars.querySelectorAll(".status-bar")); +let retryHideTimer = 0; // ---- Appearance ---- @@ -111,9 +121,7 @@ function applyAppearance(cfg: AppearanceConfig = {}): void { currentAppearance.overlayStyle = cfg.overlayStyle || "liquid"; currentAppearance.theme = cfg.theme || "system"; const isMac = cfg.platform === "macos"; - if (elements.stage) { - elements.stage.style.display = isMac ? "none" : ""; - } + syncStageVisibility(); const isVibrancy = isMac && cfg.overlayStyle === "vibrancy"; elements.bubble.classList.toggle("platform-mac", isMac); elements.bubble.classList.toggle("platform-win", !isMac); @@ -124,6 +132,11 @@ function applyAppearance(cfg: AppearanceConfig = {}): void { ); } +function syncStageVisibility(): void { + const isMac = currentAppearance.platform === "macos"; + elements.stage.style.display = isMac ? "none" : ""; +} + // ---- Waveform ---- let waveformRaf = 0; @@ -208,9 +221,15 @@ function getVisibleHintText(): string { const visualState: string = state.appState === "recording" && !state.audioReady ? "connecting" : state.appState; if (visualState === "connecting") return isZhLocale ? "准备中…" : "Preparing…"; + if (visualState === "finishing" && state.hintVariant === "retry") { + // Placeholder until the replayed transcript starts streaming in. + if (!state.finalText && !state.partialText) return isZhLocale ? "重试中…" : "Retrying…"; + return ""; + } if (visualState === "finishing" && state.hintVariant === "progress") { return isZhLocale ? "思考中…" : "Thinking…"; } + // The retry label + hotkey live inside the retry button, not in the message. return state.hintText || ""; } @@ -218,6 +237,30 @@ function shouldShowHint(): boolean { return Boolean(getVisibleHintText()); } +function clearRetryTimer(): void { + if (retryHideTimer) { + window.clearTimeout(retryHideTimer); + retryHideTimer = 0; + } +} + +function showRetryAction(): void { + clearRetryTimer(); + state.retryVisible = true; + state.retrying = false; + retryHideTimer = window.setTimeout(() => { + state.retryVisible = false; + state.retrying = false; + updateView(); + }, 5000); +} + +function hideRetryAction(): void { + clearRetryTimer(); + state.retryVisible = false; + state.retrying = false; +} + // ---- Layout ---- let resizeRaf = 0; @@ -250,7 +293,8 @@ function scheduleResize(): void { const indicatorWidth = 22 + 12; const waveformWidth = state.appState === "recording" ? 18 + 12 : 0; - const chrome = 14 + 16 + 2 + indicatorWidth + waveformWidth; + const retryWidth = state.retryVisible ? 22 + 8 : 0; + const chrome = 14 + 16 + 2 + indicatorWidth + waveformWidth + retryWidth; const textSlack = 10; const singleLineLimit = 520; const multiLineWidth = 520; @@ -308,6 +352,17 @@ function updateView(): void { elements.stage.dataset.state = visualState; elements.stage.dataset.mode = hasHint ? "hint" : "transcript"; + elements.stage.dataset.retry = + state.retryVisible && state.hintLevel === "error" ? "true" : "false"; + elements.stage.dataset.retrying = state.retrying ? "true" : "false"; + elements.retryButton.disabled = state.retrying || !state.retryVisible; + // Label + hotkey live inside the button, e.g. "重试 (R ⌥)". + elements.retryLabel.textContent = state.retryHotkey + ? `${isZhLocale ? "重试" : "Retry"} (${state.retryHotkey})` + : isZhLocale + ? "重试" + : "Retry"; + syncStageVisibility(); elements.finalText.textContent = showTranscript ? state.finalText : ""; elements.partialText.textContent = showTranscript ? state.partialText : ""; if (showTranscript) scrollTranscriptToBottom(); @@ -334,7 +389,9 @@ function resetState(): void { state.hintText = ""; state.hintLevel = "info"; state.hintVariant = "text"; + state.retryHotkey = ""; state.audioReady = false; + hideRetryAction(); state.layoutWidth = 0; state.layoutWrap = false; state.renderedWidth = 0; @@ -655,6 +712,7 @@ onOverlayEvent(async (event: OverlayEvent) => { break; case "state": state.appState = (payload as { state: AppState }).state; + if (state.appState !== "idle") hideRetryAction(); if (state.appState === "idle" || state.appState === "connecting") state.audioReady = false; if (state.appState === "idle") { // Session over: suspend the cue context. No-op if the end cue is still @@ -733,10 +791,22 @@ onOverlayEvent(async (event: OverlayEvent) => { break; } case "hint": { - const p = payload as { text?: string; level?: HintLevel; variant?: string }; + const p = payload as { + text?: string; + level?: HintLevel; + variant?: string; + retryable?: boolean; + hotkey?: string; + }; state.hintText = p.text || ""; state.hintLevel = p.level || "info"; state.hintVariant = p.variant || "text"; + state.retryHotkey = p.hotkey || ""; + if (p.retryable === true && state.hintLevel === "error" && state.hintText) { + showRetryAction(); + } else { + hideRetryAction(); + } updateView(); break; } @@ -754,6 +824,25 @@ onOverlayEvent(async (event: OverlayEvent) => { } }); +elements.retryButton.addEventListener("click", async (event) => { + event.preventDefault(); + event.stopPropagation(); + if (!state.retryVisible || state.retrying) return; + clearRetryTimer(); + state.retrying = true; + updateView(); + try { + await retryLatestFailedTranscription(); + hideRetryAction(); + } catch (error) { + state.hintText = (error as Error).message || String(error) || "重试失败"; + state.hintLevel = "error"; + state.hintVariant = "text"; + showRetryAction(); + } + updateView(); +}); + window.addEventListener("beforeunload", () => { stopAudioCapture(); }); diff --git a/web/src/ui/pages/AppSettingsPage.tsx b/web/src/ui/pages/AppSettingsPage.tsx index 0660836..ece0149 100644 --- a/web/src/ui/pages/AppSettingsPage.tsx +++ b/web/src/ui/pages/AppSettingsPage.tsx @@ -175,7 +175,6 @@ export function AppSettingsPage() { } /> + setAppBool("keep_recordings", v)} + /> + } + /> diff --git a/web/src/ui/pages/HomePage.tsx b/web/src/ui/pages/HomePage.tsx index af1cf92..1600a40 100644 --- a/web/src/ui/pages/HomePage.tsx +++ b/web/src/ui/pages/HomePage.tsx @@ -1,6 +1,12 @@ -import { Copy, Trash2 } from "lucide-react"; +import { Copy, Play, RefreshCw, Trash2 } from "lucide-react"; import { useCallback, useEffect, useState } from "react"; -import { deleteHistory, getHistory, getStats } from "@/bridge/settings"; +import { + deleteHistory, + getHistory, + getStats, + playSoundFile, + retryHistoryTranscription, +} from "@/bridge/settings"; import { formatCompact } from "@/lib/format"; import { Button } from "@/ui/components/Button"; import { Heatmap } from "@/ui/components/Heatmap"; @@ -49,8 +55,12 @@ interface Stats { totalCharacters?: number; } interface HistoryItem { - ts: number; + ts: string; text: string; + status?: "success" | "failed"; + audioPath?: string; + error?: string; + retryOf?: string; } /* ---------- component ---------- */ @@ -59,6 +69,7 @@ export function HomePage() { const [stats, setStats] = useState(null); const [history, setHistory] = useState([]); const [days, setDays] = useState(1); + const [retryingTs, setRetryingTs] = useState(null); const load = useCallback(async () => { try { @@ -153,6 +164,11 @@ export function HomePage() { const show = dk !== last; last = dk; const time = `${String(d.getHours()).padStart(2, "0")}:${String(d.getMinutes()).padStart(2, "0")}`; + const failed = item.status === "failed"; + const retrying = retryingTs === item.ts; + const displayText = failed + ? `转写失败:${item.error || item.text || "请检查网络连接"}` + : item.text; return (
{show && ( @@ -162,27 +178,90 @@ export function HomePage() {
)} -
+
{time}
-

{item.text}

+

+ {displayText} +

-
- +
+ {item.audioPath && ( + + )} + {failed ? ( + + ) : ( + + )}