diff --git a/config.yaml.example b/config.yaml.example index 7ae25a3..50707e4 100644 --- a/config.yaml.example +++ b/config.yaml.example @@ -9,6 +9,7 @@ app: hotkey_mode: toggle # toggle=按一次开始再按一次结束,hold=按住说话松开结束 remove_trailing_period: true # 自动删除识别结果末尾的句号 keep_clipboard: true # 保留识别结果在剪贴板 + keep_recordings: false # 是否保留成功录音;关闭时仅保留失败录音用于重试 theme: system # dark / light / system overlay_style: liquid # macOS 悬浮窗外观(仅 macOS 生效) sound: diff --git a/docs/superpowers/specs/2026-06-21-native-overlay-retry-audio-design.md b/docs/superpowers/specs/2026-06-21-native-overlay-retry-audio-design.md new file mode 100644 index 0000000..f9192c6 --- /dev/null +++ b/docs/superpowers/specs/2026-06-21-native-overlay-retry-audio-design.md @@ -0,0 +1,109 @@ +# Native Overlay, Retry, and Recording Asset Design + +- Date: 2026-06-21 +- Status: Approved +- Branch: `codex/native-cpal-capture` + +## Goal + +Make the macOS recording main path independent of WebView control logic, then make ASR failure and late-result behavior recoverable by using saved WAV recordings as retryable transcription assets. + +The user-visible behavior outside recording, overlay feedback, retry, and history playback should remain unchanged. + +## Execution Order + +1. Native overlay and native cue playback. +2. Fix late ASR result handling. +3. Add recording asset, history playback, retry, and retention policy. + +## Phase 1: Native Main Path + +On macOS, the recording main path should no longer depend on WebView for recording lifecycle control or cue playback. + +The existing native overlay remains the visual surface. It should handle actionable failure states directly, including a retry icon button. The retry control must be visually subtle and fit the current glass pill style. Its maximum visual footprint must not exceed the current recording waveform element, so the control remains refined rather than dominant. + +Failure overlay behavior: + +- Show the existing failure text style. +- Show a refresh-style icon button only, without text. +- Display a 5-second countdown affordance around the retry button. +- If the user does not click within 5 seconds, hide the overlay. +- The failed transcription attempt remains available in input history when a WAV exists. + +Cue playback should move to native playback on macOS so start/end cues do not depend on the overlay WebView. Windows can keep the current WebView path unless the native implementation is naturally cross-platform. + +## Phase 2: Late ASR Result Handling + +The current Doubao flow can return a partial result when `commit_and_await_final` times out after 5 seconds, while the server may continue sending a more complete result afterward. This causes premature paste of incomplete text. + +The fix should prefer correctness over premature paste: + +- Do not paste a partial result merely because the 5-second commit wait elapsed. +- If the session has not produced a reliable final result by the deadline, mark the attempt as failed or retryable instead of pasting known-incomplete text. +- If a definite final result or terminal close arrives within the accepted completion window, paste normally. +- The saved WAV should make manual retry cheap, so retryable failure is better than silently pasting partial text. + +## Phase 3: Recording Assets and Retry + +Each transcription attempt should have a durable record that can represent success or failure. + +History entries should support: + +- `status`: success or failed. +- `text`: successful final text, or a short failure description. +- `audioPath`: saved WAV path when available. +- `error`: failure reason when applicable. +- `retryOf`: optional original entry timestamp or ID. + +Successful entries continue to count toward usage statistics. Failed entries should appear in input history but should not increase total session or character counts. + +Retry behavior: + +- Retry uses the saved WAV, not the microphone. +- Retry can be triggered from the native failure overlay within 5 seconds. +- Retry can also be triggered from Settings home input history. +- A successful retry creates or updates a successful history record and follows the normal paste/clipboard/statistics path. +- If recording retention is disabled, the failed WAV is deleted after a retry succeeds. + +History UI behavior: + +- Successful rows show play, copy, and delete icon buttons. +- Failed rows show play, retry, and delete icon buttons. +- Buttons must match the current input-record action style: orange solid rounded-square icon buttons with white line icons. + +## Recording Retention Setting + +Add an app setting for whether to retain recordings. + +Default: disabled. + +When enabled: + +- Keep successful and failed recordings for the most recent 1 month. +- Prune older recordings and references. + +When disabled: + +- Keep only recordings needed for failed retryable entries. +- Delete recordings after successful transcription or successful retry. + +## Testing + +Backend: + +- Unit tests for history serialization/backward compatibility. +- Unit tests for retention pruning decisions. +- Tests for retrying a WAV through the same ASR path where practical. +- Tests for Doubao commit timeout behavior so partial text is not treated as successful final output. + +Frontend/settings: + +- Tests for history rows with success and failure states. +- Tests for play/retry button bridge calls. + +Manual: + +- Network timeout creates a failed history entry with WAV. +- Native overlay retry starts a transcription attempt from WAV. +- Settings history retry works after overlay disappears. +- Successful retry removes failed-only WAV when retention is disabled. diff --git a/package.json b/package.json index 96d3160..681bcb2 100644 --- a/package.json +++ b/package.json @@ -28,6 +28,7 @@ "scripts": { "tauri": "tauri", "dev": "tauri dev", + "dev:no-watch": "tauri dev --no-watch", "dev:web": "vite", "build:web": "vite build", "pack": "tsx --env-file=.env scripts/pack.ts", diff --git a/src-tauri/Cargo.toml b/src-tauri/Cargo.toml index fb6741a..cdf92d4 100644 --- a/src-tauri/Cargo.toml +++ b/src-tauri/Cargo.toml @@ -59,3 +59,6 @@ wiremock = "0.6" objc2 = "0.6" objc2-app-kit = "0.3" objc2-foundation = "0.3" +# Native microphone capture is macOS-only (native_audio is cfg(macos)); keeping +# cpal off other targets avoids pulling ALSA (libasound2-dev) into the Linux CI. +cpal = "0.15" diff --git a/src-tauri/src/app_state.rs b/src-tauri/src/app_state.rs index 4f18af6..308c8ef 100644 --- a/src-tauri/src/app_state.rs +++ b/src-tauri/src/app_state.rs @@ -32,6 +32,15 @@ pub struct AppInner { /// once it attaches. Always accessed while holding `asr_session` to stay /// ordered against the drain. pub pending_audio: Mutex>>, + /// Full-session 16k mono PCM captured from the same stream sent to ASR. + /// Saved as a WAV when a recording is finalized, for diagnostics and review. + pub recording_audio: Mutex>, + /// WAV path for the current recording once saved. + pub current_recording_wav: Mutex>, + /// History timestamp of the failed entry currently being retried. + pub current_retry_of: Mutex>, + /// Latest failed history entry that has a WAV and can be retried from the overlay. + pub current_failure_ts: Mutex>, /// Resolves when the background ASR connect finishes (Ok) or fails (Err). /// `stop_recording` awaits this when the user stops before the session is ready. pub connect_rx: Mutex>>>, @@ -44,6 +53,10 @@ pub struct AppInner { /// audio, so already-recognized text is accumulated here and prepended to the /// new session's output. Reset at the start of every recording. pub accumulated_text: Mutex, + /// Native microphone capture used on macOS to avoid WebView/WebRTC input + /// processing. Other platforms keep the renderer getUserMedia path. + #[cfg(target_os = "macos")] + pub native_audio: Mutex>, } pub type AppHandle = Arc; @@ -67,8 +80,14 @@ pub fn create_app_state( pending_audio_warmup: Mutex::new(None), latest_transcript: Mutex::new((String::new(), String::new())), pending_audio: Mutex::new(Vec::new()), + recording_audio: Mutex::new(Vec::new()), + current_recording_wav: Mutex::new(None), + current_retry_of: Mutex::new(None), + current_failure_ts: Mutex::new(None), connect_rx: Mutex::new(None), session_epoch: std::sync::atomic::AtomicU64::new(0), accumulated_text: Mutex::new(String::new()), + #[cfg(target_os = "macos")] + native_audio: Mutex::new(None), }) } diff --git a/src-tauri/src/asr/doubao.rs b/src-tauri/src/asr/doubao.rs index 1c0b727..60b3a3b 100644 --- a/src-tauri/src/asr/doubao.rs +++ b/src-tauri/src/asr/doubao.rs @@ -629,8 +629,6 @@ impl AsrEngine for DoubaoEngine { let session = DoubaoSession { is_ready: is_ready.clone(), is_committed: is_committed.clone(), - final_text: final_text.clone(), - latest_result_text: latest_result_text.clone(), writer_tx, commit_tx: commit_tx.clone(), }; @@ -930,8 +928,6 @@ impl AsrEngine for DoubaoEngine { struct DoubaoSession { is_ready: Arc, is_committed: Arc, - final_text: Arc>, - latest_result_text: Arc>, /// Sends frames to the dedicated writer task. A single FIFO consumer keeps /// frames ordered and guarantees the last packet is written after all audio. writer_tx: mpsc::UnboundedSender, @@ -979,14 +975,9 @@ impl AsrSession for DoubaoSession { let (tx, rx) = tokio::sync::oneshot::channel(); *self.commit_tx.lock().await = Some(tx); - match tokio::time::timeout(std::time::Duration::from_secs(5), rx).await { + match tokio::time::timeout(std::time::Duration::from_secs(15), rx).await { Ok(Ok(text)) => Ok(text), - _ => { - // Timeout: use whatever we have - let latest = self.latest_result_text.lock().await.clone(); - let final_t = self.final_text.lock().await.clone(); - Ok(if latest.is_empty() { final_t } else { latest }) - } + _ => Err("ASR 最终结果超时,请检查网络连接".to_string()), } } diff --git a/src-tauri/src/commands.rs b/src-tauri/src/commands.rs index e61d4c7..a84444e 100644 --- a/src-tauri/src/commands.rs +++ b/src-tauri/src/commands.rs @@ -271,6 +271,23 @@ pub async fn delete_history( Ok(serde_json::json!({ "ok": true })) } +/// Retry a failed history entry by replaying its saved WAV through ASR. +#[tauri::command] +pub async fn retry_history_transcription( + app: AppHandle, + ts: String, +) -> Result { + crate::retry_history_transcription(app, ts).await +} + +/// Retry the latest failed recording from the overlay retry button. +#[tauri::command] +pub async fn retry_latest_failed_transcription( + app: AppHandle, +) -> Result { + crate::retry_latest_failed_transcription(app).await +} + /// Compute a 0..1 loudness level from f32 PCM samples for the overlay waveform. /// Mirrors the web AnalyserNode mapping (RMS + peak, mild compression). #[cfg(target_os = "macos")] @@ -289,49 +306,38 @@ fn compute_audio_level(samples: &[f32]) -> Option { Some((rms * 13.0 + peak * 2.8).powf(0.82).min(1.0)) } -/// Receive an audio chunk from the renderer (base64-encoded i16 PCM), -/// decode to f32 samples and forward to the active ASR session. -#[tauri::command] -pub async fn send_audio_chunk( - _app: AppHandle, - state: State<'_, AppState>, - base64_chunk: String, -) -> Result { - use base64::Engine as _; +pub(crate) async fn append_audio_samples( + app: &AppHandle, + state: &AppState, + samples: Vec, +) -> bool { use std::sync::atomic::{AtomicU64, Ordering}; static CHUNK_COUNT: AtomicU64 = AtomicU64::new(0); let n = CHUNK_COUNT.fetch_add(1, Ordering::Relaxed); if n == 0 || n.is_multiple_of(50) { log_audio!( debug, - "Received chunk #{} ({} bytes base64)", + "Received audio chunk #{} ({} samples)", n, - base64_chunk.len() + samples.len() ); } - // Decode base64 → i16 PCM bytes → f32 samples - let bytes = match base64::engine::general_purpose::STANDARD.decode(&base64_chunk) { - Ok(data) => data, - Err(_) => { - log_audio!(warn, "Chunk #{} base64 decode failed", n); - return Ok(serde_json::json!({ "ok": false, "message": "音频数据解码失败" })); - } - }; - let samples: Vec = bytes - .chunks_exact(2) - .map(|chunk| { - let sample = i16::from_le_bytes([chunk[0], chunk[1]]); - sample as f32 / 32768.0 - }) - .collect(); + state + .recording_audio + .lock() + .await + .extend_from_slice(&samples); // Drive the native waveform (macOS only) from the same PCM the ASR receives, // whether the chunk is sent immediately or buffered. #[cfg(target_os = "macos")] if let Some(level) = compute_audio_level(&samples) { - crate::overlay::set_audio_level(&_app, level); + crate::overlay::set_audio_level(app, level); } + // `app` only drives the macOS native waveform above; unused on other platforms. + #[cfg(not(target_os = "macos"))] + let _ = app; // Hold the `asr_session` lock across the decision so buffering stays ordered // against the background connect task's drain (same lock), guaranteeing no @@ -340,7 +346,7 @@ pub async fn send_audio_chunk( if let Some(ref session) = *session { if session.is_ready() { session.append_audio(&samples); - return Ok(serde_json::json!({ "ok": true })); + return false; } } @@ -350,6 +356,7 @@ pub async fn send_audio_chunk( let mut pending = state.pending_audio.lock().await; if pending.len() < MAX_PENDING_CHUNKS { pending.push(samples); + return true; } else if n.is_multiple_of(50) { log_audio!( warn, @@ -358,7 +365,37 @@ pub async fn send_audio_chunk( n ); } - Ok(serde_json::json!({ "ok": true, "buffered": true })) + true +} + +/// Receive an audio chunk from the renderer (base64-encoded i16 PCM), +/// decode to f32 samples and forward to the active ASR session. +#[tauri::command] +pub async fn send_audio_chunk( + app: AppHandle, + state: State<'_, AppState>, + base64_chunk: String, +) -> Result { + use base64::Engine as _; + + // Decode base64 → i16 PCM bytes → f32 samples + let bytes = match base64::engine::general_purpose::STANDARD.decode(&base64_chunk) { + Ok(data) => data, + Err(_) => { + log_audio!(warn, "Audio chunk base64 decode failed"); + return Ok(serde_json::json!({ "ok": false, "message": "音频数据解码失败" })); + } + }; + let samples: Vec = bytes + .chunks_exact(2) + .map(|chunk| { + let sample = i16::from_le_bytes([chunk[0], chunk[1]]); + sample as f32 / 32768.0 + }) + .collect(); + + let buffered = append_audio_samples(&app, &state, samples).await; + Ok(serde_json::json!({ "ok": true, "buffered": buffered })) } /// Notify that audio has stopped in the renderer. diff --git a/src-tauri/src/config.rs b/src-tauri/src/config.rs index f1b4d54..3f54d47 100644 --- a/src-tauri/src/config.rs +++ b/src-tauri/src/config.rs @@ -140,6 +140,8 @@ pub struct AppSettings { pub remove_trailing_period: bool, #[serde(default = "default_true")] pub keep_clipboard: bool, + #[serde(default)] + pub keep_recordings: bool, #[serde(default = "default_theme")] pub theme: String, #[serde(default = "default_overlay_style")] @@ -812,6 +814,7 @@ impl Default for AppConfig { hotkey_mode: default_hotkey_mode(), remove_trailing_period: true, keep_clipboard: true, + keep_recordings: false, theme: default_theme(), overlay_style: default_overlay_style(), sound: None, diff --git a/src-tauri/src/hotkey.rs b/src-tauri/src/hotkey.rs index 209a7ec..3052d83 100644 --- a/src-tauri/src/hotkey.rs +++ b/src-tauri/src/hotkey.rs @@ -505,7 +505,7 @@ fn run_listener_loop(tap: &Tap, config: &HotkeyConfig, app_handle: &tauri::AppHa escape_was_pressed = true; let handle = app_handle.clone(); tauri::async_runtime::spawn(async move { - crate::cancel_recording(handle).await; + crate::on_escape(handle).await; }); } diff --git a/src-tauri/src/lib.rs b/src-tauri/src/lib.rs index 0d31398..24657b1 100644 --- a/src-tauri/src/lib.rs +++ b/src-tauri/src/lib.rs @@ -9,6 +9,8 @@ mod hotword; mod llm; mod migration; mod model; +#[cfg(target_os = "macos")] +mod native_audio; mod overlay; mod paste; mod stats; @@ -26,8 +28,12 @@ use tauri::{ }; /// Delay after the mic stream is ready, before entering Recording / playing the -/// start cue. Gives the browser AEC/AGC time to converge so the first words are -/// not attenuated. Trade-off: added latency between key press and "go". +/// start cue. The renderer (getUserMedia) path needs it so the browser's AEC/AGC +/// converge before the first words. Native cpal capture has no such DSP warmup, +/// so macOS uses 0 — testing whether dropped leading words / cue glitches return. +#[cfg(target_os = "macos")] +const AUDIO_SETTLE_MS: u64 = 0; +#[cfg(not(target_os = "macos"))] const AUDIO_SETTLE_MS: u64 = 350; #[cfg_attr(mobile, tauri::mobile_entry_point)] @@ -156,6 +162,8 @@ pub fn run() { commands::get_stats, commands::get_history, commands::delete_history, + commands::retry_history_transcription, + commands::retry_latest_failed_transcription, commands::send_audio_chunk, commands::audio_stopped, commands::audio_warmup_ready, @@ -262,6 +270,107 @@ fn position_overlay(app_handle: &AppHandle) { } } +fn set_overlay_retry_interaction(app_handle: &AppHandle, enabled: bool) { + if let Some(overlay) = app_handle.get_webview_window("overlay") { + let _ = overlay.set_ignore_cursor_events(!enabled); + } + if enabled { + // The user's app is still frontmost here; remember it so a successful retry + // can return focus before pasting (clicking the retry button activates the + // overlay otherwise). Self-capture is filtered out inside the helper. + overlay::capture_foreground_app(app_handle); + } +} + +/// Map one accelerator token to the symbol the settings UI shows. Mirrors the +/// frontend `normalizeHotkeyLabel` so the overlay label matches system settings. +fn normalize_hotkey_key(key: &str) -> &str { + match key { + "CmdOrCtrl" | "CommandOrControl" | "Command" | "Cmd" | "Meta" => "⌘", + "Control" | "Ctrl" => "⌃", + "Shift" => "⇧", + "Alt" | "Option" => "⌥", + "Space" => "␣", + "ControlLeft" => "L ⌃", + "ControlRight" => "R ⌃", + "ShiftLeft" => "L ⇧", + "ShiftRight" => "R ⇧", + "AltLeft" => "L ⌥", + "AltRight" => "R ⌥", + "MetaLeft" => "L ⌘", + "MetaRight" => "R ⌘", + other => other, + } +} + +/// Format an accelerator string ("AltRight", "Control+Space") into the symbol +/// label shown in settings ("R ⌥", "⌃ ␣"). +fn format_hotkey_label(hotkey: &str) -> String { + hotkey + .split('+') + .map(|k| normalize_hotkey_key(k.trim())) + .collect::>() + .join(" ") +} + +/// The configured main hotkey, formatted for display. Empty for recorded keycode +/// sequences (which have no stable accelerator string). +async fn current_hotkey_label(app_inner: &Arc) -> String { + let Ok(config) = app_inner.config_manager.load_config() else { + return String::new(); + }; + match &config.app.hotkey { + serde_norway::Value::String(s) => format_hotkey_label(s), + _ => String::new(), + } +} + +/// Emit a retryable error hint, tagged with the main hotkey label so the overlay +/// can show which key (also) triggers the retry. Centralizes every failure path. +async fn emit_retryable_error_hint( + app: &AppHandle, + app_inner: &Arc, + text: &str, +) { + let hotkey = current_hotkey_label(app_inner).await; + let _ = app.emit( + "overlay:event", + serde_json::json!({ + "type": "hint", + "payload": { + "text": text, + "level": "error", + "variant": "text", + "retryable": true, + "hotkey": hotkey + } + }), + ); +} + +fn schedule_retry_overlay_hide(app_handle: AppHandle, app_inner: Arc) { + // While the retry is shown (idle), keep ESC live so it can dismiss the failure. + // set_app_state(Idle) just disabled it, so re-enable it here. + set_escape_enabled_now(&app_handle, true); + tauri::async_runtime::spawn(async move { + tokio::time::sleep(Duration::from_secs(5)).await; + let still_idle = { + let s = app_inner.state.lock().await; + matches!(*s, app_state::AppState::Idle) + }; + if still_idle { + // The retry affordance is gone once the overlay hides, so drop the + // pending failure: the hotkey reverts to starting a new recording. + *app_inner.current_failure_ts.lock().await = None; + set_escape_enabled_now(&app_handle, false); + set_overlay_retry_interaction(&app_handle, false); + if let Some(overlay) = app_handle.get_webview_window("overlay") { + let _ = overlay.hide(); + } + } + }); +} + fn app_state_name(state: &app_state::AppState) -> &'static str { match state { app_state::AppState::Idle => "idle", @@ -371,14 +480,19 @@ fn resolve_configured_sound_path( /// dedicated, kept-warm AudioContext, so the cue is full-volume and never /// truncated. Falls back to `afplay` only if the file cannot be read. fn emit_cue(app: &AppHandle, app_inner: &Arc, name: &str) { - use base64::Engine as _; - let Some(file_path) = resolve_configured_sound_path(app, app_inner, name) else { return; }; + #[cfg(target_os = "macos")] + { + crate::paste::play_sound(&file_path); + } + + #[cfg(not(target_os = "macos"))] match std::fs::read(&file_path) { Ok(bytes) => { + use base64::Engine as _; let data = base64::engine::general_purpose::STANDARD.encode(&bytes); let _ = app.emit( "overlay:event", @@ -401,6 +515,7 @@ fn emit_cue(app: &AppHandle, app_inner: &Arc, name: &str) { } } +#[cfg(not(target_os = "macos"))] async fn stop_renderer_audio( app: &AppHandle, app_inner: &Arc, @@ -424,6 +539,230 @@ async fn stop_renderer_audio( } } +async fn stop_audio_capture( + app: &AppHandle, + app_inner: &Arc, + timeout_ms: u64, +) { + #[cfg(target_os = "macos")] + { + native_audio::stop_capture(app_inner).await; + let _ = timeout_ms; + let _ = app; + } + + #[cfg(not(target_os = "macos"))] + stop_renderer_audio(app, app_inner, timeout_ms).await; +} + +async fn save_recording_wav( + app: &AppHandle, + app_inner: &Arc, +) -> Option { + let samples = { + let mut audio = app_inner.recording_audio.lock().await; + if audio.is_empty() { + return app_inner.current_recording_wav.lock().await.clone(); + } + std::mem::take(&mut *audio) + }; + + let data_dir = match app.path().app_data_dir() { + Ok(dir) => dir, + Err(error) => { + log_audio!( + warn, + "Resolve app data dir for recording WAV failed: {}", + error + ); + return None; + } + }; + let output_dir = data_dir.join("recordings"); + if let Err(error) = std::fs::create_dir_all(&output_dir) { + log_audio!( + warn, + "Create recording WAV directory failed ({}): {}", + output_dir.display(), + error + ); + return None; + } + + let ts = chrono::Local::now().format("%Y%m%d-%H%M%S%.3f"); + let path = output_dir.join(format!("voicepaste-{ts}.wav")); + match write_wav_16k_mono(&path, &samples) { + Ok(()) => { + log_audio!(info, "Recording WAV saved: {}", path.display()); + *app_inner.current_recording_wav.lock().await = Some(path.clone()); + Some(path) + } + Err(error) => { + log_audio!( + warn, + "Write recording WAV failed ({}): {}", + path.display(), + error + ); + None + } + } +} + +fn write_wav_16k_mono(path: &std::path::Path, samples: &[f32]) -> Result<(), String> { + const SAMPLE_RATE: u32 = 16_000; + const CHANNELS: u16 = 1; + const BYTES_PER_SAMPLE: u16 = 2; + + let data_bytes = samples.len() * BYTES_PER_SAMPLE as usize; + let riff_size = 36usize + .checked_add(data_bytes) + .ok_or_else(|| "WAV too large".to_string())?; + let mut wav = Vec::with_capacity(44 + data_bytes); + wav.extend_from_slice(b"RIFF"); + wav.extend_from_slice(&(riff_size as u32).to_le_bytes()); + wav.extend_from_slice(b"WAVE"); + wav.extend_from_slice(b"fmt "); + wav.extend_from_slice(&16u32.to_le_bytes()); + wav.extend_from_slice(&1u16.to_le_bytes()); + wav.extend_from_slice(&CHANNELS.to_le_bytes()); + wav.extend_from_slice(&SAMPLE_RATE.to_le_bytes()); + wav.extend_from_slice(&(SAMPLE_RATE * CHANNELS as u32 * BYTES_PER_SAMPLE as u32).to_le_bytes()); + wav.extend_from_slice(&(CHANNELS * BYTES_PER_SAMPLE).to_le_bytes()); + wav.extend_from_slice(&(BYTES_PER_SAMPLE * 8).to_le_bytes()); + wav.extend_from_slice(b"data"); + wav.extend_from_slice(&(data_bytes as u32).to_le_bytes()); + for &sample in samples { + let pcm = (sample.clamp(-1.0, 1.0) * i16::MAX as f32) as i16; + wav.extend_from_slice(&pcm.to_le_bytes()); + } + + std::fs::write(path, wav).map_err(|e| e.to_string()) +} + +async fn current_recording_wav_string(app_inner: &Arc) -> Option { + app_inner + .current_recording_wav + .lock() + .await + .as_ref() + .map(|path| path.to_string_lossy().to_string()) +} + +async fn record_transcription_failure( + app: &AppHandle, + app_inner: &Arc, + message: &str, +) -> String { + let audio_path = current_recording_wav_string(app_inner).await; + let retry_of = app_inner.current_retry_of.lock().await.clone(); + let ts = app_inner + .stats + .lock() + .await + .record_failure(message, audio_path, retry_of); + *app_inner.current_failure_ts.lock().await = Some(ts.clone()); + set_overlay_retry_interaction(app, true); + ts +} + +fn prune_old_recordings(app: &AppHandle) { + let Ok(data_dir) = app.path().app_data_dir() else { + return; + }; + let recordings_dir = data_dir.join("recordings"); + let Ok(entries) = std::fs::read_dir(recordings_dir) else { + return; + }; + let cutoff = std::time::SystemTime::now() + .checked_sub(std::time::Duration::from_secs(31 * 24 * 60 * 60)) + .unwrap_or(std::time::UNIX_EPOCH); + + for entry in entries.flatten() { + let path = entry.path(); + if path.extension().and_then(|ext| ext.to_str()) != Some("wav") { + continue; + } + let Ok(metadata) = entry.metadata() else { + continue; + }; + let Ok(modified) = metadata.modified() else { + continue; + }; + if modified < cutoff { + let _ = std::fs::remove_file(path); + } + } +} + +/// Heuristic: did this recording capture actual sound (speech) rather than +/// silence? Used to tell a genuine no-speech stop (end immediately) apart from +/// speech whose transcript was lost to a slow/failed network (keep commit + +/// retry). Biased toward "has sound" so real speech is never silently dropped. +fn recording_has_audio_signal(samples: &[f32]) -> bool { + // 16k mono. Native capture has no AEC, so the start cue bleeds into the mic + // at the very beginning; skip that leading window so the cue is never mistaken + // for speech. Anything the user actually says runs past it (and if they spoke + // inside it, a transcript would have arrived, short-circuiting this check). + const CUE_SKIP: usize = 11_200; // ~0.7s at 16k covers the start cue + echo tail + const MIN_VOICE: usize = 1_600; // need ~100ms of real audio after the cue + if samples.len() < CUE_SKIP + MIN_VOICE { + return false; + } + let tail = &samples[CUE_SKIP..]; + let peak = tail.iter().fold(0.0f32, |m, &s| m.max(s.abs())); + let rms = (tail.iter().map(|&s| s * s).sum::() / tail.len() as f32).sqrt(); + // A quiet mic noise floor sits well below these; speech clears both easily. + peak >= 0.02 && rms >= 0.004 +} + +/// Drop the WAV and recording bookkeeping for a recording that produced nothing +/// worth keeping (e.g. the user stopped without speaking). Nothing to retry. +async fn discard_recording_artifacts(app_inner: &Arc) { + if let Some(path) = app_inner.current_recording_wav.lock().await.take() { + let _ = std::fs::remove_file(path); + } + *app_inner.current_retry_of.lock().await = None; + *app_inner.current_failure_ts.lock().await = None; + app_inner.recording_audio.lock().await.clear(); +} + +async fn record_success_and_apply_retention( + app: &AppHandle, + app_inner: &Arc, + text: &str, + keep_recordings: bool, +) { + let wav_path = app_inner.current_recording_wav.lock().await.take(); + let retry_of = app_inner.current_retry_of.lock().await.take(); + let audio_path = if keep_recordings { + wav_path + .as_ref() + .map(|path| path.to_string_lossy().to_string()) + } else { + if let Some(path) = wav_path { + let _ = std::fs::remove_file(path); + } + None + }; + let mut stats = app_inner.stats.lock().await; + if let Some(retry_ts) = retry_of.as_ref() { + if stats.replace_history_with_success(retry_ts, text, audio_path.clone()) { + drop(stats); + // Always prune: a never-retried failure recording is only deleted on a + // later success, otherwise it is reclaimed by the 31-day retention sweep, + // even when keep_recordings is off (only failure WAVs persist then). + prune_old_recordings(app); + return; + } + } + stats.record_session_with_audio(text, audio_path, retry_of); + drop(stats); + + prune_old_recordings(app); +} + +#[cfg(not(target_os = "macos"))] async fn wait_for_audio_warmup( app_inner: &Arc, timeout_ms: u64, @@ -555,6 +894,19 @@ struct ActivePromptId(std::sync::Mutex>); /// Handle hotkey press event. In toggle mode, toggles recording. In hold mode, starts recording. /// `prompt_id` is `Some(id)` when a prompt-template hotkey was triggered, `None` for the main hotkey. async fn on_hotkey_pressed(app_handle: AppHandle, mode: &str, prompt_id: Option) { + // Keyboard-driven retry: while a retryable failure is shown (idle, retry button + // visible), the main hotkey triggers the retry instead of a new recording, so + // the user can retry without reaching for the mouse. + if prompt_id.is_none() { + let app_inner = app_handle.state::>(); + let can_retry = matches!(*app_inner.state.lock().await, app_state::AppState::Idle) + && app_inner.current_failure_ts.lock().await.is_some(); + if can_retry { + let _ = retry_latest_failed_transcription(app_handle.clone()).await; + return; + } + } + // Store the active prompt ID for the recording session if let Some(active) = app_handle.try_state::() { *active.0.lock().unwrap() = prompt_id; @@ -653,6 +1005,11 @@ async fn start_recording(app_handle: AppHandle) { } *app_inner.latest_transcript.lock().await = (String::new(), String::new()); + app_inner.recording_audio.lock().await.clear(); + *app_inner.current_recording_wav.lock().await = None; + *app_inner.current_retry_of.lock().await = None; + *app_inner.current_failure_ts.lock().await = None; + set_overlay_retry_interaction(&app_handle, false); let _ = app_handle.emit("overlay:event", serde_json::json!({ "type": "reset" })); // Re-position before showing so the overlay follows the current display layout // (e.g. after an external monitor was connected/disconnected). @@ -663,20 +1020,14 @@ async fn start_recording(app_handle: AppHandle) { // 2. Warm up microphone capture set_app_state(&app_handle, &app_inner, app_state::AppState::Connecting).await; - let _ = app_handle.emit( - "overlay:event", - serde_json::json!({ - "type": "audio:warmup", - }), - ); - if let Err(e) = wait_for_audio_warmup(&app_inner, 8000).await { + #[cfg(target_os = "macos")] + if let Err(e) = native_audio::start_capture(app_handle.clone(), Arc::clone(&app_inner)).await { *recording_state.0.lock().unwrap() = false; - stop_renderer_audio(&app_handle, &app_inner, 1200).await; set_app_state(&app_handle, &app_inner, app_state::AppState::Idle).await; if let Some(overlay) = app_handle.get_webview_window("overlay") { let _ = overlay.hide(); } - log_rec!(warn, "Audio warmup failed: {}", e); + log_rec!(warn, "Native audio warmup failed: {}", e); let _ = app_handle.emit( "overlay:event", serde_json::json!({ @@ -687,10 +1038,37 @@ async fn start_recording(app_handle: AppHandle) { return; } + #[cfg(not(target_os = "macos"))] + { + let _ = app_handle.emit( + "overlay:event", + serde_json::json!({ + "type": "audio:warmup", + }), + ); + if let Err(e) = wait_for_audio_warmup(&app_inner, 8000).await { + *recording_state.0.lock().unwrap() = false; + stop_audio_capture(&app_handle, &app_inner, 1200).await; + set_app_state(&app_handle, &app_inner, app_state::AppState::Idle).await; + if let Some(overlay) = app_handle.get_webview_window("overlay") { + let _ = overlay.hide(); + } + log_rec!(warn, "Audio warmup failed: {}", e); + let _ = app_handle.emit( + "overlay:event", + serde_json::json!({ + "type": "hint", + "payload": { "text": e, "level": "error", "variant": "text" } + }), + ); + return; + } + } + // Check if recording was cancelled during warmup (hold mode: quick press-release) if !*recording_state.0.lock().unwrap() { log_rec!(warn, "Cancelled during warmup, aborting start"); - stop_renderer_audio(&app_handle, &app_inner, 1200).await; + stop_audio_capture(&app_handle, &app_inner, 1200).await; set_app_state(&app_handle, &app_inner, app_state::AppState::Idle).await; if let Some(overlay) = app_handle.get_webview_window("overlay") { let _ = overlay.hide(); @@ -698,19 +1076,16 @@ async fn start_recording(app_handle: AppHandle) { return; } - // Settle delay before the cue: getUserMedia resolving only means the stream - // exists, not that its AEC/AGC have converged. The mic is live and DSP converges - // on real input during this wait (capture stays gated off until Recording), while - // the renderer's cue keep-alive (set up during warmup) holds the output device - // warm so the cue still plays smoothly afterwards. The cue is the user's "go" - // signal, so it must land AFTER this delay — never before, or the user would - // speak into the unconverged window and lose the first words. + // Settle delay before the cue: the selected capture backend is live during + // this wait (audio stays gated off until Recording), while the renderer's cue + // keep-alive holds the output device warm so the start cue plays smoothly. + // The cue is the user's "go" signal, so it lands after capture warmup. tokio::time::sleep(std::time::Duration::from_millis(AUDIO_SETTLE_MS)).await; // Re-check cancellation: the user may have released during the settle delay. if !*recording_state.0.lock().unwrap() { log_rec!(warn, "Cancelled during settle, aborting start"); - stop_renderer_audio(&app_handle, &app_inner, 1200).await; + stop_audio_capture(&app_handle, &app_inner, 1200).await; set_app_state(&app_handle, &app_inner, app_state::AppState::Idle).await; if let Some(overlay) = app_handle.get_webview_window("overlay") { let _ = overlay.hide(); @@ -747,6 +1122,7 @@ async fn start_recording(app_handle: AppHandle) { emit_cue(&app_handle, &app_inner, "start"); set_app_state(&app_handle, &app_inner, app_state::AppState::Recording).await; + #[cfg(not(target_os = "macos"))] let _ = app_handle.emit( "overlay:event", serde_json::json!({ "type": "recording:start" }), @@ -837,6 +1213,208 @@ async fn create_active_session( result.map(|(session, event_rx)| (session, event_rx, show_recording_hint)) } +pub(crate) async fn retry_history_transcription( + app_handle: AppHandle, + ts: String, +) -> Result { + let app_inner = app_handle.state::>(); + let retry_epoch = app_inner + .session_epoch + .fetch_add(1, std::sync::atomic::Ordering::SeqCst) + + 1; + // No outer wall-clock timeout: the connection phase is already bounded by the + // ASR connect timeout (5s, surfaced as a failure + retry), and the final wait + // by commit_and_await_final's own timeout. An outer cap would only risk + // cutting off a valid streaming transcription mid-flight. + retry_history_transcription_inner(app_handle, ts, retry_epoch).await +} + +/// Record a retry attempt as a failure, surface the error hint, and arm the +/// overlay retry affordance + auto-hide. Shared by every failure path of +/// `retry_history_transcription_inner`. +async fn fail_retry( + app_handle: &AppHandle, + app_inner: &Arc, + ts: &str, + message: &str, +) { + let failure_ts = app_inner.stats.lock().await.record_failure( + message, + current_recording_wav_string(app_inner).await, + Some(ts.to_string()), + ); + *app_inner.current_failure_ts.lock().await = Some(failure_ts); + emit_retryable_error_hint(app_handle, app_inner, message).await; + set_overlay_retry_interaction(app_handle, true); + set_app_state(app_handle, app_inner, app_state::AppState::Idle).await; + schedule_retry_overlay_hide(app_handle.clone(), Arc::clone(app_inner)); +} + +async fn retry_history_transcription_inner( + app_handle: AppHandle, + ts: String, + retry_epoch: u64, +) -> Result { + let app_inner = app_handle.state::>(); + let entry = { + let stats = app_inner.stats.lock().await; + stats + .find_history(&ts) + .ok_or_else(|| "未找到输入记录".to_string())? + }; + let audio_path = entry + .audio_path + .clone() + .ok_or_else(|| "这条记录没有可重试的录音".to_string())?; + let path = PathBuf::from(&audio_path); + let samples = read_wav_16k_mono(&path)?; + if samples.is_empty() { + return Err("录音文件为空,无法重试".to_string()); + } + + set_overlay_retry_interaction(&app_handle, false); + set_app_state(&app_handle, &app_inner, app_state::AppState::Finishing).await; + // Clear the stale failure hint + old transcript, then show a "retrying" + // placeholder while the connection is established. The overlay yields this + // placeholder to the live transcript the moment the replayed recognition + // starts streaming in (see visible_hint / getVisibleHintText), so the user + // sees "重试中" → streaming text, like a normal recording. + let _ = app_handle.emit("overlay:event", serde_json::json!({ "type": "reset" })); + let _ = app_handle.emit( + "overlay:event", + serde_json::json!({ + "type": "hint", + "payload": { "text": "", "level": "info", "variant": "retry" } + }), + ); + *app_inner.latest_transcript.lock().await = (String::new(), String::new()); + *app_inner.current_recording_wav.lock().await = Some(path); + *app_inner.current_retry_of.lock().await = Some(ts.clone()); + + let config = app_inner.config_manager.load_config()?; + let hotwords = app_inner.hotword_manager.active_words(); + let (session, event_rx, _) = match create_active_session(&app_handle, &config, &hotwords).await + { + Ok(result) => result, + Err(error) => { + if !is_current_epoch(&app_inner, retry_epoch) { + return Err("重试已取消".to_string()); + } + let message = format!("{error},请检查网络连接"); + fail_retry(&app_handle, &app_inner, &ts, &message).await; + return Err(message); + } + }; + let session: Arc = Arc::from(session); + if !is_current_epoch(&app_inner, retry_epoch) { + session.close(); + return Err("重试已取消".to_string()); + } + let events_app = app_handle.clone(); + tauri::async_runtime::spawn(async move { + manage_asr_session(events_app, event_rx, retry_epoch).await; + }); + + for chunk in samples.chunks(1600) { + session.append_audio(chunk); + } + + let text = match session.commit_and_await_final().await { + Ok(text) if !text.trim().is_empty() => text, + Ok(_) => { + if !is_current_epoch(&app_inner, retry_epoch) { + session.close(); + return Err("重试已取消".to_string()); + } + session.close(); + let message = "重试转写没有得到文本,请检查网络连接"; + fail_retry(&app_handle, &app_inner, &ts, message).await; + return Err(message.to_string()); + } + Err(error) => { + if !is_current_epoch(&app_inner, retry_epoch) { + session.close(); + return Err("重试已取消".to_string()); + } + session.close(); + let error = format!("{error},请检查网络连接"); + fail_retry(&app_handle, &app_inner, &ts, &error).await; + return Err(error); + } + }; + + if !is_current_epoch(&app_inner, retry_epoch) { + session.close(); + return Err("重试已取消".to_string()); + } + // Hand focus back to the app the user was in before clicking retry, then give + // the OS a moment to switch, so the paste keystroke lands in the right window. + overlay::restore_foreground_app(&app_handle); + tokio::time::sleep(Duration::from_millis(150)).await; + finalize_and_paste(&app_handle, &app_inner, text.clone()).await; + session.close(); + *app_inner.current_failure_ts.lock().await = None; + set_app_state(&app_handle, &app_inner, app_state::AppState::Idle).await; + if let Some(overlay) = app_handle.get_webview_window("overlay") { + let _ = overlay.hide(); + } + Ok(serde_json::json!({ "ok": true, "text": text })) +} + +pub(crate) async fn retry_latest_failed_transcription( + app_handle: AppHandle, +) -> Result { + let app_inner = app_handle.state::>(); + let ts = app_inner + .current_failure_ts + .lock() + .await + .clone() + .ok_or_else(|| "没有可重试的失败录音".to_string())?; + retry_history_transcription(app_handle, ts).await +} + +fn read_wav_16k_mono(path: &std::path::Path) -> Result, String> { + let data = std::fs::read(path).map_err(|e| format!("读取录音文件失败: {e}"))?; + if data.len() < 44 || &data[0..4] != b"RIFF" || &data[8..12] != b"WAVE" { + return Err("录音文件不是有效 WAV".to_string()); + } + let mut pos = 12usize; + let mut channels = 0u16; + let mut sample_rate = 0u32; + let mut bits = 0u16; + let mut data_range = None; + while pos + 8 <= data.len() { + let id = &data[pos..pos + 4]; + let size = u32::from_le_bytes([data[pos + 4], data[pos + 5], data[pos + 6], data[pos + 7]]) + as usize; + let start = pos + 8; + let end = start.saturating_add(size).min(data.len()); + if id == b"fmt " && size >= 16 && end <= data.len() { + channels = u16::from_le_bytes([data[start + 2], data[start + 3]]); + sample_rate = u32::from_le_bytes([ + data[start + 4], + data[start + 5], + data[start + 6], + data[start + 7], + ]); + bits = u16::from_le_bytes([data[start + 14], data[start + 15]]); + } else if id == b"data" { + data_range = Some(start..end); + break; + } + pos = start + size + (size % 2); + } + if channels != 1 || sample_rate != 16_000 || bits != 16 { + return Err("仅支持 16kHz mono 16-bit WAV 重试".to_string()); + } + let range = data_range.ok_or_else(|| "WAV 缺少 data chunk".to_string())?; + Ok(data[range] + .chunks_exact(2) + .map(|chunk| i16::from_le_bytes([chunk[0], chunk[1]]) as f32 / 32768.0) + .collect()) +} + /// Connect the ASR session in the background (one retry), then attach it: flush /// any audio buffered during the connect and publish the ready session. Signals /// completion through `connect_tx` so `stop_recording` can wait when the user @@ -928,29 +1506,16 @@ async fn connect_and_attach( } *app_handle.state::().0.lock().unwrap() = false; app_inner.pending_audio.lock().await.clear(); - stop_renderer_audio(&app_handle, &app_inner, 1200).await; + stop_audio_capture(&app_handle, &app_inner, 1200).await; + save_recording_wav(&app_handle, &app_inner).await; + let message = format!("ASR 连接失败: {},请检查网络连接", e); + record_transcription_failure(&app_handle, &app_inner, &message).await; // Emit error hint BEFORE setting idle so the overlay shows it: the // frontend's idle handler only clears "info"-level hints. - let _ = app_handle.emit("overlay:event", serde_json::json!({ - "type": "hint", - "payload": { "text": format!("ASR 连接失败: {}", e), "level": "error", "variant": "text" } - })); + emit_retryable_error_hint(&app_handle, &app_inner, &message).await; set_app_state(&app_handle, &app_inner, app_state::AppState::Idle).await; // Auto-hide after a delay so the user can read it; guard: still idle. - let delayed_handle = app_handle.clone(); - let delayed_inner: Arc = Arc::clone(&app_inner); - tauri::async_runtime::spawn(async move { - tokio::time::sleep(Duration::from_secs(3)).await; - let still_idle = { - let s = delayed_inner.state.lock().await; - matches!(*s, app_state::AppState::Idle) - }; - if still_idle { - if let Some(overlay) = delayed_handle.get_webview_window("overlay") { - let _ = overlay.hide(); - } - } - }); + schedule_retry_overlay_hide(app_handle.clone(), Arc::clone(&app_inner)); } } } @@ -967,7 +1532,15 @@ async fn stop_recording(app_handle: AppHandle) { set_app_state(&app_handle, &app_inner, app_state::AppState::Finishing).await; // 2. Stop renderer audio first so the final buffered chunk is flushed. - stop_renderer_audio(&app_handle, &app_inner, 1200).await; + stop_audio_capture(&app_handle, &app_inner, 1200).await; + // Snapshot whether real sound was captured before save_recording_wav drains + // the buffer: a silent stop ends immediately, but speech whose transcript was + // lost (slow/failed network) must keep the retry path even with no result yet. + let captured_audio_signal = { + let audio = app_inner.recording_audio.lock().await; + recording_has_audio_signal(&audio) + }; + save_recording_wav(&app_handle, &app_inner).await; // 3. Acquire the ready ASR session. If the background connect hasn't finished // (user stopped before it was ready), wait for it to resolve so the buffered @@ -988,16 +1561,47 @@ async fn stop_recording(app_handle: AppHandle) { *app_inner.asr_events.lock().await = None; if let Some(session) = session { - // 4. Commit and get this session's final text. + // 4. No speech case: the session connected but produced no transcript + // (no partial/final this session, nothing accumulated across reconnects) + // AND the captured audio was silent. The user stopped without speaking; + // Doubao won't emit a final for silence, so committing would block until + // the timeout and then wrongly offer a retry. End immediately, ESC-like. + // If audio WAS captured but no transcript arrived (slow/failed network), + // fall through to commit so the result — or a retry — is still possible. + let recognized_anything = { + let (final_t, partial_t) = app_inner.latest_transcript.lock().await.clone(); + let accumulated = app_inner.accumulated_text.lock().await.clone(); + !final_t.trim().is_empty() + || !partial_t.trim().is_empty() + || !accumulated.trim().is_empty() + }; + if !recognized_anything && !captured_audio_signal { + log_rec!(info, "Stop with no recognized speech; ending immediately"); + session.close(); + app_inner.pending_audio.lock().await.clear(); + *app_inner.accumulated_text.lock().await = String::new(); + discard_recording_artifacts(&app_inner).await; + set_overlay_retry_interaction(&app_handle, false); + if let Some(overlay) = app_handle.get_webview_window("overlay") { + let _ = overlay.hide(); + } + set_app_state(&app_handle, &app_inner, app_state::AppState::Idle).await; + return; + } + + // 5. Commit and get this session's final text. let session_text = match session.commit_and_await_final().await { Ok(t) => t, - Err(_) => { - let (final_t, partial_t) = app_inner.latest_transcript.lock().await.clone(); - if !final_t.is_empty() { - final_t - } else { - partial_t - } + Err(e) => { + log_rec!(warn, "ASR commit failed: {}", e); + session.close(); + app_inner.pending_audio.lock().await.clear(); + *app_inner.accumulated_text.lock().await = String::new(); + record_transcription_failure(&app_handle, &app_inner, &e).await; + emit_retryable_error_hint(&app_handle, &app_inner, &e).await; + set_app_state(&app_handle, &app_inner, app_state::AppState::Idle).await; + schedule_retry_overlay_hide(app_handle.clone(), Arc::clone(&app_inner)); + return; } }; log_rec!( @@ -1033,26 +1637,12 @@ async fn stop_recording(app_handle: AppHandle) { warn, "Stop with no ready ASR session; discarding buffered audio" ); - let _ = app_handle.emit("overlay:event", serde_json::json!({ - "type": "hint", - "payload": { "text": "语音服务连接失败,请重试", "level": "error", "variant": "text" } - })); + let message = "语音服务连接失败,请检查网络连接"; + record_transcription_failure(&app_handle, &app_inner, message).await; + emit_retryable_error_hint(&app_handle, &app_inner, message).await; *app_inner.accumulated_text.lock().await = String::new(); set_app_state(&app_handle, &app_inner, app_state::AppState::Idle).await; - let delayed_handle = app_handle.clone(); - let delayed_inner: Arc = Arc::clone(&app_inner); - tauri::async_runtime::spawn(async move { - tokio::time::sleep(Duration::from_secs(3)).await; - let still_idle = { - let s = delayed_inner.state.lock().await; - matches!(*s, app_state::AppState::Idle) - }; - if still_idle { - if let Some(overlay) = delayed_handle.get_webview_window("overlay") { - let _ = overlay.hide(); - } - } - }); + schedule_retry_overlay_hide(app_handle.clone(), Arc::clone(&app_inner)); return; } } @@ -1061,6 +1651,7 @@ async fn stop_recording(app_handle: AppHandle) { *app_inner.accumulated_text.lock().await = String::new(); // 12. Hide overlay + set_overlay_retry_interaction(&app_handle, false); if let Some(overlay) = app_handle.get_webview_window("overlay") { let _ = overlay.hide(); } @@ -1087,6 +1678,55 @@ pub async fn toggle_recording(app_handle: AppHandle) { } /// Cancel the active recording without committing or pasting text. +/// Directly toggle the ESC-cancel shortcut, independent of the recording state +/// machine. Used to keep ESC live while a retryable failure is shown (idle). +fn set_escape_enabled_now(app: &AppHandle, enabled: bool) { + if let Some(hc) = app.try_state::() { + hotkey::set_escape_enabled(&hc, enabled); + } +} + +/// ESC handler. Routes to the right teardown for whatever is on screen: +/// an active recording, an in-flight retry, or a shown retryable failure. +pub(crate) async fn on_escape(app_handle: AppHandle) { + if is_recording(&app_handle) { + cancel_recording(app_handle).await; + return; + } + let app_inner = app_handle.state::>(); + let state = app_inner.state.lock().await.clone(); + match state { + // Retry in progress (a normal commit has no retry marker): abort it. + app_state::AppState::Finishing if app_inner.current_retry_of.lock().await.is_some() => { + abort_retry_or_failure(&app_handle, &app_inner).await; + } + // Retryable failure currently shown: dismiss it. + app_state::AppState::Idle if app_inner.current_failure_ts.lock().await.is_some() => { + abort_retry_or_failure(&app_handle, &app_inner).await; + } + _ => {} + } +} + +/// Tear down an in-flight retry or a shown retryable failure: discard any +/// in-flight result via the epoch bump, clear retry state, and hide the overlay. +async fn abort_retry_or_failure(app_handle: &AppHandle, app_inner: &Arc) { + app_inner + .session_epoch + .fetch_add(1, std::sync::atomic::Ordering::SeqCst); + *app_inner.current_retry_of.lock().await = None; + *app_inner.current_failure_ts.lock().await = None; + *app_inner.latest_transcript.lock().await = (String::new(), String::new()); + *app_inner.accumulated_text.lock().await = String::new(); + set_overlay_retry_interaction(app_handle, false); + let _ = app_handle.emit("overlay:event", serde_json::json!({ "type": "reset" })); + if let Some(overlay) = app_handle.get_webview_window("overlay") { + let _ = overlay.hide(); + } + // set_app_state(Idle) also re-syncs (disables) the ESC shortcut. + set_app_state(app_handle, app_inner, app_state::AppState::Idle).await; +} + async fn cancel_recording(app_handle: AppHandle) { let app_inner = app_handle.state::>(); let recording_state = app_handle.state::(); @@ -1108,13 +1748,16 @@ async fn cancel_recording(app_handle: AppHandle) { .session_epoch .fetch_add(1, std::sync::atomic::Ordering::SeqCst); app_inner.pending_audio.lock().await.clear(); + app_inner.recording_audio.lock().await.clear(); + *app_inner.current_recording_wav.lock().await = None; + *app_inner.current_retry_of.lock().await = None; // Clear the active prompt ID since the session was cancelled if let Some(active) = app_handle.try_state::() { *active.0.lock().unwrap() = None; } - stop_renderer_audio(&app_handle, &app_inner, 1200).await; + stop_audio_capture(&app_handle, &app_inner, 1200).await; if let Some(session) = app_inner.asr_session.lock().await.take() { session.close(); @@ -1186,6 +1829,11 @@ async fn manage_asr_session( final_text, partial_text, } => { + // Stop feeding the overlay once this session is superseded + // (e.g. the user pressed ESC to abort an in-flight retry). + if !is_current_epoch(&app_inner, my_epoch) { + break 'outer; + } // A real transcript means the (possibly reconnected) session is // healthy again: reset the failure counter. reconnect_attempts = 0; @@ -1393,36 +2041,19 @@ async fn finalize_on_failure(app: &AppHandle, app_inner: &Arc() { *active.0.lock().unwrap() = None; } set_app_state(app, app_inner, app_state::AppState::Idle).await; - let delayed_handle = app.clone(); - let delayed_inner: Arc = Arc::clone(app_inner); - tauri::async_runtime::spawn(async move { - tokio::time::sleep(Duration::from_secs(3)).await; - let still_idle = { - let s = delayed_inner.state.lock().await; - matches!(*s, app_state::AppState::Idle) - }; - if still_idle { - if let Some(overlay) = delayed_handle.get_webview_window("overlay") { - let _ = overlay.hide(); - } - } - }); + schedule_retry_overlay_hide(app.clone(), Arc::clone(app_inner)); return; } @@ -1626,8 +2257,12 @@ async fn finalize_and_paste( } } - // Record usage stats - app_inner.stats.lock().await.record_session(&final_text); + // Record usage stats and retain/delete the WAV according to user settings. + let keep_recordings = config + .as_ref() + .map(|c| c.app.keep_recordings) + .unwrap_or(false); + record_success_and_apply_retention(app_handle, app_inner, &final_text, keep_recordings).await; emit_cue(app_handle, app_inner, "end"); } @@ -1661,3 +2296,75 @@ pub fn reload_hotkey_bindings(app: &AppHandle) { let prompts = app_inner.config_manager.load_prompts(); hotkey::reload_bindings(&hc, &hotkey_str, &mode, &prompts); } + +#[cfg(test)] +mod audio_signal_tests { + use super::recording_has_audio_signal; + + #[test] + fn silence_is_not_treated_as_speech() { + let silence = vec![0.0f32; 16_000]; + assert!(!recording_has_audio_signal(&silence)); + } + + #[test] + fn quiet_noise_floor_is_not_treated_as_speech() { + // ~ -54 dBFS hum: below both gates, must not look like speech. + let noise: Vec = (0..16_000) + .map(|i| if i % 2 == 0 { 0.002 } else { -0.002 }) + .collect(); + assert!(!recording_has_audio_signal(&noise)); + } + + #[test] + fn very_short_clip_is_not_treated_as_speech() { + // Under 100ms even at full amplitude is an accidental tap, not speech. + let blip = vec![0.5f32; 800]; + assert!(!recording_has_audio_signal(&blip)); + } + + #[test] + fn loud_sustained_signal_is_treated_as_speech() { + // A 0.3-amplitude tone clears both the peak and RMS gates. + let tone: Vec = (0..16_000).map(|i| 0.3 * (i as f32 * 0.2).sin()).collect(); + assert!(recording_has_audio_signal(&tone)); + } + + #[test] + fn start_cue_bleed_then_silence_is_not_treated_as_speech() { + // Loud cue in the first ~0.5s, silence afterward: must be skipped, not + // mistaken for the user speaking (no AEC in native capture). + let mut samples = vec![0.0f32; 16_000]; + for (i, s) in samples.iter_mut().enumerate().take(8_000) { + *s = 0.4 * (i as f32 * 0.3).sin(); + } + assert!(!recording_has_audio_signal(&samples)); + } +} + +#[cfg(test)] +mod hotkey_label_tests { + use super::format_hotkey_label; + + #[test] + fn function_key_passes_through() { + assert_eq!(format_hotkey_label("F13"), "F13"); + } + + #[test] + fn sided_modifier_matches_settings_symbol() { + // Mirrors the frontend normalizeHotkeyLabel ("AltRight" -> "R ⌥"). + assert_eq!(format_hotkey_label("AltRight"), "R ⌥"); + } + + #[test] + fn combo_is_symbolized_and_joined() { + assert_eq!(format_hotkey_label("Control+Space"), "⌃ ␣"); + assert_eq!(format_hotkey_label("CmdOrCtrl+Shift+A"), "⌘ ⇧ A"); + } + + #[test] + fn empty_stays_empty() { + assert_eq!(format_hotkey_label(""), ""); + } +} diff --git a/src-tauri/src/native_audio.rs b/src-tauri/src/native_audio.rs new file mode 100644 index 0000000..f8268ba --- /dev/null +++ b/src-tauri/src/native_audio.rs @@ -0,0 +1,336 @@ +use crate::app_state; +use cpal::traits::{DeviceTrait, HostTrait, StreamTrait}; +use std::sync::{Arc, Mutex}; +use std::thread; +use tauri::AppHandle; + +const TARGET_SAMPLE_RATE: u32 = 16_000; +const TARGET_CHUNK_SAMPLES: usize = 1600; + +pub struct NativeAudioCapture { + stop_tx: std::sync::mpsc::Sender<()>, + input_thread: Option>, + forward_task: tauri::async_runtime::JoinHandle<()>, +} + +impl NativeAudioCapture { + async fn stop(mut self) { + let _ = self.stop_tx.send(()); + if let Some(input_thread) = self.input_thread.take() { + let _ = tokio::task::spawn_blocking(move || input_thread.join()).await; + } + let _ = self.forward_task.await; + } +} + +pub async fn start_capture( + app: AppHandle, + app_inner: Arc, +) -> Result<(), String> { + let mut slot = app_inner.native_audio.lock().await; + if slot.is_some() { + return Ok(()); + } + + let (audio_tx, mut rx) = tokio::sync::mpsc::unbounded_channel::>(); + let (stop_tx, stop_rx) = std::sync::mpsc::channel::<()>(); + let (ready_tx, ready_rx) = tokio::sync::oneshot::channel::>(); + + let input_thread = thread::Builder::new() + .name("voicepaste-native-audio".to_string()) + .spawn(move || { + if let Err(error) = run_input_thread(audio_tx, stop_rx, ready_tx) { + log_audio!(error, "Native audio thread exited with error: {}", error); + } + }) + .map_err(|e| format!("启动原生录音线程失败: {e}"))?; + + // Await the ready signal asynchronously: the worker sends it the moment the + // input stream is built, so we never block a tokio worker thread. + match ready_rx.await { + Ok(Ok(())) => {} + Ok(Err(error)) => { + let _ = stop_tx.send(()); + let _ = input_thread.join(); + return Err(error); + } + Err(_) => { + let _ = input_thread.join(); + return Err("原生录音线程提前退出".to_string()); + } + } + + let forward_app = app.clone(); + let forward_inner = Arc::clone(&app_inner); + let forward_task = tauri::async_runtime::spawn(async move { + while let Some(samples) = rx.recv().await { + let state = forward_inner.state.lock().await.clone(); + if matches!( + state, + app_state::AppState::Recording | app_state::AppState::Finishing + ) { + crate::commands::append_audio_samples(&forward_app, &forward_inner, samples).await; + } + } + }); + + *slot = Some(NativeAudioCapture { + stop_tx, + input_thread: Some(input_thread), + forward_task, + }); + log_audio!(info, "Native cpal microphone capture started"); + Ok(()) +} + +pub async fn stop_capture(app_inner: &Arc) { + let capture = app_inner.native_audio.lock().await.take(); + if let Some(capture) = capture { + capture.stop().await; + log_audio!(info, "Native cpal microphone capture stopped"); + } +} + +fn run_input_thread( + tx: tokio::sync::mpsc::UnboundedSender>, + stop_rx: std::sync::mpsc::Receiver<()>, + ready_tx: tokio::sync::oneshot::Sender>, +) -> Result<(), String> { + let final_chunk = Arc::new(Mutex::new(Vec::::with_capacity(TARGET_CHUNK_SAMPLES))); + let stream = match build_input_stream(tx.clone(), Arc::clone(&final_chunk)) { + Ok(stream) => stream, + Err(error) => { + let _ = ready_tx.send(Err(error.clone())); + return Err(error); + } + }; + if let Err(error) = stream.play() { + let message = format!("启动麦克风输入流失败: {error}"); + let _ = ready_tx.send(Err(message.clone())); + return Err(message); + } + let _ = ready_tx.send(Ok(())); + stop_rx + .recv() + .map_err(|e| format!("等待停止原生录音失败: {e}"))?; + drop(stream); + if let Ok(mut chunk) = final_chunk.lock() { + if !chunk.is_empty() { + let tail = std::mem::take(&mut *chunk); + let _ = tx.send(tail); + } + } + Ok(()) +} + +fn build_input_stream( + tx: tokio::sync::mpsc::UnboundedSender>, + final_chunk: Arc>>, +) -> Result { + let host = cpal::default_host(); + let device = host + .default_input_device() + .ok_or_else(|| "未找到默认麦克风输入设备".to_string())?; + let config = device + .default_input_config() + .map_err(|e| format!("读取默认麦克风配置失败: {e}"))?; + let sample_rate = config.sample_rate().0; + let channels = usize::from(config.channels()); + let stream_config = config.config(); + + log_audio!( + info, + "Native input device: sample_rate={}, channels={}, format={:?}", + sample_rate, + channels, + config.sample_format() + ); + + let err_fn = |err| { + log_audio!(error, "Native microphone stream error: {}", err); + }; + + match config.sample_format() { + cpal::SampleFormat::F32 => build_stream::( + &device, + &stream_config, + channels, + sample_rate, + tx, + final_chunk, + err_fn, + ), + cpal::SampleFormat::I16 => build_stream::( + &device, + &stream_config, + channels, + sample_rate, + tx, + final_chunk, + err_fn, + ), + cpal::SampleFormat::U16 => build_stream::( + &device, + &stream_config, + channels, + sample_rate, + tx, + final_chunk, + err_fn, + ), + other => Err(format!("不支持的采样格式: {other:?}")), + } +} + +fn build_stream( + device: &cpal::Device, + config: &cpal::StreamConfig, + channels: usize, + sample_rate: u32, + tx: tokio::sync::mpsc::UnboundedSender>, + final_chunk: Arc>>, + err_fn: impl FnMut(cpal::StreamError) + Send + 'static, +) -> Result +where + T: cpal::Sample + cpal::SizedSample + Send + 'static, + f32: FromNativeSample, +{ + let mut resampler = StreamingResampler::new(sample_rate, TARGET_SAMPLE_RATE); + + device + .build_input_stream( + config, + move |data: &[T], _| { + let mono = downmix_to_mono(data, channels); + let samples = resampler.push(&mono); + let Ok(mut chunk) = final_chunk.lock() else { + return; + }; + for sample in samples { + chunk.push(sample); + if chunk.len() >= TARGET_CHUNK_SAMPLES { + let full = std::mem::take(&mut *chunk); + if tx.send(full).is_err() { + return; + } + } + } + }, + err_fn, + None, + ) + .map_err(|e| format!("创建麦克风输入流失败: {e}")) +} + +fn downmix_to_mono(data: &[T], channels: usize) -> Vec +where + T: Copy, + f32: FromNativeSample, +{ + data.chunks(channels) + .map(|frame| { + let sum = frame + .iter() + .map(|&sample| f32::from_native_sample(sample)) + .sum::(); + sum / channels as f32 + }) + .collect() +} + +trait FromNativeSample { + fn from_native_sample(sample: T) -> f32; +} + +impl FromNativeSample for f32 { + fn from_native_sample(sample: f32) -> f32 { + sample.clamp(-1.0, 1.0) + } +} + +impl FromNativeSample for f32 { + fn from_native_sample(sample: i16) -> f32 { + sample as f32 / i16::MAX as f32 + } +} + +impl FromNativeSample for f32 { + fn from_native_sample(sample: u16) -> f32 { + (sample as f32 - 32768.0) / 32768.0 + } +} + +struct StreamingResampler { + from_rate: u32, + to_rate: u32, + ratio: f64, + position: f64, + input: Vec, +} + +impl StreamingResampler { + fn new(from_rate: u32, to_rate: u32) -> Self { + Self { + from_rate, + to_rate, + ratio: from_rate as f64 / to_rate as f64, + position: 0.0, + input: Vec::new(), + } + } + + fn push(&mut self, samples: &[f32]) -> Vec { + if samples.is_empty() { + return Vec::new(); + } + if self.from_rate == self.to_rate { + return samples.to_vec(); + } + + self.input.extend_from_slice(samples); + let mut output = Vec::new(); + while self.position + 1.0 < self.input.len() as f64 { + let idx = self.position.floor() as usize; + let frac = (self.position - idx as f64) as f32; + let a = self.input[idx]; + let b = self.input[idx + 1]; + output.push(a + (b - a) * frac); + self.position += self.ratio; + } + + // Keep at least the last input sample as the interpolation anchor for + // the next callback. With ratios such as 48k -> 16k, `position` can step + // past the current buffer length after the final emitted sample; never + // drain beyond the slice or CoreAudio's no-unwind callback will abort. + let consumed = (self.position.floor() as usize).min(self.input.len().saturating_sub(1)); + if consumed > 0 { + self.input.drain(..consumed); + self.position -= consumed as f64; + } + output + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn resampler_handles_48k_coreaudio_512_frame_callbacks() { + let mut resampler = StreamingResampler::new(48_000, 16_000); + for _ in 0..20 { + let input = vec![0.25; 512]; + let output = resampler.push(&input); + assert!(!output.is_empty()); + } + } + + #[test] + fn resampler_keeps_last_sample_for_next_interpolation_window() { + let mut resampler = StreamingResampler::new(48_000, 16_000); + let _ = resampler.push(&vec![0.0; 512]); + assert!(!resampler.input.is_empty()); + assert!(resampler.position >= 0.0); + assert!(resampler.position < resampler.input.len() as f64 + resampler.ratio); + } +} diff --git a/src-tauri/src/overlay.rs b/src-tauri/src/overlay.rs index ee521ca..3e305c0 100644 --- a/src-tauri/src/overlay.rs +++ b/src-tauri/src/overlay.rs @@ -24,27 +24,96 @@ pub fn set_audio_level(app: &AppHandle, level: f64) { macos::set_audio_level(app, level); } +/// Remember the app the user is currently working in, so we can hand keyboard +/// focus back to it after a retry (clicking the native retry button activates the +/// overlay, which would otherwise swallow the paste). No-op off macOS. +#[allow(unused_variables)] +pub fn capture_foreground_app(app: &AppHandle) { + #[cfg(target_os = "macos")] + macos::capture_foreground_app(app); +} + +/// Reactivate the app captured by [`capture_foreground_app`] so the subsequent +/// paste lands in the window the user was in. No-op off macOS. +#[allow(unused_variables)] +pub fn restore_foreground_app(app: &AppHandle) { + #[cfg(target_os = "macos")] + macos::restore_foreground_app(app); +} + /// Native macOS overlay renderer. Builds and updates an AppKit pill /// (`NSGlassEffectView` → container → indicator + transcript label) living inside /// the overlay window's content view, above the transparent WebView. #[cfg(target_os = "macos")] mod macos { use objc2::rc::Retained; - use objc2::runtime::AnyObject; - use objc2::{msg_send, MainThreadMarker}; + use objc2::runtime::{AnyClass, AnyObject, ClassBuilder, Sel}; + use objc2::{class, msg_send, sel, MainThreadMarker}; use objc2_app_kit::{ - NSAppearance, NSAppearanceNameAqua, NSAppearanceNameDarkAqua, NSColor, NSFont, - NSGlassEffectView, NSGlassEffectViewStyle, NSLineBreakMode, NSProgressIndicator, - NSProgressIndicatorStyle, NSTextField, NSView, NSVisualEffectBlendingMode, - NSVisualEffectMaterial, NSVisualEffectState, NSVisualEffectView, NSWindow, + NSAppearance, NSAppearanceNameAqua, NSAppearanceNameDarkAqua, + NSApplicationActivationOptions, NSAttributedStringNSStringDrawing, NSBezelStyle, NSButton, + NSColor, NSFont, NSGlassEffectView, NSGlassEffectViewStyle, NSLineBreakMode, + NSProgressIndicator, NSProgressIndicatorStyle, NSRunningApplication, NSTextField, NSView, + NSVisualEffectBlendingMode, NSVisualEffectMaterial, NSVisualEffectState, + NSVisualEffectView, NSWindow, NSWorkspace, }; use objc2_foundation::{ - NSArray, NSAttributedString, NSMutableAttributedString, NSNumber, NSPoint, NSRange, NSRect, - NSSize, NSString, + NSArray, NSAttributedString, NSBundle, NSMutableAttributedString, NSNumber, NSPoint, + NSRange, NSRect, NSSize, NSString, }; use std::cell::RefCell; + use std::sync::{Mutex as StdMutex, OnceLock}; use tauri::{AppHandle, Manager}; + static RETRY_APP: OnceLock>> = OnceLock::new(); + /// Bundle id of the app the user was in before the overlay stole focus, so a + /// successful retry can hand keyboard focus back for the paste. + static PREV_FOREGROUND_BUNDLE: OnceLock>> = OnceLock::new(); + + fn our_bundle_id() -> Option { + NSBundle::mainBundle() + .bundleIdentifier() + .map(|s| s.to_string()) + } + + pub fn capture_foreground_app(app: &AppHandle) { + let _ = app.run_on_main_thread(|| { + let ws = NSWorkspace::sharedWorkspace(); + let Some(front) = ws.frontmostApplication() else { + return; + }; + let Some(bid) = front.bundleIdentifier() else { + return; + }; + let bid = bid.to_string(); + // If we are already frontmost (e.g. re-arming retry after a failed + // attempt), keep the previously captured app instead of ourselves. + if our_bundle_id().as_deref() == Some(bid.as_str()) { + return; + } + let slot = PREV_FOREGROUND_BUNDLE.get_or_init(|| StdMutex::new(None)); + if let Ok(mut guard) = slot.lock() { + *guard = Some(bid); + } + }); + } + + pub fn restore_foreground_app(app: &AppHandle) { + let bid = PREV_FOREGROUND_BUNDLE + .get() + .and_then(|slot| slot.lock().ok().and_then(|g| g.clone())); + let Some(bid) = bid else { + return; + }; + let _ = app.run_on_main_thread(move || { + let ns_bid = NSString::from_str(&bid); + let apps = NSRunningApplication::runningApplicationsWithBundleIdentifier(&ns_bid); + if let Some(target) = apps.firstObject() { + target.activateWithOptions(NSApplicationActivationOptions::ActivateAllWindows); + } + }); + } + // --- Layout constants (mirror web/styles.css + app.js scheduleResize) --- const FONT_SIZE: f64 = 14.0; const PAD_LEFT: f64 = 14.0; @@ -67,6 +136,11 @@ mod macos { const WAVE_GAP_LEFT: f64 = 12.0; // gap between text and waveform const WAVE_MAX_H: f64 = 22.0; const WAVE_MIN_H: f64 = 3.0; + const RETRY_SIZE: f64 = 22.0; + const RETRY_MIN_W: f64 = 38.0; // floor; the button grows to fit "重试 (R ⌥)" + const RETRY_TEXT_PAD: f64 = 24.0; // horizontal padding around the button title + const RETRY_GAP_LEFT: f64 = 8.0; + const RETRY_RIGHT_INSET: f64 = 26.0; /// Logical overlay model, mirrored from overlay events (parallels app.js `state`). #[derive(Default)] @@ -75,7 +149,9 @@ mod macos { partial_text: String, hint_text: String, hint_level: String, // "info" | "warn" | "error" - hint_variant: String, // "text" | "progress" + hint_variant: String, // "text" | "progress" | "retry" + hint_retryable: bool, + retry_hotkey: String, // formatted main hotkey label, e.g. "R ⌥" app_state: String, // "idle" | "connecting" | "recording" | "finishing" // Sticky layout (prevents width jitter while recording/finishing). layout_width: f64, @@ -117,8 +193,13 @@ mod macos { spinner: Retained, label: Retained, bars: [Retained; WAVE_N], + retry_view: Retained, + retry_button: Retained, + _retry_target: Retained, dot_layer: Retained, // CALayer for the indicator dot ripple_layer: Retained, // CALayer halo behind the dot (recording ripple) + retry_track_layer: Retained, // CALayer for the retry ring track + retry_progress_layer: Retained, // CALayer for the retry countdown ring fade_mask: Retained, // CAGradientLayer for the multi-line top fade applied_variant: String, // "" (auto/inherit) | "light" | "dark" } @@ -128,6 +209,36 @@ mod macos { static VIEWS: RefCell> = const { RefCell::new(None) }; } + extern "C" fn retry_clicked(_this: *mut AnyObject, _sel: Sel, _sender: *mut AnyObject) { + let Some(lock) = RETRY_APP.get() else { + return; + }; + let Ok(guard) = lock.lock() else { + return; + }; + let Some(app) = guard.as_ref().cloned() else { + return; + }; + tauri::async_runtime::spawn(async move { + let _ = crate::retry_latest_failed_transcription(app).await; + }); + } + + fn retry_target_class() -> &'static AnyClass { + if let Some(cls) = AnyClass::get(c"VoicePasteRetryTarget") { + return cls; + } + let mut builder = + ClassBuilder::new(c"VoicePasteRetryTarget", class!(NSObject)).expect("class builder"); + unsafe { + builder.add_method( + sel!(retryClicked:), + retry_clicked as extern "C" fn(*mut AnyObject, Sel, *mut AnyObject), + ); + } + builder.register() + } + fn liquid_glass_available() -> bool { objc2::runtime::AnyClass::get(c"NSGlassEffectView").is_some() } @@ -151,6 +262,11 @@ mod macos { /// Parse an incoming overlay event and update the native pill on the main thread. pub fn dispatch(app: &AppHandle, event: &serde_json::Value) { + let slot = RETRY_APP.get_or_init(|| StdMutex::new(None)); + if let Ok(mut guard) = slot.lock() { + *guard = Some(app.clone()); + } + let kind = event.get("type").and_then(|v| v.as_str()).unwrap_or(""); // Only visual events drive the native pill. Audio lifecycle events // (audio:warmup / recording:start / recording:stop) belong to the WebView worker. @@ -183,6 +299,8 @@ mod macos { model.hint_text.clear(); model.hint_level = "info".into(); model.hint_variant = "text".into(); + model.hint_retryable = false; + model.retry_hotkey.clear(); model.layout_width = 0.0; model.layout_wrap = false; model.smoothed_level = 0.0; @@ -200,6 +318,8 @@ mod macos { { model.hint_text.clear(); model.hint_variant = "text".into(); + model.hint_retryable = false; + model.retry_hotkey.clear(); } // Collapse the waveform when not actively recording. if s != "recording" { @@ -236,6 +356,15 @@ mod macos { .and_then(|v| v.as_str()) .unwrap_or("text") .into(); + model.hint_retryable = payload + .and_then(|p| p.get("retryable")) + .and_then(|v| v.as_bool()) + .unwrap_or(false); + model.retry_hotkey = payload + .and_then(|p| p.get("hotkey")) + .and_then(|v| v.as_str()) + .unwrap_or("") + .into(); } "appearance" => { if let Some(s) = payload @@ -306,6 +435,18 @@ mod macos { "Preparing…".into() }; } + if visual_state == "finishing" && model.hint_variant == "retry" { + // Placeholder shown only until the replayed transcript starts + // streaming in; then yield to the live text below. + if model.final_text.is_empty() && model.partial_text.is_empty() { + return if zh { + "重试中…".into() + } else { + "Retrying…".into() + }; + } + return String::new(); + } if visual_state == "finishing" && model.hint_variant == "progress" { return if zh { "思考中…".into() @@ -313,6 +454,8 @@ mod macos { "Thinking…".into() }; } + // The retry label + hotkey live inside the retry button, not in the + // message text, so the hint is just the error message. model.hint_text.clone() } @@ -373,6 +516,52 @@ mod macos { container.addSubview(&spinner); container.addSubview(&label); + let retry_view = NSView::new(mtm); + retry_view.setWantsLayer(true); + set_layer_color(&retry_view, &NSColor::clearColor(), RETRY_SIZE / 2.0); + let retry_track_layer = make_retry_ring_layer(); + let retry_progress_layer = make_retry_ring_layer(); + unsafe { + let retry_layer: *mut AnyObject = msg_send![&*retry_view, layer]; + if !retry_layer.is_null() { + let _: () = msg_send![retry_layer, addSublayer: &*retry_track_layer]; + let _: () = msg_send![retry_layer, addSublayer: &*retry_progress_layer]; + } + } + let retry_target = unsafe { + let cls = retry_target_class(); + let obj: *mut AnyObject = msg_send![cls, new]; + Retained::from_raw(obj).expect("retry target init") + }; + let retry_button = unsafe { + NSButton::buttonWithTitle_target_action( + &NSString::from_str("重试"), + Some(&retry_target), + Some(sel!(retryClicked:)), + mtm, + ) + }; + retry_button.setBordered(true); + retry_button.setTransparent(false); + retry_button.setShowsBorderOnlyWhileMouseInside(true); + retry_button.setBezelStyle(NSBezelStyle::AccessoryBarAction); + retry_button.setFont(Some(&NSFont::systemFontOfSize_weight(12.0, 600.0))); + retry_button.setContentTintColor(Some(&NSColor::systemRedColor())); + retry_button.setBezelColor(Some(&NSColor::colorWithSRGBRed_green_blue_alpha( + 1.0, 0.231, 0.231, 0.10, + ))); + retry_button.setAttributedTitle(&retry_title_attr("")); + unsafe { + let cell: *mut AnyObject = msg_send![&*retry_button, cell]; + if !cell.is_null() { + let _: () = msg_send![cell, setHighlightsBy: 1usize]; + let _: () = msg_send![cell, setShowsStateBy: 0usize]; + } + } + retry_view.addSubview(&retry_button); + retry_view.setHidden(true); + container.addSubview(&retry_view); + // Waveform bars (right side), green and rounded; positioned per render. let bars: [Retained; WAVE_N] = std::array::from_fn(|_| { let b = NSView::new(mtm); @@ -413,8 +602,13 @@ mod macos { spinner, label, bars, + retry_view, + retry_button, + _retry_target: retry_target, dot_layer, ripple_layer, + retry_track_layer, + retry_progress_layer, fade_mask: make_fade_mask(), applied_variant: "".into(), }); @@ -466,6 +660,100 @@ mod macos { } } + fn make_retry_ring_layer() -> Retained { + unsafe { + let cls = objc2::runtime::AnyClass::get(c"CAShapeLayer").expect("CAShapeLayer"); + let obj: *mut AnyObject = msg_send![cls, alloc]; + let obj: *mut AnyObject = msg_send![obj, init]; + let layer = Retained::from_raw(obj).expect("CAShapeLayer init"); + let clear = NSColor::clearColor(); + let clear_cg: *mut AnyObject = msg_send![&*clear, CGColor]; + let _: () = msg_send![&*layer, setFillColor: clear_cg]; + let _: () = msg_send![&*layer, setLineWidth: 1.6f64]; + let cap = NSString::from_str("round"); + let _: () = msg_send![&*layer, setLineCap: &*cap]; + let _: () = msg_send![&*layer, setStrokeEnd: 1.0f64]; + layer + } + } + + fn set_retry_ring_path(layer: &AnyObject, width: f64) { + unsafe { + let cls = objc2::runtime::AnyClass::get(c"NSBezierPath").expect("NSBezierPath"); + let inset = 1.9; + let left = inset; + let right = width - inset; + let top = RETRY_SIZE - inset; + let bottom = inset; + let radius = (top - bottom) / 2.0; + let center_y = RETRY_SIZE / 2.0; + let left_center = NSPoint { + x: left + radius, + y: center_y, + }; + let right_center = NSPoint { + x: right - radius, + y: center_y, + }; + let path: *mut AnyObject = msg_send![cls, bezierPath]; + let _: () = msg_send![path, moveToPoint: NSPoint { x: width / 2.0, y: top }]; + let _: () = msg_send![path, lineToPoint: NSPoint { x: left + radius, y: top }]; + let _: () = msg_send![ + path, + appendBezierPathWithArcWithCenter: left_center, + radius: radius, + startAngle: 90.0f64, + endAngle: 270.0f64, + clockwise: false + ]; + let _: () = msg_send![path, lineToPoint: NSPoint { x: right - radius, y: bottom }]; + let _: () = msg_send![ + path, + appendBezierPathWithArcWithCenter: right_center, + radius: radius, + startAngle: 270.0f64, + endAngle: 90.0f64, + clockwise: false + ]; + let _: () = msg_send![path, closePath]; + let cg_path: *mut AnyObject = msg_send![path, CGPath]; + let _: () = msg_send![layer, setPath: cg_path]; + } + } + + fn set_retry_countdown(layer: &AnyObject, on: bool) { + unsafe { + let key = NSString::from_str("retry-countdown"); + if on { + let existing: *mut AnyObject = msg_send![layer, animationForKey: &*key]; + if !existing.is_null() { + return; + } + let anim_cls = + objc2::runtime::AnyClass::get(c"CABasicAnimation").expect("CABasicAnimation"); + let path = NSString::from_str("strokeStart"); + let anim: *mut AnyObject = msg_send![anim_cls, animationWithKeyPath: &*path]; + let _: () = msg_send![anim, setFromValue: &*NSNumber::numberWithDouble(0.0)]; + let _: () = msg_send![anim, setToValue: &*NSNumber::numberWithDouble(1.0)]; + let _: () = msg_send![anim, setDuration: 5.0f64]; + let _: () = msg_send![anim, setRemovedOnCompletion: false]; + let fill = NSString::from_str("forwards"); + let _: () = msg_send![anim, setFillMode: &*fill]; + let tcls = objc2::runtime::AnyClass::get(c"CAMediaTimingFunction") + .expect("CAMediaTimingFunction"); + let tname = NSString::from_str("linear"); + let tf: *mut AnyObject = msg_send![tcls, functionWithName: &*tname]; + let _: () = msg_send![anim, setTimingFunction: tf]; + let _: () = msg_send![layer, addAnimation: anim, forKey: &*key]; + } else { + let _: () = msg_send![layer, removeAnimationForKey: &*key]; + let _: () = msg_send![layer, setOpacity: 1.0f32]; + let _: () = msg_send![layer, setStrokeStart: 0.0f64]; + let _: () = msg_send![layer, setStrokeEnd: 1.0f64]; + } + } + } + /// Add or remove the recording dot's expanding-ring "ripple" on a halo layer, /// faithfully matching the web `vp-ring` keyframes: the dot itself stays fixed /// while a ring scales out from it and fades, looping every 1.6s (ease-out). @@ -672,6 +960,28 @@ mod macos { Retained::into_super(attr) } + fn retry_title_attr(hotkey: &str) -> Retained { + // "重试 (R ⌥)" — label + the configured hotkey, matching settings symbols. + let text = if hotkey.is_empty() { + "重试".to_string() + } else { + format!("重试 ({hotkey})") + }; + let attr = NSMutableAttributedString::from_nsstring(&NSString::from_str(&text)); + let font = NSFont::systemFontOfSize_weight(12.0, 600.0); + let range = NSRange::new(0, text.encode_utf16().count()); + let color = NSColor::colorWithSRGBRed_green_blue_alpha(1.0, 0.231, 0.231, 1.0); + unsafe { + attr.addAttribute_value_range(objc2_app_kit::NSFontAttributeName, &font, range); + attr.addAttribute_value_range( + objc2_app_kit::NSForegroundColorAttributeName, + &color, + range, + ); + } + Retained::into_super(attr) + } + fn hint_color(level: &str) -> Retained { match level { "error" => NSColor::systemRedColor(), @@ -703,6 +1013,15 @@ mod macos { let hint = visible_hint(model); let has_hint = !hint.is_empty(); let has_text = !model.final_text.is_empty() || !model.partial_text.is_empty(); + let show_retry = model.hint_retryable && model.hint_level == "error" && has_hint; + // Build the retry button title ("重试 (R ⌥)") and size the button to fit it, + // so the label + hotkey live inside the button rather than in the message. + let retry_title = retry_title_attr(&model.retry_hotkey); + let retry_w = if show_retry { + (retry_title.size().width.ceil() + RETRY_TEXT_PAD).max(RETRY_MIN_W) + } else { + 0.0 + }; VIEWS.with(|v| { let mut slot = v.borrow_mut(); @@ -749,13 +1068,18 @@ mod macos { // its width so the text never overlaps the bars. let show_wave = model.app_state == "recording"; - // Chrome around the text (left pad + indicator + gap + right pad + waveform). + // Chrome around the text (left pad + indicator + gap + right action area). let wave_reserve = if show_wave { WAVE_GAP_LEFT + WAVE_AREA_W } else { 0.0 }; - let chrome = PAD_LEFT + INDICATOR_W + GAP + PAD_RIGHT + wave_reserve; + let retry_reserve = if show_retry { + RETRY_GAP_LEFT + retry_w + (RETRY_RIGHT_INSET - PAD_RIGHT) + } else { + 0.0 + }; + let chrome = PAD_LEFT + INDICATOR_W + GAP + PAD_RIGHT + wave_reserve + retry_reserve; let text_w = measured_w + TEXT_SLACK; let next_width = if want_wrap { MULTI_LINE_WIDTH + chrome @@ -930,6 +1254,58 @@ mod macos { // Waveform bars (right), shown only while recording. layout_bars(views, pill_w, pill_h, model, show_wave); + views.retry_view.setHidden(!show_retry); + if show_retry { + let retry_x = pill_w - RETRY_RIGHT_INSET - retry_w; + let retry_y = ((pill_h - RETRY_SIZE) / 2.0).round(); + views.retry_view.setFrame(NSRect { + origin: NSPoint { + x: retry_x, + y: retry_y, + }, + size: NSSize { + width: retry_w, + height: RETRY_SIZE, + }, + }); + views.retry_button.setFrame(NSRect { + origin: NSPoint { x: 0.0, y: 0.0 }, + size: NSSize { + width: retry_w, + height: RETRY_SIZE, + }, + }); + views.retry_button.setAttributedTitle(&retry_title); + unsafe { + let bg = NSColor::colorWithSRGBRed_green_blue_alpha(1.0, 0.231, 0.231, 0.08); + let border = + NSColor::colorWithSRGBRed_green_blue_alpha(1.0, 0.231, 0.231, 0.84); + let view_layer: *mut AnyObject = msg_send![&*views.retry_view, layer]; + if !view_layer.is_null() { + let bg_cg: *mut AnyObject = msg_send![&*bg, CGColor]; + let _: () = msg_send![view_layer, setBackgroundColor: bg_cg]; + let _: () = msg_send![view_layer, setCornerRadius: RETRY_SIZE / 2.0]; + } + let ring_frame = NSRect { + origin: NSPoint { x: 0.0, y: 0.0 }, + size: NSSize { + width: retry_w, + height: RETRY_SIZE, + }, + }; + let _: () = msg_send![&*views.retry_track_layer, setFrame: ring_frame]; + let _: () = msg_send![&*views.retry_progress_layer, setFrame: ring_frame]; + set_retry_ring_path(&views.retry_track_layer, retry_w); + set_retry_ring_path(&views.retry_progress_layer, retry_w); + let track = NSColor::colorWithSRGBRed_green_blue_alpha(1.0, 0.231, 0.231, 0.18); + let track_cg: *mut AnyObject = msg_send![&*track, CGColor]; + let _: () = msg_send![&*views.retry_track_layer, setStrokeColor: track_cg]; + let border_cg: *mut AnyObject = msg_send![&*border, CGColor]; + let _: () = msg_send![&*views.retry_progress_layer, setStrokeColor: border_cg]; + } + } + set_retry_countdown(&views.retry_progress_layer, show_retry); + // Keep the pill visible throughout an active session (so the indicator // stays up while waiting for the first transcript); only hide it when idle // with no content. diff --git a/src-tauri/src/stats.rs b/src-tauri/src/stats.rs index 51372a4..669e79d 100644 --- a/src-tauri/src/stats.rs +++ b/src-tauri/src/stats.rs @@ -24,6 +24,14 @@ pub struct HistoryEntry { pub text: String, #[serde(default)] pub chars: usize, + #[serde(default = "default_history_status")] + pub status: String, + #[serde(rename = "audioPath", default, skip_serializing_if = "Option::is_none")] + pub audio_path: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub error: Option, + #[serde(rename = "retryOf", default, skip_serializing_if = "Option::is_none")] + pub retry_of: Option, } pub struct StatsService { @@ -54,7 +62,12 @@ impl StatsService { } } - pub fn record_session(&mut self, text: &str) { + pub fn record_session_with_audio( + &mut self, + text: &str, + audio_path: Option, + retry_of: Option, + ) { if text.is_empty() { return; } @@ -79,10 +92,96 @@ impl StatsService { ts: now.to_rfc3339(), text: text.to_string(), chars: char_count, + status: "success".to_string(), + audio_path, + error: None, + retry_of, }; self.append_history(&entry); } + pub fn replace_history_with_success( + &mut self, + ts: &str, + text: &str, + audio_path: Option, + ) -> bool { + if text.is_empty() { + return false; + } + + let Ok(d) = chrono::DateTime::parse_from_rfc3339(ts) else { + return false; + }; + let local = d.with_timezone(&Local); + let key = local.format("%Y-%m-%d").to_string(); + let file_path = self.history_dir.join(format!("{}.jsonl", key)); + let Ok(content) = fs::read_to_string(&file_path) else { + return false; + }; + + let mut replaced = false; + let char_count = text.len(); + let mut next_lines = Vec::new(); + for line in content.lines().filter(|line| !line.is_empty()) { + match serde_json::from_str::(line) { + Ok(mut entry) if entry.ts == ts => { + entry.text = text.to_string(); + entry.chars = char_count; + entry.status = "success".to_string(); + entry.audio_path = audio_path.clone(); + entry.error = None; + entry.retry_of = None; + if let Ok(json) = serde_json::to_string(&entry) { + next_lines.push(json); + replaced = true; + } else { + next_lines.push(line.to_string()); + } + } + Ok(entry) => { + if let Ok(json) = serde_json::to_string(&entry) { + next_lines.push(json); + } else { + next_lines.push(line.to_string()); + } + } + Err(_) => next_lines.push(line.to_string()), + } + } + + if !replaced { + return false; + } + + if fs::write(&file_path, format!("{}\n", next_lines.join("\n"))).is_err() { + return false; + } + self.record_usage(text); + true + } + + pub fn record_failure( + &mut self, + message: &str, + audio_path: Option, + retry_of: Option, + ) -> String { + let now = Local::now(); + let ts = now.to_rfc3339(); + let entry = HistoryEntry { + ts: ts.clone(), + text: message.to_string(), + chars: 0, + status: "failed".to_string(), + audio_path, + error: Some(message.to_string()), + retry_of, + }; + self.append_history(&entry); + ts + } + pub fn get_stats(&self) -> &Stats { &self.stats } @@ -123,6 +222,10 @@ impl StatsService { } pub fn delete_history(&mut self, ts: &str) { + self.delete_history_entry(ts, true); + } + + pub fn delete_history_entry(&mut self, ts: &str, delete_audio: bool) { if let Ok(d) = chrono::DateTime::parse_from_rfc3339(ts) { let local = d.with_timezone(&Local); let key = local.format("%Y-%m-%d").to_string(); @@ -131,12 +234,18 @@ impl StatsService { if file_path.exists() { if let Ok(content) = fs::read_to_string(&file_path) { let lines: Vec<&str> = content.lines().filter(|l| !l.is_empty()).collect(); + let mut removed_audio_paths = Vec::new(); let new_lines: Vec = lines .iter() - .filter(|line| { - serde_json::from_str::(line) - .map(|e| e.ts != ts) - .unwrap_or(true) + .filter(|line| match serde_json::from_str::(line) { + Ok(e) if e.ts == ts => { + if let Some(path) = e.audio_path { + removed_audio_paths.push(path); + } + false + } + Ok(_) => true, + Err(_) => true, }) .map(|s| s.to_string()) .collect(); @@ -147,12 +256,23 @@ impl StatsService { } else { let _ = fs::write(&file_path, format!("{}\n", new_lines.join("\n"))); } + if delete_audio { + for path in removed_audio_paths { + let _ = fs::remove_file(path); + } + } } } } } } + pub fn find_history(&self, ts: &str) -> Option { + self.get_history(365) + .into_iter() + .find(|entry| entry.ts == ts) + } + fn flush_stats(&mut self) { self.prune_daily_counts(); let path = self.data_dir.join("stats.json"); @@ -161,6 +281,22 @@ impl StatsService { } } + fn record_usage(&mut self, text: &str) { + if text.is_empty() { + return; + } + + let now = Local::now(); + if self.stats.first_used_at.is_none() { + self.stats.first_used_at = Some(now.to_rfc3339()); + } + self.stats.total_sessions += 1; + self.stats.total_characters += text.len() as u64; + let key = now.format("%Y-%m-%d").to_string(); + *self.stats.daily_counts.entry(key).or_insert(0) += text.len() as u64; + self.flush_stats(); + } + fn prune_daily_counts(&mut self) { let cutoff = Local::now() - chrono::Duration::days(MAX_DAILY_COUNTS_DAYS); let cutoff_key = cutoff.format("%Y-%m-%d").to_string(); @@ -194,6 +330,10 @@ fn is_date_key(s: &str) -> bool { chrono::NaiveDate::parse_from_str(s, "%Y-%m-%d").is_ok() } +fn default_history_status() -> String { + "success".to_string() +} + // --------------------------------------------------------------------------- // Tests // --------------------------------------------------------------------------- @@ -233,6 +373,10 @@ mod tests { ts: "2025-01-01T00:00:00+00:00".to_string(), text: "hello".to_string(), chars: 5, + status: "success".to_string(), + audio_path: None, + error: None, + retry_of: None, }; let json = serde_json::to_string(&entry).unwrap(); assert!(json.contains("hello")); @@ -250,7 +394,7 @@ mod tests { #[test] fn record_session_increments_counters() { let (mut svc, _dir) = new_stats_service(); - svc.record_session("hello world"); + svc.record_session_with_audio("hello world", None, None); let stats = svc.get_stats(); assert_eq!(stats.total_sessions, 1); assert_eq!(stats.total_characters, 11); // "hello world".len() @@ -260,7 +404,7 @@ mod tests { #[test] fn record_session_empty_text_ignored() { let (mut svc, _dir) = new_stats_service(); - svc.record_session(""); + svc.record_session_with_audio("", None, None); let stats = svc.get_stats(); assert_eq!(stats.total_sessions, 0); } @@ -268,8 +412,8 @@ mod tests { #[test] fn record_session_multiple_increments() { let (mut svc, _dir) = new_stats_service(); - svc.record_session("first"); - svc.record_session("second"); + svc.record_session_with_audio("first", None, None); + svc.record_session_with_audio("second", None, None); let stats = svc.get_stats(); assert_eq!(stats.total_sessions, 2); assert_eq!(stats.total_characters, 11); @@ -278,7 +422,7 @@ mod tests { #[test] fn daily_counts_populated() { let (mut svc, _dir) = new_stats_service(); - svc.record_session("test"); + svc.record_session_with_audio("test", None, None); let stats = svc.get_stats(); assert_eq!(stats.daily_counts.len(), 1); let today = chrono::Local::now().format("%Y-%m-%d").to_string(); @@ -348,6 +492,26 @@ mod tests { assert_eq!(history[0].text, "old"); } + #[test] + fn replace_history_with_success_updates_entry_in_place() { + let today = chrono::Local::now().format("%Y-%m-%d").to_string(); + let (mut svc, _dir) = new_stats_service(); + let failure_ts = svc.record_failure("timeout", Some("/tmp/retry.wav".to_string()), None); + + assert_eq!(failure_ts[0..10], today); + assert!(svc.replace_history_with_success(&failure_ts, "重试成功", None)); + + let history = svc.get_history(365); + assert_eq!(history.len(), 1); + assert_eq!(history[0].ts, failure_ts); + assert_eq!(history[0].text, "重试成功"); + assert_eq!(history[0].status, "success"); + assert_eq!(history[0].chars, "重试成功".len()); + assert!(history[0].audio_path.is_none()); + assert!(history[0].error.is_none()); + assert_eq!(svc.get_stats().total_sessions, 1); + } + #[test] fn delete_history_removes_entry() { let dir = tempdir().unwrap(); diff --git a/web/index.html b/web/index.html index 57d6d3a..423caf3 100644 --- a/web/index.html +++ b/web/index.html @@ -34,6 +34,13 @@ + diff --git a/web/src/bridge/overlay.ts b/web/src/bridge/overlay.ts index 69b40aa..0142765 100644 --- a/web/src/bridge/overlay.ts +++ b/web/src/bridge/overlay.ts @@ -67,3 +67,8 @@ export async function sendAudioWarmupFailed(payload: { message?: string } = {}): export async function getConfig(): Promise { return invoke("get_app_config"); } + +/** Retry the latest failed recording directly from the overlay. */ +export async function retryLatestFailedTranscription(): Promise { + await invoke("retry_latest_failed_transcription"); +} diff --git a/web/src/bridge/settings.ts b/web/src/bridge/settings.ts index 8fde11b..c1f4d6d 100644 --- a/web/src/bridge/settings.ts +++ b/web/src/bridge/settings.ts @@ -177,10 +177,18 @@ export async function getHistory(daysBack = 1): Promise { return invoke("get_history", { daysBack }); } -export async function deleteHistory(ts: number): Promise { +export async function deleteHistory(ts: string): Promise { return invoke("delete_history", { ts }); } +export async function playSoundFile(filePath: string): Promise { + return invoke("play_sound_file", { filePath }); +} + +export async function retryHistoryTranscription(ts: string): Promise { + return invoke("retry_history_transcription", { ts }); +} + // ---- Prompts ---- export async function loadPrompts(): Promise { diff --git a/web/src/ui/main-overlay.ts b/web/src/ui/main-overlay.ts index bdea157..922f530 100644 --- a/web/src/ui/main-overlay.ts +++ b/web/src/ui/main-overlay.ts @@ -9,6 +9,7 @@ import { getConfig, notifyAudioStopped, onOverlayEvent, + retryLatestFailedTranscription, sendAudioChunk, sendAudioWarmupFailed, sendAudioWarmupReady, @@ -28,6 +29,7 @@ interface OverlayState { hintText: string; hintLevel: HintLevel; hintVariant: string; + retryHotkey: string; appState: AppState; audioReady: boolean; mediaStream: MediaStream | null; @@ -40,6 +42,8 @@ interface OverlayState { layoutWrap: boolean; renderedWidth: number; waveBarLevels: number[]; + retryVisible: boolean; + retrying: boolean; } interface AppearanceConfig { @@ -56,6 +60,7 @@ const state: OverlayState = { hintText: "", hintLevel: "info", hintVariant: "text", + retryHotkey: "", appState: "idle", audioReady: false, mediaStream: null, @@ -68,6 +73,8 @@ const state: OverlayState = { layoutWrap: false, renderedWidth: 0, waveBarLevels: [], + retryVisible: false, + retrying: false, }; // ---- DOM elements ---- @@ -88,9 +95,12 @@ const elements = { transcript: getEl("transcript"), measureText: getEl("measureText"), statusBars: getEl("statusBars"), + retryButton: getEl("retryButton") as HTMLButtonElement, + retryLabel: getEl("retryLabel"), }; const statusBarItems = Array.from(elements.statusBars.querySelectorAll(".status-bar")); +let retryHideTimer = 0; // ---- Appearance ---- @@ -111,9 +121,7 @@ function applyAppearance(cfg: AppearanceConfig = {}): void { currentAppearance.overlayStyle = cfg.overlayStyle || "liquid"; currentAppearance.theme = cfg.theme || "system"; const isMac = cfg.platform === "macos"; - if (elements.stage) { - elements.stage.style.display = isMac ? "none" : ""; - } + syncStageVisibility(); const isVibrancy = isMac && cfg.overlayStyle === "vibrancy"; elements.bubble.classList.toggle("platform-mac", isMac); elements.bubble.classList.toggle("platform-win", !isMac); @@ -124,6 +132,11 @@ function applyAppearance(cfg: AppearanceConfig = {}): void { ); } +function syncStageVisibility(): void { + const isMac = currentAppearance.platform === "macos"; + elements.stage.style.display = isMac ? "none" : ""; +} + // ---- Waveform ---- let waveformRaf = 0; @@ -208,9 +221,15 @@ function getVisibleHintText(): string { const visualState: string = state.appState === "recording" && !state.audioReady ? "connecting" : state.appState; if (visualState === "connecting") return isZhLocale ? "准备中…" : "Preparing…"; + if (visualState === "finishing" && state.hintVariant === "retry") { + // Placeholder until the replayed transcript starts streaming in. + if (!state.finalText && !state.partialText) return isZhLocale ? "重试中…" : "Retrying…"; + return ""; + } if (visualState === "finishing" && state.hintVariant === "progress") { return isZhLocale ? "思考中…" : "Thinking…"; } + // The retry label + hotkey live inside the retry button, not in the message. return state.hintText || ""; } @@ -218,6 +237,30 @@ function shouldShowHint(): boolean { return Boolean(getVisibleHintText()); } +function clearRetryTimer(): void { + if (retryHideTimer) { + window.clearTimeout(retryHideTimer); + retryHideTimer = 0; + } +} + +function showRetryAction(): void { + clearRetryTimer(); + state.retryVisible = true; + state.retrying = false; + retryHideTimer = window.setTimeout(() => { + state.retryVisible = false; + state.retrying = false; + updateView(); + }, 5000); +} + +function hideRetryAction(): void { + clearRetryTimer(); + state.retryVisible = false; + state.retrying = false; +} + // ---- Layout ---- let resizeRaf = 0; @@ -250,7 +293,8 @@ function scheduleResize(): void { const indicatorWidth = 22 + 12; const waveformWidth = state.appState === "recording" ? 18 + 12 : 0; - const chrome = 14 + 16 + 2 + indicatorWidth + waveformWidth; + const retryWidth = state.retryVisible ? 22 + 8 : 0; + const chrome = 14 + 16 + 2 + indicatorWidth + waveformWidth + retryWidth; const textSlack = 10; const singleLineLimit = 520; const multiLineWidth = 520; @@ -308,6 +352,17 @@ function updateView(): void { elements.stage.dataset.state = visualState; elements.stage.dataset.mode = hasHint ? "hint" : "transcript"; + elements.stage.dataset.retry = + state.retryVisible && state.hintLevel === "error" ? "true" : "false"; + elements.stage.dataset.retrying = state.retrying ? "true" : "false"; + elements.retryButton.disabled = state.retrying || !state.retryVisible; + // Label + hotkey live inside the button, e.g. "重试 (R ⌥)". + elements.retryLabel.textContent = state.retryHotkey + ? `${isZhLocale ? "重试" : "Retry"} (${state.retryHotkey})` + : isZhLocale + ? "重试" + : "Retry"; + syncStageVisibility(); elements.finalText.textContent = showTranscript ? state.finalText : ""; elements.partialText.textContent = showTranscript ? state.partialText : ""; if (showTranscript) scrollTranscriptToBottom(); @@ -334,7 +389,9 @@ function resetState(): void { state.hintText = ""; state.hintLevel = "info"; state.hintVariant = "text"; + state.retryHotkey = ""; state.audioReady = false; + hideRetryAction(); state.layoutWidth = 0; state.layoutWrap = false; state.renderedWidth = 0; @@ -444,6 +501,10 @@ function stopCueKeepAlive(): void { } } +function usesNativeAudioCapture(): boolean { + return currentAppearance.platform === "macos"; +} + // Create the cue context if needed, resume it, and start the keep-alive so the // output device is warm and settled by the time a cue plays. Idempotent; called // during warmup. @@ -651,6 +712,7 @@ onOverlayEvent(async (event: OverlayEvent) => { break; case "state": state.appState = (payload as { state: AppState }).state; + if (state.appState !== "idle") hideRetryAction(); if (state.appState === "idle" || state.appState === "connecting") state.audioReady = false; if (state.appState === "idle") { // Session over: suspend the cue context. No-op if the end cue is still @@ -674,7 +736,12 @@ onOverlayEvent(async (event: OverlayEvent) => { case "audio:warmup": try { state.audioReady = false; - await startAudioCapture(); + if (usesNativeAudioCapture()) { + ensureCueContextWarm(); + state.audioReady = true; + } else { + await startAudioCapture(); + } sendAudioWarmupReady(); } catch (error) { const msg = (error as Error).message || String(error); @@ -689,7 +756,11 @@ onOverlayEvent(async (event: OverlayEvent) => { try { state.appState = "recording"; state.audioReady = false; - await startAudioCapture(); + if (usesNativeAudioCapture()) { + state.audioReady = true; + } else { + await startAudioCapture(); + } startWaveformAnimation(); state.hintText = ""; state.hintLevel = "info"; @@ -704,7 +775,12 @@ onOverlayEvent(async (event: OverlayEvent) => { updateView(); break; case "recording:stop": - await stopAudioCapture(); + if (usesNativeAudioCapture()) { + stopWaveformAnimation(); + state.pendingSamples = []; + } else { + await stopAudioCapture(); + } notifyAudioStopped(); break; case "transcript": { @@ -715,10 +791,22 @@ onOverlayEvent(async (event: OverlayEvent) => { break; } case "hint": { - const p = payload as { text?: string; level?: HintLevel; variant?: string }; + const p = payload as { + text?: string; + level?: HintLevel; + variant?: string; + retryable?: boolean; + hotkey?: string; + }; state.hintText = p.text || ""; state.hintLevel = p.level || "info"; state.hintVariant = p.variant || "text"; + state.retryHotkey = p.hotkey || ""; + if (p.retryable === true && state.hintLevel === "error" && state.hintText) { + showRetryAction(); + } else { + hideRetryAction(); + } updateView(); break; } @@ -736,6 +824,25 @@ onOverlayEvent(async (event: OverlayEvent) => { } }); +elements.retryButton.addEventListener("click", async (event) => { + event.preventDefault(); + event.stopPropagation(); + if (!state.retryVisible || state.retrying) return; + clearRetryTimer(); + state.retrying = true; + updateView(); + try { + await retryLatestFailedTranscription(); + hideRetryAction(); + } catch (error) { + state.hintText = (error as Error).message || String(error) || "重试失败"; + state.hintLevel = "error"; + state.hintVariant = "text"; + showRetryAction(); + } + updateView(); +}); + window.addEventListener("beforeunload", () => { stopAudioCapture(); }); diff --git a/web/src/ui/pages/AppSettingsPage.tsx b/web/src/ui/pages/AppSettingsPage.tsx index 0660836..ece0149 100644 --- a/web/src/ui/pages/AppSettingsPage.tsx +++ b/web/src/ui/pages/AppSettingsPage.tsx @@ -175,7 +175,6 @@ export function AppSettingsPage() { } /> + setAppBool("keep_recordings", v)} + /> + } + /> diff --git a/web/src/ui/pages/HomePage.tsx b/web/src/ui/pages/HomePage.tsx index af1cf92..1600a40 100644 --- a/web/src/ui/pages/HomePage.tsx +++ b/web/src/ui/pages/HomePage.tsx @@ -1,6 +1,12 @@ -import { Copy, Trash2 } from "lucide-react"; +import { Copy, Play, RefreshCw, Trash2 } from "lucide-react"; import { useCallback, useEffect, useState } from "react"; -import { deleteHistory, getHistory, getStats } from "@/bridge/settings"; +import { + deleteHistory, + getHistory, + getStats, + playSoundFile, + retryHistoryTranscription, +} from "@/bridge/settings"; import { formatCompact } from "@/lib/format"; import { Button } from "@/ui/components/Button"; import { Heatmap } from "@/ui/components/Heatmap"; @@ -49,8 +55,12 @@ interface Stats { totalCharacters?: number; } interface HistoryItem { - ts: number; + ts: string; text: string; + status?: "success" | "failed"; + audioPath?: string; + error?: string; + retryOf?: string; } /* ---------- component ---------- */ @@ -59,6 +69,7 @@ export function HomePage() { const [stats, setStats] = useState(null); const [history, setHistory] = useState([]); const [days, setDays] = useState(1); + const [retryingTs, setRetryingTs] = useState(null); const load = useCallback(async () => { try { @@ -153,6 +164,11 @@ export function HomePage() { const show = dk !== last; last = dk; const time = `${String(d.getHours()).padStart(2, "0")}:${String(d.getMinutes()).padStart(2, "0")}`; + const failed = item.status === "failed"; + const retrying = retryingTs === item.ts; + const displayText = failed + ? `转写失败:${item.error || item.text || "请检查网络连接"}` + : item.text; return (
{show && ( @@ -162,27 +178,90 @@ export function HomePage() {
)} -
+
{time}
-

{item.text}

+

+ {displayText} +

-
- +
+ {item.audioPath && ( + + )} + {failed ? ( + + ) : ( + + )}