feat: STT card mode — show recording text inside the block

ijbo · ijbo · commit af548161a73e · 2026-03-16T13:55:12.000+09:00
- Added card mode API (startForCard/stopForCard) to speechToText.js
- Transcription now appears inside STT card result area instead of editor cursor
- Disabled AI refinement in card mode to prevent LLM hallucination
- Improved hallucination filter with 20+ AI model patterns
- Improved consensus engine with timestamp-based staleness detection
- Increased WSA tiebreaker range from ±2 to ±5 points
- Fixed late result leak after Stop with 3s grace period
- Added substring-based dedup for dual-engine result merging
- Added .stt-interim CSS for live interim text (dimmed italic)
diff --git a/changelogs/CHANGELOG-stt-card-mode.md b/changelogs/CHANGELOG-stt-card-mode.md
@@ -0,0 +1,37 @@
+# STT Card Mode — Show Recording Text Inside the Block
+
+## Summary
+
+When recording via the `{{@STT:}}` card, transcribed text now appears **inside the card's result area** instead of being auto-inserted at the editor cursor. Users can review the transcription before clicking Insert to place it in the document.
+
+## Changes
+
+### js/speechToText.js
+- Added **card mode** API: `startForCard(onText, onInterim)` and `stopForCard()`
+- When card mode is active, transcription routes to callbacks instead of `insertAtCursor()`
+- Captured card mode state at the start of `processAndInsert()` to prevent async race conditions during AI refinement
+- **Disabled AI refinement in card mode** to prevent LLM hallucination (e.g., "I'm sorry, I can't assist") — uses basic punctuation only
+- Skipped toolbar UI (status bar, cheat sheet, interim text element) in card mode
+- Skipped pause-detection auto-paragraph insertion in card mode
+- **Improved hallucination filter**: added 20+ AI model patterns (e.g., "I'm an AI", "could you repeat that", "I'm sorry, could you")
+- **Improved consensus engine**:
+  - Added timestamp-based staleness detection (>1.5s apart = different speech segments, take fresher result)
+  - Increased WSA tiebreaker range from ±2 to ±5 points (WSA has native real-time streaming)
+- `stopForCard()` now clears pending consensus results and timer to prevent late Worker results
+- Added 3-second grace period after Stop to discard late Voxtral results (prevents editor leak)
+
+### js/ai-docgen.js
+- Updated STT card Record button to use `startForCard()` with `onText` and `onInterim` callbacks
+- `onText` callback accumulates transcription chunks with substring-based deduplication
+- `onInterim` callback shows live interim text (dimmed italic) without mic emoji
+- Updated Stop button to use `stopForCard()` with proper cleanup (strips interim spans)
+- `startForCard()` force-stops any active session before starting (fixes re-recording after Clear)
+
+### css/tts.css
+- Added `.stt-interim` styles: dimmed, italic text for live interim transcription
+- Dark mode support for `.stt-interim`
+
+## Files Modified
+- `js/speechToText.js` — 129 insertions, 10 deletions
+- `js/ai-docgen.js` — 79 insertions, 26 deletions (in STT event handlers)
+- `css/tts.css` — 12 insertions
diff --git a/css/tts.css b/css/tts.css
@@ -419,3 +419,15 @@
 [data-theme="dark"] .ai-stt-result-text {
     color: var(--text-color, #e6edf3);
 }
+
+/* STT interim/live transcription text */
+.stt-interim {
+    color: #9ca3af;
+    font-style: italic;
+    opacity: 0.8;
+    transition: opacity 0.2s ease;
+}
+
+[data-theme="dark"] .stt-interim {
+    color: #6b7280;
+}
diff --git a/js/ai-docgen.js b/js/ai-docgen.js
@@ -1062,7 +1062,7 @@
             });
         });
 
-        // STT record button — start recording via mic
+        // STT record button — start recording via mic (card mode)
         container.querySelectorAll('.ai-stt-record').forEach(function (btn) {
             btn.addEventListener('click', function (e) {
                 e.preventDefault();
@@ -1079,9 +1079,6 @@
                     return;
                 }
 
-                // Start the STT engine
-                M.speechToText.start();
-
                 // Switch button state
                 this.style.display = 'none';
                 var stopBtn = card.querySelector('.ai-stt-stop');
@@ -1092,23 +1089,68 @@
                 var resultDiv = card.querySelector('.ai-stt-result');
                 var resultText = card.querySelector('.ai-stt-result-text');
                 if (resultDiv) resultDiv.style.display = '';
-                if (resultText) resultText.textContent = '🎤 Listening… speak now';
+                if (resultText) {
+                    resultText.innerHTML = '<span class="stt-interim">Listening… speak now</span>';
+                }
+
+                // Accumulated transcription for this recording session
+                var accumulated = '';
+                var lastChunkNorm = ''; // normalized last chunk for dedup
+
+                // Start in card mode — text routes to the card, not the editor
+                M.speechToText.startForCard(
+                    // onText — final transcription chunk (deduped across engines)
+                    function (text) {
+                        if (!text || !text.trim()) return;
+                        var chunk = text.trim();
+
+                        // Dedup: only skip if one text contains the other (same speech from 2nd engine)
+                        var normalizedChunk = chunk.toLowerCase().replace(/[^\w\s]/g, '').trim();
+                        if (lastChunkNorm && normalizedChunk) {
+                            if (lastChunkNorm.includes(normalizedChunk) || normalizedChunk.includes(lastChunkNorm)) {
+                                console.log('🎤 STT card: skipping duplicate chunk', JSON.stringify(chunk));
+                                return;
+                            }
+                        }
+
+                        lastChunkNorm = normalizedChunk;
+                        accumulated += (accumulated ? ' ' : '') + chunk;
+                        if (resultText) {
+                            resultText.textContent = accumulated;
+                        }
+                    },
+                    // onInterim — live interim/partial text
+                    function (interim) {
+                        if (!resultText) return;
+                        if (!interim) {
+                            // Interim cleared — show accumulated or listening status
+                            resultText.innerHTML = accumulated
+                                ? accumulated
+                                : '<span class="stt-interim">Listening… speak now</span>';
+                        } else {
+                            // Show accumulated + current interim preview
+                            resultText.innerHTML = accumulated
+                                ? accumulated + ' <span class="stt-interim">' + escapeHtml(interim) + '</span>'
+                                : '<span class="stt-interim">' + escapeHtml(interim) + '</span>';
+                        }
+                    }
+                );
 
                 M.showToast && M.showToast('🎤 Recording started — speak now', 'info');
             });
         });
 
-        // STT stop button — stop recording and capture transcription
+        // STT stop button — stop recording (card mode)
         container.querySelectorAll('.ai-stt-stop').forEach(function (btn) {
             btn.addEventListener('click', function (e) {
                 e.preventDefault();
                 e.stopPropagation();
                 var card = this.closest('.ai-stt-card');
                 if (!card) return;
 
-                // Stop the STT engine
+                // Stop the STT engine in card mode
                 if (M.speechToText && M.speechToText.isListening()) {
-                    M.speechToText.stop();
+                    M.speechToText.stopForCard();
                 }
 
                 // Switch button state
@@ -1117,17 +1159,18 @@
                 if (recordBtn) recordBtn.style.display = '';
                 card.classList.remove('ai-stt-recording');
 
-                // Grab whatever was transcribed from the editor
-                // The STT engine inserts text at cursor — read the latest editor content
+                // Finalize the result area
                 var resultText = card.querySelector('.ai-stt-result-text');
-                if (resultText && resultText.textContent === '🎤 Listening… speak now') {
-                    resultText.textContent = '⏳ Processing transcription…';
-                    // Give a moment for final STT result to arrive
-                    setTimeout(function () {
-                        if (resultText.textContent === '⏳ Processing transcription…') {
-                            resultText.textContent = '(No speech detected — try again)';
-                        }
-                    }, 3000);
+                if (resultText) {
+                    // Strip any remaining interim spans to reveal final text
+                    var interimSpans = resultText.querySelectorAll('.stt-interim');
+                    interimSpans.forEach(function (s) { s.remove(); });
+                    var finalText = resultText.textContent.trim();
+                    if (!finalText) {
+                        resultText.textContent = '(No speech detected — try again)';
+                    } else {
+                        resultText.textContent = finalText;
+                    }
                 }
 
                 M.showToast && M.showToast('🎤 Recording stopped', 'info');
diff --git a/js/speechToText.js b/js/speechToText.js