Skip to content

Commit af54816

Browse files
committed
feat: STT card mode — show recording text inside the block
- Added card mode API (startForCard/stopForCard) to speechToText.js - Transcription now appears inside STT card result area instead of editor cursor - Disabled AI refinement in card mode to prevent LLM hallucination - Improved hallucination filter with 20+ AI model patterns - Improved consensus engine with timestamp-based staleness detection - Increased WSA tiebreaker range from ±2 to ±5 points - Fixed late result leak after Stop with 3s grace period - Added substring-based dedup for dual-engine result merging - Added .stt-interim CSS for live interim text (dimmed italic)
1 parent 6bbb686 commit af54816

4 files changed

Lines changed: 221 additions & 36 deletions

File tree

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
# STT Card Mode — Show Recording Text Inside the Block
2+
3+
## Summary
4+
5+
When recording via the `{{@STT:}}` card, transcribed text now appears **inside the card's result area** instead of being auto-inserted at the editor cursor. Users can review the transcription before clicking Insert to place it in the document.
6+
7+
## Changes
8+
9+
### js/speechToText.js
10+
- Added **card mode** API: `startForCard(onText, onInterim)` and `stopForCard()`
11+
- When card mode is active, transcription routes to callbacks instead of `insertAtCursor()`
12+
- Captured card mode state at the start of `processAndInsert()` to prevent async race conditions during AI refinement
13+
- **Disabled AI refinement in card mode** to prevent LLM hallucination (e.g., "I'm sorry, I can't assist") — uses basic punctuation only
14+
- Skipped toolbar UI (status bar, cheat sheet, interim text element) in card mode
15+
- Skipped pause-detection auto-paragraph insertion in card mode
16+
- **Improved hallucination filter**: added 20+ AI model patterns (e.g., "I'm an AI", "could you repeat that", "I'm sorry, could you")
17+
- **Improved consensus engine**:
18+
- Added timestamp-based staleness detection (>1.5s apart = different speech segments, take fresher result)
19+
- Increased WSA tiebreaker range from ±2 to ±5 points (WSA has native real-time streaming)
20+
- `stopForCard()` now clears pending consensus results and timer to prevent late Worker results
21+
- Added 3-second grace period after Stop to discard late Voxtral results (prevents editor leak)
22+
23+
### js/ai-docgen.js
24+
- Updated STT card Record button to use `startForCard()` with `onText` and `onInterim` callbacks
25+
- `onText` callback accumulates transcription chunks with substring-based deduplication
26+
- `onInterim` callback shows live interim text (dimmed italic) without mic emoji
27+
- Updated Stop button to use `stopForCard()` with proper cleanup (strips interim spans)
28+
- `startForCard()` force-stops any active session before starting (fixes re-recording after Clear)
29+
30+
### css/tts.css
31+
- Added `.stt-interim` styles: dimmed, italic text for live interim transcription
32+
- Dark mode support for `.stt-interim`
33+
34+
## Files Modified
35+
- `js/speechToText.js` — 129 insertions, 10 deletions
36+
- `js/ai-docgen.js` — 79 insertions, 26 deletions (in STT event handlers)
37+
- `css/tts.css` — 12 insertions

css/tts.css

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -419,3 +419,15 @@
419419
[data-theme="dark"] .ai-stt-result-text {
420420
color: var(--text-color, #e6edf3);
421421
}
422+
423+
/* STT interim/live transcription text */
424+
.stt-interim {
425+
color: #9ca3af;
426+
font-style: italic;
427+
opacity: 0.8;
428+
transition: opacity 0.2s ease;
429+
}
430+
431+
[data-theme="dark"] .stt-interim {
432+
color: #6b7280;
433+
}

js/ai-docgen.js

Lines changed: 61 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1062,7 +1062,7 @@
10621062
});
10631063
});
10641064

1065-
// STT record button — start recording via mic
1065+
// STT record button — start recording via mic (card mode)
10661066
container.querySelectorAll('.ai-stt-record').forEach(function (btn) {
10671067
btn.addEventListener('click', function (e) {
10681068
e.preventDefault();
@@ -1079,9 +1079,6 @@
10791079
return;
10801080
}
10811081

1082-
// Start the STT engine
1083-
M.speechToText.start();
1084-
10851082
// Switch button state
10861083
this.style.display = 'none';
10871084
var stopBtn = card.querySelector('.ai-stt-stop');
@@ -1092,23 +1089,68 @@
10921089
var resultDiv = card.querySelector('.ai-stt-result');
10931090
var resultText = card.querySelector('.ai-stt-result-text');
10941091
if (resultDiv) resultDiv.style.display = '';
1095-
if (resultText) resultText.textContent = '🎤 Listening… speak now';
1092+
if (resultText) {
1093+
resultText.innerHTML = '<span class="stt-interim">Listening… speak now</span>';
1094+
}
1095+
1096+
// Accumulated transcription for this recording session
1097+
var accumulated = '';
1098+
var lastChunkNorm = ''; // normalized last chunk for dedup
1099+
1100+
// Start in card mode — text routes to the card, not the editor
1101+
M.speechToText.startForCard(
1102+
// onText — final transcription chunk (deduped across engines)
1103+
function (text) {
1104+
if (!text || !text.trim()) return;
1105+
var chunk = text.trim();
1106+
1107+
// Dedup: only skip if one text contains the other (same speech from 2nd engine)
1108+
var normalizedChunk = chunk.toLowerCase().replace(/[^\w\s]/g, '').trim();
1109+
if (lastChunkNorm && normalizedChunk) {
1110+
if (lastChunkNorm.includes(normalizedChunk) || normalizedChunk.includes(lastChunkNorm)) {
1111+
console.log('🎤 STT card: skipping duplicate chunk', JSON.stringify(chunk));
1112+
return;
1113+
}
1114+
}
1115+
1116+
lastChunkNorm = normalizedChunk;
1117+
accumulated += (accumulated ? ' ' : '') + chunk;
1118+
if (resultText) {
1119+
resultText.textContent = accumulated;
1120+
}
1121+
},
1122+
// onInterim — live interim/partial text
1123+
function (interim) {
1124+
if (!resultText) return;
1125+
if (!interim) {
1126+
// Interim cleared — show accumulated or listening status
1127+
resultText.innerHTML = accumulated
1128+
? accumulated
1129+
: '<span class="stt-interim">Listening… speak now</span>';
1130+
} else {
1131+
// Show accumulated + current interim preview
1132+
resultText.innerHTML = accumulated
1133+
? accumulated + ' <span class="stt-interim">' + escapeHtml(interim) + '</span>'
1134+
: '<span class="stt-interim">' + escapeHtml(interim) + '</span>';
1135+
}
1136+
}
1137+
);
10961138

10971139
M.showToast && M.showToast('🎤 Recording started — speak now', 'info');
10981140
});
10991141
});
11001142

1101-
// STT stop button — stop recording and capture transcription
1143+
// STT stop button — stop recording (card mode)
11021144
container.querySelectorAll('.ai-stt-stop').forEach(function (btn) {
11031145
btn.addEventListener('click', function (e) {
11041146
e.preventDefault();
11051147
e.stopPropagation();
11061148
var card = this.closest('.ai-stt-card');
11071149
if (!card) return;
11081150

1109-
// Stop the STT engine
1151+
// Stop the STT engine in card mode
11101152
if (M.speechToText && M.speechToText.isListening()) {
1111-
M.speechToText.stop();
1153+
M.speechToText.stopForCard();
11121154
}
11131155

11141156
// Switch button state
@@ -1117,17 +1159,18 @@
11171159
if (recordBtn) recordBtn.style.display = '';
11181160
card.classList.remove('ai-stt-recording');
11191161

1120-
// Grab whatever was transcribed from the editor
1121-
// The STT engine inserts text at cursor — read the latest editor content
1162+
// Finalize the result area
11221163
var resultText = card.querySelector('.ai-stt-result-text');
1123-
if (resultText && resultText.textContent === '🎤 Listening… speak now') {
1124-
resultText.textContent = '⏳ Processing transcription…';
1125-
// Give a moment for final STT result to arrive
1126-
setTimeout(function () {
1127-
if (resultText.textContent === '⏳ Processing transcription…') {
1128-
resultText.textContent = '(No speech detected — try again)';
1129-
}
1130-
}, 3000);
1164+
if (resultText) {
1165+
// Strip any remaining interim spans to reveal final text
1166+
var interimSpans = resultText.querySelectorAll('.stt-interim');
1167+
interimSpans.forEach(function (s) { s.remove(); });
1168+
var finalText = resultText.textContent.trim();
1169+
if (!finalText) {
1170+
resultText.textContent = '(No speech detected — try again)';
1171+
} else {
1172+
resultText.textContent = finalText;
1173+
}
11311174
}
11321175

11331176
M.showToast && M.showToast('🎤 Recording stopped', 'info');

0 commit comments

Comments
 (0)