Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
141 changes: 138 additions & 3 deletions openless-all/app/src-tauri/src/asr/whisper.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,12 @@ pub struct WhisperBatchASR {
prompt: Option<String>,
/// OpenAI 互換でもファイル長に上限がある provider 用。None は従来通り一括送信。
max_chunk_duration_ms: Option<u64>,
/// `response_format=verbose_json` を要求してセグメント単位のメタデータ
/// (no_speech_prob / avg_logprob / compression_ratio)で幻聴を捨てるか。
/// OpenAI / Groq の Whisper は full に対応。SenseVoice / TeleSpeech 等
/// (SiliconFlow)は response_format 自体が無いので false にして従来の
/// `json` のまま送る(壊さない)。
verbose_json: bool,
buffer: Mutex<Vec<u8>>,
}

Expand All @@ -40,13 +46,15 @@ impl WhisperBatchASR {
model: String,
prompt: Option<String>,
max_chunk_duration_ms: Option<u64>,
verbose_json: bool,
) -> Self {
Self {
api_key,
base_url,
model,
prompt,
max_chunk_duration_ms,
verbose_json,
buffer: Mutex::new(Vec::new()),
}
}
Expand Down Expand Up @@ -111,6 +119,16 @@ impl WhisperBatchASR {
.part("file", wav_part)
.text("model", self.model.clone());

// verbose_json 対応プロバイダ(OpenAI / Groq)のときだけ、セグメント
// メタデータ付きの応答を要求し、temperature も 0 に固定する。非対応
// プロバイダ(SiliconFlow の SenseVoice / TeleSpeech 等)には送らず
// 従来どおりの応答にして、未知パラメータでの 4xx を避ける。
if self.verbose_json {
form = form
.text("response_format", "verbose_json")
.text("temperature", "0");
}

// `prompt` は空文字を送らない:OpenAI 互換実装によっては空文字でエラーに
// なるリスクがある(Groq は許容するが防御的にスキップ)。`trim()` で
// 空白のみのケースも除外。
Expand All @@ -137,7 +155,13 @@ impl WhisperBatchASR {
}

let json: serde_json::Value = resp.json().await.context("parse Whisper response")?;
Ok(json["text"].as_str().unwrap_or("").trim().to_string())
if self.verbose_json {
// verbose_json:セグメントのメタデータで幻聴を除いた本文を組む。
// segments が無い応答では内部で従来どおり text にフォールバック。
Ok(extract_confident_text(&json))
} else {
Ok(json["text"].as_str().unwrap_or("").trim().to_string())
}
}

pub fn cancel(&self) {
Expand All @@ -151,6 +175,70 @@ impl crate::recorder::AudioConsumer for WhisperBatchASR {
}
}

/// verbose_json 应答里去掉「幻听」段落后拼出正文。
///
/// Whisper 在静音 / 弱音 / 噪声段会生成「听起来合理但用户没说」的文本(已知
/// hallucination 缺陷):录音前后的沉默或麦克风底噪会变成无关词。verbose_json
/// 的每个 segment 带 `no_speech_prob` / `avg_logprob` / `compression_ratio`,
/// 用它们丢掉明显不是真实语音的段落。
///
/// 判定(命中任一即丢弃):
/// - `no_speech_prob > 0.6` 且 `avg_logprob < -0.5`:高静音概率且低置信,沉默被作话。
/// - `compression_ratio > 2.4`:同一短语反复幻听(Whisper 标准阈值)。
/// - `avg_logprob < -1.0`:置信极低,噪声被词化。
///
/// 误删真实语音最糟,所以阈值保守。没有 `segments` 字段(例如 provider 忽略了
/// verbose_json)时退回直接用 `text`,与旧行为一致。元数据字段缺失时按
/// 「不丢弃」处理(unwrap_or 默认值),所以对不返回这些指标的 provider 是无害空转。
fn extract_confident_text(json: &serde_json::Value) -> String {
let Some(segments) = json.get("segments").and_then(|s| s.as_array()) else {
return json["text"].as_str().unwrap_or("").trim().to_string();
};

let mut kept = String::new();
for seg in segments {
let text = seg.get("text").and_then(|t| t.as_str()).unwrap_or("");
if text.trim().is_empty() {
continue;
}
let no_speech = seg
.get("no_speech_prob")
.and_then(|v| v.as_f64())
.unwrap_or(0.0);
let avg_logprob = seg
.get("avg_logprob")
.and_then(|v| v.as_f64())
.unwrap_or(0.0);
let compression = seg
.get("compression_ratio")
.and_then(|v| v.as_f64())
.unwrap_or(1.0);

let is_hallucination = (no_speech > 0.6 && avg_logprob < -0.5)
|| compression > 2.4
|| avg_logprob < -1.0;
if is_hallucination {
log::warn!(
"[whisper] 丢弃疑似幻听段落: no_speech={:.2} avg_logprob={:.2} compression={:.2} text={:?}",
no_speech,
avg_logprob,
compression,
text.trim()
);
continue;
}
kept.push_str(text);
}

let kept = kept.trim().to_string();
if kept.is_empty() {
// 全部段落被判为幻听(≈整段几乎是静音)。回退到原始 text 会把幻听又捡
// 回来,所以返回空串;上层把空转写当「什么都没说」无害处理。
return String::new();
}
kept
}

fn pcm_duration_ms(pcm: &[u8]) -> u64 {
(pcm.len() as u64 / PCM_BYTES_PER_SAMPLE as u64) * 1000 / PCM_SAMPLE_RATE_HZ
}
Expand Down Expand Up @@ -555,11 +643,57 @@ mod tests {
assert_eq!(join_transcript_chunks(&chunks), "「中文」");
}

#[test]
fn extract_confident_text_drops_hallucinated_segment() {
let json = serde_json::json!({
"text": "本当の発話 幻聴",
"segments": [
{"text": "本当の発話", "no_speech_prob": 0.01, "avg_logprob": -0.2, "compression_ratio": 1.2},
{"text": "幻聴", "no_speech_prob": 0.9, "avg_logprob": -0.8, "compression_ratio": 1.1},
]
});
assert_eq!(extract_confident_text(&json), "本当の発話");
}

#[test]
fn extract_confident_text_keeps_all_confident_segments() {
let json = serde_json::json!({
"text": "ignored",
"segments": [
{"text": "前半", "no_speech_prob": 0.0, "avg_logprob": -0.1, "compression_ratio": 1.0},
{"text": "後半", "no_speech_prob": 0.0, "avg_logprob": -0.2, "compression_ratio": 1.0},
]
});
assert_eq!(extract_confident_text(&json), "前半後半");
}

#[test]
fn extract_confident_text_falls_back_to_text_without_segments() {
let json = serde_json::json!({ "text": " 素の文字起こし " });
assert_eq!(extract_confident_text(&json), "素の文字起こし");
}

#[test]
fn extract_confident_text_missing_metrics_keeps_segment() {
// provider が指標を返さない場合は「不丢弃」=そのまま残す(無害空転)。
let json = serde_json::json!({
"text": "x",
"segments": [ {"text": "保留される"} ]
});
assert_eq!(extract_confident_text(&json), "保留される");
}

#[tokio::test]
async fn transcribe_posts_single_request_without_chunk_limit() {
let (base_url, server) = start_whisper_test_server(vec!["one"]);
let asr =
WhisperBatchASR::new("key".to_string(), base_url, "model".to_string(), None, None);
let asr = WhisperBatchASR::new(
"key".to_string(),
base_url,
"model".to_string(),
None,
None,
false,
);
let pcm = vec![0u8; 32_000 * 65];
asr.consume_pcm_chunk(&pcm);

Expand All @@ -579,6 +713,7 @@ mod tests {
"model".to_string(),
None,
Some(30_000),
false,
);
let pcm = vec![0u8; 32_000 * 65];
asr.consume_pcm_chunk(&pcm);
Expand Down
25 changes: 25 additions & 0 deletions openless-all/app/src-tauri/src/coordinator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2482,6 +2482,19 @@ fn is_whisper_compatible_provider(id: &str) -> bool {
matches!(id, "whisper" | "siliconflow" | "zhipu" | "groq")
}

/// 该 provider 的 `/audio/transcriptions` 是否支持 `response_format=verbose_json`
/// 并返回带 `no_speech_prob` / `avg_logprob` / `compression_ratio` 的 segments,
/// 用于幻听过滤。
///
/// - `whisper`(OpenAI)/ `groq`:原生 Whisper,完整支持,过滤有效。
/// - `siliconflow`:模型是 SenseVoice / TeleSpeech,文档无 `response_format`,
/// 发送 verbose_json 可能被拒,**保持关闭**走旧的 `json`。
/// - `zhipu`(GLM-ASR):虽接受 verbose_json,但不产出上述指标,过滤是空转;
/// 为最小化行为变更,这里也**保持关闭**,仅对确证有收益的 whisper/groq 开启。
fn whisper_supports_verbose_json(provider_id: &str) -> bool {
matches!(provider_id, "whisper" | "groq")
}

fn is_bailian_provider(id: &str) -> bool {
id == crate::asr::bailian::PROVIDER_ID
}
Expand Down Expand Up @@ -2648,6 +2661,7 @@ async fn build_qa_asr_start(inner: &Arc<Inner>, active_asr: &str) -> Result<QaAs
model,
whisper_prompt,
batch_asr_chunk_limit_ms(active_asr),
whisper_supports_verbose_json(active_asr),
));
let active = ActiveAsr::Whisper(Arc::clone(&whisper));
let consumer: Arc<dyn crate::recorder::AudioConsumer> = whisper;
Expand Down Expand Up @@ -3882,6 +3896,16 @@ mod tests {
));
}

#[test]
fn verbose_json_enabled_only_for_whisper_family() {
// verbose_json + 幻听过滤只对返回完整 Whisper 指标的 provider 开启。
assert!(whisper_supports_verbose_json("whisper"));
assert!(whisper_supports_verbose_json("groq"));
// SiliconFlow(SenseVoice/TeleSpeech) / Zhipu(GLM-ASR) 保持旧的 json 行为。
assert!(!whisper_supports_verbose_json("siliconflow"));
assert!(!whisper_supports_verbose_json("zhipu"));
}

#[test]
fn qa_asr_provider_kind_tracks_active_provider() {
assert_eq!(
Expand Down Expand Up @@ -4462,6 +4486,7 @@ mod tests {
"model".to_string(),
None,
None,
false,
));
*coordinator.inner.asr.lock() = Some(SessionResource::new(
session_id(2),
Expand Down
1 change: 1 addition & 0 deletions openless-all/app/src-tauri/src/coordinator/dictation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -803,6 +803,7 @@ pub(super) async fn begin_session(inner: &Arc<Inner>) -> Result<(), String> {
model,
whisper_prompt,
batch_asr_chunk_limit_ms(&active_asr),
whisper_supports_verbose_json(&active_asr),
));
store_asr_for_session(
inner,
Expand Down
Loading