diff --git a/examples/industrial_data_pretraining/qwen3_asr/serve_qwen3_asr_ws.py b/examples/industrial_data_pretraining/qwen3_asr/serve_qwen3_asr_ws.py new file mode 100644 index 000000000..6e7e0789d --- /dev/null +++ b/examples/industrial_data_pretraining/qwen3_asr/serve_qwen3_asr_ws.py @@ -0,0 +1,145 @@ +#!/usr/bin/env python3 +# coding=utf-8 +""" +Qwen3-ASR streaming WebSocket service. + +把官方 example_qwen3_asr_vllm_streaming.py 的原生流式 API +(init_streaming_state / streaming_transcribe / finish_streaming_transcribe) +包成一个 WebSocket 服务,协议与 Fun-ASR-Nano 的 serve_realtime_ws.py 一致, +因此可以直接用同一个 bench_streaming_ws.py 压测,对比。 + +协议: + 1. 客户端连接 ws://host:port + 2. 客户端发文本 "START" → 服务端回 {"event": "started"} + 3. 客户端发二进制 int16 PCM 块(16kHz 单声道) + 4. 服务端随转写增长发 {"partial": "<当前文本>"} + 5. 客户端发文本 "STOP" → 服务端回 + {"is_final": true, "sentences": [{"text": "<最终文本>"}]} + 然后 {"event": "stopped"} + +架构对齐 serve_realtime_ws.py:单 asyncio 事件循环,streaming_transcribe / +finish_streaming_transcribe 同步调用、阻塞整个循环——这样压测出来的并发特性 +才和 Fun-ASR-Nano 那条同口径可比。生产扩展同样靠 多进程 + CUDA MPS + nginx +(见 vllm_guide §6.7)。 + +关于 VAD 参见配套说明文档 + + +依赖: + pip install qwen-asr[vllm] websockets numpy +启动: + python serve_qwen3_asr_ws.py --port 10095 --gpu-memory-utilization 0.8 + # 可选:--chunk-size-sec 控制流式块大小(默认 2.0)。值越小出字越快/越勤, + # 但并发开销越大(实测 1.0 比 2.0 明显更吃并发)。 +""" +import asyncio +import argparse +import json +import logging + +import numpy as np +import websockets + +from qwen_asr import Qwen3ASRModel + +# websockets 默认会对每个连接打 INFO 级 "connection open/closed",压测时刷屏; +# 提到 WARNING 关掉这条噪音(不影响连接行为,纯日志)。 +logging.getLogger("websockets").setLevel(logging.WARNING) + +SAMPLE_RATE = 16000 + +# 全局只加载一次;所有连接共用模型,各自持有独立的 streaming state。 +asr = None + +# 流式块大小(秒),由 --chunk-size-sec 设置,handle_client 里 init_streaming_state 用。 +# 默认 2.0(官方 example 值);值越小出字越勤、并发开销越大。 +CHUNK_SIZE_SEC = 2.0 + + +def int16_pcm_to_float32(pcm_bytes: bytes) -> np.ndarray: + """bench 发来的是 int16 小端 PCM;Qwen3-ASR 的 streaming_transcribe 吃 float32 [-1,1)。""" + return np.frombuffer(pcm_bytes, dtype=np.int16).astype(np.float32) / 32768.0 + + +async def handle_client(ws, path=None): # path 兼容老版本 websockets 的两参回调 + state = None + last_partial = None + try: + async for msg in ws: + # ---- 文本控制消息 ---- + if isinstance(msg, str): + if msg == "START": + # 每个连接一份独立 state;参数同官方 example + state = asr.init_streaming_state( + unfixed_chunk_num=2, + unfixed_token_num=5, + chunk_size_sec=CHUNK_SIZE_SEC, + ) + last_partial = None + await ws.send(json.dumps({"event": "started"})) + + elif msg == "STOP": + if state is not None: + # 同步收尾,阻塞循环(与 serve_realtime_ws.py 句尾 finalize 同口径) + asr.finish_streaming_transcribe(state) + final_text = (state.text or "").strip() + await ws.send(json.dumps({ + "is_final": True, + "sentences": [{"text": final_text}] if final_text else [], + })) + await ws.send(json.dumps({"event": "stopped"})) + break + # 其它文本忽略 + + # ---- 二进制音频块 ---- + else: + if state is None: + continue # 还没 START,丢弃 + seg = int16_pcm_to_float32(msg) + # 同步调用,阻塞整个事件循环 —— 这正是要复刻的单循环架构 + asr.streaming_transcribe(seg, state) + text = state.text or "" + # 只在文本变化时发 partial,避免刷屏(不影响 bench 的首词延迟统计) + if text != last_partial: + last_partial = text + await ws.send(json.dumps({"partial": text})) + + except websockets.exceptions.ConnectionClosed: + pass + except Exception: + logging.exception("Unexpected error in Qwen3-ASR WebSocket handler") + + +async def amain(args): + global asr, CHUNK_SIZE_SEC + CHUNK_SIZE_SEC = args.chunk_size_sec + print(f"Loading {args.model} (gpu_memory_utilization={args.gpu_memory_utilization}, chunk_size_sec={CHUNK_SIZE_SEC}) ...") + # Streaming is vLLM-only and no forced aligner supported.(官方 example 注释) + asr = Qwen3ASRModel.LLM( + model=args.model, + gpu_memory_utilization=args.gpu_memory_utilization, + max_new_tokens=32, # 流式用小值,同官方 example + ) + print(f"Serving on ws://{args.host}:{args.port} (Ctrl-C to stop)") + async with websockets.serve( + handle_client, args.host, args.port, max_size=10 * 1024 * 1024 + ): + await asyncio.Future() # run forever + + +def main(): + p = argparse.ArgumentParser() + p.add_argument("--host", default="0.0.0.0") + p.add_argument("--port", type=int, default=10095) + p.add_argument("--model", default="Qwen/Qwen3-ASR-1.7B") + p.add_argument("--gpu-memory-utilization", type=float, default=0.8, + dest="gpu_memory_utilization") + p.add_argument("--chunk-size-sec", type=float, default=2.0, + dest="chunk_size_sec", + help="流式块大小(秒),传给 init_streaming_state。默认 2.0;越小出字越勤但并发开销越大(实测 1.0 比 2.0 明显更吃并发)") + args = p.parse_args() + asyncio.run(amain(args)) + + +if __name__ == "__main__": + main() diff --git a/examples/industrial_data_pretraining/qwen3_asr/serve_qwen3_asr_ws_notes.md b/examples/industrial_data_pretraining/qwen3_asr/serve_qwen3_asr_ws_notes.md new file mode 100644 index 000000000..5f209069c --- /dev/null +++ b/examples/industrial_data_pretraining/qwen3_asr/serve_qwen3_asr_ws_notes.md @@ -0,0 +1,142 @@ +# serve_qwen3_asr_ws.py 说明与已知问题 + +本文档记录 `serve_qwen3_asr_ws.py` 的几个容易困惑/踩坑的点。代码里对应位置只留一行 +指向本文的精简注释,细节看这里。 + +--- + +## 1. 关于 VAD + +> 在开源仓库 github.com/QwenLM/Qwen3-ASR 没有关于 `VAD` 内容。但商用 [Qwen3-ASR的文档/示例](https://help.aliyun.com/zh/model-studio/qwen-asr-realtime-interaction-process)里**是有 VAD 设置**,例如: +> ```json +> "turn_detection": { "type": "server_vad", "threshold": 0.2, "silence_duration_ms": 800 } +> ``` + +VAD 在 ASR 里其实有两种完全不同的用途,容易混为一谈: + +- **A. 切段用的 VAD(给非流式 encoder 喂分段)**:像 Fun-ASR-Nano 这类模型 encoder 是 + **非流式**的(一次要看完整一段),必须靠 VAD 把连续音频切成一句句、每句整体编码解码。 + 这种 VAD 对 Fun-ASR-Nano 这类模型 **技术必需**——不切就没法编码。 + +- **B. 端点/轮次检测用的 VAD(判断"这一轮说完了没")**:检测说话人停顿(如静音 800ms) + 来判定"一句话/一轮结束",从而触发"锁定文本 / 发 is_final / 该回应了"。这是**产品行为** + 层面的需求,和 encoder 能不能流式无关。 + +对 **Qwen3-ASR 的开源流式 API**(本服务用的 `qwen-asr[vllm]` 的 +`init_streaming_state` / `streaming_transcribe`): + +- **不需要 A 类(切段)VAD**:它是**增量式**流式——每次只吃新增的一小段音频、状态向前 + 滚动,连续转写,不存在"先切句再解码"。"哪些字定了、哪些会变"由 `unfixed_chunk_num` / + `unfixed_token_num` 表示(尾部 N 个 chunk/token 算"未定"、会被后续音频修正,其余视为 + 已确认)——这相当于内置的 partial/锁定机制,取代了 A 类 VAD 的切段职责。所以在开源 + 流式 API 的代码里搜不到 vad,是因为它**这一层不做切段**。 + +- **仍然需要 B 类(端点)VAD —— 只是开源 API 自己不带**:自动判断"用户停顿=这一轮结束" + 这件事,`streaming_transcribe` 本身不管。商用服务在 ASR **之外**包了一层 `server_vad` + 来做(就是上面那段 `turn_detection`)。本服务目前是用客户端显式发 **`STOP`** 来代替 + 这个端点判断(bench 里音频放完即发 STOP)。**若要在真实场景自动断句/断轮,需要自己在 + 本服务之外接一个 VAD / 端点检测**(角色等同商用的 server_vad),而不是去 Qwen3-ASR + 内部找——它的开源流式 API 不含这一层。 + +**一句话**:Qwen3-ASR 增量流式**省掉了"切段 VAD"(A)**,但**"端点/轮次 VAD"(B)这个 +职责依然存在**,商用版用 `server_vad` 实现、本服务用手动 `STOP` 代替。两者不矛盾。 + +### 1.1 官方佐证:商用 Qwen-ASR-Realtime 的"VAD 模式 / Manual 模式" + +阿里云百炼的实时语音识别(Qwen-ASR-Realtime)文档明确把"断句由谁做"分成两种模式, +本质就是 `session.turn_detection` 开还是关: + +- **VAD 模式(默认,`turn_detection` 配置为 server_vad)**:服务端自动检测语音起点/终点 + 来断句,客户端只管持续发音频流,服务端在"检测到一句话结束"时自动返回最终结果。流程中 + 服务端会发 `input_audio_buffer.speech_started` / `speech_stopped` 等事件——这就是上面说的 + **B 类端点 VAD**,由服务端那一层(server_vad)实现,**不是** ASR 内核在切段。 + +- **Manual 模式(`turn_detection` 设为 null)**:由**客户端**控制断句——发完一整句音频后, + 客户端发 `input_audio_buffer.commit` 通知服务端边界。适用于客户端能明确判断语句边界的 + 场景(如"按住说话"、聊天发语音)。 + +> 对应关系:本服务 `serve_qwen3_asr_ws.py` 用客户端显式发 **`STOP`** 来标记一轮结束, +> 等价于商用的 **Manual 模式**(`turn_detection=null`,由客户端控制边界)。若要做成"服务端 +> 自动断句",就是去实现商用 **VAD 模式** 的那一层端点检测(server_vad),加在本服务的 +> 增量转写之外,而不是在 Qwen3-ASR 转写内核里找。 +> +> 文档:实时语音识别(Qwen-ASR-Realtime)交互流程 +> (help.aliyun.com/zh/model-studio/qwen-asr-realtime-interaction-process): ”服务端自动检测语音的起点和终点(断句)。开发者只需持续发送音频流,服务端会在检测到一句话结束时自动返回最终识别结果。此模式适用于实时对话、会议记录等场景。“ + +### 1.2 `chunk-size-sec` 控制流式块大小 + +`chunk-size-sec` 控制流式块大小(默认 2.0)。值越小出字越快/越勤,但并发开销越大(实测 1.0 比 2.0 在L20 上,29秒音频 48路并发,1.0 全部失败,2.0 全部通过)。 + +--- + +## 2 必须用 vllm 0.14,不要用 0.19(rope_scaling / thinker_config 警告) + +本服务需要vllm加速, `qwen-asr[vllm]`,[它锁定 `vllm==0.14.0`](https://github.com/QwenLM/Qwen3-ASR/blob/main/pyproject.toml)。若换成更新的vllm版本, 比如 0.19.x,启动会**出现**: + +``` +Unrecognized keys in `rope_scaling` for 'rope_type'='default': + {'mrope_section', 'mrope_interleaved', 'interleaved'} +thinker_config is None. Initializing thinker model with default values +``` + +**根因**:vllm 在 0.14 → 0.19 之间 (transformers 两版都是 4.57.6),config 解析里的 +`patch_rope_scaling_dict` 会把 `rope_type` 从 `'mrope'` 改写成 `'default'`(它把 mrope +当 legacy、假设由 vllm 内部消化 `mrope_section` 等字段): + +```python +elif rope_scaling["rope_type"] == "mrope": + assert "mrope_section" in rope_scaling + rope_scaling["rope_type"] = "default" # ← 改写 +``` + +但 Qwen3-ASR 自带的 `Qwen3ASRThinkerTextRotaryEmbedding` 期望从 `rope_scaling` 里读到 +`"mrope"` 才走多模态 RoPE 分支: + +```python +self.rope_type = config.rope_scaling.get("rope_type", "default") +``` + +被 vllm 改写成 `"default"` 后,它走了普通 RoPE,`mrope_section` / `mrope_interleaved` / +`interleaved` 这几个键没人认领 → 打印 "Unrecognized keys" 警告,且音频/文本的多模态 +位置编码退化。`thinker_config is None` 那条同源:0.19 的加载路径没正确解析 Qwen3-ASR 的 +thinker 子配置,回退到默认参数。 + +**影响与抉择**:在 0.19 上服务"能起、也能出字"(两条是 WARNING/INFO 不是 ERROR),抽查 +几条转写也"看着正常";但位置编码退化对长音频/复杂内容可能有害,且**未做 CER 量化对比**, +无法判定等价。保守起见固定用 `qwen-asr[vllm]` 自带的 `vllm==0.14.0`。 + +### 2.1 vllm 加速需要用 Qwen3ASRModel + +funasr 的`AutoModelVLLM` 不能加速 Qwen3-ASR , 必须要用 `from qwen_asr import Qwen3ASRModel` + +> 这解答 `#3026` 的问题 +> +## 3. tokenizer 的 `fix_mistral_regex` 警告(无害,可忽略) + +启动时可能出现: + +``` +The tokenizer you are loading from '.../Qwen3-ASR-1.7B' with an incorrect regex +pattern ... This will lead to incorrect tokenization. You should set the +`fix_mistral_regex=True` flag when loading this tokenizer to fix this issue. +``` + +**原因**:Qwen3-ASR 的 tokenizer 沿用了一类带已知 regex 问题的分词器实现,底层库检测到 +该 regex 模式后给出提醒,建议加 `fix_mistral_regex=True` 修正切分。本服务通过 qwen-asr +的高层 API 加载模型、并不直接构造 tokenizer,没有暴露这个开关,所以这条提醒按原样打印。 + +**影响**:实测对中文 ASR 转写结果无可见影响(**抽查**多条转写正常,未做 CER 量化)。该 +regex 修正主要影响某些特殊 token 的边界切分,对语音转写路径未观察到差异。属于"提醒级" +噪音,可忽略。若要彻底消除,需在更底层自行加载 tokenizer 时传 `fix_mistral_regex=True`, +但 qwen-asr 高层 API 当前不直接支持,且无实测必要。 + +--- + +### 顺带:另外两条启动日志(均无害) + +- `Error retrieving safetensors: Repo id must be in the form ...`:把本地模型路径当成 HF + 仓库 id 去查线上元数据,失败后重试 2 次、回退本地加载,不影响功能。可设环境变量 + `HF_HUB_OFFLINE=1` 消除。 + +- `Downcasting torch.float32 to torch.bfloat16`:权重以 fp32 存、按 bf16 加载,正常省显存 / + 提速,bf16 与 fp32 指数位同宽,精度几乎无损。这是 INFO 不是错误。 diff --git a/examples/industrial_data_pretraining/qwen3_asr/serve_qwen3_asr_ws_notes_en.md b/examples/industrial_data_pretraining/qwen3_asr/serve_qwen3_asr_ws_notes_en.md new file mode 100644 index 000000000..2618f7d53 --- /dev/null +++ b/examples/industrial_data_pretraining/qwen3_asr/serve_qwen3_asr_ws_notes_en.md @@ -0,0 +1,175 @@ +# serve_qwen3_asr_ws.py — Notes & Known Issues + +This document records the confusing points / gotchas of `serve_qwen3_asr_ws.py`. The code +only keeps one-line comments pointing here; details live in this file. + +--- + +## 1. About VAD + +> The open-source repo github.com/QwenLM/Qwen3-ASR contains nothing about `VAD`. However, +> the commercial [Qwen3-ASR docs/examples](https://help.aliyun.com/zh/model-studio/qwen-asr-realtime-interaction-process) +> **do have VAD settings**, e.g.: +> ```json +> "turn_detection": { "type": "server_vad", "threshold": 0.2, "silence_duration_ms": 800 } +> ``` + +VAD actually serves two completely different purposes in ASR, which are easy to conflate: + +- **A. Segmentation VAD (feeding chunks to a non-streaming encoder)**: models like + Fun-ASR-Nano have a **non-streaming** encoder (it needs to see a whole segment at once), + so a VAD is required to cut the continuous audio into sentences, each encoded/decoded as a + whole. This kind of VAD is **technically mandatory** for such models — without cutting, + there is no way to encode. + +- **B. Endpoint / turn-detection VAD (deciding "has this turn finished?")**: detects speaker + pauses (e.g. 800ms of silence) to decide "a sentence/turn has ended", triggering "lock the + text / emit is_final / time to respond". This is a **product-behavior** need, unrelated to + whether the encoder is streaming. + +For **Qwen3-ASR's open-source streaming API** (this service uses `qwen-asr[vllm]`'s +`init_streaming_state` / `streaming_transcribe`): + +- **No need for type-A (segmentation) VAD**: it is **incremental** streaming — each call only + consumes the newly added slice of audio, the state rolls forward, transcribing continuously; + there is no "cut first, then decode" step. "Which characters are final vs. still changing" is + expressed by `unfixed_chunk_num` / `unfixed_token_num` (the last N chunks/tokens are + "unfixed" and may be corrected by later audio; the rest are treated as confirmed) — this acts + as a built-in partial/locking mechanism that replaces type-A VAD's segmentation role. That is + why you can't find `vad` in the open-source streaming API: it simply **doesn't do + segmentation at this layer**. + +- **Still needs type-B (endpoint) VAD — the open-source API just doesn't ship one**: + automatically deciding "user paused = this turn ended" is not handled by + `streaming_transcribe` itself. The commercial service wraps a `server_vad` layer **outside** + the ASR (the `turn_detection` block above) to do it. This service currently uses an explicit + **`STOP`** message from the client to substitute for that endpoint decision (in the bench, + STOP is sent once the audio finishes). **To auto-segment/auto-endpoint in a real scenario, + you need to add a VAD / endpoint detector outside this service** (the same role as commercial + `server_vad`), rather than looking for one inside Qwen3-ASR — its open-source streaming API + does not include this layer. + +**In one line**: Qwen3-ASR's incremental streaming **removes the "segmentation VAD" (A)**, but +the **"endpoint / turn VAD" (B) responsibility still exists** — the commercial version +implements it via `server_vad`, while this service substitutes a manual `STOP`. The two are not +contradictory. + +### 1.1 Official corroboration: the commercial Qwen-ASR-Realtime "VAD mode / Manual mode" + +Aliyun Bailian's real-time ASR (Qwen-ASR-Realtime) docs explicitly split "who does the +segmentation" into two modes — essentially whether `session.turn_detection` is on or off: + +- **VAD mode (default, `turn_detection` set to server_vad)**: the server automatically detects + speech start/end to segment; the client just keeps streaming audio, and the server returns the + final result automatically when it "detects a sentence has ended". In the flow the server emits + `input_audio_buffer.speech_started` / `speech_stopped` events — this is exactly the **type-B + endpoint VAD** above, implemented by that server-side (server_vad) layer, **not** the ASR core + doing segmentation. + +- **Manual mode (`turn_detection` set to null)**: the **client** controls segmentation — after + sending a full sentence of audio, the client sends `input_audio_buffer.commit` to tell the + server the boundary. Suitable for scenarios where the client can clearly determine sentence + boundaries (e.g. "push-to-talk", sending a voice message in a chat app). + +> Mapping: this service `serve_qwen3_asr_ws.py` uses an explicit client **`STOP`** to mark the +> end of a turn, which is equivalent to the commercial **Manual mode** (`turn_detection=null`, +> client-controlled boundaries). To make it "server auto-segmentation", you would implement that +> endpoint-detection layer of the commercial **VAD mode** (server_vad) — added outside this +> service's incremental transcription, not found inside the Qwen3-ASR transcription core. +> +> Docs: Real-time Speech Recognition (Qwen-ASR-Realtime) interaction flow +> (help.aliyun.com/zh/model-studio/qwen-asr-realtime-interaction-process) + +### 1.2 `chunk-size-sec` controls the streaming chunk size + +`chunk-size-sec` controls the streaming chunk size (default 2.0). Smaller = faster/more frequent +output, but higher concurrency cost (measured on L20, 29s audio, 48-way concurrency: 1.0 → +all failed, 2.0 → all passed). + +--- + +## 2. Use vllm 0.14, not 0.19 (rope_scaling / thinker_config warnings) + +This service needs vllm acceleration via `qwen-asr[vllm]`, which +[pins `vllm==0.14.0`](https://github.com/QwenLM/Qwen3-ASR/blob/main/pyproject.toml). Switching to +a newer vllm (e.g. 0.19.x) will produce at startup: + +``` +Unrecognized keys in `rope_scaling` for 'rope_type'='default': + {'mrope_section', 'mrope_interleaved', 'interleaved'} +thinker_config is None. Initializing thinker model with default values +``` + +**Root cause**: between vllm 0.14 → 0.19 (transformers is 4.57.6 in both, so it's ruled out), +the config parsing in `patch_rope_scaling_dict` rewrites `rope_type` from `'mrope'` to +`'default'` (treating mrope as legacy and assuming vllm internally consumes the `mrope_section` +etc. fields): + +```python +elif rope_scaling["rope_type"] == "mrope": + assert "mrope_section" in rope_scaling + rope_scaling["rope_type"] = "default" # ← rewrite +``` + +But Qwen3-ASR's own `Qwen3ASRThinkerTextRotaryEmbedding` expects to read `"mrope"` from +`rope_scaling` to take the multimodal RoPE branch: + +```python +self.rope_type = config.rope_scaling.get("rope_type", "default") +``` + +After vllm rewrites it to `"default"`, it takes the plain RoPE branch, and the keys +`mrope_section` / `mrope_interleaved` / `interleaved` are left unclaimed → the "Unrecognized +keys" warning, with degraded multimodal (audio/text) positional encoding. The +`thinker_config is None` line has the same origin: 0.19's load path doesn't correctly parse +Qwen3-ASR's thinker sub-config and falls back to default parameters. + +**Impact & decision**: on 0.19 the service "starts and does emit text" (both lines are +WARNING/INFO, not ERROR), and spot-checked transcripts "look fine"; but the degraded positional +encoding may be harmful for long/complex audio, and **no CER quantitative comparison was done**, +so equivalence cannot be established. To be safe, stick with the `vllm==0.14.0` that +`qwen-asr[vllm]` ships. + +### 2.1 vllm acceleration requires Qwen3ASRModel + +funasr's `AutoModelVLLM` cannot accelerate Qwen3-ASR; you must use +`from qwen_asr import Qwen3ASRModel`. + +> This answers the question in `#3026`. + +--- + +## 3. tokenizer `fix_mistral_regex` warning (harmless, ignorable) + +At startup you may see: + +``` +The tokenizer you are loading from '.../Qwen3-ASR-1.7B' with an incorrect regex +pattern ... This will lead to incorrect tokenization. You should set the +`fix_mistral_regex=True` flag when loading this tokenizer to fix this issue. +``` + +**Cause**: Qwen3-ASR's tokenizer reuses a tokenizer implementation with a known regex issue; the +underlying library detects that regex pattern and warns, suggesting `fix_mistral_regex=True` to +fix the splitting. This service loads the model via qwen-asr's high-level API and does not +construct the tokenizer directly, so it does not expose this flag — hence the warning prints +as-is. + +**Impact**: measured to have no visible effect on Chinese ASR transcripts (**spot-checked** +multiple transcripts as normal; no CER quantification). That regex fix mainly affects the +boundary splitting of certain special tokens, and no difference was observed on the speech +transcription path. It's "notice-level" noise and can be ignored. To eliminate it entirely you'd +pass `fix_mistral_regex=True` when loading the tokenizer at a lower level, but the qwen-asr +high-level API doesn't directly support that, and there's no measured need. + +--- + +### Aside: two other startup log lines (both harmless) + +- `Error retrieving safetensors: Repo id must be in the form ...`: it treats the local model + path as an HF repo id to fetch online metadata, fails, retries twice, and falls back to local + loading — no functional impact. Can be silenced with the env var `HF_HUB_OFFLINE=1`. + +- `Downcasting torch.float32 to torch.bfloat16`: weights are stored as fp32 and loaded as bf16, + normally saving VRAM / speeding things up; bf16 has the same exponent width as fp32, so + precision loss is negligible. This is INFO, not an error. diff --git a/tests/test_qwen3_asr_ws_example.py b/tests/test_qwen3_asr_ws_example.py new file mode 100644 index 000000000..9b0bd9d22 --- /dev/null +++ b/tests/test_qwen3_asr_ws_example.py @@ -0,0 +1,56 @@ +import ast +import unittest +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[1] +EXAMPLE_DIR = ROOT / "examples" / "industrial_data_pretraining" / "qwen3_asr" +SERVER_PATH = EXAMPLE_DIR / "serve_qwen3_asr_ws.py" +NOTE_PATHS = [ + EXAMPLE_DIR / "serve_qwen3_asr_ws_notes.md", + EXAMPLE_DIR / "serve_qwen3_asr_ws_notes_en.md", +] + + +class Qwen3AsrWebsocketExampleTest(unittest.TestCase): + def test_notes_do_not_contain_review_placeholders(self): + forbidden_phrases = ["以下是我的个人理解", "The following is my personal understanding"] + for note_path in NOTE_PATHS: + text = note_path.read_text(encoding="utf-8") + for phrase in forbidden_phrases: + with self.subTest(path=note_path.name, phrase=phrase): + self.assertNotIn(phrase, text) + + def test_handler_logs_unexpected_exceptions(self): + tree = ast.parse(SERVER_PATH.read_text(encoding="utf-8"), filename=str(SERVER_PATH)) + handle_client = next( + node + for node in tree.body + if isinstance(node, ast.AsyncFunctionDef) and node.name == "handle_client" + ) + + catches_generic_exception = False + logs_exception = False + for node in ast.walk(handle_client): + if not isinstance(node, ast.ExceptHandler): + continue + if isinstance(node.type, ast.Name) and node.type.id == "Exception": + catches_generic_exception = True + for child in ast.walk(node): + if not isinstance(child, ast.Call): + continue + func = child.func + if ( + isinstance(func, ast.Attribute) + and isinstance(func.value, ast.Name) + and func.value.id == "logging" + and func.attr == "exception" + ): + logs_exception = True + + self.assertTrue(catches_generic_exception) + self.assertTrue(logs_exception) + + +if __name__ == "__main__": + unittest.main()