From ab1a2105f675ff532443606f6f0bec5ff52b73de Mon Sep 17 00:00:00 2001
From: kenchikuliu <kenchikuliu@users.noreply.github.com>
Date: Wed, 17 Jun 2026 20:40:02 +0800
Subject: [PATCH] Add local-tts skill with checked delivery workflow

---
 README.md                             |   3 +-
 docs/README_EN.md                     |   3 +-
 install.ps1                           |   2 +-
 install.sh                            |   2 +-
 local-tts/SKILL.md                    | 161 ++++++++++++++++++++++++
 local-tts/agents/openai.yaml          |   4 +
 local-tts/references/local-results.md |  27 ++++
 local-tts/scripts/tts_benchmark.py    | 132 ++++++++++++++++++++
 local-tts/scripts/tts_generate.py     | 172 ++++++++++++++++++++++++++
 9 files changed, 502 insertions(+), 4 deletions(-)
 mode change 100644 => 100755 install.sh
 create mode 100644 local-tts/SKILL.md
 create mode 100644 local-tts/agents/openai.yaml
 create mode 100644 local-tts/references/local-results.md
 create mode 100755 local-tts/scripts/tts_benchmark.py
 create mode 100755 local-tts/scripts/tts_generate.py
diff --git a/README.md b/README.md
index 91baad2..fe9a808 100644
--- a/README.md
+++ b/README.md
@@ -25,6 +25,7 @@
 |-------|----------|----------|
 | [collaborating-with-codex](https://github.com/GuDaStudio/collaborating-with-codex) | 将编码任务委托给 Codex CLI，用于原型开发、调试和代码审查 | OpenAI Codex |
 | [collaborating-with-gemini](https://github.com/GuDaStudio/collaborating-with-gemini) | 将编码任务委托给 Gemini CLI，用于原型开发、调试和代码审查 | Google Gemini |
+| [local-tts](./local-tts) | 本地生成并检查 OmniVoice/VoxCPM 语音，交付前验证响度、静音和 ASR 可识别性 | Local GPU / TTS |
 
 </details>
 
@@ -88,7 +89,7 @@ cd skills
 ./install.sh --user --skill collaborating-with-codex
 
 # 安装多个指定 Skill
-./install.sh --user -s collaborating-with-codex -s collaborating-with-gemini
+./install.sh --user -s collaborating-with-codex -s collaborating-with-gemini -s local-tts
 ```
 
 **方式三：自定义安装路径**
diff --git a/docs/README_EN.md b/docs/README_EN.md
index 9f49239..9ca8c57 100644
--- a/docs/README_EN.md
+++ b/docs/README_EN.md
@@ -24,6 +24,7 @@ Star us on GitHub — your support means a lot! 🙏😊
 |-------|-------------|---------------------|
 | [collaborating-with-codex](../collaborating-with-codex) | Delegates coding tasks to Codex CLI for prototyping, debugging, and code review | OpenAI Codex |
 | [collaborating-with-gemini](../collaborating-with-gemini) | Delegates coding tasks to Gemini CLI for prototyping, debugging, and code review | Google Gemini |
+| [local-tts](../local-tts) | Generates and verifies local OmniVoice/VoxCPM speech, including loudness, silence, and ASR checks before delivery | Local GPU / TTS |
 
 ---
 
@@ -75,7 +76,7 @@ The install script provides flexible options for scope and target location.
 ./install.sh --user --skill collaborating-with-codex
 
 # Install multiple specific Skills
-./install.sh --user -s collaborating-with-codex -s collaborating-with-gemini
+./install.sh --user -s collaborating-with-codex -s collaborating-with-gemini -s local-tts
 ```
 
 **Option 3: Custom Installation Path**
diff --git a/install.ps1 b/install.ps1
index 4b3b90b..a1e9fd9 100644
--- a/install.ps1
+++ b/install.ps1
@@ -13,7 +13,7 @@ param(
 
 $ErrorActionPreference = "Stop"
 $ScriptDir = Split-Path -Parent $MyInvocation.MyCommand.Path
-$AvailableSkills = @("collaborating-with-codex", "collaborating-with-gemini")
+$AvailableSkills = @("collaborating-with-codex", "collaborating-with-gemini", "local-tts")
 
 function Write-ColorOutput {
     param([string]$Text, [string]$Color = "White")
diff --git a/install.sh b/install.sh
old mode 100644
new mode 100755
index 9d3b7eb..a1858ab
--- a/install.sh
+++ b/install.sh
@@ -12,7 +12,7 @@ BLUE='\033[0;34m'
 NC='\033[0m'
 
 # Available skills
-AVAILABLE_SKILLS=("collaborating-with-codex" "collaborating-with-gemini")
+AVAILABLE_SKILLS=("collaborating-with-codex" "collaborating-with-gemini" "local-tts")
 
 # Script directory
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
diff --git a/local-tts/SKILL.md b/local-tts/SKILL.md
new file mode 100644
index 0000000..555bb9f
--- /dev/null
+++ b/local-tts/SKILL.md
@@ -0,0 +1,161 @@
+---
+name: local-tts
+description: "Generate, verify, and benchmark the local OmniVoice-Studio and VoxCPM text-to-speech setups on this machine. Use when the user asks whether OmniVoice-Studio or voxcpm can run locally, wants sample speech generation, wants movie/video narration voiceover, wants final-effect/performance comparisons, or wants to integrate these local TTS engines into a skill or workflow."
+---
+
+# Local TTS
+
+## Quick Start
+
+Use the generation runner for normal TTS output:
+
+```bash
+python /home/slam/.codex/skills/local-tts/scripts/tts_generate.py --engine auto --device auto --text "你好，这是本地语音合成测试。" --output /tmp/local_tts.wav --report-json /tmp/local_tts.json
+```
+
+Use the benchmark runner when the user wants performance numbers:
+
+```bash
+python /home/slam/.codex/skills/local-tts/scripts/tts_benchmark.py --device both --out-dir /tmp/tts_bench_run
+```
+
+Default assumptions:
+- OmniVoice-Studio repo: `/home/slam/OmniVoice-Studio`
+- Python env: `/home/slam/OmniVoice-Studio/.venv`
+- OmniVoice CLI: `/home/slam/OmniVoice-Studio/.venv/bin/omnivoice-infer`
+- VoxCPM package installed inside the same venv
+
+If `CUDA_VISIBLE_DEVICES` is set incorrectly, unset it for GPU tests. For CPU tests, the runners force `CUDA_VISIBLE_DEVICES=""`.
+
+## Workflow
+
+1. Check environment:
+   - Run `env -u CUDA_VISIBLE_DEVICES /home/slam/OmniVoice-Studio/.venv/bin/python -c "import torch, voxcpm; print(torch.__version__, torch.cuda.is_available(), torch.cuda.device_count())"`.
+   - Run `nvidia-smi --query-compute-apps=pid,process_name,used_memory --format=csv,noheader` before GPU tests.
+2. Use `tts_generate.py` for a normal short output file. Request `--report-json` when another agent or script needs structured output.
+3. For user-facing narration longer than a few sentences, use the checked-delivery workflow below instead of one long raw generation.
+4. Use `tts_benchmark.py` for wall-clock time, audio duration, RTF, sample rate, output path, and failures.
+5. Separate cold-start/compile overhead from steady-state speed when interpreting VoxCPM GPU results.
+6. Do not kill GPU processes unless the user explicitly authorizes it.
+
+## Checked Delivery Workflow
+
+Use this workflow before giving the user a local TTS result or uploading it to Baidu Netdisk.
+
+1. Split long narration into short sentence groups and generate each chunk separately.
+   - Keep a chunk to roughly 1-2 sentences.
+   - Prefer OmniVoice with `--postprocess_output True` for these chunks if calling `omnivoice-infer` directly.
+   - Use stable narration settings such as `--language Chinese --speed 0.95 --num_step 8 --instruct "男，青年，低音调"` unless the user asks for a different voice.
+2. Concatenate chunks with `ffmpeg` or another audio tool; insert only short, intentional pauses.
+3. Convert the deliverable to MP3 for compatibility:
+
+```bash
+ffmpeg -y -hide_banner -i input.wav \
+  -af 'highpass=f=80,lowpass=f=8000,loudnorm=I=-16:TP=-1.5:LRA=11' \
+  -ar 48000 -ac 1 -codec:a libmp3lame -b:a 192k output.mp3
+```
+
+4. Verify technical decode and loudness:
+
+```bash
+ffprobe -v error -show_entries format=duration,size,bit_rate:stream=codec_name,sample_rate,channels -of json output.mp3
+ffmpeg -hide_banner -nostats -i output.mp3 -af silencedetect=noise=-35dB:d=0.8,volumedetect -f null -
+```
+
+5. Verify audible speech with ASR before delivery. Use faster-whisper from the OmniVoice venv; on this dual-GPU machine, `CUDA_VISIBLE_DEVICES=1` selects the RTX 4090:
+
+```bash
+CUDA_VISIBLE_DEVICES=1 /home/slam/OmniVoice-Studio/.venv/bin/python - <<'PY'
+from faster_whisper import WhisperModel
+path = "output.mp3"
+model = WhisperModel("small", device="cuda", compute_type="float16")
+segments, info = model.transcribe(path, language="zh", beam_size=5, vad_filter=False)
+texts = []
+last_end = 0.0
+for seg in segments:
+    texts.append(seg.text.strip())
+    last_end = max(last_end, float(seg.end))
+    print(f"[{seg.start:.2f}-{seg.end:.2f}] {seg.text.strip()}")
+print({"language": info.language, "duration": info.duration, "last_end": last_end, "text": "".join(texts)})
+PY
+```
+
+Pass criteria:
+- `ffprobe` duration and file size are plausible for the requested output.
+- `volumedetect` mean volume is not near silence, and `silencedetect` does not show large unintended blank spans.
+- ASR returns recognizable speech across the whole file, especially near the end.
+- Do not upload or share the file until these checks pass.
+- Prefer delivering the checked MP3. Keep the WAV as a working artifact unless the user explicitly asks for WAV.
+
+Known good pattern from the 4090 test: chunked OmniVoice generation, postprocessing enabled, final MP3 at 48 kHz mono 192 kbps, then ASR verification. A 64.8 second checked sample transcribed continuously from start to finish and avoided the earlier mostly blank long-WAV failure.
+
+## Generation
+
+Auto-select engine and device:
+
+```bash
+python /home/slam/.codex/skills/local-tts/scripts/tts_generate.py --engine auto --device auto --text "Hello." --output /tmp/local_tts.wav --report-json /tmp/local_tts.json
+```
+
+Force OmniVoice:
+
+```bash
+python /home/slam/.codex/skills/local-tts/scripts/tts_generate.py --engine omnivoice --device cuda --text "Hello." --output /tmp/local_tts.wav
+```
+
+Force VoxCPM:
+
+```bash
+python /home/slam/.codex/skills/local-tts/scripts/tts_generate.py --engine voxcpm --device cuda --text "Hello." --output /tmp/local_tts.wav
+```
+
+Movie narration preset:
+
+```bash
+python /home/slam/.codex/skills/local-tts/scripts/tts_generate.py \
+  --engine omnivoice \
+  --device cuda \
+  --language Chinese \
+  --speed 0.95 \
+  --num-step 8 \
+  --instruct "男，青年，低音调" \
+  --text "在这个看似平静的小镇里，一场被隐藏多年的秘密，正在悄悄浮出水面。" \
+  --output /tmp/movie_narration.wav \
+  --report-json /tmp/movie_narration.json
+```
+
+Use `--duration` when a narration segment must match a fixed video slot. Prefer `--speed 0.9` to `1.0` for suspense narration, and `--num-step 8` for better quality than the fastest benchmark setting.
+
+OmniVoice `--instruct` accepts fixed labels, not arbitrary prose. Chinese labels must use full-width comma, for example `男，青年，低音调`, `女，中年，中音调`, or `男，中年，低音调`.
+
+## Benchmark
+
+CPU only:
+
+```bash
+python /home/slam/.codex/skills/local-tts/scripts/tts_benchmark.py --device cpu --out-dir /tmp/tts_bench_cpu
+```
+
+GPU only:
+
+```bash
+python /home/slam/.codex/skills/local-tts/scripts/tts_benchmark.py --device cuda --out-dir /tmp/tts_bench_gpu
+```
+
+Custom text:
+
+```bash
+python /home/slam/.codex/skills/local-tts/scripts/tts_benchmark.py --device both --text "你好，这是本地语音合成测试。" --out-dir /tmp/tts_bench_zh
+```
+
+## Interpretation
+
+Use `elapsed_sec / audio_duration_sec` as RTF. Lower is faster. RTF is only a rough comparison when engines produce different output durations or sample rates.
+
+OmniVoice-Studio is the app/integration layer. Prefer it when the user needs a service, UI, API, or multi-engine workflow.
+
+VoxCPM is a direct model library. Prefer it when the user needs direct programmatic TTS calls and accepts model-level setup/latency tradeoffs.
+
+`tts_generate.py --engine auto` currently chooses OmniVoice as the stable default integration path. Force `--engine voxcpm` when direct VoxCPM behavior is specifically needed.
+
+Read `references/local-results.md` when the user asks what has already been tested on this machine.
diff --git a/local-tts/agents/openai.yaml b/local-tts/agents/openai.yaml
new file mode 100644
index 0000000..cfdd931
--- /dev/null
+++ b/local-tts/agents/openai.yaml
@@ -0,0 +1,4 @@
+interface:
+  display_name: "Local TTS"
+  short_description: "Generate and verify local TTS audio"
+  default_prompt: "Use $local-tts to generate checked local narration, convert it to a compatible MP3, and verify speech quality before delivery."
diff --git a/local-tts/references/local-results.md b/local-tts/references/local-results.md
new file mode 100644
index 0000000..d62445b
--- /dev/null
+++ b/local-tts/references/local-results.md
@@ -0,0 +1,27 @@
+# Local Results
+
+Machine state observed on 2026-06-16:
+
+- GPU: NVIDIA GeForce RTX 4060 Ti
+- Torch in `/home/slam/OmniVoice-Studio/.venv`: `2.8.0+cu128`
+- CUDA runtime reported by torch: `12.8`
+- `env -u CUDA_VISIBLE_DEVICES` detects one CUDA GPU.
+- VoxCPM package is installed in `/home/slam/OmniVoice-Studio/.venv`.
+- OmniVoice-Studio diagnostics and backend health check passed previously.
+
+Benchmark text:
+
+```text
+Hello from the local text to speech benchmark. This is a short performance test.
+```
+
+Observed benchmark:
+
+| Engine | Device | Elapsed | Audio | RTF | Sample rate |
+| --- | ---: | ---: | ---: | ---: | ---: |
+| OmniVoice | CPU | 59.9s | 5.04s | 11.88 | 24000 |
+| VoxCPM | CPU | 70.8s | 10.16s | 6.97 | 16000 |
+| OmniVoice | CUDA | 10.9s | 5.04s | 2.17 | 24000 |
+| VoxCPM | CUDA | 55.9s | 10.16s | 5.50 | 16000 |
+
+Interpret with care: outputs differ in duration and sample rate, and VoxCPM GPU includes heavy first-run torch compile/warmup overhead.
diff --git a/local-tts/scripts/tts_benchmark.py b/local-tts/scripts/tts_benchmark.py
new file mode 100755
index 0000000..55341ff
--- /dev/null
+++ b/local-tts/scripts/tts_benchmark.py
@@ -0,0 +1,132 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import subprocess
+import sys
+import time
+import wave
+from pathlib import Path
+
+
+DEFAULT_TEXT = "Hello from the local text to speech benchmark. This is a short performance test."
+DEFAULT_ROOT = Path("/home/slam/OmniVoice-Studio")
+
+
+def wav_info(path: Path) -> dict:
+    with wave.open(str(path), "rb") as f:
+        frames = f.getnframes()
+        rate = f.getframerate()
+        duration = frames / float(rate) if rate else 0.0
+        return {
+            "audio_duration_sec": duration,
+            "sample_rate": rate,
+            "channels": f.getnchannels(),
+            "bytes": path.stat().st_size,
+        }
+
+
+def run_cmd(name: str, cmd: list[str], output: Path, env: dict[str, str], cwd: Path) -> dict:
+    if output.exists():
+        output.unlink()
+    start = time.perf_counter()
+    proc = subprocess.run(cmd, cwd=str(cwd), env=env, text=True, capture_output=True)
+    elapsed = time.perf_counter() - start
+    result = {
+        "name": name,
+        "ok": proc.returncode == 0,
+        "returncode": proc.returncode,
+        "elapsed_sec": elapsed,
+        "output": str(output),
+        "stdout_tail": proc.stdout[-2000:],
+        "stderr_tail": proc.stderr[-4000:],
+    }
+    if output.exists():
+        info = wav_info(output)
+        result.update(info)
+        duration = info["audio_duration_sec"]
+        result["rtf"] = elapsed / duration if duration else None
+    return result
+
+
+def device_env(device: str) -> dict[str, str]:
+    env = os.environ.copy()
+    if device == "cpu":
+        env["CUDA_VISIBLE_DEVICES"] = ""
+    return env
+
+
+def build_voxcpm_script(path: Path, text: str, output: Path) -> None:
+    path.write_text(
+        "from voxcpm import VoxCPM\n"
+        "import soundfile as sf\n"
+        "model = VoxCPM.from_pretrained('openbmb/VoxCPM-0.5B', load_denoiser=False)\n"
+        f"wav = model.generate(text={text!r}, normalize=True, denoise=False, inference_timesteps=1, max_length=128, retry_badcase=False)\n"
+        f"sf.write({str(output)!r}, wav, 16000)\n"
+    )
+
+
+def benchmark_device(root: Path, out_dir: Path, text: str, device: str) -> list[dict]:
+    py = root / ".venv/bin/python"
+    omni = root / ".venv/bin/omnivoice-infer"
+    env = device_env(device)
+    omni_out = out_dir / f"omnivoice_{device}.wav"
+    vox_out = out_dir / f"voxcpm_{device}.wav"
+    vox_script = out_dir / f"run_voxcpm_{device}_once.py"
+    build_voxcpm_script(vox_script, text, vox_out)
+
+    omni_cmd = [
+        str(omni),
+        "--text",
+        text,
+        "--output",
+        str(omni_out),
+        "--device",
+        "cpu" if device == "cpu" else "cuda",
+        "--num_step",
+        "4",
+        "--denoise",
+        "False",
+        "--postprocess_output",
+        "False",
+    ]
+
+    return [
+        run_cmd(f"omnivoice_{device}", omni_cmd, omni_out, env, root),
+        run_cmd(f"voxcpm_{device}", [str(py), str(vox_script)], vox_out, env, root),
+    ]
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Benchmark local OmniVoice-Studio and VoxCPM TTS.")
+    parser.add_argument("--root", default=str(DEFAULT_ROOT), help="OmniVoice-Studio repo path.")
+    parser.add_argument("--out-dir", default="/tmp/tts_bench_run", help="Output directory for wav files and report.json.")
+    parser.add_argument("--text", default=DEFAULT_TEXT, help="Text to synthesize.")
+    parser.add_argument("--device", choices=["cpu", "cuda", "both"], default="cpu", help="Device mode to test.")
+    args = parser.parse_args()
+
+    root = Path(args.root).expanduser().resolve()
+    out_dir = Path(args.out_dir).expanduser().resolve()
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    devices = ["cpu", "cuda"] if args.device == "both" else [args.device]
+    results: list[dict] = []
+    for device in devices:
+        results.extend(benchmark_device(root, out_dir, args.text, device))
+
+    report = {
+        "root": str(root),
+        "text": args.text,
+        "device": args.device,
+        "results": results,
+    }
+    report_path = out_dir / "report.json"
+    report_path.write_text(json.dumps(report, indent=2, ensure_ascii=False))
+    print(json.dumps(report, indent=2, ensure_ascii=False))
+    return 0 if all(r["ok"] for r in results) else 1
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/local-tts/scripts/tts_generate.py b/local-tts/scripts/tts_generate.py
new file mode 100755
index 0000000..1a0ebca
--- /dev/null
+++ b/local-tts/scripts/tts_generate.py
@@ -0,0 +1,172 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import subprocess
+import time
+import wave
+from pathlib import Path
+
+
+ROOT = Path("/home/slam/OmniVoice-Studio")
+PY = ROOT / ".venv/bin/python"
+OMNI = ROOT / ".venv/bin/omnivoice-infer"
+DEFAULT_TEXT = "Hello from local TTS."
+
+
+def run(cmd: list[str], env: dict[str, str], cwd: Path) -> tuple[int, float]:
+    start = time.perf_counter()
+    proc = subprocess.run(cmd, cwd=str(cwd), env=env)
+    return proc.returncode, time.perf_counter() - start
+
+
+def wav_info(path: Path) -> dict:
+    if not path.exists():
+        return {}
+    with wave.open(str(path), "rb") as f:
+        frames = f.getnframes()
+        rate = f.getframerate()
+        duration = frames / float(rate) if rate else 0.0
+        return {
+            "audio_duration_sec": duration,
+            "sample_rate": rate,
+            "channels": f.getnchannels(),
+            "bytes": path.stat().st_size,
+        }
+
+
+def detect_cuda(env: dict[str, str]) -> bool:
+    if env.get("CUDA_VISIBLE_DEVICES") == "":
+        return False
+    proc = subprocess.run(
+        [str(PY), "-c", "import torch; print(int(torch.cuda.is_available()))"],
+        cwd=str(ROOT),
+        env=env,
+        text=True,
+        capture_output=True,
+    )
+    return proc.returncode == 0 and proc.stdout.strip() == "1"
+
+
+def has_cjk(text: str) -> bool:
+    return any("\u4e00" <= ch <= "\u9fff" for ch in text)
+
+
+def pick_device(device: str, env: dict[str, str]) -> str:
+    if device != "auto":
+        return device
+    return "cuda" if detect_cuda(env) else "cpu"
+
+
+def pick_engine(engine: str, text: str) -> str:
+    if engine != "auto":
+        return engine
+    return "omnivoice" if has_cjk(text) else "omnivoice"
+
+
+def write_report(path: str | None, result: dict) -> None:
+    if not path:
+        return
+    report_path = Path(path).expanduser().resolve()
+    report_path.parent.mkdir(parents=True, exist_ok=True)
+    report_path.write_text(json.dumps(result, indent=2, ensure_ascii=False))
+
+
+def str_or_none(value: str | None) -> str:
+    return "None" if value is None else value
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Generate one local TTS WAV with OmniVoice or VoxCPM.")
+    parser.add_argument("--text", default=DEFAULT_TEXT)
+    parser.add_argument("--output", required=True)
+    parser.add_argument("--engine", choices=["auto", "omnivoice", "voxcpm"], default="auto")
+    parser.add_argument("--device", choices=["auto", "cpu", "cuda"], default="auto")
+    parser.add_argument("--num-step", type=int, default=4)
+    parser.add_argument("--voxcpm-inference-timesteps", type=int, default=1)
+    parser.add_argument("--max-length", type=int, default=128)
+    parser.add_argument("--omni-model", default="k2-fsa/OmniVoice")
+    parser.add_argument("--language", default=None)
+    parser.add_argument("--instruct", default=None)
+    parser.add_argument("--speed", type=float, default=None)
+    parser.add_argument("--duration", type=float, default=None)
+    parser.add_argument("--report-json", default=None)
+    args = parser.parse_args()
+
+    output = Path(args.output).expanduser().resolve()
+    output.parent.mkdir(parents=True, exist_ok=True)
+
+    base_env = os.environ.copy()
+    device = pick_device(args.device, base_env)
+    env = base_env.copy()
+    if device == "cpu":
+        env["CUDA_VISIBLE_DEVICES"] = ""
+    engine = pick_engine(args.engine, args.text)
+
+    result = {
+        "engine": engine,
+        "device": device,
+        "text": args.text,
+        "output": str(output),
+    }
+
+    if engine == "omnivoice":
+        cmd = [
+            str(OMNI),
+            "--model",
+            args.omni_model,
+            "--text",
+            args.text,
+            "--output",
+            str(output),
+            "--device",
+            device,
+            "--num_step",
+            str(args.num_step),
+            "--denoise",
+            "False",
+            "--postprocess_output",
+            "False",
+        ]
+        if args.language:
+            cmd += ["--language", args.language]
+        if args.instruct:
+            cmd += ["--instruct", args.instruct]
+        if args.speed is not None:
+            cmd += ["--speed", str(args.speed)]
+        if args.duration is not None:
+            cmd += ["--duration", str(args.duration)]
+        returncode, elapsed = run(cmd, env, ROOT)
+    else:
+        script = output.with_suffix(".voxcpm.py")
+        script.write_text(
+            "from voxcpm import VoxCPM\n"
+            "import soundfile as sf\n"
+            "model = VoxCPM.from_pretrained('openbmb/VoxCPM-0.5B', load_denoiser=False)\n"
+            f"wav = model.generate(text={args.text!r}, normalize=True, denoise=False, inference_timesteps={args.voxcpm_inference_timesteps}, max_length={args.max_length}, retry_badcase=False)\n"
+            f"sf.write({str(output)!r}, wav, 16000)\n"
+        )
+        returncode, elapsed = run([str(PY), str(script)], env, ROOT)
+
+    result["returncode"] = returncode
+    result["elapsed_sec"] = elapsed
+    result.update(wav_info(output))
+    if result.get("audio_duration_sec"):
+        result["rtf"] = elapsed / result["audio_duration_sec"]
+    if args.language is not None:
+        result["language"] = args.language
+    if args.instruct is not None:
+        result["instruct"] = args.instruct
+    if args.speed is not None:
+        result["speed"] = args.speed
+    if args.duration is not None:
+        result["duration"] = args.duration
+    write_report(args.report_json, result)
+    print(json.dumps(result, indent=2, ensure_ascii=False))
+    return returncode
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())