From ab1a2105f675ff532443606f6f0bec5ff52b73de Mon Sep 17 00:00:00 2001 From: kenchikuliu Date: Wed, 17 Jun 2026 20:40:02 +0800 Subject: [PATCH] Add local-tts skill with checked delivery workflow --- README.md | 3 +- docs/README_EN.md | 3 +- install.ps1 | 2 +- install.sh | 2 +- local-tts/SKILL.md | 161 ++++++++++++++++++++++++ local-tts/agents/openai.yaml | 4 + local-tts/references/local-results.md | 27 ++++ local-tts/scripts/tts_benchmark.py | 132 ++++++++++++++++++++ local-tts/scripts/tts_generate.py | 172 ++++++++++++++++++++++++++ 9 files changed, 502 insertions(+), 4 deletions(-) mode change 100644 => 100755 install.sh create mode 100644 local-tts/SKILL.md create mode 100644 local-tts/agents/openai.yaml create mode 100644 local-tts/references/local-results.md create mode 100755 local-tts/scripts/tts_benchmark.py create mode 100755 local-tts/scripts/tts_generate.py diff --git a/README.md b/README.md index 91baad2..fe9a808 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,7 @@ |-------|----------|----------| | [collaborating-with-codex](https://github.com/GuDaStudio/collaborating-with-codex) | 将编码任务委托给 Codex CLI,用于原型开发、调试和代码审查 | OpenAI Codex | | [collaborating-with-gemini](https://github.com/GuDaStudio/collaborating-with-gemini) | 将编码任务委托给 Gemini CLI,用于原型开发、调试和代码审查 | Google Gemini | +| [local-tts](./local-tts) | 本地生成并检查 OmniVoice/VoxCPM 语音,交付前验证响度、静音和 ASR 可识别性 | Local GPU / TTS | @@ -88,7 +89,7 @@ cd skills ./install.sh --user --skill collaborating-with-codex # 安装多个指定 Skill -./install.sh --user -s collaborating-with-codex -s collaborating-with-gemini +./install.sh --user -s collaborating-with-codex -s collaborating-with-gemini -s local-tts ``` **方式三:自定义安装路径** diff --git a/docs/README_EN.md b/docs/README_EN.md index 9f49239..9ca8c57 100644 --- a/docs/README_EN.md +++ b/docs/README_EN.md @@ -24,6 +24,7 @@ Star us on GitHub — your support means a lot! 🙏😊 |-------|-------------|---------------------| | [collaborating-with-codex](../collaborating-with-codex) | Delegates coding tasks to Codex CLI for prototyping, debugging, and code review | OpenAI Codex | | [collaborating-with-gemini](../collaborating-with-gemini) | Delegates coding tasks to Gemini CLI for prototyping, debugging, and code review | Google Gemini | +| [local-tts](../local-tts) | Generates and verifies local OmniVoice/VoxCPM speech, including loudness, silence, and ASR checks before delivery | Local GPU / TTS | --- @@ -75,7 +76,7 @@ The install script provides flexible options for scope and target location. ./install.sh --user --skill collaborating-with-codex # Install multiple specific Skills -./install.sh --user -s collaborating-with-codex -s collaborating-with-gemini +./install.sh --user -s collaborating-with-codex -s collaborating-with-gemini -s local-tts ``` **Option 3: Custom Installation Path** diff --git a/install.ps1 b/install.ps1 index 4b3b90b..a1e9fd9 100644 --- a/install.ps1 +++ b/install.ps1 @@ -13,7 +13,7 @@ param( $ErrorActionPreference = "Stop" $ScriptDir = Split-Path -Parent $MyInvocation.MyCommand.Path -$AvailableSkills = @("collaborating-with-codex", "collaborating-with-gemini") +$AvailableSkills = @("collaborating-with-codex", "collaborating-with-gemini", "local-tts") function Write-ColorOutput { param([string]$Text, [string]$Color = "White") diff --git a/install.sh b/install.sh old mode 100644 new mode 100755 index 9d3b7eb..a1858ab --- a/install.sh +++ b/install.sh @@ -12,7 +12,7 @@ BLUE='\033[0;34m' NC='\033[0m' # Available skills -AVAILABLE_SKILLS=("collaborating-with-codex" "collaborating-with-gemini") +AVAILABLE_SKILLS=("collaborating-with-codex" "collaborating-with-gemini" "local-tts") # Script directory SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" diff --git a/local-tts/SKILL.md b/local-tts/SKILL.md new file mode 100644 index 0000000..555bb9f --- /dev/null +++ b/local-tts/SKILL.md @@ -0,0 +1,161 @@ +--- +name: local-tts +description: "Generate, verify, and benchmark the local OmniVoice-Studio and VoxCPM text-to-speech setups on this machine. Use when the user asks whether OmniVoice-Studio or voxcpm can run locally, wants sample speech generation, wants movie/video narration voiceover, wants final-effect/performance comparisons, or wants to integrate these local TTS engines into a skill or workflow." +--- + +# Local TTS + +## Quick Start + +Use the generation runner for normal TTS output: + +```bash +python /home/slam/.codex/skills/local-tts/scripts/tts_generate.py --engine auto --device auto --text "你好,这是本地语音合成测试。" --output /tmp/local_tts.wav --report-json /tmp/local_tts.json +``` + +Use the benchmark runner when the user wants performance numbers: + +```bash +python /home/slam/.codex/skills/local-tts/scripts/tts_benchmark.py --device both --out-dir /tmp/tts_bench_run +``` + +Default assumptions: +- OmniVoice-Studio repo: `/home/slam/OmniVoice-Studio` +- Python env: `/home/slam/OmniVoice-Studio/.venv` +- OmniVoice CLI: `/home/slam/OmniVoice-Studio/.venv/bin/omnivoice-infer` +- VoxCPM package installed inside the same venv + +If `CUDA_VISIBLE_DEVICES` is set incorrectly, unset it for GPU tests. For CPU tests, the runners force `CUDA_VISIBLE_DEVICES=""`. + +## Workflow + +1. Check environment: + - Run `env -u CUDA_VISIBLE_DEVICES /home/slam/OmniVoice-Studio/.venv/bin/python -c "import torch, voxcpm; print(torch.__version__, torch.cuda.is_available(), torch.cuda.device_count())"`. + - Run `nvidia-smi --query-compute-apps=pid,process_name,used_memory --format=csv,noheader` before GPU tests. +2. Use `tts_generate.py` for a normal short output file. Request `--report-json` when another agent or script needs structured output. +3. For user-facing narration longer than a few sentences, use the checked-delivery workflow below instead of one long raw generation. +4. Use `tts_benchmark.py` for wall-clock time, audio duration, RTF, sample rate, output path, and failures. +5. Separate cold-start/compile overhead from steady-state speed when interpreting VoxCPM GPU results. +6. Do not kill GPU processes unless the user explicitly authorizes it. + +## Checked Delivery Workflow + +Use this workflow before giving the user a local TTS result or uploading it to Baidu Netdisk. + +1. Split long narration into short sentence groups and generate each chunk separately. + - Keep a chunk to roughly 1-2 sentences. + - Prefer OmniVoice with `--postprocess_output True` for these chunks if calling `omnivoice-infer` directly. + - Use stable narration settings such as `--language Chinese --speed 0.95 --num_step 8 --instruct "男,青年,低音调"` unless the user asks for a different voice. +2. Concatenate chunks with `ffmpeg` or another audio tool; insert only short, intentional pauses. +3. Convert the deliverable to MP3 for compatibility: + +```bash +ffmpeg -y -hide_banner -i input.wav \ + -af 'highpass=f=80,lowpass=f=8000,loudnorm=I=-16:TP=-1.5:LRA=11' \ + -ar 48000 -ac 1 -codec:a libmp3lame -b:a 192k output.mp3 +``` + +4. Verify technical decode and loudness: + +```bash +ffprobe -v error -show_entries format=duration,size,bit_rate:stream=codec_name,sample_rate,channels -of json output.mp3 +ffmpeg -hide_banner -nostats -i output.mp3 -af silencedetect=noise=-35dB:d=0.8,volumedetect -f null - +``` + +5. Verify audible speech with ASR before delivery. Use faster-whisper from the OmniVoice venv; on this dual-GPU machine, `CUDA_VISIBLE_DEVICES=1` selects the RTX 4090: + +```bash +CUDA_VISIBLE_DEVICES=1 /home/slam/OmniVoice-Studio/.venv/bin/python - <<'PY' +from faster_whisper import WhisperModel +path = "output.mp3" +model = WhisperModel("small", device="cuda", compute_type="float16") +segments, info = model.transcribe(path, language="zh", beam_size=5, vad_filter=False) +texts = [] +last_end = 0.0 +for seg in segments: + texts.append(seg.text.strip()) + last_end = max(last_end, float(seg.end)) + print(f"[{seg.start:.2f}-{seg.end:.2f}] {seg.text.strip()}") +print({"language": info.language, "duration": info.duration, "last_end": last_end, "text": "".join(texts)}) +PY +``` + +Pass criteria: +- `ffprobe` duration and file size are plausible for the requested output. +- `volumedetect` mean volume is not near silence, and `silencedetect` does not show large unintended blank spans. +- ASR returns recognizable speech across the whole file, especially near the end. +- Do not upload or share the file until these checks pass. +- Prefer delivering the checked MP3. Keep the WAV as a working artifact unless the user explicitly asks for WAV. + +Known good pattern from the 4090 test: chunked OmniVoice generation, postprocessing enabled, final MP3 at 48 kHz mono 192 kbps, then ASR verification. A 64.8 second checked sample transcribed continuously from start to finish and avoided the earlier mostly blank long-WAV failure. + +## Generation + +Auto-select engine and device: + +```bash +python /home/slam/.codex/skills/local-tts/scripts/tts_generate.py --engine auto --device auto --text "Hello." --output /tmp/local_tts.wav --report-json /tmp/local_tts.json +``` + +Force OmniVoice: + +```bash +python /home/slam/.codex/skills/local-tts/scripts/tts_generate.py --engine omnivoice --device cuda --text "Hello." --output /tmp/local_tts.wav +``` + +Force VoxCPM: + +```bash +python /home/slam/.codex/skills/local-tts/scripts/tts_generate.py --engine voxcpm --device cuda --text "Hello." --output /tmp/local_tts.wav +``` + +Movie narration preset: + +```bash +python /home/slam/.codex/skills/local-tts/scripts/tts_generate.py \ + --engine omnivoice \ + --device cuda \ + --language Chinese \ + --speed 0.95 \ + --num-step 8 \ + --instruct "男,青年,低音调" \ + --text "在这个看似平静的小镇里,一场被隐藏多年的秘密,正在悄悄浮出水面。" \ + --output /tmp/movie_narration.wav \ + --report-json /tmp/movie_narration.json +``` + +Use `--duration` when a narration segment must match a fixed video slot. Prefer `--speed 0.9` to `1.0` for suspense narration, and `--num-step 8` for better quality than the fastest benchmark setting. + +OmniVoice `--instruct` accepts fixed labels, not arbitrary prose. Chinese labels must use full-width comma, for example `男,青年,低音调`, `女,中年,中音调`, or `男,中年,低音调`. + +## Benchmark + +CPU only: + +```bash +python /home/slam/.codex/skills/local-tts/scripts/tts_benchmark.py --device cpu --out-dir /tmp/tts_bench_cpu +``` + +GPU only: + +```bash +python /home/slam/.codex/skills/local-tts/scripts/tts_benchmark.py --device cuda --out-dir /tmp/tts_bench_gpu +``` + +Custom text: + +```bash +python /home/slam/.codex/skills/local-tts/scripts/tts_benchmark.py --device both --text "你好,这是本地语音合成测试。" --out-dir /tmp/tts_bench_zh +``` + +## Interpretation + +Use `elapsed_sec / audio_duration_sec` as RTF. Lower is faster. RTF is only a rough comparison when engines produce different output durations or sample rates. + +OmniVoice-Studio is the app/integration layer. Prefer it when the user needs a service, UI, API, or multi-engine workflow. + +VoxCPM is a direct model library. Prefer it when the user needs direct programmatic TTS calls and accepts model-level setup/latency tradeoffs. + +`tts_generate.py --engine auto` currently chooses OmniVoice as the stable default integration path. Force `--engine voxcpm` when direct VoxCPM behavior is specifically needed. + +Read `references/local-results.md` when the user asks what has already been tested on this machine. diff --git a/local-tts/agents/openai.yaml b/local-tts/agents/openai.yaml new file mode 100644 index 0000000..cfdd931 --- /dev/null +++ b/local-tts/agents/openai.yaml @@ -0,0 +1,4 @@ +interface: + display_name: "Local TTS" + short_description: "Generate and verify local TTS audio" + default_prompt: "Use $local-tts to generate checked local narration, convert it to a compatible MP3, and verify speech quality before delivery." diff --git a/local-tts/references/local-results.md b/local-tts/references/local-results.md new file mode 100644 index 0000000..d62445b --- /dev/null +++ b/local-tts/references/local-results.md @@ -0,0 +1,27 @@ +# Local Results + +Machine state observed on 2026-06-16: + +- GPU: NVIDIA GeForce RTX 4060 Ti +- Torch in `/home/slam/OmniVoice-Studio/.venv`: `2.8.0+cu128` +- CUDA runtime reported by torch: `12.8` +- `env -u CUDA_VISIBLE_DEVICES` detects one CUDA GPU. +- VoxCPM package is installed in `/home/slam/OmniVoice-Studio/.venv`. +- OmniVoice-Studio diagnostics and backend health check passed previously. + +Benchmark text: + +```text +Hello from the local text to speech benchmark. This is a short performance test. +``` + +Observed benchmark: + +| Engine | Device | Elapsed | Audio | RTF | Sample rate | +| --- | ---: | ---: | ---: | ---: | ---: | +| OmniVoice | CPU | 59.9s | 5.04s | 11.88 | 24000 | +| VoxCPM | CPU | 70.8s | 10.16s | 6.97 | 16000 | +| OmniVoice | CUDA | 10.9s | 5.04s | 2.17 | 24000 | +| VoxCPM | CUDA | 55.9s | 10.16s | 5.50 | 16000 | + +Interpret with care: outputs differ in duration and sample rate, and VoxCPM GPU includes heavy first-run torch compile/warmup overhead. diff --git a/local-tts/scripts/tts_benchmark.py b/local-tts/scripts/tts_benchmark.py new file mode 100755 index 0000000..55341ff --- /dev/null +++ b/local-tts/scripts/tts_benchmark.py @@ -0,0 +1,132 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import json +import os +import subprocess +import sys +import time +import wave +from pathlib import Path + + +DEFAULT_TEXT = "Hello from the local text to speech benchmark. This is a short performance test." +DEFAULT_ROOT = Path("/home/slam/OmniVoice-Studio") + + +def wav_info(path: Path) -> dict: + with wave.open(str(path), "rb") as f: + frames = f.getnframes() + rate = f.getframerate() + duration = frames / float(rate) if rate else 0.0 + return { + "audio_duration_sec": duration, + "sample_rate": rate, + "channels": f.getnchannels(), + "bytes": path.stat().st_size, + } + + +def run_cmd(name: str, cmd: list[str], output: Path, env: dict[str, str], cwd: Path) -> dict: + if output.exists(): + output.unlink() + start = time.perf_counter() + proc = subprocess.run(cmd, cwd=str(cwd), env=env, text=True, capture_output=True) + elapsed = time.perf_counter() - start + result = { + "name": name, + "ok": proc.returncode == 0, + "returncode": proc.returncode, + "elapsed_sec": elapsed, + "output": str(output), + "stdout_tail": proc.stdout[-2000:], + "stderr_tail": proc.stderr[-4000:], + } + if output.exists(): + info = wav_info(output) + result.update(info) + duration = info["audio_duration_sec"] + result["rtf"] = elapsed / duration if duration else None + return result + + +def device_env(device: str) -> dict[str, str]: + env = os.environ.copy() + if device == "cpu": + env["CUDA_VISIBLE_DEVICES"] = "" + return env + + +def build_voxcpm_script(path: Path, text: str, output: Path) -> None: + path.write_text( + "from voxcpm import VoxCPM\n" + "import soundfile as sf\n" + "model = VoxCPM.from_pretrained('openbmb/VoxCPM-0.5B', load_denoiser=False)\n" + f"wav = model.generate(text={text!r}, normalize=True, denoise=False, inference_timesteps=1, max_length=128, retry_badcase=False)\n" + f"sf.write({str(output)!r}, wav, 16000)\n" + ) + + +def benchmark_device(root: Path, out_dir: Path, text: str, device: str) -> list[dict]: + py = root / ".venv/bin/python" + omni = root / ".venv/bin/omnivoice-infer" + env = device_env(device) + omni_out = out_dir / f"omnivoice_{device}.wav" + vox_out = out_dir / f"voxcpm_{device}.wav" + vox_script = out_dir / f"run_voxcpm_{device}_once.py" + build_voxcpm_script(vox_script, text, vox_out) + + omni_cmd = [ + str(omni), + "--text", + text, + "--output", + str(omni_out), + "--device", + "cpu" if device == "cpu" else "cuda", + "--num_step", + "4", + "--denoise", + "False", + "--postprocess_output", + "False", + ] + + return [ + run_cmd(f"omnivoice_{device}", omni_cmd, omni_out, env, root), + run_cmd(f"voxcpm_{device}", [str(py), str(vox_script)], vox_out, env, root), + ] + + +def main() -> int: + parser = argparse.ArgumentParser(description="Benchmark local OmniVoice-Studio and VoxCPM TTS.") + parser.add_argument("--root", default=str(DEFAULT_ROOT), help="OmniVoice-Studio repo path.") + parser.add_argument("--out-dir", default="/tmp/tts_bench_run", help="Output directory for wav files and report.json.") + parser.add_argument("--text", default=DEFAULT_TEXT, help="Text to synthesize.") + parser.add_argument("--device", choices=["cpu", "cuda", "both"], default="cpu", help="Device mode to test.") + args = parser.parse_args() + + root = Path(args.root).expanduser().resolve() + out_dir = Path(args.out_dir).expanduser().resolve() + out_dir.mkdir(parents=True, exist_ok=True) + + devices = ["cpu", "cuda"] if args.device == "both" else [args.device] + results: list[dict] = [] + for device in devices: + results.extend(benchmark_device(root, out_dir, args.text, device)) + + report = { + "root": str(root), + "text": args.text, + "device": args.device, + "results": results, + } + report_path = out_dir / "report.json" + report_path.write_text(json.dumps(report, indent=2, ensure_ascii=False)) + print(json.dumps(report, indent=2, ensure_ascii=False)) + return 0 if all(r["ok"] for r in results) else 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/local-tts/scripts/tts_generate.py b/local-tts/scripts/tts_generate.py new file mode 100755 index 0000000..1a0ebca --- /dev/null +++ b/local-tts/scripts/tts_generate.py @@ -0,0 +1,172 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import json +import os +import subprocess +import time +import wave +from pathlib import Path + + +ROOT = Path("/home/slam/OmniVoice-Studio") +PY = ROOT / ".venv/bin/python" +OMNI = ROOT / ".venv/bin/omnivoice-infer" +DEFAULT_TEXT = "Hello from local TTS." + + +def run(cmd: list[str], env: dict[str, str], cwd: Path) -> tuple[int, float]: + start = time.perf_counter() + proc = subprocess.run(cmd, cwd=str(cwd), env=env) + return proc.returncode, time.perf_counter() - start + + +def wav_info(path: Path) -> dict: + if not path.exists(): + return {} + with wave.open(str(path), "rb") as f: + frames = f.getnframes() + rate = f.getframerate() + duration = frames / float(rate) if rate else 0.0 + return { + "audio_duration_sec": duration, + "sample_rate": rate, + "channels": f.getnchannels(), + "bytes": path.stat().st_size, + } + + +def detect_cuda(env: dict[str, str]) -> bool: + if env.get("CUDA_VISIBLE_DEVICES") == "": + return False + proc = subprocess.run( + [str(PY), "-c", "import torch; print(int(torch.cuda.is_available()))"], + cwd=str(ROOT), + env=env, + text=True, + capture_output=True, + ) + return proc.returncode == 0 and proc.stdout.strip() == "1" + + +def has_cjk(text: str) -> bool: + return any("\u4e00" <= ch <= "\u9fff" for ch in text) + + +def pick_device(device: str, env: dict[str, str]) -> str: + if device != "auto": + return device + return "cuda" if detect_cuda(env) else "cpu" + + +def pick_engine(engine: str, text: str) -> str: + if engine != "auto": + return engine + return "omnivoice" if has_cjk(text) else "omnivoice" + + +def write_report(path: str | None, result: dict) -> None: + if not path: + return + report_path = Path(path).expanduser().resolve() + report_path.parent.mkdir(parents=True, exist_ok=True) + report_path.write_text(json.dumps(result, indent=2, ensure_ascii=False)) + + +def str_or_none(value: str | None) -> str: + return "None" if value is None else value + + +def main() -> int: + parser = argparse.ArgumentParser(description="Generate one local TTS WAV with OmniVoice or VoxCPM.") + parser.add_argument("--text", default=DEFAULT_TEXT) + parser.add_argument("--output", required=True) + parser.add_argument("--engine", choices=["auto", "omnivoice", "voxcpm"], default="auto") + parser.add_argument("--device", choices=["auto", "cpu", "cuda"], default="auto") + parser.add_argument("--num-step", type=int, default=4) + parser.add_argument("--voxcpm-inference-timesteps", type=int, default=1) + parser.add_argument("--max-length", type=int, default=128) + parser.add_argument("--omni-model", default="k2-fsa/OmniVoice") + parser.add_argument("--language", default=None) + parser.add_argument("--instruct", default=None) + parser.add_argument("--speed", type=float, default=None) + parser.add_argument("--duration", type=float, default=None) + parser.add_argument("--report-json", default=None) + args = parser.parse_args() + + output = Path(args.output).expanduser().resolve() + output.parent.mkdir(parents=True, exist_ok=True) + + base_env = os.environ.copy() + device = pick_device(args.device, base_env) + env = base_env.copy() + if device == "cpu": + env["CUDA_VISIBLE_DEVICES"] = "" + engine = pick_engine(args.engine, args.text) + + result = { + "engine": engine, + "device": device, + "text": args.text, + "output": str(output), + } + + if engine == "omnivoice": + cmd = [ + str(OMNI), + "--model", + args.omni_model, + "--text", + args.text, + "--output", + str(output), + "--device", + device, + "--num_step", + str(args.num_step), + "--denoise", + "False", + "--postprocess_output", + "False", + ] + if args.language: + cmd += ["--language", args.language] + if args.instruct: + cmd += ["--instruct", args.instruct] + if args.speed is not None: + cmd += ["--speed", str(args.speed)] + if args.duration is not None: + cmd += ["--duration", str(args.duration)] + returncode, elapsed = run(cmd, env, ROOT) + else: + script = output.with_suffix(".voxcpm.py") + script.write_text( + "from voxcpm import VoxCPM\n" + "import soundfile as sf\n" + "model = VoxCPM.from_pretrained('openbmb/VoxCPM-0.5B', load_denoiser=False)\n" + f"wav = model.generate(text={args.text!r}, normalize=True, denoise=False, inference_timesteps={args.voxcpm_inference_timesteps}, max_length={args.max_length}, retry_badcase=False)\n" + f"sf.write({str(output)!r}, wav, 16000)\n" + ) + returncode, elapsed = run([str(PY), str(script)], env, ROOT) + + result["returncode"] = returncode + result["elapsed_sec"] = elapsed + result.update(wav_info(output)) + if result.get("audio_duration_sec"): + result["rtf"] = elapsed / result["audio_duration_sec"] + if args.language is not None: + result["language"] = args.language + if args.instruct is not None: + result["instruct"] = args.instruct + if args.speed is not None: + result["speed"] = args.speed + if args.duration is not None: + result["duration"] = args.duration + write_report(args.report_json, result) + print(json.dumps(result, indent=2, ensure_ascii=False)) + return returncode + + +if __name__ == "__main__": + raise SystemExit(main())