From 663375480371619b2a1bd894052e4dee3fa0f01d Mon Sep 17 00:00:00 2001
From: Landis Code <250052214+landiscode@users.noreply.github.com>
Date: Thu, 21 May 2026 01:46:07 -0700
Subject: [PATCH] Add whisper.cpp Sapat transcription guide

Signed-off-by: Landis Code <250052214+landiscode@users.noreply.github.com>
---
 authors/landiscode.md                         |   8 +
 ...0260521_definition_local_speech_to_text.md |  20 ++
 ..._whispercpp_sapat_transcription_daytona.md | 287 ++++++++++++++++++
 ...p_sapat_transcription_daytona_workflow.svg |  54 ++++
 4 files changed, 369 insertions(+)
 create mode 100644 authors/landiscode.md
 create mode 100644 definitions/20260521_definition_local_speech_to_text.md
 create mode 100644 guides/20260521_whispercpp_sapat_transcription_daytona.md
 create mode 100644 guides/assets/20260521_whispercpp_sapat_transcription_daytona_workflow.svg

diff --git a/authors/landiscode.md b/authors/landiscode.md
new file mode 100644
index 00000000..adf12dc2
--- /dev/null
+++ b/authors/landiscode.md
@@ -0,0 +1,8 @@
+Author: Landis Code
+Title: Software engineer and automation builder
+Description: Landis Code builds developer tooling, workflow automations, and AI-assisted engineering systems. Their work focuses on practical integrations that keep setup reproducible, credentials out of source control, and everyday development loops easier to verify.
+Company Name:
+Company Description:
+Author Image: <https://github.com/landiscode.png>
+Company Logo Dark:
+Company Logo White:
diff --git a/definitions/20260521_definition_local_speech_to_text.md b/definitions/20260521_definition_local_speech_to_text.md
new file mode 100644
index 00000000..00b2ee4c
--- /dev/null
+++ b/definitions/20260521_definition_local_speech_to_text.md
@@ -0,0 +1,20 @@
+---
+title: 'Local Speech-to-Text'
+description: 'Speech recognition that runs on local compute instead of sending audio to a hosted transcription API.'
+date: 2026-05-21
+author: 'Landis Code'
+---
+
+# Local Speech-to-Text
+
+## Definition
+
+Local speech-to-text is the process of converting audio into text on local compute, such as a developer workstation, container, or workspace, instead of sending the audio to a hosted transcription API.
+
+## Context and Usage
+
+Local speech-to-text is useful when recordings contain sensitive customer, product, or incident information that should not leave a controlled environment.
+It can also reduce API dependencies during development because the transcription engine, model file, and command-line options are part of the local workflow.
+
+The tradeoff is operational ownership.
+Teams must install the transcription binary, choose and store model files, allocate enough CPU or GPU resources, and validate the output quality themselves.
diff --git a/guides/20260521_whispercpp_sapat_transcription_daytona.md b/guides/20260521_whispercpp_sapat_transcription_daytona.md
new file mode 100644
index 00000000..07668ee7
--- /dev/null
+++ b/guides/20260521_whispercpp_sapat_transcription_daytona.md
@@ -0,0 +1,287 @@
+---
+title: "Run Local whisper.cpp Transcription With Sapat"
+description: "Build a reproducible Daytona workflow for private local transcription with Sapat and whisper.cpp."
+date: 2026-05-21
+author: "Landis Code"
+tags: ["daytona", "sapat", "whisper.cpp", "transcription"]
+---
+
+# Run Local whisper.cpp Transcription With Sapat
+
+# Introduction
+
+Engineering recordings are useful, but they are also easy to mishandle.
+A product demo can contain unreleased feature names, a customer call can include personally identifiable details, and an incident review can mention internal URLs or credentials.
+Hosted speech-to-text APIs are convenient, but some teams need a [local speech-to-text](../definitions/20260521_definition_local_speech_to_text.md) path for the first transcript pass.
+
+This guide shows how to run local transcription in a Daytona workspace with [Sapat](https://github.com/nkkko/sapat) and [whisper.cpp](https://github.com/ggml-org/whisper.cpp).
+Sapat already handles the file workflow: it accepts a video file, converts the media with `ffmpeg`, sends the normalized audio to a provider, and writes a sidecar `.txt` transcript.
+The companion Sapat pull request adds a `whispercpp` provider that shells out to a configured whisper.cpp CLI binary and model file.
+
+Daytona makes this workflow repeatable.
+Instead of keeping the whisper.cpp binary, model path, and shell commands on one laptop, the team can capture the setup in a workspace and share the exact commands.
+That does not make transcripts automatically safe, but it does make the review loop easier to reproduce and debug.
+
+## TL;DR
+
+- Add `--api whispercpp` to Sapat for local whisper.cpp transcription.
+- Configure `WHISPER_CPP_BINARY` and `WHISPER_CPP_MODEL_PATH` in `.env`.
+- Keep recordings and transcripts out of Git until they are reviewed.
+- Use Daytona so the binary, model path, Python environment, and validation steps live together.
+- Treat local transcription as private by default, not publication-ready by default.
+
+![whisper.cpp Sapat workflow](assets/20260521_whispercpp_sapat_transcription_daytona_workflow.svg)
+
+## What You Will Build
+
+You will create a Daytona workspace that can transcribe recordings without calling a hosted speech-to-text API.
+The workflow has five pieces:
+
+- A Daytona workspace created from the Sapat repository.
+- A local whisper.cpp CLI binary, usually `whisper-cli`.
+- A local whisper.cpp model file such as `ggml-base.en.bin`.
+- A Sapat branch with the `--api whispercpp` provider.
+- A repeatable command that writes a `.txt` transcript beside the input recording.
+
+The companion implementation is in [nibzard/sapat#42](https://github.com/nibzard/sapat/pull/42).
+It adds a `WhisperCppTranscription` provider, wires the provider into the CLI, documents the environment variables, and includes mocked tests so validation does not require downloading a model.
+
+## Prerequisites
+
+Install the Daytona CLI and make sure Docker is running.
+If you are new to Daytona, start with the official [installation guide](https://www.daytona.io/docs/installation/installation/).
+
+You also need:
+
+- GitHub access to clone Sapat.
+- Python available in the workspace.
+- `ffmpeg`, because Sapat normalizes input media before transcription.
+- A whisper.cpp binary.
+- A whisper.cpp model file.
+- A short test recording that does not contain secrets, customer data, or private incident details.
+
+For the first smoke test, use a short clip.
+Ten to twenty seconds is enough to validate file conversion, provider routing, model loading, and transcript output.
+
+## Create the Daytona Workspace
+
+Create a workspace from the Sapat repository:
+
+```bash
+daytona create https://github.com/nkkko/sapat --code
+```
+
+Open the workspace shell and install Sapat in editable mode:
+
+```bash
+python -m venv .venv
+. .venv/bin/activate
+python -m pip install -e .
+```
+
+If `ffmpeg` is not available in the workspace image, install it before running Sapat.
+On a Debian-based image, use:
+
+```bash
+sudo apt-get update
+sudo apt-get install -y ffmpeg
+```
+
+Check that the CLI is available:
+
+```bash
+sapat --help
+```
+
+After the companion provider branch is applied, the `--api` option should include `whispercpp` beside `openai`, `groq`, and `azure`.
+
+## Add whisper.cpp to the Workspace
+
+There are two common ways to make whisper.cpp available.
+You can build the upstream project in the workspace, or you can provide a prebuilt binary through your base image.
+The important part is that Sapat receives the absolute path to a working CLI binary and model file.
+
+A source build keeps the setup explicit:
+
+```bash
+cd /workspaces
+git clone https://github.com/ggml-org/whisper.cpp.git
+cd whisper.cpp
+cmake -B build
+cmake --build build -j
+```
+
+Download a model that fits the workspace resources.
+For a quick English smoke test, the base English model is a reasonable starting point:
+
+```bash
+bash ./models/download-ggml-model.sh base.en
+```
+
+The resulting paths usually look like this:
+
+```bash
+/workspaces/whisper.cpp/build/bin/whisper-cli
+/workspaces/whisper.cpp/models/ggml-base.en.bin
+```
+
+Use a larger model only when the workspace has enough CPU, memory, and time for it.
+Local transcription moves provider cost out of the API bill, but the compute still has to happen somewhere.
+
+## Configure Sapat
+
+Create a `.env` file in the Sapat project root.
+Do not commit it.
+
+```bash
+cat > .env <<'EOF'
+WHISPER_CPP_BINARY=/workspaces/whisper.cpp/build/bin/whisper-cli
+WHISPER_CPP_MODEL_PATH=/workspaces/whisper.cpp/models/ggml-base.en.bin
+WHISPER_CPP_THREADS=4
+EOF
+```
+
+The provider reads three values:
+
+| Variable | Required | Purpose |
+| --- | --- | --- |
+| `WHISPER_CPP_BINARY` | No | Path to the CLI binary. Defaults to `whisper-cli`. |
+| `WHISPER_CPP_MODEL_PATH` | Yes | Path to a local `ggml` whisper.cpp model file. |
+| `WHISPER_CPP_THREADS` | No | Thread count passed to the CLI with `-t`. |
+
+The model path is required because a missing model should fail before Sapat starts converting files.
+That early error is much easier to debug than a late CLI failure after a batch has already started.
+
+## Understand the Provider Patch
+
+The `whispercpp` provider follows the same Sapat provider shape as the hosted API integrations, but the last step is local:
+
+1. Validate that the source audio file exists.
+2. Validate that `WHISPER_CPP_MODEL_PATH` points to a local model file.
+3. Convert Sapat's temporary MP3 into a 16 kHz mono WAV file.
+4. Run the configured whisper.cpp binary with `-m`, `-f`, `-otxt`, `-of`, and `-nt`.
+5. Read the generated `.txt` output and return it to Sapat.
+
+The provider passes `--language` through as `-l` and passes Sapat's prompt through as `--prompt`.
+It intentionally does not implement Sapat's `--correct` chat pass.
+Correction requires a separate language model workflow, while this provider is scoped to local automatic speech recognition.
+
+That boundary is useful.
+It keeps the local path local, avoids surprise API calls, and makes transcript review an explicit step instead of an invisible post-processing side effect.
+
+## Run a Single Transcription
+
+Place a short recording in the workspace, then run:
+
+```bash
+sapat ./recordings/demo.mp4 \
+  --api whispercpp \
+  --quality M \
+  --language en \
+  --prompt "Product demo with API names and command output"
+```
+
+Sapat will create a temporary MP3, convert it to WAV for whisper.cpp, run the local model, save `demo.txt`, and remove the temporary MP3.
+Open the transcript and check three things:
+
+- The transcript exists beside the original recording.
+- Product names and acronyms look close enough for review.
+- The text does not contain material that should be redacted before sharing.
+
+For a directory of recordings, pass the directory path:
+
+```bash
+sapat ./recordings --api whispercpp --quality M --language en
+```
+
+Current Sapat directory processing looks for `.mp4` files.
+If your sources are WAV, M4A, or WebM, run them one file at a time or convert them before batch processing.
+
+## Choose a Model and Review Policy
+
+Local transcription makes provider selection feel simpler because there is no API key, but model choice still matters.
+Small models are fast enough for smoke tests and rough indexing.
+Larger models usually produce better transcripts, especially for noisy audio, accented speech, or dense technical terms.
+
+Start with this policy:
+
+| Use case | Suggested model size | Review expectation |
+| --- | --- | --- |
+| CLI smoke test | `base.en` | Confirm routing and output only. |
+| Internal demo notes | `small` or `medium` | Review names, commands, and acronyms. |
+| Incident or customer recordings | `medium` or larger | Review every paragraph before sharing. |
+
+The Sapat `--prompt` option is still useful locally.
+Use it to provide product names, acronyms, speaker context, or command names that appear in the recording.
+Do not put secrets in the prompt.
+
+## Validate Before a Team Uses It
+
+Before handing the workflow to a team, run the implementation checks from the companion Sapat PR:
+
+```bash
+.venv/bin/python -m unittest discover -s tests -v
+.venv/bin/python -m compileall src tests
+.venv/bin/python -m sapat.script --help
+git diff --check
+```
+
+Those tests do not call a live whisper.cpp binary.
+They mock subprocess calls and verify command construction, generated output handling, missing model configuration, and CLI routing.
+That keeps validation cheap and avoids downloading model weights in CI.
+
+Then run one live smoke test with a non-sensitive recording.
+Keep the input short, confirm the transcript, and save the exact command in the project notes.
+
+## Troubleshooting
+
+**Problem:** `WHISPER_CPP_MODEL_PATH must point to a whisper.cpp model file.`
+
+**Solution:** Add `WHISPER_CPP_MODEL_PATH` to `.env` and make sure it points to an existing `ggml` model file.
+
+**Problem:** `whisper.cpp binary 'whisper-cli' was not found.`
+
+**Solution:** Set `WHISPER_CPP_BINARY` to the absolute path of the compiled binary, or add the binary directory to `PATH`.
+
+**Problem:** The command fails on `ffmpeg`.
+
+**Solution:** Install `ffmpeg` in the workspace.
+Sapat depends on it before any transcription provider runs.
+
+**Problem:** Transcription works but takes too long.
+
+**Solution:** Try a smaller model, increase `WHISPER_CPP_THREADS`, or use a workspace with more CPU resources.
+For GPU acceleration, use a workspace image and whisper.cpp build that matches your hardware target.
+
+**Problem:** `--correct` fails with whisper.cpp.
+
+**Solution:** Run `--api whispercpp` without `--correct`, review the transcript manually, and use a separate approved correction workflow if your team needs one.
+
+## Review and Security Checklist
+
+Local transcription reduces network exposure, but it does not remove review responsibility.
+Add a checklist to your team workflow:
+
+- Store raw recordings in a folder ignored by Git.
+- Store model files outside the repository or in an ignored directory.
+- Keep `.env` out of commits.
+- Review transcripts for credentials, customer names, private URLs, and unreleased details.
+- Commit only sanitized notes or excerpts.
+- Record the exact Sapat command and model path used for each batch.
+
+This makes the transcript useful without treating it as harmless text.
+
+## Conclusion
+
+You now have a reproducible Daytona workflow for local whisper.cpp-backed Sapat transcription.
+The workspace contains the Python environment, `ffmpeg`, the whisper.cpp binary, the model path, and the validation commands, so another engineer can rerun the process without guessing what was installed on a laptop.
+
+The companion provider PR adds the missing `whispercpp` integration to Sapat.
+From there, the team can choose a model size, run a private first transcript pass, and review the resulting text before it becomes an issue comment, release note, incident summary, or searchable internal document.
+
+## References
+
+- [Sapat repository](https://github.com/nkkko/sapat)
+- [Companion whisper.cpp provider PR](https://github.com/nibzard/sapat/pull/42)
+- [whisper.cpp repository](https://github.com/ggml-org/whisper.cpp)
+- [whisper.cpp CLI README](https://github.com/ggml-org/whisper.cpp/blob/master/examples/cli/README.md)
diff --git a/guides/assets/20260521_whispercpp_sapat_transcription_daytona_workflow.svg b/guides/assets/20260521_whispercpp_sapat_transcription_daytona_workflow.svg
new file mode 100644
index 00000000..f5e74a43
--- /dev/null
+++ b/guides/assets/20260521_whispercpp_sapat_transcription_daytona_workflow.svg
@@ -0,0 +1,54 @@
+<svg xmlns="http://www.w3.org/2000/svg" width="1120" height="420" viewBox="0 0 1120 420" role="img" aria-labelledby="title desc">
+  <title id="title">Daytona, Sapat, and whisper.cpp transcription workflow</title>
+  <desc id="desc">A five step workflow from a recording in Daytona to a local whisper.cpp transcript and reviewed notes.</desc>
+  <defs>
+    <marker id="arrow" markerWidth="12" markerHeight="12" refX="10" refY="6" orient="auto" markerUnits="strokeWidth">
+      <path d="M2,2 L10,6 L2,10 Z" fill="#335c67" />
+    </marker>
+    <style>
+      .panel { fill: #f8fafc; stroke: #335c67; stroke-width: 2; rx: 14; }
+      .accent { fill: #d9ed92; stroke: #335c67; stroke-width: 2; rx: 14; }
+      .title { font: 700 22px system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif; fill: #12343b; }
+      .text { font: 500 15px system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif; fill: #12343b; }
+      .small { font: 500 13px system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif; fill: #335c67; }
+      .line { stroke: #335c67; stroke-width: 3; fill: none; marker-end: url(#arrow); }
+    </style>
+  </defs>
+
+  <rect width="1120" height="420" fill="#ffffff" />
+  <text x="56" y="58" class="title">Private transcription loop in a Daytona workspace</text>
+  <text x="56" y="84" class="small">Keep the recording, model, transcript, and validation commands in one reproducible environment.</text>
+
+  <rect x="56" y="132" width="170" height="130" class="panel" />
+  <text x="82" y="168" class="title">Recording</text>
+  <text x="82" y="200" class="text">Demo, call,</text>
+  <text x="82" y="222" class="text">or walkthrough</text>
+  <text x="82" y="246" class="small">Ignored by Git</text>
+
+  <path d="M232 198 H298" class="line" />
+
+  <rect x="310" y="132" width="170" height="130" class="accent" />
+  <text x="344" y="168" class="title">Sapat</text>
+  <text x="344" y="200" class="text">Normalize audio</text>
+  <text x="344" y="222" class="text">with ffmpeg</text>
+  <text x="344" y="246" class="small">MP3 to WAV</text>
+
+  <path d="M486 198 H552" class="line" />
+
+  <rect x="564" y="132" width="170" height="130" class="panel" />
+  <text x="594" y="168" class="title">whisper.cpp</text>
+  <text x="594" y="200" class="text">Local CLI</text>
+  <text x="594" y="222" class="text">and model file</text>
+  <text x="594" y="246" class="small">No API key</text>
+
+  <path d="M740 198 H806" class="line" />
+
+  <rect x="818" y="132" width="170" height="130" class="accent" />
+  <text x="854" y="168" class="title">Transcript</text>
+  <text x="854" y="200" class="text">Sidecar .txt</text>
+  <text x="854" y="222" class="text">beside media</text>
+  <text x="854" y="246" class="small">Review before sharing</text>
+
+  <rect x="310" y="310" width="678" height="58" class="panel" />
+  <text x="340" y="346" class="text">Validation: mocked subprocess tests, compile check, CLI help, and one short non-sensitive smoke recording.</text>
+</svg>