Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,14 +84,23 @@ make app # macOS DMG

## Usage

Set at least one API key, then run zee:

```bash
export GROQ_API_KEY=your_key # batch mode (Groq Whisper)
export OPENAI_API_KEY=your_key # batch mode (OpenAI Whisper)
export DEEPGRAM_API_KEY=your_key # streaming mode (Deepgram)
export MISTRAL_API_KEY=your_key # batch mode (Mistral Voxtral)
zee # starts in menu bar, hold Ctrl+Shift+Space to record
zee -stream # words appear as you speak
```

> **Note:** `export` only works in the current terminal session. To make API keys available to `Zee.app` when launched from Spotlight or Applications, use `launchctl`:
> ```bash
> launchctl setenv GROQ_API_KEY your_key
> ```
> Add this to your `~/.zshrc` so it runs on every login.

zee runs as a system tray app in the menu bar. Hold `Ctrl+Shift+Space` to record, release to transcribe. Result auto-pastes into the focused window.

Use the tray menu to switch microphones, providers, and languages — or use `-setup` for initial device selection.
Expand Down
8 changes: 6 additions & 2 deletions log/log.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ type Metrics struct {
TotalTimeMs float64
MemoryAllocMB float64
MemoryPeakMB float64
InferenceMs float64
}

func ResolveDir(flagPath string) (string, error) {
Expand Down Expand Up @@ -197,8 +198,11 @@ func TranscriptionMetrics(m Metrics, mode, format, provider string, connReused b
Float64("ttfb_ms", m.TTFBMs).
Float64("total_ms", m.TotalTimeMs).
Float64("mem_mb", m.MemoryAllocMB).
Float64("peak_mb", m.MemoryPeakMB).
Msg("transcription")
Float64("peak_mb", m.MemoryPeakMB)
if m.InferenceMs > 0 {
ev = ev.Float64("inference_ms", m.InferenceMs)
}
ev.Msg("transcription")
}

func TranscriptionText(text string) {
Expand Down
7 changes: 7 additions & 0 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,7 @@ func run() {
groqKey := os.Getenv("GROQ_API_KEY")
openaiKey := os.Getenv("OPENAI_API_KEY")
dgKey := os.Getenv("DEEPGRAM_API_KEY")
mistralKey := os.Getenv("MISTRAL_API_KEY")

type providerDef struct {
name, label, key string
Expand All @@ -371,6 +372,7 @@ func run() {
{"groq", "Groq", groqKey, transcriber.GroqModels, func() transcriber.Transcriber { return transcriber.NewGroq(groqKey) }},
{"openai", "OpenAI", openaiKey, transcriber.OpenAIModels, func() transcriber.Transcriber { return transcriber.NewOpenAI(openaiKey) }},
{"deepgram", "Deepgram", dgKey, transcriber.DeepgramModels, func() transcriber.Transcriber { return transcriber.NewDeepgram(dgKey) }},
{"mistral", "Mistral", mistralKey, transcriber.MistralModels, func() transcriber.Transcriber { return transcriber.NewMistral(mistralKey) }},
}

var trayModels []tray.Model
Expand All @@ -389,6 +391,8 @@ func run() {
}
}

tray.SetLanguages(transcriber.AllLanguages())

tray.SetModels(trayModels, func(provider, model string) {
configMu.Lock()
defer configMu.Unlock()
Expand All @@ -413,6 +417,8 @@ func run() {
if !streamEnabled {
activeFormat = *formatFlag
}

tray.SetLanguages(newTr.SupportedLanguages())
})

tray.SetLanguage(*langFlag, func(code string) {
Expand Down Expand Up @@ -711,6 +717,7 @@ func finishTranscription(sess transcriber.Session, clipCh chan string, updatesDo
TotalTimeMs: bs.TotalTimeMs,
MemoryAllocMB: result.MemoryAllocMB,
MemoryPeakMB: result.MemoryPeakMB,
InferenceMs: bs.InferenceMs,
}
transcriptionsMu.Lock()
transcriptionCount++
Expand Down
1 change: 1 addition & 0 deletions transcriber/batch_session.go
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@ func (bs *batchSession) Close() (SessionResult, error) {
ConnReused: netMetrics.ConnReused,
TLSProtocol: netMetrics.TLSProtocol,
Confidence: result.Confidence,
InferenceMs: result.InferenceMs,
},
Metrics: bs.formatMetrics(rawSize, encodedSize, compressionPct, audioDuration, result),
}
Expand Down
13 changes: 11 additions & 2 deletions transcriber/deepgram.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,20 @@ func NewDeepgram(apiKey string) *Deepgram {
}
}

func (d *Deepgram) Name() string { return "deepgram" }
var nova3Langs = langsFromCodes([]string{
"bg", "ca", "zh", "cs", "da", "nl", "en", "et", "fi", "fr",
"de", "el", "hi", "hu", "id", "it", "ja", "ko", "lv", "lt",
"ms", "no", "pl", "pt", "ro", "ru", "sk", "es", "sv", "th",
"tr", "uk", "vi",
})

var DeepgramModels = []ModelInfo{
{ID: "nova-3", Label: "Nova-3 (stream)", Stream: true},
{ID: "nova-3", Label: "Nova-3 (stream)", Stream: true, Languages: nova3Langs},
}

func (d *Deepgram) SupportedLanguages() []Language { return modelLanguages(DeepgramModels, d.GetModel()) }
func (d *Deepgram) Name() string { return "deepgram" }

func (d *Deepgram) Models() []ModelInfo { return DeepgramModels }

func (d *Deepgram) NewSession(ctx context.Context, cfg SessionConfig) (Session, error) {
Expand Down
3 changes: 2 additions & 1 deletion transcriber/fake.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ func NewFake(text string, err error) *FakeTranscriber {
return &FakeTranscriber{text: text, err: err}
}

func (f *FakeTranscriber) Name() string { return "fake" }
func (f *FakeTranscriber) Name() string { return "fake" }
func (f *FakeTranscriber) SupportedLanguages() []Language { return nil }
func (f *FakeTranscriber) SetLanguage(lang string) { f.lang = lang }
func (f *FakeTranscriber) GetLanguage() string { return f.lang }
func (f *FakeTranscriber) Models() []ModelInfo { return nil }
Expand Down
19 changes: 14 additions & 5 deletions transcriber/groq.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,18 @@ const (
ModelWhisperV3 = "whisper-large-v3"
)

var whisperLangs = langsFromCodes([]string{
"af", "ar", "hy", "az", "be", "bs", "bg", "ca", "zh", "hr",
"cs", "da", "nl", "en", "et", "fi", "fr", "gl", "de", "el",
"he", "hi", "hu", "is", "id", "it", "ja", "kn", "kk", "ko",
"lv", "lt", "mk", "ms", "mr", "mi", "ne", "no", "fa", "pl",
"pt", "ro", "ru", "sr", "sk", "sl", "es", "sw", "sv", "tl",
"ta", "th", "tr", "uk", "ur", "vi", "cy",
})

var GroqModels = []ModelInfo{
{ID: ModelWhisperV3Turbo, Label: "Whisper V3 Turbo", Stream: false},
{ID: ModelWhisperV3, Label: "Whisper V3", Stream: false},
{ID: ModelWhisperV3Turbo, Label: "Whisper V3 Turbo", Stream: false, Languages: whisperLangs},
{ID: ModelWhisperV3, Label: "Whisper V3", Stream: false, Languages: whisperLangs},
}

type Groq struct {
Expand All @@ -36,9 +45,9 @@ func NewGroq(apiKey string) *Groq {
}
}

func (g *Groq) Models() []ModelInfo { return GroqModels }

func (g *Groq) Name() string { return "groq" }
func (g *Groq) SupportedLanguages() []Language { return modelLanguages(GroqModels, g.GetModel()) }
func (g *Groq) Models() []ModelInfo { return GroqModels }
func (g *Groq) Name() string { return "groq" }

func (g *Groq) NewSession(_ context.Context, cfg SessionConfig) (Session, error) {
go g.client.Warm()
Expand Down
106 changes: 106 additions & 0 deletions transcriber/mistral.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
package transcriber

import (
"bytes"
"context"
"encoding/json"
"fmt"
"mime/multipart"
"net/http"
"strconv"
)

var voxtralLangs = langsFromCodes([]string{
"ar", "zh", "nl", "en", "fr", "de", "hi", "it", "ja", "ko",
"pt", "ru", "es",
})

var MistralModels = []ModelInfo{
{ID: "voxtral-mini-latest", Label: "Voxtral Mini", Stream: false, Languages: voxtralLangs},
}

type Mistral struct {
baseTranscriber
apiKey string
}

func NewMistral(apiKey string) *Mistral {
apiURL := "https://api.mistral.ai/v1/audio/transcriptions"
return &Mistral{
baseTranscriber: baseTranscriber{
client: NewTracedClient(apiURL),
apiURL: apiURL,
model: "voxtral-mini-latest",
},
apiKey: apiKey,
}
}

func (m *Mistral) SupportedLanguages() []Language { return modelLanguages(MistralModels, m.GetModel()) }
func (m *Mistral) Name() string { return "mistral" }
func (m *Mistral) Models() []ModelInfo { return MistralModels }

func (m *Mistral) NewSession(_ context.Context, cfg SessionConfig) (Session, error) {
go m.client.Warm()
if cfg.Stream {
return nil, fmt.Errorf("mistral does not support streaming transcription")
}
return newBatchSession(cfg, m.transcribe)
}

func (m *Mistral) transcribe(audioData []byte, format, lang string) (*Result, error) {
var body bytes.Buffer
writer := multipart.NewWriter(&body)

part, err := writer.CreateFormFile("file", "audio."+format)
if err != nil {
return nil, err
}
if _, err := part.Write(audioData); err != nil {
return nil, err
}

writer.WriteField("model", m.GetModel())
if lang != "" {
writer.WriteField("language", lang)
}
writer.Close()

req, err := http.NewRequest("POST", m.apiURL, &body)
if err != nil {
return nil, err
}

req.Header.Set("Authorization", "Bearer "+m.apiKey)
req.Header.Set("Content-Type", writer.FormDataContentType())

resp, err := m.client.Do(req)
if err != nil {
return nil, err
}

if resp.StatusCode != 200 {
return nil, fmt.Errorf("mistral API error %d: %s", resp.StatusCode, string(resp.Body))
}

var mResp struct {
Text string `json:"text"`
Language string `json:"language"`
Duration float64 `json:"duration"`
}
if err := json.Unmarshal(resp.Body, &mResp); err != nil {
return nil, fmt.Errorf("mistral response parse error: %w", err)
}

remaining := firstNonEmpty(resp.Header, "x-ratelimit-remaining-req-minute")
limit := firstNonEmpty(resp.Header, "x-ratelimit-limit-req-minute")
inferenceMs, _ := strconv.ParseFloat(firstNonEmpty(resp.Header, "x-envoy-upstream-service-time"), 64)

return &Result{
Text: mResp.Text,
Metrics: resp.Metrics,
RateLimit: remaining + "/" + limit,
Duration: mResp.Duration,
InferenceMs: inferenceMs,
}, nil
}
15 changes: 13 additions & 2 deletions transcriber/openai.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,20 @@ func NewOpenAI(apiKey string) *OpenAI {
}
}

func (o *OpenAI) Name() string { return "openai" }
func (o *OpenAI) SupportedLanguages() []Language { return modelLanguages(OpenAIModels, o.GetModel()) }
func (o *OpenAI) Name() string { return "openai" }

var gpt4oTranscribeLangs = langsFromCodes([]string{
"af", "ar", "hy", "az", "be", "bs", "bg", "ca", "zh", "hr",
"cs", "da", "nl", "en", "et", "fi", "fr", "gl", "de", "el",
"he", "hi", "hu", "is", "id", "it", "ja", "kn", "kk", "ko",
"lv", "lt", "mk", "ms", "mr", "mi", "ne", "no", "fa", "pl",
"pt", "ro", "ru", "sr", "sk", "sl", "es", "sw", "sv", "tl",
"ta", "th", "tr", "uk", "ur", "vi", "cy",
})

var OpenAIModels = []ModelInfo{
{ID: "gpt-4o-transcribe", Label: "GPT-4o Transcribe", Stream: false},
{ID: "gpt-4o-transcribe", Label: "GPT-4o Transcribe", Stream: false, Languages: gpt4oTranscribeLangs},
}

func (o *OpenAI) Models() []ModelInfo { return OpenAIModels }
Expand Down
1 change: 1 addition & 0 deletions transcriber/session.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ type BatchStats struct {
ConnReused bool
TLSProtocol string
Confidence float64
InferenceMs float64
}

type StreamStats struct {
Expand Down
Loading
Loading