From efce09331c0130f2177a41ba252f092803856040 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=BCmer=20Cip?= Date: Tue, 31 Mar 2026 19:16:28 +0300 Subject: [PATCH 1/2] add elevenlabs scribe v2 support --- main.go | 2 + transcriber/elevenlabs.go | 138 +++++++++++++++++++++++++++++++++++++ transcriber/transcriber.go | 42 +++++++---- tray/tray.go | 4 +- tray/tray_darwin.go | 6 +- 5 files changed, 170 insertions(+), 22 deletions(-) create mode 100644 transcriber/elevenlabs.go diff --git a/main.go b/main.go index b2a39c4..f292c71 100644 --- a/main.go +++ b/main.go @@ -361,6 +361,7 @@ func run() { openaiKey := os.Getenv("OPENAI_API_KEY") dgKey := os.Getenv("DEEPGRAM_API_KEY") mistralKey := os.Getenv("MISTRAL_API_KEY") + elevenLabsKey := os.Getenv("ELEVENLABS_API_KEY") type providerDef struct { name, label, key string @@ -372,6 +373,7 @@ func run() { {"openai", "OpenAI", openaiKey, transcriber.OpenAIModels, func() transcriber.Transcriber { return transcriber.NewOpenAI(openaiKey) }}, {"deepgram", "Deepgram", dgKey, transcriber.DeepgramModels, func() transcriber.Transcriber { return transcriber.NewDeepgram(dgKey) }}, {"mistral", "Mistral", mistralKey, transcriber.MistralModels, func() transcriber.Transcriber { return transcriber.NewMistral(mistralKey) }}, + {"elevenlabs", "ElevenLabs", elevenLabsKey, transcriber.ElevenLabsModels, func() transcriber.Transcriber { return transcriber.NewElevenLabs(elevenLabsKey) }}, } var trayModels []tray.Model diff --git a/transcriber/elevenlabs.go b/transcriber/elevenlabs.go new file mode 100644 index 0000000..b838ba4 --- /dev/null +++ b/transcriber/elevenlabs.go @@ -0,0 +1,138 @@ +package transcriber + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "mime/multipart" + "net/http" +) + +const ModelScribeV2 = "scribe_v2" + +var scribeV2Langs = langsFromCodes([]string{ + "af", "am", "ar", "hy", "as", "az", "be", "bn", "bs", "bg", + "my", "ca", "ny", "hr", "cs", "da", "nl", "en", "et", "fi", + "fr", "gl", "ka", "de", "el", "gu", "ha", "he", "hi", "hu", + "is", "ig", "id", "ga", "it", "ja", "jv", "kn", "kk", "km", + "ko", "ku", "ky", "lo", "lv", "ln", "lt", "lb", "mk", "ms", + "ml", "mt", "zh", "mi", "mr", "mn", "ne", "no", "oc", "or", + "ps", "fa", "pl", "pt", "pa", "ro", "ru", "sr", "sn", "sd", + "sk", "sl", "so", "es", "sw", "sv", "ta", "tg", "te", "th", + "tr", "uk", "ur", "uz", "vi", "cy", "wo", "xh", "zu", +}) + +var ElevenLabsModels = []ModelInfo{ + {ID: ModelScribeV2, Label: "Scribe V2", Stream: false, Languages: scribeV2Langs}, +} + +type ElevenLabs struct { + baseTranscriber + apiKey string +} + +func NewElevenLabs(apiKey string) *ElevenLabs { + apiURL := "https://api.elevenlabs.io/v1/speech-to-text" + return &ElevenLabs{ + baseTranscriber: baseTranscriber{ + client: NewTracedClient(apiURL), + apiURL: apiURL, + model: ModelScribeV2, + }, + apiKey: apiKey, + } +} + +func (e *ElevenLabs) SupportedLanguages() []Language { + return modelLanguages(ElevenLabsModels, e.GetModel()) +} +func (e *ElevenLabs) Name() string { return "elevenlabs" } +func (e *ElevenLabs) Models() []ModelInfo { return ElevenLabsModels } + +func (e *ElevenLabs) NewSession(_ context.Context, cfg SessionConfig) (Session, error) { + go e.client.Warm() + if cfg.Stream { + return nil, fmt.Errorf("elevenlabs does not support streaming transcription") + } + return newBatchSession(cfg, e.transcribe) +} + +type elevenLabsResponse struct { + Text string `json:"text"` + LanguageCode string `json:"language_code"` + LanguageProbability float64 `json:"language_probability"` + Words []struct { + Text string `json:"text"` + Type string `json:"type"` + Start float64 `json:"start"` + End float64 `json:"end"` + LogProb float64 `json:"logprob"` + } `json:"words"` +} + +func (e *ElevenLabs) transcribe(audioData []byte, format, lang string) (*Result, error) { + var body bytes.Buffer + writer := multipart.NewWriter(&body) + + part, err := writer.CreateFormFile("file", "audio."+format) + if err != nil { + return nil, err + } + if _, err := part.Write(audioData); err != nil { + return nil, err + } + + writer.WriteField("model_id", e.GetModel()) + if lang != "" { + writer.WriteField("language_code", lang) + } + writer.WriteField("tag_audio_events", "false") + writer.Close() + + req, err := http.NewRequest("POST", e.apiURL, &body) + if err != nil { + return nil, err + } + + req.Header.Set("xi-api-key", e.apiKey) + req.Header.Set("Content-Type", writer.FormDataContentType()) + + resp, err := e.client.Do(req) + if err != nil { + return nil, err + } + + if resp.StatusCode != 200 { + return nil, fmt.Errorf("elevenlabs API error %d: %s", resp.StatusCode, string(resp.Body)) + } + + var elResp elevenLabsResponse + if err := json.Unmarshal(resp.Body, &elResp); err != nil { + return nil, fmt.Errorf("elevenlabs response parse error: %w", err) + } + + var avgLogProb float64 + var wordCount int + var duration float64 + for _, w := range elResp.Words { + if w.Type == "word" { + avgLogProb += w.LogProb + wordCount++ + if w.End > duration { + duration = w.End + } + } + } + if wordCount > 0 { + avgLogProb /= float64(wordCount) + } + + return &Result{ + Text: elResp.Text, + Metrics: resp.Metrics, + Confidence: elResp.LanguageProbability, + AvgLogProb: avgLogProb, + Duration: duration, + }, nil +} diff --git a/transcriber/transcriber.go b/transcriber/transcriber.go index 0221579..a46a00b 100644 --- a/transcriber/transcriber.go +++ b/transcriber/transcriber.go @@ -84,21 +84,29 @@ type Transcriber interface { // langLabels maps ISO-639-1 codes to display names. var langLabels = map[string]string{ - "af": "Afrikaans", "ar": "Arabic", "hy": "Armenian", "az": "Azerbaijani", - "be": "Belarusian", "bs": "Bosnian", "bg": "Bulgarian", "ca": "Catalan", - "zh": "Chinese", "hr": "Croatian", "cs": "Czech", "da": "Danish", - "nl": "Dutch", "en": "English", "et": "Estonian", "fi": "Finnish", - "fr": "French", "gl": "Galician", "de": "German", "el": "Greek", + "af": "Afrikaans", "am": "Amharic", "ar": "Arabic", "hy": "Armenian", + "as": "Assamese", "az": "Azerbaijani", "be": "Belarusian", "bn": "Bengali", + "bs": "Bosnian", "bg": "Bulgarian", "my": "Burmese", "ca": "Catalan", + "ny": "Chichewa", "zh": "Chinese", "hr": "Croatian", "cs": "Czech", + "da": "Danish", "nl": "Dutch", "en": "English", "et": "Estonian", + "fi": "Finnish", "fr": "French", "gl": "Galician", "ka": "Georgian", + "de": "German", "el": "Greek", "gu": "Gujarati", "ha": "Hausa", "he": "Hebrew", "hi": "Hindi", "hu": "Hungarian", "is": "Icelandic", - "id": "Indonesian", "it": "Italian", "ja": "Japanese", "kn": "Kannada", - "kk": "Kazakh", "ko": "Korean", "lv": "Latvian", "lt": "Lithuanian", - "mk": "Macedonian", "ms": "Malay", "mr": "Marathi", "mi": "Maori", - "ne": "Nepali", "no": "Norwegian", "fa": "Persian", "pl": "Polish", - "pt": "Portuguese", "ro": "Romanian", "ru": "Russian", "sr": "Serbian", - "sk": "Slovak", "sl": "Slovenian", "es": "Spanish", "sw": "Swahili", - "sv": "Swedish", "tl": "Tagalog", "ta": "Tamil", "th": "Thai", - "tr": "Turkish", "uk": "Ukrainian", "ur": "Urdu", "vi": "Vietnamese", - "cy": "Welsh", + "ig": "Igbo", "id": "Indonesian", "ga": "Irish", "it": "Italian", + "ja": "Japanese", "jv": "Javanese", "kn": "Kannada", "kk": "Kazakh", + "km": "Khmer", "ko": "Korean", "ku": "Kurdish", "ky": "Kyrgyz", + "lo": "Lao", "lv": "Latvian", "ln": "Lingala", "lt": "Lithuanian", + "lb": "Luxembourgish", "mk": "Macedonian", "ms": "Malay", "ml": "Malayalam", + "mt": "Maltese", "mi": "Maori", "mr": "Marathi", "mn": "Mongolian", + "ne": "Nepali", "no": "Norwegian", "oc": "Occitan", "or": "Odia", + "ps": "Pashto", "fa": "Persian", "pl": "Polish", "pt": "Portuguese", + "pa": "Punjabi", "ro": "Romanian", "ru": "Russian", "sr": "Serbian", + "sn": "Shona", "sd": "Sindhi", "sk": "Slovak", "sl": "Slovenian", + "so": "Somali", "es": "Spanish", "sw": "Swahili", "sv": "Swedish", + "ta": "Tamil", "tg": "Tajik", "te": "Telugu", "th": "Thai", + "tl": "Tagalog", "tr": "Turkish", "uk": "Ukrainian", "ur": "Urdu", + "uz": "Uzbek", "vi": "Vietnamese", "cy": "Welsh", "wo": "Wolof", + "xh": "Xhosa", "zu": "Zulu", } func langsFromCodes(codes []string) []Language { @@ -170,6 +178,7 @@ func New() (Transcriber, error) { openaiKey := os.Getenv("OPENAI_API_KEY") groqKey := os.Getenv("GROQ_API_KEY") mistralKey := os.Getenv("MISTRAL_API_KEY") + elevenLabsKey := os.Getenv("ELEVENLABS_API_KEY") if dgKey != "" { return NewDeepgram(dgKey), nil @@ -183,6 +192,9 @@ func New() (Transcriber, error) { if mistralKey != "" { return NewMistral(mistralKey), nil } + if elevenLabsKey != "" { + return NewElevenLabs(elevenLabsKey), nil + } - return nil, fmt.Errorf("set DEEPGRAM_API_KEY, OPENAI_API_KEY, GROQ_API_KEY, or MISTRAL_API_KEY environment variable") + return nil, fmt.Errorf("set DEEPGRAM_API_KEY, OPENAI_API_KEY, GROQ_API_KEY, MISTRAL_API_KEY, or ELEVENLABS_API_KEY environment variable") } diff --git a/tray/tray.go b/tray/tray.go index 559b667..e353f8b 100644 --- a/tray/tray.go +++ b/tray/tray.go @@ -145,9 +145,9 @@ func statusText() string { lang = langCode } if provider == "" { - return "n/a" + return "๐˜ป๐˜ฆ๐˜ฆ" } - return provider + " ยท " + model + " ยท " + lang + return "๐˜ป๐˜ฆ๐˜ฆ โ€” " + provider + " ยท " + model + " ยท " + lang } func updateStatus() { diff --git a/tray/tray_darwin.go b/tray/tray_darwin.go index 203deb2..e7e72b5 100644 --- a/tray/tray_darwin.go +++ b/tray/tray_darwin.go @@ -48,7 +48,7 @@ func updateRecordingIcon(rec bool) { if rec { systray.SetIcon(iconRecHi) if mRecord != nil { - mRecord.SetTitle("๐Ÿ”ด Stop Recording (Shift+Control+Space)") + mRecord.SetTitle("โ— Stop Recording (Shift+Control+Space)") } } else { systray.SetTemplateIcon(iconIdleHi, iconIdle) @@ -176,8 +176,6 @@ func onReady() { } }) - systray.AddSeparator() - mCopy = systray.AddMenuItem("Copy Last Recorded Text", "Copy last transcription to clipboard") mCopy.Disable() mCopy.Click(func() { @@ -186,8 +184,6 @@ func onReady() { } }) - systray.AddSeparator() - mSettings = systray.AddMenuItem("Settings", "Settings") mDevices = mSettings.AddSubMenuItem("Devices", "Select input device") From 032fe1c66fb0d860f23a52a2ec90a8f510e04daf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=BCmer=20Cip?= Date: Tue, 31 Mar 2026 19:22:43 +0300 Subject: [PATCH 2/2] tweak tray --- tray/tray_darwin.go | 52 ++++++++++++++++++++++++--------------------- 1 file changed, 28 insertions(+), 24 deletions(-) diff --git a/tray/tray_darwin.go b/tray/tray_darwin.go index e7e72b5..fb5c68a 100644 --- a/tray/tray_darwin.go +++ b/tray/tray_darwin.go @@ -186,30 +186,6 @@ func onReady() { mSettings = systray.AddMenuItem("Settings", "Settings") - mDevices = mSettings.AddSubMenuItem("Devices", "Select input device") - - deviceMu.Lock() - mDefaultDevice = mDevices.AddSubMenuItemCheckbox("System Default", "Use system default device", deviceSel == "") - mDefaultDevice.Click(func() { - deviceMu.Lock() - cb := deviceCb - deviceMu.Unlock() - if cb != nil { - cb("") - } - deviceMu.Lock() - for _, it := range deviceItems { - it.Uncheck() - } - mDefaultDevice.Check() - deviceMu.Unlock() - }) - deviceItems = make([]*systray.MenuItem, 0, len(deviceNames)) - for i, name := range deviceNames { - item := addDeviceItem(mDevices, i, name, name == deviceSel) - deviceItems = append(deviceItems, item) - } - deviceMu.Unlock() mAutoPaste = mSettings.AddSubMenuItemCheckbox("Auto-paste", "Auto-paste transcribed text", autoPasteOn) mAutoPaste.Click(func() { if mAutoPaste.Checked() { @@ -237,6 +213,34 @@ func onReady() { } }) + sep := mSettings.AddSubMenuItem("โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€", "") + sep.Disable() + + mDevices = mSettings.AddSubMenuItem("Microphone", "Select input device") + + deviceMu.Lock() + mDefaultDevice = mDevices.AddSubMenuItemCheckbox("System Default", "Use system default device", deviceSel == "") + mDefaultDevice.Click(func() { + deviceMu.Lock() + cb := deviceCb + deviceMu.Unlock() + if cb != nil { + cb("") + } + deviceMu.Lock() + for _, it := range deviceItems { + it.Uncheck() + } + mDefaultDevice.Check() + deviceMu.Unlock() + }) + deviceItems = make([]*systray.MenuItem, 0, len(deviceNames)) + for i, name := range deviceNames { + item := addDeviceItem(mDevices, i, name, name == deviceSel) + deviceItems = append(deviceItems, item) + } + deviceMu.Unlock() + modelMu.Lock() if len(models) > 0 { mBackend = mSettings.AddSubMenuItem("Model", "Select transcription model")