BasedHardware · Git-on-my-level · May 21, 2026 · May 21, 2026 · May 21, 2026 · May 21, 2026
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -108,7 +108,7 @@ jobs:
         if: steps.changed.outputs.has_frontend == 'true' || steps.changed.outputs.has_personas == 'true'
         uses: actions/setup-node@v4
         with:
-          node-version: '18'
+          node-version: '22'
           cache: 'npm'
           cache-dependency-path: |
             web/frontend/package-lock.json

diff --git a/.gitignore b/.gitignore
@@ -9,7 +9,9 @@ myenv/
 venv/
 .DS_Store
 dump/
-/scripts/
+/scripts/*
+!/scripts/desktop_assemblyai_e2e.py
+!/scripts/ASSEMBLYAI_BACKGROUND_E2E_AGENT_PROMPT.md
 *.zip
 *.wav
 node_modules
@@ -27,10 +29,15 @@ yarn.lock
 .packages
 .pub-cache/
 .pub/
+.swiftpm/
+
+# Build / compile artifacts (cross-ecosystem; prefer these over per-tool *.o/*.d rules)
 build/
 dist/
 .build/
-.swiftpm/
+target/
+**/.build-agent-*/
+**/.build-hybrid-*/
 
 # VS Code
 .vscode/*

diff --git a/AGENTS.md b/AGENTS.md
@@ -49,7 +49,8 @@ backend (main.py)
   ├── ws ──► pusher (pusher/)
   ├── ──────► diarizer (diarizer/)
   ├── ──────► vad (modal/)
-  └── ──────► deepgram (self-hosted or cloud)
+  ├── ──────► deepgram (self-hosted or cloud)
+  └── ──────► assemblyai (cloud, background async when enabled)
 
 pusher
   ├── ──────► diarizer (diarizer/)
@@ -63,12 +64,13 @@ notifications-job (modal/job.py)  [cron]
 
 Helm charts: `backend/charts/{backend-listen,pusher,diarizer,vad,deepgram-self-hosted,agent-proxy}/`
 
-- **backend** (`main.py`) — REST API. Streams audio to pusher via WebSocket (`utils/pusher.py`). Calls diarizer for speaker embeddings (`utils/stt/speaker_embedding.py`). Calls vad for voice activity detection and speaker identification (`utils/stt/vad.py`, `utils/stt/speech_profile.py`). Calls deepgram for STT (`utils/stt/streaming.py`).
+- **backend** (`main.py`) — REST API. Streams realtime/listen audio to pusher via WebSocket (`utils/pusher.py`). Calls diarizer for speaker embeddings (`utils/stt/speaker_embedding.py`). Calls vad for voice activity detection and speaker identification (`utils/stt/vad.py`, `utils/stt/speech_profile.py`). Calls deepgram for realtime and Hold-to-Talk STT (`utils/stt/streaming.py`) and for prerecorded fallback. Calls AssemblyAI for explicitly enabled async/background prerecorded workloads through `utils/stt/provider_service.py`.
 - **pusher** (`pusher/main.py`) — Receives audio via binary WebSocket protocol. Calls diarizer and deepgram for speaker sample extraction (`utils/speaker_identification.py` → `utils/speaker_sample.py`).
 - **agent-proxy** (`agent-proxy/main.py`) — GKE. WebSocket proxy at `wss://agent.omi.me/v1/agent/ws`. Validates Firebase ID token, looks up `agentVm` in Firestore, proxies bidirectionally to VM's `ws://<ip>:8080/ws`. VM credentials never leave the server.
 - **diarizer** (`diarizer/main.py`) — GPU. Speaker embeddings at `/v2/embedding`. Called by backend and pusher (`HOSTED_SPEAKER_EMBEDDING_API_URL`).
 - **vad** (`modal/main.py`) — GPU. `/v1/vad` (voice activity detection) and `/v1/speaker-identification` (speaker matching). Called by backend only (`HOSTED_VAD_API_URL`, `HOSTED_SPEECH_PROFILE_API_URL`).
-- **deepgram** — STT. Streaming uses self-hosted (`DEEPGRAM_SELF_HOSTED_URL`) or cloud based on `DEEPGRAM_SELF_HOSTED_ENABLED` (`utils/stt/streaming.py`). Pre-recorded always uses Deepgram cloud (`utils/stt/pre_recorded.py`). Called by backend and pusher.
+- **deepgram** — STT. Streaming uses self-hosted (`DEEPGRAM_SELF_HOSTED_URL`) or cloud based on `DEEPGRAM_SELF_HOSTED_ENABLED` (`utils/stt/streaming.py`). Prerecorded Deepgram cloud remains the default and fallback provider through `utils/stt/provider_service.py`/`utils/stt/pre_recorded.py`. Called by backend and pusher.
+- **assemblyai** — Async/background STT. Used only by backend for feature-flagged prerecorded workloads (`sync`, `background`, `postprocess`) through `utils/stt/assemblyai_adapter.py`; provider speaker labels remain session-local metadata.
 - **notifications-job** (`modal/job.py`) — Cron job, reads Firestore/Redis, sends push notifications.
 
 Keep this map up to date. When adding, removing, or changing inter-service calls, update this section and the matching section in `CLAUDE.md`.

diff --git a/CLAUDE.md b/CLAUDE.md
@@ -52,7 +52,8 @@ backend (main.py)
   ├── ws ──► pusher (pusher/)
   ├── ──────► diarizer (diarizer/)
   ├── ──────► vad (modal/)
-  └── ──────► deepgram (self-hosted or cloud)
+  ├── ──────► deepgram (self-hosted or cloud)
+  └── ──────► assemblyai (cloud, background async when enabled)
 
 pusher
   ├── ──────► diarizer (diarizer/)

diff --git a/app/lib/backend/schema/transcript_segment.dart b/app/lib/backend/schema/transcript_segment.dart
@@ -33,6 +33,13 @@ class TranscriptSegment {
   List<Translation> translations = [];
   bool speechProfileProcessed;
   String? sttProvider;
+  String? sttModel;
+  String? providerClusterId;
+  String? providerSpeakerLabel;
+  String speakerIdentityState;
+  double? speakerIdentityConfidence;
+  String? speakerIdentitySource;
+  String? speakerIdentityVersion;
 
   TranscriptSegment({
     required this.id,
@@ -45,11 +52,22 @@ class TranscriptSegment {
     required this.translations,
     this.speechProfileProcessed = true,
     this.sttProvider,
+    this.sttModel,
+    this.providerClusterId,
+    this.providerSpeakerLabel,
+    this.speakerIdentityState = 'legacy_ambiguous',
+    this.speakerIdentityConfidence,
+    this.speakerIdentitySource,
+    this.speakerIdentityVersion,
   }) {
     final parts = speaker?.split('_') ?? [];
     speakerId = parts.length > 1 ? (int.tryParse(parts[1]) ?? 0) : 0;
   }
 
+  bool get hasExplicitUnknownSpeakerIdentity => speakerIdentityState == 'unknown';
+
+  bool get hasLegacySpeakerLabel => speaker != null && speaker!.isNotEmpty;
+
   @override
   String toString() {
     return 'TranscriptSegment: {id: $id text: $text, speaker: $speakerId, isUser: $isUser, start: $start, end: $end}';
@@ -66,14 +84,23 @@ class TranscriptSegment {
     return TranscriptSegment(
       id: (json['id'] ?? '') as String,
       text: json['text'] as String,
-      speaker: (json['speaker'] ?? 'SPEAKER_00') as String,
+      speaker: json['speaker'] as String?,
       isUser: (json['is_user'] ?? false) as bool,
       personId: json['person_id'],
       start: double.tryParse(json['start'].toString()) ?? 0.0,
       end: double.tryParse(json['end'].toString()) ?? 0.0,
       translations: json['translations'] != null ? Translation.fromJsonList(json['translations'] as List<dynamic>) : [],
       speechProfileProcessed: (json['speech_profile_processed'] ?? true) as bool,
       sttProvider: json['stt_provider'] as String?,
+      sttModel: json['stt_model'] as String?,
+      providerClusterId: json['provider_cluster_id'] as String?,
+      providerSpeakerLabel: json['provider_speaker_label'] as String?,
+      speakerIdentityState: (json['speaker_identity_state'] ?? 'legacy_ambiguous') as String,
+      speakerIdentityConfidence: json['speaker_identity_confidence'] != null
+          ? double.tryParse(json['speaker_identity_confidence'].toString())
+          : null,
+      speakerIdentitySource: json['speaker_identity_source'] as String?,
+      speakerIdentityVersion: json['speaker_identity_version'] as String?,
     );
   }
 
@@ -88,6 +115,13 @@ class TranscriptSegment {
       'end': end,
       'translations': translations.map((t) => t.toJson()).toList(),
       if (sttProvider != null) 'stt_provider': sttProvider,
+      if (sttModel != null) 'stt_model': sttModel,
+      if (providerClusterId != null) 'provider_cluster_id': providerClusterId,
+      if (providerSpeakerLabel != null) 'provider_speaker_label': providerSpeakerLabel,
+      'speaker_identity_state': speakerIdentityState,
+      if (speakerIdentityConfidence != null) 'speaker_identity_confidence': speakerIdentityConfidence,
+      if (speakerIdentitySource != null) 'speaker_identity_source': speakerIdentitySource,
+      if (speakerIdentityVersion != null) 'speaker_identity_version': speakerIdentityVersion,
     };
   }
 
@@ -195,7 +229,7 @@ class TranscriptSegment {
         if (segment.personId != null && peopleMap.containsKey(segment.personId)) {
           speakerName = peopleMap[segment.personId]!;
         } else {
-          var displayId = '${getDisplaySpeakerId(segment.speakerId, segments)}';
+          var displayId = getDisplaySpeakerIdForSegment(segment, segments);
           speakerName = speakerLabelBuilder != null ? speakerLabelBuilder(displayId) : 'Speaker $displayId';
         }
         transcript += '$timestampStr $speakerName: $segmentText ';
@@ -242,4 +276,25 @@ class TranscriptSegment {
     // Normalize: subtract minimum and add 1 to make it 1-indexed
     return speakerId - minSpeakerId + 1;
   }
+
+  static String getDisplaySpeakerIdForSegment(TranscriptSegment segment, List<TranscriptSegment> segments) {
+    if (segment.hasLegacySpeakerLabel) {
+      return '${getDisplaySpeakerId(segment.speakerId, segments)}';
+    }
+
+    final providerClusterId = segment.providerClusterId;
+    if (providerClusterId != null && providerClusterId.isNotEmpty) {
+      final clusterIds = <String>[];
+      for (final item in segments) {
+        final clusterId = item.providerClusterId;
+        if (!item.isUser && clusterId != null && clusterId.isNotEmpty && !clusterIds.contains(clusterId)) {
+          clusterIds.add(clusterId);
+        }
+      }
+      final index = clusterIds.indexOf(providerClusterId);
+      if (index >= 0) return '${index + 1}';
+    }
+
+    return '?';
+  }
 }
diff --git a/app/lib/pages/conversation_detail/page.dart b/app/lib/pages/conversation_detail/page.dart
@@ -1456,7 +1456,7 @@ class _TranscriptWidgetsState extends State<TranscriptWidgets> with AutomaticKee
                 final person =
                     segment.personId != null ? SharedPreferencesUtil().getPersonById(segment.personId!) : null;
                 final speakerName = person?.name ??
-                    context.l10n.speakerWithId('${TranscriptSegment.getDisplaySpeakerId(segment.speakerId, segments)}');
+                    context.l10n.speakerWithId(TranscriptSegment.getDisplaySpeakerIdForSegment(segment, segments));
                 PlatformManager.instance.analytics.editSegmentTextStarted();
                 bool saved = false;
                 showEditSegmentBottomSheet(

diff --git a/app/lib/widgets/transcript.dart b/app/lib/widgets/transcript.dart
@@ -508,7 +508,7 @@ class _TranscriptWidgetState extends State<TranscriptWidget> {
                                   ? 'omi'
                                   : (person?.name ??
                                       context.l10n.speakerWithId(
-                                        '${TranscriptSegment.getDisplaySpeakerId(data.speakerId, widget.segments)}',
+                                        TranscriptSegment.getDisplaySpeakerIdForSegment(data, widget.segments),
                                       )),
                               style: TextStyle(
                                 color: data.speakerId == omiSpeakerId || person != null

diff --git a/app/test/widgets/transcript_test.dart b/app/test/widgets/transcript_test.dart
@@ -42,6 +42,63 @@ void main() {
     );
   }
 
+  group('TranscriptSegment compatibility metadata', () {
+    test('decodes provider speaker metadata without changing legacy fields', () {
+      final segment = TranscriptSegment.fromJson({
+        'id': 'seg-provider',
+        'text': 'Hello',
+        'speaker': null,
+        'speaker_id': 0,
+        'is_user': false,
+        'person_id': null,
+        'start': 0.0,
+        'end': 1.0,
+        'stt_provider': 'provider-a',
+        'stt_model': 'async-large',
+        'provider_cluster_id': 'speaker-alpha',
+        'provider_speaker_label': null,
+        'speaker_identity_state': 'unknown',
+        'speaker_identity_confidence': null,
+        'speaker_identity_source': null,
+        'speaker_identity_version': 'v1',
+      });
+
+      expect(segment.speaker, isNull);
+      expect(segment.speakerId, 0);
+      expect(segment.sttProvider, 'provider-a');
+      expect(segment.sttModel, 'async-large');
+      expect(segment.providerClusterId, 'speaker-alpha');
+      expect(segment.speakerIdentityState, 'unknown');
+      expect(segment.speakerIdentityVersion, 'v1');
+    });
+
+    test('uses provider cluster labels when legacy speaker label is absent', () {
+      final first = TranscriptSegment.fromJson({
+        'id': 'seg-provider-a',
+        'text': 'Hello',
+        'speaker': null,
+        'is_user': false,
+        'start': 0.0,
+        'end': 1.0,
+        'provider_cluster_id': 'speaker-alpha',
+        'speaker_identity_state': 'unknown',
+      });
+      final second = TranscriptSegment.fromJson({
+        'id': 'seg-provider-b',
+        'text': 'Hi',
+        'speaker': null,
+        'is_user': false,
+        'start': 1.0,
+        'end': 2.0,
+        'provider_cluster_id': 'speaker-beta',
+        'speaker_identity_state': 'unknown',
+      });
+
+      expect(TranscriptSegment.getDisplaySpeakerIdForSegment(first, [first, second]), '1');
+      expect(TranscriptSegment.getDisplaySpeakerIdForSegment(second, [first, second]), '2');
+    });
+  });
+
   group('Speaker label display', () {
     testWidgets('shows person name when personId is set and in cache', (tester) async {
       final now = DateTime.now();

diff --git a/backend/.env.template b/backend/.env.template
@@ -14,6 +14,17 @@ REDIS_DB_PASSWORD=
 
 DEEPGRAM_API_KEY=
 
+# AssemblyAI async prerecorded STT. Enabled by default for eligible workloads: sync, background, postprocess.
+ASSEMBLYAI_API_KEY=
+ASSEMBLYAI_PRERECORDED_STT_ENABLED=true
+ASSEMBLYAI_PRERECORDED_STT_WORKLOADS=sync,background,postprocess
+ASSEMBLYAI_PRERECORDED_STT_FALLBACK_ENABLED=true
+ASSEMBLYAI_STT_MODEL=universal-2
+ASSEMBLYAI_BASE_URL=https://api.assemblyai.com
+ASSEMBLYAI_POLL_INTERVAL_SECONDS=3
+ASSEMBLYAI_MAX_POLL_SECONDS=900
+ASSEMBLYAI_SMOKE_AUDIO_URL=
+
 ADMIN_KEY=
 OPENAI_API_KEY=
 

diff --git a/backend/database/conversations.py b/backend/database/conversations.py
@@ -874,6 +874,36 @@ def update_conversation_segments(
         return
 
 
+def update_conversation_segments_and_background_chunks(
+    uid: str,
+    conversation_id: str,
+    segments: List[dict],
+    background_processed_chunks: Dict[str, dict],
+    finished_at: datetime = None,
+    data_protection_level: str = None,
+):
+    doc_ref = db.collection('users').document(uid).collection(conversations_collection).document(conversation_id)
+    if data_protection_level is not None:
+        doc_level = data_protection_level
+    else:
+        doc_snapshot = doc_ref.get(field_paths=['data_protection_level'])
+        if not doc_snapshot.exists:
+            return
+        doc_level = doc_snapshot.to_dict().get('data_protection_level', 'standard')
+    update_payload = {
+        'transcript_segments': segments,
+        'background_processed_chunks': background_processed_chunks,
+    }
+    if finished_at:
+        update_payload['finished_at'] = finished_at
+    prepared_payload = _prepare_conversation_for_write(update_payload, uid, doc_level)
+    try:
+        doc_ref.update(prepared_payload)
+    except NotFound:
+        # Document was deleted between cache read and write — safe to skip
+        return
+
+
 # ***********************************
 # ********** VISIBILITY *************
 # ***********************************