r3dbars · r3dbars · Jun 16, 2026 · Jun 13, 2026 · Jun 14, 2026 · Jun 15, 2026
diff --git a/Sources/TranscriptedCore/Pipeline/TranscriptionPipeline.swift b/Sources/TranscriptedCore/Pipeline/TranscriptionPipeline.swift
@@ -144,8 +144,9 @@ extension Transcription {
             let rawSegments = try await diarization.diarizeOffline(samples: systemSamples, sampleRate: 16000)
 
             // Post-process diarization segments, but skip the broad pairwise merge
-            // phase for PyAnnote/VBx output. Small-cluster absorption and DB-informed
-            // split still run for noise cleanup and known-speaker corrections.
+            // phase for PyAnnote/VBx output. Small-cluster absorption, same-voice
+            // consolidation (collapses one over-segmented voice so the user names
+            // each person once), and DB-informed split still run.
             let existingProfiles = speakerDB.allSpeakers()
             let speakerSegments = EmbeddingClusterer.postProcess(
                 segments: rawSegments,

diff --git a/Sources/TranscriptedCore/Speaker/EmbeddingClusterer.swift b/Sources/TranscriptedCore/Speaker/EmbeddingClusterer.swift
@@ -1,5 +1,5 @@
 // EmbeddingClusterer.swift
-// Post-processes diarization speaker segments to fix two failure modes:
+// Post-processes diarization speaker segments to fix three failure modes:
 //
 // Supports both Sortformer (streaming) and PyAnnote (offline) pipelines.
 //
@@ -9,7 +9,15 @@
 //    Note: Skipped for PyAnnote offline output, where VBx clustering already
 //    handles speaker merging/fragmentation.
 //
-// 2. Merging: Different speakers collapsed into one diarizer ID.
+// 2. Over-segmentation: One real voice split across several clusters that each
+//    accumulate enough speech to survive small-cluster absorption. This is why
+//    a one-on-one call can surface 4-7 "speakers" to name for a single person.
+//    Fixed by same-voice consolidation — agglomeratively merge clusters whose
+//    mean embeddings are as similar as the "same known person" auto-accept bar,
+//    recomputing centroids after each merge so genuinely distinct speakers in a
+//    crowded meeting do not chain-collapse.
+//
+// 3. Merging: Different speakers collapsed into one diarizer ID.
 //    Fixed by DB-informed split — compare per-segment embeddings against
 //    known speaker profiles and split clusters that contain 2+ distinct voices.
 
@@ -18,19 +26,42 @@ import Accelerate
 
 public enum EmbeddingClusterer {
 
+    /// Cosine-similarity bar for the same-voice consolidation pass. Must equal
+    /// `SpeakerNamingPolicy.autoAcceptSimilarityThreshold` (0.88): consolidation
+    /// should only collapse two clusters into one person when they are at least as
+    /// similar as we'd demand to silently auto-accept them as the same known person.
+    /// `EmbeddingClustererTests.testConsolidationThresholdMatchesAutoAcceptBar`
+    /// asserts the two stay equal, so changing one without the other fails CI
+    /// instead of silently drifting.
+    public static let sameVoiceConsolidationThreshold: Float = 0.88
+
+    /// Lower bar used only to detect "these centroids may belong to different
+    /// known speakers" before consolidation. This mirrors the lowest adaptive
+    /// profile-match threshold used by `TranscriptionPipeline`, so we preserve
+    /// plausible known-speaker conflicts for the later naming/review path.
+    private static let knownProfileConflictThreshold: Float = 0.70
+
     /// Post-process diarization segments: merge fragmented speakers,
     /// absorb tiny orphan clusters, then split clusters that contain
     /// multiple known DB voices.
     ///
     /// - Parameter pairwiseMergeThreshold: Cosine similarity threshold for merging
     ///   fragmented speaker clusters. Pass `nil` to skip only the pairwise merge
-    ///   phase; small-cluster absorption and DB-informed split still run.
+    ///   phase; small-cluster absorption, same-voice consolidation, and
+    ///   DB-informed split still run.
     ///   Sortformer default: 0.85 (conservative). Offline PyAnnote callers pass
     ///   `nil` because VBx already handles the base merge/fragmentation case.
+    /// - Parameter consolidationThreshold: Cosine similarity threshold for the
+    ///   same-voice consolidation pass that collapses over-segmented large
+    ///   clusters of one speaker. Pass `nil` to skip it. Defaults to the
+    ///   `SpeakerNamingPolicy` auto-accept bar (0.88) so two clusters only merge
+    ///   when they are more similar than we'd demand to auto-accept them as the
+    ///   same known person.
     public static func postProcess(
         segments: [SpeakerSegment],
         existingProfiles: [SpeakerProfile],
-        pairwiseMergeThreshold: Float? = 0.85
+        pairwiseMergeThreshold: Float? = 0.85,
+        consolidationThreshold: Float? = sameVoiceConsolidationThreshold
     ) -> [SpeakerSegment] {
         guard segments.count >= 2 else { return segments }
         var result: [SpeakerSegment]
@@ -40,6 +71,13 @@ public enum EmbeddingClusterer {
             result = segments
         }
         result = absorbSmallClusters(segments: result)
+        if let consolidationThreshold {
+            result = consolidateSameVoiceClusters(
+                segments: result,
+                threshold: consolidationThreshold,
+                existingProfiles: existingProfiles
+            )
+        }
         result = dbInformedSplit(segments: result, profiles: existingProfiles)
         return result
     }
@@ -263,6 +301,150 @@ public enum EmbeddingClusterer {
         }
     }
 
+    // MARK: - Same-Voice Consolidation
+
+    /// Consolidate clusters that are almost certainly the same voice, even when
+    /// each cluster is large enough to survive `absorbSmallClusters`.
+    ///
+    /// Offline VBx clustering sometimes splits one remote participant across
+    /// several speaker IDs that each accumulate well over `minClusterDuration`
+    /// of speech. `absorbSmallClusters` never touches them because it only folds
+    /// short clusters into large ones, so a one-on-one call can surface 4-7
+    /// "speakers" the user has to name for a single person.
+    ///
+    /// This pass compares the mean embedding of every surviving cluster pair and
+    /// merges those above `threshold`. Two safeguards keep genuine
+    /// multi-speaker meetings intact:
+    /// - The threshold is high (0.88 by default — the `SpeakerNamingPolicy`
+    ///   auto-accept bar). Distinct speakers rarely exceed ~0.6 cosine
+    ///   similarity, so only near-identical voices merge.
+    /// - Merging is agglomerative with recomputed centroids: after A and B
+    ///   merge, the combined centroid must still clear `threshold` against C
+    ///   before C joins. This avoids the transitive A≈B, B≈C → A+B+C collapse
+    ///   that made the broad pairwise merge unsafe on VBx output.
+    static func consolidateSameVoiceClusters(
+        segments: [SpeakerSegment],
+        threshold: Float = sameVoiceConsolidationThreshold,
+        existingProfiles: [SpeakerProfile] = []
+    ) -> [SpeakerSegment] {
+        let distinctIds = Set(segments.map { $0.speakerId })
+        guard distinctIds.count >= 2 else { return segments }
+
+        // Collect embeddings per speaker. Prefer quality-filtered samples but
+        // fall back to all samples so every cluster has a centroid to compare.
+        var qualityEmbeddings: [Int: [[Float]]] = [:]
+        var allEmbeddings: [Int: [[Float]]] = [:]
+        for segment in segments {
+            guard let embedding = segment.embedding, !embedding.isEmpty else { continue }
+            allEmbeddings[segment.speakerId, default: []].append(embedding)
+            if segment.qualityScore >= 0.3, segment.duration >= 1.0 {
+                qualityEmbeddings[segment.speakerId, default: []].append(embedding)
+            }
+        }
+
+        // Live clusters: the raw embeddings backing each centroid, so we can
+        // recompute the centroid after every merge.
+        var clusterEmbeddings: [Int: [[Float]]] = [:]
+        for id in distinctIds {
+            let quality = qualityEmbeddings[id] ?? []
+            let embeddings = quality.isEmpty ? (allEmbeddings[id] ?? []) : quality
+            if !embeddings.isEmpty {
+                clusterEmbeddings[id] = embeddings
+            }
+        }
+        guard clusterEmbeddings.count >= 2 else { return segments }
+
+        var centroids: [Int: [Float]] = [:]
+        for (id, embeddings) in clusterEmbeddings {
+            centroids[id] = Transcription.computeMeanEmbedding(embeddings)
+        }
+
+        // old speaker ID → canonical surviving ID (identity to start).
+        var mergeMap: [Int: Int] = [:]
+        for id in clusterEmbeddings.keys { mergeMap[id] = id }
+
+        // Repeatedly merge the single most-similar pair above threshold,
+        // recomputing the merged centroid each round until nothing qualifies.
+        while centroids.count >= 2 {
+            let liveIds = centroids.keys.sorted()
+            var bestSim = threshold
+            var bestPair: (keep: Int, drop: Int)?
+            for i in 0..<liveIds.count {
+                for j in (i + 1)..<liveIds.count {
+                    let a = liveIds[i], b = liveIds[j]
+                    guard let ea = centroids[a], let eb = centroids[b] else { continue }
+                    let sim = Float(Transcription.cosineSimilarityStatic(ea, eb))
+                    if hasKnownProfileConflict(
+                        ea,
+                        eb,
+                        profiles: existingProfiles
+                    ) {
+                        continue
+                    }
+                    if sim > bestSim {
+                        bestSim = sim
+                        bestPair = (keep: a, drop: b)  // liveIds sorted, so a < b
+                    }
+                }
+            }
+
+            guard let pair = bestPair else { break }
+            clusterEmbeddings[pair.keep, default: []].append(contentsOf: clusterEmbeddings[pair.drop] ?? [])
+            clusterEmbeddings[pair.drop] = nil
+            centroids[pair.keep] = Transcription.computeMeanEmbedding(clusterEmbeddings[pair.keep] ?? [])
+            centroids[pair.drop] = nil
+            for (old, canonical) in mergeMap where canonical == pair.drop {
+                mergeMap[old] = pair.keep
+            }
+            AppLogger.transcription.info("Consolidated same-voice clusters", [
+                "merged": "spk\(pair.drop)",
+                "into": "spk\(pair.keep)",
+                "similarity": String(format: "%.3f", bestSim)
+            ])
+        }
+
+        guard mergeMap.contains(where: { $0.key != $0.value }) else { return segments }
+
+        return segments.map { segment in
+            let newId = mergeMap[segment.speakerId] ?? segment.speakerId
+            guard newId != segment.speakerId else { return segment }
+            return SpeakerSegment(
+                speakerId: newId,
+                startTime: segment.startTime,
+                endTime: segment.endTime,
+                embedding: segment.embedding,
+                qualityScore: segment.qualityScore
+            )
+        }
+    }
+
+    private static func hasKnownProfileConflict(
+        _ lhs: [Float],
+        _ rhs: [Float],
+        profiles: [SpeakerProfile]
+    ) -> Bool {
+        guard !profiles.isEmpty else { return false }
+        let lhsMatches = knownProfileMatches(for: lhs, profiles: profiles)
+        let rhsMatches = knownProfileMatches(for: rhs, profiles: profiles)
+        guard !lhsMatches.isEmpty, !rhsMatches.isEmpty else { return false }
+
+        return lhsMatches.union(rhsMatches).count > 1
+    }
+
+    private static func knownProfileMatches(
+        for embedding: [Float],
+        profiles: [SpeakerProfile]
+    ) -> Set<UUID> {
+        Set(profiles.compactMap { profile in
+            guard profile.disputeCount == 0,
+                  profile.embedding.count == embedding.count else {
+                return nil
+            }
+            let similarity = Float(Transcription.cosineSimilarityStatic(embedding, profile.embedding))
+            return similarity >= knownProfileConflictThreshold ? profile.id : nil
+        })
+    }
+
     // MARK: - DB-Informed Split
 
     /// Split clusters that contain 2+ known DB voices.

diff --git a/Sources/TranscriptedCore/Speaker/SpeakerNamingPolicy.swift b/Sources/TranscriptedCore/Speaker/SpeakerNamingPolicy.swift
@@ -1,10 +1,17 @@
 import Foundation
 
 public enum SpeakerNamingPolicy {
+    /// Cosine-similarity bar above which a returning known speaker is auto-accepted
+    /// as the same person without asking the user to confirm. This is the canonical
+    /// "same known person" threshold; `EmbeddingClusterer.sameVoiceConsolidationThreshold`
+    /// is tied to it (guarded by `EmbeddingClustererTests`) so same-voice consolidation
+    /// never merges two clusters we would not also auto-accept as one another.
+    public static let autoAcceptSimilarityThreshold: Double = 0.88
+
     public static func shouldAutoAccept(profile: SpeakerProfile, similarity: Double) -> Bool {
         profile.displayName != nil
             && profile.disputeCount == 0
-            && similarity > 0.88
+            && similarity > autoAcceptSimilarityThreshold
             && profile.callCount > 4
     }