Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions Sources/TranscriptedCore/Pipeline/TranscriptionPipeline.swift
Original file line number Diff line number Diff line change
Expand Up @@ -144,8 +144,9 @@ extension Transcription {
let rawSegments = try await diarization.diarizeOffline(samples: systemSamples, sampleRate: 16000)

// Post-process diarization segments, but skip the broad pairwise merge
// phase for PyAnnote/VBx output. Small-cluster absorption and DB-informed
// split still run for noise cleanup and known-speaker corrections.
// phase for PyAnnote/VBx output. Small-cluster absorption, same-voice
// consolidation (collapses one over-segmented voice so the user names
// each person once), and DB-informed split still run.
let existingProfiles = speakerDB.allSpeakers()
let speakerSegments = EmbeddingClusterer.postProcess(
segments: rawSegments,
Expand Down
190 changes: 186 additions & 4 deletions Sources/TranscriptedCore/Speaker/EmbeddingClusterer.swift
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
// EmbeddingClusterer.swift
// Post-processes diarization speaker segments to fix two failure modes:
// Post-processes diarization speaker segments to fix three failure modes:
//
// Supports both Sortformer (streaming) and PyAnnote (offline) pipelines.
//
Expand All @@ -9,7 +9,15 @@
// Note: Skipped for PyAnnote offline output, where VBx clustering already
// handles speaker merging/fragmentation.
//
// 2. Merging: Different speakers collapsed into one diarizer ID.
// 2. Over-segmentation: One real voice split across several clusters that each
// accumulate enough speech to survive small-cluster absorption. This is why
// a one-on-one call can surface 4-7 "speakers" to name for a single person.
// Fixed by same-voice consolidation — agglomeratively merge clusters whose
// mean embeddings are as similar as the "same known person" auto-accept bar,
// recomputing centroids after each merge so genuinely distinct speakers in a
// crowded meeting do not chain-collapse.
//
// 3. Merging: Different speakers collapsed into one diarizer ID.
// Fixed by DB-informed split — compare per-segment embeddings against
// known speaker profiles and split clusters that contain 2+ distinct voices.

Expand All @@ -18,19 +26,42 @@ import Accelerate

public enum EmbeddingClusterer {

/// Cosine-similarity bar for the same-voice consolidation pass. Must equal
/// `SpeakerNamingPolicy.autoAcceptSimilarityThreshold` (0.88): consolidation
/// should only collapse two clusters into one person when they are at least as
/// similar as we'd demand to silently auto-accept them as the same known person.
/// `EmbeddingClustererTests.testConsolidationThresholdMatchesAutoAcceptBar`
/// asserts the two stay equal, so changing one without the other fails CI
/// instead of silently drifting.
public static let sameVoiceConsolidationThreshold: Float = 0.88

/// Lower bar used only to detect "these centroids may belong to different
/// known speakers" before consolidation. This mirrors the lowest adaptive
/// profile-match threshold used by `TranscriptionPipeline`, so we preserve
/// plausible known-speaker conflicts for the later naming/review path.
private static let knownProfileConflictThreshold: Float = 0.70

/// Post-process diarization segments: merge fragmented speakers,
/// absorb tiny orphan clusters, then split clusters that contain
/// multiple known DB voices.
///
/// - Parameter pairwiseMergeThreshold: Cosine similarity threshold for merging
/// fragmented speaker clusters. Pass `nil` to skip only the pairwise merge
/// phase; small-cluster absorption and DB-informed split still run.
/// phase; small-cluster absorption, same-voice consolidation, and
/// DB-informed split still run.
/// Sortformer default: 0.85 (conservative). Offline PyAnnote callers pass
/// `nil` because VBx already handles the base merge/fragmentation case.
/// - Parameter consolidationThreshold: Cosine similarity threshold for the
/// same-voice consolidation pass that collapses over-segmented large
/// clusters of one speaker. Pass `nil` to skip it. Defaults to the
/// `SpeakerNamingPolicy` auto-accept bar (0.88) so two clusters only merge
/// when they are more similar than we'd demand to auto-accept them as the
/// same known person.
public static func postProcess(
segments: [SpeakerSegment],
existingProfiles: [SpeakerProfile],
pairwiseMergeThreshold: Float? = 0.85
pairwiseMergeThreshold: Float? = 0.85,
consolidationThreshold: Float? = sameVoiceConsolidationThreshold
) -> [SpeakerSegment] {
guard segments.count >= 2 else { return segments }
var result: [SpeakerSegment]
Expand All @@ -40,6 +71,13 @@ public enum EmbeddingClusterer {
result = segments
}
result = absorbSmallClusters(segments: result)
if let consolidationThreshold {
result = consolidateSameVoiceClusters(
segments: result,
threshold: consolidationThreshold,
existingProfiles: existingProfiles
)
}
result = dbInformedSplit(segments: result, profiles: existingProfiles)
return result
}
Expand Down Expand Up @@ -263,6 +301,150 @@ public enum EmbeddingClusterer {
}
}

// MARK: - Same-Voice Consolidation

/// Consolidate clusters that are almost certainly the same voice, even when
/// each cluster is large enough to survive `absorbSmallClusters`.
///
/// Offline VBx clustering sometimes splits one remote participant across
/// several speaker IDs that each accumulate well over `minClusterDuration`
/// of speech. `absorbSmallClusters` never touches them because it only folds
/// short clusters into large ones, so a one-on-one call can surface 4-7
/// "speakers" the user has to name for a single person.
///
/// This pass compares the mean embedding of every surviving cluster pair and
/// merges those above `threshold`. Two safeguards keep genuine
/// multi-speaker meetings intact:
/// - The threshold is high (0.88 by default — the `SpeakerNamingPolicy`
/// auto-accept bar). Distinct speakers rarely exceed ~0.6 cosine
/// similarity, so only near-identical voices merge.
/// - Merging is agglomerative with recomputed centroids: after A and B
/// merge, the combined centroid must still clear `threshold` against C
/// before C joins. This avoids the transitive A≈B, B≈C → A+B+C collapse
/// that made the broad pairwise merge unsafe on VBx output.
static func consolidateSameVoiceClusters(
segments: [SpeakerSegment],
threshold: Float = sameVoiceConsolidationThreshold,
existingProfiles: [SpeakerProfile] = []
) -> [SpeakerSegment] {
let distinctIds = Set(segments.map { $0.speakerId })
guard distinctIds.count >= 2 else { return segments }

// Collect embeddings per speaker. Prefer quality-filtered samples but
// fall back to all samples so every cluster has a centroid to compare.
var qualityEmbeddings: [Int: [[Float]]] = [:]
var allEmbeddings: [Int: [[Float]]] = [:]
for segment in segments {
guard let embedding = segment.embedding, !embedding.isEmpty else { continue }
allEmbeddings[segment.speakerId, default: []].append(embedding)
if segment.qualityScore >= 0.3, segment.duration >= 1.0 {
qualityEmbeddings[segment.speakerId, default: []].append(embedding)
}
}

// Live clusters: the raw embeddings backing each centroid, so we can
// recompute the centroid after every merge.
var clusterEmbeddings: [Int: [[Float]]] = [:]
for id in distinctIds {
let quality = qualityEmbeddings[id] ?? []
let embeddings = quality.isEmpty ? (allEmbeddings[id] ?? []) : quality
if !embeddings.isEmpty {
clusterEmbeddings[id] = embeddings
}
}
guard clusterEmbeddings.count >= 2 else { return segments }

var centroids: [Int: [Float]] = [:]
for (id, embeddings) in clusterEmbeddings {
centroids[id] = Transcription.computeMeanEmbedding(embeddings)
}

// old speaker ID → canonical surviving ID (identity to start).
var mergeMap: [Int: Int] = [:]
for id in clusterEmbeddings.keys { mergeMap[id] = id }

// Repeatedly merge the single most-similar pair above threshold,
// recomputing the merged centroid each round until nothing qualifies.
while centroids.count >= 2 {
let liveIds = centroids.keys.sorted()
var bestSim = threshold
var bestPair: (keep: Int, drop: Int)?
for i in 0..<liveIds.count {
for j in (i + 1)..<liveIds.count {
let a = liveIds[i], b = liveIds[j]
guard let ea = centroids[a], let eb = centroids[b] else { continue }
let sim = Float(Transcription.cosineSimilarityStatic(ea, eb))
if hasKnownProfileConflict(
ea,
eb,
profiles: existingProfiles
) {
continue
}
if sim > bestSim {
bestSim = sim
bestPair = (keep: a, drop: b) // liveIds sorted, so a < b
}
}
}

guard let pair = bestPair else { break }
clusterEmbeddings[pair.keep, default: []].append(contentsOf: clusterEmbeddings[pair.drop] ?? [])
clusterEmbeddings[pair.drop] = nil
centroids[pair.keep] = Transcription.computeMeanEmbedding(clusterEmbeddings[pair.keep] ?? [])
centroids[pair.drop] = nil
for (old, canonical) in mergeMap where canonical == pair.drop {
mergeMap[old] = pair.keep
}
AppLogger.transcription.info("Consolidated same-voice clusters", [
"merged": "spk\(pair.drop)",
"into": "spk\(pair.keep)",
"similarity": String(format: "%.3f", bestSim)
])
}

guard mergeMap.contains(where: { $0.key != $0.value }) else { return segments }

return segments.map { segment in
let newId = mergeMap[segment.speakerId] ?? segment.speakerId
guard newId != segment.speakerId else { return segment }
return SpeakerSegment(
speakerId: newId,
startTime: segment.startTime,
endTime: segment.endTime,
embedding: segment.embedding,
qualityScore: segment.qualityScore
)
}
}

private static func hasKnownProfileConflict(
_ lhs: [Float],
_ rhs: [Float],
profiles: [SpeakerProfile]
) -> Bool {
guard !profiles.isEmpty else { return false }
let lhsMatches = knownProfileMatches(for: lhs, profiles: profiles)
let rhsMatches = knownProfileMatches(for: rhs, profiles: profiles)
guard !lhsMatches.isEmpty, !rhsMatches.isEmpty else { return false }

return lhsMatches.union(rhsMatches).count > 1
}

private static func knownProfileMatches(
for embedding: [Float],
profiles: [SpeakerProfile]
) -> Set<UUID> {
Set(profiles.compactMap { profile in
guard profile.disputeCount == 0,
profile.embedding.count == embedding.count else {
return nil
}
let similarity = Float(Transcription.cosineSimilarityStatic(embedding, profile.embedding))
return similarity >= knownProfileConflictThreshold ? profile.id : nil
})
}

// MARK: - DB-Informed Split

/// Split clusters that contain 2+ known DB voices.
Expand Down
9 changes: 8 additions & 1 deletion Sources/TranscriptedCore/Speaker/SpeakerNamingPolicy.swift
Original file line number Diff line number Diff line change
@@ -1,10 +1,17 @@
import Foundation

public enum SpeakerNamingPolicy {
/// Cosine-similarity bar above which a returning known speaker is auto-accepted
/// as the same person without asking the user to confirm. This is the canonical
/// "same known person" threshold; `EmbeddingClusterer.sameVoiceConsolidationThreshold`
/// is tied to it (guarded by `EmbeddingClustererTests`) so same-voice consolidation
/// never merges two clusters we would not also auto-accept as one another.
public static let autoAcceptSimilarityThreshold: Double = 0.88

public static func shouldAutoAccept(profile: SpeakerProfile, similarity: Double) -> Bool {
profile.displayName != nil
&& profile.disputeCount == 0
&& similarity > 0.88
&& similarity > autoAcceptSimilarityThreshold
&& profile.callCount > 4
}

Expand Down
Loading
Loading