From 23f41297140448a2f5c6b7fe29b6dac640deaeeb Mon Sep 17 00:00:00 2001 From: vnayakde Date: Thu, 16 Apr 2026 03:51:21 +0530 Subject: [PATCH] context-aware interruption handling layer_has_been_made --- README.md | 34 +++++++++++++++++ .../livekit/agents/voice/agent_activity.py | 37 ++++++++++++++++--- .../livekit/agents/voice/agent_session.py | 3 ++ 3 files changed, 68 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 2a09aac241..a625e8b114 100644 --- a/README.md +++ b/README.md @@ -373,3 +373,37 @@ The Agents framework is under active development in a rapidly evolving field. We + +--- + +# ๐ŸŽ“ LiveKit Intelligent Interruption Handling (Assignment Submission) + +This repository implements a **context-aware interruption handler** for LiveKit Voice Agents. In simple terms: we fixed a common Voice AI bug where the AI awkwardly stops talking entirely just because the user coughed, said "yeah", or made a short sound of agreement. + +## ๐Ÿš€ Features Implemented +1. **Configurable Ignore List**: Built a list of filler words (like `"yeah", "ok", "hmm"`) directly into the agent's configuration settings (`AgentSessionOptions.backchannel_ignore_words`). This means developers can easily customize exactly which words shouldn't interrupt the AI! +2. **State-Aware Filtering**: The filter only activates when the agent is *currently speaking*. If the agent is patiently listening, "yeah" is treated as a totally normal conversational response. +3. **Semantic Interruption**: Mixed sentences like `"Yeah, wait"` successfully bypass the filter because "wait" is a commanding word. The agent stops immediately. +4. **No Core Hacks**: We didn't break or rewrite the low-level Voice Activity Detection (VAD) models. Instead, we injected clean, modular logic into the `AgentActivity` event loop to catch and evaluate transcripts in real-time. + +## ๐Ÿง  How the Logic Works (Simply Explained) + +When humans talk to Voice AI, two distinct systems are listening: +* **VAD (Voice Activity Detection)**: Very fast. It only knows *if* you made a sound. +* **STT (Speech-To-Text)**: Slightly slower. It actually decodes *what* you said. + +**The Problem:** By default, VAD acts instantly. The exact millisecond you say "yeah" to agree with the AI, the VAD panics and pauses the AI's audio, assuming you are interrupting. Then, ~300 milliseconds later, STT realizes you only said "yeah", but the AI has already stopped talking! This creates awkward stuttering. + +**The Solution:** +1. **Patience over Panic (`_interrupt_by_audio_activity`):** I modified a function inside `agent_activity.py` hook. Now, when VAD detects sound while the agent is talking, we intentionally **wait** instead of pausing the speaker right away. +2. **The Verification Check (`_is_backchanneling`):** As the slower STT starts dripping text in, we pass the text to a new, smart helper function. This function cleans the transcript and checks if *all* the words spoken are purely filler words (like "yeah", "hmm"). +3. **Seamless Continuation:** If it is purely a filler word, we `return` out of the function early. The audio stream never breaks! However, if they say something commanding (like "Stop"), we immediately let LiveKit halt the AI. +4. **The End-of-Turn Cleanup (`on_end_of_turn`):** When the user completely stops talking, LiveKit evaluates the final sentence. I added the exact same safety check here: if the sentence was exclusively filler words, we dismiss the turn rather than letting it confuse the agent's memory. + +## ๐Ÿงช Evaluation Criteria Handled Correctly +- โœ… **Agent Speaking + User says "Yeah/Ok"**: Code detects it's backchanneling. Bypasses interruption. Audio continues seamlessly. +- โœ… **Agent Speaking + User says "Stop"**: Code detects "stop" isn't a filler word. Instantly cuts audio playback. +- โœ… **Agent Silent + User says "Yeah/Ok"**: Code detects the agent isn't talking, disables our custom filter completely, and lets the LLM naturally respond to "Yeah". + +## ๐Ÿ“น Proof of Execution +[Proof of Execution](https://drive.google.com/drive/folders/1moI4OLYw8EQcUVkNbTYlkjQ7B8m1nKMB?usp=drive_link) diff --git a/livekit-agents/livekit/agents/voice/agent_activity.py b/livekit-agents/livekit/agents/voice/agent_activity.py index 0c3f7c743d..56f2710da4 100644 --- a/livekit-agents/livekit/agents/voice/agent_activity.py +++ b/livekit-agents/livekit/agents/voice/agent_activity.py @@ -1166,6 +1166,15 @@ def _on_generation_created(self, ev: llm.GenerationCreatedEvent) -> None: ) self._schedule_speech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL) + def _is_backchanneling(self, text: str) -> bool: + import re + if not text or not text.strip(): + return True + clean_text = re.sub(r'[^\w\s-]', '', text.lower()) + words = clean_text.split() + ignore_list = self._session.options.backchannel_ignore_words or [] + return all(word in ignore_list for word in words) + def _interrupt_by_audio_activity(self) -> None: opt = self._session.options use_pause = opt.resume_false_interruption and opt.false_interruption_timeout is not None @@ -1174,6 +1183,16 @@ def _interrupt_by_audio_activity(self) -> None: # ignore if realtime model has turn detection enabled return + if ( + self._current_speech is not None + and not self._current_speech.interrupted + and self.stt is not None + and self._audio_recognition is not None + ): + transcript = self._audio_recognition.current_transcript + if self._is_backchanneling(transcript): + return + if ( self.stt is not None and opt.min_interruption_words > 0 @@ -1371,13 +1390,19 @@ def on_end_of_turn(self, info: _EndOfTurnInfo) -> bool: and self._current_speech is not None and self._current_speech.allow_interruptions and not self._current_speech.interrupted - and self._session.options.min_interruption_words > 0 - and len(split_words(info.new_transcript, split_character=True)) - < self._session.options.min_interruption_words ): - self._cancel_preemptive_generation() - # avoid interruption if the new_transcript is too short - return False + if self._is_backchanneling(info.new_transcript): + self._cancel_preemptive_generation() + return False + + if ( + self._session.options.min_interruption_words > 0 + and len(split_words(info.new_transcript, split_character=True)) + < self._session.options.min_interruption_words + ): + self._cancel_preemptive_generation() + # avoid interruption if the new_transcript is too short + return False old_task = self._user_turn_completed_atask self._user_turn_completed_atask = self._create_speech_task( diff --git a/livekit-agents/livekit/agents/voice/agent_session.py b/livekit-agents/livekit/agents/voice/agent_session.py index 628718a6b2..6e04ade6fa 100644 --- a/livekit-agents/livekit/agents/voice/agent_session.py +++ b/livekit-agents/livekit/agents/voice/agent_session.py @@ -89,6 +89,7 @@ class AgentSessionOptions: preemptive_generation: bool tts_text_transforms: Sequence[TextTransforms] | None ivr_detection: bool + backchannel_ignore_words: list[str] | None = None Userdata_T = TypeVar("Userdata_T") @@ -147,6 +148,7 @@ def __init__( discard_audio_if_uninterruptible: bool = True, min_interruption_duration: float = 0.5, min_interruption_words: int = 0, + backchannel_ignore_words: list[str] | None = None, min_endpointing_delay: float = 0.5, max_endpointing_delay: float = 3.0, max_tool_steps: int = 3, @@ -271,6 +273,7 @@ def __init__( discard_audio_if_uninterruptible=discard_audio_if_uninterruptible, min_interruption_duration=min_interruption_duration, min_interruption_words=min_interruption_words, + backchannel_ignore_words=backchannel_ignore_words or ["yeah", "ok", "okay", "hmm", "right", "uh-huh", "yep", "yes", "mhm"], min_endpointing_delay=min_endpointing_delay, max_endpointing_delay=max_endpointing_delay, max_tool_steps=max_tool_steps,