From 23f41297140448a2f5c6b7fe29b6dac640deaeeb Mon Sep 17 00:00:00 2001
From: vnayakde <intern@mowito.in>
Date: Thu, 16 Apr 2026 03:51:21 +0530
Subject: [PATCH] context-aware interruption handling layer_has_been_made

---
 README.md                                     | 34 +++++++++++++++++
 .../livekit/agents/voice/agent_activity.py    | 37 ++++++++++++++++---
 .../livekit/agents/voice/agent_session.py     |  3 ++
 3 files changed, 68 insertions(+), 6 deletions(-)
diff --git a/README.md b/README.md
index 2a09aac241..a625e8b114 100644
--- a/README.md
+++ b/README.md
@@ -373,3 +373,37 @@ The Agents framework is under active development in a rapidly evolving field. We
 </tbody>
 </table>
 <!--END_REPO_NAV-->
+
+---
+
+# 🎓 LiveKit Intelligent Interruption Handling (Assignment Submission)
+
+This repository implements a **context-aware interruption handler** for LiveKit Voice Agents. In simple terms: we fixed a common Voice AI bug where the AI awkwardly stops talking entirely just because the user coughed, said "yeah", or made a short sound of agreement.
+
+## 🚀 Features Implemented
+1. **Configurable Ignore List**: Built a list of filler words (like `"yeah", "ok", "hmm"`) directly into the agent's configuration settings (`AgentSessionOptions.backchannel_ignore_words`). This means developers can easily customize exactly which words shouldn't interrupt the AI!
+2. **State-Aware Filtering**: The filter only activates when the agent is *currently speaking*. If the agent is patiently listening, "yeah" is treated as a totally normal conversational response.
+3. **Semantic Interruption**: Mixed sentences like `"Yeah, wait"` successfully bypass the filter because "wait" is a commanding word. The agent stops immediately.
+4. **No Core Hacks**: We didn't break or rewrite the low-level Voice Activity Detection (VAD) models. Instead, we injected clean, modular logic into the `AgentActivity` event loop to catch and evaluate transcripts in real-time.
+
+## 🧠 How the Logic Works (Simply Explained)
+
+When humans talk to Voice AI, two distinct systems are listening:
+* **VAD (Voice Activity Detection)**: Very fast. It only knows *if* you made a sound.
+* **STT (Speech-To-Text)**: Slightly slower. It actually decodes *what* you said.
+
+**The Problem:** By default, VAD acts instantly. The exact millisecond you say "yeah" to agree with the AI, the VAD panics and pauses the AI's audio, assuming you are interrupting. Then, ~300 milliseconds later, STT realizes you only said "yeah", but the AI has already stopped talking! This creates awkward stuttering.
+
+**The Solution:**
+1. **Patience over Panic (`_interrupt_by_audio_activity`):** I modified a function inside `agent_activity.py` hook. Now, when VAD detects sound while the agent is talking, we intentionally **wait** instead of pausing the speaker right away.
+2. **The Verification Check (`_is_backchanneling`):** As the slower STT starts dripping text in, we pass the text to a new, smart helper function. This function cleans the transcript and checks if *all* the words spoken are purely filler words (like "yeah", "hmm"). 
+3. **Seamless Continuation:** If it is purely a filler word, we `return` out of the function early. The audio stream never breaks! However, if they say something commanding (like "Stop"), we immediately let LiveKit halt the AI.
+4. **The End-of-Turn Cleanup (`on_end_of_turn`):** When the user completely stops talking, LiveKit evaluates the final sentence. I added the exact same safety check here: if the sentence was exclusively filler words, we dismiss the turn rather than letting it confuse the agent's memory.
+
+## 🧪 Evaluation Criteria Handled Correctly
+- ✅ **Agent Speaking + User says "Yeah/Ok"**: Code detects it's backchanneling. Bypasses interruption. Audio continues seamlessly.
+- ✅ **Agent Speaking + User says "Stop"**: Code detects "stop" isn't a filler word. Instantly cuts audio playback.
+- ✅ **Agent Silent + User says "Yeah/Ok"**: Code detects the agent isn't talking, disables our custom filter completely, and lets the LLM naturally respond to "Yeah".
+
+## 📹 Proof of Execution
+[Proof of Execution](https://drive.google.com/drive/folders/1moI4OLYw8EQcUVkNbTYlkjQ7B8m1nKMB?usp=drive_link)
diff --git a/livekit-agents/livekit/agents/voice/agent_activity.py b/livekit-agents/livekit/agents/voice/agent_activity.py
index 0c3f7c743d..56f2710da4 100644
--- a/livekit-agents/livekit/agents/voice/agent_activity.py
+++ b/livekit-agents/livekit/agents/voice/agent_activity.py
@@ -1166,6 +1166,15 @@ def _on_generation_created(self, ev: llm.GenerationCreatedEvent) -> None:
         )
         self._schedule_speech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL)
 
+    def _is_backchanneling(self, text: str) -> bool:
+        import re
+        if not text or not text.strip():
+            return True
+        clean_text = re.sub(r'[^\w\s-]', '', text.lower())
+        words = clean_text.split()
+        ignore_list = self._session.options.backchannel_ignore_words or []
+        return all(word in ignore_list for word in words)
+
     def _interrupt_by_audio_activity(self) -> None:
         opt = self._session.options
         use_pause = opt.resume_false_interruption and opt.false_interruption_timeout is not None
@@ -1174,6 +1183,16 @@ def _interrupt_by_audio_activity(self) -> None:
             # ignore if realtime model has turn detection enabled
             return
 
+        if (
+            self._current_speech is not None
+            and not self._current_speech.interrupted
+            and self.stt is not None
+            and self._audio_recognition is not None
+        ):
+            transcript = self._audio_recognition.current_transcript
+            if self._is_backchanneling(transcript):
+                return
+
         if (
             self.stt is not None
             and opt.min_interruption_words > 0
@@ -1371,13 +1390,19 @@ def on_end_of_turn(self, info: _EndOfTurnInfo) -> bool:
             and self._current_speech is not None
             and self._current_speech.allow_interruptions
             and not self._current_speech.interrupted
-            and self._session.options.min_interruption_words > 0
-            and len(split_words(info.new_transcript, split_character=True))
-            < self._session.options.min_interruption_words
         ):
-            self._cancel_preemptive_generation()
-            # avoid interruption if the new_transcript is too short
-            return False
+            if self._is_backchanneling(info.new_transcript):
+                self._cancel_preemptive_generation()
+                return False
+
+            if (
+                self._session.options.min_interruption_words > 0
+                and len(split_words(info.new_transcript, split_character=True))
+                < self._session.options.min_interruption_words
+            ):
+                self._cancel_preemptive_generation()
+                # avoid interruption if the new_transcript is too short
+                return False
 
         old_task = self._user_turn_completed_atask
         self._user_turn_completed_atask = self._create_speech_task(
diff --git a/livekit-agents/livekit/agents/voice/agent_session.py b/livekit-agents/livekit/agents/voice/agent_session.py
index 628718a6b2..6e04ade6fa 100644
--- a/livekit-agents/livekit/agents/voice/agent_session.py
+++ b/livekit-agents/livekit/agents/voice/agent_session.py
@@ -89,6 +89,7 @@ class AgentSessionOptions:
     preemptive_generation: bool
     tts_text_transforms: Sequence[TextTransforms] | None
     ivr_detection: bool
+    backchannel_ignore_words: list[str] | None = None
 
 
 Userdata_T = TypeVar("Userdata_T")
@@ -147,6 +148,7 @@ def __init__(
         discard_audio_if_uninterruptible: bool = True,
         min_interruption_duration: float = 0.5,
         min_interruption_words: int = 0,
+        backchannel_ignore_words: list[str] | None = None,
         min_endpointing_delay: float = 0.5,
         max_endpointing_delay: float = 3.0,
         max_tool_steps: int = 3,
@@ -271,6 +273,7 @@ def __init__(
             discard_audio_if_uninterruptible=discard_audio_if_uninterruptible,
             min_interruption_duration=min_interruption_duration,
             min_interruption_words=min_interruption_words,
+            backchannel_ignore_words=backchannel_ignore_words or ["yeah", "ok", "okay", "hmm", "right", "uh-huh", "yep", "yes", "mhm"],
             min_endpointing_delay=min_endpointing_delay,
             max_endpointing_delay=max_endpointing_delay,
             max_tool_steps=max_tool_steps,