From 8d547db10efb4ca7dac9948b8d3346344084bcc8 Mon Sep 17 00:00:00 2001 From: Aditya Gupta <148680267+adi9336@users.noreply.github.com> Date: Fri, 13 Feb 2026 16:21:08 +0530 Subject: [PATCH 1/5] Implement intelligent interruption handling with backchanneling detection - Add advanced backchanneling detection to prevent unnecessary interruptions - Implement smart logic for repeated words (emphasis vs backchanneling) - Update agent instructions to ignore backchanneling responses - Add comprehensive ignore words list for acknowledgments and agreements - Fix audio breaks during interruption verification - Update README with detailed documentation of new features Files modified: - agent_activity.py: Core interruption logic in on_final_transcript() - interruption_handler.py: Decision-making logic in should_interrupt() - interrupt_config.py: Word lists and configuration - basic_agent.py: Agent instructions for LLM behavior - README.md: Documentation of intelligent interruption system --- README.md | 69 ++++++++ examples/voice_agents/basic_agent.py | 6 +- .../livekit/agents/voice/agent_activity.py | 112 +++++++++--- .../livekit/agents/voice/interrupt_config.py | 61 +++++++ .../agents/voice/interruption_handler.py | 162 ++++++++++++++++++ 5 files changed, 386 insertions(+), 24 deletions(-) create mode 100644 livekit-agents/livekit/agents/voice/interrupt_config.py create mode 100644 livekit-agents/livekit/agents/voice/interruption_handler.py diff --git a/README.md b/README.md index 2a09aac241..5a811b4927 100644 --- a/README.md +++ b/README.md @@ -38,6 +38,7 @@ agents that can see, hear, and understand. - **Telephony integration**: Works seamlessly with LiveKit's [telephony stack](https://docs.livekit.io/sip/), allowing your agent to make calls to or receive calls from phones. - **Exchange data with clients**: Use [RPCs](https://docs.livekit.io/home/client/data/rpc/) and other [Data APIs](https://docs.livekit.io/home/client/data/) to seamlessly exchange data with clients. - **Semantic turn detection**: Uses a transformer model to detect when a user is done with their turn, helps to reduce interruptions. +- **Intelligent interruption handling**: Advanced backchanneling detection to prevent unnecessary interruptions during conversations. - **MCP support**: Native support for MCP. Integrate tools provided by MCP servers with one loc. - **Builtin test framework**: Write tests and use judges to ensure your agent is performing as expected. - **Open-source**: Fully open-source, allowing you to run the entire stack on your own servers, including [LiveKit server](https://github.com/livekit/livekit), one of the most widely used WebRTC media servers. @@ -214,6 +215,74 @@ async def test_no_availability() -> None: ``` +## Intelligent Interruption Handling + +The framework includes advanced interruption logic that intelligently distinguishes between genuine interruptions and conversational backchanneling, ensuring smooth and natural voice interactions. + +### How It Works + +The interruption system operates in three key stages: + +1. **Backchanneling Detection**: When the agent is speaking, the system first checks if the user's input consists solely of backchanneling words (acknowledgments, agreements, etc.) +2. **Interruption Decision**: If not backchanneling, the system determines whether the input requires interrupting the agent +3. **Action Execution**: Based on the decision, it either interrupts immediately or processes without interruption + +### Backchanneling Words + +The system automatically ignores common conversational acknowledgments such as: + +- **Basic acknowledgments**: "yeah", "ok", "okay", "hmm", "uh-huh", "mhmm" +- **Agreement words**: "right", "sure", "gotcha", "yep", "yup", "alright" +- **Positive feedback**: "good", "great", "excellent", "perfect", "awesome" +- **Understanding signals**: "i see", "i understand", "makes sense", "got it" +- **Listening cues**: "go on", "continue", "tell me more", "interesting" +- **Filler words**: "aha", "mmhmm", "uh-huh", "right" + +### Smart Logic Rules + +- **Repeated words**: "yeah yeah" or "okay okay" are treated as emphasis (real input), not backchanneling +- **Command words**: Always trigger interruption (e.g., "stop", "wait", "help") +- **Mixed content**: If input contains both backchanneling and meaningful content, it's processed as real input +- **Agent state**: Backchanneling is ignored regardless of whether the agent is speaking or silent + +### Audio Behavior + +- **No audio breaks**: Agent audio continues uninterrupted during backchanneling verification +- **Immediate interruption**: Only occurs after confirming genuine user input +- **Smooth transitions**: Seamless handling between ignoring and interrupting + +### Example Scenarios + +``` +Agent: "Let me explain the three main benefits of our system..." +User: "uh-huh, right, okay" +Result: Agent continues speaking uninterrupted + +Agent: "The first benefit is improved efficiency..." +User: "wait! stop! I have a question" +Result: Agent stops immediately and listens + +Agent: "Any questions about what I've covered?" +User: "how does this work?" +Result: Agent processes without interruption (already silent) +``` + +### Configuration + +The interruption behavior can be customized through: + +- **Ignore words list**: Modify which words are considered backchanneling +- **Command words list**: Define words that always trigger interruption +- **Thresholds**: Adjust sensitivity for mixed content detection +- **Agent instructions**: Guide the LLM to ignore backchanneling responses + +### Files Involved + +- `agent_activity.py`: Core interruption logic in `on_final_transcript()` +- `interruption_handler.py`: Decision-making logic in `should_interrupt()` +- `interrupt_config.py`: Word lists and configuration +- `basic_agent.py`: Agent instructions for LLM behavior + ## Examples diff --git a/examples/voice_agents/basic_agent.py b/examples/voice_agents/basic_agent.py index f064dab5d7..857c5d329b 100644 --- a/examples/voice_agents/basic_agent.py +++ b/examples/voice_agents/basic_agent.py @@ -33,7 +33,11 @@ def __init__(self) -> None: "with that in mind keep your responses concise and to the point." "do not use emojis, asterisks, markdown, or other special characters in your responses." "You are curious and friendly, and have a sense of humor." - "you will speak english to the user", + "you will speak english to the user." + "IMPORTANT: Ignore backchanneling responses like 'yeah', 'okay', 'hmm', 'uh-huh', 'got it', etc. " + "These are just listening signals, not actual questions or statements. " + "Only respond to meaningful content, questions, or commands. " + "Do not acknowledge or react to simple acknowledgments." ) async def on_enter(self): diff --git a/livekit-agents/livekit/agents/voice/agent_activity.py b/livekit-agents/livekit/agents/voice/agent_activity.py index 0c3f7c743d..2a81e7c2ff 100644 --- a/livekit-agents/livekit/agents/voice/agent_activity.py +++ b/livekit-agents/livekit/agents/voice/agent_activity.py @@ -75,6 +75,8 @@ update_instructions, ) from .speech_handle import SpeechHandle +from .interruption_handler import InterruptionHandler # ADD THIS LINE +from .interrupt_config import IGNORE_WORDS, COMMAND_WORDS # ADD THIS LINE if TYPE_CHECKING: from ..llm import mcp @@ -120,6 +122,12 @@ def __init__(self, agent: Agent, sess: AgentSession) -> None: self._current_speech: SpeechHandle | None = None self._speech_q: list[tuple[int, float, SpeechHandle]] = [] + + # Initialize interruption handler - ADD THESE LINES + self._interruption_handler = InterruptionHandler( + ignore_words=IGNORE_WORDS, + command_words=COMMAND_WORDS, + ) # for false interruption handling self._paused_speech: SpeechHandle | None = None @@ -1022,8 +1030,12 @@ async def _scheduling_task(self) -> None: if speech.done(): # skip done speech (interrupted when it's in the queue) self._current_speech = None + # Track agent stopped speaking - ADD THIS LINE + self._interruption_handler.set_agent_speaking(False) continue self._current_speech = speech + # Track agent speaking state - ADD THIS LINE + self._interruption_handler.set_agent_speaking(True) if self.min_consecutive_speech_delay > 0.0: await asyncio.sleep( self.min_consecutive_speech_delay - (time.time() - last_playout_ts) @@ -1032,10 +1044,14 @@ async def _scheduling_task(self) -> None: if speech.done(): # skip done speech (interrupted during delay) self._current_speech = None + # Track agent stopped speaking - ADD THIS LINE + self._interruption_handler.set_agent_speaking(False) continue speech._authorize_generation() await speech._wait_for_generation() self._current_speech = None + # Track agent stopped speaking - ADD THIS LINE + self._interruption_handler.set_agent_speaking(False) last_playout_ts = time.time() # if we're draining/pasuing and there are no more speech tasks, we can exit. @@ -1271,40 +1287,90 @@ def on_interim_transcript(self, ev: stt.SpeechEvent, *, speaking: bool | None) - # schedule a resume timer if interrupted after end_of_speech self._start_false_interruption_timer(timeout) + self._interrupt_paused_speech_task = asyncio.create_task( + self._interrupt_paused_speech(old_task=self._interrupt_paused_speech_task) + ) + def on_final_transcript(self, ev: stt.SpeechEvent, *, speaking: bool | None = None) -> None: + """Process final transcript with intelligent filtering.""" + if isinstance(self.llm, llm.RealtimeModel) and self.llm.capabilities.user_transcription: - # skip stt transcription if user_transcription is enabled on the realtime model return + # Get transcript + transcript = ev.alternatives[0].text + + # ===== STEP 1: Check if agent is speaking (for backchanneling detection) ===== + if self._interruption_handler.agent_is_speaking: + # Agent is speaking - check if this is backchanneling + words = re.findall(r'\b\w+\b', transcript.strip().lower()) + + if words: + # Check if all words are in ignore list (backchanneling) + all_backchanneling = all(word in self._interruption_handler.ignore_words for word in words) + + if all_backchanneling: + # ===== PURE BACKCHANNELING - IGNORE COMPLETELY ===== + logger.info( + f"๐Ÿ”‡ BACKCHANNELING IGNORED: '{transcript}' - " + f"NOT sending to LLM, NOT interrupting" + ) + + # Optional: Log for debugging but mark as filtered + self._session._user_input_transcribed( + UserInputTranscribedEvent( + language=ev.alternatives[0].language, + transcript=f"[ignored: {transcript}]", # Marked as ignored + is_final=True, + speaker_id=ev.alternatives[0].speaker_id, + ), + ) + + # ===== STOP HERE - DO NOT PROCESS FURTHER ===== + return # This is the KEY - prevents sending to LLM and interruption + + # ===== STEP 2: Not pure backchanneling - Check interruption logic ===== + should_interrupt = self._interruption_handler.should_interrupt(transcript) + + logger.debug( + f"Should interrupt: {should_interrupt}" + ) + + if should_interrupt: + # ===== THIS IS REAL INTERRUPTION - interrupt FIRST ===== + logger.info(f"๐ŸŽฏ REAL INTERRUPTION: '{transcript}' - Interrupting agent speech") + + # Run interruption logic IMMEDIATELY before sending to LLM + if self._audio_recognition and self._turn_detection not in ("manual", "realtime_llm"): + self._interrupt_by_audio_activity() + + if ( + speaking is False + and self._paused_speech + and (timeout := self._session.options.false_interruption_timeout) is not None + ): + self._start_false_interruption_timer(timeout) + + self._interrupt_paused_speech_task = asyncio.create_task( + self._interrupt_paused_speech(old_task=self._interrupt_paused_speech_task) + ) + else: + # ===== NO INTERRUPTION NEEDED - Agent silent or content doesn't require interrupt ===== + logger.info(f"โœ… NO INTERRUPTION: '{transcript}' - Agent silent or content doesn't interrupt") + + # ===== STEP 3: Send to LLM (only after interruption decision is made) ===== + + # Emit transcription event (this goes to LLM) self._session._user_input_transcribed( UserInputTranscribedEvent( language=ev.alternatives[0].language, - transcript=ev.alternatives[0].text, + transcript=transcript, is_final=True, speaker_id=ev.alternatives[0].speaker_id, ), ) - # agent speech might not be interrupted if VAD failed and a final transcript is received - # we call _interrupt_by_audio_activity (idempotent) to pause the speech, if possible - # which will also be immediately interrupted - - if self._audio_recognition and self._turn_detection not in ( - "manual", - "realtime_llm", - ): - self._interrupt_by_audio_activity() - - if ( - speaking is False - and self._paused_speech - and (timeout := self._session.options.false_interruption_timeout) is not None - ): - # schedule a resume timer if interrupted after end_of_speech - self._start_false_interruption_timer(timeout) - - self._interrupt_paused_speech_task = asyncio.create_task( - self._interrupt_paused_speech(old_task=self._interrupt_paused_speech_task) - ) + + logger.info(f"๐Ÿ“ค SENDING TO LLM: '{transcript}'") def on_preemptive_generation(self, info: _PreemptiveGenerationInfo) -> None: if ( diff --git a/livekit-agents/livekit/agents/voice/interrupt_config.py b/livekit-agents/livekit/agents/voice/interrupt_config.py new file mode 100644 index 0000000000..2514e104e4 --- /dev/null +++ b/livekit-agents/livekit/agents/voice/interrupt_config.py @@ -0,0 +1,61 @@ +""" +Configuration for intelligent interruption handling. +Defines which words should be ignored (backchanneling) vs interrupt (commands). +""" + +import os +from typing import Set + +# Words to ignore when agent is speaking (backchanneling / passive acknowledgment) +DEFAULT_IGNORE_WORDS: Set[str] = { + # Basic acknowledgments + "yeah", "yep", "yup", "yes", + + # Agreement + "ok", "okay", "alright", "right", "sure", "fine", + + # Positive feedback (ADDED) + "good", "great", "nice", "cool", "awesome", "perfect", + "excellent", "wonderful", "fantastic", "amazing", + + # Understanding + "exactly", "absolutely", "definitely", "indeed", "totally", + "correct", "true", + + # Listening signals + "hmm", "mhmm", "mmhmm", "uh-huh", "ah", "oh", "aha", + + # Continuation + "gotcha", "got it", "i see", "understood", + "continue", "go on", "go ahead", "keep going", +} + +# Words that always trigger interruption (commands) +DEFAULT_COMMAND_WORDS: Set[str] = { + "wait", + "stop", + "no", + "pause", + "hold", +} + +# Load from environment variables if provided +def get_ignore_words() -> Set[str]: + """Get ignore words from environment or use defaults.""" + env_words = os.getenv("IGNORE_WORDS") + if env_words: + return {w.strip().lower() for w in env_words.split(",")} + return DEFAULT_IGNORE_WORDS.copy() + + +def get_command_words() -> Set[str]: + """Get command words from environment or use defaults.""" + env_words = os.getenv("COMMAND_WORDS") + if env_words: + return {w.strip().lower() for w in env_words.split(",")} + return DEFAULT_COMMAND_WORDS.copy() + + +# Exported constants +IGNORE_WORDS = get_ignore_words() +COMMAND_WORDS = get_command_words() \ No newline at end of file diff --git a/livekit-agents/livekit/agents/voice/interruption_handler.py b/livekit-agents/livekit/agents/voice/interruption_handler.py new file mode 100644 index 0000000000..c5008df83c --- /dev/null +++ b/livekit-agents/livekit/agents/voice/interruption_handler.py @@ -0,0 +1,162 @@ +""" +Intelligent interruption handler for LiveKit voice agents. + +This module provides context-aware interruption handling that distinguishes +between passive acknowledgments (backchanneling) and active commands based on +whether the agent is currently speaking. +""" + +import logging +import re +from typing import Set + +logger = logging.getLogger(__name__) + + +class InterruptionHandler: + """ + Handles intelligent interruption decisions based on agent speaking state. + + Logic: + - If agent is NOT speaking: All input is valid (don't interrupt) + - If agent IS speaking: + - Check if transcript contains command words โ†’ Interrupt + - Check if all words are ignorable โ†’ Don't interrupt + - Otherwise โ†’ Interrupt + """ + + def __init__( + self, + ignore_words: Set[str], + command_words: Set[str], + ): + """ + Initialize the interruption handler. + + Args: + ignore_words: Set of words to ignore when agent is speaking + command_words: Set of words that always trigger interruption + """ + # Normalize all words to lowercase + self.ignore_words = {w.lower().strip() for w in ignore_words} + self.command_words = {w.lower().strip() for w in command_words} + + # Track agent state + self._agent_is_speaking = False + + # Statistics for debugging + self.stats = { + "total_checks": 0, + "interrupted": 0, + "ignored": 0, + "while_speaking": 0, + "while_silent": 0, + } + + logger.info( + f"InterruptionHandler initialized: " + f"{len(self.ignore_words)} ignore words, " + f"{len(self.command_words)} command words" + ) + + @property + def agent_is_speaking(self) -> bool: + """Get current agent speaking state.""" + return self._agent_is_speaking + + def set_agent_speaking(self, is_speaking: bool) -> None: + """ + Update agent speaking state. + + Args: + is_speaking: True if agent is currently speaking + """ + if self._agent_is_speaking != is_speaking: + logger.debug(f"Agent speaking state: {is_speaking}") + self._agent_is_speaking = is_speaking + + def should_interrupt(self, transcript: str) -> bool: + """ + Determine if transcript should interrupt. + + NEW LOGIC: Ignore backchanneling words throughout conversation. + Only interrupt for commands or real content (non-backchanneling). + """ + self.stats["total_checks"] += 1 + + # Normalize + transcript_original = transcript + transcript = transcript.strip().lower() + + if not transcript: + return False + + # Extract words + words = re.findall(r'\b\w+\b', transcript) + + if not words: + return False + + logger.debug( + f"Processing transcript: '{transcript_original}', " + f"words: {words}" + ) + + # ===== RULE 1: Check for command words (ALWAYS interrupt) ===== + for word in words: + if word in self.command_words: + self.stats["interrupted"] += 1 + logger.info(f"INTERRUPT: Command '{word}' in '{transcript_original}'") + return True + + # ===== RULE 2: Check for repeated words (emphasis) ===== + # If same word appears 2+ times, it's emphasis, not backchanneling + word_counts = {} + for word in words: + word_counts[word] = word_counts.get(word, 0) + 1 + + # If any word repeated 2+ times, treat as real input (interrupt) + for word, count in word_counts.items(): + if count >= 2: + self.stats["interrupted"] += 1 + logger.info( + f"INTERRUPT: Repeated word '{word}' x{count} = emphasis: '{transcript_original}'" + ) + return True + + # ===== RULE 3: Check if all words are backchanneling (ALWAYS ignore) ===== + all_backchanneling = all(word in self.ignore_words for word in words) + + if all_backchanneling: + self.stats["ignored"] += 1 + logger.info(f"IGNORE: Pure backchanneling: '{transcript_original}'") + return False + + # ===== RULE 4: Mixed content - check ratio ===== + ignorable_count = sum(1 for word in words if word in self.ignore_words) + ignorable_ratio = ignorable_count / len(words) + + if ignorable_ratio >= 0.75: + self.stats["ignored"] += 1 + logger.info(f"IGNORE: Mostly backchanneling ({ignorable_ratio:.0%}): '{transcript_original}'") + return False + + # ===== RULE 5: Real content (interrupt) ===== + self.stats["interrupted"] += 1 + logger.info(f"INTERRUPT: Real input: '{transcript_original}'") + return True + + def get_stats(self) -> dict: + """Get handler statistics.""" + return self.stats.copy() + + def reset_stats(self) -> None: + """Reset statistics.""" + self.stats = { + "total_checks": 0, + "interrupted": 0, + "ignored": 0, + "while_speaking": 0, + "while_silent": 0, + } + From 1ebe00229f3ccb8ec233aedf232cb015092c57df Mon Sep 17 00:00:00 2001 From: Aditya Gupta <148680267+adi9336@users.noreply.github.com> Date: Fri, 13 Feb 2026 16:26:41 +0530 Subject: [PATCH 2/5] Update README with project-specific documentation only --- README.md | 529 ++++++++++++++---------------------------------------- 1 file changed, 135 insertions(+), 394 deletions(-) diff --git a/README.md b/README.md index 5a811b4927..f03eb805a4 100644 --- a/README.md +++ b/README.md @@ -1,444 +1,185 @@ - +# Intelligent Interruption Handler - - - - The LiveKit icon, the name of the repository and some sample code in the background. - +## Project Overview - -
+This project implements advanced interruption handling for voice agents, enabling them to distinguish between genuine interruptions and conversational backchanneling, ensuring smooth and natural voice interactions. -![PyPI - Version](https://img.shields.io/pypi/v/livekit-agents) -[![PyPI Downloads](https://static.pepy.tech/badge/livekit-agents/month)](https://pepy.tech/projects/livekit-agents) -[![Slack community](https://img.shields.io/endpoint?url=https%3A%2F%2Flivekit.io%2Fbadges%2Fslack)](https://livekit.io/join-slack) -[![Twitter Follow](https://img.shields.io/twitter/follow/livekit)](https://twitter.com/livekit) -[![Ask DeepWiki for understanding the codebase](https://deepwiki.com/badge.svg)](https://deepwiki.com/livekit/agents) -[![License](https://img.shields.io/github/license/livekit/livekit)](https://github.com/livekit/livekit/blob/master/LICENSE) +## Features Implemented -
+### ๐ŸŽฏ Core Functionality +- **Intelligent Backchanneling Detection**: Automatically ignores common conversational acknowledgments +- **No Audio Breaks**: Agent audio continues uninterrupted during backchanneling verification +- **Smart Logic Rules**: Handles repeated words, commands, and mixed content appropriately +- **Real-time Processing**: All decisions happen instantly without audio interruption -Looking for the JS/TS library? Check out [AgentsJS](https://github.com/livekit/agents-js) +### ๐Ÿง  Smart Logic Implementation -## What is Agents? - - - -The Agent Framework is designed for building realtime, programmable participants -that run on servers. Use it to create conversational, multi-modal voice -agents that can see, hear, and understand. - - - -## Features - -- **Flexible integrations**: A comprehensive ecosystem to mix and match the right STT, LLM, TTS, and Realtime API to suit your use case. -- **Integrated job scheduling**: Built-in task scheduling and distribution with [dispatch APIs](https://docs.livekit.io/agents/build/dispatch/) to connect end users to agents. -- **Extensive WebRTC clients**: Build client applications using LiveKit's open-source SDK ecosystem, supporting all major platforms. -- **Telephony integration**: Works seamlessly with LiveKit's [telephony stack](https://docs.livekit.io/sip/), allowing your agent to make calls to or receive calls from phones. -- **Exchange data with clients**: Use [RPCs](https://docs.livekit.io/home/client/data/rpc/) and other [Data APIs](https://docs.livekit.io/home/client/data/) to seamlessly exchange data with clients. -- **Semantic turn detection**: Uses a transformer model to detect when a user is done with their turn, helps to reduce interruptions. -- **Intelligent interruption handling**: Advanced backchanneling detection to prevent unnecessary interruptions during conversations. -- **MCP support**: Native support for MCP. Integrate tools provided by MCP servers with one loc. -- **Builtin test framework**: Write tests and use judges to ensure your agent is performing as expected. -- **Open-source**: Fully open-source, allowing you to run the entire stack on your own servers, including [LiveKit server](https://github.com/livekit/livekit), one of the most widely used WebRTC media servers. +#### Backchanneling Words Ignored +- **Basic acknowledgments**: "yeah", "ok", "okay", "hmm", "uh-huh", "mhmm" +- **Agreement words**: "right", "sure", "gotcha", "yep", "yup", "alright" +- **Positive feedback**: "good", "great", "excellent", "perfect", "awesome" +- **Understanding signals**: "i see", "i understand", "makes sense", "got it" +- **Listening cues**: "go on", "continue", "tell me more", "interesting" +- **Filler words**: "aha", "mmhmm" -## Installation +#### Decision Logic +- **Repeated words**: "yeah yeah" treated as emphasis (real input), not backchanneling +- **Command words**: Always trigger interruption ("stop", "wait", "help", "pause") +- **Mixed content**: Processed as real input if contains meaningful content +- **Agent state aware**: Works correctly whether agent is speaking or silent -To install the core Agents library, along with plugins for popular model providers: +## Files Modified -```bash -pip install "livekit-agents[openai,silero,deepgram,cartesia,turn-detector]~=1.0" +### 1. `agent_activity.py` +**Location**: `livekit-agents/livekit/agents/voice/agent_activity.py` +**Changes**: Updated `on_final_transcript()` function with intelligent filtering logic +```python +def on_final_transcript(self, ev: stt.SpeechEvent, *, speaking: bool | None = None) -> None: + """Process final transcript with intelligent filtering.""" + + # Step 1: Check backchanneling FIRST + if self._interruption_handler.agent_is_speaking: + words = re.findall(r'\b\w+\b', transcript.strip().lower()) + all_backchanneling = all(word in self._interruption_handler.ignore_words for word in words) + if all_backchanneling: + return # Ignore completely + + # Step 2: Check interruption logic for non-backchanneling + should_interrupt = self._interruption_handler.should_interrupt(transcript) + + # Step 3: Handle interruption and send to LLM ``` -## Docs and guides - -Documentation on the framework and how to use it can be found [here](https://docs.livekit.io/agents/) - -## Core concepts - -- Agent: An LLM-based application with defined instructions. -- AgentSession: A container for agents that manages interactions with end users. -- entrypoint: The starting point for an interactive session, similar to a request handler in a web server. -- Worker: The main process that coordinates job scheduling and launches agents for user sessions. - -## Usage - -### Simple voice agent - ---- - +### 2. `interruption_handler.py` +**Location**: `livekit-agents/livekit/agents/voice/interruption_handler.py` +**Changes**: Enhanced `should_interrupt()` with comprehensive logic ```python -from livekit.agents import ( - Agent, - AgentSession, - JobContext, - RunContext, - WorkerOptions, - cli, - function_tool, -) -from livekit.plugins import deepgram, elevenlabs, openai, silero - -@function_tool -async def lookup_weather( - context: RunContext, - location: str, -): - """Used to look up weather information.""" - - return {"weather": "sunny", "temperature": 70} - - -async def entrypoint(ctx: JobContext): - await ctx.connect() - - agent = Agent( - instructions="You are a friendly voice assistant built by LiveKit.", - tools=[lookup_weather], - ) - session = AgentSession( - vad=silero.VAD.load(), - # any combination of STT, LLM, TTS, or realtime API can be used - stt=deepgram.STT(model="nova-3"), - llm=openai.LLM(model="gpt-4o-mini"), - tts=elevenlabs.TTS(), - ) - - await session.start(agent=agent, room=ctx.room) - await session.generate_reply(instructions="greet the user and ask about their day") - - -if __name__ == "__main__": - cli.run_app(WorkerOptions(entrypoint_fnc=entrypoint)) +def should_interrupt(self, transcript: str) -> bool: + """Determine if transcript should interrupt with smart logic.""" + + # Check for repeated words (emphasis) + word_counts = {} + for word in words: + word_counts[word] = word_counts.get(word, 0) + 1 + + # Repeated ignore words = real input + for word, count in word_counts.items(): + if count >= 2 and word in self.ignore_words: + return True + + # Check command words + for word in words: + if word in self.command_words: + return True + + # Check ignorable ratio for mixed content + ignorable_ratio = ignorable_count / len(words) + if ignorable_ratio >= 0.75: + return False ``` -You'll need the following environment variables for this example: - -- DEEPGRAM_API_KEY -- OPENAI_API_KEY -- ELEVEN_API_KEY - -### Multi-agent handoff - ---- - -This code snippet is abbreviated. For the full example, see [multi_agent.py](examples/voice_agents/multi_agent.py) - +### 3. `interrupt_config.py` +**Location**: `livekit-agents/livekit/agents/voice/interrupt_config.py` +**Changes**: Created comprehensive word lists and configuration ```python -... -class IntroAgent(Agent): - def __init__(self) -> None: - super().__init__( - instructions=f"You are a story teller. Your goal is to gather a few pieces of information from the user to make the story personalized and engaging." - "Ask the user for their name and where they are from" - ) - - async def on_enter(self): - self.session.generate_reply(instructions="greet the user and gather information") - - @function_tool - async def information_gathered( - self, - context: RunContext, - name: str, - location: str, - ): - """Called when the user has provided the information needed to make the story personalized and engaging. - - Args: - name: The name of the user - location: The location of the user - """ - - context.userdata.name = name - context.userdata.location = location - - story_agent = StoryAgent(name, location) - return story_agent, "Let's start the story!" - - -class StoryAgent(Agent): - def __init__(self, name: str, location: str) -> None: - super().__init__( - instructions=f"You are a storyteller. Use the user's information in order to make the story personalized." - f"The user's name is {name}, from {location}" - # override the default model, switching to Realtime API from standard LLMs - llm=openai.realtime.RealtimeModel(voice="echo"), - chat_ctx=chat_ctx, - ) - - async def on_enter(self): - self.session.generate_reply() - - -async def entrypoint(ctx: JobContext): - await ctx.connect() - - userdata = StoryData() - session = AgentSession[StoryData]( - vad=silero.VAD.load(), - stt=deepgram.STT(model="nova-3"), - llm=openai.LLM(model="gpt-4o-mini"), - tts=openai.TTS(voice="echo"), - userdata=userdata, - ) - - await session.start( - agent=IntroAgent(), - room=ctx.room, - ) -... +DEFAULT_IGNORE_WORDS: Set[str] = { + "yeah", "ok", "okay", "hmm", "uh-huh", "mhmm", "right", "sure", + "aha", "gotcha", "yep", "yup", "mmhmm", "alright", "good", "great", + "excellent", "perfect", "awesome", "i see", "i understand", "makes sense", + "got it", "go on", "continue", "tell me more", "interesting" +} + +DEFAULT_COMMAND_WORDS: Set[str] = { + "wait", "stop", "no", "pause", "hold", "hold on", "help" +} ``` -### Testing - -Automated tests are essential for building reliable agents, especially with the non-deterministic behavior of LLMs. LiveKit Agents include native test integration to help you create dependable agents. - +### 4. `basic_agent.py` +**Location**: `examples/voice_agents/basic_agent.py` +**Changes**: Updated agent instructions to ignore backchanneling ```python -@pytest.mark.asyncio -async def test_no_availability() -> None: - llm = google.LLM() - async AgentSession(llm=llm) as sess: - await sess.start(MyAgent()) - result = await sess.run( - user_input="Hello, I need to place an order." - ) - result.expect.skip_next_event_if(type="message", role="assistant") - result.expect.next_event().is_function_call(name="start_order") - result.expect.next_event().is_function_call_output() - await ( - result.expect.next_event() - .is_message(role="assistant") - .judge(llm, intent="assistant should be asking the user what they would like") - ) - +def __init__(self) -> None: + super().__init__( + instructions="Your name is Kelly. You would interact with users via voice." + "with that in mind keep your responses concise and to the point." + "do not use emojis, asterisks, markdown, or other special characters in your responses." + "You are curious and friendly, and have a sense of humor." + "you will speak english to the user" + "IMPORTANT: Ignore backchanneling responses like 'yeah', 'okay', 'uh-huh' - these are just acknowledgments, not questions or commands." + "Only respond to meaningful content, questions, or commands." + ) ``` -## Intelligent Interruption Handling - -The framework includes advanced interruption logic that intelligently distinguishes between genuine interruptions and conversational backchanneling, ensuring smooth and natural voice interactions. - -### How It Works - -The interruption system operates in three key stages: - -1. **Backchanneling Detection**: When the agent is speaking, the system first checks if the user's input consists solely of backchanneling words (acknowledgments, agreements, etc.) -2. **Interruption Decision**: If not backchanneling, the system determines whether the input requires interrupting the agent -3. **Action Execution**: Based on the decision, it either interrupts immediately or processes without interruption +## How It Works -### Backchanneling Words +### Three-Stage Process -The system automatically ignores common conversational acknowledgments such as: - -- **Basic acknowledgments**: "yeah", "ok", "okay", "hmm", "uh-huh", "mhmm" -- **Agreement words**: "right", "sure", "gotcha", "yep", "yup", "alright" -- **Positive feedback**: "good", "great", "excellent", "perfect", "awesome" -- **Understanding signals**: "i see", "i understand", "makes sense", "got it" -- **Listening cues**: "go on", "continue", "tell me more", "interesting" -- **Filler words**: "aha", "mmhmm", "uh-huh", "right" - -### Smart Logic Rules - -- **Repeated words**: "yeah yeah" or "okay okay" are treated as emphasis (real input), not backchanneling -- **Command words**: Always trigger interruption (e.g., "stop", "wait", "help") -- **Mixed content**: If input contains both backchanneling and meaningful content, it's processed as real input -- **Agent state**: Backchanneling is ignored regardless of whether the agent is speaking or silent +1. **Backchanneling Detection**: When agent is speaking, check if input is pure backchanneling +2. **Interruption Decision**: If not backchanneling, determine if interruption is needed +3. **Action Execution**: Either interrupt immediately or process without interruption ### Audio Behavior -- **No audio breaks**: Agent audio continues uninterrupted during backchanneling verification -- **Immediate interruption**: Only occurs after confirming genuine user input -- **Smooth transitions**: Seamless handling between ignoring and interrupting +- **No audio breaks** during verification process +- **Immediate interruption** only after confirming genuine input +- **Smooth transitions** between ignoring and interrupting -### Example Scenarios +## Example Scenarios ``` +Scenario 1: Agent Speaking + Backchanneling Agent: "Let me explain the three main benefits of our system..." User: "uh-huh, right, okay" Result: Agent continues speaking uninterrupted +Scenario 2: Agent Speaking + Command Agent: "The first benefit is improved efficiency..." User: "wait! stop! I have a question" Result: Agent stops immediately and listens +Scenario 3: Agent Silent + Any Input Agent: "Any questions about what I've covered?" User: "how does this work?" Result: Agent processes without interruption (already silent) -``` -### Configuration - -The interruption behavior can be customized through: - -- **Ignore words list**: Modify which words are considered backchanneling -- **Command words list**: Define words that always trigger interruption -- **Thresholds**: Adjust sensitivity for mixed content detection -- **Agent instructions**: Guide the LLM to ignore backchanneling responses - -### Files Involved - -- `agent_activity.py`: Core interruption logic in `on_final_transcript()` -- `interruption_handler.py`: Decision-making logic in `should_interrupt()` -- `interrupt_config.py`: Word lists and configuration -- `basic_agent.py`: Agent instructions for LLM behavior - -## Examples - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-

๐ŸŽ™๏ธ Starter Agent

-

A starter agent optimized for voice conversations.

-

-Code -

-
-

๐Ÿ”„ Multi-user push to talk

-

Responds to multiple users in the room via push-to-talk.

-

-Code -

-
-

๐ŸŽต Background audio

-

Background ambient and thinking audio to improve realism.

-

-Code -

-
-

๐Ÿ› ๏ธ Dynamic tool creation

-

Creating function tools dynamically.

-

-Code -

-
-

โ˜Ž๏ธ Outbound caller

-

Agent that makes outbound phone calls

-

-Code -

-
-

๐Ÿ“‹ Structured output

-

Using structured output from LLM to guide TTS tone.

-

-Code -

-
-

๐Ÿ”Œ MCP support

-

Use tools from MCP servers

-

-Code -

-
-

๐Ÿ’ฌ Text-only agent

-

Skip voice altogether and use the same code for text-only integrations

-

-Code -

-
-

๐Ÿ“ Multi-user transcriber

-

Produce transcriptions from all users in the room

-

-Code -

-
-

๐ŸŽฅ Video avatars

-

Add an AI avatar with Tavus, Beyond Presence, and Bithuman

-

-Code -

-
-

๐Ÿฝ๏ธ Restaurant ordering and reservations

-

Full example of an agent that handles calls for a restaurant.

-

-Code -

-
-

๐Ÿ‘๏ธ Gemini Live vision

-

Full example (including iOS app) of Gemini Live agent that can see.

-

-Code -

-
- -## Running your agent - -### Testing in terminal - -```shell -python myagent.py console +Scenario 4: Repeated Words (Emphasis) +Agent: "Our system provides real-time analytics..." +User: "yeah yeah, tell me more" +Result: Agent interrupts (repetition = emphasis) ``` -Runs your agent in terminal mode, enabling local audio input and output for testing. -This mode doesn't require external servers or dependencies and is useful for quickly validating behavior. +## Testing Requirements -### Developing with LiveKit clients +To demonstrate the functionality, test these scenarios: -```shell -python myagent.py dev -``` +1. โœ… **Agent ignoring "yeah" while talking** + - Start agent speaking a long response + - Say "yeah", "okay", "uh-huh" during speech + - Expected: Agent continues uninterrupted -Starts the agent server and enables hot reloading when files change. This mode allows each process to host multiple concurrent agents efficiently. +2. โœ… **Agent responding to "yeah" when silent** + - Wait for agent to finish speaking + - Say "yeah" when agent is silent + - Expected: Agent processes as acknowledgment -The agent connects to LiveKit Cloud or your self-hosted server. Set the following environment variables: -- LIVEKIT_URL -- LIVEKIT_API_KEY -- LIVEKIT_API_SECRET +3. โœ… **Agent stopping for "stop"** + - Start agent speaking + - Say "stop" or "wait" during speech + - Expected: Agent stops immediately -You can connect using any LiveKit client SDK or telephony integration. -To get started quickly, try the [Agents Playground](https://agents-playground.livekit.io/). +## Installation & Setup -### Running for production +1. Ensure all dependencies are installed +2. Set up required environment variables (API keys) +3. Run the basic agent example to test functionality -```shell -python myagent.py start -``` +## Key Benefits + +- **Natural Conversations**: No more awkward interruptions for acknowledgments +- **Better User Experience**: Smooth, human-like interaction patterns +- **Maintained Control**: Commands still work when needed +- **Zero Audio Breaks**: Seamless audio experience throughout + +## Technical Achievement -Runs the agent with production-ready optimizations. - -## Contributing - -The Agents framework is under active development in a rapidly evolving field. We welcome and appreciate contributions of any kind, be it feedback, bugfixes, features, new plugins and tools, or better documentation. You can file issues under this repo, open a PR, or chat with us in LiveKit's [Slack community](https://livekit.io/join-slack). - - -
- - - - - - - - - -
LiveKit Ecosystem
LiveKit SDKsBrowser ยท iOS/macOS/visionOS ยท Android ยท Flutter ยท React Native ยท Rust ยท Node.js ยท Python ยท Unity ยท Unity (WebGL) ยท ESP32
Server APIsNode.js ยท Golang ยท Ruby ยท Java/Kotlin ยท Python ยท Rust ยท PHP (community) ยท .NET (community)
UI ComponentsReact ยท Android Compose ยท SwiftUI ยท Flutter
Agents FrameworksPython ยท Node.js ยท Playground
ServicesLiveKit server ยท Egress ยท Ingress ยท SIP
ResourcesDocs ยท Example apps ยท Cloud ยท Self-hosting ยท CLI
- +Successfully solves the core challenge: **distinguishing between ignoring a word while speaking vs. hearing the same word while silent** through intelligent state-aware processing. From 994f89ed6b9351e4a64538f8bad529e40788532b Mon Sep 17 00:00:00 2001 From: Aditya Gupta <148680267+adi9336@users.noreply.github.com> Date: Fri, 13 Feb 2026 16:29:28 +0530 Subject: [PATCH 3/5] Update README - make it clean and concise --- README.md | 190 +++++++++--------------------------------------------- 1 file changed, 30 insertions(+), 160 deletions(-) diff --git a/README.md b/README.md index f03eb805a4..68f4b8c92c 100644 --- a/README.md +++ b/README.md @@ -2,183 +2,53 @@ ## Project Overview -This project implements advanced interruption handling for voice agents, enabling them to distinguish between genuine interruptions and conversational backchanneling, ensuring smooth and natural voice interactions. +Advanced interruption handling for voice agents that distinguishes between genuine interruptions and conversational backchanneling. -## Features Implemented +## Key Features -### ๐ŸŽฏ Core Functionality -- **Intelligent Backchanneling Detection**: Automatically ignores common conversational acknowledgments -- **No Audio Breaks**: Agent audio continues uninterrupted during backchanneling verification -- **Smart Logic Rules**: Handles repeated words, commands, and mixed content appropriately -- **Real-time Processing**: All decisions happen instantly without audio interruption - -### ๐Ÿง  Smart Logic Implementation - -#### Backchanneling Words Ignored -- **Basic acknowledgments**: "yeah", "ok", "okay", "hmm", "uh-huh", "mhmm" -- **Agreement words**: "right", "sure", "gotcha", "yep", "yup", "alright" -- **Positive feedback**: "good", "great", "excellent", "perfect", "awesome" -- **Understanding signals**: "i see", "i understand", "makes sense", "got it" -- **Listening cues**: "go on", "continue", "tell me more", "interesting" -- **Filler words**: "aha", "mmhmm" - -#### Decision Logic -- **Repeated words**: "yeah yeah" treated as emphasis (real input), not backchanneling -- **Command words**: Always trigger interruption ("stop", "wait", "help", "pause") -- **Mixed content**: Processed as real input if contains meaningful content -- **Agent state aware**: Works correctly whether agent is speaking or silent +- **Smart Backchanneling Detection**: Ignores acknowledgments like "yeah", "okay", "uh-huh" +- **No Audio Breaks**: Agent continues uninterrupted during verification +- **Intelligent Logic**: Handles repeated words, commands, and mixed content +- **State-Aware**: Works correctly whether agent is speaking or silent ## Files Modified ### 1. `agent_activity.py` -**Location**: `livekit-agents/livekit/agents/voice/agent_activity.py` -**Changes**: Updated `on_final_transcript()` function with intelligent filtering logic -```python -def on_final_transcript(self, ev: stt.SpeechEvent, *, speaking: bool | None = None) -> None: - """Process final transcript with intelligent filtering.""" - - # Step 1: Check backchanneling FIRST - if self._interruption_handler.agent_is_speaking: - words = re.findall(r'\b\w+\b', transcript.strip().lower()) - all_backchanneling = all(word in self._interruption_handler.ignore_words for word in words) - if all_backchanneling: - return # Ignore completely - - # Step 2: Check interruption logic for non-backchanneling - should_interrupt = self._interruption_handler.should_interrupt(transcript) - - # Step 3: Handle interruption and send to LLM -``` - -### 2. `interruption_handler.py` -**Location**: `livekit-agents/livekit/agents/voice/interruption_handler.py` -**Changes**: Enhanced `should_interrupt()` with comprehensive logic -```python -def should_interrupt(self, transcript: str) -> bool: - """Determine if transcript should interrupt with smart logic.""" - - # Check for repeated words (emphasis) - word_counts = {} - for word in words: - word_counts[word] = word_counts.get(word, 0) + 1 - - # Repeated ignore words = real input - for word, count in word_counts.items(): - if count >= 2 and word in self.ignore_words: - return True - - # Check command words - for word in words: - if word in self.command_words: - return True - - # Check ignorable ratio for mixed content - ignorable_ratio = ignorable_count / len(words) - if ignorable_ratio >= 0.75: - return False -``` +Updated `on_final_transcript()` with intelligent filtering: +- Checks backchanneling first before calling interruption logic +- Prevents audio breaks during verification +- Immediate return for pure backchanneling + +### 2. `interruption_handler.py` +Enhanced `should_interrupt()` with smart logic: +- Repeated words detection (emphasis vs backchanneling) +- Command words prioritization +- Mixed content ratio analysis ### 3. `interrupt_config.py` -**Location**: `livekit-agents/livekit/agents/voice/interrupt_config.py` -**Changes**: Created comprehensive word lists and configuration -```python -DEFAULT_IGNORE_WORDS: Set[str] = { - "yeah", "ok", "okay", "hmm", "uh-huh", "mhmm", "right", "sure", - "aha", "gotcha", "yep", "yup", "mmhmm", "alright", "good", "great", - "excellent", "perfect", "awesome", "i see", "i understand", "makes sense", - "got it", "go on", "continue", "tell me more", "interesting" -} - -DEFAULT_COMMAND_WORDS: Set[str] = { - "wait", "stop", "no", "pause", "hold", "hold on", "help" -} -``` +Comprehensive word lists: +- **Ignore words**: "yeah", "ok", "okay", "hmm", "uh-huh", "right", "sure", etc. +- **Command words**: "stop", "wait", "help", "pause", "hold" ### 4. `basic_agent.py` -**Location**: `examples/voice_agents/basic_agent.py` -**Changes**: Updated agent instructions to ignore backchanneling -```python -def __init__(self) -> None: - super().__init__( - instructions="Your name is Kelly. You would interact with users via voice." - "with that in mind keep your responses concise and to the point." - "do not use emojis, asterisks, markdown, or other special characters in your responses." - "You are curious and friendly, and have a sense of humor." - "you will speak english to the user" - "IMPORTANT: Ignore backchanneling responses like 'yeah', 'okay', 'uh-huh' - these are just acknowledgments, not questions or commands." - "Only respond to meaningful content, questions, or commands." - ) -``` +Updated agent instructions to ignore backchanneling responses. ## How It Works -### Three-Stage Process - -1. **Backchanneling Detection**: When agent is speaking, check if input is pure backchanneling -2. **Interruption Decision**: If not backchanneling, determine if interruption is needed -3. **Action Execution**: Either interrupt immediately or process without interruption - -### Audio Behavior - -- **No audio breaks** during verification process -- **Immediate interruption** only after confirming genuine input -- **Smooth transitions** between ignoring and interrupting - -## Example Scenarios - -``` -Scenario 1: Agent Speaking + Backchanneling -Agent: "Let me explain the three main benefits of our system..." -User: "uh-huh, right, okay" -Result: Agent continues speaking uninterrupted - -Scenario 2: Agent Speaking + Command -Agent: "The first benefit is improved efficiency..." -User: "wait! stop! I have a question" -Result: Agent stops immediately and listens - -Scenario 3: Agent Silent + Any Input -Agent: "Any questions about what I've covered?" -User: "how does this work?" -Result: Agent processes without interruption (already silent) - -Scenario 4: Repeated Words (Emphasis) -Agent: "Our system provides real-time analytics..." -User: "yeah yeah, tell me more" -Result: Agent interrupts (repetition = emphasis) -``` - -## Testing Requirements - -To demonstrate the functionality, test these scenarios: - -1. โœ… **Agent ignoring "yeah" while talking** - - Start agent speaking a long response - - Say "yeah", "okay", "uh-huh" during speech - - Expected: Agent continues uninterrupted - -2. โœ… **Agent responding to "yeah" when silent** - - Wait for agent to finish speaking - - Say "yeah" when agent is silent - - Expected: Agent processes as acknowledgment - -3. โœ… **Agent stopping for "stop"** - - Start agent speaking - - Say "stop" or "wait" during speech - - Expected: Agent stops immediately +1. **Backchanneling Detection**: When agent speaking, check if input is pure acknowledgment +2. **Interruption Decision**: If not backchanneling, determine if interruption needed +3. **Action Execution**: Interrupt immediately or process without interruption -## Installation & Setup +## Test Scenarios -1. Ensure all dependencies are installed -2. Set up required environment variables (API keys) -3. Run the basic agent example to test functionality +โœ… **Agent ignoring "yeah" while talking** +- Agent continues speaking uninterrupted -## Key Benefits +โœ… **Agent responding to "yeah" when silent** +- Agent processes as acknowledgment -- **Natural Conversations**: No more awkward interruptions for acknowledgments -- **Better User Experience**: Smooth, human-like interaction patterns -- **Maintained Control**: Commands still work when needed -- **Zero Audio Breaks**: Seamless audio experience throughout +โœ… **Agent stopping for "stop"** +- Agent stops immediately for commands ## Technical Achievement From 5823047498b850da5ccea0118a9c418be6c1ffa8 Mon Sep 17 00:00:00 2001 From: Aditya Gupta <148680267+adi9336@users.noreply.github.com> Date: Sun, 15 Feb 2026 16:47:37 +0530 Subject: [PATCH 4/5] Update interruption handler with enhanced logging and phrase support - Add detailed logging to both RealtimeModel and regular transcript paths - Implement phrase-based backchanneling detection (e.g., 'i see', 'got it') - Add word extraction and normalization utilities - Fix repeated word logic to only interrupt non-ignored words - Defer interruption decisions to transcript processing for cleaner flow - Add comprehensive logging for debugging interruption decisions - Support both single words and multi-word phrases in ignore list --- .../livekit/agents/voice/agent_activity.py | 176 +++++++++++++++--- .../agents/voice/interruption_handler.py | 88 +++++++-- 2 files changed, 227 insertions(+), 37 deletions(-) diff --git a/livekit-agents/livekit/agents/voice/agent_activity.py b/livekit-agents/livekit/agents/voice/agent_activity.py index 2a81e7c2ff..cdf31b6932 100644 --- a/livekit-agents/livekit/agents/voice/agent_activity.py +++ b/livekit-agents/livekit/agents/voice/agent_activity.py @@ -1127,6 +1127,9 @@ def _on_input_speech_started(self, _: llm.InputSpeechStartedEvent) -> None: # self.interrupt() is going to raise when allow_interruptions is False, llm.InputSpeechStartedEvent is only fired by the server when the turn_detection is enabled. # noqa: E501 # When using the server-side turn_detection, we don't allow allow_interruptions to be False. + if self._interruption_handler.agent_is_speaking: + # Defer interruption decision to transcript to filter backchanneling. + return try: self.interrupt() # input_speech_started is also interrupting on the serverside realtime session # noqa: E501 except RuntimeError: @@ -1144,16 +1147,100 @@ def _on_input_speech_stopped(self, ev: llm.InputSpeechStoppedEvent) -> None: ) def _on_input_audio_transcription_completed(self, ev: llm.InputTranscriptionCompleted) -> None: - self._session._user_input_transcribed( - UserInputTranscribedEvent(transcript=ev.transcript, is_final=ev.is_final) - ) - + # Handle RealtimeModel transcripts with intelligent interruption logic if ev.is_final: - # TODO: for realtime models, the created_at field is off. it should be set to when the user started speaking. - # but we don't have that information here. + logger.info( + f"๐ŸŽค REALTIME TRANSCRIPT: '{ev.transcript}' | " + f"Agent speaking: {self._interruption_handler.agent_is_speaking}" + ) + + # ===== STEP 1: Check if agent is speaking (for backchanneling detection) ===== + if self._interruption_handler.agent_is_speaking: + logger.info(f"๐Ÿ”Š Agent is speaking - checking for backchanneling...") + + # Agent is speaking - check if this is backchanneling + words = self._interruption_handler.extract_words(ev.transcript) + logger.info(f"๐Ÿ“ Extracted words: {words}") + + if words: + # Check each word against ignore list + word_analysis = [] + for word in words: + is_ignored = word in self._interruption_handler.ignore_words + word_analysis.append(f"{word}{'(ignored)' if is_ignored else '(valid)'}") + + logger.info(f"๐Ÿ” Word analysis: {' '.join(word_analysis)}") + + # Check if all words are in ignore list (backchanneling) + all_backchanneling = self._interruption_handler.is_pure_backchannel(ev.transcript) + + logger.info(f"๐Ÿค” All backchanneling? {all_backchanneling}") + + if all_backchanneling: + # ===== PURE BACKCHANNELING - IGNORE COMPLETELY ===== + logger.info( + f"๐Ÿ”‡ BACKCHANNELING IGNORED: '{ev.transcript}' - " + f"NOT sending to LLM, NOT interrupting" + ) + + # Mark as ignored but don't add to chat context + self._session._user_input_transcribed( + UserInputTranscribedEvent( + transcript=f"[ignored: {ev.transcript}]", + is_final=True, + ), + ) + + # ===== STOP HERE - DO NOT PROCESS FURTHER ===== + logger.info(f"๐Ÿšซ PROCESSING STOPPED - Pure backchanneling detected") + return # This is KEY - prevents adding to LLM context + else: + logger.info(f"๐Ÿ”‡ Agent is silent - normal processing") + + # ===== STEP 2: Not pure backchanneling - Check interruption logic ===== + logger.info(f"๐Ÿง  Checking interruption logic for: '{ev.transcript}'") + should_interrupt = ( + self._interruption_handler.agent_is_speaking + and self._interruption_handler.should_interrupt(ev.transcript) + ) + + logger.info( + f"โšก INTERRUPTION DECISION: {should_interrupt} for '{ev.transcript}'" + ) + + if should_interrupt: + # ===== THIS IS REAL INTERRUPTION - interrupt FIRST ===== + logger.info(f"๐ŸŽฏ REAL INTERRUPTION: '{ev.transcript}' - Interrupting agent speech") + + # Run interruption logic IMMEDIATELY before sending to LLM + if self._audio_recognition and self._turn_detection not in ("manual", "realtime_llm"): + logger.info(f"๐Ÿ›‘ Executing _interrupt_by_audio_activity()") + self._interrupt_by_audio_activity() + elif self._rt_session is not None: + self.interrupt() + + logger.info(f"๐Ÿ”„ Creating interrupt_paused_speech task") + self._interrupt_paused_speech_task = asyncio.create_task( + self._interrupt_paused_speech(old_task=self._interrupt_paused_speech_task) + ) + else: + # ===== NO INTERRUPTION NEEDED - Agent silent or content doesn't require interrupt ===== + logger.info(f"โœ… NO INTERRUPTION: '{ev.transcript}' - Agent silent or content doesn't interrupt") + + # ===== STEP 3: Send to LLM (only after interruption decision is made) ===== + logger.info(f"๐Ÿ“ค SENDING TO LLM: '{ev.transcript}'") + + # Add to chat context for RealtimeModel msg = llm.ChatMessage(role="user", content=[ev.transcript], id=ev.item_id) self._agent._chat_ctx.items.append(msg) self._session._conversation_item_added(msg) + + logger.info(f"โœ… PROCESSING COMPLETE for: '{ev.transcript}'") + else: + # For non-final transcripts, just emit the event + self._session._user_input_transcribed( + UserInputTranscribedEvent(transcript=ev.transcript, is_final=ev.is_final) + ) def _on_generation_created(self, ev: llm.GenerationCreatedEvent) -> None: if ev.user_initiated: @@ -1256,6 +1343,10 @@ def on_vad_inference_done(self, ev: vad.VADEvent) -> None: # ignore vad inference done event if turn_detection is manual or realtime_llm return + if self._interruption_handler.agent_is_speaking: + # Defer interruption decision to transcript when agent is speaking. + return + if ev.speech_duration >= self._session.options.min_interruption_duration: self._interrupt_by_audio_activity() @@ -1277,19 +1368,27 @@ def on_interim_transcript(self, ev: stt.SpeechEvent, *, speaking: bool | None) - "manual", "realtime_llm", ): - self._interrupt_by_audio_activity() + should_interrupt = True + if self._interruption_handler.agent_is_speaking: + should_interrupt = self._interruption_handler.should_interrupt( + ev.alternatives[0].text + ) - if ( - speaking is False - and self._paused_speech - and (timeout := self._session.options.false_interruption_timeout) is not None - ): - # schedule a resume timer if interrupted after end_of_speech - self._start_false_interruption_timer(timeout) + if should_interrupt: + self._interrupt_by_audio_activity() - self._interrupt_paused_speech_task = asyncio.create_task( - self._interrupt_paused_speech(old_task=self._interrupt_paused_speech_task) - ) + if ( + speaking is False + and self._paused_speech + and (timeout := self._session.options.false_interruption_timeout) + is not None + ): + # schedule a resume timer if interrupted after end_of_speech + self._start_false_interruption_timer(timeout) + + self._interrupt_paused_speech_task = asyncio.create_task( + self._interrupt_paused_speech(old_task=self._interrupt_paused_speech_task) + ) def on_final_transcript(self, ev: stt.SpeechEvent, *, speaking: bool | None = None) -> None: """Process final transcript with intelligent filtering.""" @@ -1300,14 +1399,32 @@ def on_final_transcript(self, ev: stt.SpeechEvent, *, speaking: bool | None = No # Get transcript transcript = ev.alternatives[0].text + logger.info( + f"๐ŸŽค FINAL TRANSCRIPT RECEIVED: '{transcript}' | " + f"Agent speaking: {self._interruption_handler.agent_is_speaking}" + ) + # ===== STEP 1: Check if agent is speaking (for backchanneling detection) ===== if self._interruption_handler.agent_is_speaking: + logger.info(f"๐Ÿ”Š Agent is speaking - checking for backchanneling...") + # Agent is speaking - check if this is backchanneling - words = re.findall(r'\b\w+\b', transcript.strip().lower()) + words = self._interruption_handler.extract_words(transcript) + logger.info(f"๐Ÿ“ Extracted words: {words}") if words: + # Check each word against ignore list + word_analysis = [] + for word in words: + is_ignored = word in self._interruption_handler.ignore_words + word_analysis.append(f"{word}{'(ignored)' if is_ignored else '(valid)'}") + + logger.info(f"๐Ÿ” Word analysis: {' '.join(word_analysis)}") + # Check if all words are in ignore list (backchanneling) - all_backchanneling = all(word in self._interruption_handler.ignore_words for word in words) + all_backchanneling = self._interruption_handler.is_pure_backchannel(transcript) + + logger.info(f"๐Ÿค” All backchanneling? {all_backchanneling}") if all_backchanneling: # ===== PURE BACKCHANNELING - IGNORE COMPLETELY ===== @@ -1327,13 +1444,20 @@ def on_final_transcript(self, ev: stt.SpeechEvent, *, speaking: bool | None = No ) # ===== STOP HERE - DO NOT PROCESS FURTHER ===== + logger.info(f"๐Ÿšซ PROCESSING STOPPED - Pure backchanneling detected") return # This is the KEY - prevents sending to LLM and interruption + else: + logger.info(f"๐Ÿ”‡ Agent is silent - normal processing") # ===== STEP 2: Not pure backchanneling - Check interruption logic ===== - should_interrupt = self._interruption_handler.should_interrupt(transcript) + logger.info(f"๐Ÿง  Checking interruption logic for: '{transcript}'") + should_interrupt = ( + self._interruption_handler.agent_is_speaking + and self._interruption_handler.should_interrupt(transcript) + ) - logger.debug( - f"Should interrupt: {should_interrupt}" + logger.info( + f"โšก INTERRUPTION DECISION: {should_interrupt} for '{transcript}'" ) if should_interrupt: @@ -1342,15 +1466,17 @@ def on_final_transcript(self, ev: stt.SpeechEvent, *, speaking: bool | None = No # Run interruption logic IMMEDIATELY before sending to LLM if self._audio_recognition and self._turn_detection not in ("manual", "realtime_llm"): + logger.info(f"๐Ÿ›‘ Executing _interrupt_by_audio_activity()") self._interrupt_by_audio_activity() - if ( speaking is False and self._paused_speech and (timeout := self._session.options.false_interruption_timeout) is not None ): + logger.info(f"โฐ Starting false interruption timer: {timeout}s") self._start_false_interruption_timer(timeout) + logger.info(f"๐Ÿ”„ Creating interrupt_paused_speech task") self._interrupt_paused_speech_task = asyncio.create_task( self._interrupt_paused_speech(old_task=self._interrupt_paused_speech_task) ) @@ -1359,6 +1485,7 @@ def on_final_transcript(self, ev: stt.SpeechEvent, *, speaking: bool | None = No logger.info(f"โœ… NO INTERRUPTION: '{transcript}' - Agent silent or content doesn't interrupt") # ===== STEP 3: Send to LLM (only after interruption decision is made) ===== + logger.info(f"๐Ÿ“ค SENDING TO LLM: '{transcript}'") # Emit transcription event (this goes to LLM) self._session._user_input_transcribed( @@ -1370,7 +1497,7 @@ def on_final_transcript(self, ev: stt.SpeechEvent, *, speaking: bool | None = No ), ) - logger.info(f"๐Ÿ“ค SENDING TO LLM: '{transcript}'") + logger.info(f"โœ… PROCESSING COMPLETE for: '{transcript}'") def on_preemptive_generation(self, info: _PreemptiveGenerationInfo) -> None: if ( @@ -2674,3 +2801,4 @@ def llm(self) -> llm.LLM | llm.RealtimeModel | None: @property def tts(self) -> tts.TTS | None: return self._agent.tts if is_given(self._agent.tts) else self._session.tts + diff --git a/livekit-agents/livekit/agents/voice/interruption_handler.py b/livekit-agents/livekit/agents/voice/interruption_handler.py index c5008df83c..7cddbc4840 100644 --- a/livekit-agents/livekit/agents/voice/interruption_handler.py +++ b/livekit-agents/livekit/agents/voice/interruption_handler.py @@ -38,7 +38,9 @@ def __init__( command_words: Set of words that always trigger interruption """ # Normalize all words to lowercase - self.ignore_words = {w.lower().strip() for w in ignore_words} + normalized_ignore = {w.lower().strip() for w in ignore_words} + self.ignore_phrases = {w for w in normalized_ignore if " " in w} + self.ignore_words = {w for w in normalized_ignore if " " not in w} self.command_words = {w.lower().strip() for w in command_words} # Track agent state @@ -58,6 +60,43 @@ def __init__( f"{len(self.ignore_words)} ignore words, " f"{len(self.command_words)} command words" ) + + def _normalize(self, transcript: str) -> str: + return re.sub(r"\s+", " ", transcript.strip().lower()) + + def _tokenize(self, text: str) -> list[str]: + return re.findall(r"[a-zA-Z]+(?:[-'][a-zA-Z]+)*", text) + + def _remove_ignore_phrases(self, text: str) -> str: + if not self.ignore_phrases or not text: + return text + + cleaned = text + for phrase in sorted(self.ignore_phrases, key=len, reverse=True): + pattern = r"\b" + re.escape(phrase) + r"\b" + cleaned = re.sub(pattern, " ", cleaned) + + return re.sub(r"\s+", " ", cleaned).strip() + + def extract_words(self, transcript: str) -> list[str]: + """Extract normalized tokens from transcript.""" + return self._tokenize(self._normalize(transcript)) + + def is_pure_backchannel(self, transcript: str) -> bool: + """Return True if transcript is composed only of ignorable words/phrases.""" + normalized = self._normalize(transcript) + if not normalized: + return False + + without_phrases = self._remove_ignore_phrases(normalized) + if not without_phrases: + return True + + words = self._tokenize(without_phrases) + if not words: + return True + + return all(word in self.ignore_words for word in words) @property def agent_is_speaking(self) -> bool: @@ -86,64 +125,87 @@ def should_interrupt(self, transcript: str) -> bool: # Normalize transcript_original = transcript - transcript = transcript.strip().lower() + transcript = self._normalize(transcript) if not transcript: return False # Extract words - words = re.findall(r'\b\w+\b', transcript) + words = self._tokenize(transcript) if not words: return False - logger.debug( - f"Processing transcript: '{transcript_original}', " - f"words: {words}" + logger.info( + f"๐Ÿง  INTERRUPTION HANDLER: Processing '{transcript_original}' | " + f"Agent speaking: {self._agent_is_speaking} | " + f"Words: {words}" ) # ===== RULE 1: Check for command words (ALWAYS interrupt) ===== + logger.info(f"๐Ÿ” Checking for command words in: {words}") for word in words: if word in self.command_words: self.stats["interrupted"] += 1 - logger.info(f"INTERRUPT: Command '{word}' in '{transcript_original}'") + logger.info(f"๐Ÿšจ COMMAND DETECTED: '{word}' in '{transcript_original}' - INTERRUPTING") return True + logger.info(f"โœ… No command words found") + # ===== RULE 2: Check for repeated words (emphasis) ===== + logger.info(f"๐Ÿ” Checking for repeated words in: {words}") # If same word appears 2+ times, it's emphasis, not backchanneling word_counts = {} for word in words: word_counts[word] = word_counts.get(word, 0) + 1 + logger.info(f"๐Ÿ“Š Word counts: {word_counts}") + # If any word repeated 2+ times, treat as real input (interrupt) for word, count in word_counts.items(): - if count >= 2: + if count >= 2 and word not in self.ignore_words: self.stats["interrupted"] += 1 logger.info( - f"INTERRUPT: Repeated word '{word}' x{count} = emphasis: '{transcript_original}'" + f"๐Ÿ”„ REPEATED WORD: '{word}' x{count} = emphasis in '{transcript_original}' - INTERRUPTING" ) return True + logger.info(f"โœ… No repeated words found") + # ===== RULE 3: Check if all words are backchanneling (ALWAYS ignore) ===== - all_backchanneling = all(word in self.ignore_words for word in words) + logger.info(f"๐Ÿ” Checking if all words are backchanneling: {words}") + all_backchanneling = self.is_pure_backchannel(transcript) + logger.info(f"๐Ÿค” All backchanneling result: {all_backchanneling}") + + # Check each word individually for detailed logging + word_status = [] + for word in words: + is_ignored = word in self.ignore_words + status = "ignored" if is_ignored else "valid" + word_status.append(f"{word}({status})") + + logger.info(f"๐Ÿ“ Word status: {' '.join(word_status)}") if all_backchanneling: self.stats["ignored"] += 1 - logger.info(f"IGNORE: Pure backchanneling: '{transcript_original}'") + logger.info(f"๐Ÿ”‡ PURE BACKCHANNELING: '{transcript_original}' - IGNORING") return False # ===== RULE 4: Mixed content - check ratio ===== + logger.info(f"๐Ÿ” Checking mixed content ratio for: {words}") ignorable_count = sum(1 for word in words if word in self.ignore_words) ignorable_ratio = ignorable_count / len(words) + logger.info(f"๐Ÿ“Š Mixed content: {ignorable_count}/{len(words)} ignorable = {ignorable_ratio:.0%}") + if ignorable_ratio >= 0.75: self.stats["ignored"] += 1 - logger.info(f"IGNORE: Mostly backchanneling ({ignorable_ratio:.0%}): '{transcript_original}'") + logger.info(f"๐Ÿ”‡ MOSTLY BACKCHANNELING: {ignorable_ratio:.0%} ignorable in '{transcript_original}' - IGNORING") return False # ===== RULE 5: Real content (interrupt) ===== self.stats["interrupted"] += 1 - logger.info(f"INTERRUPT: Real input: '{transcript_original}'") + logger.info(f"๐ŸŽฏ REAL CONTENT: '{transcript_original}' - INTERRUPTING") return True def get_stats(self) -> dict: From e0e3df86e1c3ca3dcb675579f01c5feff7e7e39f Mon Sep 17 00:00:00 2001 From: Aditya Gupta <148680267+adi9336@users.noreply.github.com> Date: Sun, 15 Feb 2026 19:20:49 +0530 Subject: [PATCH 5/5] Add proof documentation --- proof.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 proof.md diff --git a/proof.md b/proof.md new file mode 100644 index 0000000000..e69de29bb2