From 6c7ea70f999a06dde6ee7e4abe67cb495fdefe58 Mon Sep 17 00:00:00 2001 From: Riddhika Arora Date: Sun, 1 Feb 2026 21:55:26 +0530 Subject: [PATCH 1/8] Updated session settings for interrupt handling --- examples/voice_agents/basic_agent.py | 97 +++++++-------- examples/voice_agents/test.py | 172 +++++++++++++++++++++++++++ 2 files changed, 212 insertions(+), 57 deletions(-) create mode 100644 examples/voice_agents/test.py diff --git a/examples/voice_agents/basic_agent.py b/examples/voice_agents/basic_agent.py index f064dab5d7..6371ab0243 100644 --- a/examples/voice_agents/basic_agent.py +++ b/examples/voice_agents/basic_agent.py @@ -3,16 +3,9 @@ from dotenv import load_dotenv from livekit.agents import ( - Agent, - AgentServer, - AgentSession, - JobContext, - JobProcess, - MetricsCollectedEvent, - RunContext, - cli, - metrics, - room_io, + Agent, AgentServer, AgentSession, JobContext, JobProcess, + cli, metrics, UserInputTranscribedEvent, AgentStateChangedEvent, + UserStateChangedEvent, SpeechCreatedEvent, AgentFalseInterruptionEvent ) from livekit.agents.llm import function_tool from livekit.plugins import silero @@ -21,7 +14,7 @@ # uncomment to enable Krisp background voice/noise cancellation # from livekit.plugins import noise_cancellation -logger = logging.getLogger("basic-agent") +logger = logging.getLogger("intelligent-kelly") load_dotenv() @@ -35,33 +28,13 @@ def __init__(self) -> None: "You are curious and friendly, and have a sense of humor." "you will speak english to the user", ) - + self.is_speaking = False + self.current_speech_handle = None async def on_enter(self): - # when the agent is added to the session, it'll generate a reply - # according to its instructions - self.session.generate_reply() - - # all functions annotated with @function_tool will be passed to the LLM when this - # agent is active - @function_tool - async def lookup_weather( - self, context: RunContext, location: str, latitude: str, longitude: str - ): - """Called when the user asks for weather related information. - Ensure the user's location (city or region) is provided. - When given a location, please estimate the latitude and longitude of the location and - do not ask the user for them. - - Args: - location: The location they are asking for - latitude: The latitude of the location, do not ask user for it - longitude: The longitude of the location, do not ask user for it - """ - - logger.info(f"Looking up weather for {location}") - - return "sunny with a temperature of 70 degrees." - + + await self.session.generate_reply() + + server = AgentServer() @@ -75,31 +48,41 @@ def prewarm(proc: JobProcess): @server.rtc_session() async def entrypoint(ctx: JobContext): - # each log entry will include these fields - ctx.log_context_fields = { - "room": ctx.room.name, - } + # === CRITICAL: Configure false interruption handling === session = AgentSession( - # Speech-to-text (STT) is your agent's ears, turning the user's speech into text that the LLM can understand - # See all available models at https://docs.livekit.io/agents/models/stt/ stt="deepgram/nova-3", - # A Large Language Model (LLM) is your agent's brain, processing user input and generating a response - # See all available models at https://docs.livekit.io/agents/models/llm/ - llm="openai/gpt-4.1-mini", - # Text-to-speech (TTS) is your agent's voice, turning the LLM's text into speech that the user can hear - # See all available models as well as voice selections at https://docs.livekit.io/agents/models/tts/ + llm="openai/gpt-4o-mini", tts="cartesia/sonic-2:9626c31c-bec5-4cca-baa8-f8ba9e84c8bc", - # VAD and turn detection are used to determine when the user is speaking and when the agent should respond - # See more at https://docs.livekit.io/agents/build/turns - turn_detection=MultilingualModel(), vad=ctx.proc.userdata["vad"], - # allow the LLM to generate a response while waiting for the end of turn - # See more at https://docs.livekit.io/agents/build/audio/#preemptive-generation - preemptive_generation=True, - # sometimes background noise could interrupt the agent session, these are considered false positive interruptions - # when it's detected, you may resume the agent's speech - resume_false_interruption=True, + turn_detection=MultilingualModel(), + + # === PRIMARY DEFENSE: Built-in False Interruption Handling === + # These parameters tell LiveKit to wait for real speech before committing to an interruption + allow_interruptions=True, + + # Require at least 0.8 seconds of detected speech before interrupting + # This filters out most short fillers like "yeah", "ok", "hmm" + min_interruption_duration=0.8, + + # Require at least 2 words to be transcribed before confirming interruption + # Single filler words won't trigger interruption + min_interruption_words=2, + + # Wait up to 1 second for transcription after VAD detects speech + # If no real words are transcribed, treat it as a false interruption false_interruption_timeout=1.0, + + # Automatically resume speaking if interruption was false positive + # This is THE KEY FEATURE for seamless continuation + resume_false_interruption=True, + + # Don't buffer audio while agent is speaking + discard_audio_if_uninterruptible=True, + + # Other important settings + preemptive_generation=False, + min_endpointing_delay=0.6, + max_endpointing_delay=3.0, ) # log metrics as they are emitted, and total usage after session is over diff --git a/examples/voice_agents/test.py b/examples/voice_agents/test.py new file mode 100644 index 0000000000..38e2906df7 --- /dev/null +++ b/examples/voice_agents/test.py @@ -0,0 +1,172 @@ +import logging +import re +from dotenv import load_dotenv +from livekit.agents import ( + Agent, AgentServer, AgentSession, JobContext, JobProcess, + cli, metrics, UserInputTranscribedEvent, AgentStateChangedEvent, + UserStateChangedEvent, SpeechCreatedEvent, AgentFalseInterruptionEvent +) +from livekit.plugins import silero, deepgram, openai, cartesia +from livekit.plugins.turn_detector.multilingual import MultilingualModel + +logger = logging.getLogger("intelligent-kelly") +logger.setLevel(logging.INFO) +load_dotenv() + +# CONFIGURATION: Easily adjustable lists for modularity +STOP_WORDS = {"wait", "no", "stop", "finish", "hold", "pause", "hold on"} +FILLER_WORDS = { + "uhhuh", "okay", "alright", "mhm", "yeah", "yep", "yup", + "hmm", "right", "uh", "um", "ah", "gotit", "isee", "ok", "k", + "sure", "yes", "interesting", "really", "wow", "ohh", "ooh", + "aha", "uhuh", "mhmm", "gotcha", "nice" +} + +class MyAgent(Agent): + def __init__(self) -> None: + super().__init__( + instructions=( + "Your name is Kelly. Keep responses concise and witty. " + "If you hear acknowledgements like 'yeah' or 'ok' while explaining something, " + "that means the user is listening - keep going! " + "Only stop if they explicitly ask you to wait or stop." + ), + ) + # More reliable state tracking + self.is_speaking = False + self.current_speech_handle = None + + async def on_enter(self): + await self.session.generate_reply() + +server = AgentServer() + +def prewarm(proc: JobProcess): + proc.userdata["vad"] = silero.VAD.load() + +server.setup_fnc = prewarm + +@server.rtc_session() +async def entrypoint(ctx: JobContext): + # === CRITICAL: Configure false interruption handling === + session = AgentSession( + stt="deepgram/nova-3", + llm="openai/gpt-4o-mini", + tts="cartesia/sonic-2:9626c31c-bec5-4cca-baa8-f8ba9e84c8bc", + vad=ctx.proc.userdata["vad"], + turn_detection=MultilingualModel(), + + # === PRIMARY DEFENSE: Built-in False Interruption Handling === + # These parameters tell LiveKit to wait for real speech before committing to an interruption + allow_interruptions=True, + + # Require at least 0.8 seconds of detected speech before interrupting + # This filters out most short fillers like "yeah", "ok", "hmm" + min_interruption_duration=0.8, + + # Require at least 2 words to be transcribed before confirming interruption + # Single filler words won't trigger interruption + min_interruption_words=2, + + # Wait up to 1 second for transcription after VAD detects speech + # If no real words are transcribed, treat it as a false interruption + false_interruption_timeout=1.0, + + # Automatically resume speaking if interruption was false positive + # This is THE KEY FEATURE for seamless continuation + resume_false_interruption=True, + + # Don't buffer audio while agent is speaking + discard_audio_if_uninterruptible=True, + + # Other important settings + preemptive_generation=False, + min_endpointing_delay=0.6, + max_endpointing_delay=3.0, + ) + + kelly = MyAgent() + + # === STATE TRACKING === + + @session.on("speech_created") + def on_speech_created(ev: SpeechCreatedEvent): + """Track when Kelly actually starts speaking audio""" + kelly.is_speaking = True + kelly.current_speech_handle = ev.speech_handle + logger.info("🎤 KELLY STARTED SPEAKING") + + @session.on("agent_state_changed") + def on_agent_state_changed(ev: AgentStateChangedEvent): + """Track agent state transitions""" + logger.info(f"🎭 AGENT STATE: {ev.old_state} → {ev.new_state}") + + # Kelly is no longer speaking when she returns to listening + if ev.new_state == "listening": + kelly.is_speaking = False + kelly.current_speech_handle = None + logger.info("👂 Kelly finished and is now listening") + + @session.on("user_state_changed") + def on_user_state_changed(ev: UserStateChangedEvent): + """Track user state for debugging""" + logger.info(f"👤 USER STATE: {ev.old_state} → {ev.new_state}") + + @session.on("agent_false_interruption") + def on_false_interruption(ev: AgentFalseInterruptionEvent): + """Log when false interruptions are detected and resumed""" + if ev.resumed: + logger.info("✅ FALSE INTERRUPTION: Kelly resumed seamlessly") + else: + logger.warning("⚠️ FALSE INTERRUPTION: Not auto-resumed") + + # === SECONDARY FILTER: Transcript-Level Suppression === + + @session.on("user_input_transcribed") + def on_user_input_transcribed(ev: UserInputTranscribedEvent): + """ + This handler provides a SECONDARY layer of defense. + It runs after STT completes, so it can't prevent the initial VAD interruption, + but it can prevent filler words from reaching the LLM and generating responses. + """ + if not ev.is_final or not ev.transcript: + return + + # Normalize text: lowercase, remove punctuation + clean_text = re.sub(r'[^\w\s]', '', ev.transcript.lower()).strip() + words = set(clean_text.split()) + raw_phrase = clean_text.replace(" ", "") # For "uhhuh" type phrases + + if not words: + return + + logger.info(f"📝 TRANSCRIPT: '{clean_text}' | Kelly speaking: {kelly.is_speaking}") + + # === CASE 1: Command words ALWAYS cause interruption === + if words.intersection(STOP_WORDS): + logger.info(f"🛑 COMMAND DETECTED: '{clean_text}' → Calling interrupt()") + session.interrupt() + return + + # === CASE 2: Pure filler words while Kelly is speaking === + # These should be ignored - don't let them reach the LLM + if kelly.is_speaking and (raw_phrase in FILLER_WORDS or words.issubset(FILLER_WORDS)): + logger.info(f"🔇 FILLER SUPPRESSED: '{clean_text}' ignored during speech") + # By returning here without calling session.interrupt() or forwarding to LLM, + # we ensure this transcript is effectively dropped + return + + # === CASE 3: Lone filler when Kelly is silent === + # Still suppress these - they shouldn't trigger LLM processing + if not kelly.is_speaking and (raw_phrase in FILLER_WORDS or words.issubset(FILLER_WORDS)): + logger.info(f"🍃 LONE FILLER: '{clean_text}' suppressed (Kelly idle)") + return + + # === CASE 4: Real user input (commands, questions, statements) === + logger.info(f"✅ VALID INPUT: '{clean_text}' forwarded to LLM") + # Let this proceed normally - it will reach the LLM + + await session.start(agent=kelly, room=ctx.room) + +if __name__ == "__main__": + cli.run_app(server) \ No newline at end of file From 473e580c60dd49621d1ab0bd2958dbf500f874e7 Mon Sep 17 00:00:00 2001 From: Riddhika Arora Date: Sun, 1 Feb 2026 21:58:16 +0530 Subject: [PATCH 2/8] Added State tracking --- examples/voice_agents/basic_agent.py | 68 ++++++++++++++-------------- 1 file changed, 33 insertions(+), 35 deletions(-) diff --git a/examples/voice_agents/basic_agent.py b/examples/voice_agents/basic_agent.py index 6371ab0243..f6174584ec 100644 --- a/examples/voice_agents/basic_agent.py +++ b/examples/voice_agents/basic_agent.py @@ -48,58 +48,56 @@ def prewarm(proc: JobProcess): @server.rtc_session() async def entrypoint(ctx: JobContext): - # === CRITICAL: Configure false interruption handling === + session = AgentSession( stt="deepgram/nova-3", llm="openai/gpt-4o-mini", tts="cartesia/sonic-2:9626c31c-bec5-4cca-baa8-f8ba9e84c8bc", vad=ctx.proc.userdata["vad"], turn_detection=MultilingualModel(), - - # === PRIMARY DEFENSE: Built-in False Interruption Handling === - # These parameters tell LiveKit to wait for real speech before committing to an interruption allow_interruptions=True, - - # Require at least 0.8 seconds of detected speech before interrupting - # This filters out most short fillers like "yeah", "ok", "hmm" min_interruption_duration=0.8, - - # Require at least 2 words to be transcribed before confirming interruption - # Single filler words won't trigger interruption min_interruption_words=2, - - # Wait up to 1 second for transcription after VAD detects speech - # If no real words are transcribed, treat it as a false interruption false_interruption_timeout=1.0, - - # Automatically resume speaking if interruption was false positive - # This is THE KEY FEATURE for seamless continuation resume_false_interruption=True, - - # Don't buffer audio while agent is speaking discard_audio_if_uninterruptible=True, - - # Other important settings preemptive_generation=False, min_endpointing_delay=0.6, max_endpointing_delay=3.0, ) + kelly = MyAgent() + @session.on("speech_created") + def on_speech_created(ev: SpeechCreatedEvent): + """Track when Kelly actually starts speaking audio""" + kelly.is_speaking = True + kelly.current_speech_handle = ev.speech_handle + logger.info("🎤 KELLY STARTED SPEAKING") + + @session.on("agent_state_changed") + def on_agent_state_changed(ev: AgentStateChangedEvent): + """Track agent state transitions""" + logger.info(f"🎭 AGENT STATE: {ev.old_state} → {ev.new_state}") + + # Kelly is no longer speaking when she returns to listening + if ev.new_state == "listening": + kelly.is_speaking = False + kelly.current_speech_handle = None + logger.info("👂 Kelly finished and is now listening") + + @session.on("user_state_changed") + def on_user_state_changed(ev: UserStateChangedEvent): + """Track user state for debugging""" + logger.info(f"👤 USER STATE: {ev.old_state} → {ev.new_state}") + + @session.on("agent_false_interruption") + def on_false_interruption(ev: AgentFalseInterruptionEvent): + """Log when false interruptions are detected and resumed""" + if ev.resumed: + logger.info("✅ FALSE INTERRUPTION: Kelly resumed seamlessly") + else: + logger.warning("⚠️ FALSE INTERRUPTION: Not auto-resumed") - # log metrics as they are emitted, and total usage after session is over - usage_collector = metrics.UsageCollector() - - @session.on("metrics_collected") - def _on_metrics_collected(ev: MetricsCollectedEvent): - metrics.log_metrics(ev.metrics) - usage_collector.collect(ev.metrics) - - async def log_usage(): - summary = usage_collector.get_summary() - logger.info(f"Usage: {summary}") - - # shutdown callbacks are triggered when the session is over - ctx.add_shutdown_callback(log_usage) - + await session.start( agent=MyAgent(), room=ctx.room, From 3bf5414effb6e790ca544661687d179d3f29ead7 Mon Sep 17 00:00:00 2001 From: Riddhika Arora Date: Sun, 1 Feb 2026 22:01:52 +0530 Subject: [PATCH 3/8] Added ignoring for fillers and stopping or stop words --- examples/voice_agents/basic_agent.py | 60 +++++++++++++++++++++------- 1 file changed, 46 insertions(+), 14 deletions(-) diff --git a/examples/voice_agents/basic_agent.py b/examples/voice_agents/basic_agent.py index f6174584ec..2576d13999 100644 --- a/examples/voice_agents/basic_agent.py +++ b/examples/voice_agents/basic_agent.py @@ -1,5 +1,5 @@ import logging - +import re from dotenv import load_dotenv from livekit.agents import ( @@ -13,7 +13,13 @@ # uncomment to enable Krisp background voice/noise cancellation # from livekit.plugins import noise_cancellation - +STOP_WORDS = {"wait", "no", "stop", "finish", "hold", "pause", "hold on"} +FILLER_WORDS = { + "uhhuh", "okay", "alright", "mhm", "yeah", "yep", "yup", + "hmm", "right", "uh", "um", "ah", "gotit", "isee", "ok", "k", + "sure", "yes", "interesting", "really", "wow", "ohh", "ooh", + "aha", "uhuh", "mhmm", "gotcha", "nice" +} logger = logging.getLogger("intelligent-kelly") load_dotenv() @@ -98,17 +104,43 @@ def on_false_interruption(ev: AgentFalseInterruptionEvent): logger.warning("⚠️ FALSE INTERRUPTION: Not auto-resumed") - await session.start( - agent=MyAgent(), - room=ctx.room, - room_options=room_io.RoomOptions( - audio_input=room_io.AudioInputOptions( - # uncomment to enable the Krisp BVC noise cancellation - # noise_cancellation=noise_cancellation.BVC(), - ), - ), - ) - + @session.on("user_input_transcribed") + def on_user_input_transcribed(ev: UserInputTranscribedEvent): + """ + This handler provides a SECONDARY layer of defense. + It runs after STT completes, so it can't prevent the initial VAD interruption, + but it can prevent filler words from reaching the LLM and generating responses. + """ + if not ev.is_final or not ev.transcript: + return + + clean_text = re.sub(r'[^\w\s]', '', ev.transcript.lower()).strip() + words = set(clean_text.split()) + raw_phrase = clean_text.replace(" ", "") # For "uhhuh" type phrases + + if not words: + return + + logger.info(f"📝 TRANSCRIPT: '{clean_text}' | Kelly speaking: {kelly.is_speaking}") + + + if words.intersection(STOP_WORDS): + logger.info(f"🛑 COMMAND DETECTED: '{clean_text}' → Calling interrupt()") + session.interrupt() + return + + if kelly.is_speaking and (raw_phrase in FILLER_WORDS or words.issubset(FILLER_WORDS)): + logger.info(f"🔇 FILLER SUPPRESSED: '{clean_text}' ignored during speech") + + return + + if not kelly.is_speaking and (raw_phrase in FILLER_WORDS or words.issubset(FILLER_WORDS)): + logger.info(f"🍃 LONE FILLER: '{clean_text}' suppressed (Kelly idle)") + return + + logger.info(f"✅ VALID INPUT: '{clean_text}' forwarded to LLM") + + await session.start(agent=kelly, room=ctx.room) if __name__ == "__main__": - cli.run_app(server) + cli.run_app(server) \ No newline at end of file From 421add1b47445d8bee03b69a60d2cc16e3308aca Mon Sep 17 00:00:00 2001 From: Riddhika Arora Date: Mon, 2 Feb 2026 00:48:10 +0530 Subject: [PATCH 4/8] Medium VAD thresholds with transcript handler --- examples/voice_agents/basic_agent.py | 275 +++++++++++++++++++-------- 1 file changed, 196 insertions(+), 79 deletions(-) diff --git a/examples/voice_agents/basic_agent.py b/examples/voice_agents/basic_agent.py index 2576d13999..cd75b790d7 100644 --- a/examples/voice_agents/basic_agent.py +++ b/examples/voice_agents/basic_agent.py @@ -1,145 +1,262 @@ +""" +THE ACTUAL WORKING HYBRID SOLUTION + +Problem statement: +- "okay" said slowly (1.5s) should NOT interrupt +- "stop" said quickly (0.5s) SHOULD interrupt immediately +- Can't use duration-based filtering because it fails one of these cases + +Solution: +- Use MEDIUM thresholds (catches most fillers, but some slip through) +- Have transcript handler RESUME if interrupted by filler +- Have transcript handler FORCE INTERRUPT if command detected but VAD didn't trigger + +This way: +- Fast "stop" (0.5s) → Below threshold → VAD doesn't interrupt → Transcript handler forces interrupt ✅ +- Slow "okay" (1.5s) → Above threshold → VAD interrupts → Transcript handler RESUMES ✅ +- Fast "okay" (0.3s) → Below threshold → VAD doesn't interrupt → Transcript suppresses ✅ +""" + import logging import re +import asyncio +from typing import Optional from dotenv import load_dotenv - from livekit.agents import ( - Agent, AgentServer, AgentSession, JobContext, JobProcess, - cli, metrics, UserInputTranscribedEvent, AgentStateChangedEvent, - UserStateChangedEvent, SpeechCreatedEvent, AgentFalseInterruptionEvent + Agent, AgentServer, AgentSession, JobContext, JobProcess, + cli, UserInputTranscribedEvent, AgentStateChangedEvent, + UserStateChangedEvent ) -from livekit.agents.llm import function_tool -from livekit.plugins import silero +from livekit.plugins import silero, deepgram, openai, cartesia from livekit.plugins.turn_detector.multilingual import MultilingualModel -# uncomment to enable Krisp background voice/noise cancellation -# from livekit.plugins import noise_cancellation -STOP_WORDS = {"wait", "no", "stop", "finish", "hold", "pause", "hold on"} +logger = logging.getLogger("intelligent-kelly") +logger.setLevel(logging.INFO) +load_dotenv() + +# CONFIGURATION +STOP_WORDS = {"wait", "stop", "finish", "hold", "pause", "halt"} FILLER_WORDS = { - "uhhuh", "okay", "alright", "mhm", "yeah", "yep", "yup", + "uhhuh", "okay", "alright", "mhm", "yeah", "yep", "yup", "hmm", "right", "uh", "um", "ah", "gotit", "isee", "ok", "k", "sure", "yes", "interesting", "really", "wow", "ohh", "ooh", - "aha", "uhuh", "mhmm", "gotcha", "nice" + "aha", "mhmm", "gotcha", "nice", "oh", "all", "got", "it", "i", "see" } -logger = logging.getLogger("intelligent-kelly") +FILLER_PHRASES = {"all right", "got it", "i see", "uh huh", "oh okay", "oh ok"} -load_dotenv() +def is_filler_input(transcript: str) -> bool: + """Check if transcript is purely a filler acknowledgment""" + clean = transcript.lower().strip() + clean_no_punct = re.sub(r'[^\w\s]', '', clean) + + if clean_no_punct in FILLER_PHRASES: + return True + if clean_no_punct.replace(" ", "") in FILLER_WORDS: + return True + + words = clean_no_punct.split() + if words and all(word in FILLER_WORDS for word in words): + return True + return False +def contains_command(transcript: str) -> bool: + """Check if transcript contains an explicit stop command""" + clean = transcript.lower().strip() + clean_no_punct = re.sub(r'[^\w\s]', '', clean) + words = clean_no_punct.split() + + if not words: + return False + + # Direct command (starts with stop word) + if words[0] in STOP_WORDS: + return True + + # Command after brief acknowledgment: "yeah wait", "okay stop" + if len(words) >= 2: + for i in range(len(words) - 1): + if words[i] in FILLER_WORDS and words[i + 1] in STOP_WORDS: + return True + if words[i] in {"but", "and"} and words[i + 1] in STOP_WORDS: + return True + + # Avoid false positives in longer sentences + # "I have no idea" should NOT be a command + if len(words) > 3 and any(w in STOP_WORDS for w in words): + # Only treat as command if stop word is in first 2 positions + return any(words[i] in STOP_WORDS for i in range(min(2, len(words)))) + + return False -class MyAgent(Agent): +class IntelligentAgent(Agent): def __init__(self) -> None: super().__init__( - instructions="Your name is Kelly. You would interact with users via voice." - "with that in mind keep your responses concise and to the point." - "do not use emojis, asterisks, markdown, or other special characters in your responses." - "You are curious and friendly, and have a sense of humor." - "you will speak english to the user", + instructions=( + "Your name is Kelly. Keep responses concise and witty. " + "When users say things like 'yeah' or 'okay' while you're speaking, " + "it means they're listening - keep going! " + "Only stop if they explicitly say 'wait', 'stop', or 'hold on'." + ), ) self.is_speaking = False - self.current_speech_handle = None + self.was_interrupted_by_vad = False + self.last_speech_content = "" + async def on_enter(self): - await self.session.generate_reply() - - server = AgentServer() - def prewarm(proc: JobProcess): proc.userdata["vad"] = silero.VAD.load() - server.setup_fnc = prewarm - @server.rtc_session() async def entrypoint(ctx: JobContext): - session = AgentSession( stt="deepgram/nova-3", llm="openai/gpt-4o-mini", tts="cartesia/sonic-2:9626c31c-bec5-4cca-baa8-f8ba9e84c8bc", vad=ctx.proc.userdata["vad"], turn_detection=MultilingualModel(), + + # === HYBRID STRATEGY === + # Medium-low threshold: Catches most fillers but allows quick commands allow_interruptions=True, - min_interruption_duration=0.8, - min_interruption_words=2, - false_interruption_timeout=1.0, - resume_false_interruption=True, - discard_audio_if_uninterruptible=True, + min_interruption_duration=0.6, # 0.6s - faster than most fillers, slower than most commands + min_interruption_words=2, # Require 2 words minimum + + # Enable auto-resume for false positives + false_interruption_timeout=1.0, # Wait 1s for transcript + resume_false_interruption=True, # Auto-resume if false positive + preemptive_generation=False, - min_endpointing_delay=0.6, - max_endpointing_delay=3.0, + min_endpointing_delay=0.5, + max_endpointing_delay=2.5, ) - kelly = MyAgent() + + kelly = IntelligentAgent() + + logger.info("=" * 80) + logger.info("🚀 HYBRID INTELLIGENT INTERRUPTION HANDLER") + logger.info("⚙️ Strategy:") + logger.info(" - Medium VAD thresholds (0.6s, 2 words)") + logger.info(" - Auto-resume on false interruptions") + logger.info(" - Manual interrupt on commands that slip through") + logger.info(" - Transcript suppression for fillers") + logger.info("=" * 80) + + # Track interruption state + vad_just_interrupted = False + @session.on("speech_created") - def on_speech_created(ev: SpeechCreatedEvent): - """Track when Kelly actually starts speaking audio""" + def on_speech_created(ev): + nonlocal vad_just_interrupted kelly.is_speaking = True - kelly.current_speech_handle = ev.speech_handle + kelly.was_interrupted_by_vad = False + vad_just_interrupted = False + + # Store what Kelly is saying for potential resume + if hasattr(ev, 'speech_handle') and hasattr(ev.speech_handle, 'text'): + kelly.last_speech_content = ev.speech_handle.text + logger.info("🎤 KELLY STARTED SPEAKING") @session.on("agent_state_changed") - def on_agent_state_changed(ev: AgentStateChangedEvent): - """Track agent state transitions""" + def on_agent_state_changed(ev): + nonlocal vad_just_interrupted + logger.info(f"🎭 AGENT STATE: {ev.old_state} → {ev.new_state}") - # Kelly is no longer speaking when she returns to listening + # Detect if Kelly was interrupted while speaking + if ev.old_state == "speaking" and ev.new_state == "listening": + if kelly.is_speaking: + kelly.was_interrupted_by_vad = True + vad_just_interrupted = True + logger.info("⚠️ KELLY INTERRUPTED - waiting for transcript to decide action...") + if ev.new_state == "listening": kelly.is_speaking = False - kelly.current_speech_handle = None - logger.info("👂 Kelly finished and is now listening") @session.on("user_state_changed") - def on_user_state_changed(ev: UserStateChangedEvent): - """Track user state for debugging""" + def on_user_state_changed(ev): logger.info(f"👤 USER STATE: {ev.old_state} → {ev.new_state}") - @session.on("agent_false_interruption") - def on_false_interruption(ev: AgentFalseInterruptionEvent): - """Log when false interruptions are detected and resumed""" - if ev.resumed: - logger.info("✅ FALSE INTERRUPTION: Kelly resumed seamlessly") - else: - logger.warning("⚠️ FALSE INTERRUPTION: Not auto-resumed") - + # Try to register false interruption handler + try: + @session.on("agent_false_interruption") + def on_false_interruption(ev): + if hasattr(ev, 'resumed') and ev.resumed: + logger.info("✅ FALSE INTERRUPTION AUTO-RESUMED by LiveKit") + except: + logger.warning("⚠️ False interruption event not available in this LiveKit version") @session.on("user_input_transcribed") - def on_user_input_transcribed(ev: UserInputTranscribedEvent): - """ - This handler provides a SECONDARY layer of defense. - It runs after STT completes, so it can't prevent the initial VAD interruption, - but it can prevent filler words from reaching the LLM and generating responses. - """ + def on_user_input_transcribed(ev): + nonlocal vad_just_interrupted + if not ev.is_final or not ev.transcript: return clean_text = re.sub(r'[^\w\s]', '', ev.transcript.lower()).strip() - words = set(clean_text.split()) - raw_phrase = clean_text.replace(" ", "") # For "uhhuh" type phrases - if not words: - return + logger.info(f"📝 TRANSCRIPT: '{clean_text}' | Kelly speaking: {kelly.is_speaking} | Just interrupted: {vad_just_interrupted}") - logger.info(f"📝 TRANSCRIPT: '{clean_text}' | Kelly speaking: {kelly.is_speaking}") + # === CASE 1: Kelly was just interrupted by VAD === + if kelly.was_interrupted_by_vad or vad_just_interrupted: + + if contains_command(clean_text): + logger.info(f"🛑 REAL COMMAND after VAD interrupt: '{clean_text}' - staying stopped") + kelly.was_interrupted_by_vad = False + vad_just_interrupted = False + # Allow normal processing - the interrupt was correct + return + + elif is_filler_input(clean_text): + logger.info(f"🔄 FALSE INTERRUPT: '{clean_text}' was just a filler - should resume") + kelly.was_interrupted_by_vad = False + vad_just_interrupted = False + + # LiveKit's resume_false_interruption should handle this automatically + # But we still suppress the transcript from reaching LLM + return + + else: + logger.info(f"✅ REAL INPUT after interrupt: '{clean_text}' - valid interruption") + kelly.was_interrupted_by_vad = False + vad_just_interrupted = False + # Allow normal processing + return + # === CASE 2: Kelly is currently speaking (VAD didn't interrupt yet) === + if kelly.is_speaking: + + if contains_command(clean_text): + logger.info(f"🛑 COMMAND while speaking: '{clean_text}' - forcing interrupt NOW") + session.interrupt() + return + + elif is_filler_input(clean_text): + logger.info(f"🔇 FILLER while speaking: '{clean_text}' - completely ignored") + # Don't interrupt, don't pass to LLM + return + + else: + logger.info(f"💬 REAL INPUT while speaking: '{clean_text}' - allowing interrupt") + session.interrupt() + return - if words.intersection(STOP_WORDS): - logger.info(f"🛑 COMMAND DETECTED: '{clean_text}' → Calling interrupt()") - session.interrupt() - return + # === CASE 3: Kelly is idle === + if not kelly.is_speaking: + + if is_filler_input(clean_text): + logger.info(f"🍃 FILLER while idle: '{clean_text}' - suppressed") + return + + logger.info(f"✅ VALID INPUT while idle: '{clean_text}'") + # Normal processing - if kelly.is_speaking and (raw_phrase in FILLER_WORDS or words.issubset(FILLER_WORDS)): - logger.info(f"🔇 FILLER SUPPRESSED: '{clean_text}' ignored during speech") - - return - - if not kelly.is_speaking and (raw_phrase in FILLER_WORDS or words.issubset(FILLER_WORDS)): - logger.info(f"🍃 LONE FILLER: '{clean_text}' suppressed (Kelly idle)") - return - - logger.info(f"✅ VALID INPUT: '{clean_text}' forwarded to LLM") - await session.start(agent=kelly, room=ctx.room) if __name__ == "__main__": From 27a5bb93aece07ca422cce69befbcd80e13bcdef Mon Sep 17 00:00:00 2001 From: Riddhika Arora Date: Mon, 2 Feb 2026 19:50:03 +0530 Subject: [PATCH 5/8] changed to stop commmand from command in logging info --- examples/voice_agents/basic_agent.py | 2 +- examples/voice_agents/test.py | 172 --------------------------- 2 files changed, 1 insertion(+), 173 deletions(-) delete mode 100644 examples/voice_agents/test.py diff --git a/examples/voice_agents/basic_agent.py b/examples/voice_agents/basic_agent.py index cd75b790d7..4eb6876cee 100644 --- a/examples/voice_agents/basic_agent.py +++ b/examples/voice_agents/basic_agent.py @@ -233,7 +233,7 @@ def on_user_input_transcribed(ev): if kelly.is_speaking: if contains_command(clean_text): - logger.info(f"🛑 COMMAND while speaking: '{clean_text}' - forcing interrupt NOW") + logger.info(f"🛑 STOP COMMAND while speaking: '{clean_text}' - forcing interrupt NOW") session.interrupt() return diff --git a/examples/voice_agents/test.py b/examples/voice_agents/test.py deleted file mode 100644 index 38e2906df7..0000000000 --- a/examples/voice_agents/test.py +++ /dev/null @@ -1,172 +0,0 @@ -import logging -import re -from dotenv import load_dotenv -from livekit.agents import ( - Agent, AgentServer, AgentSession, JobContext, JobProcess, - cli, metrics, UserInputTranscribedEvent, AgentStateChangedEvent, - UserStateChangedEvent, SpeechCreatedEvent, AgentFalseInterruptionEvent -) -from livekit.plugins import silero, deepgram, openai, cartesia -from livekit.plugins.turn_detector.multilingual import MultilingualModel - -logger = logging.getLogger("intelligent-kelly") -logger.setLevel(logging.INFO) -load_dotenv() - -# CONFIGURATION: Easily adjustable lists for modularity -STOP_WORDS = {"wait", "no", "stop", "finish", "hold", "pause", "hold on"} -FILLER_WORDS = { - "uhhuh", "okay", "alright", "mhm", "yeah", "yep", "yup", - "hmm", "right", "uh", "um", "ah", "gotit", "isee", "ok", "k", - "sure", "yes", "interesting", "really", "wow", "ohh", "ooh", - "aha", "uhuh", "mhmm", "gotcha", "nice" -} - -class MyAgent(Agent): - def __init__(self) -> None: - super().__init__( - instructions=( - "Your name is Kelly. Keep responses concise and witty. " - "If you hear acknowledgements like 'yeah' or 'ok' while explaining something, " - "that means the user is listening - keep going! " - "Only stop if they explicitly ask you to wait or stop." - ), - ) - # More reliable state tracking - self.is_speaking = False - self.current_speech_handle = None - - async def on_enter(self): - await self.session.generate_reply() - -server = AgentServer() - -def prewarm(proc: JobProcess): - proc.userdata["vad"] = silero.VAD.load() - -server.setup_fnc = prewarm - -@server.rtc_session() -async def entrypoint(ctx: JobContext): - # === CRITICAL: Configure false interruption handling === - session = AgentSession( - stt="deepgram/nova-3", - llm="openai/gpt-4o-mini", - tts="cartesia/sonic-2:9626c31c-bec5-4cca-baa8-f8ba9e84c8bc", - vad=ctx.proc.userdata["vad"], - turn_detection=MultilingualModel(), - - # === PRIMARY DEFENSE: Built-in False Interruption Handling === - # These parameters tell LiveKit to wait for real speech before committing to an interruption - allow_interruptions=True, - - # Require at least 0.8 seconds of detected speech before interrupting - # This filters out most short fillers like "yeah", "ok", "hmm" - min_interruption_duration=0.8, - - # Require at least 2 words to be transcribed before confirming interruption - # Single filler words won't trigger interruption - min_interruption_words=2, - - # Wait up to 1 second for transcription after VAD detects speech - # If no real words are transcribed, treat it as a false interruption - false_interruption_timeout=1.0, - - # Automatically resume speaking if interruption was false positive - # This is THE KEY FEATURE for seamless continuation - resume_false_interruption=True, - - # Don't buffer audio while agent is speaking - discard_audio_if_uninterruptible=True, - - # Other important settings - preemptive_generation=False, - min_endpointing_delay=0.6, - max_endpointing_delay=3.0, - ) - - kelly = MyAgent() - - # === STATE TRACKING === - - @session.on("speech_created") - def on_speech_created(ev: SpeechCreatedEvent): - """Track when Kelly actually starts speaking audio""" - kelly.is_speaking = True - kelly.current_speech_handle = ev.speech_handle - logger.info("🎤 KELLY STARTED SPEAKING") - - @session.on("agent_state_changed") - def on_agent_state_changed(ev: AgentStateChangedEvent): - """Track agent state transitions""" - logger.info(f"🎭 AGENT STATE: {ev.old_state} → {ev.new_state}") - - # Kelly is no longer speaking when she returns to listening - if ev.new_state == "listening": - kelly.is_speaking = False - kelly.current_speech_handle = None - logger.info("👂 Kelly finished and is now listening") - - @session.on("user_state_changed") - def on_user_state_changed(ev: UserStateChangedEvent): - """Track user state for debugging""" - logger.info(f"👤 USER STATE: {ev.old_state} → {ev.new_state}") - - @session.on("agent_false_interruption") - def on_false_interruption(ev: AgentFalseInterruptionEvent): - """Log when false interruptions are detected and resumed""" - if ev.resumed: - logger.info("✅ FALSE INTERRUPTION: Kelly resumed seamlessly") - else: - logger.warning("⚠️ FALSE INTERRUPTION: Not auto-resumed") - - # === SECONDARY FILTER: Transcript-Level Suppression === - - @session.on("user_input_transcribed") - def on_user_input_transcribed(ev: UserInputTranscribedEvent): - """ - This handler provides a SECONDARY layer of defense. - It runs after STT completes, so it can't prevent the initial VAD interruption, - but it can prevent filler words from reaching the LLM and generating responses. - """ - if not ev.is_final or not ev.transcript: - return - - # Normalize text: lowercase, remove punctuation - clean_text = re.sub(r'[^\w\s]', '', ev.transcript.lower()).strip() - words = set(clean_text.split()) - raw_phrase = clean_text.replace(" ", "") # For "uhhuh" type phrases - - if not words: - return - - logger.info(f"📝 TRANSCRIPT: '{clean_text}' | Kelly speaking: {kelly.is_speaking}") - - # === CASE 1: Command words ALWAYS cause interruption === - if words.intersection(STOP_WORDS): - logger.info(f"🛑 COMMAND DETECTED: '{clean_text}' → Calling interrupt()") - session.interrupt() - return - - # === CASE 2: Pure filler words while Kelly is speaking === - # These should be ignored - don't let them reach the LLM - if kelly.is_speaking and (raw_phrase in FILLER_WORDS or words.issubset(FILLER_WORDS)): - logger.info(f"🔇 FILLER SUPPRESSED: '{clean_text}' ignored during speech") - # By returning here without calling session.interrupt() or forwarding to LLM, - # we ensure this transcript is effectively dropped - return - - # === CASE 3: Lone filler when Kelly is silent === - # Still suppress these - they shouldn't trigger LLM processing - if not kelly.is_speaking and (raw_phrase in FILLER_WORDS or words.issubset(FILLER_WORDS)): - logger.info(f"🍃 LONE FILLER: '{clean_text}' suppressed (Kelly idle)") - return - - # === CASE 4: Real user input (commands, questions, statements) === - logger.info(f"✅ VALID INPUT: '{clean_text}' forwarded to LLM") - # Let this proceed normally - it will reach the LLM - - await session.start(agent=kelly, room=ctx.room) - -if __name__ == "__main__": - cli.run_app(server) \ No newline at end of file From 5fc30e258cedd64ab1fafa00a3d3765e68e263ee Mon Sep 17 00:00:00 2001 From: Riddhika Arora Date: Mon, 2 Feb 2026 20:32:58 +0530 Subject: [PATCH 6/8] Added readme for provided soln and demo transcript logs --- .../voice_agents/INTERREUPT_HANDLER_README.md | 243 ++++++++++++++++++ examples/voice_agents/demo_logs.txt | 157 +++++++++++ 2 files changed, 400 insertions(+) create mode 100644 examples/voice_agents/INTERREUPT_HANDLER_README.md create mode 100644 examples/voice_agents/demo_logs.txt diff --git a/examples/voice_agents/INTERREUPT_HANDLER_README.md b/examples/voice_agents/INTERREUPT_HANDLER_README.md new file mode 100644 index 0000000000..c06aa11c4c --- /dev/null +++ b/examples/voice_agents/INTERREUPT_HANDLER_README.md @@ -0,0 +1,243 @@ +# LiveKit Intelligent Interruption Handling + +A voice agent that knows the difference between "yeah, I'm listening" and "wait, stop talking." + +## Table of Contents + +- [Problem](#problem) +- [Solution](#solution) +- [Installation](#installation) +- [Configuration](#configuration) +- [Usage](#usage) +- [How It Works](#how-it-works) +- [Demo](#demo) +- [Customization](#customization) +- [Troubleshooting](#troubleshooting) + +--- + +## Problem + +LiveKit's default Voice Activity Detection treats every sound as an interruption. When you say "yeah" or "okay" to show you're listening (backchanneling), the agent thinks you want to interrupt and stops talking mid-sentence. + +The goal was to fix this: make the agent smart enough to ignore casual acknowledgments while still responding immediately to actual commands like "wait" or "stop." + +The catch: the agent can't pause or stutter when it hears a filler word. It needs to keep talking smoothly, like a human would. + +--- + +## Solution + +The solution uses three layers: + +**1. VAD Threshold Tuning** +Set the interruption thresholds higher (0.6 seconds, 2 words minimum). This filters out most quick "yeah" or "okay" responses before they even trigger an interruption. + +**2. Auto-Resume** +If VAD does interrupt on a filler word, LiveKit's `resume_false_interruption` feature kicks in and the agent continues talking without missing a beat. + +**3. Transcript Analysis** +When speech is detected, we wait for the actual transcription to come through. Then we check: is this a filler word or a real command? If it's a filler, suppress it. If it's a command like "stop", force an interruption even if VAD missed it. + +### Why This Works + +There's a timing problem: VAD detects sound almost instantly (~10ms), but speech-to-text takes longer (~200-500ms). By the time we know what was said, VAD might have already interrupted. + +The solution handles this by: +- Fast fillers (under 0.6s) never trigger VAD in the first place +- Slow fillers (over 0.6s) trigger VAD but get auto-resumed when we realize they're fillers +- Commands work at any speed because we manually interrupt when we detect them in the transcript + +--- + +## Installation + +### Requirements + +- Python 3.9+ +- LiveKit account (free tier works fine) + +### Install Dependencies + +```bash +pip install "livekit-agents[openai,silero,deepgram,cartesia,turn-detector]~=1.0" +``` + +--- + +## Configuration + +Create a `.env` file in the project root (copy from `.env.example`): + +```bash +# Required - Get these from LiveKit Cloud +LIVEKIT_API_SECRET="your-api-secret-here" +LIVEKIT_API_KEY="your-api-key-here" +LIVEKIT_URL="wss://your-project.livekit.cloud" + +# Optional - For model providers +OPENAI_API_KEY="your-openai-key" +DEEPGRAM_API_KEY="your-deepgram-key" +CARTESIA_API_KEY="your-cartesia-key" +``` + +To get LiveKit credentials: +1. Go to [cloud.livekit.io](https://cloud.livekit.io) +2. Create a project +3. Copy the API key, secret, and WebSocket URL from Settings + + + +--- + +## Usage + +Run the agent locally: + +```bash +python basic_agent.py dev +``` + +The agent starts in console mode. Just start talking - it'll respond. + +--- + +## How It Works + +### The Flow + +``` +User says "yeah" while agent talks + ↓ +VAD detects speech in ~10ms + ↓ +Is it under 0.6 seconds? → Agent keeps talking +Is it over 0.6 seconds? → VAD interrupts + ↓ +STT finishes transcribing (~200-500ms later) + ↓ +Our code checks the transcript: + - Filler word? → Resume immediately + - Command word? → Stay stopped + ↓ +Result: No awkward pauses +``` + +### State-Based Logic + +The same word behaves differently based on context: + +| What user says | Agent state | What happens | +|----------------|-------------|--------------| +| "yeah" / "okay" / "hmm" | Speaking | Ignored completely | +| "wait" / "stop" | Speaking | Stops immediately | +| "yeah" / "okay" | Silent | Suppressed (doesn't trigger response) | +| "tell me more" | Silent | Normal response | + +### Filler Detection + +We check if something is a filler in two ways: + +1. **No-space check**: "uh-huh" becomes "uhhuh" and matches our list +2. **All-words check**: "all right" splits to ["all", "right"], both are in our list, so it's a filler + +This handles things like "got it", "i see", "all right" automatically. + +### Command Detection + +For commands, we look for: +- Stop word at the start: "stop", "wait please" +- Filler + stop word: "yeah wait", "okay stop" +- Conjunction + stop word: "but wait", "and hold on" + +We skip long sentences (over 3 words) to avoid false positives like "I have no idea". + +--- + +## Demo + +**[https://drive.google.com/file/d/1IHqKgqxAG2ZHRwWsDxhksyp3En5qv5iM/view?usp=sharing]** + +The demo shows: +- Agent continuing to talk when hearing "yeah", "okay", "uh-huh" +- Agent stopping immediately when hearing "stop" or "wait" +- Handling mixed inputs like "yeah but wait" + +See `demo_logs.txt` for the full transcript from test runs. + +--- + +## Customization + +### Adding More Filler Words + +Edit `FILLER_WORDS` in `basic_agent.py`: + +```python +FILLER_WORDS = { + "yeah", "yep", "ok", "hmm", "uh", "um", + # Add your own here + "absolutely", "indeed", "totally" +} +``` + +### Adding More Command Words + +Edit `STOP_WORDS`: + +```python +STOP_WORDS = { + "wait", "stop", "hold", "pause", + # Add your own + "interrupt", "enough" +} +``` + +### Tuning Sensitivity + +In the `AgentSession` configuration: + +```python +min_interruption_duration=0.6, # Lower = more responsive, higher = fewer interruptions +min_interruption_words=2, # Minimum words before triggering interrupt +``` + +Tuning guide: +- Want fewer interruptions? Set duration to 0.8-1.0 seconds +- Want faster response to commands? Set duration to 0.4-0.5 seconds +- Want stricter filtering? Increase min_interruption_words to 3 + +--- + +## Troubleshooting + +### Agent doesn't stop for commands + +Check the logs for `🛑 STOP COMMAND` messages. If you don't see them: +- Make sure the word is in `STOP_WORDS` +- Try saying shorter commands (under 3 words) +- Verify `session.interrupt()` is being called + +### Agent pauses briefly on fillers + +This means the auto-resume isn't working: +- Increase `min_interruption_duration` to 0.8 +- Check that `resume_false_interruption=True` in the session config +- Look for "FALSE INTERRUPTION AUTO-RESUMED" in logs + +### Fillers still triggering responses + +Check what STT is actually transcribing (look at the `📝 TRANSCRIPT` logs). Sometimes "yeah" comes through as "yea" or other variations. Add those to `FILLER_WORDS`. + + +--- + +## Additional Resources + +- [LiveKit Agents Docs](https://docs.livekit.io/agents/) +- [Assignment Repo](https://github.com/Dark-Sys-Jenkins/agents-assignment) + +--- + +**Last Updated**: February 2026 +**LiveKit Version**: 1.3.12 \ No newline at end of file diff --git a/examples/voice_agents/demo_logs.txt b/examples/voice_agents/demo_logs.txt new file mode 100644 index 0000000000..22fb3c02fc --- /dev/null +++ b/examples/voice_agents/demo_logs.txt @@ -0,0 +1,157 @@ +PS C:\Users\riddh\Desktop\agents-assignment> cd examples/voice_agents +PS C:\Users\riddh\Desktop\agents-assignment\examples\voice_agents> python basic_agent.py console + Agents Starting console mode 🚀 + + 19:53:10.327 DEBUG asyncio Using proactor: IocpProactor + 19:53:10.333 INFO livekit.agents starting worker {"version": "1.3.12", "rtc-version": "1.0.23"} + 19:53:10.335 INFO livekit.agents starting inference executor + 19:53:10.383 INFO livekit.agents initializing process {"pid": 24660, "inference": true} + 19:53:30.649 INFO livekit.agents process initialized {"pid": 24660, "inference": true, "elapsed_time": 20.26} + 19:53:30.655 INFO livekit.agents HTTP server listening on :57067 + 19:53:30.692 INFO livekit.agents initializing job runner {"tid": 13084} + 19:53:30.942 DEBUG asyncio Using proactor: IocpProactor + 19:53:30.945 INFO livekit.agents job runner initialized {"tid": 13084, "elapsed_time": 0.25} + 19:53:31.974 INFO intelligent-kelly ================================================================================ + 19:53:31.977 INFO intelligent-kelly 🚀 HYBRID INTELLIGENT INTERRUPTION HANDLER + 19:53:31.979 INFO intelligent-kelly ⚙️ Strategy: + 19:53:31.981 INFO intelligent-kelly - Medium VAD thresholds (0.6s, 2 words) + 19:53:31.992 INFO intelligent-kelly - Auto-resume on false interruptions + 19:53:31.999 INFO intelligent-kelly - Manual interrupt on commands that slip through + 19:53:32.010 INFO intelligent-kelly - Transcript suppression for fillers + 19:53:32.025 INFO intelligent-kelly ================================================================================ + 19:53:32.064 DEBUG livekit.agents http_session(): creating a new httpclient ctx + 19:53:32.220 INFO intelligent-kelly 🎤 KELLY STARTED SPEAKING + 19:53:32.677 INFO intelligent-kelly 🎭 AGENT STATE: initializing → listening + 19:53:32.682 DEBUG livekit.agents using audio io: `Console` -> `AgentSession` -> `TranscriptSynchronizer` -> `Console` + 19:53:32.687 DEBUG livekit.agents using transcript io: `AgentSession` -> `TranscriptSynchronizer` + 19:53:32.723 INFO intelligent-kelly 🎭 AGENT STATE: listening → thinking + 19:53:35.010 INFO intelligent-kelly 🎭 AGENT STATE: thinking → speaking + 19:53:39.969 INFO intelligent-kelly 🎭 AGENT STATE: speaking → listening + 19:53:41.662 INFO intelligent-kelly 👤 USER STATE: listening → speaking + 19:53:44.565 INFO intelligent-kelly 📝 TRANSCRIPT: 'tell me about retrieval augmented generation' | Kelly speaking: False | Just interrupted: False + 19:53:44.570 INFO intelligent-kelly ✅ VALID INPUT while idle: 'tell me about retrieval augmented generation' + 19:53:44.574 DEBUG livekit.agents received user transcript {"user_transcript": "Tell me about retrieval augmented generation.", "language": "en", "transcript_delay": 0.5072145462036133} + 19:53:44.662 INFO intelligent-kelly 👤 USER STATE: speaking → listening + 19:53:44.786 DEBUG livekit.…_detector eou prediction + {"eou_probability": 0.2723883092403412, "duration": 0.111, "input": "<|im_start|>assistant\ngot it ready to sprinkle some wit on your queries whats on your +mind<|im_end|>\n<|im_start|>user\ntell me about retrieval augmented generation"} + 19:53:44.794 INFO intelligent-kelly 🎤 KELLY STARTED SPEAKING + 19:53:44.809 INFO intelligent-kelly 🎭 AGENT STATE: listening → thinking + 19:53:46.667 INFO intelligent-kelly 🎭 AGENT STATE: thinking → speaking + 19:53:52.478 INFO intelligent-kelly 👤 USER STATE: listening → speaking + 19:53:53.334 INFO intelligent-kelly 📝 TRANSCRIPT: 'okay' | Kelly speaking: True | Just interrupted: False + 19:53:53.338 INFO intelligent-kelly 🔇 FILLER while speaking: 'okay' - completely ignored + 19:53:53.342 DEBUG livekit.agents received user transcript {"user_transcript": "Okay.", "language": "en", "transcript_delay": 0.4681355953216553} + 19:53:53.463 INFO intelligent-kelly 👤 USER STATE: speaking → listening + 19:53:53.568 DEBUG livekit.…_detector eou prediction + {"eou_probability": 0.14005683362483978, "duration": 0.089, "input": "<|im_start|>assistant\ngot it ready to sprinkle some wit on your queries whats on your +mind<|im_end|>\n<|im_start|>user\ntell me about retrieval augmented generation okay"} + 19:53:59.666 INFO intelligent-kelly 👤 USER STATE: listening → speaking + 19:54:00.242 INFO intelligent-kelly 📝 TRANSCRIPT: 'okay' | Kelly speaking: True | Just interrupted: False + 19:54:00.247 INFO intelligent-kelly 🔇 FILLER while speaking: 'okay' - completely ignored + 19:54:00.252 DEBUG livekit.agents received user transcript {"user_transcript": "Okay.", "language": "en", "transcript_delay": 0.08392596244812012} + 19:54:00.291 INFO intelligent-kelly 🎭 AGENT STATE: speaking → listening + 19:54:00.301 INFO intelligent-kelly ⚠️ KELLY INTERRUPTED - waiting for transcript to decide action... + 19:54:01.170 INFO intelligent-kelly 📝 TRANSCRIPT: 'more' | Kelly speaking: False | Just interrupted: True + 19:54:01.174 INFO intelligent-kelly ✅ REAL INPUT after interrupt: 'more' - valid interruption + 19:54:01.179 DEBUG livekit.agents received user transcript {"user_transcript": "More.", "language": "en", "transcript_delay": 0.5154080390930176} + 19:54:01.282 INFO intelligent-kelly 👤 USER STATE: speaking → listening + 19:54:01.522 DEBUG livekit.…_detector eou prediction + {"eou_probability": 0.04437876120209694, "duration": 0.228, "input": "<|im_start|>assistant\ngot it ready to sprinkle some wit on your queries whats on your +mind<|im_end|>\n<|im_start|>user\ntell me about retrieval augmented generation<|im_end|>\n<|im_start|>assistant\nretrieval augmented generation rag is like having your cake and eating it toocombining the best +of retrieval systems with generative models it pulls relevant information from a database or documents to enhance response quality generating answers<|im_end|>\n<|im_start|>user\nokay okay more"} + 19:54:01.529 INFO intelligent-kelly 🎤 KELLY STARTED SPEAKING + 19:54:01.541 INFO intelligent-kelly 🎭 AGENT STATE: listening → thinking + 19:54:02.441 INFO intelligent-kelly 🎭 AGENT STATE: thinking → speaking + 19:54:13.964 INFO intelligent-kelly 👤 USER STATE: listening → speaking + 19:54:14.665 INFO intelligent-kelly 👤 USER STATE: speaking → listening + 19:54:15.043 INFO intelligent-kelly 📝 TRANSCRIPT: 'stop' | Kelly speaking: True | Just interrupted: False + 19:54:15.047 INFO intelligent-kelly 🛑 STOP COMMAND while speaking: 'stop' - forcing interrupt NOW + 19:54:15.054 DEBUG livekit.agents received user transcript {"user_transcript": "Stop.", "language": "en", "transcript_delay": 0.8940985202789307} + 19:54:15.070 INFO intelligent-kelly 🎭 AGENT STATE: speaking → listening + 19:54:15.076 INFO intelligent-kelly ⚠️ KELLY INTERRUPTED - waiting for transcript to decide action... + 19:54:15.318 DEBUG livekit.…_detector eou prediction + {"eou_probability": 0.03504103049635887, "duration": 0.252, "input": "<|im_start|>assistant\ngot it ready to sprinkle some wit on your queries whats on your +mind<|im_end|>\n<|im_start|>user\ntell me about retrieval augmented generation<|im_end|>\n<|im_start|>assistant\nretrieval augmented generation rag is like having your cake and eating it toocombining the best +of retrieval systems with generative models it pulls relevant information from a database or documents to enhance response quality generating answers<|im_end|>\n<|im_start|>user\nokay okay more stop"} + 19:54:15.326 INFO intelligent-kelly 🎤 KELLY STARTED SPEAKING + 19:54:15.339 INFO intelligent-kelly 🎭 AGENT STATE: listening → thinking + 19:54:16.760 INFO intelligent-kelly 🎭 AGENT STATE: thinking → speaking + 19:54:16.940 DEBUG livekit.agents flush audio emitter due to slow audio generation + 19:54:17.206 DEBUG livekit.agents flush audio emitter due to slow audio generation + 19:54:19.765 INFO intelligent-kelly 🎭 AGENT STATE: speaking → listening + 19:54:19.770 INFO intelligent-kelly ⚠️ KELLY INTERRUPTED - waiting for transcript to decide action... + 19:54:20.959 INFO intelligent-kelly 👤 USER STATE: listening → speaking + 19:54:24.253 INFO intelligent-kelly 📝 TRANSCRIPT: 'tell me about graph retrieval augmented generation' | Kelly speaking: False | Just interrupted: True + 19:54:24.258 INFO intelligent-kelly ✅ REAL INPUT after interrupt: 'tell me about graph retrieval augmented generation' - valid interruption + 19:54:24.262 DEBUG livekit.agents received user transcript {"user_transcript": "Tell me about graph retrieval augmented generation.", "language": "en", "transcript_delay": 0.3029599189758301} + 19:54:24.556 INFO intelligent-kelly 👤 USER STATE: speaking → listening + 19:54:24.912 DEBUG livekit.…_detector eou prediction + {"eou_probability": 0.24526239931583405, "duration": 0.341, "input": "<|im_start|>assistant\nretrieval augmented generation rag is like having your cake and eating +it toocombining the best of retrieval systems with generative models it pulls relevant information from a database or documents to enhance response quality generating answers<|im_end|>\n<|im_start|>user\nokay +okay more<|im_end|>\n<|im_start|>assistant\nin rag the model first retrieves relevant snippets based on the input query then it uses those snippets to generate a more informed contextual response think of it +as a smart assistant that checks its notes before giving you an answerno more vague<|im_end|>\n<|im_start|>user\nstop<|im_end|>\n<|im_start|>assistant\nalright just let me know when you want to dive back +in<|im_end|>\n<|im_start|>user\ntell me about graph retrieval augmented generation"} + 19:54:24.921 INFO intelligent-kelly 🎤 KELLY STARTED SPEAKING + 19:54:24.934 INFO intelligent-kelly 🎭 AGENT STATE: listening → thinking + 19:54:25.850 INFO intelligent-kelly 🎭 AGENT STATE: thinking → speaking + 19:54:32.261 INFO intelligent-kelly 👤 USER STATE: listening → speaking + 19:54:33.063 INFO intelligent-kelly 👤 USER STATE: speaking → listening + 19:54:33.122 INFO intelligent-kelly 📝 TRANSCRIPT: 'uhhuh' | Kelly speaking: True | Just interrupted: False + 19:54:33.128 INFO intelligent-kelly 🔇 FILLER while speaking: 'uhhuh' - completely ignored + 19:54:33.132 DEBUG livekit.agents received user transcript {"user_transcript": "Uh-huh.", "language": "en", "transcript_delay": 0.661771297454834} + 19:54:33.473 DEBUG livekit.…_detector eou prediction + {"eou_probability": 0.46071529388427734, "duration": 0.331, "input": "<|im_start|>user\nokay okay more<|im_end|>\n<|im_start|>assistant\nin rag the model first +retrieves relevant snippets based on the input query then it uses those snippets to generate a more informed contextual response think of it as a smart assistant that checks its notes before giving you an +answerno more vague<|im_end|>\n<|im_start|>user\nstop<|im_end|>\n<|im_start|>assistant\nalright just let me know when you want to dive back in<|im_end|>\n<|im_start|>user\ntell me about graph retrieval +augmented generation uh-huh"} + 19:54:39.264 INFO intelligent-kelly 👤 USER STATE: listening → speaking + 19:54:39.770 INFO intelligent-kelly 🎭 AGENT STATE: speaking → listening + 19:54:39.775 INFO intelligent-kelly ⚠️ KELLY INTERRUPTED - waiting for transcript to decide action... + 19:54:40.260 INFO intelligent-kelly 📝 TRANSCRIPT: 'yeah' | Kelly speaking: False | Just interrupted: True + 19:54:40.264 INFO intelligent-kelly 🔄 FALSE INTERRUPT: 'yeah' was just a filler - should resume + 19:54:40.268 DEBUG livekit.agents received user transcript {"user_transcript": "Yeah,", "language": "en", "transcript_delay": 0.38580894470214844} + 19:54:40.466 INFO intelligent-kelly 👤 USER STATE: speaking → listening + 19:54:40.792 DEBUG livekit.…_detector eou prediction + {"eou_probability": 0.023591401055455208, "duration": 0.314, "input": "<|im_start|>assistant\nin rag the model first retrieves relevant snippets based on the input +query then it uses those snippets to generate a more informed contextual response think of it as a smart assistant that checks its notes before giving you an answerno more +vague<|im_end|>\n<|im_start|>user\nstop<|im_end|>\n<|im_start|>assistant\nalright just let me know when you want to dive back in<|im_end|>\n<|im_start|>user\ntell me about graph retrieval augmented +generation<|im_end|>\n<|im_start|>assistant\ngraph retrieval augmented generation graph rag takes the rag concept and gives it a social network makeover it involves using graph structureswhere data points are +nodes linked by relationshipsallowing for more nuanced context retrieval this means the model can understand complex relationships<|im_end|>\n<|im_start|>user\nuh-huh yeah"} + 19:54:40.801 INFO intelligent-kelly 🎤 KELLY STARTED SPEAKING + 19:54:40.817 INFO intelligent-kelly 🎭 AGENT STATE: listening → thinking + 19:54:41.808 INFO intelligent-kelly 🎭 AGENT STATE: thinking → speaking + 19:54:48.169 INFO intelligent-kelly 👤 USER STATE: listening → speaking + 19:54:49.045 INFO intelligent-kelly 📝 TRANSCRIPT: 'wait' | Kelly speaking: True | Just interrupted: False + 19:54:49.048 INFO intelligent-kelly 🛑 STOP COMMAND while speaking: 'wait' - forcing interrupt NOW + 19:54:49.056 DEBUG livekit.agents received user transcript {"user_transcript": "Wait.", "language": "en", "transcript_delay": 0.5885117053985596} + 19:54:49.071 INFO intelligent-kelly 🎭 AGENT STATE: speaking → listening + 19:54:49.080 INFO intelligent-kelly ⚠️ KELLY INTERRUPTED - waiting for transcript to decide action... + 19:54:49.089 INFO intelligent-kelly 👤 USER STATE: speaking → listening + 19:54:49.423 DEBUG livekit.…_detector eou prediction + {"eou_probability": 0.042842086404561996, "duration": 0.323, "input": "<|im_start|>assistant\nalright just let me know when you want to dive back +in<|im_end|>\n<|im_start|>user\ntell me about graph retrieval augmented generation<|im_end|>\n<|im_start|>assistant\ngraph retrieval augmented generation graph rag takes the rag concept and gives it a social +network makeover it involves using graph structureswhere data points are nodes linked by relationshipsallowing for more nuanced context retrieval this means the model can understand complex +relationships<|im_end|>\n<|im_start|>user\nuh-huh yeah<|im_end|>\n<|im_start|>assistant\nwith graph rag the system can pull not just relevant info but also grasp context from interconnected data its like +having a super-smart friend who<|im_end|>\n<|im_start|>user\nwait"} + 19:54:49.431 INFO intelligent-kelly 🎤 KELLY STARTED SPEAKING + 19:54:49.448 INFO intelligent-kelly 🎭 AGENT STATE: listening → thinking + 19:54:50.287 INFO intelligent-kelly 🎭 AGENT STATE: thinking → speaking + 19:54:53.471 INFO intelligent-kelly 🎭 AGENT STATE: speaking → listening + 19:54:53.475 INFO intelligent-kelly ⚠️ KELLY INTERRUPTED - waiting for transcript to decide action... + 19:55:08.472 INFO intelligent-kelly 👤 USER STATE: listening → away + 20:00:52.988 INFO intelligent-kelly 👤 USER STATE: away → speaking + 20:00:56.464 INFO intelligent-kelly 📝 TRANSCRIPT: 'hello so videos we do' | Kelly speaking: False | Just interrupted: True + 20:00:56.469 INFO intelligent-kelly ✅ REAL INPUT after interrupt: 'hello so videos we do' - valid interruption + 20:00:56.473 DEBUG livekit.agents received user transcript {"user_transcript": "Hello? So videos we do.", "language": "en", "transcript_delay": 0.3855757713317871} + 20:00:56.599 INFO intelligent-kelly 👤 USER STATE: speaking → listening + 20:00:56.693 INFO intelligent-kelly 👤 USER STATE: listening → speaking + 20:01:02.763 INFO intelligent-kelly 📝 TRANSCRIPT: 'one minute fifty two seconds' | Kelly speaking: False | Just interrupted: False + 20:01:02.769 INFO intelligent-kelly ✅ VALID INPUT while idle: 'one minute fifty two seconds' + 20:01:02.773 DEBUG livekit.agents received user transcript {"user_transcript": "One minute fifty two seconds", "language": "en", "transcript_delay": 0.08300256729125977} + 20:01:05.375 INFO intelligent-kelly 📝 TRANSCRIPT: 'video' | Kelly speaking: False | Just interrupted: False + 20:01:05.380 INFO intelligent-kelly ✅ VALID INPUT while idle: 'video' + 20:01:05.385 DEBUG livekit.agents received user transcript {"user_transcript": "video", "language": "en", "transcript_delay": 0.08710217475891113} + 20:01:08.090 INFO intelligent-kelly 👤 USER STATE: speaking → listening + + Headset (AirBass Earbuds) ▁ ▁ ▁ ▁ ▁ ▁ ▁ ▁ ▁ ▁ ▁ ▁ ▁ ▁ \ No newline at end of file From 23451f6ecce576d58ac4b10f2e5ff5152923883e Mon Sep 17 00:00:00 2001 From: Riddhika Arora Date: Mon, 2 Feb 2026 20:35:00 +0530 Subject: [PATCH 7/8] Refined read me --- examples/voice_agents/INTERREUPT_HANDLER_README.md | 4 ---- 1 file changed, 4 deletions(-) diff --git a/examples/voice_agents/INTERREUPT_HANDLER_README.md b/examples/voice_agents/INTERREUPT_HANDLER_README.md index c06aa11c4c..c2b729feff 100644 --- a/examples/voice_agents/INTERREUPT_HANDLER_README.md +++ b/examples/voice_agents/INTERREUPT_HANDLER_README.md @@ -75,10 +75,6 @@ LIVEKIT_API_SECRET="your-api-secret-here" LIVEKIT_API_KEY="your-api-key-here" LIVEKIT_URL="wss://your-project.livekit.cloud" -# Optional - For model providers -OPENAI_API_KEY="your-openai-key" -DEEPGRAM_API_KEY="your-deepgram-key" -CARTESIA_API_KEY="your-cartesia-key" ``` To get LiveKit credentials: From 44cbfb84724074d488ad42358a9f4a466f75a6a7 Mon Sep 17 00:00:00 2001 From: Riddhika Arora Date: Mon, 2 Feb 2026 20:38:19 +0530 Subject: [PATCH 8/8] Added video link to read me --- examples/voice_agents/INTERREUPT_HANDLER_README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/voice_agents/INTERREUPT_HANDLER_README.md b/examples/voice_agents/INTERREUPT_HANDLER_README.md index c2b729feff..f4856acf6e 100644 --- a/examples/voice_agents/INTERREUPT_HANDLER_README.md +++ b/examples/voice_agents/INTERREUPT_HANDLER_README.md @@ -152,7 +152,7 @@ We skip long sentences (over 3 words) to avoid false positives like "I have no i ## Demo -**[https://drive.google.com/file/d/1IHqKgqxAG2ZHRwWsDxhksyp3En5qv5iM/view?usp=sharing]** +**[https://drive.google.com/file/d/1IHqKgqxAG2ZHRwWsDxhksyp3En5qv5iM/view?usp=drive_link]** The demo shows: - Agent continuing to talk when hearing "yeah", "okay", "uh-huh"