Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions examples/voice_agents/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,3 +76,16 @@ This directory contains a comprehensive collection of voice-based agent examples
- [LiveKit Agents Documentation](https://docs.livekit.io/agents/)
- [Agents Starter Example](https://github.com/livekit-examples/agent-starter-python)
- [More Agents Examples](https://github.com/livekit-examples/python-agents-examples)

## Intelligent Interrupt Handler (Assignment Addition)
-The modified basic_agent.py includes:
Semantic classification of user speech into:

IGNORE (e.g. “yeah”, “ok”, “hmm”)
INTERRUPT (e.g. “wait”, “stop”)
NORMAL input

-Backchannel acknowledgements are ignored only while the agent is speaking
-Explicit interruption commands immediately stop agent speech
-No LiveKit core code or framework internals are modified
-This ensures uninterrupted speech continuity and correct state-aware behavior.
199 changes: 148 additions & 51 deletions examples/voice_agents/basic_agent.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
import logging
import re
import asyncio
from enum import Enum

from dotenv import load_dotenv

Expand All @@ -16,52 +19,100 @@
)
from livekit.agents.llm import function_tool
from livekit.plugins import silero
from livekit.plugins.turn_detector.multilingual import MultilingualModel

# uncomment to enable Krisp background voice/noise cancellation
# from livekit.plugins import noise_cancellation
# -------------------------------------------------
# Setup
# -------------------------------------------------

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("basic-agent")

load_dotenv()

# -------------------------------------------------
# Semantic preprocessing / intent classification
# -------------------------------------------------

IGNORE_WORDS = {
"yeah",
"yes",
"ok",
"okay",
"hmm",
"uh",
"uh-huh",
"right",
"aha",
}

INTERRUPT_WORDS = {
"stop",
"wait",
"pause",
"cancel",
"hold on",
"no",
}


class UserIntent(Enum):
IGNORE = "ignore"
INTERRUPT = "interrupt"
NORMAL = "normal"


def _normalize(text: str) -> str:
return re.sub(r"[^\w\s]", "", text.lower()).strip()


def classify_input(transcript: str, agent_is_speaking: bool) -> UserIntent:
if not transcript:
return UserIntent.IGNORE

words = set(_normalize(transcript).split())

if agent_is_speaking:
if words & INTERRUPT_WORDS:
return UserIntent.INTERRUPT

if words and words.issubset(IGNORE_WORDS):
return UserIntent.IGNORE

return UserIntent.INTERRUPT

return UserIntent.NORMAL


# -------------------------------------------------
# Agent definition (model behavior unchanged)
# -------------------------------------------------

class MyAgent(Agent):
def __init__(self) -> None:
super().__init__(
instructions="Your name is Kelly. You would interact with users via voice."
"with that in mind keep your responses concise and to the point."
"do not use emojis, asterisks, markdown, or other special characters in your responses."
"You are curious and friendly, and have a sense of humor."
"you will speak english to the user",
instructions=(
"Your name is Kelly. You interact with users via voice. "
"Keep responses concise and to the point. "
"Do not use emojis, markdown, or special characters. "
"You are friendly, curious, and speak English."
)
)

async def on_enter(self):
# when the agent is added to the session, it'll generate a reply
# according to its instructions
# Initial model invocation preserved
self.session.generate_reply()

# all functions annotated with @function_tool will be passed to the LLM when this
# agent is active
@function_tool
async def lookup_weather(
self, context: RunContext, location: str, latitude: str, longitude: str
):
"""Called when the user asks for weather related information.
Ensure the user's location (city or region) is provided.
When given a location, please estimate the latitude and longitude of the location and
do not ask the user for them.

Args:
location: The location they are asking for
latitude: The latitude of the location, do not ask user for it
longitude: The longitude of the location, do not ask user for it
"""

logger.info(f"Looking up weather for {location}")
return "It is sunny with a temperature of 70 degrees."

return "sunny with a temperature of 70 degrees."

# -------------------------------------------------
# Server setup
# -------------------------------------------------

server = AgentServer()

Expand All @@ -72,37 +123,82 @@ def prewarm(proc: JobProcess):

server.setup_fnc = prewarm

agent_is_speaking = False
agent_is_generating = False


@server.rtc_session()
async def entrypoint(ctx: JobContext):
# each log entry will include these fields
ctx.log_context_fields = {
"room": ctx.room.name,
}
global agent_is_speaking, agent_is_generating

ctx.log_context_fields = {"room": ctx.room.name}

session = AgentSession(
# Speech-to-text (STT) is your agent's ears, turning the user's speech into text that the LLM can understand
# See all available models at https://docs.livekit.io/agents/models/stt/
stt="deepgram/nova-3",
# A Large Language Model (LLM) is your agent's brain, processing user input and generating a response
# See all available models at https://docs.livekit.io/agents/models/llm/
llm="openai/gpt-4.1-mini",
# Text-to-speech (TTS) is your agent's voice, turning the LLM's text into speech that the user can hear
# See all available models as well as voice selections at https://docs.livekit.io/agents/models/tts/
tts="cartesia/sonic-2:9626c31c-bec5-4cca-baa8-f8ba9e84c8bc",
# VAD and turn detection are used to determine when the user is speaking and when the agent should respond
# See more at https://docs.livekit.io/agents/build/turns
turn_detection=MultilingualModel(),
tts="cartesia/sonic-2",
vad=ctx.proc.userdata["vad"],
# allow the LLM to generate a response while waiting for the end of turn
# See more at https://docs.livekit.io/agents/build/audio/#preemptive-generation
preemptive_generation=True,
# sometimes background noise could interrupt the agent session, these are considered false positive interruptions
# when it's detected, you may resume the agent's speech
resume_false_interruption=True,
false_interruption_timeout=1.0,
)

# log metrics as they are emitted, and total usage after session is over
# -------------------------------------------------
# Track speaking + generation state
# -------------------------------------------------

@session.on("agent_speech_started")
def _on_agent_speech_started():
global agent_is_speaking
agent_is_speaking = True
logger.info("Agent speech started")

@session.on("agent_speech_finished")
def _on_agent_speech_finished():
global agent_is_speaking
agent_is_speaking = False
logger.info("Agent speech finished")

@session.on("llm_generation_started")
def _on_generation_started():
global agent_is_generating
agent_is_generating = True
logger.info("LLM generation started")

@session.on("llm_generation_finished")
def _on_generation_finished():
global agent_is_generating
agent_is_generating = False
logger.info("LLM generation finished")

# -------------------------------------------------
# Semantic preprocessing gate
# -------------------------------------------------

@session.on("user_transcript_final")
def _on_user_transcript(transcript: str):
intent = classify_input(transcript, agent_is_speaking)

logger.info(
f"User='{transcript}' | intent={intent.value} | "
f"speaking={agent_is_speaking} | generating={agent_is_generating}"
)

# Explicit interrupt
if intent == UserIntent.INTERRUPT:
logger.info("Interrupt intent → stopping agent")
asyncio.create_task(session.interrupt())
return

# 🔴 Critical fix: suppress generation on backchannels
if intent == UserIntent.IGNORE:
logger.info("Backchannel detected → suppressing generation")
return

# NORMAL input → allow LiveKit to proceed naturally

# -------------------------------------------------
# Metrics (unchanged)
# -------------------------------------------------

usage_collector = metrics.UsageCollector()

@session.on("metrics_collected")
Expand All @@ -112,22 +208,23 @@ def _on_metrics_collected(ev: MetricsCollectedEvent):

async def log_usage():
summary = usage_collector.get_summary()
logger.info(f"Usage: {summary}")
logger.info(f"Usage summary: {summary}")

# shutdown callbacks are triggered when the session is over
ctx.add_shutdown_callback(log_usage)

# -------------------------------------------------
# Start agent
# -------------------------------------------------

await session.start(
agent=MyAgent(),
room=ctx.room,
room_options=room_io.RoomOptions(
audio_input=room_io.AudioInputOptions(
# uncomment to enable the Krisp BVC noise cancellation
# noise_cancellation=noise_cancellation.BVC(),
),
audio_input=room_io.AudioInputOptions(),
),
)


if __name__ == "__main__":
logger.info("RUNNING BASIC AGENT WITH GENERATION-SAFE BACKCHANNEL HANDLING")
cli.run_app(server)