Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 42 additions & 32 deletions quickstarts/Get_started_LiveAPI.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,18 +87,27 @@
# Live session configuration
# Trigger tokens sent so that model does not hallucinate in long conversations
# Sliding window to retain the context within the context window limit
CONFIG = types.LiveConnectConfig(
response_modalities=["AUDIO"],
speech_config=types.SpeechConfig(
voice_config=types.VoiceConfig(
prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name = "Zephyr")
)
),
context_window_compression=types.ContextWindowCompressionConfig(
trigger_tokens = 25600,
sliding_window = types.SlidingWindow(target_tokens=12800),
),
)
CONFIG = types.LiveConnectConfig(
response_modalities=["AUDIO"],
speech_config=types.SpeechConfig(
voice_config=types.VoiceConfig(
prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name="Zephyr")
)
),
realtime_input_config=types.RealtimeInputConfig(
activity_handling=types.ActivityHandling.NO_INTERRUPTION,
automatic_activity_detection=types.AutomaticActivityDetection(
start_of_speech_sensitivity=types.StartSensitivity.START_SENSITIVITY_HIGH,
end_of_speech_sensitivity=types.EndSensitivity.END_SENSITIVITY_HIGH,
silence_duration_ms=800,
prefix_padding_ms=300,
),
),
context_window_compression=types.ContextWindowCompressionConfig(
trigger_tokens=25600,
sliding_window=types.SlidingWindow(target_tokens=12800),
),
)

pya = pyaudio.PyAudio()

Expand Down Expand Up @@ -235,16 +244,17 @@ async def capture_frames(self):
finally:
cap.release()

def _capture_screen(self):
sct = mss.mss()
monitor = sct.monitors[0]

i = sct.grab(monitor)

img = PIL.Image.frombytes("RGB", i.size, i.rgb)

image_io = io.BytesIO()
img.save(image_io, format="jpeg")
def _capture_screen(self):
sct = mss.mss()
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Creating the mss.mss() object inside _capture_screen is inefficient as this method is called in a loop. It's recommended to initialize mss.mss() once and reuse the object.

A good approach would be to initialize it in the __init__ method of the AudioVideoLoop class and store it as an instance variable (e.g., self.sct). You could then use self.sct here.

Example of changes:

# In AudioVideoLoop.__init__
...
self.sct = None
if self.video_mode == "screen":
    self.sct = mss.mss()
...

# In _capture_screen
def _capture_screen(self):
    monitor = self.sct.monitors[0]
    i = self.sct.grab(monitor)
    ...

This would require modifying __init__ as well, but it would improve performance by avoiding repeated object creation.

monitor = sct.monitors[0]

i = sct.grab(monitor)

img = PIL.Image.frombytes("RGB", i.size, i.rgb)
img.thumbnail([1024, 1024])

image_io = io.BytesIO()
img.save(image_io, format="jpeg")
image_io.seek(0)

mime_type = "image/jpeg"
Expand Down Expand Up @@ -282,16 +292,16 @@ async def send_text(self):
except asyncio.CancelledError:
pass

async def send_realtime(self):
try:
while True:
msg = await self.out_queue.get()
if msg["mime_type"].startswith("audio/"):
await self.session.send_realtime_input(audio=msg)
else:
await self.session.send_realtime_input(media=msg)
except asyncio.CancelledError:
pass
async def send_realtime(self):
try:
while True:
msg = await self.out_queue.get()
if msg["mime_type"].startswith("audio/"):
await self.session.send_realtime_input(audio=msg)
else:
await self.session.send_realtime_input(video=msg)
except asyncio.CancelledError:
pass

async def run(self):
"""Run all tasks to handle audio/video/text interaction"""
Expand Down
Loading