Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions realtime_ai_character/audio/speech_to_text/google.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,11 @@
'max_alternatives': 1,
'enable_automatic_punctuation': True,
},
'twilio': {
'encoding': speech.RecognitionConfig.AudioEncoding.MULAW,
'sample_rate_hertz': 8000,
'language_code': 'en-uS',
}
})


Expand Down
29 changes: 25 additions & 4 deletions realtime_ai_character/audio/speech_to_text/whisper.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@ def __init__(self, use="local"):
super().__init__()
if use == "local":
device = 'cuda' if is_cuda_available() else 'cpu'
logger.info(f"Loading [Local Whisper] model: [{config.model}]({device}) ...")
logger.info(
f"Loading [Local Whisper] model: [{config.model}]({device}) ...")
self.model = WhisperModel(
model_size_or_path=config.model,
device="auto",
Expand All @@ -63,8 +64,11 @@ def transcribe(self, audio_bytes, platform, prompt="", language="en-US", suppres
logger.info("Transcribing audio...")
if platform == "web":
audio = self._convert_webm_to_wav(audio_bytes, self.use == "local")
elif platform == "twilio":
audio = self._ulaw_to_wav(audio_bytes, self.use == "local")
else:
audio = self._convert_bytes_to_wav(audio_bytes, self.use == "local")
audio = self._convert_bytes_to_wav(
audio_bytes, self.use == "local")
if self.use == "local":
return self._transcribe(audio, prompt, suppress_tokens=suppress_tokens)
elif self.use == "api":
Expand All @@ -90,7 +94,8 @@ def _transcribe_api(self, audio, prompt=""):
return text

def _convert_webm_to_wav(self, webm_data, local=True):
webm_audio = AudioSegment.from_file(io.BytesIO(webm_data), format="webm")
webm_audio = AudioSegment.from_file(
io.BytesIO(webm_data), format="webm")
wav_data = io.BytesIO()
webm_audio.export(wav_data, format="wav")
if local:
Expand All @@ -101,6 +106,22 @@ def _convert_webm_to_wav(self, webm_data, local=True):

def _convert_bytes_to_wav(self, audio_bytes, local=True):
if local:
audio = io.BytesIO(sr.AudioData(audio_bytes, 44100, 2).get_wav_data())
audio = io.BytesIO(sr.AudioData(
audio_bytes, 44100, 2).get_wav_data())
return audio
return sr.AudioData(audio_bytes, 44100, 2)

def _ulaw_to_wav(self, audio_bytes, local=True):
sound = AudioSegment(
data=audio_bytes,
sample_width=1,
frame_rate=8000,
channels=1
)

audio = io.BytesIO()
sound.export(audio, format="wav")
if local:
return audio

return sr.AudioData(audio_bytes, 8000, 1)
7 changes: 3 additions & 4 deletions realtime_ai_character/audio/text_to_speech/edge_tts.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,15 @@
logger = get_logger(__name__)
DEBUG = False


class EdgeTTS(Singleton, TextToSpeech):
def __init__(self):
super().__init__()
logger.info("Initializing [EdgeTTS] voices...")

@timed
async def stream(self, text, websocket, tts_event: asyncio.Event, voice_id="",
first_sentence=False, language='en-US') -> None:
first_sentence=False, language='en-US', *args, **kwargs) -> None:
if DEBUG:
return
voices = await VoicesManager.create()
Expand All @@ -30,8 +31,7 @@ async def stream(self, text, websocket, tts_event: asyncio.Event, voice_id="",
messages.extend(message["data"])
await websocket.send_bytes(bytes(messages))


async def generate_audio(self, text, voice_id = "", language='en-US') -> bytes:
async def generate_audio(self, text, voice_id="", language='en-US') -> bytes:
voices = await VoicesManager.create()
voice = voices.find(Gender="Male", Language="en")[0]
communicate = edge_tts.Communicate(text, voice["Name"])
Expand All @@ -40,4 +40,3 @@ async def generate_audio(self, text, voice_id = "", language='en-US') -> bytes:
if message["type"] == "audio":
messages.extend(message["data"])
return bytes(messages)

41 changes: 34 additions & 7 deletions realtime_ai_character/audio/text_to_speech/elevenlabs.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@
import os
import types
import httpx
import base64

from realtime_ai_character.logger import get_logger
from realtime_ai_character.utils import Singleton, timed
from realtime_ai_character.audio.text_to_speech.base import TextToSpeech
from realtime_ai_character.audio.text_to_speech.utils import MP3ToUlaw

logger = get_logger(__name__)

Expand Down Expand Up @@ -39,13 +41,15 @@ def __init__(self):
logger.info("Initializing [ElevenLabs Text To Speech] voices...")

@timed
async def stream(self, text, websocket, tts_event: asyncio.Event,
async def stream(self, text, websocket, tts_event: asyncio.Event,
voice_id="21m00Tcm4TlvDq8ikWAM",
first_sentence=False, language='en-US') -> None:
first_sentence=False, language='en-US', sid="",
platform="", *args, **kwargs) -> None:
if DEBUG:
return
if voice_id == "":
logger.info("voice_id is not found in .env file, using ElevenLabs default voice")
logger.info(
"voice_id is not found in .env file, using ElevenLabs default voice")
voice_id = "21m00Tcm4TlvDq8ikWAM"
headers = config.headers
if language != 'en-US':
Expand All @@ -67,13 +71,35 @@ async def stream(self, text, websocket, tts_event: asyncio.Event,
if tts_event.is_set():
# stop streaming audio
break
await websocket.send_bytes(chunk)
if platform != "twilio":
await websocket.send_bytes(chunk)
else:
audio_bytes = MP3ToUlaw(chunk)
audio_b64 = base64.b64encode(audio_bytes).decode()
media_response = {
"event": "media",
"streamSid": sid,
"media": {
"payload": audio_b64,
},
}
# "done" marker is sent to twilio to track if the audio has been completed.
await websocket.send_json(media_response)
mark = {
"event": "mark",
"streamSid": sid,
"mark": {
"name": "done",
},
}
await websocket.send_json(mark)

async def generate_audio(self, text, voice_id = "", language='en-US') -> bytes:
async def generate_audio(self, text, voice_id="", language='en-US') -> bytes:
if DEBUG:
return
if voice_id == "":
logger.info("voice_id is not found in .env file, using ElevenLabs default voice")
logger.info(
"voice_id is not found in .env file, using ElevenLabs default voice")
voice_id = "21m00Tcm4TlvDq8ikWAM"
headers = config.headers
if language != 'en-US':
Expand All @@ -87,6 +113,7 @@ async def generate_audio(self, text, voice_id = "", language='en-US') -> bytes:
async with httpx.AsyncClient() as client:
response = await client.post(url, json=data, headers=headers)
if response.status_code != 200:
logger.error(f"ElevenLabs returns response {response.status_code}")
logger.error(
f"ElevenLabs returns response {response.status_code}")
# Get audio/mpeg from the response and return it
return response.content
94 changes: 62 additions & 32 deletions realtime_ai_character/audio/text_to_speech/google_cloud_tts.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import types
import httpx
import base64
import json
from google.oauth2 import service_account
import google.auth.transport.requests

Expand All @@ -14,23 +15,24 @@

DEBUG = False

config = types.SimpleNamespace(**{
'url': 'https://texttospeech.googleapis.com/v1/text:synthesize',
'headers': {
'Content-Type': 'application/json',
},
'data': {
'voice': {
'languageCode': 'en-US',
'name': 'en-US-Studio-M',
'ssmlGender': 'MALE'
config = types.SimpleNamespace(
**{
'url': 'https://texttospeech.googleapis.com/v1/text:synthesize',
'headers': {
'Content-Type': 'application/json',
},
'audioConfig': {
'audioEncoding': 'MP3'
}
},
'service_account_file': os.getenv('GOOGLE_APPLICATION_CREDENTIALS', 'default/path.json'),
})
'data': {
'voice': {
'languageCode': 'en-US',
'name': 'en-US-Studio-M',
'ssmlGender': 'MALE'
},
'audioConfig': {
'audioEncoding': 'MP3'
}
},
'service_account_file': os.getenv('GOOGLE_APPLICATION_CREDENTIALS', 'default/path.json'),
})


class GoogleCloudTTS(Singleton, TextToSpeech):
Expand All @@ -56,40 +58,67 @@ def __init__(self):

@timed
async def stream(self, text, websocket, tts_event: asyncio.Event, voice_id="",
first_sentence=False, language='en-US') -> None:
first_sentence=False, language='en-US', sid="", platform="",) -> None:
if DEBUG:
return
headers = config.headers
# For customized voices

# if language != 'en-US':
# config.data["voice"]["languageCode"] = language
# config.data["voice"]["name"] = voice_id
if language != 'en-US':
config.data["voice"]["languageCode"] = language
config.data["voice"]["name"] = voice_id
data = {
"input": {
"text": text
},
**config.data,
}
if voice_id:
logger.info("Override voice_id")
data["voice"]["name"] = voice_id
if voice_id == "en-US-Studio-O":
data["voice"]["ssmlGender"] = 'FEMALE'

# twilio expects g711 mulaw audio encoding
# https://www.twilio.com/docs/voice/twiml/stream#websocket-messages-to-twilio
if platform == "twilio":
data["audioConfig"]["audioEncoding"] = "MULAW"
data["audioConfig"]["sampleRateHertz"] = 8000

url = config.url
async with httpx.AsyncClient() as client:
response = await client.post(url, json=data, headers=headers)
# Google Cloud TTS API does not support streaming, we send the whole content at once
if response.status_code != 200:
logger.error(f"Google Cloud TTS returns response {response.status_code}")
logger.error(
f"Google Cloud TTS returns response {response.status_code}")
else:
audio_content = response.content
# Decode the base64-encoded audio content
audio_content = base64.b64decode(audio_content)
await websocket.send_bytes(audio_content)
content = json.loads(response.content)
audio_b64 = content["audioContent"] # base64 encoded string
if platform != "twilio":
audio_content = base64.b64decode(audio_b64)
await websocket.send_bytes(audio_content)
return

# audio_b64 includes WAV header. After base64 decode, the legnth is 58 Bytes for
# the header. After encoding the WAV header, the length is 224. So we trunck the
# first 224 bytes of the response received from google text to speech as twilio
# is not expecting audio bytes to include WAV header:
# https://www.twilio.com/docs/voice/twiml/stream#message-media-to-twilio
media_response = {
"event": "media",
"streamSid": sid,
"media": {
"payload": audio_b64[224:],
},
}
# "done" marker is sent to twilio to track if the audio has been completed.
await websocket.send_json(media_response)
mark = {
"event": "mark",
"streamSid": sid,
"mark": {
"name": "done",
},
}
await websocket.send_json(mark)

async def generate_audio(self, text, voice_id = "", language='en-US') -> bytes:
async def generate_audio(self, text, voice_id="", language='en-US') -> bytes:
headers = config.headers
# For customized voices

Expand All @@ -110,7 +139,8 @@ async def generate_audio(self, text, voice_id = "", language='en-US') -> bytes:
async with httpx.AsyncClient() as client:
response = await client.post(url, json=data, headers=headers)
if response.status_code != 200:
logger.error(f"Google Cloud TTS returns response {response.status_code}")
logger.error(
f"Google Cloud TTS returns response {response.status_code}")
else:
audio_content = response.content
# Decode the base64-encoded audio content
Expand Down
28 changes: 28 additions & 0 deletions realtime_ai_character/audio/text_to_speech/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import io

import torch
import torchaudio

from realtime_ai_character.logger import get_logger

logger = get_logger(__name__)


def MP3ToUlaw(src: bytes) -> bytes:
reader = torchaudio.io.StreamReader(io.BytesIO(src))
logger.info(f"MP3ToUlaw stream: {reader.get_src_stream_info(0)}")
reader.add_basic_audio_stream(-1, decoder="mp3float", sample_rate=8000)
audio = torch.concat([chunk[0] for chunk in reader.stream()])

# ulaw encoding
ulaw_encoding = torchaudio.transforms.MuLawEncoding()
ulaw_wave = ulaw_encoding(audio).to(torch.uint8)

buffer = io.BytesIO()
writer = torchaudio.io.StreamWriter(dst=buffer, format="mulaw")
writer.add_audio_stream(sample_rate=8000, num_channels=1, format="u8")

with writer.open():
writer.write_audio_chunk(0, ulaw_wave)

return buffer.getvalue()
Loading