Skip to content

Commit 801065e

Browse files
rootflo-hardikvishnurk6247vizsatiz
authored
voice agents refactoring (#207)
* add cust_number to sys prompt dynamically * changed voice_agent - tts_voice_id str to tts_voice_ids dict * call_processing - added azure llm service * updated pipecat version * resetting interruption strategies, adding smart turn analyzer - also set stop_secs=0.2 in VADParams as we are using smart turn detection. - removed depereceated interruption strategy setting * fix: generating gcs signed urls using workload identity credentials * fix: add email & token flow in gcs presigned url * fix: change request type * fix: change keyword argument to access_token * language and endconversation detection tool (#210) * added language and env_conversation detection tool * lang detection -> parallel pipeline instead of service switcher * little prompt fix for tool * resolved comments * resolved client review comments * fix: reuse gcs function in pdo service * Support for image in middleware proxy (#211) * fix for json enabled in inference api * Adding control for log levels --------- Co-authored-by: vishnu r kumar <rkumar.vishnu28@gmail.com> Co-authored-by: Vishnu Satis <vishnu@rootflo.ai> Co-authored-by: vizsatiz <satis.vishnu@gmail.com>
1 parent 88a525c commit 801065e

24 files changed

Lines changed: 1117 additions & 397 deletions

File tree

wavefront/client/src/pages/apps/[appId]/voice-agents/CreateVoiceAgentDialog.tsx

Lines changed: 46 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,11 @@ const createVoiceAgentSchema = z.object({
5656
tts_config_id: z.string().min(1, 'TTS configuration is required'),
5757
stt_config_id: z.string().min(1, 'STT configuration is required'),
5858
telephony_config_id: z.string().min(1, 'Telephony configuration is required'),
59-
tts_voice_id: z.string().min(1, 'TTS Voice ID is required'),
59+
tts_voice_ids: z
60+
.record(z.string(), z.string().min(1, 'Voice ID must not be empty'))
61+
.refine((val) => Object.keys(val).length > 0, {
62+
message: 'At least one voice ID is required',
63+
}),
6064
system_prompt: z.string().min(1, 'System prompt is required'),
6165
welcome_message: z.string().min(1, 'Welcome message is required'),
6266
conversation_config: z.string().optional(),
@@ -83,6 +87,7 @@ const CreateVoiceAgentDialog: React.FC<CreateVoiceAgentDialogProps> = ({ isOpen,
8387
const [creating, setCreating] = useState(false);
8488
const [ttsParameters, setTtsParameters] = useState<Record<string, unknown>>({});
8589
const [sttParameters, setSttParameters] = useState<Record<string, unknown>>({});
90+
const [voiceIdState, setVoiceIdState] = useState<Record<string, string>>({ en: '' });
8691

8792
// Fetch configs for dropdowns
8893
const { data: llmConfigs = [] } = useGetLLMConfigs(appId);
@@ -99,7 +104,7 @@ const CreateVoiceAgentDialog: React.FC<CreateVoiceAgentDialogProps> = ({ isOpen,
99104
tts_config_id: '',
100105
stt_config_id: '',
101106
telephony_config_id: '',
102-
tts_voice_id: '',
107+
tts_voice_ids: { en: '' },
103108
system_prompt: '',
104109
welcome_message: '',
105110
conversation_config: '{}',
@@ -114,6 +119,7 @@ const CreateVoiceAgentDialog: React.FC<CreateVoiceAgentDialogProps> = ({ isOpen,
114119
// Watch config selections to determine providers
115120
const watchedTtsConfigId = form.watch('tts_config_id');
116121
const watchedSttConfigId = form.watch('stt_config_id');
122+
const watchedSupportedLanguages = form.watch('supported_languages');
117123

118124
// Get selected providers
119125
const selectedTtsProvider = ttsConfigs.find((c) => c.id === watchedTtsConfigId)?.provider;
@@ -132,6 +138,21 @@ const CreateVoiceAgentDialog: React.FC<CreateVoiceAgentDialogProps> = ({ isOpen,
132138
}
133139
}, [selectedSttProvider, isOpen]);
134140

141+
// Sync voice ID state with language changes
142+
useEffect(() => {
143+
if (isOpen && watchedSupportedLanguages) {
144+
setVoiceIdState((prev) => {
145+
const newState: Record<string, string> = {};
146+
// Preserve existing voice IDs for languages still selected
147+
watchedSupportedLanguages.forEach((lang) => {
148+
newState[lang] = prev[lang] || '';
149+
});
150+
form.setValue('tts_voice_ids', newState);
151+
return newState;
152+
});
153+
}
154+
}, [watchedSupportedLanguages, isOpen]);
155+
135156
// Reset form when dialog closes
136157
useEffect(() => {
137158
if (!isOpen) {
@@ -142,7 +163,7 @@ const CreateVoiceAgentDialog: React.FC<CreateVoiceAgentDialogProps> = ({ isOpen,
142163
tts_config_id: '',
143164
stt_config_id: '',
144165
telephony_config_id: '',
145-
tts_voice_id: '',
166+
tts_voice_ids: { en: '' },
146167
system_prompt: '',
147168
welcome_message: '',
148169
conversation_config: '{}',
@@ -154,6 +175,7 @@ const CreateVoiceAgentDialog: React.FC<CreateVoiceAgentDialogProps> = ({ isOpen,
154175
});
155176
setTtsParameters({});
156177
setSttParameters({});
178+
setVoiceIdState({ en: '' });
157179
}
158180
}, [isOpen, form]);
159181

@@ -232,7 +254,7 @@ const CreateVoiceAgentDialog: React.FC<CreateVoiceAgentDialogProps> = ({ isOpen,
232254
tts_config_id: data.tts_config_id.trim(),
233255
stt_config_id: data.stt_config_id.trim(),
234256
telephony_config_id: data.telephony_config_id.trim(),
235-
tts_voice_id: data.tts_voice_id.trim(),
257+
tts_voice_ids: data.tts_voice_ids,
236258
tts_parameters: Object.keys(builtTtsParameters).length > 0 ? builtTtsParameters : null,
237259
stt_parameters: Object.keys(builtSttParameters).length > 0 ? builtSttParameters : null,
238260
system_prompt: data.system_prompt.trim(),
@@ -710,17 +732,31 @@ const CreateVoiceAgentDialog: React.FC<CreateVoiceAgentDialogProps> = ({ isOpen,
710732
<h4 className="text-sm font-medium">TTS Voice Settings</h4>
711733
<FormField
712734
control={form.control}
713-
name="tts_voice_id"
735+
name="tts_voice_ids"
714736
render={({ field }) => (
715737
<FormItem>
716738
<FormLabel>
717-
TTS Voice ID<span className="text-red-500">*</span>
739+
TTS Voice IDs<span className="text-red-500">*</span>
718740
</FormLabel>
719-
<FormControl>
720-
<Input placeholder="e.g., alloy, echo, fable (OpenAI) or voice ID (ElevenLabs)" {...field} />
721-
</FormControl>
741+
<div className="space-y-3">
742+
{watchedSupportedLanguages.map((langCode) => (
743+
<div key={langCode} className="flex items-center gap-3">
744+
<Label className="w-24 text-sm font-medium">{getLanguageDisplayName(langCode)}:</Label>
745+
<Input
746+
placeholder={`Voice ID for ${getLanguageDisplayName(langCode)}`}
747+
value={voiceIdState[langCode] || ''}
748+
onChange={(e) => {
749+
const newState = { ...voiceIdState, [langCode]: e.target.value };
750+
setVoiceIdState(newState);
751+
field.onChange(newState);
752+
}}
753+
className="flex-1"
754+
/>
755+
</div>
756+
))}
757+
</div>
722758
<FormDescription>
723-
Provider-specific voice identifier (e.g., for Deepgram: aura-2-helena-en)
759+
Provider-specific voice identifiers per language (e.g., "aura-2-helena-en" for Deepgram)
724760
</FormDescription>
725761
<FormMessage />
726762
</FormItem>

wavefront/client/src/pages/apps/[appId]/voice-agents/EditVoiceAgentDialog.tsx

Lines changed: 43 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ const updateVoiceAgentSchema = z.object({
5151
tts_config_id: z.string().min(1, 'TTS configuration is required'),
5252
stt_config_id: z.string().min(1, 'STT configuration is required'),
5353
telephony_config_id: z.string().min(1, 'Telephony configuration is required'),
54-
tts_voice_id: z.string().min(1, 'TTS Voice ID is required'),
54+
tts_voice_ids: z.record(z.string(), z.string()).optional(),
5555
system_prompt: z.string().min(1, 'System prompt is required'),
5656
welcome_message: z.string().min(1, 'Welcome message is required'),
5757
conversation_config: z.string().optional(),
@@ -92,6 +92,7 @@ const EditVoiceAgentDialog: React.FC<EditVoiceAgentDialogProps> = ({
9292
// State for TTS/STT parameters (managed separately from form)
9393
const [ttsParameters, setTtsParameters] = useState<Record<string, unknown>>({});
9494
const [sttParameters, setSttParameters] = useState<Record<string, unknown>>({});
95+
const [voiceIdState, setVoiceIdState] = useState<Record<string, string>>(agent.tts_voice_ids || { en: '' });
9596

9697
const form = useForm<UpdateVoiceAgentInput>({
9798
resolver: zodResolver(updateVoiceAgentSchema),
@@ -102,7 +103,7 @@ const EditVoiceAgentDialog: React.FC<EditVoiceAgentDialogProps> = ({
102103
tts_config_id: agent.tts_config_id,
103104
stt_config_id: agent.stt_config_id,
104105
telephony_config_id: agent.telephony_config_id,
105-
tts_voice_id: agent.tts_voice_id,
106+
tts_voice_ids: agent.tts_voice_ids,
106107
system_prompt: agent.system_prompt,
107108
welcome_message: agent.welcome_message,
108109
conversation_config: agent.conversation_config ? JSON.stringify(agent.conversation_config, null, 2) : '{}',
@@ -117,6 +118,7 @@ const EditVoiceAgentDialog: React.FC<EditVoiceAgentDialogProps> = ({
117118
// Watch for config changes to determine providers
118119
const watchedTtsConfigId = form.watch('tts_config_id');
119120
const watchedSttConfigId = form.watch('stt_config_id');
121+
const watchedSupportedLanguages = form.watch('supported_languages');
120122

121123
const selectedTtsProvider = ttsConfigs.find((c) => c.id === watchedTtsConfigId)?.provider;
122124
const selectedSttProvider = sttConfigs.find((c) => c.id === watchedSttConfigId)?.provider;
@@ -134,7 +136,7 @@ const EditVoiceAgentDialog: React.FC<EditVoiceAgentDialogProps> = ({
134136
tts_config_id: agent.tts_config_id,
135137
stt_config_id: agent.stt_config_id,
136138
telephony_config_id: agent.telephony_config_id,
137-
tts_voice_id: agent.tts_voice_id,
139+
tts_voice_ids: agent.tts_voice_ids,
138140
system_prompt: agent.system_prompt,
139141
welcome_message: agent.welcome_message,
140142
conversation_config: agent.conversation_config ? JSON.stringify(agent.conversation_config, null, 2) : '{}',
@@ -144,6 +146,7 @@ const EditVoiceAgentDialog: React.FC<EditVoiceAgentDialogProps> = ({
144146
supported_languages: agent.supported_languages || ['en'],
145147
default_language: agent.default_language || 'en',
146148
});
149+
setVoiceIdState(agent.tts_voice_ids || { en: '' });
147150
}
148151
}, [isOpen, agent, form]);
149152

@@ -161,6 +164,21 @@ const EditVoiceAgentDialog: React.FC<EditVoiceAgentDialogProps> = ({
161164
}
162165
}, [isOpen, agent.stt_parameters]);
163166

167+
// Sync voice ID state with language changes
168+
useEffect(() => {
169+
if (isOpen && watchedSupportedLanguages) {
170+
setVoiceIdState((prev) => {
171+
const newState: Record<string, string> = {};
172+
// Preserve existing voice IDs for languages still selected
173+
watchedSupportedLanguages.forEach((lang) => {
174+
newState[lang] = prev[lang] || '';
175+
});
176+
form.setValue('tts_voice_ids', newState);
177+
return newState;
178+
});
179+
}
180+
}, [watchedSupportedLanguages, isOpen]);
181+
164182
// Helper functions to update parameters
165183
const setTtsParameter = (key: string, value: unknown) => {
166184
setTtsParameters((prev) => ({ ...prev, [key]: value }));
@@ -266,8 +284,8 @@ const EditVoiceAgentDialog: React.FC<EditVoiceAgentDialogProps> = ({
266284
requestData.telephony_config_id = data.telephony_config_id;
267285
}
268286

269-
if (data.tts_voice_id.trim() !== agent.tts_voice_id) {
270-
requestData.tts_voice_id = data.tts_voice_id.trim();
287+
if (JSON.stringify(data.tts_voice_ids) !== JSON.stringify(agent.tts_voice_ids)) {
288+
requestData.tts_voice_ids = data.tts_voice_ids;
271289
}
272290

273291
// Check if TTS parameters changed
@@ -757,17 +775,31 @@ const EditVoiceAgentDialog: React.FC<EditVoiceAgentDialogProps> = ({
757775
<h4 className="text-sm font-medium">TTS Voice Settings</h4>
758776
<FormField
759777
control={form.control}
760-
name="tts_voice_id"
778+
name="tts_voice_ids"
761779
render={({ field }) => (
762780
<FormItem>
763781
<FormLabel>
764-
TTS Voice ID<span className="text-red-500">*</span>
782+
TTS Voice IDs<span className="text-red-500">*</span>
765783
</FormLabel>
766-
<FormControl>
767-
<Input placeholder="e.g., alloy, echo, fable (OpenAI) or voice ID (ElevenLabs)" {...field} />
768-
</FormControl>
784+
<div className="space-y-3">
785+
{watchedSupportedLanguages.map((langCode) => (
786+
<div key={langCode} className="flex items-center gap-3">
787+
<Label className="w-24 text-sm font-medium">{getLanguageDisplayName(langCode)}:</Label>
788+
<Input
789+
placeholder={`Voice ID for ${getLanguageDisplayName(langCode)}`}
790+
value={voiceIdState[langCode] || ''}
791+
onChange={(e) => {
792+
const newState = { ...voiceIdState, [langCode]: e.target.value };
793+
setVoiceIdState(newState);
794+
field.onChange(newState);
795+
}}
796+
className="flex-1"
797+
/>
798+
</div>
799+
))}
800+
</div>
769801
<FormDescription>
770-
Provider-specific voice identifier (e.g., for Deepgram: aura-2-helena-en)
802+
Provider-specific voice identifiers per language (e.g., "aura-2-helena-en" for Deepgram)
771803
</FormDescription>
772804
<FormMessage />
773805
</FormItem>

wavefront/client/src/types/voice-agent.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ export interface VoiceAgent {
1313
telephony_config_id: string;
1414
system_prompt: string;
1515
welcome_message: string;
16-
tts_voice_id: string;
16+
tts_voice_ids: Record<string, string>;
1717
tts_parameters: Record<string, unknown> | null;
1818
stt_parameters: Record<string, unknown> | null;
1919
conversation_config: Record<string, unknown> | null;
@@ -39,7 +39,7 @@ export interface CreateVoiceAgentRequest {
3939
telephony_config_id: string;
4040
system_prompt: string;
4141
welcome_message: string;
42-
tts_voice_id: string;
42+
tts_voice_ids: Record<string, string>;
4343
tts_parameters?: Record<string, unknown> | null;
4444
stt_parameters?: Record<string, unknown> | null;
4545
conversation_config?: Record<string, unknown> | null;
@@ -63,7 +63,7 @@ export interface UpdateVoiceAgentRequest {
6363
telephony_config_id?: string;
6464
system_prompt?: string;
6565
welcome_message?: string;
66-
tts_voice_id?: string;
66+
tts_voice_ids?: Record<string, string>;
6767
tts_parameters?: Record<string, unknown> | null;
6868
stt_parameters?: Record<string, unknown> | null;
6969
conversation_config?: Record<string, unknown> | null;

wavefront/server/apps/call_processing/call_processing/controllers/webhook_controller.py

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,6 @@
1616
from pipecat.runner.types import WebSocketRunnerArguments
1717
from pipecat.runner.utils import parse_telephony_websocket
1818

19-
# from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
20-
# from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
2119
from pipecat.serializers.twilio import TwilioFrameSerializer
2220
from pipecat.audio.vad.silero import SileroVADAnalyzer
2321
from pipecat.audio.vad.vad_analyzer import VADParams
@@ -119,6 +117,8 @@ async def inbound_webhook(
119117

120118
# Pass parameters to WebSocket stream
121119
stream.parameter(name='voice_agent_id', value=agent_id)
120+
stream.parameter(name='customer_number', value=From)
121+
stream.parameter(name='agent_number', value=To)
122122

123123
connect.append(stream)
124124
response.append(connect)
@@ -134,13 +134,15 @@ async def inbound_webhook(
134134

135135
@webhook_router.post('/twiml')
136136
async def twiml_endpoint(
137+
From: str = Form(...),
138+
To: str = Form(...),
137139
voice_agent_id: str = Query(...),
138140
welcome_message_audio_url: str = Query(default=''),
139141
):
140142
"""
141143
Twilio TwiML endpoint
142144
143-
Called by Twilio when call connects (directly or via inbound webhook redirect).
145+
Called by Twilio when call connects (directly or via outbound webhook redirect).
144146
Returns TwiML XML with WebSocket connection instructions.
145147
146148
Query params:
@@ -181,6 +183,8 @@ async def twiml_endpoint(
181183

182184
# Pass parameters to WebSocket stream
183185
stream.parameter(name='voice_agent_id', value=voice_agent_id)
186+
stream.parameter(name='customer_number', value=To)
187+
stream.parameter(name='agent_number', value=From)
184188

185189
connect.append(stream)
186190
response.append(connect)
@@ -223,12 +227,20 @@ async def websocket_endpoint(
223227
# Extract parameters from stream
224228
body_data = call_data.get('body', {})
225229
voice_agent_id = body_data.get('voice_agent_id')
230+
customer_number = body_data.get('customer_number')
231+
# agent_number = body_data.get('agent_number')
226232

227233
if not voice_agent_id:
228234
logger.error('voice_agent_id not found in stream parameters')
229235
await websocket.close(code=1008, reason='Missing voice_agent_id')
230236
return
231237

238+
if not customer_number:
239+
logger.warning(
240+
'customer_number not found in stream parameters, using empty string'
241+
)
242+
customer_number = ''
243+
232244
logger.info(f'Voice agent ID: {voice_agent_id}')
233245

234246
# Convert voice_agent_id to UUID
@@ -263,13 +275,12 @@ async def websocket_endpoint(
263275
vad_analyzer=SileroVADAnalyzer(
264276
params=VADParams(
265277
confidence=0.7, # Default is 0.7, can lower to 0.4-0.5 for faster detection
266-
start_secs=0.15, # Default is 0.2, keep it
267-
stop_secs=0.8, # KEY: Lower from default 0.8 for faster cutoff (should be 0.2 for smart turn detection)
278+
start_secs=0.2, # Default is 0.2, keep it
279+
stop_secs=0.2, # KEY: Lower from default 0.8 for faster cutoff (should be 0.2 for smart turn detection)
268280
min_volume=0.6, # Default is 0.6, adjust based on your audio quality
269281
),
270282
), # Voice Activity Detection
271283
serializer=serializer,
272-
# turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
273284
),
274285
)
275286

@@ -282,6 +293,7 @@ async def websocket_endpoint(
282293
tts_config=configs['tts_config'],
283294
stt_config=configs['stt_config'],
284295
tools=configs['tools'],
296+
customer_number=customer_number,
285297
)
286298

287299
except Exception as e:

0 commit comments

Comments
 (0)