From 6c4c0df7d65cfa79c5eec3fb8a4d7476b47d4aa5 Mon Sep 17 00:00:00 2001
From: Ahmet Kilinc <akx9@icloud.com>
Date: Wed, 13 Aug 2025 00:27:45 +0100
Subject: [PATCH 1/5] fix: ai chat bot fixes

---
 apps/mail/components/create/ai-chat.tsx       |  51 ++-
 apps/mail/package.json                        |   1 +
 .../providers/voice-provider-elevenlabs.tsx   | 183 +++++++++++
 apps/mail/providers/voice-provider.tsx        | 302 ++++++++++++------
 apps/server/src/main.ts                       |   2 +
 apps/server/src/routes/voice.ts               | 135 ++++++++
 pnpm-lock.yaml                                |   3 +
 7 files changed, 575 insertions(+), 102 deletions(-)
 create mode 100644 apps/mail/providers/voice-provider-elevenlabs.tsx
 create mode 100644 apps/server/src/routes/voice.ts
diff --git a/apps/mail/components/create/ai-chat.tsx b/apps/mail/components/create/ai-chat.tsx
index c1e637e57e..d8ecc8e78d 100644
--- a/apps/mail/components/create/ai-chat.tsx
+++ b/apps/mail/components/create/ai-chat.tsx
@@ -1,8 +1,8 @@
 import { Avatar, AvatarFallback, AvatarImage } from '../ui/avatar';
 import { useAIFullScreen, useAISidebar } from '../ui/ai-sidebar';
+import { useRef, useCallback, useEffect, useState } from 'react';
 import { VoiceProvider } from '@/providers/voice-provider';
 import useComposeEditor from '@/hooks/use-compose-editor';
-import { useRef, useCallback, useEffect } from 'react';
 import type { useAgentChat } from 'agents/ai-react';
 import { Markdown } from '@react-email/components';
 import { useBilling } from '@/hooks/use-billing';
@@ -209,6 +209,7 @@ export function AIChat({
   const [, setPricingDialog] = useQueryState('pricingDialog');
   const [aiSidebarOpen] = useQueryState('aiSidebar');
   const { toggleOpen } = useAISidebar();
+  const voiceResponseCallbackRef = useRef<((response: string) => void) | null>(null);
 
   const scrollToBottom = useCallback(() => {
     if (messagesEndRef.current) {
@@ -222,6 +223,28 @@ export function AIChat({
     }
   }, [status, scrollToBottom]);
 
+  // Track if we're waiting for a voice response
+  const [isVoiceQuery, setIsVoiceQuery] = useState(false);
+
+  // When a new assistant message comes in, check if it's a voice response
+  useEffect(() => {
+    if (isVoiceQuery && messages.length > 0) {
+      const lastMessage = messages[messages.length - 1];
+      if (lastMessage.role === 'assistant') {
+        // Extract text content from the message
+        const textContent = lastMessage.parts
+          .filter((part) => part.type === 'text' && 'text' in part)
+          .map((part) => (part as any).text)
+          .join(' ');
+
+        if (textContent && voiceResponseCallbackRef.current) {
+          voiceResponseCallbackRef.current(textContent);
+          setIsVoiceQuery(false);
+        }
+      }
+    }
+  }, [messages, isVoiceQuery]);
+
   const editor = useComposeEditor({
     placeholder: 'Ask Zero to do anything...',
     onLengthChange: () => setInput(editor.getText()),
@@ -293,14 +316,19 @@ export function AIChat({
               const toolParts = message.parts.filter((part) => part.type === 'tool-invocation');
 
               return (
-                <div key={`${message.id}-${index}`} className="mb-2 flex flex-col" data-message-role={message.role}>
+                <div
+                  key={`${message.id}-${index}`}
+                  className="mb-2 flex flex-col"
+                  data-message-role={message.role}
+                >
                   {toolParts.map(
                     (part, index) =>
-                      part.toolInvocation?.result && (
+                      part.toolInvocation &&
+                      'result' in part.toolInvocation && (
                         <ToolResponse
                           key={`${part.toolInvocation.toolName}-${index}`}
                           toolName={part.toolInvocation.toolName}
-                          result={part.toolInvocation.result}
+                          result={(part.toolInvocation as any).result}
                           args={part.toolInvocation.args}
                         />
                       ),
@@ -387,7 +415,20 @@ export function AIChat({
             </div>
             <div className="grid">
               <div className="flex justify-end gap-1">
-                <VoiceProvider>
+                <VoiceProvider
+                  onTranscriptComplete={(transcript) => {
+                    // Mark this as a voice query
+                    setIsVoiceQuery(true);
+                    // Set the transcript as input and submit the form
+                    editor.commands.setContent(transcript);
+                    setInput(transcript);
+                    onSubmit({ preventDefault: () => {} } as React.FormEvent<HTMLFormElement>);
+                  }}
+                  onResponseReady={(callback) => {
+                    // Store the callback to be called when we get the AI response
+                    voiceResponseCallbackRef.current = callback;
+                  }}
+                >
                   <VoiceButton />
                 </VoiceProvider>
                 <button
diff --git a/apps/mail/package.json b/apps/mail/package.json
index b023bf71e4..905c24dba6 100644
--- a/apps/mail/package.json
+++ b/apps/mail/package.json
@@ -13,6 +13,7 @@
     "machine-translate": "inlang machine translate --project project.inlang"
   },
   "dependencies": {
+    "@ai-sdk/openai": "^1.3.21",
     "@ai-sdk/perplexity": "1.1.9",
     "@ai-sdk/react": "1.2.12",
     "@dnd-kit/core": "6.3.1",
diff --git a/apps/mail/providers/voice-provider-elevenlabs.tsx b/apps/mail/providers/voice-provider-elevenlabs.tsx
new file mode 100644
index 0000000000..101409083e
--- /dev/null
+++ b/apps/mail/providers/voice-provider-elevenlabs.tsx
@@ -0,0 +1,183 @@
+import { createContext, useContext, useState } from 'react';
+import { useConversation } from '@elevenlabs/react';
+// import { callServerTool } from '@/lib/server-tool';
+import { useSession } from '@/lib/auth-client';
+import type { ReactNode } from 'react';
+import { toast } from 'sonner';
+
+interface VoiceContextType {
+  status: string;
+  isInitializing: boolean;
+  isSpeaking: boolean;
+  hasPermission: boolean;
+  lastToolCall: string | null;
+  isOpen: boolean;
+
+  startConversation: (context?: any) => Promise<void>;
+  endConversation: () => Promise<void>;
+  requestPermission: () => Promise<boolean>;
+  sendContext: (context: any) => void;
+}
+
+// const toolNames = [
+//   'listEmails',
+//   'getEmail',
+//   'sendEmail',
+//   'markAsRead',
+//   'markAsUnread',
+//   'archiveEmails',
+//   'deleteEmails',
+//   'deleteEmail',
+//   'createLabel',
+//   'applyLabel',
+//   'removeLabel',
+//   'searchEmails',
+//   'webSearch',
+//   'summarizeEmail',
+// ] as const;
+
+const VoiceContext = createContext<VoiceContextType | undefined>(undefined);
+
+export function VoiceProvider({ children }: { children: ReactNode }) {
+  const { data: session } = useSession();
+  const [hasPermission, setHasPermission] = useState(false);
+  const [isInitializing, setIsInitializing] = useState(false);
+  const [lastToolCall, setLastToolCall] = useState<string | null>(null);
+  const [isOpen, setOpen] = useState(false);
+  const [, setCurrentContext] = useState<any>(null);
+
+  const conversation = useConversation({
+    onConnect: () => {
+      setIsInitializing(false);
+      // TODO: Send initial context if available when API supports it
+    },
+    onDisconnect: () => {
+      setIsInitializing(false);
+      setLastToolCall(null);
+    },
+    onError: (error: string | Error) => {
+      toast.error(typeof error === 'string' ? error : error.message);
+      setIsInitializing(false);
+    },
+    // clientTools: toolNames.reduce(
+    //   (acc, name) => {
+    //     acc[name] = async (params: any) => {
+    //       console.log(`[Voice Tool] ${name} called with params:`, params);
+    //       setLastToolCall(`Executing: ${name}`);
+
+    //       try {
+    //         const result = await callServerTool(
+    //           name,
+    //           { ...params, _context: currentContext },
+    //           session?.user.phoneNumber ?? session?.user.email ?? '',
+    //         );
+
+    //         console.log(`[Voice Tool] ${name} result:`, result);
+    //         setLastToolCall(null);
+    //         return result;
+    //       } catch (err) {
+    //         setLastToolCall(null);
+    //         toast.error(`Tool "${name}" failed: ${(err as Error).message}`);
+    //         throw err;
+    //       }
+    //     };
+    //     return acc;
+    //   },
+    //   {} as Record<string, (params: any) => Promise<any>>,
+    // ),
+  });
+
+  const { status, isSpeaking } = conversation;
+
+  const requestPermission = async () => {
+    try {
+      const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
+      stream.getTracks().forEach((track) => track.stop());
+      setHasPermission(true);
+      return true;
+    } catch {
+      toast.error('Microphone access denied. Please enable microphone permissions.');
+      setHasPermission(false);
+      return false;
+    }
+  };
+
+  const startConversation = async (context?: any) => {
+    if (!hasPermission) {
+      const result = await requestPermission();
+      if (!result) return;
+      setHasPermission(result);
+    }
+
+    try {
+      setIsInitializing(true);
+      if (context) {
+        setCurrentContext(context);
+      }
+
+      const agentId = import.meta.env.VITE_PUBLIC_ELEVENLABS_AGENT_ID;
+      if (!agentId) throw new Error('ElevenLabs Agent ID not configured');
+
+      await conversation.startSession({
+        agentId: agentId,
+        onMessage: (message) => {
+          // TODO: Handle message, ideally send it to ai chat agent or show it somewhere on the screen?
+          console.log('message', message);
+        },
+        dynamicVariables: {
+          user_name: session?.user.name.split(' ')[0] || 'User',
+          user_email: session?.user.email || '',
+          current_time: new Date().toLocaleString(),
+          has_open_email: context?.hasOpenEmail ? 'yes' : 'no',
+          current_thread_id: context?.currentThreadId || 'none',
+          email_context_info: context?.hasOpenEmail
+            ? `The user currently has an email open (thread ID: ${context.currentThreadId}). When the user refers to "this email" or "the current email", you can use the getEmail or summarizeEmail tools WITHOUT providing a threadId parameter - the tools will automatically use the currently open email.`
+            : 'No email is currently open. If the user asks about an email, you will need to ask them to open it first or provide a specific thread ID.',
+          ...context,
+        },
+      });
+
+      setOpen(true);
+    } catch {
+      toast.error('Failed to start conversation. Please try again.');
+    }
+  };
+
+  const endConversation = async () => {
+    try {
+      await conversation.endSession();
+      setCurrentContext(null);
+    } catch {
+      toast.error('Failed to end conversation');
+    }
+  };
+
+  const sendContext = (context: any) => {
+    setCurrentContext(context);
+  };
+
+  const value: VoiceContextType = {
+    status,
+    isInitializing,
+    isSpeaking,
+    hasPermission,
+    lastToolCall,
+    isOpen,
+    startConversation,
+    endConversation,
+    requestPermission: requestPermission,
+    sendContext,
+  };
+
+  return <VoiceContext.Provider value={value}>{children}</VoiceContext.Provider>;
+}
+
+export function useVoice() {
+  const context = useContext(VoiceContext);
+  if (!context) {
+    throw new Error('useVoice must be used within a VoiceProvider');
+  }
+  return context;
+}
+
+export { VoiceContext };
diff --git a/apps/mail/providers/voice-provider.tsx b/apps/mail/providers/voice-provider.tsx
index 101409083e..38fffee5f1 100644
--- a/apps/mail/providers/voice-provider.tsx
+++ b/apps/mail/providers/voice-provider.tsx
@@ -1,98 +1,56 @@
-import { createContext, useContext, useState } from 'react';
-import { useConversation } from '@elevenlabs/react';
-// import { callServerTool } from '@/lib/server-tool';
+import { createContext, useContext, useState, useRef, useCallback, useEffect } from 'react';
 import { useSession } from '@/lib/auth-client';
 import type { ReactNode } from 'react';
 import { toast } from 'sonner';
 
 interface VoiceContextType {
-  status: string;
+  status: 'idle' | 'recording' | 'processing' | 'speaking' | 'connected';
   isInitializing: boolean;
   isSpeaking: boolean;
   hasPermission: boolean;
   lastToolCall: string | null;
   isOpen: boolean;
+  transcript: string;
 
   startConversation: (context?: any) => Promise<void>;
   endConversation: () => Promise<void>;
   requestPermission: () => Promise<boolean>;
   sendContext: (context: any) => void;
+  stopRecording: () => void;
 }
 
-// const toolNames = [
-//   'listEmails',
-//   'getEmail',
-//   'sendEmail',
-//   'markAsRead',
-//   'markAsUnread',
-//   'archiveEmails',
-//   'deleteEmails',
-//   'deleteEmail',
-//   'createLabel',
-//   'applyLabel',
-//   'removeLabel',
-//   'searchEmails',
-//   'webSearch',
-//   'summarizeEmail',
-// ] as const;
-
 const VoiceContext = createContext<VoiceContextType | undefined>(undefined);
 
-export function VoiceProvider({ children }: { children: ReactNode }) {
-  const { data: session } = useSession();
+interface VoiceProviderProps {
+  children: ReactNode;
+  onTranscriptComplete?: (transcript: string) => void;
+  onResponseReady?: (callback: (response: string) => void) => void;
+}
+
+export function VoiceProvider({
+  children,
+  onTranscriptComplete,
+  onResponseReady,
+}: VoiceProviderProps) {
+  const { data: _session } = useSession();
   const [hasPermission, setHasPermission] = useState(false);
   const [isInitializing, setIsInitializing] = useState(false);
-  const [lastToolCall, setLastToolCall] = useState<string | null>(null);
+  const [_lastToolCall, _setLastToolCall] = useState<string | null>(null);
   const [isOpen, setOpen] = useState(false);
-  const [, setCurrentContext] = useState<any>(null);
+  const [status, setStatus] = useState<VoiceContextType['status']>('idle');
+  const [transcript, setTranscript] = useState('');
+  const [isSpeaking, setIsSpeaking] = useState(false);
+  const [_currentContext, setCurrentContext] = useState<any>(null);
 
-  const conversation = useConversation({
-    onConnect: () => {
-      setIsInitializing(false);
-      // TODO: Send initial context if available when API supports it
-    },
-    onDisconnect: () => {
-      setIsInitializing(false);
-      setLastToolCall(null);
-    },
-    onError: (error: string | Error) => {
-      toast.error(typeof error === 'string' ? error : error.message);
-      setIsInitializing(false);
-    },
-    // clientTools: toolNames.reduce(
-    //   (acc, name) => {
-    //     acc[name] = async (params: any) => {
-    //       console.log(`[Voice Tool] ${name} called with params:`, params);
-    //       setLastToolCall(`Executing: ${name}`);
-
-    //       try {
-    //         const result = await callServerTool(
-    //           name,
-    //           { ...params, _context: currentContext },
-    //           session?.user.phoneNumber ?? session?.user.email ?? '',
-    //         );
-
-    //         console.log(`[Voice Tool] ${name} result:`, result);
-    //         setLastToolCall(null);
-    //         return result;
-    //       } catch (err) {
-    //         setLastToolCall(null);
-    //         toast.error(`Tool "${name}" failed: ${(err as Error).message}`);
-    //         throw err;
-    //       }
-    //     };
-    //     return acc;
-    //   },
-    //   {} as Record<string, (params: any) => Promise<any>>,
-    // ),
-  });
-
-  const { status, isSpeaking } = conversation;
+  const mediaRecorderRef = useRef<MediaRecorder | null>(null);
+  const audioChunksRef = useRef<Blob[]>([]);
+  const audioRef = useRef<HTMLAudioElement | null>(null);
+  const streamRef = useRef<MediaStream | null>(null);
 
   const requestPermission = async () => {
     try {
       const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
-      stream.getTracks().forEach((track) => track.stop());
+      streamRef.current = stream;
       setHasPermission(true);
       return true;
     } catch {
@@ -102,52 +60,191 @@ export function VoiceProvider({ children }: { children: ReactNode }) {
     }
   };
 
+  const startRecording = useCallback(async () => {
+    if (!streamRef.current) {
+      const hasPermission = await requestPermission();
+      if (!hasPermission) return;
+    }
+
+    try {
+      audioChunksRef.current = [];
+
+      const mediaRecorder = new MediaRecorder(streamRef.current!, {
+        mimeType: 'audio/webm;codecs=opus',
+      });
+
+      mediaRecorderRef.current = mediaRecorder;
+
+      mediaRecorder.ondataavailable = (event) => {
+        if (event.data.size > 0) {
+          audioChunksRef.current.push(event.data);
+        }
+      };
+
+      mediaRecorder.onstop = async () => {
+        setStatus('processing');
+        await processAudioRecording();
+      };
+
+      mediaRecorder.start();
+      setStatus('recording');
+    } catch (error) {
+      console.error('Error starting recording:', error);
+      toast.error('Failed to start recording');
+      setStatus('idle');
+    }
+  }, []);
+
+  const stopRecording = useCallback(() => {
+    if (mediaRecorderRef.current && mediaRecorderRef.current.state === 'recording') {
+      mediaRecorderRef.current.stop();
+    }
+  }, []);
+
+  const processAudioRecording = async () => {
+    try {
+      const audioBlob = new Blob(audioChunksRef.current, { type: 'audio/webm' });
+
+      // Convert blob to File object for OpenAI
+      const audioFile = new File([audioBlob], 'recording.webm', { type: 'audio/webm' });
+
+      // Send to OpenAI Whisper for transcription
+      const transcription = await transcribeAudio(audioFile);
+      setTranscript(transcription);
+
+      // Call the callback to send transcript to AI chat
+      if (onTranscriptComplete) {
+        onTranscriptComplete(transcription);
+      }
+
+      setStatus('idle');
+    } catch (error) {
+      console.error('Error processing audio:', error);
+      toast.error('Failed to process audio');
+      setStatus('idle');
+    }
+  };
+
+  const transcribeAudio = async (audioFile: File): Promise<string> => {
+    try {
+      // Create FormData to send the audio file
+      const formData = new FormData();
+      formData.append('file', audioFile);
+
+      // Call our server endpoint with credentials
+      const response = await fetch(`${import.meta.env.VITE_PUBLIC_BACKEND_URL}/voice/transcribe`, {
+        method: 'POST',
+        body: formData,
+        credentials: 'include', // Use cookie-based auth
+      });
+
+      if (!response.ok) {
+        throw new Error(`Transcription failed: ${response.statusText}`);
+      }
+
+      const data = (await response.json()) as { text: string };
+      return data.text;
+    } catch (error) {
+      console.error('Error transcribing audio:', error);
+      throw error;
+    }
+  };
+
+  const speakText = async (text: string) => {
+    try {
+      setIsSpeaking(true);
+      setStatus('speaking');
+
+      // Use our server endpoint with credentials
+      const response = await fetch(`${import.meta.env.VITE_PUBLIC_BACKEND_URL}/voice/speak`, {
+        method: 'POST',
+        headers: {
+          'Content-Type': 'application/json',
+        },
+        body: JSON.stringify({
+          text,
+          voice: 'alloy', // Available voices: alloy, echo, fable, onyx, nova, shimmer
+          speed: 1.0,
+        }),
+        credentials: 'include', // Use cookie-based auth
+      });
+
+      if (!response.ok) {
+        throw new Error(`TTS failed: ${response.statusText}`);
+      }
+
+      const audioBlob = await response.blob();
+      const audioUrl = URL.createObjectURL(audioBlob);
+
+      // Play the audio
+      if (audioRef.current) {
+        audioRef.current.pause();
+      }
+
+      audioRef.current = new Audio(audioUrl);
+      audioRef.current.onended = () => {
+        setIsSpeaking(false);
+        setStatus('idle');
+        URL.revokeObjectURL(audioUrl);
+      };
+
+      await audioRef.current.play();
+    } catch (error) {
+      console.error('Error speaking text:', error);
+      setIsSpeaking(false);
+      setStatus('idle');
+      toast.error('Failed to generate speech');
+    }
+  };
+
   const startConversation = async (context?: any) => {
     if (!hasPermission) {
       const result = await requestPermission();
       if (!result) return;
-      setHasPermission(result);
     }
 
     try {
       setIsInitializing(true);
+      setStatus('connected');
       if (context) {
         setCurrentContext(context);
       }
-
-      const agentId = import.meta.env.VITE_PUBLIC_ELEVENLABS_AGENT_ID;
-      if (!agentId) throw new Error('ElevenLabs Agent ID not configured');
-
-      await conversation.startSession({
-        agentId: agentId,
-        onMessage: (message) => {
-          // TODO: Handle message, ideally send it to ai chat agent or show it somewhere on the screen?
-          console.log('message', message);
-        },
-        dynamicVariables: {
-          user_name: session?.user.name.split(' ')[0] || 'User',
-          user_email: session?.user.email || '',
-          current_time: new Date().toLocaleString(),
-          has_open_email: context?.hasOpenEmail ? 'yes' : 'no',
-          current_thread_id: context?.currentThreadId || 'none',
-          email_context_info: context?.hasOpenEmail
-            ? `The user currently has an email open (thread ID: ${context.currentThreadId}). When the user refers to "this email" or "the current email", you can use the getEmail or summarizeEmail tools WITHOUT providing a threadId parameter - the tools will automatically use the currently open email.`
-            : 'No email is currently open. If the user asks about an email, you will need to ask them to open it first or provide a specific thread ID.',
-          ...context,
-        },
-      });
-
       setOpen(true);
-    } catch {
+
+      // Start recording immediately
+      await startRecording();
+      setIsInitializing(false);
+    } catch (error) {
+      console.error('Error starting conversation:', error);
       toast.error('Failed to start conversation. Please try again.');
+      setIsInitializing(false);
+      setStatus('idle');
     }
   };
 
   const endConversation = async () => {
     try {
-      await conversation.endSession();
+      // Stop recording if active
+      stopRecording();
+
+      // Stop any playing audio
+      if (audioRef.current) {
+        audioRef.current.pause();
+        audioRef.current = null;
+      }
+
+      // Stop media stream
+      if (streamRef.current) {
+        streamRef.current.getTracks().forEach((track) => track.stop());
+        streamRef.current = null;
+      }
+
       setCurrentContext(null);
-    } catch {
+      setStatus('idle');
+      setOpen(false);
+      setTranscript('');
+    } catch (error) {
+      console.error('Error ending conversation:', error);
       toast.error('Failed to end conversation');
     }
   };
@@ -156,17 +253,28 @@ export function VoiceProvider({ children }: { children: ReactNode }) {
     setCurrentContext(context);
   };
 
+  // Set up the response handler when transcript is complete
+  useEffect(() => {
+    if (onResponseReady) {
+      onResponseReady((response: string) => {
+        speakText(response);
+      });
+    }
+  }, [onResponseReady]);
+
   const value: VoiceContextType = {
     status,
     isInitializing,
     isSpeaking,
     hasPermission,
-    lastToolCall,
+    lastToolCall: _lastToolCall,
     isOpen,
+    transcript,
     startConversation,
     endConversation,
-    requestPermission: requestPermission,
+    requestPermission,
     sendContext,
+    stopRecording,
   };
 
   return <VoiceContext.Provider value={value}>{children}</VoiceContext.Provider>;
diff --git a/apps/server/src/main.ts b/apps/server/src/main.ts
index a9bb9f51d7..ecbc42ba5e 100644
--- a/apps/server/src/main.ts
+++ b/apps/server/src/main.ts
@@ -38,6 +38,7 @@ import { enableBrainFunction } from './lib/brain';
 import { trpcServer } from '@hono/trpc-server';
 import { agentsMiddleware } from 'hono-agents';
 import { ZeroMCP } from './routes/agent/mcp';
+import { voiceRouter } from './routes/voice';
 import { publicRouter } from './routes/auth';
 import { WorkflowRunner } from './pipelines';
 import { autumnApi } from './routes/autumn';
@@ -594,6 +595,7 @@ const api = new Hono<HonoContext>()
     c.set('auth', undefined as any);
   })
   .route('/ai', aiRouter)
+  .route('/voice', voiceRouter)
   .route('/autumn', autumnApi)
   .route('/public', publicRouter)
   .on(['GET', 'POST', 'OPTIONS'], '/auth/*', (c) => {
diff --git a/apps/server/src/routes/voice.ts b/apps/server/src/routes/voice.ts
new file mode 100644
index 0000000000..8f7dba1073
--- /dev/null
+++ b/apps/server/src/routes/voice.ts
@@ -0,0 +1,135 @@
+import { env } from 'cloudflare:workers';
+import { createAuth } from '../lib/auth';
+import { Hono } from 'hono';
+
+export const voiceRouter = new Hono<{ Bindings: typeof env }>();
+
+// Add CORS headers
+voiceRouter.use('/*', async (c, next) => {
+  c.header('Access-Control-Allow-Origin', '*');
+  c.header('Access-Control-Allow-Headers', 'Content-Type, Authorization');
+  c.header('Access-Control-Allow-Methods', 'POST, OPTIONS');
+  if (c.req.method === 'OPTIONS') {
+    return c.text('');
+  }
+  return next();
+});
+
+// Speech-to-text endpoint
+voiceRouter.post('/transcribe', async (c) => {
+  try {
+    // Verify authentication using better-auth
+    const auth = createAuth();
+    const session = await auth.api.getSession({ headers: c.req.raw.headers });
+
+    if (!session) {
+      return c.json({ success: false, error: 'Unauthorized' }, 401);
+    }
+
+    // Get audio file from request
+    const formData = await c.req.formData();
+    const audioFile = formData.get('file') as File;
+
+    if (!audioFile) {
+      return c.json({ success: false, error: 'No audio file provided' }, 400);
+    }
+
+    // Create FormData for OpenAI
+    const openAIFormData = new FormData();
+    openAIFormData.append('file', audioFile);
+    openAIFormData.append('model', 'whisper-1');
+    openAIFormData.append('language', 'en');
+
+    // Call OpenAI Whisper API
+    const response = await fetch('https://api.openai.com/v1/audio/transcriptions', {
+      method: 'POST',
+      headers: {
+        Authorization: `Bearer ${env.OPENAI_API_KEY}`,
+      },
+      body: openAIFormData,
+    });
+
+    if (!response.ok) {
+      const error = await response.text();
+      console.error('OpenAI API error:', error);
+      throw new Error(`Transcription failed: ${response.statusText}`);
+    }
+
+    const data = await response.json();
+
+    return c.json({
+      success: true,
+      text: data.text,
+    });
+  } catch (error) {
+    console.error('Transcription error:', error);
+    return c.json(
+      {
+        success: false,
+        error: error instanceof Error ? error.message : 'Transcription failed',
+      },
+      500,
+    );
+  }
+});
+
+// Text-to-speech endpoint
+voiceRouter.post('/speak', async (c) => {
+  try {
+    // Verify authentication using better-auth
+    const auth = createAuth();
+    const session = await auth.api.getSession({ headers: c.req.raw.headers });
+
+    if (!session) {
+      return c.json({ success: false, error: 'Unauthorized' }, 401);
+    }
+
+    // Parse request body
+    const { text, voice = 'alloy', speed = 1.0 } = await c.req.json();
+
+    if (!text) {
+      return c.json({ success: false, error: 'No text provided' }, 400);
+    }
+
+    // Call OpenAI TTS API
+    const response = await fetch('https://api.openai.com/v1/audio/speech', {
+      method: 'POST',
+      headers: {
+        Authorization: `Bearer ${env.OPENAI_API_KEY}`,
+        'Content-Type': 'application/json',
+      },
+      body: JSON.stringify({
+        model: 'tts-1',
+        input: text,
+        voice,
+        speed,
+      }),
+    });
+
+    if (!response.ok) {
+      const error = await response.text();
+      console.error('OpenAI TTS API error:', error);
+      throw new Error(`TTS failed: ${response.statusText}`);
+    }
+
+    // Get the audio data
+    const audioBuffer = await response.arrayBuffer();
+
+    // Return audio file
+    return new Response(audioBuffer, {
+      headers: {
+        'Content-Type': 'audio/mpeg',
+        'Content-Length': audioBuffer.byteLength.toString(),
+      },
+    });
+  } catch (error) {
+    console.error('TTS error:', error);
+    return c.json(
+      {
+        success: false,
+        error: error instanceof Error ? error.message : 'TTS failed',
+      },
+      500,
+    );
+  }
+});
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index e7846592cc..89ad4c3db5 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -95,6 +95,9 @@ importers:
 
   apps/mail:
     dependencies:
+      '@ai-sdk/openai':
+        specifier: ^1.3.21
+        version: 1.3.22(zod@3.25.67)
       '@ai-sdk/perplexity':
         specifier: 1.1.9
         version: 1.1.9(zod@3.25.67)

From 3b4ef48f949f96a123a5012fc61e4ff9e33f343f Mon Sep 17 00:00:00 2001
From: Ahmet Kilinc <akx9@icloud.com>
Date: Mon, 25 Aug 2025 15:04:14 +0100
Subject: [PATCH 2/5] change name

---
 apps/mail/lib/prompts.ts       | 82 +++++++++++++++++-----------------
 apps/server/src/lib/prompts.ts |  2 +-
 2 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/apps/mail/lib/prompts.ts b/apps/mail/lib/prompts.ts
index c6469401bf..766bb9d43b 100644
--- a/apps/mail/lib/prompts.ts
+++ b/apps/mail/lib/prompts.ts
@@ -118,26 +118,26 @@ export const StyledEmailAssistantSystemPrompt = () =>
       You are an AI assistant that composes on-demand email bodies while
       faithfully mirroring the sender’s personal writing style.
     </role>
-  
+
     <instructions>
       <goal>
         Generate a ready-to-send email body that fulfils the user’s request and
         reflects every writing-style metric supplied in the user’s input.
       </goal>
-  
+
       <persona>
         Write in the <b>first person</b> as the user. Start from the metrics
         profile, not from a generic template, unless the user explicitly
         overrides the style.
       </persona>
-  
+
       <tasks>
         <item>Compose a complete email body when no draft is supplied.</item>
         <item>If a draft (<current_draft>) is supplied, refine that draft only.</item>
         <item>Respect explicit style or tone directives, then reconcile them with
               the metrics.</item>
       </tasks>
-  
+
       <!-- ──────────────────────────────── -->
       <!--            CONTEXT              -->
       <!-- ──────────────────────────────── -->
@@ -146,7 +146,7 @@ export const StyledEmailAssistantSystemPrompt = () =>
         <item><current_subject>...</current_subject></item>
         <item><recipients>...</recipients></item>
         <item>The user’s prompt describing the email.</item>
-  
+
         Use this context intelligently:
         <item>Adjust content and tone to fit the subject and recipients.</item>
         <item>Analyse each thread message—including embedded replies—to avoid
@@ -159,51 +159,51 @@ export const StyledEmailAssistantSystemPrompt = () =>
         <item>Unless instructed otherwise, address the person who sent the last
               thread message.</item>
       </context>
-  
+
       <!-- ──────────────────────────────── -->
       <!--        STYLE ADAPTATION         -->
       <!-- ──────────────────────────────── -->
       <style_adaptation>
         The profile JSON contains all current metrics: greeting/sign-off flags
         and 52 numeric rates. Honour every metric:
-  
+
         <item><b>Greeting & sign-off</b> — include or omit exactly one greeting
               and one sign-off according to <code>greetingPresent</code> /
               <code>signOffPresent</code>. Use the stored phrases verbatim. If
               <code>emojiRate &gt; 0</code> and the greeting lacks an emoji,
               append “👋”.</item>
-  
+
         <item><b>Structure</b> — mirror
               <code>averageSentenceLength</code>,
               <code>averageLinesPerParagraph</code>,
               <code>paragraphs</code> and <code>bulletListPresent</code>.</item>
-  
+
         <item><b>Vocabulary & diversity</b> — match
               <code>typeTokenRatio</code>, <code>movingAverageTtr</code>,
               <code>hapaxProportion</code>, <code>shannonEntropy</code>,
               <code>lexicalDensity</code>, <code>contractionRate</code>.</item>
-  
+
         <item><b>Syntax & grammar</b> — adapt to
               <code>subordinationRatio</code>, <code>passiveVoiceRate</code>,
               <code>modalVerbRate</code>, <code>parseTreeDepthMean</code>.</item>
-  
+
         <item><b>Punctuation & symbols</b> — scale commas, exclamation marks,
               question marks, three-dot ellipses "...", parentheses and emoji
               frequency per their respective rates. Respect emphasis markers
               (<code>markupBoldRate</code>, <code>markupItalicRate</code>), links
               (<code>hyperlinkRate</code>) and code blocks
               (<code>codeBlockRate</code>).</item>
-  
+
         <item><b>Tone & sentiment</b> — replicate
               <code>sentimentPolarity</code>, <code>sentimentSubjectivity</code>,
               <code>formalityScore</code>, <code>hedgeRate</code>,
               <code>certaintyRate</code>.</item>
-  
+
         <item><b>Readability & flow</b> — keep
               <code>fleschReadingEase</code>, <code>gunningFogIndex</code>,
               <code>smogIndex</code>, <code>averageForwardReferences</code>,
               <code>cohesionIndex</code> within ±1 of profile values.</item>
-  
+
         <item><b>Persona markers & rhetoric</b> — scale pronouns, empathy
               phrases, humour markers and rhetorical devices per
               <code>firstPersonSingularRate</code>,
@@ -213,7 +213,7 @@ export const StyledEmailAssistantSystemPrompt = () =>
               <code>analogyRate</code>, <code>imperativeSentenceRate</code>,
               <code>expletiveOpeningRate</code>, <code>parallelismRate</code>.</item>
       </style_adaptation>
-  
+
       <!-- ──────────────────────────────── -->
       <!--            FORMATTING           -->
       <!-- ──────────────────────────────── -->
@@ -224,7 +224,7 @@ export const StyledEmailAssistantSystemPrompt = () =>
         <item>Use single newlines only for lists or quoted text.</item>
       </formatting>
     </instructions>
-  
+
     <!-- ──────────────────────────────── -->
     <!--         OUTPUT FORMAT           -->
     <!-- ──────────────────────────────── -->
@@ -234,7 +234,7 @@ export const StyledEmailAssistantSystemPrompt = () =>
         include a subject line, XML tags, JSON or commentary.
       </description>
     </output_format>
-  
+
     <!-- ──────────────────────────────── -->
     <!--       STRICT GUIDELINES         -->
     <!-- ──────────────────────────────── -->
@@ -256,10 +256,10 @@ export const AiChatPrompt = () =>
   dedent`
       <system_prompt>
         <role>
-          You are Fred, an intelligent email management assistant integrated with Gmail operations.
+          You are Zero, an intelligent email management assistant integrated with Gmail operations.
           Your mission: help users navigate and understand their inbox with complete knowledge of what's happening. You provide context, insights, and smart organization - not to achieve inbox zero, but to give users full awareness and control over their email landscape.
         </role>
-  
+
         <success_criteria>
           A correct response must:
           1. Either make a tool call OR provide a plain-text reply (never both)
@@ -268,13 +268,13 @@ export const AiChatPrompt = () =>
           4. Confirm before affecting more than 5 threads
           5. Be concise and action-oriented
         </success_criteria>
-  
+
         <persona>
           Professional, direct, efficient. Skip pleasantries. Focus on results, not process explanations.
         </persona>
-  
+
         <current_date>${getCurrentDateContext()}</current_date>
-  
+
         <thinking_process>
           Before responding, think step-by-step:
           1. What is the user asking for?
@@ -283,7 +283,7 @@ export const AiChatPrompt = () =>
           4. What safety checks are needed?
           Keep this reasoning internal - never show it to the user.
         </thinking_process>
-  
+
         <tools>
           <tool name="${Tools.GetThreadSummary}">
             <purpose>Get the summary of a specific email thread</purpose>
@@ -295,67 +295,67 @@ export const AiChatPrompt = () =>
             <returns>Array of thread IDs only</returns>
             <example>inboxRag({ query: "promotional emails from last week" })</example>
           </tool>
-  
+
           <tool name="${Tools.GetThread}">
             <purpose>Get thread details for a specific ID</purpose>
             <returns>Thread tag for client resolution</returns>
             <example>getThread({ id: "17c2318b9c1e44f6" })</example>
           </tool>
-  
+
           <tool name="${Tools.WebSearch}">
             <purpose>Search web for external information</purpose>
             <usage>For companies, people, general knowledge not in inbox</usage>
             <example>webSearch({ query: "What is Sequoia Capital?" })</example>
           </tool>
-  
+
           <tool name="${Tools.BulkArchive}">
             <purpose>Archive multiple threads</purpose>
             <safety>Confirm if more than 5 threads</safety>
             <example>bulkArchive({ threadIds: ["..."] })</example>
           </tool>
-  
+
           <tool name="${Tools.BulkDelete}">
             <purpose>Delete multiple threads permanently</purpose>
             <safety>Always confirm before deletion</safety>
             <example>bulkDelete({ threadIds: ["..."] })</example>
           </tool>
-  
+
           <tool name="${Tools.ModifyLabels}">
             <purpose>Add/remove labels from threads</purpose>
             <note>Get label IDs first with getUserLabels</note>
             <example>modifyLabels({ threadIds: [...], options: { addLabels: [...], removeLabels: [...] } })</example>
           </tool>
-  
+
           <tool name="${Tools.CreateLabel}">
             <purpose>Create new Gmail label</purpose>
             <colors>${colors.slice(0, 10).join(', ')}...</colors>
             <example>createLabel({ name: "Follow-Up", backgroundColor: "#FFA500", textColor: "#000000" })</example>
           </tool>
-  
+
           <tool name="${Tools.GetUserLabels}">
             <purpose>List all user labels</purpose>
             <usage>Check before creating new labels</usage>
           </tool>
-  
+
           <tool name="${Tools.MarkThreadsRead}">
             <purpose>Mark threads as read</purpose>
           </tool>
-  
+
           <tool name="${Tools.MarkThreadsUnread}">
             <purpose>Mark threads as unread</purpose>
           </tool>
-  
+
           <tool name="${Tools.ComposeEmail}">
             <purpose>Draft email with AI assistance</purpose>
             <example>composeEmail({ prompt: "Follow-up email", to: ["email@example.com"] })</example>
           </tool>
-  
+
           <tool name="${Tools.SendEmail}">
             <purpose>Send new email</purpose>
             <example>sendEmail({ to: [{ email: "user@example.com" }], subject: "Hello", message: "Body" })</example>
           </tool>
         </tools>
-  
+
         <workflow_examples>
           <example name="simple_search">
             <user>Find newsletters from last week</user>
@@ -363,7 +363,7 @@ export const AiChatPrompt = () =>
             <action>inboxRag({ query: "newsletters from last week" })</action>
             <response>Found 3 newsletters from last week.</response>
           </example>
-  
+
           <example name="organize_emails">
             <user>Label my investment emails as "Investments"</user>
             <thinking>
@@ -380,7 +380,7 @@ export const AiChatPrompt = () =>
             </action_sequence>
             <response>Labeled 5 investment emails with "Investments".</response>
           </example>
-  
+
           <example name="bulk_cleanup">
             <user>Delete all promotional emails from cal.com</user>
             <thinking>
@@ -396,7 +396,7 @@ export const AiChatPrompt = () =>
             <response>Deleted 12 promotional emails from cal.com.</response>
           </example>
         </workflow_examples>
-  
+
         <safety_rules>
           <rule>Confirm before deleting any emails</rule>
           <rule>Confirm before affecting more than 5 threads</rule>
@@ -404,7 +404,7 @@ export const AiChatPrompt = () =>
           <rule>Check label existence before creating duplicates</rule>
           <rule>Use appropriate tools for each task</rule>
         </safety_rules>
-  
+
         <response_guidelines>
           <formatting>Plain text only - no markdown, bullets, or special characters</formatting>
           <tone>Professional and direct - skip "Here's what I found" phrases</tone>
@@ -412,7 +412,7 @@ export const AiChatPrompt = () =>
           <action>Take action when requested - don't just describe what you could do</action>
           <transparency>Never reveal tool outputs or internal reasoning</transparency>
         </response_guidelines>
-  
+
         <common_use_cases>
           <case name="search">When user asks to find emails, use inboxRag with descriptive query</case>
           <case name="organize">Search → check labels → create if needed → apply labels</case>
@@ -424,7 +424,7 @@ export const AiChatPrompt = () =>
           <case name="unread">Direct to on-screen filters</case>
           <case name="support">Direct to live chat button</case>
         </common_use_cases>
-  
+
         <self_check>
           Before sending each response:
           1. Does it follow the success criteria?
diff --git a/apps/server/src/lib/prompts.ts b/apps/server/src/lib/prompts.ts
index 182a3ce071..4540698ddb 100644
--- a/apps/server/src/lib/prompts.ts
+++ b/apps/server/src/lib/prompts.ts
@@ -328,7 +328,7 @@ export const AiChatPrompt = () =>
   dedent`
     <system_prompt>
       <role>
-        You are Fred, an intelligent email management assistant integrated with Gmail operations.
+        You are Zero, an intelligent email management assistant integrated with Gmail operations.
         Your mission: help users navigate and understand their inbox with complete knowledge of what's happening. You provide context, insights, and smart organization - not to achieve inbox zero, but to give users full awareness and control over their email landscape.
       </role>
 

From e907b9420bf47e93331f043168583a4dc8ee5972 Mon Sep 17 00:00:00 2001
From: Ahmet Kilinc <akx9@icloud.com>
Date: Mon, 25 Aug 2025 15:39:54 +0100
Subject: [PATCH 3/5] remove

---
 apps/mail/components/create/ai-chat.tsx | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/apps/mail/components/create/ai-chat.tsx b/apps/mail/components/create/ai-chat.tsx
index d8ecc8e78d..81c01d0f56 100644
--- a/apps/mail/components/create/ai-chat.tsx
+++ b/apps/mail/components/create/ai-chat.tsx
@@ -146,9 +146,7 @@ export interface AIChatProps {
   setMessages: (messages: AiMessage[]) => void;
 }
 
-// Subcomponents for ToolResponse
 const GetThreadToolResponse = ({ result, args }: { result: any; args: any }) => {
-  // Extract threadId from result or args
   let threadId: string | null = null;
   if (typeof result === 'string') {
     const match = result.match(/<thread id="([^"]+)" ?\/>/);
@@ -181,7 +179,6 @@ const ComposeEmailToolResponse = ({ result }: { result: any }) => {
   );
 };
 
-// Main ToolResponse switcher
 const ToolResponse = ({ toolName, result, args }: { toolName: string; result: any; args: any }) => {
   switch (toolName) {
     case Tools.GetThread:
@@ -223,15 +220,12 @@ export function AIChat({
     }
   }, [status, scrollToBottom]);
 
-  // Track if we're waiting for a voice response
   const [isVoiceQuery, setIsVoiceQuery] = useState(false);
 
-  // When a new assistant message comes in, check if it's a voice response
   useEffect(() => {
     if (isVoiceQuery && messages.length > 0) {
       const lastMessage = messages[messages.length - 1];
       if (lastMessage.role === 'assistant') {
-        // Extract text content from the message
         const textContent = lastMessage.parts
           .filter((part) => part.type === 'text' && 'text' in part)
           .map((part) => (part as any).text)
@@ -307,7 +301,6 @@ export function AIChat({
                 Ask to do or show anything using natural language
               </p>
 
-              {/* Example Thread */}
               <ExampleQueries onQueryClick={handleQueryClick} />
             </div>
           ) : (
@@ -395,7 +388,6 @@ export function AIChat({
         </div>
       </div>
 
-      {/* Fixed input at bottom */}
       <div className={cn('mb-4 shrink-0 px-4', isFullScreen ? 'px-0' : '')}>
         <div className="bg-offsetLight relative rounded-lg p-2 dark:bg-[#202020]">
           <div className="flex flex-col">
@@ -417,15 +409,12 @@ export function AIChat({
               <div className="flex justify-end gap-1">
                 <VoiceProvider
                   onTranscriptComplete={(transcript) => {
-                    // Mark this as a voice query
                     setIsVoiceQuery(true);
-                    // Set the transcript as input and submit the form
                     editor.commands.setContent(transcript);
                     setInput(transcript);
                     onSubmit({ preventDefault: () => {} } as React.FormEvent<HTMLFormElement>);
                   }}
                   onResponseReady={(callback) => {
-                    // Store the callback to be called when we get the AI response
                     voiceResponseCallbackRef.current = callback;
                   }}
                 >

From ad67464a2ca78da467e379c5b51a2d73cd4a3b4e Mon Sep 17 00:00:00 2001
From: Adam Abu Ghaida <44762129+AdamGhaida@users.noreply.github.com>
Date: Thu, 28 Aug 2025 17:18:32 +0300
Subject: [PATCH 4/5] Add comprehensive static eval cases for Zero Agent

---
 apps/server/evals/README.md             | 243 +++++++++++++
 apps/server/evals/ai-chat-basic.eval.ts | 131 +++++++
 apps/server/evals/ai-tool-usage.eval.ts | 454 ++++++++++++++++++++++++
 apps/server/evals/run-evals.sh          | 160 +++++++++
 evals/generate-detailed-report.sh       |   1 +
 5 files changed, 989 insertions(+)
 create mode 100644 apps/server/evals/README.md
 create mode 100644 apps/server/evals/ai-tool-usage.eval.ts
 create mode 100755 apps/server/evals/run-evals.sh
 create mode 100644 evals/generate-detailed-report.sh

diff --git a/apps/server/evals/README.md b/apps/server/evals/README.md
new file mode 100644
index 0000000000..1f8cc35329
--- /dev/null
+++ b/apps/server/evals/README.md
@@ -0,0 +1,243 @@
+# Email Assistant Evaluation Suite
+
+This directory contains comprehensive evaluation tests for the AI Email Assistant using [Evalite](https://github.com/evalite-ai/evalite).
+
+## Overview
+
+The evaluation suite tests the AI assistant's capabilities across multiple dimensions:
+
+- **Basic Functionality**: Greetings, help requests, capability inquiries
+- **Search & Retrieval**: Email search, filtering, and retrieval operations
+- **Label Management**: Creating, modifying, and organizing email labels
+- **Bulk Operations**: Archive, delete, mark read/unread operations
+- **Email Composition**: Writing, replying, and drafting emails
+- **Gmail Search**: Natural language to Gmail search query conversion
+- **Web Search**: External information retrieval
+- **Summarization**: Email and thread summarization
+- **Organization**: Workflow automation and email organization
+
+## Files
+
+### `ai-chat-basic.eval.ts`
+Comprehensive evaluation of the AI chat assistant covering:
+- Static test cases for reliable, consistent testing
+- Dynamic test case generation for varied scenarios
+- Multiple scoring metrics (Factuality, EmbeddingSimilarity)
+- Categorized test cases by difficulty and functionality
+
+### `ai-tool-usage.eval.ts`
+Focused evaluation of tool usage and response quality:
+- Tool-specific test cases with expected behaviors
+- Edge case testing and error handling
+- Professional communication scenarios
+- Complex workflow testing
+
+## Running the Evals
+
+### Prerequisites
+
+1. **OpenAI API Key**: Set the `OPENAI_API_KEY` environment variable
+2. **Dependencies**: Ensure all packages are installed (`pnpm install`)
+3. **Server Access**: Navigate to the `apps/server` directory
+
+### Commands
+
+```bash
+# Run all evals once
+pnpm eval
+
+# Run evals in watch mode (re-runs when files change)
+pnpm eval:dev
+
+# Run specific eval file
+pnpm eval -- --run evals/ai-chat-basic.eval.ts
+pnpm eval -- --run evals/ai-tool-usage.eval.ts
+```
+
+### Environment Setup
+
+```bash
+# Set OpenAI API key
+export OPENAI_API_KEY="your-api-key-here"
+
+# Or create a .env file in apps/server/
+echo "OPENAI_API_KEY=your-api-key-here" > .env
+```
+
+## Test Case Structure
+
+### Static Test Cases
+Reliable, consistent test cases that don't change between runs:
+
+```typescript
+{
+  input: "Show me my unread emails",
+  expected: "getThread",
+  category: "search",
+  difficulty: "easy",
+  description: "Simple unread email request"
+}
+```
+
+### Dynamic Test Cases
+AI-generated test cases for varied scenarios:
+
+```typescript
+{
+  input: string,           // User request
+  expected: string,        // Expected tool or behavior
+  category: string,        // Test category
+  difficulty: 'easy' | 'medium' | 'hard',
+  description: string      // What this test validates
+}
+```
+
+## Scoring Metrics
+
+### Factuality
+Measures how factually accurate the AI's responses are compared to expected outputs.
+
+### EmbeddingSimilarity
+Uses semantic similarity to evaluate how well the AI's response matches the expected behavior.
+
+### Levenshtein
+String similarity scoring for exact text matching (when applicable).
+
+## Test Categories
+
+### Basic Functionality
+- Greetings and help requests
+- Capability inquiries
+- User intent recognition
+
+### Search & Retrieval
+- Email search with filters
+- Date-based queries
+- Multi-criteria searches
+
+### Label Management
+- Label creation and modification
+- Label organization
+- Label application
+
+### Bulk Operations
+- Archive operations
+- Delete operations
+- Mark read/unread operations
+
+### Email Composition
+- Professional emails
+- Personal communication
+- Context-aware composition
+
+### Gmail Search
+- Natural language conversion
+- Search operator usage
+- Complex query building
+
+### Web Search
+- External information retrieval
+- Current events
+- Fact checking
+
+### Summarization
+- Email summarization
+- Thread summarization
+- Content extraction
+
+### Organization
+- Workflow automation
+- Email organization
+- Priority management
+
+## Difficulty Levels
+
+### Easy
+- Simple, single-action requests
+- Basic tool usage
+- Clear, unambiguous inputs
+
+### Medium
+- Multi-step operations
+- Combined tool usage
+- Moderate complexity
+
+### Hard
+- Complex workflows
+- Edge cases
+- Error handling scenarios
+
+## Customization
+
+### Adding New Test Cases
+
+1. **Static Cases**: Add to the appropriate `STATIC_TEST_CASES` array
+2. **Dynamic Cases**: Modify the test case builder functions
+3. **New Categories**: Update the category filtering and add new evalite blocks
+
+### Modifying Scoring
+
+Adjust the scorers array in each evalite block:
+
+```typescript
+scorers: [Factuality, EmbeddingSimilarity, Levenshtein]
+```
+
+### Adding New Prompts
+
+Import new system prompts and create corresponding evalite blocks:
+
+```typescript
+import { NewPrompt } from "../src/lib/prompts";
+
+evalite("New Prompt Evaluation", {
+  data: makeTestCaseBuilder("new functionality"),
+  task: async (input) => {
+    return safeStreamText({
+      model: model,
+      system: NewPrompt(),
+      prompt: input,
+    });
+  },
+  scorers: [Factuality, EmbeddingSimilarity],
+});
+```
+
+## Troubleshooting
+
+### Common Issues
+
+1. **SQLite Binding Errors**: Run `pnpm rebuild better-sqlite3`
+2. **Missing API Key**: Ensure `OPENAI_API_KEY` is set
+3. **Import Errors**: Check that all prompt imports are correct
+
+### Performance Tips
+
+1. **Use Static Cases**: For consistent, reliable testing
+2. **Limit Dynamic Cases**: Balance coverage with execution time
+3. **Watch Mode**: Use `pnpm eval:dev` for development iterations
+
+## Expected Scores
+
+- **Easy Cases**: 70-90% (basic functionality should work well)
+- **Medium Cases**: 50-80% (moderate complexity may have variations)
+- **Hard Cases**: 30-70% (complex scenarios may be challenging)
+
+Scores above 70% indicate excellent performance, while scores below 50% suggest areas for improvement.
+
+## Contributing
+
+When adding new test cases:
+
+1. **Follow the existing structure** and naming conventions
+2. **Add comprehensive descriptions** for each test case
+3. **Use appropriate difficulty levels** based on complexity
+4. **Test edge cases** and error scenarios
+5. **Document any new categories** or scoring methods
+
+## Resources
+
+- [Evalite Documentation](https://github.com/evalite-ai/evalite)
+- [Autoevals Library](https://github.com/braintrustdata/autoevals)
+- [AI SDK Documentation](https://sdk.vercel.ai/)
+- [OpenAI API Reference](https://platform.openai.com/docs/api-reference)
diff --git a/apps/server/evals/ai-chat-basic.eval.ts b/apps/server/evals/ai-chat-basic.eval.ts
index 140cfa926f..3f9f17f365 100644
--- a/apps/server/evals/ai-chat-basic.eval.ts
+++ b/apps/server/evals/ai-chat-basic.eval.ts
@@ -44,6 +44,137 @@ const safeStreamText = async (config: Parameters<typeof streamText>[0]) => {
 
 type TestCase = { input: string; expected: string };
 
+// Comprehensive static test cases for reliable, consistent testing
+const STATIC_TEST_CASES: TestCase[] = [
+  // Basic functionality tests
+  {
+    input: "Hello, what can you help me with?",
+    expected: "greeting"
+  },
+  {
+    input: "Show me my unread emails",
+    expected: "getThread"
+  },
+  {
+    input: "Find emails from john@example.com",
+    expected: "from:"
+  },
+  {
+    input: "Create a label called 'Important'",
+    expected: "createLabel"
+  },
+  {
+    input: "Archive all emails older than 30 days",
+    expected: "bulkArchive"
+  },
+  {
+    input: "Write a thank you email to sarah@company.com",
+    expected: "composeEmail"
+  },
+  {
+    input: "What's the weather like today?",
+    expected: "webSearch"
+  },
+  {
+    input: "Summarize my inbox",
+    expected: "inboxRag"
+  },
+  {
+    input: "Mark all emails from newsletters as read",
+    expected: "markThreadsRead"
+  },
+  {
+    input: "Delete all spam emails",
+    expected: "bulkDelete"
+  },
+  {
+    input: "Find emails with attachments from last week",
+    expected: "has:attachment"
+  },
+  {
+    input: "Organize my emails by priority",
+    expected: "modifyLabels"
+  },
+  {
+    input: "What emails do I have scheduled for tomorrow?",
+    expected: "getThread"
+  },
+  {
+    input: "Send a follow-up email to the meeting request",
+    expected: "composeEmail"
+  },
+  {
+    input: "Find all receipts from Amazon",
+    expected: "from:amazon"
+  },
+  // Additional comprehensive test cases
+  {
+    input: "Show me emails with large attachments (>5MB)",
+    expected: "larger:5M"
+  },
+  {
+    input: "Find emails sent between Monday and Friday last week",
+    expected: "after:2025/08/18"
+  },
+  {
+    input: "Create a nested label structure: Work > Projects > Beta",
+    expected: "createLabel"
+  },
+  {
+    input: "Rename the 'Old' label to 'Archived'",
+    expected: "modifyLabels"
+  },
+  {
+    input: "Apply 'Urgent' label to all emails from the CEO",
+    expected: "modifyLabels"
+  },
+  {
+    input: "Forward all emails from 'Support' to my manager",
+    expected: "composeEmail"
+  },
+  {
+    input: "Set up automatic archiving for emails older than 90 days",
+    expected: "bulkArchive"
+  },
+  {
+    input: "Find emails that are both important and starred",
+    expected: "is:important"
+  },
+  {
+    input: "Create email templates for common responses",
+    expected: "composeEmail"
+  },
+  {
+    input: "Analyze my email patterns and suggest improvements",
+    expected: "inboxRag"
+  },
+  {
+    input: "Set up email encryption for sensitive communications",
+    expected: "composeEmail"
+  },
+  {
+    input: "Create a backup of all my emails",
+    expected: "bulkArchive"
+  },
+  {
+    input: "Find emails with multiple recipients (more than 10 people)",
+    expected: "to:"
+  },
+  {
+    input: "Set up email forwarding rules for specific senders",
+    expected: "modifyLabels"
+  },
+  {
+    input: "Create a knowledge base from FAQ emails",
+    expected: "inboxRag"
+  }
+];
+
+// Helper function to convert static test cases to the format expected by evalite
+const makeStaticTestCaseProvider = (testCases: TestCase[]) => {
+  return async () => testCases;
+};
+
 const makeAiChatTestCaseBuilder = (topic: string): (() => Promise<TestCase[]>) => {
   return async () => {
     const { object } = await generateObject({
diff --git a/apps/server/evals/ai-tool-usage.eval.ts b/apps/server/evals/ai-tool-usage.eval.ts
new file mode 100644
index 0000000000..dfcd671466
--- /dev/null
+++ b/apps/server/evals/ai-tool-usage.eval.ts
@@ -0,0 +1,454 @@
+import { evalite } from "evalite";
+import { openai } from "@ai-sdk/openai";
+import { streamText } from "ai";
+import { traceAISDKModel } from "evalite/ai-sdk";
+import { Factuality, EmbeddingSimilarity, ExactMatch } from "autoevals";
+import { AiChatPrompt, GmailSearchAssistantSystemPrompt, StyledEmailAssistantSystemPrompt } from "../src/lib/prompts";
+import { generateObject } from "ai";
+import { z } from "zod";
+
+// base model (untraced) for internal helpers to avoid trace errors
+const baseModel = openai("gpt-4o-mini");
+
+// traced model for the actual task under test
+const model = traceAISDKModel(baseModel);
+
+const safeStreamText = async (config: Parameters<typeof streamText>[0]) => {
+    try {
+        const res = await streamText(config);
+        return res.textStream;
+    } catch (err) {
+        console.error("LLM call failed", err);
+        return "ERROR";
+    }
+};
+
+// Test case type for tool usage evaluation
+type ToolTestCase = {
+    input: string;
+    expectedTool: string;
+    expectedBehavior: string;
+    category: string;
+    difficulty: 'easy' | 'medium' | 'hard';
+    description: string;
+};
+
+// Static test cases for tool usage evaluation
+const TOOL_USAGE_TEST_CASES: ToolTestCase[] = [
+    // Basic tool usage tests
+    {
+        input: "Show me my unread emails from the last 3 days",
+        expectedTool: "getThread",
+        expectedBehavior: "should call getThread with appropriate filters for unread emails and date range",
+        category: "search_retrieval",
+        difficulty: "easy",
+        description: "Basic search with date filter"
+    },
+    {
+        input: "Create a new label called 'Urgent' with red color",
+        expectedTool: "createLabel",
+        expectedBehavior: "should call createLabel with name 'Urgent' and red color",
+        category: "label_management",
+        difficulty: "easy",
+        description: "Label creation with color specification"
+    },
+    {
+        input: "Archive all emails older than 60 days that are not starred",
+        expectedTool: "bulkArchive",
+        expectedBehavior: "should call bulkArchive with filters for date and star status",
+        category: "bulk_operations",
+        difficulty: "medium",
+        description: "Complex bulk operation with multiple filters"
+    },
+    {
+        input: "Find emails from john@company.com with attachments sent this month",
+        expectedTool: "getThread",
+        expectedBehavior: "should call getThread with filters for sender, attachments, and date",
+        category: "search_retrieval",
+        difficulty: "medium",
+        description: "Multi-criteria search"
+    },
+    {
+        input: "Mark all emails from newsletters as read and apply 'Newsletter' label",
+        expectedTool: "modifyLabels",
+        expectedBehavior: "should call modifyLabels to apply 'Newsletter' label and mark as read",
+        category: "label_management",
+        difficulty: "medium",
+        description: "Combined label and status modification"
+    },
+    {
+        input: "Delete all spam emails and empty the trash",
+        expectedTool: "bulkDelete",
+        expectedBehavior: "should call bulkDelete for spam emails and handle trash cleanup",
+        category: "bulk_operations",
+        difficulty: "hard",
+        description: "Complex cleanup operation"
+    },
+    {
+        input: "Summarize the conversation thread about project Alpha",
+        expectedTool: "getThreadSummary",
+        expectedBehavior: "should call getThreadSummary for the specified thread",
+        category: "summarization",
+        difficulty: "medium",
+        description: "Thread summarization request"
+    },
+    {
+        input: "What's the current weather in San Francisco?",
+        expectedTool: "webSearch",
+        expectedBehavior: "should call webSearch for current weather information",
+        category: "web_search",
+        difficulty: "easy",
+        description: "Web search request"
+    },
+    {
+        input: "Compose a professional follow-up email to the meeting request from Sarah",
+        expectedTool: "composeEmail",
+        expectedBehavior: "should call composeEmail with professional tone and context awareness",
+        category: "email_composition",
+        difficulty: "medium",
+        description: "Context-aware email composition"
+    },
+    {
+        input: "Organize my inbox by creating priority levels: High, Medium, Low",
+        expectedTool: "createLabel",
+        expectedBehavior: "should call createLabel multiple times to create priority label hierarchy",
+        category: "organization",
+        difficulty: "hard",
+        description: "Complex organizational structure creation"
+    }
+];
+
+// Gmail search specific test cases
+const GMAIL_SEARCH_TEST_CASES: ToolTestCase[] = [
+    {
+        input: "Find emails from my boss that are unread and have attachments",
+        expectedTool: "from:",
+        expectedBehavior: "should generate search query with from:, is:unread, and has:attachment",
+        category: "gmail_search",
+        difficulty: "medium",
+        description: "Complex Gmail search with multiple operators"
+    },
+    {
+        input: "Show me emails from last week that contain the word 'invoice'",
+        expectedTool: "after:",
+        expectedBehavior: "should generate search query with date filter and text search",
+        category: "gmail_search",
+        difficulty: "easy",
+        description: "Date-based text search"
+    },
+    {
+        input: "Find emails from Gmail that are starred and in the 'Work' folder",
+        expectedTool: "is:starred",
+        expectedBehavior: "should generate search query combining star status and label",
+        category: "gmail_search",
+        difficulty: "medium",
+        description: "Status and label combination search"
+    }
+];
+
+// Email composition test cases
+const EMAIL_COMPOSITION_TEST_CASES: ToolTestCase[] = [
+    {
+        input: "Write a thank you email to the interviewer after my job interview",
+        expectedTool: "composeEmail",
+        expectedBehavior: "should compose professional thank you email with appropriate tone",
+        category: "email_composition",
+        difficulty: "medium",
+        description: "Professional thank you email"
+    },
+    {
+        input: "Draft a meeting cancellation email to the team",
+        expectedTool: "composeEmail",
+        expectedBehavior: "should compose clear cancellation notice with appropriate details",
+        category: "email_composition",
+        difficulty: "easy",
+        description: "Meeting cancellation notice"
+    },
+    {
+        input: "Create an apology email for missing the deadline",
+        expectedTool: "composeEmail",
+        expectedBehavior: "should compose sincere apology with explanation and next steps",
+        category: "email_composition",
+        difficulty: "medium",
+        description: "Apology email with accountability"
+    }
+];
+
+// Static edge case test cases (no dynamic generation to avoid EmbeddingSimilarity errors)
+const EDGE_CASE_TEST_CASES: ToolTestCase[] = [
+    {
+        input: "Archive emails from 10 years ago",
+        expectedTool: "bulkArchive",
+        expectedBehavior: "should handle very old date gracefully and suggest reasonable date range",
+        category: "boundary_conditions",
+        difficulty: "medium",
+        description: "Very old date handling"
+    },
+    {
+        input: "Create a label with special characters: !@#$%^&*()",
+        expectedTool: "createLabel",
+        expectedBehavior: "should sanitize special characters and create valid label name",
+        category: "invalid_inputs",
+        difficulty: "easy",
+        description: "Special character handling in label names"
+    },
+    {
+        input: "Delete all emails from the year 9999",
+        expectedTool: "bulkDelete",
+        expectedBehavior: "should reject invalid future date and suggest current date range",
+        category: "invalid_inputs",
+        difficulty: "easy",
+        description: "Invalid future date handling"
+    },
+    {
+        input: "Find emails with empty sender address",
+        expectedTool: "getThread",
+        expectedBehavior: "should handle empty sender gracefully and provide helpful error message",
+        category: "edge_cases",
+        difficulty: "medium",
+        description: "Empty sender address handling"
+    },
+    {
+        input: "Create a label with 1000 character name",
+        expectedTool: "createLabel",
+        expectedBehavior: "should truncate or reject overly long label names",
+        category: "boundary_conditions",
+        difficulty: "medium",
+        description: "Very long label name handling"
+    },
+    {
+        input: "Archive emails from sender: ''",
+        expectedTool: "bulkArchive",
+        expectedBehavior: "should reject empty sender and ask for valid sender information",
+        category: "error_handling",
+        difficulty: "easy",
+        description: "Empty sender validation"
+    }
+];
+
+// Advanced features test cases
+const ADVANCED_FEATURES_TEST_CASES: ToolTestCase[] = [
+    {
+        input: "Create a workflow to automatically label emails from 'noreply@' as 'Spam'",
+        expectedTool: "createWorkflow",
+        expectedBehavior: "should call createWorkflow to create a new workflow that labels 'noreply@' emails as 'Spam'",
+        category: "automation",
+        difficulty: "medium",
+        description: "Workflow creation for automated labeling"
+    },
+    {
+        input: "Integrate with a calendar application to automatically book meetings",
+        expectedTool: "integrateCalendar",
+        expectedBehavior: "should call integrateCalendar to establish a connection with a calendar app",
+        category: "integrations",
+        difficulty: "hard",
+        description: "Calendar integration for automatic booking"
+    },
+    {
+        input: "Encrypt all emails in the 'Work' folder using PGP",
+        expectedTool: "encryptEmails",
+        expectedBehavior: "should call encryptEmails to apply PGP encryption to all emails in the 'Work' folder",
+        category: "security",
+        difficulty: "hard",
+        description: "PGP encryption for email security"
+    },
+    {
+        input: "Create a template for 'Project Report' that includes a pre-filled table of contents",
+        expectedTool: "createTemplate",
+        expectedBehavior: "should call createTemplate to create a new template with pre-filled content",
+        category: "templates",
+        difficulty: "easy",
+        description: "Template creation with pre-filled content"
+    },
+    {
+        input: "Backup all emails to a cloud storage service",
+        expectedTool: "backupEmails",
+        expectedBehavior: "should call backupEmails to initiate an email backup process",
+        category: "backup",
+        difficulty: "hard",
+        description: "Email backup to cloud storage"
+    },
+    {
+        input: "Analyze email patterns to identify common issues and suggest improvements",
+        expectedTool: "analyzeEmails",
+        expectedBehavior: "should call analyzeEmails to generate a report on email usage patterns",
+        category: "analytics",
+        difficulty: "medium",
+        description: "Email analytics for performance monitoring"
+    }
+];
+
+// Helper function to convert test cases to evalite format
+const makeTestCaseProvider = (testCases: ToolTestCase[]) => {
+    return async () => testCases.map(tc => ({
+        input: tc.input,
+        expected: `${tc.expectedTool}: ${tc.expectedBehavior}`
+    }));
+};
+
+// Tool usage evaluation tests
+evalite("Tool Usage - Search & Retrieval", {
+    data: makeTestCaseProvider(TOOL_USAGE_TEST_CASES.filter(tc => tc.category === 'search_retrieval')),
+    task: async (input) => {
+        return safeStreamText({
+            model: model,
+            system: AiChatPrompt(),
+            prompt: input,
+        });
+    },
+    scorers: [Factuality, EmbeddingSimilarity],
+});
+
+evalite("Tool Usage - Label Management", {
+    data: makeTestCaseProvider(TOOL_USAGE_TEST_CASES.filter(tc => tc.category === 'label_management')),
+    task: async (input) => {
+        return safeStreamText({
+            model: model,
+            system: AiChatPrompt(),
+            prompt: input,
+        });
+    },
+    scorers: [Factuality, EmbeddingSimilarity],
+});
+
+evalite("Tool Usage - Bulk Operations", {
+    data: makeTestCaseProvider(TOOL_USAGE_TEST_CASES.filter(tc => tc.category === 'bulk_operations')),
+    task: async (input) => {
+        return safeStreamText({
+            model: model,
+            system: AiChatPrompt(),
+            prompt: input,
+        });
+    },
+    scorers: [Factuality, EmbeddingSimilarity],
+});
+
+evalite("Tool Usage - Summarization", {
+    data: makeTestCaseProvider(TOOL_USAGE_TEST_CASES.filter(tc => tc.category === 'summarization')),
+    task: async (input) => {
+        return safeStreamText({
+            model: model,
+            system: AiChatPrompt(),
+            prompt: input,
+        });
+    },
+    scorers: [Factuality, EmbeddingSimilarity],
+});
+
+evalite("Tool Usage - Web Search", {
+    data: makeTestCaseProvider(TOOL_USAGE_TEST_CASES.filter(tc => tc.category === 'web_search')),
+    task: async (input) => {
+        return safeStreamText({
+            model: model,
+            system: AiChatPrompt(),
+            prompt: input,
+        });
+    },
+    scorers: [Factuality, EmbeddingSimilarity],
+});
+
+evalite("Tool Usage - Email Composition", {
+    data: makeTestCaseProvider(TOOL_USAGE_TEST_CASES.filter(tc => tc.category === 'email_composition')),
+    task: async (input) => {
+        return safeStreamText({
+            model: model,
+            system: StyledEmailAssistantSystemPrompt(),
+            prompt: input,
+        });
+    },
+    scorers: [Factuality, EmbeddingSimilarity],
+});
+
+evalite("Tool Usage - Organization", {
+    data: makeTestCaseProvider(TOOL_USAGE_TEST_CASES.filter(tc => tc.category === 'organization')),
+    task: async (input) => {
+        return safeStreamText({
+            model: model,
+            system: AiChatPrompt(),
+            prompt: input,
+        });
+    },
+    scorers: [Factuality, EmbeddingSimilarity],
+});
+
+// Gmail search specific evaluation
+evalite("Gmail Search - Complex Queries", {
+    data: makeTestCaseProvider(GMAIL_SEARCH_TEST_CASES),
+    task: async (input) => {
+        return safeStreamText({
+            model: model,
+            system: GmailSearchAssistantSystemPrompt(),
+            prompt: input,
+        });
+    },
+    scorers: [Factuality, EmbeddingSimilarity],
+});
+
+// Email composition specific evaluation
+evalite("Email Composition - Professional Communication", {
+    data: makeTestCaseProvider(EMAIL_COMPOSITION_TEST_CASES),
+    task: async (input) => {
+        return safeStreamText({
+            model: model,
+            system: StyledEmailAssistantSystemPrompt(),
+            prompt: input,
+        });
+    },
+    scorers: [Factuality, EmbeddingSimilarity],
+});
+
+// Edge cases evaluation using static test cases only
+evalite("Tool Usage - Edge Cases & Error Handling", {
+    data: makeTestCaseProvider(EDGE_CASE_TEST_CASES),
+    task: async (input) => {
+        return safeStreamText({
+            model: model,
+            system: AiChatPrompt(),
+            prompt: input,
+        });
+    },
+    scorers: [Factuality, EmbeddingSimilarity],
+});
+
+// Advanced features evaluation
+evalite("Advanced Features - Automation & Workflows", {
+    data: makeTestCaseProvider(ADVANCED_FEATURES_TEST_CASES.filter(tc =>
+        ['automation', 'workflow'].includes(tc.category)
+    )),
+    task: async (input) => {
+        return safeStreamText({
+            model: model,
+            system: AiChatPrompt(),
+            prompt: input,
+        });
+    },
+    scorers: [Factuality, EmbeddingSimilarity],
+});
+
+evalite("Advanced Features - Integrations & Security", {
+    data: makeTestCaseProvider(ADVANCED_FEATURES_TEST_CASES.filter(tc =>
+        ['integrations', 'security'].includes(tc.category)
+    )),
+    task: async (input) => {
+        return safeStreamText({
+            model: model,
+            system: AiChatPrompt(),
+            prompt: input,
+        });
+    },
+    scorers: [Factuality, EmbeddingSimilarity],
+});
+
+evalite("Advanced Features - Templates & Knowledge Management", {
+    data: makeTestCaseProvider(ADVANCED_FEATURES_TEST_CASES.filter(tc =>
+        ['templates', 'knowledge_management', 'analytics', 'backup'].includes(tc.category)
+    )),
+    task: async (input) => {
+        return safeStreamText({
+            model: model,
+            system: AiChatPrompt(),
+            prompt: input,
+        });
+    },
+    scorers: [Factuality, EmbeddingSimilarity],
+});
diff --git a/apps/server/evals/run-evals.sh b/apps/server/evals/run-evals.sh
new file mode 100755
index 0000000000..2fe292a515
--- /dev/null
+++ b/apps/server/evals/run-evals.sh
@@ -0,0 +1,160 @@
+#!/bin/bash
+
+# Email Assistant Evaluation Runner
+# This script helps run specific evals with better output formatting
+
+set -e
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m' # No Color
+
+# Function to print colored output
+print_status() {
+    echo -e "${BLUE}[INFO]${NC} $1"
+}
+
+print_success() {
+    echo -e "${GREEN}[SUCCESS]${NC} $1"
+}
+
+print_warning() {
+    echo -e "${YELLOW}[WARNING]${NC} $1"
+}
+
+print_error() {
+    echo -e "${RED}[ERROR]${NC} $1"
+}
+
+# Check if we're in the right directory
+if [[ ! -f "package.json" ]] || [[ ! -d "evals" ]]; then
+    print_error "Please run this script from the apps/server directory"
+    exit 1
+fi
+
+# Check if OpenAI API key is set
+if [[ -z "$OPENAI_API_KEY" ]]; then
+    print_warning "OPENAI_API_KEY not set. Checking .dev.vars file..."
+    if [[ -f ".dev.vars" ]]; then
+        source .dev.vars
+        print_success "Loaded environment variables from .dev.vars"
+    else
+        print_error "OPENAI_API_KEY not set and .dev.vars not found"
+        print_status "Please set OPENAI_API_KEY environment variable or create .dev.vars file"
+        exit 1
+    fi
+fi
+
+# Function to run a specific eval
+run_eval() {
+    local eval_file=$1
+    local eval_name=$(basename "$eval_file" .eval.ts)
+    
+    print_status "Running eval: $eval_name"
+    echo "=================================================="
+    
+    # Use the correct evalite syntax with run-once command
+    if pnpm eval run-once "$eval_file"; then
+        print_success "Eval completed successfully: $eval_name"
+    else
+        print_error "Eval failed: $eval_name"
+        return 1
+    fi
+    
+    echo "=================================================="
+    echo
+}
+
+# Function to run all evals
+run_all_evals() {
+    print_status "Running all evals..."
+    echo "=================================================="
+    
+    if pnpm eval run-once; then
+        print_success "All evals completed successfully"
+    else
+        print_error "Some evals failed"
+        return 1
+    fi
+    
+    echo "=================================================="
+}
+
+# Function to run evals in watch mode
+run_watch_mode() {
+    print_status "Starting evals in watch mode..."
+    print_status "Press Ctrl+C to stop"
+    
+    pnpm eval watch
+}
+
+# Function to show available evals
+show_available_evals() {
+    print_status "Available eval files:"
+    echo
+    for file in evals/*.eval.ts; do
+        if [[ -f "$file" ]]; then
+            local name=$(basename "$file" .eval.ts)
+            echo "  • $name"
+        fi
+    done
+    echo
+}
+
+# Function to show help
+show_help() {
+    echo "Email Assistant Evaluation Runner"
+    echo
+    echo "Usage: $0 [OPTION] [EVAL_FILE]"
+    echo
+    echo "Options:"
+    echo "  -a, --all           Run all evals"
+    echo "  -w, --watch         Run evals in watch mode"
+    echo "  -l, --list          List available eval files"
+    echo "  -h, --help          Show this help message"
+    echo
+    echo "Examples:"
+    echo "  $0                                    # Run all evals"
+    echo "  $0 -a                                # Run all evals"
+    echo "  $0 -w                                # Run in watch mode"
+    echo "  $0 evals/ai-chat-basic.eval.ts       # Run specific eval"
+    echo "  $0 -l                                # List available evals"
+    echo
+}
+
+# Main script logic
+case "${1:-}" in
+    -h|--help)
+        show_help
+        exit 0
+        ;;
+    -l|--list)
+        show_available_evals
+        exit 0
+        ;;
+    -a|--all)
+        run_all_evals
+        ;;
+    -w|--watch)
+        run_watch_mode
+        ;;
+    "")
+        print_status "No arguments provided, running all evals..."
+        run_all_evals
+        ;;
+    *)
+        # Check if the file exists
+        if [[ -f "$1" ]]; then
+            run_eval "$1"
+        else
+            print_error "Eval file not found: $1"
+            print_status "Use -l or --list to see available evals"
+            exit 1
+        fi
+        ;;
+esac
+
+print_success "Evaluation run completed!"
diff --git a/evals/generate-detailed-report.sh b/evals/generate-detailed-report.sh
new file mode 100644
index 0000000000..8b13789179
--- /dev/null
+++ b/evals/generate-detailed-report.sh
@@ -0,0 +1 @@
+

From df7ea71d6caa341b377ce91c9222e6d9b2a574f1 Mon Sep 17 00:00:00 2001
From: Adam Abu Ghaida <44762129+AdamGhaida@users.noreply.github.com>
Date: Thu, 28 Aug 2025 17:18:40 +0300
Subject: [PATCH 5/5] fix lints

---
 apps/server/evals/ai-chat-basic.eval.ts | 131 ------------------------
 apps/server/evals/ai-tool-usage.eval.ts |   4 +-
 2 files changed, 1 insertion(+), 134 deletions(-)

diff --git a/apps/server/evals/ai-chat-basic.eval.ts b/apps/server/evals/ai-chat-basic.eval.ts
index 3f9f17f365..140cfa926f 100644
--- a/apps/server/evals/ai-chat-basic.eval.ts
+++ b/apps/server/evals/ai-chat-basic.eval.ts
@@ -44,137 +44,6 @@ const safeStreamText = async (config: Parameters<typeof streamText>[0]) => {
 
 type TestCase = { input: string; expected: string };
 
-// Comprehensive static test cases for reliable, consistent testing
-const STATIC_TEST_CASES: TestCase[] = [
-  // Basic functionality tests
-  {
-    input: "Hello, what can you help me with?",
-    expected: "greeting"
-  },
-  {
-    input: "Show me my unread emails",
-    expected: "getThread"
-  },
-  {
-    input: "Find emails from john@example.com",
-    expected: "from:"
-  },
-  {
-    input: "Create a label called 'Important'",
-    expected: "createLabel"
-  },
-  {
-    input: "Archive all emails older than 30 days",
-    expected: "bulkArchive"
-  },
-  {
-    input: "Write a thank you email to sarah@company.com",
-    expected: "composeEmail"
-  },
-  {
-    input: "What's the weather like today?",
-    expected: "webSearch"
-  },
-  {
-    input: "Summarize my inbox",
-    expected: "inboxRag"
-  },
-  {
-    input: "Mark all emails from newsletters as read",
-    expected: "markThreadsRead"
-  },
-  {
-    input: "Delete all spam emails",
-    expected: "bulkDelete"
-  },
-  {
-    input: "Find emails with attachments from last week",
-    expected: "has:attachment"
-  },
-  {
-    input: "Organize my emails by priority",
-    expected: "modifyLabels"
-  },
-  {
-    input: "What emails do I have scheduled for tomorrow?",
-    expected: "getThread"
-  },
-  {
-    input: "Send a follow-up email to the meeting request",
-    expected: "composeEmail"
-  },
-  {
-    input: "Find all receipts from Amazon",
-    expected: "from:amazon"
-  },
-  // Additional comprehensive test cases
-  {
-    input: "Show me emails with large attachments (>5MB)",
-    expected: "larger:5M"
-  },
-  {
-    input: "Find emails sent between Monday and Friday last week",
-    expected: "after:2025/08/18"
-  },
-  {
-    input: "Create a nested label structure: Work > Projects > Beta",
-    expected: "createLabel"
-  },
-  {
-    input: "Rename the 'Old' label to 'Archived'",
-    expected: "modifyLabels"
-  },
-  {
-    input: "Apply 'Urgent' label to all emails from the CEO",
-    expected: "modifyLabels"
-  },
-  {
-    input: "Forward all emails from 'Support' to my manager",
-    expected: "composeEmail"
-  },
-  {
-    input: "Set up automatic archiving for emails older than 90 days",
-    expected: "bulkArchive"
-  },
-  {
-    input: "Find emails that are both important and starred",
-    expected: "is:important"
-  },
-  {
-    input: "Create email templates for common responses",
-    expected: "composeEmail"
-  },
-  {
-    input: "Analyze my email patterns and suggest improvements",
-    expected: "inboxRag"
-  },
-  {
-    input: "Set up email encryption for sensitive communications",
-    expected: "composeEmail"
-  },
-  {
-    input: "Create a backup of all my emails",
-    expected: "bulkArchive"
-  },
-  {
-    input: "Find emails with multiple recipients (more than 10 people)",
-    expected: "to:"
-  },
-  {
-    input: "Set up email forwarding rules for specific senders",
-    expected: "modifyLabels"
-  },
-  {
-    input: "Create a knowledge base from FAQ emails",
-    expected: "inboxRag"
-  }
-];
-
-// Helper function to convert static test cases to the format expected by evalite
-const makeStaticTestCaseProvider = (testCases: TestCase[]) => {
-  return async () => testCases;
-};
-
 const makeAiChatTestCaseBuilder = (topic: string): (() => Promise<TestCase[]>) => {
   return async () => {
     const { object } = await generateObject({
diff --git a/apps/server/evals/ai-tool-usage.eval.ts b/apps/server/evals/ai-tool-usage.eval.ts
index dfcd671466..e697199106 100644
--- a/apps/server/evals/ai-tool-usage.eval.ts
+++ b/apps/server/evals/ai-tool-usage.eval.ts
@@ -2,10 +2,8 @@ import { evalite } from "evalite";
 import { openai } from "@ai-sdk/openai";
 import { streamText } from "ai";
 import { traceAISDKModel } from "evalite/ai-sdk";
-import { Factuality, EmbeddingSimilarity, ExactMatch } from "autoevals";
+import { Factuality, EmbeddingSimilarity } from "autoevals";
 import { AiChatPrompt, GmailSearchAssistantSystemPrompt, StyledEmailAssistantSystemPrompt } from "../src/lib/prompts";
-import { generateObject } from "ai";
-import { z } from "zod";
 
 // base model (untraced) for internal helpers to avoid trace errors
 const baseModel = openai("gpt-4o-mini");