From 6c4c0df7d65cfa79c5eec3fb8a4d7476b47d4aa5 Mon Sep 17 00:00:00 2001 From: Ahmet Kilinc Date: Wed, 13 Aug 2025 00:27:45 +0100 Subject: [PATCH 1/5] fix: ai chat bot fixes --- apps/mail/components/create/ai-chat.tsx | 51 ++- apps/mail/package.json | 1 + .../providers/voice-provider-elevenlabs.tsx | 183 +++++++++++ apps/mail/providers/voice-provider.tsx | 302 ++++++++++++------ apps/server/src/main.ts | 2 + apps/server/src/routes/voice.ts | 135 ++++++++ pnpm-lock.yaml | 3 + 7 files changed, 575 insertions(+), 102 deletions(-) create mode 100644 apps/mail/providers/voice-provider-elevenlabs.tsx create mode 100644 apps/server/src/routes/voice.ts diff --git a/apps/mail/components/create/ai-chat.tsx b/apps/mail/components/create/ai-chat.tsx index c1e637e57e..d8ecc8e78d 100644 --- a/apps/mail/components/create/ai-chat.tsx +++ b/apps/mail/components/create/ai-chat.tsx @@ -1,8 +1,8 @@ import { Avatar, AvatarFallback, AvatarImage } from '../ui/avatar'; import { useAIFullScreen, useAISidebar } from '../ui/ai-sidebar'; +import { useRef, useCallback, useEffect, useState } from 'react'; import { VoiceProvider } from '@/providers/voice-provider'; import useComposeEditor from '@/hooks/use-compose-editor'; -import { useRef, useCallback, useEffect } from 'react'; import type { useAgentChat } from 'agents/ai-react'; import { Markdown } from '@react-email/components'; import { useBilling } from '@/hooks/use-billing'; @@ -209,6 +209,7 @@ export function AIChat({ const [, setPricingDialog] = useQueryState('pricingDialog'); const [aiSidebarOpen] = useQueryState('aiSidebar'); const { toggleOpen } = useAISidebar(); + const voiceResponseCallbackRef = useRef<((response: string) => void) | null>(null); const scrollToBottom = useCallback(() => { if (messagesEndRef.current) { @@ -222,6 +223,28 @@ export function AIChat({ } }, [status, scrollToBottom]); + // Track if we're waiting for a voice response + const [isVoiceQuery, setIsVoiceQuery] = useState(false); + + // When a new assistant message comes in, check if it's a voice response + useEffect(() => { + if (isVoiceQuery && messages.length > 0) { + const lastMessage = messages[messages.length - 1]; + if (lastMessage.role === 'assistant') { + // Extract text content from the message + const textContent = lastMessage.parts + .filter((part) => part.type === 'text' && 'text' in part) + .map((part) => (part as any).text) + .join(' '); + + if (textContent && voiceResponseCallbackRef.current) { + voiceResponseCallbackRef.current(textContent); + setIsVoiceQuery(false); + } + } + } + }, [messages, isVoiceQuery]); + const editor = useComposeEditor({ placeholder: 'Ask Zero to do anything...', onLengthChange: () => setInput(editor.getText()), @@ -293,14 +316,19 @@ export function AIChat({ const toolParts = message.parts.filter((part) => part.type === 'tool-invocation'); return ( -
+
{toolParts.map( (part, index) => - part.toolInvocation?.result && ( + part.toolInvocation && + 'result' in part.toolInvocation && ( ), @@ -387,7 +415,20 @@ export function AIChat({
- + { + // Mark this as a voice query + setIsVoiceQuery(true); + // Set the transcript as input and submit the form + editor.commands.setContent(transcript); + setInput(transcript); + onSubmit({ preventDefault: () => {} } as React.FormEvent); + }} + onResponseReady={(callback) => { + // Store the callback to be called when we get the AI response + voiceResponseCallbackRef.current = callback; + }} + >
) : ( @@ -395,7 +388,6 @@ export function AIChat({
- {/* Fixed input at bottom */}
@@ -417,15 +409,12 @@ export function AIChat({
{ - // Mark this as a voice query setIsVoiceQuery(true); - // Set the transcript as input and submit the form editor.commands.setContent(transcript); setInput(transcript); onSubmit({ preventDefault: () => {} } as React.FormEvent); }} onResponseReady={(callback) => { - // Store the callback to be called when we get the AI response voiceResponseCallbackRef.current = callback; }} > From ad67464a2ca78da467e379c5b51a2d73cd4a3b4e Mon Sep 17 00:00:00 2001 From: Adam Abu Ghaida <44762129+AdamGhaida@users.noreply.github.com> Date: Thu, 28 Aug 2025 17:18:32 +0300 Subject: [PATCH 4/5] Add comprehensive static eval cases for Zero Agent --- apps/server/evals/README.md | 243 +++++++++++++ apps/server/evals/ai-chat-basic.eval.ts | 131 +++++++ apps/server/evals/ai-tool-usage.eval.ts | 454 ++++++++++++++++++++++++ apps/server/evals/run-evals.sh | 160 +++++++++ evals/generate-detailed-report.sh | 1 + 5 files changed, 989 insertions(+) create mode 100644 apps/server/evals/README.md create mode 100644 apps/server/evals/ai-tool-usage.eval.ts create mode 100755 apps/server/evals/run-evals.sh create mode 100644 evals/generate-detailed-report.sh diff --git a/apps/server/evals/README.md b/apps/server/evals/README.md new file mode 100644 index 0000000000..1f8cc35329 --- /dev/null +++ b/apps/server/evals/README.md @@ -0,0 +1,243 @@ +# Email Assistant Evaluation Suite + +This directory contains comprehensive evaluation tests for the AI Email Assistant using [Evalite](https://github.com/evalite-ai/evalite). + +## Overview + +The evaluation suite tests the AI assistant's capabilities across multiple dimensions: + +- **Basic Functionality**: Greetings, help requests, capability inquiries +- **Search & Retrieval**: Email search, filtering, and retrieval operations +- **Label Management**: Creating, modifying, and organizing email labels +- **Bulk Operations**: Archive, delete, mark read/unread operations +- **Email Composition**: Writing, replying, and drafting emails +- **Gmail Search**: Natural language to Gmail search query conversion +- **Web Search**: External information retrieval +- **Summarization**: Email and thread summarization +- **Organization**: Workflow automation and email organization + +## Files + +### `ai-chat-basic.eval.ts` +Comprehensive evaluation of the AI chat assistant covering: +- Static test cases for reliable, consistent testing +- Dynamic test case generation for varied scenarios +- Multiple scoring metrics (Factuality, EmbeddingSimilarity) +- Categorized test cases by difficulty and functionality + +### `ai-tool-usage.eval.ts` +Focused evaluation of tool usage and response quality: +- Tool-specific test cases with expected behaviors +- Edge case testing and error handling +- Professional communication scenarios +- Complex workflow testing + +## Running the Evals + +### Prerequisites + +1. **OpenAI API Key**: Set the `OPENAI_API_KEY` environment variable +2. **Dependencies**: Ensure all packages are installed (`pnpm install`) +3. **Server Access**: Navigate to the `apps/server` directory + +### Commands + +```bash +# Run all evals once +pnpm eval + +# Run evals in watch mode (re-runs when files change) +pnpm eval:dev + +# Run specific eval file +pnpm eval -- --run evals/ai-chat-basic.eval.ts +pnpm eval -- --run evals/ai-tool-usage.eval.ts +``` + +### Environment Setup + +```bash +# Set OpenAI API key +export OPENAI_API_KEY="your-api-key-here" + +# Or create a .env file in apps/server/ +echo "OPENAI_API_KEY=your-api-key-here" > .env +``` + +## Test Case Structure + +### Static Test Cases +Reliable, consistent test cases that don't change between runs: + +```typescript +{ + input: "Show me my unread emails", + expected: "getThread", + category: "search", + difficulty: "easy", + description: "Simple unread email request" +} +``` + +### Dynamic Test Cases +AI-generated test cases for varied scenarios: + +```typescript +{ + input: string, // User request + expected: string, // Expected tool or behavior + category: string, // Test category + difficulty: 'easy' | 'medium' | 'hard', + description: string // What this test validates +} +``` + +## Scoring Metrics + +### Factuality +Measures how factually accurate the AI's responses are compared to expected outputs. + +### EmbeddingSimilarity +Uses semantic similarity to evaluate how well the AI's response matches the expected behavior. + +### Levenshtein +String similarity scoring for exact text matching (when applicable). + +## Test Categories + +### Basic Functionality +- Greetings and help requests +- Capability inquiries +- User intent recognition + +### Search & Retrieval +- Email search with filters +- Date-based queries +- Multi-criteria searches + +### Label Management +- Label creation and modification +- Label organization +- Label application + +### Bulk Operations +- Archive operations +- Delete operations +- Mark read/unread operations + +### Email Composition +- Professional emails +- Personal communication +- Context-aware composition + +### Gmail Search +- Natural language conversion +- Search operator usage +- Complex query building + +### Web Search +- External information retrieval +- Current events +- Fact checking + +### Summarization +- Email summarization +- Thread summarization +- Content extraction + +### Organization +- Workflow automation +- Email organization +- Priority management + +## Difficulty Levels + +### Easy +- Simple, single-action requests +- Basic tool usage +- Clear, unambiguous inputs + +### Medium +- Multi-step operations +- Combined tool usage +- Moderate complexity + +### Hard +- Complex workflows +- Edge cases +- Error handling scenarios + +## Customization + +### Adding New Test Cases + +1. **Static Cases**: Add to the appropriate `STATIC_TEST_CASES` array +2. **Dynamic Cases**: Modify the test case builder functions +3. **New Categories**: Update the category filtering and add new evalite blocks + +### Modifying Scoring + +Adjust the scorers array in each evalite block: + +```typescript +scorers: [Factuality, EmbeddingSimilarity, Levenshtein] +``` + +### Adding New Prompts + +Import new system prompts and create corresponding evalite blocks: + +```typescript +import { NewPrompt } from "../src/lib/prompts"; + +evalite("New Prompt Evaluation", { + data: makeTestCaseBuilder("new functionality"), + task: async (input) => { + return safeStreamText({ + model: model, + system: NewPrompt(), + prompt: input, + }); + }, + scorers: [Factuality, EmbeddingSimilarity], +}); +``` + +## Troubleshooting + +### Common Issues + +1. **SQLite Binding Errors**: Run `pnpm rebuild better-sqlite3` +2. **Missing API Key**: Ensure `OPENAI_API_KEY` is set +3. **Import Errors**: Check that all prompt imports are correct + +### Performance Tips + +1. **Use Static Cases**: For consistent, reliable testing +2. **Limit Dynamic Cases**: Balance coverage with execution time +3. **Watch Mode**: Use `pnpm eval:dev` for development iterations + +## Expected Scores + +- **Easy Cases**: 70-90% (basic functionality should work well) +- **Medium Cases**: 50-80% (moderate complexity may have variations) +- **Hard Cases**: 30-70% (complex scenarios may be challenging) + +Scores above 70% indicate excellent performance, while scores below 50% suggest areas for improvement. + +## Contributing + +When adding new test cases: + +1. **Follow the existing structure** and naming conventions +2. **Add comprehensive descriptions** for each test case +3. **Use appropriate difficulty levels** based on complexity +4. **Test edge cases** and error scenarios +5. **Document any new categories** or scoring methods + +## Resources + +- [Evalite Documentation](https://github.com/evalite-ai/evalite) +- [Autoevals Library](https://github.com/braintrustdata/autoevals) +- [AI SDK Documentation](https://sdk.vercel.ai/) +- [OpenAI API Reference](https://platform.openai.com/docs/api-reference) diff --git a/apps/server/evals/ai-chat-basic.eval.ts b/apps/server/evals/ai-chat-basic.eval.ts index 140cfa926f..3f9f17f365 100644 --- a/apps/server/evals/ai-chat-basic.eval.ts +++ b/apps/server/evals/ai-chat-basic.eval.ts @@ -44,6 +44,137 @@ const safeStreamText = async (config: Parameters[0]) => { type TestCase = { input: string; expected: string }; +// Comprehensive static test cases for reliable, consistent testing +const STATIC_TEST_CASES: TestCase[] = [ + // Basic functionality tests + { + input: "Hello, what can you help me with?", + expected: "greeting" + }, + { + input: "Show me my unread emails", + expected: "getThread" + }, + { + input: "Find emails from john@example.com", + expected: "from:" + }, + { + input: "Create a label called 'Important'", + expected: "createLabel" + }, + { + input: "Archive all emails older than 30 days", + expected: "bulkArchive" + }, + { + input: "Write a thank you email to sarah@company.com", + expected: "composeEmail" + }, + { + input: "What's the weather like today?", + expected: "webSearch" + }, + { + input: "Summarize my inbox", + expected: "inboxRag" + }, + { + input: "Mark all emails from newsletters as read", + expected: "markThreadsRead" + }, + { + input: "Delete all spam emails", + expected: "bulkDelete" + }, + { + input: "Find emails with attachments from last week", + expected: "has:attachment" + }, + { + input: "Organize my emails by priority", + expected: "modifyLabels" + }, + { + input: "What emails do I have scheduled for tomorrow?", + expected: "getThread" + }, + { + input: "Send a follow-up email to the meeting request", + expected: "composeEmail" + }, + { + input: "Find all receipts from Amazon", + expected: "from:amazon" + }, + // Additional comprehensive test cases + { + input: "Show me emails with large attachments (>5MB)", + expected: "larger:5M" + }, + { + input: "Find emails sent between Monday and Friday last week", + expected: "after:2025/08/18" + }, + { + input: "Create a nested label structure: Work > Projects > Beta", + expected: "createLabel" + }, + { + input: "Rename the 'Old' label to 'Archived'", + expected: "modifyLabels" + }, + { + input: "Apply 'Urgent' label to all emails from the CEO", + expected: "modifyLabels" + }, + { + input: "Forward all emails from 'Support' to my manager", + expected: "composeEmail" + }, + { + input: "Set up automatic archiving for emails older than 90 days", + expected: "bulkArchive" + }, + { + input: "Find emails that are both important and starred", + expected: "is:important" + }, + { + input: "Create email templates for common responses", + expected: "composeEmail" + }, + { + input: "Analyze my email patterns and suggest improvements", + expected: "inboxRag" + }, + { + input: "Set up email encryption for sensitive communications", + expected: "composeEmail" + }, + { + input: "Create a backup of all my emails", + expected: "bulkArchive" + }, + { + input: "Find emails with multiple recipients (more than 10 people)", + expected: "to:" + }, + { + input: "Set up email forwarding rules for specific senders", + expected: "modifyLabels" + }, + { + input: "Create a knowledge base from FAQ emails", + expected: "inboxRag" + } +]; + +// Helper function to convert static test cases to the format expected by evalite +const makeStaticTestCaseProvider = (testCases: TestCase[]) => { + return async () => testCases; +}; + const makeAiChatTestCaseBuilder = (topic: string): (() => Promise) => { return async () => { const { object } = await generateObject({ diff --git a/apps/server/evals/ai-tool-usage.eval.ts b/apps/server/evals/ai-tool-usage.eval.ts new file mode 100644 index 0000000000..dfcd671466 --- /dev/null +++ b/apps/server/evals/ai-tool-usage.eval.ts @@ -0,0 +1,454 @@ +import { evalite } from "evalite"; +import { openai } from "@ai-sdk/openai"; +import { streamText } from "ai"; +import { traceAISDKModel } from "evalite/ai-sdk"; +import { Factuality, EmbeddingSimilarity, ExactMatch } from "autoevals"; +import { AiChatPrompt, GmailSearchAssistantSystemPrompt, StyledEmailAssistantSystemPrompt } from "../src/lib/prompts"; +import { generateObject } from "ai"; +import { z } from "zod"; + +// base model (untraced) for internal helpers to avoid trace errors +const baseModel = openai("gpt-4o-mini"); + +// traced model for the actual task under test +const model = traceAISDKModel(baseModel); + +const safeStreamText = async (config: Parameters[0]) => { + try { + const res = await streamText(config); + return res.textStream; + } catch (err) { + console.error("LLM call failed", err); + return "ERROR"; + } +}; + +// Test case type for tool usage evaluation +type ToolTestCase = { + input: string; + expectedTool: string; + expectedBehavior: string; + category: string; + difficulty: 'easy' | 'medium' | 'hard'; + description: string; +}; + +// Static test cases for tool usage evaluation +const TOOL_USAGE_TEST_CASES: ToolTestCase[] = [ + // Basic tool usage tests + { + input: "Show me my unread emails from the last 3 days", + expectedTool: "getThread", + expectedBehavior: "should call getThread with appropriate filters for unread emails and date range", + category: "search_retrieval", + difficulty: "easy", + description: "Basic search with date filter" + }, + { + input: "Create a new label called 'Urgent' with red color", + expectedTool: "createLabel", + expectedBehavior: "should call createLabel with name 'Urgent' and red color", + category: "label_management", + difficulty: "easy", + description: "Label creation with color specification" + }, + { + input: "Archive all emails older than 60 days that are not starred", + expectedTool: "bulkArchive", + expectedBehavior: "should call bulkArchive with filters for date and star status", + category: "bulk_operations", + difficulty: "medium", + description: "Complex bulk operation with multiple filters" + }, + { + input: "Find emails from john@company.com with attachments sent this month", + expectedTool: "getThread", + expectedBehavior: "should call getThread with filters for sender, attachments, and date", + category: "search_retrieval", + difficulty: "medium", + description: "Multi-criteria search" + }, + { + input: "Mark all emails from newsletters as read and apply 'Newsletter' label", + expectedTool: "modifyLabels", + expectedBehavior: "should call modifyLabels to apply 'Newsletter' label and mark as read", + category: "label_management", + difficulty: "medium", + description: "Combined label and status modification" + }, + { + input: "Delete all spam emails and empty the trash", + expectedTool: "bulkDelete", + expectedBehavior: "should call bulkDelete for spam emails and handle trash cleanup", + category: "bulk_operations", + difficulty: "hard", + description: "Complex cleanup operation" + }, + { + input: "Summarize the conversation thread about project Alpha", + expectedTool: "getThreadSummary", + expectedBehavior: "should call getThreadSummary for the specified thread", + category: "summarization", + difficulty: "medium", + description: "Thread summarization request" + }, + { + input: "What's the current weather in San Francisco?", + expectedTool: "webSearch", + expectedBehavior: "should call webSearch for current weather information", + category: "web_search", + difficulty: "easy", + description: "Web search request" + }, + { + input: "Compose a professional follow-up email to the meeting request from Sarah", + expectedTool: "composeEmail", + expectedBehavior: "should call composeEmail with professional tone and context awareness", + category: "email_composition", + difficulty: "medium", + description: "Context-aware email composition" + }, + { + input: "Organize my inbox by creating priority levels: High, Medium, Low", + expectedTool: "createLabel", + expectedBehavior: "should call createLabel multiple times to create priority label hierarchy", + category: "organization", + difficulty: "hard", + description: "Complex organizational structure creation" + } +]; + +// Gmail search specific test cases +const GMAIL_SEARCH_TEST_CASES: ToolTestCase[] = [ + { + input: "Find emails from my boss that are unread and have attachments", + expectedTool: "from:", + expectedBehavior: "should generate search query with from:, is:unread, and has:attachment", + category: "gmail_search", + difficulty: "medium", + description: "Complex Gmail search with multiple operators" + }, + { + input: "Show me emails from last week that contain the word 'invoice'", + expectedTool: "after:", + expectedBehavior: "should generate search query with date filter and text search", + category: "gmail_search", + difficulty: "easy", + description: "Date-based text search" + }, + { + input: "Find emails from Gmail that are starred and in the 'Work' folder", + expectedTool: "is:starred", + expectedBehavior: "should generate search query combining star status and label", + category: "gmail_search", + difficulty: "medium", + description: "Status and label combination search" + } +]; + +// Email composition test cases +const EMAIL_COMPOSITION_TEST_CASES: ToolTestCase[] = [ + { + input: "Write a thank you email to the interviewer after my job interview", + expectedTool: "composeEmail", + expectedBehavior: "should compose professional thank you email with appropriate tone", + category: "email_composition", + difficulty: "medium", + description: "Professional thank you email" + }, + { + input: "Draft a meeting cancellation email to the team", + expectedTool: "composeEmail", + expectedBehavior: "should compose clear cancellation notice with appropriate details", + category: "email_composition", + difficulty: "easy", + description: "Meeting cancellation notice" + }, + { + input: "Create an apology email for missing the deadline", + expectedTool: "composeEmail", + expectedBehavior: "should compose sincere apology with explanation and next steps", + category: "email_composition", + difficulty: "medium", + description: "Apology email with accountability" + } +]; + +// Static edge case test cases (no dynamic generation to avoid EmbeddingSimilarity errors) +const EDGE_CASE_TEST_CASES: ToolTestCase[] = [ + { + input: "Archive emails from 10 years ago", + expectedTool: "bulkArchive", + expectedBehavior: "should handle very old date gracefully and suggest reasonable date range", + category: "boundary_conditions", + difficulty: "medium", + description: "Very old date handling" + }, + { + input: "Create a label with special characters: !@#$%^&*()", + expectedTool: "createLabel", + expectedBehavior: "should sanitize special characters and create valid label name", + category: "invalid_inputs", + difficulty: "easy", + description: "Special character handling in label names" + }, + { + input: "Delete all emails from the year 9999", + expectedTool: "bulkDelete", + expectedBehavior: "should reject invalid future date and suggest current date range", + category: "invalid_inputs", + difficulty: "easy", + description: "Invalid future date handling" + }, + { + input: "Find emails with empty sender address", + expectedTool: "getThread", + expectedBehavior: "should handle empty sender gracefully and provide helpful error message", + category: "edge_cases", + difficulty: "medium", + description: "Empty sender address handling" + }, + { + input: "Create a label with 1000 character name", + expectedTool: "createLabel", + expectedBehavior: "should truncate or reject overly long label names", + category: "boundary_conditions", + difficulty: "medium", + description: "Very long label name handling" + }, + { + input: "Archive emails from sender: ''", + expectedTool: "bulkArchive", + expectedBehavior: "should reject empty sender and ask for valid sender information", + category: "error_handling", + difficulty: "easy", + description: "Empty sender validation" + } +]; + +// Advanced features test cases +const ADVANCED_FEATURES_TEST_CASES: ToolTestCase[] = [ + { + input: "Create a workflow to automatically label emails from 'noreply@' as 'Spam'", + expectedTool: "createWorkflow", + expectedBehavior: "should call createWorkflow to create a new workflow that labels 'noreply@' emails as 'Spam'", + category: "automation", + difficulty: "medium", + description: "Workflow creation for automated labeling" + }, + { + input: "Integrate with a calendar application to automatically book meetings", + expectedTool: "integrateCalendar", + expectedBehavior: "should call integrateCalendar to establish a connection with a calendar app", + category: "integrations", + difficulty: "hard", + description: "Calendar integration for automatic booking" + }, + { + input: "Encrypt all emails in the 'Work' folder using PGP", + expectedTool: "encryptEmails", + expectedBehavior: "should call encryptEmails to apply PGP encryption to all emails in the 'Work' folder", + category: "security", + difficulty: "hard", + description: "PGP encryption for email security" + }, + { + input: "Create a template for 'Project Report' that includes a pre-filled table of contents", + expectedTool: "createTemplate", + expectedBehavior: "should call createTemplate to create a new template with pre-filled content", + category: "templates", + difficulty: "easy", + description: "Template creation with pre-filled content" + }, + { + input: "Backup all emails to a cloud storage service", + expectedTool: "backupEmails", + expectedBehavior: "should call backupEmails to initiate an email backup process", + category: "backup", + difficulty: "hard", + description: "Email backup to cloud storage" + }, + { + input: "Analyze email patterns to identify common issues and suggest improvements", + expectedTool: "analyzeEmails", + expectedBehavior: "should call analyzeEmails to generate a report on email usage patterns", + category: "analytics", + difficulty: "medium", + description: "Email analytics for performance monitoring" + } +]; + +// Helper function to convert test cases to evalite format +const makeTestCaseProvider = (testCases: ToolTestCase[]) => { + return async () => testCases.map(tc => ({ + input: tc.input, + expected: `${tc.expectedTool}: ${tc.expectedBehavior}` + })); +}; + +// Tool usage evaluation tests +evalite("Tool Usage - Search & Retrieval", { + data: makeTestCaseProvider(TOOL_USAGE_TEST_CASES.filter(tc => tc.category === 'search_retrieval')), + task: async (input) => { + return safeStreamText({ + model: model, + system: AiChatPrompt(), + prompt: input, + }); + }, + scorers: [Factuality, EmbeddingSimilarity], +}); + +evalite("Tool Usage - Label Management", { + data: makeTestCaseProvider(TOOL_USAGE_TEST_CASES.filter(tc => tc.category === 'label_management')), + task: async (input) => { + return safeStreamText({ + model: model, + system: AiChatPrompt(), + prompt: input, + }); + }, + scorers: [Factuality, EmbeddingSimilarity], +}); + +evalite("Tool Usage - Bulk Operations", { + data: makeTestCaseProvider(TOOL_USAGE_TEST_CASES.filter(tc => tc.category === 'bulk_operations')), + task: async (input) => { + return safeStreamText({ + model: model, + system: AiChatPrompt(), + prompt: input, + }); + }, + scorers: [Factuality, EmbeddingSimilarity], +}); + +evalite("Tool Usage - Summarization", { + data: makeTestCaseProvider(TOOL_USAGE_TEST_CASES.filter(tc => tc.category === 'summarization')), + task: async (input) => { + return safeStreamText({ + model: model, + system: AiChatPrompt(), + prompt: input, + }); + }, + scorers: [Factuality, EmbeddingSimilarity], +}); + +evalite("Tool Usage - Web Search", { + data: makeTestCaseProvider(TOOL_USAGE_TEST_CASES.filter(tc => tc.category === 'web_search')), + task: async (input) => { + return safeStreamText({ + model: model, + system: AiChatPrompt(), + prompt: input, + }); + }, + scorers: [Factuality, EmbeddingSimilarity], +}); + +evalite("Tool Usage - Email Composition", { + data: makeTestCaseProvider(TOOL_USAGE_TEST_CASES.filter(tc => tc.category === 'email_composition')), + task: async (input) => { + return safeStreamText({ + model: model, + system: StyledEmailAssistantSystemPrompt(), + prompt: input, + }); + }, + scorers: [Factuality, EmbeddingSimilarity], +}); + +evalite("Tool Usage - Organization", { + data: makeTestCaseProvider(TOOL_USAGE_TEST_CASES.filter(tc => tc.category === 'organization')), + task: async (input) => { + return safeStreamText({ + model: model, + system: AiChatPrompt(), + prompt: input, + }); + }, + scorers: [Factuality, EmbeddingSimilarity], +}); + +// Gmail search specific evaluation +evalite("Gmail Search - Complex Queries", { + data: makeTestCaseProvider(GMAIL_SEARCH_TEST_CASES), + task: async (input) => { + return safeStreamText({ + model: model, + system: GmailSearchAssistantSystemPrompt(), + prompt: input, + }); + }, + scorers: [Factuality, EmbeddingSimilarity], +}); + +// Email composition specific evaluation +evalite("Email Composition - Professional Communication", { + data: makeTestCaseProvider(EMAIL_COMPOSITION_TEST_CASES), + task: async (input) => { + return safeStreamText({ + model: model, + system: StyledEmailAssistantSystemPrompt(), + prompt: input, + }); + }, + scorers: [Factuality, EmbeddingSimilarity], +}); + +// Edge cases evaluation using static test cases only +evalite("Tool Usage - Edge Cases & Error Handling", { + data: makeTestCaseProvider(EDGE_CASE_TEST_CASES), + task: async (input) => { + return safeStreamText({ + model: model, + system: AiChatPrompt(), + prompt: input, + }); + }, + scorers: [Factuality, EmbeddingSimilarity], +}); + +// Advanced features evaluation +evalite("Advanced Features - Automation & Workflows", { + data: makeTestCaseProvider(ADVANCED_FEATURES_TEST_CASES.filter(tc => + ['automation', 'workflow'].includes(tc.category) + )), + task: async (input) => { + return safeStreamText({ + model: model, + system: AiChatPrompt(), + prompt: input, + }); + }, + scorers: [Factuality, EmbeddingSimilarity], +}); + +evalite("Advanced Features - Integrations & Security", { + data: makeTestCaseProvider(ADVANCED_FEATURES_TEST_CASES.filter(tc => + ['integrations', 'security'].includes(tc.category) + )), + task: async (input) => { + return safeStreamText({ + model: model, + system: AiChatPrompt(), + prompt: input, + }); + }, + scorers: [Factuality, EmbeddingSimilarity], +}); + +evalite("Advanced Features - Templates & Knowledge Management", { + data: makeTestCaseProvider(ADVANCED_FEATURES_TEST_CASES.filter(tc => + ['templates', 'knowledge_management', 'analytics', 'backup'].includes(tc.category) + )), + task: async (input) => { + return safeStreamText({ + model: model, + system: AiChatPrompt(), + prompt: input, + }); + }, + scorers: [Factuality, EmbeddingSimilarity], +}); diff --git a/apps/server/evals/run-evals.sh b/apps/server/evals/run-evals.sh new file mode 100755 index 0000000000..2fe292a515 --- /dev/null +++ b/apps/server/evals/run-evals.sh @@ -0,0 +1,160 @@ +#!/bin/bash + +# Email Assistant Evaluation Runner +# This script helps run specific evals with better output formatting + +set -e + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# Function to print colored output +print_status() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +print_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +print_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +print_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# Check if we're in the right directory +if [[ ! -f "package.json" ]] || [[ ! -d "evals" ]]; then + print_error "Please run this script from the apps/server directory" + exit 1 +fi + +# Check if OpenAI API key is set +if [[ -z "$OPENAI_API_KEY" ]]; then + print_warning "OPENAI_API_KEY not set. Checking .dev.vars file..." + if [[ -f ".dev.vars" ]]; then + source .dev.vars + print_success "Loaded environment variables from .dev.vars" + else + print_error "OPENAI_API_KEY not set and .dev.vars not found" + print_status "Please set OPENAI_API_KEY environment variable or create .dev.vars file" + exit 1 + fi +fi + +# Function to run a specific eval +run_eval() { + local eval_file=$1 + local eval_name=$(basename "$eval_file" .eval.ts) + + print_status "Running eval: $eval_name" + echo "==================================================" + + # Use the correct evalite syntax with run-once command + if pnpm eval run-once "$eval_file"; then + print_success "Eval completed successfully: $eval_name" + else + print_error "Eval failed: $eval_name" + return 1 + fi + + echo "==================================================" + echo +} + +# Function to run all evals +run_all_evals() { + print_status "Running all evals..." + echo "==================================================" + + if pnpm eval run-once; then + print_success "All evals completed successfully" + else + print_error "Some evals failed" + return 1 + fi + + echo "==================================================" +} + +# Function to run evals in watch mode +run_watch_mode() { + print_status "Starting evals in watch mode..." + print_status "Press Ctrl+C to stop" + + pnpm eval watch +} + +# Function to show available evals +show_available_evals() { + print_status "Available eval files:" + echo + for file in evals/*.eval.ts; do + if [[ -f "$file" ]]; then + local name=$(basename "$file" .eval.ts) + echo " • $name" + fi + done + echo +} + +# Function to show help +show_help() { + echo "Email Assistant Evaluation Runner" + echo + echo "Usage: $0 [OPTION] [EVAL_FILE]" + echo + echo "Options:" + echo " -a, --all Run all evals" + echo " -w, --watch Run evals in watch mode" + echo " -l, --list List available eval files" + echo " -h, --help Show this help message" + echo + echo "Examples:" + echo " $0 # Run all evals" + echo " $0 -a # Run all evals" + echo " $0 -w # Run in watch mode" + echo " $0 evals/ai-chat-basic.eval.ts # Run specific eval" + echo " $0 -l # List available evals" + echo +} + +# Main script logic +case "${1:-}" in + -h|--help) + show_help + exit 0 + ;; + -l|--list) + show_available_evals + exit 0 + ;; + -a|--all) + run_all_evals + ;; + -w|--watch) + run_watch_mode + ;; + "") + print_status "No arguments provided, running all evals..." + run_all_evals + ;; + *) + # Check if the file exists + if [[ -f "$1" ]]; then + run_eval "$1" + else + print_error "Eval file not found: $1" + print_status "Use -l or --list to see available evals" + exit 1 + fi + ;; +esac + +print_success "Evaluation run completed!" diff --git a/evals/generate-detailed-report.sh b/evals/generate-detailed-report.sh new file mode 100644 index 0000000000..8b13789179 --- /dev/null +++ b/evals/generate-detailed-report.sh @@ -0,0 +1 @@ + From df7ea71d6caa341b377ce91c9222e6d9b2a574f1 Mon Sep 17 00:00:00 2001 From: Adam Abu Ghaida <44762129+AdamGhaida@users.noreply.github.com> Date: Thu, 28 Aug 2025 17:18:40 +0300 Subject: [PATCH 5/5] fix lints --- apps/server/evals/ai-chat-basic.eval.ts | 131 ------------------------ apps/server/evals/ai-tool-usage.eval.ts | 4 +- 2 files changed, 1 insertion(+), 134 deletions(-) diff --git a/apps/server/evals/ai-chat-basic.eval.ts b/apps/server/evals/ai-chat-basic.eval.ts index 3f9f17f365..140cfa926f 100644 --- a/apps/server/evals/ai-chat-basic.eval.ts +++ b/apps/server/evals/ai-chat-basic.eval.ts @@ -44,137 +44,6 @@ const safeStreamText = async (config: Parameters[0]) => { type TestCase = { input: string; expected: string }; -// Comprehensive static test cases for reliable, consistent testing -const STATIC_TEST_CASES: TestCase[] = [ - // Basic functionality tests - { - input: "Hello, what can you help me with?", - expected: "greeting" - }, - { - input: "Show me my unread emails", - expected: "getThread" - }, - { - input: "Find emails from john@example.com", - expected: "from:" - }, - { - input: "Create a label called 'Important'", - expected: "createLabel" - }, - { - input: "Archive all emails older than 30 days", - expected: "bulkArchive" - }, - { - input: "Write a thank you email to sarah@company.com", - expected: "composeEmail" - }, - { - input: "What's the weather like today?", - expected: "webSearch" - }, - { - input: "Summarize my inbox", - expected: "inboxRag" - }, - { - input: "Mark all emails from newsletters as read", - expected: "markThreadsRead" - }, - { - input: "Delete all spam emails", - expected: "bulkDelete" - }, - { - input: "Find emails with attachments from last week", - expected: "has:attachment" - }, - { - input: "Organize my emails by priority", - expected: "modifyLabels" - }, - { - input: "What emails do I have scheduled for tomorrow?", - expected: "getThread" - }, - { - input: "Send a follow-up email to the meeting request", - expected: "composeEmail" - }, - { - input: "Find all receipts from Amazon", - expected: "from:amazon" - }, - // Additional comprehensive test cases - { - input: "Show me emails with large attachments (>5MB)", - expected: "larger:5M" - }, - { - input: "Find emails sent between Monday and Friday last week", - expected: "after:2025/08/18" - }, - { - input: "Create a nested label structure: Work > Projects > Beta", - expected: "createLabel" - }, - { - input: "Rename the 'Old' label to 'Archived'", - expected: "modifyLabels" - }, - { - input: "Apply 'Urgent' label to all emails from the CEO", - expected: "modifyLabels" - }, - { - input: "Forward all emails from 'Support' to my manager", - expected: "composeEmail" - }, - { - input: "Set up automatic archiving for emails older than 90 days", - expected: "bulkArchive" - }, - { - input: "Find emails that are both important and starred", - expected: "is:important" - }, - { - input: "Create email templates for common responses", - expected: "composeEmail" - }, - { - input: "Analyze my email patterns and suggest improvements", - expected: "inboxRag" - }, - { - input: "Set up email encryption for sensitive communications", - expected: "composeEmail" - }, - { - input: "Create a backup of all my emails", - expected: "bulkArchive" - }, - { - input: "Find emails with multiple recipients (more than 10 people)", - expected: "to:" - }, - { - input: "Set up email forwarding rules for specific senders", - expected: "modifyLabels" - }, - { - input: "Create a knowledge base from FAQ emails", - expected: "inboxRag" - } -]; - -// Helper function to convert static test cases to the format expected by evalite -const makeStaticTestCaseProvider = (testCases: TestCase[]) => { - return async () => testCases; -}; - const makeAiChatTestCaseBuilder = (topic: string): (() => Promise) => { return async () => { const { object } = await generateObject({ diff --git a/apps/server/evals/ai-tool-usage.eval.ts b/apps/server/evals/ai-tool-usage.eval.ts index dfcd671466..e697199106 100644 --- a/apps/server/evals/ai-tool-usage.eval.ts +++ b/apps/server/evals/ai-tool-usage.eval.ts @@ -2,10 +2,8 @@ import { evalite } from "evalite"; import { openai } from "@ai-sdk/openai"; import { streamText } from "ai"; import { traceAISDKModel } from "evalite/ai-sdk"; -import { Factuality, EmbeddingSimilarity, ExactMatch } from "autoevals"; +import { Factuality, EmbeddingSimilarity } from "autoevals"; import { AiChatPrompt, GmailSearchAssistantSystemPrompt, StyledEmailAssistantSystemPrompt } from "../src/lib/prompts"; -import { generateObject } from "ai"; -import { z } from "zod"; // base model (untraced) for internal helpers to avoid trace errors const baseModel = openai("gpt-4o-mini");