-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy patheval.py
More file actions
171 lines (127 loc) · 4.77 KB
/
eval.py
File metadata and controls
171 lines (127 loc) · 4.77 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
from pydantic import BaseModel
from pydantic_ai import Agent
import json
from pathlib import Path
from datetime import datetime
from tqdm import tqdm
import pandas as pd
LOG_DIR = Path('logs')
LOG_DIR.mkdir(exist_ok=True)
class EvaluationCheck(BaseModel):
check_name: str
justification: str
check_pass: bool
class EvaluationChecklist(BaseModel):
checklist: list[EvaluationCheck]
summary: str
evaluation_prompt = """
Use this checklist to evaluate the quality of an AI agent's answer (<ANSWER>) to a user question (<QUESTION>).
We also include the entire log (<LOG>) for analysis.
For each item, check if the condition is met.
Checklist:
- instructions_follow: The agent followed the user's instructions (in <INSTRUCTIONS>)
- instructions_avoid: The agent avoided doing things it was told not to do
- answer_relevant: The response directly addresses the user's question
- answer_clear: The answer is clear and correct
- answer_citations: The response includes proper citations or sources when required
- completeness: The response is complete and covers all key aspects of the request
- tool_call_search: Is the search tool invoked?
Output true/false for each check and provide a short explanation for your judgment.
""".strip()
def create_eval_agent(model='gpt-5-nano'):
eval_agent = Agent(
name='eval_agent',
model=model,
instructions=evaluation_prompt,
output_type=EvaluationChecklist
)
return eval_agent
def load_log_file(log_file):
with open(log_file, 'r') as f_in:
log_data = json.load(f_in)
log_data['log_file'] = log_file
return log_data
user_prompt_format = """
<INSTRUCTIONS>{instructions}</INSTRUCTIONS>
<QUESTION>{question}</QUESTION>
<ANSWER>{answer}</ANSWER>
<LOG>{log}</LOG>
""".strip()
def simplify_log_messages(messages):
log_simplified = []
for m in messages:
parts = []
for original_part in m['parts']:
part = original_part.copy()
kind = part['part_kind']
if kind == 'user-prompt':
del part['timestamp']
if kind == 'tool-call':
del part['tool_call_id']
if kind == 'tool-return':
del part['tool_call_id']
del part['metadata']
del part['timestamp']
# Replace actual search results with placeholder to save tokens
part['content'] = 'RETURN_RESULTS_REDACTED'
if kind == 'text':
del part['id']
parts.append(part)
message = {
'kind': m['kind'],
'parts': parts
}
log_simplified.append(message)
return log_simplified
async def evaluate_log_record(eval_agent, log_record):
messages = log_record['messages']
instructions = log_record['system_prompt']
question = messages[0]['parts'][0]['content']
answer = messages[-1]['parts'][0]['content']
log_simplified = simplify_log_messages(messages)
log = json.dumps(log_simplified)
user_prompt = user_prompt_format.format(
instructions=instructions,
question=question,
answer=answer,
log=log
)
result = await eval_agent.run(user_prompt, output_type=EvaluationChecklist)
return result.output
def retreive_log():
eval_set = []
for log_file in LOG_DIR.glob('*.json'):
if 'es_agent' not in log_file.name and 'es_agent_v2' not in log_file.name:
continue
log_record = load_log_file(log_file)
# if log_record['source'] != 'ai-generated':
# continue
eval_set.append(log_record)
return eval_set
async def evaluate_log():
eval_results = []
eval_agent = create_eval_agent()
eval_set = retreive_log()
for log_record in tqdm(eval_set):
eval_result = await evaluate_log_record(eval_agent, log_record)
eval_results.append((log_record, eval_result))
return eval_results
async def get_eval_means():
eval_results = await evaluate_log()
rows = []
for log_record, eval_result in eval_results:
messages = log_record['messages']
row = {
'file': log_record['log_file'].name,
'question': messages[0]['parts'][0]['content'],
'answer': messages[-1]['parts'][0]['content'],
}
checks = {c.check_name: c.check_pass for c in eval_result.checklist}
row.update(checks)
rows.append(row)
df_evals = pd.DataFrame(rows)
change_to_bool_cols = ['instructions_follow', 'instructions_avoid', 'answer_relevant',
'answer_clear','answer_citations', 'completeness', 'tool_call_search']
df_evals[change_to_bool_cols] = df_evals[change_to_bool_cols].astype(bool)
metrics = df_evals.mean(numeric_only=True)
return metrics