-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathevaluator.py
More file actions
381 lines (304 loc) · 11.6 KB
/
evaluator.py
File metadata and controls
381 lines (304 loc) · 11.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
"""
Evaluation Module for Memory-Augmented QA
Implements the reward functions from Mem-alpha:
- r1: Correctness Reward (QA accuracy)
- r2: Tool Call Format Reward
- r3: Compression Reward
- r4: Memory Content Reward
Also implements the QA evaluation pipeline using two-layer RAG retrieval.
"""
from typing import Dict, List, Optional, Callable, Tuple
from dataclasses import dataclass
import re
from .memory_structure import MemorySystem
from .rag_retriever import TwoLayerRAGRetriever
@dataclass
class Question:
"""A question for evaluation"""
question_id: int
question: str
answer: str # Ground truth answer
metadata: Optional[Dict] = None
@dataclass
class QAResult:
"""Result of a QA evaluation"""
question: Question
retrieved_context: str
predicted_answer: str
is_correct: bool
score: float
@dataclass
class EvaluationResult:
"""Complete evaluation result"""
total_questions: int
correct_answers: int
accuracy: float # r1 reward
tool_call_success_rate: float # r2 reward
compression_ratio: float # r3 reward
memory_content_score: float # r4 reward
final_reward: float
qa_results: List[QAResult]
class QAEvaluator:
"""
Question Answering Evaluator
Uses two-layer RAG retrieval to answer questions based on constructed memory,
then evaluates against ground truth answers.
"""
QA_PROMPT_TEMPLATE = """You are a helpful assistant. Answer the question based on the provided memory context.
## Memory Context:
{context}
## Question:
{question}
## Instructions:
- Answer based ONLY on the information in the memory context
- If the answer is not in the context, say "I don't have that information"
- Keep your answer concise and direct
## Answer:"""
def __init__(self,
memory_system: MemorySystem,
llm_callable: Optional[Callable[[str], str]] = None,
k_categories: int = 3,
n_entries_per_category: int = 5):
"""
Initialize evaluator
Args:
memory_system: The memory system to evaluate
llm_callable: Function to call LLM for answer generation
k_categories: Number of categories for Layer 1 retrieval
n_entries_per_category: Number of entries for Layer 2 retrieval
"""
self.memory = memory_system
self.llm_callable = llm_callable
self.retriever = TwoLayerRAGRetriever(
memory_system=memory_system,
k_categories=k_categories,
n_entries_per_category=n_entries_per_category
)
def set_llm(self, llm_callable: Callable[[str], str]) -> None:
"""Set the LLM callable"""
self.llm_callable = llm_callable
def answer_question(self, question: Question) -> QAResult:
"""
Answer a single question using RAG pipeline
Args:
question: The question to answer
Returns:
QAResult with prediction and evaluation
"""
if self.llm_callable is None:
raise ValueError("LLM callable not set. Use set_llm() first.")
# Retrieve relevant context using two-layer RAG
context = self.retriever.retrieve_for_qa(question.question)
# Generate answer
prompt = self.QA_PROMPT_TEMPLATE.format(
context=context,
question=question.question
)
predicted_answer = self.llm_callable(prompt)
# Evaluate correctness
is_correct, score = self._evaluate_answer(
predicted=predicted_answer,
ground_truth=question.answer
)
return QAResult(
question=question,
retrieved_context=context,
predicted_answer=predicted_answer,
is_correct=is_correct,
score=score
)
def _evaluate_answer(self,
predicted: str,
ground_truth: str) -> Tuple[bool, float]:
"""
Evaluate predicted answer against ground truth
Uses substring matching and F1 score
"""
pred_clean = self._normalize_answer(predicted)
truth_clean = self._normalize_answer(ground_truth)
# Check for exact or substring match
if truth_clean in pred_clean or pred_clean in truth_clean:
return True, 1.0
# Calculate token-level F1 score
pred_tokens = set(pred_clean.split())
truth_tokens = set(truth_clean.split())
if not truth_tokens:
return False, 0.0
common = pred_tokens & truth_tokens
precision = len(common) / len(pred_tokens) if pred_tokens else 0
recall = len(common) / len(truth_tokens) if truth_tokens else 0
if precision + recall == 0:
return False, 0.0
f1 = 2 * precision * recall / (precision + recall)
# Consider correct if F1 > 0.5
return f1 > 0.5, f1
def _normalize_answer(self, answer: str) -> str:
"""Normalize answer for comparison"""
# Lowercase
answer = answer.lower()
# Remove punctuation
answer = re.sub(r'[^\w\s]', '', answer)
# Remove extra whitespace
answer = ' '.join(answer.split())
return answer
def evaluate_questions(self, questions: List[Question]) -> List[QAResult]:
"""Evaluate a list of questions"""
results = []
for question in questions:
result = self.answer_question(question)
results.append(result)
return results
class RewardCalculator:
"""
Calculate rewards for Mem-alpha training
Rewards:
- r1: Correctness reward (QA accuracy)
- r2: Tool call format reward
- r3: Compression reward
- r4: Memory content reward
"""
def __init__(self,
beta: float = 0.05,
gamma: float = 0.1,
llm_judge: Optional[Callable[[str], str]] = None):
"""
Initialize reward calculator
Args:
beta: Weight for compression reward
gamma: Weight for memory content reward
llm_judge: LLM callable for memory content validation
"""
self.beta = beta
self.gamma = gamma
self.llm_judge = llm_judge
def calculate_correctness_reward(self, qa_results: List[QAResult]) -> float:
"""
r1: Correctness reward based on QA accuracy
r1 = (number of correct answers) / (total questions)
"""
if not qa_results:
return 0.0
correct = sum(1 for r in qa_results if r.is_correct)
return correct / len(qa_results)
def calculate_tool_call_reward(self,
successful_calls: int,
total_calls: int) -> float:
"""
r2: Tool call format reward
r2 = (successful tool calls) / (total tool calls)
"""
if total_calls == 0:
return 1.0 # No calls means no errors
return successful_calls / total_calls
def calculate_compression_reward(self,
memory_length: int,
input_length: int) -> float:
"""
r3: Compression reward
r3 = 1 - (memory_length / input_length)
Encourages efficient memory usage
"""
if input_length == 0:
return 1.0
ratio = memory_length / input_length
return max(0, 1 - ratio)
def calculate_memory_content_reward(self,
memory_system: MemorySystem) -> float:
"""
r4: Memory content quality reward
Uses LLM judge to validate memory entries
Returns fraction of valid entries
"""
if self.llm_judge is None:
return 1.0 # Default to full score if no judge
all_entries = memory_system.get_all_entries()
if not all_entries:
return 1.0
valid_count = 0
for entry in all_entries:
if self._validate_memory_entry(entry):
valid_count += 1
return valid_count / len(all_entries)
def _validate_memory_entry(self, entry) -> bool:
"""Validate a single memory entry using LLM judge"""
prompt = f"""Analyze the quality of this memory entry:
Category: {entry.category.value}
Content: {entry.content}
Is this a valid, well-formed memory entry that:
1. Contains meaningful information
2. Is appropriately categorized
3. Is not a placeholder or generic text
Respond with ONLY "VALID" or "INVALID"."""
try:
response = self.llm_judge(prompt) if self.llm_judge else "VALID"
return "VALID" in response.upper()
except Exception:
return True # Default to valid on error
def calculate_total_reward(self,
r1: float,
r2: float,
r3: float,
r4: float) -> float:
"""
Calculate total reward: r = r1 + r2 + beta * r3 + gamma * r4
"""
return r1 + r2 + self.beta * r3 + self.gamma * r4
def evaluate(self,
memory_system: MemorySystem,
qa_results: List[QAResult],
successful_tool_calls: int,
total_tool_calls: int,
total_input_length: int) -> EvaluationResult:
"""
Complete evaluation with all reward components
Args:
memory_system: The constructed memory system
qa_results: Results from QA evaluation
successful_tool_calls: Number of successful tool calls
total_tool_calls: Total number of tool calls
total_input_length: Total length of input chunks
Returns:
EvaluationResult with all metrics
"""
# Calculate individual rewards
r1 = self.calculate_correctness_reward(qa_results)
r2 = self.calculate_tool_call_reward(successful_tool_calls, total_tool_calls)
r3 = self.calculate_compression_reward(
memory_system.get_total_memory_length(),
total_input_length
)
r4 = self.calculate_memory_content_reward(memory_system)
# Calculate total reward
total_reward = self.calculate_total_reward(r1, r2, r3, r4)
return EvaluationResult(
total_questions=len(qa_results),
correct_answers=sum(1 for r in qa_results if r.is_correct),
accuracy=r1,
tool_call_success_rate=r2,
compression_ratio=r3,
memory_content_score=r4,
final_reward=total_reward,
qa_results=qa_results
)
class MockQALLM:
"""Mock LLM for QA testing"""
def __call__(self, prompt: str) -> str:
"""Generate mock answer based on context"""
# Extract question from prompt
q_match = re.search(r'Question:\s*(.*?)\s*(?:##|$)', prompt, re.DOTALL)
if not q_match:
return "I don't know."
question = q_match.group(1).strip().lower()
# Extract context
ctx_match = re.search(r'Memory Context:\s*(.*?)\s*##', prompt, re.DOTALL)
context = ctx_match.group(1).strip().lower() if ctx_match else ""
# Simple keyword matching for mock answers
keywords = re.findall(r'\b\w{4,}\b', question)
for keyword in keywords:
if keyword in context:
# Find sentence containing keyword
sentences = re.split(r'[.!?]', context)
for sent in sentences:
if keyword in sent and len(sent.strip()) > 20:
return sent.strip().capitalize() + "."
return "Based on the context, I cannot find a specific answer."