claude-code-web/evaluator.py at main · Weizhena/claude-code-web · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
"""
Evaluation Module for Memory-Augmented QA

Implements the reward functions from Mem-alpha:
- r1: Correctness Reward (QA accuracy)
- r2: Tool Call Format Reward
- r3: Compression Reward
- r4: Memory Content Reward

Also implements the QA evaluation pipeline using two-layer RAG retrieval.
"""

from typing import Dict, List, Optional, Callable, Tuple
from dataclasses import dataclass
import re


from .memory_structure import MemorySystem
from .rag_retriever import TwoLayerRAGRetriever


@dataclass
class Question:
    """A question for evaluation"""
    question_id: int
    question: str
    answer: str  # Ground truth answer
    metadata: Optional[Dict] = None


@dataclass
class QAResult:
    """Result of a QA evaluation"""
    question: Question
    retrieved_context: str
    predicted_answer: str
    is_correct: bool
    score: float


@dataclass
class EvaluationResult:
    """Complete evaluation result"""
    total_questions: int
    correct_answers: int
    accuracy: float  # r1 reward
    tool_call_success_rate: float  # r2 reward
    compression_ratio: float  # r3 reward
    memory_content_score: float  # r4 reward
    final_reward: float
    qa_results: List[QAResult]


class QAEvaluator:
    """
    Question Answering Evaluator

    Uses two-layer RAG retrieval to answer questions based on constructed memory,
    then evaluates against ground truth answers.
    """

    QA_PROMPT_TEMPLATE = """You are a helpful assistant. Answer the question based on the provided memory context.

## Memory Context:
{context}

## Question:
{question}

## Instructions:
- Answer based ONLY on the information in the memory context
- If the answer is not in the context, say "I don't have that information"
- Keep your answer concise and direct

## Answer:"""

    def __init__(self,
                 memory_system: MemorySystem,
                 llm_callable: Optional[Callable[[str], str]] = None,
                 k_categories: int = 3,
                 n_entries_per_category: int = 5):
        """
        Initialize evaluator

        Args:
            memory_system: The memory system to evaluate
            llm_callable: Function to call LLM for answer generation
            k_categories: Number of categories for Layer 1 retrieval
            n_entries_per_category: Number of entries for Layer 2 retrieval
        """
        self.memory = memory_system
        self.llm_callable = llm_callable
        self.retriever = TwoLayerRAGRetriever(
            memory_system=memory_system,
            k_categories=k_categories,
            n_entries_per_category=n_entries_per_category
        )

    def set_llm(self, llm_callable: Callable[[str], str]) -> None:
        """Set the LLM callable"""
        self.llm_callable = llm_callable

    def answer_question(self, question: Question) -> QAResult:
        """
        Answer a single question using RAG pipeline

        Args:
            question: The question to answer

        Returns:
            QAResult with prediction and evaluation
        """
        if self.llm_callable is None:
            raise ValueError("LLM callable not set. Use set_llm() first.")

        # Retrieve relevant context using two-layer RAG
        context = self.retriever.retrieve_for_qa(question.question)

        # Generate answer
        prompt = self.QA_PROMPT_TEMPLATE.format(
            context=context,
            question=question.question
        )
        predicted_answer = self.llm_callable(prompt)

        # Evaluate correctness
        is_correct, score = self._evaluate_answer(
            predicted=predicted_answer,
            ground_truth=question.answer
        )

        return QAResult(
            question=question,
            retrieved_context=context,
            predicted_answer=predicted_answer,
            is_correct=is_correct,
            score=score
        )

    def _evaluate_answer(self,
                         predicted: str,
                         ground_truth: str) -> Tuple[bool, float]:
        """
        Evaluate predicted answer against ground truth

        Uses substring matching and F1 score
        """
        pred_clean = self._normalize_answer(predicted)
        truth_clean = self._normalize_answer(ground_truth)

        # Check for exact or substring match
        if truth_clean in pred_clean or pred_clean in truth_clean:
            return True, 1.0

        # Calculate token-level F1 score
        pred_tokens = set(pred_clean.split())
        truth_tokens = set(truth_clean.split())

        if not truth_tokens:
            return False, 0.0

        common = pred_tokens & truth_tokens
        precision = len(common) / len(pred_tokens) if pred_tokens else 0
        recall = len(common) / len(truth_tokens) if truth_tokens else 0

        if precision + recall == 0:
            return False, 0.0

        f1 = 2 * precision * recall / (precision + recall)

        # Consider correct if F1 > 0.5
        return f1 > 0.5, f1

    def _normalize_answer(self, answer: str) -> str:
        """Normalize answer for comparison"""
        # Lowercase
        answer = answer.lower()
        # Remove punctuation
        answer = re.sub(r'[^\w\s]', '', answer)
        # Remove extra whitespace
        answer = ' '.join(answer.split())
        return answer

    def evaluate_questions(self, questions: List[Question]) -> List[QAResult]:
        """Evaluate a list of questions"""
        results = []
        for question in questions:
            result = self.answer_question(question)
            results.append(result)
        return results


class RewardCalculator:
    """
    Calculate rewards for Mem-alpha training

    Rewards:
    - r1: Correctness reward (QA accuracy)
    - r2: Tool call format reward
    - r3: Compression reward
    - r4: Memory content reward
    """

    def __init__(self,
                 beta: float = 0.05,
                 gamma: float = 0.1,
                 llm_judge: Optional[Callable[[str], str]] = None):
        """
        Initialize reward calculator

        Args:
            beta: Weight for compression reward
            gamma: Weight for memory content reward
            llm_judge: LLM callable for memory content validation
        """
        self.beta = beta
        self.gamma = gamma
        self.llm_judge = llm_judge

    def calculate_correctness_reward(self, qa_results: List[QAResult]) -> float:
        """
        r1: Correctness reward based on QA accuracy

        r1 = (number of correct answers) / (total questions)
        """
        if not qa_results:
            return 0.0

        correct = sum(1 for r in qa_results if r.is_correct)
        return correct / len(qa_results)

    def calculate_tool_call_reward(self,
                                   successful_calls: int,
                                   total_calls: int) -> float:
        """
        r2: Tool call format reward

        r2 = (successful tool calls) / (total tool calls)
        """
        if total_calls == 0:
            return 1.0  # No calls means no errors
        return successful_calls / total_calls

    def calculate_compression_reward(self,
                                     memory_length: int,
                                     input_length: int) -> float:
        """
        r3: Compression reward

        r3 = 1 - (memory_length / input_length)

        Encourages efficient memory usage
        """
        if input_length == 0:
            return 1.0
        ratio = memory_length / input_length
        return max(0, 1 - ratio)

    def calculate_memory_content_reward(self,
                                        memory_system: MemorySystem) -> float:
        """
        r4: Memory content quality reward

        Uses LLM judge to validate memory entries
        Returns fraction of valid entries
        """
        if self.llm_judge is None:
            return 1.0  # Default to full score if no judge

        all_entries = memory_system.get_all_entries()
        if not all_entries:
            return 1.0

        valid_count = 0
        for entry in all_entries:
            if self._validate_memory_entry(entry):
                valid_count += 1

        return valid_count / len(all_entries)

    def _validate_memory_entry(self, entry) -> bool:
        """Validate a single memory entry using LLM judge"""
        prompt = f"""Analyze the quality of this memory entry:

Category: {entry.category.value}
Content: {entry.content}

Is this a valid, well-formed memory entry that:
1. Contains meaningful information
2. Is appropriately categorized
3. Is not a placeholder or generic text

Respond with ONLY "VALID" or "INVALID"."""

        try:
            response = self.llm_judge(prompt) if self.llm_judge else "VALID"
            return "VALID" in response.upper()
        except Exception:
            return True  # Default to valid on error

    def calculate_total_reward(self,
                               r1: float,
                               r2: float,
                               r3: float,
                               r4: float) -> float:
        """
        Calculate total reward: r = r1 + r2 + beta * r3 + gamma * r4
        """
        return r1 + r2 + self.beta * r3 + self.gamma * r4

    def evaluate(self,
                 memory_system: MemorySystem,
                 qa_results: List[QAResult],
                 successful_tool_calls: int,
                 total_tool_calls: int,
                 total_input_length: int) -> EvaluationResult:
        """
        Complete evaluation with all reward components

        Args:
            memory_system: The constructed memory system
            qa_results: Results from QA evaluation
            successful_tool_calls: Number of successful tool calls
            total_tool_calls: Total number of tool calls
            total_input_length: Total length of input chunks

        Returns:
            EvaluationResult with all metrics
        """
        # Calculate individual rewards
        r1 = self.calculate_correctness_reward(qa_results)
        r2 = self.calculate_tool_call_reward(successful_tool_calls, total_tool_calls)
        r3 = self.calculate_compression_reward(
            memory_system.get_total_memory_length(),
            total_input_length
        )
        r4 = self.calculate_memory_content_reward(memory_system)

        # Calculate total reward
        total_reward = self.calculate_total_reward(r1, r2, r3, r4)

        return EvaluationResult(
            total_questions=len(qa_results),
            correct_answers=sum(1 for r in qa_results if r.is_correct),
            accuracy=r1,
            tool_call_success_rate=r2,
            compression_ratio=r3,
            memory_content_score=r4,
            final_reward=total_reward,
            qa_results=qa_results
        )


class MockQALLM:
    """Mock LLM for QA testing"""

    def __call__(self, prompt: str) -> str:
        """Generate mock answer based on context"""
        # Extract question from prompt
        q_match = re.search(r'Question:\s*(.*?)\s*(?:##|$)', prompt, re.DOTALL)
        if not q_match:
            return "I don't know."

        question = q_match.group(1).strip().lower()

        # Extract context
        ctx_match = re.search(r'Memory Context:\s*(.*?)\s*##', prompt, re.DOTALL)
        context = ctx_match.group(1).strip().lower() if ctx_match else ""

        # Simple keyword matching for mock answers
        keywords = re.findall(r'\b\w{4,}\b', question)

        for keyword in keywords:
            if keyword in context:
                # Find sentence containing keyword
                sentences = re.split(r'[.!?]', context)
                for sent in sentences:
                    if keyword in sent and len(sent.strip()) > 20:
                        return sent.strip().capitalize() + "."

        return "Based on the context, I cannot find a specific answer."