-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy patheval.py
More file actions
121 lines (98 loc) · 3.89 KB
/
eval.py
File metadata and controls
121 lines (98 loc) · 3.89 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import os
import logging
from dotenv import load_dotenv
# Ragas & Datasets
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, context_precision, context_recall
# LangChain models
from langchain_groq import ChatGroq
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
# App Logic (we import our own retriever and rag chain)
from app import build_retriever, build_rag_chain
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
load_dotenv()
QDRANT_PATH = "./qdrant_db"
COLLECTION_NAME = "apple_sec_filings"
# A golden dataset of typical questions we expect the Self-Querying RAG to answer
# 'ground_truth' represents the ideal answer we'd expect
GOLDEN_DATASET = [
{
"question": "What is Apple's primary form of revenue as reported in the 10-K?",
"ground_truth": "Apple's primary form of revenue is the sale of its hardware products (iPhone, Mac, iPad) and related services."
},
{
"question": "What are the common risk factors mentioned in Apple's filings?",
"ground_truth": "Common risk factors include global economic conditions, severe competition, supply chain disruptions, and dependence on third-party software/component providers."
}
]
def run_evaluation():
if not os.getenv("GROQ_API_KEY"):
logger.error("GROQ_API_KEY is not set.")
return
logger.info("Initializing components for Evaluation...")
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
try:
client = QdrantClient(path=QDRANT_PATH)
vector_store = QdrantVectorStore(
client=client,
collection_name=COLLECTION_NAME,
embedding=embeddings,
)
except Exception as e:
logger.error(f"Failed to connect to Qdrant: {e}")
return
# We will use Groq's 70b model for better reasoning in the 'judge' role.
eval_llm = ChatGroq(model_name="llama-3.3-70b-versatile", temperature=0)
retriever = build_retriever(vector_store, eval_llm)
rag_chain = build_rag_chain(retriever, eval_llm)
questions = []
answers = []
contexts = []
ground_truths = []
logger.info("Generating answers for the Golden Dataset...")
for item in GOLDEN_DATASET:
question = item["question"]
gt = item["ground_truth"]
# 1. Retrieve contexts
retrieved_docs = retriever.invoke(question)
doc_contents = [doc.page_content for doc in retrieved_docs]
# 2. Generate answer
answer = rag_chain.invoke(question)
questions.append(question)
answers.append(answer)
contexts.append(doc_contents)
ground_truths.append(gt)
# 3. Format as HuggingFace Dataset (required by Ragas)
eval_data = {
"question": questions,
"answer": answers,
"contexts": contexts,
"ground_truth": ground_truths
}
dataset = Dataset.from_dict(eval_data)
logger.info("Running Ragas Evaluation Metrics...")
# Ragas needs LLMs and Embeddings to evaluate metrics:
metrics = [
faithfulness, # Hallucination check
answer_relevancy, # Does it actually answer the question?
context_precision, # Did we retrieve relevant items first?
context_recall # Did we retrieve ALL the necessary context?
]
result = evaluate(
dataset=dataset,
metrics=metrics,
llm=eval_llm,
embeddings=embeddings
)
logger.info("=== EVALUATION RESULTS ===")
print(result)
# Save the dataframe for later analysis
df = result.to_pandas()
df.to_csv("ragas_evaluation_report.csv", index=False)
logger.info("Saved detailed report to ragas_evaluation_report.csv")
if __name__ == "__main__":
run_evaluation()