-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathanalyze_content.py
More file actions
102 lines (81 loc) · 3.64 KB
/
analyze_content.py
File metadata and controls
102 lines (81 loc) · 3.64 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#!/usr/bin/env python3
"""
Analyze the content of your processed RAG documents
This will help us understand what's actually in your PDF
"""
import os
import sys
from pathlib import Path
import pickle
# Add the project root to the path
project_root = Path(__file__).parent
sys.path.insert(0, str(project_root))
def analyze_rag_content():
"""Analyze the content of processed RAG documents"""
try:
print("📊 Analyzing RAG Content")
print("=" * 50)
# Load the embeddings to see the actual content
embeddings_file = Path("models/embeddings.pkl")
if not embeddings_file.exists():
print("❌ No embeddings found. Run setup_rag.py first.")
return False
with open(embeddings_file, 'rb') as f:
embedded_chunks = pickle.load(f)
print(f"📚 Total chunks: {len(embedded_chunks)}")
# Show first few chunks to understand content
print("\n🔍 Sample chunks from your PDF:")
print("-" * 40)
for i, embedded_chunk in enumerate(embedded_chunks[:10]):
# Extract content from dictionary structure
chunk_data = embedded_chunk['chunk_data']
content = chunk_data.get('content', 'No content')
page_num = chunk_data.get('page_number', 'Unknown')
source_file = chunk_data.get('source_file', 'Unknown')
print(f"\n📄 Chunk {i+1} (Page {page_num}):")
print(f"Source: {source_file}")
print(f"Content: {content[:300]}...")
if i == 4: # Show first 5 chunks
break
# Show some statistics
print(f"\n📈 Content Statistics:")
print(f"- Total chunks: {len(embedded_chunks)}")
pages = set()
chunk_sizes = []
for chunk in embedded_chunks:
chunk_data = chunk['chunk_data']
pages.add(chunk_data.get('page_number', 0))
chunk_sizes.append(len(chunk_data.get('content', '')))
print(f"- Pages covered: {sorted(pages)}")
if pages:
print(f"- Page range: {min(pages)} to {max(pages)}")
# Show chunk sizes
if chunk_sizes:
print(f"- Average chunk size: {sum(chunk_sizes) / len(chunk_sizes):.0f} characters")
print(f"- Min chunk size: {min(chunk_sizes)} characters")
print(f"- Max chunk size: {max(chunk_sizes)} characters")
# Extract some key terms to help with queries
print(f"\n🔑 Suggested queries based on content:")
all_content = []
for chunk in embedded_chunks[:20]:
chunk_data = chunk['chunk_data']
all_content.append(chunk_data.get('content', ''))
combined_content = " ".join(all_content)
# Look for common Arabic terms
sample_text = combined_content[:1000] # First 1000 characters
if sample_text:
print(f"- Sample text: {sample_text[:300]}...")
# Based on the debug output, I can see this is about "علي" (Ali)
print(f"\n💡 Try these queries based on your content:")
print(f"- 'من هو علي؟' (Who is Ali?)")
print(f"- 'صف علي' (Describe Ali)")
print(f"- 'ما هي صفات علي؟' (What are Ali's characteristics?)")
print(f"- 'حدثني عن الطفل' (Tell me about the child)")
print(f"- 'ما لون عيني علي؟' (What color are Ali's eyes?)")
print(f"- Or search for any specific words you see in the sample text above")
return True
except Exception as e:
print(f"❌ Error analyzing content: {e}")
return False
if __name__ == "__main__":
analyze_rag_content()