-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfinal_demo.py
More file actions
executable file
·72 lines (59 loc) · 2.33 KB
/
final_demo.py
File metadata and controls
executable file
·72 lines (59 loc) · 2.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#!/usr/bin/env python3
"""
Final demonstration that everything works perfectly.
"""
from pashto_pipeline import TextProcessingPipeline, PashtoNormalizer, PashtoTokenizer
from pashto_pipeline.preprocessing.stopwords import StopwordsRemover
print("="*70)
print("🎉 PASHTO PROCESSING PIPELINE - FINAL DEMONSTRATION")
print("="*70)
# Sample Pashto texts
texts = [
"سلام دنیا! دا د پښتو متن پروسس کولو یوه بېلګه ده.",
"زه په کابل کې اوسېږم او پښتو ژبه زده کوم.",
"پښتو یوه ښکلې او تاریخي ژبه ده.",
]
print("\n📝 Original Texts:")
for i, text in enumerate(texts, 1):
print(f" {i}. {text}")
# Create pipeline
print("\n🔧 Creating Processing Pipeline...")
pipeline = TextProcessingPipeline()
normalizer = PashtoNormalizer(
normalize_whitespace=True,
normalize_digits='western'
)
tokenizer = PashtoTokenizer(preserve_punctuation=True)
pipeline.add_step('normalize', normalizer.normalize)
pipeline.add_step('tokenize', tokenizer.tokenize)
print(f" ✓ Pipeline created with {len(pipeline.get_steps())} steps")
# Process texts
print("\n⚙️ Processing Texts...")
results = pipeline.process_batch(texts, verbose=False)
print("\n📊 Results:")
for i, (original, tokens) in enumerate(zip(texts, results), 1):
print(f"\n Text {i}:")
print(f" Original: {original}")
print(f" Tokens ({len(tokens)}): {tokens[:5]}..." if len(tokens) > 5 else f" Tokens: {tokens}")
# Demonstrate stopword removal
print("\n🔍 Stopword Removal Demo:")
remover = StopwordsRemover()
sample_tokens = ['زه', 'په', 'ښار', 'کې', 'یم']
filtered = remover.remove(sample_tokens)
print(f" Before: {sample_tokens}")
print(f" After: {filtered}")
# Statistics
print("\n📈 Statistics:")
total_tokens = sum(len(r) for r in results)
print(f" • Total texts processed: {len(texts)}")
print(f" • Total tokens extracted: {total_tokens}")
print(f" • Average tokens per text: {total_tokens/len(texts):.1f}")
print("\n✅ All Components Working Perfectly!")
print("="*70)
print("\n🎓 Next Steps:")
print(" 1. Read the documentation in docs/")
print(" 2. Try examples/basic_usage.py")
print(" 3. Check QUICKSTART.md for more details")
print(" 4. Explore code/pashto_dataset/ for advanced features")
print("\n🚀 Happy Processing!")
print("="*70)