-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbenchmark.py
More file actions
63 lines (51 loc) · 2 KB
/
benchmark.py
File metadata and controls
63 lines (51 loc) · 2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import time
import random
import string
import re
from flashtext import KeywordProcessor
def generate_random_corpus(num_words=100000):
words = []
for _ in range(num_words):
word = ''.join(random.choices(string.ascii_letters, k=random.randint(3, 10)))
words.append(word)
return ' '.join(words)
def benchmark():
# Setup
print("Generating corpus...")
corpus = generate_random_corpus(500000) # 500k words
possible_chars = string.ascii_letters
keywords = []
for _ in range(1000):
kw = ''.join(random.choices(possible_chars, k=random.randint(4, 8)))
keywords.append(kw)
print(f"Corpus length: {len(corpus)} chars")
print(f"Keywords count: {len(keywords)}")
# 1. FlashText Case-Insensitive (Mixed Case Optimization)
kp = KeywordProcessor(case_sensitive=False)
kp.add_keywords_from_list(keywords)
start_time = time.time()
kp.extract_keywords(corpus)
end_time = time.time()
flashtext_time = end_time - start_time
print(f"FlashText (Case-Insensitive): {flashtext_time:.4f} seconds")
# 2. FlashText Case-Sensitive
kp_strict = KeywordProcessor(case_sensitive=True)
kp_strict.add_keywords_from_list(keywords)
start_time = time.time()
kp_strict.extract_keywords(corpus)
end_time = time.time()
flashtext_strict_time = end_time - start_time
print(f"FlashText (Case-Sensitive): {flashtext_strict_time:.4f} seconds")
# 3. Regex (Baseline comparison)
# Compile regex for all keywords
# escaped_keywords = [re.escape(k) for k in keywords]
# pattern_str = r'\b(' + '|'.join(escaped_keywords) + r')\b'
# pattern = re.compile(pattern_str, re.IGNORECASE)
# start_time = time.time()
# pattern.findall(corpus)
# end_time = time.time()
# regex_time = end_time - start_time
# print(f"Regex (Compiled): {regex_time:.4f} seconds")
# print(f"Speedup vs Regex: {regex_time / flashtext_time:.2f}x")
if __name__ == "__main__":
benchmark()