-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathutils.py
More file actions
32 lines (26 loc) · 1.21 KB
/
utils.py
File metadata and controls
32 lines (26 loc) · 1.21 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import nltk
from nltk.corpus import stopwords
from dotenv import load_dotenv
import os
load_dotenv()
DOWNLOAD_DIR = os.getenv("NLTK_DOWNLOAD_DIR")
nltk.data.path.append(DOWNLOAD_DIR)
# Download stopwords if not already downloaded
nltk.download('stopwords', download_dir=DOWNLOAD_DIR)
# Get English stopwords
stop_words = set(stopwords.words('english'))
# Define NSM primes
NSM_PRIMES = {
"i", "you", "someone", "people", "something", "thing", "body", "kind", "part",
"this", "the same", "other", "else", "another", "one", "two", "some", "all",
"much", "many", "little", "few", "good", "bad", "big", "small", "think", "know",
"want", "don't want", "feel", "see", "hear", "say", "words", "true", "do",
"happen", "move", "there", "is", "be",
"mine", "live", "die", "when", "time", "now", "before", "after",
"a long time", "a short time", "for some time", "moment", "where", "place",
"here", "above", "below", "far", "near", "side", "inside", "touch",
"not", "maybe", "can", "because", "if", "very", "more", "like", "as", "way", "said"
}
# Find stopwords that are not in NSM_PRIMES
STOP_WORDS = stop_words - NSM_PRIMES
LEGAL_PUNCTUATION = {"'", ".", ",", ":", "!", "?", "\"", "\n", "\t", "(", ")", "/" }