-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathb2_preprocessing_function.py
More file actions
41 lines (27 loc) · 1.4 KB
/
b2_preprocessing_function.py
File metadata and controls
41 lines (27 loc) · 1.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import re
import nltk
from nltk.corpus import stopwords
stopwords_list = set(stopwords.words('english'))
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
'''Removes HTML tags: replaces anything between opening and closing <> with empty space'''
return TAG_RE.sub('', text)
class CustomPreprocess():
'''Cleans text data up, leaving only 2 or more char long non-stepwords composed of A-Z & a-z only
in lowercase'''
def __init__(self):
pass
def preprocess_text(self,sen):
sen = sen.lower()
# Remove html tags
sentence = remove_tags(sen)
# Remove punctuations and numbers
sentence = re.sub('[^a-zA-Z]', ' ', sentence)
# Single character removal
sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence) # When we remove apostrophe from the word "Mark's", the apostrophe is replaced by an empty space. Hence, we are left with single character "s" that we are removing here.
# Remove multiple spaces
sentence = re.sub(r'\s+', ' ', sentence) # Next, we remove all the single characters and replace it by a space which creates multiple spaces in our text. Finally, we remove the multiple spaces from our text as well.
# Remove Stopwords
pattern = re.compile(r'\b(' + r'|'.join(stopwords_list) + r')\b\s*')
sentence = pattern.sub('', sentence)
return sentence