-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutils.py
More file actions
195 lines (162 loc) · 5.87 KB
/
utils.py
File metadata and controls
195 lines (162 loc) · 5.87 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
from nltk.corpus import stopwords, words
from nltk.tokenize import word_tokenize
from emoji import UNICODE_EMOJI
from nltk.sentiment.util import mark_negation
from unicodedata import category
from nltk.stem import PorterStemmer
import re
import farasa
from contractions import contractions
i = 1
arabic_stopword = ['في','من','علي' ,'على', 'أن', 'الى','التي', 'عن', 'لا','ما', 'او',
'هذا', 'هذه', 'الذي', 'كان', 'مع', 'و', 'ذلك', 'في', 'الله', 'بين', 'كل', 'هو',
'كما', 'لم', 'بعد', 'ان', 'ازاى', 'ليه', 'ازاي', 'عشان', 'علشان' ]
porter = PorterStemmer()
def process(tweet):
global i
print(i)
i += 1
text = tweet['text'].lower()
lang = tweet['lang']
try:
mentions = tweet['mentions']
urls = tweet['urls']
except KeyError:
mentions = []
urls = []
# remove mentions and urls from tweets
text = filter_tweet(text, mentions, urls)
# seperate emojis and text
[text, emoji_text] = separate_emojis(text)
# remove English contractions
if lang == 'en':
text = expand_contractions(text)
if lang == 'ar':
text = remove_arabic_variants(text)
#remove stop words
text = remove_stopwords(text, lang)
# mark negations and remove punctuation from text
if lang == 'en':
text = mark_negation(text)
text = remove_punct(text)
# normalize text by removing repetions and steming
# old_text = text
text = normalize_repititions(text, lang)
if lang == 'ar':
text = farasa.lemmatize(text)
if lang == 'en':
text = stem_words(text)
# if(old_text != text):
# print('old text: %s, new text: %s' %(tweet['text'], ' '.join(text)))
text += emoji_text
print(text)
return text
def remove_arabic_variants(text):
variants = [('أ', 'ا'), ('آ', 'ا'), ('إ', 'ا'), ('ى', 'ي'), ('ة', 'ه')]
for variant in variants:
text = text.replace(variant[0], variant[1])
return text
def separate_emojis(text):
emoji_text = []
for emoji in UNICODE_EMOJI.keys():
last_index = 0
while True:
try:
next_index = text.index(emoji, last_index)
last_index = next_index + len(emoji) + 3
text = text[0:next_index] + ' ' + emoji + ' ' + text[next_index+len(emoji):len(text)]
emoji_text.append(emoji)
except ValueError:
break
for emoji in emoji_text:
text = text.replace(emoji, "")
return [text, emoji_text]
def filter_tweet(text, mentions, urls):
for item in mentions + urls:
start = item['indices'][0]
end = item['indices'][1]
text = text[0:start] + ' '*(end-start) + text[end:len(text)]
#remove extra urls
split = filter(lambda word: not word.startswith('http://') and not word.startswith('https://'), text.split())
#remove mentions
split = filter(lambda word: not word.startswith('@'), split)
#remove #'s at the beginning of hashtags
split = map(lambda word: word.strip('#'), split)
return ' '.join(split)
def expand_contractions(text):
words = text.split()
new_words = []
for word in words:
if word in contractions:
new_words.append(contractions[word])
else:
new_words.append(word)
return ' '.join(new_words).lower()
def remove_stopwords(text, lang):
if(lang == 'en'):
list = stopwords.words('english') + ['rt']
for word in ['not', 'no']:
list.remove(word)
else:
list = stopwords.words('arabic') + ['rt'] + arabic_stopword
list += ['و' + x if not x.startswith('و') else x for x in list]
stop_words = set(list)
tokenized = word_tokenize(text)
filtered_sentence = [w for w in tokenized if not w in stop_words]
return filtered_sentence
def remove_punct(text):
filtered_text = []
for word in text:
original_word = word
for c in word:
if (category(c).startswith('P') or category(c).startswith('C') or category(c).startswith('M') or category(c).startswith('S')) and c not in ['?','!']:
word = word.replace(c,'')
# removing odd punctuation
# Example: '...' or words ending in a punctuation symbol
if(original_word.find(word) == -1):
filtered_text.append(original_word)
else:
filtered_text.append(word)
filtered_text = filter(lambda x: x != '' , filtered_text)
return list(filtered_text)
def normalize_repititions(text, lang):
if lang == 'ar':
text = [normalize_token_ar(token) for token in text]
elif lang == 'en':
text = [normalize_token_en(token) for token in text]
return text
def normalize_token_ar(text):
i = 1
while i < len(text):
if (text[i] == 'ا' or text[i] == ',') and text[i] == text[i-1]:
text = text[0:i] + text[i+1:len(text)]
i -= 1
if text[i] == 'ي' and text[i] == text[i-1] and (i < len(text)-2 or not text.endswith('يين')):
text = text[0:i] + text[i+1:len(text)]
i -= 1
i += 1
return text
def normalize_token_en(text):
while True:
match = re.search('(\w)\\1{2,}', text)
if match == None:
break
span = match.span()
char = match.group(1)
single = text[0:span[0]] + char + text[span[1]:len(text)]
double = text[0:span[0]] + char + char + text[span[1]:len(text)]
if double in words.words():
text = double
elif single in words.words():
text = single
else:
if char in ['a', 'e', 'i', 'o', 'u']:
text = double
else:
text = single
return text
def stem_words(text):
new_text = []
for word in text:
new_text.append(porter.stem(word))
return new_text