-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathclassifier1.py
More file actions
98 lines (74 loc) · 3.16 KB
/
classifier1.py
File metadata and controls
98 lines (74 loc) · 3.16 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# -*- coding: utf-8 -*-
"""
Created on Thu Jul 4 12:23:33 2019
@author: lujine
"""
from nltk.corpus import twitter_samples
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
import pickle
import utils
def create_word_features(words):
useful_words = [word for word in words if word not in stopwords.words("english")]
my_dict = dict([(word, True) for word in useful_words])
return my_dict
if __name__ == "__main__":
twitter_samples.fileids()
strings = twitter_samples.strings('negative_tweets.json')
i = 0
neg_reviews = []
for tweet in strings:
tweet = tweet.replace(":", "").replace("(", "").replace(")", "")
feature = create_word_features(utils.process({'text': tweet, 'lang': 'en'}))
neg_reviews.append((feature, "negative"))
print(i)
i+=1
strings = twitter_samples.strings('positive_tweets.json')
pos_reviews = []
for tweet in strings:
tweet = tweet.replace(":", "").replace("(", "").replace(")", "")
feature = create_word_features(utils.process({'text': tweet, 'lang': 'en'}))
pos_reviews.append((feature, "positive"))
print(i)
i+=1
# print(len(neg_reviews))
# print(len(pos_reviews))
# print(neg_reviews[:5])
arabic_neg = []
arabic_pos = []
lines = open('arabic_training.txt', 'r', encoding='utf-8').read().split('\n')
for line in lines:
fields = line.split('\t')
if len(fields) >= 2:
if fields[1] == "POS":
feature = create_word_features(utils.process({'text': fields[0], 'lang': 'ar'}))
arabic_pos.append((feature, 'positive'))
print(i)
i+=1
elif fields[1] == "NEG":
feature = create_word_features(utils.process({'text': fields[0], 'lang': 'ar'}))
arabic_neg.append((feature, 'negative'))
print(i)
i+=1
# print(len(arabic_neg))
# print(len(arabic_pos))
# print(arabic_pos[:5])
train_set_en = neg_reviews[:4000] + pos_reviews[:4000]
test_set_en = neg_reviews[0:1000] + pos_reviews[0:1000]
train_set_ar = arabic_neg[:int(len(arabic_neg)*0.8)] + arabic_pos[:int(len(arabic_pos)*0.8)]
test_set_ar = arabic_neg[0:int(len(arabic_neg)*0.2)] + arabic_pos[0:int(len(arabic_pos)*0.2)]
# print(len(train_set_en), len(test_set_en))
english_classifier = NaiveBayesClassifier.train(train_set_en)
arabic_classifier = NaiveBayesClassifier.train(train_set_ar)
accuracy_en = nltk.classify.util.accuracy(english_classifier, test_set_en)
accuracy_ar = nltk.classify.util.accuracy(arabic_classifier, test_set_ar)
print(accuracy_en)
print(accuracy_ar)
# for tweet in test_set_en:
# print((tweet,english_classifier.classify(tweet[0])))
with open('second_classifier_en.pkl', 'wb') as classifier_file:
pickle.dump(english_classifier, classifier_file)
with open('second_classifier_ar.pkl', 'wb') as classifier_file:
pickle.dump(arabic_classifier, classifier_file)