-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsimple_sentiment_analysis.py
More file actions
110 lines (84 loc) · 3.49 KB
/
simple_sentiment_analysis.py
File metadata and controls
110 lines (84 loc) · 3.49 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import spacy
import pandas as pd
import pickle
import string
import os
import os.path
from spacy import displacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn import datasets
from joblib import dump, load
from os import path
# NOTE:
## Not updated to latest version, as imtermediate replaces this with better training
## Please use that script intead.
nlp = spacy.load('en_core_web_sm')
dir_path = os.path.dirname(os.path.realpath(__file__))
# This should be changed to the local directories of the training dataset.
data_loc_dict = {'yelp': 'C:\\Users\\carlp\\Desktop\\sentiment labelled sentences\\sentiment labelled sentences\\yelp_labelled.txt',
'amazon': 'C:\\Users\\carlp\\Desktop\\sentiment labelled sentences\\sentiment labelled sentences\\amazon_cells_labelled.txt',
'imdb': 'C:\\Users\\carlp\Desktop\\sentiment labelled sentences\\sentiment labelled sentences\\imdb_labelled.txt'}
clf_file_path = dir_path + '\\simple_sentiment_analysis_clf.joblib'
vect_file_path = dir_path + '\\simple_sentiment_analysis_vect.joblib'
clf = None
vectorizer = None
def CheckModelExistence():
if (path.exists(clf_file_path)):
print('Model exists! Checking vectorizer...')
if (path.exists(vect_file_path)):
print('Vectorizer exists! Loading...')
global clf
global vectorizer
clf = load(clf_file_path)
vectorizer = load(vect_file_path)
GetUserInput(vectorizer, clf)
else:
print('Vectorizer doesn\'t exist. Creating new model...')
CreateModel(vectorizer, clf)
else:
print('Model doesn\'t exist. Creating new model...')
CreateModel(vectorizer, clf)
def CreateModel(vectorizer, clf):
df_list = []
for source, filepath in data_loc_dict.items():
df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t')
df['source'] = source # Add another column filled with the source name
df_list.append(df)
df = pd.concat(df_list)
sentences = []
y = []
for source in df['source'].unique():
df_source = df[df['source'] == source]
sentences = df_source['sentence'].values
y = df_source['label'].values
sentences_train, sentences_test, y_train, y_test = train_test_split(
sentences, y, test_size=0.25, random_state=1000)
vectorizer = CountVectorizer()
vectorizer.fit(sentences_train)
X_train = vectorizer.transform(sentences_train)
X_test = vectorizer.transform(sentences_test)
clf = LogisticRegression()
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
print('Accuracy for {} data: {:.4f}'.format(source, score))
dump(clf, clf_file_path)
dump(vectorizer, vect_file_path)
clf = load(clf_file_path)
vectorizer = load(vect_file_path)
GetUserInput(vectorizer, clf)
def GetUserInput(vectorizer, clf):
while True:
doc = nlp(input())
X_test = vectorizer.transform([doc.text])
print('##########')
if (clf.predict(X_test[0:1]) == 0):
print('Negative Sentiment Detected :(')
else:
print('Positive Sentiment Detected :)')
print('##########')
CheckModelExistence()