-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathPreprocessing.py
More file actions
68 lines (53 loc) · 2.48 KB
/
Preprocessing.py
File metadata and controls
68 lines (53 loc) · 2.48 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import os
import joblib
import spacy
import pandas
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
csv_path = "cleaned_data/trustpilot_reviews_with_ratings.csv"
def read_csv_file(filename):
with open(filename, "r", encoding='utf-8') as csv_file:
df = pandas.read_csv(csv_file)
review_body = df["review_body"].values
'''
0 = False
1 = Neutral
2 = Positive
'''
review_rating = df["review_rating"] = df["review_rating"].replace({2: 0, 1: 0, 3: 1, 4: 2, 5: 2})
review_rating = review_rating.values
return review_body, review_rating
class Preprocessing:
def __init__(self):
self.nlp = spacy.load('de_core_news_sm')
reviews_bodies, rating_values = read_csv_file(csv_path)
clean_tokens = self.tokenize(reviews_bodies)
self.tfidf_transform(clean_tokens, rating_values)
def tokenize(self, review_body):
clean_tokens = []
for review in review_body:
doc = self.nlp(review)
tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct and token.is_alpha] # Info: token.is_alpha = No Numbers and Emojis
clean_tokens.append(tokens)
return clean_tokens
def tfidf_transform(self, reviews_bodies, rating_values):
if len(reviews_bodies) and len(rating_values):
print("The number of review bodies and rating values is equal")
y = rating_values
tfidf_vectorizer = TfidfVectorizer(max_features=20000, ngram_range=(1, 2))
x_tfidf = tfidf_vectorizer.fit_transform([' '.join(review) for review in reviews_bodies]) # read reviews from the list
x_train, x_test, y_train, y_test = train_test_split(x_tfidf, y, test_size=0.2, random_state=42, stratify=y)
svc_model = LinearSVC()
svc_model.fit(x_train, y_train)
accuracy = metrics.accuracy_score(y_test, svc_model.predict(x_test))
print(f"Genauigkeit: {accuracy:2f}")
print(classification_report(y_test, svc_model.predict(x_test)))
#save model in the 'Model'-Folder
if not os.path.exists("Model"):
os.mkdir("Model")
joblib.dump(svc_model, "Model/reviews_tfidf_model.pkl")
joblib.dump(tfidf_vectorizer, "Model/reviews_vectorizer.pkl")
print("Model gespeichert")