-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocessing.py
More file actions
192 lines (165 loc) · 5.6 KB
/
preprocessing.py
File metadata and controls
192 lines (165 loc) · 5.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
# ------------ Imports ------------
# Serializaton
import joblib
# Data processing
import pandas as pd
import spacy
import re
from sklearn.feature_extraction.text import TfidfVectorizer
# ------------ Load Spacy Model ------------
try:
nlp = spacy.load("de_core_news_lg") # loads german language model
except OSError:
from spacy.cli import download # download model if not present
download("de_core_news_lg")
nlp = spacy.load("de_core_news_lg")
# ------------ Preprocessing Pipeline ------------
def data_handling(file: str):
"""
in: csv file path
out: cleaned dataframe
"""
# open csv file
df_data: pd.DataFrame = pd.read_csv(file, sep=";")
# df_data = df_data.head(100) # optional limit for faster testing
# new column for cleaned description + convert to lowercase
df_data["description_clean"] = df_data["description"].str.lower()
# whitelist of pronouns to keep
whitelist: set[str] = {
"du",
"er",
"sie",
"es",
"wir",
"ihr",
"mich",
"dich",
"ihn",
"uns",
"euch",
"dir",
"ihm",
"ihnen",
"dein",
"deine",
"deiner",
"deines",
"deinem",
"deinen",
"sein",
"seine",
"seiner",
"seines",
"seinem",
"seinen",
"ihr",
"ihre",
"ihrer",
"ihres",
"ihrem",
"ihren",
"unser",
"unsere",
"unserer",
"unseres",
"unserem",
"unseren",
"euer",
"eure",
"eurer",
"eures",
"eurem",
"euren",
}
# function to remove stopwords
def remove_stopwords(description: str):
"""
in: description text
out: description text without stopwords
"""
doc = nlp(description) # converts text to spacy-doc object with tokens
cleaned_tokens: list[str] = [
token.text
for token in doc # for every token in doc
if not token.is_stop
or token.text in whitelist # keep token only if its not a stopword or is whitelisted
]
return " ".join(cleaned_tokens) # joins tokens back to string
# remove hyperlinks
def remove_hyperlinks(description: str):
"""
in: description text
out: description text without hyperlinks
"""
return re.sub(r"http[s]?://\S+", "", description) # regex for link structure, sub with empty string
# function to remove special characters
def remove_special_chars(description: str):
"""
in: description text
out: description text without special characters
"""
# map umlauts to their replacements
umlaut_map: dict[str, str] = {
"ä": "ae",
"ö": "oe",
"ü": "ue",
"ß": "ss",
}
# replace all umlauts
for umlaut, replacement in umlaut_map.items():
description = description.replace(
umlaut, replacement
) # replace umlaut with replacement
description = re.sub(r",(?=\S)", ", ", description) # space after commas
description = re.sub(r"\.(?=\S)", ". ", description) # space after dot
description = re.sub(
r"[^a-z\s@]", "", description
) # remove non-alphanumeric characters except spaces
description = re.sub(
r"\s+", " ", description
) # replace multiple spaces with single space
return description.strip() # remove leading/trailing spaces
# apply preprocessing functions
df_data["description_clean"] = (
df_data["description_clean"]
.apply(remove_hyperlinks)
.apply(remove_special_chars)
.apply(remove_stopwords)
)
return df_data
# ------------ TF-IDF Feature Engineering ------------
def tfidf_vectorizer(df: pd.DataFrame, filepath: str):
"""
in: df - cleaned DataFrame
filepath - path to save the vectorizer
out: X_train_tfidf - TF-IDF transformed train features (sparse matrix)
"""
vectorizer: TfidfVectorizer = TfidfVectorizer(
max_features=10000, # limit feature space dimensionality
min_df=1, # mininmal document frequency for a token to be included (could be increased to avoid overfitting)
sublinear_tf=True, # logarithmic scaling for better for better weighting
norm="l2", # L2 normalization
stop_words=None, # doesnt exlude more stopwords
)
X_train_tfidf = vectorizer.fit_transform(
df["description_clean"]
) # fit and transform train data
print("Train shape:", X_train_tfidf.shape) # print dimensions of tf-idf matrices
joblib.dump(
vectorizer, filepath + "tfidf_vectorizer.pkl"
) # serialize vectorizer
return X_train_tfidf
# ------------ Data to import for features ------------
def get_processed_dfs(csv_path: str, vec_path: str):
"""
in: csv file path, vectorizer save path
out: df with preprocessed data, vectorized dataset (tf-idf sparse matrix)
"""
df_preprocessed: pd.DataFrame = data_handling(csv_path)
X_train_tfidf = tfidf_vectorizer(df_preprocessed, vec_path)
return (df_preprocessed, X_train_tfidf)
# ------------ Test ------------
if __name__ == "__main__":
df_test, X_train_tfidf = get_processed_dfs("../tar.csv", "./model")
print(df_test.head())
print(X_train_tfidf)