-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathload_data.py
More file actions
159 lines (115 loc) · 5 KB
/
load_data.py
File metadata and controls
159 lines (115 loc) · 5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import re
from collections import Counter
import pandas as pd
from bs4 import BeautifulSoup as Soup
from bs4 import Tag
from sklearn.model_selection import StratifiedShuffleSplit
from tea import DATA_DIR, setup_logger
logger = setup_logger(__name__)
TRAIN_FILE = 'ABSA16_Laptops_Train_SB1_v2.xml'
TEST_FILE = 'EN_LAPT_SB1_TEST_.xml.gold'
def calculate_label_ratio(labels):
"""
:param labels:
:return:
"""
x = Counter(labels)
sum_counts = sum(x.values())
print()
for t in x.most_common():
ratio = round(t[1] / sum_counts * 100, 2)
print('Label: {}, Instances: {}, Ratio: {}%'.format(t[0], t[1], ratio))
def parse_reviews(file_type='train',
save_data=True,
load_data=True):
"""
:param file_type: str. the file type. Enum between 'train' and 'test'
:param save_data: bool. whether the extracted data will be saved
:param load_data: bool. whether the extracted data should be loaded
:return: pandas data-frame with 2 columns: polarity, text
"""
assert file_type in ['train', 'test']
file = TRAIN_FILE if file_type == 'train' else TEST_FILE
path = "{}{}".format(DATA_DIR, file)
if load_data:
try:
x = path.split('.')[-1]
infile = re.sub(x, 'csv', path)
logger.info('Loading file: {}'.format(infile))
return pd.read_csv(infile)
except FileNotFoundError:
logger.warning('File Not Found on Data Directory. Creating a new one from scratch')
data = list()
handler = open(path).read()
soup = Soup(handler, "lxml")
reviews = soup.body.reviews
for body_child in reviews:
if isinstance(body_child, Tag):
for body_child_2 in body_child.sentences:
if isinstance(body_child_2, Tag):
if body_child_2.opinions:
opinion = body_child_2.opinions.opinion
# keeping only reviews that have a polarity
if opinion:
sentence = body_child_2.text.strip()
polarity = opinion.attrs.get('polarity')
data.append({'text': sentence, 'polarity': polarity})
extracted_data = pd.DataFrame(data)
extracted_data = extracted_data[extracted_data['polarity'] != 'neutral']
if save_data:
logger.info('Saving etracted reviews metadata from file: {}'.format(file_type))
x = path.split('.')[-1]
outfile = re.sub(x, 'csv', path)
extracted_data.to_csv(outfile, encoding='utf-8', index=False)
return extracted_data
def get_df_stratified_split_in_train_validation(data,
label,
validation_size,
random_state=5):
"""
:param data: pandas dataframe with the data
:param label: str. the label of the stratified split
:param validation_size: float. the size of validation
:param random_state: int. seed for reproducibility
:return: dictionary
"""
data.reset_index(drop=True, inplace=True)
X = data.drop(label, axis=1).copy()
y = data[label].copy()
train_validation_sss = StratifiedShuffleSplit(test_size=validation_size,
random_state=random_state)
x_train, x_val, y_train, y_val = None, None, None, None
for train_index, validation_index in train_validation_sss.split(X, y):
x_train, x_val = X.iloc[train_index], X.iloc[validation_index]
y_train, y_val = y.iloc[train_index], y.iloc[validation_index]
proportions = lambda x: sorted([(i, round(Counter(x)[i] / float(len(x)) * 100.0, 3)) for i in Counter(x)],
key=lambda x: x[1])
data_size = lambda x: round(len(x) / len(data), 3)
logger.info(60 * '*')
logger.info('Train Dataset real Size: {}'.format(round(len(x_train) / len(data), 3)))
logger.info('Validation Dataset Requested Size: {}'.format(validation_size))
logger.info('Validation Dataset Real Size: {}'.format(data_size(x_val)))
logger.info('Train Dataset shape: {}'.format(x_train.shape))
logger.info('Train Dataset Label Proportion: ')
for t in proportions(y_train):
logger.info(t)
logger.info('Validation Dataset shape: {}'.format(x_val.shape))
logger.info('Validation Dataset Label Proportion: ')
for t in proportions(y_val):
logger.info(t)
df_train = x_train.copy()
df_train[label] = y_train
df_validation = x_val.copy()
df_validation[label] = y_val
result = {
'x_train': x_train,
'x_validation': x_val,
'y_train': y_train,
'y_validation': y_val,
'df_train': df_train,
'df_validation': df_validation}
return result
if __name__ == "__main__":
train_data = parse_reviews(load_data=False, save_data=False, file_type='train')
print(train_data.head())
calculate_label_ratio(train_data['polarity'])