Customer_Sentiment_Analysis/load_data.py at main · atharvapathak/Customer_Sentiment_Analysis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import re
from collections import Counter

import pandas as pd
from bs4 import BeautifulSoup as Soup
from bs4 import Tag
from sklearn.model_selection import StratifiedShuffleSplit

from tea import DATA_DIR, setup_logger

logger = setup_logger(__name__)

TRAIN_FILE = 'ABSA16_Laptops_Train_SB1_v2.xml'
TEST_FILE = 'EN_LAPT_SB1_TEST_.xml.gold'


def calculate_label_ratio(labels):
    """

    :param labels:
    :return:
    """

    x = Counter(labels)
    sum_counts = sum(x.values())

    print()
    for t in x.most_common():
        ratio = round(t[1] / sum_counts * 100, 2)

        print('Label: {}, Instances: {}, Ratio: {}%'.format(t[0], t[1], ratio))


def parse_reviews(file_type='train',
                  save_data=True,
                  load_data=True):
    """

    :param file_type: str. the file type. Enum between 'train' and 'test'
    :param save_data: bool. whether the extracted data will be saved
    :param load_data: bool. whether the extracted data should be loaded
    :return: pandas data-frame with 2 columns: polarity, text
    """
    assert file_type in ['train', 'test']

    file = TRAIN_FILE if file_type == 'train' else TEST_FILE

    path = "{}{}".format(DATA_DIR, file)

    if load_data:
        try:
            x = path.split('.')[-1]
            infile = re.sub(x, 'csv', path)
            logger.info('Loading file: {}'.format(infile))

            return pd.read_csv(infile)
        except FileNotFoundError:
            logger.warning('File Not Found on Data Directory. Creating a new one from scratch')

    data = list()

    handler = open(path).read()
    soup = Soup(handler, "lxml")

    reviews = soup.body.reviews

    for body_child in reviews:
        if isinstance(body_child, Tag):
            for body_child_2 in body_child.sentences:
                if isinstance(body_child_2, Tag):
                    if body_child_2.opinions:
                        opinion = body_child_2.opinions.opinion
                        # keeping only reviews that have a polarity
                        if opinion:
                            sentence = body_child_2.text.strip()
                            polarity = opinion.attrs.get('polarity')
                            data.append({'text': sentence, 'polarity': polarity})

    extracted_data = pd.DataFrame(data)
    extracted_data = extracted_data[extracted_data['polarity'] != 'neutral']
    if save_data:
        logger.info('Saving etracted reviews metadata from file: {}'.format(file_type))
        x = path.split('.')[-1]
        outfile = re.sub(x, 'csv', path)
        extracted_data.to_csv(outfile, encoding='utf-8', index=False)

    return extracted_data


def get_df_stratified_split_in_train_validation(data,
                                                label,
                                                validation_size,
                                                random_state=5):
    """

    :param data: pandas dataframe with the data
    :param label: str. the label of the stratified split
    :param validation_size: float. the size of validation
    :param random_state: int. seed for reproducibility
    :return: dictionary
    """

    data.reset_index(drop=True, inplace=True)
    X = data.drop(label, axis=1).copy()
    y = data[label].copy()

    train_validation_sss = StratifiedShuffleSplit(test_size=validation_size,
                                                  random_state=random_state)

    x_train, x_val, y_train, y_val = None, None, None, None

    for train_index, validation_index in train_validation_sss.split(X, y):
        x_train, x_val = X.iloc[train_index], X.iloc[validation_index]
        y_train, y_val = y.iloc[train_index], y.iloc[validation_index]

    proportions = lambda x: sorted([(i, round(Counter(x)[i] / float(len(x)) * 100.0, 3)) for i in Counter(x)],
                                   key=lambda x: x[1])

    data_size = lambda x: round(len(x) / len(data), 3)

    logger.info(60 * '*')

    logger.info('Train Dataset real Size: {}'.format(round(len(x_train) / len(data), 3)))

    logger.info('Validation Dataset Requested Size: {}'.format(validation_size))
    logger.info('Validation Dataset Real Size: {}'.format(data_size(x_val)))

    logger.info('Train Dataset shape: {}'.format(x_train.shape))
    logger.info('Train Dataset Label Proportion: ')
    for t in proportions(y_train):
        logger.info(t)

    logger.info('Validation Dataset shape: {}'.format(x_val.shape))
    logger.info('Validation Dataset Label Proportion: ')
    for t in proportions(y_val):
        logger.info(t)

    df_train = x_train.copy()
    df_train[label] = y_train

    df_validation = x_val.copy()
    df_validation[label] = y_val

    result = {
        'x_train': x_train,
        'x_validation': x_val,
        'y_train': y_train,
        'y_validation': y_val,
        'df_train': df_train,
        'df_validation': df_validation}

    return result


if __name__ == "__main__":
    train_data = parse_reviews(load_data=False, save_data=False, file_type='train')
    print(train_data.head())

    calculate_label_ratio(train_data['polarity'])