SkillHuntingClassification/eda.py at main · gokcegok/SkillHuntingClassification · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np


def check_df(dataframe, n_head=10, n_tail=10):
    """
    This function prints basic information about given dataframe.
        shape, data types, head of dataframe, tail of dataframe,
        null value counts

    Parameters
    ----------
    dataframe: pandas.DataFrame

    n_head: integer
            number of rows for printing head of df

    n_tail: integer
            number of rows for printing tail of df

    Returns
    -------
    """

    print("\n------------- Shape -------------\n")
    print(dataframe.shape)
    print("\n------------- Data Types -------------\n")
    print(dataframe.dtypes)
    print("\n------------- Head -------------\n")
    print(dataframe.head(n_head))
    print("\n------------- Tail -------------\n")
    print(dataframe.tail(n_tail))
    print("\n------------- NA -------------\n")
    print(dataframe.isnull().sum())
    print("\n------------- Percentages -------------\n")
    print(dataframe.describe([0, 0.05, 0.50, 0.95, 0.99, 1]).T)


def cat_summary(dataframe, col_name, plot=False):
    """
    This function prints value counts and class ratio of given
    categorical variable in the given dataframe.

    Parameters
    ----------
    dataframe: pandas.DataFrame

    col_name: string
              the name of the categorical variable
    plot: boolean
          If plot == True:
            plot bar plot of the categorical variable

    Returns
    -------
    """

    print(pd.DataFrame({col_name: dataframe[col_name].value_counts(),
                        "Ratio": 100 * dataframe[col_name].value_counts() / len(dataframe)}), "\n")

    if plot:

        sns.countplot(x=dataframe[col_name], data=dataframe)
        plt.show(block=True)


def num_summary(dataframe, num_col, plot=False):
    """
    This function prints descriptive statistics
    of the given numeric variable in the given dataset.

    Parameters
    ----------
    dataframe: pandas.DataFrame

    num_col: string
             the name of the numeric column
    plot: boolean
          If plot == True:
            plot bar plot of the categorical variable

    Returns
    -------
    """

    # quantiles = np.arange(0.05, 1.05, 0.05)
    quantiles = [0.01, 0.05, 0.25, 0.50, 0.75, 0.95, 0.99]
    print(dataframe[num_col].describe(quantiles).T)

    if plot:
        dataframe[num_col].hist()
        plt.xlabel(num_col)
        plt.ylabel("freq")
        plt.title("hist of " + num_col)
        plt.show(block=True)


def grab_col_names(dataframe, cat_th=10, car_th=20):
    """
    This function returns names of categorical, numeric and categorical looking
    cardinal variables in the given dataset. And prints a brief report about dataset.

    Parameters
    ----------
    dataframe: pandas.DataFrame

    cat_th: int, float
            threshold value for numeric looking categorical variables

    car_th: int, float
            threshold value for categorical looking cardinal variables

    Return
    ------
    cat_cols: list
        list of categorical variables
    num_cols: list
        list of numeric variables
    cat_but_car: list
        list of categorical looking cardinal variables

    Notes
    -----
    len(cat_cols) + len(num_cols) + len(cat_but_car) = len(df)
    num_but_cols(numeric looking categorical variables) is in cat_cols
    """

    cat_cols = [col for col in dataframe.columns if
                str(dataframe[col].dtypes) in ["category", "object", "bool"]]

    num_but_cat = [col for col in dataframe.columns if
                   dataframe[col].nunique() < cat_th and dataframe[col].dtypes in ["int", "float"]]

    cat_but_car = [col for col in dataframe.columns if
                   dataframe[col].nunique() > car_th and str(dataframe[col].dtypes) in ["category", "object"]]

    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes in ["int", "float"]]
    num_cols = [col for col in num_cols if col not in cat_cols]

    print("Observations: ", dataframe.shape[0])
    print("Variables: ", dataframe.shape[1])
    print("Categorical Columns: ", len(cat_cols))
    print("Numeric columns: : ", len(num_cols))
    print("Cardinal Columns: ", len(cat_but_car))
    print("Num but categorical cols ", len(num_but_cat))

    return cat_cols, num_cols, cat_but_car


def target_summary_with_categorical(dataframe, target, cat_col):
    """
    This function prints the means of the target variable
    according to the classes of the given categorical variable

    Parameters
    ----------
    dataframe: pandas.DataFrame

    target: str
            the name of the target/dependent variable
    cat_col: str
            the name of the given categorical variable

    Returns
    -------
    """

    print(pd.DataFrame({"TARGET MEAN": dataframe.groupby(cat_col)[target].mean()}))


def target_summary_with_numeric(dataframe, target, num_col):
    """
    This function prints the means of the target variable
    according to the classes of the given numeric variable

    Parameters
    ----------
    dataframe: pandas.DataFrame

    target: str
            the name of the target/dependent variable
    num_col: str
            the name of the given numeric variable

    Returns
    -------
    """

    print(pd.DataFrame(dataframe.groupby(target).agg({num_col: "mean"})))


def high_correlated_cols(dataframe, plot=False, corr_th=0.9):
    """
    This function returns the list of the high correlated variables.
    As default if the correlation between two variable bigger than 0.90
    this variables would be "high correlated".

    Parameters
    ----------
    dataframe: pandas.DataFrame

    plot: boolean
          If plot == True:
          plot bar plot of the categorical variable

    corr_th: int, float
             threshold value for high correlated variables

    Returns
    -------
    drop_list: list
               the list of high correlated variables
    """

    corr_ = dataframe.corr()
    corr_matrix_ = corr_.abs()
    upper_corr_ = corr_matrix_.where(np.triu(np.ones(corr_matrix_.shape), k=1).astype("bool"))
    drop_list = [col for col in upper_corr_.columns if any(upper_corr_[col] > corr_th)]

    if plot:

        sns.set(rc={"figure.figsize": (15, 25)})
        sns.heatmap(corr_, cmap="RdBu")
        plt.show()

    return drop_list


def get_outlier_thresholds(dataframe, column, th_quantile=0.95):
    """
    This function calculates the upper and lower limits for finding outlier values in a variable.
    Params
    ------
    dataframe: pd.DataFrame
    column: str
            column name
    th_quantile: float
                 Percentage of values that will not be considered outliers
    Returns
    -------
    lower_limit
    upper_limit
    """
    Q1 = dataframe[column].quantile(1-th_quantile)
    Q3 = dataframe[column].quantile(th_quantile)
    IQR = Q3 - Q1
    upper_limit = Q3 + IQR * 1.5
    lower_limit = Q1 - IQR * 1.5
    print("lower limit: ", lower_limit, ", upper limit: ", upper_limit)

    return lower_limit, upper_limit


def check_outlier(dataframe, col_name):
    """
    This function returns whether the entered variable
    has outliers.

    Parameters
    ----------
    dataframe: pandas.DataFrame
    col_name: string
              column name
    Return
    -------
    True/False
    """
    low_limit, up_limit = get_outlier_thresholds(dataframe, col_name)
    if dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None):
        return True
    else:
        return False


def any_outliers(dataframe, column, th_quantile=0.95):
    """
    Params
    ------
    dataframe: pd.DataFrame
    column: str
            column name
    th_quantile: float
                 Percentage of values that will not be considered outliers

    """
    low_limit, up_limit = get_outlier_thresholds(dataframe, column, th_quantile)

    return dataframe[(dataframe[column] < low_limit) | (dataframe[column] > up_limit)].any(axis=None)


def replace_with_thresholds(dataframe, column, inplace=True, th_quantile=0.95):
    """
    Replace outliers with upper or lower thresholds.

    Parameters
    ----------
    dataframe : pandas.DataFrame

    column : string
        column name which searching for outlier values
    inplace : boolean, optional
        replace original dataframe with new dataframe. The default is True.
    th_quantile : float, optional
        Value for finding limit of outliers. The default is 0.95.

    Returns
    -------
    dataframe_ : pandas.DataFrame
        outliers suppressed dataframe

    """


    lower_limit, upper_limit = get_outlier_thresholds(dataframe, column, th_quantile)

    if inplace:

        dataframe.loc[(dataframe[column] < lower_limit), column] = lower_limit
        dataframe.loc[(dataframe[column] > upper_limit), column] = upper_limit

    else:

        dataframe_ = dataframe.copy()
        dataframe_.loc[(dataframe_[column] < lower_limit), column] = lower_limit
        dataframe_.loc[(dataframe_[column] > upper_limit), column] = upper_limit

        return dataframe_


def missing_values_table(dataframe, na_name=False):
    """

    Parameters
    ----------
    dataframe : pandas.DataFrame

    na_name : Boolean, optional
        If it is True, function returns name of null columns. The default is False.

    Returns
    -------
    na_columns : list
        list of null column names

    """
    na_columns = [col for col in dataframe.columns if dataframe[col].isnull().sum() > 0]

    n_miss = dataframe[na_columns].isnull().sum().sort_values(ascending=False)
    ratio = (dataframe[na_columns].isnull().sum() / dataframe.shape[0] * 100).sort_values(ascending=False)
    missing_df = pd.concat([n_miss, np.round(ratio, 2)], axis=1, keys=['n_miss', 'ratio'])
    print(missing_df, end="\n")

    if na_name:
        return na_columns


def plot_importance(model, features, num, save=False):
    """
    Gives a feature importance plot
    """
    feature_imp = pd.DataFrame({'Value': model.feature_importances_, 'Feature': features.columns})
    plt.figure(figsize=(10, 2))
    sns.set(font_scale=1)
    sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value",
                                                                     ascending=False)[0:num])
    plt.title('Features')
    plt.tight_layout()
    plt.show()
    if save:
        plt.savefig('importances.png')