-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdatautils.py
More file actions
91 lines (64 loc) · 2.76 KB
/
datautils.py
File metadata and controls
91 lines (64 loc) · 2.76 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import pandas as pd
def hold_out(df, train_proportion, seed=0):
# df: dataframe pandas
# train_proportion : 0<= train_proportion<=1 , quantita di dataset per training
# seed: seed per shuffling
shuffled_df = df.sample(frac=1, random_state=seed).reset_index(drop=True)
train_size = int(train_proportion * df.shape[0])
df_train = shuffled_df.iloc[:train_size, :]
df_test = shuffled_df.iloc[train_size:, :]
return df_train, df_test
def kfold(df_design, fold_num=5, normalize=True, seed=0):
shuffled_df = df_design.sample(frac=1, random_state=seed).reset_index(
drop=True
)
assert df_design.shape[0] % fold_num == 0 # all folds are of equal size
fold_len = df_design.shape[0] // fold_num
fold_list = list()
for i in range(fold_num):
fold_list.append(
shuffled_df.iloc[i * fold_len : (i + 1) * fold_len, :] # noqa
)
fold_sets = []
for fold in fold_list:
other_folds = [f for f in fold_list if f is not fold]
assert len(other_folds) == len(fold_list) - 1
training_set = pd.concat(other_folds).reset_index(drop=True)
assert (
len(list(set(training_set["ID"]) & set(fold["ID"]))) == 0
) # checks any data leak
fold_sets.append((training_set, fold))
df_normal_parameters = pd.DataFrame()
feature_cols = [f"x{i}" for i in range(1, 11)] # feature column names
# target_cols = [ f'y{i}' for i in range(1,4)]
if normalize:
for idx, (df_training, df_val) in enumerate(
fold_sets
): # normalize features
# compute mean and std. dev.
curr_means = df_training[feature_cols].mean()
curr_std = df_training[feature_cols].std()
# standardize training set
df_training.loc[:, feature_cols] = (
df_training[feature_cols] - curr_means
) / curr_std
# standardize val set w.r.t mean and std dev of training
df_val.loc[:, feature_cols] = (
df_val[feature_cols] - curr_means
) / curr_std
# saving the mean and std, since it is needed for assessment
curr_means.rename(f"mean{idx}", inplace=True)
curr_std.rename(f"std{idx}", inplace=True)
df_normal_parameters = pd.concat(
[df_normal_parameters, curr_means, curr_std], axis=1
)
return fold_sets
def obtain_features_targets(df):
feature_cols = [f"x{i}" for i in range(1, 11)] # feature column names
target_cols = [f"y{i}" for i in range(1, 4)] # target column names
X = df[feature_cols].values
y = df[target_cols].values
# check data type is float64
assert X.dtype == "float64"
assert y.dtype == "float64"
return X, y