GWAS_ELASTIC/load_dataset.py at master · laoconeth/GWAS_ELASTIC · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import pickle
import numpy as np
from sklearn.utils import shuffle
import os

def load_dataset(dic, path) :
    dataset = pickle.load(open(os.path.join(path, '[Allergy]_classification_data_seed_2_linearitypreserved_withnames_withAAAD.p'), 'rb'))

    # load genome data
    n_train = dataset['train']['size']
    X_train = dataset['train']['genotype']
    y_train = dataset['train']['label']
    IgE_train = dataset['train']['IgE']
    AA_train = dataset['train']['AA']
    AD_train = dataset['train']['AD']


    n_test = dataset['test']['size']
    X_test = dataset['test']['data']
    y_test = dataset['test']['label']
    IgE_test = dataset['test']['IgE']
    AA_test = dataset['test']['AA']
    AD_test = dataset['test']['AD']

    X_train = X_train[:, 66:243591]
    X_test = X_test[:, 66:243591]


    n_whole = n_train + n_test
    X_whole = np.concatenate((X_train, X_test), axis=0)
    y_whole = np.concatenate((y_train, y_test))
    AA_whole = np.concatenate((AA_train, AA_test))
    AD_whole = np.concatenate((AD_train, AD_test))
    X_whole, y_whole, AA_whole, AD_whole = shuffle(X_whole, y_whole, AA_whole, AD_whole, \
                                                   random_state = dic['random_seed'])

    # split to folds
    fold_boundaries = np.linspace(1, dic['n_folds'], dic['n_folds'] - 1, endpoint = False) / \
                      float(dic['n_folds'])
    splits_X = np.split(X_whole, [int(i * X_whole.shape[0]) for i in fold_boundaries])
    splits_y = np.split(y_whole, [int(i * y_whole.shape[0]) for i in fold_boundaries])
    splits_AA = np.split(AA_whole, [int(i * AA_whole.shape[0]) for i in fold_boundaries])
    splits_AD = np.split(AD_whole, [int(i * AD_whole.shape[0]) for i in fold_boundaries])
    splits_n = [a.shape[0] for a in splits_X]
    folds = [{"X": splits_X[i], "y": splits_y[i], "AA": splits_AA[i], "AD": splits_AD[i], "n": splits_n[i]} \
             for i in range(dic['n_folds'])]

    # get p values
    for fold_idx in range(dic['n_folds']) :
        savename = path + "cochran_result_dataset_seed={0}_fold={1}_autosome=yes".\
            format(dic['random_seed'], str(fold_idx))
        with np.load(savename + ".npz") as f :
            folds[fold_idx]['p_value'] = f['p_value']
            folds[fold_idx]['sorted_snp_idx'] = f['sorted_snp_idx']
        # filter p_values
        if dic['pv_or_n'] == 'n' :
            threshold_idx = dic['n_threshold']
        elif dic['pv_or_n'] == 'pv' :
            threshold_idx = (folds[fold_idx]['p_value'] < dic['n_threshold']).sum()
        folds[fold_idx]['filtered_snp_list'] = (folds[fold_idx]['sorted_snp_idx'])[0:threshold_idx]

    dataset = {}
    dataset['dim_X'] = dic['n_threshold']
    dataset['dim_y'] = 1
    dataset['dim_AA'] = 1
    dataset['dim_AD'] = 1
    dataset['samples'] = []
    for cur_fold in range(dic['n_folds']):
        # split to train, val, test
        X_train_fold = np.concatenate([folds[(cur_fold + j) % dic['n_folds']]["X"] for j in range(3)],
                                      axis=0)
        X_val_fold = folds[(cur_fold + (dic['n_folds'] + 3)) % dic['n_folds']]["X"]
        X_test_fold = folds[(cur_fold + (dic['n_folds'] + 4)) % dic['n_folds']]["X"]

        y_train_fold = np.concatenate([folds[(cur_fold + j) % dic['n_folds']]["y"] for j in range(3)])
        y_val_fold = folds[(cur_fold + (dic['n_folds'] + 3)) % dic['n_folds']]["y"]
        y_test_fold = folds[(cur_fold + (dic['n_folds'] + 4)) % dic['n_folds']]["y"]

        AA_train_fold = np.concatenate([folds[(cur_fold + j) % dic['n_folds']]["AA"] for j in range(3)])
        AA_val_fold = folds[(cur_fold + (dic['n_folds'] + 3)) % dic['n_folds']]["AA"]
        AA_test_fold = folds[(cur_fold + (dic['n_folds'] + 4)) % dic['n_folds']]["AA"]

        AD_train_fold = np.concatenate([folds[(cur_fold + j) % dic['n_folds']]["AD"] for j in range(3)])
        AD_val_fold = folds[(cur_fold + (dic['n_folds'] + 3)) % dic['n_folds']]["AD"]
        AD_test_fold = folds[(cur_fold + (dic['n_folds'] + 4)) % dic['n_folds']]["AD"]

        # filter features
        X_train_fold = X_train_fold[:, folds[cur_fold]['filtered_snp_list']]
        X_val_fold = X_val_fold[:, folds[cur_fold]['filtered_snp_list']]
        X_test_fold = X_test_fold[:, folds[cur_fold]['filtered_snp_list']]

        dataset['samples'].append({'train': {'X':X_train_fold, 'y':y_train_fold, 'AA':AA_train_fold, 'AD':AD_train_fold},
                                   'val': {'X':X_val_fold, 'y':y_val_fold, 'AA':AA_val_fold, 'AD':AD_val_fold},
                                   'test': {'X':X_test_fold, 'y':y_test_fold, 'AA':AA_test_fold, 'AD':AD_test_fold}})

    return dataset


if __name__ == "__main__":

    dic = {}

    dic['gpu'] = 0
    dic['random_seed'] = 2
    dic['n_epochs'] = 500
    dic['n_batches'] = 1
    dic['find_hyps'] = False

    dic['use_IgE'] = False
    dic['n_folds'] = 5
    dic['pv_or_n'] = 'n'
    dic['n_threshold'] = 1000

    dic['reg_group'] = 0.0015
    dic['g_norm_threshold'] = pow(10, -3)

    dic['lr'] = 0.001

    path_data = ''

    load_dataset(dic, path_data)