OperatorActionPrediction/XGBoostModel.py at master · doas3140/OperatorActionPrediction · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
'''
    XGBoost model class

     - Main variables:
        self.model - keras model
        self.num_classes - number of outputs/classes

     - Main functions:
        self.train - trains self.model w/ selected parameters
        self.predict - predicts Y from X with self.model
        self.change_inputs - function that changes inputs before feeding to model
        self.load_model - loads model.h5 file from directory (dirpath)
        self.save_model - saves model.h5 and model.json to directory (dirpath)
        self.evaluation_function - returns function to evaluate when model improves. used for early stopping
'''

import os
import json
import keras
import pickle
import sklearn
import imblearn
import numpy as np
from tqdm import tqdm
import xgboost as xgb

from utils.model_utils import DataPlaceholder, get_class_weights, \
                              imblearn_sample
from utils.model_utils import MultipleMetricsEarlyStopping as EarlyStopping
from utils.metrics import normalized_confusion_matrix_and_identity_mse as confusion_mse
from utils.metrics import accuracy, weighted_accuracy, recall, precision

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, mean_squared_error
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import KFold, StratifiedKFold


class XGBoostModel():
    def __init__(self, num_classes):
        '''
        @param num_classes - number of outputs/classes
        '''
        self.model = None
        self.num_classes = num_classes


    def change_inputs(self, X, Y=None):
        ''' change inputs before feeding to model
        @param X w/ shape (num_examples, num_error, num_sites) - where each number is integer of number of errors happened at that site
        @param Y w/ shape (num_examples,) - where each number is integer that represents one class
        return DMatrix of X,Y dataset or X dataset
        '''
        X = X.reshape((len(X),-1))
        if Y is not None:
            return xgb.DMatrix(X, label=Y)
        else:
            return xgb.DMatrix(X)


    def train( self, X, Y, max_depth=4, max_epochs=100, seed=42, verbose=0, # training vars
               use_imblearn=False, imblearn_class=SMOTE(random_state=42, ratio=1.0), # imblearn vars
               early_stopping_patience=10, test_split=0.2, # early stopping vars
               testing=False, kfold_function=KFold, kfold_splits=5 ): # testing vars (returns predictions from kfold)
        '''
        @param X w/ shape (num_examples, num_error, num_sites) - where each number is integer of number of errors happened at that site
        @param Y w/ shape (num_examples,) - where each number is integer that represents one class
        @param max_depth - maximum tree depth
        @param max_epochs - maximum learning epochs, if not using kfold to approximate num_epochs: then this is num_epochs used for training
        @param seed - random seed that is used everywhere for reproducability
        @param verbose - if 0: no print output, else: print output
        @param use_imblearn - boolean that decides to use resampling for training or not (from imblearn library)
        @param imblearn_class - class from iblearn library used for resampling (doesn't do anything if use_imblearn = False)
        @param early_stopping_patience - when training doesn't improve for n epochs, then stop. where n is this variable number
        @param test_split - test split used for early stopping
        @param testing - if True: stops training after cross validation and returns predictions of all data across all kfolds
        @param kfold_function - cross validation function from sklearn library (doesn't do anything if testing == False)
        @param kfold_splits - number of cross validation splits, used in kfold_function (doesn't do anything if testing == False)
        '''

        if verbose != 0: verb_eval = True
        else: verb_eval = False

        train_param = {
            'max_depth': max_depth,  # the maximum depth of each tree
            'eta': 0.3,  # the training step for each iteration
            'silent': 1,  # logging mode - quiet
            'objective': 'multi:softprob',  # error evaluation for multiclass training
            'num_class': self.num_classes # the number of classes that exist in this datset
        }

        if testing:
            enum = enumerate(kfold_function(n_splits=kfold_splits, shuffle=True, random_state=seed).split(X,Y))
            if verbose != 0:
                enum = tqdm(enum, total=kfold_splits, desc='kfold', leave=False, initial=0)
            histories = []
            for i,(index_train, index_valid) in enum:
                X_train, X_val = X[ index_train ], X[ index_valid ]
                y_train, y_val = Y[ index_train ], Y[ index_valid ]
                if use_imblearn:
                    X_train, y_train = imblearn_sample( X_train, y_train, imblearn_class, verbose=verbose )
                dmatrix_train = self.change_inputs(X_train, y_train)
                dmatrix_val = self.change_inputs(X_val, y_val)
                watchlist = [(dmatrix_train, 'train'), (dmatrix_val, 'val')]
                results = {}
                model = xgb.train( train_param, dmatrix_train, max_epochs,
                                   watchlist, feval = self.evaluation_function(),
                                   early_stopping_rounds = early_stopping_patience,
                                   evals_result = results, verbose_eval = verb_eval )
                train_pred = model.predict( dmatrix_train )
                train_labels = keras.utils.to_categorical( y_train )
                val_pred = model.predict( dmatrix_val )
                val_labels = keras.utils.to_categorical( y_val )
                histories.append({ 'pred':train_pred, 'labels':train_labels,
                                   'val_pred':val_pred, 'val_labels':val_labels })
            return histories
        else:
            X_train, X_val, y_train, y_val = train_test_split(X, Y, test_size=test_split, random_state=seed)
            if use_imblearn:
                X_train, y_train = imblearn_sample( X_train, y_train, imblearn_class, verbose=verbose )
            dmatrix_train = self.change_inputs(X_train, y_train)
            dmatrix_val = self.change_inputs(X_val, y_val)
            watchlist = [(dmatrix_train, 'train'), (dmatrix_val, 'val')]
            results = {}
            self.model = xgb.train( train_param, dmatrix_train, max_epochs,
                         watchlist, feval = self.evaluation_function(),
                         early_stopping_rounds = early_stopping_patience,
                         evals_result = results, verbose_eval = verb_eval )
            return { 'loss': results['train']['merror'], 'val_loss': results['val']['merror'],
                     'main_score': results['val']['confusion_mse'] }


    def evaluation_function(self):
        ''' returns function to evaluate when model improves. used for early stopping
        return function(y_pred, y_true)
        '''
        num_classes = self.num_classes
        def evaluation(y_pred,y_true):
            '''
            @param y_pred w/ shape (num_examples, num_classes) - probabilities of each class
            @param y_true w/ shape (num_examples,) - each number represents class index
            '''
            y_true = keras.utils.to_categorical(y_true.get_label())
            y_pred = np.argmax(y_pred,axis=1)
            y_true = np.argmax(y_true,axis=1)
            def confusion_mse(y_true,y_pred):
                cm = confusion_matrix(y_true, y_pred, labels=np.arange(num_classes)) # returns shape=(num_classes,num_classes)
                cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] # normalize confusion matrix
                cm = cm.flatten()
                cm[ np.isnan(cm) ] = 0 # replace nans w/ 0
                identity = np.identity(num_classes).flatten()
                return mean_squared_error(cm,identity)
            mse = confusion_mse(y_true,y_pred)
            return [('confusion_mse',mse)]
        return evaluation


    def predict(self, X, argmax=True):
        ''' predicts Y from X with self.model
        @param X w/ shape (num_examples, num_error, num_sites)
        return y_argmax w/ shape (num_examples,) - each number represents class index
        '''
        X = self.change_inputs(X)
        y_pred = self.model.predict(X) # (num_examples, num_outputs)
        if argmax:
            y_pred = np.argmax(y_pred, axis=-1) # (num_examples,)
        return y_pred


    def load_model(self, dirpath):
        ''' loads xgboost.model.pickle file from directory (dirpath)
        @param dirpath - path of directory, from where to load
        '''
        modelpath = os.path.join(dirpath, 'xgboost.model.pickle')
        self.model = pickle.load(open(modelpath, "rb"))


    def save_model(self, dirpath):
        ''' saves xgboost.model.pickle to directory (dirpath)
        @param dirpath - path of directory, where to save
        '''
        modelpath = os.path.join(dirpath, 'xgboost.model.pickle')
        pickle.dump(self.model, open(modelpath, "wb"))


#