Phishing-AutoEncoder/main.py at main · Nick-prog/Phishing-AutoEncoder · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import time
import packages
import random
import numpy as np
import tensorflow as tf
from sklearn.model_selection import RandomizedSearchCV
from scikeras.wrappers import KerasClassifier

random.seed(15)
np.random.seed(15)

def syntactic_dataset():
    '''
    Function just to test the dataset used by one of our references.
    Need to replicate a similar number of features and labels for the semantic dataset.
    '''
    data = packages.Preprocess("C:/Users/Nicko/Documents/Cyber INT/Phishing-AutoEncoder/All.csv")
    combined = data.download()

    full_dataset = combined.copy() # copy dataset.
    full_dataset = full_dataset.dropna()

    features = full_dataset.drop(['URL_Type_obf_Type', 'label'] , axis=1)
    labels = packages.pd.DataFrame(full_dataset['label'])
    print("Positive samples:", (labels == 1).sum().sum(), "Percentage: ", ((labels == 1).sum().sum())/len(labels)*100)
    print("Negative samples:", (labels == 0).sum().sum(), "Percentage: ", ((labels == 0).sum().sum())/len(labels)*100)
    print("Total:", len(labels))

    features_scaled = data.mmx_scale(features)

    # Pass feature and label dataframe to starify_shuffle_split method. Obtain a training and testing set for both.
    train_feature, train_label, test_feature, test_label = data.straify_shuffle_split(features_scaled, labels)

    # Separate our training data into a training and validation set.
    x_train, x_valid, y_train, y_valid = data.train_test_split(train_feature, train_label)

    hidden = 3
    epoch = 1
    nodes = [x_train.shape[1] * (hidden-i) for i in range(hidden)]
    # active = "relu"
    active = tf.keras.layers.LeakyReLU(alpha=0.3)

    # Initialize the autoencoder model
    autoencoder = packages.Autoencoder(input_shape = (x_train.shape[1],), # Based on number of features
                                       num_hidden_layers = hidden,
                                       num_nodes = nodes,
                                       active = active) # List of how many neurons per layer

    # Train the autoencoder model
    history = autoencoder.train(x_train, x_valid, epochs=epoch, batch_size=256)

    encoder = autoencoder.get_encoder()
    x_train_pred = encoder.predict(x_train)
    x_valid_pred = encoder.predict(x_valid)

    classifier = packages.Classifier(x_train_pred, x_valid_pred, y_train, y_valid, test_feature, test_label)
    classifier.decision_tree(depth=5)
    classifier.logistic_regression(max_iter=1000)
    classifier.random_forest(estimators=100)
    classifier.support_vector_machine(kernel="linear", regularization=1)
    print(f"{epoch} epochs with {hidden} hidden layers for the {active} activation function. End.")

def semantic_dataset():
    '''
    Function to run our semantic dataset approach through our created AE and classifiers.
    Need to run:
    - Try running with only one hidden layer
    - Try running with more epochs (100-500)
    - Try running with only LeakyReLU
    Always consider the possibiltiy of over training, if results are similar don't push it too much farther.
    '''
    data = packages.Preprocess("C:/Users/Nicko/Documents/Cyber INT/Phishing-AutoEncoder/test.csv")
    combined = data.download()

    full_dataset = combined.copy() # copy dataset.
    full_dataset = full_dataset.drop(full_dataset[full_dataset['label'] == 1].sample(min((full_dataset['label'] == 1).sum(), 6428)).index)
    full_dataset = full_dataset.drop(full_dataset[full_dataset['label'] == 0].sample(min((full_dataset['label'] == 0).sum(), 11297)).index)

    features = packages.pd.DataFrame(full_dataset['url'])
    labels = packages.pd.DataFrame(full_dataset['label'])
    print("Positive samples:", (labels == 1).sum().sum(), "Percentage: ", ((labels == 1).sum().sum())/len(labels)*100)
    print("Negative samples:", (labels == 0).sum().sum(), "Percentage: ", ((labels == 0).sum().sum())/len(labels)*100)
    print("Total:", len(labels))

    encoded_features = data.word_2_vec(features, "url")

    features_scaled = data.mmx_scale(encoded_features)

    # Pass feature and label dataframe to starify_shuffle_split method. Obtain a training and testing set for both.
    train_feature, train_label, test_feature, test_label = data.straify_shuffle_split(features_scaled, labels)

    # Separate our training data into a training and validation set.
    x_train, x_valid, y_train, y_valid = data.train_test_split(train_feature, train_label)

    hidden = 1
    epoch = 500
    nodes = [x_train.shape[1] * (hidden-i) for i in range(hidden)]
    # active = "relu"
    active = tf.keras.layers.LeakyReLU(alpha=0.3)

    # Initialize the autoencoder model
    autoencoder = packages.Autoencoder(input_shape = (x_train.shape[1],), # Based on number of features
                                       num_hidden_layers = hidden,
                                       num_nodes = nodes,
                                       active = active) # List of how many neurons per layer

    # Train the autoencoder model
    history = autoencoder.train(x_train, x_valid, epochs=epoch, batch_size=256)

    encoder = autoencoder.get_encoder()
    x_train_pred = encoder.predict(x_train)
    x_valid_pred = encoder.predict(x_valid)

    classifier = packages.Classifier(x_train_pred, x_valid_pred, y_train, y_valid, test_feature, test_label)
    classifier.decision_tree(depth=5)
    classifier.logistic_regression(max_iter=1000)
    classifier.random_forest(estimators=100)
    classifier.support_vector_machine(kernel="linear", regularization=1)
    print(f"{epoch} epochs with {hidden} hidden layers for the {active} activation function. End.")

if __name__ == '__main__':
    start_time = time.time()

    syntactic_dataset()
    # semantic_dataset()

    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Elapsed time: {elapsed_time:.2f} seconds")