EmailSpamClassification/main.py at main · Devin-Jay/EmailSpamClassification · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import math
import random

import Performance as perf
from NaiveBayes.NaiveBayes import NaiveBayesModel
from LogisticRegression.LogisticRegression import LogisticRegression
from KNearestNeighbor.KNearestNeighbor import KNearestNeighbor

K = 5

def importDataset(fileName):
    with open(fileName) as file:
        dataset = file.readlines()
        for email in range(len(dataset)):
            dataset[email] = dataset[email].split(",")
    return dataset

def printPerformance(performanceTuple):
    print("Accuracy: ", performanceTuple[0])
    print("False Positives: ", performanceTuple[1])
    print("True Positives: ", performanceTuple[2])
    print("Area Under Curve: ", performanceTuple[3])

def main(fileName, trainingPct):

    # get dataset
    dataset = importDataset("spambase.csv")

    # initialize model classes with data labels
    NBAlgorithm = NaiveBayesModel(dataset[0])
    LRClass = LogisticRegression(dataset[0])
    KNNClass = KNearestNeighbor(dataset[0])

    #shuffle dataset
    dataset = dataset[1:]
    random.shuffle(dataset)

    # get training set
    trainingSize = math.floor(trainingPct * (len(dataset)))
    trainingSet = dataset[:trainingSize + 1]
    random.shuffle(trainingSet)

    # get evalSet and evalSet actualvals
    evalSet = dataset[trainingSize:]
    evalActualResults = [int(email[-1]) for email in evalSet]

    foldResults = []
    evalResults = []
    allModelResults = []

    ### NAIVE BAYES ###
    # loop through training (increment by size of training // 5 for 5 cross validation)
    for fold in range(0, len(trainingSet) - (len(trainingSet) // K), len(trainingSet) // K):
        # train Naive Bayes model and test on fold
        predictedResults, spamPrior, hamPrior, spamLikelihoods, hamLikelihoods = NBAlgorithm.naiveBayes(fold, fold + len(trainingSet) // K, trainingSet)

        # get actualResults
        actualResults = [int(email[-1]) for email in trainingSet[fold:fold + len(trainingSet) // K]]

        # get performance results
        acc, fp, tp, auc = perf.performance(predictedResults, actualResults)

        # add to fold results
        foldResults.append((acc, fp, tp, auc))

        # test on evalSet
        result = NBAlgorithm.useModel(spamPrior, hamPrior, spamLikelihoods, hamLikelihoods, evalSet)

        # get performance results on evalSet
        acc, fp, tp, auc = perf.performance(result, evalActualResults)

        # add to evalResults
        evalResults.append((acc, fp, tp, auc))

    allModelResults.append(perf.average(foldResults, K))
    allModelResults.append(perf.average(evalResults, K))

    foldResults = []
    evalResults = []

    ### Linear Regression ###
    # loop through training (increment by size of training // 5 for 5 cross validation)
    for fold in range(0, len(trainingSet) - (len(trainingSet) // K), len(trainingSet) // K):
        # train Linear Regression model and test on fold
        predictedResults, model = LRClass.LRAlgorithm(fold, fold + len(trainingSet) // K, trainingSet)

        # get actualResults
        actualResults = [int(email[-1]) for email in trainingSet[fold:fold + len(trainingSet) // K]]

        # get performance results
        acc, fp, tp, auc = perf.performance(predictedResults, actualResults)

        # add to fold results
        foldResults.append((acc, fp, tp, auc))

        # test on evalSet
        result = LRClass.useModel(model, evalSet)

        # get performance results on evalSet
        acc, fp, tp, auc = perf.performance(result, evalActualResults)

        # add to evalResults
        evalResults.append((acc, fp, tp, auc))

    allModelResults.append(perf.average(foldResults, K))
    allModelResults.append(perf.average(evalResults, K))

    foldResults = []
    evalResults = []

    ### K Nearest Neighbor ###
    # loop through training (increment by size of training // 5 for 5 cross validation)
    for fold in range(0, len(trainingSet) - (len(trainingSet) // K), len(trainingSet) // K):
        # get emails to compare to
        compareEmails = trainingSet[:fold] + trainingSet[fold + len(trainingSet) // K:]

        # test on fold
        predictedResults = KNNClass.classifySet(trainingSet[fold:fold + len(trainingSet) // K], compareEmails)

        # get actualResults
        actualResults = [int(email[-1]) for email in trainingSet[fold:fold + len(trainingSet) // K]]

        # get performance results
        acc, fp, tp, auc = perf.performance(predictedResults, actualResults)

        # add to fold results
        foldResults.append((acc, fp, tp, auc))

        # test on evalSet
        result = KNNClass.classifySet(evalSet, compareEmails)

        # get performance results on evalSet
        acc, fp, tp, auc = perf.performance(result, evalActualResults)

        # add to evalResults
        evalResults.append((acc, fp, tp, auc))

    allModelResults.append(perf.average(foldResults, K))
    allModelResults.append(perf.average(evalResults, K))

    print()
    for performance in range(len(allModelResults)):
        if (performance == 0):
            print("Naive Bayes Fold Average:")
        elif (performance == 1):
            print("Naive Bayes EvalSet Average:")
        elif (performance == 2):
            print("Linear Regression Fold Average::")
        elif (performance == 3):
            print("Linear Regression EvalSet Average::")
        elif (performance == 4):
            print("K Nearest Neighbor Fold Average::")
        else:
            print("K Nearest Neighbor EvalSet Average::")

        printPerformance(allModelResults[performance])
        print()

main("spambase.csv",0.8)