-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
159 lines (118 loc) · 5.49 KB
/
main.py
File metadata and controls
159 lines (118 loc) · 5.49 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import math
import random
import Performance as perf
from NaiveBayes.NaiveBayes import NaiveBayesModel
from LogisticRegression.LogisticRegression import LogisticRegression
from KNearestNeighbor.KNearestNeighbor import KNearestNeighbor
K = 5
def importDataset(fileName):
with open(fileName) as file:
dataset = file.readlines()
for email in range(len(dataset)):
dataset[email] = dataset[email].split(",")
return dataset
def printPerformance(performanceTuple):
print("Accuracy: ", performanceTuple[0])
print("False Positives: ", performanceTuple[1])
print("True Positives: ", performanceTuple[2])
print("Area Under Curve: ", performanceTuple[3])
def main(fileName, trainingPct):
# get dataset
dataset = importDataset("spambase.csv")
# initialize model classes with data labels
NBAlgorithm = NaiveBayesModel(dataset[0])
LRClass = LogisticRegression(dataset[0])
KNNClass = KNearestNeighbor(dataset[0])
#shuffle dataset
dataset = dataset[1:]
random.shuffle(dataset)
# get training set
trainingSize = math.floor(trainingPct * (len(dataset)))
trainingSet = dataset[:trainingSize + 1]
random.shuffle(trainingSet)
# get evalSet and evalSet actualvals
evalSet = dataset[trainingSize:]
evalActualResults = [int(email[-1]) for email in evalSet]
foldResults = []
evalResults = []
allModelResults = []
### NAIVE BAYES ###
# loop through training (increment by size of training // 5 for 5 cross validation)
for fold in range(0, len(trainingSet) - (len(trainingSet) // K), len(trainingSet) // K):
# train Naive Bayes model and test on fold
predictedResults, spamPrior, hamPrior, spamLikelihoods, hamLikelihoods = NBAlgorithm.naiveBayes(fold, fold + len(trainingSet) // K, trainingSet)
# get actualResults
actualResults = [int(email[-1]) for email in trainingSet[fold:fold + len(trainingSet) // K]]
# get performance results
acc, fp, tp, auc = perf.performance(predictedResults, actualResults)
# add to fold results
foldResults.append((acc, fp, tp, auc))
# test on evalSet
result = NBAlgorithm.useModel(spamPrior, hamPrior, spamLikelihoods, hamLikelihoods, evalSet)
# get performance results on evalSet
acc, fp, tp, auc = perf.performance(result, evalActualResults)
# add to evalResults
evalResults.append((acc, fp, tp, auc))
allModelResults.append(perf.average(foldResults, K))
allModelResults.append(perf.average(evalResults, K))
foldResults = []
evalResults = []
### Linear Regression ###
# loop through training (increment by size of training // 5 for 5 cross validation)
for fold in range(0, len(trainingSet) - (len(trainingSet) // K), len(trainingSet) // K):
# train Linear Regression model and test on fold
predictedResults, model = LRClass.LRAlgorithm(fold, fold + len(trainingSet) // K, trainingSet)
# get actualResults
actualResults = [int(email[-1]) for email in trainingSet[fold:fold + len(trainingSet) // K]]
# get performance results
acc, fp, tp, auc = perf.performance(predictedResults, actualResults)
# add to fold results
foldResults.append((acc, fp, tp, auc))
# test on evalSet
result = LRClass.useModel(model, evalSet)
# get performance results on evalSet
acc, fp, tp, auc = perf.performance(result, evalActualResults)
# add to evalResults
evalResults.append((acc, fp, tp, auc))
allModelResults.append(perf.average(foldResults, K))
allModelResults.append(perf.average(evalResults, K))
foldResults = []
evalResults = []
### K Nearest Neighbor ###
# loop through training (increment by size of training // 5 for 5 cross validation)
for fold in range(0, len(trainingSet) - (len(trainingSet) // K), len(trainingSet) // K):
# get emails to compare to
compareEmails = trainingSet[:fold] + trainingSet[fold + len(trainingSet) // K:]
# test on fold
predictedResults = KNNClass.classifySet(trainingSet[fold:fold + len(trainingSet) // K], compareEmails)
# get actualResults
actualResults = [int(email[-1]) for email in trainingSet[fold:fold + len(trainingSet) // K]]
# get performance results
acc, fp, tp, auc = perf.performance(predictedResults, actualResults)
# add to fold results
foldResults.append((acc, fp, tp, auc))
# test on evalSet
result = KNNClass.classifySet(evalSet, compareEmails)
# get performance results on evalSet
acc, fp, tp, auc = perf.performance(result, evalActualResults)
# add to evalResults
evalResults.append((acc, fp, tp, auc))
allModelResults.append(perf.average(foldResults, K))
allModelResults.append(perf.average(evalResults, K))
print()
for performance in range(len(allModelResults)):
if (performance == 0):
print("Naive Bayes Fold Average:")
elif (performance == 1):
print("Naive Bayes EvalSet Average:")
elif (performance == 2):
print("Linear Regression Fold Average::")
elif (performance == 3):
print("Linear Regression EvalSet Average::")
elif (performance == 4):
print("K Nearest Neighbor Fold Average::")
else:
print("K Nearest Neighbor EvalSet Average::")
printPerformance(allModelResults[performance])
print()
main("spambase.csv",0.8)