-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathlogRegIris.py
More file actions
128 lines (112 loc) · 4.39 KB
/
logRegIris.py
File metadata and controls
128 lines (112 loc) · 4.39 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
# -*- coding: utf-8 -*-
"""
Machine Learning lab1
comparison with the Iris dataset
chose two classes from all three classes for computaion
used 100 samples in total
each two validation method has 100 validation data sample
@author: 10152510119 徐紫琦
"""
from numpy import *
import scipy
import matplotlib.pyplot as plt
filename='pima-indians-diabetes.data' #file directory
def loadFile(): #read the .data file of the dataset
fr = open(filename)
linestr = fr.readlines()
return linestr
def appendData(linestr,dataMat,labelMat,i):#append attributes into the data matrix and append class label into the label matrix
lineArr = linestr[i].strip().split(",")
w2=float(lineArr[0])
w3=float(lineArr[1])
w4=float(lineArr[2])
w5=float(lineArr[3])
dataMat.append([1.0, w2, w2,w4,w5]) #4 attributes given, w1=1
if lineArr[4]=="Iris-setosa":
labelMat.append([0])
else:
labelMat.append([1])
def loadDataSet10(linestr,j): #turn raw data into matrixes for the j-th computation in 10-fold cross-validation
dataMat = []
labelMat = []
length=len(linestr)
for i in range(0,10*j): #100 data samples->leave 10 samples out and append the rest samples
appendData(linestr,dataMat,labelMat,i)
if(j<9):
for i in range(10*(j+1),100):
appendData(linestr,dataMat,labelMat,i)
return dataMat,labelMat
def loadDataSet1(linestr,j): #turn raw data into matrixes for the j-th computation in leave-one-out cross-validation
dataMat = []
labelMat = []
length=len(linestr)
for i in range(0,j-1): #100 data samples->leave 1 sample out and append the rest samples
appendData(linestr,dataMat,labelMat,i)
if(j<99):
for i in range(j+1,100):
appendData(linestr,dataMat,labelMat,i)
return dataMat,labelMat
def sigmoid(inX): #sigmoid function: special exp function to prevent overflow
return 1.0/(1+scipy.special.expit(-inX))
def gradAscent(dataMat, labelMat):
dataMatrix=mat(dataMat)
alpha = 0.001 #learning rate
maxCycles = 500 #the number of circles of iteration
weights = ones((5,1)) #initialize the parameters
for k in range(maxCycles):
h = sigmoid(dataMatrix*weights)
error = (labelMat - h)
weights = weights + alpha * dataMatrix.transpose()* error #refresh the parameters
return weights
def countErr10(weights,linestr,j): #count the numver of errors in the j-th computation in 10-fold cross-validation
length=len(linestr)
dataMat = []
labelMat = []
for i in range(10*(j),10*(j+1)):#construct the validation data matrix
appendData(linestr,dataMat,labelMat,i)
y=sigmoid(mat(dataMat)*weights)
err=0
for i in range(0,10): #compare the class labels with the pridiction values to count errors
if (round(y[(i,0)])!=labelMat[i][0]) or (y[(i,0)]==0.5 and labelMat[i][0]==0): #to solve the problem that round(0.5,0)==0.0
err+=1
return err
def countErr1(weights,linestr,j): #judge if the validation data is an error
length=len(linestr)
dataMat = []
labelMat = []
appendData(linestr,dataMat,labelMat,j)
y=sigmoid(mat(dataMat)*weights)
err=0
if (round(y[(0,0)])!=labelMat[0][0]) or (y[(0,0)]==0.5 and labelMat[0][0]==0):
err+=1
return err
def printFigure(err10,err1):
n_groups = 2
list=[err10,err1]
fig, ax = plt.subplots()
index = np.arange(n_groups)
bar_width = 0.35
opacity = 0.4
rects1 = plt.bar(index, list, bar_width,alpha=opacity, color='b')
plt.ylabel('error rate')
plt.title('Error rate comparison between 10-fold cross-validation\n and leave-one-out cross-validation')
plt.xticks(index, ('10-fold', 'leave-one-out'))
plt.ylim(0,1)
plt.legend()
plt.tight_layout()
plt.show()
def main():
totalErr10=0
totalErr1=0
linestr=loadFile()
for i in range(0,10): #run 10 times 10-fold cross-validation
dataMat, labelMat = loadDataSet10(linestr,i)
weights=gradAscent(dataMat, labelMat).getA()
totalErr10+=countErr10(weights,loadFile(),i)
for i in range(0,100): #run the leave-one-out vross-validation 100 times
dataMat, labelMat = loadDataSet1(linestr,i)
weights=gradAscent(dataMat, labelMat).getA()
totalErr1+=countErr1(weights,loadFile(),i)
printFigure(totalErr10/100,totalErr1/100)
if __name__=='__main__':
main()