-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathhistogram.py
More file actions
107 lines (90 loc) · 2.83 KB
/
histogram.py
File metadata and controls
107 lines (90 loc) · 2.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import csv
import sys
import ml_functions as ml
import matplotlib.pyplot as plt
csvfile = open('assets/dataset_train.csv')
rawdata = list(csv.reader(csvfile))
# Features: modify indexFeatures according to the dataset
indexFeatures = 6
lenFeatures = len(rawdata[0]) - indexFeatures
features = []
for i in range(len(rawdata[0]) - indexFeatures):
features.append(rawdata[0][i + indexFeatures])
# Arrays of marks for each house
gryf = []
raven = []
slyth = []
huffle = []
# Array of arrays of marks
houses = [gryf, raven, slyth, huffle]
lenHouses = len(houses)
# Arrays of Std Deviations
gryfStd = []
ravenStd = []
slythStd = []
huffleStd = []
# Array of arrays of Std arrays
housesStd = [gryfStd, ravenStd, slythStd, huffleStd]
# Array of Std of Std
featuresStd = [0.0] * lenFeatures
# Array of Mean of Std
stdMean = [0.0] * lenFeatures
# Init arrays with generic size
for i in range(lenHouses):
for _ in range(lenFeatures):
houses[i].append([])
housesStd[i].append([])
# main
del rawdata[0]
for row in rawdata:
if ml.isFormatted(row):
tmp = gryf
if row[1] == 'Ravenclaw':
tmp = raven
elif row[1] == 'Slytherin':
tmp = slyth
elif row[1] == 'Hufflepuff':
tmp = huffle
for j in range(lenFeatures):
tmp[j].append(row[j + indexFeatures])
# Get all stats
for i, house in enumerate(houses):
for j, row in enumerate(house):
house[j] = ml.formatData(row)
minV, maxV = ml.getMinMax(house[j])
house[j] = ml.normalizeData(house[j], minV, maxV)
mean = ml.getMean(house[j])
housesStd[i][j] = ml.getStd(house[j], mean)
stdMean[j] += housesStd[i][j]
for i, mean in enumerate(stdMean):
stdMean[i] /= lenHouses
# Get Std
for i, house in enumerate(housesStd):
for j, row in enumerate(house):
featuresStd[j] += (row - stdMean[j]) * (row - stdMean[j])
for i, std in enumerate(featuresStd):
featuresStd[i] /= lenHouses - 1
featuresStd[i] = featuresStd[i] ** 0.5
# Get index of min std
minIndex = 0
for i in range(lenFeatures):
if featuresStd[i] < featuresStd[minIndex]:
minIndex = i
print('The most homogeneous feature between the four houses is:',
features[minIndex])
# Plot
kwargs = dict(histtype='stepfilled', ec='black', alpha=0.3)
plt.hist(gryf[minIndex], **kwargs)
plt.hist(raven[minIndex], **kwargs)
plt.hist(slyth[minIndex], **kwargs)
plt.hist(huffle[minIndex], **kwargs)
plt.title(features[minIndex])
plt.axvline(ml.getMean(gryf[minIndex] + raven[minIndex] + slyth[minIndex] +
huffle[minIndex]), color='k', linestyle='dashed', linewidth=1)
plt.legend(['Std mean', 'Gryffindor', 'Ravenclaw',
'Slytherin', 'Hufflepuff'])
plt.xlabel('Marks', fontsize=16)
plt.ylabel('Students', fontsize=16)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.show()