DSLR/logreg_predict.py at master · ljoly/DSLR · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import sys
import csv
import numpy as np
import ml_functions as ml
import matplotlib.pyplot as plt


def sigmoid(z):
    return 1 / (1 + np.exp(-z))


def showDistribution(distribution):
    kwargs = dict(histtype='stepfilled', ec='black', alpha=0.3, bins=2)
    for i in range(len(distribution)):
        plt.hist(distribution[i], **kwargs)
    plt.title('Distribution')
    plt.legend(['Gryffindor', 'Ravenclaw', 'Slytherin', 'Hufflepuff'])
    plt.xlabel('Houses', fontsize=16)
    plt.ylabel('Students', fontsize=16)
    plt.xticks(fontsize=14)
    plt.yticks(fontsize=14)

    plt.show()


def predict():
    f = open('assets/houses.csv', 'w')
    f.write('Index,Hogwarts House\n')
    distribution = [[], [], [], []]
    for i, studMarks in enumerate(X):
        probs = []
        for _, weights in enumerate(housesWeights):
            z = np.dot(studMarks, weights)
            probs.append(sigmoid(z))
            _, maxV = ml.getMinMax(probs)
            house = ''
            for j in range(len(probs)):
                if probs[j] == maxV:
                    house = houses[j]
        if house == 'Gryffindor':
            distribution[0].append('Gryffindor')
        elif house == 'Ravenclaw':
            distribution[1].append('Ravenclaw')
        elif house == 'Slytherin':
            distribution[2].append('Slytherin')
        elif house == 'Hufflepuff':
            distribution[3].append('Hufflepuff')
        f.write(str(i) + ',' + house + '\n')
    return distribution


def formatFeatures():
    data = []
    for _ in range(lenFeatures):
        data.append([])
    for row in rawdata:
        for i in range(lenFeatures):
            if row[i + indexFeatures] == '':
                data[i].append(0.0)
            else:
                data[i].append(float(row[i + indexFeatures]))

    # Normalize
    for i in range(lenFeatures):
        minV, maxV = ml.getMinMax(data[i])
        data[i] = ml.normalizeData(data[i], minV, maxV)

    return data


if __name__ == '__main__':
    if (len(sys.argv) < 2):
        print("Argument missing")
        exit()
    csvfile = open(sys.argv[1])
    rawdata = list(csv.reader(csvfile))
    # Features: modify indexFeatures according to the dataset
    # In that case we skip the first two features
    # The second one has a clone and the first has no impact
    indexFeatures = 8
    lenFeatures = len(rawdata[0]) - indexFeatures
    del rawdata[0]

    houses = ['Gryffindor', 'Ravenclaw', 'Slytherin', 'Hufflepuff']
    data = formatFeatures()

    X = np.transpose(data)
    ones = np.ones((len(X), 1))
    X = np.concatenate((ones, X), axis=1)
    csvWeights = open('assets/weights.csv')
    rawWeights = list(csv.reader(csvWeights))
    housesWeights = []

    for i in range(len(rawWeights)):
        w = []
        for j in range(1, len(rawWeights[i])):
            w.append(float(rawWeights[i][j]))
        housesWeights.append(w)

    distribution = predict()
    showDistribution(distribution)