ML-KNN-Algorithm/concrete.py at master · eylultuncel/ML-KNN-Algorithm · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
import math
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt


def k_fold(x, is_weighted):
    # start and end points of each fold
    arr = [0, 206, 412, 618, 824, 1030]
    # for each fold, we create our test and train set and then call KNN classification function
    for i in range(5):
        # 1/5 part of the data set as test data
        x_test = x[arr[i]:arr[i + 1]]

        # rest of the data set as train data
        a = x[0:arr[i]]
        b = x[arr[i + 1]:]
        x_train = np.concatenate((a, b), axis=0)

        print()
        print("--------------------------FOLD", i+1, "--------------------------------------------")

        # for every fold use knn classification
        knn_maes = knn_classification(x_train, x_test, is_weighted)

        list_mae = []

        for e in range(0, 9, 2):
            mae = knn_maes[e] / len(x_test)
            print("MAE for (KNN) k=", (e + 1), " : ", mae)
            list_mae.append(mae)

        plt.plot([1, 3, 5, 7, 9], list_mae)
        plt.axis([0, 9, 0, 10])
    return


def normalize(x):
    # for each column
    for i in range(0, x.shape[1]-1):
        col = []
        # for each row of that specific column
        for k in range(x.shape[0]):
            # get all the values of the specific column
            col.append(x[k, i])
        # sort the column array so the first index contains min value, last index contains max value for that column
        col.sort()
        min_of_col = col[0]
        max_of_col = col[x.shape[0]-1]
        # for each element in that column normalize one by one
        for j in range(x.shape[0]):
            x[j, i] = (x[j, i] - min_of_col) / (max_of_col - min_of_col)
    return x


# for each element in that column normalize one by one
def calculate_predictions(x_train, sorted_keys, test, maes):
    closest_points = []
    for i in range(9):
        # in every loop, add one more nearest neighbor to the closest_points array
        closest_points.append(x_train[sorted_keys[i]][8])

        # when k=1 -> i=0  (length of closest_points array=1)
        #      k=3 -> i=2  (length of closest_points array=3)
        #      k=5 -> i=4  (length of closest_points array=5)
        #      k=7 -> i=6  (length of closest_points array=7)
        #      k=9 -> i=8  (length of closest_points array=9)
        if i == 0 or i == 2 or i == 4 or i == 6 or i == 8:
            # get mean value of the closest points
            estimated = (sum(closest_points)) / (len(closest_points))
            # get sum of the all closest points, later it will used to get calculate mae
            maes[i] += abs(test[8] - estimated)

    return maes


# for weighted KNN cases, we use this function to predict test data's classes
def calculate_weighted_predictions(x_train, sorted_keys, test, maes, euclidean_distances):
    # in closest_points_and_weights dictionary there are csMPa values and their weights
    # key=csMPa , value=weight
    closest_points_and_weights = {}
    for i in range(9):
        key = x_train[sorted_keys[i]][8]

        if euclidean_distances.get(sorted_keys[i]) == 0:
            euc_dist_of_point = math.inf
        else:
            euc_dist_of_point = euclidean_distances.get(sorted_keys[i])

        # for each neighbor calculate weight as (1/distance) and add it to the dictionary
        if key in closest_points_and_weights.keys():
            closest_points_and_weights[x_train[sorted_keys[i]][8]] += 1 / euc_dist_of_point
        else:
            closest_points_and_weights[x_train[sorted_keys[i]][8]] = 1 / euc_dist_of_point

        # when k=1 -> i=0  (length of closest_points array=1)
        #      k=3 -> i=2  (length of closest_points array=3)
        #      k=5 -> i=4  (length of closest_points array=5)
        #      k=7 -> i=6  (length of closest_points array=7)
        #      k=9 -> i=8  (length of closest_points array=9)
        if i == 0 or i == 2 or i == 4 or i == 6 or i == 8:

            # get weighted mean value of the closest csMPa values
            estimated = 0
            total_weight = 0
            for k in closest_points_and_weights.keys():
                estimated += k * closest_points_and_weights.get(k)
                total_weight += closest_points_and_weights.get(k)

            if estimated < 0.0001 or total_weight == math.inf:
                estimated = test[8]
            else:
                estimated = estimated / total_weight

            # get sum of the all closest points difference between real value, later it will used to get calculate mae
            maes[i] += abs(test[8] - estimated)

    return maes


def knn_classification(x_train, x_test, is_weighted):
    # indices of the array represents k in kNN (we only use 1-3-5-7-9)
    maes = [0, 0, 0, 0, 0, 0, 0, 0, 0]

    # for each row in the test set, calculate euclidean distance
    for k in range(x_test.shape[0]):
        test = x_test[k]
        euclidean_distances = {}
        for j in range(0, x_train.shape[0]):
            cement = x_train[j][0] - test[0]
            slag = x_train[j][1] - test[1]
            flyash = x_train[j][2] - test[2]
            water = x_train[j][3] - test[3]
            superplasticizer = x_train[j][4] - test[4]
            coarseaggregate = x_train[j][5] - test[5]
            fineaggregate = x_train[j][6] - test[6]
            age = x_train[j][7] - test[7]
            euc_dist = math.sqrt(cement**2 + slag**2 + flyash**2 + water**2 + superplasticizer**2 + coarseaggregate**2 + fineaggregate**2 + age**2)

            euclidean_distances[j] = euc_dist

        # sort by the value of euclidean distance, first element will be the nearest point
        sorted_keys = sorted(euclidean_distances, key=euclidean_distances.get)

        # get the only first 9 nearest points because we dont need more for prediction
        sorted_keys = sorted_keys[:9]

        if not is_weighted:
            # calculate mae by looking first 1,3,5,7,9 neighbors
            mae = calculate_predictions(x_train, sorted_keys, test, maes)
        else:
            # calculate mae by looking first 1,3,5,7,9 neighbors
            mae = calculate_weighted_predictions(x_train, sorted_keys, test, maes, euclidean_distances)

    return mae


def knn(x):
    print("KNN")
    k_fold(x, False)
    plt.ylabel("KNN")
    plt.show()


def knn_with_normalization(x):
    print("KNN WITH NORMALIZATION")
    x = normalize(x)
    k_fold(x, False)
    plt.ylabel("KNN-normalization")
    plt.show()


def weighted_knn(x):
    print("WEIGHTED KNN")
    k_fold(x, True)
    plt.ylabel("Weighted KNN")
    plt.show()


def weighted_knn_with_normalization(x):
    print("WEIGHTED KNN WITH NORMALIZATION")
    x = normalize(x)
    k_fold(x, True)
    plt.ylabel("Weighted KNN-normalization")
    plt.show()


def main():
    # reading data's in the csv file to the numpy array
    df = pd.read_csv('./concrete.csv')
    x = np.array(df.iloc[:, :])

    # shuffle the data
    np.random.seed(101)
    np.random.shuffle(x)

    # KNN function
    knn(x.copy())

    # KNN with normalization function
    knn_with_normalization(x.copy())

    # Weighted KNN function
    weighted_knn(x.copy())

    # Weighted KNN with normalization function
    weighted_knn_with_normalization(x.copy())


if __name__ == "__main__":
    main()