Machine-Learning/spam_linear_reg.py at master · Hitesh1912/Machine-Learning · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#this code runs linear regression on spamdata set

import numpy as np
import pandas as pd
from numpy.linalg import inv
import time
from random import randrange

# Function importing Dataset
def importdata():
    data = pd.read_csv(
        'http://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data',
        sep=',', header=None)
    # print("Dataset Shape: ", data.shape)
    # #for missing values remove row or column with (atleast 1) any Nan Null present
    data = data.dropna(axis=0, how='any')
    return data.values


# Function to split the dataset
def splitdataset(data):
    # Seperating the target variable
    x = [row[0:len(row) - 1] for row in data]
    y = [row[-1] for row in data]
    return x, y


#remove test set from training set
def removearray(train,arr):
    ind = 0
    size = len(train)
    while ind != size and not np.array_equal(train[ind],arr):
        ind += 1
    if ind != size:
        train.pop(ind)
    else:
        raise ValueError('not found')


# Calculate accuracy
def accuracy_val(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual)) * 100.0


def get_best_param(X, y):
    X_transpose = X.T
    # normal equation
    # theta_best = (X.T * X)^(-1) * X.T * y
    best_params = inv(X_transpose.dot(X)).dot(X_transpose).dot(y)
    return np.array(best_params)  # returns a list


# test prediction
def predict(X_test,params):
    # 920 #size of xtest
    X_test_b = np.c_[np.ones((len(X_test), 1)), X_test]
    # y = h_Theta_X(Theta) = Theta.T * X
    prediction = X_test_b.dot(params)
    prediction = classify_prediction(prediction)
    return prediction


def classify_prediction(prediction):
    theta = np.mean(prediction)
    new_prediction = list()
    for score in prediction:
        if score < theta:
            new_prediction.append(0)
        else:
            new_prediction.append(1)
    return new_prediction


def linear_reg(train_set,test_set):
    X, y = splitdataset(train_set)
    X_test, y_test = splitdataset(test_set)
    X_b = np.c_[np.ones((len(X), 1)), X]  # set bias term to 1 for each sample
    params = get_best_param(X_b, y)
    prediction = predict(X_test, params)
    return prediction

def k_fold_split(dataset, k_folds):
    dataset_split = list()
    dataset_1 = list(dataset)
    fold_size = int(len(dataset) / k_folds)
    for i in range(k_folds):
        fold = list()
        while len(fold) < fold_size:
            idx = randrange(len(dataset_1))
            fold.append(dataset_1.pop(idx))
        dataset_split.append(fold)
    return dataset_split


def evaluate_model(dataset,k_folds):
    folds = k_fold_split(dataset, k_folds)
    scores = list()
    for fold in folds:
        test_set = list()
        train_set = list(folds)
        removearray(train_set, fold)
        train_set = sum(train_set, [])
        for r in fold:
            row = list(r)
            test_set.append(row)
            row[-1] = None
        predicted = linear_reg(train_set,test_set)
        actual = [r[-1] for r in fold]
        acc = accuracy_val(actual, predicted)
        scores.append(acc)
    return scores


if __name__ == '__main__':
    s = time.time()
    data_set = importdata()
    accuracy = evaluate_model(data_set, 5)
    print('Accuracy: %s' % accuracy)
    print('Mean Accuracy: %.3f%%' % (round(sum(accuracy) / float(len(accuracy)))))
    e = time.time()
    print("time",e-s)