Skip to content

Issue #18

@sheikhsarmad

Description

@sheikhsarmad

import kagglehub
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import seaborn as sns
import matplotlib.pyplot as plt
import time
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix

Step 1: Replace queston marks with proper missing value markr

dataset.replace("?", np.nan, inplace=True)

Step 2: remove any rows with missin information

dataset = dataset.dropna()

Step 3: Convert all values to hole numbers

dataset = dataset.astype(int)

Remove impossible medical values

Medical value limits

max_allowed = {
"sex": 1, # 0=female, 1=male
"cp": 4, # 4 types of chest pain maximum
"fbs": 1, # 0=no, 1=yes for high blood sugar
"exang": 1, # 0=no, 1=yes for exercise pain
"ca": 3, # Maximum 3 major vessels
"thal": 3, # Thalassemia types: 0-3 only
"num": 1 # Diagnosis: 0=healthy, 1=sick
}

find which ros have valid medcal values

valid_data = dataset[list(max_allowed)] <= pd.Series(max_allowed)
all_values_valid = valid_data.all(axis=1)

fep only vald patint records

dataset = dataset[all_values_valid]

dataset

seperating our data into two parts: catagarical and numeric stuff

categoricalColumns = ["sex", "cp", "fbs", "restecg", "exang", "slope", "thal"]
numericalColumns = ["age", "trestbps", "chol", "thalach", "oldpeak", "ca"]

creating the numeric dataset by selecting only number columns from original data

numericalDataset = dataset[numericalColumns]

creating the catagorical dataset by selecting only text columns from original data

categoricalDataset = dataset[categoricalColumns]

numericalDataset

calcilate mean, median, standerd devation

StatisticalSummary = numericalDataset.agg(["mean", "median", "std"])

show summary

StatisticalSummary

makin a heatmap to see how columns corelate

plt.figure(figsize=(10, 8)) # set the size
sns.heatmap(dataset.corr(), annot=True, cmap="coolwarm") # draw heatmap
plt.title("Correlation Heatmap") # add titel

plt.show() # diplsay plot

check class balence

plt.figure(figsize=(6, 4)) # set fig size
sns.countplot(x=dataset["num"]) # count each class
plt.title("Diagnosis Distribution") # add titel
plt.xlabel("Diagnosis(0,1)") # x-axis label
plt.ylabel("Count") # y-axis label
plt.show() # show plot

age colum histogram

plt.figure(figsize=(6, 4)) # fig size
plt.hist(dataset["age"], bins=50, edgecolor="black") # plot age
plt.title("Age Distribution") # titel
plt.xlabel("Age") # x lable
plt.ylabel("Count") # y lable
plt.show() # diplsay

cholestrol histogram plot

plt.figure(figsize=(6, 4)) # set plot size
plt.hist(dataset["chol"], bins=20, edgecolor="black") # make hist
plt.title("Cholestrol Distribution") # titel
plt.xlabel("Cholestrol") # x axis
plt.ylabel("Count") # y axis
plt.show() # show it

min max scaling to values

scaler= MinMaxScaler()
dataset[numericalColumns]= scaler.fit_transform(dataset[numericalColumns])

dataset.head() # check data

onehot encodeing catagorical cols

dataset=pd.get_dummies(dataset, columns= categoricalColumns, drop_first=True)
dataset # check

model prep(non pca)

Y = dataset["num"]
X = dataset.drop("num", axis=1) # remove target col

split data into train/test

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

create model objects

LogisticRegressionModel = LogisticRegression()
KNNModel = KNeighborsClassifier()
DecisionTreeModel = DecisionTreeClassifier()
RandomForestModel = RandomForestClassifier()

time logistic egresion

LRModelStartTime = time.time()
LogisticRegressionModel.fit(X_train, Y_train)
LRModelEndTime = time.time()
LRModelTrainingTime = LRModelEndTime - LRModelStartTime

predit

Y_pred = LogisticRegressionModel.predict(X_test)

confuson matrix

confusionMatrix = confusion_matrix(Y_test, Y_pred)
tn, fp, fn, tp = confusionMatrix.ravel()
fpr = fp / (fp + tn)

other scorse

accuracy = accuracy_score(Y_test, Y_pred)
precision = precision_score(Y_test, Y_pred)
recall = recall_score(Y_test, Y_pred)

print

print("=== LOGISTIC REGRESSION ===")
print("Confusion Matrix:")
print(confusionMatrix)
print("Accuracy:", round(accuracy, 4))
print("Training Time:", round(LRModelTrainingTime, 4))
print("Precision:", round(precision, 4))
print("Recall:", round(recall, 4))
print("False Positive Rate:", round(fpr, 4))

knn model

KNNModelStartTime = time.time()
KNNModel.fit(X_train, Y_train)
KNNModelEndTime = time.time()
KNNModelTrainingTime = KNNModelEndTime - KNNModelStartTime

predit

Y_pred = KNNModel.predict(X_test)

metrics

confusionMatrix = confusion_matrix(Y_test, Y_pred)
tn, fp, fn, tp = confusionMatrix.ravel()
accuracy = accuracy_score(Y_test, Y_pred)
precision = precision_score(Y_test, Y_pred)
recall = recall_score(Y_test, Y_pred)
fpr = fp/(fp+tn)

print

print("KNN:")
print(confusionMatrix)
print("Time:", KNNModelTrainingTime)
print("Acc:", accuracy, "Prec:", precision)
print("Rec:", recall, "FPR:", fpr)

desicion tree model

DecisionTreeStartTime = time.time()
DecisionTreeModel.fit(X_train, Y_train)
DecisionTreeEndTime = time.time()
DecisionTreeTrainingTime = DecisionTreeEndTime - DecisionTreeStartTime

predicitons

Y_pred = DecisionTreeModel.predict(X_test)

calcilate all metrics

confusionMatrix = confusion_matrix(Y_test, Y_pred)
tn, fp, fn, tp = confusionMatrix.ravel()
fpr = fp/(fp+tn)
accuracy = accuracy_score(Y_test, Y_pred)
precision = precision_score(Y_test, Y_pred)
recall = recall_score(Y_test, Y_pred)

print evrything together

print("=== DECISION TREE ===")
print(confusionMatrix)
print("Time:", DecisionTreeTrainingTime)
print(f"Acc:{accuracy:.4f} Prec:{precision:.4f} Rec:{recall:.4f} FPR:{fpr:.4f}")

random forest training

RandomForestStartTime = time.time()
RandomForestModel.fit(X_train, Y_train)
RandomForestEndTime = time.time()
RandomForestTrainingTime = RandomForestEndTime - RandomForestStartTime

predicitons

Y_pred = RandomForestModel.predict(X_test)

metrics

confusionMatrix = confusion_matrix(Y_test, Y_pred)
tn, fp, fn, tp = confusionMatrix.ravel()
fpr = fp/(fp+tn)
accuracy = accuracy_score(Y_test, Y_pred)
precision = precision_score(Y_test, Y_pred)
recall = recall_score(Y_test, Y_pred)

results

print("=== RANDOM FOREST ===")
print("Matrix:", confusionMatrix.tolist())
print("Time:", round(RandomForestTrainingTime, 4))
print("Acc:", round(accuracy, 4), "Prec:", round(precision, 4))
print("Rec:", round(recall, 4), "FPR:", round(fpr, 4))

buliding nerual network

NeuralNetworkModel = Sequential()
NeuralNetworkModel.add(Dense(units=32, activation='sigmoid', input_dim=X_train.shape[1]))
NeuralNetworkModel.add(Dense(units=16, activation='sigmoid')) # midle layr
NeuralNetworkModel.add(Dense(1, activation='softmax')) # final

set optimizer and loss

NeuralNetworkModel.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

start timing and train

NeuralNetworkStartTime = time.time()
history = NeuralNetworkModel.fit(X_train, Y_train, epochs=10, batch_size=16, validation_split=0.2)
NeuralNetworkEndTime = time.time()
NeuralNetworkTrainingTime = NeuralNetworkEndTime - NeuralNetworkStartTime

test the model

test_loss, test_accuracy = NeuralNetworkModel.evaluate(X_test, Y_test)

get predicitons and metrics

precision = precision_score(Y_test, Y_pred)
recall = precision_score(Y_test, Y_pred)
confusionMatrix = confusion_matrix(Y_test, Y_pred)
tn, fp, fn, tp = confusionMatrix.ravel()
fpr = fp/(fp+tn)

show results

print("Neural Network Report:")
print("-" * 30)
print("Confusion Matrix:\n", confusionMatrix)
print(f"Acc: {test_accuracy:.4f} | Time: {NeuralNetworkTrainingTime:.4f}s")
print(f"Prec: {precision:.4f} | Rec: {recall:.4f} | FPR: {fpr:.4f}")

pca time keep 95% variance

pcaY = dataset["num"]
pca = PCA(n_components=0.95)
pcaX = pca.fit_transform(dataset.drop("num", axis=1))

make dataframe with pc columns

pc_names = [f"PC{i+1}" for i in range(pcaX.shape[1])]
pcaDataset = pd.DataFrame(pcaX, columns=pc_names)

add target

pcaDataset["num"] = pcaY.values
pcaDataset.head() # show

knn excecution

KNNModelStartTime = time.time()
KNNModel.fit(X_train, Y_train)
KNNModelEndTime = time.time()

pred + metrics calc

Y_pred = KNNModel.predict(X_test)
confusionMatrix = confusion_matrix(Y_test, Y_pred)
tn, fp, fn, tp = confusionMatrix.ravel()
fpr = fp/(fp+tn)

scorse collection

accuracy = accuracy_score(Y_test, Y_pred)
precision = precision_score(Y_test, Y_pred)
recall = recall_score(Y_test, Y_pred)

results in one line

print("KNN MODEL")
print(f"Time: {KNNModelEndTime-KNNModelStartTime:.4f}s")
print(f"Matrix:\n{confusionMatrix}")
print(f"Acc:{accuracy:.4f}|Prec:{precision:.4f}|Rec:{recall:.4f}|FPR:{fpr:.4f}")

#Decision tree model
DecisionTreeStartTime= time.time()
DecisionTreeModel.fit(X_train, Y_train)
DecisionTreeEndTime= time.time()
DecisionTreeTrainingTime= DecisionTreeEndTime - DecisionTreeStartTime

#y prediction model

Y_pred= DecisionTreeModel.predict(X_test)

#confusion matrix formulas
accuracy= accuracy_score(Y_test, Y_pred)
precision= precision_score(Y_test, Y_pred)
recall= recall_score(Y_test, Y_pred)
tn, fp, fn, tp= confusion_matrix(Y_test, Y_pred).ravel()
fpr= fp/(fp+tn)
confusionMatrix= confusion_matrix(Y_test, Y_pred)

#results genration
print("Confusion Matrix: ")
print(confusionMatrix)
print("Decision Tree Accuracy: ", accuracy)
print("Decision Tree Training Time: ", DecisionTreeTrainingTime)
print("Decision Tree Precision: ", precision)
print("Decision Tree Recall: ", recall)
print("Decision Tree False Positive Rate: ", fpr)

random forest model - testing this one

RandomForestStartTime= time.time()
RandomForestModel.fit(X_train, Y_train)
RandomForestEndTime= time.time()
RandomForestTrainingTime= RandomForestEndTime - RandomForestStartTime

get predictions

Y_pred= RandomForestModel.predict(X_test)

#calcilate all metrics we need
confusionMatrix= confusion_matrix(Y_test, Y_pred)
tn, fp, fn, tp= confusionMatrix.ravel()
fpr= fp/(fp+tn)
accuracy= accuracy_score(Y_test, Y_pred)
precision= precision_score(Y_test, Y_pred)
recall= recall_score(Y_test, Y_pred)

printing results - chagned format

print("Random Forest Test Results")

print("Training Time : ", RandomForestTrainingTime)
print("Confusion Matrix :")
print(confusionMatrix)
print("Performance:")
print("Accuracy = ", accuracy)
print("Precision = ", precision)
print("Recall = ", recall)
print("FPR = ", fpr)

neural network build

NeuralNetworkModel = Sequential()
NeuralNetworkModel.add(Dense(units=32, activation='sigmoid', input_dim=X_train.shape[1]))
NeuralNetworkModel.add(Dense(units=16, activation='sigmoid')) # hiden layer
NeuralNetworkModel.add(Dense(1, activation='softmax')) # out layer

compile model

NeuralNetworkModel.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

train and time it

NeuralNetworkStartTime = time.time()
history = NeuralNetworkModel.fit(X_train, Y_train, epochs=10, batch_size=16, validation_split=0.2)
NeuralNetworkEndTime = time.time()
NeuralNetworkTrainingTime = NeuralNetworkEndTime - NeuralNetworkStartTime

evaluate on test

test_loss, test_accuracy = NeuralNetworkModel.evaluate(X_test, Y_test)

bug fix: forgot to predict earlier

Y_pred = NeuralNetworkModel.predict(X_test)
Y_pred = (Y_pred > 0.5).astype(int)

calcilate everything

confusionMatrix = confusion_matrix(Y_test, Y_pred)
tn, fp, fn, tp = confusionMatrix.ravel()
fpr = fp/(fp+tn)
precision = precision_score(Y_test, Y_pred)
recall = recall_score(Y_test, Y_pred)

store and print differently

results = [
("Confusion Matrix", confusionMatrix),
("Accuracy", test_accuracy),
("Training Time", NeuralNetworkTrainingTime),
("Precision", precision),
("Recall", recall),
("FPR", fpr)
]

for name, val in results:
print(f"{name}: {val}")

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions