-
Notifications
You must be signed in to change notification settings - Fork 281
Description
import kagglehub
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import seaborn as sns
import matplotlib.pyplot as plt
import time
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
Step 1: Replace queston marks with proper missing value markr
dataset.replace("?", np.nan, inplace=True)
Step 2: remove any rows with missin information
dataset = dataset.dropna()
Step 3: Convert all values to hole numbers
dataset = dataset.astype(int)
Remove impossible medical values
Medical value limits
max_allowed = {
"sex": 1, # 0=female, 1=male
"cp": 4, # 4 types of chest pain maximum
"fbs": 1, # 0=no, 1=yes for high blood sugar
"exang": 1, # 0=no, 1=yes for exercise pain
"ca": 3, # Maximum 3 major vessels
"thal": 3, # Thalassemia types: 0-3 only
"num": 1 # Diagnosis: 0=healthy, 1=sick
}
find which ros have valid medcal values
valid_data = dataset[list(max_allowed)] <= pd.Series(max_allowed)
all_values_valid = valid_data.all(axis=1)
fep only vald patint records
dataset = dataset[all_values_valid]
dataset
seperating our data into two parts: catagarical and numeric stuff
categoricalColumns = ["sex", "cp", "fbs", "restecg", "exang", "slope", "thal"]
numericalColumns = ["age", "trestbps", "chol", "thalach", "oldpeak", "ca"]
creating the numeric dataset by selecting only number columns from original data
numericalDataset = dataset[numericalColumns]
creating the catagorical dataset by selecting only text columns from original data
categoricalDataset = dataset[categoricalColumns]
numericalDataset
calcilate mean, median, standerd devation
StatisticalSummary = numericalDataset.agg(["mean", "median", "std"])
show summary
StatisticalSummary
makin a heatmap to see how columns corelate
plt.figure(figsize=(10, 8)) # set the size
sns.heatmap(dataset.corr(), annot=True, cmap="coolwarm") # draw heatmap
plt.title("Correlation Heatmap") # add titel
plt.show() # diplsay plot
check class balence
plt.figure(figsize=(6, 4)) # set fig size
sns.countplot(x=dataset["num"]) # count each class
plt.title("Diagnosis Distribution") # add titel
plt.xlabel("Diagnosis(0,1)") # x-axis label
plt.ylabel("Count") # y-axis label
plt.show() # show plot
age colum histogram
plt.figure(figsize=(6, 4)) # fig size
plt.hist(dataset["age"], bins=50, edgecolor="black") # plot age
plt.title("Age Distribution") # titel
plt.xlabel("Age") # x lable
plt.ylabel("Count") # y lable
plt.show() # diplsay
cholestrol histogram plot
plt.figure(figsize=(6, 4)) # set plot size
plt.hist(dataset["chol"], bins=20, edgecolor="black") # make hist
plt.title("Cholestrol Distribution") # titel
plt.xlabel("Cholestrol") # x axis
plt.ylabel("Count") # y axis
plt.show() # show it
min max scaling to values
scaler= MinMaxScaler()
dataset[numericalColumns]= scaler.fit_transform(dataset[numericalColumns])
dataset.head() # check data
onehot encodeing catagorical cols
dataset=pd.get_dummies(dataset, columns= categoricalColumns, drop_first=True)
dataset # check
model prep(non pca)
Y = dataset["num"]
X = dataset.drop("num", axis=1) # remove target col
split data into train/test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
create model objects
LogisticRegressionModel = LogisticRegression()
KNNModel = KNeighborsClassifier()
DecisionTreeModel = DecisionTreeClassifier()
RandomForestModel = RandomForestClassifier()
time logistic egresion
LRModelStartTime = time.time()
LogisticRegressionModel.fit(X_train, Y_train)
LRModelEndTime = time.time()
LRModelTrainingTime = LRModelEndTime - LRModelStartTime
predit
Y_pred = LogisticRegressionModel.predict(X_test)
confuson matrix
confusionMatrix = confusion_matrix(Y_test, Y_pred)
tn, fp, fn, tp = confusionMatrix.ravel()
fpr = fp / (fp + tn)
other scorse
accuracy = accuracy_score(Y_test, Y_pred)
precision = precision_score(Y_test, Y_pred)
recall = recall_score(Y_test, Y_pred)
print("=== LOGISTIC REGRESSION ===")
print("Confusion Matrix:")
print(confusionMatrix)
print("Accuracy:", round(accuracy, 4))
print("Training Time:", round(LRModelTrainingTime, 4))
print("Precision:", round(precision, 4))
print("Recall:", round(recall, 4))
print("False Positive Rate:", round(fpr, 4))
knn model
KNNModelStartTime = time.time()
KNNModel.fit(X_train, Y_train)
KNNModelEndTime = time.time()
KNNModelTrainingTime = KNNModelEndTime - KNNModelStartTime
predit
Y_pred = KNNModel.predict(X_test)
metrics
confusionMatrix = confusion_matrix(Y_test, Y_pred)
tn, fp, fn, tp = confusionMatrix.ravel()
accuracy = accuracy_score(Y_test, Y_pred)
precision = precision_score(Y_test, Y_pred)
recall = recall_score(Y_test, Y_pred)
fpr = fp/(fp+tn)
print("KNN:")
print(confusionMatrix)
print("Time:", KNNModelTrainingTime)
print("Acc:", accuracy, "Prec:", precision)
print("Rec:", recall, "FPR:", fpr)
desicion tree model
DecisionTreeStartTime = time.time()
DecisionTreeModel.fit(X_train, Y_train)
DecisionTreeEndTime = time.time()
DecisionTreeTrainingTime = DecisionTreeEndTime - DecisionTreeStartTime
predicitons
Y_pred = DecisionTreeModel.predict(X_test)
calcilate all metrics
confusionMatrix = confusion_matrix(Y_test, Y_pred)
tn, fp, fn, tp = confusionMatrix.ravel()
fpr = fp/(fp+tn)
accuracy = accuracy_score(Y_test, Y_pred)
precision = precision_score(Y_test, Y_pred)
recall = recall_score(Y_test, Y_pred)
print evrything together
print("=== DECISION TREE ===")
print(confusionMatrix)
print("Time:", DecisionTreeTrainingTime)
print(f"Acc:{accuracy:.4f} Prec:{precision:.4f} Rec:{recall:.4f} FPR:{fpr:.4f}")
random forest training
RandomForestStartTime = time.time()
RandomForestModel.fit(X_train, Y_train)
RandomForestEndTime = time.time()
RandomForestTrainingTime = RandomForestEndTime - RandomForestStartTime
predicitons
Y_pred = RandomForestModel.predict(X_test)
metrics
confusionMatrix = confusion_matrix(Y_test, Y_pred)
tn, fp, fn, tp = confusionMatrix.ravel()
fpr = fp/(fp+tn)
accuracy = accuracy_score(Y_test, Y_pred)
precision = precision_score(Y_test, Y_pred)
recall = recall_score(Y_test, Y_pred)
results
print("=== RANDOM FOREST ===")
print("Matrix:", confusionMatrix.tolist())
print("Time:", round(RandomForestTrainingTime, 4))
print("Acc:", round(accuracy, 4), "Prec:", round(precision, 4))
print("Rec:", round(recall, 4), "FPR:", round(fpr, 4))
buliding nerual network
NeuralNetworkModel = Sequential()
NeuralNetworkModel.add(Dense(units=32, activation='sigmoid', input_dim=X_train.shape[1]))
NeuralNetworkModel.add(Dense(units=16, activation='sigmoid')) # midle layr
NeuralNetworkModel.add(Dense(1, activation='softmax')) # final
set optimizer and loss
NeuralNetworkModel.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
start timing and train
NeuralNetworkStartTime = time.time()
history = NeuralNetworkModel.fit(X_train, Y_train, epochs=10, batch_size=16, validation_split=0.2)
NeuralNetworkEndTime = time.time()
NeuralNetworkTrainingTime = NeuralNetworkEndTime - NeuralNetworkStartTime
test the model
test_loss, test_accuracy = NeuralNetworkModel.evaluate(X_test, Y_test)
get predicitons and metrics
precision = precision_score(Y_test, Y_pred)
recall = precision_score(Y_test, Y_pred)
confusionMatrix = confusion_matrix(Y_test, Y_pred)
tn, fp, fn, tp = confusionMatrix.ravel()
fpr = fp/(fp+tn)
show results
print("Neural Network Report:")
print("-" * 30)
print("Confusion Matrix:\n", confusionMatrix)
print(f"Acc: {test_accuracy:.4f} | Time: {NeuralNetworkTrainingTime:.4f}s")
print(f"Prec: {precision:.4f} | Rec: {recall:.4f} | FPR: {fpr:.4f}")
pca time keep 95% variance
pcaY = dataset["num"]
pca = PCA(n_components=0.95)
pcaX = pca.fit_transform(dataset.drop("num", axis=1))
make dataframe with pc columns
pc_names = [f"PC{i+1}" for i in range(pcaX.shape[1])]
pcaDataset = pd.DataFrame(pcaX, columns=pc_names)
add target
pcaDataset["num"] = pcaY.values
pcaDataset.head() # show
knn excecution
KNNModelStartTime = time.time()
KNNModel.fit(X_train, Y_train)
KNNModelEndTime = time.time()
pred + metrics calc
Y_pred = KNNModel.predict(X_test)
confusionMatrix = confusion_matrix(Y_test, Y_pred)
tn, fp, fn, tp = confusionMatrix.ravel()
fpr = fp/(fp+tn)
scorse collection
accuracy = accuracy_score(Y_test, Y_pred)
precision = precision_score(Y_test, Y_pred)
recall = recall_score(Y_test, Y_pred)
results in one line
print("KNN MODEL")
print(f"Time: {KNNModelEndTime-KNNModelStartTime:.4f}s")
print(f"Matrix:\n{confusionMatrix}")
print(f"Acc:{accuracy:.4f}|Prec:{precision:.4f}|Rec:{recall:.4f}|FPR:{fpr:.4f}")
#Decision tree model
DecisionTreeStartTime= time.time()
DecisionTreeModel.fit(X_train, Y_train)
DecisionTreeEndTime= time.time()
DecisionTreeTrainingTime= DecisionTreeEndTime - DecisionTreeStartTime
#y prediction model
Y_pred= DecisionTreeModel.predict(X_test)
#confusion matrix formulas
accuracy= accuracy_score(Y_test, Y_pred)
precision= precision_score(Y_test, Y_pred)
recall= recall_score(Y_test, Y_pred)
tn, fp, fn, tp= confusion_matrix(Y_test, Y_pred).ravel()
fpr= fp/(fp+tn)
confusionMatrix= confusion_matrix(Y_test, Y_pred)
#results genration
print("Confusion Matrix: ")
print(confusionMatrix)
print("Decision Tree Accuracy: ", accuracy)
print("Decision Tree Training Time: ", DecisionTreeTrainingTime)
print("Decision Tree Precision: ", precision)
print("Decision Tree Recall: ", recall)
print("Decision Tree False Positive Rate: ", fpr)
random forest model - testing this one
RandomForestStartTime= time.time()
RandomForestModel.fit(X_train, Y_train)
RandomForestEndTime= time.time()
RandomForestTrainingTime= RandomForestEndTime - RandomForestStartTime
get predictions
Y_pred= RandomForestModel.predict(X_test)
#calcilate all metrics we need
confusionMatrix= confusion_matrix(Y_test, Y_pred)
tn, fp, fn, tp= confusionMatrix.ravel()
fpr= fp/(fp+tn)
accuracy= accuracy_score(Y_test, Y_pred)
precision= precision_score(Y_test, Y_pred)
recall= recall_score(Y_test, Y_pred)
printing results - chagned format
print("Random Forest Test Results")
print("Training Time : ", RandomForestTrainingTime)
print("Confusion Matrix :")
print(confusionMatrix)
print("Performance:")
print("Accuracy = ", accuracy)
print("Precision = ", precision)
print("Recall = ", recall)
print("FPR = ", fpr)
neural network build
NeuralNetworkModel = Sequential()
NeuralNetworkModel.add(Dense(units=32, activation='sigmoid', input_dim=X_train.shape[1]))
NeuralNetworkModel.add(Dense(units=16, activation='sigmoid')) # hiden layer
NeuralNetworkModel.add(Dense(1, activation='softmax')) # out layer
compile model
NeuralNetworkModel.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
train and time it
NeuralNetworkStartTime = time.time()
history = NeuralNetworkModel.fit(X_train, Y_train, epochs=10, batch_size=16, validation_split=0.2)
NeuralNetworkEndTime = time.time()
NeuralNetworkTrainingTime = NeuralNetworkEndTime - NeuralNetworkStartTime
evaluate on test
test_loss, test_accuracy = NeuralNetworkModel.evaluate(X_test, Y_test)
bug fix: forgot to predict earlier
Y_pred = NeuralNetworkModel.predict(X_test)
Y_pred = (Y_pred > 0.5).astype(int)
calcilate everything
confusionMatrix = confusion_matrix(Y_test, Y_pred)
tn, fp, fn, tp = confusionMatrix.ravel()
fpr = fp/(fp+tn)
precision = precision_score(Y_test, Y_pred)
recall = recall_score(Y_test, Y_pred)
store and print differently
results = [
("Confusion Matrix", confusionMatrix),
("Accuracy", test_accuracy),
("Training Time", NeuralNetworkTrainingTime),
("Precision", precision),
("Recall", recall),
("FPR", fpr)
]
for name, val in results:
print(f"{name}: {val}")