Predicting-Stroke-Risk-Using-Machine-Learning/milestone1.py at main · ShafiMohammad09/Predicting-Stroke-Risk-Using-Machine-Learning · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import pandas as pd

# load dataset
df = pd.read_csv('dataset/data.csv')

# basic exploration
print("Basic statistical description of numerical data:")
print(df.describe())

print("\nDataset information (columns, types, and non-null counts):")
print(df.info())

print("\nShape of the dataset (rows, columns):")
print(df.shape)
print("\nBasic statistical description of categorical data:")
print(df.describe(include='object'))

# unique values and null value analysis
print("\nUnique values in 'gender' column:")
gender_unique = df['gender'].unique()
print(gender_unique)

print("\nUnique values in 'smoking_status' column:")
smoking_status_unique = df['smoking_status'].unique()
print(smoking_status_unique)

# check for null values
null_values = df.isnull().sum()
print("\nNull values in each column:")
print(null_values)

# percentage of null values
null_percentage = df.isnull().mean() * 100
print("\nPercentage of null values in each column:")
print(null_percentage)

# observations
print("\nObservations:")

# display the data types of each column to ensure they align with expected types
print("\nData types of each column:")
print(df.dtypes)

# **observation 1:**
#  the dataset contains 5110 rows and 12 columns.
print(f"1. The dataset contains {df.shape[0]} rows and {df.shape[1]} columns.")

# **observation 2:**
#  the 'bmi' column originally contained 201 missing values.
missing_bmi_count = null_values['bmi']
if missing_bmi_count > 0:
    print(f"2. The 'bmi' column originally contained {missing_bmi_count} missing values.")
else:
    print("2. The 'bmi' column contains no missing values.")

# **observation 3:**
#  the 'gender' column contains the following unique values: ['Male', 'Female', 'Other'].
print(f"3. The 'gender' column contains the following unique values: {gender_unique}")

# **observation 4:**
#  the 'smoking_status' column contains the following unique values: ['formerly smoked', 'never smoked', 'smokes', 'Unknown'].
print(f"4. The 'smoking_status' column contains the following unique values: {smoking_status_unique}")

# **observation 5:**
#  missing data is found in the 'bmi' column, with a total of 3.93% of its values missing.
print(f"5. The dataset contains missing data in the following columns (with percentages):")
print(null_percentage[null_percentage > 0])

# **observation 6:**
#  after handling missing values, no missing values remain in the 'bmi' column.
# option 1: Dropping rows with missing values in 'bmi'
df_dropped = df.dropna(subset=['bmi'])  # Option 1
print(f"6. After dropping rows with missing 'bmi' values, the dataset contains {df_dropped.shape[0]} rows.")

# option 2: Impute missing 'bmi' values with the mean
mean_bmi = df['bmi'].mean()
df['bmi'].fillna(mean_bmi, inplace=True)  # Option 2
print(f"\nImputing missing 'bmi' values with mean value: {mean_bmi}")

# checking null values again after imputation
null_values_after = df.isnull().sum()
print("\nNull values after imputing missing 'bmi' values:")
print(null_values_after)

# **observation 7:**
# after imputing missing values, the 'bmi' column has no missing values.
if null_values_after['bmi'] == 0:
    print("7. After imputing, the 'bmi' column has no missing values.")

# identify if there are any duplicate rows in the dataset.
duplicates = df.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicates}")

# **observation 8:**
#strok rate by genders
print("\nStroke rate by gender:")
print(df.groupby('gender')['stroke'].mean())
#observation of Stroke rate by gender:
#gender
#Female    0.047094
#Male      0.051064
#Other     0.000000

#percentage
total = df['stroke'].sum()
strokes_gender = df[df['stroke'] == 1].groupby('gender')['stroke'].count()
stroke_per = (strokes_gender / total) * 100

total = df['stroke'].sum()
strokes_gender = df[df['stroke'] == 1].groupby('gender')['stroke'].count()
stroke_per = (strokes_gender / total) * 100
print("\nStroke percentage by gender (relative to all stroke cases):")
print(stroke_per)
#gender
#Female    56.626506
#Male      43.373494