data-mining-covid19/question1.py at main · tsaperlein/data-mining-covid19 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore

# --- Load the CSV file ---
df = pd.read_csv("data.csv")
df.info()
stats = df.describe()
print("\n \n")
print(stats)

# Print the number of missing values in each column of the DataFrame, sorted in descending order
NaNs_data_df = df.isnull().sum().sort_values(ascending=False)
print("\n \n")
print(NaNs_data_df)

# Filter out inaccurate values in "Daily tests"
df.drop(df[df['Daily tests'] < 0].index, inplace = True)
# --------------------------------------------


# Drop countries with too many missing values in "Daily tests" column
country_nan_percentage = df.groupby("Entity")["Daily tests"].apply(lambda x: x.isna().mean() * 100)
threshold = 75      # 75% missing values
countries_to_drop = country_nan_percentage[country_nan_percentage > threshold].index

# print the countries to drop
print("\n \n")
print(countries_to_drop)
df = df[~df["Entity"].isin(countries_to_drop)].reset_index(drop=True)

# --- Fill missing values in columns ---
# Fill missing values in "Daily tests" column
df['Daily tests'] = df['Daily tests'].groupby(df['Entity']).apply(lambda x: x.fillna(method='ffill')).reset_index(drop=True)
df['Daily tests'] = df['Daily tests'].groupby(df['Entity']).apply(lambda x: x.fillna(method='bfill')).reset_index(drop=True)
# Fill missing values in "Cases" column with 0
df['Cases'] = df['Cases'].fillna(0)
# Fill missing values in "Deaths" column with 0
df['Deaths'] = df['Deaths'].fillna(0)
# --------------------------------------------

# Save df to a new csv file
df.to_csv('modified_dataframe.csv', index=False)

# --- TIMELINE ---
# -- Plot the number of cases and deaths passing through time
# Convert date column to datetime format
df['Date'] = pd.to_datetime(df['Date'])

# Create a new dataframe with the required data
data = df[['Date', 'Entity', 'Cases', 'Deaths']]
data = data.groupby('Date').sum().reset_index()

# Create figure and axes
fig, ax = plt.subplots()

# Plot cases and deaths for all entities
ax.plot(data['Date'], data['Cases'], label='Cases')
ax.plot(data['Date'], data['Deaths'], label='Deaths')

# Set x and y labels
ax.set_xlabel('Month')
ax.set_ylabel('Number')

# Format x-axis labels
month_format = mpl.dates.DateFormatter('%b')
ax.xaxis.set_major_formatter(month_format)
ax.xaxis.set_major_locator(mpl.dates.MonthLocator())
ax.xaxis.set_minor_locator(mpl.dates.MonthLocator(bymonthday=15))

# Set y-axis limits
ax.set_ylim(bottom=1)
ax.set_yscale('log')

# Add legend
ax.legend()

# Save plot
plt.savefig('q1/cases_deaths_diagram.png', dpi=300, bbox_inches='tight')
# --------------------------------------------

# --- Plot/save boxplots for each column ---
columns = ['Entity', 'Continent', 'Date', 'Daily tests', 'Cases', 'Deaths']
data = df.drop(columns, axis=1).drop_duplicates()
fig, axs = plt.subplots(3, 3, figsize=(12, 8))
axs = axs.flatten()
for i, col in enumerate(data.columns):
    axs[i].boxplot(data[col])
    axs[i].set_title(col)
plt.tight_layout()
plt.savefig("q1/boxplots.png", bbox_inches='tight')
plt.close()
# --------------------------------------------

# --- Plot histograms for each column ---
columns = ['Date', 'Daily tests', 'Cases', 'Deaths']
plt.figure(figsize=(16, 9))
df.drop(columns, axis=1).drop_duplicates().hist(bins=15, figsize=(16, 9), rwidth=0.8)
plt.savefig("q1/histograms.png", bbox_inches='tight')
plt.close()
# --------------------------------------------

# --- Plot heatmap for correlation between columns ---
# Keep the last line (date) for each country and drop unused columns
df_last = df.groupby(df['Entity']).tail(1).drop(['Entity', 'Date', 'Continent'], axis=1)
plt.figure(figsize=(12, 8))
sns.heatmap(df_last.corr(), annot=True, cmap=plt.cm.Reds)
plt.savefig("q1/correlation-heatmap.png", bbox_inches='tight')
plt.close()
# --------------------------------------------

# --- Plot feature-output-variable distributions for each column ---
# Scatter plots readability: remove outliers in all comuns except in the column 'Continent'
df_last = df.groupby(df['Entity']).tail(1).drop(['Entity', 'Date'], axis=1)
column_continent = df_last[['Continent']]
df_last = df_last.drop('Continent', axis=1)
df_last = column_continent.join(df_last[(np.abs(zscore(df_last)) < 3).all(axis=1)])

# Plot feature-output-variable distributions for each column
for column in df_last.columns.drop(['Cases', 'Deaths']):
    fig, ax = plt.subplots(ncols=2, figsize=(14, 4))
    df_last.plot.scatter(x=column, y='Cases', ax=ax[0])
    df_last.plot.scatter(x=column, y='Deaths', ax=ax[1])
    if column == 'Continent':
        fig.autofmt_xdate(rotation=90)
    file_name = column.replace("/", "-")
    plt.savefig(f"q1/Scatter/{file_name}-scatter.png", bbox_inches='tight')
    plt.close()
# --------------------------------------------

# --- Scatter deaths-cases ---
fig, ax = plt.subplots(figsize=(10, 6))
df_last.plot.scatter(x='Cases', y='Deaths', ax=ax)
plt.savefig(f"q1/Scatter/deaths-cases-scatter.png", bbox_inches='tight')
plt.close()