-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcorrelations.py
More file actions
106 lines (90 loc) · 3.63 KB
/
correlations.py
File metadata and controls
106 lines (90 loc) · 3.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# Calculate Pearson, Spearman, and Kendall correlations with heatmap
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import pearsonr, spearmanr, kendalltau
import os
# Paths
file_path = "PT-all-scores-all-approaches.csv"
output_dir = "correlations"
os.makedirs(output_dir, exist_ok=True)
# Load data
df = pd.read_csv(file_path)
# Rename columns for readability
score_labels = {
"CVSSBaseScore": "CVSS-B",
"ExploitabilityIndex": "Expl. Index",
"EPSSScore": "EPSS",
"SSVCScoreLow": "SSVC-L",
"SSVCScoreMedium": "SSVC-M",
"SSVCScoreHigh": "SSVC-H"
}
df = df.rename(columns=score_labels)
score_columns = list(score_labels.values())
df[score_columns] = df[score_columns].apply(pd.to_numeric, errors="coerce")
df = df.dropna(subset=score_columns)
# Function to calculate correlations + p-values
def correlation_with_pvalues(data, method_name, stat_func):
corr_matrix = pd.DataFrame(index=score_columns, columns=score_columns, dtype=float)
pval_matrix = pd.DataFrame(index=score_columns, columns=score_columns, dtype=float)
for i in score_columns:
for j in score_columns:
if i == j:
corr_matrix.loc[i, j] = 1.0
pval_matrix.loc[i, j] = 0.0
elif pd.isna(corr_matrix.loc[i, j]):
corr, pval = stat_func(data[i], data[j])
corr_matrix.loc[i, j] = corr
corr_matrix.loc[j, i] = corr
pval_matrix.loc[i, j] = pval
pval_matrix.loc[j, i] = pval
# Save full matrices
corr_matrix.to_csv(f"{output_dir}/{method_name}_correlation_values.csv")
pval_matrix.to_csv(f"{output_dir}/{method_name}_pvalues.csv")
# Mask lower triangle
mask = np.tril(np.ones_like(corr_matrix, dtype=bool))
# Trim last row/col to remove white space
trimmed_corr = corr_matrix.iloc[:-1, 1:]
trimmed_mask = mask[:-1, 1:]
# Plot
plt.figure(figsize=(10, 8))
ax = sns.heatmap(
trimmed_corr.astype(float),
mask=trimmed_mask,
annot=True,
annot_kws={"size": 26}, # Larger numbers inside cells
fmt=".2f",
cmap="coolwarm",
vmin=-1,
vmax=1,
square=False,
linewidths=0.5,
cbar_kws={
"shrink": 0.95,
"label": "Correlation", # Add label to colorbar
"format": "%.2f"
},
xticklabels=trimmed_corr.columns,
yticklabels=trimmed_corr.index
)
cbar = ax.collections[0].colorbar
cbar.ax.tick_params(labelsize=16) # Tick labels
cbar.set_label("Correlation", fontsize=18) # Colorbar label
# Increase axis label fonts
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
plt.title(f"{method_name.capitalize()} Correlation", fontsize=25)
# Save figure
plt.tight_layout()
plt.savefig(f"{output_dir}/{method_name}_correlation_heatmap_upper_triangle.png", dpi=300)
plt.close()
print(f"{method_name.capitalize()} correlation heatmap saved to {output_dir}/{method_name}_correlation_heatmap_upper_triangle.png.")
# Run for all methods
correlation_with_pvalues(df, "pearson", pearsonr)
correlation_with_pvalues(df, "spearman", spearmanr)
correlation_with_pvalues(df, "kendall", kendalltau)
# Gather correlation matrices
pearson_corr = pd.read_csv(f"{output_dir}/pearson_correlation_values.csv", index_col=0)
spearman_corr = pd.read_csv(f"{output_dir}/spearman_correlation_values.csv", index_col=0)
kendall_corr = pd.read_csv(f"{output_dir}/kendall_correlation_values.csv", index_col=0)