-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutils.py
More file actions
96 lines (84 loc) · 3.7 KB
/
utils.py
File metadata and controls
96 lines (84 loc) · 3.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import json
def create_dirs(paths):
"""
Checks if a list of directory paths exist.
If not, creates them. Used to ensure 'results/', 'final_model/', and 'plot/' folders exist.
"""
for p in paths:
if not os.path.exists(p): os.makedirs(p, exist_ok=True)
def save_config(config, folder, filename):
"""
Saves a dictionary (config) as a JSON file to the specified folder.
"""
create_dirs([folder])
with open(os.path.join(folder, filename), 'w') as f: json.dump(config, f, indent=4)
def load_cup_data(tr_path, ts_path):
"""
Parses the ML-CUP CSV files.
- Reads Training data (inputs and targets).
- Reads Blind Test data (inputs and IDs).
Returns: X_all, Y_all, X_blind, ids_blind.
"""
df_tr = pd.read_csv(tr_path, comment='#', header=None)
X_all = df_tr.iloc[:, 1:13].values.astype(np.float64)
Y_all = df_tr.iloc[:, 13:17].values.astype(np.float64)
df_ts = pd.read_csv(ts_path, comment='#', header=None)
X_blind = df_ts.iloc[:, 1:13].values.astype(np.float64)
ids_blind = df_ts.iloc[:, 0].values.astype(int)
return X_all, Y_all, X_blind, ids_blind
def load_monk_data(train_path, test_path):
"""
Parses the MONK dataset files.
- Applies One-Hot Encoding to the categorical features.
Returns: X_train, y_train, X_test, y_test.
"""
def one_hot_encode_monk(df):
sizes = [3, 3, 2, 3, 4, 2]
X = df.iloc[:, 1:7].values
encoded = []
for i, size in enumerate(sizes):
temp = np.zeros((X.shape[0], size))
temp[np.arange(X.shape[0]), X[:, i] - 1] = 1
encoded.append(temp)
return np.hstack(encoded)
train_df = pd.read_csv(train_path, sep=r'\s+', header=None)
X_train = one_hot_encode_monk(train_df)
y_train = train_df.iloc[:, 0].values.reshape(-1, 1)
test_df = pd.read_csv(test_path, sep=r'\s+', header=None)
X_test = one_hot_encode_monk(test_df)
y_test = test_df.iloc[:, 0].values.reshape(-1, 1)
return X_train, y_train, X_test, y_test
def plot_monk_combined(history, dataset_name, save_path):
"""
Generates and saves a side-by-side plot for MONK tasks:
- Left: Mean Squared Error (MSE) vs Epochs.
- Right: Accuracy vs Epochs.
"""
create_dirs([os.path.dirname(save_path)])
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
ax1.plot(history['mse_train'], label='Train MSE', color='blue')
if 'mse_test' in history: ax1.plot(history['mse_test'], label='Test MSE', color='orange')
ax1.set_title(f"Learning Curve (MSE) - {dataset_name}")
ax1.legend(); ax1.grid(True)
ax2.plot(history['acc_train'], label='Train Accuracy', color='blue')
if 'acc_test' in history: ax2.plot(history['acc_test'], label='Test Accuracy', color='orange')
ax2.set_title(f"Learning Curve (Accuracy) - {dataset_name}")
ax2.legend(); ax2.grid(True)
plt.tight_layout(); plt.savefig(save_path); plt.show(); plt.close()
def plot_cup_curve(history, metric_tr, metric_val, title, save_path):
"""
Generates and saves a learning curve plot for the CUP task.
Plots Training metric vs Validation metric (usually MEE) over epochs.
"""
create_dirs([os.path.dirname(save_path)])
plt.figure(figsize=(10, 6))
label_tr = "Train MEE" if "mee" in metric_tr else f"Train {metric_tr}"
label_val = "Val MEE" if "mee" in metric_val else f"Val {metric_val}"
plt.plot(history[metric_tr], label=label_tr, color='blue')
if metric_val in history: plt.plot(history[metric_val], label=label_val, color='orange')
plt.title(title); plt.legend(); plt.grid(True)
plt.savefig(save_path); plt.show(); plt.close()