Machine-Learing-Course-2025-Project--Unipi/utils.py at main · dev-beluga/Machine-Learing-Course-2025-Project--Unipi · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import json

def create_dirs(paths):
    """
    Checks if a list of directory paths exist.
    If not, creates them. Used to ensure 'results/', 'final_model/', and 'plot/' folders exist.
    """

    for p in paths:
        if not os.path.exists(p): os.makedirs(p, exist_ok=True)

def save_config(config, folder, filename):
    """
    Saves a dictionary (config) as a JSON file to the specified folder.
    """

    create_dirs([folder])
    with open(os.path.join(folder, filename), 'w') as f: json.dump(config, f, indent=4)

def load_cup_data(tr_path, ts_path):
    """
    Parses the ML-CUP CSV files.
    - Reads Training data (inputs and targets).
    - Reads Blind Test data (inputs and IDs).
    Returns: X_all, Y_all, X_blind, ids_blind.
    """

    df_tr = pd.read_csv(tr_path, comment='#', header=None)
    X_all = df_tr.iloc[:, 1:13].values.astype(np.float64)
    Y_all = df_tr.iloc[:, 13:17].values.astype(np.float64)
    df_ts = pd.read_csv(ts_path, comment='#', header=None)
    X_blind = df_ts.iloc[:, 1:13].values.astype(np.float64)
    ids_blind = df_ts.iloc[:, 0].values.astype(int)
    return X_all, Y_all, X_blind, ids_blind

def load_monk_data(train_path, test_path):
    """
    Parses the MONK dataset files.
    - Applies One-Hot Encoding to the categorical features.
    Returns: X_train, y_train, X_test, y_test.
    """

    def one_hot_encode_monk(df):
        sizes = [3, 3, 2, 3, 4, 2]
        X = df.iloc[:, 1:7].values
        encoded = []
        for i, size in enumerate(sizes):
            temp = np.zeros((X.shape[0], size))
            temp[np.arange(X.shape[0]), X[:, i] - 1] = 1
            encoded.append(temp)
        return np.hstack(encoded)
    train_df = pd.read_csv(train_path, sep=r'\s+', header=None)
    X_train = one_hot_encode_monk(train_df)
    y_train = train_df.iloc[:, 0].values.reshape(-1, 1)
    test_df = pd.read_csv(test_path, sep=r'\s+', header=None)
    X_test = one_hot_encode_monk(test_df)
    y_test = test_df.iloc[:, 0].values.reshape(-1, 1)
    return X_train, y_train, X_test, y_test

def plot_monk_combined(history, dataset_name, save_path):
    """
    Generates and saves a side-by-side plot for MONK tasks:
    - Left: Mean Squared Error (MSE) vs Epochs.
    - Right: Accuracy vs Epochs.
    """

    create_dirs([os.path.dirname(save_path)])
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
    ax1.plot(history['mse_train'], label='Train MSE', color='blue')
    if 'mse_test' in history: ax1.plot(history['mse_test'], label='Test MSE', color='orange')
    ax1.set_title(f"Learning Curve (MSE) - {dataset_name}")
    ax1.legend(); ax1.grid(True)
    ax2.plot(history['acc_train'], label='Train Accuracy', color='blue')
    if 'acc_test' in history: ax2.plot(history['acc_test'], label='Test Accuracy', color='orange')
    ax2.set_title(f"Learning Curve (Accuracy) - {dataset_name}")
    ax2.legend(); ax2.grid(True)
    plt.tight_layout(); plt.savefig(save_path); plt.show(); plt.close()

def plot_cup_curve(history, metric_tr, metric_val, title, save_path):
    """
    Generates and saves a learning curve plot for the CUP task.
    Plots Training metric vs Validation metric (usually MEE) over epochs.
    """

    create_dirs([os.path.dirname(save_path)])
    plt.figure(figsize=(10, 6))
    label_tr = "Train MEE" if "mee" in metric_tr else f"Train {metric_tr}"
    label_val = "Val MEE" if "mee" in metric_val else f"Val {metric_val}"
    plt.plot(history[metric_tr], label=label_tr, color='blue')
    if metric_val in history: plt.plot(history[metric_val], label=label_val, color='orange')
    plt.title(title); plt.legend(); plt.grid(True)
    plt.savefig(save_path); plt.show(); plt.close()