Auto_Optimizer/TLDR.py at main · JaneM443/Auto_Optimizer · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
import optuna
import subprocess
import pickle
import sys
import os
import shutil
import logging
from typing import Any, Dict, Tuple, List
import time

###-------------------------------------------------------------------------###
### Runs multiple trials and logs results                                   ###
###-------------------------------------------------------------------------###

def load_logger(output_file_path):
    ###---------------------------------------------------------------------###
    ### Configure logging format, sending all files to that directory       ###
    ###---------------------------------------------------------------------###

    AUTO_CLEAR = True
    logging.basicConfig(
        filename = f"{output_file_path}/TLDR_output.log",
        level = logging.DEBUG,
        filemode = 'a' if AUTO_CLEAR == False else 'w',
        format = "%(levelname)s | %(asctime)s | '%(message)s'"
        )

    logging.info("Logger Loaded")

    #-----------------------------------------------------------------------###

def main(data) -> None:
    ###---------------------------------------------------------------------###
    ### Sets up, runs and concludes the full trial
    ###---------------------------------------------------------------------###

                            # Dict[Param Name: Tuple[Min, Max]] (Ns, Ps)
    hyperparameters         : Dict[str       : Tuple[Any, Any]] = data[0]

                            # Dict[Parameter Name: Value] (SLURM Parameters)
    runtimeparameters       : Dict[str           : Any            ] = data[2]

    # HPL file paths

    folder_path = "hpl-2.3/"
    file_path = "hpl-2.3.tar.gz"

    # Remove HPL and run setup from scratch

    logging.info("Running 'SLURM/setup_hpl.sh'")
    try:
        shutil.rmtree(folder_path)
        os.remove(file_path)
    except OSError as e:
        print(f"Error: {folder_path} could not be removed - {e}")

    slurm_script_path = 'SLURM/setup_hpl.sh'
    try:
        subprocess.run(['bash', slurm_script_path], check=True)
        logging.debug("SLURM script executed successfully.")
    except subprocess.CalledProcessError as e:
        logging.error(f"Error executing SLURM script: {e}")
        raise e

    # Create optuna study and runs the full trial

    study = optuna.create_study(direction = "maximize",
                                pruner=optuna.pruners.MedianPruner())
    study.optimize(lambda trial : objective(trial,
                                            hyperparameters,
                                            runtimeparameters),
                   n_trials = runtimeparameters["Number Of Trials"][0])

    # Returns the best parameters found from the trials

    best_params = study.best_params
    best_value = study.best_value
    best_trial = study.best_trial

    # Logs those results

    logging.info("Best Parameters: "+str(best_params))
    logging.info("Best Value: "+str(best_value))
    logging.info("Best Study: "+str(best_trial))

    #-----------------------------------------------------------------------###

def edit_HPL_dat(limits):
    ###---------------------------------------------------------------------###
    ### Changes parameters in HPL dat
    ###---------------------------------------------------------------------###

    with open('Extra/HPL.dat.scaffold', 'r') as file:
        hpl_file_data = file.read()

    for param_name in limits.keys():
        hpl_file_data = hpl_file_data.replace(f"{{{param_name}}}", f"{limits[param_name]}")

    with open("hpl-2.3/testing/HPL.dat", 'w') as file:
        file.write(hpl_file_data)

    #-----------------------------------------------------------------------###

def run_hpl_benchmark():
    ###---------------------------------------------------------------------###
    ### Runs one HPL trial
    ###---------------------------------------------------------------------###

    slurm_script_path = 'SLURM/run_hpl.sh'

    try:
        subprocess.run(['bash', slurm_script_path], check=True)
        logging.debug("SLURM script executed successfully.")
    except subprocess.CalledProcessError as e:
        logging.error(f"Error executing SLURM script: {e}")
        raise e

    #-----------------------------------------------------------------------###

def retrieve_latest_gflops():
    ###---------------------------------------------------------------------###
    ### Extracts GFLOPS data from HPL log file
    ###---------------------------------------------------------------------###

    with open('hpl-2.3/testing/hpl.log','r') as file:
        hpl_log_lines = file.readlines()

    data_indicies = [index + 2 for (index, line) in enumerate(hpl_log_lines) if "Gflops" in line]
    data_indicies = data_indicies[1:]
    data_lines = [line.strip('\n').split(' ') for (index, line) in enumerate(hpl_log_lines) if index in data_indicies]
    data_lines = [[data for data in line if data != ''] for line in data_lines]
    Gflops = [line[-1] for line in data_lines]

    if(len(Gflops) != 1):
        logging.critical(f"{len(Gflops)} is an invalid number of lines returned from data search. Expecting 1")
        raise Exception(f"{len(Gflops)} is an invalid number of lines returned from data search. Expecting 1")

    return float(Gflops[0])

    #-----------------------------------------------------------------------###

def objective(trial, hyperparameters, runtimeparameters):
    ###---------------------------------------------------------------------###
    ### The study, lays out method for each trial
    ###---------------------------------------------------------------------###

    # Logging information

    current_time = time.perf_counter()
    logging.info("Trial Started")
    hyperparameter_names = [name for name in hyperparameters.keys()]

    # Choosing hyperparameter values

    nodes = runtimeparameters["Number Of Nodes"][0]
    cores = runtimeparameters["Cores Per Node"][0]
    number_of_ranks = nodes*cores

    # Optuna picks a value within the user specified range for hyperparameters
    limits = {key: trial.suggest_int(key,
                                     hyperparameters[key][0],
                                     hyperparameters[key][1])
                for key in hyperparameter_names if key not in ("Ps", "Qs")}
    # Selects possible P values in the user specified range that divide ranks
    divisors = [divisor for divisor in range(hyperparameters["Ps"][0],
                                             hyperparameters["Ps"][1])
                                             if number_of_ranks % divisor == 0]
    # Optuna selects one of these divisors
    Ps = trial.suggest_categorical("Ps", divisors)
    # Q is then fixed by this choice
    Qs = number_of_ranks // Ps

    #! Temporary remove once latency is gone
    limits.update({"Ps":Ps, "Qs":Qs})

    logging.info(f"Limits : {str(limits)}")

    # Run Benchmark with these values and extract GFLOPS

    # Update HPL dat file
    edit_HPL_dat(limits)
    # Run the HPL benchmark while time stamping
    os.system("echo `date -u` > hpl_submission.tstamps")
    run_hpl_benchmark()
    os.system("echo `date -u` >> hpl_submission.tstamps")
    # Retrieve result of trial
    gflops = retrieve_latest_gflops()
    logging.info(f"Gflops : {gflops}")


    delta_time = time.perf_counter() - current_time
    logging.info(f"Trial Ended : Elapsed time |{delta_time}|")

    return gflops

    #-----------------------------------------------------------------------###

if __name__ == "__main__":

    ###----------------------------------------------------------------------###
    ### User interacts here to change run parameters                         ###
    ###----------------------------------------------------------------------###

    # File path for current logging
    current_directories = os.listdir("Outputs")
    current_id = max(int(dir_id) for dir_id in current_directories)

    output_file_path = f"Outputs/{current_id}"

    load_logger(output_file_path) # Points logger to directory


    # Loads data from ServersideDougal

    FILE_PATH = sys.argv[1]

    try:
        with open(FILE_PATH, "rb") as file:
            data = pickle.load(file)
            logging.info(f"Loaded Data : {data}")
    except Exception as exception:
        logging.critical(f"Error with loading Dougal data: {type(exception).__name__} - {exception}")
        raise Exception

    main(data)

    #-----------------------------------------------------------------------###