From c80857124d4a436aff12904f2f427b36160bce17 Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Tue, 4 Jul 2017 20:22:29 -0500 Subject: [PATCH 1/5] o Add a general workflow that is capable of running any benchmark and perform a basic grid or random search. --- workflows/rnd_or_grid/README.md | 13 ++ workflows/rnd_or_grid/data/nt3_settings.json | 12 ++ workflows/rnd_or_grid/data/p1b1_settings.json | 13 ++ workflows/rnd_or_grid/data/p1b3_settings.json | 14 ++ workflows/rnd_or_grid/data/p2b1_settings.json | 13 ++ workflows/rnd_or_grid/data/p3b1_settings.json | 14 ++ workflows/rnd_or_grid/python/computeStats.py | 40 +++++ .../rnd_or_grid/python/determineParameters.py | 149 ++++++++++++++++++ workflows/rnd_or_grid/python/evaluateOne.py | 93 +++++++++++ .../rnd_or_grid/python/nt3_tc1_runner.py | 71 +++++++++ workflows/rnd_or_grid/python/p1b1_runner.py | 49 ++++++ workflows/rnd_or_grid/python/p1b3_runner.py | 60 +++++++ workflows/rnd_or_grid/python/p2b1_runner.py | 56 +++++++ workflows/rnd_or_grid/python/p3b1_runner.py | 56 +++++++ .../python/test/run_test_runners.sh | 7 + .../rnd_or_grid/python/test/test_runners.py | 50 ++++++ workflows/rnd_or_grid/swift/computeStats.sh | 2 + .../rnd_or_grid/swift/determineParameters.sh | 3 + workflows/rnd_or_grid/swift/evaluateOne.sh | 3 + workflows/rnd_or_grid/swift/rnd_or_grid.swift | 68 ++++++++ workflows/rnd_or_grid/swift/run | 54 +++++++ 21 files changed, 840 insertions(+) create mode 100644 workflows/rnd_or_grid/README.md create mode 100644 workflows/rnd_or_grid/data/nt3_settings.json create mode 100644 workflows/rnd_or_grid/data/p1b1_settings.json create mode 100644 workflows/rnd_or_grid/data/p1b3_settings.json create mode 100644 workflows/rnd_or_grid/data/p2b1_settings.json create mode 100644 workflows/rnd_or_grid/data/p3b1_settings.json create mode 100644 workflows/rnd_or_grid/python/computeStats.py create mode 100644 workflows/rnd_or_grid/python/determineParameters.py create mode 100644 workflows/rnd_or_grid/python/evaluateOne.py create mode 100644 workflows/rnd_or_grid/python/nt3_tc1_runner.py create mode 100644 workflows/rnd_or_grid/python/p1b1_runner.py create mode 100644 workflows/rnd_or_grid/python/p1b3_runner.py create mode 100644 workflows/rnd_or_grid/python/p2b1_runner.py create mode 100644 workflows/rnd_or_grid/python/p3b1_runner.py create mode 100755 workflows/rnd_or_grid/python/test/run_test_runners.sh create mode 100644 workflows/rnd_or_grid/python/test/test_runners.py create mode 100755 workflows/rnd_or_grid/swift/computeStats.sh create mode 100755 workflows/rnd_or_grid/swift/determineParameters.sh create mode 100755 workflows/rnd_or_grid/swift/evaluateOne.sh create mode 100644 workflows/rnd_or_grid/swift/rnd_or_grid.swift create mode 100755 workflows/rnd_or_grid/swift/run diff --git a/workflows/rnd_or_grid/README.md b/workflows/rnd_or_grid/README.md new file mode 100644 index 00000000..792551f5 --- /dev/null +++ b/workflows/rnd_or_grid/README.md @@ -0,0 +1,13 @@ +# Simple grid or random parameter sweep with Swift for all the benchmarks, using command line. D type , which runs a parameter sweep. It calls command-line programs as follows: +- determineParameters.{sh,py}: Reads data/**settings.json** for sweep parameters, and return as a string for use by Swift program (sweep-parameters.txt) +- evaluateOne.{sh,py}: Runs a single experiment. (Calls the specified benchmark). +- computeStats.{sh,py}: Ingests data from all of the experiments and computes simple stats. + +Usage: ./run ./run ex3_p1b1_grid p1b1 grid + +Notes: +**settings.json**: sweep parameters variation +1. json file must be present in the data folder and named as: _settings.json, samples files are available and must be modified as per needs. +2. Run directory will be created in the experiments folder +3. New variables can be introduced in the determineParameters.py and evaluateOne.py. +4. Variations of parameters must be specified in data/*.json files diff --git a/workflows/rnd_or_grid/data/nt3_settings.json b/workflows/rnd_or_grid/data/nt3_settings.json new file mode 100644 index 00000000..d505cd22 --- /dev/null +++ b/workflows/rnd_or_grid/data/nt3_settings.json @@ -0,0 +1,12 @@ +{ + "parameters": + { + "epochs": [1, 2 ], + "batch_size": [10, 20], + "classes": [2, 3] + }, + "samples": + { + "num": [2] + } +} diff --git a/workflows/rnd_or_grid/data/p1b1_settings.json b/workflows/rnd_or_grid/data/p1b1_settings.json new file mode 100644 index 00000000..f8f7a7a8 --- /dev/null +++ b/workflows/rnd_or_grid/data/p1b1_settings.json @@ -0,0 +1,13 @@ +{ + "parameters": + { + "epochs": [1, 2, 8], + "batch_size": [20, 40], + "N1": [1000, 2000], + "NE": [500, 600] + }, + "samples": + { + "num": [2] + } +} diff --git a/workflows/rnd_or_grid/data/p1b3_settings.json b/workflows/rnd_or_grid/data/p1b3_settings.json new file mode 100644 index 00000000..12a32b60 --- /dev/null +++ b/workflows/rnd_or_grid/data/p1b3_settings.json @@ -0,0 +1,14 @@ +{ + "parameters": + { + "epochs": [1, 2], + "batch_size": [50, 60], + "test_cell_split": [0.15, 0.25], + "drop": [0.1, 0.15] + }, + "samples": + { + "num": [2] + } + +} diff --git a/workflows/rnd_or_grid/data/p2b1_settings.json b/workflows/rnd_or_grid/data/p2b1_settings.json new file mode 100644 index 00000000..ce5a0af3 --- /dev/null +++ b/workflows/rnd_or_grid/data/p2b1_settings.json @@ -0,0 +1,13 @@ +{ + "parameters": + { + "epochs": [1, 2], + "batch_size": [32, 40], + "molecular_epochs": [1, 3], + "weight_decay": [0.0005, 0.0006] + }, + "samples": + { + "num": [2] + } +} diff --git a/workflows/rnd_or_grid/data/p3b1_settings.json b/workflows/rnd_or_grid/data/p3b1_settings.json new file mode 100644 index 00000000..ee13ec76 --- /dev/null +++ b/workflows/rnd_or_grid/data/p3b1_settings.json @@ -0,0 +1,14 @@ +{ + "parameters": + { + "epochs": [1 , 2 ], + "batch_size": [20, 40], + "shared_nnet_spec": [1200, 1400], + "n_fold": [1, 2] + }, + "samples": + { + "num": [2] + } +} + \ No newline at end of file diff --git a/workflows/rnd_or_grid/python/computeStats.py b/workflows/rnd_or_grid/python/computeStats.py new file mode 100644 index 00000000..f414c378 --- /dev/null +++ b/workflows/rnd_or_grid/python/computeStats.py @@ -0,0 +1,40 @@ +import sys +from collections import defaultdict +import json, os + +def extractVals(A): + B = defaultdict(dict) + A1 = A.split() + for n, val in zip(A1[0::2], A1[1::2]): + B[n] = float(val) + return(B) + +def computeStats(swiftArrayAsString): + A = extractVals(swiftArrayAsString) + vals = [] + for a in A: + vals += [A[a]] + print('%d values, with min=%f, max=%f, avg=%f\n'%(len(vals),min(vals),max(vals),sum(vals)/float(len(vals)))) + + filename = os.environ['TURBINE_OUTPUT']+ "/final_stats.txt" + # writing the val loss to the output file + with open(filename, 'w') as the_file: + the_file.write('%d values, with min=%f, max=%f, avg=%f\n'%(len(vals),min(vals),max(vals),sum(vals)/float(len(vals)))) + + + +if (len(sys.argv) < 2): + print('requires arg=dataFilename') + sys.exit(1) + +dataFilename = sys.argv[1] + +try: + with open(dataFilename, 'r') as the_file: + data = the_file.read() +except IOError as e: + print("Could not open: %s" % dataFilename) + print("PWD is: '%s'" % os.getcwd()) + +computeStats(data) + diff --git a/workflows/rnd_or_grid/python/determineParameters.py b/workflows/rnd_or_grid/python/determineParameters.py new file mode 100644 index 00000000..aa573157 --- /dev/null +++ b/workflows/rnd_or_grid/python/determineParameters.py @@ -0,0 +1,149 @@ +import sys, json, os +import random + +# ===== Definitions ========================================================= +def expand(Vs, fr, to, soFar): + soFarNew = [] + for s in soFar: + print Vs[fr] + if (Vs[fr] == None): + print ("ERROR: The order of json inputs and values must be preserved") + sys.exit(1) + for v in Vs[fr]: + if s == '': + soFarNew += [str(v)] + else: + soFarNew += [s+','+str(v)] + if fr==to: + return(soFarNew) + else: + return expand(Vs, fr+1, to, soFarNew) + +def generate_random(values, n_samples, benchmarkName): + # select '#samples' random numbers between the range provided in settings.json file + result = "" + for s in range(samples[0]): + if(benchmarkName=="p1b1"): + # values = {1:epochs, 2: batch_size, 3: N1, 4: NE} + t_epoch= random.randint(values[1][0], values[1][1]) + t_batch_size= random.randint(values[2][0], values[2][1]) + t_N1= random.randint(values[3][0], values[3][1]) + t_NE= random.randint(values[4][0], values[4][1]) + result+=str(t_epoch) + ',' + str(t_batch_size) + ',' + str(t_N1) + ',' + str(t_NE) + elif(benchmarkName=="p1b3"): + # values = {1:epochs, 2: batch_size, 3: test_cell_split, 4: drop} + t_epoch= random.randint(values[1][0], values[1][1]) + t_batch_size= random.randint(values[2][0], values[2][1]) + t_tcs= random.uniform(values[3][0], values[3][1]) + t_drop= random.uniform(values[4][0], values[4][1]) + result+=str(t_epoch) + ',' + str(t_batch_size) + ',' + str(t_tcs) + ',' + str(t_drop) + elif(benchmarkName=="nt3"): + # values = {1:epochs, 2: batch_size, 3: classes} + t_epoch= random.randint(values[1][0], values[1][1]) + t_batch_size= random.randint(values[2][0], values[2][1]) + t_classes= random.randint(values[3][0], values[3][1]) + result+=str(t_epoch) + ',' + str(t_batch_size) + ',' + str(t_classes) + elif(benchmarkName=="p2b1"): + # values = {1:epochs, 2: batch_size, 3: molecular_epochs, 4: weight_decay} + t_epoch= random.randint(values[1][0], values[1][1]) + t_batch_size= random.randint(values[2][0], values[2][1]) + t_me= random.randint(values[3][0], values[3][1]) + t_wd= random.uniform(values[4][0], values[4][1]) + result+=str(t_epoch) + ',' + str(t_batch_size) + ',' + str(t_me) + ',' + str(t_wd) + elif(benchmarkName=="p3b1"): + # values = {1:epochs, 2: batch_size, 3: shared_nnet_spec, 4: n_fold} + t_epoch= random.randint(values[1][0], values[1][1]) + t_batch_size= random.randint(values[2][0], values[2][1]) + t_sns= random.randint(values[3][0], values[3][1]) + t_nf= random.randint(values[4][0], values[4][1]) + result+=str(t_epoch) + ',' + str(t_batch_size) + ',' + str(t_sns) + ',' + str(t_nf) + else: + print('ERROR: Tried all possible benchmarks, Invalid benchmark name or json file') + sys.exit(1) + # Populate the result string for writing sweep-parameters file + if(s < (samples[0]-1)): + result+=":" + return result + +# ===== Main program ======================================================== +if (len(sys.argv) < 3): + print('requires arg1=settingsFilename and arg2=paramsFilename') + sys.exit(1) + +settingsFilename = sys.argv[1] +paramsFilename = sys.argv[2] +benchmarkName = sys.argv[3] +searchType = sys.argv[4] + +#Trying to open the settings file +print("Reading settings: %s" % settingsFilename) +try: + with open(settingsFilename) as fp: + settings = json.load(fp) +except IOError as e: + print("Could not open: %s" % settingsFilename) + print("PWD is: '%s'" % os.getcwd()) + sys.exit(1) + +# Read in the variables from json file +# Register new variables for any benchmark here +#Common variables +epochs = settings.get('parameters').get('epochs') +batch_size = settings.get('parameters').get('batch_size') +# P1B1 +N1 = settings.get('parameters').get('N1') +NE = settings.get('parameters').get('NE') +#NT3 +classes = settings.get('parameters').get('classes') +#P2B1 +molecular_epochs = settings.get('parameters').get('molecular_epochs') +weight_decay = settings.get('parameters').get('weight_decay') +#P3B1 +shared_nnet_spec = settings.get('parameters').get('shared_nnet_spec') +n_fold = settings.get('parameters').get('n_fold') +#P1B3 +test_cell_split = settings.get('parameters').get('test_cell_split') +drop = settings.get('parameters').get('drop') + +# For random scheme determine number of samples +samples = settings.get('samples', {}).get('num', None) + + +# Make values for computing grid sweep parameters +values = {} +if(benchmarkName=="p1b1"): + values = {1:epochs, 2: batch_size, 3: N1, 4: NE} + print values +elif(benchmarkName=="p1b3"): + values = {1:epochs, 2: batch_size, 3: test_cell_split, 4: drop} + print values +elif(benchmarkName=="nt3"): + values = {1:epochs, 2: batch_size, 3: classes} + print values +elif(benchmarkName=="p2b1"): + values = {1:epochs, 2: batch_size, 3: molecular_epochs, 4: weight_decay} + print values +elif(benchmarkName=="p3b1"): + values = {1:epochs, 2: batch_size, 3: shared_nnet_spec, 4: n_fold} + print values +else: + print('ERROR: Tried all possible benchmarks, Invalid benchmark name or json file') + sys.exit(1) + +result = {} +if(searchType == "grid"): + results = expand(values, 1, len(values), ['']) + result = ':'.join(results) +elif(searchType =="random"): + if(samples == None): + print ("ERROR: Provide number of samples in json file") + sys.exit(1) + result = generate_random(values, samples, benchmarkName) +else: + print ("ERROR: Invalid search type, specify either - grid or random") + sys.exit(1) + + +with open(paramsFilename, 'w') as the_file: + the_file.write(result) + diff --git a/workflows/rnd_or_grid/python/evaluateOne.py b/workflows/rnd_or_grid/python/evaluateOne.py new file mode 100644 index 00000000..247be3ee --- /dev/null +++ b/workflows/rnd_or_grid/python/evaluateOne.py @@ -0,0 +1,93 @@ +import sys +import json, os +import socket + + +if (len(sys.argv) < 3): + print('requires arg1=param and arg2=filename') + sys.exit(1) + +parameterString = sys.argv[1] +filename = sys.argv[2] +benchmarkName = sys.argv[3] + +integs = [float(x) for x in parameterString.split(',')] +print (integs) + +if (benchmarkName == "p1b1"): + import p1b1_runner + hyper_parameter_map = {'epochs' : int(integs[0])} + hyper_parameter_map['framework'] = 'keras' + hyper_parameter_map['batch_size'] = int(integs[1]) + hyper_parameter_map['dense'] = [int(integs[2]), int(integs[3])] + hyper_parameter_map['run_id'] = parameterString + # hyper_parameter_map['instance_directory'] = os.environ['TURBINE_OUTPUT'] + hyper_parameter_map['save'] = os.environ['TURBINE_OUTPUT']+ "/output-"+str(os.getpid()) + sys.argv = ['p1b1_runner'] + val_loss = p1b1_runner.run(hyper_parameter_map) +elif (benchmarkName == "p1b3"): + import p1b3_runner + hyper_parameter_map = {'epochs' : int(integs[0])} + hyper_parameter_map['framework'] = 'keras' + hyper_parameter_map['batch_size'] = int(integs[1]) + hyper_parameter_map['test_cell_split'] = int(integs[2]) + hyper_parameter_map['drop'] = int(integs[3]) + hyper_parameter_map['run_id'] = parameterString + # hyper_parameter_map['instance_directory'] = os.environ['TURBINE_OUTPUT'] + hyper_parameter_map['save'] = os.environ['TURBINE_OUTPUT']+ "/output-"+str(os.getpid()) + sys.argv = ['p1b3_runner'] + val_loss = p1b3_runner.run(hyper_parameter_map) +elif (benchmarkName == "p2b1"): + import p2b1_runner + hyper_parameter_map = {'epochs' : int(integs[0])} + hyper_parameter_map['framework'] = 'keras' + hyper_parameter_map['batch_size'] = int(integs[1]) + hyper_parameter_map['molecular_epochs'] = int(integs[2]) + hyper_parameter_map['weight_decay'] = integs[3] + hyper_parameter_map['run_id'] = parameterString + hyper_parameter_map['save'] = os.environ['TURBINE_OUTPUT']+ "/output-"+str(os.getpid()) + sys.argv = ['p2b1_runner'] + val_loss = p2b1_runner.run(hyper_parameter_map) +elif (benchmarkName == "nt3"): + import nt3_tc1_runner + hyper_parameter_map = {'epochs' : int(integs[0])} + hyper_parameter_map['framework'] = 'keras' + hyper_parameter_map['batch_size'] = int(integs[1]) + hyper_parameter_map['classes'] = int(integs[2]) + hyper_parameter_map['model_name'] = 'nt3' + hyper_parameter_map['save'] = os.environ['TURBINE_OUTPUT']+ "/output-"+str(os.getpid()) + sys.argv = ['nt3_runner'] + val_loss = nt3_tc1_runner.run(hyper_parameter_map) +elif (benchmarkName == "p3b1"): + import p3b1_runner + hyper_parameter_map = {'epochs' : int(integs[0])} + hyper_parameter_map['framework'] = 'keras' + hyper_parameter_map['batch_size'] = int(integs[1]) + hyper_parameter_map['shared_nnet_spec'] = int(integs[2]) + hyper_parameter_map['n_fold'] = int(integs[3]) + hyper_parameter_map['run_id'] = parameterString + # hyper_parameter_map['instance_directory'] = os.environ['TURBINE_OUTPUT'] + hyper_parameter_map['save'] = os.environ['TURBINE_OUTPUT']+ "/output-"+str(os.getpid()) + sys.argv = ['p3b1_runner'] + val_loss = p3b1_runner.run(hyper_parameter_map) + +# print (parameterString) +# print ("filename is " + filename) +# print (str(os.getpid())) +print (val_loss) + +# sfn = os.environ['TURBINE_OUTPUT']+ "/output-"+str(os.getpid()) + "/procname-" + parameterString +# with open(sfn, 'w') as sfile: +# sfile.write(socket.getfqdn()) +# proc_id = "-"+ str(os.getpid()) +# sfile.write(proc_id) + +# works around this error: +# https://github.com/tensorflow/tensorflow/issues/3388 +from keras import backend as K +K.clear_session() + +# writing the val loss to the output file (result-*) +with open(filename, 'w') as the_file: + the_file.write(repr(val_loss)) + diff --git a/workflows/rnd_or_grid/python/nt3_tc1_runner.py b/workflows/rnd_or_grid/python/nt3_tc1_runner.py new file mode 100644 index 00000000..e447c075 --- /dev/null +++ b/workflows/rnd_or_grid/python/nt3_tc1_runner.py @@ -0,0 +1,71 @@ +# tensoflow.__init__ calls _os.path.basename(_sys.argv[0]) +# so we need to create a synthetic argv. +import sys +if not hasattr(sys, 'argv'): + sys.argv = ['nt3_tc1'] + +import json +import os +import numpy as np +import importlib +import runner_utils + +def import_pkg(framework, model_name): + if framework == 'keras': + module_name = "{}_baseline_keras2".format(model_name) + pkg = importlib.import_module(module_name) + # elif framework is 'mxnet': + # import nt3_baseline_mxnet + # pkg = nt3_baseline_keras_baseline_mxnet + # elif framework is 'neon': + # import nt3_baseline_neon + # pkg = nt3_baseline_neon + else: + raise ValueError("Invalid framework: {}".format(framework)) + return pkg + +def run(hyper_parameter_map): + framework = hyper_parameter_map['framework'] + model_name = hyper_parameter_map['model_name'] + pkg = import_pkg(framework, model_name) + + runner_utils.format_params(hyper_parameter_map) + + # params is python dictionary + params = pkg.initialize_parameters() + for k,v in hyper_parameter_map.items(): + #if not k in params: + # raise Exception("Parameter '{}' not found in set of valid arguments".format(k)) + params[k] = v + + runner_utils.write_params(params, hyper_parameter_map) + history = pkg.run(params) + + if framework == 'keras': + # works around this error: + # https://github.com/tensorflow/tensorflow/issues/3388 + try: + from keras import backend as K + K.clear_session() + except AttributeError: # theano does not have this function + pass + + # use the last validation_loss as the value to minimize + val_loss = history.history['val_loss'] + return val_loss[-1] + +if __name__ == '__main__': + param_string = sys.argv[1] + instance_directory = sys.argv[2] + model_name = sys.argv[3] + framework = sys.argv[4] + exp_id = sys.argv[5] + run_id = sys.argv[6] + hyper_parameter_map = runner_utils.init(param_string, instance_directory, framework, 'save') + hyper_parameter_map['model_name'] = model_name + hyper_parameter_map['experiment_id'] = exp_id + hyper_parameter_map['run_id'] = run_id + # clear sys.argv so that argparse doesn't object + sys.argv = ['nt3_tc1_runner'] + result = run(hyper_parameter_map) + runner_utils.write_output(result, instance_directory) diff --git a/workflows/rnd_or_grid/python/p1b1_runner.py b/workflows/rnd_or_grid/python/p1b1_runner.py new file mode 100644 index 00000000..7ceb0c59 --- /dev/null +++ b/workflows/rnd_or_grid/python/p1b1_runner.py @@ -0,0 +1,49 @@ +# tensoflow.__init__ calls _os.path.basename(_sys.argv[0]) +# so we need to create a synthetic argv. +import sys +if not hasattr(sys, 'argv'): + sys.argv = ['p1b1'] + +import json +import os +import p1b1 +import runner_utils + +def run(hyper_parameter_map): + framework = hyper_parameter_map['framework'] + if framework is 'keras': + import p1b1_baseline_keras2 + pkg = p1b1_baseline_keras2 + elif framework is 'mxnet': + import p1b1_baseline_mxnet + pkg = p1b1_baseline_mxnet + elif framework is 'neon': + import p1b1_baseline_neon + pkg = p1b1_baseline_neon + else: + raise ValueError("Invalid framework: {}".format(framework)) + + # params is python dictionary + params = pkg.initialize_parameters() + runner_utils.format_params(hyper_parameter_map) + + for k,v in hyper_parameter_map.items(): + #if not k in params: + # raise Exception("Parameter '{}' not found in set of valid arguments".format(k)) + params[k] = v + + print(params) + history = pkg.run(params) + + if framework is 'keras': + # works around this error: + # https://github.com/tensorflow/tensorflow/issues/3388 + try: + from keras import backend as K + K.clear_session() + except AttributeError: # theano does not have this function + pass + + # use the last validation_loss as the value to minimize + val_loss = history.history['val_loss'] + return val_loss[-1] diff --git a/workflows/rnd_or_grid/python/p1b3_runner.py b/workflows/rnd_or_grid/python/p1b3_runner.py new file mode 100644 index 00000000..f330ed18 --- /dev/null +++ b/workflows/rnd_or_grid/python/p1b3_runner.py @@ -0,0 +1,60 @@ +# tensoflow.__init__ calls _os.path.basename(_sys.argv[0]) +# so we need to create a synthetic argv. +import sys +if not hasattr(sys, 'argv'): + sys.argv = ['p1b3'] + +import json +import os +import p1b3 +import runner_utils + +def run(hyper_parameter_map): + framework = hyper_parameter_map['framework'] + if framework == 'keras': + import p1b3_baseline_keras2 + pkg = p1b3_baseline_keras2 + elif framework == 'mxnet': + import p1b3_baseline_mxnet + pkg = p1b3_baseline_mxnet + elif framework == 'neon': + import p1b3_baseline_neon + pkg = p1b3_baseline_neon + else: + raise ValueError("Invalid framework: {}".format(framework)) + + # params is python dictionary + params = pkg.initialize_parameters() + runner_utils.format_params(hyper_parameter_map) + + for k,v in hyper_parameter_map.items(): + #if not k in params: + # raise Exception("Parameter '{}' not found in set of valid arguments".format(k)) + params[k] = v + + runner_utils.write_params(params, hyper_parameter_map) + history = pkg.run(params) + + if framework == 'keras': + # works around this error: + # https://github.com/tensorflow/tensorflow/issues/3388 + try: + from keras import backend as K + K.clear_session() + except AttributeError: # theano does not have this function + pass + + # use the last validation_loss as the value to minimize + val_loss = history.history['val_loss'] + return val_loss[-1] + +if __name__ == '__main__': + param_file = sys.argv[1] + instance_directory = sys.argv[2] + framework = sys.argv[3] + hyper_parameter_map = runner_utils.init(param_file, instance_directory, framework, + 'save') + # clear sys.argv so that argparse doesn't object + sys.argv = ['p1b3_runner'] + result = run(hyper_parameter_map) + runner_utils.write_output(result, instance_directory) diff --git a/workflows/rnd_or_grid/python/p2b1_runner.py b/workflows/rnd_or_grid/python/p2b1_runner.py new file mode 100644 index 00000000..30de6b58 --- /dev/null +++ b/workflows/rnd_or_grid/python/p2b1_runner.py @@ -0,0 +1,56 @@ +# tensoflow.__init__ calls _os.path.basename(_sys.argv[0]) +# so we need to create a synthetic argv. +import sys +if not hasattr(sys, 'argv'): + sys.argv = ['p2b1'] + +import json +import os +import p2b1 +import runner_utils + +def run(hyper_parameter_map): + framework = hyper_parameter_map['framework'] + if framework == 'keras': + import p2b1_baseline_keras2 + pkg = p2b1_baseline_keras2 + else: + raise ValueError("Invalid framework: {}".format(framework)) + + # params is python dictionary + params = pkg.initialize_parameters() + runner_utils.format_params(hyper_parameter_map) + + for k,v in hyper_parameter_map.items(): + #if not k in params: + # raise Exception("Parameter '{}' not found in set of valid arguments".format(k)) + params[k] = v + + runner_utils.write_params(params, hyper_parameter_map) + loss_history = pkg.run(params) + + if framework == 'keras': + # works around this error: + # https://github.com/tensorflow/tensorflow/issues/3388 + try: + from keras import backend as K + K.clear_session() + except AttributeError: # theano does not have this function + pass + + return loss_history[-1] + +if __name__ == '__main__': + param_string = sys.argv[1] + instance_directory = sys.argv[2] + framework = sys.argv[3] + exp_id = sys.argv[4] + run_id = sys.argv[5] + hyper_parameter_map = runner_utils.init(param_string, instance_directory, + framework, 'save_path') + hyper_parameter_map['experiment_id'] = exp_id + hyper_parameter_map['run_id'] = run_id + # clear sys.argv so that argparse doesn't object + sys.argv = ['p2b1_runner'] + result = run(hyper_parameter_map) + runner_utils.write_output(result, instance_directory) diff --git a/workflows/rnd_or_grid/python/p3b1_runner.py b/workflows/rnd_or_grid/python/p3b1_runner.py new file mode 100644 index 00000000..f5002e93 --- /dev/null +++ b/workflows/rnd_or_grid/python/p3b1_runner.py @@ -0,0 +1,56 @@ +# tensoflow.__init__ calls _os.path.basename(_sys.argv[0]) +# so we need to create a synthetic argv. +import sys +if not hasattr(sys, 'argv'): + sys.argv = ['p3b1'] + +import json +import os +import p3b1 +import runner_utils + +def run(hyper_parameter_map): + framework = hyper_parameter_map['framework'] + if framework == 'keras': + import p3b1_baseline_keras2 + pkg = p3b1_baseline_keras2 + else: + raise ValueError("Unsupported framework: {}".format(framework)) + + # params is python dictionary + params = pkg.initialize_parameters() + runner_utils.format_params(hyper_parameter_map) + + for k,v in hyper_parameter_map.items(): + #if not k in params: + # raise Exception("Parameter '{}' not found in set of valid arguments".format(k)) + params[k] = v + + runner_utils.write_params(params, hyper_parameter_map) + avg_loss = pkg.do_n_fold(params) + + if framework == 'keras': + # works around this error: + # https://github.com/tensorflow/tensorflow/issues/3388 + try: + from keras import backend as K + K.clear_session() + except AttributeError: # theano does not have this function + pass + + return avg_loss + +if __name__ == '__main__': + param_string = sys.argv[1] + instance_directory = sys.argv[2] + framework = sys.argv[3] + exp_id = sys.argv[4] + run_id = sys.argv[5] + hyper_parameter_map = runner_utils.init(param_string, instance_directory, + framework, 'save_path') + hyper_parameter_map['experiment_id'] = exp_id + hyper_parameter_map['run_id'] = run_id + # clear sys.argv so that argparse doesn't object + sys.argv = ['p3b1_runner'] + result = run(hyper_parameter_map) + runner_utils.write_output(result, instance_directory) diff --git a/workflows/rnd_or_grid/python/test/run_test_runners.sh b/workflows/rnd_or_grid/python/test/run_test_runners.sh new file mode 100755 index 00000000..019b4824 --- /dev/null +++ b/workflows/rnd_or_grid/python/test/run_test_runners.sh @@ -0,0 +1,7 @@ +#! /usr/bin/env bash + +RUNNER_DIR=../../../../../Benchmarks/Pilot1/P1B1:../../../../../Benchmarks/Pilot2/P2B1:../../../../../Benchmarks/Pilot3/P3B1:../../../../../Benchmarks/Pilot1/NT3:../../../../../Benchmarks/Pilot1/P1B3 +export PYTHONPATH="$PWD/..:$RUNNER_DIR:../../../common/python" +echo $PYTHONPATH + +python test_runners.py \ No newline at end of file diff --git a/workflows/rnd_or_grid/python/test/test_runners.py b/workflows/rnd_or_grid/python/test/test_runners.py new file mode 100644 index 00000000..c3f0cb8e --- /dev/null +++ b/workflows/rnd_or_grid/python/test/test_runners.py @@ -0,0 +1,50 @@ +import p1b1_runner +import p2b1_runner +import p1b3_runner +import p3b1_runner +import nt3_tc1_runner + +def main(): + + hyper_parameter_map = {'epochs' : 1} + hyper_parameter_map['framework'] = 'keras' + # hyper_parameter_map['save_path'] = save_path +# hyper_parameter_map = {'epochs' : 1} +# hyper_parameter_map['batch_size'] = 40 +# hyper_parameter_map['dense'] = [1219, 536] +# hyper_parameter_map['framework'] = 'keras' + + +#1 # p1b1 - works +# hyper_parameter_map['save'] = './p1bl1_testing_failure' + print("STARTING#####P1B1##########") + p1b1_validation_loss = p1b1_runner.run(hyper_parameter_map) + print("DONE##########P1B1#####") + + +#2 # p1b3 - works too big + print("STARTING#####P1B3##########") + # p1b3_validation_loss = p1b3_runner.run(hyper_parameter_map) + print("DONE######P1B3#########") + +#3 # p2b1 - works + print("STARTING#####P2B1##########") + # p2b1_validation_loss = p2b1_runner.run(hyper_parameter_map) + print("DONE#####P2B1##########") + +#4 # p3b1 - fails - ValueError: invalid literal for int() with base 10: '1200;1200' + print("STARTING#####P3B1##########") + # p3b1_validation_loss = p3b1_runner.run(hyper_parameter_map) + print("DONE#####P3B1##########") + +#5 # NT3 - works - too big + print("STARTING#####NT3##########") + hyper_parameter_map['model_name'] = 'nt3' + nt3tc1_validation_losss = nt3_tc1_runner.run(hyper_parameter_map) + print("DONE#####NT3##########") + + + +# # print("Validation Loss: ", p1b1_validation_loss) +if __name__ == '__main__': + main() diff --git a/workflows/rnd_or_grid/swift/computeStats.sh b/workflows/rnd_or_grid/swift/computeStats.sh new file mode 100755 index 00000000..9d2b0e25 --- /dev/null +++ b/workflows/rnd_or_grid/swift/computeStats.sh @@ -0,0 +1,2 @@ +#!/bin/bash +python $APP_HOME/../python/computeStats.py $1 diff --git a/workflows/rnd_or_grid/swift/determineParameters.sh b/workflows/rnd_or_grid/swift/determineParameters.sh new file mode 100755 index 00000000..6aa2bab8 --- /dev/null +++ b/workflows/rnd_or_grid/swift/determineParameters.sh @@ -0,0 +1,3 @@ +#!/bin/bash +echo $APP_HOME +python $APP_HOME/../python/determineParameters.py $1 $2 $3 $4 diff --git a/workflows/rnd_or_grid/swift/evaluateOne.sh b/workflows/rnd_or_grid/swift/evaluateOne.sh new file mode 100755 index 00000000..08a72341 --- /dev/null +++ b/workflows/rnd_or_grid/swift/evaluateOne.sh @@ -0,0 +1,3 @@ +#!/bin/bash +filename=$TURBINE_OUTPUT/result-$1.txt +python -u $APP_HOME/../python/evaluateOne.py $1 $filename $3 diff --git a/workflows/rnd_or_grid/swift/rnd_or_grid.swift b/workflows/rnd_or_grid/swift/rnd_or_grid.swift new file mode 100644 index 00000000..b4fb48f3 --- /dev/null +++ b/workflows/rnd_or_grid/swift/rnd_or_grid.swift @@ -0,0 +1,68 @@ +import string; +import files; +import io; +import sys; + +// ===== Interface definitions for the programs that we call ====== +// Random values are created from bounds specified in data/settings.json file +app (file f) +determineParameters(string settingsFilename, string benchmark, string searchType) +{ + (getenv("APP_HOME")+"/determineParameters.sh") settingsFilename f benchmark searchType; +} + +// This is where the p1b1 runner is called +app (file f) +evaluateOne(string params, string benchmark) +{ + (getenv("APP_HOME")+"/evaluateOne.sh") params f benchmark; +} + +// call this to read all the resultsFiles and compute stats +app () +computeStats(string resultsFile) +{ + (getenv("APP_HOME")+"/computeStats.sh") resultsFile; +} + +// call this to create any required directories +app (void o) make_dir(string dirname) { + "mkdir" "-p" dirname; +} + + +// ===== The program proper ============================================== +string turbine_output = getenv("TURBINE_OUTPUT"); +string app_home = getenv("APP_HOME"); +float results[string]; + +//make the experiments dir +make_dir(turbine_output); + +// Get parameters +benchmark = argv("benchmark_name"); +searchType = argv("search_type"); +settingsFilename = app_home+"/../data/"+benchmark+"_settings.json"; +string sweepParamFile = turbine_output+"/sweep-parameters.txt"; +file parametersFile = determineParameters(settingsFilename, benchmark, searchType); +parametersString = read(parametersFile); +parameters = split(parametersString, ":"); + +// Run experiments in parallel, passing each a different parameter set +foreach param in parameters +{ + string rName = turbine_output+"/result-"+param+".txt"; + printf(rName); + file resultFile = evaluateOne(param, benchmark); + results[param] = string2float(read(resultFile)); +} + +// Compute stats of this array of results +// Write directly to a file with write +file tmp = write(repr(results)); + +// Find the name of a file with filename +//trace("Temporary filename is: " + filename(tmp)); + +computeStats(filename(tmp)); + diff --git a/workflows/rnd_or_grid/swift/run b/workflows/rnd_or_grid/swift/run new file mode 100755 index 00000000..7ed387cd --- /dev/null +++ b/workflows/rnd_or_grid/swift/run @@ -0,0 +1,54 @@ +#!/bin/bash +# +# Usage: ./run +# + +if [ "$#" -ne 3 ]; then + script_name=$(basename $0) + echo "Usage: ${script_name} EXPERIMENT_ID (run1_p1b1) BENCHMARKS_NAME (eg. p1b1) SEARCH_TYPE (eg. grid or random" + exit 1 +fi + +#### set this variable to add new benchmarks directory +RUNNERS_DIR=../../../../Benchmarks/Pilot1/P1B1:../../../../Benchmarks/Pilot2/P2B1:../../../../Benchmarks/Pilot3/P3B1:../../../../Benchmarks/Pilot1/NT3:../../../../Benchmarks/Pilot1/P1B3 +### + +THIS=$( cd $( dirname $0 ); /bin/pwd ) +export APP_HOME=$THIS + +PROJECT_ROOT=$APP_HOME/.. + +export PYTHONPATH=$PYTHONPATH:$PROJECT_ROOT/python:$RUNNERS_DIR:$PROJECT_ROOT/../common/python:$PYTHONPATH + +export EXPID=$1 +B_NAME=$2 +S_NAME=$3 + +export TURBINE_OUTPUT=$APP_HOME/../experiments/$EXPID + + # prefix=$PWD/../data/ + # suffix="_settings.json" + # export SETTINGS_FILE=$prefix$BENCHMARK_NAME$suffix + +# TODO edit QUEUE, WALLTIME, PPN, AND TURNBINE_JOBNAME +# as required. Note that QUEUE, WALLTIME, PPN, AND TURNBINE_JOBNAME will +# be ignored if MACHINE flag (see below) is not set +export QUEUE=batch +export WALLTIME=00:10:00 +export PPN=1 +export TURBINE_JOBNAME="${EXPID}_job" + +### set the desired number of processors +PROCS=3 +### + +# Resident task workers and ranks +export TURBINE_RESIDENT_WORK_WORKERS=1 +export RESIDENT_WORK_RANKS=$(( PROCS - 2 )) + +echo $PYTHONPATH + +# remove -l option for removing printing processors ranks +# settings.json file has all the parameter combinations to be tested +echo swift-t -n $PROCS $APP_HOME/grid-sweep.swift $* +swift-t -l -n $PROCS $APP_HOME/rnd_or_grid.swift $* --benchmark_name=$B_NAME --search_type=$S_NAME From dc51d427097aeb4eb4b016c01c519ef8f14b90d6 Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Wed, 5 Jul 2017 03:30:59 -0500 Subject: [PATCH 2/5] o Add code to detect repeated random/grid parameters before evaluation in parallel --- .../rnd_or_grid/python/determineParameters.py | 33 +++++++++++-------- workflows/rnd_or_grid/python/evaluateOne.py | 1 - .../rnd_or_grid/python/test/test_runners.py | 12 +++---- .../rnd_or_grid/swift/determineParameters.sh | 1 - workflows/rnd_or_grid/swift/rnd_or_grid.swift | 5 --- 5 files changed, 25 insertions(+), 27 deletions(-) diff --git a/workflows/rnd_or_grid/python/determineParameters.py b/workflows/rnd_or_grid/python/determineParameters.py index aa573157..f772f5e4 100644 --- a/workflows/rnd_or_grid/python/determineParameters.py +++ b/workflows/rnd_or_grid/python/determineParameters.py @@ -1,11 +1,11 @@ import sys, json, os import random +import itertools # ===== Definitions ========================================================= def expand(Vs, fr, to, soFar): soFarNew = [] for s in soFar: - print Vs[fr] if (Vs[fr] == None): print ("ERROR: The order of json inputs and values must be preserved") sys.exit(1) @@ -22,7 +22,8 @@ def expand(Vs, fr, to, soFar): def generate_random(values, n_samples, benchmarkName): # select '#samples' random numbers between the range provided in settings.json file result = "" - for s in range(samples[0]): + param_listed = [] + for s in range(n_samples): if(benchmarkName=="p1b1"): # values = {1:epochs, 2: batch_size, 3: N1, 4: NE} t_epoch= random.randint(values[1][0], values[1][1]) @@ -61,9 +62,9 @@ def generate_random(values, n_samples, benchmarkName): print('ERROR: Tried all possible benchmarks, Invalid benchmark name or json file') sys.exit(1) # Populate the result string for writing sweep-parameters file - if(s < (samples[0]-1)): - result+=":" - return result + param_listed += [str(result)] + result="" + return (param_listed) # ===== Main program ======================================================== if (len(sys.argv) < 3): @@ -75,6 +76,7 @@ def generate_random(values, n_samples, benchmarkName): benchmarkName = sys.argv[3] searchType = sys.argv[4] +## Read in the variables from json file #Trying to open the settings file print("Reading settings: %s" % settingsFilename) try: @@ -85,7 +87,6 @@ def generate_random(values, n_samples, benchmarkName): print("PWD is: '%s'" % os.getcwd()) sys.exit(1) -# Read in the variables from json file # Register new variables for any benchmark here #Common variables epochs = settings.get('parameters').get('epochs') @@ -108,41 +109,47 @@ def generate_random(values, n_samples, benchmarkName): # For random scheme determine number of samples samples = settings.get('samples', {}).get('num', None) +## Done reading from file # Make values for computing grid sweep parameters values = {} if(benchmarkName=="p1b1"): values = {1:epochs, 2: batch_size, 3: N1, 4: NE} - print values elif(benchmarkName=="p1b3"): values = {1:epochs, 2: batch_size, 3: test_cell_split, 4: drop} - print values elif(benchmarkName=="nt3"): values = {1:epochs, 2: batch_size, 3: classes} - print values elif(benchmarkName=="p2b1"): values = {1:epochs, 2: batch_size, 3: molecular_epochs, 4: weight_decay} - print values elif(benchmarkName=="p3b1"): values = {1:epochs, 2: batch_size, 3: shared_nnet_spec, 4: n_fold} - print values else: print('ERROR: Tried all possible benchmarks, Invalid benchmark name or json file') sys.exit(1) +# this (result) is : seperated string with all params result = {} +# Determine parameter space based of search type if(searchType == "grid"): results = expand(values, 1, len(values), ['']) - result = ':'.join(results) elif(searchType =="random"): if(samples == None): print ("ERROR: Provide number of samples in json file") sys.exit(1) - result = generate_random(values, samples, benchmarkName) + # result, results = generate_random(values, samples, benchmarkName) + results = generate_random(values, samples[0], benchmarkName) else: print ("ERROR: Invalid search type, specify either - grid or random") sys.exit(1) +counter=0 +for a, b in itertools.combinations(results, 2): + if(a == b): + print ("Warning: skipping -identical parameters found", counter) + results.remove(a) + +#These are final : seperated parameters for evaluation +result = ':'.join(results) with open(paramsFilename, 'w') as the_file: the_file.write(result) diff --git a/workflows/rnd_or_grid/python/evaluateOne.py b/workflows/rnd_or_grid/python/evaluateOne.py index 247be3ee..c99b5083 100644 --- a/workflows/rnd_or_grid/python/evaluateOne.py +++ b/workflows/rnd_or_grid/python/evaluateOne.py @@ -12,7 +12,6 @@ benchmarkName = sys.argv[3] integs = [float(x) for x in parameterString.split(',')] -print (integs) if (benchmarkName == "p1b1"): import p1b1_runner diff --git a/workflows/rnd_or_grid/python/test/test_runners.py b/workflows/rnd_or_grid/python/test/test_runners.py index c3f0cb8e..1082be43 100644 --- a/workflows/rnd_or_grid/python/test/test_runners.py +++ b/workflows/rnd_or_grid/python/test/test_runners.py @@ -22,29 +22,27 @@ def main(): print("DONE##########P1B1#####") -#2 # p1b3 - works too big +#2 # p1b3 - works too big for desktop print("STARTING#####P1B3##########") - # p1b3_validation_loss = p1b3_runner.run(hyper_parameter_map) + p1b3_validation_loss = p1b3_runner.run(hyper_parameter_map) print("DONE######P1B3#########") #3 # p2b1 - works print("STARTING#####P2B1##########") - # p2b1_validation_loss = p2b1_runner.run(hyper_parameter_map) + p2b1_validation_loss = p2b1_runner.run(hyper_parameter_map) print("DONE#####P2B1##########") #4 # p3b1 - fails - ValueError: invalid literal for int() with base 10: '1200;1200' print("STARTING#####P3B1##########") - # p3b1_validation_loss = p3b1_runner.run(hyper_parameter_map) + p3b1_validation_loss = p3b1_runner.run(hyper_parameter_map) print("DONE#####P3B1##########") -#5 # NT3 - works - too big +#5 # NT3 - works - too big print("STARTING#####NT3##########") hyper_parameter_map['model_name'] = 'nt3' nt3tc1_validation_losss = nt3_tc1_runner.run(hyper_parameter_map) print("DONE#####NT3##########") - - # # print("Validation Loss: ", p1b1_validation_loss) if __name__ == '__main__': main() diff --git a/workflows/rnd_or_grid/swift/determineParameters.sh b/workflows/rnd_or_grid/swift/determineParameters.sh index 6aa2bab8..098d26ec 100755 --- a/workflows/rnd_or_grid/swift/determineParameters.sh +++ b/workflows/rnd_or_grid/swift/determineParameters.sh @@ -1,3 +1,2 @@ #!/bin/bash -echo $APP_HOME python $APP_HOME/../python/determineParameters.py $1 $2 $3 $4 diff --git a/workflows/rnd_or_grid/swift/rnd_or_grid.swift b/workflows/rnd_or_grid/swift/rnd_or_grid.swift index b4fb48f3..04333235 100644 --- a/workflows/rnd_or_grid/swift/rnd_or_grid.swift +++ b/workflows/rnd_or_grid/swift/rnd_or_grid.swift @@ -52,7 +52,6 @@ parameters = split(parametersString, ":"); foreach param in parameters { string rName = turbine_output+"/result-"+param+".txt"; - printf(rName); file resultFile = evaluateOne(param, benchmark); results[param] = string2float(read(resultFile)); } @@ -60,9 +59,5 @@ foreach param in parameters // Compute stats of this array of results // Write directly to a file with write file tmp = write(repr(results)); - -// Find the name of a file with filename -//trace("Temporary filename is: " + filename(tmp)); - computeStats(filename(tmp)); From b1ea2783f7bd449b83b8b8d8450d7f8bc7266747 Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Wed, 12 Jul 2017 00:09:08 -0500 Subject: [PATCH 3/5] o Add json file as input argument o Need to formalize input variables for better/more variables for variation --- workflows/rnd_or_grid/data/p3b1_settings.json | 4 +- .../rnd_or_grid/python/determineParameters.py | 12 +++--- workflows/rnd_or_grid/python/evaluateOne.py | 2 - .../rnd_or_grid/python/nt3_tc1_runner.py | 2 + workflows/rnd_or_grid/python/p1b1_runner.py | 3 -- workflows/rnd_or_grid/python/p2b1_runner.py | 2 + workflows/rnd_or_grid/python/p3b1_runner.py | 41 +++++++++++++++++++ .../rnd_or_grid/python/test/test_runners.py | 24 +++++------ workflows/rnd_or_grid/swift/rnd_or_grid.swift | 3 +- workflows/rnd_or_grid/swift/run | 10 +++-- 10 files changed, 72 insertions(+), 31 deletions(-) diff --git a/workflows/rnd_or_grid/data/p3b1_settings.json b/workflows/rnd_or_grid/data/p3b1_settings.json index ee13ec76..c84ca192 100644 --- a/workflows/rnd_or_grid/data/p3b1_settings.json +++ b/workflows/rnd_or_grid/data/p3b1_settings.json @@ -2,9 +2,7 @@ "parameters": { "epochs": [1 , 2 ], - "batch_size": [20, 40], - "shared_nnet_spec": [1200, 1400], - "n_fold": [1, 2] + "batch_size": [20, 40] }, "samples": { diff --git a/workflows/rnd_or_grid/python/determineParameters.py b/workflows/rnd_or_grid/python/determineParameters.py index f772f5e4..06ef2e1b 100644 --- a/workflows/rnd_or_grid/python/determineParameters.py +++ b/workflows/rnd_or_grid/python/determineParameters.py @@ -52,12 +52,10 @@ def generate_random(values, n_samples, benchmarkName): t_wd= random.uniform(values[4][0], values[4][1]) result+=str(t_epoch) + ',' + str(t_batch_size) + ',' + str(t_me) + ',' + str(t_wd) elif(benchmarkName=="p3b1"): - # values = {1:epochs, 2: batch_size, 3: shared_nnet_spec, 4: n_fold} + # values = {1:epochs, 2: batch_size}//, 3: learning_rate, 4: n_fold} t_epoch= random.randint(values[1][0], values[1][1]) t_batch_size= random.randint(values[2][0], values[2][1]) - t_sns= random.randint(values[3][0], values[3][1]) - t_nf= random.randint(values[4][0], values[4][1]) - result+=str(t_epoch) + ',' + str(t_batch_size) + ',' + str(t_sns) + ',' + str(t_nf) + result+=str(t_epoch) + ',' + str(t_batch_size) else: print('ERROR: Tried all possible benchmarks, Invalid benchmark name or json file') sys.exit(1) @@ -100,8 +98,8 @@ def generate_random(values, n_samples, benchmarkName): molecular_epochs = settings.get('parameters').get('molecular_epochs') weight_decay = settings.get('parameters').get('weight_decay') #P3B1 -shared_nnet_spec = settings.get('parameters').get('shared_nnet_spec') -n_fold = settings.get('parameters').get('n_fold') +# learning_rate = settings.get('parameters').get('learning_rate') +# n_fold = settings.get('parameters').get('n_fold') #P1B3 test_cell_split = settings.get('parameters').get('test_cell_split') drop = settings.get('parameters').get('drop') @@ -122,7 +120,7 @@ def generate_random(values, n_samples, benchmarkName): elif(benchmarkName=="p2b1"): values = {1:epochs, 2: batch_size, 3: molecular_epochs, 4: weight_decay} elif(benchmarkName=="p3b1"): - values = {1:epochs, 2: batch_size, 3: shared_nnet_spec, 4: n_fold} + values = {1:epochs, 2: batch_size} else: print('ERROR: Tried all possible benchmarks, Invalid benchmark name or json file') sys.exit(1) diff --git a/workflows/rnd_or_grid/python/evaluateOne.py b/workflows/rnd_or_grid/python/evaluateOne.py index c99b5083..73d5db74 100644 --- a/workflows/rnd_or_grid/python/evaluateOne.py +++ b/workflows/rnd_or_grid/python/evaluateOne.py @@ -62,8 +62,6 @@ hyper_parameter_map = {'epochs' : int(integs[0])} hyper_parameter_map['framework'] = 'keras' hyper_parameter_map['batch_size'] = int(integs[1]) - hyper_parameter_map['shared_nnet_spec'] = int(integs[2]) - hyper_parameter_map['n_fold'] = int(integs[3]) hyper_parameter_map['run_id'] = parameterString # hyper_parameter_map['instance_directory'] = os.environ['TURBINE_OUTPUT'] hyper_parameter_map['save'] = os.environ['TURBINE_OUTPUT']+ "/output-"+str(os.getpid()) diff --git a/workflows/rnd_or_grid/python/nt3_tc1_runner.py b/workflows/rnd_or_grid/python/nt3_tc1_runner.py index e447c075..678f8a00 100644 --- a/workflows/rnd_or_grid/python/nt3_tc1_runner.py +++ b/workflows/rnd_or_grid/python/nt3_tc1_runner.py @@ -61,10 +61,12 @@ def run(hyper_parameter_map): framework = sys.argv[4] exp_id = sys.argv[5] run_id = sys.argv[6] + benchmark_timeout = int(sys.argv[7]) hyper_parameter_map = runner_utils.init(param_string, instance_directory, framework, 'save') hyper_parameter_map['model_name'] = model_name hyper_parameter_map['experiment_id'] = exp_id hyper_parameter_map['run_id'] = run_id + hyper_parameter_map['timeout'] = benchmark_timeout # clear sys.argv so that argparse doesn't object sys.argv = ['nt3_tc1_runner'] result = run(hyper_parameter_map) diff --git a/workflows/rnd_or_grid/python/p1b1_runner.py b/workflows/rnd_or_grid/python/p1b1_runner.py index 7ceb0c59..20ce7e7d 100644 --- a/workflows/rnd_or_grid/python/p1b1_runner.py +++ b/workflows/rnd_or_grid/python/p1b1_runner.py @@ -7,7 +7,6 @@ import json import os import p1b1 -import runner_utils def run(hyper_parameter_map): framework = hyper_parameter_map['framework'] @@ -25,8 +24,6 @@ def run(hyper_parameter_map): # params is python dictionary params = pkg.initialize_parameters() - runner_utils.format_params(hyper_parameter_map) - for k,v in hyper_parameter_map.items(): #if not k in params: # raise Exception("Parameter '{}' not found in set of valid arguments".format(k)) diff --git a/workflows/rnd_or_grid/python/p2b1_runner.py b/workflows/rnd_or_grid/python/p2b1_runner.py index 30de6b58..b3583c7d 100644 --- a/workflows/rnd_or_grid/python/p2b1_runner.py +++ b/workflows/rnd_or_grid/python/p2b1_runner.py @@ -46,10 +46,12 @@ def run(hyper_parameter_map): framework = sys.argv[3] exp_id = sys.argv[4] run_id = sys.argv[5] + benchmark_timeout = int(sys.argv[6]) hyper_parameter_map = runner_utils.init(param_string, instance_directory, framework, 'save_path') hyper_parameter_map['experiment_id'] = exp_id hyper_parameter_map['run_id'] = run_id + hyper_parameter_map['timeout'] = benchmark_timeout # clear sys.argv so that argparse doesn't object sys.argv = ['p2b1_runner'] result = run(hyper_parameter_map) diff --git a/workflows/rnd_or_grid/python/p3b1_runner.py b/workflows/rnd_or_grid/python/p3b1_runner.py index f5002e93..385d3e26 100644 --- a/workflows/rnd_or_grid/python/p3b1_runner.py +++ b/workflows/rnd_or_grid/python/p3b1_runner.py @@ -8,14 +8,39 @@ import os import p3b1 import runner_utils +import socket + +node_pid = "%s,%i" % (socket.gethostname(), os.getpid()) +print("node,pid: " + node_pid) + +logger = None + +def get_logger(): + """ Set up logging """ + global logger + if logger is not None: + return logger + import logging, sys + logger = logging.getLogger(__name__) + logger.setLevel(logging.DEBUG) + h = logging.StreamHandler(stream=sys.stdout) + fmtr = logging.Formatter('%(asctime)s %(name)s %(levelname)-9s %(message)s', + datefmt='%Y/%m/%d %H:%M:%S') + h.setFormatter(fmtr) + logger.addHandler(h) + return logger def run(hyper_parameter_map): + + logger = get_logger() framework = hyper_parameter_map['framework'] + logger.debug("IMPORT START") if framework == 'keras': import p3b1_baseline_keras2 pkg = p3b1_baseline_keras2 else: raise ValueError("Unsupported framework: {}".format(framework)) + logger.debug("IMPORT STOP") # params is python dictionary params = pkg.initialize_parameters() @@ -26,8 +51,12 @@ def run(hyper_parameter_map): # raise Exception("Parameter '{}' not found in set of valid arguments".format(k)) params[k] = v + logger.debug("WRITE_PARAMS START") runner_utils.write_params(params, hyper_parameter_map) + logger.debug("WRITE_PARAMS STOP") + logger.debug("DO_N_FOLD START") avg_loss = pkg.do_n_fold(params) + logger.debug("DO_N_FOLD STOP") if framework == 'keras': # works around this error: @@ -41,16 +70,28 @@ def run(hyper_parameter_map): return avg_loss if __name__ == '__main__': + logger = get_logger() + logger.debug("RUN START") + param_string = sys.argv[1] instance_directory = sys.argv[2] framework = sys.argv[3] exp_id = sys.argv[4] run_id = sys.argv[5] + benchmark_timeout = int(sys.argv[6]) + + logger.debug("RUN INIT START") + hyper_parameter_map = runner_utils.init(param_string, instance_directory, framework, 'save_path') + logger.debug("RUN INIT STOP") hyper_parameter_map['experiment_id'] = exp_id hyper_parameter_map['run_id'] = run_id + hyper_parameter_map['timeout'] = benchmark_timeout # clear sys.argv so that argparse doesn't object sys.argv = ['p3b1_runner'] result = run(hyper_parameter_map) + logger.debug("WRITE OUTPUT START") runner_utils.write_output(result, instance_directory) + logger.debug("WRITE OUTPUT STOP") + logger.debug("RUN STOP") diff --git a/workflows/rnd_or_grid/python/test/test_runners.py b/workflows/rnd_or_grid/python/test/test_runners.py index 1082be43..a6218837 100644 --- a/workflows/rnd_or_grid/python/test/test_runners.py +++ b/workflows/rnd_or_grid/python/test/test_runners.py @@ -16,21 +16,21 @@ def main(): #1 # p1b1 - works -# hyper_parameter_map['save'] = './p1bl1_testing_failure' - print("STARTING#####P1B1##########") - p1b1_validation_loss = p1b1_runner.run(hyper_parameter_map) - print("DONE##########P1B1#####") +# # hyper_parameter_map['save'] = './p1bl1_testing_failure' +# print("STARTING#####P1B1##########") +# p1b1_validation_loss = p1b1_runner.run(hyper_parameter_map) +# print("DONE##########P1B1#####") -#2 # p1b3 - works too big for desktop - print("STARTING#####P1B3##########") - p1b3_validation_loss = p1b3_runner.run(hyper_parameter_map) - print("DONE######P1B3#########") +# #2 # p1b3 - works too big for desktop +# print("STARTING#####P1B3##########") +# p1b3_validation_loss = p1b3_runner.run(hyper_parameter_map) +# print("DONE######P1B3#########") -#3 # p2b1 - works - print("STARTING#####P2B1##########") - p2b1_validation_loss = p2b1_runner.run(hyper_parameter_map) - print("DONE#####P2B1##########") +# #3 # p2b1 - works +# print("STARTING#####P2B1##########") +# p2b1_validation_loss = p2b1_runner.run(hyper_parameter_map) +# print("DONE#####P2B1##########") #4 # p3b1 - fails - ValueError: invalid literal for int() with base 10: '1200;1200' print("STARTING#####P3B1##########") diff --git a/workflows/rnd_or_grid/swift/rnd_or_grid.swift b/workflows/rnd_or_grid/swift/rnd_or_grid.swift index 04333235..ea050373 100644 --- a/workflows/rnd_or_grid/swift/rnd_or_grid.swift +++ b/workflows/rnd_or_grid/swift/rnd_or_grid.swift @@ -42,7 +42,8 @@ make_dir(turbine_output); // Get parameters benchmark = argv("benchmark_name"); searchType = argv("search_type"); -settingsFilename = app_home+"/../data/"+benchmark+"_settings.json"; +inputFile = argv("input_file"); +settingsFilename = app_home+"/../data/"+inputFile; string sweepParamFile = turbine_output+"/sweep-parameters.txt"; file parametersFile = determineParameters(settingsFilename, benchmark, searchType); parametersString = read(parametersFile); diff --git a/workflows/rnd_or_grid/swift/run b/workflows/rnd_or_grid/swift/run index 7ed387cd..2b9d8d63 100755 --- a/workflows/rnd_or_grid/swift/run +++ b/workflows/rnd_or_grid/swift/run @@ -3,9 +3,12 @@ # Usage: ./run # -if [ "$#" -ne 3 ]; then +if [ "$#" -ne 4 ]; then script_name=$(basename $0) - echo "Usage: ${script_name} EXPERIMENT_ID (run1_p1b1) BENCHMARKS_NAME (eg. p1b1) SEARCH_TYPE (eg. grid or random" + echo "Usage: ${script_name} EXPERIMENT_ID (run1_p1b1) BENCHMARKS_NAME (eg. p1b1) SEARCH_TYPE (eg. grid or random) INPUT_JSON" + echo "Example: ./run p1b1_experiment1 p1b1 random p1b1_settings.json" + echo "-This creates a p1b1_experiment1 directory in ../experiments" + echo " uses random scheme for variables specified in ../data/p1b1_settings.json file" exit 1 fi @@ -23,6 +26,7 @@ export PYTHONPATH=$PYTHONPATH:$PROJECT_ROOT/python:$RUNNERS_DIR:$PROJECT_ROOT/.. export EXPID=$1 B_NAME=$2 S_NAME=$3 +JSON_F=$4 export TURBINE_OUTPUT=$APP_HOME/../experiments/$EXPID @@ -51,4 +55,4 @@ echo $PYTHONPATH # remove -l option for removing printing processors ranks # settings.json file has all the parameter combinations to be tested echo swift-t -n $PROCS $APP_HOME/grid-sweep.swift $* -swift-t -l -n $PROCS $APP_HOME/rnd_or_grid.swift $* --benchmark_name=$B_NAME --search_type=$S_NAME +swift-t -l -n $PROCS $APP_HOME/rnd_or_grid.swift $* --benchmark_name=$B_NAME --search_type=$S_NAME --input_file=$JSON_F From f08818c22b679b50cd28a72906e8cc0dbadb4a6c Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Wed, 12 Jul 2017 00:32:23 -0500 Subject: [PATCH 4/5] o Add scripts for running on theta --- .../python/test/run_theta_runners.sh | 26 ++++++ workflows/rnd_or_grid/swift/theta_run.sh | 81 +++++++++++++++++++ 2 files changed, 107 insertions(+) create mode 100644 workflows/rnd_or_grid/python/test/run_theta_runners.sh create mode 100644 workflows/rnd_or_grid/swift/theta_run.sh diff --git a/workflows/rnd_or_grid/python/test/run_theta_runners.sh b/workflows/rnd_or_grid/python/test/run_theta_runners.sh new file mode 100644 index 00000000..20f00a74 --- /dev/null +++ b/workflows/rnd_or_grid/python/test/run_theta_runners.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +set -eu + +# Theta / Tensorflow env vars +export KMP_BLOCKTIME=30 +export KMP_SETTINGS=1 +export KMP_AFFINITY=granularity=fine,verbose,compact,1,0 +export OMP_NUM_THREADS=128 + +export PYTHONHOME="/lus/theta-fs0/projects/Candle_ECP/ncollier/py2_tf_gcc6.3_eigen3_native" +#export PYTHONHOME="/home/rjain/anaconda2" +PYTHON="$PYTHONHOME/bin/python" +export LD_LIBRARY_PATH="$PYTHONHOME/lib" +export PATH="$PYTHONHOME/bin:$PATH" + +RUNNER_DIR=../../../../../Benchmarks/Pilot1/P1B1:../../../../../Benchmarks/Pilot2/P2B1:../../../../../Benchmarks/Pilot3/P3B1:../../../../../Benchmarks/Pilot1/NT3:../../../../../Benchmarks/Pilot1/P1B3 +COMMON_DIR=../../../common/python +PYTHONPATH="$PYTHONHOME/lib/python2.7:" +PYTHONPATH+="../:$RUNNER_DIR:$COMMON_DIR:" +PYTHONPATH+="$PYTHONHOME/lib/python2.7/site-packages" +export PYTHONPATH +export PROJECT=Candle_ECP + +echo $PYTHONPATH +$PYTHON test_runners.py diff --git a/workflows/rnd_or_grid/swift/theta_run.sh b/workflows/rnd_or_grid/swift/theta_run.sh new file mode 100644 index 00000000..9002ffb0 --- /dev/null +++ b/workflows/rnd_or_grid/swift/theta_run.sh @@ -0,0 +1,81 @@ +#! /usr/bin/env bash +set -eu + +# Autodetect this workflow directory +export APP_HOME=$( cd $( dirname $0 ) ; /bin/pwd ) + +#### set this variable to add new benchmarks directory +RUNNERS_DIR=$APP_HOME/../../../../Benchmarks/Pilot1/P1B1:$APP_HOME/../../../../Benchmarks/Pilot2/P2B1:$APP_HOME/../../../../Benchmarks/Pilot3/P3B1:$APP_HOME/../../../../Benchmarks/Pilot1/NT3:$APP_HOME/../../../../Benchmarks/Pilot1/P1B3 +### +# The number of MPI processes +# Note that 2 processes are reserved for Swift/EMEMS +# The default of 4 gives you 2 workers, i.e., 2 concurrent Keras runs +export PROCS=${PROCS:-36} +# MPI processes per node +# Cori has 32 cores per node, 128GB per node +export PPN=${PPN:-1} +export QUEUE=${QUEUE:-default} +export WALLTIME=${WALLTIME:-01:20:00} + + +if [ "$#" -ne 4 ]; then + script_name=$(basename $0) + echo "Usage: ${script_name} EXPERIMENT_ID (run1_p1b1) BENCHMARKS_NAME (eg. p1b1) SEARCH_TYPE (eg. grid or random) INPUT_JSON" + echo "Example: ./run p1b1_experiment1 p1b1 random p1b1_settings.json" + echo "-This creates a p1b1_experiment1 directory in ../experiments" + echo " uses random scheme for variables specified in ../data/p1b1_settings.json file" + exit 1 +fi + +# uncomment to turn on swift/t logging. Can also set TURBINE_LOG, +# TURBINE_DEBUG, and ADLB_DEBUG to 0 to turn off logging +export TURBINE_LOG=1 TURBINE_DEBUG=1 ADLB_DEBUG=1 + +export EXPID=$1 +export B_NAME=$2 +export S_NAME=$3 +export JSON_F=$4 + +export TURBINE_OUTPUT=$APP_HOME/../experiments/$EXPID +export PROJECT=Candle_ECP +export TURBINE_JOBNAME="${EXPID}_job" + +TCL=/home/wozniak/Public/sfw/theta/tcl-8.6.1 +export R=/home/wozniak/Public/sfw/theta/R-3.4.0/lib64/R +export PY=/home/rjain/anaconda2 +export LD_LIBRARY_PATH=$PY/lib:$R/lib:$LD_LIBRARY_PATH +COMMON_DIR=$APP_HOME/../../common/python +PYTHONPATH=$APP_HOME/../python:$RUNNERS_DIR:$COMMON_DIR +PYTHONHOME=/home/rjain/anaconda2 + +export PATH=/home/rjain/install/stc/bin:$TCL/bin:$PATH +#$PYTHONHOME/bin:$TCL/bin:$PATH + +# Resident task workers and ranks +export TURBINE_RESIDENT_WORK_WORKERS=1 +export RESIDENT_WORK_RANKS=$(( PROCS - 2 )) + + +TURBINE_DIR=/home/rjain/install/turbine/lib + +# set machine to your scheduler type (e.g. pbs, slurm, cobalt etc.), +# or empty for an immediate non-queued unscheduled run +MACHINE="theta" + +if [ -n "$MACHINE" ]; then + MACHINE="-m $MACHINE" +fi + +set -x +WORKFLOW_SWIFT=rnd_or_grid.swift +swift-t -n $PROCS $MACHINE -r $TURBINE_DIR \ + -e LD_LIBRARY_PATH=$LD_LIBRARY_PATH \ + -e TURBINE_RESIDENT_WORK_WORKERS=$TURBINE_RESIDENT_WORK_WORKERS \ + -e RESIDENT_WORK_RANKS=$RESIDENT_WORK_RANKS \ + -e APP_HOME=$APP_HOME \ + -e PYTHONPATH=$PYTHONPATH \ + -e PYTHONHOME=$PYTHONHOME \ + -e TURBINE_DEBUG=$TURBINE_DEBUG\ + -e ADLB_DEBUG=$ADLB_DEBUG \ + -e TURBINE_OUTPUT=$TURBINE_OUTPUT \ + $APP_HOME/$WORKFLOW_SWIFT --benchmark_name=$B_NAME --search_type=$S_NAME --input_file=$JSON_F & From 0a3177f76b702fde945161309bb62c9caabfe1a6 Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Wed, 12 Jul 2017 13:07:17 -0500 Subject: [PATCH 5/5] add time --- .../rnd_or_grid/python/test/test_runners.py | 62 ++++++++++++------- 1 file changed, 38 insertions(+), 24 deletions(-) diff --git a/workflows/rnd_or_grid/python/test/test_runners.py b/workflows/rnd_or_grid/python/test/test_runners.py index a6218837..0f2c2b14 100644 --- a/workflows/rnd_or_grid/python/test/test_runners.py +++ b/workflows/rnd_or_grid/python/test/test_runners.py @@ -14,35 +14,49 @@ def main(): # hyper_parameter_map['dense'] = [1219, 536] # hyper_parameter_map['framework'] = 'keras' - -#1 # p1b1 - works -# # hyper_parameter_map['save'] = './p1bl1_testing_failure' -# print("STARTING#####P1B1##########") -# p1b1_validation_loss = p1b1_runner.run(hyper_parameter_map) -# print("DONE##########P1B1#####") - - -# #2 # p1b3 - works too big for desktop -# print("STARTING#####P1B3##########") -# p1b3_validation_loss = p1b3_runner.run(hyper_parameter_map) -# print("DONE######P1B3#########") - -# #3 # p2b1 - works -# print("STARTING#####P2B1##########") -# p2b1_validation_loss = p2b1_runner.run(hyper_parameter_map) -# print("DONE#####P2B1##########") - -#4 # p3b1 - fails - ValueError: invalid literal for int() with base 10: '1200;1200' +#1 # p1b1 + # hyper_parameter_map['save'] = './p1bl1_testing_failure' + print("STARTING#####P1B1##########") + ts_p1b1 = datetime.now() + p1b1_validation_loss = p1b1_runner.run(hyper_parameter_map) + te_p1b1 = datetime.now() + print("Validation loss=",p1b1_validation_loss) + print("DONE##########P1B1#####, TIME=", te_p1b1 - ts_p1b1) + + +#2 # p1b3 + print("STARTING#####P1B3##########") + ts_p1b3 = datetime.now() + p1b3_validation_loss = p1b3_runner.run(hyper_parameter_map) + te_p1b3 = datetime.now() + print("Validation loss=",p1b3_validation_loss) + print("DONE##########P1B3#####, TIME=", te_p1b3 - ts_p1b3) + +#3 # p2b1 + print("STARTING#####P2B1##########") + ts_p2b1 = datetime.now() + p2b1_validation_loss = p2b1_runner.run(hyper_parameter_map) + te_p2b1 = datetime.now() + print("Validation loss=",p2b1_validation_loss) + print("DONE##########P2B1#####, TIME=", te_p2b1 - ts_p2b1) + +#4 # p3b1 print("STARTING#####P3B1##########") + ts_p3b1 = datetime.now() p3b1_validation_loss = p3b1_runner.run(hyper_parameter_map) - print("DONE#####P3B1##########") + te_p3b1 = datetime.now() + print("Validation loss=",p3b1_validation_loss) + print("DONE##########P3B1#####, TIME=", te_p3b1 - ts_p3b1) -#5 # NT3 - works - too big +#5 # NT3 print("STARTING#####NT3##########") hyper_parameter_map['model_name'] = 'nt3' - nt3tc1_validation_losss = nt3_tc1_runner.run(hyper_parameter_map) - print("DONE#####NT3##########") + ts_nt3 = datetime.now() + nt3tc1_validation_loss = nt3_tc1_runner.run(hyper_parameter_map) + te_nt3 = datetime.now() + print("Validation loss=",nt3tc1_validation_loss) + print("DONE##########NT3#####, TIME=", te_nt3 - ts_nt3) -# # print("Validation Loss: ", p1b1_validation_loss) if __name__ == '__main__': main() +