From 217e9fc598eec2aef79a536d2137bd5671af49aa Mon Sep 17 00:00:00 2001 From: KyleHaggin Date: Thu, 30 Jan 2020 15:21:02 -0800 Subject: [PATCH 01/18] merge deployment branch into local branch --- cryptolytic/model/cron_model.py | 11 ++++++----- cryptolytic/start.py | 11 ++++++----- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/cryptolytic/model/cron_model.py b/cryptolytic/model/cron_model.py index 4fae450..9fbb208 100644 --- a/cryptolytic/model/cron_model.py +++ b/cryptolytic/model/cron_model.py @@ -342,10 +342,11 @@ def xgb_cron_pred(model_type='trade'): model_path = mfw.get_path( 'models', model_type, exchange_id, trading_pair, '.pkl' ) - if not os.path.exists(model_path): - print(f'Model not available for {exchange_id}, {trading_pair}') - continue + aws.download_file(model_path) + if not os.path.exists(model_path): + print(f'File does not exist for {exchange_id}, {trading_pair} in function xgb_cron_pred') + print(model_path) model = pickle.load(open(model_path, 'rb')) n = params['history_size']+params['lahead'] @@ -365,7 +366,7 @@ def xgb_cron_pred(model_type='trade'): x_train, y_train, x_test, y_test = xtrade.data_splice(dataset, target) if x_train.shape[0] < n: - print(f'Invalid shape {x_train.shape[0]} in function cron_pred2') + print(f'Invalid shape {x_train.shape[0]} in function xgb_cron_pred') continue preds = model.predict(x_train) @@ -373,7 +374,7 @@ def xgb_cron_pred(model_type='trade'): last_timestamp = df.timestamp[-1] timestamps = [last_timestamp + params['period'] * i for i in range(len(preds))] pd.DataFrame( - {'close': preds, 'exchange': exchange_id, + {'preds': preds, 'exchange': exchange_id, 'timestamp': timestamps}).to_csv(preds_path) preds_path = mfw.get_path( 'preds', model_type, exchange_id, trading_pair, '.csv' diff --git a/cryptolytic/start.py b/cryptolytic/start.py index e05e05d..29d9aad 100644 --- a/cryptolytic/start.py +++ b/cryptolytic/start.py @@ -11,18 +11,19 @@ def start_logging(): logging.basicConfig(filename="log.txt") # also print to stderr - stderrLogger=logging.StreamHandler() + stderrLogger = logging.StreamHandler() stderrLogger.setFormatter(logging.Formatter(logging.BASIC_FORMAT)) logging.getLogger().addHandler(stderrLogger) w = 20 logging.debug(' '*w) - logging.debug('-'*w) + logging.debug('-'*w) logging.debug('LOG START') - logging.debug('-'*w) + logging.debug('-'*w) + def init(): load_dotenv(verbose=True) start_logging() - cryptolytic.session = boto3.session.Session(aws_access_key_id=os.environ['AWS_ACCESS_KEY_ID'], - aws_secret_access_key=os.environ['AWS_SECRET_ACCESS_KEY']) + cryptolytic.session = boto3.session.Session(aws_access_key_id=os.environ['AWS_ACCESS_KEY_ID'], + aws_secret_access_key=os.environ['AWS_SECRET_ACCESS_KEY']) From 2d031f59f40088abcafa61e6c4fd0a8ac4f97b85 Mon Sep 17 00:00:00 2001 From: KyleHaggin Date: Sun, 2 Feb 2020 11:26:43 -0800 Subject: [PATCH 02/18] Merge deployment branch to local branch --- cryptolytic/data/aws.py | 2 +- cryptolytic/data/historical.py | 6 ++-- cryptolytic/model/cron_model.py | 45 ++++++++++++++-------------- cryptolytic/model/model_framework.py | 8 ++--- cryptolytic/model/xgboost_model.py | 2 +- 5 files changed, 32 insertions(+), 31 deletions(-) diff --git a/cryptolytic/data/aws.py b/cryptolytic/data/aws.py index 422a45d..462d4b5 100644 --- a/cryptolytic/data/aws.py +++ b/cryptolytic/data/aws.py @@ -32,4 +32,4 @@ def get_path(folder_name, model_type, exchange_id, trading_pair, ext): aws_folder = os.path.join('aws', folder_name) if not os.path.exists(aws_folder): os.mkdir(aws_folder) - return os.path.join(aws_folder, f'model_{model_type}_{exchange_id}_{trading_pair}{ext}') + return os.path.join(aws_folder, f'model_{model_type}_{exchange_id}_{trading_pair}{ext}').replace('\\', '/') diff --git a/cryptolytic/data/historical.py b/cryptolytic/data/historical.py index f275714..8bbd5ce 100644 --- a/cryptolytic/data/historical.py +++ b/cryptolytic/data/historical.py @@ -300,9 +300,9 @@ def yield_unique_pair(return_api=True): def update_pair(api, exchange_id, trading_pair, timestamp, period=300, - num_retries=0): + num_retries=0): """This functional inserts candlestick information into the database, - called by live_update function. + called by live_update function. Returns true if updated, or None if the task should be dropped""" # exit if num retries > 20 if num_retries > 10: @@ -395,7 +395,7 @@ def fill_missing_candles(): update_pair(api, exchange, trading_pair, int(timestamp), int(period)) -# TODO should place this in the same file with get_df and get_df should probably +# TODO should place this in the same file with get_df and get_df should probably # be just impute_df and this shoudl candle sql.get_some_candles, pass that to impute_df, # and then also call feaure_engineer_df, instead. def get_data(exchange_id, trading_pair, period, start, n=8000): diff --git a/cryptolytic/model/cron_model.py b/cryptolytic/model/cron_model.py index 269f838..241c259 100644 --- a/cryptolytic/model/cron_model.py +++ b/cryptolytic/model/cron_model.py @@ -51,7 +51,7 @@ def cron_train(): now = int(time.time()) pull_size = 5000 - h.live_update() + # h.live_update() for exchange_id, trading_pair in h.yield_unique_pair(return_api=False): print(exchange_id, trading_pair) @@ -99,11 +99,11 @@ def cron_train(): # Get the data in the same format that the model expects for its training x_train, y_train, x_val, y_val = dw.windowed( - dataset, + dataset, target, - params['batch_size'], + params['batch_size'], params['history_size'], - params['step'], + params['step'], lahead=params['lahead'], ratio=0.8) @@ -143,7 +143,10 @@ def cron_pred(): for exchange_id, trading_pair in h.yield_unique_pair(return_api=False): model_path = aws.get_path('models', model_type, exchange_id, trading_pair, '.h5') - aws.download_file(model_path) + try: + aws.download_file(model_path) + except Exception: + print(f'Model not available for {exchange_id}, {trading_pair}') if not os.path.exists(model_path): print(f'Model not available for {exchange_id}, {trading_pair}') continue @@ -155,9 +158,9 @@ def cron_pred(): df, dataset = h.get_latest_data( exchange_id, trading_pair, params['period'], - # Pull history_size + lahead length, shouldn't need more to make a + # Pull history_size + lahead length, shouldn't need more to make a # prediction - n=n) + n=15000) if df is None: continue @@ -169,35 +172,34 @@ def cron_pred(): # Get the data in the same format that the model expects for its training x_train, y_train, x_val, y_val = dw.windowed( dataset, target, - params['batch_size'], + params['batch_size'], params['history_size'], - params['step'], - lahead=0, # Zero look ahead, don't truncate any data for the prediction + params['step'], + lahead=0, # Zero look ahead, don't truncate any data for the prediction ratio=1.0) - if x_train.shape[0] < n: + if x_train.shape[0] < (params['history_size']+params['step']): print(f'Invalid shape {x_train.shape[0]} in function cron_pred2') continue - - preds = model.predict(x_train)[:, 0][-params['lahead']] + preds = model.predict(x_train)[:, 0][-params['lahead']:] last_timestamp = df.timestamp[-1] timestamps = [last_timestamp + params['period'] * i for i in range(len(preds))] preds = pd.DataFrame( - {'prediction': preds, + {'prediction': preds, 'exchange': exchange_id, - 'timestamp': timestamps, - 'trading_pair' : trading_pair, + 'timestamp': timestamps, + 'trading_pair': trading_pair, 'period': params['period'], - 'model_type' : model_type + 'model_type': model_type }) sql.add_data_to_table(preds, table='predictions') -# preds_path = aws.get_path('preds', model_type, exchange_id, trading_pair, '.csv') -# preds.to_csv(preds_path) -# aws.upload_file(preds_path) + preds_path = aws.get_path('preds', model_type, exchange_id, trading_pair, '.csv') + preds.to_csv(preds_path) + aws.upload_file(preds_path) # be able to have model train on a large series of time without crashing @@ -231,7 +233,6 @@ def xgb_cron_train(model_type): model_path = aws.get_path( 'models', model_type, exchange_id, trading_pair, '.pkl') - n = params['train_size'] # train in batches of 3000 @@ -250,7 +251,7 @@ def xgb_cron_train(model_type): target = df.columns.get_loc('arb_signal_class') x_train, y_train, x_test, y_test = xtrade.data_splice(dataset, target) - + print(df) print(n) print(df.shape, dataset.shape) diff --git a/cryptolytic/model/model_framework.py b/cryptolytic/model/model_framework.py index 9a3ab25..2175048 100644 --- a/cryptolytic/model/model_framework.py +++ b/cryptolytic/model/model_framework.py @@ -121,15 +121,15 @@ def predictions(): return preds -# https://towardsdatascience.com/types-of-convolutions-in-deep-learning-717013397f4d +# https://towardsdatascience.com/types-of-convolutions-in-deep-learning-717013397f4d # TODO try downsampling and then upsampling in the first part of the network # , can do downsampling by increasing stride or Pooling # dilation in a convolution will add spacing between values in a kernel, -# so a 3x3 kernel with a dilation of 2 will have the same field view as -# 5x5 kernel, while only taking 9 parameters, so can get a wide field of view +# so a 3x3 kernel with a dilation of 2 will have the same field view as +# 5x5 kernel, while only taking 9 parameters, so can get a wide field of view # at low cost. # Transposed convolutions, sometimes called deconvolutions. -# Can be used for upsampling the image, might need padding. +# Can be used for upsampling the image, might need padding. # Utilize skip connections into RNN and should improve model performance. def create_model(x_train, params): batch_size = params['batch_size'] diff --git a/cryptolytic/model/xgboost_model.py b/cryptolytic/model/xgboost_model.py index 1986c6d..b410565 100644 --- a/cryptolytic/model/xgboost_model.py +++ b/cryptolytic/model/xgboost_model.py @@ -43,7 +43,7 @@ def performance(X_test, y_preds): # calculate the percentage paid in fees fees_pct = number_of_entries * 2 * fee_rate/100 - # calculate fees in USD + # calculate fees in USD fees = number_of_entries * 2 * fee_rate / 100 * 10000 # calculate net profit in USD From 7d50f3871fab9f950f8463e1f48f9d40c117590b Mon Sep 17 00:00:00 2001 From: KyleHaggin Date: Mon, 3 Feb 2020 13:38:30 -0800 Subject: [PATCH 03/18] pep8 conformity --- cryptolytic/model/cron_model.py | 52 ++++++++++++++++++++------------- 1 file changed, 32 insertions(+), 20 deletions(-) diff --git a/cryptolytic/model/cron_model.py b/cryptolytic/model/cron_model.py index 241c259..49363b2 100644 --- a/cryptolytic/model/cron_model.py +++ b/cryptolytic/model/cron_model.py @@ -1,5 +1,7 @@ # begin inter imports from cryptolytic.start import init +# init call early in imports so that cryptolytic.session imports correctly on +# Windows operating systems init() import cryptolytic.model.model_framework as model_framework import cryptolytic.model.data_work as dw @@ -62,8 +64,8 @@ def cron_train(): time_counter = start while True: gc.collect() - model_path = aws.get_path('models', 'neural', exchange_id, trading_pair, '.h5') - + model_path = aws.get_path('models', 'neural', exchange_id, + trading_pair, '.h5') # model = tf.keras.load_model(path) @@ -86,7 +88,8 @@ def cron_train(): # finished training for this if time_counter >= now - params['period']: - print('Finished training for {api}, {exchange_id}, {trading_pair}') + print('Finished training for {api}, {exchange_id}, ' + '{trading_pair}') break if df.shape[0] < params['history_size']: @@ -97,7 +100,8 @@ def cron_train(): print(n) print(df.shape, dataset.shape) - # Get the data in the same format that the model expects for its training + # Get the data in the same format that the + # model expects for its training x_train, y_train, x_val, y_val = dw.windowed( dataset, target, @@ -112,7 +116,8 @@ def cron_train(): print(x_val.shape) print(y_val.shape) if x_train.shape[0] < 10: - print(f'Invalid shape {x_train.shape[0]} in function cron_train') + print(f'Invalid shape {x_train.shape[0]} ' + 'in function cron_train') break # Create a model if not exists, else load model if it @@ -138,11 +143,13 @@ def cron_pred(): availble for that trading pair complete with """ init() - all_preds = pd.DataFrame(columns=['close', 'api', 'trading_pair', 'exchange_id', 'timestamp']) + all_preds = pd.DataFrame(columns=['close', 'api', 'trading_pair', + 'exchange_id', 'timestamp']) model_type = 'neural' for exchange_id, trading_pair in h.yield_unique_pair(return_api=False): - model_path = aws.get_path('models', model_type, exchange_id, trading_pair, '.h5') + model_path = aws.get_path('models', model_type, exchange_id, + trading_pair, '.h5') try: aws.download_file(model_path) except Exception: @@ -158,8 +165,8 @@ def cron_pred(): df, dataset = h.get_latest_data( exchange_id, trading_pair, params['period'], - # Pull history_size + lahead length, shouldn't need more to make a - # prediction + # Pull history_size + lahead length, + # shouldn't need more to make a prediction n=15000) if df is None: @@ -169,16 +176,17 @@ def cron_pred(): print(dataset.shape) - # Get the data in the same format that the model expects for its training + # Get the data in the same format that the + # model expects for its training x_train, y_train, x_val, y_val = dw.windowed( dataset, target, params['batch_size'], params['history_size'], params['step'], - lahead=0, # Zero look ahead, don't truncate any data for the prediction + # Zero look ahead, don't truncate any data for the prediction + lahead=0, ratio=1.0) - if x_train.shape[0] < (params['history_size']+params['step']): print(f'Invalid shape {x_train.shape[0]} in function cron_pred2') continue @@ -186,7 +194,8 @@ def cron_pred(): preds = model.predict(x_train)[:, 0][-params['lahead']:] last_timestamp = df.timestamp[-1] - timestamps = [last_timestamp + params['period'] * i for i in range(len(preds))] + timestamps = [last_timestamp + params['period'] + * i for i in range(len(preds))] preds = pd.DataFrame( {'prediction': preds, @@ -197,7 +206,8 @@ def cron_pred(): 'model_type': model_type }) sql.add_data_to_table(preds, table='predictions') - preds_path = aws.get_path('preds', model_type, exchange_id, trading_pair, '.csv') + preds_path = aws.get_path('preds', model_type, exchange_id, + trading_pair, '.csv') preds.to_csv(preds_path) aws.upload_file(preds_path) @@ -296,7 +306,8 @@ def xgb_cron_pred(model_type='trade'): aws.download_file(model_path) if not os.path.exists(model_path): - print(f'File does not exist for {exchange_id}, {trading_pair} in function xgb_cron_pred') + print(f'File does not exist for {exchange_id}, {trading_pair} ' + 'in function xgb_cron_pred') print(model_path) model = pickle.load(open(model_path, 'rb')) @@ -323,14 +334,15 @@ def xgb_cron_pred(model_type='trade'): preds = model.predict(x_train) last_timestamp = df.timestamp[-1] - timestamps = [last_timestamp + params['period'] * i for i in range(len(preds))] + timestamps = [last_timestamp + params['period'] + * i for i in range(len(preds))] preds = pd.DataFrame( - {'prediction': preds, + {'prediction': preds, 'exchange': exchange_id, - 'timestamp': timestamps, - 'trading_pair' : trading_pair, + 'timestamp': timestamps, + 'trading_pair': trading_pair, 'period': params['period'], - 'model_type' : model_type + 'model_type': model_type }) sql.add_data_to_table(preds, table='predictions') From ac1d570046c90f75047cbf5b26f8ea4f553e0544 Mon Sep 17 00:00:00 2001 From: KyleHaggin Date: Mon, 3 Feb 2020 13:41:15 -0800 Subject: [PATCH 04/18] cryptolytic.model.data_work up to pep8 conformity --- cryptolytic/model/data_work.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cryptolytic/model/data_work.py b/cryptolytic/model/data_work.py index a49171c..bb66da1 100644 --- a/cryptolytic/model/data_work.py +++ b/cryptolytic/model/data_work.py @@ -14,7 +14,8 @@ import tensorflow as tf from tensorflow.keras.preprocessing import sequence from tensorflow.keras.models import Sequential, Model -from tensorflow.keras.layers import Dense, Embedding, Conv1D, Activation, Add, Input, LSTM +from tensorflow.keras.layers import Dense, Embedding, Conv1D,\ + Activation, Add, Input, LSTM import tensorflow.keras.layers as layers import tensorflow.keras.losses as losses import tensorflow.keras.models as models @@ -67,7 +68,8 @@ def eq(x, mu, std): return values -def windowed(dataset, target, batch_size, history_size, step, lahead=1, ratio=0.8): +def windowed(dataset, target, batch_size, history_size, + step, lahead=1, ratio=0.8): xs = [] ys = [] From e09b4b100d1fbf0fadf48b011896c2aad81e0f7e Mon Sep 17 00:00:00 2001 From: KyleHaggin Date: Mon, 3 Feb 2020 13:46:53 -0800 Subject: [PATCH 05/18] cryptolytic.model pep8 conformed. blank files deleted --- cryptolytic/model/cron_model.py | 1 - cryptolytic/model/data_work.py | 3 +-- cryptolytic/model/hyperparameter.py | 2 +- cryptolytic/model/model_framework.py | 4 +-- cryptolytic/model/predict.py | 3 --- cryptolytic/model/train.py | 1 - cryptolytic/model/xgboost_arb_model.py | 1 - cryptolytic/model/xgboost_model.py | 35 ++++++++++++++++---------- 8 files changed, 26 insertions(+), 24 deletions(-) delete mode 100644 cryptolytic/model/predict.py delete mode 100644 cryptolytic/model/train.py diff --git a/cryptolytic/model/cron_model.py b/cryptolytic/model/cron_model.py index 49363b2..343686f 100644 --- a/cryptolytic/model/cron_model.py +++ b/cryptolytic/model/cron_model.py @@ -30,7 +30,6 @@ import gc - params = { 'history_size': 400, 'lahead': 12*3, diff --git a/cryptolytic/model/data_work.py b/cryptolytic/model/data_work.py index bb66da1..31d7b64 100644 --- a/cryptolytic/model/data_work.py +++ b/cryptolytic/model/data_work.py @@ -80,7 +80,7 @@ def windowed(dataset, target, batch_size, history_size, end = dataset.shape[0] - lahead # 4990 # 4990 - 1000 = 3990 for i in range(start, end): - # grab rows from start y-history_size to end + # grab rows from start y-history_size to end indices = range(i-history_size, i, step) xs.append(x[indices]) ys.append(y[i:i+lahead]) @@ -100,4 +100,3 @@ def windowed(dataset, target, batch_size, history_size, ys = ys[:total_size] return xs[:train_size], ys[:train_size], xs[train_size:], ys[train_size:] - diff --git a/cryptolytic/model/hyperparameter.py b/cryptolytic/model/hyperparameter.py index bba904a..a960cad 100644 --- a/cryptolytic/model/hyperparameter.py +++ b/cryptolytic/model/hyperparameter.py @@ -10,7 +10,7 @@ import os from kerastuner.tuners import RandomSearch -#internal imports +# internal imports import cryptolytic.data.historical as h import cryptolytic.model as m import cryptolytic.model.lstm_framework as lstm diff --git a/cryptolytic/model/model_framework.py b/cryptolytic/model/model_framework.py index 2175048..a878cdd 100644 --- a/cryptolytic/model/model_framework.py +++ b/cryptolytic/model/model_framework.py @@ -202,7 +202,8 @@ def get_path(folder_name, model_type, exchange_id, trading_pair, ext): Example models/model_trade_binance_eth_usd.pkl preds/model_trade_binance_eth_usd.csv """ - return f'{folder_name}/model_{model_type}_{exchange_id}_{trading_pair}{ext}' + return f'{folder_name}/model_{model_type}_{exchange_id}_'\ + '{trading_pair}{ext}' def load_all_models(folder): @@ -257,4 +258,3 @@ def model_compile(model_framework, optimizer=optimizer, metrics=metrics) return compiled_model - diff --git a/cryptolytic/model/predict.py b/cryptolytic/model/predict.py deleted file mode 100644 index e7cefa8..0000000 --- a/cryptolytic/model/predict.py +++ /dev/null @@ -1,3 +0,0 @@ -# File for running predictions on latest data -import cryptolytic. - diff --git a/cryptolytic/model/train.py b/cryptolytic/model/train.py deleted file mode 100644 index 11246f7..0000000 --- a/cryptolytic/model/train.py +++ /dev/null @@ -1 +0,0 @@ -# File for automatic retraining of them odel diff --git a/cryptolytic/model/xgboost_arb_model.py b/cryptolytic/model/xgboost_arb_model.py index 0e8ee4a..3c76bd0 100644 --- a/cryptolytic/model/xgboost_arb_model.py +++ b/cryptolytic/model/xgboost_arb_model.py @@ -25,4 +25,3 @@ def fit_model(model, x_train, y_train): model.fit(x_train, y_train) # Return the fitted model return model - diff --git a/cryptolytic/model/xgboost_model.py b/cryptolytic/model/xgboost_model.py index b410565..c6ff70d 100644 --- a/cryptolytic/model/xgboost_model.py +++ b/cryptolytic/model/xgboost_model.py @@ -7,22 +7,29 @@ def performance(X_test, y_preds): - """ Takes in a test dataset and a model's predictions, calculates and returns - the profit or loss. When the model generates consecutive buy predictions, - anything after the first one are considered a hold and fees are not added - for the hold trades. """ + """ + Takes in a test dataset and a model's predictions, calculates and returns + the profit or loss. When the model generates consecutive buy predictions, + anything after the first one are considered a hold and fees are not added + for the hold trades. + """ - fee_rate = 0.35 + fee_rate = 0.35 # creates dataframe for features and predictions df_preds = X_test df_preds['y_preds'] = y_preds # creates column with 0s for False predictions and 1s for True predictions - df_preds['binary_y_preds'] = df_preds['y_preds'].shift(1).apply(lambda x: 1 if x == True else 0) + df_preds['binary_y_preds'] = df_preds['y_preds'].shift(1).\ + apply(lambda x: 1 if x is True else 0) - # performance results from adding the closing difference percentage of the rows where trades were executed - performance = ((10000 * df_preds['binary_y_preds']*df_preds['close_diff']).sum()) + # performance results from adding the closing difference percentage of + # the rows where trades were executed + performance = ( + (10000 * df_preds['binary_y_preds']*df_preds['close_diff']) + .sum() + ) # calculating fees and improve trading strategy # creates a count list for when trades were triggered @@ -35,10 +42,12 @@ def performance(X_test, y_preds): df_preds['trade_trig'] = df_preds['increase_count'].diff(1) # number of total entries(1s) - number_of_entries = (df_preds.trade_trig.values==1).sum() + number_of_entries = (df_preds.trade_trig.values == 1).sum() - # performance takes into account fees given the rate at the beginning of this function - pct_performance = ((df_preds['binary_y_preds']*df_preds['close_diff']).sum()) + # performance takes into account fees given the rate at the + # beginning of this function + pct_performance = ((df_preds['binary_y_preds']*df_preds['close_diff']) + .sum()) # calculate the percentage paid in fees fees_pct = number_of_entries * 2 * fee_rate/100 @@ -52,7 +61,8 @@ def performance(X_test, y_preds): # calculate net profit percent performance_net_pct = performance_net/10000 - return pct_performance, performance, fees, performance_net, performance_net_pct + return pct_performance, performance, fees,\ + performance_net, performance_net_pct def trade_model(df, params={}): @@ -104,7 +114,6 @@ def data_splice(dataset, target): train = dataset[0:train_size] test = dataset[train_size:] - # Categorical target for the model to achieve # 1 = price increase # 0 = no price increase From bdf37a1be2ba5e59fd53170553a80231fd665fa4 Mon Sep 17 00:00:00 2001 From: KyleHaggin Date: Mon, 3 Feb 2020 13:51:43 -0800 Subject: [PATCH 06/18] cryptolytic.data.__init conformed to pep8 standards --- cryptolytic/data/__init__.py | 80 +++++++++++++++++++++--------------- 1 file changed, 46 insertions(+), 34 deletions(-) diff --git a/cryptolytic/data/__init__.py b/cryptolytic/data/__init__.py index bd0ef3b..0b00397 100644 --- a/cryptolytic/data/__init__.py +++ b/cryptolytic/data/__init__.py @@ -27,9 +27,12 @@ def denoise(signal, repeat): def resample_ohlcv(df, period=None): - """this function resamples ohlcv csvs for a specified candle interval; while - this can be used to change the candle interval for the data, it can also be - used to fill in gaps in the ohlcv data without changing the candle interval""" + """ + this function resamples ohlcv csvs for a specified candle interval; + while this can be used to change the candle interval for the data, + it can also be used to fill in gaps in the ohlcv data without changing + the candle interval + """ # dictionary specifying which columns to use for resampling ohlcv_dict = {'open': 'first', 'high': 'max', @@ -38,7 +41,7 @@ def resample_ohlcv(df, period=None): 'volume': 'sum'} # apply resampling - if period==None: + if period is None: period = df['period'][0] period = pd.to_timedelta(period, unit='s') df_new = df.resample(period, how=ohlcv_dict) @@ -52,8 +55,9 @@ def nan_df(df): def merge_candle_dfs(df1, df2): """Merge candle dataframes""" - merge_cols = ['trading_pair', 'exchange', 'period', 'datetime', 'timestamp'] - df_merged = df1.merge(df2, how='inner', on=merge_cols) + merge_cols = ['trading_pair', 'exchange', + 'period', 'datetime', 'timestamp'] + df_merged = df1.merge(df2, how='inner', on=merge_cols) return df_merged @@ -66,10 +70,14 @@ def outer_merge(df1, df2): def fix_df(df): - """Changes columns to the right type if needed and makes sure the index is set as the - datetime of the timestamp. Maybe better to have pandas infer numeric.""" + """ + Changes columns to the right type if needed and makes sure the index is + set as the datetime of the timestamp. Maybe better to have pandas + infer numeric. + """ df['datetime'] = pd.to_datetime(df['timestamp'], unit='s') - numeric = ['period', 'open', 'close', 'high', 'low', 'volume', 'arb_diff', 'arb_signal'] + numeric = ['period', 'open', 'close', 'high', 'low', 'volume', + 'arb_diff', 'arb_signal'] for col in numeric: if col not in df.columns: continue @@ -80,18 +88,19 @@ def fix_df(df): def impute_df(df): """ - Finds the gaps in the time series data for the dataframe, and pulls the average market - price and its last volume for those values and places those values into the gaps. Any remaining - gaps or new nan values are filled with backwards fill. + Finds the gaps in the time series data for the dataframe, and pulls the + average market price and its last volume for those values and places those + values into the gaps. Any remaining gaps or new nan values are filled + with backwards fill. """ df = df.copy() return df # resample ohclv will reveal missing timestamps to impute - gapped = resample_ohlcv(df) + gapped = resample_ohlcv(df) gaps = nan_df(gapped).index # stop psycopg2 error with int conversion convert_datetime = compose(int, convert_datetime_to_timestamp) - timestamps = mapl(convert_datetime, list(gaps)) + timestamps = mapl(convert_datetime, list(gaps)) info = {'trading_pair': df['trading_pair'][0], 'period': int(df['period'][0]), 'exchange': df['exchange'][0], @@ -107,24 +116,24 @@ def impute_df(df): df = fix_df(df) df['volume'] = df['volume'].ffill() df = df.bfill().ffill() - assert df.isna().any().any() == False + assert df.isna().any().any() is False return df def get_df(info, n=1000): """ - Pull info from database and give it some useful augmentation for analysis. + Pull info from database and give it some useful augmentation for analysis. TODO move functionality into get_data function in historical. """ df = sql.get_some_candles(info=info, n=n, verbose=True) df = impute_df(df) - + df['high_m_low'] = df['high'] - df['low'] df['close_m_open'] = df['close'] - df['open'] dfarb = sql.get_arb_info(info, n) merged = merge_candle_dfs(df, dfarb) - assert merged.isna().any().any() == False + assert merged.isna().any().any() is False return merged @@ -135,11 +144,11 @@ def thing(arg, axis=0): return x, mu, std -# Version 2 +# Version 2 def normalize(A): if isinstance(A, pd.DataFrame) or isinstance(A, pd.Series): A = A.values - if np.ndim(A)==1: + if np.ndim(A) == 1: A = np.expand_dims(A, axis=1) A = A.copy() x, mu, std = thing(A, axis=0) @@ -149,22 +158,24 @@ def normalize(A): # from sql A[:, i] = (x[:, i] - mu[i]) / std[i] return A - + def denormalize(values, df, col=None): - """Denormalize, needs the original information to be able to denormalize.""" + """ + Denormalize, needs the original information to be able to denormalize. + """ values = values.copy() - + def eq(x, mu, std): return np.exp((x * std) + mu) - 1 - + if np.ndim(values) == 1 and col is not None: x, mu, std = thing(df[col]) return eq(values, mu, std) else: - for i in range(values.shape[1]): + for i in range(values.shape[1]): x, mu, std = thing(df.iloc[:, i]) - if isinstance(values, pd.DataFrame): + if isinstance(values, pd.DataFrame): values.iloc[:, i] = eq(values.iloc[:, i], mu, std) else: values[:, i] = eq(values[:, i], mu, std) @@ -177,29 +188,30 @@ def windowed(df, target, batch_size, history_size, step, lahead=1, ratio=0.8): """ xs = [] ys = [] - + x = df y = df[:, target] - start = history_size # 1000 - end = df.shape[0] - lahead # 4990 + start = history_size # 1000 + end = df.shape[0] - lahead # 4990 # 4990 - 1000 = 3990 for i in range(start, end): indices = range(i-history_size, i, step) xs.append(x[indices]) ys.append(y[i:i+lahead]) - + xs = np.array(xs) ys = np.array(ys) - + nrows = xs.shape[0] train_size = int(nrows * ratio) - # make sure the sizes are multiples of the batch size (needed for types of models) + # make sure the sizes are multiples of the batch size + # (needed for types of models) train_size -= train_size % batch_size val_size = nrows - train_size - val_size -= val_size % batch_size + val_size -= val_size % batch_size total_size = train_size + val_size xs = xs[:total_size] ys = ys[:total_size] - + return xs[:train_size], ys[:train_size], xs[train_size:], ys[train_size:] From f71518c601870d8764979342c75f7ef5bc70db3b Mon Sep 17 00:00:00 2001 From: KyleHaggin Date: Mon, 3 Feb 2020 13:52:30 -0800 Subject: [PATCH 07/18] cryptolytic.data.aws conformed to pep8 standards --- cryptolytic/data/aws.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cryptolytic/data/aws.py b/cryptolytic/data/aws.py index 462d4b5..b7a87ba 100644 --- a/cryptolytic/data/aws.py +++ b/cryptolytic/data/aws.py @@ -32,4 +32,6 @@ def get_path(folder_name, model_type, exchange_id, trading_pair, ext): aws_folder = os.path.join('aws', folder_name) if not os.path.exists(aws_folder): os.mkdir(aws_folder) - return os.path.join(aws_folder, f'model_{model_type}_{exchange_id}_{trading_pair}{ext}').replace('\\', '/') + return os.path.join( + aws_folder, f'model_{model_type}_{exchange_id}_{trading_pair}{ext}' + ).replace('\\', '/') From 059fd926a867d821d195a16ea4f8f3322619cd14 Mon Sep 17 00:00:00 2001 From: KyleHaggin Date: Mon, 3 Feb 2020 14:04:09 -0800 Subject: [PATCH 08/18] cryptolytic.data.historical pep8 conformed --- cryptolytic/data/historical.py | 200 ++++++++++++++++++++------------- 1 file changed, 119 insertions(+), 81 deletions(-) diff --git a/cryptolytic/data/historical.py b/cryptolytic/data/historical.py index 8bbd5ce..d2c87c3 100644 --- a/cryptolytic/data/historical.py +++ b/cryptolytic/data/historical.py @@ -1,5 +1,5 @@ """ - Description: Contains functions on APIs and turning that into candlestick data. +Description: Contains functions on APIs and turning that into candlestick data. """ import requests from cryptolytic.util import date @@ -32,8 +32,8 @@ to configure your AWS VPC security groups to allow outside access """ -# api_info.json file is used to store information regarding the api -# such as url for the api call, the trading pairs and exchanges +# api_info.json file is used to store information regarding the api +# such as url for the api call, the trading pairs and exchanges # supported for that api, etc. api_info = None with open('data/api_info.json', 'r') as f: @@ -52,9 +52,11 @@ def crypto_full_name(crypto_short): def trading_pair_info(api, trading_pair): - """Returns full info for the trading pair necessary for the etrading_pairchange. + """Returns full info for the trading pair necessary + for the etrading_pairchange. trading_pair: e.g. btc_eth - Returns: e.g. BTC ETH if the pair was reveresed and uppercased + Returns: e.g. BTC ETH + if the pair was reveresed and uppercased """ if api_info[api].get('rename_pairs') is not None: if trading_pair in api_info[api]['rename_pairs']: @@ -79,22 +81,22 @@ def trading_pair_info(api, trading_pair): trading_pair = trading_pair.replace('_', '-') handled = True if api in {'coincap'}: - # coincap uses full crypto names, and uses - in place of spaces + # coincap uses full crypto names, and uses - in place of spaces baseId = crypto_full_name(baseId).lower().replace(' ', '-') quoteId = crypto_full_name(quoteId).lower().replace(' ', '-') handled = True - if not handled: raise Exception('API not supported ', api) - return {'baseId' : baseId, - 'quoteId' : quoteId, + return {'baseId': baseId, + 'quoteId': quoteId, 'trading_pair': trading_pair} def convert_candlestick(candlestick, api, timestamp): - # dict with keys :open, :high, :low, :close, :volume, :quoteVolume, :timestamp + # dict with keys + # :open, :high, :low, :close, :volume, :quoteVolume, :timestamp candlestick_old = candlestick candlestick = candlestick.copy() ohclv = ["open", "high", "close", "low", "volume", "timestamp"] @@ -104,14 +106,14 @@ def convert_candlestick(candlestick, api, timestamp): pass # reorder candlestick information elif corder is not None: - candlestick = {k: candlestick[i] for k, i in + candlestick = {k: candlestick[i] for k, i in zip(ohclv, corder)} - elif api=='hitbtc': + elif api == 'hitbtc': candlestick['high'] = candlestick.pop('max') candlestick['low'] = candlestick.pop('min') - elif api=='coincap': + elif api == 'coincap': candlestick['timestamp'] = candlestick.pop('period') - elif api=='poloniex': + elif api == 'poloniex': candlestick['timestamp'] = candlestick.pop('date') else: raise Exception('API not supported ', api) @@ -121,7 +123,9 @@ def convert_candlestick(candlestick, api, timestamp): if timestamp_format == "milliseconds": candlestick['timestamp'] = candlestick['timestamp'] // 1000 elif timestamp_format == "iso8601": - candlestick['timestamp'] = int(ciso8601.parse_datetime(candlestick['timestamp']).timestamp()) + candlestick['timestamp'] = int( + ciso8601.parse_datetime(candlestick['timestamp']).timestamp() + ) elif timestamp == "replace": candlestick['timestamp'] = timestamp @@ -129,10 +133,13 @@ def convert_candlestick(candlestick, api, timestamp): # no less than year 2000 or greater than 2030. timesamp must be an int if (not all(x in candlestick.keys() for x in ohclv)) or \ - not isinstance(candlestick['timestamp'], int) or candlestick['timestamp'] >= 1894131876 or candlestick['timestamp'] <= 947362914: + not isinstance(candlestick['timestamp'], int) or \ + candlestick['timestamp'] >= 1894131876 or \ + candlestick['timestamp'] <= 947362914: raise Exception() except Exception: - raise Exception("API: ", api, "\nInvalid Candle: ", candlestick, "\nOld candle: ", candlestick_old) + raise Exception("API: ", api, "\nInvalid Candle: ", candlestick, + "\nOld candle: ", candlestick_old) return {key: candlestick[key] for key in ohclv} @@ -146,7 +153,7 @@ def format_apiurl(api, params={}): # Coincap expects milliseconds in its url query if api_info[api].get("timestamp_format") == "milliseconds": params['start'] *= 1000 - params['end'] *= 1000 + params['end'] *= 1000 if api_info[api].get("timestamp_format") == "iso8601": params['start'] = datetime.utcfromtimestamp(params['start']) params['end'] = datetime.utcfromtimestamp(params['end']) @@ -161,7 +168,8 @@ def format_apiurl(api, params={}): return url -# Coincap uses time intervals like h1, m15 etc. so need a function to convert to the seconds to that format +# Coincap uses time intervals like h1, m15 etc. so need a function to +# convert to the seconds to that format def get_time_interval(api, period): """For coincap, hitbtc, etc. which use a format like 'm1' instead of period like 60 for 60 seconds.""" @@ -175,39 +183,47 @@ def get_time_interval(api, period): if accepted_values is None: raise Exception('API not supported', api) elif weeks >= 1 and accepted_values.has('weeks'): - x = list(filter(lambda x: x<=weeks, accepted_values['weeks'])) + x = list(filter(lambda x: x <= weeks, accepted_values['weeks'])) interval = 'w'+str(np.max(x)) elif days >= 1: - x = list(filter(lambda x: x<=days, accepted_values['days'])) + x = list(filter(lambda x: x <= days, accepted_values['days'])) interval = 'd'+str(np.max(x)) elif hours >= 1: - x = list(filter(lambda x: x<=hours, accepted_values['hours'])) + x = list(filter(lambda x: x <= hours, accepted_values['hours'])) interval = 'h'+str(np.max(x)) else: - x = [1] + list(filter(lambda x: x<=minutes, accepted_values['minutes'])) + x = [1] + list( + filter(lambda x: x <= minutes, accepted_values['minutes']) + ) interval = 'm'+str(np.max(x)) # expects uppercase like 'M1' for 1 minute if api_info.get(api).get("uppercase_timeinterval"): interval = interval.upper() - elif api_info.get(api).get("reverse_timeinterval"): # expect 1m format instead + # expect 1m format instead + elif api_info.get(api).get("reverse_timeinterval"): interval = interval[1:] + interval[0] return interval def conform_json_response(api, json_response): - """Get the right data from the json response. Expects a list, either like [[],...], or like [{},..]""" - if api=='cryptowatch': + """ + Get the right data from the json response. Expects a list, + either like [[],...], or like [{},..] + """ + if api == 'cryptowatch': return list(json_response['result'].values())[0] - elif api=='coincap': + elif api == 'coincap': return json_response['data'] elif api in {'poloniex', 'hitbtc', 'bitfinex', 'coinbase'}: return json_response else: - raise Exception('API not supported', api, 'Response was ', json_response) + raise Exception('API not supported', api, + 'Response was ', json_response) return None + def lookup_apikey(api): apikey = api_info[api].get('apikey') if apikey is not None: @@ -221,41 +237,48 @@ def get_from_api(api='cryptowatch', exchange='binance', trading_pair='eth_btc', start: start time in unix timestamp format """ - # apikey = lookup_apikey(api) # Variable initialization pair_info = trading_pair_info(api, trading_pair) - baseId = pair_info.get('baseId') # the first coin in the pair - quoteId = pair_info.get('quoteId') # the second coin in the pair - trading_pair_api = pair_info.get('trading_pair') # e.g. eth_usd in the form of what the api expects - start = start # start time unix timestamp - end = start+period*limit # end time unix timestamp - cutoff_time = int(time.time()-(period / 2)) # don't ask for a time greater than -period / 2 seconds ago + # the first coin in the pair + baseId = pair_info.get('baseId') + # the second coin in the pair + quoteId = pair_info.get('quoteId') + # e.g. eth_usd in the form of what the api expects + trading_pair_api = pair_info.get('trading_pair') + # start time unix timestamp + start = start + # end time unix timestamp + end = start+period*limit + # don't ask for a time greater than -period / 2 seconds ago + cutoff_time = int(time.time()-(period / 2)) end = min(end, cutoff_time) assert start < end print("Start", start) print("End", end) - # parameters for the url to get candle data from - urlparams = dict(exchange=exchange, trading_pair=trading_pair_api, apikey=apikey, - period=period, end=start+(limit*period), baseId=baseId, quoteId=quoteId, - limit=limit) + urlparams = dict( + exchange=exchange, trading_pair=trading_pair_api, apikey=apikey, + period=period, end=start+(limit*period), baseId=baseId, + quoteId=quoteId, limit=limit + ) # The API uses another notation for period (like 1m for 1 minute) if api in {'coincap', 'hitbtc', 'bitfinex'}: urlparams['interval'] = get_time_interval(api, period) - urlparams['start']=start - urlparams['end']=end + urlparams['start'] = start + urlparams['end'] = end url = format_apiurl(api, urlparams) response = requests.get(url) if response.status_code != 200: - raise Exception(f"In function get_from_api, got bad response {response.status_code}. Exiting early.", - f"Response Content: {response.content}") + raise Exception(f"In function get_from_api, got bad response " + "{response.status_code}. Exiting early.", + f"Response Content: {response.content}") # load and convert the candlestick info to be a common format json_response = json.loads(response.content) @@ -317,11 +340,11 @@ def update_pair(api, exchange_id, trading_pair, timestamp, period=300, try: # Get candle information candle_info = get_from_api(api=api, - exchange=exchange_id, - trading_pair=trading_pair, - start=timestamp, - period=period, - limit=limit) + exchange=exchange_id, + trading_pair=trading_pair, + start=timestamp, + period=period, + limit=limit) except Exception as e: print(f'Error encountered: {e}') @@ -329,12 +352,15 @@ def update_pair(api, exchange_id, trading_pair, timestamp, period=300, # that there is such a gap in candle data that the time frame cannot # advance to new candles, so continue with this task at an updated timestep if candle_info is None or candle_info['last_timestamp'] == timestamp: - # If the timestamp is from a day ago but there is no candle information, - # probably because such historical information is not available. No retry. - if timestamp >= now - 86400: + # If the timestamp is from a day ago but there is no + # candle information, probably because such historical + # information is not available. No retry. + if timestamp >= now - 86400: return - print(f'Retry {api} {exchange_id} {trading_pair} {timestamp} {num_retries}') - return update_pair(api, exchange_id, trading_pair, timestamp+limit*period, period, num_retries + 1) + print(f'Retry {api} {exchange_id} {trading_pair} {timestamp} ' + '{num_retries}') + return update_pair(api, exchange_id, trading_pair, + timestamp+limit*period, period, num_retries + 1) # Print the timestamp ts = candle_info['last_timestamp'] @@ -342,7 +368,8 @@ def update_pair(api, exchange_id, trading_pair, timestamp, period=300, # Insert into sql try: - print("Adding Candlestick to database", api, exchange_id, trading_pair, timestamp) + print("Adding Candlestick to database", api, exchange_id, + trading_pair, timestamp) sql.candlestick_to_sql(candle_info) return True # ran without error except AssertionError as e: @@ -351,9 +378,9 @@ def update_pair(api, exchange_id, trading_pair, timestamp, period=300, def live_update(period=300): # Period default is 5 minutes """ - Updates the database based on the info in data/api_info.json with - new candlestick info, grabbing data from the last timestamp until now, - with the start date set at the start of 2019. + Updates the database based on the info in data/api_info.json with + new candlestick info, grabbing data from the last timestamp until now, + with the start date set at the start of 2019. """ now = time.time() now = int(now) @@ -369,7 +396,8 @@ def live_update(period=300): # Period default is 5 minutes actual_pair = None print(api, exchange_id, trading_pair) - start = sql.get_latest_date(exchange_id, trading_pair, period) or 1546300800 # timestamp is January 1st 2019 + start = sql.get_latest_date(exchange_id, trading_pair, period) or \ + 1546300800 # timestamp is January 1st 2019 # already at the latest date, remove if start >= now-period: @@ -395,19 +423,24 @@ def fill_missing_candles(): update_pair(api, exchange, trading_pair, int(timestamp), int(period)) -# TODO should place this in the same file with get_df and get_df should probably -# be just impute_df and this shoudl candle sql.get_some_candles, pass that to impute_df, -# and then also call feaure_engineer_df, instead. +# TODO should place this in the same file with get_df and get_df should +# probably be just impute_df and this shoudl candle sql.get_some_candles, +# pass that to impute_df, and then also call feaure_engineer_df, instead. def get_data(exchange_id, trading_pair, period, start, n=8000): """ - Get data for the given trading pair and perform feature engineering on that data - for usage in models. + Get data for the given trading pair and perform feature engineering on + that data for usage in models. """ - print(mapl(lambda x: type(x), [exchange_id, trading_pair, period, start, n])) + print(mapl(lambda x: type(x), [exchange_id, trading_pair, + period, start, n])) - # Pull in data for the given trading pair at the given time on the given exchange - df = d.get_df({'start': start, 'period': period, 'trading_pair': trading_pair, - 'exchange_id': exchange_id}, n=n) + # Pull in data for the given trading pair at the + # given time on the given exchange + df = d.get_df({'start': start, + 'period': period, + 'trading_pair': trading_pair, + 'exchange_id': exchange_id}, + n=n) def price_increase(percent_diff, bottom5percent, top5percent): """Classify price changes into three types of categories""" @@ -424,35 +457,39 @@ def price_increase(percent_diff, bottom5percent, top5percent): df = df.filter(regex="(?!timestamp_.*)", axis=1) # Feature engineering - df = ta.add_all_ta_features(df, open="open", high="high", low="low", - close="close", volume="volume").fillna(axis=1, value=0) - df_shifted = df.shift(1,fill_value=0) + df = ta.add_all_ta_features( + df, open="open", high="high", low="low", + close="close", volume="volume" + ).fillna(axis=1, value=0) + df_shifted = df.shift(1, fill_value=0) df_diff = (df - df_shifted).rename(lambda x: x+'_diff', axis=1) df = pd.concat([df, df_diff], axis=1) df['diff_percent'] = df['close'].pct_change(1).fillna(0) df = df.drop(['volume_adi'], axis=1) - # Categorical feature for xgboost trading model + # Categorical feature for xgboost trading model bottom5percent = df['diff_percent'].quantile(0.05) top5percent = df['diff_percent'].quantile(0.95) - df['price_increased'] = df['diff_percent'].apply(lambda x: price_increase(x, bottom5percent, top5percent)) + df['price_increased'] = df['diff_percent'].apply( + lambda x: price_increase(x, bottom5percent, top5percent)) # Categorical feature for xgboost arbitrage model df['arb_signal_class'] = 0 - # if the next candle is in positive arbitrage (1%), assign it the category 1 - mask = df['arb_signal'].shift(1) > 0.01 - df['arb_signal_class'][mask] = 1 - # if the next candle is in negative arbitrage (-1%), assign it the category -1 - mask = df['arb_signal'].shift(1) < 0.01 + # if the next candle is in positive arbitrage (1%), + # assign it the category 1 + mask = df['arb_signal'].shift(1) > 0.01 + df['arb_signal_class'][mask] = 1 + # if the next candle is in negative arbitrage (-1%), + # assign it the category -1 + mask = df['arb_signal'].shift(1) < 0.01 df['arb_signal_class'][mask] = -1 - dataset = np.nan_to_num(dw.normalize(df.values), nan=0) idx = np.isinf(dataset) dataset[idx] = 0 - - # Don't normalize columns which are percentages, check for some other things later + # Don't normalize columns which are percentages, + # check for some other things later # TODO check for infs and nans and maybe not normalize those features # , especially if that number is high. # Also, categoricals should not be normalized. @@ -474,7 +511,8 @@ def price_increase(percent_diff, bottom5percent, top5percent): def get_latest_data(exchange_id, trading_pair, period, n=8000): """ - Get data for the given trading pair and perform feature engineering on that data for the latest date + Get data for the given trading pair and perform feature + engineering on that data for the latest date """ now = int(time.time()) start = now - n*period From 4b630b0d8257f2a45e5c4094578a714cf19987fd Mon Sep 17 00:00:00 2001 From: KyleHaggin Date: Mon, 3 Feb 2020 14:05:10 -0800 Subject: [PATCH 09/18] cryptolytic.data.metrics pep8 conformed --- cryptolytic/data/metrics.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/cryptolytic/data/metrics.py b/cryptolytic/data/metrics.py index 66c93c5..dbc1d6c 100644 --- a/cryptolytic/data/metrics.py +++ b/cryptolytic/data/metrics.py @@ -2,25 +2,26 @@ def get_higher_closing(df1, df2): # true if df is higher categories = (df1['close'] - df2['close']) print('something') -# categories = +# categories = + def get_higher_closing_test(): - df1 = + df1 = # function to create column showing percentage by which higher price is higher def get_pct_higher(df): # i.e., if exchange 1 has a higher closing price than exchange 2... if df['higher_closing_price'] == 1: - # return the percentage by which the exchange 1 closing price is + # return the percentage by which the exchange 1 closing price is # greater than the exchange 2 closing price - return ((df['close_exchange_1'] / + return ((df['close_exchange_1'] / df['close_exchange_2'])-1)*100 # otherwise, if exchange 2 has a higher closing price than exchange 1... elif df['higher_closing_price'] == 2: # return the percentage by which the exchange 2 closing price is # greater than the exchange 1 closing price - return ((df['close_exchange_2'] / + return ((df['close_exchange_2'] / df['close_exchange_1'])-1)*100 # otherwise, i.e., if the closing prices are equivalent... else: From 7899802d5b0cf1ae22bc83cbb862a38d1691611c Mon Sep 17 00:00:00 2001 From: KyleHaggin Date: Mon, 3 Feb 2020 14:13:01 -0800 Subject: [PATCH 10/18] cryptolytic.data.spl pep8 conformed --- cryptolytic/data/sql.py | 119 ++++++++++++++++++++++++---------------- 1 file changed, 71 insertions(+), 48 deletions(-) diff --git a/cryptolytic/data/sql.py b/cryptolytic/data/sql.py index d90c6be..36cd85b 100644 --- a/cryptolytic/data/sql.py +++ b/cryptolytic/data/sql.py @@ -12,6 +12,7 @@ ohlc = ["open", "high", "low", "close"] + def get_credentials(): """Get the credentials for a psycopg2.connect""" return { @@ -48,6 +49,7 @@ def safe_q1(q, args={}, return_conn=False): if result is not None: return result[0] + # like q1 but more appropriate in some cases def safe_q2(q, args={}, return_conn=False): result = safe_q(q, args, return_conn).fetchone() @@ -115,17 +117,18 @@ def get_table_columns(table_name): def add_data_to_table(df, cur=None, table='candlesticks'): - """Builds a string from our data-set using the mogrify method which is - then called once using the execute method to insert the candlestick - information (collected using functions in the historical file), into the - database. + """ + Builds a string from our data-set using the mogrify method which is + then called once using the execute method to insert the candlestick + information (collected using functions in the historical file), into the + database. """ order = get_table_columns(table) n = len(order) query = "("+",".join(repeat("%s", n))+")" df = d.fix_df(df) - + print(df.head()) print(len(df)) args_str = None @@ -146,7 +149,8 @@ def add_data_to_table(df, cur=None, table='candlesticks'): except Exception as e: print('ERROR', e) try: - cur.execute(f"INSERT INTO {table} VALUES" + args_str + " on conflict do nothing;") + cur.execute(f"INSERT INTO {table} VALUES" + args_str + + " on conflict do nothing;") if conn is not None: conn.commit() except ps.OperationalError as e: @@ -156,7 +160,8 @@ def add_data_to_table(df, cur=None, table='candlesticks'): def candlestick_to_sql(data): """ - Inserts candlesticks data into database. See get_from_api in data/historical.py for more info. + Inserts candlesticks data into database. + See get_from_api in data/historical.py for more info. """ conn = ps.connect(**get_credentials()) @@ -172,18 +177,18 @@ def candlestick_to_sql(data): def get_latest_date(exchange_id, trading_pair, period): """ - Return the latest date for a given trading pair on a given exchange + Return the latest date for a given trading pair on a given exchange """ q = """ - SELECT timestamp FROM candlesticks - WHERE exchange=%(exchange_id)s AND trading_pair=%(trading_pair)s AND period=%(period)s - ORDER BY timestamp desc - LIMIT 1; + SELECT timestamp FROM candlesticks + WHERE exchange=%(exchange_id)s AND trading_pair=%(trading_pair)s AND period=%(period)s + ORDER BY timestamp desc + LIMIT 1; """ - latest_date = safe_q1(q, {'exchange_id': exchange_id, - 'trading_pair': trading_pair, - 'period': period - }) + latest_date = safe_q1(q, + {'exchange_id': exchange_id, + 'trading_pair': trading_pair, + 'period': period}) if latest_date is None: print('No latest date') @@ -201,8 +206,8 @@ def get_earliest_date(exchange_id, trading_pair, period): LIMIT 1; """ latest_date = safe_q1(q, {'exchange_id': exchange_id, - 'trading_pair': trading_pair, - 'period': period}) + 'trading_pair': trading_pair, + 'period': period}) if latest_date is None: print('No latest date') @@ -211,16 +216,20 @@ def get_earliest_date(exchange_id, trading_pair, period): def get_some_candles(info, n=10000, verbose=False): """ - Return n candles - info: can contain start (unix-timestamp or str), end, exchange_id, - period (in seconds), trading_pair - Example: info={'start':1546300800, 'end':1546309800, 'exchange_id':'bitfinex', - 'trading_pair':'eth_btc', 'period':300} + Return n candles + info: can contain start (unix-timestamp or str), end, exchange_id, + period (in seconds), trading_pair + Example: info={'start':1546300800, + 'end':1546309800, + 'exchange_id':'bitfinex', + 'trading_pair':'eth_btc', + 'period':300} """ n = min(n, 50000) # no number larger than 50_000 - select = "open, close, high, low, timestamp, volume" if not verbose else "*" + select = "open, close, high, low,"\ + "timestamp, volume" if not verbose else "*" where = '' - + if 'period' not in info.keys(): info['period'] = 300 @@ -228,7 +237,7 @@ def get_some_candles(info, n=10000, verbose=False): if 'start' in info: info['start'] = date.convert_datetime(info['start']) if 'end' in info: - info['end'] = date.convert_datetime(info['end']) + info['end'] = date.convert_datetime(info['end']) def add_clause(where, key, clause): if key in info.keys(): @@ -251,12 +260,15 @@ def add_clause(where, key, clause): LIMIT {n}; """ results = safe_qall(q, info) - columns = get_table_columns('candlesticks') if select == "*" else ["open", "close", "high", "low", "timestamp", "volume"] - # TODO instead of returning a dataframe, return the query and then either convert to a dataframe (with get_candles) or to json - df = pd.DataFrame(results, columns=columns) + columns = get_table_columns('candlesticks') if select == "*" else \ + ["open", "close", "high", "low", "timestamp", "volume"] + # TODO instead of returning a dataframe, return the query and then either + # convert to a dataframe (with get_candles) or to json + df = pd.DataFrame(results, columns=columns) df['period'] = info['period'] return d.fix_df(df) + def get_api(api): q = "SELECT * FROM candlesticks WHERE api = %(api)s" safe_qall(q, {'api': api}) @@ -273,17 +285,17 @@ def get_bad_timestamps(info): def remove_duplicates(): - """ - Remove any duplicate candlestick information from the database. + """ + Remove any duplicate candlestick information from the database. """ q = """ with q as (select *, "timestamp" - lag(timestamp, 1) - over (partition by(exchange, trading_pair, period) + over (partition by(exchange, trading_pair, period) order by "timestamp" ) as diff from candlesticks) delete from candlesticks where ctid in ( - select ctid + select ctid from q where diff=0 order by timestamp); @@ -304,8 +316,9 @@ def get_missing_timesteps(): """ missing = safe_qall(q) - return pd.DataFrame(missing, columns = ["api", "exchange", "period", "trading_pair", "timestamp", "ntimestamp"]) - + return pd.DataFrame(missing, columns=[ + "api", "exchange", "period", "trading_pair", "timestamp", "ntimestamp" + ]) # Used for filling in missing candlestick values @@ -314,9 +327,10 @@ def get_avg_candle(query): """Query to get avg price values for a candlestick at a certain timestamp. TODO batch query for improved performance.""" - assert {'timestamp', 'trading_pair', 'period', 'exchange'}.issubset(query.keys()) + assert {'timestamp', 'trading_pair', 'period', 'exchange'}.\ + issubset(query.keys()) - q = """select avg("open"), avg(high), avg(low), avg("close") from candlesticks + q = """select avg("open"), avg(high), avg(low), avg("close") from candlesticks where "timestamp"=%(timestamp)s and trading_pair=%(trading_pair)s and period=%(period)s;""" intermediate = safe_qall(q, query) @@ -327,14 +341,16 @@ def get_avg_candle(query): where trading_pair=%(trading_pair)s and exchange=%(exchange)s and period=%(period)s and timestamp=%(timestamp)s ; """ ohlc = ["open", "high", "low", "close", "timestamp"] - result = {key: intermediate[i] for i, key in zip(range(len(intermediate)), ohlc)} + result = {key: intermediate[i] for i, key in zip(range(len(intermediate)), + ohlc)} result['volume'] = safe_q1(q2, query) return result def batch_avg_candles(info): - assert {'timestamps', 'trading_pair', 'period', 'exchange'}.issubset(info.keys()) + assert {'timestamps', 'trading_pair', 'period', 'exchange'}.\ + issubset(info.keys()) assert len(info['timestamps']) >= 2 info['timestamps'] = tuple(info['timestamps']) @@ -346,15 +362,17 @@ def batch_avg_candles(info): """ result = safe_qall(q, info) - df = pd.DataFrame(result, columns=['timestamp', 'open', 'high', 'low', 'close']) + df = pd.DataFrame(result, columns=['timestamp', 'open', 'high', + 'low', 'close']) df[ohlc] = df[ohlc].apply(pd.to_numeric) return d.fix_df(df) def batch_last_volume_candles(info): - assert {'timestamps', 'trading_pair', 'period', 'exchange'}.issubset(info.keys()) + assert {'timestamps', 'trading_pair', 'period', 'exchange'}.\ + issubset(info.keys()) info_copy = info.copy() - # get the previous volumes, so minus the period, forward + # get the previous volumes, so minus the period, forward # fills if this still has nans info['timestamps'] = mapl(lambda x: x-info['period'], info['timestamps']) @@ -372,16 +390,18 @@ def batch_last_volume_candles(info): df = pd.DataFrame({'timestamp': info_copy['timestamps']}) df = df.merge(volumes, how='left', on='timestamp') return d.fix_df(df.ffill().bfill()) - - def get_arb_info(info, n=1000): """ - Example: info := {'start':1556668800, 'period':300, 'trading_pair':'eth_btc', 'exchange_id':'binance'} + Example: info := {'start':1556668800, + 'period':300, + 'trading_pair':'eth_btc', + 'exchange_id':'binance'} """ - assert {'exchange_id', 'trading_pair', 'period', 'start'}.issubset(info.keys()) + assert {'exchange_id', 'trading_pair', 'period', 'start'}.\ + issubset(info.keys()) info['n'] = n q = """with sub as ( @@ -403,8 +423,11 @@ def get_arb_info(info, n=1000): results = safe_qall(q, info) if results is not None: - # arb_signal is more interpretable than arb_diff but the signal is the same - df = pd.DataFrame(results, columns=["exchange", "trading_pair", "timestamp", "period", "avg", "arb_diff", "arb_signal"]) + # arb_signal is more interpretable than arb_diff + # but the signal is the same + df = pd.DataFrame(results, columns=["exchange", "trading_pair", + "timestamp", "period", "avg", + "arb_diff", "arb_signal"]) return d.fix_df(df) From 3e5266c89074a1f193d1a3643988a6e5be639df9 Mon Sep 17 00:00:00 2001 From: KyleHaggin Date: Mon, 3 Feb 2020 14:17:18 -0800 Subject: [PATCH 11/18] cryptolytic.data.utils pep8 conformed --- cryptolytic/data/utils.py | 80 ++++++++++++++++++++++----------------- 1 file changed, 46 insertions(+), 34 deletions(-) diff --git a/cryptolytic/data/utils.py b/cryptolytic/data/utils.py index 02f0d30..d1de800 100644 --- a/cryptolytic/data/utils.py +++ b/cryptolytic/data/utils.py @@ -1,12 +1,13 @@ import psycopg2 as ps # Credentials -credentials = {'POSTGRES_ADDRESS' : '#', - 'POSTGRES_PORT' : '#', - 'POSTGRES_USERNAME' : '#', - 'POSTGRES_PASSWORD' : '#', - 'POSTGRES_DBNAME' : '#', - 'API_KEY' : '#'} +credentials = {'POSTGRES_ADDRESS': '#', + 'POSTGRES_PORT': '#', + 'POSTGRES_USERNAME': '#', + 'POSTGRES_PASSWORD': '#', + 'POSTGRES_DBNAME': '#', + 'API_KEY': '#'} + # Define postgres_db_conn function. def postgres_db_conn(credentials): @@ -27,6 +28,7 @@ def postgres_db_conn(credentials): # Define currency pairs within each exchange and create the names of the # tables for each exchange. + coinbase_pro_pairs = ['bch_btc', 'bch_usd', 'btc_usd', 'btc_usdc', 'dash_btc', 'dash_usd', 'eos_btc', 'eos_usd', 'etc_usd', 'eth_btc', 'eth_usd', 'eth_usdc', 'ltc_btc', 'ltc_usd', 'xrp_btc', @@ -37,7 +39,7 @@ def postgres_db_conn(credentials): 'etc_usd', 'eth_btc', 'eth_usd', 'eth_usdt', 'ltc_btc', 'ltc_usd', 'ltc_usdt', 'xrp_btc', 'xrp_usd', 'zec_usd', 'zrx_usd'] - + hitbtc_pairs = ['bch_btc', 'bch_usdt', 'btc_usdc', 'btc_usdt', 'dash_btc', 'dash_usdt', 'eos_btc', 'eos_usdt', 'etc_usdt', 'eth_btc', 'eth_usdc', 'eth_usdt', 'ltc_btc', 'ltc_usdt', 'xrp_btc', @@ -54,13 +56,14 @@ def postgres_db_conn(credentials): bitfinex_table_list = ['bitfinex_' + pair for pair in bitfinex_pairs] -coinbase_pro_table_list = ['coinbase_pro_' + pair for pair in +coinbase_pro_table_list = ['coinbase_pro_' + pair for pair in coinbase_pro_pairs] gemini_table_list = ['gemini_' + pair for pair in gemini_pairs] kraken_table_list = ['kraken_' + pair for pair in kraken_pairs] + # Define create_tables function. def create_tables(credentials): '''Connects to a PostgreSQL database and adds tables to each respective @@ -88,22 +91,26 @@ def create_tables(credentials): close float, base_volume float );'''.format(schema=schema, table_name=table_name)) - + # Commit and close. Verify that tables were created successfully. conn.commit() - + print("Tables created successfully!") conn.close() - + + # Define insert_csv_to_db function. -# Within Jupyter Lab, a folder entitled "data" was created. A folder for each exchange was -# then created within the "data" folder. Each exchange folder then included csv files for all -# of the trading pairs supported on that respective exchange and for all intervals (300 and 3600). +# Within Jupyter Lab, a folder entitled "data" was created. +# A folder for each exchange wasthen created within the "data" folder. +# Each exchange folder then included csv files for all of the trading pairs +# supported on that respective exchange and for all intervals (300 and 3600). def insert_csv_to_db(credentials): - '''Connects to a PostgreSQL database and imports csv files into a specified schema - and table name.''' - - # Create connection and cursor to database. + ''' + Connects to a PostgreSQL database and imports csv + files into a specified schema and table name. + ''' + +# Create connection and cursor to database. conn, cur = create_conn(credentials) print("Connect to database.") @@ -119,36 +126,41 @@ def insert_csv_to_db(credentials): table_name = filename.replace('_3600.csv', '') with open('data/' + directory + '/' + filename, 'r') as f: # Skip the header row. - next(f) - cur.copy_from(f, '{schema}.{table_name}'.format(schema=schema, table_name=table_name), sep=',') + next(f) + cur.copy_from(f, '{schema}.{table_name}'.format( + schema=schema, table_name=table_name), sep=',' + ) conn.commit() conn.close() print('Done!') + # Define drop_column function. def drop_column(credentials): - '''Connects to a PostgreSQL database and drops a table column, in this case ID, - from all tables within each schema.''' - + ''' + Connects to a PostgreSQL database and drops a table column, + in this case ID, from all tables within each schema. + ''' + # Create connection and cursor to database. conn, cur = create_conn(credentials) - + print("Connect to database.") - + schemas = ['fiveminute', 'onehour'] - table_list = (hitbtc_table_list + bitfinex_table_list + coinbase_pro_table_list + gemini_table_list + kraken_table_list) - + table_list = (hitbtc_table_list + bitfinex_table_list + + coinbase_pro_table_list + gemini_table_list + + kraken_table_list) + for schema in schemas: for table_name in table_list: - cur.execute('''ALTER TABLE {schema}.{table_name} DROP COLUMN ID;'''.format(schema=schema, table_name=table_name)) - + cur.execute( + '''ALTER TABLE {schema}.{table_name} DROP COLUMN ID;''' + .format(schema=schema, table_name=table_name)) + # Commit and close. Verify that tables were created successfully. conn.commit() - + print("Column removed.") conn.close() - - -# Utilities Involing dates -def datetime From df0dfb7f46c5459f423268c6db30c339f5b73747 Mon Sep 17 00:00:00 2001 From: KyleHaggin Date: Mon, 3 Feb 2020 14:19:30 -0800 Subject: [PATCH 12/18] cryptolytic.util pep8 conformed --- cryptolytic/util/__init__.py | 8 ++++++-- cryptolytic/util/date.py | 1 + 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/cryptolytic/util/__init__.py b/cryptolytic/util/__init__.py index 8d27f3a..0c7ec08 100644 --- a/cryptolytic/util/__init__.py +++ b/cryptolytic/util/__init__.py @@ -1,10 +1,11 @@ import functools import signal -import multiprocessing +import multiprocessing import pprint pprint = pprint.PrettyPrinter().pprint + def timeout_handler(signum, frame): raise TimeoutError() @@ -30,7 +31,7 @@ def timeout(fn, time, timeout_handler=None, success_handler=None): success_handler() except Exception as e: print(e) - return + return def bdir(x): @@ -57,12 +58,15 @@ def first(x): if isinstance(x, list) or isinstance(x, tuple): return x[0] + def dict_matches(cond, b): return set(cond.items()).issubset(set(b.items())) + def select_keys(d, keys): return {k: d[k] for k in keys if k in d} + class adict(dict): def __init__(self, *args, **kwargs): super(adict, self).__init__(*args, **kwargs) diff --git a/cryptolytic/util/date.py b/cryptolytic/util/date.py index 63c6155..b249b11 100644 --- a/cryptolytic/util/date.py +++ b/cryptolytic/util/date.py @@ -1,6 +1,7 @@ from datetime import datetime import time + def convert_datetime(t): """Convert string to unix time stamp if not. Currently handles %d-%m-%Y Consider using something more convenient like the arrow library.""" From 2e8ef3ec8b395dd8075400939efb27f782872926 Mon Sep 17 00:00:00 2001 From: KyleHaggin Date: Mon, 3 Feb 2020 14:20:34 -0800 Subject: [PATCH 13/18] cryptolytic.start pep8 conformed --- cryptolytic/start.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cryptolytic/start.py b/cryptolytic/start.py index 29d9aad..71c331d 100644 --- a/cryptolytic/start.py +++ b/cryptolytic/start.py @@ -25,5 +25,7 @@ def start_logging(): def init(): load_dotenv(verbose=True) start_logging() - cryptolytic.session = boto3.session.Session(aws_access_key_id=os.environ['AWS_ACCESS_KEY_ID'], - aws_secret_access_key=os.environ['AWS_SECRET_ACCESS_KEY']) + cryptolytic.session = boto3.session.Session( + aws_access_key_id=os.environ['AWS_ACCESS_KEY_ID'], + aws_secret_access_key=os.environ['AWS_SECRET_ACCESS_KEY'] + ) From c45fa9c5a6a7f9ebae1551f1d2436a67905959a6 Mon Sep 17 00:00:00 2001 From: KyleHaggin Date: Mon, 3 Feb 2020 14:23:17 -0800 Subject: [PATCH 14/18] more commenting on cryptolytic.data.aws --- cryptolytic/data/aws.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cryptolytic/data/aws.py b/cryptolytic/data/aws.py index b7a87ba..9651ba5 100644 --- a/cryptolytic/data/aws.py +++ b/cryptolytic/data/aws.py @@ -34,4 +34,6 @@ def get_path(folder_name, model_type, exchange_id, trading_pair, ext): os.mkdir(aws_folder) return os.path.join( aws_folder, f'model_{model_type}_{exchange_id}_{trading_pair}{ext}' + # Windows operating systems use \\ instead of /, replace function + # required to conform with Unix operating systems ).replace('\\', '/') From 6845b6c0f7f219f40ac2f1721e6fce74b0178141 Mon Sep 17 00:00:00 2001 From: KyleHaggin Date: Tue, 4 Feb 2020 17:57:42 -0800 Subject: [PATCH 15/18] Updated README.md with new data source API documentations and locations of data in AWS --- README.md | 17 ++++++++++- cryptolytic/notebooks/..ipynb | 30 ------------------- .../{ => Archive}/arbitrage_models.ipynb | 0 3 files changed, 16 insertions(+), 31 deletions(-) delete mode 100644 cryptolytic/notebooks/..ipynb rename cryptolytic/notebooks/{ => Archive}/arbitrage_models.ipynb (100%) diff --git a/README.md b/README.md index ae7ca4d..a096849 100644 --- a/README.md +++ b/README.md @@ -55,6 +55,8 @@ Each trade recommender model recommends trades for a particular trading pair on The arbitrage models predict arbitrage opportunities between two exchanges for a particular trading pair. Predictions are made ten minutes in advance. To count as an arbitrage opportunity, a price disparity between two exchanges must last for at least thirty minutes, and the disparity must be great enough to cover the costs of buying on one exchange and selling on the other. +The models can also be accessed via the organization's AWS S3 bucket by the name of "crypto-buckit". Current code (as of 4 February 2020) will upload all future models into this S3 bucket. + ### Features Each of the nine trade recommender models is trained on 67 features. Of those 67 features, five are taken directly from the OHLCV data (open, high, low, close, base_volume), one indicates where gaps were present in the data (nan_ohlcv), three indicate the time (year, month, day), and the remainder are technical analysis features. @@ -81,7 +83,7 @@ Documentation for the technical analysis features features is available here: We obtained all of our data from the Cryptowatch, Bitfinex, Coinbase Pro, and HitBTC APIs. Documentation for obtaining that data is listed below: -[Cryptowatch API OHLCV Data Documentation](https://developer.cryptowat.ch/reference/rest-api-markets#market-ohlc-candlesticks) +[Cryptowatch REST API OHLCV Data Documentation](https://docs.cryptowat.ch/rest-api/) [Bitfinex API OHLCV Data Documentation](https://docs.bitfinex.com/reference#rest-public-candles) @@ -89,6 +91,14 @@ We obtained all of our data from the Cryptowatch, Bitfinex, Coinbase Pro, and Hi [HitBTC OHLCV Data Documentation](https://api.hitbtc.com/#candles) +[Binance API OHLCV Documentation](https://github.com/binance-exchange/binance-official-api-docs/blob/master/rest-api.md) + +[Gemini REST API OHLCV Data Documentation](https://docs.gemini.com/rest-api/) + +[Kraken REST API OHLCV Data Documentation](https://www.kraken.com/en-us/features/api) + +[Poloniex API OHLCV Data Documentation](https://docs.poloniex.com/#introduction) + ### Python Notebooks [Notebook Folder](https://github.com/Lambda-School-Labs/cryptolytic-ds/tree/master/finalized_notebooks) @@ -117,6 +127,11 @@ Returns: ``` {"results":"{ 'prediction': 'result'} ]} ``` +### Internal Access via AWS +The raw data and models can also be internally accessed through the Organization's AWS accounts. +-AWS RDS: RDS databases holds historical candlestick data from the cryptocurrency market APIs as well as the prediction from the models. +-AWS S3: S3 buckets holds the pickled, trained models for both Trading and Arbitrage. + ## Contributing diff --git a/cryptolytic/notebooks/..ipynb b/cryptolytic/notebooks/..ipynb deleted file mode 100644 index 0e90a77..0000000 --- a/cryptolytic/notebooks/..ipynb +++ /dev/null @@ -1,30 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.3" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/cryptolytic/notebooks/arbitrage_models.ipynb b/cryptolytic/notebooks/Archive/arbitrage_models.ipynb similarity index 100% rename from cryptolytic/notebooks/arbitrage_models.ipynb rename to cryptolytic/notebooks/Archive/arbitrage_models.ipynb From d01d79efbef314558b4d623134938ea618d24a71 Mon Sep 17 00:00:00 2001 From: KyleHaggin Date: Tue, 4 Feb 2020 18:50:06 -0800 Subject: [PATCH 16/18] Updated predicitons and features sections of the README.md to reflect the most recent iteration of the code --- README.md | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index a096849..c33fd0a 100644 --- a/README.md +++ b/README.md @@ -43,25 +43,21 @@ Python, AWS, PostgreSQL, SQL, Flask ### Predictions -The models folder contains two zip files, with a total of 30 models: - -tr_pickles.zip contains nine pickled trade recommender models. - -arb_models.zip contains 21 pickled arbitrage models. +Each trade recommender model recommends trades for a particular trading pair on a particular exchange by predicting whether the closing price will increase by enough to cover the costs of executing a trade. -All 30 models use a RandomForestClassifier algorithm. +The arbitrage models predict arbitrage opportunities between two exchanges for a particular trading pair. Predictions are made five minutes in advance. To count as an arbitrage opportunity, a price disparity between two exchanges must last for at least thirty minutes, and the disparity must be great enough to cover the costs of buying on one exchange and selling on the other. -Each trade recommender model recommends trades for a particular trading pair on a particular exchange by predicting whether the closing price will increase by enough to cover the costs of executing a trade. +The trained and pickled models can be accessed via the organization's AWS S3 bucket by the name of "crypto-buckit" under folder aws/models. Current code (as of 4 February 2020) will upload all future models into this S3 bucket. -The arbitrage models predict arbitrage opportunities between two exchanges for a particular trading pair. Predictions are made ten minutes in advance. To count as an arbitrage opportunity, a price disparity between two exchanges must last for at least thirty minutes, and the disparity must be great enough to cover the costs of buying on one exchange and selling on the other. +The naming convetion for the models is model\_{arbitrage/trade}\_{api}\_{trading\_pair}.pkl -The models can also be accessed via the organization's AWS S3 bucket by the name of "crypto-buckit". Current code (as of 4 February 2020) will upload all future models into this S3 bucket. +The predictions themselves can be accessed via the Organization's AWS RDS database with the table name "predictions". ### Features -Each of the nine trade recommender models is trained on 67 features. Of those 67 features, five are taken directly from the OHLCV data (open, high, low, close, base_volume), one indicates where gaps were present in the data (nan_ohlcv), three indicate the time (year, month, day), and the remainder are technical analysis features. +Each of the nine trade recommender models is trained on 80 features. Of those 80 features, five are taken directly from the OHLCV data (open, high, low, close, base\_volume), and the remainder are technical analysis features. We are filling NaN values of open, high, low, close with the average price, and forward filling NaN values for base\_volume -Each of the 21 arbitrage models is trained on 91 features. Of those 91 features, three features indicate the time (year, month, day), and four indicate the degree and length of price disparities between two exchanges (higher_closing_price, pct_higher, arbitrage_opportunity, window_length). Half of the remaining 84 features are specific to the first of the two exchanges in a given arbitrage dataset and are labelled with the suffix "exchange_1"; the other half are specific to the second of those two exchanges and are labelled with the suffix "exchange_2". In each of these two sets of 42 features, two are taken directly from the OHLCV data (close_exchange_#, base_volume_exchange_#), one indicates where gaps were present in the data (nan_ohlcv), and the remainder are technical analysis features. +Each of the arbitrage models is trained on 80 features. Of those 80 features, and four indicate the degree and length of price disparities between two exchanges (higher_closing_price, pct_higher, arbitrage_opportunity, window_length). Arbitrage is calculated by comparing the price of the primary exchange against the mean price of the other exchanges. This allows us to compare the market against every other market with minimal computation cost. Technical analysis features were engineered with the Technical Analysis Library; they fall into five types: From 56d2fdace5ac7cc7666dcce85610f321f69597ad Mon Sep 17 00:00:00 2001 From: KyleHaggin Date: Tue, 4 Feb 2020 18:52:18 -0800 Subject: [PATCH 17/18] Deleted unneeded whitespace --- README.md | 5 ----- 1 file changed, 5 deletions(-) diff --git a/README.md b/README.md index c33fd0a..68538ee 100644 --- a/README.md +++ b/README.md @@ -60,15 +60,10 @@ Each of the nine trade recommender models is trained on 80 features. Of those 8 Each of the arbitrage models is trained on 80 features. Of those 80 features, and four indicate the degree and length of price disparities between two exchanges (higher_closing_price, pct_higher, arbitrage_opportunity, window_length). Arbitrage is calculated by comparing the price of the primary exchange against the mean price of the other exchanges. This allows us to compare the market against every other market with minimal computation cost. Technical analysis features were engineered with the Technical Analysis Library; they fall into five types: - (1) Momentum indicators - (2) Volume indicators - (3) Volatility indicators - (4) Trend indicators - (5) Others indicators Documentation for the technical analysis features features is available here: From 96d640042f124f8c813a6ade0926b0e7034fb421 Mon Sep 17 00:00:00 2001 From: KyleHaggin Date: Tue, 4 Feb 2020 18:54:12 -0800 Subject: [PATCH 18/18] Deleted unneeded whitespace --- README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 68538ee..2697af8 100644 --- a/README.md +++ b/README.md @@ -59,12 +59,12 @@ Each of the nine trade recommender models is trained on 80 features. Of those 8 Each of the arbitrage models is trained on 80 features. Of those 80 features, and four indicate the degree and length of price disparities between two exchanges (higher_closing_price, pct_higher, arbitrage_opportunity, window_length). Arbitrage is calculated by comparing the price of the primary exchange against the mean price of the other exchanges. This allows us to compare the market against every other market with minimal computation cost. -Technical analysis features were engineered with the Technical Analysis Library; they fall into five types: -(1) Momentum indicators -(2) Volume indicators -(3) Volatility indicators -(4) Trend indicators -(5) Others indicators +Technical analysis features were engineered with the Technical Analysis Library; they fall into five types:
+(1) Momentum indicators
+(2) Volume indicators
+(3) Volatility indicators
+(4) Trend indicators
+(5) Others indicators
Documentation for the technical analysis features features is available here: