Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
186 changes: 162 additions & 24 deletions modelTraining/trainingScripts/exploration.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -395,23 +395,54 @@
"metadata": {},
"outputs": [],
"source": [
"# factorize cat variables\n",
"cols_to_factorize = ['chain', 'project', 'ilRisk', 'exposure', 'stablecoin']\n",
"# I will use lgbm it does not need encoding, caegory type is enough\n",
"df['chain'] = df['chain'].astype('category')\n",
"df['project'] = df['project'].astype('category')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6fb82cd2",
"metadata": {},
"outputs": [],
"source": [
"# more features \n",
"df = df.sort_values(['pool', 'timestamp']).reset_index(drop=True)\n",
"pool_group = df.groupby('pool', group_keys=False)\n",
"\n",
"for window in [7, 14]:\n",
" df[f'apyMean{window}d'] = pool_group['apy'].apply(lambda x:\n",
"x.rolling(window, min_periods=1).mean())\n",
" df[f'apyStd{window}d'] = pool_group['apy'].apply(lambda x:\n",
"x.rolling(window, min_periods=1).std()).fillna(0)\n",
" df[f'tvlMean{window}d'] = pool_group['tvlUsd'].apply(lambda x:\n",
"x.rolling(window, min_periods=1).mean())\n",
"\n",
"df['apyChange7d'] = pool_group['apy'].apply(lambda x:\n",
"x.pct_change(periods=7)).fillna(0)\n",
"df['tvlChange7d'] = pool_group['tvlUsd'].apply(lambda x:\n",
"x.pct_change(periods=7)).fillna(0)\n",
"\n",
"df['apyDevFromMean'] = (df['apy'] - df['apyMeanExpanding']) /df['apyMeanExpanding'].replace(0, np.nan)\n",
"df['apyDevFromMean'] = df['apyDevFromMean'].fillna(0)\n",
"\n",
"for i in cols_to_factorize:\n",
" df[f'{i}_factorized'] = pd.factorize(df[i])[0]\n",
"\n",
"# save mapping (which we use on the triggerEnrichment lambda)\n",
"project = df[['project', 'project_factorized']].set_index('project')\n",
"chain = df[['chain', 'chain_factorized']]\n",
"pool_first_date = df.groupby('pool')['timestamp'].transform('min')\n",
"df['poolAgeDays'] = (df['timestamp'] - pool_first_date).dt.days\n",
"\n",
"mapping_project = df.set_index('project')[['project_factorized']].to_dict()\n",
"mapping_chain = df.set_index('chain')[['chain_factorized']].to_dict()\n",
"df['logTvlUsd'] = np.log1p(df['tvlUsd'])\n",
"\n",
"d_cat_map = {}\n",
"df = df.sort_values(['timestamp', 'pool']).reset_index(drop=True)\n",
"\n",
"d_cat_map.update(mapping_project)\n",
"d_cat_map.update(mapping_chain)"
"df['chainMeanApy'] = df.groupby('chain',\n",
"group_keys=False)['apy'].apply(lambda x: x.expanding().mean())\n",
"df['projectMeanApy'] = df.groupby('project',\n",
"group_keys=False)['apy'].apply(lambda x: x.expanding().mean())\n",
"\n",
"\n",
"df['apyDevFromChain'] = df['apy'] - df['chainMeanApy']\n",
"df['apyDevFromProject'] = df['apy'] - df['projectMeanApy']\n"
]
},
{
Expand All @@ -425,12 +456,25 @@
"source": [
"# leaving out all weak features for now\n",
"features = [\n",
" 'apy',\n",
" 'tvlUsd',\n",
" 'apyMeanExpanding',\n",
" 'apyStdExpanding',\n",
" 'chain_factorized',\n",
" 'project_factorized',\n",
" \"apy\",\n",
" \"tvlUsd\",\n",
" \"logTvlUsd\",\n",
" \"apyMeanExpanding\",\n",
" \"apyStdExpanding\",\n",
" \"apyMean7d\",\n",
" \"apyStd7d\",\n",
" \"tvlMean7d\",\n",
" \"apyMean14d\",\n",
" \"apyStd14d\",\n",
" \"tvlMean14d\",\n",
" \"apyChange7d\",\n",
" \"tvlChange7d\",\n",
" \"apyDevFromMean\",\n",
" \"apyDevFromChain\",\n",
" \"apyDevFromProject\",\n",
" \"poolAgeDays\",\n",
" \"chain\",\n",
" \"project\",\n",
"]"
]
},
Expand Down Expand Up @@ -544,6 +588,33 @@
"y_test = X_test['target']"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "37cf0f0a",
"metadata": {},
"outputs": [],
"source": [
"# adding eval set\n",
"X_test = df[df.timestamp > cutoff_date]\n",
"y_test = X_test['target']\n",
"\n",
"# split train into train + eval for early stopping (80/20 by time)\n",
"df_train_full = df[df.timestamp <=\n",
"cutoff_date].sort_values('timestamp').reset_index(drop=True)\n",
"eval_split = int(len(df_train_full) * 0.8)\n",
"\n",
"X_train = df_train_full.iloc[:eval_split]\n",
"X_eval = df_train_full.iloc[eval_split:]\n",
"y_train = X_train['target']\n",
"y_eval = X_eval['target']\n",
"\n",
"print(f\"Train: {X_train.shape[0]}, Eval: {X_eval.shape[0]}, Test:{X_test.shape[0]}\")\n",
"print(f\"Train: {X_train.timestamp.min()} -{X_train.timestamp.max()}\")\n",
"print(f\"Eval: {X_eval.timestamp.min()} - {X_eval.timestamp.max()}\")\n",
"print(f\"Test: {X_test.timestamp.min()} - {X_test.timestamp.max()}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down Expand Up @@ -604,6 +675,14 @@
"y_train.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c998862f",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
Expand All @@ -613,12 +692,52 @@
},
"outputs": [],
"source": [
"# using 2 algos with out of box settings, logreg cause basic and random forest cause usually better\n",
"# and checking for increase in cv -> increase in test? (as a consistency\n",
"# check if test set is similarly distributed to train)\n",
"# gradient boosting is the best classic ml algorithm\n",
"# I will use lgbm because it is fast and can work with cat features \n",
"# also I will use optuna to tune hyperparameters, it is faster than gridsearchcv\n",
"import optuna\n",
"from lightgbm import LGBMClassifier, early_stopping, log_evaluation\n",
"from sklearn.metrics import roc_auc_score\n",
"\n",
"\n",
"clf_lr = LogisticRegression()\n",
"clf_rf = RandomForestClassifier(random_state=random_state, n_estimators=100, n_jobs=-1, oob_score=True)"
"optuna.logging.set_verbosity(optuna.logging.WARNING)\n",
"\n",
"\n",
"def objective(trial):\n",
" params = {\n",
" \"n_estimators\": trial.suggest_int(\"n_estimators\", 100, 2500),\n",
" \"learning_rate\": trial.suggest_float(\"learning_rate\", 0.001, 0.1, log=True),\n",
" \"max_depth\": trial.suggest_int(\"max_depth\", 2, 20),\n",
" \"num_leaves\": trial.suggest_int(\"num_leaves\", 4, 128),\n",
" \"min_child_samples\": trial.suggest_int(\"min_child_samples\", 5, 100),\n",
" \"subsample\": trial.suggest_float(\"subsample\", 0.5, 1.0),\n",
" \"colsample_bytree\": trial.suggest_float(\"colsample_bytree\", 0.3, 1.0),\n",
" \"reg_alpha\": trial.suggest_float(\"reg_alpha\", 1e-8, 10.0, log=True),\n",
" \"reg_lambda\": trial.suggest_float(\"reg_lambda\", 1e-8, 10.0, log=True),\n",
" \"scale_pos_weight\": trial.suggest_float(\"scale_pos_weight\", 0.5, 3.0),\n",
" \"random_state\": 42,\n",
" \"verbosity\": -1,\n",
" \"objective\": \"binary\",\n",
" \"metric\": \"auc\",\n",
" }\n",
"\n",
" model = LGBMClassifier(**params)\n",
" model.fit(\n",
" X_train[features],\n",
" y_train,\n",
" eval_set=[(X_eval[features], y_eval)],\n",
" callbacks=[early_stopping(stopping_rounds=50), log_evaluation(0)],\n",
" )\n",
"\n",
" y_pred = model.predict_proba(X_eval[features])[:, 1]\n",
" return roc_auc_score(y_eval, y_pred)\n",
"\n",
"\n",
"study = optuna.create_study(direction=\"maximize\")\n",
"study.optimize(objective, n_trials=1000, show_progress_bar=True)\n",
"\n",
"print(f\"Best AUC: {study.best_value:.4f}\")\n",
"print(f\"Best params: {study.best_params}\")"
]
},
{
Expand All @@ -630,7 +749,26 @@
},
"outputs": [],
"source": [
"random_state = 1993"
"# final model with best params\n",
"best_params = study.best_params\n",
"best_params[\"random_state\"] = 42\n",
"best_params[\"verbosity\"] = -1\n",
"\n",
"model = LGBMClassifier(**best_params)\n",
"model.fit(\n",
" pd.concat([X_train[features], X_eval[features]]),\n",
" pd.concat([y_train, y_eval]),\n",
" # eval_set=[(X_eval, y_eval)],\n",
" # callbacks=[early_stopping(stopping_rounds=50), log_evaluation(0)]\n",
")\n",
"\n",
"y_pred = model.predict_proba(X_test[features])[:, 1]\n",
"print(f\"Test ROC AUC: {roc_auc_score(y_test, y_pred):.4f}\")\n",
"\n",
"# Распределение предсказаний\n",
"print(f\"\\nPrediction distribution:\")\n",
"print(f\"Mean: {y_pred.mean():.3f}, Std: {y_pred.std():.3f}\")\n",
"print(f\"Min: {y_pred.min():.3f}, Max: {y_pred.max():.3f}\")"
]
},
{
Expand Down