Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,6 @@ docs/_build

# Cookiecutter
output/

# rck_jr dev files
rosey/delme.py
71 changes: 71 additions & 0 deletions rosey/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
<!-- ruff: noqa -->
<!-- linting: ignore -->
# send_time_p13n

# GOAL:

Predict the ideal send time for each contact to maximize probability of interaction

# HYPOTHESIS:

The closer an email is to the top of a contact's inbox the more probable it is that interact.

# LEGACY MODEL:

Creates a pdf of historical interactions for each contact. Essentially a count is performed of how often a contact has interacted (opens or clicks) with emails in the past, these counts populate a pdf that represents every possible hour of the week (168 hours). This count is normalized to create probabilities that a custom `argmax` function ingests and selects send times from.

NOTE: If a contact does not meet a certain threshold of interactions in the lookback window, a default `mode` distribution is used for that contact.

# NEW MODEL:

A catboost classifier is trained on enriched historical interaction data for each contact. The features used from enrichment are provided by the customer or publicly available (ie H3 features).

# WHY SHOULD THE NEW MODEL BE BETTER?

- Generalized learning from `high` interaction users to `low` to `no` interaction users
- Minimize lookback window due to learning from population level data
- Go beyond interaction data to make predictions (ie demographic data)

# RESULTS OF RECENT AB TEST

We lost :-( (I told everyone that it was because of Stephen....)

# Things I need help on:

- Auto feature selection to replace manual EDA
- How should I evaluate a model offline? Currently use LogLoss and ROC curves.
- Model selection, is Catboost the correct model? I've been told the idea is train one and then try to beat it.

# AUTO FEATURE SELECTION PROCESS

AIM FOR CAUSALLY UPSTREAM OF THE VARIABLE

- Ingest contact features provided by customer
- Filter out based on `feasible_dtypes`
- Filter out based on `null_frac_threshold` (0.8 threshold)
- MAKE SURE THE FEATURES DROPPED ARE OBVIOUSLY BAD, WHAT ABOUT FEATURES CLOSE TO THE THRESHOLD?
- Rules of thumb can be misleading
- Filter out highly correlated features (0.95 threshold)
- Tree based models are not unstable with highly correlated features, but feature importance can be messed up
- Use `catboost.select_features()` functionality, basically trains a model and filters features using `feature_importance`
- Select the `top_n` features

# OFFLINE TESTING:

- What is the AUC on the low interacation audience?
- Actually check when the email was sent vs when we thought it should. (Plot of residual of when it was actually sent vs desired send time.)
CONFIRM IT IS ACTUALLY A MODEL PROBLEM! (Compliance Testing)
- Customer Splitting (hold out customers that the model never sees during training)
- Hold out the last week of historical data to test on. (Need to match up all the datetime features properly)

# TODO:

- Catboost Baseline functionality (see catboost website)
- Predict click_hour?
- Data Reduction?
- Include Mode Distribution as an input feature to the model
- Add conversion probability to the `send_to_interaction` distribution plots
- Check on `data_drift` between training, test and live AB
- Check on contacts that got emails from both the legacy and the new model.

# REPORT BACK 2ish WEEKs
261 changes: 261 additions & 0 deletions rosey/gbm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,261 @@
# mypy: disable-error-code="import-untyped"

from pathlib import Path
from typing import Dict, List, Literal, Optional

import numpy as np
import pandas as pd
from catboost import CatBoost, CatBoostClassifier, CatBoostRegressor, Pool
from sklearn.base import BaseEstimator
from sklearn.exceptions import NotFittedError
from sklearn.model_selection import train_test_split

FILEPATH_MODEL = "catboost_model_storage/catboost.cbm"


def evaluate_model(
model: CatBoost,
x: pd.DataFrame,
y: pd.Series,
metrics: List[str],
verbose=False,
**kwargs,
) -> Dict[str, float]:
"""
Ingests model and processed test data then evaluates the model using the specified metrics.
"""
test_pool = Pool(
x,
y,
**kwargs,
)
eval_result = model.eval_metrics( # type: ignore[attr-defined]
test_pool,
metrics,
)

# Log evaluation metrics and print if verbose
logs = {}
for metric in metrics:
logs[metric] = eval_result[metric][-1]
if verbose:
print(f"{metric}: {eval_result[metric][-1]}")

# Log feature importances and print if verbose
importances = model.get_feature_importance( # type: ignore[attr-defined]
prettified=True
)
if verbose:
print(importances)
for name, value in zip(importances["Feature Id"], importances["Importances"]):
logs[f"Importance of {name}"] = value

logs["Testing Sample Count"] = len(x)
print(logs)
return logs


# TODO: Implement a function to write a model to a local path
def write_model(model: CatBoost, path: Path) -> None:
model.save_model(path.cwd() / FILEPATH_MODEL)


# TODO: Implement a function to load a model from a local path
def load_model(
path: Path, model_type: Literal["classifier", "regressor"]
) -> CatBoost:
"""
model_type: either "classifier" or "regressor"
"""
if model_type == "classifier":
model = CatBoostClassifier()
elif model_type == "regressor":
model = CatBoostRegressor()
else:
raise ValueError("model_type must be either 'classifier' or 'regressor'")

# join the path to the filename
model_path = path.cwd() / FILEPATH_MODEL

return model.load_model(model_path)


class _BoostingModelTrainer(BaseEstimator):
"""Base class for CatBoost Models Trainers"""

def __init__(
self,
iterations: int,
cat_features: Optional[List[str]],
text_features: Optional[List[str]],
embedding_features: Optional[List[str]],
early_stopping_rounds: int,
use_best_model: bool,
verbose: bool,
random_state: int,
loss_function: str,
eval_metric: str,
):
self.iterations = iterations
self.cat_features = cat_features
self.text_features = text_features
self.embedding_features = embedding_features
self.early_stopping_rounds = early_stopping_rounds
self.use_best_model = use_best_model
self.verbose = verbose
self.random_state = random_state
self.loss_function = loss_function
self.eval_metric = eval_metric

self.model_ = None

@property
def get_model(self):
if self.model_:
return self.model_
raise NotFittedError("You must call `.fit()` first")

def fit(
self,
x: pd.DataFrame,
y: pd.Series,
stratify_by: Optional[pd.DataFrame] = None,
x_val: Optional[pd.DataFrame] = None,
y_val: Optional[pd.Series] = None,
**kwargs,
) -> CatBoost:
"""
Ingest processed train data, perform train test split, and fit
the model .
"""
x, y = x.reset_index(drop=True), y.reset_index(drop=True)
if x_val is None or y_val is None:
x_train, x_val, y_train, y_val = train_test_split(
x, y, stratify=stratify_by
)
else:
x_train, y_train = x, y

self.model_ = self.model_.fit( # type: ignore[attr-defined]
x_train,
y_train,
eval_set=(x_val, y_val),
**kwargs,
)

# TODO (2024/07/03) @srose: Implement batched fitting if required

print(
{
"Iterations Completed": min(
(
self.model_.best_iteration_ # type: ignore[attr-defined]
+ self.early_stopping_rounds
+ 1
),
self.iterations,
),
"Training Sample Count": len(x_train),
}
)
return self.model_

def predict(self, x: pd.DataFrame) -> np.ndarray:
return self.model_.predict(x) # type: ignore[attr-defined]


class CBClassifierTrainer(_BoostingModelTrainer):
"""
CatBoost Classifier Trainer.

NOTE: this class is set up for binary classification
"""

def __init__(
self,
iterations=10_000,
cat_features=None,
text_features=None,
embedding_features=None,
early_stopping_rounds=25,
use_best_model=True,
verbose=True,
random_state=42,
loss_function="Logloss",
eval_metric="AUC", # TODO: Does this trigger early stopping or Loss Function?
**kwargs,
):
super().__init__(
iterations,
cat_features,
text_features,
embedding_features,
early_stopping_rounds,
use_best_model,
verbose,
random_state,
loss_function,
eval_metric,
)

self.model_ = CatBoostClassifier(
iterations=self.iterations,
cat_features=self.cat_features,
text_features=self.text_features,
embedding_features=self.embedding_features,
early_stopping_rounds=self.early_stopping_rounds,
use_best_model=self.use_best_model,
verbose=self.verbose,
random_state=self.random_state,
loss_function=self.loss_function,
eval_metric=self.eval_metric,
**kwargs,
)


class CBRegressorTrainer(_BoostingModelTrainer):
"""CatBoost Regressor Trainer."""

def __init__(
self,
iterations=10_000,
cat_features=None,
text_features=None,
embedding_features=None,
early_stopping_rounds=25,
use_best_model=True,
verbose=True,
random_state=42,
loss_function="RMSE",
eval_metric="RMSE",
**kwargs,
):
super().__init__(
iterations,
cat_features,
text_features,
embedding_features,
early_stopping_rounds,
use_best_model,
verbose,
random_state,
loss_function,
eval_metric,
)

self.eval_metric = eval_metric

self.model_ = CatBoostRegressor(
iterations=self.iterations,
cat_features=self.cat_features,
text_features=self.text_features,
embedding_features=self.embedding_features,
early_stopping_rounds=self.early_stopping_rounds,
use_best_model=self.use_best_model,
verbose=self.verbose,
random_state=self.random_state,
loss_function=self.loss_function,
eval_metric=self.eval_metric,
**kwargs,
)

Loading