arose13 · rkehoejr11 · Dec 14, 2024 · Dec 16, 2024 · Feb 16, 2025
diff --git a/.gitignore b/.gitignore
@@ -38,3 +38,6 @@ docs/_build
 
 # Cookiecutter
 output/
+
+# rck_jr dev files
+rosey/delme.py
diff --git a/rosey/README.md b/rosey/README.md
@@ -0,0 +1,71 @@
+<!-- ruff: noqa -->
+<!-- linting: ignore -->
+# send_time_p13n
+
+# GOAL:
+
+Predict the ideal send time for each contact to maximize probability of interaction
+
+# HYPOTHESIS:
+
+The closer an email is to the top of a contact's inbox the more probable it is that interact.
+
+# LEGACY MODEL:
+
+Creates a pdf of historical interactions for each contact. Essentially a count is performed of how often a contact has interacted (opens or clicks) with emails in the past, these counts populate a pdf that represents every possible hour of the week (168 hours). This count is normalized to create probabilities that a custom `argmax` function ingests and selects send times from.
+
+    NOTE: If a contact does not meet a certain threshold of interactions in the lookback window, a default `mode` distribution is used for that contact.
+
+# NEW MODEL:
+
+A catboost classifier is trained on enriched historical interaction data for each contact. The features used from enrichment are provided by the customer or publicly available (ie H3 features).
+
+# WHY SHOULD THE NEW MODEL BE BETTER?
+
+- Generalized learning from `high` interaction users to `low` to `no` interaction users
+- Minimize lookback window due to learning from population level data
+- Go beyond interaction data to make predictions (ie demographic data)
+
+# RESULTS OF RECENT AB TEST
+
+We lost :-( (I told everyone that it was because of Stephen....)
+
+# Things I need help on:
+
+- Auto feature selection to replace manual EDA
+- How should I evaluate a model offline? Currently use LogLoss and ROC curves.
+- Model selection, is Catboost the correct model? I've been told the idea is train one and then try to beat it.
+
+# AUTO FEATURE SELECTION PROCESS
+
+AIM FOR CAUSALLY UPSTREAM OF THE VARIABLE
+
+- Ingest contact features provided by customer
+- Filter out based on `feasible_dtypes`
+- Filter out based on `null_frac_threshold` (0.8 threshold)
+    - MAKE SURE THE FEATURES DROPPED ARE OBVIOUSLY BAD, WHAT ABOUT FEATURES CLOSE TO THE THRESHOLD?
+    - Rules of thumb can be misleading
+- Filter out highly correlated features (0.95 threshold)
+    - Tree based models are not unstable with highly correlated features, but feature importance can be messed up
+- Use `catboost.select_features()` functionality, basically trains a model and filters features using `feature_importance`
+- Select the `top_n` features
+
+# OFFLINE TESTING:
+
+- What is the AUC on the low interacation audience?
+- Actually check when the email was sent vs when we thought it should. (Plot of residual of when it was actually sent vs desired send time.)
+    CONFIRM IT IS ACTUALLY A MODEL PROBLEM! (Compliance Testing)
+- Customer Splitting (hold out customers that the model never sees during training)
+- Hold out the last week of historical data to test on. (Need to match up all the datetime features properly)
+
+# TODO:
+
+- Catboost Baseline functionality (see catboost website)
+- Predict click_hour?
+- Data Reduction?
+- Include Mode Distribution as an input feature to the model
+- Add conversion probability to the `send_to_interaction` distribution plots
+- Check on `data_drift` between training, test and live AB
+- Check on contacts that got emails from both the legacy and the new model.
+
+# REPORT BACK 2ish WEEKs
diff --git a/rosey/gbm.py b/rosey/gbm.py
@@ -0,0 +1,261 @@
+# mypy: disable-error-code="import-untyped"
+
+from pathlib import Path
+from typing import Dict, List, Literal, Optional
+
+import numpy as np
+import pandas as pd
+from catboost import CatBoost, CatBoostClassifier, CatBoostRegressor, Pool
+from sklearn.base import BaseEstimator
+from sklearn.exceptions import NotFittedError
+from sklearn.model_selection import train_test_split
+
+FILEPATH_MODEL = "catboost_model_storage/catboost.cbm"
+
+
+def evaluate_model(
+    model: CatBoost,
+    x: pd.DataFrame,
+    y: pd.Series,
+    metrics: List[str],
+    verbose=False,
+    **kwargs,
+) -> Dict[str, float]:
+    """
+    Ingests model and processed test data then evaluates the model using the specified metrics.
+    """
+    test_pool = Pool(
+        x,
+        y,
+        **kwargs,
+    )
+    eval_result = model.eval_metrics(  # type: ignore[attr-defined]
+        test_pool,
+        metrics,
+    )
+
+    # Log evaluation metrics and print if verbose
+    logs = {}
+    for metric in metrics:
+        logs[metric] = eval_result[metric][-1]
+        if verbose:
+            print(f"{metric}: {eval_result[metric][-1]}")
+
+    # Log feature importances and print if verbose
+    importances = model.get_feature_importance(  # type: ignore[attr-defined]
+        prettified=True
+    )
+    if verbose:
+        print(importances)
+    for name, value in zip(importances["Feature Id"], importances["Importances"]):
+        logs[f"Importance of {name}"] = value
+
+    logs["Testing Sample Count"] = len(x)
+    print(logs)
+    return logs
+
+
+# TODO: Implement a function to write a model to a local path
+def write_model(model: CatBoost, path: Path) -> None:
+    model.save_model(path.cwd() / FILEPATH_MODEL)
+
+
+# TODO: Implement a function to load a model from a local path
+def load_model(
+    path: Path, model_type: Literal["classifier", "regressor"]
+) -> CatBoost:
+    """
+    model_type: either "classifier" or "regressor"
+    """
+    if model_type == "classifier":
+        model = CatBoostClassifier()
+    elif model_type == "regressor":
+        model = CatBoostRegressor()
+    else:
+        raise ValueError("model_type must be either 'classifier' or 'regressor'")
+
+    # join the path to the filename
+    model_path = path.cwd() / FILEPATH_MODEL
+
+    return model.load_model(model_path)
+
+
+class _BoostingModelTrainer(BaseEstimator):
+    """Base class for CatBoost Models Trainers"""
+
+    def __init__(
+        self,
+        iterations: int,
+        cat_features: Optional[List[str]],
+        text_features: Optional[List[str]],
+        embedding_features: Optional[List[str]],
+        early_stopping_rounds: int,
+        use_best_model: bool,
+        verbose: bool,
+        random_state: int,
+        loss_function: str,
+        eval_metric: str,
+    ):
+        self.iterations = iterations
+        self.cat_features = cat_features
+        self.text_features = text_features
+        self.embedding_features = embedding_features
+        self.early_stopping_rounds = early_stopping_rounds
+        self.use_best_model = use_best_model
+        self.verbose = verbose
+        self.random_state = random_state
+        self.loss_function = loss_function
+        self.eval_metric = eval_metric
+
+        self.model_ = None
+
+    @property
+    def get_model(self):
+        if self.model_:
+            return self.model_
+        raise NotFittedError("You must call `.fit()` first")
+
+    def fit(
+        self,
+        x: pd.DataFrame,
+        y: pd.Series,
+        stratify_by: Optional[pd.DataFrame] = None,
+        x_val: Optional[pd.DataFrame] = None,
+        y_val: Optional[pd.Series] = None,
+        **kwargs,
+    ) -> CatBoost:
+        """
+        Ingest processed train data, perform train test split, and fit
+        the model .
+        """
+        x, y = x.reset_index(drop=True), y.reset_index(drop=True)
+        if x_val is None or y_val is None:
+            x_train, x_val, y_train, y_val = train_test_split(
+                x, y, stratify=stratify_by
+            )
+        else:
+            x_train, y_train = x, y
+
+        self.model_ = self.model_.fit(  # type: ignore[attr-defined]
+            x_train,
+            y_train,
+            eval_set=(x_val, y_val),
+            **kwargs,
+        )
+
+        # TODO (2024/07/03) @srose: Implement batched fitting if required
+
+        print(
+            {
+                "Iterations Completed": min(
+                    (
+                        self.model_.best_iteration_  # type: ignore[attr-defined]
+                        + self.early_stopping_rounds
+                        + 1
+                    ),
+                    self.iterations,
+                ),
+                "Training Sample Count": len(x_train),
+            }
+        )
+        return self.model_
+
+    def predict(self, x: pd.DataFrame) -> np.ndarray:
+        return self.model_.predict(x)  # type: ignore[attr-defined]
+
+
+class CBClassifierTrainer(_BoostingModelTrainer):
+    """
+    CatBoost Classifier Trainer.
+
+    NOTE: this class is set up for binary classification
+    """
+
+    def __init__(
+        self,
+        iterations=10_000,
+        cat_features=None,
+        text_features=None,
+        embedding_features=None,
+        early_stopping_rounds=25,
+        use_best_model=True,
+        verbose=True,
+        random_state=42,
+        loss_function="Logloss",
+        eval_metric="AUC",  # TODO: Does this trigger early stopping or Loss Function?
+        **kwargs,
+    ):
+        super().__init__(
+            iterations,
+            cat_features,
+            text_features,
+            embedding_features,
+            early_stopping_rounds,
+            use_best_model,
+            verbose,
+            random_state,
+            loss_function,
+            eval_metric,
+        )
+
+        self.model_ = CatBoostClassifier(
+            iterations=self.iterations,
+            cat_features=self.cat_features,
+            text_features=self.text_features,
+            embedding_features=self.embedding_features,
+            early_stopping_rounds=self.early_stopping_rounds,
+            use_best_model=self.use_best_model,
+            verbose=self.verbose,
+            random_state=self.random_state,
+            loss_function=self.loss_function,
+            eval_metric=self.eval_metric,
+            **kwargs,
+        )
+
+
+class CBRegressorTrainer(_BoostingModelTrainer):
+    """CatBoost Regressor Trainer."""
+
+    def __init__(
+        self,
+        iterations=10_000,
+        cat_features=None,
+        text_features=None,
+        embedding_features=None,
+        early_stopping_rounds=25,
+        use_best_model=True,
+        verbose=True,
+        random_state=42,
+        loss_function="RMSE",
+        eval_metric="RMSE",
+        **kwargs,
+    ):
+        super().__init__(
+            iterations,
+            cat_features,
+            text_features,
+            embedding_features,
+            early_stopping_rounds,
+            use_best_model,
+            verbose,
+            random_state,
+            loss_function,
+            eval_metric,
+        )
+
+        self.eval_metric = eval_metric
+
+        self.model_ = CatBoostRegressor(
+            iterations=self.iterations,
+            cat_features=self.cat_features,
+            text_features=self.text_features,
+            embedding_features=self.embedding_features,
+            early_stopping_rounds=self.early_stopping_rounds,
+            use_best_model=self.use_best_model,
+            verbose=self.verbose,
+            random_state=self.random_state,
+            loss_function=self.loss_function,
+            eval_metric=self.eval_metric,
+            **kwargs,
+        )
+