Skip to content

Latest commit

 

History

History
533 lines (445 loc) · 14.6 KB

File metadata and controls

533 lines (445 loc) · 14.6 KB

將樹狀機器學習模型編譯成 SQL 推理查詢,讓模型預測直接在資料庫內執行。

一、介紹

ML2SQL 將訓練好的機器學習模型(DecisionTree、RandomForest、AdaBoost、XGBoost、LGBM)編譯成純 SQL 推理查詢,讓模型預測可直接在資料庫內執行,無需額外部署推理服務。

核心流程:

訓練模型 → 匯出模型結構 → 生成 SQL → 在資料庫中執行推理

二、支援模型與功能

特徵項目 DecisionTree RandomForest AdaBoost XGBoost LGBM (官方)
支援類別型特徵
支援二分類
支援多分類
權重支援 (sample_weight)
支援 early stopping
支援特徵抽樣
支援樣本抽樣
支援剪枝
匯出成 SQL 推理
轉 JSON
可視化 (dot)

三、安裝

pip install -e .

四、用法

4.1 LGBMClassifier

使用官方 LightGBM 訓練後,透過 LightGBMManualPredictor 將模型編譯成 SQL。

from Classifiers.BoostClassifier import LightGBMManualPredictor
from utils import generate_synthetic_data
import lightgbm as lgb
from sklearn.metrics import accuracy_score

if __name__ == "__main__":
    df = generate_synthetic_data(n=200000)
    test_df = generate_synthetic_data(n=100, seed=43)
    X = df.drop(columns=["play"])
    y = (df["play"] == "Yes").astype(int)

    X_test = test_df.drop(columns=["play"])
    y_test = (test_df["play"] == "Yes").astype(int)

    cat_cols = ["humidity", "wind", "outlook"]
    for col in cat_cols:
        X[col] = X[col].astype("category").cat.set_categories(sorted(X[col].unique()))
        X_test[col] = X_test[col].astype("category").cat.set_categories(sorted(X[col].unique()))

    feature_cat_mapping = {
        col: {i: cat for i, cat in enumerate(X[col].cat.categories)}
        for col in cat_cols
    }

    lgbm = lgb.LGBMClassifier(max_depth=3, n_estimators=3, learning_rate=0.3)
    lgbm.fit(X, y, categorical_feature=cat_cols)

    print(f"LightGBM Accuracy: {accuracy_score(y_test, lgbm.predict(X_test)):.4f}")

    lgbm.booster_.save_model("lgb_model.txt")
    model = lgb.Booster(model_file="lgb_model.txt")

    feature_names = X.columns.tolist()
    predictor = LightGBMManualPredictor(model.dump_model(), feature_names, feature_cat_mapping)

    sql = predictor.generate_sql(threshold=0.5)
    print(sql)
SQL 生成範例(點擊展開)
WITH predictions AS (
    SELECT
        id,
        CASE
          WHEN outlook = 'Overcast' THEN
            CASE
              WHEN cloud_cover <= 60.5 THEN 0.2905411340700754
              ELSE
                CASE
                  WHEN wind = 'Weak' THEN -0.3534347768742837
                  ELSE 0.2905411340700754
                END
            END
          ELSE
            CASE
              WHEN temperature <= 32.5 THEN -0.9885879150965038
              ELSE
                CASE
                  WHEN wind = 'Weak' THEN -0.9885879150965038
                  ELSE 0.2905411340700754
                END
            END
        END AS tree_1_score,
        -- tree_2_score, tree_3_score ...
        FROM input_data
),
logits AS (
    SELECT
        id,
        0.0 + (tree_1_score + tree_2_score + tree_3_score) AS logit
    FROM predictions
)
SELECT
    id,
    1.0 / (1.0 + EXP(-logit)) AS probability,
    CASE WHEN 1.0 / (1.0 + EXP(-logit)) >= 0.5 THEN 1 ELSE 0 END AS prediction
FROM logits;

4.2 LinearBoostingClassifier

自實作的線性 Boosting 分類器,生成的 SQL 為線性公式,可讀性高。

from Classifiers.BoostClassifier import LinearBoostingClassifier
from utils import generate_linear_sql, generate_synthetic_data
from sklearn.metrics import accuracy_score

if __name__ == "__main__":
    data = generate_synthetic_data(n=200000, seed=42)
    X = data.drop(columns=["play"])
    y = (data["play"] == "Yes").astype(int)

    test_df = generate_synthetic_data(n=100, seed=43)
    X_test = test_df.drop(columns=["play"])
    y_test = (test_df["play"] == "Yes").astype(int)

    clf = LinearBoostingClassifier(
        n_estimators=200,
        learning_rate=0.3,
        early_stopping_rounds=20,
        colsample_bytree=1,
        subsample=0.8,
    )
    clf.fit(X, y, (X_test, y_test))

    print("Accuracy:", accuracy_score(y_test, clf.predict(X_test)))

    clf.save("linear_boost_model.json")
    model = LinearBoostingClassifier.load("linear_boost_model.json")

    # 生成完整版 SQL(每棵樹獨立展開)
    sql = generate_linear_sql(model)
    print(sql)

    # 生成合併版 SQL(所有樹的係數加總,更簡潔)
    sql = generate_merged_linear_sql(model)
    print(sql)

    # 特徵重要性
    features = model.compute_feature_contributions(X_test, plot=True, normalize=True)
    # 輸出範例:
    # temperature      0.492538
    # outlook_Sunny    0.162274
    # wind_Weak        0.096765

    # 各樣本的特徵貢獻
    per_sample = model.per_sample_contributions(X_test)
    # per_sample['raw'], per_sample['normalized'], per_sample['signed']
SQL 生成範例 — 完整版(點擊展開)
WITH linear_score AS (
  SELECT
    id,
    -0.508096 +
    0.3 * (0.315567 + 0.090054 * temperature + -0.005837 * cloud_cover + 0.321502 * humidity_Low + -0.979199 * wind_Weak + -3.287133 * outlook_Rainy + -3.289431 * outlook_Sunny) +
    0.3 * (-0.166722 + 0.077027 * temperature + -0.004919 * cloud_cover + 0.271886 * humidity_Low + -0.833306 * wind_Weak + -2.371971 * outlook_Rainy + -2.369790 * outlook_Sunny) +
    -- 更多樹...
    AS raw_score
  FROM input_data
)
SELECT *,
  1 / (1 + EXP(-raw_score)) AS prob,
  CASE WHEN 1 / (1 + EXP(-raw_score)) >= 0.5 THEN 1 ELSE 0 END AS prediction
FROM linear_score;
SQL 生成範例 — 合併版(點擊展開)
WITH linear_score AS (
  SELECT
    id,
    -1.499403
    + -0.011357 * cloud_cover
    + 0.619758  * humidity_Low
    + -4.512565 * outlook_Rainy
    + -4.519462 * outlook_Sunny
    + 0.177894  * temperature
    + -1.979982 * wind_Weak
    AS raw_score
  FROM input_data
)
SELECT *,
  1 / (1 + EXP(-raw_score)) AS prob,
  CASE WHEN 1 / (1 + EXP(-raw_score)) >= 0.5 THEN 1 ELSE 0 END AS prediction
FROM linear_score;

4.3 XGBoostClassifier

自實作的 XGBoost 分類器。

from Classifiers.BoostClassifier import XGBoostClassifier
from utils import generate_synthetic_data, generate_xgboost_sql

if __name__ == "__main__":
    data = generate_synthetic_data(n=100, seed=42)
    X = data.drop(columns=["play"])
    y = (data["play"] == "Yes").astype(int)

    test_df = generate_synthetic_data(n=100, seed=43)
    X_test = test_df.drop(columns=["play"])
    y_test = (test_df["play"] == "Yes").astype(int)

    feature_types = {
        "temperature": "numerical",
        "humidity": "categorical",
        "wind": "categorical",
        "outlook": "categorical",
        "cloud_cover": "numerical"
    }

    clf = XGBoostClassifier(
        n_estimators=15,
        max_depth=3,
        learning_rate=0.3,
        reg_lambda=1.0,
        feature_types=feature_types,
        colsample_bytree=1.0,
        subsample=1.0,
    )
    clf.fit(X, y, (X_test, y_test))

    from sklearn.metrics import accuracy_score
    print("Accuracy:", accuracy_score(y_test, clf.predict(X_test)))

    clf.save("xgboost_model.json")
    model = XGBoostClassifier.load("xgboost_model.json")

    sql = generate_xgboost_sql(model)
    print(sql)
SQL 生成範例(點擊展開)
WITH predictions AS (
    SELECT
        id,
        CASE
          WHEN outlook = 'Overcast' THEN
            CASE
              WHEN cloud_cover <= 70 THEN 2.412220
              ELSE
                CASE
                  WHEN wind = 'Strong' THEN 1.454234
                  ELSE -0.204320
                END
            END
          ELSE
            CASE
              WHEN temperature <= 32 THEN -1.353249
              ELSE
                CASE
                  WHEN wind = 'Strong' THEN 1.628352
                  ELSE -0.934034
                END
            END
        END AS tree_1_score,
        -- tree_2_score ... tree_N_score
    FROM input_data
),
logits AS (
    SELECT
        id,
        -0.7537718 + 0.3 * (tree_1_score + tree_2_score + ... + tree_N_score) AS logit
    FROM predictions
)
SELECT
    id,
    1.0 / (1.0 + EXP(-logit)) AS probability,
    CASE WHEN 1.0 / (1.0 + EXP(-logit)) >= 0.5 THEN 1 ELSE 0 END AS prediction
FROM logits;

4.4 AdaBoostClassifier

自實作的 AdaBoost 分類器,以加權投票方式生成 SQL。

from Classifiers.BoostClassifier import AdaBoostClassifier
from Classifiers.TreeClassifier import DecisionTreeClassifier
from utils import generate_adaboost_sql, generate_synthetic_data
from sklearn.metrics import accuracy_score

if __name__ == "__main__":
    data = generate_synthetic_data(n=100, seed=42)
    feature_types = {
        "temperature": "numerical",
        "humidity": "categorical",
        "wind": "categorical",
        "outlook": "categorical",
        "cloud_cover": "numerical"
    }

    boost = AdaBoostClassifier(
        base_estimator_class=DecisionTreeClassifier,
        feature_types=feature_types,
        n_estimators=10,
        max_depth=3
    )
    boost.fit(data, target_key="play")

    boost.save("adaboost.json")
    model = AdaBoostClassifier.load("adaboost.json", DecisionTreeClassifier)

    test_df = generate_synthetic_data(n=100, seed=43)
    X_test = test_df.drop(columns=["play"])
    y_test = test_df["play"]
    print("Accuracy:", model.score(X_test, y_test, metric="accuracy")[0])
    print("Recall:",   model.score(X_test, y_test, metric="recall",  average="binary")[0])
    print("F1 score:", model.score(X_test, y_test, metric="f1",      average="binary")[0])

    sql = generate_adaboost_sql(model)
    print(sql)
SQL 生成範例(點擊展開)
WITH predictions AS (
    SELECT
        id,
        CASE
          WHEN outlook = 'Rainy' THEN
            CASE
              WHEN temperature <= 33.5 THEN 'No'
              ELSE
                CASE
                  WHEN wind = 'Strong' THEN 'Yes'
                  ELSE 'No'
                END
            END
          WHEN outlook = 'Overcast' THEN
            CASE
              WHEN cloud_cover <= 59.0 THEN 'Yes'
              ELSE
                CASE
                  WHEN wind = 'Strong' THEN 'Yes'
                  ELSE 'No'
                END
            END
          ELSE 'No'
        END AS tree_1_pred,
        -- tree_2_pred ... tree_N_pred
    FROM input_data
),
weighted_scores AS (
    SELECT
        id,
        (CASE WHEN tree_1_pred = 'No'  THEN 3.8918 ELSE 0 END) + ... AS score_No,
        (CASE WHEN tree_1_pred = 'Yes' THEN 3.8918 ELSE 0 END) + ... AS score_Yes
    FROM predictions
)
SELECT
    id,
    CASE WHEN score_Yes >= score_No THEN 'Yes' ELSE 'No' END AS prediction
FROM weighted_scores;

4.5 RandomForestClassifier

自實作的 Random Forest 分類器,支援多數決投票 SQL。

from Classifiers.ForestClassifier import RandomForestClassifier
from utils import generate_forest_sql, generate_synthetic_data
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

if __name__ == "__main__":
    data = generate_synthetic_data(n=100, seed=42)
    feature_types = {
        "temperature": "numerical",
        "humidity": "categorical",
        "wind": "categorical",
        "outlook": "categorical",
        "cloud_cover": "numerical"
    }

    rf = RandomForestClassifier(
        feature_types=feature_types,
        n_estimators=5,
        max_depth=5,
        bootstrap=True,
        oob_score=True,
        max_features=None,
        criterion="gini",
        min_samples_split=2,
        min_samples_leaf=1,
        tie_break_random=True,
        random_state=42
    )
    rf.fit(data, target_key="play")

    rf.save("random_forest_model.json")
    rf_loaded = RandomForestClassifier.load("random_forest_model.json")

    test_df = generate_synthetic_data(n=100, seed=43)
    X_test = test_df.drop(columns=["play"])
    y_test = test_df["play"]
    print("Accuracy:", rf.score(X_test, y_test, metric="accuracy")[0])
    print("Recall:",   rf.score(X_test, y_test, metric="recall",  average="binary")[0])
    print("F1 score:", rf.score(X_test, y_test, metric="f1",      average="binary")[0])
    print("OOB Score:", rf.oob_score_)  # 須 bootstrap=True, oob_score=True

    sql = generate_forest_sql(rf, input_table="input_data", id_col="id")
    print(sql)

    # 特徵重要性
    importances = rf.feature_importances_()
    items = sorted(importances.items(), key=lambda x: x[1], reverse=True)
    features, scores = zip(*items)

    plt.barh(features, scores)
    plt.xlabel("Importance")
    plt.title("Random Forest Feature Importances")
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()
SQL 生成範例(點擊展開)
WITH predictions AS (
    SELECT
        id,
        CASE
          WHEN outlook = 'Rainy' THEN
            CASE
              WHEN temperature <= 34.0 THEN 'No'
              ELSE
                CASE
                  WHEN cloud_cover <= 49.0 THEN 'Yes'
                  ELSE 'No'
                END
            END
          WHEN outlook = 'Overcast' THEN
            CASE
              WHEN wind = 'Strong' THEN 'Yes'
              ELSE
                CASE
                  WHEN cloud_cover <= 52.0 THEN 'Yes'
                  ELSE
                    CASE
                      WHEN humidity = 'High' THEN 'No'
                      ELSE 'Yes'
                    END
                END
            END
          WHEN outlook = 'Sunny' THEN 'No'
          ELSE 'No'
        END AS tree_1_pred,
        -- tree_2_pred ... tree_N_pred
    FROM input_data
)
SELECT
    id,
    CASE
        WHEN (tree_1_pred = 'Yes')::int + ... >= 3 THEN 'Yes'
        ELSE 'No'
    END AS prediction
FROM predictions;