將樹狀機器學習模型編譯成 SQL 推理查詢,讓模型預測直接在資料庫內執行。
ML2SQL 將訓練好的機器學習模型(DecisionTree、RandomForest、AdaBoost、XGBoost、LGBM)編譯成純 SQL 推理查詢,讓模型預測可直接在資料庫內執行,無需額外部署推理服務。
核心流程:
訓練模型 → 匯出模型結構 → 生成 SQL → 在資料庫中執行推理
| 特徵項目 | DecisionTree | RandomForest | AdaBoost | XGBoost | LGBM (官方) |
|---|---|---|---|---|---|
| 支援類別型特徵 | ✅ | ✅ | ✅ | ✅ | ✅ |
| 支援二分類 | ✅ | ✅ | ✅ | ✅ | ✅ |
| 支援多分類 | ✅ | ✅ | ✅ | ❌ | ✅ |
| 權重支援 (sample_weight) | ✅ | ✅ | |||
| 支援 early stopping | ✅ | ✅ | ✅ | ✅ | |
| 支援特徵抽樣 | ✅ | ✅ | ✅ | ||
| 支援樣本抽樣 | ✅ | ✅ | ✅ | ||
| 支援剪枝 | ✅ | ✅ | ✅ | ||
| 匯出成 SQL 推理 | ✅ | ✅ | ✅ | ✅ | ✅ |
| 轉 JSON | ✅ | ✅ | ✅ | ✅ | ❌ |
| 可視化 (dot) | ✅ | ✅ | ✅ | ✅ | ✅ |
pip install -e .使用官方 LightGBM 訓練後,透過 LightGBMManualPredictor 將模型編譯成 SQL。
from Classifiers.BoostClassifier import LightGBMManualPredictor
from utils import generate_synthetic_data
import lightgbm as lgb
from sklearn.metrics import accuracy_score
if __name__ == "__main__":
df = generate_synthetic_data(n=200000)
test_df = generate_synthetic_data(n=100, seed=43)
X = df.drop(columns=["play"])
y = (df["play"] == "Yes").astype(int)
X_test = test_df.drop(columns=["play"])
y_test = (test_df["play"] == "Yes").astype(int)
cat_cols = ["humidity", "wind", "outlook"]
for col in cat_cols:
X[col] = X[col].astype("category").cat.set_categories(sorted(X[col].unique()))
X_test[col] = X_test[col].astype("category").cat.set_categories(sorted(X[col].unique()))
feature_cat_mapping = {
col: {i: cat for i, cat in enumerate(X[col].cat.categories)}
for col in cat_cols
}
lgbm = lgb.LGBMClassifier(max_depth=3, n_estimators=3, learning_rate=0.3)
lgbm.fit(X, y, categorical_feature=cat_cols)
print(f"LightGBM Accuracy: {accuracy_score(y_test, lgbm.predict(X_test)):.4f}")
lgbm.booster_.save_model("lgb_model.txt")
model = lgb.Booster(model_file="lgb_model.txt")
feature_names = X.columns.tolist()
predictor = LightGBMManualPredictor(model.dump_model(), feature_names, feature_cat_mapping)
sql = predictor.generate_sql(threshold=0.5)
print(sql)SQL 生成範例(點擊展開)
WITH predictions AS (
SELECT
id,
CASE
WHEN outlook = 'Overcast' THEN
CASE
WHEN cloud_cover <= 60.5 THEN 0.2905411340700754
ELSE
CASE
WHEN wind = 'Weak' THEN -0.3534347768742837
ELSE 0.2905411340700754
END
END
ELSE
CASE
WHEN temperature <= 32.5 THEN -0.9885879150965038
ELSE
CASE
WHEN wind = 'Weak' THEN -0.9885879150965038
ELSE 0.2905411340700754
END
END
END AS tree_1_score,
-- tree_2_score, tree_3_score ...
FROM input_data
),
logits AS (
SELECT
id,
0.0 + (tree_1_score + tree_2_score + tree_3_score) AS logit
FROM predictions
)
SELECT
id,
1.0 / (1.0 + EXP(-logit)) AS probability,
CASE WHEN 1.0 / (1.0 + EXP(-logit)) >= 0.5 THEN 1 ELSE 0 END AS prediction
FROM logits;自實作的線性 Boosting 分類器,生成的 SQL 為線性公式,可讀性高。
from Classifiers.BoostClassifier import LinearBoostingClassifier
from utils import generate_linear_sql, generate_synthetic_data
from sklearn.metrics import accuracy_score
if __name__ == "__main__":
data = generate_synthetic_data(n=200000, seed=42)
X = data.drop(columns=["play"])
y = (data["play"] == "Yes").astype(int)
test_df = generate_synthetic_data(n=100, seed=43)
X_test = test_df.drop(columns=["play"])
y_test = (test_df["play"] == "Yes").astype(int)
clf = LinearBoostingClassifier(
n_estimators=200,
learning_rate=0.3,
early_stopping_rounds=20,
colsample_bytree=1,
subsample=0.8,
)
clf.fit(X, y, (X_test, y_test))
print("Accuracy:", accuracy_score(y_test, clf.predict(X_test)))
clf.save("linear_boost_model.json")
model = LinearBoostingClassifier.load("linear_boost_model.json")
# 生成完整版 SQL(每棵樹獨立展開)
sql = generate_linear_sql(model)
print(sql)
# 生成合併版 SQL(所有樹的係數加總,更簡潔)
sql = generate_merged_linear_sql(model)
print(sql)
# 特徵重要性
features = model.compute_feature_contributions(X_test, plot=True, normalize=True)
# 輸出範例:
# temperature 0.492538
# outlook_Sunny 0.162274
# wind_Weak 0.096765
# 各樣本的特徵貢獻
per_sample = model.per_sample_contributions(X_test)
# per_sample['raw'], per_sample['normalized'], per_sample['signed']SQL 生成範例 — 完整版(點擊展開)
WITH linear_score AS (
SELECT
id,
-0.508096 +
0.3 * (0.315567 + 0.090054 * temperature + -0.005837 * cloud_cover + 0.321502 * humidity_Low + -0.979199 * wind_Weak + -3.287133 * outlook_Rainy + -3.289431 * outlook_Sunny) +
0.3 * (-0.166722 + 0.077027 * temperature + -0.004919 * cloud_cover + 0.271886 * humidity_Low + -0.833306 * wind_Weak + -2.371971 * outlook_Rainy + -2.369790 * outlook_Sunny) +
-- 更多樹...
AS raw_score
FROM input_data
)
SELECT *,
1 / (1 + EXP(-raw_score)) AS prob,
CASE WHEN 1 / (1 + EXP(-raw_score)) >= 0.5 THEN 1 ELSE 0 END AS prediction
FROM linear_score;SQL 生成範例 — 合併版(點擊展開)
WITH linear_score AS (
SELECT
id,
-1.499403
+ -0.011357 * cloud_cover
+ 0.619758 * humidity_Low
+ -4.512565 * outlook_Rainy
+ -4.519462 * outlook_Sunny
+ 0.177894 * temperature
+ -1.979982 * wind_Weak
AS raw_score
FROM input_data
)
SELECT *,
1 / (1 + EXP(-raw_score)) AS prob,
CASE WHEN 1 / (1 + EXP(-raw_score)) >= 0.5 THEN 1 ELSE 0 END AS prediction
FROM linear_score;自實作的 XGBoost 分類器。
from Classifiers.BoostClassifier import XGBoostClassifier
from utils import generate_synthetic_data, generate_xgboost_sql
if __name__ == "__main__":
data = generate_synthetic_data(n=100, seed=42)
X = data.drop(columns=["play"])
y = (data["play"] == "Yes").astype(int)
test_df = generate_synthetic_data(n=100, seed=43)
X_test = test_df.drop(columns=["play"])
y_test = (test_df["play"] == "Yes").astype(int)
feature_types = {
"temperature": "numerical",
"humidity": "categorical",
"wind": "categorical",
"outlook": "categorical",
"cloud_cover": "numerical"
}
clf = XGBoostClassifier(
n_estimators=15,
max_depth=3,
learning_rate=0.3,
reg_lambda=1.0,
feature_types=feature_types,
colsample_bytree=1.0,
subsample=1.0,
)
clf.fit(X, y, (X_test, y_test))
from sklearn.metrics import accuracy_score
print("Accuracy:", accuracy_score(y_test, clf.predict(X_test)))
clf.save("xgboost_model.json")
model = XGBoostClassifier.load("xgboost_model.json")
sql = generate_xgboost_sql(model)
print(sql)SQL 生成範例(點擊展開)
WITH predictions AS (
SELECT
id,
CASE
WHEN outlook = 'Overcast' THEN
CASE
WHEN cloud_cover <= 70 THEN 2.412220
ELSE
CASE
WHEN wind = 'Strong' THEN 1.454234
ELSE -0.204320
END
END
ELSE
CASE
WHEN temperature <= 32 THEN -1.353249
ELSE
CASE
WHEN wind = 'Strong' THEN 1.628352
ELSE -0.934034
END
END
END AS tree_1_score,
-- tree_2_score ... tree_N_score
FROM input_data
),
logits AS (
SELECT
id,
-0.7537718 + 0.3 * (tree_1_score + tree_2_score + ... + tree_N_score) AS logit
FROM predictions
)
SELECT
id,
1.0 / (1.0 + EXP(-logit)) AS probability,
CASE WHEN 1.0 / (1.0 + EXP(-logit)) >= 0.5 THEN 1 ELSE 0 END AS prediction
FROM logits;自實作的 AdaBoost 分類器,以加權投票方式生成 SQL。
from Classifiers.BoostClassifier import AdaBoostClassifier
from Classifiers.TreeClassifier import DecisionTreeClassifier
from utils import generate_adaboost_sql, generate_synthetic_data
from sklearn.metrics import accuracy_score
if __name__ == "__main__":
data = generate_synthetic_data(n=100, seed=42)
feature_types = {
"temperature": "numerical",
"humidity": "categorical",
"wind": "categorical",
"outlook": "categorical",
"cloud_cover": "numerical"
}
boost = AdaBoostClassifier(
base_estimator_class=DecisionTreeClassifier,
feature_types=feature_types,
n_estimators=10,
max_depth=3
)
boost.fit(data, target_key="play")
boost.save("adaboost.json")
model = AdaBoostClassifier.load("adaboost.json", DecisionTreeClassifier)
test_df = generate_synthetic_data(n=100, seed=43)
X_test = test_df.drop(columns=["play"])
y_test = test_df["play"]
print("Accuracy:", model.score(X_test, y_test, metric="accuracy")[0])
print("Recall:", model.score(X_test, y_test, metric="recall", average="binary")[0])
print("F1 score:", model.score(X_test, y_test, metric="f1", average="binary")[0])
sql = generate_adaboost_sql(model)
print(sql)SQL 生成範例(點擊展開)
WITH predictions AS (
SELECT
id,
CASE
WHEN outlook = 'Rainy' THEN
CASE
WHEN temperature <= 33.5 THEN 'No'
ELSE
CASE
WHEN wind = 'Strong' THEN 'Yes'
ELSE 'No'
END
END
WHEN outlook = 'Overcast' THEN
CASE
WHEN cloud_cover <= 59.0 THEN 'Yes'
ELSE
CASE
WHEN wind = 'Strong' THEN 'Yes'
ELSE 'No'
END
END
ELSE 'No'
END AS tree_1_pred,
-- tree_2_pred ... tree_N_pred
FROM input_data
),
weighted_scores AS (
SELECT
id,
(CASE WHEN tree_1_pred = 'No' THEN 3.8918 ELSE 0 END) + ... AS score_No,
(CASE WHEN tree_1_pred = 'Yes' THEN 3.8918 ELSE 0 END) + ... AS score_Yes
FROM predictions
)
SELECT
id,
CASE WHEN score_Yes >= score_No THEN 'Yes' ELSE 'No' END AS prediction
FROM weighted_scores;自實作的 Random Forest 分類器,支援多數決投票 SQL。
from Classifiers.ForestClassifier import RandomForestClassifier
from utils import generate_forest_sql, generate_synthetic_data
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
if __name__ == "__main__":
data = generate_synthetic_data(n=100, seed=42)
feature_types = {
"temperature": "numerical",
"humidity": "categorical",
"wind": "categorical",
"outlook": "categorical",
"cloud_cover": "numerical"
}
rf = RandomForestClassifier(
feature_types=feature_types,
n_estimators=5,
max_depth=5,
bootstrap=True,
oob_score=True,
max_features=None,
criterion="gini",
min_samples_split=2,
min_samples_leaf=1,
tie_break_random=True,
random_state=42
)
rf.fit(data, target_key="play")
rf.save("random_forest_model.json")
rf_loaded = RandomForestClassifier.load("random_forest_model.json")
test_df = generate_synthetic_data(n=100, seed=43)
X_test = test_df.drop(columns=["play"])
y_test = test_df["play"]
print("Accuracy:", rf.score(X_test, y_test, metric="accuracy")[0])
print("Recall:", rf.score(X_test, y_test, metric="recall", average="binary")[0])
print("F1 score:", rf.score(X_test, y_test, metric="f1", average="binary")[0])
print("OOB Score:", rf.oob_score_) # 須 bootstrap=True, oob_score=True
sql = generate_forest_sql(rf, input_table="input_data", id_col="id")
print(sql)
# 特徵重要性
importances = rf.feature_importances_()
items = sorted(importances.items(), key=lambda x: x[1], reverse=True)
features, scores = zip(*items)
plt.barh(features, scores)
plt.xlabel("Importance")
plt.title("Random Forest Feature Importances")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()SQL 生成範例(點擊展開)
WITH predictions AS (
SELECT
id,
CASE
WHEN outlook = 'Rainy' THEN
CASE
WHEN temperature <= 34.0 THEN 'No'
ELSE
CASE
WHEN cloud_cover <= 49.0 THEN 'Yes'
ELSE 'No'
END
END
WHEN outlook = 'Overcast' THEN
CASE
WHEN wind = 'Strong' THEN 'Yes'
ELSE
CASE
WHEN cloud_cover <= 52.0 THEN 'Yes'
ELSE
CASE
WHEN humidity = 'High' THEN 'No'
ELSE 'Yes'
END
END
END
WHEN outlook = 'Sunny' THEN 'No'
ELSE 'No'
END AS tree_1_pred,
-- tree_2_pred ... tree_N_pred
FROM input_data
)
SELECT
id,
CASE
WHEN (tree_1_pred = 'Yes')::int + ... >= 3 THEN 'Yes'
ELSE 'No'
END AS prediction
FROM predictions;