Python梯度提升树
# Python梯度提升树
# 梯度提升 (Gradient Boosting) 通过逐步添加决策树来修正残差
# 是目前表格数据上最强大的机器学习方法之一
# 1. 导入库
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
# 2. 加载数据
cancer = load_breast_cancer()
X, y = cancer.data, cancer.target
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=42
)
# 3. 基础梯度提升树
gb = GradientBoostingClassifier(
n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42
)
gb.fit(X_train, y_train)
y_prob = gb.predict_proba(X_test)[:, 1]
print(f"=== GradientBoosting ===")
print(f"准确率: {accuracy_score(y_test, gb.predict(X_test)):.4f}")
print(f"AUC: {roc_auc_score(y_test, y_prob):.4f}")
# 4. 学习率的影响
print(f"\n不同学习率的对比 (n_estimators=100):")
for lr in [0.01, 0.05, 0.1, 0.2, 0.5]:
gb_lr = GradientBoostingClassifier(
n_estimators=100, learning_rate=lr, max_depth=3, random_state=42
)
gb_lr.fit(X_train, y_train)
print(f" lr={lr:.2f}: 准确率={gb_lr.score(X_test, y_test):.4f}")
# 5. 早停法 (Early Stopping)
gb_early = GradientBoostingClassifier(
n_estimators=1000, learning_rate=0.1, max_depth=3,
validation_fraction=0.2, n_iter_no_change=10, tol=1e-4, random_state=42
)
gb_early.fit(X_train, y_train)
print(f"\n早停法实际使用树数: {gb_early.n_estimators_} (最大 1000)")
print(f"早停法准确率: {gb_early.score(X_test, y_test):.4f}")
# 6. 特征重要性
importances = gb.feature_importances_
top_5_idx = np.argsort(importances)[::-1][:5]
print(f"\n前 5 个重要特征:")
for i, idx in enumerate(top_5_idx):
print(f" {i+1}. {cancer.feature_names[idx]}: {importances[idx]:.4f}")
# 7. 使用 subsample 防止过拟合
gb_sub = GradientBoostingClassifier(
n_estimators=100, learning_rate=0.1, max_depth=3,
subsample=0.8, random_state=42
)
gb_sub.fit(X_train, y_train)
print(f"\nsubsample=0.8 准确率: {gb_sub.score(X_test, y_test):.4f}")
# 8. 损失函数
print(f"\n=== 模型参数 ===")
print(f"损失函数: {gb.loss}, 学习率: {gb.learning_rate}")
print(f"树数量: {gb.n_estimators}, 最大深度: {gb.max_depth}")
# 9. GBDT vs 随机森林
# GBDT: 逐步优化,每棵树拟合残差(boosting)
# RandomForest: 并行训练,每棵树独立(bagging)
# GBDT 通常精度更高,但调参较复杂
# 10. 调参指南
# n_estimators: 更多树降低偏差但可能过拟合
# learning_rate: 越小泛化越好但需更多树
# max_depth: 通常 3-5
# subsample: 0.5-0.8 引入随机性防过拟合
