当前位置：首页 > news >正文

DAY 15

news 2026/7/6 1:22:14

浙大疏锦行

代码极简逻辑

1. 数据预处理

读取信贷csv，分离特征与违约标签；文本特征编码；查看样本是否不平衡。

2. 数据集拆分

分层划分训练/测试集，保证两组违约样本比例一致。

3. 搭建4套对比流水线（防数据泄露）

基线：标准化+随机森林（对照组）
SMOTE过采样：合成少数违约样本平衡数据
SMOTEENN混合采样：过采样+剔除噪声
权重平衡：不改动数据，训练时加重少数类损失

4. 网格搜索+分层5折交叉验证

批量训练4套模型，以F1为优化目标，输出召回率、精确率、AUC。

5. 横向对比

汇总所有方案指标，自动选出F1最高的最优模型。

6. 阈值优化

不用默认0.5阈值，通过PR曲线找到F1最佳分割点，优化风控预测效果并绘图展示。

7. 可选

保存最优模型用于后续预测。

#DAY 15 不平衡数据集的处理

============================================================

信贷数据集 + 缺失值填充 + 不平衡处理 + 交叉验证 + 超参数调优

============================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score, recall_score, precision_score, roc_auc_score, precision_recall_curve
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
import warnings
warnings.filterwarnings(“ignore”)

中文绘图设置

plt.rcParams[‘font.sans-serif’] = [‘SimHei’, ‘Microsoft YaHei’, ‘DejaVu Sans’]
plt.rcParams[‘axes.unicode_minus’] = False

1.读取数据

file_path = r"C:\Python Study\Python60DaysChallenge-main\data.csv"
data = pd.read_csv(file_path)
print(“数据集形状:”, data.shape)
print(“\n数据集全部列名：”)
print(data.columns.tolist())

TARGET_COL = ‘Credit Default’

分离特征标签

X = data.drop(columns=[TARGET_COL])
y = data[TARGET_COL]

区分数值列、文本分类列

num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(include=[‘object’]).columns.tolist()

打印缺失值数量

print(“\n各列缺失值统计：”)
print(X.isnull().sum())

文本特征编码

le_dict = {}
for col in cat_cols:
le = LabelEncoder()
X[col] = le.fit_transform(X[col].astype(str).fillna(“Missing”))
le_dict[col] = le

数值缺失先简单填充（预处理阶段临时处理）

X[num_cols] = SimpleImputer(strategy=“median”).fit_transform(X[num_cols])

类别分布可视化

print(“\n原始数据集目标变量分布:”)
print(y.value_counts())
plt.figure(figsize=(6,4))
sns.countplot(x=y)
plt.title(‘信贷违约类别分布’)
plt.show()

2.分层划分训练测试集

X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"\n训练集分布: {pd.Series(y_train).value_counts().to_dict()}“)
print(f"测试集分布: {pd.Series(y_test).value_counts().to_dict()}”)

3.基础配置

base_clf = RandomForestClassifier(random_state=42)
param_grid_common = {
‘classifier__n_estimators’: [50, 100],
‘classifier__max_depth’: [5, 10],
‘classifier__min_samples_split’: [2, 5]
}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

4.四条流水线（数据已提前填充无NaN，SMOTE可正常运行）

基线

pipeline_baseline = ImbPipeline([
(‘scaler’, StandardScaler()),
(‘classifier’, base_clf)
])
param_baseline = param_grid_common.copy()

SMOTE

pipeline_smote = ImbPipeline([
(‘scaler’, StandardScaler()),
(‘sampler’, SMOTE(random_state=42)),
(‘classifier’, base_clf)
])
param_smote = {**param_grid_common, ‘sampler__k_neighbors’: [3, 5]}

SMOTEENN

pipeline_smotenn = ImbPipeline([
(‘scaler’, StandardScaler()),
(‘sampler’, SMOTEENN(random_state=42)),
(‘classifier’, base_clf)
])
param_smotenn = param_grid_common.copy()

权重平衡

pipeline_weighted = ImbPipeline([
(‘scaler’, StandardScaler()),
(‘classifier’, RandomForestClassifier(random_state=42, class_weight=‘balanced’))
])
param_weighted = param_grid_common.copy()

5.网格搜索函数

def run_gridsearch(pipeline, param_grid, name):
print(f"\n{‘=’*60}“)
print(f"正在运行策略: {name}”)
print(f"{‘=’*60}“)
gs = GridSearchCV(
pipeline, param_grid, cv=cv, scoring=‘f1’, n_jobs=-1, verbose=1
)
gs.fit(X_train, y_train)
print(f"最佳参数组合: {gs.best_params_}”)
print(f"交叉验证最佳 F1 (平均): {gs.best_score_:.4f}")

y_pred = gs.best_estimator_.predict(X_test) test_f1 = f1_score(y_test, y_pred) test_recall = recall_score(y_test, y_pred) test_precision = precision_score(y_test, y_pred) test_auc = roc_auc_score(y_test, gs.best_estimator_.predict_proba(X_test)[:,1]) print(f"测试集 F1: {test_f1:.4f}") print(f"测试集 召回率(Recall): {test_recall:.4f}") print(f"测试集 精确率(Precision): {test_precision:.4f}") print(f"测试集 AUC: {test_auc:.4f}") print("\n分类报告:") print(classification_report(y_test, y_pred)) print("混淆矩阵:") print(confusion_matrix(y_test, y_pred)) return gs.best_estimator_, test_f1, test_recall, test_precision, test_auc

6.批量训练对比

results = {}
best_models = {}
strategies = [
(‘Baseline’, pipeline_baseline, param_baseline),
(‘SMOTE’, pipeline_smote, param_smote),
(‘SMOTEENN’, pipeline_smotenn, param_smotenn),
(‘Weighted’, pipeline_weighted, param_weighted)
]
for name, pipe, params in strategies:
model, f1, rec, prec, auc = run_gridsearch(pipe, params, name)
results[name] = {‘F1’: f1, ‘Recall’: rec, ‘Precision’: prec, ‘AUC’: auc}
best_models[name] = model

7.结果汇总

print(“\n\n” + “=”*60)
print(“各策略性能对比”)
print(“=”*60)
df_results = pd.DataFrame(results).T
print(df_results.round(4))
best_strategy = df_results[‘F1’].idxmax()
best_model = best_models[best_strategy]
print(f"\n最优策略: {best_strategy}，F1 = {df_results.loc[best_strategy, ‘F1’]:.4f}")

8.阈值调优绘图

print(“\n阈值微调”)
y_proba = best_model.predict_proba(X_test)[:, 1]
precisions, recalls, thresholds = precision_recall_curve(y_test, y_proba)
fscores = 2 * (precisions * recalls) / (precisions + recalls + 1e-9)
ix = np.argmax(fscores[:-1])
best_threshold = thresholds[ix]
print(f"最优阈值: {best_threshold:.4f}“)
print(f"对应F1:{fscores[ix]:.4f} 召回:{recalls[ix]:.4f} 精确:{precisions[ix]:.4f}”)
y_pred_new = (y_proba >= best_threshold).astype(int)
print(“\n调整阈值后分类报告：”)
print(classification_report(y_test, y_pred_new))

绘图

plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(thresholds, precisions[:-1], ‘–’, label=‘Precision’)
plt.plot(thresholds, recalls[:-1], ‘:’, label=‘Recall’)
plt.plot(thresholds, fscores[:-1], linewidth=2, label=‘F1’)
plt.scatter(best_threshold, fscores[ix], c=‘red’, s=100)
plt.xlabel(“Threshold”)
plt.grid(True)
plt.legend()
plt.subplot(1, 2, 2)
plt.plot(recalls, precisions)
plt.scatter(recalls[ix], precisions[ix], c=‘red’, s=100)
plt.xlabel(“Recall”)
plt.ylabel(“Precision”)
plt.grid(True)
plt.tight_layout()
plt.show()

print(“全部执行完毕！”)

查看全文

http://www.cnnetsun.cn/news/3172556.html