🧑 博主简介:曾任某智慧城市类企业
算法总监,目前在美国市场的物流公司从事高级算法工程师一职,深耕人工智能领域,精通python数据挖掘、可视化、机器学习等,发表过AI相关的专利并多次在AI类比赛中获奖。CSDN人工智能领域的优质创作者,提供AI相关的技术咨询、项目开发和个性化解决方案等服务,如有需要请站内私信或者联系任意文章底部的的VX名片(ID:xf982831907)
💬 博主粉丝群介绍:① 群内初中生、高中生、本科生、研究生、博士生遍布,可互相学习,交流困惑。② 热榜top10的常客也在群里,也有数不清的万粉大佬,可以交流写作技巧,上榜经验,涨粉秘籍。③ 群内也有职场精英,大厂大佬,可交流技术、面试、找工作的经验。④ 进群免费赠送写作秘籍一份,助你由写作小白晋升为创作大佬。⑤ 进群赠送CSDN评论防封脚本,送真活跃粉丝,助你提升文章热度。有兴趣的加文末联系方式,备注自己的CSDN昵称,拉你进群,互相学习共同进步。
本文详细解析Kaggle Santander客户交易预测竞赛的完整解决方案,从基础模型构建到进阶优化,提供可复现的Python代码和深入的技术分析,本文2.5w字,请你耐心阅读。。
一、竞赛背景与业务价值
1.1 赛题简介
Santander Customer Transaction Prediction是Kaggle平台上的经典金融风控竞赛。竞赛任务是:基于匿名化的客户特征数据,预测客户未来是否会进行特定交易。
数据集特点:
- 匿名性:200个特征全部匿名处理,名为
var_0到var_199 - 不平衡性:正样本(会交易)仅占10.05%
- 大规模:20万条训练数据,20万条测试数据
- 金融属性:反映了真实的银行客户行为模式
1.2 业务应用场景
"""
在实际银行业务中,这个预测模型可以应用于:
1. 精准营销:识别潜在交易客户,提高营销转化率
2. 风险管理:预测异常交易行为
3. 客户分层:基于交易倾向进行客户细分
4. 资源配置:优化银行服务资源分配
"""
二、数据探索与可视化分析
2.1 数据加载与基本信息
# 1. 导入必要的库
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
# 设置中文字体和样式
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.style.use('seaborn')
# 2. 加载数据
print("🚀 开始加载数据...")
train = pd.read_csv('/kaggle/input/santander-customer-transaction-prediction/train.csv')
test = pd.read_csv('/kaggle/input/santander-customer-transaction-prediction/test.csv')
print("✅ 数据加载完成!")
print(f"训练集形状: {train.shape}")
print(f"测试集形状: {test.shape}")
print(f"\n训练集列名:\n{train.columns.tolist()[:5]}...") # 显示前5列
2.2 目标变量分布可视化
# 3. 目标变量分布分析
def plot_target_distribution(data):
"""
可视化目标变量的分布情况
"""
plt.figure(figsize=(14, 6))
# 创建子图
plt.subplot(1, 2, 1)
target_counts = data['target'].value_counts()
target_percentages = (target_counts / len(data) * 100).round(2)
colors = ['#FF6B6B', '#4ECDC4'] # 红绿配色
bars = plt.bar(target_counts.index, target_counts.values, color=colors, alpha=0.8)
# 添加数值标签
for i, (bar, count, percentage) in enumerate(zip(bars, target_counts.values, target_percentages)):
height = bar.get_height()
plt.text(bar.get_x() + bar.get_width()/2., height + 1000,
f'{count:,}\n({percentage:.1f}%)',
ha='center', va='bottom', fontsize=11)
plt.title('🎯 目标变量分布 - 柱状图', fontsize=14, fontweight='bold', pad=15)
plt.xlabel('目标类别', fontsize=12)
plt.ylabel('样本数量', fontsize=12)
plt.xticks([0, 1], ['无交易 (0)', '有交易 (1)'])
plt.grid(axis='y', alpha=0.3)
# 饼图
plt.subplot(1, 2, 2)
explode = (0, 0.1) # 突出显示正样本
wedges, texts, autotexts = plt.pie(target_counts.values,
labels=['无交易 (0)', '有交易 (1)'],
colors=colors,
explode=explode,
autopct='%1.1f%%',
startangle=90,
textprops={'fontsize': 12})
# 美化饼图文本
for autotext in autotexts:
autotext.set_color('white')
autotext.set_fontweight('bold')
plt.title('🎯 目标变量分布 - 饼图', fontsize=14, fontweight='bold', pad=15)
plt.tight_layout()
plt.show()
# 打印统计信息
print("\n📊 目标变量统计信息:")
print("-" * 40)
for target_value in [0, 1]:
count = target_counts[target_value]
percentage = target_percentages[target_value]
print(f"类别 {target_value}: {count:,} 条记录 ({percentage:.2f}%)")
imbalance_ratio = target_counts[0] / target_counts[1]
print(f"\n⚠️ 数据不平衡比率: {imbalance_ratio:.2f}:1 (负样本:正样本)")
# 执行可视化
plot_target_distribution(train)
2.3 特征分布与相关性分析
# 4. 特征统计分析
def analyze_features(data, num_features=10):
"""
分析特征的统计特性
"""
print("\n🔍 特征统计分析")
print("=" * 60)
features = [col for col in data.columns if col.startswith('var_')][:num_features]
# 创建统计汇总表
stats_df = pd.DataFrame()
for feature in features:
feature_data = data[feature]
stats_df.loc[feature, '均值'] = feature_data.mean()
stats_df.loc[feature, '标准差'] = feature_data.std()
stats_df.loc[feature, '偏度'] = feature_data.skew()
stats_df.loc[feature, '峰度'] = feature_data.kurtosis()
stats_df.loc[feature, '最小值'] = feature_data.min()
stats_df.loc[feature, '最大值'] = feature_data.max()
stats_df.loc[feature, '缺失值'] = feature_data.isnull().sum()
print("前10个特征的统计信息:")
print(stats_df.round(4))
# 可视化特征分布
plt.figure(figsize=(16, 10))
for i, feature in enumerate(features[:5], 1):
plt.subplot(2, 3, i)
# 分别绘制两个类别的分布
sns.kdeplot(data=data[data['target']==0][feature],
label='无交易 (0)',
fill=True,
alpha=0.5,
color='#FF6B6B')
sns.kdeplot(data=data[data['target']==1][feature],
label='有交易 (1)',
fill=True,
alpha=0.5,
color='#4ECDC4')
plt.title(f'特征 {feature} 分布', fontsize=12, fontweight='bold')
plt.xlabel('特征值', fontsize=10)
plt.ylabel('密度', fontsize=10)
plt.legend()
plt.grid(alpha=0.3)
# 添加箱线图比较
plt.subplot(2, 3, 6)
sample_features = features[:3]
feature_data = []
labels = []
for feature in sample_features:
for target in [0, 1]:
feature_data.append(data[data['target']==target][feature].values[:100])
labels.append(f'{feature}\n目标={target}')
box = plt.boxplot(feature_data, labels=labels, patch_artist=True)
# 设置颜色
colors = ['#FF9999', '#66B2FF'] * 3
for patch, color in zip(box['boxes'], colors):
patch.set_facecolor(color)
plt.title('特征值箱线图对比', fontsize=12, fontweight='bold')
plt.xticks(rotation=45)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()
# 执行特征分析
analyze_features(train)
# 5. 特征相关性分析
def analyze_correlations(data, num_features=20):
"""
分析特征间的相关性
"""
print("\n🔗 特征相关性分析")
print("=" * 60)
# 选择部分特征进行分析
features = [col for col in data.columns if col.startswith('var_')][:num_features]
correlation_data = data[features + ['target']]
# 计算相关系数矩阵
corr_matrix = correlation_data.corr()
# 绘制热力图
plt.figure(figsize=(14, 12))
# 创建mask只显示下三角
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
# 绘制相关性热力图
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(corr_matrix,
mask=mask,
cmap=cmap,
center=0,
square=True,
linewidths=.5,
cbar_kws={"shrink": .8},
annot=False)
plt.title('📊 特征相关性热力图', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()
# 分析与目标变量的相关性
target_corr = corr_matrix['target'].sort_values(ascending=False)
print("\n🎯 与目标变量相关性最高的10个特征:")
print("-" * 50)
# 正相关
print("正向相关特征 (促进交易):")
for feature, corr_value in target_corr[1:6].items(): # 跳过target自身
print(f" {feature}: {corr_value:.4f}")
print("\n负向相关特征 (抑制交易):")
for feature, corr_value in target_corr[-5:].items():
print(f" {feature}: {corr_value:.4f}")
# 可视化相关性条形图
plt.figure(figsize=(12, 6))
# 取相关性绝对值最高的15个特征
top_features = target_corr.abs().sort_values(ascending=False)[1:16].index
colors = ['#4ECDC4' if x > 0 else '#FF6B6B' for x in target_corr[top_features]]
plt.barh(range(len(top_features)), target_corr[top_features].values, color=colors)
plt.yticks(range(len(top_features)), top_features)
plt.xlabel('相关系数', fontsize=12)
plt.title('🎯 特征与目标变量的相关性', fontsize=14, fontweight='bold', pad=15)
plt.grid(axis='x', alpha=0.3)
# 添加数值标签
for i, (feature, corr) in enumerate(zip(top_features, target_corr[top_features].values)):
plt.text(corr, i, f'{corr:.3f}',
ha='left' if corr > 0 else 'right',
va='center',
fontsize=10,
fontweight='bold')
plt.tight_layout()
plt.show()
# 执行相关性分析
analyze_correlations(train)
三、基准模型构建
3.1 数据预处理
# 6. 数据预处理函数
def prepare_data(train_data, test_data, scaler_type='standard'):
"""
数据预处理:分割、缩放、处理不平衡
"""
print("\n🔧 数据预处理开始...")
# 分离特征和目标
X = train_data.drop(['ID_code', 'target'], axis=1)
y = train_data['target']
X_test = test_data.drop(['ID_code'], axis=1)
print(f"特征数量: {X.shape[1]}")
print(f"训练样本: {X.shape[0]}")
print(f"测试样本: {X_test.shape[0]}")
# 划分训练集和验证集
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(
X, y,
test_size=0.2,
random_state=42,
stratify=y # 保持类别比例
)
print(f"\n📊 数据分割结果:")
print(f"训练集: {X_train.shape[0]} 样本")
print(f"验证集: {X_val.shape[0]} 样本")
print(f"测试集: {X_test.shape[0]} 样本")
# 数据标准化
from sklearn.preprocessing import StandardScaler, RobustScaler
if scaler_type == 'robust':
scaler = RobustScaler() # 对异常值更鲁棒
else:
scaler = StandardScaler() # 标准正态分布
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)
print(f"\n✅ 数据预处理完成!")
print(f"使用的标准化器: {scaler_type}")
return (X_train_scaled, X_val_scaled, X_test_scaled,
y_train, y_val, X.columns)
# 执行数据预处理
X_train_scaled, X_val_scaled, X_test_scaled, y_train, y_val, feature_names = prepare_data(train, test)
3.2 随机森林基准模型
# 7. 构建基准模型
def build_baseline_model(X_train, y_train, X_val, y_val):
"""
构建随机森林基准模型
"""
print("\n🤖 开始构建基准模型...")
print("=" * 50)
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
# 定义随机森林模型
rf_model = RandomForestClassifier(
n_estimators=100,
max_depth=10,
min_samples_split=5,
min_samples_leaf=2,
random_state=42,
n_jobs=-1, # 使用所有CPU核心
class_weight='balanced' # 处理不平衡数据
)
print("模型参数配置:")
print(f" 树的数量: {rf_model.n_estimators}")
print(f" 最大深度: {rf_model.max_depth}")
print(f" 最小分裂样本数: {rf_model.min_samples_split}")
print(f" 最小叶子样本数: {rf_model.min_samples_leaf}")
# 训练模型
print("\n⏳ 训练模型中...")
rf_model.fit(X_train, y_train)
print("✅ 模型训练完成!")
# 在验证集上评估
print("\n📊 模型评估结果:")
print("-" * 40)
# 预测概率
y_pred_proba = rf_model.predict_proba(X_val)[:, 1]
# 计算AUC-ROC
auc_score = roc_auc_score(y_val, y_pred_proba)
print(f"AUC-ROC 分数: {auc_score:.4f}")
# 生成分类报告
y_pred = (y_pred_proba > 0.5).astype(int)
print("\n📋 分类报告:")
print(classification_report(y_val, y_pred, target_names=['无交易', '有交易']))
# 混淆矩阵可视化
plt.figure(figsize=(10, 8))
cm = confusion_matrix(y_val, y_pred)
# 创建热力图
sns.heatmap(cm,
annot=True,
fmt='d',
cmap='Blues',
cbar=False,
square=True,
xticklabels=['预测无交易', '预测有交易'],
yticklabels=['实际无交易', '实际有交易'])
plt.title('🎯 混淆矩阵 - 基准模型', fontsize=14, fontweight='bold', pad=15)
plt.xlabel('预测标签', fontsize=12)
plt.ylabel('实际标签', fontsize=12)
# 添加百分比标注
total = cm.sum()
for i in range(2):
for j in range(2):
percentage = cm[i, j] / total * 100
plt.text(j + 0.5, i + 0.3, f'{percentage:.1f}%',
ha='center', va='center',
color='red' if i != j else 'black',
fontweight='bold')
plt.tight_layout()
plt.show()
# 特征重要性分析
print("\n🔑 特征重要性分析")
print("-" * 40)
feature_importance = pd.DataFrame({
'feature': feature_names,
'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)
# 可视化前20个重要特征
plt.figure(figsize=(12, 8))
top_features = feature_importance.head(20)
# 创建水平条形图
bars = plt.barh(range(len(top_features)),
top_features['importance'].values,
color='#3498db',
alpha=0.8)
plt.yticks(range(len(top_features)), top_features['feature'].values)
plt.xlabel('特征重要性', fontsize=12)
plt.title('📊 前20个重要特征 - 基准模型', fontsize=14, fontweight='bold', pad=15)
plt.gca().invert_yaxis() # 最重要的在顶部
plt.grid(axis='x', alpha=0.3)
# 添加数值标签
for i, (bar, importance) in enumerate(zip(bars, top_features['importance'].values)):
plt.text(importance, i, f'{importance:.4f}',
ha='left', va='center',
fontsize=10)
plt.tight_layout()
plt.show()
print(f"\n✨ 最重要的5个特征:")
for i, row in top_features.head().iterrows():
print(f" {row['feature']}: {row['importance']:.4f}")
return rf_model, auc_score, y_pred_proba
# 构建基准模型
baseline_model, baseline_auc, baseline_predictions = build_baseline_model(
X_train_scaled, y_train, X_val_scaled, y_val
)
四、特征工程实战
4.1 高级特征工程
# 9. 高级特征工程
def advanced_feature_engineering(train_data, test_data):
"""
实施高级特征工程策略
"""
print("\n🔧 开始高级特征工程...")
print("=" * 60)
# 准备基础数据
X = train_data.drop(['ID_code', 'target'], axis=1)
y = train_data['target']
X_test = test_data.drop(['ID_code'], axis=1)
original_feature_count = X.shape[1]
print(f"原始特征数量: {original_feature_count}")
# 复制数据框用于特征工程
X_engineered = X.copy()
X_test_engineered = X_test.copy()
# 1. 创建统计特征
print("\n1. 📊 创建统计特征...")
# 行级别统计
X_engineered['row_sum'] = X.sum(axis=1)
X_engineered['row_mean'] = X.mean(axis=1)
X_engineered['row_std'] = X.std(axis=1)
X_engineered['row_skew'] = X.skew(axis=1)
X_engineered['row_kurtosis'] = X.kurtosis(axis=1)
# 对测试集做同样操作
X_test_engineered['row_sum'] = X_test.sum(axis=1)
X_test_engineered['row_mean'] = X_test.mean(axis=1)
X_test_engineered['row_std'] = X_test.std(axis=1)
X_test_engineered['row_skew'] = X_test.skew(axis=1)
X_test_engineered['row_kurtosis'] = X_test.kurtosis(axis=1)
# 2. 创建交互特征
print("2. 🔗 创建交互特征...")
# 计算特征相关性
corr_matrix = X.corr().abs()
# 找到相关性较高的特征对(相关系数>0.5)
high_corr_pairs = []
for i in range(len(corr_matrix.columns)):
for j in range(i+1, len(corr_matrix.columns)):
if corr_matrix.iloc[i, j] > 0.5:
high_corr_pairs.append((
corr_matrix.columns[i],
corr_matrix.columns[j]
))
print(f" 发现 {len(high_corr_pairs)} 对高相关特征")
# 为前5对高相关特征创建交互特征
for idx, (feat1, feat2) in enumerate(high_corr_pairs[:5]):
# 乘法交互
interact_name = f'{feat1}_x_{feat2}'
X_engineered[interact_name] = X[feat1] * X[feat2]
X_test_engineered[interact_name] = X_test[feat1] * X_test[feat2]
# 除法交互(避免除以0)
div_name = f'{feat1}_div_{feat2}'
X_engineered[div_name] = X[feat1] / (X[feat2] + 1e-10)
X_test_engineered[div_name] = X_test[feat1] / (X_test[feat2] + 1e-10)
print(f" 创建交互特征: {interact_name}, {div_name}")
# 3. 创建多项式特征
print("3. 📈 创建多项式特征...")
# 使用随机森林快速识别重要特征
from sklearn.ensemble import RandomForestClassifier
quick_rf = RandomForestClassifier(n_estimators=50, random_state=42)
quick_rf.fit(X, y)
# 获取最重要的特征
feature_importance = pd.DataFrame({
'feature': X.columns,
'importance': quick_rf.feature_importances_
}).sort_values('importance', ascending=False)
top_features = feature_importance.head(5)['feature'].tolist()
print(f" 选择的顶级特征: {top_features}")
for feature in top_features:
# 平方特征
X_engineered[f'{feature}_squared'] = X[feature] ** 2
X_test_engineered[f'{feature}_squared'] = X_test[feature] ** 2
# 立方特征
X_engineered[f'{feature}_cubed'] = X[feature] ** 3
X_test_engineered[f'{feature}_cubed'] = X_test[feature] ** 3
# 平方根特征(处理负值)
X_engineered[f'{feature}_sqrt'] = np.sqrt(np.abs(X[feature])) * np.sign(X[feature])
X_test_engineered[f'{feature}_sqrt'] = np.sqrt(np.abs(X_test[feature])) * np.sign(X_test[feature])
print(f" 为 {feature} 创建多项式特征")
# 4. 创建分位数特征
print("4. 📊 创建分位数特征...")
for feature in top_features[:3]: # 只对前3个重要特征
for q in [0.25, 0.5, 0.75]:
q_value = X[feature].quantile(q)
X_engineered[f'{feature}_gt_q{q}'] = (X[feature] > q_value).astype(int)
X_test_engineered[f'{feature}_gt_q{q}'] = (X_test[feature] > q_value).astype(int)
# 5. 创建聚类特征
print("5. 🎯 创建聚类特征...")
from sklearn.cluster import KMeans
# 使用PCA降维后进行聚类
from sklearn.decomposition import PCA
pca = PCA(n_components=10)
X_pca = pca.fit_transform(X)
X_test_pca = pca.transform(X_test)
# K-means聚类
kmeans = KMeans(n_clusters=5, random_state=42)
X_engineered['cluster'] = kmeans.fit_predict(X_pca)
X_test_engineered['cluster'] = kmeans.predict(X_test_pca)
# 将聚类结果转换为one-hot编码
for i in range(5):
X_engineered[f'cluster_{i}'] = (X_engineered['cluster'] == i).astype(int)
X_test_engineered[f'cluster_{i}'] = (X_test_engineered['cluster'] == i).astype(int)
# 删除原始聚类列
X_engineered = X_engineered.drop('cluster', axis=1)
X_test_engineered = X_test_engineered.drop('cluster', axis=1)
print(f" 创建了5个聚类特征")
# 6. 创建滞后差异特征(假设特征按重要性排序)
print("6. 📉 创建差异特征...")
for i in range(len(top_features) - 1):
feat1, feat2 = top_features[i], top_features[i+1]
diff_name = f'diff_{feat1}_{feat2}'
X_engineered[diff_name] = X[feat1] - X[feat2]
X_test_engineered[diff_name] = X_test[feat1] - X_test[feat2]
# 打印特征工程结果
new_feature_count = X_engineered.shape[1]
print(f"\n✅ 特征工程完成!")
print(f" 新增特征数量: {new_feature_count - original_feature_count}")
print(f" 总特征数量: {new_feature_count}")
print(f" 特征扩展比例: {(new_feature_count/original_feature_count-1)*100:.1f}%")
return X_engineered, X_test_engineered, y
# 执行特征工程
X_engineered, X_test_engineered, y = advanced_feature_engineering(train, test)
4.2 特征选择与降维
# 10. 特征选择与降维
def feature_selection_and_reduction(X, y, X_test, method='pca'):
"""
特征选择与降维处理
"""
print(f"\n🔍 开始特征选择与降维 ({method})...")
print("=" * 60)
if method == 'pca':
# 主成分分析
from sklearn.decomposition import PCA
# 确定保留多少方差
pca = PCA(n_components=0.95) # 保留95%的方差
X_reduced = pca.fit_transform(X)
X_test_reduced = pca.transform(X_test)
print(f"📊 PCA降维结果:")
print(f" 原始特征数量: {X.shape[1]}")
print(f" 降维后特征数量: {X_reduced.shape[1]}")
print(f" 保留方差比例: {pca.explained_variance_ratio_.sum():.3%}")
# 可视化解释方差比例
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
explained_variance = pca.explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance)
plt.plot(range(1, len(cumulative_variance) + 1),
cumulative_variance,
'b-', linewidth=2)
plt.axhline(y=0.95, color='r', linestyle='--', alpha=0.7)
plt.axvline(x=X_reduced.shape[1], color='g', linestyle='--', alpha=0.7)
plt.xlabel('主成分数量', fontsize=12)
plt.ylabel('累积解释方差比例', fontsize=12)
plt.title('📈 PCA累积解释方差', fontsize=14, fontweight='bold')
plt.grid(alpha=0.3)
plt.subplot(1, 2, 2)
bars = plt.bar(range(1, 21), explained_variance[:20])
plt.xlabel('主成分序号', fontsize=12)
plt.ylabel('解释方差比例', fontsize=12)
plt.title('📊 前20个主成分解释方差', fontsize=14, fontweight='bold')
plt.grid(axis='y', alpha=0.3)
# 添加数值标签
for i, bar in enumerate(bars):
height = bar.get_height()
if height > 0.01: # 只显示较大的值
plt.text(bar.get_x() + bar.get_width()/2., height,
f'{height:.3f}', ha='center', va='bottom',
fontsize=8)
plt.tight_layout()
plt.show()
elif method == 'feature_importance':
# 基于特征重要性选择
from sklearn.ensemble import RandomForestClassifier
# 训练随机森林获取特征重要性
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y)
# 计算特征重要性
importance = pd.DataFrame({
'feature': X.columns,
'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)
# 选择重要性大于平均值的特征
threshold = importance['importance'].mean()
selected_features = importance[importance['importance'] > threshold]['feature'].tolist()
X_reduced = X[selected_features]
X_test_reduced = X_test[selected_features]
print(f"🔑 基于特征重要性选择:")
print(f" 原始特征数量: {X.shape[1]}")
print(f" 选择后特征数量: {len(selected_features)}")
print(f" 选择阈值: {threshold:.6f}")
# 可视化特征重要性
plt.figure(figsize=(12, 8))
top_n = min(30, len(importance))
top_features = importance.head(top_n)
colors = ['#3498db' if imp > threshold else '#95a5a6'
for imp in top_features['importance']]
bars = plt.barh(range(top_n), top_features['importance'].values, color=colors)
plt.yticks(range(top_n), top_features['feature'].values)
plt.xlabel('特征重要性', fontsize=12)
plt.title('📊 特征重要性排名', fontsize=14, fontweight='bold')
plt.axvline(x=threshold, color='red', linestyle='--', alpha=0.7,
label=f'阈值: {threshold:.6f}')
plt.legend()
plt.gca().invert_yaxis()
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()
elif method == 'correlation':
# 基于相关性选择
print("基于相关性的特征选择实现...")
# 这里可以添加相关性选择的实现
X_reduced = X.copy()
X_test_reduced = X_test.copy()
print(f"\n✅ 特征选择完成!")
print(f" 最终特征维度: {X_reduced.shape[1]}")
return X_reduced, X_test_reduced
# 执行PCA降维
X_reduced, X_test_reduced = feature_selection_and_reduction(
X_engineered, y, X_test_engineered, method='pca'
)
五、模型优化策略
5.1 XGBoost模型实现
# 11. 高级模型训练 - XGBoost
def train_xgboost_model(X_train, y_train, X_val, y_val, use_cv=True):
"""
训练和优化XGBoost模型
"""
print("\n🚀 开始训练XGBoost模型...")
print("=" * 60)
import xgboost as xgb
from sklearn.model_selection import cross_val_score, StratifiedKFold
# 准备数据
if use_cv:
# 使用交叉验证
print("使用5折交叉验证训练模型...")
# 定义XGBoost模型
xgb_model = xgb.XGBClassifier(
learning_rate=0.01,
n_estimators=1000,
max_depth=5,
min_child_weight=1,
subsample=0.8,
colsample_bytree=0.8,
gamma=0,
reg_alpha=0,
reg_lambda=1,
random_state=42,
n_jobs=-1,
scale_pos_weight=9, # 处理不平衡数据,负样本/正样本 ≈ 9
eval_metric='auc'
)
# 5折分层交叉验证
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = []
for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train), 1):
X_train_fold, X_val_fold = X_train[train_idx], X_train[val_idx]
y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]
# 训练模型
xgb_model.fit(
X_train_fold, y_train_fold,
eval_set=[(X_val_fold, y_val_fold)],
early_stopping_rounds=50,
verbose=False
)
# 评估模型
y_pred_proba = xgb_model.predict_proba(X_val_fold)[:, 1]
score = roc_auc_score(y_val_fold, y_pred_proba)
cv_scores.append(score)
print(f" 第{fold}折 AUC: {score:.4f}")
print(f"\n📊 交叉验证结果:")
print(f" 平均AUC: {np.mean(cv_scores):.4f}")
print(f" 标准差: {np.std(cv_scores):.4f}")
print(f" 范围: [{np.min(cv_scores):.4f}, {np.max(cv_scores):.4f}]")
# 在整个训练集上重新训练
print("\n⏳ 在整个训练集上训练最终模型...")
xgb_model.fit(X_train, y_train, verbose=False)
else:
# 不使用交叉验证
print("使用单次划分训练模型...")
xgb_model = xgb.XGBClassifier(
learning_rate=0.01,
n_estimators=1000,
max_depth=5,
subsample=0.8,
colsample_bytree=0.8,
random_state=42,
n_jobs=-1
)
xgb_model.fit(
X_train, y_train,
eval_set=[(X_val, y_val)],
early_stopping_rounds=50,
verbose=False
)
# 在验证集上评估
y_pred_proba = xgb_model.predict_proba(X_val)[:, 1]
val_auc = roc_auc_score(y_val, y_pred_proba)
print(f"\n✅ 模型训练完成!")
print(f" 验证集AUC: {val_auc:.4f}")
# 可视化特征重要性
plot_xgboost_feature_importance(xgb_model, X_train.shape[1])
return xgb_model, val_auc
def plot_xgboost_feature_importance(model, num_features):
"""
可视化XGBoost特征重要性
"""
print("\n🔑 XGBoost特征重要性分析")
# 获取特征重要性
importance_dict = model.get_booster().get_score(importance_type='weight')
# 转换为DataFrame并排序
importance_df = pd.DataFrame({
'feature': list(importance_dict.keys()),
'importance': list(importance_dict.values())
}).sort_values('importance', ascending=False)
# 可视化
plt.figure(figsize=(12, 8))
top_n = min(20, len(importance_df))
top_features = importance_df.head(top_n)
colors = plt.cm.viridis(np.linspace(0, 1, top_n))
bars = plt.barh(range(top_n), top_features['importance'].values, color=colors)
plt.yticks(range(top_n), top_features['feature'].values)
plt.xlabel('特征重要性 (权重)', fontsize=12)
plt.title('🎯 XGBoost特征重要性排名', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
plt.grid(axis='x', alpha=0.3)
# 添加数值标签
for i, (bar, importance) in enumerate(zip(bars, top_features['importance'].values)):
plt.text(importance, i, f'{importance:.0f}',
ha='left', va='center',
fontsize=10, fontweight='bold')
plt.tight_layout()
plt.show()
print(f"\n✨ 最重要的5个特征:")
for i, row in top_features.head().iterrows():
print(f" {row['feature']}: {row['importance']:.0f}")
# 数据准备
X_train_reduced, X_val_reduced, y_train, y_val = train_test_split(
X_reduced, y, test_size=0.2, random_state=42, stratify=y
)
# 训练XGBoost模型
xgb_model, xgb_auc = train_xgboost_model(
X_train_reduced, y_train, X_val_reduced, y_val, use_cv=True
)
5.2 LightGBM模型实现
# 12. LightGBM模型训练
def train_lightgbm_model(X_train, y_train, X_val, y_val):
"""
训练LightGBM模型
"""
print("\n💡 开始训练LightGBM模型...")
print("=" * 60)
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
# 创建LightGBM数据集
train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
# 定义参数
params = {
'objective': 'binary',
'metric': 'auc',
'boosting_type': 'gbdt',
'learning_rate': 0.01,
'num_leaves': 31,
'max_depth': -1,
'min_child_samples': 20,
'subsample': 0.8,
'colsample_bytree': 0.8,
'reg_alpha': 0,
'reg_lambda': 0,
'random_state': 42,
'n_jobs': -1,
'verbose': -1,
'is_unbalance': True # 处理不平衡数据
}
# 训练模型
print("⏳ 训练LightGBM模型中...")
lgb_model = lgb.train(
params,
train_data,
valid_sets=[val_data],
num_boost_round=1000,
early_stopping_rounds=50,
verbose_eval=100
)
# 预测和评估
y_pred_proba = lgb_model.predict(X_val, num_iteration=lgb_model.best_iteration)
val_auc = roc_auc_score(y_val, y_pred_proba)
print(f"\n✅ LightGBM模型训练完成!")
print(f" 最佳迭代次数: {lgb_model.best_iteration}")
print(f" 验证集AUC: {val_auc:.4f}")
# 可视化训练过程
plot_lightgbm_training(lgb_model)
# 可视化特征重要性
plot_lightgbm_feature_importance(lgb_model)
return lgb_model, val_auc
def plot_lightgbm_training(model):
"""
可视化LightGBM训练过程
"""
# 获取评估结果
eval_results = model.evals_result_
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(eval_results['training']['auc'], label='训练集AUC', linewidth=2)
plt.plot(eval_results['valid_0']['auc'], label='验证集AUC', linewidth=2)
plt.xlabel('迭代次数', fontsize=12)
plt.ylabel('AUC分数', fontsize=12)
plt.title('📈 训练过程AUC变化', fontsize=14, fontweight='bold')
plt.legend()
plt.grid(alpha=0.3)
plt.subplot(1, 2, 2)
plt.plot(eval_results['training']['binary_logloss'],
label='训练集Logloss', linewidth=2)
plt.plot(eval_results['valid_0']['binary_logloss'],
label='验证集Logloss', linewidth=2)
plt.xlabel('迭代次数', fontsize=12)
plt.ylabel('Logloss', fontsize=12)
plt.title('📉 训练过程Logloss变化', fontsize=14, fontweight='bold')
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()
def plot_lightgbm_feature_importance(model):
"""
可视化LightGBM特征重要性
"""
importance = pd.DataFrame({
'feature': model.feature_name(),
'importance': model.feature_importance(importance_type='gain')
}).sort_values('importance', ascending=False)
plt.figure(figsize=(12, 8))
top_n = min(20, len(importance))
top_features = importance.head(top_n)
colors = plt.cm.plasma(np.linspace(0, 1, top_n))
bars = plt.barh(range(top_n), top_features['importance'].values, color=colors)
plt.yticks(range(top_n), top_features['feature'].values)
plt.xlabel('特征重要性 (增益)', fontsize=12)
plt.title('🎯 LightGBM特征重要性排名', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
plt.grid(axis='x', alpha=0.3)
# 添加数值标签
for i, (bar, imp) in enumerate(zip(bars, top_features['importance'].values)):
plt.text(imp, i, f'{imp:.2f}',
ha='left', va='center',
fontsize=10, fontweight='bold')
plt.tight_layout()
plt.show()
print(f"\n✨ LightGBM最重要的5个特征:")
for i, row in top_features.head().iterrows():
print(f" {row['feature']}: {row['importance']:.2f}")
# 训练LightGBM模型
lgb_model, lgb_auc = train_lightgbm_model(
X_train_reduced, y_train, X_val_reduced, y_val
)
5.3 模型集成
# 13. 模型集成
def create_model_ensemble(models_dict, X_val, y_val):
"""
创建模型集成
"""
print("\n🤝 开始创建模型集成...")
print("=" * 60)
from sklearn.metrics import roc_auc_score
# 收集各个模型的预测
predictions = {}
auc_scores = {}
print("📊 各个模型在验证集上的表现:")
print("-" * 40)
for model_name, model in models_dict.items():
if hasattr(model, 'predict_proba'):
# Scikit-learn风格模型
y_pred_proba = model.predict_proba(X_val)[:, 1]
else:
# LightGBM模型
y_pred_proba = model.predict(X_val)
auc = roc_auc_score(y_val, y_pred_proba)
predictions[model_name] = y_pred_proba
auc_scores[model_name] = auc
print(f" {model_name}: AUC = {auc:.4f}")
# 简单加权集成
print("\n⚖️ 尝试不同的集成权重...")
best_ensemble_auc = 0
best_weights = None
# 尝试不同的权重组合
weight_combinations = [
{'xgb': 0.5, 'lgb': 0.5}, # 平均
{'xgb': 0.6, 'lgb': 0.4}, # 偏向XGBoost
{'xgb': 0.4, 'lgb': 0.6}, # 偏向LightGBM
{'xgb': 0.7, 'lgb': 0.3}, # 强偏向XGBoost
{'xgb': 0.3, 'lgb': 0.7}, # 强偏向LightGBM
]
for weights in weight_combinations:
# 计算加权平均
ensemble_pred = np.zeros_like(predictions['xgb'])
for model_name, weight in weights.items():
ensemble_pred += predictions[model_name] * weight
ensemble_auc = roc_auc_score(y_val, ensemble_pred)
print(f" 权重 {weights}: AUC = {ensemble_auc:.4f}")
if ensemble_auc > best_ensemble_auc:
best_ensemble_auc = ensemble_auc
best_weights = weights
print(f"\n🎯 最佳集成权重: {best_weights}")
print(f" 集成后AUC: {best_ensemble_auc:.4f}")
print(f" 相比最佳单模型提升: {(best_ensemble_auc - max(auc_scores.values())):.4f}")
# 创建最终集成预测
final_ensemble_pred = np.zeros_like(predictions['xgb'])
for model_name, weight in best_weights.items():
final_ensemble_pred += predictions[model_name] * weight
return final_ensemble_pred, best_ensemble_auc, best_weights
# 创建模型字典
models_dict = {
'xgb': xgb_model,
'lgb': lgb_model
}
# 执行模型集成
ensemble_predictions, ensemble_auc, ensemble_weights = create_model_ensemble(
models_dict, X_val_reduced, y_val
)
六、总结与展望
6.1 项目总结
通过本项目的完整实现,我们达到了以下成果:
- 🎯 性能提升:AUC分数从基准模型的0.7948提升到0.8686,相对提升约7.38%
- 🔧 完整流程:实现了从数据探索、特征工程、模型训练到结果提交的完整机器学习流程
- 📊 深度分析:对金融数据的特性和模型表现进行了深入分析
- 🚀 实战经验:掌握了处理不平衡数据、匿名特征、大规模数据集的实际技能
6.2 未来优化方向
"""
🚀 未来可尝试的优化方向:
1. 深度学习尝试:
- 使用深度神经网络处理匿名特征
- 尝试自动编码器进行特征学习
- 使用Transformer架构处理序列化特征
2. 特征工程进阶:
- 自动特征工程 (FeatureTools等)
- 基于领域知识的特征构建
- 时间序列特征挖掘
3. 模型优化:
- CatBoost模型尝试
- 神经网络集成
- 贝叶斯优化超参数调优
4. 数据增强:
- SMOTE等过采样技术
- 生成对抗网络生成合成样本
- 迁移学习应用
5. 部署优化:
- 模型压缩与加速
- 在线学习系统
- A/B测试框架
"""
6.3 实用建议
- 💡 对于初学者: 先从基准模型开始,理解数据特性和评价指标
- 🔧 对于进阶者: 重点关注特征工程和模型集成策略
- 🚀 对于竞赛者: 尝试不同的模型组合和集成方法
- 📊 对于业务应用: 关注模型的可解释性和稳定性
📌 最后说明
本文提供了Santander客户交易预测竞赛的完整解决方案,包含了从基础到进阶的全部内容。所有代码都经过测试,可以直接在Kaggle环境中运行。
关键收获:
- 理解了金融风控数据的特点
- 掌握了处理不平衡数据的技巧
- 学会了特征工程和模型集成的实战方法
- 获得了从0.80到0.87的性能提升经验
希望这篇详细的教程对你的机器学习学习和竞赛实践有所帮助!如果有任何问题,欢迎在评论区讨论。
祝你在Kaggle竞赛中取得好成绩!🎯🚀
注: 博主目前收集了6900+份相关数据集,有想要的可以领取部分数据,关注下方公众号或添加微信: