Python数据分析与机器学习实战:从数据清洗到预测建模
一、环境准备与数据加载
# 导入必要的库
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
# 设置可视化样式
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
%matplotlib inline
# 加载示例数据集(使用内置的乳腺癌数据集)
from sklearn.datasets import load_breast_cancer
# 加载数据
data = load_breast_cancer()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target
print(f"数据集形状: {df.shape}")
print(f"\n前5行数据:")
print(df.head())
print(f"\n数据统计描述:")
print(df.describe())
二、数据探索与预处理
# 1. 检查数据基本信息
def explore_data(df):
print("=== 数据基本信息 ===")
print(f"数据集形状: {df.shape}")
print(f"\n数据列名:")
print(df.columns.tolist())
print(f"\n数据类型:")
print(df.dtypes)
print(f"\n缺失值统计:")
print(df.isnull().sum())
print(f"\n目标变量分布:")
print(df['target'].value_counts())
explore_data(df)
# 2. 数据可视化分析
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
# 目标变量分布
axes[0, 0].pie(df['target'].value_counts(),
labels=['恶性', '良性'],
autopct='%1.1f%%',
colors=['#ff9999', '#66b3ff'])
axes[0, 0].set_title('目标变量分布')
# 特征分布直方图
axes[0, 1].hist(df['mean radius'], bins=30, alpha=0.7, color='skyblue')
axes[0, 1].set_xlabel('平均半径')
axes[0, 1].set_ylabel('频数')
axes[0, 1].set_title('特征分布示例')
# 相关性热图
corr_matrix = df.corr()
sns.heatmap(corr_matrix.iloc[:10, :10], ax=axes[0, 2], cmap='coolwarm', annot=True, fmt='.2f')
axes[0, 2].set_title('特征相关性热图(部分)')
# 箱线图
sns.boxplot(x='target', y='mean radius', data=df, ax=axes[1, 0])
axes[1, 0].set_title('不同类别的特征分布')
axes[1, 0].set_xticklabels(['恶性', '良性'])
# 散点图示例
axes[1, 1].scatter(df['mean radius'], df['mean texture'],
c=df['target'], alpha=0.6, cmap='viridis')
axes[1, 1].set_xlabel('平均半径')
axes[1, 1].set_ylabel('平均纹理')
axes[1, 1].set_title('特征散点图')
# 特征重要性(使用随机森林)
from sklearn.ensemble import RandomForestClassifier
X_temp = df.drop('target', axis=1)
y_temp = df['target']
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_temp, y_temp)
feature_importance = pd.DataFrame({
'feature': X_temp.columns,
'importance': rf.feature_importances_
}).sort_values('importance', ascending=True).tail(10)
axes[1, 2].barh(feature_importance['feature'], feature_importance['importance'])
axes[1, 2].set_xlabel('重要性')
axes[1, 2].set_title('Top 10 重要特征')
plt.tight_layout()
plt.show()
# 3. 数据预处理
def preprocess_data(df):
"""数据预处理函数"""
# 分离特征和目标变量
X = df.drop('target', axis=1)
y = df['target']
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
# 特征标准化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print(f"训练集大小: {X_train_scaled.shape}")
print(f"测试集大小: {X_test_scaled.shape}")
return X_train_scaled, X_test_scaled, y_train, y_test, scaler
X_train, X_test, y_train, y_test, scaler = preprocess_data(df)
三、机器学习模型构建与评估
# 1. 定义模型训练和评估函数
def train_and_evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
"""训练并评估模型"""
print(f"\n{'='*50}")
print(f"训练 {model_name} 模型")
print(f"{'='*50}")
# 训练模型
model.fit(X_train, y_train)
# 预测
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None
# 评估指标
print("分类报告:")
print(classification_report(y_test, y_pred, target_names=['恶性', '良性']))
# 混淆矩阵
cm = confusion_matrix(y_test, y_pred)
# 可视化结果
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
# 混淆矩阵热图
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0])
axes[0].set_xlabel('预测标签')
axes[0].set_ylabel('真实标签')
axes[0].set_title(f'{model_name} - 混淆矩阵')
# ROC曲线(如果模型支持概率预测)
if y_pred_proba is not None:
from sklearn.metrics import roc_curve, auc
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)
axes[1].plot(fpr, tpr, color='darkorange', lw=2,
label=f'ROC曲线 (AUC = {roc_auc:.2f})')
axes[1].plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
axes[1].set_xlim([0.0, 1.0])
axes[1].set_ylim([0.0, 1.05])
axes[1].set_xlabel('假正率')
axes[1].set_ylabel('真正率')
axes[1].set_title(f'{model_name} - ROC曲线')
axes[1].legend(loc="lower right")
plt.tight_layout()
plt.show()
return model, y_pred
# 2. 训练多个模型进行比较
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
# 定义要比较的模型
models = {
"逻辑回归": LogisticRegression(random_state=42, max_iter=1000),
"支持向量机": SVC(probability=True, random_state=42),
"决策树": DecisionTreeClassifier(random_state=42),
"随机森林": RandomForestClassifier(n_estimators=100, random_state=42),
"梯度提升": GradientBoostingClassifier(n_estimators=100, random_state=42)
}
# 训练和评估所有模型
results = {}
for name, model in models.items():
trained_model, predictions = train_and_evaluate_model(
model, X_train, X_test, y_train, y_test, name
)
results[name] = {
'model': trained_model,
'predictions': predictions
}
四、模型优化与交叉验证
# 1. 使用交叉验证评估模型
from sklearn.model_selection import cross_val_score, GridSearchCV
def cross_validation_evaluation(model, X, y, model_name):
"""使用交叉验证评估模型"""
print(f"\n{'='*50}")
print(f"{model_name} 交叉验证")
print(f"{'='*50}")
# 执行交叉验证
cv_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
print(f"交叉验证准确率: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
print(f"各折分数: {cv_scores}")
return cv_scores
# 对随机森林进行交叉验证
rf_model = RandomForestClassifier(random_state=42)
cv_scores = cross_validation_evaluation(rf_model, X_train, y_train, "随机森林")
# 2. 超参数调优
print("\n" + "="*50)
print("超参数调优示例:随机森林")
print("="*50)
# 定义参数网格
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [None, 10, 20, 30],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4]
}
# 创建GridSearchCV对象
grid_search = GridSearchCV(
RandomForestClassifier(random_state=42),
param_grid,
cv=5,
scoring='accuracy',
n_jobs=-1,
verbose=1
)
# 执行网格搜索
print("开始网格搜索...")
grid_search.fit(X_train, y_train)
print(f"\n最佳参数: {grid_search.best_params_}")
print(f"最佳交叉验证分数: {grid_search.best_score_:.4f}")
# 使用最佳模型进行预测
best_rf = grid_search.best_estimator_
y_pred_best = best_rf.predict(X_test)
print(f"\n测试集准确率: {np.mean(y_pred_best == y_test):.4f}")
# 3. 特征重要性分析(基于最佳模型)
feature_importance_df = pd.DataFrame({
'feature': df.drop('target', axis=1).columns,
'importance': best_rf.feature_importances_
}).sort_values('importance', ascending=False)
plt.figure(figsize=(10, 8))
bars = plt.barh(feature_importance_df['feature'][:15],
feature_importance_df['importance'][:15],
color='skyblue')
plt.xlabel('特征重要性')
plt.title('Top 15 重要特征(基于随机森林)')
plt.gca().invert_yaxis() # 最重要的特征在顶部
# 添加数值标签
for bar in bars:
width = bar.get_width()
plt.text(width + 0.001, bar.get_y() + bar.get_height()/2,
f'{width:.3f}', ha='left', va='center')
plt.tight_layout()
plt.show()
五、模型部署与实战应用
# 1. 创建完整的数据分析管道
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
# 创建预处理和建模的管道
pipeline = Pipeline([
('scaler', StandardScaler()),
('pca', PCA(n_components=0.95)), # 保留95%的方差
('classifier', RandomForestClassifier(
n_estimators=100,
max_depth=20,
min_samples_split=5,
min_samples_leaf=2,
random_state=42
))
])
# 训练管道
pipeline.fit(X_train, y_train)
# 评估管道
pipeline_score = pipeline.score(X_test, y_test)
print(f"管道模型测试准确率: {pipeline_score:.4f}")
# 2. 模型保存与加载
import joblib
import json
# 保存模型
model_data = {
'model': pipeline,
'features': df.drop('target', axis=1).columns.tolist(),
'target_names': data.target_names.tolist()
}
joblib.dump(model_data, 'breast_cancer_model.pkl')
print("模型已保存为 'breast_cancer_model.pkl'")
# 3. 创建预测函数
def predict_new_sample(sample_data, model_path='breast_cancer_model.pkl'):
"""预测新样本"""
# 加载模型
model_data = joblib.load(model_path)
model = model_data['model']
feature_names = model_data['features']
target_names = model_data['target_names']
# 确保输入数据格式正确
if isinstance(sample_data, dict):
sample_df = pd.DataFrame([sample_data])
elif isinstance(sample_data, list):
sample_df = pd.DataFrame([sample_data], columns=feature_names)
else:
sample_df = sample_data
# 确保列顺序正确
sample_df = sample_df[feature_names]
# 预测
prediction = model.predict(sample_df)
prediction_proba = model.predict_proba(sample_df)
# 返回结果
result = {
'prediction': int(prediction[0]),
'prediction_label': target_names[prediction[0]],
'probability': float(prediction_proba[0][prediction[0]]),
'probabilities': prediction_proba[0].tolist(),
'class_labels': target_names
}
return result
# 4. 示例:使用新数据进行预测
# 创建一个新样本(使用测试集的第一个样本作为示例)
sample = X_test[0].reshape(1, -1)
sample_df = pd.DataFrame(sample, columns=df.drop('target', axis=1).columns)
# 进行预测
prediction_result = predict_new_sample(sample_df)
print("\n" + "="*50)
print("新样本预测结果")
print("="*50)
print(f"预测类别: {prediction_result['prediction_label']}")
print(f"预测概率: {prediction_result['probability']:.4f}")
print(f"\n各类别概率:")
for label, prob in zip(prediction_result['class_labels'],
prediction_result['probabilities']):
print(f" {label}: {prob:.4f}")
# 5. 创建简易的Web应用接口(Flask示例)
"""
# app.py
from flask import Flask, request, jsonify
import pandas as pd
import joblib
app = Flask(__name__)
# 加载模型
model_data = joblib.load('breast_cancer_model.pkl')
model = model_data['model']
@app.route('/predict', methods=['POST'])
def predict():
data = request.get_json()
sample_df = pd.DataFrame([data])
prediction = model.predict(sample_df)[0]
probability = model.predict_proba(sample_df)[0][prediction]
return jsonify({
'prediction': int(prediction),
'prediction_label': model_data['target_names'][prediction],
'probability': float(probability)
})
if __name__ == '__main__':
app.run(debug=True)
"""
总结
通过本实战教程,我们完成了从数据加载、探索分析、预处理、模型构建、评估优化到最终部署的完整机器学习工作流程。关键步骤包括:
- 数据理解与探索:通过统计分析和可视化了解数据特征
- 数据预处理:处理缺失值、标准化特征、划分数据集
- 模型构建与评估:尝试多种算法并比较性能
- 模型优化:使用交叉验证和网格搜索调优超参数
- 部署应用:创建预测管道和简易API接口
在实际项目中,还需要考虑模型监控、定期重新训练、处理类别不平衡等问题。掌握这些核心技能后,你可以将这些技术应用于各种领域,如金融风控、医疗诊断、推荐系统等。
注意:本教程仅用于教育目的,实际应用时需根据具体业务需求调整数据处理流程和模型选择。