Python数据分析与机器学习实战

17 阅读7分钟

5cea31a197234eb19e4d8a79bbee6b9d~tplv-obj.jpg

Python数据分析与机器学习实战:从数据清洗到预测建模

一、环境准备与数据加载

# 导入必要的库
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

# 设置可视化样式
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
%matplotlib inline

# 加载示例数据集(使用内置的乳腺癌数据集)
from sklearn.datasets import load_breast_cancer

# 加载数据
data = load_breast_cancer()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target

print(f"数据集形状: {df.shape}")
print(f"\n前5行数据:")
print(df.head())
print(f"\n数据统计描述:")
print(df.describe())

二、数据探索与预处理

# 1. 检查数据基本信息
def explore_data(df):
    print("=== 数据基本信息 ===")
    print(f"数据集形状: {df.shape}")
    print(f"\n数据列名:")
    print(df.columns.tolist())
    print(f"\n数据类型:")
    print(df.dtypes)
    print(f"\n缺失值统计:")
    print(df.isnull().sum())
    print(f"\n目标变量分布:")
    print(df['target'].value_counts())
    
explore_data(df)

# 2. 数据可视化分析
fig, axes = plt.subplots(2, 3, figsize=(15, 10))

# 目标变量分布
axes[0, 0].pie(df['target'].value_counts(), 
              labels=['恶性', '良性'], 
              autopct='%1.1f%%',
              colors=['#ff9999', '#66b3ff'])
axes[0, 0].set_title('目标变量分布')

# 特征分布直方图
axes[0, 1].hist(df['mean radius'], bins=30, alpha=0.7, color='skyblue')
axes[0, 1].set_xlabel('平均半径')
axes[0, 1].set_ylabel('频数')
axes[0, 1].set_title('特征分布示例')

# 相关性热图
corr_matrix = df.corr()
sns.heatmap(corr_matrix.iloc[:10, :10], ax=axes[0, 2], cmap='coolwarm', annot=True, fmt='.2f')
axes[0, 2].set_title('特征相关性热图(部分)')

# 箱线图
sns.boxplot(x='target', y='mean radius', data=df, ax=axes[1, 0])
axes[1, 0].set_title('不同类别的特征分布')
axes[1, 0].set_xticklabels(['恶性', '良性'])

# 散点图示例
axes[1, 1].scatter(df['mean radius'], df['mean texture'], 
                   c=df['target'], alpha=0.6, cmap='viridis')
axes[1, 1].set_xlabel('平均半径')
axes[1, 1].set_ylabel('平均纹理')
axes[1, 1].set_title('特征散点图')

# 特征重要性(使用随机森林)
from sklearn.ensemble import RandomForestClassifier

X_temp = df.drop('target', axis=1)
y_temp = df['target']
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_temp, y_temp)

feature_importance = pd.DataFrame({
    'feature': X_temp.columns,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=True).tail(10)

axes[1, 2].barh(feature_importance['feature'], feature_importance['importance'])
axes[1, 2].set_xlabel('重要性')
axes[1, 2].set_title('Top 10 重要特征')

plt.tight_layout()
plt.show()

# 3. 数据预处理
def preprocess_data(df):
    """数据预处理函数"""
    # 分离特征和目标变量
    X = df.drop('target', axis=1)
    y = df['target']
    
    # 划分训练集和测试集
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    # 特征标准化
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    print(f"训练集大小: {X_train_scaled.shape}")
    print(f"测试集大小: {X_test_scaled.shape}")
    
    return X_train_scaled, X_test_scaled, y_train, y_test, scaler

X_train, X_test, y_train, y_test, scaler = preprocess_data(df)

三、机器学习模型构建与评估

# 1. 定义模型训练和评估函数
def train_and_evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    """训练并评估模型"""
    print(f"\n{'='*50}")
    print(f"训练 {model_name} 模型")
    print(f"{'='*50}")
    
    # 训练模型
    model.fit(X_train, y_train)
    
    # 预测
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None
    
    # 评估指标
    print("分类报告:")
    print(classification_report(y_test, y_pred, target_names=['恶性', '良性']))
    
    # 混淆矩阵
    cm = confusion_matrix(y_test, y_pred)
    
    # 可视化结果
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))
    
    # 混淆矩阵热图
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0])
    axes[0].set_xlabel('预测标签')
    axes[0].set_ylabel('真实标签')
    axes[0].set_title(f'{model_name} - 混淆矩阵')
    
    # ROC曲线(如果模型支持概率预测)
    if y_pred_proba is not None:
        from sklearn.metrics import roc_curve, auc
        fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
        roc_auc = auc(fpr, tpr)
        
        axes[1].plot(fpr, tpr, color='darkorange', lw=2, 
                    label=f'ROC曲线 (AUC = {roc_auc:.2f})')
        axes[1].plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        axes[1].set_xlim([0.0, 1.0])
        axes[1].set_ylim([0.0, 1.05])
        axes[1].set_xlabel('假正率')
        axes[1].set_ylabel('真正率')
        axes[1].set_title(f'{model_name} - ROC曲线')
        axes[1].legend(loc="lower right")
    
    plt.tight_layout()
    plt.show()
    
    return model, y_pred

# 2. 训练多个模型进行比较
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# 定义要比较的模型
models = {
    "逻辑回归": LogisticRegression(random_state=42, max_iter=1000),
    "支持向量机": SVC(probability=True, random_state=42),
    "决策树": DecisionTreeClassifier(random_state=42),
    "随机森林": RandomForestClassifier(n_estimators=100, random_state=42),
    "梯度提升": GradientBoostingClassifier(n_estimators=100, random_state=42)
}

# 训练和评估所有模型
results = {}
for name, model in models.items():
    trained_model, predictions = train_and_evaluate_model(
        model, X_train, X_test, y_train, y_test, name
    )
    results[name] = {
        'model': trained_model,
        'predictions': predictions
    }

四、模型优化与交叉验证

# 1. 使用交叉验证评估模型
from sklearn.model_selection import cross_val_score, GridSearchCV

def cross_validation_evaluation(model, X, y, model_name):
    """使用交叉验证评估模型"""
    print(f"\n{'='*50}")
    print(f"{model_name} 交叉验证")
    print(f"{'='*50}")
    
    # 执行交叉验证
    cv_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    
    print(f"交叉验证准确率: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
    print(f"各折分数: {cv_scores}")
    
    return cv_scores

# 对随机森林进行交叉验证
rf_model = RandomForestClassifier(random_state=42)
cv_scores = cross_validation_evaluation(rf_model, X_train, y_train, "随机森林")

# 2. 超参数调优
print("\n" + "="*50)
print("超参数调优示例:随机森林")
print("="*50)

# 定义参数网格
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# 创建GridSearchCV对象
grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

# 执行网格搜索
print("开始网格搜索...")
grid_search.fit(X_train, y_train)

print(f"\n最佳参数: {grid_search.best_params_}")
print(f"最佳交叉验证分数: {grid_search.best_score_:.4f}")

# 使用最佳模型进行预测
best_rf = grid_search.best_estimator_
y_pred_best = best_rf.predict(X_test)
print(f"\n测试集准确率: {np.mean(y_pred_best == y_test):.4f}")

# 3. 特征重要性分析(基于最佳模型)
feature_importance_df = pd.DataFrame({
    'feature': df.drop('target', axis=1).columns,
    'importance': best_rf.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 8))
bars = plt.barh(feature_importance_df['feature'][:15], 
                feature_importance_df['importance'][:15],
                color='skyblue')
plt.xlabel('特征重要性')
plt.title('Top 15 重要特征(基于随机森林)')
plt.gca().invert_yaxis()  # 最重要的特征在顶部

# 添加数值标签
for bar in bars:
    width = bar.get_width()
    plt.text(width + 0.001, bar.get_y() + bar.get_height()/2,
             f'{width:.3f}', ha='left', va='center')

plt.tight_layout()
plt.show()

五、模型部署与实战应用

# 1. 创建完整的数据分析管道
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

# 创建预处理和建模的管道
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=0.95)),  # 保留95%的方差
    ('classifier', RandomForestClassifier(
        n_estimators=100,
        max_depth=20,
        min_samples_split=5,
        min_samples_leaf=2,
        random_state=42
    ))
])

# 训练管道
pipeline.fit(X_train, y_train)

# 评估管道
pipeline_score = pipeline.score(X_test, y_test)
print(f"管道模型测试准确率: {pipeline_score:.4f}")

# 2. 模型保存与加载
import joblib
import json

# 保存模型
model_data = {
    'model': pipeline,
    'features': df.drop('target', axis=1).columns.tolist(),
    'target_names': data.target_names.tolist()
}

joblib.dump(model_data, 'breast_cancer_model.pkl')
print("模型已保存为 'breast_cancer_model.pkl'")

# 3. 创建预测函数
def predict_new_sample(sample_data, model_path='breast_cancer_model.pkl'):
    """预测新样本"""
    # 加载模型
    model_data = joblib.load(model_path)
    model = model_data['model']
    feature_names = model_data['features']
    target_names = model_data['target_names']
    
    # 确保输入数据格式正确
    if isinstance(sample_data, dict):
        sample_df = pd.DataFrame([sample_data])
    elif isinstance(sample_data, list):
        sample_df = pd.DataFrame([sample_data], columns=feature_names)
    else:
        sample_df = sample_data
    
    # 确保列顺序正确
    sample_df = sample_df[feature_names]
    
    # 预测
    prediction = model.predict(sample_df)
    prediction_proba = model.predict_proba(sample_df)
    
    # 返回结果
    result = {
        'prediction': int(prediction[0]),
        'prediction_label': target_names[prediction[0]],
        'probability': float(prediction_proba[0][prediction[0]]),
        'probabilities': prediction_proba[0].tolist(),
        'class_labels': target_names
    }
    
    return result

# 4. 示例:使用新数据进行预测
# 创建一个新样本(使用测试集的第一个样本作为示例)
sample = X_test[0].reshape(1, -1)
sample_df = pd.DataFrame(sample, columns=df.drop('target', axis=1).columns)

# 进行预测
prediction_result = predict_new_sample(sample_df)

print("\n" + "="*50)
print("新样本预测结果")
print("="*50)
print(f"预测类别: {prediction_result['prediction_label']}")
print(f"预测概率: {prediction_result['probability']:.4f}")
print(f"\n各类别概率:")
for label, prob in zip(prediction_result['class_labels'], 
                      prediction_result['probabilities']):
    print(f"  {label}: {prob:.4f}")

# 5. 创建简易的Web应用接口(Flask示例)
"""
# app.py
from flask import Flask, request, jsonify
import pandas as pd
import joblib

app = Flask(__name__)

# 加载模型
model_data = joblib.load('breast_cancer_model.pkl')
model = model_data['model']

@app.route('/predict', methods=['POST'])
def predict():
    data = request.get_json()
    sample_df = pd.DataFrame([data])
    prediction = model.predict(sample_df)[0]
    probability = model.predict_proba(sample_df)[0][prediction]
    
    return jsonify({
        'prediction': int(prediction),
        'prediction_label': model_data['target_names'][prediction],
        'probability': float(probability)
    })

if __name__ == '__main__':
    app.run(debug=True)
"""

总结

通过本实战教程,我们完成了从数据加载、探索分析、预处理、模型构建、评估优化到最终部署的完整机器学习工作流程。关键步骤包括:

  1. 数据理解与探索:通过统计分析和可视化了解数据特征
  2. 数据预处理:处理缺失值、标准化特征、划分数据集
  3. 模型构建与评估:尝试多种算法并比较性能
  4. 模型优化:使用交叉验证和网格搜索调优超参数
  5. 部署应用:创建预测管道和简易API接口

在实际项目中,还需要考虑模型监控、定期重新训练、处理类别不平衡等问题。掌握这些核心技能后,你可以将这些技术应用于各种领域,如金融风控、医疗诊断、推荐系统等。

注意:本教程仅用于教育目的,实际应用时需根据具体业务需求调整数据处理流程和模型选择。