在前面的课程中,我们学习了各种机器学习算法。然而,在实际项目中,选择合适的模型和进行有效的特征工程往往比算法本身更重要。本节将深入探讨特征工程的高级技巧和模型选择的策略,帮助你构建更强大的机器学习系统。
特征工程的重要性
特征工程是机器学习项目中最耗时但最关键的环节。好的特征工程可以:
- 显著提升模型性能:精心设计的特征能让简单模型达到复杂模型的效果
- 降低模型复杂度:通过特征选择可以减少过拟合风险
- 提高可解释性:有意义的特征更容易理解和解释
- 加速训练和推理:减少特征数量可以加快模型速度
graph LR
A[原始特征] --> B[特征工程]
B --> C[特征创建]
B --> D[特征变换]
B --> E[特征选择]
C --> F[高质量特征]
D --> F
E --> F
F --> G[更好的模型性能]
style A fill:#ff6b6b
style F fill:#51cf66
style G fill:#4dabf7
特征创建技巧
1. 数值特征变换
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import PolynomialFeatures, PowerTransformer
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
import warnings
warnings.filterwarnings('ignore')
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
class AdvancedFeatureEngineering:
"""高级特征工程类"""
@staticmethod
def create_polynomial_features(df, columns, degree=2, interaction_only=False):
"""创建多项式特征"""
poly = PolynomialFeatures(degree=degree, interaction_only=interaction_only, include_bias=False)
feature_names = [f'poly_{col}' for col in columns]
poly_features = poly.fit_transform(df[columns])
poly_df = pd.DataFrame(poly_features, columns=poly.get_feature_names_out(columns))
return poly_df
@staticmethod
def create_interaction_features(df, col1, col2):
"""创建交互特征"""
return df[col1] * df[col2]
@staticmethod
def create_ratio_features(df, numerator_col, denominator_col):
"""创建比率特征"""
return df[numerator_col] / (df[denominator_col] + 1e-8) # 避免除零
@staticmethod
def create_binning_features(df, column, n_bins=5, strategy='uniform'):
"""创建分箱特征"""
if strategy == 'uniform':
return pd.cut(df[column], bins=n_bins, labels=False)
elif strategy == 'quantile':
return pd.qcut(df[column], q=n_bins, labels=False, duplicates='drop')
@staticmethod
def create_statistical_features(df, group_col, value_col):
"""创建统计特征(分组统计)"""
grouped = df.groupby(group_col)[value_col]
return pd.DataFrame({
f'{value_col}_mean': grouped.transform('mean'),
f'{value_col}_std': grouped.transform('std'),
f'{value_col}_min': grouped.transform('min'),
f'{value_col}_max': grouped.transform('max'),
f'{value_col}_median': grouped.transform('median')
})
@staticmethod
def apply_power_transform(df, columns, method='yeo-johnson'):
"""应用幂变换(处理偏态分布)"""
transformer = PowerTransformer(method=method, standardize=True)
transformed = transformer.fit_transform(df[columns])
return pd.DataFrame(transformed, columns=columns, index=df.index)
# 创建示例数据
np.random.seed(42)
sample_data = {
'年龄': np.random.randint(18, 65, 1000),
'收入': np.random.normal(50000, 15000, 1000),
'工作经验': np.random.randint(0, 30, 1000),
'城市': np.random.choice(['北京', '上海', '广州', '深圳'], 1000),
'教育水平': np.random.choice(['本科', '硕士', '博士'], 1000),
'购买金额': np.random.normal(1000, 300, 1000)
}
df_features = pd.DataFrame(sample_data)
print("=" * 60)
print("高级特征工程示例")
print("=" * 60)
# 1. 多项式特征
print("\n1. 创建多项式特征")
poly_features = AdvancedFeatureEngineering.create_polynomial_features(
df_features, ['年龄', '收入'], degree=2
)
print(f"原始特征数: 2")
print(f"多项式特征数: {poly_features.shape[1]}")
print("前5个多项式特征:")
print(poly_features.head())
# 2. 交互特征
print("\n2. 创建交互特征")
df_features['年龄_收入_交互'] = AdvancedFeatureEngineering.create_interaction_features(
df_features, '年龄', '收入'
)
print("交互特征已创建")
# 3. 比率特征
print("\n3. 创建比率特征")
df_features['收入_年龄比'] = AdvancedFeatureEngineering.create_ratio_features(
df_features, '收入', '年龄'
)
print("比率特征已创建")
# 4. 分箱特征
print("\n4. 创建分箱特征")
df_features['收入_分箱'] = AdvancedFeatureEngineering.create_binning_features(
df_features, '收入', n_bins=5, strategy='quantile'
)
print("分箱特征已创建")
# 5. 统计特征
print("\n5. 创建统计特征")
stat_features = AdvancedFeatureEngineering.create_statistical_features(
df_features, '城市', '收入'
)
df_features = pd.concat([df_features, stat_features], axis=1)
print("统计特征已创建")
# 6. 幂变换
print("\n6. 应用幂变换")
# 创建偏态分布的数据
skewed_data = np.random.exponential(scale=1000, size=1000)
df_skewed = pd.DataFrame({'偏态特征': skewed_data})
transformed = AdvancedFeatureEngineering.apply_power_transform(
df_skewed, ['偏态特征']
)
# 可视化对比
fig, axes = plt.subplots(1, 2, figsize=(15, 5))
axes[0].hist(skewed_data, bins=50, edgecolor='black', alpha=0.7)
axes[0].set_title('原始偏态分布')
axes[0].set_xlabel('值')
axes[0].set_ylabel('频数')
axes[1].hist(transformed['偏态特征'], bins=50, edgecolor='black', alpha=0.7)
axes[1].set_title('幂变换后(接近正态分布)')
axes[1].set_xlabel('值')
axes[1].set_ylabel('频数')
plt.tight_layout()
plt.show()
print(f"\n最终特征数: {df_features.shape[1]}")
print(f"特征列表: {list(df_features.columns)}")
2. 时间特征工程
class TimeFeatureEngineering:
"""时间特征工程"""
@staticmethod
def extract_time_features(df, datetime_col):
"""从日期时间列提取特征"""
df_time = df.copy()
df_time[datetime_col] = pd.to_datetime(df_time[datetime_col])
features = {
'year': df_time[datetime_col].dt.year,
'month': df_time[datetime_col].dt.month,
'day': df_time[datetime_col].dt.day,
'dayofweek': df_time[datetime_col].dt.dayofweek,
'dayofyear': df_time[datetime_col].dt.dayofyear,
'week': df_time[datetime_col].dt.isocalendar().week,
'quarter': df_time[datetime_col].dt.quarter,
'is_weekend': (df_time[datetime_col].dt.dayofweek >= 5).astype(int),
'hour': df_time[datetime_col].dt.hour if df_time[datetime_col].dt.hour.notna().any() else None,
'minute': df_time[datetime_col].dt.minute if df_time[datetime_col].dt.minute.notna().any() else None
}
# 移除None值
features = {k: v for k, v in features.items() if v is not None}
# 创建周期性特征(使用sin/cos编码)
if 'month' in features:
features['month_sin'] = np.sin(2 * np.pi * features['month'] / 12)
features['month_cos'] = np.cos(2 * np.pi * features['month'] / 12)
if 'dayofweek' in features:
features['dayofweek_sin'] = np.sin(2 * np.pi * features['dayofweek'] / 7)
features['dayofweek_cos'] = np.cos(2 * np.pi * features['dayofweek'] / 7)
return pd.DataFrame(features)
# 时间特征示例
dates = pd.date_range('2023-01-01', periods=365, freq='D')
df_time = pd.DataFrame({'日期': dates})
time_features = TimeFeatureEngineering.extract_time_features(df_time, '日期')
print("\n时间特征工程示例")
print("=" * 60)
print(f"提取的时间特征数: {time_features.shape[1]}")
print(time_features.head(10))
# 可视化时间特征
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes[0, 0].plot(time_features['month_sin'][:90], label='month_sin', alpha=0.7)
axes[0, 0].plot(time_features['month_cos'][:90], label='month_cos', alpha=0.7)
axes[0, 0].set_title('月份周期性特征(前90天)')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)
axes[0, 1].plot(time_features['dayofweek_sin'][:30], label='dayofweek_sin', alpha=0.7)
axes[0, 1].plot(time_features['dayofweek_cos'][:30], label='dayofweek_cos', alpha=0.7)
axes[0, 1].set_title('星期周期性特征(前30天)')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)
axes[1, 0].bar(time_features['month'].value_counts().sort_index().index,
time_features['month'].value_counts().sort_index().values)
axes[1, 0].set_title('月份分布')
axes[1, 0].set_xlabel('月份')
axes[1, 0].set_ylabel('天数')
axes[1, 1].bar(time_features['dayofweek'].value_counts().sort_index().index,
time_features['dayofweek'].value_counts().sort_index().values)
axes[1, 1].set_title('星期分布')
axes[1, 1].set_xlabel('星期(0=周一)')
axes[1, 1].set_ylabel('天数')
plt.tight_layout()
plt.show()
特征选择策略
特征选择是减少过拟合、提高模型可解释性的重要方法:
class FeatureSelector:
"""特征选择器"""
@staticmethod
def univariate_selection(X, y, k=10, score_func=f_regression):
"""单变量特征选择"""
selector = SelectKBest(score_func=score_func, k=k)
X_selected = selector.fit_transform(X, y)
selected_features = selector.get_support(indices=True)
scores = selector.scores_
return X_selected, selected_features, scores
@staticmethod
def correlation_selection(df, target_col, threshold=0.9):
"""基于相关性的特征选择(去除高度相关的特征)"""
corr_matrix = df.corr().abs()
upper_triangle = corr_matrix.where(
np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
)
# 找到高度相关的特征对
high_corr_pairs = []
for col in upper_triangle.columns:
for idx in upper_triangle.index:
if upper_triangle.loc[idx, col] > threshold:
high_corr_pairs.append((idx, col, upper_triangle.loc[idx, col]))
# 选择要删除的特征(保留与目标相关性更高的)
to_drop = set()
for feat1, feat2, corr in high_corr_pairs:
corr1 = abs(df[feat1].corr(df[target_col]))
corr2 = abs(df[feat2].corr(df[target_col]))
if corr1 < corr2:
to_drop.add(feat1)
else:
to_drop.add(feat2)
return list(to_drop)
@staticmethod
def variance_selection(X, threshold=0.01):
"""基于方差的特征选择(去除低方差特征)"""
from sklearn.feature_selection import VarianceThreshold
selector = VarianceThreshold(threshold=threshold)
X_selected = selector.fit_transform(X)
selected_features = selector.get_support(indices=True)
return X_selected, selected_features
# 特征选择示例
from sklearn.datasets import make_regression
# 生成示例数据
X_reg, y_reg = make_regression(n_samples=1000, n_features=20, n_informative=10,
noise=10, random_state=42)
X_reg_df = pd.DataFrame(X_reg, columns=[f'特征{i+1}' for i in range(20)])
print("\n特征选择示例")
print("=" * 60)
# 1. 单变量特征选择
X_selected, selected_indices, scores = FeatureSelector.univariate_selection(
X_reg, y_reg, k=10
)
print(f"\n1. 单变量特征选择")
print(f"原始特征数: {X_reg.shape[1]}")
print(f"选择后特征数: {X_selected.shape[1]}")
print(f"选中的特征索引: {selected_indices}")
print(f"特征得分(前10): {sorted(scores, reverse=True)[:10]}")
# 可视化特征得分
plt.figure(figsize=(12, 6))
feature_names = [f'特征{i+1}' for i in range(len(scores))]
sorted_indices = np.argsort(scores)[::-1]
plt.barh(range(len(selected_indices)), scores[sorted_indices][:10])
plt.yticks(range(len(selected_indices)), [feature_names[i] for i in sorted_indices[:10]])
plt.xlabel('特征得分')
plt.title('单变量特征选择得分(Top 10)')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()
# 2. 相关性特征选择
X_reg_df['target'] = y_reg
to_drop = FeatureSelector.correlation_selection(X_reg_df, 'target', threshold=0.95)
print(f"\n2. 相关性特征选择")
print(f"高度相关的特征对,建议删除: {to_drop}")
# 3. 方差特征选择
X_var_selected, var_selected_indices = FeatureSelector.variance_selection(
X_reg, threshold=0.01
)
print(f"\n3. 方差特征选择")
print(f"原始特征数: {X_reg.shape[1]}")
print(f"选择后特征数: {X_var_selected.shape[1]}")
print(f"选中的特征索引: {var_selected_indices}")
模型选择策略
选择合适的模型是项目成功的关键:
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
import time
class ModelSelector:
"""模型选择器"""
def __init__(self):
self.models = {}
self.best_model = None
self.best_score = None
def add_model(self, name, model, param_grid=None):
"""添加模型"""
self.models[name] = {
'model': model,
'param_grid': param_grid
}
def compare_models(self, X_train, y_train, X_val, y_val, cv=5):
"""比较多个模型"""
results = []
for name, model_info in self.models.items():
model = model_info['model']
param_grid = model_info['param_grid']
print(f"\n评估模型: {name}")
# 如果有参数网格,进行网格搜索
if param_grid:
grid_search = GridSearchCV(
model, param_grid, cv=cv,
scoring='neg_mean_squared_error', n_jobs=-1
)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
cv_score = -grid_search.best_score_
else:
best_model = model
best_params = None
cv_scores = cross_val_score(
model, X_train, y_train, cv=cv,
scoring='neg_mean_squared_error'
)
cv_score = -cv_scores.mean()
# 在验证集上评估
start_time = time.time()
best_model.fit(X_train, y_train)
train_time = time.time() - start_time
y_pred = best_model.predict(X_val)
val_mse = mean_squared_error(y_val, y_pred)
val_r2 = r2_score(y_val, y_pred)
results.append({
'model': name,
'cv_mse': cv_score,
'val_mse': val_mse,
'val_r2': val_r2,
'train_time': train_time,
'best_params': best_params
})
print(f" CV MSE: {cv_score:.4f}")
print(f" Val MSE: {val_mse:.4f}")
print(f" Val R²: {val_r2:.4f}")
print(f" 训练时间: {train_time:.4f}s")
if best_params:
print(f" 最佳参数: {best_params}")
results_df = pd.DataFrame(results)
self.best_model_name = results_df.loc[results_df['val_mse'].idxmin(), 'model']
return results_df
# 模型选择示例
print("\n" + "=" * 60)
print("模型选择示例")
print("=" * 60)
# 准备数据
X_train_sel = X_selected[:800]
X_val_sel = X_selected[800:]
y_train_sel = y_reg[:800]
y_val_sel = y_reg[800:]
# 创建模型选择器
selector = ModelSelector()
# 添加多个模型
selector.add_model('线性回归', LinearRegression())
selector.add_model('Ridge回归', Ridge(), {
'alpha': [0.1, 1.0, 10.0, 100.0]
})
selector.add_model('Lasso回归', Lasso(), {
'alpha': [0.1, 1.0, 10.0, 100.0]
})
selector.add_model('决策树', DecisionTreeRegressor(), {
'max_depth': [3, 5, 7, 10],
'min_samples_split': [2, 5, 10]
})
selector.add_model('随机森林', RandomForestRegressor(n_estimators=100), {
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5]
})
selector.add_model('梯度提升', GradientBoostingRegressor(), {
'n_estimators': [50, 100],
'learning_rate': [0.01, 0.1],
'max_depth': [3, 5]
})
# 比较模型
results = selector.compare_models(X_train_sel, y_train_sel, X_val_sel, y_val_sel)
# 可视化结果
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
# MSE对比
axes[0].barh(results['model'], results['val_mse'])
axes[0].set_xlabel('验证集MSE')
axes[0].set_title('模型MSE对比')
axes[0].invert_yaxis()
# R²对比
axes[1].barh(results['model'], results['val_r2'])
axes[1].set_xlabel('验证集R²')
axes[1].set_title('模型R²对比')
axes[1].invert_yaxis()
# 训练时间对比
axes[2].barh(results['model'], results['train_time'])
axes[2].set_xlabel('训练时间 (秒)')
axes[2].set_title('模型训练时间对比')
axes[2].invert_yaxis()
plt.tight_layout()
plt.show()
print("\n模型比较结果:")
print(results.sort_values('val_mse'))
print(f"\n最佳模型: {selector.best_model_name}")
超参数调优
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform
def hyperparameter_tuning_example():
"""超参数调优示例"""
# 使用随机森林进行超参数调优
rf = RandomForestRegressor()
# 定义参数分布
param_distributions = {
'n_estimators': randint(50, 200),
'max_depth': [3, 5, 7, 10, None],
'min_samples_split': randint(2, 20),
'min_samples_leaf': randint(1, 10),
'max_features': ['sqrt', 'log2', None]
}
# 随机搜索
random_search = RandomizedSearchCV(
rf, param_distributions,
n_iter=50, cv=5,
scoring='neg_mean_squared_error',
n_jobs=-1, random_state=42
)
random_search.fit(X_train_sel, y_train_sel)
print("超参数调优结果")
print("=" * 60)
print(f"最佳参数: {random_search.best_params_}")
print(f"最佳CV得分: {-random_search.best_score_:.4f}")
# 在验证集上评估
y_pred_tuned = random_search.predict(X_val_sel)
val_mse_tuned = mean_squared_error(y_val_sel, y_pred_tuned)
val_r2_tuned = r2_score(y_val_sel, y_pred_tuned)
print(f"验证集MSE: {val_mse_tuned:.4f}")
print(f"验证集R²: {val_r2_tuned:.4f}")
return random_search
# 运行超参数调优
tuned_model = hyperparameter_tuning_example()
特征工程与模型选择的完整流程
graph TD
A[原始数据] --> B[特征工程]
B --> C[特征创建]
B --> D[特征变换]
B --> E[特征选择]
C --> F[特征集]
D --> F
E --> F
F --> G[模型候选集]
G --> H[模型训练与评估]
H --> I[超参数调优]
I --> J[最佳模型]
J --> K[模型验证]
K --> L{性能满足要求?}
L -->|否| B
L -->|是| M[模型部署]
style A fill:#ff6b6b
style M fill:#51cf66
最佳实践总结
- 特征工程优先:好的特征比复杂的模型更重要
- 迭代改进:特征工程是一个迭代过程,需要不断优化
- 领域知识:结合业务领域知识创建有意义的特征
- 避免数据泄露:确保特征工程不引入未来信息
- 模型选择:从简单模型开始,逐步尝试复杂模型
- 交叉验证:使用交叉验证评估模型,避免过拟合
- 自动化工具:使用AutoML工具辅助特征工程和模型选择
课后练习
-
实践任务:
- 选择一个真实数据集
- 创建至少10个新特征
- 比较不同特征选择方法的效果
- 尝试至少5种不同的模型
- 进行超参数调优
-
思考题:
- 什么情况下应该创建交互特征?
- 如何判断一个特征是否有用?
- 模型选择时应该考虑哪些因素?
-
扩展练习:
- 实现一个自动特征工程管道
- 使用AutoML工具(如TPOT)进行自动化模型选择
- 对比手动特征工程和自动特征工程的效果
总结
本节我们深入学习了:
- 高级特征工程:多项式特征、交互特征、时间特征等
- 特征选择策略:单变量选择、相关性选择、方差选择
- 模型选择方法:多模型比较、超参数调优
- 最佳实践:特征工程和模型选择的完整流程
掌握这些技能将帮助你构建更强大、更可靠的机器学习系统。记住:在大多数情况下,好的特征工程比复杂的模型更重要!
特征工程是艺术与科学的结合,需要创造力、领域知识和实验验证。