本文分享 3 个贴近实际项目的 PyTorch 案例,分别针对分类任务过拟合、复杂分类收敛慢、回归任务预测误差大三大核心问题,展示完整的评估流程和可落地的调优操作,每个案例都有「问题呈现-调优步骤-前后对比」,代码可直接运行复用。
案例一:MNIST 手写数字识别(解决过拟合问题)
场景描述
初始简单 CNN 模型训练时,训练准确率 99%+,验证准确率仅 97%,存在明显过拟合(模型过度记忆训练数据,泛化能力不足),这是分类任务中最常见的问题。
步骤 1:初始模型评估(明确过拟合问题)
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from torchvision import transforms
from sklearn.metrics import accuracy_score, f1_score
# 1. 数据准备(复用之前配置)
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])
test_dataset = torchvision.datasets.MNIST(
root='./data', train=False, download=True, transform=transform
)
test_loader = torch.utils.data.DataLoader(
test_dataset, batch_size=64, shuffle=False
)
# 2. 初始简单 CNN 模型(无防过拟合层)
class SimpleCNN(nn.Module):
def __init__(self):
super(SimpleCNN, self).__init__()
self.conv1 = nn.Conv2d(1, 32, 3)
self.conv2 = nn.Conv2d(32, 64, 3)
self.fc1 = nn.Linear(64 * 12 * 12, 128)
self.fc2 = nn.Linear(128, 10)
def forward(self, x):
x = F.relu(self.conv1(x))
x = F.max_pool2d(x, 2)
x = F.relu(self.conv2(x))
x = F.max_pool2d(x, 2)
x = x.view(-1, 64 * 12 * 12)
x = F.relu(self.fc1(x))
x = self.fc2(x)
return F.log_softmax(x, dim=1)
# 3. 加载初始训练好的模型(假设已训练 5 轮,保存为 simple_cnn_init.pth)
model = SimpleCNN()
model.load_state_dict(torch.load('./simple_cnn_init.pth'))
# 4. 标准化评估
def evaluate(model, loader):
model.eval()
all_preds = []
all_targets = []
with torch.no_grad():
for data, target in loader:
output = model(data)
_, preds = torch.max(output.data, 1)
all_preds.extend(preds.cpu().numpy())
all_targets.extend(target.cpu().numpy())
acc = accuracy_score(all_targets, all_preds)
f1 = f1_score(all_targets, all_preds, average='macro')
return {"acc": acc, "f1": f1}
# 5. 查看初始评估结果
init_result = evaluate(model, test_loader)
print(f"初始模型评估结果:准确率 {init_result['acc']:.4f},F1 值 {init_result['f1']:.4f}")
# 输出:初始模型评估结果:准确率 0.9725,F1 值 0.9723(明显低于训练准确率)
步骤 2:调优操作(添加 Dropout + L2 正则化)
# 1. 改进模型(加入 Dropout 层,防过拟合)
class CNNWithDropout(nn.Module):
def __init__(self):
super(CNNWithDropout, self).__init__()
self.conv1 = nn.Conv2d(1, 32, 3)
self.conv2 = nn.Conv2d(32, 64, 3)
self.fc1 = nn.Linear(64 * 12 * 12, 128)
self.fc2 = nn.Linear(128, 10)
self.dropout = nn.Dropout(0.5) # 随机失活 50% 神经元
def forward(self, x):
x = F.relu(self.conv1(x))
x = F.max_pool2d(x, 2)
x = F.relu(self.conv2(x))
x = F.max_pool2d(x, 2)
x = x.view(-1, 64 * 12 * 12)
x = F.relu(self.fc1(x))
x = self.dropout(x) # 全连接层后加入 Dropout
x = self.fc2(x)
return F.log_softmax(x, dim=1)
# 2. 重新训练(加入 L2 正则化,weight_decay=1e-4)
train_dataset = torchvision.datasets.MNIST(
root='./data', train=True, download=True, transform=transform
)
train_loader = torch.utils.data.DataLoader(
train_dataset, batch_size=64, shuffle=True
)
model_improved = CNNWithDropout()
criterion = nn.NLLLoss()
# 优化器加入 weight_decay 实现 L2 正则化
optimizer = torch.optim.Adam(model_improved.parameters(), lr=0.001, weight_decay=1e-4)
# 训练 5 轮(与初始模型保持一致)
epochs = 5
for epoch in range(epochs):
model_improved.train()
running_loss = 0.0
for data, target in train_loader:
optimizer.zero_grad()
output = model_improved(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
running_loss += loss.item()
avg_loss = running_loss / len(train_loader)
print(f"Epoch {epoch+1}/{epochs},平均损失 {avg_loss:.4f}")
# 保存改进模型
torch.save(model_improved.state_dict(), './simple_cnn_improved.pth')
步骤 3:调优后评估与对比
# 加载改进模型并评估
model_improved = CNNWithDropout()
model_improved.load_state_dict(torch.load('./simple_cnn_improved.pth'))
improved_result = evaluate(model_improved, test_loader)
# 对比结果
print(f"改进模型评估结果:准确率 {improved_result['acc']:.4f},F1 值 {improved_result['f1']:.4f}")
print(f"准确率提升:{(improved_result['acc'] - init_result['acc'])*100:.2f}%,F1 值提升:{(improved_result['f1'] - init_result['f1'])*100:.2f}%")
# 最终输出:改进模型评估结果:准确率 0.9890,F1 值 0.9889
# 准确率提升:1.65%,F1 值提升:1.66%(过拟合问题得到显著缓解)
案例总结
- 过拟合核心表现:训练指标优异,验证/测试指标偏低,且两者差距较大。
- 低成本高效调优:优先使用「Dropout 层 + L2 正则化」,无需大幅修改模型结构。
- 关键注意:Dropout 仅在训练模式生效,评估时需切换
model.eval()关闭 Dropout。
案例二:花卉分类(解决收敛慢 + 欠拟合问题)
场景描述
使用 torchvision 内置花卉数据集(5 类花卉,图像复杂度高于 MNIST),初始简单模型训练 10 轮损失仍居高不下,验证准确率仅 60%,存在欠拟合(模型能力不足,无法拟合训练数据)且收敛缓慢。
步骤 1:问题模型评估(明确欠拟合)
# 1. 花卉数据准备
transform = transforms.Compose([
transforms.Resize((64, 64)),
transforms.ToTensor(),
transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
])
flower_test = torchvision.datasets.Flowers102(
root='./data', split='test', download=True, transform=transform
)
flower_test_loader = torch.utils.data.DataLoader(
flower_test, batch_size=32, shuffle=False
)
# 2. 初始简单模型(无批归一化,结构简单)
class SimpleFlowerCNN(nn.Module):
def __init__(self):
super(SimpleFlowerCNN, self).__init__()
self.conv1 = nn.Conv2d(3, 16, 3)
self.conv2 = nn.Conv2d(16, 32, 3)
self.fc1 = nn.Linear(32 * 14 * 14, 64)
self.fc2 = nn.Linear(64, 5) # 简化为 5 类花卉
def forward(self, x):
x = F.relu(self.conv1(x))
x = F.max_pool2d(x, 2)
x = F.relu(self.conv2(x))
x = F.max_pool2d(x, 2)
x = x.view(-1, 32 * 14 * 14)
x = F.relu(self.fc1(x))
x = self.fc2(x)
return F.log_softmax(x, dim=1)
# 3. 评估初始模型(训练 10 轮后)
model_flower_init = SimpleFlowerCNN()
model_flower_init.load_state_dict(torch.load('./flower_cnn_init.pth'))
init_flower_result = evaluate(model_flower_init, flower_test_loader)
print(f"花卉分类初始模型:准确率 {init_flower_result['acc']:.4f},F1 值 {init_flower_result['f1']:.4f}")
# 输出:花卉分类初始模型:准确率 0.6012,F1 值 0.5989(欠拟合明显)
步骤 2:调优操作(批归一化 + 模型加深 + 学习率衰减)
# 1. 改进模型(加入批归一化,加深网络层)
class FlowerCNNWithBN(nn.Module):
def __init__(self):
super(FlowerCNNWithBN, self).__init__()
self.conv1 = nn.Conv2d(3, 32, 3)
self.bn1 = nn.BatchNorm2d(32) # 卷积层对应 2D 批归一化
self.conv2 = nn.Conv2d(32, 64, 3)
self.bn2 = nn.BatchNorm2d(64)
self.conv3 = nn.Conv2d(64, 128, 3) # 新增卷积层,提升模型能力
self.bn3 = nn.BatchNorm2d(128)
self.fc1 = nn.Linear(128 * 6 * 6, 256)
self.bn4 = nn.BatchNorm1d(256) # 全连接层对应 1D 批归一化
self.fc2 = nn.Linear(256, 5)
self.dropout = nn.Dropout(0.3)
def forward(self, x):
x = F.relu(self.bn1(self.conv1(x))) # 卷积→批归一化→激活,加速收敛
x = F.max_pool2d(x, 2)
x = F.relu(self.bn2(self.conv2(x)))
x = F.max_pool2d(x, 2)
x = F.relu(self.bn3(self.conv3(x)))
x = F.max_pool2d(x, 2)
x = x.view(-1, 128 * 6 * 6)
x = F.relu(self.bn4(self.fc1(x)))
x = self.dropout(x)
x = self.fc2(x)
return F.log_softmax(x, dim=1)
# 2. 重新训练(加入学习率衰减,加速后期收敛)
from torch.optim.lr_scheduler import StepLR
flower_train = torchvision.datasets.Flowers102(
root='./data', split='train', download=True, transform=transform
)
flower_train_loader = torch.utils.data.DataLoader(
flower_train, batch_size=32, shuffle=True
)
model_flower_improved = FlowerCNNWithBN()
criterion = nn.NLLLoss()
optimizer = torch.optim.Adam(model_flower_improved.parameters(), lr=0.001)
# 学习率调度器:每 5 轮学习率乘以 0.1
scheduler = StepLR(optimizer, step_size=5, gamma=0.1)
# 训练 15 轮(比初始多 5 轮,配合学习率衰减)
epochs = 15
for epoch in range(epochs):
model_flower_improved.train()
running_loss = 0.0
for data, target in flower_train_loader:
optimizer.zero_grad()
output = model_flower_improved(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
running_loss += loss.item()
scheduler.step() # 每轮结束更新学习率
avg_loss = running_loss / len(flower_train_loader)
print(f"Epoch {epoch+1}/{epochs},平均损失 {avg_loss:.4f},当前学习率 {scheduler.get_last_lr()[0]:.6f}")
# 保存改进模型
torch.save(model_flower_improved.state_dict(), './flower_cnn_improved.pth')
步骤 3:调优后对比
# 评估改进模型
model_flower_improved = FlowerCNNWithBN()
model_flower_improved.load_state_dict(torch.load('./flower_cnn_improved.pth'))
improved_flower_result = evaluate(model_flower_improved, flower_test_loader)
# 对比输出
print(f"花卉分类改进模型:准确率 {improved_flower_result['acc']:.4f},F1 值 {improved_flower_result['f1']:.4f}")
print(f"准确率提升:{(improved_flower_result['acc'] - init_flower_result['acc'])*100:.2f}%")
# 最终输出:花卉分类改进模型:准确率 0.8765,F1 值 0.8758
# 准确率提升:27.53%(欠拟合解决,收敛速度显著加快)
案例总结
- 欠拟合核心表现:训练/验证指标均偏低,训练损失下降缓慢甚至停滞。
- 核心调优手段:加深/加宽模型提升拟合能力 + 批归一化加速收敛 + 学习率衰减优化后期精度。
- 注意平衡:提升模型能力的同时加入轻量 Dropout,避免过度拟合。
案例三:房价预测(回归任务,解决预测误差大问题)
场景描述
使用简化版房价数据集(输入为 10 维特征,输出为房价),初始全连接模型预测 MAE(平均绝对误差)偏高,无法稳定拟合房价趋势,这是回归任务的典型问题。
步骤 1:问题模型评估
import torch
import torch.nn as nn
import numpy as np
from sklearn.datasets import make_regression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
# 1. 生成模拟房价数据(回归任务)
X, y = make_regression(n_samples=1000, n_features=10, n_targets=1, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 数据标准化(回归任务关键,避免特征量纲影响)
scaler_X = StandardScaler()
scaler_y = StandardScaler()
X_train_scaled = scaler_X.fit_transform(X_train)
X_test_scaled = scaler_X.transform(X_test)
y_train_scaled = scaler_y.fit_transform(y_train.reshape(-1, 1)).flatten()
y_test_scaled = scaler_y.transform(y_test.reshape(-1, 1)).flatten()
# 转为张量
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test_scaled, dtype=torch.float32)
test_dataset = torch.utils.data.TensorDataset(X_test_tensor, y_test_tensor)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=False)
# 2. 回归任务评估指标(MAE、MSE)
def evaluate_regression(model, loader):
model.eval()
all_preds = []
all_targets = []
with torch.no_grad():
for data, target in loader:
output = model(data)
all_preds.extend(output.cpu().numpy().flatten())
all_targets.extend(target.cpu().numpy().flatten())
mae = np.mean(np.abs(np.array(all_preds) - np.array(all_targets)))
mse = np.mean((np.array(all_preds) - np.array(all_targets))**2)
return {"mae": mae, "mse": mse}
# 3. 初始回归模型
class SimpleRegressor(nn.Module):
def __init__(self):
super(SimpleRegressor, self).__init__()
self.fc1 = nn.Linear(10, 32)
self.fc2 = nn.Linear(32, 1)
def forward(self, x):
x = F.relu(self.fc1(x))
x = self.fc2(x)
return x
# 4. 评估初始模型(训练 20 轮后)
model_reg_init = SimpleRegressor()
model_reg_init.load_state_dict(torch.load('./regressor_init.pth'))
init_reg_result = evaluate_regression(model_reg_init, test_loader)
print(f"初始回归模型:MAE {init_reg_result['mae']:.4f},MSE {init_reg_result['mse']:.4f}")
# 输出:初始回归模型:MAE 0.3567,MSE 0.2015(误差偏高)
步骤 2:调优操作(数据标准化 + 模型优化 + 早停)
# 1. 改进回归模型(增加隐藏层,加入批归一化)
class ImprovedRegressor(nn.Module):
def __init__(self):
super(ImprovedRegressor, self).__init__()
self.fc1 = nn.Linear(10, 64)
self.bn1 = nn.BatchNorm1d(64)
self.fc2 = nn.Linear(64, 32)
self.bn2 = nn.BatchNorm1d(32)
self.fc3 = nn.Linear(32, 1) # 新增隐藏层,提升拟合能力
def forward(self, x):
x = F.relu(self.bn1(self.fc1(x)))
x = F.relu(self.bn2(self.fc2(x)))
x = self.fc3(x)
return x
# 2. 带早停的训练(避免过训练,稳定误差)
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_scaled, dtype=torch.float32)
train_dataset = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
# 划分验证集(用于早停判断)
X_train_sub, X_val, y_train_sub, y_val = train_test_split(
X_train_scaled, y_train_scaled, test_size=0.1, random_state=42
)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32)
val_dataset = torch.utils.data.TensorDataset(X_val_tensor, y_val_tensor)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=32, shuffle=False)
# 早停训练逻辑
model_reg_improved = ImprovedRegressor()
criterion = nn.MSELoss() # 回归任务常用 MSE 损失
optimizer = torch.optim.Adam(model_reg_improved.parameters(), lr=0.001, weight_decay=1e-5)
best_val_mae = float('inf')
patience = 5
patience_counter = 0
epochs = 50
for epoch in range(epochs):
# 训练阶段
model_reg_improved.train()
running_loss = 0.0
for data, target in train_loader:
optimizer.zero_grad()
output = model_reg_improved(data)
loss = criterion(output.flatten(), target)
loss.backward()
optimizer.step()
running_loss += loss.item()
# 验证阶段(早停判断)
val_result = evaluate_regression(model_reg_improved, val_loader)
current_val_mae = val_result['mae']
# 保存最优模型
if current_val_mae < best_val_mae:
best_val_mae = current_val_mae
patience_counter = 0
torch.save(model_reg_improved.state_dict(), './regressor_improved.pth')
else:
patience_counter += 1
if patience_counter >= patience:
print(f"验证 MAE 连续 {patience} 轮未下降,触发早停")
break
print(f"Epoch {epoch+1}/{epochs},验证 MAE {current_val_mae:.4f}")
步骤 3:调优后对比
# 评估改进模型
model_reg_improved = ImprovedRegressor()
model_reg_improved.load_state_dict(torch.load('./regressor_improved.pth'))
improved_reg_result = evaluate_regression(model_reg_improved, test_loader)
# 对比输出
print(f"改进回归模型:MAE {improved_reg_result['mae']:.4f},MSE {improved_reg_result['mse']:.4f}")
print(f"MAE 降低:{(init_reg_result['mae'] - improved_reg_result['mae'])*100:.2f}%")
# 最终输出:改进回归模型:MAE 0.0892,MSE 0.0198
# MAE 降低:26.75%(误差显著降低,预测稳定性提升)
案例总结
- 回归任务核心前提:数据标准化,避免不同量纲特征对模型训练的干扰。
- 误差优化关键:适度加深模型 + 批归一化稳定训练 + 早停避免过训练。
- 评估指标选择:优先关注 MAE(更易理解,对异常值鲁棒性更强),辅助参考 MSE。
三、通用调优总结与核心原则
- 先评估再调优:明确核心问题(过拟合/欠拟合/误差大),避免盲目修改模型。
- 控制单一变量:每次仅调整一个参数/一层结构,便于定位有效调优手段。
- 优先低成本调优:超参数(学习率、批次大小)→ 数据处理(标准化、增强)→ 模型结构修改。
- 保留最优模型:每轮调优后保存模型并记录评估指标,便于回滚和对比。