实践目标
通过一个预测新冠病毒发病率的题目,学习深度学习的训练过程,注意常见的库的使用、基于pytorch进行深度学习模型训练的流程。
先回顾下上篇提到的深度学习模型训练的流程,数据加载-->训练-->校验-->测试
代码结构
准备工作
- 准备数据集
- Kaggle下载数据: Kaggle: ml2022spring-hw1
- 百度云下载数据: 云盘(提取码:ml22)
下载好后,放到项目根目录,声明数据集path,后面需要用
tr_path = 'covid.train.csv'
tt_path = 'covid.test.csv'
- import 常用库
- torch:张量、数据集积累、自动梯度、计算图、GPU加速、深度学习模块
- 数据处理:numpy、csv
- 绘图工具:Matplotlib,直观的呈现模型训练的效果
# PyTorch
# 导入 PyTorch 核心库,后续所有张量操作、自动求导、模型定义都基于它
import torch
# torch.nn是 PyTorch 中定义各种神经网络层、损失函数等组件的模块。通常我们用 nn 作为别名,后面定义模型时会频繁使用,比如 nn.Linear、nn.Conv2d、nn.ReLU 等
import torch.nn as nn
# Dataset:抽象类,用于封装数据集;子类需要实现 __len__() 和 __getitem__(),以便按索引取样本
# DataLoader:数据加载器,可以将任意 Dataset 封装成可迭代对象,自动支持多线程并行加载、批量采样(batch)、打乱顺序(shuffle)等
from torch.utils.data import Dataset, DataLoader
print(torch.__version__)
print(torch.cuda.is_available())
# For data processing
import numpy as np
import csv
import os
# For plotting
# Matplotlib 是最常用的绘图库,pyplot 提供类似 MATLAB 的绘图接口,用于可视化训练过程中的损失曲线、预测结果、图像样本等
import matplotlib.pyplot as plt
# 直接导入 figure() 函数,用来创建一个新的图表窗口或画布,方便接下来调用 fig = figure(figsize=(x, y)) 来设置画布大小
from matplotlib.pyplot import figure
- 准备好辅助函数
- 设备获取
- 绘制学习曲线、预测值和真实值对比
def get_device():
return 'cuda' if torch.cuda.is_available() else 'cpu'
def plot_learning_curve(loss_record, title=''):
'''
绘制学习曲线
'''
# 1. 确定横坐标范围:训练总步数
total_steps = len(loss_record['train'])
x_1 = range(total_steps)
# 2. 新建画布
figure(figsize=(6,4))
# 3. 绘制训练损失曲线
plt.plot(x_1, loss_record['train'], c='tab:red',label='train')
# 4. 如果有验证集损失,则绘制验证曲线
if len(loss_record['dev']) != 0 :
# 4.1 计算验证曲线在横坐标上的抽样点
# 因为 train 损失记录可能比 dev 多很多,因此做等间隔采样
x_2 = x_1[::len(loss_record['train']) // len(loss_record['dev'])]
plt.plot(x_2, loss_record['dev'], c='tab:cyan', label='dev')
# 5. 设置纵坐标范围
plt.ylim(0.0, 20.)
# 6. 添加坐标轴标签和标题
plt.xlabel('Training steps')
plt.ylabel('MSE loss')
plt.title('Learning curve of {}'.format(title))
# 7. 显示图例并渲染
plt.legend()
plt.show()
def plot_pred(dv_set, model, device, lim=35., preds=None, targets=None):
'''Plot prediction of your DNN'''
if preds is None or targets is None:
model.eval()
preds, targets = [], []
for x, y in dv_set:
x, y = x.to(device), y.to(device)
with torch.no_grad():
pred = model(x)
preds.append(pred.detach().cpu())
targets.append(y.detach().cpu())
print("preds shape : {}".format(len(preds)))
preds = torch.cat(preds, dim=0).numpy()
targets = torch.cat(targets, dim=0).numpy()
print("preds after cat shape : {}".format(preds.shape))
figure(figsize=(5, 5))
plt.scatter(targets, preds, c='r', alpha=0.5)
plt.plot([-0.2, lim], [-0.2, lim], c='b')
plt.xlim(-0.2, lim)
plt.ylim(-0.2, lim)
plt.xlabel('ground truth value')
plt.ylabel('predicted value')
plt.title('Ground Truth v.s. Prediction')
plt.show()
定义数据结构&网络模型
- 定义数据结构
继承自torch中的基类Dataset,实现三个接口:
注意train(测试)、dev(校验)、test(测试)数据集处理的差异
class COVID19Dataset(Dataset):
''' Dataset for loading and preprocessing the COVID19 dataset'''
def __init__(self, path, mode='train', target_only=False):
self.mode = mode
# Read data into numpy arrays
with open(path, 'r') as fp:
data = list(csv.reader(fp))
data = np.array(data[1:])[:,1:].astype(float)
if not target_only:
feats = list(range(93))
else:
# TODO: Using 40 states & 2 tested_positive features (indices = 57 & 75)
feats = [40, 41, 42, 43, 57, 58, 59, 60, 61, 75, 76, 77, 78, 79] # sklean mutual info
if mode == 'test':
# Testing data
# data: 893 * 93 (40 states + day 1 (18) + day 2 (18) + day 3 (17))
data = data[:, feats]
self.data = torch.FloatTensor(data)
else:
# Training data (train/dev sets)
# data: 2700 * 94 (40 states + day 1 (18) + day 2 (18) + day 3 (18))
target = data[:, -1]
data = data[:, feats]
self.mean = torch.FloatTensor(data).mean(dim=0, keepdim=True)
self.std = torch.FloatTensor(data).std(dim=0, keepdim=True)
# Splitting training data into train & dev sets
if mode == 'train':
indices = [i for i in range(len(data)) if i % 10 != 0]
elif mode == 'dev':
indices = [i for i in range(len(data)) if i % 5 == 0]
# Convert data into PyTorch tensors
self.data = torch.FloatTensor(data[indices])
self.target = torch.FloatTensor(target[indices])
self.dim = self.data.shape[1]
print('Finished reading the {} set of COVID19 Dataset ({} samples found, each dim = {})'
.format(mode, len(self.data), self.dim))
def __getitem__(self, index):
# Returns one sample at a time
if self.mode in ['train', 'dev']:
# For training
return self.data[index], self.target[index]
else:
# For testing (no target)
return self.data[index]
def __len__(self):
# Returns the size of the dataset
return len(self.data)
def normalization(self, mean=None, std=None):
# Normalize each dimension to follow the Gaussian distribution
# The mean and standard variance of training data will reused to normalize testing data.
if self.mode == 'train' or self.mode == 'dev':
mean = self.mean
std = self.std
self.data = (self.data-mean) / std
else:
self.data = (self.data-mean) / std
return mean, std
- 定义数据集加载的函数
使用torch提供的DataLoader,直接把数据灌进去即可,DataLoader能自动处理好batch、shuffle等工作
def prep_dataloader(path, mode, batch_size, n_jobs=0, target_only=False, mean=None, std=None):
''' Generates a dataset, then is put into a dataloader. '''
dataset = COVID19Dataset(path, mode=mode, target_only=target_only) # Construct dataset
mean, std = dataset.normalization(mean, std)
dataloader = DataLoader(
dataset, batch_size,
shuffle=(mode == 'train'), drop_last = False,
num_workers = n_jobs, pin_memory=True)
return dataloader, mean, std
- 定义网络模型
网络模型的代码其实很少
模型层级的串联有两种写法,我们这里用左边的Sequential方式,更简洁
模型层级,是由大到小,这是固定的套路,这样的先扩散后收敛的形式,能让神经网络拟合出更丰富的特征空间。
注意!上一篇讲过,在torch中Linear是右乘。
class NeuralNet(nn.Module):
''' A simple fully-connected deep neural network '''
def __init__(self, input_dim):
super(NeuralNet, self).__init__()
# Define your neural network here
# TODO: How to modify this model to achieve better performance?
self.net = nn.Sequential(
nn.Linear(input_dim, 64),
nn.ReLU(),
nn.Linear(64, 16),
nn.ReLU(),
nn.Linear(16, 8),
nn.ReLU(),
nn.Linear(8,4),
nn.ReLU(),
nn.Linear(4,1)
)
# Mean squared error loss
self.criterion = nn.MSELoss(reduction='mean')
def forward(self, x):
'''Given input of size(batch_size * input_dime), compute output of the network'''
return self.net(x).squeeze(1)
def cal_loss(self, pred, target, l1_lambda):
''' Calculate loss '''
loss = self.criterion(pred, target)
# L1 regularization
l1_reg = torch.tensor(0.).to(get_device())
for param in self.parameters():
l1_reg += torch.sum(torch.abs(param))
loss += l1_lambda * l1_reg
return loss
train
train环节需要注意几点:
- 损失函数
- Mean Squared Error (for regression tasks) criterion = nn.MSELoss()
- Cross Entropy (for classification tasks) criterion = nn.CrossEntropyLoss()
- 参数优化
损失函数驱动模型参数朝哪个方向优化,参数优化方法决定怎么“快速”的将参数降到最优值。
可以粗糙的理解为:前者是指南针,后者是交通工具。合适的优化方法能大幅降低train的迭代次数。
最常见的就是随机梯度下降(Stochastic Gradient Descent (SGD))
torch.optim.SGD(model.parameters(), lr, momentum = 0)
3. 整个训练流程为:
train代码:
def train(tr_set, dv_set, model, config, device):
''' DNN training '''
n_epochs = config['n_epochs'] # Maximum number of epochs
# Setup optimizer
optimizer = getattr(torch.optim, config['optimizer'])(
model.parameters(), **config['optim_hparas'])
min_mse = 1000.
loss__record = {'train':[], 'dev':[]}
early_stop_cnt = 0
epoch = 0
while epoch < n_epochs:
model.train() # Set your model to train mode.
for x, y in tr_set: # Iterate through the dataloader.
optimizer.zero_grad() # Gradients stored in the parameters in the previous step should be cleared out first.
x, y = x.to(device), y.to(device) # Move your data to device.
pred = model(x) # Forward pass (compute output)
mse_loss = model.cal_loss(pred, y, config['l1_lambda']) # Compute loss.
mse_loss.backward() # Compute gradient(backpropagation).
optimizer.step() # Update model with optimizer.
loss__record['train'].append(mse_loss.detach().cpu().item())
# After each epoch, test your model on the validation (development) set.
dev_mse = dev(dv_set, model, device)
if dev_mse < min_mse:
# Save model if your model improved
min_mse = dev_mse
print('Saving model (epoch = {:4d}, loss = {:.4f})'.format(epoch + 1, min_mse))
torch.save(model.state_dict(), config['save_path']) # Save model to specified path
early_stop_cnt = 0
else:
early_stop_cnt += 1
epoch += 1
loss__record['dev'].append(dev_mse)
if early_stop_cnt > config['early_stop']:
# Stop training if your model stops improving for "config['early_stop']" epochs.
break
print('Finished training after {} epochs'.format(epoch))
return min_mse, loss__record
校验代码:
注意!校验时,要关闭自动梯度,减少不要的内存和计算开销
def dev(dv_set, model, device):
model.eval() # Set your model to evaluation mode.
total_loss = 0.0
for x, y in dv_set: # Iterate through the dataloader.
x, y = x.to(device), y.to(device) # Move your data to device.
with torch.no_grad():
pred = model(x) # Forward pass (compute output).
mse_loss = model.cal_loss(pred, y, config['l1_lambda']) # Compute loss.
total_loss += mse_loss.detach().cpu().item() * len(x) # accumulate loss
total_loss = total_loss / len(dv_set.dataset) # Compute averaged loss.
return total_loss
- 超参数定义
就是一些常亮定义,这是学习项目,所以将一些seed定义成确定值,确保随机函数的一致性,便于观察模型的特点和结论复现。
device = get_device()
os.makedirs('models', exist_ok=True)
target_only = True
seed = 459
np.random.seed(seed)
delta = np.random.normal(loc=0, scale = 0.000001)
# TODO: How to tune these hyper-paramerters to improve your model's performance?
config = {
'n_epochs': 3000, # maximum number of epochs
'batch_size': 270, # mini-batch size for dataloader
'optimizer': 'Adam', # optimization algorithm (optimizer in torch.optim)
'optim_hparas': { # hyper-parameters for the optimizer (depends on which optimizer you are using)
'lr': 0.001, # learning rate of SGD
},
'l1_lambda': 0.001 + delta,
'early_stop': 200, # early stopping epochs (the number epochs since your model's last improvement)
'save_path': 'models/model.pth' # your model will be saved here
}
# 把随机种子(seed)设置为一个固定的整数 42069,使得后续所有基于此种子的随机操作都能“从同一起点”开始。
myseed = 42069
# 强制 cuDNN 在可选的实现中只使用确定性算法(deterministic),避免某些卷积、池化等操作因为底层优化而产生的细微随机性。
torch.backends.cudnn.deterministic = True
# 关闭 cuDNN 的自动调优(benchmark)。如果启用,cuDNN 会根据多种算法在第一次运行时做性能测试,然后选最快的算法,这个选择过程本身可能引入非确定性。关闭后,每次都会用上面 deterministic 指定的那套实现
torch.backends.cudnn.benchmark = False
# 为 NumPy 的全局随机数生成器设置种子,保证后续所有 np.random.*(如 randn、choice 等)都能复现相同的随机序列。
np.random.seed(myseed)
# 为 PyTorch CPU 上的随机数生成器(RNG)设定种子,影响诸如 torch.randn、torch.randperm、dropout、数据打乱等操作在 CPU 上的随机性。
torch.manual_seed(myseed)
if torch.cuda.is_available():
# 如果检测到至少有一块可用 GPU,就为所有 GPU 设备上的 PyTorch RNG 也都设定相同的种子,保证在多卡训练时,每张卡上的随机操作也都可复现。
torch.cuda.manual_seed_all(myseed)
数据加载&训练
- 加载数据
tr_set, mean, std = prep_dataloader(tr_path, 'train', config['batch_size'], target_only=target_only)
dv_set, _, _ = prep_dataloader(tr_path, 'dev', config['batch_size'], target_only=target_only, mean=mean, std=std)
tt_set, _, _ = prep_dataloader(tt_path, 'test', config['batch_size'], target_only=target_only, mean=mean, std=std)
# 日志
# Finished reading the train set of COVID19 Dataset (2429 samples found, each dim = 14)
# Finished reading the dev set of COVID19 Dataset (540 samples found, each dim = 14)
# Finished reading the test set of COVID19 Dataset (1078 samples found, each dim = 14)
- train触发
model = NeuralNet(tr_set.dataset.dim).to(device)
model_loss,model_loss_record = train(tr_set, dv_set, model, config, device)
# 耗时:2m 4.0s
- 绘制学习曲线,观察收敛趋势
plot_learning_curve(model_loss_record, title='deep model')
- 模型测试
用测试数据集测试模型
del model # 输入的batch可能不一样,删除模型,重新建一个
model = NeuralNet(tr_set.dataset.dim).to(device)
ckpt = torch.load(config['save_path'], map_location='cpu') # Load your best model
model.load_state_dict(ckpt)
if len(dv_set) > 0:
plot_pred(dv_set, model, device) # Show prediction on the validation set
可以观察到,predicted value和ground truth value组成的(x,y)值很靠近斜对角线,说明模型预测的值还是比较准的。
总结
第一个正式的案例,把深度学习过程涉及到的知识点都带上了。麻雀虽小五脏俱全,要理解透彻需要花一单时间。
可以看到,真正的模型定义部分的代码是很少的,大量的代码都是关于数据处理,数据呈现的。
参考:
- 李宏毅机器学习
- github.com/pai4451/ML2…