一行就是一个 batch
以前我是没有写过 dataloader 然后看了一下别人的,就只会输出一维的,一行一行地读 csv
#!/usr/bin/env python3
# encoding: utf-8
import numpy as np
from torch.utils.data import Dataset
# 导入数据集的类
class MyDataset(Dataset):
def __init__(self, csv_file):
self.lines = open(csv_file).readlines()
def __getitem__(self, index):
# get a line
cur_line = self.lines[index].split(',')
# enc_input 具有多个特征
full_len_input = np.float32([cur_line[0].strip(), cur_line[1].strip()])
dec_output = np.float32([cur_line[2].strip(), cur_line[3].strip()])
#enc_input = np.float32(cur_line[0].strip())
#dec_output = np.float32(cur_line[1].strip())
return full_len_input, dec_output
def __len__(self):
return len(self.lines) # MyDataSet的行数
这样的话我的输出就是 [batch_size, features_size] 而实际上他充当了 [seq_len, features_size]
这个坏处是,第一,没有 batch_size,第二,需要后期人为裁剪出 enc_input, dec_input, dec_output
enc_input, dec_input, dec_output 的关系:
That is, given the encoder input (x1, x2, …, x10) and the decoder input (x10, …, x13),
the decoder aims to output (x11, …, x14).
一个 batch 有多行
假设我现在有这样的数据
希望输入取前两列,输出取后两列
定义一个新的 dataset
#!/usr/bin/env python3
# encoding: utf-8
import numpy as np
from torch.utils.data import Dataset
# 导入数据集的类
class MyDataset(Dataset):
def __init__(self, csv_file, seq_len: int, enc_features_size: int, dec_features_size: int):
self.lines = open(csv_file).readlines()
self.seq_len = seq_len
self.enc_features_size = enc_features_size
self.dec_features_size = dec_features_size
print((int)(len(self.lines)/self.seq_len))
def __getitem__(self, index):
full_len_input = []
dec_output = []
for i in range(self.seq_len):
cur_line = self.lines[index * self.seq_len + i].split(',')
full_len_input.append([])
dec_output.append([])
for j in range(self.enc_features_size):
full_len_input[i].append(np.float32(cur_line[j].strip()))
for j in range(self.dec_features_size):
dec_output[i].append(np.float32(cur_line[self.enc_features_size + j].strip()))
return np.array(full_len_input), np.array(dec_output)
def __len__(self):
return (int)(len(self.lines)/self.seq_len) # MyDataSet的行数
取到的结果:
同时,如果 DataLoader 设置了 drop_last=True 可以让最后一轮长度不足的数据不出现
输出 enc_input, dec_input, dec_output
在之前的基础上增加裁剪
#!/usr/bin/env python3
# encoding: utf-8
import numpy as np
from torch.utils.data import Dataset
# 导入数据集的类
class MyDataset(Dataset):
def __init__(self,
csv_file,
full_seq_len: int,
enc_seq_len: int,
dec_seq_len: int,
enc_features_size: int,
dec_features_size: int):
self.lines = open(csv_file).readlines()
self.full_seq_len = full_seq_len
self.enc_seq_len = enc_seq_len
self.dec_seq_len = dec_seq_len
self.enc_features_size = enc_features_size
self.dec_features_size = dec_features_size
def __getitem__(self, index):
enc_input = []
dec_input = []
dec_output = []
for i in range(self.enc_seq_len):
cur_line = self.lines[index * self.full_seq_len + i].split(',')
enc_input.append([])
for j in range(self.enc_features_size):
enc_input[i].append(np.float32(cur_line[j].strip()))
for i in range(self.dec_seq_len + 1):
cur_line = self.lines[index * self.full_seq_len + i + self.enc_seq_len - 1].split(',')
dec_input.append([])
dec_output.append([])
for j in range(self.dec_features_size):
dec_input[i].append(np.float32(cur_line[self.enc_features_size + j].strip()))
dec_output[i].append(np.float32(cur_line[self.enc_features_size + j].strip()))
dec_input = dec_input[:-1]
dec_output = dec_output[1:]
enc_input = np.array(enc_input)
dec_input = np.array(dec_input)
dec_output = np.array(dec_output)
return enc_input, dec_input, dec_output
def __len__(self):
return (int)(len(self.lines)/self.full_seq_len) # MyDataSet的行数
结果:
可以看到已经满足了
enc_input = (x1, x2, …, x5)
dec_input = (x5, …, x9)
dec_output = (x6, …, x10)
的规律
但是这样有一个问题就是,下一次取 data 会变成
enc_input = (x11, x12, …, x15)
dec_input = (x15, …, x19)
dec_output = (x16, …, x20)
这样的话预测值就会断开
谨慎从 dataloader 返回 list
既然预测值断开了,那我本来还希望返回一个时间戳,来知道他是哪里断开了
这还需要数据集的第一列就是时间戳
#!/usr/bin/env python3
# encoding: utf-8
import numpy as np
from torch.utils.data import Dataset
class MyDataset(Dataset):
def __init__(self,
csv_file,
full_seq_len: int,
enc_seq_len: int,
dec_seq_len: int,
enc_features_size: int,
dec_features_size: int):
self.lines = open(csv_file).readlines()
self.full_seq_len = full_seq_len
self.enc_seq_len = enc_seq_len
self.dec_seq_len = dec_seq_len
self.enc_features_size = enc_features_size
self.dec_features_size = dec_features_size
def __getitem__(self, index):
"""
In default:
col 0 is timestamp,
col 1 ~ enc_features_size are features of input,
col enc_features_size+1 ~ enc_features_size+dec_features_size+1 are features of output
Caution:
When batch_size != 1, if you want to return a list type timestamp, the dataloader will mess things up
For example, assume that you are using:
for idx, (enc_timestamp, dec_timestamp, enc_input, dec_input, dec_output) in enumerate(train_loader):
if you return ['a1', 'a2', ..., 'an'] per batch,
actually you get [('a1'), ('a2'), ..., ('an')]
when batch_size = 2
and you expect to get [['a1', 'a2', ..., 'an'], ['an+1', 'an+2', ..., 'a2n']]
but actually, you get [('a1', 'an+1'), ('a2', 'an+2'), ..., ('an', 'a2n')]
so it is a worse idea to return list type
"""
enc_timestamp = []
dec_timestamp = []
enc_input = []
dec_input = []
dec_output = []
for i in range(self.enc_seq_len):
cur_line = self.lines[index * self.full_seq_len + i].split(',')
enc_input.append([])
enc_timestamp.append(cur_line[0].strip())
for j in range(self.enc_features_size):
enc_input[i].append(np.float32(cur_line[j + 1].strip()))
for i in range(self.dec_seq_len + 1):
cur_line = self.lines[index * self.full_seq_len + i + self.enc_seq_len - 1].split(',')
dec_input.append([])
dec_output.append([])
dec_timestamp.append(cur_line[0].strip())
for j in range(self.dec_features_size):
dec_input[i].append(np.float32(cur_line[self.enc_features_size + j + 1].strip()))
dec_output[i].append(np.float32(cur_line[self.enc_features_size + j + 1].strip()))
dec_timestamp = dec_timestamp[1:]
dec_input = dec_input[:-1]
dec_output = dec_output[1:]
# enc_timestamp = np.array(enc_timestamp)
# dec_timestamp = np.array(dec_timestamp)
enc_input = np.array(enc_input)
dec_input = np.array(dec_input)
dec_output = np.array(dec_output)
return enc_timestamp, dec_timestamp, enc_input, dec_input, dec_output
def __len__(self):
return (int)(len(self.lines)/self.full_seq_len) # MyDataSet的行数
结果,在 batch_size=1 的时候,返回的 list 是正常的(考虑到有些时间戳很奇怪,所以返回的是 list)
但是在 batch_size>1 的时候,返回的 list 被 dataloader 自动叠在一起,变成了在第 0 维堆叠了
这和我的预期就很不相符……而且我也不知道为啥我在 dataset 函数里面返回的是 ['a1', 'a2', ..., 'an'],但是他这经过 dataloader 拿出来就是 [('a1'), ('a2'), ..., ('an')] 了
不过考虑到 np 经过 dataloader 都会变成 tensor,也无所谓了
所以我觉得我直接返回时间戳是很蠢的,而且返回 list 这个东西也很烦
读取多个 csv 文件,并且不在 dataset 中处理数据
其实我后面想了一下,在 dataset 中处理数据其实是很蠢的,因为这样会让整个 dataset 的复用性变差
dataset 的唯一作用是读取所有相关的 csv,然后拼成一个大矩阵,然后将这个矩阵用 full_seq_len 分割成 [data_len, full_seq_len, features_num]
my_dataset.py
#!/usr/bin/env python3
# encoding: utf-8
import numpy as np
from torch.utils.data import Dataset
# find upper dir
import sys
import os
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
from csv_helper import get_ndarray_from_csv
class MyDataset(Dataset):
"""
Read all csv files and merge them into a big matrix. This matrix size is [all_rows_len, features_num]
And then spilt the matrix into each element of full_seq, the result data size is [data_len, full_seq_len, features_num]
"""
def __init__(self,
file_path,
file_prefix,
skiprows,
seq_len):
self.seq_len = seq_len
self.data = get_ndarray_from_csv(file_path, file_prefix, skiprows=skiprows)
ceil_row_len = int(len(self.data[1])/self.seq_len) * self.seq_len
self.data = self.data[:, :ceil_row_len, :]
self.data = self.data.reshape((-1, self.seq_len, len(self.data[0, 0])))
def __getitem__(self, index):
if len(self.data) == 0:
return 0
return self.data[index]
def __len__(self):
if len(self.data) == 0:
return 0
return len(self.data)
# Test
# pred_len_size = 10
# # batch size
# enc_seq_len = pred_len_size
# dec_seq_len = pred_len_size
# full_seq_len = enc_seq_len + dec_seq_len
#
# dataset_train = MyDataset(file_prefix='train',
# skiprows=1,
# seq_len=full_seq_len)
其中用于读取多个 csv 的辅助函数为
csv_helper.py
import re
import os
import csv
import numpy as np
def take_csv_number_by_prefix(elem, file_prefix):
pattern = re.compile(file_prefix + '_(.+).csv')
num = int(pattern.findall(elem)[0])
return num
# get all csv files in ../data/train
# file name is file_prefix_{number}.csv
def get_csv_names_by_prefix(root_path, file_prefix):
csvFileNames = [fn for fn in os.listdir(root_path + file_prefix) if fn.endswith('.csv')]
csvFileNames.sort(key=lambda elem: take_csv_number_by_prefix(elem, file_prefix))
return csvFileNames
def get_ndarray_from_csv(root_path, file_prefix, delimiter=',', skiprows=0, read_files_count=-1):
csvFileNames = get_csv_names_by_prefix(root_path, file_prefix)
data = []
read_files_count = len(csvFileNames) if read_files_count == -1 else read_files_count
for i in range(read_files_count):
# auto close file when leave "with" scope
with open(root_path + file_prefix + '/' + csvFileNames[i], 'r', encoding='UTF-8') as csvfile:
# new dim of data <-> new csv file
data.append([])
# add all rows to data[i]
curr_skiprows = skiprows
targetReader = csv.reader(csvfile, delimiter=delimiter)
for row in targetReader:
if curr_skiprows > 0:
curr_skiprows -= 1
continue
try:
float_row = [float(x) for x in row]
except:
print(csvFileNames[i])
print(row)
data[i].append(float_row)
return np.array(data)
# Test
# file_prefix = '../data/train'
# data = get_ndarray_from_csv(file_prefix, skiprows=1)
#
# print("!!")
但是其实这样还是要有一个地方来自定义地处理数据的
其实我觉得最好的方式就是让 dataset 读取已经处理好的数据,也就是说我有一个原始 csv,然后我自己处理得到一个预处理 csv,然后我之后的逻辑都根据这个预处理 csv 来走
因此我还写了一个预处理脚本
比如我需要去除第一列的时间戳,然后将除了倒数两列之外的列缩放到 [-1, 1],还要将倒数第三列和倒数第四列互换位置
那么我首先需要获得除了倒数两列之外的列的最大最小值,之后再将最大最小值的倒数两列互换位置
get_old_data_min_max.py
import numpy as np
from dataset.csv_helper import get_ndarray_from_csv
data = get_ndarray_from_csv('./', 'train', skiprows=1)
data_reshaped = data.reshape(-1, len(data[0, 0]))
# get rid of time step and last two symbols
data_reshaped = data_reshaped[:, 1:-2]
min_max = np.zeros((len(data_reshaped[0]), 2))
for i in range(len(data_reshaped[0])):
min_max[i, 0] = np.min(data_reshaped[:, i])
min_max[i, 1] = np.max(data_reshaped[:, i])
# swap -2 and -1
tmp = np.copy(min_max[-2, :])
min_max[-2, :] = min_max[-1, :]
min_max[-1, :] = tmp
np.save("min_max.npy", min_max)
然后在我想要输出新的 csv 的时候,使用 pandas 来读取 csv,互换 dataframe 的倒数第三列和倒数第四列,删掉表示时间戳的第一列,然后对除了倒数两列之外的列缩放,然后输出
preprocess_data.py
# get legends
import csv
swap_columns = []
with open('../old_data/train/train_1.csv', 'r', encoding='UTF-8') as csvfile:
targetReader = csv.reader(csvfile, delimiter=',')
for row in targetReader:
swap_columns = row
break
# swap -3 and -4
tmp = swap_columns[-3]
swap_columns[-3] = swap_columns[-4]
swap_columns[-4] = tmp
# get min_max
import numpy as np
min_max = np.load("min_max.npy")
# preprocess data
import pandas as pd
from dataset.csv_helper import get_csv_names_by_prefix
def preprocess(prefix):
# get csv file names
names = get_csv_names_by_prefix('./', prefix)
# scaling df
for name in names:
# swap column
df = pd.read_csv('./' + prefix + '/' + name)
df = df.reindex(columns=swap_columns)
# drop time step column
df = df.drop(columns=[swap_columns[0]])
new_columns = swap_columns[1:]
# scaling
for i in range(len(min_max)):
df[new_columns[i]] = (df[new_columns[i]] - min_max[i, 0])/(min_max[i, 1] - min_max[i, 0])
# output csv
df.to_csv('../data/' + prefix + '/' + name, index=False)
preprocess('train')
preprocess('test')
那么加载数据就是
# 导入数据
dataset_train = MyDataset(file_path='../data/',
file_prefix='train',
skiprows=1,
seq_len=full_seq_len)
train_loader = DataLoader(dataset_train,
batch_size=batch_size,
shuffle=False,
drop_last=True)
那么训练部分就是
def train_transformer(epoch):
global total_loss
global start_time
mode = True
model.train(mode=mode) # 模型设置为训练模式
loss_epoch = 0 # 一次 epoch 的 loss 总和
flag_stop = 0 # 认为收敛的次数
for idx, train_data in enumerate(train_loader):
# [enc_seq_len, enc_features_size]
enc_input = train_data[:, :enc_seq_len, :enc_features_size].to(torch.float32).to(device) # [batch_size, enc_seq_len, enc_features_size]
dec_input = train_data[:, enc_seq_len-1:full_seq_len-1, -dec_features_size:].to(torch.float32).to(device) # [batch_size, full_seq_len-enc_seq_len, dec_features_size]
dec_output = train_data[:, enc_seq_len:full_seq_len, -dec_features_size:].to(torch.float32).to(device) # [batch_size, full_seq_len-enc_seq_len, dec_features_size]
if not batch_first:
enc_input = enc_input.permute(1, 0, 2) # [batch_size, seq_len, features_size] -> [seq_len, batch_size, features_size]
dec_input = dec_input.permute(1, 0, 2)
dec_output = dec_output.permute(1, 0, 2)
# [dec_seq_len, batch_size, dec_features_size]
prediction = model(enc_input, dec_input, src_mask, tgt_mask)
loss = criterion(prediction, dec_output)
optimizer.zero_grad() # clear gradients for this training step
loss.backward() # back propagation, compute gradients
optimizer.step() # apply gradients
# scheduler.step()
# print(scheduler.get_lr())
loss_epoch += loss.item() # 将每个 batch 的 loss 累加,直到所有数据都计算完毕
# 如果认为某一次的损失小于某个值,就开始连续计数
# 如果发现某一次的损失大于某个值,就停止连续计数,重置计数值 = 0
# 如果计数值大于某个值,说明已经有很多次损失很小,那么提前终止
if loss.item() < 0.05:
flag_stop += 1
if flag_stop >= 100:
break
else:
flag_stop = 0
hasSaved = ''
train_loss_list.append(loss_epoch)
if loss_epoch < total_loss: # 损失达到新的最小值时保存模型
hasSaved = ' Save model!'
total_loss = loss_epoch
# 这里也注意要改!
torch.save(model, '../model/Transformer_model_exp2_params_only_xl_'+ name_flag +'.pkl') # save model
# if epoch % debug_epoch == 0:
if epoch != 0:
end_time = time.time()
print('Train Epoch: {} Loss: {:.4f} Used time: {:.4f}'.format(epoch, loss_epoch, end_time - start_time) + hasSaved)
start_time = end_time
连续的预测值
以前的 dataset 取数据是 123 456 789 这样取
现在我想 123 234 345 这样取,于是把 dataset 写成:
#!/usr/bin/env python3
# encoding: utf-8
import numpy as np
from torch.utils.data import Dataset
# find upper dir
import sys
import os
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
from csv_helper import get_ndarray_from_csv
class MyDataset(Dataset):
"""
Read all csv files and merge them into a big matrix. This matrix size is [all_rows_len, features_num]
And then spilt the matrix into each item of full_seq, the result data size is [data_len, full_seq_len, features_num]
"""
def __init__(self,
file_path,
file_prefix,
skiprows,
seq_len):
self.seq_len = seq_len
self.data = get_ndarray_from_csv(file_path, file_prefix, skiprows=skiprows)
self.files_len = len(self.data)
self.rows_len_per_file = len(self.data[0])
self.item_len_per_file = self.rows_len_per_file - seq_len + 1
def __getitem__(self, index):
if len(self.data) == 0:
return 0
file_index = int(index / (self.files_len * self.item_len_per_file))
item_index = index - file_index * (self.files_len * self.item_len_per_file)
data_item = self.data[file_index, item_index:item_index+self.seq_len, :]
return data_item
def __len__(self):
if len(self.data) == 0:
return 0
return self.files_len * self.item_len_per_file
# Test
# enc_features_size = 9
# dec_features_size = 3
# batch_first = False
#
# pred_len_size = 10
#
# # batch size
#
# enc_seq_len = pred_len_size
# dec_seq_len = pred_len_size
# full_seq_len = enc_seq_len + dec_seq_len
#
# import torch
# from torch.utils.data import DataLoader
#
# # device GPU or CPU
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print('You are using: ' + str(device))
#
# dataset_train = MyDataset(file_path='../data/',
# file_prefix='train',
# skiprows=1,
# seq_len=full_seq_len)
# train_loader = DataLoader(dataset_train,
# batch_size=1,
# shuffle=False,
# drop_last=True)
#
# for idx, train_data in enumerate(train_loader):
# train_data = train_data.squeeze(axis=0)
# print('!')
虽然其实我不管那样都没收敛……只能说数据加载这里我是尽力了