我一开始的网络结构是
#!/usr/bin/env python3
# encoding: utf-8
import math
import torch
from torch import nn
import positional_encoder as pe
class TransformerTS(nn.Module):
def __init__(self,
enc_features_size,
dec_features_size,
d_model=512,
nhead=8,
num_encoder_layers=6,
num_decoder_layers=6,
dim_feedforward=2048,
dropout=0.1,
activation='relu',
custom_encoder=None,
custom_decoder=None,
batch_first=False):
super(TransformerTS, self).__init__()
self.transform = nn.Transformer(
d_model=d_model,
nhead=nhead,
num_encoder_layers=num_encoder_layers,
num_decoder_layers=num_decoder_layers,
dim_feedforward=dim_feedforward,
dropout=dropout,
activation=activation,
custom_encoder=custom_encoder,
custom_decoder=custom_decoder,
batch_first=batch_first
)
self.positional_encoding_layer = pe.PositionalEncoder(
d_model=d_model,
dropout=dropout,
batch_first=batch_first
)
self.enc_input_fc = nn.Linear(enc_features_size, d_model)
self.dec_input_fc = nn.Linear(dec_features_size, d_model)
self.out_fc = nn.Linear(d_model, dec_features_size)
def forward(self, enc_input, dec_input, src_mask, tgt_mask):
# print('enc_input.size(): {}'.format(enc_input.size()))
# embed_encoder_input: [enc_seq_len, 1, enc_features_size] -> [enc_seq_len, 1, d_model]
embed_encoder_input = self.enc_input_fc(enc_input)
# print('embed_encoder_input.size(): {}'.format(embed_encoder_input.size()))
embed_encoder_input = self.positional_encoding_layer(embed_encoder_input)
# print('embed_encoder_input.size(): {}'.format(embed_encoder_input.size()))
# embed_decoder_input: [dec_seq_len, 1, dec_features_size] -> [dec_seq_len, 1, d_model]
embed_decoder_input = self.dec_input_fc(dec_input)
# x: [dec_seq_len, 1, d_model]
x = self.transform(src=embed_encoder_input,
tgt=embed_decoder_input,
src_mask=src_mask,
tgt_mask=tgt_mask)
# x: [dec_seq_len, 1, d_model] -> [dec_seq_len, 1, dec_features_size]
x = self.out_fc(x)
return x
训练的时候输出的结果感觉没有收敛的
Train Epoch:0 Loss:1212.456150816
Train Epoch:50 Loss:1398.332747567
Train Epoch:100 Loss:1380.918499266
然后我看了别人写的,感觉跟我的也差不读
towardsdatascience.com/how-to-make…
import torch.nn as nn
from torch import nn, Tensor
import positional_encoder as pe
import torch.nn.functional as F
class TimeSeriesTransformer(nn.Module):
"""
This class implements a transformer model that can be used for times series
forecasting. This time series transformer model is based on the paper by
Wu et al (2020) [1]. The paper will be referred to as "the paper".
A detailed description of the code can be found in my article here:
https://towardsdatascience.com/how-to-make-a-pytorch-transformer-for-time-series-forecasting-69e073d4061e
In cases where the paper does not specify what value was used for a specific
configuration/hyperparameter, this class uses the values from Vaswani et al
(2017) [2] or from PyTorch source code.
Unlike the paper, this class assumes that input layers, positional encoding
layers and linear mapping layers are separate from the encoder and decoder,
i.e. the encoder and decoder only do what is depicted as their sub-layers
in the paper. For practical purposes, this assumption does not make a
difference - it merely means that the linear and positional encoding layers
are implemented inside the present class and not inside the
Encoder() and Decoder() classes.
[1] Wu, N., Green, B., Ben, X., O'banion, S. (2020).
'Deep Transformer Models for Time Series Forecasting:
The Influenza Prevalence Case'.
arXiv:2001.08317 [cs, stat] [Preprint].
Available at: http://arxiv.org/abs/2001.08317 (Accessed: 9 March 2022).
[2] Vaswani, A. et al. (2017)
'Attention Is All You Need'.
arXiv:1706.03762 [cs] [Preprint].
Available at: http://arxiv.org/abs/1706.03762 (Accessed: 9 March 2022).
"""
def __init__(self,
input_size: int,
dec_seq_len: int,
batch_first: bool,
out_seq_len: int=58,
dim_val: int=512,
n_encoder_layers: int=4,
n_decoder_layers: int=4,
n_heads: int=8,
dropout_encoder: float=0.2,
dropout_decoder: float=0.2,
dropout_pos_enc: float=0.1,
dim_feedforward_encoder: int=2048,
dim_feedforward_decoder: int=2048,
num_predicted_features: int=1
):
"""
Args:
input_size: int, number of input variables. 1 if univariate.
dec_seq_len: int, the length of the input sequence fed to the decoder
dim_val: int, aka d_model. All sub-layers in the model produce
outputs of dimension dim_val
n_encoder_layers: int, number of stacked encoder layers in the encoder
n_decoder_layers: int, number of stacked encoder layers in the decoder
n_heads: int, the number of attention heads (aka parallel attention layers)
dropout_encoder: float, the dropout rate of the encoder
dropout_decoder: float, the dropout rate of the decoder
dropout_pos_enc: float, the dropout rate of the positional encoder
dim_feedforward_encoder: int, number of neurons in the linear layer
of the encoder
dim_feedforward_decoder: int, number of neurons in the linear layer
of the decoder
num_predicted_features: int, the number of features you want to predict.
Most of the time, this will be 1 because we're
only forecasting FCR-N prices in DK2, but in
we wanted to also predict FCR-D with the same
model, num_predicted_features should be 2.
"""
super().__init__()
self.dec_seq_len = dec_seq_len
#print("input_size is: {}".format(input_size))
#print("dim_val is: {}".format(dim_val))
# Creating the three linear layers needed for the model
self.encoder_input_layer = nn.Linear(
in_features=input_size,
out_features=dim_val
)
self.decoder_input_layer = nn.Linear(
in_features=num_predicted_features,
out_features=dim_val
)
self.linear_mapping = nn.Linear(
in_features=dim_val,
out_features=num_predicted_features
)
# Create positional encoder
self.positional_encoding_layer = pe.PositionalEncoder(
d_model=dim_val,
dropout=dropout_pos_enc
)
# The encoder layer used in the paper is identical to the one used by
# Vaswani et al (2017) on which the PyTorch module is based.
encoder_layer = nn.TransformerEncoderLayer(
d_model=dim_val,
nhead=n_heads,
dim_feedforward=dim_feedforward_encoder,
dropout=dropout_encoder,
batch_first=batch_first
)
# Stack the encoder layers in nn.TransformerDecoder
# It seems the option of passing a normalization instance is redundant
# in my case, because nn.TransformerEncoderLayer per default normalizes
# after each sub-layer
# (https://github.com/pytorch/pytorch/issues/24930).
self.encoder = nn.TransformerEncoder(
encoder_layer=encoder_layer,
num_layers=n_encoder_layers,
norm=None
)
decoder_layer = nn.TransformerDecoderLayer(
d_model=dim_val,
nhead=n_heads,
dim_feedforward=dim_feedforward_decoder,
dropout=dropout_decoder,
batch_first=batch_first
)
# Stack the decoder layers in nn.TransformerDecoder
# It seems the option of passing a normalization instance is redundant
# in my case, because nn.TransformerDecoderLayer per default normalizes
# after each sub-layer
# (https://github.com/pytorch/pytorch/issues/24930).
self.decoder = nn.TransformerDecoder(
decoder_layer=decoder_layer,
num_layers=n_decoder_layers,
norm=None
)
def forward(self, src: Tensor, tgt: Tensor, src_mask: Tensor=None,
tgt_mask: Tensor=None) -> Tensor:
"""
Returns a tensor of shape:
[target_sequence_length, batch_size, num_predicted_features]
Args:
src: the encoder's output sequence. Shape: (S,E) for unbatched input,
(S, N, E) if batch_first=False or (N, S, E) if
batch_first=True, where S is the source sequence length,
N is the batch size, and E is the number of features (1 if univariate)
tgt: the sequence to the decoder. Shape: (T,E) for unbatched input,
(T, N, E)(T,N,E) if batch_first=False or (N, T, E) if
batch_first=True, where T is the target sequence length,
N is the batch size, and E is the number of features (1 if univariate)
src_mask: the mask for the src sequence to prevent the model from
using data points from the target sequence
tgt_mask: the mask for the tgt sequence to prevent the model from
using data points from the target sequence
"""
#print("From model.forward(): Size of src as given to forward(): {}".format(src.size()))
#print("From model.forward(): tgt size = {}".format(tgt.size()))
# Pass throguh the input layer right before the encoder
src = self.encoder_input_layer(src) # src shape: [batch_size, src length, dim_val] regardless of number of input features
#print("From model.forward(): Size of src after input layer: {}".format(src.size()))
# Pass through the positional encoding layer
src = self.positional_encoding_layer(src) # src shape: [batch_size, src length, dim_val] regardless of number of input features
#print("From model.forward(): Size of src after pos_enc layer: {}".format(src.size()))
# Pass through all the stacked encoder layers in the encoder
# Masking is only needed in the encoder if input sequences are padded
# which they are not in this time series use case, because all my
# input sequences are naturally of the same length.
# (https://github.com/huggingface/transformers/issues/4083)
src = self.encoder( # src shape: [batch_size, enc_seq_len, dim_val]
src=src
)
#print("From model.forward(): Size of src after encoder: {}".format(src.size()))
# Pass decoder input through decoder input layer
decoder_output = self.decoder_input_layer(tgt) # src shape: [target sequence length, batch_size, dim_val] regardless of number of input features
#print("From model.forward(): Size of decoder_output after linear decoder layer: {}".format(decoder_output.size()))
#if src_mask is not None:
#print("From model.forward(): Size of src_mask: {}".format(src_mask.size()))
#if tgt_mask is not None:
#print("From model.forward(): Size of tgt_mask: {}".format(tgt_mask.size()))
# Pass throguh decoder - output shape: [batch_size, target seq len, dim_val]
decoder_output = self.decoder(
tgt=decoder_output,
memory=src,
tgt_mask=tgt_mask,
memory_mask=src_mask
)
#print("From model.forward(): decoder_output shape after decoder: {}".format(decoder_output.shape))
# Pass through linear mapping
decoder_output = self.linear_mapping(decoder_output) # shape [batch_size, target seq len]
#print("From model.forward(): decoder_output size after linear_mapping = {}".format(decoder_output.size()))
return decoder_output
于是我就仿照别人的改成了
import torch.nn as nn
from torch import nn, Tensor
import positional_encoder as pe
import torch.nn.functional as F
class TimeSeriesTransformer(nn.Module):
"""
This class implements a transformer model that can be used for times series
forecasting. This time series transformer model is based on the paper by
Wu et al (2020) [1]. The paper will be referred to as "the paper".
A detailed description of the code can be found in my article here:
https://towardsdatascience.com/how-to-make-a-pytorch-transformer-for-time-series-forecasting-69e073d4061e
In cases where the paper does not specify what value was used for a specific
configuration/hyperparameter, this class uses the values from Vaswani et al
(2017) [2] or from PyTorch source code.
Unlike the paper, this class assumes that input layers, positional encoding
layers and linear mapping layers are separate from the encoder and decoder,
i.e. the encoder and decoder only do what is depicted as their sub-layers
in the paper. For practical purposes, this assumption does not make a
difference - it merely means that the linear and positional encoding layers
are implemented inside the present class and not inside the
Encoder() and Decoder() classes.
[1] Wu, N., Green, B., Ben, X., O'banion, S. (2020).
'Deep Transformer Models for Time Series Forecasting:
The Influenza Prevalence Case'.
arXiv:2001.08317 [cs, stat] [Preprint].
Available at: http://arxiv.org/abs/2001.08317 (Accessed: 9 March 2022).
[2] Vaswani, A. et al. (2017)
'Attention Is All You Need'.
arXiv:1706.03762 [cs] [Preprint].
Available at: http://arxiv.org/abs/1706.03762 (Accessed: 9 March 2022).
"""
def __init__(self,
enc_features_size: int = 1,
dec_features_size: int = 1,
batch_first: bool = False,
d_model: int = 512,
num_encoder_layers: int = 4,
num_decoder_layers: int = 4,
n_heads: int = 8,
dropout_encoder: float = 0.2,
dropout_decoder: float = 0.2,
dropout_pos_enc: float = 0.1,
dim_feedforward_encoder: int = 2048,
dim_feedforward_decoder: int = 2048,
):
"""
Args:
enc_features_size: int, number of input variables. 1 if univariate.
dec_features_size: int, the number of features you want to predict.
Most of the time, this will be 1 because we're
only forecasting FCR-N prices in DK2, but in
we wanted to also predict FCR-D with the same
model, dec_features_size should be 2.
d_model: int, aka d_model. All sub-layers in the model produce
outputs of dimension d_model
num_encoder_layers: int, number of stacked encoder layers in the encoder
num_decoder_layers: int, number of stacked encoder layers in the decoder
n_heads: int, the number of attention heads (aka parallel attention layers)
dropout_encoder: float, the dropout rate of the encoder
dropout_decoder: float, the dropout rate of the decoder
dropout_pos_enc: float, the dropout rate of the positional encoder
dim_feedforward_encoder: int, number of neurons in the linear layer
of the encoder
dim_feedforward_decoder: int, number of neurons in the linear layer
of the decoder
"""
super().__init__()
# print("enc_features_size is: {}".format(enc_features_size))
# print("d_model is: {}".format(d_model))
# Creating the three linear layers needed for the model
self.encoder_input_layer = nn.Linear(
in_features=enc_features_size,
out_features=d_model
)
self.decoder_input_layer = nn.Linear(
in_features=dec_features_size,
out_features=d_model
)
self.linear_mapping = nn.Linear(
in_features=d_model,
out_features=dec_features_size
)
# Create positional encoder
self.positional_encoding_layer = pe.PositionalEncoder(
d_model=d_model,
dropout=dropout_pos_enc
)
# The encoder layer used in the paper is identical to the one used by
# Vaswani et al (2017) on which the PyTorch module is based.
encoder_layer = nn.TransformerEncoderLayer(
d_model=d_model,
nhead=n_heads,
dim_feedforward=dim_feedforward_encoder,
dropout=dropout_encoder,
batch_first=batch_first
)
# Stack the encoder layers in nn.TransformerDecoder
# It seems the option of passing a normalization instance is redundant
# in my case, because nn.TransformerEncoderLayer per default normalizes
# after each sub-layer
# (https://github.com/pytorch/pytorch/issues/24930).
self.encoder = nn.TransformerEncoder(
encoder_layer=encoder_layer,
num_layers=num_encoder_layers,
norm=None
)
decoder_layer = nn.TransformerDecoderLayer(
d_model=d_model,
nhead=n_heads,
dim_feedforward=dim_feedforward_decoder,
dropout=dropout_decoder,
batch_first=batch_first
)
# Stack the decoder layers in nn.TransformerDecoder
# It seems the option of passing a normalization instance is redundant
# in my case, because nn.TransformerDecoderLayer per default normalizes
# after each sub-layer
# (https://github.com/pytorch/pytorch/issues/24930).
self.decoder = nn.TransformerDecoder(
decoder_layer=decoder_layer,
num_layers=num_decoder_layers,
norm=None
)
def forward(self, src: Tensor, tgt: Tensor, src_mask: Tensor = None,
tgt_mask: Tensor = None) -> Tensor:
"""
Returns a tensor of shape:
[target_sequence_length, batch_size, dec_features_size]
Args:
src: the encoder's output sequence. Shape: (S,E) for unbatched input,
(S, N, E) if batch_first=False or (N, S, E) if
batch_first=True, where S is the source sequence length,
N is the batch size, and E is the number of features (1 if univariate)
tgt: the sequence to the decoder. Shape: (T,E) for unbatched input,
(T, N, E)(T,N,E) if batch_first=False or (N, T, E) if
batch_first=True, where T is the target sequence length,
N is the batch size, and E is the number of features (1 if univariate)
src_mask: the mask for the src sequence to prevent the model from
using data points from the target sequence
tgt_mask: the mask for the tgt sequence to prevent the model from
using data points from the target sequence
"""
# print("From model.forward(): Size of src as given to forward(): {}".format(src.size()))
# print("From model.forward(): tgt size = {}".format(tgt.size()))
# Pass throguh the input layer right before the encoder
src = self.encoder_input_layer(
src) # src shape: [batch_size, src length, d_model] regardless of number of input features
# print("From model.forward(): Size of src after input layer: {}".format(src.size()))
# Pass through the positional encoding layer
src = self.positional_encoding_layer(
src) # src shape: [batch_size, src length, d_model] regardless of number of input features
# print("From model.forward(): Size of src after pos_enc layer: {}".format(src.size()))
# Pass through all the stacked encoder layers in the encoder
# Masking is only needed in the encoder if input sequences are padded
# which they are not in this time series use case, because all my
# input sequences are naturally of the same length.
# (https://github.com/huggingface/transformers/issues/4083)
src = self.encoder( # src shape: [batch_size, enc_features_size, d_model]
src=src
)
# print("From model.forward(): Size of src after encoder: {}".format(src.size()))
# Pass decoder input through decoder input layer
decoder_output = self.decoder_input_layer(
tgt) # src shape: [target sequence length, batch_size, d_model] regardless of number of input features
# print("From model.forward(): Size of decoder_output after linear decoder layer: {}".format(decoder_output.size()))
# if src_mask is not None:
# print("From model.forward(): Size of src_mask: {}".format(src_mask.size()))
# if tgt_mask is not None:
# print("From model.forward(): Size of tgt_mask: {}".format(tgt_mask.size()))
# Pass throguh decoder - output shape: [batch_size, target seq len, d_model]
decoder_output = self.decoder(
tgt=decoder_output,
memory=src,
tgt_mask=tgt_mask,
memory_mask=src_mask
)
# print("From model.forward(): decoder_output shape after decoder: {}".format(decoder_output.shape))
# Pass through linear mapping
decoder_output = self.linear_mapping(decoder_output) # shape [batch_size, target seq len]
# print("From model.forward(): decoder_output size after linear_mapping = {}".format(decoder_output.size()))
return decoder_output
原理都是一样的,都是 transformer 的输入前面一个线性层,输出后面经过一个线性层