九、Transformer 江山一统

思考： RNN有什么缺点？大模型为什么不是很多层RNN？

9.1、消除恐惧：我们亲手写一个 Transformer

9.1.1、Embeddings

#%% import torch.nn as nn import torch

class PositionalEmbedding(nn.Module):

def __init__(self, embed_size, max_len=512):
    super().__init__()

    # Compute the positional encodings once in log space.
    pe = torch.zeros(max_len, embed_size).float()
    pe.require_grad = False

    position = torch.arange(0, max_len).float().unsqueeze(1)
    div_term = (torch.arange(0, embed_size, 2).float()
                * -(math.log(10000.0) / embed_size)).exp()

    pe[:, 0::2] = torch.sin(position * div_term)
    pe[:, 1::2] = torch.cos(position * div_term)

    pe = pe.unsqueeze(0)
    self.register_buffer('pe', pe)

def forward(self, x):
    return self.pe[:, :x.size(1)]

class BERTEmbedding(nn.Module):

def __init__(self, vocab_size, embed_size, dropout=0.1):
    """
    :param vocab_size: 词表大小
    :param embed_size: embedding维度768
    :param dropout: dropout概率
    """
    super().__init__()
    self.token_embedding = nn.Embedding(
        vocab_size, embed_size, padding_idx=0)
    self.position_embedding = PositionalEmbedding(
        embed_size=embed_size, max_len=512)
    self.token_type_embedding = nn.Embedding(2, embed_size, padding_idx=0)
    self.dropout = nn.Dropout(p=dropout)
    self.embed_size = embed_size

def forward(self, input_ids, token_type_ids):
    x = self.token_embedding(input_ids) + self.position_embedding(
        input_ids) + self.token_type_embedding(token_type_ids)
    return self.dropout(x)

9.1.2、单头 Attention

#%% import torch.nn as nn import torch.nn.functional as F import torch

import math

''' query = query_linear(x) key = key_linear(x) value = value_linear(x) '''

单个头的注意力计算

class Attention(nn.Module):

def forward(self, query, key, value, mask=None, dropout=None):
    scores = torch.matmul(query, key.transpose(-2, -1)) \
        / math.sqrt(query.size(-1))

    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)

    p_attn = F.softmax(scores, dim=-1)

    if dropout is not None:
        p_attn = dropout(p_attn)

    return torch.matmul(p_attn, value), p_attn

每个token对应的query向量与每个token对应的key向量做内积

将上述内积取softmax（得到0~1之间的值，即为attention权重）

计算每个token相对于所有其它token的attention权重（最终构成一个$L\times L$的attention矩阵）

每个token对应的value向量乘以attention权重，并相加，得到当前token的self-attention value向量

将上述操作应用于每个token

以上是一个头的操作，同时（并行）应用于多个独立的头

9.1.3、多头 Attention

将每个头得到向量拼接在一起，最后乘一个线性矩阵，得到 multi-head attention 的输出

#%%
import torch.nn

class MultiHeadedAttention(nn.Module):
    def __init__(self, head_num, hidden_size, dropout=0.1):
        """
        :param head_num: 头的个数，必须能被hidden_size整除
        :param hidden_size: 隐层的维度，与embed_size一致
        """
        super().__init__()
        assert hidden_size % head_num == 0

        self.per_head_dim = hidden_size // head_num
        self.head_num = head_num
        self.hidden_size = hidden_size
    
        self.query_linear = nn.Linear(hidden_size, hidden_size)
        self.key_linear = nn.Linear(hidden_size, hidden_size)
        self.value_linear = nn.Linear(hidden_size, hidden_size)
    
        self.output_linear = nn.Linear(hidden_size, hidden_size)
        self.attention = Attention()
    
        self.dropout = nn.Dropout(p=dropout)
    
    def reshape(self, x, batch_size):
        # 拆成多个头
        return x.view(batch_size, -1, self.head_num, self.per_head_dim).transpose(1, 2)
    
    def forward(self, x, mask=None):
        batch_size = x.size(0)
    
        query = self.reshape(self.query_linear(x))
        key = self.reshape(self.key_linear(x))
        value = self.reshape(self.value_linear(x))
    
        # 每个头计算attention
        x, attn = self.attention(
            query, key, value, mask=mask, dropout=self.dropout
        )
    
        # 把每个头的attention*value拼接在一起
        x = x.transpose(1, 2).contiguous().view(
            batch_size, -1, self.hidden_size)
    
        # 乘一个线性矩阵
        return self.output_linear(x)

#%% md

9.1.4、全连接网络（Feed-Forward Network）

#%% import torch.nn as nn

class FeedForward(nn.Module):

def __init__(self, hidden_size, dropout=0.1):
    super(FeedForward, self).__init__()
    self.input_layer = nn.Linear(hidden_size, hidden_size*4)
    self.output_layer = nn.Linear(hidden_size*4, hidden_size)
    self.dropout = nn.Dropout(dropout)
    self.activation = nn.GELU()

def forward(self, x):
    x = self.input_layer(x)
    x = self.activation(x)
    x = self.dropout(x)
    x = self.output_layer(x)
    return x

9.1.5、拼成一层 Transformer

#%% import torch.nn as nn

class TransformerBlock(nn.Module):

def __init__(self, hidden_size, head_num, dropout=0.1):
    super().__init__()
    self.multi_head_attention = MultiHeadedAttention(head_num, hidden_size)
    self.feed_forward = FeedForward(hidden_size, dropout=dropout)
    self.layer_norm1 = nn.LayerNorm(hidden_size)
    self.dropout1 = nn.Dropout(dropout)
    self.layer_norm2 = nn.LayerNorm(hidden_size)
    self.dropout2 = nn.Dropout(dropout)
    self.dropout = nn.Dropout(p=dropout)

def forward(self, x, mask):
    x0 = x
    # 多头注意力层
    x = self.multi_head_attention(x, mask)

    # 残差和LayerNorm层(1)
    x = self.dropout1(x)
    x = self.layer_norm1(x0+x)

    # 前向网络层
    x1 = x
    x = self.feed_forward(x)

    # 残差和LayerNorm层(2)
    x = self.dropout2(x)
    x = self.layer_norm2(x1+x)
    return x

Multi-head attention的输出，经过残差和norm之后进入一个两层全连接网络

Layernorm:

$y=\frac{x-\mathrm{E}(x)}{\sqrt{\mathrm{Var}(x)+\epsilon}}*\gamma+\beta$

其中 $\gamma$ 和 $\beta$ 是可训练的参数， $\epsilon=10^{-5}$ 是超参，保持数值稳定性。

9.1.6、多层 Transformer 构成 BERT Encoder

#%% import torch.nn as nn

class BERT(nn.Module):

def __init__(self, vocab_size, hidden_size=768, layer_num=12, head_num=12, dropout=0.1):

    super().__init__()
    # Embedding层
    self.embedding = BERTEmbedding(
        vocab_size=vocab_size, embed_size=hidden_size)
    # N层Transformers
    self.transformer_blocks = nn.ModuleList(
        [TransformerBlock(hidden_size, head_num, dropout)
         for _ in range(layer_num)]
    )

def forward(self, input_ids, token_type_ids):
    """
    tokenizer(["你好吗","你好"], text_pair=["我很好","我好"], max_length=10, padding='max_length',truncation=True)
    [CLS]你好吗[SEP]我很好[SEP][PAD]
    [CLS]你好[SEP]我好[SEP][PAD][PAD][PAD]  
    input_ids: [
        [101, 872, 1962, 1408, 102, 2769, 2523, 1962, 102, 0],
        [101, 872, 1962, 102, 2769, 1962, 102, 0, 0, 0]
    ]
    token_type_ids：[
            [0, 0, 0, 0, 0, 1, 1, 1, 1, 0],
            [0, 0, 0, 0, 1, 1, 1, 0, 0, 0]
        ]
    """
    attention_mask = (x > 0).unsqueeze(
        1).repeat(1, x.size(1), 1).unsqueeze(1)

    # 计算embedding
    x = self.embedding(input_ids, token_type_ids)

    # 逐层代入Tranformers
    for transformer in self.transformer_blocks:
        x = transformer.forward(x, attention_mask)

    return x