AI全栈大模型工程师(二十五)Transformer

56 阅读3分钟

九、Transformer 江山一统

思考: RNN有什么缺点?大模型为什么不是很多层RNN?

9.1、**消除恐惧:**我们亲手写一个 Transformer

9.1.1、Embeddings

#%% import torch.nn as nn import torch

class PositionalEmbedding(nn.Module):

def __init__(self, embed_size, max_len=512):
    super().__init__()

    # Compute the positional encodings once in log space.
    pe = torch.zeros(max_len, embed_size).float()
    pe.require_grad = False

    position = torch.arange(0, max_len).float().unsqueeze(1)
    div_term = (torch.arange(0, embed_size, 2).float()
                * -(math.log(10000.0) / embed_size)).exp()

    pe[:, 0::2] = torch.sin(position * div_term)
    pe[:, 1::2] = torch.cos(position * div_term)

    pe = pe.unsqueeze(0)
    self.register_buffer('pe', pe)

def forward(self, x):
    return self.pe[:, :x.size(1)]

class BERTEmbedding(nn.Module):

def __init__(self, vocab_size, embed_size, dropout=0.1):
    """
    :param vocab_size: 词表大小
    :param embed_size: embedding维度768
    :param dropout: dropout概率
    """
    super().__init__()
    self.token_embedding = nn.Embedding(
        vocab_size, embed_size, padding_idx=0)
    self.position_embedding = PositionalEmbedding(
        embed_size=embed_size, max_len=512)
    self.token_type_embedding = nn.Embedding(2, embed_size, padding_idx=0)
    self.dropout = nn.Dropout(p=dropout)
    self.embed_size = embed_size

def forward(self, input_ids, token_type_ids):
    x = self.token_embedding(input_ids) + self.position_embedding(
        input_ids) + self.token_type_embedding(token_type_ids)
    return self.dropout(x)

9.1.2、单头 Attention

#%% import torch.nn as nn import torch.nn.functional as F import torch

import math

''' query = query_linear(x) key = key_linear(x) value = value_linear(x) '''

单个头的注意力计算

class Attention(nn.Module):

def forward(self, query, key, value, mask=None, dropout=None):
    scores = torch.matmul(query, key.transpose(-2, -1)) \
        / math.sqrt(query.size(-1))

    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)

    p_attn = F.softmax(scores, dim=-1)

    if dropout is not None:
        p_attn = dropout(p_attn)

    return torch.matmul(p_attn, value), p_attn

每个token对应的query向量与每个token对应的key向量做内积
将上述内积取softmax(得到0~1之间的值,即为attention权重)
计算每个token相对于所有其它token的attention权重(最终构成一个$L\times L$的attention矩阵)
每个token对应的value向量乘以attention权重,并相加,得到当前token的self-attention value向量
将上述操作应用于每个token
以上是一个头的操作,同时(并行)应用于多个独立的头

9.1.3、多头 Attention

将每个头得到向量拼接在一起,最后乘一个线性矩阵,得到 multi-head attention 的输出

#%%
import torch.nn

class MultiHeadedAttention(nn.Module):
    def __init__(self, head_num, hidden_size, dropout=0.1):
        """
        :param head_num: 头的个数,必须能被hidden_size整除
        :param hidden_size: 隐层的维度,与embed_size一致
        """
        super().__init__()
        assert hidden_size % head_num == 0

        self.per_head_dim = hidden_size // head_num
        self.head_num = head_num
        self.hidden_size = hidden_size
    
        self.query_linear = nn.Linear(hidden_size, hidden_size)
        self.key_linear = nn.Linear(hidden_size, hidden_size)
        self.value_linear = nn.Linear(hidden_size, hidden_size)
    
        self.output_linear = nn.Linear(hidden_size, hidden_size)
        self.attention = Attention()
    
        self.dropout = nn.Dropout(p=dropout)
    
    def reshape(self, x, batch_size):
        # 拆成多个头
        return x.view(batch_size, -1, self.head_num, self.per_head_dim).transpose(1, 2)
    
    def forward(self, x, mask=None):
        batch_size = x.size(0)
    
        query = self.reshape(self.query_linear(x))
        key = self.reshape(self.key_linear(x))
        value = self.reshape(self.value_linear(x))
    
        # 每个头计算attention
        x, attn = self.attention(
            query, key, value, mask=mask, dropout=self.dropout
        )
    
        # 把每个头的attention*value拼接在一起
        x = x.transpose(1, 2).contiguous().view(
            batch_size, -1, self.hidden_size)
    
        # 乘一个线性矩阵
        return self.output_linear(x)

#%% md

9.1.4、全连接网络(Feed-Forward Network)

#%% import torch.nn as nn

class FeedForward(nn.Module):

def __init__(self, hidden_size, dropout=0.1):
    super(FeedForward, self).__init__()
    self.input_layer = nn.Linear(hidden_size, hidden_size*4)
    self.output_layer = nn.Linear(hidden_size*4, hidden_size)
    self.dropout = nn.Dropout(dropout)
    self.activation = nn.GELU()

def forward(self, x):
    x = self.input_layer(x)
    x = self.activation(x)
    x = self.dropout(x)
    x = self.output_layer(x)
    return x

9.1.5、拼成一层 Transformer

#%% import torch.nn as nn

class TransformerBlock(nn.Module):

def __init__(self, hidden_size, head_num, dropout=0.1):
    super().__init__()
    self.multi_head_attention = MultiHeadedAttention(head_num, hidden_size)
    self.feed_forward = FeedForward(hidden_size, dropout=dropout)
    self.layer_norm1 = nn.LayerNorm(hidden_size)
    self.dropout1 = nn.Dropout(dropout)
    self.layer_norm2 = nn.LayerNorm(hidden_size)
    self.dropout2 = nn.Dropout(dropout)
    self.dropout = nn.Dropout(p=dropout)

def forward(self, x, mask):
    x0 = x
    # 多头注意力层
    x = self.multi_head_attention(x, mask)

    # 残差和LayerNorm层(1)
    x = self.dropout1(x)
    x = self.layer_norm1(x0+x)

    # 前向网络层
    x1 = x
    x = self.feed_forward(x)

    # 残差和LayerNorm层(2)
    x = self.dropout2(x)
    x = self.layer_norm2(x1+x)
    return x

Multi-head attention的输出,经过残差和norm之后进入一个两层全连接网络

Layernorm:

y=xE(x)Var(x)+ϵγ+βy=\frac{x-\mathrm{E}(x)}{\sqrt{\mathrm{Var}(x)+\epsilon}}*\gamma+\beta

其中 γ\gammaβ\beta 是可训练的参数,ϵ=105\epsilon=10^{-5}是超参,保持数值稳定性。

9.1.6、多层 Transformer 构成 BERT Encoder

#%% import torch.nn as nn

class BERT(nn.Module):

def __init__(self, vocab_size, hidden_size=768, layer_num=12, head_num=12, dropout=0.1):

    super().__init__()
    # Embedding层
    self.embedding = BERTEmbedding(
        vocab_size=vocab_size, embed_size=hidden_size)
    # N层Transformers
    self.transformer_blocks = nn.ModuleList(
        [TransformerBlock(hidden_size, head_num, dropout)
         for _ in range(layer_num)]
    )

def forward(self, input_ids, token_type_ids):
    """
    tokenizer(["你好吗","你好"], text_pair=["我很好","我好"], max_length=10, padding='max_length',truncation=True)
    [CLS]你好吗[SEP]我很好[SEP][PAD]
    [CLS]你好[SEP]我好[SEP][PAD][PAD][PAD]  
    input_ids: [
        [101, 872, 1962, 1408, 102, 2769, 2523, 1962, 102, 0],
        [101, 872, 1962, 102, 2769, 1962, 102, 0, 0, 0]
    ]
    token_type_ids:[
            [0, 0, 0, 0, 0, 1, 1, 1, 1, 0],
            [0, 0, 0, 0, 1, 1, 1, 0, 0, 0]
        ]
    """
    attention_mask = (x > 0).unsqueeze(
        1).repeat(1, x.size(1), 1).unsqueeze(1)

    # 计算embedding
    x = self.embedding(input_ids, token_type_ids)

    # 逐层代入Tranformers
    for transformer in self.transformer_blocks:
        x = transformer.forward(x, attention_mask)

    return x

9.2 Transformer 怎么用

9.2.1. Encoder-Only LM 用于文本表示

针对不同下游任务,在 Encoder 上面添加不同的输出层

BERT-BiLSTM-CRF一个序列标注的经典网络结构

9.2.2. Encoder-Decoder LM,机器翻译/文本生成(大语言模型的一种形态)

  • Decoder 也是 N 层 transformer 结构
  • 生成一个 token,把它加入上文,再生成下一个 token,以此类推

Decoder 的每个 token 与 encoder 最后一层的输出和 decoder 上文的 token 一起做 attention

Decoder 的 token 只能 attend 到上文的 token(因为此时下文还没有出现)

9.2.3. Decoder-Only LM 也叫 Causal LM 或 Left-to-right LM(GPT 家族)


9.2.4. 大语言模型族谱


后记

📢博客主页:manor.blog.csdn.net

📢欢迎点赞 👍 收藏 ⭐留言 📝 如有错误敬请指正!

📢本文由 Maynor 原创,首发于 CSDN博客🙉

📢不能老盯着手机屏幕,要不时地抬起头,看看老板的位置⭐

📢专栏持续更新,欢迎订阅:blog.csdn.net/xianyu120/c…

本文由mdnice多平台发布