Bert Pytorch 源码分析：三、Transformer块```py # PFF 层，基本相当于两个全连接 # 每

# PFF 层，基本相当于两个全连接
# 每个 TF 块中位于注意力层之后
class PositionwiseFeedForward(nn.Module):
    "Implements FFN equation."

    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
		# LL1，权重矩阵尺寸 ES * FF 
        self.w_1 = nn.Linear(d_model, d_ff)
		# LL2，权重矩阵尺寸 FF * ES
        self.w_2 = nn.Linear(d_ff, d_model)
		# Dropout
        self.dropout = nn.Dropout(dropout)
		# 激活函数是 GELU
        self.activation = GELU()

    def forward(self, x):
		# 输入 -> LL1 -> GELU -> Dropout -> LL2 -> 输出
        return self.w_2(self.dropout(self.activation(self.w_1(x))))
		
# 处理 TF 块内的残差
class SublayerConnection(nn.Module):
    """
    A residual connection followed by a layer norm.
    Note for code simplicity the norm is first as opposed to last.
    """

    def __init__(self, size, dropout):
        super(SublayerConnection, self).__init__()
		# 层级标准化
        self.norm = LayerNorm(size)
		# Dropout
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        # 输入 -> LN -> 自定义层 -> Dropout -> 残差连接 -> 输出
		#  |                                    ⬆
		#  +------------------------------------+
        return x + self.dropout(sublayer(self.norm(x)))
		
# GELU 是 RELU 的高斯平滑近似形式
class GELU(nn.Module):
    """
    Paper Section 3.4, last paragraph notice that BERT used the GELU instead of RELU
    """

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
		
# 层级标准化（原理见参考文献）
class LayerNorm(nn.Module):
    "Construct a layernorm module (See citation for details)."

    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
		# 比例参数
        self.a_2 = nn.Parameter(torch.ones(features))
		# 偏移参数
        self.b_2 = nn.Parameter(torch.zeros(features))
		# 微小值防止除零错误
        self.eps = eps

    def forward(self, x):
		# 均值和方差都是对最后一维，也就是嵌入向量计算的
		# `keepdim=True`保持维数不变
		# 输入尺寸为 BS * ML * ES，计算之后是 BS * ML * 1
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
		# 将最后一维标准化，然后乘以比例加上偏移
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2
		
# Transformer 块是任何 Transformer 架构的基本结构，不仅限于 BERT，
# 不同模型只是层数、头数、嵌入维度、词表、训练数据以及解码器（具体任务）不同
class TransformerBlock(nn.Module):
    """
    Bidirectional Encoder = Transformer (self-attention)
    Transformer = MultiHead_Attention + Feed_Forward with sublayer connection
    """

    def __init__(self, hidden, attn_heads, feed_forward_hidden, dropout):
        """
        :param hidden: hidden size of transformer
        :param attn_heads: head sizes of multi-head attention
        :param feed_forward_hidden: feed_forward_hidden, usually 4*hidden_size
        :param dropout: dropout rate
        """

        super().__init__()
		# 第一部分：注意力层
        self.attention = MultiHeadedAttention(h=attn_heads, d_model=hidden)
		# 第二部分：PFF 层
        self.feed_forward = PositionwiseFeedForward(d_model=hidden, d_ff=feed_forward_hidden, dropout=dropout)
		# 注意力层残差模块
        self.input_sublayer = SublayerConnection(size=hidden, dropout=dropout)
		# PFF 层的残差模块
        self.output_sublayer = SublayerConnection(size=hidden, dropout=dropout)
		# 最后的 Dropout
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, x, mask):
		# 输入 -> LN1 -> 注意力层 -> DropOut1 -> 残差连接 -> ...
		#  |                                      ↑
		#  +--------------------------------------+
		# 这里的注意力层的三个输入全是`x`，但是仍然命名为 QKV，容易引起混淆
        x = self.input_sublayer(x, lambda _x: self.attention.forward(_x, _x, _x, mask=mask))
		# ... -> LN2 -> FFN -> DropOut2 -> 残差连接 -> ...
		#  |                                  ↑
		#  +----------------------------------+
        x = self.output_sublayer(x, self.feed_forward)
		# ... -> DropOut3 -> 结果
        return self.dropout(x)