Bert Pytorch 源码分析：四、编解码器```py # Bert 编码器模块 # 由一个嵌入层和 NL 个 TF

# Bert 编码器模块
# 由一个嵌入层和 NL 个 TF 层组成
class BERT(nn.Module):
    """
    BERT model : Bidirectional Encoder Representations from Transformers.
    """

    def __init__(self, vocab_size, hidden=768, n_layers=12, attn_heads=12, dropout=0.1):
        """
        :param vocab_size: vocab_size of total words
        :param hidden: BERT model hidden size
        :param n_layers: numbers of Transformer blocks(layers)
        :param attn_heads: number of attention heads
        :param dropout: dropout rate
        """

        super().__init__()
		# 嵌入大小 ES
        self.hidden = hidden
		# TF 层数 NL
        self.n_layers = n_layers
		# 头部数量 HC
        self.attn_heads = attn_heads

        # FFN 层中的隐藏单元数量，记为 FF，一般是 ES 的四倍
        self.feed_forward_hidden = hidden * 4

        # 嵌入层，嵌入矩阵尺寸 VS * ES
        self.embedding = BERTEmbedding(vocab_size=vocab_size, embed_size=hidden)

        # NL 个 TF 层
        self.transformer_blocks = nn.ModuleList(
            [TransformerBlock(hidden, attn_heads, hidden * 4, dropout) for _ in range(n_layers)])

    def forward(self, x, segment_info):
        # 为`<pad>`（ID = 0）设置掩码
		# 尺寸为 BS * 1 * ML * ML，以便与相似性矩阵 S 匹配
		# 在每个 BS 的 ML * ML 矩阵中，`<pad>`标记对应的行为 1，其余为零
        mask = (x > 0).unsqueeze(1).repeat(1, x.size(1), 1).unsqueeze(1)

        # 单词 ID 传入嵌入层得到词向量
        x = self.embedding(x, segment_info)

        # 依次传入每个 TF 层，得到编码器输出
        for transformer in self.transformer_blocks:
            x = transformer.forward(x, mask)

        return x

# 解码器结构根据具体任务而定
# 任务一般有三种：（1）序列分类，（2）标记分类，（3）序列生成
# 但一般都是全连接的

# 用于下个句子判断的解码器
# 序列分类任务，输入两个句子，输出一个标签，1表示是相邻句子，0表示不是
class NextSentencePrediction(nn.Module):
    """
    2-class classification model : is_next, is_not_next
    """

    def __init__(self, hidden):
        """
        :param hidden: BERT model output size
        """
        super().__init__()
		# 将向量压缩到两维， 尺寸为 ES * 2
        self.linear = nn.Linear(hidden, 2)
        self.softmax = nn.LogSoftmax(dim=-1)

    def forward(self, x):
		# 输入 -> 取第一个向量 -> LL -> softmax -> 输出
		# 输出相邻句子和非相邻句子的概率
        return self.softmax(self.linear(x[:, 0]))

# 用于完型填空的解码器
# 序列生成任务，输入是带有`<mask>`的句子，输出是完整句子
class MaskedLanguageModel(nn.Module):
    """
    predicting origin token from masked input sequence
    n-class classification problem, n-class = vocab_size
    """

    def __init__(self, hidden, vocab_size):
        """
        :param hidden: output size of BERT model
        :param vocab_size: total vocab size
        """
        super().__init__()
		# 将输入压缩到词汇表大小
        self.linear = nn.Linear(hidden, vocab_size)
        self.softmax = nn.LogSoftmax(dim=-1)

    def forward(self, x):
		# 输入 -> LL -> softmax -> 输出
		# 输出序列中每个词是词汇表中每个词的概率
        return self.softmax(self.linear(x))