Transformers 源码解析（一百零六）

`.\models\speech_to_text_2\tokenization_speech_to_text_2.py`

# 定义词汇文件的名称映射
VOCAB_FILES_NAMES = {
    "vocab_file": "vocab.json",  # 词汇表文件名
    "tokenizer_config_file": "tokenizer_config.json",  # 分词器配置文件名
    "merges_file": "merges.txt",  # 分词器合并文件名
}

# 预训练模型的词汇文件映射
PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "facebook/s2t-wav2vec2-large-en-de": (
            "https://huggingface.co/facebook/s2t-wav2vec2-large-en-de/resolve/main/vocab.json"
        ),
    },
    "tokenizer_config_file": {
        "facebook/s2t-wav2vec2-large-en-de": (
            "https://huggingface.co/facebook/s2t-wav2vec2-large-en-de/resolve/main/tokenizer_config.json"
        ),
    },
    "merges_file": {
        "facebook/s2t-wav2vec2-large-en-de": (
            "https://huggingface.co/facebook/s2t-wav2vec2-large-en-de/resolve/main/merges.txt"
        ),
    },
}

# BPE（字节对编码）模型的特殊标记
BPE_TOKEN_MERGES = "</w>"
BPE_TOKEN_VOCAB = "@@ "

def get_pairs(word):
    """
    返回单词中的符号对集合。单词被表示为符号（符号是长度可变的字符串）的元组。
    """
    pairs = set()
    prev_char = word[0]
    for char in word[1:]:
        pairs.add((prev_char, char))
        prev_char = char
    return pairs

# Speech2Text2 模型没有最大输入长度限制
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"facebook/s2t-wav2vec2-large-en-de": 1024}

class Speech2Text2Tokenizer(PreTrainedTokenizer):
    """
    构建 Speech2Text2Tokenizer。

    该分词器继承自 `PreTrainedTokenizer`，其中包含一些主要方法。用户应参考超类以获取有关这些方法的更多信息。
    """
    Args:
        vocab_file (`str`):
            File containing the vocabulary.
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sentence token.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sentence token.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.

        **kwargs
            Additional keyword arguments passed along to `PreTrainedTokenizer`.

    ```
    # 定义预训练模型的词汇文件名列表
    vocab_files_names = VOCAB_FILES_NAMES
    # 定义预训练模型的词汇文件映射表
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    # 定义预训练模型的最大输入大小
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    # 定义模型输入名称列表
    model_input_names = ["input_ids", "attention_mask"]

    def __init__(
        self,
        vocab_file,
        bos_token="<s>",
        pad_token="<pad>",
        eos_token="</s>",
        unk_token="<unk>",
        do_lower_case=False,
        merges_file=None,
        **kwargs,
    ):
        # 初始化时设置是否小写化标记
        self.do_lower_case = do_lower_case

        # 从文件中加载词汇表到编码器中
        with open(vocab_file, encoding="utf-8") as vocab_handle:
            self.encoder = json.load(vocab_handle)
        # 创建解码器，将编码器中的键值对颠倒
        self.decoder = {v: k for k, v in self.encoder.items()}

        # 如果没有提供合并文件，记录日志并设置相关属性为None
        if merges_file is None:
            logger.info(f"No merges files provided. {self.__class__.__name__} can only be used for decoding.")
            self.bpe_ranks = None
            self.cache = None
        else:
            # 从文件中读取并处理合并规则，创建BPE合并的排名字典和缓存
            with open(merges_file, encoding="utf-8") as merges_handle:
                merges = merges_handle.read().split("\n")[:-1]

            merges = [tuple(merge.split()[:2]) for merge in merges]
            self.bpe_ranks = dict(zip(merges, range(len(merges))))
            self.cache = {}

        # 调用父类的初始化方法，传递必要的参数和额外的关键字参数
        super().__init__(
            unk_token=unk_token,
            bos_token=bos_token,
            eos_token=eos_token,
            pad_token=pad_token,
            do_lower_case=do_lower_case,
            **kwargs,
        )

    @property
    def vocab_size(self) -> int:
        # 返回词汇表大小
        return len(self.decoder)

    def get_vocab(self) -> Dict:
        # 返回编码器和额外添加编码器合并后的词汇表
        return dict(self.encoder, **self.added_tokens_encoder)
    def bpe(self, token):
        # 构建包含特殊 BPE 标记的元组 word
        word = tuple(token[:-1]) + (token[-1] + BPE_TOKEN_MERGES,)
        # 如果 token 已经被缓存，直接返回缓存中的结果
        if token in self.cache:
            return self.cache[token]
        # 获取 token 的所有可能的字符对
        pairs = get_pairs(word)

        # 如果没有字符对，直接返回 token
        if not pairs:
            return token

        # 开始 BPE 算法的主循环
        while True:
            # 选择当前权重最小的字符对
            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
            # 如果选择的字符对不在 bpe_ranks 中，则跳出循环
            if bigram not in self.bpe_ranks:
                break
            first, second = bigram
            new_word = []
            i = 0
            # 根据字符对重组单词
            while i < len(word):
                try:
                    j = word.index(first, i)
                except ValueError:
                    new_word.extend(word[i:])
                    break
                else:
                    new_word.extend(word[i:j])
                    i = j

                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
                    new_word.append(first + second)
                    i += 2
                else:
                    new_word.append(word[i])
                    i += 1
            new_word = tuple(new_word)
            word = new_word
            # 如果单词长度为 1，则结束循环
            if len(word) == 1:
                break
            else:
                pairs = get_pairs(word)
        # 将单词转换为字符串形式
        word = " ".join(word)
        # 处理特殊情况下的 BPE 标记
        if word == "\n  " + BPE_TOKEN_MERGES:
            word = "\n" + BPE_TOKEN_MERGES

        if word.endswith(BPE_TOKEN_MERGES):
            word = word.replace(BPE_TOKEN_MERGES, "")

        word = word.replace(" ", BPE_TOKEN_VOCAB)
        # 将处理后的单词缓存起来
        self.cache[token] = word
        # 返回处理后的单词
        return word

    def _tokenize(self, text):
        """Tokenize a string."""

        # 检查是否有 BPE 等级文件
        if self.bpe_ranks is None:
            raise ValueError(
                "This tokenizer was instantiated without a `merges.txt` file, so"
                " that it can only be used for decoding, not for encoding. "
                "Make sure to provide `merges.txt` file at instantiation to enable "
                "encoding."
            )

        # 如果设置为小写，则将文本转换为小写
        if self.do_lower_case:
            text = text.lower()

        # 拆分文本为单词列表
        text = text.split()

        split_tokens = []
        # 对每个单词进行 BPE 分词处理
        for token in text:
            if token:
                split_tokens.extend(list(self.bpe(token).split(" ")))

        # 返回处理后的分词列表
        return split_tokens

    def _convert_token_to_id(self, token: str) -> int:
        """Converts a token (str) in an index (integer) using the vocab."""
        # 使用词汇表将 token 转换为对应的 ID
        return self.encoder.get(token, self.encoder.get(self.unk_token))

    def _convert_id_to_token(self, index: int) -> str:
        """Converts an index (integer) in a token (str) using the vocab."""
        # 使用词汇表将 ID 转换为对应的 token
        result = self.decoder.get(index, self.unk_token)
        return result

    def convert_tokens_to_string(self, tokens: List[str]) -> str:
        """
        Converts a list of output tokens into a single string.
        """
        # 将 tokens 列表合并成一个字符串
        string = " ".join(tokens)

        # 确保 @@ 标记被正确连接
        string = "".join(string.split(BPE_TOKEN_VOCAB))

        # 返回合并后的字符串
        return string
    # 保存词汇表和词汇合并文件到指定目录
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # 检查保存目录是否存在，如果不存在则记录错误并返回
        if not os.path.isdir(save_directory):
            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
            return
        
        # 构造词汇表文件路径
        vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
        )
        
        # 构造词汇合并文件路径
        merges_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
        )

        # 将编码器的内容写入词汇表文件
        with open(vocab_file, "w", encoding="utf-8") as f:
            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")

        # 初始化索引为0
        index = 0
        # 如果词分片（bpe_ranks）为空，则返回仅包含词汇表文件路径的元组
        if self.bpe_ranks is None:
            return (vocab_file,)

        # 打开词汇合并文件，以UTF-8编码写入内容
        with open(merges_file, "w", encoding="utf-8") as writer:
            # 对self.bpe_ranks按token_index进行排序，并迭代处理
            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
                # 如果索引不等于token_index，则记录警告信息
                if index != token_index:
                    logger.warning(
                        f"Saving vocabulary to {merges_file}: BPE merge indices are not consecutive."
                        " Please check that the tokenizer is not corrupted!"
                    )
                    index = token_index
                # 将BPE标记写入合并文件
                writer.write(" ".join(bpe_tokens) + "\n")
                index += 1

        # 返回包含词汇表文件路径和词汇合并文件路径的元组
        return (vocab_file, merges_file)

`.\models\speech_to_text_2\init.py`

# 版权声明和许可信息
#
# 本代码受版权保护，版权归 The HuggingFace Team 所有。
#
# 根据 Apache 许可证 2.0 版本授权使用本文件；
# 除非符合许可证的规定，否则不得使用本文件。
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，本软件按“原样”分发，不提供任何明示或暗示的保证或条件。
# 有关具体语言版本的详细信息，请参阅许可证。
from typing import TYPE_CHECKING

# 从 utils 模块导入所需的符号
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_sentencepiece_available,
    is_speech_available,
    is_torch_available,
)

# 定义模块的导入结构
_import_structure = {
    "configuration_speech_to_text_2": ["SPEECH_TO_TEXT_2_PRETRAINED_CONFIG_ARCHIVE_MAP", "Speech2Text2Config"],
    "processing_speech_to_text_2": ["Speech2Text2Processor"],
    "tokenization_speech_to_text_2": ["Speech2Text2Tokenizer"],
}

# 检查是否可以导入 torch，如果不行则引发 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可以导入 torch，则添加额外的模块到导入结构中
    _import_structure["modeling_speech_to_text_2"] = [
        "SPEECH_TO_TEXT_2_PRETRAINED_MODEL_ARCHIVE_LIST",
        "Speech2Text2ForCausalLM",
        "Speech2Text2PreTrainedModel",
    ]

# 如果是类型检查模式，则导入具体的类型
if TYPE_CHECKING:
    from .configuration_speech_to_text_2 import SPEECH_TO_TEXT_2_PRETRAINED_CONFIG_ARCHIVE_MAP, Speech2Text2Config
    from .processing_speech_to_text_2 import Speech2Text2Processor
    from .tokenization_speech_to_text_2 import Speech2Text2Tokenizer

    # 再次检查是否可以导入 torch，如果不行则忽略
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果可以导入 torch，则导入额外的模块类型
        from .modeling_speech_to_text_2 import (
            SPEECH_TO_TEXT_2_PRETRAINED_MODEL_ARCHIVE_LIST,
            Speech2Text2ForCausalLM,
            Speech2Text2PreTrainedModel,
        )

# 如果不是类型检查模式，则设置模块为 LazyModule，延迟加载
else:
    import sys

    # 设置当前模块为 LazyModule 形式，根据 _import_structure 进行延迟加载
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\splinter\configuration_splinter.py`

# coding=utf-8
# Copyright 2021 Tel AViv University, AllenAI and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Splinter model configuration
"""

# 导入预训练配置类 PretrainedConfig 和日志工具 logging
from ...configuration_utils import PretrainedConfig
from ...utils import logging

# 获取 logger 对象，用于记录日志
logger = logging.get_logger(__name__)

# 定义预训练配置文件映射字典，将模型名称映射到其配置文件的 URL
SPLINTER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "tau/splinter-base": "https://huggingface.co/tau/splinter-base/resolve/main/config.json",
    "tau/splinter-base-qass": "https://huggingface.co/tau/splinter-base-qass/resolve/main/config.json",
    "tau/splinter-large": "https://huggingface.co/tau/splinter-large/resolve/main/config.json",
    "tau/splinter-large-qass": "https://huggingface.co/tau/splinter-large-qass/resolve/main/config.json",
    # 查看所有 Splinter 模型：https://huggingface.co/models?filter=splinter
}

# 定义 SplinterConfig 类，继承自 PretrainedConfig 类
class SplinterConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`SplinterModel`]. It is used to instantiate an
    Splinter model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the Splinter
    [tau/splinter-base](https://huggingface.co/tau/splinter-base) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    """

    # 构造函数注释省略，用于实例化 SplinterConfig 对象时使用
    # 设置模型类型为 "splinter"
    model_type = "splinter"
    # 初始化函数，用于初始化一个 Transformer 模型的配置参数
    def __init__(
        self,
        vocab_size=30522,  # 词汇表大小，默认为30522
        hidden_size=768,  # 隐藏层大小，默认为768
        num_hidden_layers=12,  # Transformer 模型的隐藏层数，默认为12
        num_attention_heads=12,  # 注意力头的数量，默认为12
        intermediate_size=3072,  # Feedforward 层的中间维度大小，默认为3072
        hidden_act="gelu",  # 隐藏层激活函数，默认为 GELU
        hidden_dropout_prob=0.1,  # 隐藏层的dropout概率，默认为0.1
        attention_probs_dropout_prob=0.1,  # 注意力概率的dropout概率，默认为0.1
        max_position_embeddings=512,  # 最大位置嵌入大小，默认为512
        type_vocab_size=2,  # 类型词汇表的大小，默认为2
        initializer_range=0.02,  # 参数初始化的范围，默认为0.02
        layer_norm_eps=1e-12,  # 层归一化的epsilon值，默认为1e-12
        use_cache=True,  # 是否使用缓存，默认为True
        pad_token_id=0,  # 填充token的id，默认为0
        question_token_id=104,  # 问题token的id，默认为104
        **kwargs,
    ):
        # 调用父类的初始化函数，传入填充token的id和其他关键字参数
        super().__init__(pad_token_id=pad_token_id, **kwargs)

        # 设置类的属性，将参数值赋给对应的属性
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.intermediate_size = intermediate_size
        self.hidden_act = hidden_act
        self.hidden_dropout_prob = hidden_dropout_prob
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        self.initializer_range = initializer_range
        self.type_vocab_size = type_vocab_size
        self.layer_norm_eps = layer_norm_eps
        self.use_cache = use_cache
        self.question_token_id = question_token_id

`.\models\splinter\modeling_splinter.py`

# coding=utf-8
# Copyright 2021 Tel AViv University, AllenAI and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch Splinter model."""

# 导入必要的库
import math
from dataclasses import dataclass
from typing import List, Optional, Tuple, Union

# 导入 PyTorch 库
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import CrossEntropyLoss

# 导入相关的模块和类
from ...activations import ACT2FN
from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, ModelOutput, QuestionAnsweringModelOutput
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
from .configuration_splinter import SplinterConfig

# 获取 logger 实例
logger = logging.get_logger(__name__)

# 文档中使用的检查点和配置名称
_CHECKPOINT_FOR_DOC = "tau/splinter-base"
_CONFIG_FOR_DOC = "SplinterConfig"

# Splinter 预训练模型的存档列表
SPLINTER_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "tau/splinter-base",
    "tau/splinter-base-qass",
    "tau/splinter-large",
    "tau/splinter-large-qass",
    # 查看所有 Splinter 模型 https://huggingface.co/models?filter=splinter
]

# SplinterEmbeddings 类，用于构建来自单词、位置和令牌类型嵌入的嵌入
class SplinterEmbeddings(nn.Module):
    """Construct the embeddings from word, position and token_type embeddings."""

    def __init__(self, config):
        super().__init__()
        # 单词嵌入层，根据配置创建一个单词嵌入层，将词汇表大小、隐藏大小和填充标记索引作为参数
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
        # 位置嵌入层，根据配置创建一个位置嵌入层，将最大位置嵌入大小和隐藏大小作为参数
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
        # 令牌类型嵌入层，根据配置创建一个令牌类型嵌入层，将类型词汇表大小和隐藏大小作为参数
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)

        # LayerNorm 层，用于归一化隐藏状态
        # self.LayerNorm 名称没有使用蛇形命名，以保持与 TensorFlow 模型变量名称一致，并能够加载任何 TensorFlow 检查点文件
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 丢弃层，用于在训练过程中随机丢弃一些隐藏单元，以防止过拟合
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

        # position_ids (1, len position emb) 在序列化时是连续的内存块，并且在导出时被导出
        self.register_buffer(
            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
        )
        # position_embedding_type，位置嵌入类型，默认为绝对位置嵌入
        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
    # 定义一个前向传播函数，用于处理输入的张量数据和可选的参数
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,  # 输入的token ID张量，默认为None
        token_type_ids: Optional[torch.LongTensor] = None,  # token类型ID张量，默认为None
        position_ids: Optional[torch.LongTensor] = None,  # 位置ID张量，默认为None
        inputs_embeds: Optional[torch.FloatTensor] = None,  # 输入的嵌入张量，默认为None
        past_key_values_length: Optional[int] = 0,  # 过去的关键值长度，默认为0
    ) -> Tuple:  # 函数返回一个元组

        if input_ids is not None:
            input_shape = input_ids.size()  # 获取输入token ID张量的形状
        else:
            input_shape = inputs_embeds.size()[:-1]  # 获取输入嵌入张量的形状，不包括最后一维度

        seq_length = input_shape[1]  # 获取序列长度

        if position_ids is None:
            # 如果位置ID张量为None，则从self.position_ids中获取对应长度的位置ID张量
            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]

        if token_type_ids is None:
            # 如果token类型ID张量为None，则创建与input_shape相同形状的全零张量
            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)

        if inputs_embeds is None:
            # 如果输入的嵌入张量为None，则使用self.word_embeddings根据input_ids创建嵌入张量
            inputs_embeds = self.word_embeddings(input_ids)
        token_type_embeddings = self.token_type_embeddings(token_type_ids)  # 根据token类型ID获取token类型嵌入张量

        embeddings = inputs_embeds + token_type_embeddings  # 将输入嵌入张量和token类型嵌入张量相加

        if self.position_embedding_type == "absolute":
            # 如果位置嵌入类型是"absolute"，则根据位置ID获取位置嵌入张量并加到embeddings中
            position_embeddings = self.position_embeddings(position_ids)
            embeddings += position_embeddings

        embeddings = self.LayerNorm(embeddings)  # 对嵌入张量进行Layer Normalization
        embeddings = self.dropout(embeddings)  # 对嵌入张量进行Dropout操作
        return embeddings  # 返回处理后的嵌入张量作为输出
# 从 transformers.models.bert.modeling_bert.BertSelfAttention 复制的代码，将 Bert 替换为 Splinter
class SplinterSelfAttention(nn.Module):
    def __init__(self, config, position_embedding_type=None):
        super().__init__()
        # 检查隐藏大小是否是注意力头数的整数倍，如果不是且没有嵌入大小属性，则引发值错误
        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
            raise ValueError(
                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
                f"heads ({config.num_attention_heads})"
            )

        # 初始化注意力头数和每个头的大小
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        # 初始化查询、键和值的线性层
        self.query = nn.Linear(config.hidden_size, self.all_head_size)
        self.key = nn.Linear(config.hidden_size, self.all_head_size)
        self.value = nn.Linear(config.hidden_size, self.all_head_size)

        # 初始化 dropout 层
        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
        
        # 位置嵌入类型，默认为绝对位置编码
        self.position_embedding_type = position_embedding_type or getattr(
            config, "position_embedding_type", "absolute"
        )
        # 如果位置嵌入类型为相对键或相对键查询，则初始化距离嵌入
        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
            self.max_position_embeddings = config.max_position_embeddings
            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)

        # 是否为解码器的标志
        self.is_decoder = config.is_decoder

    # 将输入张量重塑为注意力分数的形状
    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        x = x.view(new_x_shape)
        return x.permute(0, 2, 1, 3)

    # 前向传播函数定义
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
# 从 transformers.models.bert.modeling_bert.BertSelfOutput 复制的代码，将 Bert 替换为 Splinter
class SplinterSelfOutput(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 初始化全连接层（dense）
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 初始化 LayerNorm 层
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 初始化 dropout 层
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    # 前向传播函数定义
    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 经过全连接层
        hidden_states = self.dense(hidden_states)
        # 经过 dropout
        hidden_states = self.dropout(hidden_states)
        # 加上输入张量并通过 LayerNorm 层
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states
# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->Splinter
class SplinterAttention(nn.Module):
    def __init__(self, config, position_embedding_type=None):
        super().__init__()
        # 初始化 SplinterSelfAttention 层，传入配置和位置嵌入类型
        self.self = SplinterSelfAttention(config, position_embedding_type=position_embedding_type)
        # 初始化 SplinterSelfOutput 层，传入配置
        self.output = SplinterSelfOutput(config)
        # 初始化一个空集合，用于存储被修剪的注意力头
        self.pruned_heads = set()

    def prune_heads(self, heads):
        if len(heads) == 0:
            return
        # 调用 find_pruneable_heads_and_indices 函数，找到可修剪的注意力头及其索引
        heads, index = find_pruneable_heads_and_indices(
            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
        )

        # 修剪线性层
        self.self.query = prune_linear_layer(self.self.query, index)
        self.self.key = prune_linear_layer(self.self.key, index)
        self.self.value = prune_linear_layer(self.self.value, index)
        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)

        # 更新超参数并存储被修剪的注意力头
        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
        self.pruned_heads = self.pruned_heads.union(heads)

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.Tensor]:
        # 调用 self.self 的前向传播方法，处理注意力相关计算
        self_outputs = self.self(
            hidden_states,
            attention_mask,
            head_mask,
            encoder_hidden_states,
            encoder_attention_mask,
            past_key_value,
            output_attentions,
        )
        # 调用 self.output 的前向传播方法，处理注意力输出
        attention_output = self.output(self_outputs[0], hidden_states)
        # 如果有输出注意力，将其添加到 outputs 中
        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
        return outputs


# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->Splinter
class SplinterIntermediate(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 初始化线性层，将隐藏大小映射到中间大小
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
        # 根据配置选择激活函数
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 通过线性层变换隐藏状态
        hidden_states = self.dense(hidden_states)
        # 应用中间激活函数
        hidden_states = self.intermediate_act_fn(hidden_states)
        return hidden_states


# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->Splinter
class SplinterOutput(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 直接继承 BERT 输出层的初始化方式，这里省略不写
        pass  # Initialization identical to BERT Output layer, omitted here
    # 初始化方法，接受一个配置对象作为参数
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()
        # 创建一个全连接层，将输入大小设为config中的intermediate_size，输出大小设为config中的hidden_size
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
        # 创建一个LayerNorm层，对隐藏状态的维度为config中的hidden_size进行归一化，设定epsilon为config中的layer_norm_eps
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 创建一个dropout层，以config中的hidden_dropout_prob作为dropout概率
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    # 前向传播方法，接受两个张量作为输入，返回一个张量作为输出
    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 将输入的隐藏状态通过全连接层dense进行线性变换
        hidden_states = self.dense(hidden_states)
        # 对经过全连接层后的隐藏状态进行dropout操作
        hidden_states = self.dropout(hidden_states)
        # 将dropout后的隐藏状态与输入张量相加，然后通过LayerNorm层进行归一化处理
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        # 返回归一化后的隐藏状态作为输出
        return hidden_states
# 从transformers.models.bert.modeling_bert.BertLayer复制并修改为Splinter
class SplinterLayer(nn.Module):
    # 初始化方法，设置SplinterLayer的属性
    def __init__(self, config):
        super().__init__()
        # 设置前向传播中用于分块的大小
        self.chunk_size_feed_forward = config.chunk_size_feed_forward
        # 序列长度维度设为1
        self.seq_len_dim = 1
        # 创建SplinterAttention对象，并将其赋值给self.attention
        self.attention = SplinterAttention(config)
        # 是否作为解码器模型
        self.is_decoder = config.is_decoder
        # 是否添加跨注意力
        self.add_cross_attention = config.add_cross_attention
        # 如果添加了跨注意力
        if self.add_cross_attention:
            # 如果不是解码器模型，则抛出异常
            if not self.is_decoder:
                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
            # 创建带有绝对位置嵌入类型的SplinterAttention对象，并将其赋值给self.crossattention
            self.crossattention = SplinterAttention(config, position_embedding_type="absolute")
        # 创建SplinterIntermediate对象，并将其赋值给self.intermediate
        self.intermediate = SplinterIntermediate(config)
        # 创建SplinterOutput对象，并将其赋值给self.output

        self.output = SplinterOutput(config)

    # 前向传播方法
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.Tensor]:
        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
        # 如果过去的键/值缓存不为空，则获取解码器单向自注意力的缓存键/值元组在位置1,2
        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
        # 使用自注意力层处理隐藏状态，得到自注意力的输出
        self_attention_outputs = self.attention(
            hidden_states,
            attention_mask,
            head_mask,
            output_attentions=output_attentions,
            past_key_value=self_attn_past_key_value,
        )
        # 获取自注意力层的输出
        attention_output = self_attention_outputs[0]

        # 如果是解码器，最后一个输出是自注意力缓存的元组
        if self.is_decoder:
            # 输出中去除最后一个元素，因为它是自注意力缓存的元组
            outputs = self_attention_outputs[1:-1]
            # 获取当前的键/值缓存
            present_key_value = self_attention_outputs[-1]
        else:
            # 如果不是解码器，输出中包含自注意力权重
            outputs = self_attention_outputs[1:]  # 如果我们输出注意力权重，则添加自注意力
        

        cross_attn_present_key_value = None
        if self.is_decoder and encoder_hidden_states is not None:
            if not hasattr(self, "crossattention"):
                # 如果传入了编码器隐藏状态，但未实例化交叉注意力层，则引发值错误
                raise ValueError(
                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
                    " by setting `config.add_cross_attention=True`"
                )

            # 如果过去的键/值缓存不为空，则获取交叉注意力的缓存键/值元组在位置3,4
            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
            # 使用交叉注意力层处理自注意力输出和编码器隐藏状态，得到交叉注意力的输出
            cross_attention_outputs = self.crossattention(
                attention_output,
                attention_mask,
                head_mask,
                encoder_hidden_states,
                encoder_attention_mask,
                cross_attn_past_key_value,
                output_attentions,
            )
            # 获取交叉注意力层的输出
            attention_output = cross_attention_outputs[0]
            # 将交叉注意力权重添加到输出中
            outputs = outputs + cross_attention_outputs[1:-1]  # 如果我们输出注意力权重，则添加交叉注意力

            # 将交叉注意力缓存添加到当前的键/值缓存中的位置3,4
            cross_attn_present_key_value = cross_attention_outputs[-1]
            present_key_value = present_key_value + cross_attn_present_key_value

        # 将注意力输出应用到前向分块处理中
        layer_output = apply_chunking_to_forward(
            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
        )
        # 将前向分块处理的输出添加到输出元组中
        outputs = (layer_output,) + outputs

        # 如果是解码器，将注意力的键/值作为最后的输出返回
        if self.is_decoder:
            outputs = outputs + (present_key_value,)

        # 返回最终的输出元组
        return outputs

    def feed_forward_chunk(self, attention_output):
        # 使用中间层处理注意力输出
        intermediate_output = self.intermediate(attention_output)
        # 使用输出层处理中间输出和注意力输出，得到层输出
        layer_output = self.output(intermediate_output, attention_output)
        # 返回层输出
        return layer_output
# 从transformers.models.bert.modeling_bert.BertEncoder复制代码，并将Bert->Splinter修改为SplinterEncoder类
class SplinterEncoder(nn.Module):
    # 初始化方法，接受一个config对象作为参数
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()
        # 将传入的config对象保存到实例变量self.config中
        self.config = config
        # 创建一个包含多个SplinterLayer实例的ModuleList，数量由config.num_hidden_layers决定
        self.layer = nn.ModuleList([SplinterLayer(config) for _ in range(config.num_hidden_layers)])
        # 设定梯度检查点标志，默认为False
        self.gradient_checkpointing = False

    # 前向传播方法，接收多个输入参数和一些可选参数
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = False,
        output_hidden_states: Optional[bool] = False,
        return_dict: Optional[bool] = True,
    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
        # 如果不需要输出隐藏状态，则初始化为空元组；否则为None
        all_hidden_states = () if output_hidden_states else None
        # 如果不需要输出注意力权重，则初始化为空元组；否则为None
        all_self_attentions = () if output_attentions else None
        # 如果不需要输出交叉注意力权重或者配置不支持，则初始化为空元组；否则为None
        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None

        # 如果开启梯度检查点且处于训练模式
        if self.gradient_checkpointing and self.training:
            # 如果同时使用缓存，则发出警告并设置use_cache为False
            if use_cache:
                logger.warning_once(
                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
                )
                use_cache = False

        # 如果需要使用缓存，则初始化next_decoder_cache为空元组；否则为None
        next_decoder_cache = () if use_cache else None
        # 遍历每个解码器层
        for i, layer_module in enumerate(self.layer):
            # 如果需要输出隐藏状态，则将当前层的隐藏状态加入all_hidden_states
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 获取当前层的头部掩码
            layer_head_mask = head_mask[i] if head_mask is not None else None
            # 获取过去的键值对，用于解码器
            past_key_value = past_key_values[i] if past_key_values is not None else None

            # 如果开启梯度检查点且处于训练模式
            if self.gradient_checkpointing and self.training:
                # 使用梯度检查点函数进行前向传播
                layer_outputs = self._gradient_checkpointing_func(
                    layer_module.__call__,
                    hidden_states,
                    attention_mask,
                    layer_head_mask,
                    encoder_hidden_states,
                    encoder_attention_mask,
                    past_key_value,
                    output_attentions,
                )
            else:
                # 否则直接调用解码器层的前向传播函数
                layer_outputs = layer_module(
                    hidden_states,
                    attention_mask,
                    layer_head_mask,
                    encoder_hidden_states,
                    encoder_attention_mask,
                    past_key_value,
                    output_attentions,
                )

            # 更新当前层的隐藏状态为解码器层的输出的第一个元素
            hidden_states = layer_outputs[0]
            # 如果使用缓存，则将当前层的缓存信息加入next_decoder_cache
            if use_cache:
                next_decoder_cache += (layer_outputs[-1],)
            # 如果需要输出注意力权重，则将当前层的自注意力权重加入all_self_attentions
            if output_attentions:
                all_self_attentions = all_self_attentions + (layer_outputs[1],)
                # 如果配置支持添加交叉注意力，则将当前层的交叉注意力权重加入all_cross_attentions
                if self.config.add_cross_attention:
                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)

        # 如果需要输出隐藏状态，则将最终隐藏状态加入all_hidden_states
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # 如果不需要返回字典形式的输出结果，则返回一个元组
        if not return_dict:
            return tuple(
                v
                for v in [
                    hidden_states,
                    next_decoder_cache,
                    all_hidden_states,
                    all_self_attentions,
                    all_cross_attentions,
                ]
                if v is not None
            )
        # 否则返回一个BaseModelOutputWithPastAndCrossAttentions对象
        return BaseModelOutputWithPastAndCrossAttentions(
            last_hidden_state=hidden_states,
            past_key_values=next_decoder_cache,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
            cross_attentions=all_cross_attentions,
        )
    """
    SPLINTER_INPUTS_DOCSTRING定义了一个用于SPLINTER模型输入说明文档的字符串常量。
    """
    Args:
        input_ids (`torch.LongTensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.
            
            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.
            
            [What are input IDs?](../glossary#input-ids)
        
        attention_mask (`torch.FloatTensor` of shape `{0}`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
            
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            
            [What are attention masks?](../glossary#attention-mask)
        
        token_type_ids (`torch.LongTensor` of shape `{0}`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:
            
            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            
            [What are token type IDs?](../glossary#token-type-ids)
        
        position_ids (`torch.LongTensor` of shape `{0}`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.
            
            [What are position IDs?](../glossary#position-ids)
        
        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
            
            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        
        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
@add_start_docstrings(
    "The bare Splinter Model transformer outputting raw hidden-states without any specific head on top.",
    SPLINTER_START_DOCSTRING,
)
class SplinterModel(SplinterPreTrainedModel):
    """
    The model is an encoder (with only self-attention) following the architecture described in [Attention is all you
    need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones,
    Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
    """

    def __init__(self, config):
        super().__init__(config)
        self.config = config

        # Initialize embeddings and encoder layers
        self.embeddings = SplinterEmbeddings(config)
        self.encoder = SplinterEncoder(config)

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.embeddings.word_embeddings

    def set_input_embeddings(self, value):
        self.embeddings.word_embeddings = value

    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        # Iterate over layers and prune specified heads in attention layers
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)

    @add_start_docstrings_to_model_forward(SPLINTER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=BaseModelOutputWithPastAndCrossAttentions,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        """
        Forward pass of the SplinterModel. See superclass for more details.
        """
        # Implement the forward pass using the specified inputs and return model outputs
        # including attentions, hidden states, and past key values if applicable
        ...
    """
    实现了基于问题感知的跨度选择（QASS）头部，参考Splinter的论文描述。
    
    """
    
    # 初始化函数，设置QASS模型的各个层
    def __init__(self, config):
        super().__init__()
    
        # 定义转换层，将输入特征转换为问题起始和结束的表示
        self.query_start_transform = SplinterFullyConnectedLayer(config.hidden_size, config.hidden_size)
        self.query_end_transform = SplinterFullyConnectedLayer(config.hidden_size, config.hidden_size)
        
        # 定义转换层，将输入特征转换为序列起始和结束的表示
        self.start_transform = SplinterFullyConnectedLayer(config.hidden_size, config.hidden_size)
        self.end_transform = SplinterFullyConnectedLayer(config.hidden_size, config.hidden_size)
    
        # 定义分类器层，用于预测起始位置和结束位置的概率分布
        self.start_classifier = nn.Linear(config.hidden_size, config.hidden_size, bias=False)
        self.end_classifier = nn.Linear(config.hidden_size, config.hidden_size, bias=False)
    
    # 前向传播函数，接收输入和位置信息，返回起始位置和结束位置的预测 logits
    def forward(self, inputs, positions):
        _, _, dim = inputs.size()
        index = positions.unsqueeze(-1).repeat(1, 1, dim)  # 创建位置索引张量 [batch_size, num_positions, dim]
        gathered_reps = torch.gather(inputs, dim=1, index=index)  # 根据位置索引收集输入特征表示 [batch_size, num_positions, dim]
    
        # 对问题起始和结束的特征表示进行变换
        query_start_reps = self.query_start_transform(gathered_reps)  # [batch_size, num_positions, dim]
        query_end_reps = self.query_end_transform(gathered_reps)  # [batch_size, num_positions, dim]
        
        # 对序列起始和结束的特征表示进行变换
        start_reps = self.start_transform(inputs)  # [batch_size, seq_length, dim]
        end_reps = self.end_transform(inputs)  # [batch_size, seq_length, dim]
    
        # 使用分类器预测起始位置的 logits
        hidden_states = self.start_classifier(query_start_reps)  # [batch_size, num_positions, dim]
        start_reps = start_reps.permute(0, 2, 1)  # 调整维度顺序为 [batch_size, dim, seq_length]
        start_logits = torch.matmul(hidden_states, start_reps)  # 计算起始位置的 logits
    
        # 使用分类器预测结束位置的 logits
        hidden_states = self.end_classifier(query_end_reps)  # [batch_size, num_positions, dim]
        end_reps = end_reps.permute(0, 2, 1)  # 调整维度顺序为 [batch_size, dim, seq_length]
        end_logits = torch.matmul(hidden_states, end_reps)  # 计算结束位置的 logits
    
        # 返回起始位置和结束位置的预测 logits
        return start_logits, end_logits
# 使用装饰器添加文档字符串到类定义，描述了这个类在提取式问答任务（如SQuAD）中的作用，特别是在隐藏状态输出之上有一个线性层来计算“起始位置logits”和“结束位置logits”。
@add_start_docstrings(
    """
    Splinter Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    """,
    SPLINTER_START_DOCSTRING,  # 添加了预定义的起始文档字符串
)

# 声明一个新的类，继承自SplinterPreTrainedModel，用于问题回答的Splinter模型
class SplinterForQuestionAnswering(SplinterPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        # 初始化Splinter模型
        self.splinter = SplinterModel(config)
        # 初始化用于问题感知的跨度选择头部
        self.splinter_qass = QuestionAwareSpanSelectionHead(config)
        # 设置问题的token ID
        self.question_token_id = config.question_token_id

        # 初始化权重并应用最终处理
        self.post_init()

    # 使用装饰器添加文档字符串到模型前向传播函数，描述了输入参数及其作用
    @add_start_docstrings_to_model_forward(SPLINTER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    # 使用装饰器添加代码示例文档字符串，包括检查点、输出类型和配置类信息
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=QuestionAnsweringModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    # 模型的前向传播函数定义，处理各种输入参数并返回输出结果
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        start_positions: Optional[torch.LongTensor] = None,
        end_positions: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        question_positions: Optional[torch.LongTensor] = None,
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when start and end positions are provided):
            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
        start_logits (`torch.FloatTensor` of shape `(batch_size, num_questions, sequence_length)`):
            Span-start scores (before SoftMax).
        end_logits (`torch.FloatTensor` of shape `(batch_size, num_questions, sequence_length)`):
            Span-end scores (before SoftMax).
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

    # Optional: A scalar tensor representing the total loss for span extraction tasks.
    loss: Optional[torch.FloatTensor] = None
    # Optional: Tensor containing scores for the start positions of spans in each question.
    start_logits: torch.FloatTensor = None
    # Optional: Tensor containing scores for the end positions of spans in each question.
    end_logits: torch.FloatTensor = None
    # Optional: Tuple of tensors representing hidden states of the model at each layer and embeddings.
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    # Optional: Tuple of tensors representing attention weights for each layer's self-attention mechanism.
    attentions: Optional[Tuple[torch.FloatTensor]] = None
# 定义一个 Splinter 模型，用于预训练中的重复跨度选择任务。与 QA 任务不同的是，这里没有问题，而是多个问题标记，
# 这些标记替换了重复跨度的出现。
@add_start_docstrings(
    """
    Splinter Model for the recurring span selection task as done during the pretraining. The difference to the QA task
    is that we do not have a question, but multiple question tokens that replace the occurrences of recurring spans
    instead.
    """,
    SPLINTER_START_DOCSTRING,
)
class SplinterForPreTraining(SplinterPreTrainedModel):
    
    def __init__(self, config):
        super().__init__(config)
        
        # 初始化 Splinter 模型和问题感知的跨度选择头部
        self.splinter = SplinterModel(config)
        self.splinter_qass = QuestionAwareSpanSelectionHead(config)
        self.question_token_id = config.question_token_id
        
        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(
        SPLINTER_INPUTS_DOCSTRING.format("batch_size, num_questions, sequence_length")
    )
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        start_positions: Optional[torch.LongTensor] = None,
        end_positions: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        question_positions: Optional[torch.LongTensor] = None,
    ):
        # 模型的前向传播方法，接收多个输入参数，并输出模型的预测或特征表示

    def _prepare_question_positions(self, input_ids: torch.Tensor) -> torch.Tensor:
        # 准备问题位置的方法，根据输入的 input_ids 返回一个 torch.Tensor，表示问题在输入中的位置
        
        # 在 input_ids 中寻找与 self.config.question_token_id 相等的位置
        rows, flat_positions = torch.where(input_ids == self.config.question_token_id)
        
        # 统计每行中的问题数量
        num_questions = torch.bincount(rows)
        
        # 创建一个形状为 (batch_size, 最大问题数量) 的 positions 张量，并用 pad_token_id 初始化
        positions = torch.full(
            (input_ids.size(0), num_questions.max()),
            self.config.pad_token_id,
            dtype=torch.long,
            device=input_ids.device,
        )
        
        # 根据 flat_positions 将问题位置填充到 positions 张量中
        cols = torch.cat([torch.arange(n) for n in num_questions])
        positions[rows, cols] = flat_positions
        
        # 返回填充好的 positions 张量
        return positions

`.\models\splinter\tokenization_splinter.py`

# 定义了文件编码为 UTF-8
# 版权声明，包括版权归属和保留所有权利
# Apache License 2.0 许可证声明，规定了软件的使用条件和责任限制
# 详细许可证信息可以在 http://www.apache.org/licenses/LICENSE-2.0 获取
# 如果符合许可证所述的条件，可以使用此文件
"""Splinter 的标记化类。"""

# 导入必要的模块
import collections
import os
import unicodedata
from typing import List, Optional, Tuple

# 从 tokenization_utils 中导入 PreTrainedTokenizer 类及一些辅助函数
from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
# 从 utils 中导入 logging 模块
from ...utils import logging

# 获取 logger 对象
logger = logging.get_logger(__name__)

# 定义词汇文件的名称，此处只包含一个键值对
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}

# 预训练模型的词汇文件映射表
PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "tau/splinter-base": "https://huggingface.co/tau/splinter-base/resolve/main/vocab.txt",
        "tau/splinter-base-qass": "https://huggingface.co/tau/splinter-base-qass/resolve/main/vocab.txt",
        "tau/splinter-large": "https://huggingface.co/tau/splinter-large/resolve/main/vocab.txt",
        "tau/splinter-large-qass": "https://huggingface.co/tau/splinter-large-qass/resolve/main/vocab.txt",
    }
}

# 预训练模型的位置嵌入大小映射表
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "tau/splinter-base": 512,
    "tau/splinter-base-qass": 512,
    "tau/splinter-large": 512,
    "tau/splinter-large-qass": 512,
}

# 预训练模型的初始化配置映射表
PRETRAINED_INIT_CONFIGURATION = {
    "tau/splinter-base": {"do_lower_case": False},
    "tau/splinter-base-qass": {"do_lower_case": False},
    "tau/splinter-large": {"do_lower_case": False},
    "tau/splinter-large-qass": {"do_lower_case": False},
}


def load_vocab(vocab_file):
    """从词汇文件加载词汇表到一个有序字典中。"""
    # 创建一个空的有序字典对象
    vocab = collections.OrderedDict()
    # 打开词汇文件，按 UTF-8 编码读取
    with open(vocab_file, "r", encoding="utf-8") as reader:
        # 逐行读取词汇文件内容
        tokens = reader.readlines()
    # 遍历 tokens 列表，将每个 token 去除换行符后作为键，其索引作为值存入 vocab 字典
    for index, token in enumerate(tokens):
        token = token.rstrip("\n")
        vocab[token] = index
    # 返回构建的词汇表字典
    return vocab


def whitespace_tokenize(text):
    """对文本进行基本的空格清理和分割。"""
    # 去除文本两端的空白字符
    text = text.strip()
    # 如果清理后文本为空，则返回空列表
    if not text:
        return []
    # 使用空格分割文本，得到 token 列表
    tokens = text.split()
    # 返回分割后的 token 列表
    return tokens


class SplinterTokenizer(PreTrainedTokenizer):
    r"""
    构建一个 Splinter 分词器，基于 WordPiece 算法。

    这个分词器继承自 [`PreTrainedTokenizer`]，包含大多数主要方法。用户可以参考这个超类获取更多关于这些方法的信息。
    """
    # 定义函数的参数说明文档，以下是各参数的详细说明：
    
    Args:
        vocab_file (`str`):
            包含词汇表的文件。
        do_lower_case (`bool`, *optional*, defaults to `True`):
            在分词时是否将输入转换为小写。
        do_basic_tokenize (`bool`, *optional*, defaults to `True`):
            在使用 WordPiece 分词之前是否进行基本分词。
        never_split (`Iterable`, *optional*):
            在分词时永远不会分割的标记集合。仅在 `do_basic_tokenize=True` 时生效。
        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
            未知标记。词汇表中不存在的标记会被设置为该标记。
        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
            分隔符标记，用于从多个序列构建一个序列，例如用于序列分类或问答任务中。
            也是使用特殊标记构建序列时的最后一个标记。
        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
            用于填充的标记，例如在批处理不同长度的序列时使用。
        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
            分类器标记，用于序列分类任务。使用特殊标记构建序列时的第一个标记。
        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
            掩码标记，用于掩码语言建模任务中。模型将尝试预测此标记。
        question_token (`str`, *optional*, defaults to `"[QUESTION]"`):
            用于构建问题表示的标记。
        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
            是否对中文字符进行分词。
            对于日文应该禁用此选项（参见此处的问题链接）。
        strip_accents (`bool`, *optional*):
            是否删除所有重音符号。如果未指定此选项，则将根据 `lowercase` 的值确定（与原始的 BERT 行为相同）。
    
    
    
    # 从预定义的全局变量中获取相关信息
    
    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    # 初始化方法，用于创建一个新的 Tokenizer 对象
    def __init__(
        self,
        vocab_file,                  # 词汇表文件的路径
        do_lower_case=True,          # 是否将输入文本转换为小写，默认为True
        do_basic_tokenize=True,      # 是否进行基本的分词，默认为True
        never_split=None,            # 不进行分词的特殊标记集合，默认为None
        unk_token="[UNK]",           # 未知标记，默认为"[UNK]"
        sep_token="[SEP]",           # 分隔标记，默认为"[SEP]"
        pad_token="[PAD]",           # 填充标记，默认为"[PAD]"
        cls_token="[CLS]",           # 分类标记，默认为"[CLS]"
        mask_token="[MASK]",         # 掩码标记，默认为"[MASK]"
        question_token="[QUESTION]", # 问题标记，默认为"[QUESTION]"
        tokenize_chinese_chars=True, # 是否分词中文字符，默认为True
        strip_accents=None,          # 是否去除文本中的重音符号，默认为None
        **kwargs,                    # 其他可选参数
    ):
        # 如果指定的词汇表文件不存在，则抛出 ValueError 异常
        if not os.path.isfile(vocab_file):
            raise ValueError(
                f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
                " model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
            )
        # 载入词汇表文件并存储在 self.vocab 中
        self.vocab = load_vocab(vocab_file)
        # 根据词汇表文件创建一个从 ids 到 tokens 的有序字典
        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
        # 是否进行基本分词的标记
        self.do_basic_tokenize = do_basic_tokenize
        # 如果需要进行基本分词，则初始化 BasicTokenizer 对象
        if do_basic_tokenize:
            self.basic_tokenizer = BasicTokenizer(
                do_lower_case=do_lower_case,
                never_split=never_split,
                tokenize_chinese_chars=tokenize_chinese_chars,
                strip_accents=strip_accents,
            )
        # 使用指定的未知标记初始化 WordpieceTokenizer 对象
        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
        # 问题标记的设置
        self.question_token = question_token
        # 调用父类的初始化方法，传入相同的参数
        super().__init__(
            do_lower_case=do_lower_case,
            do_basic_tokenize=do_basic_tokenize,
            never_split=never_split,
            unk_token=unk_token,
            sep_token=sep_token,
            pad_token=pad_token,
            cls_token=cls_token,
            mask_token=mask_token,
            tokenize_chinese_chars=tokenize_chinese_chars,
            strip_accents=strip_accents,
            **kwargs,
        )

    @property
    def question_token_id(self):
        """
        `Optional[int]`: Id of the question token in the vocabulary, used to condition the answer on a question
        representation.
        """
        # 返回问题标记在词汇表中的 id，用于在问题表示中条件化答案
        return self.convert_tokens_to_ids(self.question_token)

    @property
    def do_lower_case(self):
        # 返回是否进行小写处理的标记，由 BasicTokenizer 决定
        return self.basic_tokenizer.do_lower_case

    @property
    def vocab_size(self):
        # 返回词汇表的大小（词汇表中的唯一标记数）
        return len(self.vocab)

    def get_vocab(self):
        # 返回包含词汇表及其额外添加标记编码的字典
        return dict(self.vocab, **self.added_tokens_encoder)

    def _tokenize(self, text):
        # 对文本进行分词处理，返回分词后的标记列表
        split_tokens = []
        # 如果需要进行基本分词
        if self.do_basic_tokenize:
            # 使用 BasicTokenizer 对象进行分词
            for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
                # 如果分词后的 token 在不分割集合中
                if token in self.basic_tokenizer.never_split:
                    split_tokens.append(token)
                else:
                    # 使用 WordpieceTokenizer 对象对 token 进行进一步分词
                    split_tokens += self.wordpiece_tokenizer.tokenize(token)
        else:
            # 使用 WordpieceTokenizer 对象对文本进行分词
            split_tokens = self.wordpiece_tokenizer.tokenize(text)
        return split_tokens
    def _convert_token_to_id(self, token):
        """Converts a token (str) into an id using the vocab."""
        # Return the vocabulary ID of the given token; if token not found, return ID of unknown token
        return self.vocab.get(token, self.vocab.get(self.unk_token))

    def _convert_id_to_token(self, index):
        """Converts an index (integer) into a token (str) using the vocab."""
        # Return the token corresponding to the given index; if index not found, return unknown token
        return self.ids_to_tokens.get(index, self.unk_token)

    def convert_tokens_to_string(self, tokens):
        """Converts a sequence of tokens (string) into a single string."""
        # Join tokens into a string, remove '##' and strip leading/trailing spaces
        out_string = " ".join(tokens).replace(" ##", "").strip()
        return out_string

    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Build model inputs from a pair of sequences for question answering tasks by concatenating and adding special
        tokens. A Splinter sequence has the following format:

        - single sequence: `[CLS] X [SEP]`
        - pair of sequences for question answering: `[CLS] question_tokens [QUESTION] . [SEP] context_tokens [SEP]`

        Args:
            token_ids_0 (`List[int]`):
                The question token IDs if pad_on_right, else context tokens IDs
            token_ids_1 (`List[int]`, *optional*):
                The context token IDs if pad_on_right, else question token IDs

        Returns:
            `List[int]`: List of input IDs with the appropriate special tokens.
        """
        if token_ids_1 is None:
            # Return single sequence input IDs with [CLS], tokens_0, and [SEP]
            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]

        cls = [self.cls_token_id]
        sep = [self.sep_token_id]
        question_suffix = [self.question_token_id] + [self.convert_tokens_to_ids(".")]
        if self.padding_side == "right":
            # Return input IDs for question-then-context sequence
            return cls + token_ids_0 + question_suffix + sep + token_ids_1 + sep
        else:
            # Return input IDs for context-then-question sequence
            return cls + token_ids_0 + sep + token_ids_1 + question_suffix + sep

    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
    ) -> List[int]:
        """
        Retrieve a mask indicating the positions of special tokens in the input sequences.

        Args:
            token_ids_0 (`List[int]`):
                The question token IDs if pad_on_right, else context tokens IDs
            token_ids_1 (`List[int]`, *optional*):
                The context token IDs if pad_on_right, else question token IDs
            already_has_special_tokens (`bool`):
                Whether the input IDs already include special tokens

        Returns:
            `List[int]`: List indicating positions of special tokens (1 for special token, 0 for others)
        """
        # Initialize mask for special tokens
        special_tokens_mask = [0] * len(token_ids_0)

        if token_ids_1 is not None:
            special_tokens_mask += [1] * len(token_ids_1)

        # Mark special tokens [CLS], [SEP], [QUESTION] in the mask
        special_tokens_mask[:1] = [1]  # [CLS] token

        if token_ids_1 is None:
            special_tokens_mask[-1:] = [1]  # [SEP] token for single sequence
        else:
            special_tokens_mask[len(token_ids_0) + 2] = 1  # [SEP] token after context tokens

        if self.question_token_id is not None:
            special_tokens_mask[len(token_ids_0):len(token_ids_0) + 2] = [1, 1]  # [QUESTION] and [.] tokens

        return special_tokens_mask
    ) -> List[int]:
        """
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """

        # Check if the input token list already has special tokens
        if already_has_special_tokens:
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
            )

        # If token_ids_1 is provided, create masks for both sequences
        if token_ids_1 is not None:
            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
        # Otherwise, create masks for a single sequence
        return [1] + ([0] * len(token_ids_0)) + [1]

    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Create the token type IDs corresponding to the sequences passed. [What are token type
        IDs?](../glossary#token-type-ids)

        Should be overridden in a subclass if the model has a special way of building those.

        Args:
            token_ids_0 (`List[int]`): The first tokenized sequence.
            token_ids_1 (`List[int]`, *optional*): The second tokenized sequence.

        Returns:
            `List[int]`: The token type ids.
        """
        # Define special tokens
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]
        question_suffix = [self.question_token_id] + [self.convert_tokens_to_ids(".")]
        
        # If only one sequence is provided, return token type ids for that sequence
        if token_ids_1 is None:
            return len(cls + token_ids_0 + sep) * [0]

        # Determine the padding side and construct token type ids accordingly
        if self.padding_side == "right":
            # Input format: question-then-context
            return len(cls + token_ids_0 + question_suffix + sep) * [0] + len(token_ids_1 + sep) * [1]
        else:
            # Input format: context-then-question
            return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + question_suffix + sep) * [1]
    # 将词汇表保存到指定目录中的文件中
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # 初始化索引为0
        index = 0
        # 检查保存目录是否存在
        if os.path.isdir(save_directory):
            # 构建词汇文件的完整路径，如果有前缀，则包含在文件名中
            vocab_file = os.path.join(
                save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
            )
        else:
            # 如果保存目录不存在，则直接使用save_directory作为文件路径，如果有前缀，则包含在文件名中
            vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
        # 打开文件进行写操作，使用UTF-8编码
        with open(vocab_file, "w", encoding="utf-8") as writer:
            # 遍历词汇表中的每个token及其索引，按索引排序
            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
                # 如果当前索引与期望索引不同，则发出警告
                if index != token_index:
                    logger.warning(
                        f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
                        " Please check that the vocabulary is not corrupted!"
                    )
                    # 更新期望的索引
                    index = token_index
                # 将token写入文件，每个token占一行
                writer.write(token + "\n")
                # 更新索引
                index += 1
        # 返回保存的文件路径作为元组中的唯一元素
        return (vocab_file,)
# 定义一个名为 BasicTokenizer 的类，用于执行基本的分词（如拆分标点符号、转换为小写等）。
class BasicTokenizer(object):
    """
    Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).

    Args:
        do_lower_case (`bool`, *optional*, defaults to `True`):
            Whether or not to lowercase the input when tokenizing.
            是否在分词时将输入转换为小写，默认为 True。
        never_split (`Iterable`, *optional*):
            Collection of tokens which will never be split during tokenization. Only has an effect when
            `do_basic_tokenize=True`
            在分词时不会拆分的 token 集合。仅在 `do_basic_tokenize=True` 时有效。
        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
            Whether or not to tokenize Chinese characters.
            是否对中文字符进行分词，默认为 True。

            This should likely be deactivated for Japanese (see this
            [issue](https://github.com/huggingface/transformers/issues/328)).
            这可能需要为日语禁用（参见此问题链接）。
        strip_accents (`bool`, *optional*):
            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
            value for `lowercase` (as in the original BERT).
            是否去除所有的重音符号。如果未指定此选项，则由 `lowercase` 的值决定（与原始的 BERT 类似）。
    """

    # 初始化方法，接受几个可选参数用于配置分词器的行为
    def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
        # 如果 never_split 参数为 None，则设为空列表
        if never_split is None:
            never_split = []
        # 设定是否转换为小写
        self.do_lower_case = do_lower_case
        # 将 never_split 转换为集合，用于快速查找
        self.never_split = set(never_split)
        # 设定是否分词中文字符
        self.tokenize_chinese_chars = tokenize_chinese_chars
        # 设定是否去除重音符号
        self.strip_accents = strip_accents
    def tokenize(self, text, never_split=None):
        """
        Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
        WordPieceTokenizer.

        Args:
            **never_split**: (*optional*) list of str
                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
                [`PreTrainedTokenizer.tokenize`]) List of token not to split.
        """
        # 使用传入的 never_split 列表与实例中的 never_split 集合的并集作为最终的 never_split 集合
        never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
        # 清洗文本，例如移除不必要的字符或空格
        text = self._clean_text(text)

        # 以下内容是为了支持多语言和中文模型而添加的，适用于英语模型，尽管这些模型没有中文数据
        if self.tokenize_chinese_chars:
            # 对包含中文字符的文本进行特殊处理，可能涉及分词等操作
            text = self._tokenize_chinese_chars(text)
        # 使用空白字符进行基本分词
        orig_tokens = whitespace_tokenize(text)
        split_tokens = []
        # 遍历原始分词后的 tokens
        for token in orig_tokens:
            # 如果 token 不在 never_split 中，则进行进一步处理
            if token not in never_split:
                if self.do_lower_case:
                    # 如果需要将 token 转换为小写
                    token = token.lower()
                    if self.strip_accents is not False:
                        # 如果需要去除重音符号，则调用 _run_strip_accents 方法
                        token = self._run_strip_accents(token)
                elif self.strip_accents:
                    # 如果需要去除重音符号，则调用 _run_strip_accents 方法
                    token = self._run_strip_accents(token)
            # 将处理后的 token 加入到 split_tokens 中，可能涉及进一步的分割
            split_tokens.extend(self._run_split_on_punc(token, never_split))

        # 将分词后的结果再次使用空白字符进行分割，形成最终的输出 tokens
        output_tokens = whitespace_tokenize(" ".join(split_tokens))
        return output_tokens

    def _run_strip_accents(self, text):
        """Strips accents from a piece of text."""
        # 使用 NFD 标准对文本进行 Unicode 规范化，以处理重音符号
        text = unicodedata.normalize("NFD", text)
        output = []
        # 遍历文本中的每个字符
        for char in text:
            # 获取字符的 Unicode 类别
            cat = unicodedata.category(char)
            # 如果字符的类别是 Mn（Nonspacing_Mark），则跳过该字符
            if cat == "Mn":
                continue
            # 否则将字符添加到输出列表中
            output.append(char)
        # 将列表中的字符重新组合成字符串并返回
        return "".join(output)

    def _run_split_on_punc(self, text, never_split=None):
        """Splits punctuation on a piece of text."""
        # 如果指定了 never_split 并且 text 在 never_split 中，则返回 text 的列表形式
        if never_split is not None and text in never_split:
            return [text]
        chars = list(text)
        i = 0
        start_new_word = True
        output = []
        # 遍历文本中的每个字符
        while i < len(chars):
            char = chars[i]
            # 如果字符是标点符号，则作为一个新的词添加到 output 中
            if _is_punctuation(char):
                output.append([char])
                start_new_word = True
            else:
                # 否则将字符添加到当前词的最后一个词素中
                if start_new_word:
                    output.append([])
                start_new_word = False
                output[-1].append(char)
            i += 1

        # 将列表中的词素重新组合成字符串并返回
        return ["".join(x) for x in output]
    def _tokenize_chinese_chars(self, text):
        """Adds whitespace around any CJK character."""
        # 初始化空列表用于存储处理后的文本
        output = []
        # 遍历文本中的每个字符
        for char in text:
            # 获取字符的 Unicode 码点
            cp = ord(char)
            # 如果字符是中文字符，则在其前后加入空格
            if self._is_chinese_char(cp):
                output.append(" ")
                output.append(char)
                output.append(" ")
            else:
                # 如果字符不是中文字符，则直接添加到输出列表中
                output.append(char)
        # 将列表中的字符拼接成字符串并返回
        return "".join(output)

    def _is_chinese_char(self, cp):
        """Checks whether CP is the codepoint of a CJK character."""
        # 判断给定的 Unicode 码点是否属于CJK字符的范围
        if (
            (cp >= 0x4E00 and cp <= 0x9FFF)
            or (cp >= 0x3400 and cp <= 0x4DBF)  #
            or (cp >= 0x20000 and cp <= 0x2A6DF)  #
            or (cp >= 0x2A700 and cp <= 0x2B73F)  #
            or (cp >= 0x2B740 and cp <= 0x2B81F)  #
            or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
            or (cp >= 0xF900 and cp <= 0xFAFF)
            or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
        ):  #
            return True
        # 如果不是CJK字符，则返回False
        return False

    def _clean_text(self, text):
        """Performs invalid character removal and whitespace cleanup on text."""
        # 初始化空列表用于存储处理后的文本
        output = []
        # 遍历文本中的每个字符
        for char in text:
            # 获取字符的 Unicode 码点
            cp = ord(char)
            # 如果字符是空字符或者无效字符，跳过处理
            if cp == 0 or cp == 0xFFFD or self._is_control(char):
                continue
            # 如果字符是空白字符，则用单个空格替换
            if self._is_whitespace(char):
                output.append(" ")
            else:
                # 其他情况直接添加到输出列表中
                output.append(char)
        # 将列表中的字符拼接成字符串并返回
        return "".join(output)
class WordpieceTokenizer(object):
    """Runs WordPiece tokenization."""

    def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
        # 初始化 WordpieceTokenizer 类的实例
        self.vocab = vocab  # 词汇表，用于查找词片段
        self.unk_token = unk_token  # 未知词片段的标记
        self.max_input_chars_per_word = max_input_chars_per_word  # 单词的最大字符数限制

    def tokenize(self, text):
        """
        Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
        tokenization using the given vocabulary.

        For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.

        Args:
          text: A single token or whitespace separated tokens. This should have
            already been passed through *BasicTokenizer*.

        Returns:
          A list of wordpiece tokens.
        """

        output_tokens = []  # 存储最终的词片段结果
        for token in whitespace_tokenize(text):  # 使用空格分割文本中的每个单词
            chars = list(token)  # 将单词分割为字符列表
            if len(chars) > self.max_input_chars_per_word:
                output_tokens.append(self.unk_token)  # 如果单词超过最大字符限制，添加未知词片段标记
                continue

            is_bad = False  # 标记当前单词是否无法分割为词片段
            start = 0
            sub_tokens = []  # 存储当前单词分割后的词片段
            while start < len(chars):
                end = len(chars)
                cur_substr = None
                while start < end:
                    substr = "".join(chars[start:end])  # 从当前位置到末尾的子串
                    if start > 0:
                        substr = "##" + substr  # 非首字符的词片段添加"##"前缀
                    if substr in self.vocab:  # 如果词片段在词汇表中
                        cur_substr = substr
                        break
                    end -= 1
                if cur_substr is None:
                    is_bad = True  # 如果无法找到合适的词片段，则标记为无法处理
                    break
                sub_tokens.append(cur_substr)  # 将找到的词片段添加到列表中
                start = end

            if is_bad:
                output_tokens.append(self.unk_token)  # 如果无法处理当前单词，添加未知词片段标记
            else:
                output_tokens.extend(sub_tokens)  # 否则将分割得到的词片段添加到最终结果中
        return output_tokens  # 返回最终的词片段列表

`.\models\splinter\tokenization_splinter_fast.py`

# coding=utf-8
# 上面的代码指定了文件编码格式为 UTF-8，确保能够正确解析包含非 ASCII 字符的内容
# Copyright 2021 Tel AViv University, AllenAI and The HuggingFace Inc. team. All rights reserved.
# 版权声明，保留所有权利
#
# Licensed under the Apache License, Version 2.0 (the "License");
# 在 Apache License 2.0 下授权使用本文件
# you may not use this file except in compliance with the License.
# 除非遵循 License，否则不得使用此文件
# You may obtain a copy of the License at
# 获取 License 的副本
#
#     http://www.apache.org/licenses/LICENSE-2.0
#     可以在上面的链接中找到详细的 License 内容
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# 根据适用法律或书面同意，软件按"原样"分发
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 没有任何明示或暗示的担保或条件
# See the License for the specific language governing permissions and
# limitations under the License.
# 请查看 License 获取详细的授权和限制条款
"""Fast Tokenization classes for Splinter."""
# 用于 Splinter 的快速分词类

import json
from typing import List, Optional, Tuple

from tokenizers import normalizers

from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ...utils import logging
from .tokenization_splinter import SplinterTokenizer


logger = logging.get_logger(__name__)

# 定义词汇文件的名称映射
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}

# 预训练模型的词汇文件映射
PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "tau/splinter-base": "https://huggingface.co/tau/splinter-base/resolve/main/vocab.txt",
        "tau/splinter-base-qass": "https://huggingface.co/tau/splinter-base-qass/resolve/main/vocab.txt",
        "tau/splinter-large": "https://huggingface.co/tau/splinter-large/resolve/main/vocab.txt",
        "tau/splinter-large-qass": "https://huggingface.co/tau/splinter-large-qass/resolve/main/vocab.txt",
    }
}

# 预训练模型的位置嵌入大小映射
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "tau/splinter-base": 512,
    "tau/splinter-base-qass": 512,
    "tau/splinter-large": 512,
    "tau/splinter-large-qass": 512,
}

# 预训练模型的初始化配置映射
PRETRAINED_INIT_CONFIGURATION = {
    "tau/splinter-base": {"do_lower_case": False},
    "tau/splinter-base-qass": {"do_lower_case": False},
    "tau/splinter-large": {"do_lower_case": False},
    "tau/splinter-large-qass": {"do_lower_case": False},
}


class SplinterTokenizerFast(PreTrainedTokenizerFast):
    r"""
    Construct a "fast" Splinter tokenizer (backed by HuggingFace's *tokenizers* library). Based on WordPiece.

    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.
    构建一个快速的 Splinter 分词器，基于 HuggingFace 的 tokenizers 库，基于 WordPiece。

    This class inherits from PreTrainedTokenizerFast, which includes most of the primary methods. Users should refer to
    the superclass for more information on those methods.
    此类继承自 PreTrainedTokenizerFast，该类包含大多数主要方法。用户应参考超类以获取有关这些方法的更多信息。
    ```
    # 定义函数参数和默认值的说明
    Args:
        vocab_file (`str`):
            Vocabulary 文件的路径。
        do_lower_case (`bool`, *optional*, defaults to `True`):
            是否在标记化时将输入转换为小写。
        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
            未知标记。如果标记不在词汇表中，则无法转换为 ID，并将其设置为此标记。
        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
            分隔符标记，用于从多个序列构建一个序列，例如用于序列分类或问答任务中的问题与文本的分隔。
        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
            填充标记，用于将不同长度的序列进行批处理时进行填充。
        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
            分类器标记，在序列分类任务中作为序列的第一个标记。
        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
            掩码标记，用于掩码语言建模任务中模型尝试预测的标记。
        question_token (`str`, *optional*, defaults to `"[QUESTION]"`):
            构建问题表示时使用的标记。
        clean_text (`bool`, *optional*, defaults to `True`):
            是否在标记化前清理文本，例如删除控制字符并替换所有空格。
        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
            是否标记化中文字符，对于日文可能需要禁用此选项。
        strip_accents (`bool`, *optional*):
            是否去除所有重音符号。如果未指定，则将根据 `lowercase` 的值来确定（与原始的 BERT 行为一致）。
        wordpieces_prefix (`str`, *optional*, defaults to `"##"`):
            子词的前缀。
    """
    
    # 导入预定义的常量和类
    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    slow_tokenizer_class = SplinterTokenizer
    # 初始化方法，用于实例化对象时设置各种参数和调用父类的初始化方法
    def __init__(
        self,
        vocab_file=None,  # 词汇文件路径，默认为None
        tokenizer_file=None,  # 分词器文件路径，默认为None
        do_lower_case=True,  # 是否将输入文本转为小写，默认为True
        unk_token="[UNK]",  # 未知token的表示，默认为"[UNK]"
        sep_token="[SEP]",  # 分隔token的表示，默认为"[SEP]"
        pad_token="[PAD]",  # 填充token的表示，默认为"[PAD]"
        cls_token="[CLS]",  # 类别token的表示，默认为"[CLS]"
        mask_token="[MASK]",  # 掩码token的表示，默认为"[MASK]"
        question_token="[QUESTION]",  # 问题token的表示，默认为"[QUESTION]"
        tokenize_chinese_chars=True,  # 是否分词中文字符，默认为True
        strip_accents=None,  # 是否去除重音符号，默认为None
        **kwargs,  # 其他关键字参数
    ):
        # 调用父类的初始化方法，设置各种参数
        super().__init__(
            vocab_file,
            tokenizer_file=tokenizer_file,
            do_lower_case=do_lower_case,
            unk_token=unk_token,
            sep_token=sep_token,
            pad_token=pad_token,
            cls_token=cls_token,
            mask_token=mask_token,
            tokenize_chinese_chars=tokenize_chinese_chars,
            strip_accents=strip_accents,
            additional_special_tokens=(question_token,),  # 添加额外的特殊token，这里是问题token
            **kwargs,
        )

        # 获取前处理器的状态信息，并根据初始化参数更新其设置
        pre_tok_state = json.loads(self.backend_tokenizer.normalizer.__getstate__())
        if (
            pre_tok_state.get("lowercase", do_lower_case) != do_lower_case
            or pre_tok_state.get("strip_accents", strip_accents) != strip_accents
        ):
            pre_tok_class = getattr(normalizers, pre_tok_state.pop("type"))  # 获取前处理器类
            pre_tok_state["lowercase"] = do_lower_case  # 更新小写设置
            pre_tok_state["strip_accents"] = strip_accents  # 更新去重音符设置
            self.backend_tokenizer.normalizer = pre_tok_class(**pre_tok_state)  # 使用更新后的设置重新实例化前处理器

        self.do_lower_case = do_lower_case  # 保存是否转换为小写的设置

    @property
    def question_token_id(self):
        """
        `Optional[int]`: Id of the question token in the vocabulary, used to condition the answer on a question
        representation.
        """
        return self.convert_tokens_to_ids(self.question_token)  # 返回问题token在词汇表中的id

    # 构建带有特殊token的输入序列
    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Build model inputs from a pair of sequences for question answering tasks by concatenating and adding special
        tokens. A Splinter sequence has the following format:

        - single sequence: `[CLS] X [SEP]`
        - pair of sequences for question answering: `[CLS] question_tokens [QUESTION] . [SEP] context_tokens [SEP]`

        Args:
            token_ids_0 (`List[int]`):
                The question token IDs if pad_on_right, else context tokens IDs
            token_ids_1 (`List[int]`, *optional*):
                The context token IDs if pad_on_right, else question token IDs

        Returns:
            `List[int]`: List of input IDs with the appropriate special tokens.
        """
        if token_ids_1 is None:
            # Return single sequence format: `[CLS] X [SEP]`
            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]

        cls = [self.cls_token_id]
        sep = [self.sep_token_id]
        question_suffix = [self.question_token_id] + [self.convert_tokens_to_ids(".")]
        if self.padding_side == "right":
            # Return question-then-context format: `[CLS] question_tokens [QUESTION] . [SEP] context_tokens [SEP]`
            return cls + token_ids_0 + question_suffix + sep + token_ids_1 + sep
        else:
            # Return context-then-question format: `[CLS] context_tokens [SEP] question_tokens [QUESTION] . [SEP]`
            return cls + token_ids_0 + sep + token_ids_1 + question_suffix + sep

    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Create the token type IDs corresponding to the sequences passed. What are token type
        IDs? See glossary.

        Should be overridden in a subclass if the model has a special way of building those.

        Args:
            token_ids_0 (`List[int]`): The first tokenized sequence.
            token_ids_1 (`List[int]`, *optional*): The second tokenized sequence.

        Returns:
            `List[int]`: The token type ids.
        """
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]
        question_suffix = [self.question_token_id] + [self.convert_tokens_to_ids(".")]
        if token_ids_1 is None:
            # Return token type IDs for single sequence
            return len(cls + token_ids_0 + sep) * [0]

        if self.padding_side == "right":
            # Return token type IDs for question-then-context format
            return len(cls + token_ids_0 + question_suffix + sep) * [0] + len(token_ids_1 + sep) * [1]
        else:
            # Return token type IDs for context-then-question format
            return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + question_suffix + sep) * [1]

    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        """
        Save the tokenizer's model vocabulary to a directory.

        Args:
            save_directory (str): The directory where the vocabulary files will be saved.
            filename_prefix (str, *optional*): Optional prefix for the saved vocabulary files.

        Returns:
            `Tuple[str]`: Tuple of filenames saved.
        """
        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
        return tuple(files)

`.\models\splinter\init.py`

# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# 导入类型检查工具
from typing import TYPE_CHECKING

# 导入所需的辅助函数和类
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available

# 定义模块的导入结构
_import_structure = {
    "configuration_splinter": ["SPLINTER_PRETRAINED_CONFIG_ARCHIVE_MAP", "SplinterConfig"],
    "tokenization_splinter": ["SplinterTokenizer"],
}

# 检查是否可用 tokenizers 库，若不可用则抛出异常
try:
    if not is_tokenizers_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 添加 tokenization_splinter_fast 到导入结构，用于快速 tokenization
    _import_structure["tokenization_splinter_fast"] = ["SplinterTokenizerFast"]

# 检查是否可用 torch 库，若不可用则抛出异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 添加 modeling_splinter 到导入结构，包含多个 Splinter 模型组件
    _import_structure["modeling_splinter"] = [
        "SPLINTER_PRETRAINED_MODEL_ARCHIVE_LIST",
        "SplinterForQuestionAnswering",
        "SplinterForPreTraining",
        "SplinterLayer",
        "SplinterModel",
        "SplinterPreTrainedModel",
    ]

# 如果是类型检查模式，则进行详细导入
if TYPE_CHECKING:
    from .configuration_splinter import SPLINTER_PRETRAINED_CONFIG_ARCHIVE_MAP, SplinterConfig
    from .tokenization_splinter import SplinterTokenizer

    # 检查是否可用 tokenizers 库，若可用则进一步导入
    try:
        if not is_tokenizers_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .tokenization_splinter_fast import SplinterTokenizerFast

    # 检查是否可用 torch 库，若可用则进一步导入
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_splinter import (
            SPLINTER_PRETRAINED_MODEL_ARCHIVE_LIST,
            SplinterForPreTraining,
            SplinterForQuestionAnswering,
            SplinterLayer,
            SplinterModel,
            SplinterPreTrainedModel,
        )

# 非类型检查模式下，设置当前模块为惰性加载模块
else:
    import sys

    # 使用 _LazyModule 类来实现惰性加载模块
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\squeezebert\configuration_squeezebert.py`

# coding=utf-8
# Copyright 2020 The SqueezeBert authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

""" SqueezeBERT model configuration"""

from collections import OrderedDict  # 导入有序字典模块
from typing import Mapping  # 导入 Mapping 类型

from ...configuration_utils import PretrainedConfig  # 导入预训练配置工具
from ...onnx import OnnxConfig  # 导入 ONNX 配置
from ...utils import logging  # 导入日志工具

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器对象

# 定义 SqueezeBERT 预训练模型配置文件映射
SQUEEZEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "squeezebert/squeezebert-uncased": (
        "https://huggingface.co/squeezebert/squeezebert-uncased/resolve/main/config.json"
    ),
    "squeezebert/squeezebert-mnli": "https://huggingface.co/squeezebert/squeezebert-mnli/resolve/main/config.json",
    "squeezebert/squeezebert-mnli-headless": (
        "https://huggingface.co/squeezebert/squeezebert-mnli-headless/resolve/main/config.json"
    ),
}

# 定义 SqueezeBertConfig 类，继承自 PretrainedConfig 类
class SqueezeBertConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`SqueezeBertModel`]. It is used to instantiate a
    SqueezeBERT model according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the SqueezeBERT
    [squeezebert/squeezebert-uncased](https://huggingface.co/squeezebert/squeezebert-uncased) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.


    Examples:

    ```
    >>> from transformers import SqueezeBertConfig, SqueezeBertModel

    >>> # Initializing a SqueezeBERT configuration
    >>> configuration = SqueezeBertConfig()

    >>> # Initializing a model (with random weights) from the configuration above
    >>> model = SqueezeBertModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```

    Attributes: pretrained_config_archive_map (Dict[str, str]): A dictionary containing all the available pre-trained
    checkpoints.
    """

    # 预训练配置文件映射
    pretrained_config_archive_map = SQUEEZEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
    # 模型类型
    model_type = "squeezebert"
    # 初始化函数，用于初始化一个 Transformer 模型的配置参数
    def __init__(
        self,
        vocab_size=30522,  # 词汇表大小，默认为 30522
        hidden_size=768,  # 隐藏层大小，默认为 768
        num_hidden_layers=12,  # Transformer 模型的隐藏层层数，默认为 12
        num_attention_heads=12,  # 注意力头的数量，默认为 12
        intermediate_size=3072,  # 中间层的大小，默认为 3072
        hidden_act="gelu",  # 隐藏层激活函数，默认为 gelu
        hidden_dropout_prob=0.1,  # 隐藏层的 dropout 概率，默认为 0.1
        attention_probs_dropout_prob=0.1,  # 注意力概率的 dropout 概率，默认为 0.1
        max_position_embeddings=512,  # 最大位置嵌入数，默认为 512
        type_vocab_size=2,  # 类型词汇表大小，默认为 2
        initializer_range=0.02,  # 初始化范围，默认为 0.02
        layer_norm_eps=1e-12,  # 层归一化的 epsilon，默认为 1e-12
        pad_token_id=0,  # 填充标记的 ID，默认为 0
        embedding_size=768,  # 嵌入大小，默认为 768
        q_groups=4,  # 查询张量的分组数，默认为 4
        k_groups=4,  # 键张量的分组数，默认为 4
        v_groups=4,  # 值张量的分组数，默认为 4
        post_attention_groups=1,  # 注意力后处理的分组数，默认为 1
        intermediate_groups=4,  # 中间层的分组数，默认为 4
        output_groups=4,  # 输出层的分组数，默认为 4
        **kwargs,  # 其它关键字参数，用于接收未知的额外参数
    ):
        # 调用父类的初始化方法，传递填充标记 ID 和其他未知关键字参数
        super().__init__(pad_token_id=pad_token_id, **kwargs)
    
        # 初始化 Transformer 模型的各种配置参数
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.hidden_act = hidden_act
        self.intermediate_size = intermediate_size
        self.hidden_dropout_prob = hidden_dropout_prob
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        self.max_position_embeddings = max_position_embeddings
        self.type_vocab_size = type_vocab_size
        self.initializer_range = initializer_range
        self.layer_norm_eps = layer_norm_eps
        self.embedding_size = embedding_size
        self.q_groups = q_groups
        self.k_groups = k_groups
        self.v_groups = v_groups
        self.post_attention_groups = post_attention_groups
        self.intermediate_groups = intermediate_groups
        self.output_groups = output_groups
# # 从 transformers.models.bert.configuration_bert.BertOnxxConfig 复制并修改为 SqueezeBertOnnxConfig
class SqueezeBertOnnxConfig(OnnxConfig):
    # 定义 inputs 属性，返回一个映射，表示模型输入的动态轴
    @property
    def inputs(self) -> Mapping[str, Mapping[int, str]]:
        # 如果任务是多项选择，则设置动态轴为 {0: "batch", 1: "choice", 2: "sequence"}
        if self.task == "multiple-choice":
            dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
        else:
            # 否则，设置动态轴为 {0: "batch", 1: "sequence"}
            dynamic_axis = {0: "batch", 1: "sequence"}
        # 返回一个有序字典，包含输入名称到动态轴的映射
        return OrderedDict(
            [
                ("input_ids", dynamic_axis),        # 输入为 input_ids，对应动态轴为 dynamic_axis
                ("attention_mask", dynamic_axis),  # 输入为 attention_mask，对应动态轴为 dynamic_axis
                ("token_type_ids", dynamic_axis),  # 输入为 token_type_ids，对应动态轴为 dynamic_axis
            ]
        )

`.\models\squeezebert\modeling_squeezebert.py`

# 设置文件编码为 UTF-8

# 版权声明，指出 SqueezeBert 项目的版权归属和许可信息
# 根据 Apache License, Version 2.0 许可证，除非符合许可证的要求，否则不得使用该文件
# 可以在以下网址获取许可证的副本：http://www.apache.org/licenses/LICENSE-2.0
# 根据适用法律或书面同意，软件以“原样”分发，无任何明示或暗示的保证或条件
# 详细信息请参阅许可证，限制和条件
"""
PyTorch SqueezeBert 模型。
"""

import math
from typing import Optional, Tuple, Union

import torch
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

# 导入活化函数映射表
from ...activations import ACT2FN
# 导入模型输出类
from ...modeling_outputs import (
    BaseModelOutput,
    BaseModelOutputWithPooling,
    MaskedLMOutput,
    MultipleChoiceModelOutput,
    QuestionAnsweringModelOutput,
    SequenceClassifierOutput,
    TokenClassifierOutput,
)
# 导入预训练模型基类
from ...modeling_utils import PreTrainedModel
# 导入工具函数：添加代码示例文档字符串、添加起始文档字符串、添加模型前向方法的起始文档字符串、日志记录
from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
# 导入 SqueezeBert 配置类
from .configuration_squeezebert import SqueezeBertConfig

# 获取 logger 实例，用于日志记录
logger = logging.get_logger(__name__)

# 用于文档的检查点和配置
_CHECKPOINT_FOR_DOC = "squeezebert/squeezebert-uncased"
_CONFIG_FOR_DOC = "SqueezeBertConfig"

# 预训练模型存档列表
SQUEEZEBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "squeezebert/squeezebert-uncased",
    "squeezebert/squeezebert-mnli",
    "squeezebert/squeezebert-mnli-headless",
]

# SqueezeBertEmbeddings 类，构建来自单词、位置和 token 类型嵌入的嵌入层
class SqueezeBertEmbeddings(nn.Module):
    """Construct the embeddings from word, position and token_type embeddings."""

    def __init__(self, config):
        super().__init__()
        # 单词嵌入层，使用配置中的词汇大小、嵌入维度和填充索引
        self.word_embeddings = nn.Embedding(config.vocab_size, config.embedding_size, padding_idx=config.pad_token_id)
        # 位置嵌入层，使用配置中的最大位置嵌入大小和嵌入维度
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.embedding_size)
        # token 类型嵌入层，使用配置中的 token 类型词汇大小和嵌入维度
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.embedding_size)

        # LayerNorm 层，保持与 TensorFlow 模型变量名称一致，以便加载任何 TensorFlow 检查点文件
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # dropout 层，使用配置中的隐藏层 dropout 概率
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

        # position_ids (1, len position emb) 在内存中是连续的，并在序列化时被导出
        self.register_buffer(
            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
        )
    # 定义一个前向传播方法，用于处理输入的各种信息并生成嵌入表示
    def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):
        # 如果输入的 input_ids 不为空，则获取其形状信息
        if input_ids is not None:
            input_shape = input_ids.size()
        else:
            # 否则，获取 inputs_embeds 的形状信息（排除最后一维）
            input_shape = inputs_embeds.size()[:-1]

        # 获取序列长度，这里假设 input_shape 是一个元组，包含 batch_size 和 seq_length
        seq_length = input_shape[1]

        # 如果未提供 position_ids，则使用预定义的 position_ids 矩阵，截取到当前序列长度
        if position_ids is None:
            position_ids = self.position_ids[:, :seq_length]

        # 如果未提供 token_type_ids，则创建全零的张量，形状与 input_shape 相同
        if token_type_ids is None:
            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)

        # 如果未提供 inputs_embeds，则利用 input_ids 获取嵌入表示
        if inputs_embeds is None:
            inputs_embeds = self.word_embeddings(input_ids)
        
        # 根据 position_ids 获取位置嵌入表示
        position_embeddings = self.position_embeddings(position_ids)
        
        # 根据 token_type_ids 获取类型嵌入表示
        token_type_embeddings = self.token_type_embeddings(token_type_ids)

        # 将输入嵌入、位置嵌入和类型嵌入相加得到最终的嵌入表示
        embeddings = inputs_embeds + position_embeddings + token_type_embeddings
        
        # 对嵌入表示进行 LayerNorm 处理
        embeddings = self.LayerNorm(embeddings)
        
        # 对嵌入表示进行 dropout 处理，防止过拟合
        embeddings = self.dropout(embeddings)
        
        # 返回处理后的嵌入表示
        return embeddings
# 定义了一个名为 MatMulWrapper 的神经网络模块类
class MatMulWrapper(nn.Module):
    """
    Wrapper for torch.matmul(). This makes flop-counting easier to implement. Note that if you directly call
    torch.matmul() in your code, the flop counter will typically ignore the flops of the matmul.
    """

    def __init__(self):
        super().__init__()

    def forward(self, mat1, mat2):
        """
        执行前向传播计算

        :param mat1: 第一个 torch 张量
        :param mat2: 第二个 torch 张量
        :return: 两个张量的矩阵乘积

        这里描述了 BERT 中典型的张量维度，mat1.shape: [B, <optional extra dims>, M, K]
        mat2.shape: [B, <optional extra dims>, K, N] 输出形状: [B, <optional extra dims>, M, N]
        """
        return torch.matmul(mat1, mat2)


# 定义了一个名为 SqueezeBertLayerNorm 的 nn.LayerNorm 子类
class SqueezeBertLayerNorm(nn.LayerNorm):
    """
    This is a nn.LayerNorm subclass that accepts NCW data layout and performs normalization in the C dimension.

    N = batch C = channels W = sequence length
    """

    def __init__(self, hidden_size, eps=1e-12):
        # 调用 nn.LayerNorm 的初始化方法来初始化自身
        nn.LayerNorm.__init__(self, normalized_shape=hidden_size, eps=eps)  # instantiates self.{weight, bias, eps}

    def forward(self, x):
        # 将输入张量 x 的维度顺序变换为 NCW
        x = x.permute(0, 2, 1)
        # 调用 nn.LayerNorm 类的前向传播方法对 x 进行归一化
        x = nn.LayerNorm.forward(self, x)
        # 将归一化后的张量 x 的维度顺序变换回原来的维度顺序
        return x.permute(0, 2, 1)


# 定义了一个名为 ConvDropoutLayerNorm 的神经网络模块类
class ConvDropoutLayerNorm(nn.Module):
    """
    ConvDropoutLayerNorm: Conv, Dropout, LayerNorm
    """

    def __init__(self, cin, cout, groups, dropout_prob):
        super().__init__()

        # 定义一个 1 维卷积层
        self.conv1d = nn.Conv1d(in_channels=cin, out_channels=cout, kernel_size=1, groups=groups)
        # 定义一个 SqueezeBertLayerNorm 层
        self.layernorm = SqueezeBertLayerNorm(cout)
        # 定义一个 Dropout 层
        self.dropout = nn.Dropout(dropout_prob)

    def forward(self, hidden_states, input_tensor):
        # 对隐藏状态进行 1 维卷积
        x = self.conv1d(hidden_states)
        # 对卷积结果进行 Dropout 操作
        x = self.dropout(x)
        # 将 Dropout 后的结果与输入张量 input_tensor 相加
        x = x + input_tensor
        # 对相加后的结果进行 LayerNorm 归一化
        x = self.layernorm(x)
        return x


# 定义了一个名为 ConvActivation 的神经网络模块类
class ConvActivation(nn.Module):
    """
    ConvActivation: Conv, Activation
    """

    def __init__(self, cin, cout, groups, act):
        super().__init__()
        # 定义一个 1 维卷积层
        self.conv1d = nn.Conv1d(in_channels=cin, out_channels=cout, kernel_size=1, groups=groups)
        # 根据给定的激活函数名称 act，选择相应的激活函数
        self.act = ACT2FN[act]

    def forward(self, x):
        # 对输入张量 x 进行 1 维卷积
        output = self.conv1d(x)
        # 对卷积输出应用选择的激活函数
        return self.act(output)


class SqueezeBertSelfAttention(nn.Module):
    # 继续实现该类，未提供的部分未显示在此处
    pass
    def __init__(self, config, cin, q_groups=1, k_groups=1, v_groups=1):
        """
        config = used for some things; ignored for others (work in progress...) cin = input channels = output channels
        groups = number of groups to use in conv1d layers
        """
        super().__init__()  # 调用父类的初始化方法
        if cin % config.num_attention_heads != 0:  # 如果输入通道数不是注意力头数的整数倍
            raise ValueError(
                f"cin ({cin}) is not a multiple of the number of attention heads ({config.num_attention_heads})"
            )
        self.num_attention_heads = config.num_attention_heads  # 设置注意力头的数量
        self.attention_head_size = int(cin / config.num_attention_heads)  # 计算每个注意力头的大小
        self.all_head_size = self.num_attention_heads * self.attention_head_size  # 计算所有注意力头的总大小

        # 创建查询、键、值的卷积层，用于注意力机制
        self.query = nn.Conv1d(in_channels=cin, out_channels=cin, kernel_size=1, groups=q_groups)
        self.key = nn.Conv1d(in_channels=cin, out_channels=cin, kernel_size=1, groups=k_groups)
        self.value = nn.Conv1d(in_channels=cin, out_channels=cin, kernel_size=1, groups=v_groups)

        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)  # 设置注意力概率的dropout层
        self.softmax = nn.Softmax(dim=-1)  # 创建softmax层，沿着最后一个维度进行softmax计算

        self.matmul_qk = MatMulWrapper()  # 创建矩阵乘法封装类的实例
        self.matmul_qkv = MatMulWrapper()  # 创建矩阵乘法封装类的实例

    def transpose_for_scores(self, x):
        """
        - input: [N, C, W]
        - output: [N, C1, W, C2] where C1 is the head index, and C2 is one head's contents
        """
        new_x_shape = (x.size()[0], self.num_attention_heads, self.attention_head_size, x.size()[-1])  # 计算新的形状
        x = x.view(*new_x_shape)  # 调整张量的形状
        return x.permute(0, 1, 3, 2)  # 转置张量的维度顺序，使得注意力头的信息在正确的维度上

    def transpose_key_for_scores(self, x):
        """
        - input: [N, C, W]
        - output: [N, C1, C2, W] where C1 is the head index, and C2 is one head's contents
        """
        new_x_shape = (x.size()[0], self.num_attention_heads, self.attention_head_size, x.size()[-1])  # 计算新的形状
        x = x.view(*new_x_shape)  # 调整张量的形状
        return x  # 返回未进行维度转置的张量

    def transpose_output(self, x):
        """
        - input: [N, C1, W, C2]
        - output: [N, C, W]
        """
        x = x.permute(0, 1, 3, 2).contiguous()  # 转置张量的维度顺序，并保证内存中连续存储
        new_x_shape = (x.size()[0], self.all_head_size, x.size()[3])  # 计算新的形状
        x = x.view(*new_x_shape)  # 调整张量的形状
        return x  # 返回调整后的张量
    def forward(self, hidden_states, attention_mask, output_attentions):
        """
        前向传播函数，用于计算自注意力机制后的结果。

        hidden_states: 输入的隐藏状态张量，数据布局为 [N, C, W]。
        attention_mask: 注意力掩码张量，数据布局为 [N, W]，不需要转置。
        output_attentions: 布尔值，指示是否输出注意力分数。

        返回包含上下文层和（可选）注意力分数的字典结果。
        """

        # 通过查询函数生成混合查询层
        mixed_query_layer = self.query(hidden_states)
        # 通过键函数生成混合键层
        mixed_key_layer = self.key(hidden_states)
        # 通过值函数生成混合值层
        mixed_value_layer = self.value(hidden_states)

        # 将混合查询层转置以获得注意力分数计算所需的格式
        query_layer = self.transpose_for_scores(mixed_query_layer)
        # 将混合键层转置以获得注意力分数计算所需的格式
        key_layer = self.transpose_key_for_scores(mixed_key_layer)
        # 将混合值层转置以获得注意力分数计算所需的格式
        value_layer = self.transpose_for_scores(mixed_value_layer)

        # 计算原始的注意力分数，使用查询层和键层的点积
        attention_score = self.matmul_qk(query_layer, key_layer)
        # 将注意力分数除以 sqrt(注意力头大小)，用于稳定训练
        attention_score = attention_score / math.sqrt(self.attention_head_size)
        # 加上预先计算的注意力掩码
        attention_score = attention_score + attention_mask

        # 将注意力分数归一化为注意力概率
        attention_probs = self.softmax(attention_score)

        # 使用 dropout 对注意力概率进行随机置零，以防止过拟合
        attention_probs = self.dropout(attention_probs)

        # 将注意力概率与值层相乘得到上下文层
        context_layer = self.matmul_qkv(attention_probs, value_layer)
        # 将上下文层转置为输出格式
        context_layer = self.transpose_output(context_layer)

        # 构建结果字典，包含上下文层
        result = {"context_layer": context_layer}
        # 如果需要输出注意力分数，则将其添加到结果中
        if output_attentions:
            result["attention_score"] = attention_score
        
        # 返回最终的结果字典
        return result
# 定义 SqueezeBertModule 类，继承自 nn.Module
class SqueezeBertModule(nn.Module):
    def __init__(self, config):
        """
        初始化函数，用于设置模块的各个层次
        - hidden_size = 输入通道数 = 输出通道数（Q、K、V 通道数相同）= 模块的输出通道数
        - intermediate_size = 中间层的输出通道数
        - groups = BertModule 中所有层的分组数（未来可以更改接口以允许不同层的不同分组）
        """
        super().__init__()

        # 从配置中获取各层的通道数
        c0 = config.hidden_size
        c1 = config.hidden_size
        c2 = config.intermediate_size
        c3 = config.hidden_size

        # 初始化注意力层
        self.attention = SqueezeBertSelfAttention(
            config=config, cin=c0, q_groups=config.q_groups, k_groups=config.k_groups, v_groups=config.v_groups
        )
        # 初始化注意力后处理层
        self.post_attention = ConvDropoutLayerNorm(
            cin=c0, cout=c1, groups=config.post_attention_groups, dropout_prob=config.hidden_dropout_prob
        )
        # 初始化中间层
        self.intermediate = ConvActivation(cin=c1, cout=c2, groups=config.intermediate_groups, act=config.hidden_act)
        # 初始化输出层
        self.output = ConvDropoutLayerNorm(
            cin=c2, cout=c3, groups=config.output_groups, dropout_prob=config.hidden_dropout_prob
        )

    def forward(self, hidden_states, attention_mask, output_attentions):
        """
        前向传播函数，用于计算模块的输出
        Args:
        - hidden_states: 输入的隐藏状态张量
        - attention_mask: 注意力掩码张量
        - output_attentions: 是否输出注意力分数

        Returns:
        - output_dict: 包含模块输出的字典，至少包含 "feature_map" 键
        """
        # 计算注意力
        att = self.attention(hidden_states, attention_mask, output_attentions)
        # 获取注意力层的输出
        attention_output = att["context_layer"]

        # 执行注意力后处理
        post_attention_output = self.post_attention(attention_output, hidden_states)
        # 执行中间层计算
        intermediate_output = self.intermediate(post_attention_output)
        # 执行输出层计算
        layer_output = self.output(intermediate_output, post_attention_output)

        # 准备输出字典
        output_dict = {"feature_map": layer_output}
        # 如果需要输出注意力分数，则将其加入输出字典
        if output_attentions:
            output_dict["attention_score"] = att["attention_score"]

        return output_dict


# 定义 SqueezeBertEncoder 类，继承自 nn.Module
class SqueezeBertEncoder(nn.Module):
    def __init__(self, config):
        """
        初始化函数，用于设置编码器的层数
        Args:
        - config: 包含模型配置信息的对象
        """
        super().__init__()

        # 确保嵌入尺寸与隐藏尺寸相同
        assert config.embedding_size == config.hidden_size, (
            "If you want embedding_size != intermediate hidden_size, "
            "please insert a Conv1d layer to adjust the number of channels "
            "before the first SqueezeBertModule."
        )

        # 创建编码器层列表
        self.layers = nn.ModuleList(SqueezeBertModule(config) for _ in range(config.num_hidden_layers))

    def forward(
        self,
        hidden_states,
        attention_mask=None,
        head_mask=None,
        output_attentions=False,
        output_hidden_states=False,
        return_dict=True,
    ):
        """
        前向传播函数，用于计算编码器的输出
        Args:
        - hidden_states: 输入的隐藏状态张量
        - attention_mask: 注意力掩码张量（默认为 None）
        - head_mask: 注意力头掩码张量（默认为 None）
        - output_attentions: 是否输出注意力分数（默认为 False）
        - output_hidden_states: 是否输出隐藏状态（默认为 False）
        - return_dict: 是否返回字典格式的输出（默认为 True）

        Returns:
        - 输出结果，根据 return_dict 参数决定返回类型
        """
        # 遍历每一层编码器模块进行前向传播
        for layer in self.layers:
            hidden_states = layer(hidden_states, attention_mask, output_attentions)["feature_map"]

        # 如果需要返回字典格式的输出
        if return_dict:
            return {"last_hidden_state": hidden_states}

        # 否则直接返回隐藏状态张量
        return hidden_states
        if head_mask is None:
            head_mask_is_all_none = True
        elif head_mask.count(None) == len(head_mask):
            head_mask_is_all_none = True
        else:
            head_mask_is_all_none = False
        assert head_mask_is_all_none is True, "head_mask is not yet supported in the SqueezeBert implementation."

        # 将隐藏状态的维度顺序从 [batch_size, sequence_length, hidden_size] 转换为 [batch_size, hidden_size, sequence_length]
        hidden_states = hidden_states.permute(0, 2, 1)

        # 如果输出隐藏状态，则初始化空元组 all_hidden_states
        all_hidden_states = () if output_hidden_states else None
        # 如果输出注意力权重，则初始化空元组 all_attentions
        all_attentions = () if output_attentions else None

        # 遍历网络层并处理每一层的隐藏状态和注意力权重
        for layer in self.layers:
            if output_hidden_states:
                # 将隐藏状态的维度顺序再次从 [batch_size, hidden_size, sequence_length] 转换回 [batch_size, sequence_length, hidden_size]
                hidden_states = hidden_states.permute(0, 2, 1)
                # 将当前层的隐藏状态添加到 all_hidden_states 中
                all_hidden_states += (hidden_states,)
                # 将隐藏状态的维度顺序恢复为 [batch_size, hidden_size, sequence_length]
                hidden_states = hidden_states.permute(0, 2, 1)

            # 对当前层进行前向传播，获取层的输出特征映射和注意力分数
            layer_output = layer.forward(hidden_states, attention_mask, output_attentions)

            # 更新隐藏状态为当前层的特征映射
            hidden_states = layer_output["feature_map"]

            # 如果输出注意力权重，则将当前层的注意力分数添加到 all_attentions 中
            if output_attentions:
                all_attentions += (layer_output["attention_score"],)

        # 将隐藏状态的维度顺序从 [batch_size, hidden_size, sequence_length] 转换回 [batch_size, sequence_length, hidden_size]
        hidden_states = hidden_states.permute(0, 2, 1)

        # 如果输出隐藏状态，则将最终的隐藏状态添加到 all_hidden_states 中
        if output_hidden_states:
            all_hidden_states += (hidden_states,)

        # 如果不返回字典形式的结果，则返回隐藏状态、所有隐藏状态和所有注意力权重的元组
        if not return_dict:
            return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)
        
        # 返回字典形式的结果，包含最终的隐藏状态、所有隐藏状态和所有注意力权重
        return BaseModelOutput(
            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
        )
# 定义一个自定义的池化层模块，用于SqueezeBert模型
class SqueezeBertPooler(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 创建一个全连接层，输入和输出大小为config.hidden_size
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 定义激活函数为双曲正切函数
        self.activation = nn.Tanh()

    def forward(self, hidden_states):
        # 选择隐藏状态中的第一个 token 的特征向量作为池化输出
        first_token_tensor = hidden_states[:, 0]
        # 将第一个 token 的特征向量输入全连接层
        pooled_output = self.dense(first_token_tensor)
        # 经过激活函数处理
        pooled_output = self.activation(pooled_output)
        return pooled_output


# 定义一个预测头变换模块，用于SqueezeBert模型的预测任务
class SqueezeBertPredictionHeadTransform(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 创建一个全连接层，输入和输出大小为config.hidden_size
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 根据配置选择对应的激活函数
        if isinstance(config.hidden_act, str):
            self.transform_act_fn = ACT2FN[config.hidden_act]
        else:
            self.transform_act_fn = config.hidden_act
        # 创建LayerNorm层，对输入进行归一化处理
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

    def forward(self, hidden_states):
        # 输入经过全连接层
        hidden_states = self.dense(hidden_states)
        # 经过激活函数处理
        hidden_states = self.transform_act_fn(hidden_states)
        # 经过LayerNorm处理
        hidden_states = self.LayerNorm(hidden_states)
        return hidden_states


# 定义一个语言模型预测头模块，用于SqueezeBert模型的预测任务
class SqueezeBertLMPredictionHead(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 创建预测头变换模块
        self.transform = SqueezeBertPredictionHeadTransform(config)

        # 输出权重与输入嵌入的权重相同，但每个标记都有一个输出偏置
        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

        # 创建一个偏置参数，每个词汇表的标记都有一个偏置
        self.bias = nn.Parameter(torch.zeros(config.vocab_size))

        # 需要一个链接来确保偏置与`resize_token_embeddings`正确地调整大小
        self.decoder.bias = self.bias

    def forward(self, hidden_states):
        # 输入经过预测头变换模块
        hidden_states = self.transform(hidden_states)
        # 经过线性层处理
        hidden_states = self.decoder(hidden_states)
        return hidden_states


# 定义一个仅含MLM头模块，用于SqueezeBert模型的预测任务
class SqueezeBertOnlyMLMHead(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 创建语言模型预测头模块
        self.predictions = SqueezeBertLMPredictionHead(config)

    def forward(self, sequence_output):
        # 输入序列经过语言模型预测头模块
        prediction_scores = self.predictions(sequence_output)
        return prediction_scores


# 定义一个SqueezeBert预训练模型的抽象类，用于处理权重初始化、预训练模型的下载和加载
class SqueezeBertPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 配置类为SqueezeBertConfig
    config_class = SqueezeBertConfig
    # 基础模型前缀为"transformer"
    base_model_prefix = "transformer"
    # 定义私有方法 _init_weights，用于初始化神经网络模块的权重
    def _init_weights(self, module):
        """Initialize the weights"""
        # 如果模块是全连接层或一维卷积层
        if isinstance(module, (nn.Linear, nn.Conv1d)):
            # 使用正态分布初始化权重，均值为0，标准差为模型配置的初始化范围
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            # 如果模块有偏置项，则将偏置项初始化为零
            if module.bias is not None:
                module.bias.data.zero_()
        # 如果模块是嵌入层
        elif isinstance(module, nn.Embedding):
            # 使用正态分布初始化权重，均值为0，标准差为模型配置的初始化范围
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            # 如果指定了填充索引，将该索引对应的权重初始化为零
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        # 如果模块是自定义的 SqueezeBertLayerNorm 层
        elif isinstance(module, SqueezeBertLayerNorm):
            # 将偏置项初始化为零
            module.bias.data.zero_()
            # 将权重初始化为全1
            module.weight.data.fill_(1.0)
# SQUEEZEBERT_START_DOCSTRING 是一个长字符串，用于描述 SqueezeBERT 模型及其相关信息。
# 提供了论文引用、类继承信息、PyTorch 模块说明和最佳微调建议。
SQUEEZEBERT_START_DOCSTRING = r"""

    The SqueezeBERT model was proposed in [SqueezeBERT: What can computer vision teach NLP about efficient neural
    networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W.
    Keutzer

    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    For best results finetuning SqueezeBERT on text classification tasks, it is recommended to use the
    *squeezebert/squeezebert-mnli-headless* checkpoint as a starting point.

    Parameters:
        config ([`SqueezeBertConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.

    Hierarchy:

    ```
    Internal class hierarchy:
    SqueezeBertModel
        SqueezeBertEncoder
            SqueezeBertModule
            SqueezeBertSelfAttention
                ConvActivation
                ConvDropoutLayerNorm
    ```

    Data layouts:

    ```
    Input data is in [batch, sequence_length, hidden_size] format.

    Data inside the encoder is in [batch, hidden_size, sequence_length] format. But, if `output_hidden_states == True`, the data from inside the encoder is returned in [batch, sequence_length, hidden_size] format.

    The final output of the encoder is in [batch, sequence_length, hidden_size] format.
    ```
"""

# SQUEEZEBERT_INPUTS_DOCSTRING 是一个空字符串，可能用于描述 SqueezeBERT 模型的输入说明，但当前未填充任何内容。
SQUEEZEBERT_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `({0})`):
            # 输入序列标记在词汇表中的索引。

            # 可以使用 `AutoTokenizer` 获得这些索引。详见 `PreTrainedTokenizer.encode` 和 `PreTrainedTokenizer.__call__`。

            # [什么是输入 ID？](../glossary#input-ids)
        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
            # 遮罩，用于在填充标记上避免执行注意力操作。遮罩值选取范围为 `[0, 1]`：

            # - 1 表示 **未遮罩** 的标记，
            # - 0 表示 **已遮罩** 的标记。

            # [什么是注意力遮罩？](../glossary#attention-mask)
        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            # 分段标记索引，用于指示输入的第一部分和第二部分。索引选取范围为 `[0, 1]`：

            # - 0 对应 *句子 A* 的标记，
            # - 1 对应 *句子 B* 的标记。

            # [什么是分段标记 ID？](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            # 输入序列每个标记在位置嵌入中的位置索引。索引选取范围为 `[0, config.max_position_embeddings - 1]`。

            # [什么是位置 ID？](../glossary#position-ids)
        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            # 遮罩，用于将自注意力模块的特定头部置零。遮罩值选取范围为 `[0, 1]`：

            # - 1 表示头部 **未遮罩**，
            # - 0 表示头部 **已遮罩**。

        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
            # 可选参数，可以直接传入嵌入表示，而不是传递 `input_ids`。如果需要对 `input_ids` 索引转换为相关向量具有更多控制权，则很有用。
            # 如果模型内部嵌入查找矩阵不符合需求，这将非常有用。

        output_attentions (`bool`, *optional*):
            # 是否返回所有注意力层的注意力张量。返回的张量中有关 `attentions` 的详细信息。

        output_hidden_states (`bool`, *optional*):
            # 是否返回所有层的隐藏状态。返回的张量中有关 `hidden_states` 的详细信息。

        return_dict (`bool`, *optional*):
            # 是否返回 [`~utils.ModelOutput`] 而不是普通的元组。
"""
@add_start_docstrings(
    "The bare SqueezeBERT Model transformer outputting raw hidden-states without any specific head on top.",
    SQUEEZEBERT_START_DOCSTRING,
)
class SqueezeBertModel(SqueezeBertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        # 初始化模型的各个组件
        self.embeddings = SqueezeBertEmbeddings(config)  # 初始化嵌入层
        self.encoder = SqueezeBertEncoder(config)        # 初始化编码器
        self.pooler = SqueezeBertPooler(config)          # 初始化池化层

        # 初始化权重并应用最终处理
        self.post_init()

    def get_input_embeddings(self):
        return self.embeddings.word_embeddings

    def set_input_embeddings(self, new_embeddings):
        self.embeddings.word_embeddings = new_embeddings

    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        # 对模型的注意力头进行修剪
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)

    @add_start_docstrings_to_model_forward(SQUEEZEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=BaseModelOutputWithPooling,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
"""
        ) -> Union[Tuple, BaseModelOutputWithPooling]:
        # 如果未指定output_attentions，则使用配置中的默认值
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 如果未指定output_hidden_states，则使用配置中的默认值
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 如果未指定return_dict，则使用配置中的默认值
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 如果同时指定了input_ids和inputs_embeds，则抛出异常
        if input_ids is not None and inputs_embeds is not None:
            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
        # 如果只指定了input_ids，则检查padding情况，并获取其形状
        elif input_ids is not None:
            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
            input_shape = input_ids.size()
        # 如果只指定了inputs_embeds，则获取其形状（去除最后一维）
        elif inputs_embeds is not None:
            input_shape = inputs_embeds.size()[:-1]
        else:
            # 如果既没有指定input_ids也没有指定inputs_embeds，则抛出异常
            raise ValueError("You have to specify either input_ids or inputs_embeds")

        # 根据input_ids或inputs_embeds确定设备
        device = input_ids.device if input_ids is not None else inputs_embeds.device

        # 如果未提供attention_mask，则创建一个全为1的mask，形状与input_shape相同
        if attention_mask is None:
            attention_mask = torch.ones(input_shape, device=device)
        # 如果未提供token_type_ids，则创建一个全为0的token_type_ids，数据类型为long，设备为device
        if token_type_ids is None:
            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)

        # 获取扩展后的attention_mask，以确保形状匹配
        extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
        # 如果需要，准备头部掩码
        # 在head_mask中的1.0表示保留该头部
        # attention_probs的形状为batch_size x num_heads x N x N
        # 输入的head_mask形状为[num_heads]或[num_hidden_layers x num_heads]
        # head_mask转换为形状为[num_hidden_layers x batch x num_heads x seq_length x seq_length]
        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)

        # 将输入传递给嵌入层，获取嵌入输出
        embedding_output = self.embeddings(
            input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
        )
        # 将嵌入输出传递给编码器，获取编码器的输出
        encoder_outputs = self.encoder(
            hidden_states=embedding_output,
            attention_mask=extended_attention_mask,
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # 从编码器输出中获取序列输出（通常是最后一层的隐藏状态）
        sequence_output = encoder_outputs[0]
        # 将序列输出传递给池化层，获取池化后的输出
        pooled_output = self.pooler(sequence_output)

        # 如果不要求返回字典，则返回一个元组
        if not return_dict:
            return (sequence_output, pooled_output) + encoder_outputs[1:]

        # 如果要求返回字典，则构建BaseModelOutputWithPooling对象并返回
        return BaseModelOutputWithPooling(
            last_hidden_state=sequence_output,
            pooler_output=pooled_output,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )
# 使用自定义的文档字符串初始化 SqueezeBERT 模型，其顶部带有一个语言建模的头部
@add_start_docstrings("""SqueezeBERT Model with a `language modeling` head on top.""", SQUEEZEBERT_START_DOCSTRING)
# 定义 SqueezeBertForMaskedLM 类，继承自 SqueezeBertPreTrainedModel
class SqueezeBertForMaskedLM(SqueezeBertPreTrainedModel):
    # 指定共享权重的键列表
    _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]

    # 初始化函数，接受一个配置对象作为参数
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__(config)

        # 创建 SqueezeBertModel 对象，并赋值给 self.transformer
        self.transformer = SqueezeBertModel(config)
        # 创建 SqueezeBertOnlyMLMHead 对象，并赋值给 self.cls
        self.cls = SqueezeBertOnlyMLMHead(config)

        # 初始化权重并进行最终处理
        self.post_init()

    # 获取输出嵌入的方法，返回预测的解码器
    def get_output_embeddings(self):
        return self.cls.predictions.decoder

    # 设置输出嵌入的方法，将新的嵌入赋给解码器
    def set_output_embeddings(self, new_embeddings):
        self.cls.predictions.decoder = new_embeddings

    # 前向传播方法，接受多个输入参数并返回输出
    @add_start_docstrings_to_model_forward(SQUEEZEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=MaskedLMOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,

        # 确定前向传播是否返回一个字典形式的结果
        return_dict: Optional[bool] = None,
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        """
        # 如果 `return_dict` 为 `None`，则使用模型配置中的 `use_return_dict` 参数值
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 将输入传递给 Transformer 模型进行处理
        outputs = self.transformer(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从 Transformer 输出中获取序列输出
        sequence_output = outputs[0]
        
        # 将序列输出传递给分类器，生成预测分数
        prediction_scores = self.cls(sequence_output)

        # 初始化 masked_lm_loss 为 None
        masked_lm_loss = None
        
        # 如果提供了 labels，则计算 masked language modeling 的损失
        if labels is not None:
            # 使用交叉熵损失函数，-100 索引对应填充标记
            loss_fct = CrossEntropyLoss()
            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))

        # 如果 `return_dict` 为 False，则返回的 output 包括预测分数和其他输出
        if not return_dict:
            output = (prediction_scores,) + outputs[2:]
            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output

        # 如果 `return_dict` 为 True，则返回 MaskedLMOutput 对象，包括损失、预测 logits、隐藏状态和注意力权重
        return MaskedLMOutput(
            loss=masked_lm_loss,
            logits=prediction_scores,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
@add_start_docstrings(
    """
    SqueezeBERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    """,
    SQUEEZEBERT_START_DOCSTRING,
)
class SqueezeBertForSequenceClassification(SqueezeBertPreTrainedModel):
    """
    SqueezeBERT模型的序列分类/回归头部变换器（在汇总输出顶部的线性层），例如用于GLUE任务。
    继承自SqueezeBertPreTrainedModel。
    """

    def __init__(self, config):
        """
        初始化方法，设置模型配置和层。
        
        Args:
            config (:class:`~transformers.SqueezeBertConfig`): 模型的配置对象。
        """
        super().__init__(config)
        self.num_labels = config.num_labels
        self.config = config

        # SqueezeBERT模型，用于特征提取
        self.transformer = SqueezeBertModel(config)
        # Dropout层，用于防止过拟合
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # 分类器线性层，将隐藏状态映射到标签数量
        self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)

        # 初始化权重并进行最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(SQUEEZEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=SequenceClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, SequenceClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 根据参数 return_dict 的值确定是否使用配置中的 return_dict 设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 将输入传递给 Transformer 模型进行处理，并返回相应的输出
        outputs = self.transformer(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从 Transformer 输出中获取池化后的表示
        pooled_output = outputs[1]

        # 对池化后的表示应用 dropout
        pooled_output = self.dropout(pooled_output)

        # 将池化后的表示输入分类器，得到预测 logits
        logits = self.classifier(pooled_output)

        # 初始化损失值为 None
        loss = None

        # 如果有指定 labels，则计算损失
        if labels is not None:
            # 如果问题类型未指定，则根据 num_labels 和 labels 的类型确定问题类型
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            # 根据问题类型选择相应的损失函数
            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        # 如果 return_dict 是 False，则返回包含 logits 和可能的隐藏状态的元组
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 如果 return_dict 是 True，则返回一个 SequenceClassifierOutput 对象，包含损失、logits、隐藏状态和注意力权重
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
# 添加模型文档字符串，描述该模型是基于 SqueezeBERT 的多选分类模型
@add_start_docstrings(
    """
    SqueezeBERT Model with a multiple choice classification head on top (a linear layer on top of the pooled output and
    a softmax) e.g. for RocStories/SWAG tasks.
    """,
    SQUEEZEBERT_START_DOCSTRING,
)
# 定义 SqueezeBertForMultipleChoice 类，继承自 SqueezeBertPreTrainedModel
class SqueezeBertForMultipleChoice(SqueezeBertPreTrainedModel):
    
    # 初始化方法，接受一个配置对象 config
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__(config)
        
        # 初始化 SqueezeBERT 模型
        self.transformer = SqueezeBertModel(config)
        # Dropout 层，使用配置中的隐藏层 dropout 概率
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # 分类器，使用线性层将隐藏状态大小映射到 1（二进制分类）
        self.classifier = nn.Linear(config.hidden_size, 1)

        # 初始化权重并应用最终处理
        self.post_init()

    # 前向传播方法
    @add_start_docstrings_to_model_forward(
        SQUEEZEBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
    )
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=MultipleChoiceModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        # 方法接收多个参数，用于进行前向传播
        # 输入参数说明如下：
        # - input_ids: 输入的 token ids
        # - attention_mask: 注意力掩码，指示哪些元素是 padding 的
        # - token_type_ids: token 类型 ids，对于单句或双句模型有用
        # - position_ids: 位置 ids，指示每个 token 在序列中的位置
        # - head_mask: 头部掩码，用于控制哪些注意力头是有效的
        # - inputs_embeds: 可选的输入嵌入向量
        # - labels: 可选的标签，用于训练时的损失计算
        # - output_attentions: 是否输出注意力权重
        # - output_hidden_states: 是否输出隐藏状态
        # - return_dict: 是否返回字典形式的输出
        
        # 返回模型的输出，包括分类器的输出和其他可选的中间状态
        pass
        ) -> Union[Tuple, MultipleChoiceModelOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where *num_choices* is the size of the second dimension of the input tensors. (see
            *input_ids* above)
        """
        # 如果 return_dict 参数为 None，则根据配置决定是否使用返回字典
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 计算 num_choices，即输入张量的第二个维度大小
        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]

        # 以下四行代码将输入张量展平成二维张量，如果输入为 None 则保持为 None
        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
        inputs_embeds = (
            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
            if inputs_embeds is not None
            else None
        )

        # 调用 Transformer 模型进行前向传播，返回结果保存在 outputs 中
        outputs = self.transformer(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从 Transformer 输出中获取 pooled_output，并应用 dropout
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)

        # 将 pooled_output 输入分类器，得到 logits
        logits = self.classifier(pooled_output)

        # 将 logits 重塑成二维张量，形状为 (batch_size, num_choices)
        reshaped_logits = logits.view(-1, num_choices)

        # 如果提供了 labels，则计算交叉熵损失
        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(reshaped_logits, labels)

        # 如果 return_dict 为 False，则返回不同的输出格式
        if not return_dict:
            output = (reshaped_logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 如果 return_dict 为 True，则返回 MultipleChoiceModelOutput 对象
        return MultipleChoiceModelOutput(
            loss=loss,
            logits=reshaped_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
"""
SqueezeBERT Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
for Named-Entity-Recognition (NER) tasks.
"""
# 导入所需模块和类
@add_start_docstrings(
    """
    SqueezeBERT Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
    for Named-Entity-Recognition (NER) tasks.
    """,
    SQUEEZEBERT_START_DOCSTRING,
)
# 定义SqueezeBertForTokenClassification类，继承自SqueezeBertPreTrainedModel类
class SqueezeBertForTokenClassification(SqueezeBertPreTrainedModel):
    
    # 初始化方法，接收一个config对象作为参数
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        # SqueezeBERT模型作为transformer层
        self.transformer = SqueezeBertModel(config)
        # Dropout层，根据配置中的hidden_dropout_prob概率丢弃部分神经元，防止过拟合
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # 线性层，将隐藏状态的输出映射到config.num_labels个类别上
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        # 初始化权重并应用最终处理
        self.post_init()

    # 前向传播方法，接收多个输入参数并返回输出结果或损失
    @add_start_docstrings_to_model_forward(SQUEEZEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TokenClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, TokenClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        """
        # 根据self.config.use_return_dict决定是否返回字典形式的输出
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 将输入参数传递给transformer层，并接收输出结果
        outputs = self.transformer(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取transformer层的输出中的序列输出
        sequence_output = outputs[0]

        # 对序列输出进行dropout操作
        sequence_output = self.dropout(sequence_output)
        # 将dropout后的序列输出通过线性层映射到分类标签空间
        logits = self.classifier(sequence_output)

        # 初始化损失值为None
        loss = None
        # 如果labels不为None，则计算token分类损失
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            # 将logits展平为二维张量，计算损失
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        # 如果不返回字典形式的输出，则按元组形式构造输出结果
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 返回TokenClassifierOutput对象，包含损失值、logits、隐藏状态和注意力权重等
        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
    """
    SqueezeBERT Model with a span classification head on top for extractive question-answering tasks like SQuAD (a
    linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    """,
    
    
    这段代码是一个字符串文档，描述了SqueezeBERT模型及其用途，特别是在类似SQuAD的抽取式问答任务中使用的特定功能。
)
class SqueezeBertForQuestionAnswering(SqueezeBertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.transformer = SqueezeBertModel(config)
        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)

        # Initialize weights and apply final processing
        self.post_init()

    @add_start_docstrings_to_model_forward(SQUEEZEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=QuestionAnsweringModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,  # 输入的token IDs，可选的张量
        attention_mask: Optional[torch.Tensor] = None,  # 注意力遮罩，用于指示哪些tokens要参与attention计算
        token_type_ids: Optional[torch.Tensor] = None,  # token类型IDs，用于区分segment
        position_ids: Optional[torch.Tensor] = None,  # 位置IDs，指示每个token的位置
        head_mask: Optional[torch.Tensor] = None,  # 头部掩码，指定要mask的attention头
        inputs_embeds: Optional[torch.Tensor] = None,  # 输入的嵌入向量，代替input_ids和token_type_ids
        start_positions: Optional[torch.Tensor] = None,  # 起始位置的token索引
        end_positions: Optional[torch.Tensor] = None,  # 结束位置的token索引
        output_attentions: Optional[bool] = None,  # 是否输出attention权重
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态
        return_dict: Optional[bool] = None,  # 是否返回一个字典格式的输出
        r"""
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (*sequence_length*). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (*sequence_length*). Position outside of the sequence
            are not taken into account for computing the loss.
        """
        # 初始化返回字典，如果未提供则使用配置中的设定
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 使用 transformer 模型处理输入数据
        outputs = self.transformer(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取序列输出
        sequence_output = outputs[0]

        # 使用 qa_outputs 进行问答模型的输出
        logits = self.qa_outputs(sequence_output)
        
        # 将 logits 拆分为 start_logits 和 end_logits
        start_logits, end_logits = logits.split(1, dim=-1)
        
        # 去除多余的维度并确保连续性
        start_logits = start_logits.squeeze(-1).contiguous()
        end_logits = end_logits.squeeze(-1).contiguous()

        total_loss = None
        # 如果提供了起始位置和结束位置
        if start_positions is not None and end_positions is not None:
            # 如果是在多 GPU 环境下，添加一个维度以便操作
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)
            
            # 忽略超出模型输入范围的位置
            ignored_index = start_logits.size(1)
            start_positions = start_positions.clamp(0, ignored_index)
            end_positions = end_positions.clamp(0, ignored_index)

            # 定义交叉熵损失函数，忽略指定的索引
            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2

        # 如果不需要返回字典形式的输出
        if not return_dict:
            output = (start_logits, end_logits) + outputs[2:]
            return ((total_loss,) + output) if total_loss is not None else output

        # 返回 QuestionAnsweringModelOutput 对象，包含损失、起始位置 logits、结束位置 logits、隐藏状态和注意力权重
        return QuestionAnsweringModelOutput(
            loss=total_loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

`.\models\squeezebert\tokenization_squeezebert.py`

# 设置文件编码为 UTF-8
# 版权声明：2020 年由 SqueezeBert 作者和 HuggingFace Inc. 团队共同持有
#
# 根据 Apache 许可证版本 2.0 进行许可
# 除非符合许可证的条款，否则不得使用此文件
# 您可以在以下网址获取许可证的副本
# http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则本软件是基于 "按原样" 分发的
# 没有任何明示或暗示的担保或条件
# 请参阅许可证了解特定语言的权限及限制

"""SqueezeBERT 的标记化类。"""

# 引入必要的库和模块
import collections
import os
import unicodedata
from typing import List, Optional, Tuple

# 从 tokenization_utils 模块中导入必要的函数和类
from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
# 从 utils 模块中导入日志记录功能
from ...utils import logging

# 获取日志记录器
logger = logging.get_logger(__name__)

# 定义词汇文件名字典
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}

# 预训练模型的词汇文件映射
PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "squeezebert/squeezebert-uncased": (
            "https://huggingface.co/squeezebert/squeezebert-uncased/resolve/main/vocab.txt"
        ),
        "squeezebert/squeezebert-mnli": "https://huggingface.co/squeezebert/squeezebert-mnli/resolve/main/vocab.txt",
        "squeezebert/squeezebert-mnli-headless": (
            "https://huggingface.co/squeezebert/squeezebert-mnli-headless/resolve/main/vocab.txt"
        ),
    }
}

# 预训练模型的位置编码尺寸映射
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "squeezebert/squeezebert-uncased": 512,
    "squeezebert/squeezebert-mnli": 512,
    "squeezebert/squeezebert-mnli-headless": 512,
}

# 预训练模型的初始化配置映射
PRETRAINED_INIT_CONFIGURATION = {
    "squeezebert/squeezebert-uncased": {"do_lower_case": True},
    "squeezebert/squeezebert-mnli": {"do_lower_case": True},
    "squeezebert/squeezebert-mnli-headless": {"do_lower_case": True},
}

# 从 transformers.models.bert.tokenization_bert.load_vocab 复制的函数
def load_vocab(vocab_file):
    """加载词汇文件到一个字典中。"""
    vocab = collections.OrderedDict()
    with open(vocab_file, "r", encoding="utf-8") as reader:
        tokens = reader.readlines()
    for index, token in enumerate(tokens):
        token = token.rstrip("\n")
        vocab[token] = index
    return vocab

# 从 transformers.models.bert.tokenization_bert.whitespace_tokenize 复制的函数
def whitespace_tokenize(text):
    """在文本上执行基本的空白字符清理和分割。"""
    text = text.strip()
    if not text:
        return []
    tokens = text.split()
    return tokens

# 从 transformers.models.bert.tokenization_bert.BertTokenizer 复制的类，并将 Bert 改为 SqueezeBert
class SqueezeBertTokenizer(PreTrainedTokenizer):
    r"""
    构建一个 SqueezeBERT 分词器。基于 WordPiece。

    这个分词器继承自 [`PreTrainedTokenizer`]，其中包含大多数主要方法。用户应参考其文档
    # 定义一个类，用于处理词汇表和标记化的相关功能，继承自PreTrainedTokenizerBase类，
    # 可以查阅更多关于这些方法的信息。
    
    Args:
        vocab_file (`str`):
            包含词汇表的文件路径。
        do_lower_case (`bool`, *optional*, defaults to `True`):
            是否在标记化时将输入转换为小写。
        do_basic_tokenize (`bool`, *optional*, defaults to `True`):
            是否在WordPiece标记化之前进行基本的分词处理。
        never_split (`Iterable`, *optional*):
            在标记化时不会被分割的标记集合。仅在 `do_basic_tokenize=True` 时有效。
        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
            未知标记。词汇表中不存在的标记将被设置为这个标记。
        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
            分隔符标记，用于将多个序列组合成一个序列，例如用于序列分类或问答任务中。
            也用作带有特殊标记的序列的最后一个标记。
        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
            用于填充的标记，例如在对不同长度的序列进行批处理时使用。
        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
            分类器标记，用于序列分类任务中。构建带有特殊标记时是序列的第一个标记。
        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
            用于掩码值的标记。在使用掩码语言建模训练时，模型将尝试预测这个标记。
        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
            是否对中文字符进行标记化。对于日语，可能需要禁用此选项（参见此处的相关问题）。
        strip_accents (`bool`, *optional*):
            是否去除所有的重音符号。如果未指定此选项，则根据 `lowercase` 的值（如原始的SqueezeBERT中）确定。
    """
    
    vocab_files_names = VOCAB_FILES_NAMES  # 词汇表文件名列表
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP  # 预训练词汇表文件映射
    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION  # 预训练模型初始化配置
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES  # 预训练模型最大输入尺寸
    # 初始化函数，用于初始化一个新的 Tokenizer 对象
    def __init__(
        self,
        vocab_file,
        do_lower_case=True,
        do_basic_tokenize=True,
        never_split=None,
        unk_token="[UNK]",
        sep_token="[SEP]",
        pad_token="[PAD]",
        cls_token="[CLS]",
        mask_token="[MASK]",
        tokenize_chinese_chars=True,
        strip_accents=None,
        **kwargs,
    ):
        # 检查给定的词汇文件是否存在，如果不存在则抛出 ValueError 异常
        if not os.path.isfile(vocab_file):
            raise ValueError(
                f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
                " model use `tokenizer = SqueezeBertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
            )
        # 加载词汇表文件并赋值给实例变量 vocab
        self.vocab = load_vocab(vocab_file)
        # 根据词汇表创建一个从 id 到 token 的有序字典
        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
        # 是否执行基本的分词操作
        self.do_basic_tokenize = do_basic_tokenize
        # 如果执行基本分词，则创建 BasicTokenizer 对象并赋值给实例变量 basic_tokenizer
        if do_basic_tokenize:
            self.basic_tokenizer = BasicTokenizer(
                do_lower_case=do_lower_case,
                never_split=never_split,
                tokenize_chinese_chars=tokenize_chinese_chars,
                strip_accents=strip_accents,
            )

        # 根据词汇表创建 WordpieceTokenizer 对象，并赋值给实例变量 wordpiece_tokenizer
        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))

        # 调用父类的初始化方法，传递相关参数和额外参数
        super().__init__(
            do_lower_case=do_lower_case,
            do_basic_tokenize=do_basic_tokenize,
            never_split=never_split,
            unk_token=unk_token,
            sep_token=sep_token,
            pad_token=pad_token,
            cls_token=cls_token,
            mask_token=mask_token,
            tokenize_chinese_chars=tokenize_chinese_chars,
            strip_accents=strip_accents,
            **kwargs,
        )

    # 返回实例变量 basic_tokenizer 的 do_lower_case 属性值
    @property
    def do_lower_case(self):
        return self.basic_tokenizer.do_lower_case

    # 返回词汇表的大小（即词汇表中不同 token 的数量）
    @property
    def vocab_size(self):
        return len(self.vocab)

    # 返回词汇表和 added_tokens_encoder 的合并字典
    def get_vocab(self):
        return dict(self.vocab, **self.added_tokens_encoder)

    # 将输入的文本进行分词处理，返回分词后的列表
    def _tokenize(self, text, split_special_tokens=False):
        split_tokens = []
        # 如果执行基本分词操作
        if self.do_basic_tokenize:
            # 使用 basic_tokenizer 对象对文本进行分词
            for token in self.basic_tokenizer.tokenize(
                text, never_split=self.all_special_tokens if not split_special_tokens else None
            ):
                # 如果分词结果在 never_split 集合中，则直接添加到 split_tokens 中
                if token in self.basic_tokenizer.never_split:
                    split_tokens.append(token)
                # 否则使用 wordpiece_tokenizer 进行进一步的分词，并将结果添加到 split_tokens 中
                else:
                    split_tokens += self.wordpiece_tokenizer.tokenize(token)
        else:
            # 如果不执行基本分词，则直接使用 wordpiece_tokenizer 对文本进行分词
            split_tokens = self.wordpiece_tokenizer.tokenize(text)
        return split_tokens

    # 根据 token 返回其对应的 id，如果 token 不在词汇表中，则返回 unk_token 对应的 id
    def _convert_token_to_id(self, token):
        """Converts a token (str) in an id using the vocab."""
        return self.vocab.get(token, self.vocab.get(self.unk_token))

    # 根据 id 返回其对应的 token，如果 id 不在 ids_to_tokens 中，则返回 unk_token
    def _convert_id_to_token(self, index):
        """Converts an index (integer) in a token (str) using the vocab."""
        return self.ids_to_tokens.get(index, self.unk_token)
    def convert_tokens_to_string(self, tokens):
        """
        Converts a sequence of tokens (string) into a single string.

        Args:
            tokens (`List[str]`): List of tokens to be joined into a string.

        Returns:
            `str`: The concatenated string of tokens with "##" markers removed.
        """
        out_string = " ".join(tokens).replace(" ##", "").strip()
        return out_string

    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Builds model inputs from a sequence or a pair of sequences for sequence classification tasks by concatenating and
        adding special tokens. A SqueezeBERT sequence has the following format:

        - single sequence: `[CLS] X [SEP]`
        - pair of sequences: `[CLS] A [SEP] B [SEP]`

        Args:
            token_ids_0 (`List[int]`): List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*): Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of input IDs with the appropriate special tokens added.
        """
        if token_ids_1 is None:
            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
        cls = [self.cls_token_id]
        sep = [self.sep_token_id]
        return cls + token_ids_0 + sep + token_ids_1 + sep

    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
    ) -> List[int]:
        """
        Retrieves sequence IDs from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`): List of IDs.
            token_ids_1 (`List[int]`, *optional*): Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers indicating the presence of special tokens (1) or sequence tokens (0).
        """

        if already_has_special_tokens:
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
            )

        if token_ids_1 is not None:
            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
        return [1] + ([0] * len(token_ids_0)) + [1]

    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Creates token type IDs from a sequence or a pair of sequences for sequence classification tasks. Token type IDs
        distinguish between the first and the second sequences in a pair.

        Args:
            token_ids_0 (`List[int]`): List of IDs for the first sequence.
            token_ids_1 (`List[int]`, *optional*): Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of token type IDs with appropriate distinctions for sequence pairs.
        """
    ) -> List[int]:
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A SqueezeBERT sequence
        pair mask has the following format:

        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```

        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
        """
        # Define special tokens for separation and classification
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]
        
        # If only one sequence is provided, return a mask with all zeros
        if token_ids_1 is None:
            return len(cls + token_ids_0 + sep) * [0]
        
        # Otherwise, concatenate both sequences with separators and return a mask
        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]

    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # Initialize index for vocabulary token numbering
        index = 0
        
        # Determine the path and filename of the vocabulary file
        if os.path.isdir(save_directory):
            vocab_file = os.path.join(
                save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
            )
        else:
            vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
        
        # Write the vocabulary to the specified file
        with open(vocab_file, "w", encoding="utf-8") as writer:
            # Iterate over sorted vocabulary items and write each token to the file
            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
                # Check if the indices are consecutive and log a warning if not
                if index != token_index:
                    logger.warning(
                        f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
                        " Please check that the vocabulary is not corrupted!"
                    )
                    index = token_index
                # Write the token followed by a newline
                writer.write(token + "\n")
                index += 1
        
        # Return the path to the saved vocabulary file
        return (vocab_file,)
# Copied from transformers.models.bert.tokenization_bert.BasicTokenizer
class BasicTokenizer(object):
    """
    Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).

    Args:
        do_lower_case (`bool`, *optional*, defaults to `True`):
            Whether or not to lowercase the input when tokenizing.
        never_split (`Iterable`, *optional*):
            Collection of tokens which will never be split during tokenization. Only has an effect when
            `do_basic_tokenize=True`
        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
            Whether or not to tokenize Chinese characters.

            This should likely be deactivated for Japanese (see this
            [issue](https://github.com/huggingface/transformers/issues/328)).
        strip_accents (`bool`, *optional*):
            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
            value for `lowercase` (as in the original BERT).
        do_split_on_punc (`bool`, *optional*, defaults to `True`):
            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
            the full context of the words, such as contractions.
    """

    def __init__(
        self,
        do_lower_case=True,
        never_split=None,
        tokenize_chinese_chars=True,
        strip_accents=None,
        do_split_on_punc=True,
    ):
        # 如果 `never_split` 参数未提供，则设为一个空列表
        if never_split is None:
            never_split = []
        # 设置是否将输入转换为小写
        self.do_lower_case = do_lower_case
        # 将 `never_split` 转换为集合，用于快速查找不应分割的特定标记
        self.never_split = set(never_split)
        # 设置是否对中文字符进行分词
        self.tokenize_chinese_chars = tokenize_chinese_chars
        # 设置是否去除所有重音符号，如果未指定，则由 `lowercase` 的值决定（与原始 BERT 相同）
        self.strip_accents = strip_accents
        # 设置是否在基本标点符号上进行分割
        self.do_split_on_punc = do_split_on_punc
    def tokenize(self, text, never_split=None):
        """
        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.

        Args:
            never_split (`List[str]`, *optional*)
                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
                [`PreTrainedTokenizer.tokenize`]) List of token not to split.
        """
        # 使用 union() 方法将 self.never_split 和传入的 never_split 合并成一个新的集合
        never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
        # 清理文本内容，如去除多余空格等
        text = self._clean_text(text)

        # 以下代码块是为了支持多语言和中文模型，对文本进行分词处理
        # 这个功能于2018年11月1日添加，适用于多语言和中文模型
        if self.tokenize_chinese_chars:
            # 对包含中文字符的文本进行特殊处理
            text = self._tokenize_chinese_chars(text)
        
        # 使用 NFC 标准规范化 Unicode 文本，以避免相同字符的不同 Unicode 编码被视为不同字符
        unicode_normalized_text = unicodedata.normalize("NFC", text)
        # 使用空白字符进行分词，得到原始 token 列表
        orig_tokens = whitespace_tokenize(unicode_normalized_text)
        # 初始化分割后的 token 列表
        split_tokens = []
        
        # 遍历原始 token 列表
        for token in orig_tokens:
            # 如果 token 不在 never_split 中，则继续处理
            if token not in never_split:
                # 如果设置为小写处理，则将 token 转换为小写
                if self.do_lower_case:
                    token = token.lower()
                    # 如果 strip_accents 不为 False，则移除 token 中的重音符号
                    if self.strip_accents is not False:
                        token = self._run_strip_accents(token)
                # 如果 strip_accents 为 True，则移除 token 中的重音符号
                elif self.strip_accents:
                    token = self._run_strip_accents(token)
            # 将处理后的 token 添加到分割后的 token 列表中
            split_tokens.extend(self._run_split_on_punc(token, never_split))

        # 将分割后的 token 列表使用空白字符连接，并进行最终的空白字符分词
        output_tokens = whitespace_tokenize(" ".join(split_tokens))
        # 返回最终处理后的 token 列表
        return output_tokens

    def _run_strip_accents(self, text):
        """Strips accents from a piece of text."""
        # 使用 NFD 标准规范化 Unicode 文本，以便处理重音符号
        text = unicodedata.normalize("NFD", text)
        # 初始化输出列表
        output = []
        # 遍历文本中的每个字符
        for char in text:
            # 获取字符的 Unicode 分类
            cat = unicodedata.category(char)
            # 如果字符的分类为 Mn（Mark, Nonspacing），则跳过该字符
            if cat == "Mn":
                continue
            # 将非重音符号的字符添加到输出列表中
            output.append(char)
        # 将输出列表中的字符连接成字符串并返回
        return "".join(output)
    def _run_split_on_punc(self, text, never_split=None):
        """在文本上执行标点符号的分割。"""
        # 如果不需要在标点处分割，或者文本在never_split中，直接返回文本列表
        if not self.do_split_on_punc or (never_split is not None and text in never_split):
            return [text]
        # 将文本转换为字符列表
        chars = list(text)
        i = 0
        start_new_word = True
        output = []
        while i < len(chars):
            char = chars[i]
            # 如果是标点符号，则将其作为新的单词输出列表的一个单独项
            if _is_punctuation(char):
                output.append([char])
                start_new_word = True
            else:
                # 如果不是标点符号，根据情况添加到当前单词中或开始新单词
                if start_new_word:
                    output.append([])
                start_new_word = False
                output[-1].append(char)
            i += 1

        # 将列表中的字符列表连接成字符串列表并返回
        return ["".join(x) for x in output]

    def _tokenize_chinese_chars(self, text):
        """在CJK字符周围添加空格。"""
        output = []
        for char in text:
            cp = ord(char)
            # 如果是CJK字符，将其周围添加空格
            if self._is_chinese_char(cp):
                output.append(" ")
                output.append(char)
                output.append(" ")
            else:
                output.append(char)
        # 将列表转换为字符串并返回
        return "".join(output)

    def _is_chinese_char(self, cp):
        """检查CP是否是CJK字符的码位。"""
        # 这里定义的"中文字符"包括CJK统一表意字符范围内的所有字符
        if (
            (cp >= 0x4E00 and cp <= 0x9FFF)
            or (cp >= 0x3400 and cp <= 0x4DBF)  #
            or (cp >= 0x20000 and cp <= 0x2A6DF)  #
            or (cp >= 0x2A700 and cp <= 0x2B73F)  #
            or (cp >= 0x2B740 and cp <= 0x2B81F)  #
            or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
            or (cp >= 0xF900 and cp <= 0xFAFF)
            or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
        ):  #
            return True

        return False

    def _clean_text(self, text):
        """对文本执行无效字符移除和空白字符清理。"""
        output = []
        for char in text:
            cp = ord(char)
            # 如果是无效字符或控制字符，跳过
            if cp == 0 or cp == 0xFFFD or _is_control(char):
                continue
            # 如果是空白字符，替换为单个空格，否则保留字符
            if _is_whitespace(char):
                output.append(" ")
            else:
                output.append(char)
        # 将列表转换为字符串并返回
        return "".join(output)
class WordpieceTokenizer(object):
    """Runs WordPiece tokenization."""

    def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
        # 初始化 WordpieceTokenizer 类
        self.vocab = vocab  # 词汇表，用于词片段的匹配
        self.unk_token = unk_token  # 未知 token，在词汇表中找不到匹配时使用
        self.max_input_chars_per_word = max_input_chars_per_word  # 单词最大长度限制，默认为 100

    def tokenize(self, text):
        """
        Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
        tokenization using the given vocabulary.

        For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.

        Args:
            text: A single token or whitespace separated tokens. This should have
                already been passed through *BasicTokenizer*.

        Returns:
            A list of wordpiece tokens.
        """
        
        output_tokens = []  # 存储最终的 wordpiece tokens 结果列表
        for token in whitespace_tokenize(text):  # 遍历通过空格分隔的文本中的每个 token
            chars = list(token)  # 将 token 拆分为字符列表
            if len(chars) > self.max_input_chars_per_word:  # 如果 token 的长度超过设定的最大输入字符数
                output_tokens.append(self.unk_token)  # 将未知 token 添加到输出结果中
                continue

            is_bad = False  # 标记当前 token 是否无法分割为 wordpiece tokens
            start = 0  # 初始化起始索引
            sub_tokens = []  # 存储当前 token 分割后的 wordpiece tokens
            while start < len(chars):
                end = len(chars)
                cur_substr = None
                while start < end:
                    substr = "".join(chars[start:end])  # 获取从 start 到 end 的子字符串
                    if start > 0:
                        substr = "##" + substr  # 如果 start > 0，表示为一个片段的延续
                    if substr in self.vocab:  # 如果当前子字符串在词汇表中
                        cur_substr = substr  # 记录当前有效的子字符串
                        break
                    end -= 1  # 否则尝试减小 end，缩小子字符串范围
                if cur_substr is None:
                    is_bad = True  # 如果无法找到有效的子字符串，则标记为无法处理
                    break
                sub_tokens.append(cur_substr)  # 将有效的子字符串添加到 sub_tokens 列表中
                start = end  # 更新 start 为 end，继续处理下一个子字符串

            if is_bad:
                output_tokens.append(self.unk_token)  # 如果 token 无法处理，则添加未知 token
            else:
                output_tokens.extend(sub_tokens)  # 否则将处理后的 wordpiece tokens 添加到结果列表中
        return output_tokens  # 返回最终的 wordpiece tokens 列表

`.\models\squeezebert\tokenization_squeezebert_fast.py`

# coding=utf-8
# 版权所有 2020 年的 SqueezeBert 作者和 HuggingFace Inc. 团队。
#
# 根据 Apache 许可 2.0 版本许可下许可;
# 除非符合许可证的规定，否则不得使用此文件。
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，软件按"原样"分发，不附带任何明示或暗示的保证或条件。
# 有关特定语言的条款，请参阅许可证。
"""SqueezeBERT 的标记化类。"""

import json
from typing import List, Optional, Tuple

from tokenizers import normalizers

# 从 transformers 库中导入必要的模块和类
from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ...utils import logging
from .tokenization_squeezebert import SqueezeBertTokenizer

# 获取日志记录器
logger = logging.get_logger(__name__)

# 定义与词汇文件和标记器文件相关的名称
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}

# 预训练模型的词汇文件映射
PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "squeezebert/squeezebert-uncased": (
            "https://huggingface.co/squeezebert/squeezebert-uncased/resolve/main/vocab.txt"
        ),
        "squeezebert/squeezebert-mnli": "https://huggingface.co/squeezebert/squeezebert-mnli/resolve/main/vocab.txt",
        "squeezebert/squeezebert-mnli-headless": (
            "https://huggingface.co/squeezebert/squeezebert-mnli-headless/resolve/main/vocab.txt"
        ),
    },
    "tokenizer_file": {
        "squeezebert/squeezebert-uncased": (
            "https://huggingface.co/squeezebert/squeezebert-uncased/resolve/main/tokenizer.json"
        ),
        "squeezebert/squeezebert-mnli": (
            "https://huggingface.co/squeezebert/squeezebert-mnli/resolve/main/tokenizer.json"
        ),
        "squeezebert/squeezebert-mnli-headless": (
            "https://huggingface.co/squeezebert/squeezebert-mnli-headless/resolve/main/tokenizer.json"
        ),
    },
}

# 预训练模型的位置嵌入尺寸映射
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "squeezebert/squeezebert-uncased": 512,
    "squeezebert/squeezebert-mnli": 512,
    "squeezebert/squeezebert-mnli-headless": 512,
}

# 预训练模型的初始化配置映射
PRETRAINED_INIT_CONFIGURATION = {
    "squeezebert/squeezebert-uncased": {"do_lower_case": True},
    "squeezebert/squeezebert-mnli": {"do_lower_case": True},
    "squeezebert/squeezebert-mnli-headless": {"do_lower_case": True},
}

# 从 transformers.models.bert.tokenization_bert_fast.BertTokenizerFast 复制的 SqueezeBERT 快速标记器类
class SqueezeBertTokenizerFast(PreTrainedTokenizerFast):
    r"""
    构建一个“快速”SqueezeBERT标记器（由 HuggingFace 的 *tokenizers* 库支持）。基于 WordPiece。

    此标记器继承自 [`PreTrainedTokenizerFast`]，其中包含大多数主要方法。用户应参考此超类以获取有关这些方法的更多信息。

    ```
    """
        Args:
            vocab_file (`str`):
                File containing the vocabulary.
            do_lower_case (`bool`, *optional*, defaults to `True`):
                Whether or not to lowercase the input when tokenizing.
            unk_token (`str`, *optional*, defaults to `"[UNK]"`):
                The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
                token instead.
            sep_token (`str`, *optional*, defaults to `"[SEP]"`):
                The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
                sequence classification or for a text and a question for question answering. It is also used as the last
                token of a sequence built with special tokens.
            pad_token (`str`, *optional*, defaults to `"[PAD]"`):
                The token used for padding, for example when batching sequences of different lengths.
            cls_token (`str`, *optional*, defaults to `"[CLS]"`):
                The classifier token which is used when doing sequence classification (classification of the whole sequence
                instead of per-token classification). It is the first token of the sequence when built with special tokens.
            mask_token (`str`, *optional*, defaults to `"[MASK]"`):
                The token used for masking values. This is the token used when training this model with masked language
                modeling. This is the token which the model will try to predict.
            clean_text (`bool`, *optional*, defaults to `True`):
                Whether or not to clean the text before tokenization by removing any control characters and replacing all
                whitespaces by the classic one.
            tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
                Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see [this
                issue](https://github.com/huggingface/transformers/issues/328)).
            strip_accents (`bool`, *optional*):
                Whether or not to strip all accents. If this option is not specified, then it will be determined by the
                value for `lowercase` (as in the original SqueezeBERT).
            wordpieces_prefix (`str`, *optional*, defaults to `"##"`):
                The prefix for subwords.
        """
    
        # 引入全局变量
        vocab_files_names = VOCAB_FILES_NAMES
        pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
        pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
        max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
        slow_tokenizer_class = SqueezeBertTokenizer
    
        # 初始化函数，接收多个参数，包括必填和可选参数
        def __init__(
            self,
            vocab_file=None,
            tokenizer_file=None,
            do_lower_case=True,
            unk_token="[UNK]",
            sep_token="[SEP]",
            pad_token="[PAD]",
            cls_token="[CLS]",
            mask_token="[MASK]",
            tokenize_chinese_chars=True,
            strip_accents=None,
            **kwargs,
        ):
        ):
        # 调用父类初始化方法，设置词汇文件、分词器文件、大小写处理、未知标记、分隔符标记、填充标记、类别标记、掩码标记、中文字符分词处理和重音处理等参数
        super().__init__(
            vocab_file,
            tokenizer_file=tokenizer_file,
            do_lower_case=do_lower_case,
            unk_token=unk_token,
            sep_token=sep_token,
            pad_token=pad_token,
            cls_token=cls_token,
            mask_token=mask_token,
            tokenize_chinese_chars=tokenize_chinese_chars,
            strip_accents=strip_accents,
            **kwargs,
        )

        # 获取当前标准化器的状态，并转换成 JSON 格式
        normalizer_state = json.loads(self.backend_tokenizer.normalizer.__getstate__())
        # 检查当前标准化器的设置与初始化时传入的设置是否一致，若不一致则更新标准化器
        if (
            normalizer_state.get("lowercase", do_lower_case) != do_lower_case
            or normalizer_state.get("strip_accents", strip_accents) != strip_accents
            or normalizer_state.get("handle_chinese_chars", tokenize_chinese_chars) != tokenize_chinese_chars
        ):
            # 获取当前标准化器的类别，并更新设置参数
            normalizer_class = getattr(normalizers, normalizer_state.pop("type"))
            normalizer_state["lowercase"] = do_lower_case
            normalizer_state["strip_accents"] = strip_accents
            normalizer_state["handle_chinese_chars"] = tokenize_chinese_chars
            self.backend_tokenizer.normalizer = normalizer_class(**normalizer_state)

        # 更新对象的小写处理设置
        self.do_lower_case = do_lower_case

    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A SqueezeBERT sequence has the following format:

        - single sequence: `[CLS] X [SEP]`
        - pair of sequences: `[CLS] A [SEP] B [SEP]`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """
        # 构建带有特殊标记的输入序列，根据是否提供第二个序列决定是否添加第二个分隔符标记
        output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]

        if token_ids_1 is not None:
            output += token_ids_1 + [self.sep_token_id]

        return output

    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
        ):
    def create_seq_pair_mask(self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None) -> List[int]:
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A SqueezeBERT sequence
        pair mask has the following format:
    
        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```
    
        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
    
        Args:
            token_ids_0 (`List[int]`):
                List of IDs for the first sequence.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
    
        Returns:
            `List[int]`: List of token type IDs according to the given sequence(s). 0 represents the first sequence, and 1 represents the second sequence.
        """
        # Define special tokens
        sep = [self.sep_token_id]  # Separator token ID
        cls = [self.cls_token_id]  # Classification token ID
    
        # If token_ids_1 is not provided, return mask for token_ids_0 only
        if token_ids_1 is None:
            return len(cls + token_ids_0 + sep) * [0]
    
        # Return mask including both token_ids_0 and token_ids_1
        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
    
    
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        """
        Save the vocabulary associated with the tokenizer's model to a specified directory.
    
        Args:
            save_directory (str):
                Directory where the vocabulary files will be saved.
            filename_prefix (str, *optional*):
                Optional prefix for the saved files.
    
        Returns:
            Tuple[str]: Tuple containing the filenames of the saved vocabulary files.
        """
        # Save the vocabulary files using the tokenizer's model
        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
        return tuple(files)

`.\models\squeezebert\init.py`

# 版权声明和许可信息，说明该代码受 Apache License, Version 2.0 版权保护
#
# 如果符合许可证要求，可以使用本文件；否则，不得使用
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非法律另有规定或书面同意，否则本软件按“原样”分发，不附带任何明示或暗示的保证或条件
# 详见许可证了解更多信息
#

# 引入类型检查
from typing import TYPE_CHECKING

# 引入必要的依赖和模块
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available

# 定义要导入的结构
_import_structure = {
    "configuration_squeezebert": [
        "SQUEEZEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP",
        "SqueezeBertConfig",
        "SqueezeBertOnnxConfig",
    ],
    "tokenization_squeezebert": ["SqueezeBertTokenizer"],
}

# 检查是否有 tokenizers 库可用，如果不可用则引发 OptionalDependencyNotAvailable 异常
try:
    if not is_tokenizers_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，将 tokenization_squeezebert_fast 模块添加到导入结构中
    _import_structure["tokenization_squeezebert_fast"] = ["SqueezeBertTokenizerFast"]

# 检查是否有 torch 库可用，如果不可用则引发 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，将 modeling_squeezebert 模块添加到导入结构中
    _import_structure["modeling_squeezebert"] = [
        "SQUEEZEBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
        "SqueezeBertForMaskedLM",
        "SqueezeBertForMultipleChoice",
        "SqueezeBertForQuestionAnswering",
        "SqueezeBertForSequenceClassification",
        "SqueezeBertForTokenClassification",
        "SqueezeBertModel",
        "SqueezeBertModule",
        "SqueezeBertPreTrainedModel",
    ]

# 如果是类型检查模式，引入相关模块
if TYPE_CHECKING:
    from .configuration_squeezebert import (
        SQUEEZEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
        SqueezeBertConfig,
        SqueezeBertOnnxConfig,
    )
    from .tokenization_squeezebert import SqueezeBertTokenizer

    try:
        if not is_tokenizers_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .tokenization_squeezebert_fast import SqueezeBertTokenizerFast

    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_squeezebert import (
            SQUEEZEBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
            SqueezeBertForMaskedLM,
            SqueezeBertForMultipleChoice,
            SqueezeBertForQuestionAnswering,
            SqueezeBertForSequenceClassification,
            SqueezeBertForTokenClassification,
            SqueezeBertModel,
            SqueezeBertModule,
            SqueezeBertPreTrainedModel,
        )

# 如果不是类型检查模式，将当前模块设置为延迟加载模块
else:
    import sys

    # 将当前模块替换为延迟加载模块，其中包括导入结构和当前文件的信息
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\stablelm\configuration_stablelm.py`

# coding=utf-8
# 版权声明及许可信息
# 本文件用于定义 StableLM 模型的配置类

# 从 Transformers 库中导入预训练配置类 PretrainedConfig
from ...configuration_utils import PretrainedConfig
# 从 Transformers 库中导入日志记录工具 logging
from ...utils import logging

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 预训练模型配置文件映射表，指定不同模型的配置文件下载链接
STABLELM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "stabilityai/stablelm-3b-4e1t": "https://huggingface.co/stabilityai/stablelm-3b-4e1t/resolve/main/config.json",
    # 查看所有 StableLM 模型的链接地址：https://huggingface.co/models?filter=stablelm
}

# 定义 StableLmConfig 类，继承自 PretrainedConfig
class StableLmConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`~StableLmModel`].
    It is used to instantiate an StableLM model according to the specified arguments, defining the model
    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
    the StableLM [stabilityai/stablelm-3b-4e1t](https://huggingface.co/stabilityai/stablelm-3b-4e1t) architecture.

    Configuration objects inherit from  [`PretrainedConfig`] and can be used
    to control the model outputs. Read the documentation from  [`PretrainedConfig`]
    for more information.

    Example:

    ```
    >>> from transformers import StableLmModel, StableLmConfig

    >>> # Initializing a StableLM stablelm-3b style configuration
    >>> configuration = StableLmConfig()
    ```
    """

    # 模型类型标识为 "stablelm"
    model_type = "stablelm"
    # 推断时忽略的键列表
    keys_to_ignore_at_inference = ["past_key_values"]

    # 初始化方法，设置模型配置的各项参数
    def __init__(
        self,
        vocab_size=50304,  # 词汇表大小
        intermediate_size=6912,  # 中间层大小
        hidden_size=2560,  # 隐藏层大小
        num_hidden_layers=32,  # 隐藏层层数
        num_attention_heads=32,  # 注意力头数
        num_key_value_heads=32,  # 键值头数
        hidden_act="silu",  # 隐藏层激活函数
        max_position_embeddings=4096,  # 最大位置嵌入数
        initializer_range=0.02,  # 初始化范围
        layer_norm_eps=1.0e-5,  # 层归一化 epsilon 参数
        use_cache=True,  # 是否使用缓存
        tie_word_embeddings=False,  # 是否绑定词嵌入
        rope_theta=10_000,  # 绳索 theta 参数
        rope_scaling=None,  # 绳索缩放参数
        use_qkv_bias=False,  # 是否使用 QKV 偏置
        hidden_dropout=0.0,  # 隐藏层 dropout 率
        attention_dropout=0.0,  # 注意力 dropout 率
        partial_rotary_factor=0.25,  # 部分旋转因子
        bos_token_id=0,  # 起始符号 ID
        eos_token_id=0,  # 终止符号 ID
        **kwargs,  # 其他参数
    ):
        # 调用父类的初始化方法，设置模型配置参数
        super().__init__(
            vocab_size=vocab_size,
            intermediate_size=intermediate_size,
            hidden_size=hidden_size,
            num_hidden_layers=num_hidden_layers,
            num_attention_heads=num_attention_heads,
            num_key_value_heads=num_key_value_heads,
            hidden_act=hidden_act,
            max_position_embeddings=max_position_embeddings,
            initializer_range=initializer_range,
            layer_norm_eps=layer_norm_eps,
            use_cache=use_cache,
            tie_word_embeddings=tie_word_embeddings,
            rope_theta=rope_theta,
            rope_scaling=rope_scaling,
            use_qkv_bias=use_qkv_bias,
            hidden_dropout=hidden_dropout,
            attention_dropout=attention_dropout,
            partial_rotary_factor=partial_rotary_factor,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            **kwargs,
        )
        ):
        self.vocab_size = vocab_size  # 设置对象的词汇量大小
        self.max_position_embeddings = max_position_embeddings  # 设置对象的最大位置编码长度

        self.hidden_size = hidden_size  # 设置对象的隐藏层大小
        self.intermediate_size = intermediate_size  # 设置对象的中间层大小
        self.num_hidden_layers = num_hidden_layers  # 设置对象的隐藏层数量
        self.num_attention_heads = num_attention_heads  # 设置对象的注意力头数量
        self.num_key_value_heads = num_key_value_heads  # 设置对象的键值头数量
        self.hidden_act = hidden_act  # 设置对象的隐藏层激活函数类型

        self.initializer_range = initializer_range  # 设置对象的初始化范围
        self.layer_norm_eps = layer_norm_eps  # 设置对象的层归一化 epsilon 参数
        self.use_cache = use_cache  # 设置对象是否使用缓存
        self.rope_theta = rope_theta  # 设置对象的绳子角度
        self.rope_scaling = rope_scaling  # 设置对象的绳子缩放配置
        self.use_qkv_bias = use_qkv_bias  # 设置对象是否使用查询键值的偏置
        self.hidden_dropout = hidden_dropout  # 设置对象的隐藏层dropout率
        self.attention_dropout = attention_dropout  # 设置对象的注意力dropout率
        self.partial_rotary_factor = partial_rotary_factor  # 设置对象的部分旋转因子
        self._rope_scaling_validation()  # 调用内部方法验证绳子缩放配置的有效性

        super().__init__(  # 调用父类初始化方法，传递额外的参数
            bos_token_id=bos_token_id,  # 开始标记的 token id
            eos_token_id=eos_token_id,  # 结束标记的 token id
            tie_word_embeddings=tie_word_embeddings,  # 是否共享词嵌入
            **kwargs,  # 传递任意额外的关键字参数
        )

    # 从 transformers.models.llama.configuration_llama.LlamaConfig._rope_scaling_validation 复制而来
    def _rope_scaling_validation(self):
        """
        Validate the `rope_scaling` configuration.
        """
        if self.rope_scaling is None:  # 如果绳子缩放配置为 None，直接返回
            return

        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:  # 如果绳子缩放配置不是字典或者长度不为2，抛出数值错误
            raise ValueError(
                "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
                f"got {self.rope_scaling}"
            )
        rope_scaling_type = self.rope_scaling.get("type", None)  # 获取绳子缩放配置的类型字段
        rope_scaling_factor = self.rope_scaling.get("factor", None)  # 获取绳子缩放配置的因子字段
        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:  # 如果类型字段为空或者不在预定义的类型列表中，抛出数值错误
            raise ValueError(
                f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
            )
        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:  # 如果因子字段为空或者不是浮点数或者小于等于1，抛出数值错误
            raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")

`.\models\stablelm\modeling_stablelm.py`

# coding=utf-8
# 版权 2024 EleutherAI 和 HuggingFace Inc. 团队。保留所有权利。
#
# 这段代码基于 EleutherAI 的 GPT-NeoX 库以及该库中的 GPT-NeoX
# 和 OPT 实现进行了修改，以适应与 Meta AI 团队训练模型时的微小架构差异。
#
# 根据 Apache 许可证 2.0 版本许可，您只能在遵守许可证的情况下使用此文件。
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则本软件按"原样"分发，
# 没有任何明示或暗示的保证或条件。
# 有关特定语言的权限，请参阅许可证。
""" PyTorch StableLM 模型。"""
import math
from typing import List, Optional, Tuple, Union

import torch
import torch.nn.functional as F
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from ...activations import ACT2FN
from ...cache_utils import Cache, DynamicCache
from ...modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa
from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
from ...modeling_utils import PreTrainedModel
from ...utils import (
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    is_flash_attn_2_available,
    is_flash_attn_greater_or_equal_2_10,
    logging,
    replace_return_docstrings,
)
from .configuration_stablelm import StableLmConfig


if is_flash_attn_2_available():
    from flash_attn import flash_attn_func, flash_attn_varlen_func
    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa


logger = logging.get_logger(__name__)

_CONFIG_FOR_DOC = "StableLmConfig"


# 从 transformers.models.llama.modeling_llama._get_unpad_data 复制而来
def _get_unpad_data(attention_mask):
    # 计算批次中的每个序列的长度总和
    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
    # 找到非零位置的索引
    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
    # 获取批次中最长序列的长度
    max_seqlen_in_batch = seqlens_in_batch.max().item()
    # 计算累积序列长度，并在左侧填充一个零
    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
    return (
        indices,
        cu_seqlens,
        max_seqlen_in_batch,
    )


# 从 transformers.models.mistral.modeling_mistral.MistralRotaryEmbedding 复制而来，将 Mistral 替换为 StableLm
class StableLmRotaryEmbedding(nn.Module):
    # 初始化方法，设置 Transformer 的位置编码参数和设备相关信息
    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
        # 调用父类初始化方法
        super().__init__()

        # 设置维度
        self.dim = dim
        # 设置最大位置编码长度，默认为2048
        self.max_position_embeddings = max_position_embeddings
        # 设置基础参数，默认为10000
        self.base = base

        # 计算频率倒数向量，用于位置编码
        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
        # 将频率倒数向量作为缓冲区注册到当前对象中
        self.register_buffer("inv_freq", inv_freq, persistent=False)

        # 为了使 `torch.jit.trace` 正常工作，在这里构建缓存的余弦和正弦值
        self._set_cos_sin_cache(
            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
        )

    # 设置余弦和正弦值的缓存
    def _set_cos_sin_cache(self, seq_len, device, dtype):
        # 记录缓存的最大序列长度
        self.max_seq_len_cached = seq_len
        # 创建从0到最大序列长度的张量，设备和类型与位置编码频率倒数向量相匹配
        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)

        # 计算频率乘以位置的张量
        freqs = torch.outer(t, self.inv_freq)
        
        # 按最后一个维度连接余弦和正弦值张量，用于位置编码
        emb = torch.cat((freqs, freqs), dim=-1)
        # 将余弦值作为缓冲区注册到当前对象中，并转换为指定的数据类型
        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
        # 将正弦值作为缓冲区注册到当前对象中，并转换为指定的数据类型
        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)

    # 前向传播方法，生成位置编码的余弦和正弦值
    def forward(self, x, seq_len=None):
        # x: [bs, num_attention_heads, seq_len, head_size]

        # 如果指定了新的序列长度，并且超过了当前缓存的最大序列长度
        if seq_len > self.max_seq_len_cached:
            # 更新余弦和正弦值的缓存
            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)

        # 返回当前缓存中的余弦和正弦值，截取到指定的序列长度并转换为指定的数据类型
        return (
            self.cos_cached[:seq_len].to(dtype=x.dtype),
            self.sin_cached[:seq_len].to(dtype=x.dtype),
        )
# 从StableLmRotaryEmbedding类复制，并加入线性缩放旋转嵌入的功能，用于稳定语言模型
class StableLmLinearScalingRotaryEmbedding(StableLmRotaryEmbedding):
    """StableLmRotaryEmbedding扩展，带有线性缩放功能。由Reddit用户/u/kaiokendev贡献"""

    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
        self.scaling_factor = scaling_factor  # 初始化线性缩放因子
        super().__init__(dim, max_position_embeddings, base, device)  # 调用父类的初始化方法

    def _set_cos_sin_cache(self, seq_len, device, dtype):
        self.max_seq_len_cached = seq_len
        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
        t = t / self.scaling_factor  # 根据缩放因子调整序列

        freqs = torch.outer(t, self.inv_freq)
        # 与论文不同，但使用不同的排列方式以获得相同的计算结果
        emb = torch.cat((freqs, freqs), dim=-1)
        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)  # 缓存余弦值
        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)  # 缓存正弦值


# 从StableLmRotaryEmbedding类复制，并加入动态NTK缩放旋转嵌入的功能
class StableLmDynamicNTKScalingRotaryEmbedding(StableLmRotaryEmbedding):
    """StableLmRotaryEmbedding扩展，带有动态NTK缩放功能。由Reddit用户/u/bloc97和/u/emozilla贡献"""

    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
        self.scaling_factor = scaling_factor  # 初始化动态缩放因子
        super().__init__(dim, max_position_embeddings, base, device)  # 调用父类的初始化方法

    def _set_cos_sin_cache(self, seq_len, device, dtype):
        self.max_seq_len_cached = seq_len

        if seq_len > self.max_position_embeddings:
            # 根据序列长度动态计算基础值
            base = self.base * (
                (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
            ) ** (self.dim / (self.dim - 2))
            inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
            self.register_buffer("inv_freq", inv_freq, persistent=False)  # 缓存频率的倒数

        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)

        freqs = torch.outer(t, self.inv_freq)
        # 与论文不同，但使用不同的排列方式以获得相同的计算结果
        emb = torch.cat((freqs, freqs), dim=-1)
        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)  # 缓存余弦值
        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)  # 缓存正弦值


# 从transformers.models.llama.modeling_llama.rotate_half复制
def rotate_half(x):
    """旋转输入的一半隐藏维度。"""
    x1 = x[..., : x.shape[-1] // 2]
    x2 = x[..., x.shape[-1] // 2 :]
    return torch.cat((-x2, x1), dim=-1)


# 从transformers.models.mistral.modeling_mistral.apply_rotary_pos_emb复制
# 将旋转位置嵌入应用到查询和键张量上
def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
    """
    Args:
        q (`torch.Tensor`): 查询张量。
        k (`torch.Tensor`): 键张量。
        cos (`torch.Tensor`): 旋转嵌入的余弦部分。
        sin (`torch.Tensor`): 旋转嵌入的正弦部分。
        position_ids (`torch.Tensor`):
            与查询和键张量对应的位置索引。例如，当与KV缓存一起使用时，可以传递偏移的位置ID。
        unsqueeze_dim (`int`, *可选*, 默认为 1):
            指定沿着哪个维度展开 cos[position_ids] 和 sin[position_ids]，以便它们可以正确地广播到 q 和 k 的维度。
            例如，如果 cos[position_ids] 和 sin[position_ids] 的形状是 [batch_size, seq_len, head_dim]，
            当 q 和 k 的形状是 [batch_size, heads, seq_len, head_dim] 时，设置 unsqueeze_dim=1 可以使 cos[position_ids]
            和 sin[position_ids] 能够广播到 q 和 k 的形状。类似地，如果 q 和 k 的形状是 [batch_size, seq_len, heads, head_dim]，
            则设置 unsqueeze_dim=2。
    Returns:
        `tuple(torch.Tensor)`: 旋转使用旋转位置嵌入后的查询和键张量。
    """
    cos = cos[position_ids].unsqueeze(unsqueeze_dim)  # 根据位置索引展开余弦部分
    sin = sin[position_ids].unsqueeze(unsqueeze_dim)  # 根据位置索引展开正弦部分
    q_embed = (q * cos) + (rotate_half(q) * sin)      # 应用旋转位置嵌入到查询张量
    k_embed = (k * cos) + (rotate_half(k) * sin)      # 应用旋转位置嵌入到键张量
    return q_embed, k_embed


# 从 transformers.models.mistral.modeling_mistral.MistralMLP 复制并修改为 StableLmMLP
class StableLmMLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.hidden_size = config.hidden_size
        self.intermediate_size = config.intermediate_size
        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)  # 定义门控投影层
        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)    # 定义上投影层
        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)  # 定义下投影层
        self.act_fn = ACT2FN[config.hidden_act]  # 激活函数根据配置选择

    def forward(self, x):
        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))  # 前向传播方法，应用门控投影和上投影


# 从 transformers.models.llama.modeling_llama.repeat_kv 复制
def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
    """
    这相当于 torch.repeat_interleave(x, dim=1, repeats=n_rep)。将隐藏状态从 (batch, num_key_value_heads, seqlen, head_dim)
    重复为 (batch, num_attention_heads, seqlen, head_dim)。
    """
    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
    if n_rep == 1:
        return hidden_states  # 如果重复次数为 1，直接返回隐藏状态
    # 将 hidden_states 的形状从 [batch, num_key_value_heads, n_rep, slen, head_dim]
    # 扩展为 [batch, num_key_value_heads, n_rep, slen, head_dim]，其中:
    # - batch 是批次大小
    # - num_key_value_heads 是键值头的数量
    # - n_rep 是重复次数
    # - slen 是序列长度
    # - head_dim 是头的维度
    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
    # 将 hidden_states 重新调整为形状 [batch, num_key_value_heads * n_rep, slen, head_dim]
    # 返回调整后的 hidden_states
    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
    # 定义一个名为 StableLmAttention 的类，表示稳定的语言模型注意力机制，参考 'Attention Is All You Need' 论文中的多头注意力机制

    def __init__(self, config: StableLmConfig, layer_idx: Optional[int] = None):
        # 初始化函数，接收一个配置对象 config 和一个可选的层索引 layer_idx
        super().__init__()
        self.config = config  # 保存传入的配置对象
        self.layer_idx = layer_idx  # 保存传入的层索引，如果未提供则发出警告

        if layer_idx is None:
            logger.warning_once(
                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
                "when creating this class."
            )

        self.hidden_size = config.hidden_size  # 从配置中获取隐藏层大小
        self.num_heads = config.num_attention_heads  # 从配置中获取注意力头的数量
        self.head_dim = self.hidden_size // self.num_heads  # 计算每个注意力头的维度
        self.num_key_value_heads = config.num_key_value_heads  # 从配置中获取键值头的数量
        self.num_key_value_groups = self.num_heads // self.num_key_value_heads  # 计算键值头的分组数
        self.max_position_embeddings = config.max_position_embeddings  # 从配置中获取最大位置嵌入数
        self.rope_theta = config.rope_theta  # 从配置中获取绳索θ参数
        self.partial_rotary_factor = config.partial_rotary_factor  # 从配置中获取部分旋转因子
        self.is_causal = True  # 设置是否是因果关系（causal）的注意力机制

        if (self.head_dim * self.num_heads) != self.hidden_size:
            raise ValueError(
                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
                f" and `num_heads`: {self.num_heads})."
            )

        # 创建线性层用于查询（Q）、键（K）、值（V）的投影
        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.use_qkv_bias)
        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.use_qkv_bias)
        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.use_qkv_bias)
        
        # 创建输出投影层
        self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False)

        # 创建注意力机制中的dropout层
        self.attention_dropout = nn.Dropout(config.attention_dropout)

        # 初始化稳定语言模型特有的旋转平面（rope）参数
        self._init_rope()

    # 从 transformers.models.persimmon.modeling_persimmon.PersimmonAttention._init_rope 复制而来，用于初始化稳定语言模型注意力机制中的旋转平面参数
    # 初始化 RoPE（Rotary Positional Embedding）组件的方法
    def _init_rope(self):
        # 如果配置中未指定 RoPE 的缩放设置
        if self.config.rope_scaling is None:
            # 使用稳定的线性 RoPE 嵌入进行初始化
            self.rotary_emb = StableLmRotaryEmbedding(
                int(self.partial_rotary_factor * self.head_dim),  # 计算 RoPE 嵌入的维度
                max_position_embeddings=self.max_position_embeddings,  # 最大位置编码长度
                base=self.rope_theta,  # RoPE 的基础角度
            )
        else:
            # 如果配置中指定了 RoPE 的缩放设置
            scaling_type = self.config.rope_scaling["type"]  # 获取缩放类型
            scaling_factor = self.config.rope_scaling["factor"]  # 获取缩放因子
            # 根据不同的缩放类型选择合适的 RoPE 嵌入进行初始化
            if scaling_type == "linear":
                self.rotary_emb = StableLmLinearScalingRotaryEmbedding(
                    int(self.partial_rotary_factor * self.head_dim),  # 计算 RoPE 嵌入的维度
                    max_position_embeddings=self.max_position_embeddings,  # 最大位置编码长度
                    scaling_factor=scaling_factor,  # 线性缩放因子
                    base=self.rope_theta,  # RoPE 的基础角度
                )
            elif scaling_type == "dynamic":
                self.rotary_emb = StableLmDynamicNTKScalingRotaryEmbedding(
                    int(self.partial_rotary_factor * self.head_dim),  # 计算 RoPE 嵌入的维度
                    max_position_embeddings=self.max_position_embeddings,  # 最大位置编码长度
                    scaling_factor=scaling_factor,  # 动态缩放因子
                    base=self.rope_theta,  # RoPE 的基础角度
                )
            else:
                # 如果未知的 RoPE 缩放类型，抛出异常
                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")

    # 前向传播方法，用于模型的前向计算
    def forward(
        self,
        hidden_states: torch.Tensor,  # 输入的隐藏状态张量
        attention_mask: Optional[torch.Tensor] = None,  # 注意力遮罩张量（可选）
        position_ids: Optional[torch.LongTensor] = None,  # 位置 ID 张量（可选）
        past_key_value: Optional[Cache] = None,  # 缓存的键值对（可选）
        output_attentions: bool = False,  # 是否输出注意力权重（默认为 False）
        use_cache: bool = False,  # 是否使用缓存（默认为 False）

        # 方法的输入参数说明完毕
class StableLmSdpaAttention(StableLmAttention):
    # 继承自 StableLmAttention 的稳定SDPA注意力模块

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Cache] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
    ):
        # 前向传播函数，接受输入隐藏状态、注意力掩码、位置ID、过去键值、是否输出注意力、是否使用缓存

class StableLmFlashAttention2(StableLmAttention):
    """
    StableLM flash attention module. This module inherits from `StableLmAttention` as the weights of the module stays
    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
    flash attention and deal with padding tokens in case the input contains any of them.
    """
    # StableLM闪电注意力模块2，继承自 StableLmAttention，模块的权重保持不变。
    # 唯一需要更改的是前向传播，需要正确调用闪电注意力的公共API，并处理输入中可能包含的填充标记。

    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
    def __init__(self, *args, **kwargs):
        # 初始化函数，接受任意数量的位置参数和关键字参数

        super().__init__(*args, **kwargs)
        # 调用父类的初始化函数

        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
        # 如果闪电注意力版本小于2.1，则使用顶部左对齐的因果掩码；大于等于2.1版本，默认使用底部右对齐的掩码。该属性用于处理这种差异。

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Cache] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
        **kwargs,
    ):
        # 前向传播函数，接受输入隐藏状态、注意力掩码、位置ID、过去键值、是否输出注意力、是否使用缓存，以及其他关键字参数

    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._flash_attention_forward
    def _flash_attention_forward(
        self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
    ):
        # 闪电注意力的前向传播函数，接受查询状态、键状态、值状态、注意力掩码、查询长度、dropout比率、softmax缩放参数
    ):
        """
        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
        first unpad the input, then computes the attention scores and pad the final attention scores.

        Args:
            query_states (`torch.Tensor`):
                Input query states to be passed to Flash Attention API
            key_states (`torch.Tensor`):
                Input key states to be passed to Flash Attention API
            value_states (`torch.Tensor`):
                Input value states to be passed to Flash Attention API
            attention_mask (`torch.Tensor`):
                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
                position of padding tokens and 1 for the position of non-padding tokens.
            dropout (`float`):
                Attention dropout
            softmax_scale (`float`, *optional*):
                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
        """
        # Determine if the attention mechanism should be causal based on the configuration
        if not self._flash_attn_uses_top_left_mask:
            causal = self.is_causal
        else:
            # Temporary workaround for RoCm version 2.1 and above until query_length == 1 issue is resolved
            causal = self.is_causal and query_length != 1

        # Check if there are padding tokens in the sequence to handle
        if attention_mask is not None:
            batch_size = query_states.shape[0]
            # Unpad the input sequences based on the attention mask
            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
                query_states, key_states, value_states, attention_mask, query_length
            )

            # Extract lengths for the current batch
            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens

            # Apply variable-length Flash attention mechanism
            attn_output_unpad = flash_attn_varlen_func(
                query_states,
                key_states,
                value_states,
                cu_seqlens_q=cu_seqlens_q,
                cu_seqlens_k=cu_seqlens_k,
                max_seqlen_q=max_seqlen_in_batch_q,
                max_seqlen_k=max_seqlen_in_batch_k,
                dropout_p=dropout,
                softmax_scale=softmax_scale,
                causal=causal,
            )

            # Pad the attention output to match the original input dimensions
            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
        else:
            # Apply Flash attention mechanism without padding handling
            attn_output = flash_attn_func(
                query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
            )

        # Return the final attention output
        return attn_output

    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._upad_input
    # 定义一个私有方法，用于处理输入数据，根据给定的参数进行调整
    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
        # 获取不包含填充的数据的索引、当前序列长度和批次中的最大序列长度
        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
        # 获取批次大小、键值序列长度、键值头数、头维度
        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape

        # 根据索引重新排列键层数据，以去除填充
        key_layer = index_first_axis(
            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
        )
        # 根据索引重新排列值层数据，以去除填充
        value_layer = index_first_axis(
            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
        )

        # 根据查询长度调整查询层数据
        if query_length == kv_seq_len:
            # 如果查询长度与键值序列长度相同，直接根据索引重新排列查询层数据
            query_layer = index_first_axis(
                query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
            )
            cu_seqlens_q = cu_seqlens_k
            max_seqlen_in_batch_q = max_seqlen_in_batch_k
            indices_q = indices_k
        elif query_length == 1:
            # 如果查询长度为1，进行相应的处理
            max_seqlen_in_batch_q = 1
            cu_seqlens_q = torch.arange(
                batch_size + 1, dtype=torch.int32, device=query_layer.device
            )  # 这里有一个memcpy操作，性能较差。
            indices_q = cu_seqlens_q[:-1]
            query_layer = query_layer.squeeze(1)
        else:
            # 否则，假设存在左填充，根据查询长度调整注意力掩码，然后调用unpad_input函数处理查询层数据
            attention_mask = attention_mask[:, -query_length:]
            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)

        # 返回调整后的查询层、键层、值层数据以及相关的索引和序列长度信息
        return (
            query_layer,
            key_layer,
            value_layer,
            indices_q,
            (cu_seqlens_q, cu_seqlens_k),
            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
        )
# 定义一个全局常量，包含了不同注意力机制类与其对应的实现类
ATTENTION_CLASSES = {
    "eager": StableLmAttention,
    "sdpa": StableLmSdpaAttention,
    "flash_attention_2": StableLmFlashAttention2,
}

# 定义了一个稳定语言模型的解码器层类
class StableLmDecoderLayer(nn.Module):
    def __init__(self, config: StableLmConfig, layer_idx: int):
        super().__init__()
        self.hidden_size = config.hidden_size  # 从配置中获取隐藏层大小
        # 初始化自注意力机制，根据配置选择不同的实现类
        self.self_attn = ATTENTION_CLASSES[config._attn_implementation](config, layer_idx=layer_idx)
        self.mlp = StableLmMLP(config)  # 初始化多层感知机
        # 初始化输入层归一化层，使用配置中的层归一化 epsilon 参数
        self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 初始化注意力后归一化层，同样使用配置中的层归一化 epsilon 参数
        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.hidden_dropout)  # 初始化 dropout 模块

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        output_attentions: Optional[bool] = False,
        use_cache: Optional[bool] = False,
    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range
                `[0, config.n_positions - 1]`.

                [What are position IDs?](../glossary#position-ids)
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*):
                cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
        """

        # 保存输入的原始状态作为残差连接的一部分
        residual = hidden_states

        # 输入层的 LayerNormalization
        hidden_states = self.input_layernorm(hidden_states)

        # 自注意力机制
        hidden_states, self_attn_weights, present_key_value = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_value=past_key_value,
            output_attentions=output_attentions,
            use_cache=use_cache,
        )

        # 添加残差连接
        hidden_states = residual + hidden_states

        # 全连接层的 LayerNormalization
        residual = hidden_states
        hidden_states = self.post_attention_layernorm(hidden_states)
        
        # MLP（多层感知机）层
        hidden_states = self.mlp(hidden_states)

        # 应用 dropout
        hidden_states = self.dropout(hidden_states)

        # 添加残差连接
        hidden_states = hidden_states + residual

        # 准备输出
        outputs = (hidden_states,)

        # 如果需要输出注意力权重
        if output_attentions:
            outputs += (self_attn_weights,)

        # 如果需要使用缓存
        if use_cache:
            outputs += (present_key_value,)

        return outputs
# STABLELM_START_DOCSTRING 变量，包含多行字符串，用于描述 StableLmPreTrainedModel 类的说明文档
STABLELM_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`StableLmConfig`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# add_start_docstrings 装饰器，添加文档字符串到 StableLmPreTrainedModel 类
@add_start_docstrings(
    "The bare StableLm Model outputting raw hidden-states without any specific head on top.",
    STABLELM_START_DOCSTRING,
)
# StableLmPreTrainedModel 类，继承自 PreTrainedModel 类
class StableLmPreTrainedModel(PreTrainedModel):
    # 指定配置类
    config_class = StableLmConfig
    # 模型基础名称前缀
    base_model_prefix = "model"
    # 支持梯度检查点
    supports_gradient_checkpointing = True
    # 不分割的模块列表
    _no_split_modules = ["StableLmDecoderLayer"]
    # 跳过设备放置的键
    _skip_keys_device_placement = "past_key_values"
    # 支持闪光注意力 2
    _supports_flash_attn_2 = True
    # 支持缓存类
    _supports_cache_class = True
    # 支持 SDPA（Sparse Dense Parallel Attention）
    _supports_sdpa = True

    # 初始化权重方法，根据模块类型初始化权重
    def _init_weights(self, module):
        std = self.config.initializer_range
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()


# STABLELM_INPUTS_DOCSTRING 变量，目前未赋值，预期用于描述输入参数的文档字符串
STABLELM_INPUTS_DOCSTRING = r"""
"""

# add_start_docstrings 装饰器，添加文档字符串到 StableLmModel 类
@add_start_docstrings(
    "The bare StableLm Model outputting raw hidden-states without any specific head on top.",
    STABLELM_START_DOCSTRING,
)
# StableLmModel 类，继承自 StableLmPreTrainedModel 类
class StableLmModel(StableLmPreTrainedModel):
    """
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`StableLmDecoderLayer`]

    Args:
        config: StableLmConfig
    """

    # 初始化方法，接受一个 StableLmConfig 类型的参数 config
    def __init__(self, config: StableLmConfig):
        super().__init__(config)
        # 设定填充索引
        self.padding_idx = config.pad_token_id
        # 设定词汇表大小
        self.vocab_size = config.vocab_size

        # 创建词嵌入层，用于将词汇索引转换为向量表示
        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
        # 创建多层 Transformer 解码器层的列表
        self.layers = nn.ModuleList(
            [StableLmDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
        )
        # 创建 LayerNorm 层，用于层归一化
        self.norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

        # 设置注意力实现类型
        self._attn_implementation = config._attn_implementation
        # 是否支持梯度检查点
        self.gradient_checkpointing = False
        # 初始化权重并应用最终处理
        self.post_init()
    # 获取输入的嵌入向量表
    def get_input_embeddings(self):
        return self.embed_tokens

    # 设置输入的嵌入向量表
    def set_input_embeddings(self, value):
        self.embed_tokens = value

    # 在模型前向传播方法上添加文档字符串，使用给定的文档字符串常量
    def forward(
        self,
        input_ids: torch.LongTensor = None,  # 输入的 token IDs，类型为长整型张量
        attention_mask: Optional[torch.Tensor] = None,  # 注意力遮罩，可选的张量
        position_ids: Optional[torch.LongTensor] = None,  # 位置 IDs，可选的长整型张量
        past_key_values: Optional[List[torch.FloatTensor]] = None,  # 过去的键值对列表，可选的浮点张量列表
        inputs_embeds: Optional[torch.FloatTensor] = None,  # 输入的嵌入向量，可选的浮点张量
        use_cache: Optional[bool] = None,  # 是否使用缓存，可选的布尔值
        output_attentions: Optional[bool] = None,  # 是否输出注意力，可选的布尔值
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态，可选的布尔值
        return_dict: Optional[bool] = None,  # 是否返回字典格式的输出，可选的布尔值
        ):
# Copied from transformers.models.persimmon.modeling_persimmon.PersimmonForCausalLM with PERSIMMON->STABLELM,Persimmon->StableLm
class StableLmForCausalLM(StableLmPreTrainedModel):
    _tied_weights_keys = ["lm_head.weight"]

    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.__init__ with LLAMA->STABLELM,Llama->StableLm
    def __init__(self, config):
        super().__init__(config)
        # 实例化 StableLmModel，使用给定的配置
        self.model = StableLmModel(config)
        # 设置词汇表大小
        self.vocab_size = config.vocab_size
        # 创建线性层，用于语言模型的输出预测
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

        # 初始化权重并进行最终处理
        self.post_init()

    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_input_embeddings
    def get_input_embeddings(self):
        # 返回模型的输入嵌入层
        return self.model.embed_tokens

    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_input_embeddings
    def set_input_embeddings(self, value):
        # 设置模型的输入嵌入层
        self.model.embed_tokens = value

    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_output_embeddings
    def get_output_embeddings(self):
        # 返回语言模型的输出嵌入层
        return self.lm_head

    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_output_embeddings
    def set_output_embeddings(self, new_embeddings):
        # 设置语言模型的输出嵌入层
        self.lm_head = new_embeddings

    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_decoder
    def set_decoder(self, decoder):
        # 设置解码器模型
        self.model = decoder

    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_decoder
    def get_decoder(self):
        # 返回当前使用的解码器模型
        return self.model

    @add_start_docstrings_to_model_forward(STABLELM_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
    # Ignore copy
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        # 稳定语言模型的前向传播函数，根据给定参数生成预测输出
        pass

    def prepare_inputs_for_generation(
        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
    ):
        # 准备生成模型输入的函数，根据给定参数组织输入
        pass
        ):
        # 如果传入的过去键值不为空
        if past_key_values is not None:
            # 如果过去键值是一个 Cache 对象
            if isinstance(past_key_values, Cache):
                # 获取缓存中序列长度
                cache_length = past_key_values.get_seq_length()
                # 获取已见过的令牌数目
                past_length = past_key_values.seen_tokens
                # 获取缓存中的最大长度
                max_cache_length = past_key_values.get_max_length()
            else:
                # 如果 past_key_values 不是 Cache 对象，则使用默认值
                cache_length = past_length = past_key_values[0][0].shape[2]
                max_cache_length = None

            # 保留未处理的令牌：
            # 1 - 如果 attention_mask 的长度超过 input_ids 的长度，则说明有部分输入仅作为缓存的一部分传递
            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
            # 2 - 如果 past_length 小于 input_ids 的长度，则 input_ids 包含所有输入令牌。我们可以基于 past_length
            #    丢弃 input_ids。
            elif past_length < input_ids.shape[1]:
                input_ids = input_ids[:, past_length:]
            # 3 - 否则 (past_length >= input_ids.shape[1])，假设 input_ids 只包含未处理的令牌。

            # 如果即将超出最大缓存长度，需要裁剪输入的 attention_mask。
            if (
                max_cache_length is not None
                and attention_mask is not None
                and cache_length + input_ids.shape[1] > max_cache_length
            ):
                attention_mask = attention_mask[:, -max_cache_length:]

        position_ids = kwargs.get("position_ids", None)
        # 如果存在 attention_mask 但不存在 position_ids，则动态创建 position_ids 以用于批次生成
        if attention_mask is not None and position_ids is None:
            position_ids = attention_mask.long().cumsum(-1) - 1
            position_ids.masked_fill_(attention_mask == 0, 1)
            # 如果存在过去的键值，将 position_ids 裁剪到与 input_ids 相同的长度
            if past_key_values:
                position_ids = position_ids[:, -input_ids.shape[1] :]

        # 如果传入了 inputs_embeds，则仅在第一个生成步骤中使用它们
        if inputs_embeds is not None and past_key_values is None:
            model_inputs = {"inputs_embeds": inputs_embeds}
        else:
            model_inputs = {"input_ids": input_ids}

        # 更新 model_inputs 字典，包括 position_ids、past_key_values、use_cache、attention_mask
        model_inputs.update(
            {
                "position_ids": position_ids,
                "past_key_values": past_key_values,
                "use_cache": kwargs.get("use_cache"),
                "attention_mask": attention_mask,
            }
        )
        # 返回 model_inputs 作为模型的输入
        return model_inputs
    # 重新排序缓存中的过去键值对，使其适应新的beam索引顺序
    def _reorder_cache(past_key_values, beam_idx):
        # 初始化一个空元组，用于存储重新排序后的过去键值对
        reordered_past = ()
        # 遍历每一层的过去键值对
        for layer_past in past_key_values:
            # 对于每个过去状态，根据beam_idx重新排序，选择对应设备上的索引
            reordered_past += (
                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
            )
        # 返回重新排序后的过去键值对
        return reordered_past
@add_start_docstrings(
    """
    The StableLm transformer with a sequence classification head on top (linear layer).

    [`StableLmForSequenceClassification`] uses the last token in order to do the classification, as other causal
    models (e.g. GPT-2) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    """,
    STABLELM_START_DOCSTRING,
)
# 定义一个新的类 `StableLmForSequenceClassification`，继承自 `StableLmPreTrainedModel` 类
# 该类包含了一个用于序列分类的线性层在其顶部的 `StableLm` 变换器
class StableLmForSequenceClassification(StableLmPreTrainedModel):
    def __init__(self, config):
        # 调用父类的构造函数，并传入配置参数 `config`
        super().__init__(config)
        # 初始化 `num_labels` 属性，表示分类的类别数量
        self.num_labels = config.num_labels
        # 创建 `StableLmModel` 模型实例，并保存在 `self.model` 属性中
        self.model = StableLmModel(config)
        # 创建一个线性层 `score`，用于将 `hidden_size` 的输出映射到 `num_labels` 的空间
        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)

        # 初始化权重并应用最终处理
        self.post_init()

    # 返回模型的输入嵌入层 `embed_tokens`
    def get_input_embeddings(self):
        return self.model.embed_tokens

    # 设置模型的输入嵌入层 `embed_tokens` 的值为 `value`
    def set_input_embeddings(self, value):
        self.model.embed_tokens = value

    @add_start_docstrings_to_model_forward(STABLELM_INPUTS_DOCSTRING)
    # 定义模型的前向传播函数 `forward`
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,

        # 前向传播函数接受多个输入参数，包括 `input_ids`, `attention_mask`, `position_ids`, `past_key_values`,
        # `inputs_embeds`, `labels`, `use_cache`, `output_attentions`, `output_hidden_states`, `return_dict`
        # 它计算模型的输出并返回结果，根据 `STABLELM_INPUTS_DOCSTRING` 的文档字符串进行注释
        pass

Transformers-源码解析-一百零六-

Transformers 源码解析（一百零六）

.\models\speech_to_text_2\tokenization_speech_to_text_2.py

.\models\speech_to_text_2\__init__.py

.\models\splinter\configuration_splinter.py

.\models\splinter\modeling_splinter.py

.\models\splinter\tokenization_splinter.py

.\models\splinter\tokenization_splinter_fast.py

.\models\splinter\__init__.py

.\models\squeezebert\configuration_squeezebert.py

.\models\squeezebert\modeling_squeezebert.py

.\models\squeezebert\tokenization_squeezebert.py

.\models\squeezebert\tokenization_squeezebert_fast.py

.\models\squeezebert\__init__.py

.\models\stablelm\configuration_stablelm.py

.\models\stablelm\modeling_stablelm.py

`.\models\speech_to_text_2\tokenization_speech_to_text_2.py`

`.\models\speech_to_text_2\init.py`

`.\models\splinter\configuration_splinter.py`

`.\models\splinter\modeling_splinter.py`

`.\models\splinter\tokenization_splinter.py`

`.\models\splinter\tokenization_splinter_fast.py`

`.\models\splinter\init.py`

`.\models\squeezebert\configuration_squeezebert.py`

`.\models\squeezebert\modeling_squeezebert.py`

`.\models\squeezebert\tokenization_squeezebert.py`

`.\models\squeezebert\tokenization_squeezebert_fast.py`

`.\models\squeezebert\init.py`

`.\models\stablelm\configuration_stablelm.py`

`.\models\stablelm\modeling_stablelm.py`