Transformers 源码解析（八十八）

`.\models\perceiver\tokenization_perceiver.py`

# coding=utf-8
# 版权 2021 年 HuggingFace Inc. 团队。
#
# 根据 Apache 许可证版本 2.0 授权。
# 除非符合许可证要求或书面同意，否则不得使用此文件。
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，软件按"原样"分发，不提供任何明示或暗示的担保或条件。
# 有关详细信息，请参阅许可证。
""" Perceiver 的分词器类。"""


from typing import Dict, List, Optional, Tuple

# 导入父类 PreTrainedTokenizer 和一些其他必要的模块
from ...tokenization_utils import AddedToken, PreTrainedTokenizer
from ...utils import logging

# 获取 logger 对象，用于记录日志
logger = logging.get_logger(__name__)


class PerceiverTokenizer(PreTrainedTokenizer):
    """
    构建一个 Perceiver 分词器。Perceiver 简单地使用原始字节 utf-8 编码。

    这个分词器继承自 [`PreTrainedTokenizer`]，该类包含大部分主要方法。用户应参考这个父类获取更多有关这些方法的信息。

    Args:
        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
            用于填充的标记，在批处理不同长度的序列时使用。
        bos_token (`str`, *optional*, defaults to `"[BOS]"`):
            BOS 标记（在词汇表中保留，但实际上不使用）。
        eos_token (`str`, *optional*, defaults to `"[EOS]"`):
            序列结束标记（在词汇表中保留，但实际上不使用）。

            <Tip>

            当使用特殊标记构建序列时，这不是实际用于序列结束的标记。
            实际用于结束序列的标记是 `sep_token`。

            </Tip>

        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
            用于掩码语言建模的 MASK 标记。
        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
            CLS 标记（在词汇表中保留，但实际上不使用）。
        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
            分隔符标记，在从两个序列构建一个序列时使用。

    """

    # 模型输入的名称列表
    model_input_names = ["input_ids", "attention_mask"]

    def __init__(
        self,
        pad_token="[PAD]",
        bos_token="[BOS]",
        eos_token="[EOS]",
        mask_token="[MASK]",
        cls_token="[CLS]",
        sep_token="[SEP]",
        model_max_length=2048,
        **kwargs,
    ):
        # 初始化函数，设置分词器的各种特殊标记及其默认值
        super().__init__(
            pad_token=pad_token,
            bos_token=bos_token,
            eos_token=eos_token,
            mask_token=mask_token,
            cls_token=cls_token,
            sep_token=sep_token,
            **kwargs,
        )
    ) -> None:
        # 如果 pad_token 是字符串，则封装为 AddedToken 对象；否则直接使用传入的对象
        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
        # 如果 bos_token 是字符串，则封装为 AddedToken 对象；否则直接使用传入的对象
        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
        # 如果 eos_token 是字符串，则封装为 AddedToken 对象；否则直接使用传入的对象
        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
        # 如果 mask_token 是字符串，则封装为 AddedToken 对象；否则直接使用传入的对象
        mask_token = AddedToken(mask_token, lstrip=False, rstrip=False) if isinstance(mask_token, str) else mask_token
        # 如果 cls_token 是字符串，则封装为 AddedToken 对象；否则直接使用传入的对象
        cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
        # 如果 sep_token 是字符串，则封装为 AddedToken 对象；否则直接使用传入的对象
        sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token

        # 初始化 UTF-8 编码的词汇表大小为 2 的 8 次方（256）
        self._utf_vocab_size = 2**8  # utf is 8 bits

        # 这些特殊 token 不在词汇表中，因此我们手动将它们添加到解码器中
        self._added_tokens_decoder: Dict[str, int] = {
            0: pad_token,
            1: bos_token,
            2: eos_token,
            3: mask_token,
            4: cls_token,
            5: sep_token,
        }
        # 特殊 token 的数量
        self._num_special_tokens = len(self._added_tokens_decoder)
        # 调用父类的构造方法，初始化基本特殊 token 和模型最大长度等参数
        super().__init__(
            pad_token=pad_token,
            bos_token=bos_token,
            eos_token=eos_token,
            mask_token=mask_token,
            cls_token=cls_token,
            sep_token=sep_token,
            model_max_length=model_max_length,
            **kwargs,
        )

    def get_vocab(self) -> Dict[str, int]:
        # 初始化一个空的词汇表字典
        vocab = {}
        # 遍历 UTF-8 编码范围内的所有字符
        for i in range(self._utf_vocab_size):
            # 将每个字符转换为对应的 token，索引从特殊 token 的数量开始递增
            token = chr(i)
            vocab[token] = i + self._num_special_tokens
        # 将已添加的特殊 token 编码器加入词汇表中
        vocab.update(self.added_tokens_encoder)
        return vocab

    @property
    def vocab_size(self):
        # 返回 UTF-8 编码的词汇表大小
        return self._utf_vocab_size

    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
    ) -> List[int]:
        """
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """
        if already_has_special_tokens:
            # If the token list already has special tokens, delegate to superclass method
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
            )

        # Normal case: adding special tokens to the token lists
        if token_ids_1 is None:
            # For a single sequence, add `[CLS]`, sequence tokens, and `[SEP]`
            return [1] + [0] * len(token_ids_0) + [1]
        else:
            # For a pair of sequences, add `[CLS]`, tokens from the first sequence, `[SEP]`, tokens from the second sequence, and `[SEP]`
            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]

    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks. A sequence has the
        following format:

        - single sequence: `[CLS] X [SEP]`
        - pair of sequences: `[CLS] A [SEP] B [SEP]`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """
        if token_ids_1 is None:
            # If there is only one sequence, add `[CLS]`, tokens, and `[SEP]`
            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
        else:
            # If there are two sequences, add `[CLS]`, tokens from the first sequence, `[SEP]`, tokens from the second sequence, and `[SEP]`
            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + token_ids_1 + [self.sep_token_id]

    def _tokenize(self, text: str) -> List[str]:
        """Take as input a string and return a list of strings (tokens) for words/sub-words"""
        tokens = [chr(i) for i in text.encode("utf-8")]
        return tokens

    def _convert_token_to_id(self, token):
        """Converts a token (str) into an ID using the vocabulary."""
        if len(token) != 1:
            token_id = self.unk_token_id
        else:
            token_id = ord(token) + self._num_special_tokens
        return token_id

    def _convert_id_to_token(self, index):
        """Converts an index (integer) into a token (str) using the vocabulary."""
        token = chr(index - self._num_special_tokens)
        return token

    # TODO @ArthurZ refactor this as well....
    # 将一系列的标记（字符串）转换为单个字符串
    def convert_tokens_to_string(self, tokens):
        # 初始化一个空的字节字符串
        bstring = b""
        # 遍历每个标记
        for token in tokens:
            # 如果标记在已添加标记的编码器中
            if token in self.added_tokens_encoder:
                # 将标记转换为 UTF-8 编码的字节序列
                tok_string = str(token).encode("utf-8")
            else:
                # 否则，将标记转换为对应的字节值
                tok_string = bytes([ord(token)])
            # 将处理后的字节串添加到总字节字符串中
            bstring += tok_string
        # 将字节串解码为 UTF-8 编码的字符串，使用替换错误处理方式
        string = bstring.decode("utf-8", errors="replace")
        # 返回最终的字符串
        return string

    # PerceiverTokenizer 没有词汇表文件
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # 返回一个空元组，因为 PerceiverTokenizer 没有需要保存的词汇表
        return ()

`.\models\perceiver\init.py`

# 引入类型检查模块的类型检查功能
from typing import TYPE_CHECKING

# 从指定位置引入各种实用工具和依赖
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_tokenizers_available,
    is_torch_available,
    is_vision_available,
)

# 定义一个字典结构，包含了要导入的模块和相应的成员
_import_structure = {
    "configuration_perceiver": ["PERCEIVER_PRETRAINED_CONFIG_ARCHIVE_MAP", "PerceiverConfig", "PerceiverOnnxConfig"],
    "tokenization_perceiver": ["PerceiverTokenizer"],
}

# 尝试检查视觉模块是否可用，如果不可用则抛出依赖不可用的异常
try:
    if not is_vision_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果视觉模块可用，则向_import_structure字典中添加特征提取和图像处理的相关成员
    _import_structure["feature_extraction_perceiver"] = ["PerceiverFeatureExtractor"]
    _import_structure["image_processing_perceiver"] = ["PerceiverImageProcessor"]

# 尝试检查torch模块是否可用，如果不可用则抛出依赖不可用的异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果torch模块可用，则向_import_structure字典中添加模型相关的成员
    _import_structure["modeling_perceiver"] = [
        "PERCEIVER_PRETRAINED_MODEL_ARCHIVE_LIST",
        "PerceiverForImageClassificationConvProcessing",
        "PerceiverForImageClassificationFourier",
        "PerceiverForImageClassificationLearned",
        "PerceiverForMaskedLM",
        "PerceiverForMultimodalAutoencoding",
        "PerceiverForOpticalFlow",
        "PerceiverForSequenceClassification",
        "PerceiverLayer",
        "PerceiverModel",
        "PerceiverPreTrainedModel",
    ]

# 如果在类型检查模式下
if TYPE_CHECKING:
    # 从配置模块中引入特定的配置映射、配置类和ONNX配置类
    from .configuration_perceiver import PERCEIVER_PRETRAINED_CONFIG_ARCHIVE_MAP, PerceiverConfig, PerceiverOnnxConfig
    # 从tokenization模块中引入PerceiverTokenizer类

    from .tokenization_perceiver import PerceiverTokenizer

    # 再次尝试检查视觉模块是否可用，如果不可用则抛出依赖不可用的异常
    try:
        if not is_vision_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果视觉模块可用，则从特征提取和图像处理模块中引入相应类
        from .feature_extraction_perceiver import PerceiverFeatureExtractor
        from .image_processing_perceiver import PerceiverImageProcessor

    # 再次尝试检查torch模块是否可用，如果不可用则抛出依赖不可用的异常
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 导入模块中的特定内容，从.modeling_perceiver模块中
        from .modeling_perceiver import (
            PERCEIVER_PRETRAINED_MODEL_ARCHIVE_LIST,  # 导入预训练模型的存档列表
            PerceiverForImageClassificationConvProcessing,  # 导入处理图像分类的Perceiver模型（使用卷积处理）
            PerceiverForImageClassificationFourier,  # 导入处理图像分类的Perceiver模型（使用傅里叶特征）
            PerceiverForImageClassificationLearned,  # 导入处理图像分类的Perceiver模型（学习特征）
            PerceiverForMaskedLM,  # 导入用于掩码语言模型任务的Perceiver模型
            PerceiverForMultimodalAutoencoding,  # 导入用于多模态自编码任务的Perceiver模型
            PerceiverForOpticalFlow,  # 导入用于光流处理任务的Perceiver模型
            PerceiverForSequenceClassification,  # 导入用于序列分类任务的Perceiver模型
            PerceiverLayer,  # 导入Perceiver的层定义
            PerceiverModel,  # 导入通用Perceiver模型
            PerceiverPreTrainedModel,  # 导入预训练的Perceiver模型
        )
else:
    # 如果前面的条件不满足，则执行以下代码块
    import sys
    # 导入 sys 模块，用于访问和操作与 Python 解释器相关的系统功能

    # 将当前模块注册为 LazyModule 的实例，作为当前模块的替代
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\persimmon\configuration_persimmon.py`

# coding=utf-8
# 代码文件的版权信息和许可证声明

""" Persimmon model configuration"""
# 模型配置文件的简短描述

from ...configuration_utils import PretrainedConfig
from ...utils import logging
# 导入必要的模块和函数

logger = logging.get_logger(__name__)
# 获取与当前模块相关的日志记录器

PERSIMMON_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "adept/persimmon-8b-base": "https://huggingface.co/adept/persimmon-8b-base/resolve/main/config.json",
}
# 定义预训练模型的配置映射表，将模型名称映射到其配置文件的下载链接

class PersimmonConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`PersimmonModel`]. It is used to instantiate an
    Persimmon model according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the
    [adept/persimmon-8b-base](https://huggingface.co/adept/persimmon-8b-base).

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.


    ```
    >>> from transformers import PersimmonModel, PersimmonConfig

    >>> # Initializing a Persimmon persimmon-7b style configuration
    >>> configuration = PersimmonConfig()
    ```
    """
    # PersimmonConfig 类的说明文档，描述如何使用该类配置 Persimmon 模型

    model_type = "persimmon"
    # 模型类型为 "persimmon"

    keys_to_ignore_at_inference = ["past_key_values"]
    # 推断过程中忽略的键列表，这里包含 "past_key_values"

    def __init__(
        self,
        vocab_size=262144,
        hidden_size=4096,
        intermediate_size=16384,
        num_hidden_layers=36,
        num_attention_heads=64,
        hidden_act="relu2",
        max_position_embeddings=16384,
        initializer_range=0.02,
        layer_norm_eps=1e-5,
        use_cache=True,
        tie_word_embeddings=False,
        rope_theta=25000.0,
        rope_scaling=None,
        qk_layernorm=True,
        hidden_dropout=0.0,
        attention_dropout=0.0,
        partial_rotary_factor=0.5,
        pad_token_id=None,
        bos_token_id=1,
        eos_token_id=2,
        **kwargs,
    ):
        # PersimmonConfig 的初始化函数，用于设置模型的各项配置参数
        pass
    ):
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.layer_norm_eps = layer_norm_eps
        self.use_cache = use_cache
        self.rope_theta = rope_theta
        self.rope_scaling = rope_scaling
        self.qk_layernorm = qk_layernorm
        self.hidden_dropout = hidden_dropout
        self.attention_dropout = attention_dropout
        self.partial_rotary_factor = partial_rotary_factor
        self._rope_scaling_validation()

        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )


# 构造函数，初始化模型配置参数和调用验证函数
def __init__(
    vocab_size,  # 词汇表大小
    max_position_embeddings,  # 最大位置编码长度
    hidden_size,  # 隐藏层大小
    intermediate_size,  # 中间层大小
    num_hidden_layers,  # 隐藏层层数
    num_attention_heads,  # 注意力头的数量
    hidden_act,  # 隐藏层激活函数
    initializer_range,  # 参数初始化范围
    layer_norm_eps,  # 层归一化 epsilon 参数
    use_cache,  # 是否使用缓存
    rope_theta,  # 绳子模型 theta 参数
    rope_scaling,  # 绳子模型缩放参数
    qk_layernorm,  # QK 归一化参数
    hidden_dropout,  # 隐藏层 dropout 概率
    attention_dropout,  # 注意力机制 dropout 概率
    partial_rotary_factor,  # 部分旋转因子
    pad_token_id=None,  # 填充 token ID
    bos_token_id=None,  # 开始 token ID
    eos_token_id=None,  # 结束 token ID
    tie_word_embeddings=False,  # 是否共享词嵌入
    **kwargs,  # 其他参数
):
    # 初始化模型配置参数
    self.vocab_size = vocab_size
    self.max_position_embeddings = max_position_embeddings
    self.hidden_size = hidden_size
    self.intermediate_size = intermediate_size
    self.num_hidden_layers = num_hidden_layers
    self.num_attention_heads = num_attention_heads
    self.hidden_act = hidden_act
    self.initializer_range = initializer_range
    self.layer_norm_eps = layer_norm_eps
    self.use_cache = use_cache
    self.rope_theta = rope_theta
    self.rope_scaling = rope_scaling
    self.qk_layernorm = qk_layernorm
    self.hidden_dropout = hidden_dropout
    self.attention_dropout = attention_dropout
    self.partial_rotary_factor = partial_rotary_factor
    # 调用私有方法验证绳子模型缩放参数的有效性
    self._rope_scaling_validation()

    # 调用父类的初始化方法，传递必要的参数和其他关键字参数
    super().__init__(
        pad_token_id=pad_token_id,
        bos_token_id=bos_token_id,
        eos_token_id=eos_token_id,
        tie_word_embeddings=tie_word_embeddings,
        **kwargs,
    )

`.\models\persimmon\convert_persimmon_weights_to_hf.py`

# 导入必要的库和模块
import argparse  # 用于处理命令行参数的库
import os  # 提供与操作系统交互的功能
import warnings  # 用于警告处理的库

import flatdict  # 用于扁平化字典的库
import torch  # PyTorch深度学习库

# 从transformers库中导入所需的类和函数
from transformers import LlamaTokenizer, PersimmonConfig, PersimmonForCausalLM

try:
    from transformers import LlamaTokenizerFast  # 尝试导入快速的LlamaTokenizer
    tokenizer_class = LlamaTokenizerFast
except ImportError as e:
    warnings.warn(e)  # 输出导入错误的警告信息
    warnings.warn(
        "The converted tokenizer will be the `slow` tokenizer. To use the fast, update your `tokenizers` library and re-run the tokenizer conversion"
    )
    tokenizer_class = LlamaTokenizer  # 使用默认的LlamaTokenizer

"""
示例用法:

git clone https://github.com/persimmon-ai-labs/adept-inference
wget https://axtkn4xl5cip.objectstorage.us-phoenix-1.oci.customer-oci.com/n/axtkn4xl5cip/b/adept-public-data/o/8b_base_model_release.tar
wget https://axtkn4xl5cip.objectstorage.us-phoenix-1.oci.customer-oci.com/n/axtkn4xl5cip/b/adept-public-data/o/8b_chat_model_release.tar
python src/transformers/models/persimmon/convert_persimmon_weights_to_hf.py  --input_dir /path/to/downloaded/persimmon/weights/ --output_dir /output/path
"""

# 需要重命名的键值对映射关系
KEYS_TO_MODIFY_MAPPING = {
    "self_attention": "self_attn",  # 将键"self_attention"映射为"self_attn"
    "language_model.encoder": "model",  # 将键"language_model.encoder"映射为"model"
    "word_embeddings_for_head": "lm_head",  # 将键"word_embeddings_for_head"映射为"lm_head"
    "language_model.embedding.word_embeddings": "model.embed_tokens",  # 将键"language_model.embedding.word_embeddings"映射为"model.embed_tokens"
}

KEYS_TO_REMOVE = "rotary_emb.inv_freq"  # 需要从状态字典中移除的键

# 重命名状态字典的函数，根据映射关系修改键名
def rename_state_dict(state_dict):
    model_state_dict = {}
    for key, value in state_dict.items():
        for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items():
            if key_to_modify in key:
                key = key.replace(key_to_modify, new_key)
        if KEYS_TO_REMOVE in key:
            continue  # 如果键包含需移除的内容，跳过此键
        model_state_dict[key] = value
    return model_state_dict


def convert_persimmon_checkpoint(pytorch_dump_folder_path, ada_lib_path, pt_model_path, safe_serialization=False):
    import sys

    sys.path.insert(0, ada_lib_path)  # 将ada_lib_path插入到系统路径中，用于导入模块
    # 从指定路径加载 PyTorch 模型的状态字典到 model_state_dict_base 变量中，使用 CPU 作为设备
    model_state_dict_base = torch.load(pt_model_path, map_location="cpu")
    
    # 使用 flatdict 库将模型状态字典扁平化，使用 "." 作为分隔符
    state_dict = flatdict.FlatDict(model_state_dict_base["model"], ".")
    
    # 对状态字典进行重命名处理，返回重命名后的状态字典
    state_dict = rename_state_dict(state_dict)
    
    # 创建一个 PersimmonConfig 对象用于配置 Transformers 模型
    transformers_config = PersimmonConfig()
    
    # 使用 PersimmonForCausalLM 类创建一个 Transformers 模型，指定 eos_token_id 和 bos_token_id，并将模型放到 torch.bfloat16 数据类型中
    model = PersimmonForCausalLM(transformers_config, eos_token_id=71013, bos_token_id=71013).to(torch.bfloat16)
    
    # 加载处理后的状态字典到模型中
    model.load_state_dict(state_dict)
    
    # 将模型保存到指定路径 pytorch_dump_folder_path 中，使用安全的序列化方法进行保存
    model.save_pretrained(pytorch_dump_folder_path, safe_serialization=safe_serialization)
    
    # 将 Transformers 配置对象保存到指定路径 pytorch_dump_folder_path 中
    transformers_config.save_pretrained(pytorch_dump_folder_path)
# 主程序入口函数
def main():
    # 创建命令行参数解析器对象
    parser = argparse.ArgumentParser()
    # 添加命令行参数：输入目录，用于指定Persimmon权重文件的位置，包括tokenizer.model和model文件夹
    parser.add_argument(
        "--input_dir",
        help="Location of Persimmon weights, which contains tokenizer.model and model folders",
    )
    # 添加命令行参数：模型路径，用于指定Persimmon的`model_optim_rng.pt`文件位置
    parser.add_argument(
        "--pt_model_path",
        help="Location of Persimmon `model_optim_rng.pt`",
    )
    # 添加命令行参数：输出目录，用于指定HF模型和tokenizer的存储位置
    parser.add_argument(
        "--output_dir",
        help="Location to write HF model and tokenizer",
    )
    # 添加命令行参数：ada库路径，用于指定HF模型和tokenizer的存储位置
    parser.add_argument(
        "--ada_lib_path",
        help="Location to write HF model and tokenizer",
    )
    # 添加命令行参数：安全序列化选项，指定是否使用`safetensors`进行保存
    parser.add_argument("--safe_serialization", type=bool, help="Whether or not to save using `safetensors`.")
    
    # 解析命令行参数
    args = parser.parse_args()
    
    # 构建adept_vocab.model文件的完整路径
    spm_path = os.path.join(args.input_dir, "adept_vocab.model")

    # 调用函数：将Persimmon的检查点转换为PyTorch模型
    convert_persimmon_checkpoint(
        pytorch_dump_folder_path=args.output_dir,
        pt_model_path=args.pt_model_path,
        safe_serialization=args.safe_serialization,
        ada_lib_path=args.ada_lib_path,
    )
    
    # 初始化tokenizer对象，使用adept_vocab.model和指定的起始/结束标记
    tokenizer = tokenizer_class(spm_path, bos_token="|ENDOFTEXT|", eos_token="|ENDOFTEXT|")
    # 将tokenizer对象保存到指定的输出目录
    tokenizer.save_pretrained(args.output_dir)

# 如果当前脚本作为主程序运行，则执行main函数
if __name__ == "__main__":
    main()

`.\models\persimmon\modeling_persimmon.py`

# coding=utf-8
# Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch Persimmon model."""
import math
from typing import List, Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from ...activations import ACT2FN
from ...cache_utils import Cache, DynamicCache
from ...modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
from ...modeling_utils import PreTrainedModel
from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
from .configuration_persimmon import PersimmonConfig

logger = logging.get_logger(__name__)

_CONFIG_FOR_DOC = "PersimmonConfig"


# Copied from transformers.models.mistral.modeling_mistral.MistralRotaryEmbedding with Mistral->Persimmon
class PersimmonRotaryEmbedding(nn.Module):
    """
    Rotary positional embedding for Persimmon model.
    """

    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
        """
        Initialize the PersimmonRotaryEmbedding module.

        Args:
            dim (int): Dimensionality of the embedding.
            max_position_embeddings (int): Maximum number of positions to embed.
            base (int): Base value for rotational frequencies.
            device (Optional[torch.device]): Device to store the embeddings.
        """
        super().__init__()

        self.dim = dim
        self.max_position_embeddings = max_position_embeddings
        self.base = base
        # Calculate inverse frequencies for positional embeddings
        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
        self.register_buffer("inv_freq", inv_freq, persistent=False)

        # Build here to make `torch.jit.trace` work.
        self._set_cos_sin_cache(
            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
        )

    def _set_cos_sin_cache(self, seq_len, device, dtype):
        """
        Precompute and store cosine and sine values.

        Args:
            seq_len (int): Length of sequence to compute values for.
            device (torch.device): Device to store the cache tensors.
            dtype (torch.dtype): Data type of the cache tensors.
        """
        # Implementation details for precomputing cosine and sine values
        pass
    def _set_cos_sin_cache(self, seq_len, device, dtype):
        # 设置当前对象的最大缓存序列长度
        self.max_seq_len_cached = seq_len
        # 创建一个从 0 到 max_seq_len_cached 的整数张量，并根据设备和数据类型初始化
        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)

        # 计算频率矩阵，torch.outer 实现了外积操作
        freqs = torch.outer(t, self.inv_freq)
        # 将频率矩阵按最后一个维度连接起来，形成长度为 2 倍的频率矩阵
        emb = torch.cat((freqs, freqs), dim=-1)
        # 注册缓存的余弦值张量，并将其转换为指定的数据类型
        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
        # 注册缓存的正弦值张量，并将其转换为指定的数据类型
        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)

    def forward(self, x, seq_len=None):
        # x: [bs, num_attention_heads, seq_len, head_size]
        # 如果传入的序列长度大于当前缓存的最大序列长度，则重新设置缓存
        if seq_len > self.max_seq_len_cached:
            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)

        # 返回缓存的余弦值和正弦值张量，截取前 seq_len 长度，同时转换为 x 的数据类型
        return (
            self.cos_cached[:seq_len].to(dtype=x.dtype),
            self.sin_cached[:seq_len].to(dtype=x.dtype),
        )
# 从transformers.models.falcon.modeling_falcon.FalconLinearScalingRotaryEmbedding复制并将Falcon更改为Persimmon
class PersimmonLinearScalingRotaryEmbedding(PersimmonRotaryEmbedding):
    """PersimmonRotaryEmbedding扩展了线性缩放。鸣谢Reddit用户/u/kaiokendev"""

    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
        # 设置线性缩放因子
        self.scaling_factor = scaling_factor
        # 调用父类的初始化方法
        super().__init__(dim, max_position_embeddings, base, device)

    def _set_cos_sin_cache(self, seq_len, device, dtype):
        # 缓存当前序列长度
        self.max_seq_len_cached = seq_len
        # 生成一个序列t，长度为max_seq_len_cached，在给定设备上，并转换为指定数据类型
        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
        # 对序列t进行缩放，得到频率
        t = t / self.scaling_factor

        # 计算频率矩阵，outer操作后的结果是一个形状为(max_seq_len_cached, dim)的张量
        freqs = torch.outer(t, self.inv_freq)
        # 使用不同的排列方式来计算cos和sin
        emb = torch.cat((freqs, freqs), dim=-1)
        # 将cos值缓存起来
        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
        # 将sin值缓存起来
        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)


# 从transformers.models.falcon.modeling_falcon.FalconDynamicNTKScalingRotaryEmbedding复制并将Falcon更改为Persimmon
class PersimmonDynamicNTKScalingRotaryEmbedding(PersimmonRotaryEmbedding):
    """PersimmonRotaryEmbedding扩展了动态NTK缩放。鸣谢Reddit用户/u/bloc97和/u/emozilla"""

    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
        # 设置动态NTK缩放因子
        self.scaling_factor = scaling_factor
        # 调用父类的初始化方法
        super().__init__(dim, max_position_embeddings, base, device)

    def _set_cos_sin_cache(self, seq_len, device, dtype):
        # 缓存当前序列长度
        self.max_seq_len_cached = seq_len

        # 如果序列长度超过最大位置嵌入长度，则根据公式计算基础
        if seq_len > self.max_position_embeddings:
            base = self.base * (
                (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
            ) ** (self.dim / (self.dim - 2))
            # 计算频率的倒数
            inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
            # 将频率的倒数作为缓冲区注册起来
            self.register_buffer("inv_freq", inv_freq, persistent=False)

        # 生成一个序列t，长度为max_seq_len_cached，在给定设备上，并转换为指定数据类型
        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)

        # 计算频率矩阵，outer操作后的结果是一个形状为(max_seq_len_cached, dim)的张量
        freqs = torch.outer(t, self.inv_freq)
        # 使用不同的排列方式来计算cos和sin
        emb = torch.cat((freqs, freqs), dim=-1)
        # 将cos值缓存起来
        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
        # 将sin值缓存起来
        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)


# 从transformers.models.llama.modeling_llama.rotate_half复制
def rotate_half(x):
    """旋转输入张量一半的隐藏维度。"""
    # 将输入张量的前一半切片为x1
    x1 = x[..., : x.shape[-1] // 2]
    # 将输入张量的后一半切片为x2
    x2 = x[..., x.shape[-1] // 2 :]
    # 将x1取反后与x2拼接在一起，并在最后一个维度上进行连接
    return torch.cat((-x2, x1), dim=-1)
# 从 transformers.models.mistral.modeling_mistral.apply_rotary_pos_emb 复制而来的函数，用于应用旋转位置嵌入
def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
    """Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`):
            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
            used to pass offsetted position ids when working with a KV-cache.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    """
    # 根据位置索引从 cos 和 sin 张量中选择对应的部分，并在指定维度上进行 unsqueeze 操作，以便正确广播到 q 和 k 的维度
    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
    # 应用旋转位置嵌入到查询张量 q 上，并返回旋转后的查询和键张量
    q_embed = (q * cos) + (rotate_half(q) * sin)
    k_embed = (k * cos) + (rotate_half(k) * sin)
    return q_embed, k_embed


# 从 transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXMLP 复制而来的类，重命名为 PersimmonMLP
class PersimmonMLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 全连接层，将隐藏大小转换为中间大小
        self.dense_h_to_4h = nn.Linear(config.hidden_size, config.intermediate_size)
        # 全连接层，将中间大小转换回隐藏大小
        self.dense_4h_to_h = nn.Linear(config.intermediate_size, config.hidden_size)
        # 激活函数，根据配置中的隐藏激活函数选择对应的函数
        self.act = ACT2FN[config.hidden_act]

    def forward(self, hidden_states):
        # 将输入的隐藏状态通过全连接层 dense_h_to_4h 转换为更高维度
        hidden_states = self.dense_h_to_4h(hidden_states)
        # 应用选择的激活函数
        hidden_states = self.act(hidden_states)
        # 将转换后的高维度状态通过全连接层 dense_4h_to_h 转换回原始隐藏维度
        hidden_states = self.dense_4h_to_h(hidden_states)
        return hidden_states


# 从 transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXMLP 复制而来的类，重命名为 PersimmonAttention
class PersimmonAttention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""
    # 初始化函数，接受一个 PersimmonConfig 类型的配置和一个可选的层索引 layer_idx
    def __init__(self, config: PersimmonConfig, layer_idx: Optional[int] = None):
        # 调用父类的初始化函数
        super().__init__()
        
        # 将传入的配置对象 config 存储到实例变量 self.config 中
        self.config = config
        
        # 将传入的层索引 layer_idx 存储到实例变量 self.layer_idx 中
        self.layer_idx = layer_idx
        
        # 如果未传入层索引，则记录警告信息，提示在使用缓存时可能导致前向调用错误
        if layer_idx is None:
            logger.warning_once(
                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
                "when creating this class."
            )
        
        # 将配置中的隐藏层大小存储到实例变量 self.hidden_size 中
        self.hidden_size = config.hidden_size
        
        # 将配置中的注意力头数存储到实例变量 self.num_heads 中
        self.num_heads = config.num_attention_heads
        
        # 计算每个注意力头的维度，并存储到实例变量 self.head_dim 中
        self.head_dim = self.hidden_size // self.num_heads
        
        # 将配置中的最大位置嵌入数量存储到实例变量 self.max_position_embeddings 中
        self.max_position_embeddings = config.max_position_embeddings
        
        # 将配置中的绳索角度存储到实例变量 self.rope_theta 中
        self.rope_theta = config.rope_theta
        
        # 将配置中的部分旋转因子存储到实例变量 self.partial_rotary_factor 中
        self.partial_rotary_factor = config.partial_rotary_factor
        
        # 设置实例变量 self.is_causal 为 True
        self.is_causal = True
        
        # 检查隐藏层大小是否可以被注意力头数整除，若不能，则抛出 ValueError 异常
        if (self.head_dim * self.num_heads) != self.hidden_size:
            raise ValueError(
                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
                f" and `num_heads`: {self.num_heads})."
            )
        
        # 创建一个线性层，用于计算查询、键、值的线性变换
        self.query_key_value = nn.Linear(self.hidden_size, 3 * self.hidden_size, bias=True)
        
        # 创建一个线性层，用于将多个注意力头的输出进行线性变换和合并
        self.dense = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=True)
        
        # 根据配置中的 qk_layernorm 参数决定是否创建 LayerNorm 层，并存储到 self.q_layernorm 和 self.k_layernorm 中
        self.qk_layernorm = config.qk_layernorm
        if self.qk_layernorm:
            self.q_layernorm = nn.LayerNorm(
                config.hidden_size // self.num_heads, eps=config.layer_norm_eps, elementwise_affine=True
            )
            self.k_layernorm = nn.LayerNorm(
                config.hidden_size // self.num_heads, eps=config.layer_norm_eps, elementwise_affine=True
            )
        
        # 创建一个 Dropout 层，用于注意力机制中的 dropout 操作
        self.attention_dropout = nn.Dropout(config.attention_dropout)
        
        # 初始化绳索机制
        self._init_rope()
    # 初始化 RoPE（Rotary Positional Encoding）
    def _init_rope(self):
        # 如果配置中没有指定 RoPE 的缩放方式，则使用 PersimmonRotaryEmbedding 类初始化 RoPE
        if self.config.rope_scaling is None:
            self.rotary_emb = PersimmonRotaryEmbedding(
                int(self.partial_rotary_factor * self.head_dim),
                max_position_embeddings=self.max_position_embeddings,
                base=self.rope_theta,
            )
        # 如果配置中指定了 RoPE 的缩放方式，则根据配置的方式选择合适的 RoPE 初始化方法
        else:
            scaling_type = self.config.rope_scaling["type"]
            scaling_factor = self.config.rope_scaling["factor"]
            # 如果缩放方式是线性的，则使用 PersimmonLinearScalingRotaryEmbedding 类初始化 RoPE
            if scaling_type == "linear":
                self.rotary_emb = PersimmonLinearScalingRotaryEmbedding(
                    int(self.partial_rotary_factor * self.head_dim),
                    max_position_embeddings=self.max_position_embeddings,
                    scaling_factor=scaling_factor,
                    base=self.rope_theta,
                )
            # 如果缩放方式是动态的，则使用 PersimmonDynamicNTKScalingRotaryEmbedding 类初始化 RoPE
            elif scaling_type == "dynamic":
                self.rotary_emb = PersimmonDynamicNTKScalingRotaryEmbedding(
                    int(self.partial_rotary_factor * self.head_dim),
                    max_position_embeddings=self.max_position_embeddings,
                    scaling_factor=scaling_factor,
                    base=self.rope_theta,
                )
            # 如果缩放方式不是线性或动态，则抛出 ValueError 异常
            else:
                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")

    # 从 fused_qkv 张量中拆分出 query、key 和 value，并返回拆分后的张量
    # 这里的 fused_qkv 包含了经过融合的查询、键和值张量
    def _split_heads(self, fused_qkv: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        """
        Split the last dimension into (num_heads, head_dim) without making any copies, results share same memory
        storage as `fused_qkv`

        Args:
            fused_qkv (`torch.tensor`, *required*): [batch_size, seq_length, num_heads * 3 * head_dim]

        Returns:
            query: [batch_size, seq_length, num_heads, head_dim] key: [batch_size, seq_length, num_heads, head_dim]
            value: [batch_size, seq_length, num_heads, head_dim]
        """
        batch_size, seq_length, three_times_hidden_size = fused_qkv.shape
        # 将 fused_qkv 重塑为 [batch_size, seq_length, num_heads, 3, head_dim] 的形状
        fused_qkv = fused_qkv.view(batch_size, seq_length, self.num_heads, 3, self.head_dim)
        # 返回拆分后的 query、key 和 value 张量
        return fused_qkv[..., 0, :], fused_qkv[..., 1, :], fused_qkv[..., 2, :]

    # 前向传播函数
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Cache] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
# PersimmonDecoderLayer 类定义，继承自 nn.Module
class PersimmonDecoderLayer(nn.Module):
    # 初始化函数，接受 PersimmonConfig 对象和层索引作为参数
    def __init__(self, config: PersimmonConfig, layer_idx: int):
        # 调用父类初始化方法
        super().__init__()
        # 设置隐藏层大小
        self.hidden_size = config.hidden_size
        # 初始化 self_attn 属性，使用 PersimmonAttention 类进行自注意力计算
        self.self_attn = PersimmonAttention(config=config, layer_idx=layer_idx)
        # 初始化 mlp 属性，使用 PersimmonMLP 类进行多层感知机计算
        self.mlp = PersimmonMLP(config)
        # 初始化 input_layernorm 属性，使用 nn.LayerNorm 进行输入层归一化
        self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 初始化 post_attention_layernorm 属性，使用 nn.LayerNorm 进行自注意力后归一化
        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 初始化 dropout 属性，使用 nn.Dropout 进行隐藏层 dropout
        self.dropout = nn.Dropout(config.hidden_dropout)

    # 前向传播方法定义
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        output_attentions: Optional[bool] = False,
        use_cache: Optional[bool] = False,
    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range
                `[0, config.n_positions - 1]`.

                [What are position IDs?](../glossary#position-ids)
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*):
                cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
        """

        # 记录输入的隐藏状态，用于残差连接
        residual = hidden_states

        # 输入层归一化
        hidden_states = self.input_layernorm(hidden_states)

        # 自注意力机制
        hidden_states, self_attn_weights, present_key_value = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_value=past_key_value,
            output_attentions=output_attentions,
            use_cache=use_cache,
        )

        # 残差连接
        hidden_states = residual + hidden_states

        # 全连接层归一化
        residual = hidden_states
        hidden_states = self.post_attention_layernorm(hidden_states)

        # 多层感知机
        hidden_states = self.mlp(hidden_states)

        # Dropout
        hidden_states = self.dropout(hidden_states)

        # 残差连接
        hidden_states = hidden_states + residual

        # 准备输出
        outputs = (hidden_states,)

        # 如果需要输出注意力权重
        if output_attentions:
            outputs += (self_attn_weights,)

        # 如果使用缓存
        if use_cache:
            outputs += (present_key_value,)

        return outputs
# 定义一个长字符串，描述 Persimmon 模型的文档字符串，包含继承的 `PreTrainedModel` 的通用方法和 PyTorch 的 `torch.nn.Module` 的子类信息。
PERSIMMON_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`PersimmonConfig`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# 添加文档字符串到 `PersimmonPreTrainedModel` 类
@add_start_docstrings(
    "The bare Persimmon Model outputting raw hidden-states without any specific head on top.",
    PERSIMMON_START_DOCSTRING,
)
class PersimmonPreTrainedModel(PreTrainedModel):
    # 指定 PersimmonPreTrainedModel 类的配置类
    config_class = PersimmonConfig
    # 模型基础名称前缀
    base_model_prefix = "model"
    # 支持梯度检查点
    supports_gradient_checkpointing = True
    # 不分割的模块列表
    _no_split_modules = ["PersimmonDecoderLayer"]
    # 跳过设备位置关键字
    _skip_keys_device_placement = "past_key_values"
    # 支持缓存类
    _supports_cache_class = True

    # 初始化权重的函数，根据模块类型设置不同的初始权重
    def _init_weights(self, module):
        std = self.config.initializer_range
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()


# 定义一个空的文档字符串，用于 `PersimmonModel` 类
PERSIMMON_INPUTS_DOCSTRING = r"""
"""

# 添加文档字符串到 `PersimmonModel` 类
@add_start_docstrings(
    "The bare Persimmon Model outputting raw hidden-states without any specific head on top.",
    PERSIMMON_START_DOCSTRING,
)
class PersimmonModel(PersimmonPreTrainedModel):
    """
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`PersimmonDecoderLayer`]

    Args:
        config: PersimmonConfig
    """

    # 初始化函数，接受 `PersimmonConfig` 类型的参数 `config`
    def __init__(self, config: PersimmonConfig):
        super().__init__(config)
        # 设置填充索引和词汇表大小
        self.padding_idx = config.pad_token_id
        self.vocab_size = config.vocab_size

        # 初始化词嵌入层
        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
        # 初始化解码器层的模块列表
        self.layers = nn.ModuleList(
            [PersimmonDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
        )
        # 初始化最终层归一化
        self.final_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

        # 关闭梯度检查点
        self.gradient_checkpointing = False
        # 初始化权重并应用最终处理
        self.post_init()

    # 返回输入词嵌入层
    def get_input_embeddings(self):
        return self.embed_tokens
    # 设置模型的输入嵌入
    def set_input_embeddings(self, value):
        self.embed_tokens = value

    # 在模型的前向传播方法上添加注释，使用了一个特定的装饰器添加了文档字符串
    @add_start_docstrings_to_model_forward(PERSIMMON_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids: torch.LongTensor = None,  # 输入的 token IDs，类型为 LongTensor
        attention_mask: Optional[torch.Tensor] = None,  # 注意力遮罩，类型为可选的 Tensor
        position_ids: Optional[torch.LongTensor] = None,  # 位置 IDs，类型为可选的 LongTensor
        past_key_values: Optional[List[torch.FloatTensor]] = None,  # 过去的键值对，类型为可选的 FloatTensor 列表
        inputs_embeds: Optional[torch.FloatTensor] = None,  # 输入的嵌入向量，类型为可选的 FloatTensor
        use_cache: Optional[bool] = None,  # 是否使用缓存，类型为可选的布尔值
        output_attentions: Optional[bool] = None,  # 是否输出注意力，类型为可选的布尔值
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态，类型为可选的布尔值
        return_dict: Optional[bool] = None,  # 是否返回字典格式的输出，类型为可选的布尔值
# 定义 PersimmonForCausalLM 类，继承自 PersimmonPreTrainedModel 类
class PersimmonForCausalLM(PersimmonPreTrainedModel):
    # 定义权重共享的键值列表
    _tied_weights_keys = ["lm_head.weight"]

    # 从 transformers.models.llama.modeling_llama.LlamaForCausalLM.__init__ 复制而来，初始化函数
    def __init__(self, config):
        # 调用父类 PersimmonPreTrainedModel 的初始化函数
        super().__init__(config)
        # 创建 PersimmonModel 类的实例并赋值给 self.model
        self.model = PersimmonModel(config)
        # 设置词汇表大小
        self.vocab_size = config.vocab_size
        # 创建一个线性层，将隐藏状态的大小映射到词汇表大小，并且没有偏置
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

        # 调用额外的初始化函数，用于初始化权重并进行最终处理
        self.post_init()

    # 从 transformers.models.llama.modeling_llama.LlamaForCausalLM.get_input_embeddings 复制而来，获取输入嵌入
    def get_input_embeddings(self):
        return self.model.embed_tokens

    # 从 transformers.models.llama.modeling_llama.LlamaForCausalLM.set_input_embeddings 复制而来，设置输入嵌入
    def set_input_embeddings(self, value):
        self.model.embed_tokens = value

    # 从 transformers.models.llama.modeling_llama.LlamaForCausalLM.get_output_embeddings 复制而来，获取输出嵌入
    def get_output_embeddings(self):
        return self.lm_head

    # 从 transformers.models.llama.modeling_llama.LlamaForCausalLM.set_output_embeddings 复制而来，设置输出嵌入
    def set_output_embeddings(self, new_embeddings):
        self.lm_head = new_embeddings

    # 从 transformers.models.llama.modeling_llama.LlamaForCausalLM.set_decoder 复制而来，设置解码器
    def set_decoder(self, decoder):
        self.model = decoder

    # 从 transformers.models.llama.modeling_llama.LlamaForCausalLM.get_decoder 复制而来，获取解码器
    def get_decoder(self):
        return self.model

    # 应用装饰器并添加文档字符串，标记 forward 方法的输入说明和返回说明
    @add_start_docstrings_to_model_forward(PERSIMMON_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        # 此处定义模型的前向传播逻辑，详细说明由装饰器和注释提供
        pass

    # 定义生成输入的函数，从 transformers.models.llama.modeling_llama.LlamaForCausalLM.prepare_inputs_for_generation 复制而来
    def prepare_inputs_for_generation(
        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
    ):
        # 准备生成模型输入的逻辑，详细说明由调用方提供
        pass
        ):
            # 如果过去的键值对不为空
            if past_key_values is not None:
                # 如果过去的键值对是 Cache 对象
                if isinstance(past_key_values, Cache):
                    # 获取缓存的序列长度
                    cache_length = past_key_values.get_seq_length()
                    # 获取已见的标记长度
                    past_length = past_key_values.seen_tokens
                    # 获取最大缓存长度
                    max_cache_length = past_key_values.get_max_length()
                else:
                    # 否则，假设 past_key_values 是一个列表，取第一个元素的第一个维度的第三个元素作为缓存长度和已见标记长度
                    cache_length = past_length = past_key_values[0][0].shape[2]
                    max_cache_length = None

                # 保留未处理的标记:
                # 1 - 如果 attention_mask 的长度超过 input_ids 的长度，说明一些输入完全作为缓存传递（例如当作 input_embeds 输入时）
                if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
                    input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
                # 2 - 如果 past_length 小于 input_ids 的长度，则 input_ids 包含所有输入标记。我们可以基于 past_length 丢弃 input_ids。
                elif past_length < input_ids.shape[1]:
                    input_ids = input_ids[:, past_length:]
                # 3 - 否则（past_length >= input_ids.shape[1]），假设 input_ids 只包含未处理的标记。

                # 如果我们即将超过最大缓存长度，我们需要裁剪输入的 attention_mask。
                if (
                    max_cache_length is not None
                    and attention_mask is not None
                    and cache_length + input_ids.shape[1] > max_cache_length
                ):
                    attention_mask = attention_mask[:, -max_cache_length:]

            position_ids = kwargs.get("position_ids", None)
            # 如果 attention_mask 不为空且 position_ids 为空，则动态创建 position_ids 用于批量生成
            if attention_mask is not None and position_ids is None:
                position_ids = attention_mask.long().cumsum(-1) - 1
                position_ids.masked_fill_(attention_mask == 0, 1)
                # 如果存在过去的键值对，则仅保留最后 input_ids.shape[1] 个位置标识
                if past_key_values:
                    position_ids = position_ids[:, -input_ids.shape[1] :]

            # 如果传入了 inputs_embeds，则只在第一代步骤中使用它们
            if inputs_embeds is not None and past_key_values is None:
                model_inputs = {"inputs_embeds": inputs_embeds}
            else:
                model_inputs = {"input_ids": input_ids}

            # 更新 model_inputs 字典
            model_inputs.update(
                {
                    "position_ids": position_ids,
                    "past_key_values": past_key_values,
                    "use_cache": kwargs.get("use_cache"),
                    "attention_mask": attention_mask,
                }
            )
            # 返回 model_inputs
            return model_inputs

        @staticmethod
    # 定义一个函数 `_reorder_cache`，用于重新排序缓存数据，以便与beam搜索相关联
    def _reorder_cache(past_key_values, beam_idx):
        # 初始化一个空元组用于存储重新排序后的缓存数据
        reordered_past = ()
        # 遍历每一层的缓存数据
        for layer_past in past_key_values:
            # 对每一层的每个缓存状态进行重新排序，并将结果添加到元组中
            reordered_past += (
                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
            )
        # 返回重新排序后的缓存数据元组
        return reordered_past
# 定义了一个带有顺序分类头部的 Persimmon 转换器模型。
# 
# 这个模型 [`PersimmonForSequenceClassification`] 在顶部使用一个线性层来进行序列分类。
# 
# 它使用最后一个令牌来进行分类，类似于其他因果模型（例如 GPT-2）的做法。
# 
# 由于它在最后一个令牌上进行分类，因此需要知道最后一个令牌的位置。如果配置中定义了 `pad_token_id`，则在每一行中找到不是填充令牌的最后一个令牌。如果没有定义 `pad_token_id`，则直接取每个批次中每行的最后一个值。当传递 `inputs_embeds` 而不是 `input_ids` 时，无法猜测填充令牌，因此也采取相同的策略（取每行的最后一个值）。

`.\models\persimmon\init.py`

# 导入需要的模块和类型检查
from typing import TYPE_CHECKING

# 导入必要的异常和模块
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_torch_available,
)

# 定义模块的导入结构，包括配置和模型
_import_structure = {
    "configuration_persimmon": ["PERSIMMON_PRETRAINED_CONFIG_ARCHIVE_MAP", "PersimmonConfig"],
}

# 尝试检查是否存在 Torch 模块，如果不存在则抛出异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果 Torch 存在，添加模型相关的导入结构
    _import_structure["modeling_persimmon"] = [
        "PersimmonForCausalLM",
        "PersimmonModel",
        "PersimmonPreTrainedModel",
        "PersimmonForSequenceClassification",
    ]

# 如果类型检查为真，则导入配置和模型
if TYPE_CHECKING:
    from .configuration_persimmon import PERSIMMON_PRETRAINED_CONFIG_ARCHIVE_MAP, PersimmonConfig

    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_persimmon import (
            PersimmonForCausalLM,
            PersimmonForSequenceClassification,
            PersimmonModel,
            PersimmonPreTrainedModel,
        )

# 如果不是类型检查阶段，则设置模块为懒加载模式
else:
    import sys

    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\phi\configuration_phi.py`

# 定义一个 Python 源码文件的编码格式为 UTF-8
# 版权声明和许可证信息，这里使用 Apache License, Version 2.0
# 导入所需的模块和类
from ...configuration_utils import PretrainedConfig  # 导入预训练配置类
from ...utils import logging  # 导入日志记录工具

# 获取全局的日志记录器
logger = logging.get_logger(__name__)

# 定义一个字典，映射预训练模型名称到其配置文件的 URL
PHI_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "microsoft/phi-1": "https://huggingface.co/microsoft/phi-1/resolve/main/config.json",
    "microsoft/phi-1_5": "https://huggingface.co/microsoft/phi-1_5/resolve/main/config.json",
    "microsoft/phi-2": "https://huggingface.co/microsoft/phi-2/resolve/main/config.json",
}

# PhiConfig 类，用于存储 PhiModel 的配置信息，继承自 PretrainedConfig 类
class PhiConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`PhiModel`]. It is used to instantiate an Phi
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the Phi
    [microsoft/phi-1](https://huggingface.co/microsoft/phi-1).

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Example:

    ```
    >>> from transformers import PhiModel, PhiConfig

    >>> # Initializing a Phi-1 style configuration
    >>> configuration = PhiConfig.from_pretrained("microsoft/phi-1")

    >>> # Initializing a model from the configuration
    >>> model = PhiModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```

    """
    # 模型类型为 "phi"
    model_type = "phi"
    # 推断时忽略的键列表
    keys_to_ignore_at_inference = ["past_key_values"]

    def __init__(
        self,
        vocab_size=51200,
        hidden_size=2048,
        intermediate_size=8192,
        num_hidden_layers=24,
        num_attention_heads=32,
        num_key_value_heads=None,
        resid_pdrop=0.0,
        embd_pdrop=0.0,
        attention_dropout=0.0,
        hidden_act="gelu_new",
        max_position_embeddings=2048,
        initializer_range=0.02,
        layer_norm_eps=1e-5,
        use_cache=True,
        tie_word_embeddings=False,
        rope_theta=10000.0,
        rope_scaling=None,
        partial_rotary_factor=0.5,
        qk_layernorm=False,
        bos_token_id=1,
        eos_token_id=2,
        **kwargs,
    ):
        # 初始化 PhiConfig 实例，设置各种模型配置参数
        # 这些参数决定了 PhiModel 的架构和行为
        pass
        ):
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads

        if num_key_value_heads is None:
            num_key_value_heads = num_attention_heads

        self.num_key_value_heads = num_key_value_heads
        self.resid_pdrop = resid_pdrop
        self.embd_pdrop = embd_pdrop
        self.attention_dropout = attention_dropout
        self.hidden_act = hidden_act
        self.max_position_embeddings = max_position_embeddings
        self.initializer_range = initializer_range
        self.layer_norm_eps = layer_norm_eps
        self.use_cache = use_cache
        self.rope_theta = rope_theta
        self.rope_scaling = rope_scaling
        self.partial_rotary_factor = partial_rotary_factor
        self.qk_layernorm = qk_layernorm

        # 调用私有方法 _rope_scaling_validation() 进行 ROPE 缩放参数的验证
        self._rope_scaling_validation()

        # 调用父类初始化方法，传递相关参数
        super().__init__(
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )

    # 从 transformers.models.llama.configuration_llama.LlamaConfig._rope_scaling_validation 复制而来
    def _rope_scaling_validation(self):
        """
        Validate the `rope_scaling` configuration.
        """
        # 如果 rope_scaling 参数为 None，则直接返回，无需验证
        if self.rope_scaling is None:
            return

        # 如果 rope_scaling 不是字典类型或者长度不为 2，则抛出数值错误异常
        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
            raise ValueError(
                "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
                f"got {self.rope_scaling}"
            )
        
        # 获取 rope_scaling 字典中的 type 和 factor 字段
        rope_scaling_type = self.rope_scaling.get("type", None)
        rope_scaling_factor = self.rope_scaling.get("factor", None)
        
        # 如果 type 字段为 None 或者不在 ['linear', 'dynamic'] 中，则抛出数值错误异常
        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
            raise ValueError(
                f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
            )
        
        # 如果 factor 字段为 None 或者不是大于 1 的浮点数，则抛出数值错误异常
        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
            raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")

`.\models\phi\convert_phi_weights_to_hf.py`

# 引入 argparse 模块，用于解析命令行参数
import argparse
# 引入垃圾收集模块 gc，用于手动进行内存管理
import gc
# 引入操作系统相关功能的 os 模块
import os

# 引入 safetensors 库，用于处理安全张量
import safetensors
# 引入 PyTorch 库
import torch
# 从 huggingface_hub 库中导入 hf_hub_download 函数
from huggingface_hub import hf_hub_download

# 从 transformers 库中导入 PhiConfig 和 PhiForCausalLM 类
from transformers import PhiConfig, PhiForCausalLM

# 定义字典 MODELS，映射 Phi 模型名称到其权重文件下载链接的字典
_MODELS = {
    "microsoft/phi-1": ["https://huggingface.co/microsoft/phi-1/blob/main/pytorch_model.bin"],
    "microsoft/phi-1_5": ["https://huggingface.co/microsoft/phi-1_5/blob/main/pytorch_model.bin"],
    "microsoft/phi-2": [
        "https://huggingface.co/microsoft/phi-2/blob/main/model-00001-of-00002.safetensors",
        "https://huggingface.co/microsoft/phi-2/blob/main/model-00002-of-00002.safetensors",
    ],
}

# 定义 PHI_MAPPING 字典，将 Phi 模型中的原始权重键映射到转换后的键
PHI_MAPPING = {
    "transformer.embd.wte.weight": "model.embed_tokens.weight",
    "lm_head.linear": "lm_head",
    "lm_head.ln": "model.final_layernorm",
    "layers": "model.layers",
    "transformer": "model",
    ".h.": ".layers.",
    "ln": "input_layernorm",
    "mixer": "self_attn",
    "Wqkv": "query_key_value",
    "out_proj": "dense",
}

# 定义函数 convert_weights，用于转换原始权重到指定配置的转换后的权重
def convert_weights(original_weights, mapping, config):
    # 初始化一个空字典，用于存储转换后的权重
    converted_weights = {}
    # 获取原始权重的键，并对其进行排序
    original_weights_keys = sorted(original_weights.keys())
    # 遍历原始权重的键列表
    for original_weights_key in original_weights_keys:
        # 创建一个新的键，初始为原始权重的键
        new_key = original_weights_key

        # 如果新键包含"rotary_emb"，跳过当前循环，继续下一个键的处理
        if "rotary_emb" in new_key:
            continue

        # 如果新键包含"Wqkv"，根据后缀"weight"或"bias"分别处理权重和偏置
        if "Wqkv" in new_key:
            if "weight" in new_key:
                # 获取权重数据，并重塑其形状以匹配模型配置
                weight = original_weights[new_key]
                weights_shape = weight.shape
                weight = (
                    weight.view(3, config.num_attention_heads, -1, config.hidden_size)
                    .transpose(0, 1)
                    .reshape(*weights_shape)
                )
                original_weights[new_key] = weight
            elif "bias" in new_key:
                # 获取偏置数据，并重塑其形状以匹配模型配置
                bias = original_weights[new_key]
                bias_shape = bias.shape
                bias = bias.view(3, config.num_attention_heads, -1).transpose(0, 1).reshape(*bias_shape)
                original_weights[new_key] = bias

        # 根据映射字典mapping，替换新键中的字符串
        for k, v in mapping.items():
            if k in new_key:
                new_key = new_key.replace(k, v)

        # 将处理后的原始权重放入转换后的权重字典中，并从原始权重字典中移除原始键
        converted_weights[new_key] = original_weights.pop(original_weights_key)

    # 返回转换后的权重字典
    return converted_weights
# 根据给定的URL生成存储库ID，格式为 "<owner>/<repository>"
repo_id = f"{url.split('/')[3]}/{url.split('/')[4]}"
# 根据URL提取文件名
filename = f"{url.split('/')[-1]}"
# 使用Hugging Face Hub下载指定的资源
hf_hub_download(
    repo_id=repo_id,
    filename=filename,
    force_filename=root,  # 指定下载文件的目标根目录
    local_dir_use_symlinks=False,  # 禁用符号链接，将文件直接复制到目标位置
)


def convert_phi_weights(
    model_name, checkpoint_path, pytorch_dump_folder_path, use_cuda, save_weights_directly, _MODELS
):
    # 如果指定的模型名在_MODELS字典中，则只保留该模型名的条目，否则保持_MODELS不变
    _MODELS = _MODELS if model_name not in _MODELS.keys() else {model_name: _MODELS.get(model_name)}
    # 检测CUDA是否可用，选择相应的设备（GPU或CPU）
    device = "cuda" if torch.cuda.is_available() and use_cuda else "cpu"
    for model_name, model_url in _MODELS.items():
        converted_checkpoint = {}
        model_checkpoint = {}

        # 针对 phi-2，权重存储在两个不同的 safetensors 文件中，因此需要逐个迭代下载
        for model_each_url in model_url:
            # 构建模型路径，包括模型名称和最后一个文件名部分
            model_path = os.path.join(checkpoint_path, model_name + "_" + model_each_url.split("/")[-1])
            # 如果模型路径不存在，下载模型文件
            if not os.path.exists(model_path):
                print(f"\n{model_name} was not found! Downloading it to {model_path}")
                _download(url=model_each_url, root=model_path)

            # 如果模型路径以 "safetensors" 结尾，使用 safetensors.torch 加载文件到指定设备
            if model_path.endswith("safetensors"):
                loaded_weights = safetensors.torch.load_file(model_path, device=device)
            else:
                # 否则使用 torch 加载模型文件到指定设备
                loaded_weights = torch.load(model_path, map_location=device)
            model_checkpoint.update(**loaded_weights)

        # 解析模型类型，例如 phi-1 或 phi-1_5 或 phi-2
        model_type = model_name.split("/")[1]

        # 初始化 Phi 模型的配置
        config = PhiConfig()
        # 如果是 phi-2 模型，则更新配置参数
        if model_type == "phi-2":
            config.hidden_size = 2560
            config.intermediate_size = 10240
            config.num_hidden_layers = 32
            config.resid_pdrop = 0.1
            config.partial_rotary_factor = 0.4
            config.torch_dtype = "float16"

        # 转换权重
        converted_checkpoint.update(**convert_weights(model_checkpoint, PHI_MAPPING, config))

        # 根据选择保存整个模型权重还是转换后的权重
        if save_weights_directly:
            # 构建保存权重路径并保存权重文件
            save_weights_path = os.path.join(pytorch_dump_folder_path, model_type + "_pytorch_model.bin")
            torch.save(converted_checkpoint, save_weights_path)
            print(f"Model weights saved at {save_weights_path}!")
        else:
            # 创建 PhiForCausalLM 模型实例，并加载转换后的权重
            model = PhiForCausalLM(config).to(device)
            model.load_state_dict(converted_checkpoint, strict=True)
            # 构建保存模型路径并保存模型
            save_model_path = os.path.join(pytorch_dump_folder_path, model_type)
            model.save_pretrained(save_model_path)
            print(f"Model saved at {save_model_path}!")

            # 释放模型相关的 GPU 内存（如果使用了 CUDA）
            del config, model

        # 释放模型检查点和转换后的检查点占用的内存
        del model_checkpoint, converted_checkpoint
        # 如果使用 CUDA，则清空 GPU 缓存
        if use_cuda:
            torch.cuda.empty_cache()
        # 执行 Python 垃圾回收
        gc.collect()
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    # 创建一个参数解析器对象

    # 必选参数
    parser.add_argument(
        "--model_name",
        type=str,
        help="要转换的模型名称。请选择其中之一：phi-1, phi-1_5, phi-2。如果未提供，则转换所有模型。",
        default=None,
    )
    parser.add_argument(
        "--checkpoint_path", type=str, help="已下载检查点文件夹的路径。（请输入完整路径）"
    )
    parser.add_argument(
        "--pytorch_dump_folder_path",
        default=None,
        type=str,
        help="PyTorch 模型输出路径。（请输入完整路径）",
    )
    parser.add_argument(
        "--use_cuda",
        default=False,
        type=bool,
        help="在转换过程中是否将权重加载到 GPU 上，默认为 False",
    )
    parser.add_argument(
        "--save_weights_directly",
        default=True,
        type=bool,
        help="是否直接保存转换后的权重，或者将权重加载到 Phi 模型中再保存。默认为 True",
    )

    # 解析命令行参数
    args = parser.parse_args()

    # 调用转换函数，传入参数
    convert_phi_weights(
        args.model_name,
        args.checkpoint_path,
        args.pytorch_dump_folder_path,
        args.use_cuda,
        args.save_weights_directly,
        _MODELS,  # `_MODELS` 是一个未在提供代码中定义的变量，可能是全局变量或者导入的模块中的变量
    )

`.\models\phi\modeling_phi.py`

# coding=utf-8
# 版权 2023 Microsoft 和 HuggingFace Inc. 团队。保留所有权利。
#
# 根据 Apache 许可证 2.0 版本（“许可证”）许可;
# 除非符合许可证的规定，否则您不得使用此文件。
# 您可以在以下网址获取许可证副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则软件将根据“原样”分发，
# 没有任何明示或暗示的保证或条件。
# 有关详细信息，请参阅许可证。

""" PyTorch Phi model. """

import math
from typing import List, Optional, Tuple, Union

import torch
import torch.nn.functional as F
import torch.utils.checkpoint
from packaging import version
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from ...activations import ACT2FN
from ...cache_utils import Cache, DynamicCache
from ...modeling_attn_mask_utils import (
    _prepare_4d_causal_attention_mask,
    _prepare_4d_causal_attention_mask_for_sdpa,
)
from ...modeling_outputs import (
    BaseModelOutputWithPast,
    CausalLMOutputWithPast,
    SequenceClassifierOutputWithPast,
    TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...utils import (
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    get_torch_version,
    is_flash_attn_2_available,
    is_flash_attn_greater_or_equal_2_10,
    logging,
    replace_return_docstrings,
)
from .configuration_phi import PhiConfig

# 如果支持 Flash Attention 2，则导入相应函数和模块
if is_flash_attn_2_available():
    from flash_attn import flash_attn_func, flash_attn_varlen_func
    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa

# 获取全局日志记录器
logger = logging.get_logger(__name__)

# 用于文档的检查点和配置名称
_CHECKPOINT_FOR_DOC = "microsoft/phi-1"
_CONFIG_FOR_DOC = "PhiConfig"

# 预训练模型归档列表
PHI_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "microsoft/phi-1",
    "microsoft/phi-1_5",
    "microsoft/phi-2",
    # 查看所有 Phi 模型：https://huggingface.co/models?filter=phi
]


# 从 transformers.models.llama.modeling_llama._get_unpad_data 复制的函数
# 用于获取去除填充数据的辅助函数
def _get_unpad_data(attention_mask):
    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
    max_seqlen_in_batch = seqlens_in_batch.max().item()
    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
    return (
        indices,
        cu_seqlens,
        max_seqlen_in_batch,
    )


# 从 transformers.models.mistral.modeling_mistral.MistralRotaryEmbedding 复制的类
# 将 Mistral 替换为 Phi，用于实现 PhiRotaryEmbedding 的旋转嵌入类
class PhiRotaryEmbedding(nn.Module):
    # 初始化函数，用于初始化一个位置编码器对象
    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
        # 调用父类的初始化方法
        super().__init__()

        # 设置对象的维度
        self.dim = dim
        # 设置最大位置编码长度，默认为2048
        self.max_position_embeddings = max_position_embeddings
        # 设置基数，默认为10000
        self.base = base

        # 计算逆频率向量，用于位置编码
        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
        # 将逆频率向量注册为缓冲区，使其可以被PyTorch持久化管理
        self.register_buffer("inv_freq", inv_freq, persistent=False)

        # 构建余弦和正弦缓存，以便`torch.jit.trace`方法可以正常工作
        self._set_cos_sin_cache(
            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
        )

    # 设置余弦和正弦缓存的私有方法
    def _set_cos_sin_cache(self, seq_len, device, dtype):
        # 记录当前缓存的最大序列长度
        self.max_seq_len_cached = seq_len
        # 创建序列长度张量t，设备为指定设备，数据类型与inv_freq相同
        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)

        # 计算频率张量
        freqs = torch.outer(t, self.inv_freq)
        # 按最后一个维度连接余弦和正弦值，形成位置编码矩阵
        emb = torch.cat((freqs, freqs), dim=-1)
        # 将余弦值注册为缓冲区，并指定数据类型为dtype，使其可以被PyTorch持久化管理
        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
        # 将正弦值注册为缓冲区，并指定数据类型为dtype，使其可以被PyTorch持久化管理
        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)

    # 前向传播方法，用于位置编码器的前向计算
    def forward(self, x, seq_len=None):
        # x: [bs, num_attention_heads, seq_len, head_size]
        # 如果传入的序列长度大于当前缓存的最大序列长度，则重新设置余弦和正弦缓存
        if seq_len > self.max_seq_len_cached:
            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)

        # 返回当前缓存中的余弦和正弦值，截取到seq_len长度，并将数据类型转换为x的数据类型
        return (
            self.cos_cached[:seq_len].to(dtype=x.dtype),
            self.sin_cached[:seq_len].to(dtype=x.dtype),
        )
# 从transformers.models.falcon.modeling_falcon.FalconLinearScalingRotaryEmbedding复制并将Falcon更改为Phi
class PhiLinearScalingRotaryEmbedding(PhiRotaryEmbedding):
    """PhiRotaryEmbedding扩展了线性缩放。感谢Reddit用户/u/kaiokendev"""

    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
        self.scaling_factor = scaling_factor  # 设置缩放因子
        super().__init__(dim, max_position_embeddings, base, device)

    def _set_cos_sin_cache(self, seq_len, device, dtype):
        self.max_seq_len_cached = seq_len  # 设置缓存的最大序列长度
        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
        t = t / self.scaling_factor  # 缩放t以调整频率

        freqs = torch.outer(t, self.inv_freq)  # 计算频率
        # 与论文不同，但使用不同的排列顺序以获得相同的计算结果
        emb = torch.cat((freqs, freqs), dim=-1)  # 构造正弦和余弦的缓存
        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)  # 注册余弦缓存
        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)  # 注册正弦缓存


# 从transformers.models.falcon.modeling_falcon.FalconDynamicNTKScalingRotaryEmbedding复制并将Falcon更改为Phi
class PhiDynamicNTKScalingRotaryEmbedding(PhiRotaryEmbedding):
    """PhiRotaryEmbedding扩展了动态NTK缩放。感谢Reddit用户/u/bloc97和/u/emozilla"""

    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
        self.scaling_factor = scaling_factor  # 设置缩放因子
        super().__init__(dim, max_position_embeddings, base, device)

    def _set_cos_sin_cache(self, seq_len, device, dtype):
        self.max_seq_len_cached = seq_len

        if seq_len > self.max_position_embeddings:
            base = self.base * (
                (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
            ) ** (self.dim / (self.dim - 2))  # 计算基础值
            inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
            self.register_buffer("inv_freq", inv_freq, persistent=False)  # 注册频率反向

        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)

        freqs = torch.outer(t, self.inv_freq)  # 计算频率
        # 与论文不同，但使用不同的排列顺序以获得相同的计算结果
        emb = torch.cat((freqs, freqs), dim=-1)  # 构造正弦和余弦的缓存
        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)  # 注册余弦缓存
        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)  # 注册正弦缓存


# 从transformers.models.llama.modeling_llama.rotate_half复制
def rotate_half(x):
    """旋转输入张量一半的隐藏维度。"""
    x1 = x[..., : x.shape[-1] // 2]  # 取前一半维度
    x2 = x[..., x.shape[-1] // 2 :]  # 取后一半维度
    return torch.cat((-x2, x1), dim=-1)  # 连接负后半部分和前半部分


# 从transformers.models.mistral.modeling_mistral.apply_rotary_pos_emb复制
# 将给定的旋转位置嵌入应用到查询和键张量上。

def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
    """
    Args:
        q (`torch.Tensor`): 查询张量。
        k (`torch.Tensor`): 键张量。
        cos (`torch.Tensor`): 旋转嵌入的余弦部分。
        sin (`torch.Tensor`): 旋转嵌入的正弦部分。
        position_ids (`torch.Tensor`): 表示与查询和键张量对应的位置索引。
        unsqueeze_dim (`int`, *可选*, 默认为 1):
            'unsqueeze_dim' 参数指定沿其进行展开的维度，以便将 cos[position_ids] 和 sin[position_ids] 广播到 q 和 k 的维度。
            例如，如果 cos[position_ids] 和 sin[position_ids] 的形状为 [batch_size, seq_len, head_dim]，
            当 q 和 k 的形状为 [batch_size, heads, seq_len, head_dim] 时，设置 unsqueeze_dim=1 使得它们可以正确广播到 q 和 k 的形状。
            同样地，如果 q 和 k 的形状为 [batch_size, seq_len, heads, head_dim]，则设置 unsqueeze_dim=2。

    Returns:
        `tuple(torch.Tensor)`: 返回应用了旋转位置嵌入后的查询和键张量。
    """
    # 根据位置索引选择并展开余弦部分
    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
    # 根据位置索引选择并展开正弦部分
    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
    # 应用旋转位置嵌入到查询张量上
    q_embed = (q * cos) + (rotate_half(q) * sin)
    # 应用旋转位置嵌入到键张量上
    k_embed = (k * cos) + (rotate_half(k) * sin)
    return q_embed, k_embed


# 从 transformers.models.clip.modeling_clip.CLIPMLP 复制，并将 CLIP 替换为 Phi
class PhiMLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.activation_fn = ACT2FN[config.hidden_act]  # 使用给定的激活函数
        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)  # 第一层线性变换
        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)  # 第二层线性变换

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 第一层线性变换
        hidden_states = self.fc1(hidden_states)
        # 应用激活函数
        hidden_states = self.activation_fn(hidden_states)
        # 第二层线性变换
        hidden_states = self.fc2(hidden_states)
        return hidden_states


# 从 transformers.models.llama.modeling_llama.repeat_kv 复制，并将 llama 替换为 phi
def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
    """
    这是 torch.repeat_interleave(x, dim=1, repeats=n_rep) 的等效实现。
    将隐藏状态从 (batch, num_key_value_heads, seqlen, head_dim) 扩展为 (batch, num_attention_heads, seqlen, head_dim)。
    """
    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
    if n_rep == 1:
        return hidden_states
    # 在第二个维度上扩展隐藏状态
    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
    # 返回调整形状后的隐藏状态张量
    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
    # 定义 PhiAttention 类，实现多头注意力机制，参考自 'Attention Is All You Need' 论文
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    def __init__(self, config: PhiConfig, layer_idx: Optional[int] = None):
        super().__init__()
        # 初始化配置和层索引
        self.config = config
        self.layer_idx = layer_idx
        # 如果未传入 layer_idx，发出警告，因为在使用缓存时可能导致前向调用时的错误
        if layer_idx is None:
            logger.warning_once(
                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
                "when creating this class."
            )

        # 设置注意力机制的丢弃率
        self.attention_dropout = config.attention_dropout
        # 隐藏层大小、注意力头数、每个头的维度
        self.hidden_size = config.hidden_size
        self.num_heads = config.num_attention_heads
        self.head_dim = self.hidden_size // self.num_heads
        # 键值头的数量及每组的头数
        self.num_key_value_heads = config.num_key_value_heads
        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
        # 最大位置嵌入数、Rope 参数、部分旋转因子
        self.max_position_embeddings = config.max_position_embeddings
        self.rope_theta = config.rope_theta
        self.partial_rotary_factor = config.partial_rotary_factor
        # 是否因果
        self.is_causal = True

        # 检查 hidden_size 是否可以被 num_heads 整除，否则抛出数值错误
        if (self.head_dim * self.num_heads) != self.hidden_size:
            raise ValueError(
                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
                f" and `num_heads`: {self.num_heads})."
            )

        # 初始化查询、键、值的线性投影层
        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True)
        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
        # 初始化密集层
        self.dense = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=True)

        # 如果配置中要求进行查询和键的 LayerNorm
        self.qk_layernorm = config.qk_layernorm
        if self.qk_layernorm:
            # 初始化查询的 LayerNorm
            self.q_layernorm = nn.LayerNorm(
                config.hidden_size // self.num_heads, eps=config.layer_norm_eps, elementwise_affine=True
            )
            # 初始化键的 LayerNorm
            self.k_layernorm = nn.LayerNorm(
                config.hidden_size // self.num_heads, eps=config.layer_norm_eps, elementwise_affine=True
            )

        # 初始化 ROPE（Relative Position Encoding）参数
        self._init_rope()
    def _init_rope(self):
        # 检查配置中是否设置了 RoPE 的缩放参数
        if self.config.rope_scaling is None:
            # 若未设置，则使用 PhiRotaryEmbedding 初始化 RoPE
            self.rotary_emb = PhiRotaryEmbedding(
                int(self.partial_rotary_factor * self.head_dim),
                max_position_embeddings=self.max_position_embeddings,
                base=self.rope_theta,
            )
        else:
            # 若设置了缩放参数，则根据类型选择不同的 RoPE 初始化方式
            scaling_type = self.config.rope_scaling["type"]
            scaling_factor = self.config.rope_scaling["factor"]
            if scaling_type == "linear":
                # 使用 PhiLinearScalingRotaryEmbedding 初始化 RoPE
                self.rotary_emb = PhiLinearScalingRotaryEmbedding(
                    int(self.partial_rotary_factor * self.head_dim),
                    max_position_embeddings=self.max_position_embeddings,
                    scaling_factor=scaling_factor,
                    base=self.rope_theta,
                )
            elif scaling_type == "dynamic":
                # 使用 PhiDynamicNTKScalingRotaryEmbedding 初始化 RoPE
                self.rotary_emb = PhiDynamicNTKScalingRotaryEmbedding(
                    int(self.partial_rotary_factor * self.head_dim),
                    max_position_embeddings=self.max_position_embeddings,
                    scaling_factor=scaling_factor,
                    base=self.rope_theta,
                )
            else:
                # 抛出异常，若未知 RoPE 缩放类型
                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Cache] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
# 定义 PhiFlashAttention2 类，继承自 PhiAttention 类。此模块用于实现 Phi flash attention，其中权重与 PhiAttention 相同。
class PhiFlashAttention2(PhiAttention):
    """
    Phi flash attention module. This module inherits from `PhiAttention` as the weights of the module stays
    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
    flash attention and deal with padding tokens in case the input contains any of them.
    """

    # 从 transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__ 复制而来
    # 初始化方法，调用父类的初始化方法，并设置一些额外的属性
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
        # 标志变量，用于确定是否使用顶部左对齐的因果掩码，这取决于 flash_attn 的版本是否大于等于 2.1
        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()

    # 从 transformers.models.llama.modeling_llama.LlamaFlashAttention2._flash_attention_forward 复制而来
    # 前向传播方法，处理注意力机制的计算
    def _flash_attention_forward(
        self,
        query_states,                # 查询状态的张量
        key_states,                  # 键状态的张量
        value_states,                # 值状态的张量
        attention_mask,              # 注意力掩码，限制注意力计算的范围
        query_length,                # 查询长度
        dropout=0.0,                 # Dropout 比率，默认为 0.0
        softmax_scale=None,          # Softmax 缩放因子，默认为 None
        **kwargs,
    ):
    ):
        """
        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
        first unpad the input, then computes the attention scores and pad the final attention scores.

        Args:
            query_states (`torch.Tensor`):
                Input query states to be passed to Flash Attention API
            key_states (`torch.Tensor`):
                Input key states to be passed to Flash Attention API
            value_states (`torch.Tensor`):
                Input value states to be passed to Flash Attention API
            attention_mask (`torch.Tensor`):
                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
                position of padding tokens and 1 for the position of non-padding tokens.
            dropout (`float`):
                Attention dropout
            softmax_scale (`float`, *optional*):
                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
        """
        # Determine if causal masking is needed based on `_flash_attn_uses_top_left_mask`
        if not self._flash_attn_uses_top_left_mask:
            causal = self.is_causal
        else:
            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
            causal = self.is_causal and query_length != 1

        # Check if there are padding tokens in the attention_mask
        if attention_mask is not None:
            batch_size = query_states.shape[0]
            # Unpad the input sequences based on attention_mask
            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
                query_states, key_states, value_states, attention_mask, query_length
            )

            # Separate sequence lengths for queries and keys
            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
            # Maximum sequence lengths in the batch for queries and keys
            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens

            # Compute attention scores for un-padded inputs
            attn_output_unpad = flash_attn_varlen_func(
                query_states,
                key_states,
                value_states,
                cu_seqlens_q=cu_seqlens_q,
                cu_seqlens_k=cu_seqlens_k,
                max_seqlen_q=max_seqlen_in_batch_q,
                max_seqlen_k=max_seqlen_in_batch_k,
                dropout_p=dropout,
                softmax_scale=softmax_scale,
                causal=causal,
            )

            # Pad the attention scores back to the original input sequence length
            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
        else:
            # If no padding mask, compute attention scores directly
            attn_output = flash_attn_func(
                query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
            )

        return attn_output

    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._upad_input
    # 定义一个私有方法 `_upad_input`，接受多个输入参数用于处理注意力机制的输入数据
    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
        # 从注意力掩码中获取未填充数据的索引、当前序列长度和批次中最大序列长度
        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
        
        # 获取 key_layer 的形状信息
        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape

        # 使用索引重排 key_layer，以处理未填充的数据
        key_layer = index_first_axis(
            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
        )
        
        # 使用索引重排 value_layer，以处理未填充的数据
        value_layer = index_first_axis(
            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
        )
        
        # 根据 query_length 的不同情况，处理 query_layer
        if query_length == kv_seq_len:
            # 若 query_length 等于 kv_seq_len，则直接重排 query_layer
            query_layer = index_first_axis(
                query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
            )
            cu_seqlens_q = cu_seqlens_k  # 使用相同的未填充长度信息
            max_seqlen_in_batch_q = max_seqlen_in_batch_k  # 使用相同的最大序列长度信息
            indices_q = indices_k  # 使用相同的索引信息
        elif query_length == 1:
            # 若 query_length 等于 1，则处理成单个长度的序列
            max_seqlen_in_batch_q = 1
            cu_seqlens_q = torch.arange(
                batch_size + 1, dtype=torch.int32, device=query_layer.device
            )  # 在设备上创建一个整数张量
            indices_q = cu_seqlens_q[:-1]  # 使用序列长度创建索引
            query_layer = query_layer.squeeze(1)  # 压缩 query_layer 的第一个维度
        else:
            # 否则，根据 query_length 处理未填充的输入数据
            # 注意：此处可能会存在左填充的情况
            attention_mask = attention_mask[:, -query_length:]
            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)

        # 返回处理后的多个数据和元组
        return (
            query_layer,  # 查询层
            key_layer,  # 键层
            value_layer,  # 值层
            indices_q,  # 查询索引
            (cu_seqlens_q, cu_seqlens_k),  # 未填充长度元组
            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),  # 批次中最大序列长度元组
        )
class PhiSdpaAttention(PhiAttention):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # Determine if contiguous QKV tensors are required based on Torch version
        self.require_contiguous_qkv = version.parse(get_torch_version()) < version.parse("2.2.0")

    """
    SDPA attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
    `PhiAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
    SDPA API.
    """

    # Adapted from PhiAttention.forward
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Cache] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
    ):
        # Placeholder for SDPA-specific forward pass; implementation details are omitted here
        pass

PHI_ATTENTION_CLASSES = {
    "eager": PhiAttention,
    "flash_attention_2": PhiFlashAttention2,
    "sdpa": PhiSdpaAttention,
}


class PhiDecoderLayer(nn.Module):
    def __init__(self, config: PhiConfig, layer_idx: int):
        super().__init__()
        # Initialize self-attention mechanism based on configuration
        self.self_attn = PHI_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx=layer_idx)
        # Initialize multi-layer perceptron for the layer
        self.mlp = PhiMLP(config)
        # Layer normalization for input to the layer
        self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # Dropout applied to the residual connection
        self.resid_dropout = nn.Dropout(config.resid_pdrop)

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = False,
        use_cache: Optional[bool] = False,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
    ):
        # Placeholder for forward pass through the decoder layer; details are application-specific and omitted here
        pass
        """
        Args:
            hidden_states (`torch.FloatTensor`):
                输入到层的张量，形状为 `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*):
                注意力掩码，形状为 `(batch, 1, tgt_len, src_len)`，其中填充元素由非常大的负值表示。
            position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
                每个输入序列标记在位置嵌入中的位置索引。选在范围 `[0, config.n_positions - 1]`。[什么是位置ID?](../glossary#position-ids)
            output_attentions (`bool`, *optional*):
                是否返回所有注意力层的注意力张量。有关更多细节，请查看返回的张量中的 `attentions`。
            use_cache (`bool`, *optional*):
                如果设置为 `True`，则返回 `past_key_values` 键值状态，可以用于加速解码（参见 `past_key_values`）。
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*):
                缓存的过去键和值投影状态。
        """

        residual = hidden_states  # 保存输入张量作为残差连接的一部分

        hidden_states = self.input_layernorm(hidden_states)  # 输入张量经过层归一化处理

        # 自注意力机制
        attn_outputs, self_attn_weights, present_key_value = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_value=past_key_value,
            output_attentions=output_attentions,
            use_cache=use_cache,
        )
        attn_outputs = self.resid_dropout(attn_outputs)  # 对自注意力输出应用残差dropout

        feed_forward_hidden_states = self.resid_dropout(self.mlp(hidden_states))  # 经过MLP处理后的前馈隐藏状态，应用残差dropout
        hidden_states = attn_outputs + feed_forward_hidden_states + residual  # 最终的层输出，结合自注意力输出、前馈隐藏状态和残差连接
        outputs = (hidden_states,)  # 输出结果为包含隐藏状态的元组

        if output_attentions:
            outputs += (self_attn_weights,)  # 如果需要返回注意力权重，则将注意力权重添加到输出元组中

        if use_cache:
            outputs += (present_key_value,)  # 如果需要缓存，则将当前的键值状态添加到输出元组中

        return outputs  # 返回最终的输出结果
# 定义文档字符串，描述 PhiPreTrainedModel 类继承自 PreTrainedModel，并指向其通用方法和 PyTorch 模块用法
PHI_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`PhiConfig`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# 添加装饰器注释，说明 PhiPreTrainedModel 类是一个输出原始隐藏状态的 Phi 模型，无特定的输出层
@add_start_docstrings(
    "The bare Phi Model outputting raw hidden-states without any specific head on top.",
    PHI_START_DOCSTRING,
)
# 定义 PhiPreTrainedModel 类，继承自 PreTrainedModel
class PhiPreTrainedModel(PreTrainedModel):
    config_class = PhiConfig  # 设置配置类为 PhiConfig
    base_model_prefix = "model"  # 基础模型前缀为 "model"
    supports_gradient_checkpointing = True  # 支持梯度检查点
    _no_split_modules = ["PhiDecoderLayer"]  # 不进行模块分割的模块列表，包含 PhiDecoderLayer
    _skip_keys_device_placement = "past_key_values"  # 跳过键设备放置，指定为 "past_key_values"
    _supports_flash_attn_2 = True  # 支持 flash attention 2
    _supports_sdpa = True  # 支持 SDPA（Scaled Dot-Product Attention）
    _supports_cache_class = True  # 支持缓存类

    # 初始化模型权重的方法
    def _init_weights(self, module):
        std = self.config.initializer_range  # 获取初始化范围
        if isinstance(module, nn.Linear):  # 如果是线性层
            module.weight.data.normal_(mean=0.0, std=std)  # 初始化权重为正态分布
            if module.bias is not None:  # 如果存在偏置
                module.bias.data.zero_()  # 初始化偏置为零
        elif isinstance(module, nn.Embedding):  # 如果是嵌入层
            module.weight.data.normal_(mean=0.0, std=std)  # 初始化权重为正态分布
            if module.padding_idx is not None:  # 如果存在填充索引
                module.weight.data[module.padding_idx].zero_()  # 初始化填充索引处权重为零


# 定义输入文档字符串 PHI_INPUTS_DOCSTRING（此处省略了具体内容）
PHI_INPUTS_DOCSTRING = r"""
"""


# 添加装饰器注释，说明 PhiModel 类是一个输出原始隐藏状态的 Phi 模型，无特定的输出层
@add_start_docstrings(
    "The bare Phi Model outputting raw hidden-states without any specific head on top.",
    PHI_START_DOCSTRING,
)
# 定义 PhiModel 类，继承自 PhiPreTrainedModel
class PhiModel(PhiPreTrainedModel):
    """
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`PhiDecoderLayer`]

    Args:
        config: PhiConfig
    """
    # 初始化方法，接受一个 PhiConfig 类型的配置对象作为参数
    def __init__(self, config: PhiConfig):
        # 调用父类的初始化方法，传递配置对象作为参数
        super().__init__(config)
        # 设置填充索引为配置对象中的 pad_token_id
        self.padding_idx = config.pad_token_id
        # 设置词汇表大小为配置对象中的 vocab_size
        self.vocab_size = config.vocab_size

        # 创建一个词嵌入层对象，参数为词汇表大小、隐藏层大小和填充索引
        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
        # 创建一个丢弃层对象，丢弃率为配置对象中的 embd_pdrop
        self.embed_dropout = nn.Dropout(config.embd_pdrop)
        # 创建多层解码器，每一层使用 PhiDecoderLayer 类初始化，层数由配置对象中的 num_hidden_layers 决定
        self.layers = nn.ModuleList(
            [PhiDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
        )
        # 创建最终的 LayerNorm 层，大小为配置对象中的 hidden_size，epsilon 参数为配置对象中的 layer_norm_eps
        self.final_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

        # 根据配置对象中的 _attn_implementation 属性判断是否使用 Flash Attention 2.0
        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
        # 根据配置对象中的 _attn_implementation 属性判断是否使用 Self-Dual-Path Attention (SDPA)
        self._use_sdpa = config._attn_implementation == "sdpa"

        # 是否开启梯度检查点，默认为 False
        self.gradient_checkpointing = False
        # 执行后续的初始化和权重设置
        self.post_init()

    # 获取输入词嵌入层对象的方法
    def get_input_embeddings(self):
        return self.embed_tokens

    # 设置输入词嵌入层对象的方法
    def set_input_embeddings(self, value):
        self.embed_tokens = value

    # 根据 PHI_INPUTS_DOCSTRING 给 forward 方法添加文档字符串的装饰器
    @add_start_docstrings_to_model_forward(PHI_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
# 定义一个自定义模型类 PhiForCausalLM，继承自 PhiPreTrainedModel 类
class PhiForCausalLM(PhiPreTrainedModel):
    # 定义类变量 _tied_weights_keys，指定需要共享权重的键名列表
    _tied_weights_keys = ["lm_head.weight"]

    # 从 transformers.models.llama.modeling_llama.LlamaForCausalLM.__init__ 复制而来，
    # 使用给定的配置初始化对象
    def __init__(self, config):
        # 调用父类 PhiPreTrainedModel 的初始化方法
        super().__init__(config)
        # 使用给定配置初始化 PhiModel 对象，并将其赋给 self.model
        self.model = PhiModel(config)
        # 从配置中获取词汇表大小，并赋值给 self.vocab_size
        self.vocab_size = config.vocab_size
        # 创建一个线性层，用于语言模型的输出，输入维度为 config.hidden_size，输出维度为 config.vocab_size
        # 同时设置 bias=True，表示包含偏置项
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=True)

        # 调用对象的后初始化方法，用于权重初始化和其他必要的处理
        self.post_init()

    # 从 transformers.models.llama.modeling_llama.LlamaForCausalLM.get_input_embeddings 复制而来，
    # 返回模型的输入嵌入层对象 self.model.embed_tokens
    def get_input_embeddings(self):
        return self.model.embed_tokens

    # 从 transformers.models.llama.modeling_llama.LlamaForCausalLM.set_input_embeddings 复制而来，
    # 设置模型的输入嵌入层对象为给定的 value
    def set_input_embeddings(self, value):
        self.model.embed_tokens = value

    # 从 transformers.models.llama.modeling_llama.LlamaForCausalLM.get_output_embeddings 复制而来，
    # 返回模型的输出嵌入层对象 self.lm_head，用于语言模型的输出
    def get_output_embeddings(self):
        return self.lm_head

    # 从 transformers.models.llama.modeling_llama.LlamaForCausalLM.set_output_embeddings 复制而来，
    # 设置模型的输出嵌入层对象为新的嵌入层 new_embeddings
    def set_output_embeddings(self, new_embeddings):
        self.lm_head = new_embeddings

    # 从 transformers.models.llama.modeling_llama.LlamaForCausalLM.set_decoder 复制而来，
    # 设置模型的解码器部分为给定的 decoder
    def set_decoder(self, decoder):
        self.model = decoder

    # 从 transformers.models.llama.modeling_llama.LlamaForCausalLM.get_decoder 复制而来，
    # 返回模型的解码器部分 self.model
    def get_decoder(self):
        return self.model

    # 使用装饰器 @add_start_docstrings_to_model_forward(PHI_INPUTS_DOCSTRING) 和
    # @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)，
    # 从 transformers.models.persimmon.modeling_persimmon.PersimmonForCausalLM.prepare_inputs_for_generation 复制而来，
    # 准备生成过程的输入参数，支持多种输入格式和选项
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        # 实际的模型前向传播逻辑会在这里定义
        pass

    # 从 transformers.models.persimmon.modeling_persimmon.PersimmonForCausalLM.prepare_inputs_for_generation 复制而来，
    # 准备生成过程的输入参数，支持多种输入格式和选项
    def prepare_inputs_for_generation(
        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
    ):
        # 实现输入生成准备的逻辑，具体内容会依赖具体需求和实现
        pass
        ):
        if past_key_values is not None:
            if isinstance(past_key_values, Cache):
                cache_length = past_key_values.get_seq_length()
                past_length = past_key_values.seen_tokens
                max_cache_length = past_key_values.get_max_length()
            else:
                cache_length = past_length = past_key_values[0][0].shape[2]
                max_cache_length = None

            # Keep only the unprocessed tokens:
            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
            # input)
            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
            # input_ids based on the past_length.
            elif past_length < input_ids.shape[1]:
                input_ids = input_ids[:, past_length:]
            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.

            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
            if (
                max_cache_length is not None
                and attention_mask is not None
                and cache_length + input_ids.shape[1] > max_cache_length
            ):
                attention_mask = attention_mask[:, -max_cache_length:]

        position_ids = kwargs.get("position_ids", None)
        if attention_mask is not None and position_ids is None:
            # create position_ids on the fly for batch generation
            position_ids = attention_mask.long().cumsum(-1) - 1
            position_ids.masked_fill_(attention_mask == 0, 1)
            if past_key_values:
                position_ids = position_ids[:, -input_ids.shape[1] :]

        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
        if inputs_embeds is not None and past_key_values is None:
            model_inputs = {"inputs_embeds": inputs_embeds}
        else:
            model_inputs = {"input_ids": input_ids}

        model_inputs.update(
            {
                "position_ids": position_ids,
                "past_key_values": past_key_values,
                "use_cache": kwargs.get("use_cache"),
                "attention_mask": attention_mask,
            }
        )
        return model_inputs
    # 定义一个函数 `_reorder_cache`，用于重新排序缓存数据 `past_key_values`，基于 `beam_idx` 提供的索引
    def _reorder_cache(past_key_values, beam_idx):
        # 初始化一个空元组 `reordered_past`，用于存储重新排序后的缓存数据
        reordered_past = ()
        # 遍历 `past_key_values` 中的每一层的缓存数据 `layer_past`
        for layer_past in past_key_values:
            # 对于每个 `layer_past` 中的缓存状态 `past_state`，按照 `beam_idx` 提供的索引重新排序并存储
            reordered_past += (
                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
            )
        # 返回重新排序后的缓存数据 `reordered_past`
        return reordered_past
@add_start_docstrings(
    """
    The PhiModel with a sequence classification head on top (linear layer).

    [`PhiForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    """,
    PHI_START_DOCSTRING,
)
class PhiForSequenceClassification(PhiPreTrainedModel):
    # PhiForSequenceClassification 类，继承自 PhiPreTrainedModel

    def __init__(self, config):
        super().__init__(config)
        # 调用父类的构造函数初始化模型配置

        self.num_labels = config.num_labels
        # 设置分类标签数目为配置中的 num_labels

        self.model = PhiModel(config)
        # 创建 PhiModel 对象，使用给定的配置参数

        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
        # 创建一个线性层，用于分类，输入大小为隐藏层大小，输出大小为标签数目，无偏置项

        # Initialize weights and apply final processing
        self.post_init()
        # 执行初始化权重和应用最终处理的方法

    def get_input_embeddings(self):
        return self.model.embed_tokens
        # 返回模型的输入嵌入层对象

    def set_input_embeddings(self, value):
        self.model.embed_tokens = value
        # 设置模型的输入嵌入层对象为给定的值

    @add_start_docstrings_to_model_forward(PHI_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        # 前向传播方法，接受多种输入参数，包括 input_ids, attention_mask 等



@add_start_docstrings(
    """
    PhiModel with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    """,
    PHI_START_DOCSTRING,
)
class PhiForTokenClassification(PhiPreTrainedModel):
    # PhiForTokenClassification 类，继承自 PhiPreTrainedModel
    # 初始化函数，接受一个 PhiConfig 类型的参数 config
    def __init__(self, config: PhiConfig):
        # 调用父类的初始化函数，传入 config 参数
        super().__init__(config)
        # 设置实例变量 num_labels，从 config 参数中获取
        self.num_labels = config.num_labels

        # 使用 config 参数初始化 PhiModel 类的实例，并赋值给 self.model
        self.model = PhiModel(config)

        # 根据 config 参数设置分类器的 dropout 率
        if hasattr(config, "classifier_dropout") and config.classifier_dropout is not None:
            classifier_dropout = config.classifier_dropout
        elif hasattr(config, "hidden_dropout") and config.hidden_dropout is not None:
            classifier_dropout = config.hidden_dropout
        else:
            classifier_dropout = 0.1

        # 使用 nn.Dropout 类初始化 self.dropout，设置 dropout 率为 classifier_dropout
        self.dropout = nn.Dropout(classifier_dropout)

        # 使用 nn.Linear 类初始化 self.classifier，设置输入维度为 config.hidden_size，输出维度为 config.num_labels
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        # 调用 self.post_init() 方法，进行权重初始化和最终处理
        # (假设 self.post_init() 方法用于权重初始化和最终处理，具体细节未提供)
        self.post_init()

    # 前向传播函数，接受多个输入参数
    @add_start_docstrings_to_model_forward(PHI_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TokenClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
        attention_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        **deprecated_arguments,
        ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 初始化 return_dict，如果未指定则使用模型配置中的设定
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用模型进行前向传播计算
        model_outputs = self.model(
            input_ids,
            past_key_values=past_key_values,
            attention_mask=attention_mask,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取模型输出的隐藏状态
        hidden_states = model_outputs[0]
        # 对隐藏状态进行 dropout 处理
        hidden_states = self.dropout(hidden_states)
        # 将处理后的隐藏状态输入分类器，得到预测 logits
        logits = self.classifier(hidden_states)

        # 初始化损失为 None
        loss = None
        # 如果给定了标签，计算损失
        if labels is not None:
            # 将标签移动到对应设备，以支持模型并行计算
            labels = labels.to(logits.device)
            # 获取批次大小和序列长度
            batch_size, seq_length = labels.shape
            # 定义交叉熵损失函数
            loss_fct = CrossEntropyLoss()
            # 计算交叉熵损失
            loss = loss_fct(
                logits.view(batch_size * seq_length, self.num_labels), labels.view(batch_size * seq_length)
            )

        # 如果不需要返回字典形式的结果，则根据情况返回输出
        if not return_dict:
            output = (logits,) + model_outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 如果需要返回字典形式的结果，则构造 TokenClassifierOutput 并返回
        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=model_outputs.hidden_states,
            attentions=model_outputs.attentions,
        )

`.\models\phi\init.py`

# 导入类型检查模块 TYPE_CHECKING，用于在类型检查时导入特定模块
from typing import TYPE_CHECKING

# 从 utils 模块中导入所需的异常和函数
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_sentencepiece_available,
    is_tokenizers_available,
    is_torch_available,
)

# 定义模块的导入结构，用于延迟加载模块
_import_structure = {
    "configuration_phi": ["PHI_PRETRAINED_CONFIG_ARCHIVE_MAP", "PhiConfig"],
}

# 检查是否支持 Torch，如果不支持则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果支持 Torch，则添加 modeling_phi 模块到导入结构中
    _import_structure["modeling_phi"] = [
        "PHI_PRETRAINED_MODEL_ARCHIVE_LIST",
        "PhiPreTrainedModel",
        "PhiModel",
        "PhiForCausalLM",
        "PhiForSequenceClassification",
        "PhiForTokenClassification",
    ]

# 如果正在进行类型检查，则从 configuration_phi 和 modeling_phi 模块导入特定内容
if TYPE_CHECKING:
    from .configuration_phi import PHI_PRETRAINED_CONFIG_ARCHIVE_MAP, PhiConfig

    # 再次检查 Torch 是否可用，如果不可用则不导入 modeling_phi 模块
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_phi import (
            PHI_PRETRAINED_MODEL_ARCHIVE_LIST,
            PhiForCausalLM,
            PhiForSequenceClassification,
            PhiForTokenClassification,
            PhiModel,
            PhiPreTrainedModel,
        )

# 如果不是在类型检查模式下，则进行模块的懒加载处理
else:
    import sys

    # 使用 _LazyModule 类封装当前模块，以实现懒加载和模块属性的动态设置
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\phobert\tokenization_phobert.py`

# 指定编码格式为 UTF-8
# 版权声明，指出版权归属于 VinAI Research 和 HuggingFace Inc. 团队
# 版权声明，指出版权归属于 Open AI Team 作者和 HuggingFace Inc. 团队
#
# 根据 Apache License, Version 2.0 进行许可，除非符合许可要求，否则不得使用此文件
# 可以在以下网址获取许可证副本：http://www.apache.org/licenses/LICENSE-2.0
#
# 如果适用法律要求或书面同意，软件将按"原样"分发，不提供任何明示或暗示的担保或条件
# 有关详细信息，请参阅许可证
""" PhoBERT 的分词类 """

# 导入标准库中的 os 模块
# 导入标准库中的 re 模块，用于正则表达式操作
# 从 shutil 库中导入 copyfile 函数
# 从 typing 模块中导入 List、Optional 和 Tuple 类型
import os
import re
from shutil import copyfile
from typing import List, Optional, Tuple

# 从 tokenization_utils 模块中导入 PreTrainedTokenizer 类
# 从 utils 模块中导入 logging 函数
from ...tokenization_utils import PreTrainedTokenizer
from ...utils import logging

# 获取 logger 对象
logger = logging.get_logger(__name__)

# 定义 VOCAB_FILES_NAMES 字典，指定词汇和合并文件的名称
VOCAB_FILES_NAMES = {
    "vocab_file": "vocab.txt",       # 词汇文件名
    "merges_file": "bpe.codes",     # 合并文件名
}

# 定义 PRETRAINED_VOCAB_FILES_MAP 字典，指定预训练模型的词汇和合并文件的下载地址
PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "vinai/phobert-base": "https://huggingface.co/vinai/phobert-base/resolve/main/vocab.txt",
        "vinai/phobert-large": "https://huggingface.co/vinai/phobert-large/resolve/main/vocab.txt",
    },
    "merges_file": {
        "vinai/phobert-base": "https://huggingface.co/vinai/phobert-base/resolve/main/bpe.codes",
        "vinai/phobert-large": "https://huggingface.co/vinai/phobert-large/resolve/main/bpe.codes",
    },
}

# 定义 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 字典，指定预训练模型的位置嵌入大小
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "vinai/phobert-base": 256,      # PhoBERT-base 的位置嵌入大小
    "vinai/phobert-large": 256,     # PhoBERT-large 的位置嵌入大小
}


def get_pairs(word):
    """
    返回单词中的符号对集合。

    单词表示为符号元组（符号是长度可变的字符串）。
    """
    pairs = set()
    prev_char = word[0]
    for char in word[1:]:
        pairs.add((prev_char, char))
        prev_char = char

    pairs = set(pairs)
    return pairs


class PhobertTokenizer(PreTrainedTokenizer):
    """
    构造一个 PhoBERT 分词器。基于字节对编码（Byte-Pair-Encoding）。

    此分词器继承自 PreTrainedTokenizer，其中包含大多数主要方法。用户应参考这个超类以获取有关这些方法的更多信息。
    """
    # 定义一个类，用于处理包含特殊标记的词汇表和合并文件，以及各种特殊标记的设置
    
    vocab_files_names = VOCAB_FILES_NAMES
    # 将预定义的词汇表文件名映射保存到变量中
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    # 将预定义的预训练词汇表文件映射保存到变量中
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    # 将预定义的最大模型输入尺寸保存到变量中
    
    def __init__(
        self,
        vocab_file,
        merges_file,
        bos_token="<s>",
        eos_token="</s>",
        sep_token="</s>",
        cls_token="<s>",
        unk_token="<unk>",
        pad_token="<pad>",
        mask_token="<mask>",
        **kwargs,
    ):
    # 类的初始化方法，接受词汇表文件路径、合并文件路径和多个可选的特殊标记参数
    ):
        self.vocab_file = vocab_file  # 初始化词汇文件路径
        self.merges_file = merges_file  # 初始化合并文件路径

        self.encoder = {}  # 初始化编码器字典
        self.encoder[str(bos_token)] = 0  # 设置开始符号的编码为0
        self.encoder[str(pad_token)] = 1  # 设置填充符号的编码为1
        self.encoder[str(eos_token)] = 2  # 设置结束符号的编码为2
        self.encoder[str(unk_token)] = 3  # 设置未知符号的编码为3

        self.add_from_file(vocab_file)  # 从词汇文件中添加更多编码到编码器字典中

        self.decoder = {v: k for k, v in self.encoder.items()}  # 根据编码器字典创建解码器字典，用于反向查找

        with open(merges_file, encoding="utf-8") as merges_handle:
            merges = merges_handle.read().split("\n")[:-1]  # 读取并分割合并文件内容为列表，排除最后的空行
        merges = [tuple(merge.split()[:-1]) for merge in merges]  # 将每行的合并操作转换为元组列表

        self.bpe_ranks = dict(zip(merges, range(len(merges))))  # 创建BPE合并操作到排名的映射字典
        self.cache = {}  # 初始化缓存字典

        super().__init__(
            bos_token=bos_token,
            eos_token=eos_token,
            unk_token=unk_token,
            sep_token=sep_token,
            cls_token=cls_token,
            pad_token=pad_token,
            mask_token=mask_token,
            **kwargs,
        )  # 调用父类的初始化方法，设置特殊标记符号及其ID

    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A PhoBERT sequence has the following format:

        - single sequence: `<s> X </s>`
        - pair of sequences: `<s> A </s></s> B </s>`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """

        if token_ids_1 is None:
            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]  # 返回单个序列的输入ID列表，包含特殊标记符号
        cls = [self.cls_token_id]
        sep = [self.sep_token_id]
        return cls + token_ids_0 + sep + sep + token_ids_1 + sep  # 返回序列对的输入ID列表，包含特殊标记符号

    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
    ):
    ) -> List[int]:
        """
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """

        # Check if special tokens are already present in the token list
        if already_has_special_tokens:
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
            )

        # If token_ids_1 is None, return a list indicating special tokens at the beginning and end of token_ids_0
        if token_ids_1 is None:
            return [1] + ([0] * len(token_ids_0)) + [1]
        
        # If token_ids_1 is provided, return a list with special tokens marking both sequences
        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]

    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. PhoBERT does not
        make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.
        """

        # Define special tokens for the beginning and end of sequences
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]

        # If token_ids_1 is None, return a list of zeros for token type ids based on token_ids_0
        if token_ids_1 is None:
            return len(cls + token_ids_0 + sep) * [0]
        
        # If token_ids_1 is provided, return a list of zeros for token type ids based on both sequences
        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]

    @property
    def vocab_size(self):
        # Return the size of the vocabulary, which is the length of the encoder dictionary
        return len(self.encoder)

    def get_vocab(self):
        # Return a combined dictionary of the encoder and added_tokens_encoder
        return dict(self.encoder, **self.added_tokens_encoder)
    def bpe(self, token):
        # 如果 token 已经在缓存中，则直接返回缓存中的结果
        if token in self.cache:
            return self.cache[token]
        
        # 将 token 转换为元组形式的单词
        word = tuple(token)
        # 在单词末尾添加 "</w>" 标记，转换为新的元组形式的单词
        word = tuple(list(word[:-1]) + [word[-1] + "</w>"])
        # 获取单词中的所有可能的 bigram 组合
        pairs = get_pairs(word)

        # 如果没有 bigram 可用，则直接返回原始 token
        if not pairs:
            return token

        # 不断循环，直到无法再分割为 bigram
        while True:
            # 选取在 self.bpe_ranks 中排名最小的 bigram
            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
            # 如果选取的 bigram 不在 self.bpe_ranks 中，则停止循环
            if bigram not in self.bpe_ranks:
                break
            # 分割单词，替换为新的单词形式
            first, second = bigram
            new_word = []
            i = 0
            while i < len(word):
                try:
                    j = word.index(first, i)
                except ValueError:
                    new_word.extend(word[i:])
                    break
                else:
                    new_word.extend(word[i:j])
                    i = j

                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
                    new_word.append(first + second)
                    i += 2
                else:
                    new_word.append(word[i])
                    i += 1
            new_word = tuple(new_word)
            word = new_word
            # 如果单词长度为 1，则停止分割
            if len(word) == 1:
                break
            else:
                pairs = get_pairs(word)
        
        # 将单词中的 "@@ " 替换为空格，并移除末尾的特殊标记
        word = "@@ ".join(word)
        word = word[:-4]
        # 将处理后的结果存入缓存中
        self.cache[token] = word
        # 返回处理后的单词
        return word

    def _tokenize(self, text):
        """Tokenize a string."""
        split_tokens = []

        # 使用正则表达式将文本分割成单词列表
        words = re.findall(r"\S+\n?", text)

        # 对每个单词进行 BPE 分词，然后拆分成更小的子单词列表
        for token in words:
            split_tokens.extend(list(self.bpe(token).split(" ")))
        
        # 返回分词后的子单词列表
        return split_tokens

    def _convert_token_to_id(self, token):
        """Converts a token (str) in an id using the vocab."""
        # 根据 token 获取其在词汇表中的 id，如果不存在则返回未知标记的 id
        return self.encoder.get(token, self.encoder.get(self.unk_token))

    def _convert_id_to_token(self, index):
        """Converts an index (integer) in a token (str) using the vocab."""
        # 根据 id 获取其在词汇表中对应的 token，如果不存在则返回未知标记
        return self.decoder.get(index, self.unk_token)

    def convert_tokens_to_string(self, tokens):
        """Converts a sequence of tokens (string) in a single string."""
        # 将 tokens 列表拼接成单个字符串，并移除 BPE 分词时添加的特殊标记
        out_string = " ".join(tokens).replace("@@ ", "").strip()
        # 返回拼接后的字符串
        return out_string
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # 检查保存目录是否存在，如果不存在则记录错误并返回
        if not os.path.isdir(save_directory):
            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
            return
        
        # 构建输出的词汇表文件路径
        out_vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
        )
        # 构建输出的合并文件路径
        out_merge_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
        )

        # 如果当前词汇表文件路径与输出路径不同且当前文件存在，则复制当前词汇表文件到输出路径
        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
            copyfile(self.vocab_file, out_vocab_file)
        # 如果当前词汇表文件不存在，则将当前实例的sp_model序列化后写入输出路径
        elif not os.path.isfile(self.vocab_file):
            with open(out_vocab_file, "wb") as fi:
                content_spiece_model = self.sp_model.serialized_model_proto()
                fi.write(content_spiece_model)

        # 如果当前合并文件路径与输出路径不同，则复制当前合并文件到输出路径
        if os.path.abspath(self.merges_file) != os.path.abspath(out_merge_file):
            copyfile(self.merges_file, out_merge_file)

        # 返回输出的词汇表文件路径和合并文件路径
        return out_vocab_file, out_merge_file

    # def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):
    #     filtered_tokens = ' '.join(self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens))
    #     tokens_generated_so_far = re.sub('(@@ )', '', string=filtered_tokens)
    #     tokens_generated_so_far = re.sub('(@@ ?$)', '', string=tokens_generated_so_far)
    #     return ''.join(tokens_generated_so_far)

    def add_from_file(self, f):
        """
        从文本文件加载预先存在的字典，并将其符号添加到此实例中。
        """
        # 如果参数f是字符串，则尝试打开文件并递归调用add_from_file
        if isinstance(f, str):
            try:
                with open(f, "r", encoding="utf-8") as fd:
                    self.add_from_file(fd)
            except FileNotFoundError as fnfe:
                raise fnfe
            except UnicodeError:
                raise Exception(f"Incorrect encoding detected in {f}, please rebuild the dataset")
            return
        
        # 读取文件所有行
        lines = f.readlines()
        # 遍历每一行
        for lineTmp in lines:
            # 去除每行两端的空白字符
            line = lineTmp.strip()
            # 在行中查找最后一个空格的索引位置
            idx = line.rfind(" ")
            # 如果没有找到空格，则抛出值错误
            if idx == -1:
                raise ValueError("Incorrect dictionary format, expected '<token> <cnt>'")
            # 获取单词部分
            word = line[:idx]
            # 将单词添加到实例的编码器中，使用当前编码器的长度作为值
            self.encoder[word] = len(self.encoder)

`.\models\phobert\init.py`

# 版权声明和许可证信息，指明此代码的版权归 HuggingFace Team 所有，依据 Apache License, Version 2.0 发布
#
# 在符合许可证的情况下，可以使用此文件。您可以在以下链接获取许可证副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，本软件根据“原样”分发，不附带任何明示或暗示的保证或条件。
# 有关详细信息，请参阅许可证。

# 导入类型检查模块中的 TYPE_CHECKING
from typing import TYPE_CHECKING

# 从 utils 模块中导入 LazyModule 类
from ...utils import _LazyModule

# 定义模块的导入结构
_import_structure = {"tokenization_phobert": ["PhobertTokenizer"]}

# 如果是类型检查阶段
if TYPE_CHECKING:
    # 从 tokenization_phobert 模块中导入 PhobertTokenizer 类型
    from .tokenization_phobert import PhobertTokenizer

# 如果不是类型检查阶段
else:
    # 导入 sys 模块
    import sys

    # 将当前模块指定为 LazyModule 类的实例
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\pix2struct\configuration_pix2struct.py`

# 设置代码文件的编码格式为 UTF-8
# 版权声明，版权归 The HuggingFace Inc. 团队所有，保留所有权利
#
# 根据 Apache 许可证版本 2.0 进行许可
# 除非符合许可证规定，否则不得使用此文件
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则本软件是基于“按原样”分发的
# 没有任何明示或暗示的担保或条件，包括但不限于
# 适销性、特定用途的适用性或非侵权性的保证
# 有关更多信息，请参阅许可证

""" Pix2Struct 模型配置 """

# 导入操作系统模块
import os
# 导入 Union 类型
from typing import Union

# 导入预训练配置类 PretrainedConfig
from ...configuration_utils import PretrainedConfig
# 导入日志工具
from ...utils import logging

# 获取日志记录器对象
logger = logging.get_logger(__name__)

# Pix2Struct 预训练配置存档映射表
PIX2STRUCT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "google/pix2struct-textcaps-base": (
        "https://huggingface.co/google/pix2struct-textcaps-base/resolve/main/config.json"
    ),
}

# Pix2StructTextConfig 类，继承自 PretrainedConfig 类
class Pix2StructTextConfig(PretrainedConfig):
    r"""
    这是用于存储 [`Pix2StructTextModel`] 配置的配置类。它用于根据指定的参数实例化
    Pix2Struct 文本模型，定义模型架构。使用默认值实例化配置将产生类似于
    [google/pix2struct-base](https://huggingface.co/google/pix2struct-base) 架构使用的 Pix2Struct 文本解码器的配置。

    配置对象继承自 [`PretrainedConfig`]，可用于控制模型的输出。有关更多信息，请阅读
    [`PretrainedConfig`] 的文档。
    # 定义模型类型为 "pix2struct_text_model"
    model_type = "pix2struct_text_model"
    python
        # 在推断时要忽略的键列表
        keys_to_ignore_at_inference = ["past_key_values"]
        # 属性映射字典，将类参数名映射到配置文件中的属性名
        attribute_map = {
            "hidden_size": "hidden_size",
            "num_attention_heads": "num_heads",
            "num_hidden_layers": "num_layers",
        }
    
        # 类的初始化方法，定义了模型配置的默认参数
        def __init__(
            self,
            vocab_size=50244,
            hidden_size=768,
            d_kv=64,
            d_ff=2048,
            num_layers=12,
            num_heads=12,
            relative_attention_num_buckets=32,
            relative_attention_max_distance=128,
            dropout_rate=0.1,
            layer_norm_epsilon=1e-6,
            initializer_factor=1.0,
            dense_act_fn="gelu_new",
            decoder_start_token_id=0,
            use_cache=False,
            pad_token_id=0,
            eos_token_id=1,
            tie_word_embeddings=False,
            is_decoder=True,
            **kwargs,
        ):
            # 初始化类的各个参数
            self.vocab_size = vocab_size
            self.hidden_size = hidden_size
            self.d_kv = d_kv
            self.d_ff = d_ff
            self.num_layers = num_layers
            self.num_heads = num_heads
            self.relative_attention_num_buckets = relative_attention_num_buckets
            self.relative_attention_max_distance = relative_attention_max_distance
            self.dropout_rate = dropout_rate
            self.layer_norm_epsilon = layer_norm_epsilon
            self.initializer_factor = initializer_factor
            self.use_cache = use_cache
    
            self.eos_token_id = eos_token_id
            self.decoder_start_token_id = decoder_start_token_id
    
            # 为了向后兼容，设置密集层激活函数
            self.dense_act_fn = dense_act_fn
    
            # 调用父类的初始化方法，传入参数
            super().__init__(
                pad_token_id=pad_token_id,
                eos_token_id=eos_token_id,
                decoder_start_token_id=decoder_start_token_id,
                tie_word_embeddings=tie_word_embeddings,
                is_decoder=is_decoder,
                **kwargs,
            )
    
        # 类方法，从预训练模型加载配置
        @classmethod
        def from_pretrained(
            cls, pretrainehidden_size_name_or_path: Union[str, os.PathLike], **kwargs
        ) -> "PretrainedConfig":
            # 设置 token 的参数到 kwargs 中
            cls._set_token_in_kwargs(kwargs)
    
            # 获取预训练模型的配置字典和额外的 kwargs
            config_dict, kwargs = cls.get_config_dict(pretrainehidden_size_name_or_path, **kwargs)
    
            # 如果配置字典中的模型类型是 "pix2struct"，则获取其中的文本配置字典
            if config_dict.get("model_type") == "pix2struct":
                config_dict = config_dict["text_config"]
    
            # 如果配置字典中指定的模型类型与类中定义的模型类型不匹配，发出警告
            if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
                logger.warning(
                    f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
                    f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
                )
    
            # 从配置字典和 kwargs 创建并返回一个类实例
            return cls.from_dict(config_dict, **kwargs)
# Pix2StructVisionConfig 类，继承自 PretrainedConfig 类
class Pix2StructVisionConfig(PretrainedConfig):
    r"""
    这是一个配置类，用于存储 [`Pix2StructVisionModel`] 的配置。它被用来实例化一个 Pix2Struct 视觉模型，根据指定的参数定义模型架构。
    默认情况下实例化一个配置将产生类似于 Pix2Struct-base [google/pix2struct-base](https://huggingface.co/google/pix2struct-base) 架构的配置。

    配置对象继承自 [`PretrainedConfig`]，可以用来控制模型的输出。阅读 [`PretrainedConfig`] 的文档以获取更多信息。
    ```
    # 模型类型字符串，指定为"pix2struct_vision_model"
    model_type = "pix2struct_vision_model"
    # 初始化函数，设置 Transformer 模型的各种参数
    def __init__(
        self,
        hidden_size=768,  # 隐藏层大小，默认为768
        patch_embed_hidden_size=768,  # 补丁嵌入的隐藏层大小，默认为768
        d_ff=2048,  # Feedforward 层的大小，默认为2048
        d_kv=64,  # 键值映射的维度，默认为64
        num_hidden_layers=12,  # Transformer 模型的隐藏层数，默认为12
        num_attention_heads=12,  # 注意力头的数量，默认为12
        dense_act_fn="gelu_new",  # 密集层激活函数，默认为"gelu_new"
        layer_norm_eps=1e-6,  # 层归一化的 epsilon 值，默认为1e-6
        dropout_rate=0.0,  # 总体的 dropout 率，默认为0.0
        attention_dropout=0.0,  # 注意力层的 dropout 率，默认为0.0
        initializer_range=1e-10,  # 参数初始化的范围，默认为1e-10
        initializer_factor=1.0,  # 参数初始化的因子，默认为1.0
        seq_len=4096,  # 序列的长度，默认为4096
        relative_attention_num_buckets=32,  # 相对注意力的桶数量，默认为32
        relative_attention_max_distance=128,  # 相对注意力的最大距离，默认为128
        **kwargs,  # 其他参数，以字典形式接收
    ):
        # 调用父类的初始化方法，传递其他参数
        super().__init__(**kwargs)

        # 初始化 Transformer 模型的各种参数
        self.hidden_size = hidden_size
        self.patch_embed_hidden_size = patch_embed_hidden_size
        self.d_ff = d_ff
        self.dropout_rate = dropout_rate
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.initializer_range = initializer_range
        self.initializer_factor = initializer_factor
        self.attention_dropout = attention_dropout
        self.layer_norm_eps = layer_norm_eps
        self.dense_act_fn = dense_act_fn
        self.seq_len = seq_len
        self.relative_attention_num_buckets = relative_attention_num_buckets
        self.relative_attention_max_distance = relative_attention_max_distance
        self.d_kv = d_kv

    @classmethod
    # 从预训练模型中加载配置信息，并返回一个预训练配置对象
    def from_pretrained(
        cls, pretrainehidden_size_name_or_path: Union[str, os.PathLike], **kwargs
    ) -> "PretrainedConfig":
        # 设置 kwargs 中的 token 相关参数
        cls._set_token_in_kwargs(kwargs)

        # 获取配置字典和更新后的 kwargs
        config_dict, kwargs = cls.get_config_dict(pretrainehidden_size_name_or_path, **kwargs)

        # 如果加载的模型类型为 "pix2struct"，则使用视觉配置字典
        if config_dict.get("model_type") == "pix2struct":
            config_dict = config_dict["vision_config"]

        # 如果配置字典中包含模型类型，并且该类型与当前类的模型类型不同，发出警告
        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
            logger.warning(
                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
            )

        # 根据配置字典和 kwargs 创建一个新的类实例
        return cls.from_dict(config_dict, **kwargs)
class Pix2StructConfig(PretrainedConfig):
    r"""
    [`Pix2StructConfig`] is the configuration class to store the configuration of a
    [`Pix2StructForConditionalGeneration`]. It is used to instantiate a Pix2Struct model according to the specified
    arguments, defining the text model and vision model configs. Instantiating a configuration with the defaults will
    yield a similar configuration to that of the Pix2Struct-base
    [google/pix2struct-base](https://huggingface.co/google/pix2struct-base) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        text_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize [`Pix2StructTextConfig`].
        vision_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize [`Pix2StructVisionConfig`].
        initializer_factor (`float`, *optional*, defaults to 1.0):
            Factor to multiply the initialization range with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        is_vqa (`bool`, *optional*, defaults to `False`):
            Whether the model has been fine-tuned for VQA or not.
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether to tie the word embeddings between the text and vision models.
        is_encoder_decoder (`bool`, *optional*, defaults to `True`):
            Whether the model follows an encoder-decoder architecture.
        kwargs (*optional*):
            Dictionary of keyword arguments.

    Example:

    ```
    >>> from transformers import Pix2StructConfig, Pix2StructForConditionalGeneration

    >>> # Initializing a Pix2StructConfig with google/pix2struct-base style configuration
    >>> configuration = Pix2StructConfig()

    >>> # Initializing a Pix2StructForConditionalGeneration (with random weights) from the google/pix2struct-base style configuration
    >>> model = Pix2StructForConditionalGeneration(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config

    >>> # We can also initialize a Pix2StructConfig from a Pix2StructTextConfig and a Pix2StructVisionConfig

    >>> # Initializing a Pix2Struct text and Pix2Struct vision configuration
    >>> config_text = Pix2StructTextConfig()
    >>> config_vision = Pix2StructVisionConfig()

    >>> config = Pix2StructConfig.from_text_vision_configs(config_text, config_vision)
    ```"""

    # 定义模型类型为 "pix2struct"
    model_type = "pix2struct"

    def __init__(
        self,
        text_config=None,
        vision_config=None,
        initializer_factor=1.0,
        initializer_range=0.02,
        is_vqa=False,
        tie_word_embeddings=False,
        is_encoder_decoder=True,
        **kwargs,
    ):
        # 调用父类的初始化方法，初始化基本配置
        super().__init__(**kwargs)
        # 如果传入了文本配置和视觉配置，则保存在相应属性中
        self.text_config = text_config
        self.vision_config = vision_config
        # 初始化器相关参数
        self.initializer_factor = initializer_factor
        self.initializer_range = initializer_range
        # 是否是 VQA 模型的标志位
        self.is_vqa = is_vqa
        # 是否绑定词嵌入的标志位
        self.tie_word_embeddings = tie_word_embeddings
        # 是否是编码器-解码器结构的标志位
        self.is_encoder_decoder = is_encoder_decoder
    ):
        # 调用父类的初始化方法，传递参数以绑定词嵌入、是否编码解码器等属性
        super().__init__(tie_word_embeddings=tie_word_embeddings, is_encoder_decoder=is_encoder_decoder, **kwargs)

        # 如果文本配置为空，则初始化为空字典，并记录日志
        if text_config is None:
            text_config = {}
            logger.info("text_config is None. Initializing the Pix2StructTextConfig with default values.")

        # 如果视觉配置为空，则初始化为空字典，并记录日志
        if vision_config is None:
            vision_config = {}
            logger.info("vision_config is None. Initializing the Pix2StructVisionConfig with default values.")

        # 根据给定的文本配置创建 Pix2StructTextConfig 实例
        self.text_config = Pix2StructTextConfig(**text_config)
        # 根据给定的视觉配置创建 Pix2StructVisionConfig 实例
        self.vision_config = Pix2StructVisionConfig(**vision_config)

        # 从文本配置中获取解码器起始标记的 ID
        self.decoder_start_token_id = self.text_config.decoder_start_token_id
        # 从文本配置中获取填充标记的 ID
        self.pad_token_id = self.text_config.pad_token_id
        # 从文本配置中获取结束标记的 ID
        self.eos_token_id = self.text_config.eos_token_id

        # 初始化因子乘数
        self.initializer_factor = initializer_factor
        # 初始化范围值
        self.initializer_range = initializer_range

        # 将初始化范围值分别赋给文本配置和视觉配置
        self.text_config.initializer_range = self.initializer_range
        self.vision_config.initializer_range = self.initializer_range

        # 设定是否是视觉问答模型的标志
        self.is_vqa = is_vqa

    @classmethod
    def from_text_vision_configs(
        cls, text_config: Pix2StructTextConfig, vision_config: Pix2StructVisionConfig, **kwargs
    ):
        r"""
        Instantiate a [`Pix2StructConfig`] (or a derived class) from pix2struct text model configuration and pix2struct
        vision model configuration.

        Returns:
            [`Pix2StructConfig`]: An instance of a configuration object
        """

        # 从文本配置和视觉配置实例创建一个新的 Pix2StructConfig 实例，并返回
        return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)

`.\models\pix2struct\convert_pix2struct_original_pytorch_to_hf.py`

# coding=utf-8
# 设置文件编码为 UTF-8，确保可以正确处理中文和其他特殊字符

# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
# 版权声明，版权归 HuggingFace Inc. 团队所有，保留所有权利

# Licensed under the Apache License, Version 2.0 (the "License");
# 根据 Apache License 2.0 版本进行许可，允许在符合条件的情况下使用本文件

# you may not use this file except in compliance with the License.
# 除非符合许可证的条件，否则不得使用本文件

# You may obtain a copy of the License at
# 您可以在以下网址获取许可证的副本

#     http://www.apache.org/licenses/LICENSE-2.0
#     http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 除非适用法律要求或书面同意，否则本软件按“原样”分发，无任何明示或暗示的担保或条件

# See the License for the specific language governing permissions and
# limitations under the License.
# 详细了解许可证的具体条款和限制，请参阅许可证

import argparse  # 导入命令行参数解析模块
import os  # 导入操作系统功能模块
import re  # 导入正则表达式模块

import torch  # 导入 PyTorch 模块
from flax.traverse_util import flatten_dict  # 从 Flax 模块中导入 flatten_dict 函数
from t5x import checkpoints  # 从 t5x 模块中导入 checkpoints 函数

from transformers import (  # 从 transformers 模块中导入以下类和函数
    AutoTokenizer,
    Pix2StructConfig,
    Pix2StructForConditionalGeneration,
    Pix2StructImageProcessor,
    Pix2StructProcessor,
    Pix2StructTextConfig,
    Pix2StructVisionConfig,
)


def get_flax_param(t5x_checkpoint_path):
    # 定义函数 get_flax_param，接收一个 t5x_checkpoint_path 参数
    flax_params = checkpoints.load_t5x_checkpoint(t5x_checkpoint_path)
    # 使用 t5x 模块的 load_t5x_checkpoint 函数加载 Flax 参数
    flax_params = flatten_dict(flax_params)
    # 调用 flatten_dict 函数，将 Flax 参数字典扁平化
    return flax_params


def rename_and_convert_flax_params(flax_dict):
    # 定义函数 rename_and_convert_flax_params，接收一个 flax_dict 参数
    converted_dict = {}  # 创建一个空字典 converted_dict

    # 定义两个转换映射字典，用于将 Flax 参数映射到特定的命名约定
    CONVERSION_MAPPING = {
        "token_embedder": "embeddings",
        "encoder_norm": "layernorm",
        "kernel": "weight",
        ".out": ".output",
        "scale": "weight",
        "embedders_0.pos_embedding": "row_embedder.weight",
        "embedders_1.pos_embedding": "column_embedder.weight",
    }

    DECODER_CONVERSION_MAPPING = {
        "query": "attention.query",
        "key": "attention.key",
        "value": "attention.value",
        "output.dense": "output",
        "encoder_decoder_attention.o": "encoder_decoder_attention.attention.o",
        "pre_self_attention_layer_norm": "self_attention.layer_norm",
        "pre_cross_attention_layer_norm": "encoder_decoder_attention.layer_norm",
        "mlp.": "mlp.DenseReluDense.",
        "pre_mlp_layer_norm": "mlp.layer_norm",
        "self_attention.o": "self_attention.attention.o",
        "decoder.embeddings.embedding": "decoder.embed_tokens.weight",
        "decoder.relpos_bias.rel_embedding": "decoder.layer.0.self_attention.attention.relative_attention_bias.weight",
        "decoder.decoder_norm.weight": "decoder.final_layer_norm.weight",
        "decoder.logits_dense.weight": "decoder.lm_head.weight",
    }
    # 遍历原始字典的键
    for key in flax_dict.keys():
        # 检查是否键名中包含 "target"
        if "target" in key:
            # 移除键名中的第一个前缀
            new_key = ".".join(key[1:])

            # 使用转换映射表重命名键名
            for old, new in CONVERSION_MAPPING.items():
                new_key = new_key.replace(old, new)

            # 如果新键名中包含 "decoder"
            if "decoder" in new_key:
                # 使用解码器转换映射表进一步处理键名
                for old, new in DECODER_CONVERSION_MAPPING.items():
                    new_key = new_key.replace(old, new)

            # 如果新键名中包含 "layers" 但不包含 "decoder"
            if "layers" in new_key and "decoder" not in new_key:
                # 使用正则表达式替换层号码格式
                new_key = re.sub(r"layers_(\d+)", r"layer.\1", new_key)
                # 替换 "encoder" 为 "encoder.encoder"
                new_key = new_key.replace("encoder", "encoder.encoder")

            # 如果新键名中包含 "layers" 并且包含 "decoder"
            elif "layers" in new_key and "decoder" in new_key:
                # 使用正则表达式替换层号码格式
                new_key = re.sub(r"layers_(\d+)", r"layer.\1", new_key)

            # 将处理过的键值对加入转换后的字典中
            converted_dict[new_key] = flax_dict[key]

    # 初始化一个空的转换后的 torch 字典
    converted_torch_dict = {}

    # 将转换后的字典转换成 torch 格式
    for key in converted_dict.keys():
        # 检查键名中不包含 "embed_tokens" 且不包含 "embedder"
        if ("embed_tokens" not in key) and ("embedder" not in key):
            # 将 numpy 数组转换为 torch 张量并转置
            converted_torch_dict[key] = torch.from_numpy(converted_dict[key].T)
        else:
            # 将 numpy 数组转换为 torch 张量
            converted_torch_dict[key] = torch.from_numpy(converted_dict[key])

    # 返回转换后的 torch 字典
    return converted_torch_dict
def convert_pix2struct_original_pytorch_checkpoint_to_hf(
    t5x_checkpoint_path, pytorch_dump_folder_path, use_large=False, is_vqa=False
):
    # 从T5x模型的检查点路径获取Flax参数
    flax_params = get_flax_param(t5x_checkpoint_path)

    # 根据是否使用大型模型选择编码器和解码器配置
    if not use_large:
        encoder_config = Pix2StructVisionConfig()
        decoder_config = Pix2StructTextConfig()
    else:
        encoder_config = Pix2StructVisionConfig(
            hidden_size=1536, d_ff=3968, num_attention_heads=24, num_hidden_layers=18
        )
        decoder_config = Pix2StructTextConfig(hidden_size=1536, d_ff=3968, num_heads=24, num_layers=18)

    # 创建Pix2Struct模型配置对象
    config = Pix2StructConfig(
        vision_config=encoder_config.to_dict(), text_config=decoder_config.to_dict(), is_vqa=is_vqa
    )

    # 根据配置创建Pix2StructForConditionalGeneration模型
    model = Pix2StructForConditionalGeneration(config)

    # 将Flax参数转换并加载到PyTorch模型中
    torch_params = rename_and_convert_flax_params(flax_params)
    model.load_state_dict(torch_params)

    # 加载预训练的分词器
    tok = AutoTokenizer.from_pretrained("ybelkada/test-pix2struct-tokenizer")

    # 创建Pix2StructImageProcessor实例
    image_processor = Pix2StructImageProcessor()

    # 创建Pix2StructProcessor处理器实例，使用给定的图像处理器和分词器
    processor = Pix2StructProcessor(image_processor=image_processor, tokenizer=tok)

    # 如果使用大型模型，则设置图像处理器的最大补丁数为4096
    if use_large:
        processor.image_processor.max_patches = 4096

    # 设置图像处理器的is_vqa属性为True
    processor.image_processor.is_vqa = True

    # 如果需要的话，创建输出目录
    os.makedirs(pytorch_dump_folder_path, exist_ok=True)

    # 将模型保存为PyTorch模型
    model.save_pretrained(pytorch_dump_folder_path)

    # 保存处理器的预训练状态到指定目录
    processor.save_pretrained(pytorch_dump_folder_path)

    # 打印保存成功的消息
    print("Model saved in {}".format(pytorch_dump_folder_path))


if __name__ == "__main__":
    # 解析命令行参数
    parser = argparse.ArgumentParser()
    parser.add_argument("--t5x_checkpoint_path", default=None, type=str, help="Path to the original T5x checkpoint.")
    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
    parser.add_argument("--use_large", action="store_true", help="Use large model.")
    parser.add_argument("--is_vqa", action="store_true", help="Use large model.")
    args = parser.parse_args()

    # 调用函数，将Pix2Struct的原始PyTorch检查点转换为Hugging Face模型
    convert_pix2struct_original_pytorch_checkpoint_to_hf(
        args.t5x_checkpoint_path, args.pytorch_dump_folder_path, args.use_large
    )

`.\models\pix2struct\image_processing_pix2struct.py`

# 设置文件编码为 UTF-8
# 版权声明，版权归 HuggingFace Inc. 团队所有，保留所有权利
#
# 根据 Apache 许可证 2.0 版本使用此文件；除非符合许可证的要求，否则不得使用此文件
# 您可以在以下网址获取许可证副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则本软件按"原样"分发，不提供任何明示或暗示的担保或条件
# 有关详细信息，请参阅许可证
"""Pix2Struct 的图像处理类"""
import io  # 导入 io 模块
import math  # 导入 math 模块
from typing import Dict, Optional, Union  # 导入类型提示相关模块

import numpy as np  # 导入 NumPy 库
from huggingface_hub import hf_hub_download  # 从 HuggingFace Hub 导入模型下载函数

from ...image_processing_utils import BaseImageProcessor, BatchFeature  # 导入图像处理相关工具
from ...image_transforms import convert_to_rgb, normalize, to_channel_dimension_format, to_pil_image  # 导入图像转换函数
from ...image_utils import (  # 导入图像工具函数
    ChannelDimension,
    ImageInput,
    get_image_size,
    infer_channel_dimension_format,
    make_list_of_images,
    to_numpy_array,
    valid_images,
)
from ...utils import TensorType, is_torch_available, is_vision_available, logging  # 导入工具函数和判断函数
from ...utils.import_utils import requires_backends  # 导入后端依赖检查函数

if is_vision_available():  # 如果可用视觉模块
    import textwrap  # 导入文本包装模块

    from PIL import Image, ImageDraw, ImageFont  # 从 PIL 库导入图像处理相关函数

if is_torch_available():  # 如果可用 PyTorch 模块
    import torch  # 导入 PyTorch 库

logger = logging.get_logger(__name__)  # 获取日志记录器
DEFAULT_FONT_PATH = "ybelkada/fonts"  # 设置默认字体路径


# 从 https://discuss.pytorch.org/t/tf-image-extract-patches-in-pytorch/171409/2 调整
def torch_extract_patches(image_tensor, patch_height, patch_width):
    """
    从给定的图像张量中提取补丁的实用函数。返回形状为 (1, `patch_height`, `patch_width`, `num_channels`x `patch_height` x `patch_width`) 的张量

    Args:
        image_tensor (torch.Tensor):
            要从中提取补丁的图像张量。
        patch_height (int):
            要提取的补丁的高度。
        patch_width (int):
            要提取的补丁的宽度。
    """
    requires_backends(torch_extract_patches, ["torch"])  # 检查所需的后端是否可用

    image_tensor = image_tensor.unsqueeze(0)  # 在第一维度上添加一个维度，扩展为四维张量
    patches = torch.nn.functional.unfold(image_tensor, (patch_height, patch_width), stride=(patch_height, patch_width))  # 使用 PyTorch 的 unfold 函数提取补丁
    patches = patches.reshape(image_tensor.size(0), image_tensor.size(1), patch_height, patch_width, -1)  # 重塑张量形状
    patches = patches.permute(0, 4, 2, 3, 1).reshape(
        image_tensor.size(2) // patch_height,
        image_tensor.size(3) // patch_width,
        image_tensor.size(1) * patch_height * patch_width,
    )  # 调整张量顺序和形状
    return patches.unsqueeze(0)  # 返回四维张量


# 从 https://github.com/google-research/pix2struct/blob/0e1779af0f4db4b652c1d92b3bbd2550a7399123/pix2struct/preprocessing/preprocessing_utils.py#L106 调整
def render_text(
    text: str,
    text_size: int = 36,
    text_color: str = "black",
    background_color: str = "white",
    font_path: str = DEFAULT_FONT_PATH,
    max_width: Optional[int] = None,
    max_height: Optional[int] = None,
):
    """
    渲染文本为图像的实用函数。

    Args:
        text (str):
            要渲染的文本内容。
        text_size (int, optional):
            文本的字体大小，默认为 36。
        text_color (str, optional):
            文本的颜色，默认为黑色。
        background_color (str, optional):
            背景的颜色，默认为白色。
        font_path (str, optional):
            字体文件的路径，默认为 DEFAULT_FONT_PATH。
        max_width (int, optional):
            最大宽度限制，默认为 None。
        max_height (int, optional):
            最大高度限制，默认为 None。
    """
    # 定义左边界填充量，默认为5个单位
    left_padding: int = 5,
    # 定义右边界填充量，默认为5个单位
    right_padding: int = 5,
    # 定义顶部填充量，默认为5个单位
    top_padding: int = 5,
    # 定义底部填充量，默认为5个单位
    bottom_padding: int = 5,
    # 字体的字节码数据，可选参数，默认为None
    font_bytes: Optional[bytes] = None,
    # 字体文件的路径，可选参数，默认为None
    font_path: Optional[str] = None,
def render_header(
    image: np.ndarray, header: str, input_data_format: Optional[Union[str, ChildProcessError]] = None, **kwargs
):
    """
    Renders the input text as a header on the input image.

    Args:
        image (`np.ndarray`):
            Input image represented as a NumPy array.
        header (`str`):
            Text to render as the header.
        input_data_format (`Optional[Union[str, ChildProcessError]]`, *optional*):
            Format of the input data. Defaults to `None`.
        **kwargs:
            Additional keyword arguments for customization.

    Returns:
        `Image.Image`:
            An image with the rendered header text.

    Note:
        This function renders the header text onto the given image using specified or default parameters.
        It adapts the text rendering from an external source.

    Adapted from:
    https://github.com/google-research/pix2struct/blob/0e1779af0f4db4b652c1d92b3bbd2550a7399123/pix2struct/preprocessing/preprocessing_utils.py#L87
    """

    # Ensure the necessary backend is available for rendering text.
    requires_backends(render_text, "vision")

    # Wrap the input text to fit within lines of 80 characters width.
    wrapper = textwrap.TextWrapper(width=80)
    lines = wrapper.wrap(text=header)
    wrapped_text = "\n".join(lines)

    # Determine the font source based on provided bytes or path, or use default.
    if font_bytes is not None and font_path is None:
        font = io.BytesIO(font_bytes)
    elif font_path is not None:
        font = font_path
    else:
        font = hf_hub_download(DEFAULT_FONT_PATH, "Arial.TTF")
    # Load the font using PIL's `ImageFont.truetype` method.
    font = ImageFont.truetype(font, encoding="UTF-8", size=text_size)

    # Create a temporary canvas to calculate text dimensions.
    temp_draw = ImageDraw.Draw(Image.new("RGB", (1, 1), background_color))
    _, _, text_width, text_height = temp_draw.textbbox((0, 0), wrapped_text, font)

    # Determine the dimensions of the final image including padding.
    image_width = text_width + left_padding + right_padding
    image_height = text_height + top_padding + bottom_padding

    # Create a new image with specified dimensions and background color.
    image = Image.new("RGB", (image_width, image_height), background_color)
    draw = ImageDraw.Draw(image)

    # Render the wrapped text onto the image at specified padding positions.
    draw.text(xy=(left_padding, top_padding), text=wrapped_text, fill=text_color, font=font)

    # Return the final rendered image with the header text.
    return image
    Args:
        image (`np.ndarray`):
            The image to render the header on.
        header (`str`):
            The header text.
        data_format (`Union[ChannelDimension, str]`, *optional*):
            The data format of the image. Can be either "ChannelDimension.channels_first" or
            "ChannelDimension.channels_last".

    Returns:
        `np.ndarray`: The image with the header rendered.
    """
    # 检查渲染头部所需的视觉后端是否存在
    requires_backends(render_header, "vision")

    # 如果需要，将输入的图像转换为PIL图像格式
    image = to_pil_image(image, input_data_format=input_data_format)

    # 使用渲染文本函数生成头部文本对应的图像
    header_image = render_text(header, **kwargs)

    # 计算新图像的宽度为头部图像和原始图像宽度的最大值
    new_width = max(header_image.width, image.width)

    # 计算新图像的高度，保持原始图像的宽高比
    new_height = int(image.height * (new_width / image.width))
    new_header_height = int(header_image.height * (new_width / header_image.width))

    # 创建新的RGB模式的白色背景图像，大小为新宽度和高度之和
    new_image = Image.new("RGB", (new_width, new_height + new_header_height), "white")

    # 将调整大小后的头部图像粘贴到新图像的顶部
    new_image.paste(header_image.resize((new_width, new_header_height)), (0, 0))

    # 将调整大小后的原始图像粘贴到新图像的下部
    new_image.paste(image.resize((new_width, new_height)), (0, new_header_height))

    # 如果需要，将新图像转换回原始数据格式
    new_image = to_numpy_array(new_image)

    # 如果推断出新图像的通道维度格式为最后一个维度
    if infer_channel_dimension_format(new_image) == ChannelDimension.LAST:
        # 将新图像转换为最后一个通道维度格式
        new_image = to_channel_dimension_format(new_image, ChannelDimension.LAST)

    # 返回渲染了头部的新图像
    return new_image
    r"""
    Constructs a Pix2Struct image processor.
    构造一个 Pix2Struct 图像处理器。

    Args:
        do_convert_rgb (`bool`, *optional*, defaults to `True`):
            Whether to convert the image to RGB.
            是否将图像转换为 RGB。

        do_normalize (`bool`, *optional*, defaults to `True`):
            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
            method. According to Pix2Struct paper and code, the image is normalized with its own mean and standard
            deviation.
            是否对图像进行归一化。可以通过 `preprocess` 方法中的 `do_normalize` 参数进行覆盖。
            根据 Pix2Struct 论文和代码，图像使用其自身的均值和标准差进行归一化。

        patch_size (`Dict[str, int]`, *optional*, defaults to `{"height": 16, "width": 16}`):
            The patch size to use for the image. According to Pix2Struct paper and code, the patch size is 16x16.
            图像使用的补丁大小。根据 Pix2Struct 论文和代码，补丁大小为 16x16。

        max_patches (`int`, *optional*, defaults to 2048):
            The maximum number of patches to extract from the image as per the [Pix2Struct
            paper](https://arxiv.org/pdf/2210.03347.pdf).
            从图像中提取的最大补丁数，根据 Pix2Struct 论文。

        is_vqa (`bool`, *optional*, defaults to `False`):
            Whether or not the image processor is for the VQA task. If `True` and `header_text` is passed in, text is
            rendered onto the input images.
            图像处理器是否用于 VQA 任务。如果为 `True` 并且传入了 `header_text`，则将文本渲染到输入图像上。
    """

    model_input_names = ["flattened_patches"]

    def __init__(
        self,
        do_convert_rgb: bool = True,
        do_normalize: bool = True,
        patch_size: Dict[str, int] = None,
        max_patches: int = 2048,
        is_vqa: bool = False,
        **kwargs,
    ) -> None:
        super().__init__(**kwargs)  # 调用父类的初始化方法，传递所有未明确指定的关键字参数
        self.patch_size = patch_size if patch_size is not None else {"height": 16, "width": 16}  # 设置补丁大小，默认为 {"height": 16, "width": 16}
        self.do_normalize = do_normalize  # 是否进行归一化
        self.do_convert_rgb = do_convert_rgb  # 是否进行 RGB 转换
        self.max_patches = max_patches  # 最大提取补丁数
        self.is_vqa = is_vqa  # 是否用于 VQA 任务

    def extract_flattened_patches(
        self,
        image: np.ndarray,
        max_patches: int,
        patch_size: dict,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,
    ) -> np.ndarray:
        """
        Extract flattened patches from an image.

        Args:
            image (`np.ndarray`):
                Image to extract flattened patches from.
            max_patches (`int`):
                Maximum number of patches to extract.
            patch_size (`dict`):
                Dictionary containing the patch height and width.

        Returns:
            result (`np.ndarray`):
                A sequence of `max_patches` flattened patches.
        """
        # 检查是否需要使用 torch 后端函数
        requires_backends(self.extract_flattened_patches, "torch")

        # 将图像转换为 torch 张量格式
        image = to_channel_dimension_format(image, ChannelDimension.FIRST, input_data_format)
        image = torch.from_numpy(image)

        # 获取补丁的高度和宽度
        patch_height, patch_width = patch_size["height"], patch_size["width"]
        # 获取图像的高度和宽度
        image_height, image_width = get_image_size(image, ChannelDimension.FIRST)

        # 最大化比例以便适应给定的最大补丁数和图像尺寸
        scale = math.sqrt(max_patches * (patch_height / image_height) * (patch_width / image_width))
        num_feasible_rows = max(min(math.floor(scale * image_height / patch_height), max_patches), 1)
        num_feasible_cols = max(min(math.floor(scale * image_width / patch_width), max_patches), 1)
        resized_height = max(num_feasible_rows * patch_height, 1)
        resized_width = max(num_feasible_cols * patch_width, 1)

        # 对图像进行插值调整大小
        image = torch.nn.functional.interpolate(
            image.unsqueeze(0),
            size=(resized_height, resized_width),
            mode="bilinear",
            align_corners=False,
            antialias=True,
        ).squeeze(0)

        # 提取图像的补丁
        # [1, rows, columns, patch_height * patch_width * image_channels]
        patches = torch_extract_patches(image, patch_height, patch_width)

        # 获取补丁的形状信息
        patches_shape = patches.shape
        rows = patches_shape[1]
        columns = patches_shape[2]
        depth = patches_shape[3]

        # 重新整形补丁张量以便进一步处理
        # [rows * columns, patch_height * patch_width * image_channels]
        patches = patches.reshape([rows * columns, depth])

        # 创建行和列的索引张量
        # [rows * columns, 1]
        row_ids = torch.arange(rows).reshape([rows, 1]).repeat(1, columns).reshape([rows * columns, 1])
        col_ids = torch.arange(columns).reshape([1, columns]).repeat(rows, 1).reshape([rows * columns, 1])

        # 将索引张量的值加一，以避免包含代表填充的零
        row_ids += 1
        col_ids += 1

        # 准备额外的补丁特征信息
        # [rows * columns, 1]
        row_ids = row_ids.to(torch.float32)
        col_ids = col_ids.to(torch.float32)

        # 拼接行号、列号和补丁数据，形成最终的输出结果
        # [rows * columns, 2 + patch_height * patch_width * image_channels]
        result = torch.cat([row_ids, col_ids, patches], -1)

        # 对结果进行填充，以保证输出的补丁数量不超过 max_patches
        # [max_patches, 2 + patch_height * patch_width * image_channels]
        result = torch.nn.functional.pad(result, [0, 0, 0, max_patches - (rows * columns)]).float()

        # 将结果转换为 NumPy 数组格式
        result = to_numpy_array(result)

        return result
    # 对图像进行标准化处理，使得图像数据的均值为0，标准差为1
    def normalize(
        self,
        image: np.ndarray,
        data_format: Optional[Union[str, ChannelDimension]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,
    ) -> np.ndarray:
        """
        Normalize an image. image = (image - image_mean) / image_std.

        The image std is to mimic the tensorflow implementation of the `per_image_standardization`:
        https://www.tensorflow.org/api_docs/python/tf/image/per_image_standardization

        Args:
            image (`np.ndarray`):
                Image to normalize.
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format for the output image. If unset, the channel dimension format of the input
                image is used.
            input_data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format of the input image. If not provided, it will be inferred.
        """
        # 如果图像的数据类型是uint8，则转换为float32类型
        if image.dtype == np.uint8:
            image = image.astype(np.float32)

        # 计算图像的均值和标准差
        mean = np.mean(image)
        std = np.std(image)
        adjusted_stddev = max(std, 1.0 / math.sqrt(np.prod(image.shape)))

        # 调用标准化函数进行图像标准化处理，返回标准化后的图像数据
        return normalize(
            image,
            mean=mean,
            std=adjusted_stddev,
            data_format=data_format,
            input_data_format=input_data_format,
            **kwargs,
        )

    # 图像预处理函数，可以进行RGB转换、标准化、裁剪等操作
    def preprocess(
        self,
        images: ImageInput,
        header_text: Optional[str] = None,
        do_convert_rgb: bool = None,
        do_normalize: Optional[bool] = None,
        max_patches: Optional[int] = None,
        patch_size: Optional[Dict[str, int]] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        data_format: ChannelDimension = ChannelDimension.FIRST,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,
    ):

`.\models\pix2struct\modeling_pix2struct.py`

# coding=utf-8
# Copyright 2023 The HuggingFace Inc. & Google team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Pix2Struct modeling file"""

import math
from typing import Dict, List, Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn

# Importing specific modules from the HuggingFace library
from ...activations import ACT2FN
from ...modeling_outputs import (
    BaseModelOutput,
    BaseModelOutputWithPooling,
    CausalLMOutputWithCrossAttentions,
    Seq2SeqLMOutput,
    Seq2SeqModelOutput,
)
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import ALL_LAYERNORM_LAYERS
from ...utils import (
    DUMMY_INPUTS,
    DUMMY_MASK,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    is_torch_fx_proxy,
    logging,
    replace_return_docstrings,
)

from .configuration_pix2struct import Pix2StructConfig, Pix2StructTextConfig, Pix2StructVisionConfig

# Initialize logger for logging purposes in this module
logger = logging.get_logger(__name__)

# General docstring describing the configuration used in this module
_CONFIG_FOR_DOC = "Pix2StructConfig"

# List of pretrained model archive paths specific to Pix2Struct
PIX2STRUCT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "google/pix2struct-textcaps-base",
    "google/pix2struct-textcaps-large",
    "google/pix2struct-base",
    "google/pix2struct-large",
    "google/pix2struct-ai2d-base",
    "google/pix2struct-ai2d-large",
    "google/pix2struct-widget-captioning-base",
    "google/pix2struct-widget-captioning-large",
    "google/pix2struct-screen2words-base",
    "google/pix2struct-screen2words-large",
    "google/pix2struct-docvqa-base",
    "google/pix2struct-docvqa-large",
    "google/pix2struct-ocrvqa-base",
    "google/pix2struct-ocrvqa-large",
    "google/pix2struct-chartqa-base",
    "google/pix2struct-inforgraphics-vqa-base",
    "google/pix2struct-inforgraphics-vqa-large",
    # See all Pix2StructVision models at https://huggingface.co/models?filter=pix2struct
]

# Adapted layer normalization module from T5 to Pix2Struct style
class Pix2StructLayerNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-6):
        """
        Construct a layernorm module in the T5 style. No bias and no subtraction of mean.
        """
        super().__init__()
        # Initializing weight parameter for layer normalization
        self.weight = nn.Parameter(torch.ones(hidden_size))
        # Epsilon value for numerical stability in variance calculation
        self.variance_epsilon = eps
    def forward(self, hidden_states):
        # T5模型使用一种仅进行缩放而不进行偏移的层归一化，即均方根层归一化
        # 参考文献：https://arxiv.org/abs/1910.07467
        # 因此，方差是在没有均值的情况下计算的，且没有偏置。此外，我们希望确保对半精度输入的累积在fp32中完成

        # 计算隐藏状态的方差，将隐藏状态转换为torch.float32类型后求平方，然后沿着最后一个维度求平均
        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
        # 对隐藏状态进行归一化，使用倒数平方根进行缩放
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)

        # 如果权重的数据类型是半精度（torch.float16或torch.bfloat16），则将隐藏状态转换为相同的数据类型
        if self.weight.dtype in [torch.float16, torch.bfloat16]:
            hidden_states = hidden_states.to(self.weight.dtype)

        # 返回加权后的隐藏状态
        return self.weight * hidden_states
try:
    from apex.normalization import FusedRMSNorm

    # 使用 apex 库中的 FusedRMSNorm 替代 Pix2StructLayerNorm
    Pix2StructLayerNorm = FusedRMSNorm  # noqa

    # 日志记录：发现了 apex.normalization.FusedRMSNorm，将使用它代替 Pix2StructLayerNorm
    logger.info("Discovered apex.normalization.FusedRMSNorm - will use it instead of Pix2StructLayerNorm")
except ImportError:
    # 如果导入失败，继续使用 Pix2StructLayerNorm
    # 使用普通的 Pix2StructLayerNorm
    pass
except Exception:
    # 异常处理：apex 库加载失败，回退到使用 Pix2StructLayerNorm
    logger.warning("Discovered apex but it failed to load, falling back to Pix2StructLayerNorm")
    pass

# 将 Pix2StructLayerNorm 添加到全局列表 ALL_LAYERNORM_LAYERS 中
ALL_LAYERNORM_LAYERS.append(Pix2StructLayerNorm)


class Pix2StructVisionEmbeddings(nn.Module):
    r"""
    Construct the embeddings from patch. In `Pix2Struct` the input is different from classic Vision-transformer models.
    Here the input is a sequence of `seq_len` flattened patches that also combines padding patches (tokens). Each patch
    is represented by a vector of `hidden_size` values.
    """

    def __init__(self, config: Pix2StructConfig) -> None:
        super().__init__()
        # 线性层，用于将 patch 的隐藏表示映射到 hidden_size
        self.patch_projection = nn.Linear(config.patch_embed_hidden_size, config.hidden_size)

        # 行索引的嵌入层，将 seq_len 映射到 hidden_size
        self.row_embedder = nn.Embedding(config.seq_len, config.hidden_size)
        
        # 列索引的嵌入层，将 seq_len 映射到 hidden_size
        self.column_embedder = nn.Embedding(config.seq_len, config.hidden_size)

        # dropout 层，使用指定的 dropout_rate
        self.dropout = nn.Dropout(config.dropout_rate)

    def forward(self, flattened_patches: torch.Tensor) -> torch.Tensor:
        # 从 flattened_patches 中获取行索引和列索引
        # flattened_patches: `batch_size`, `seq_len`, `hidden_size` + 2
        row_indices = flattened_patches[:, :, 0].long()
        col_indices = flattened_patches[:, :, 1].long()

        # 提取除索引外的数据部分
        flattened_patches = flattened_patches[:, :, 2:]

        # 将 patch 投影到指定的 hidden_size
        embeddings = self.patch_projection(flattened_patches)
        # 获取行嵌入向量
        row_embeddings = self.row_embedder(row_indices)
        # 获取列嵌入向量
        col_embeddings = self.column_embedder(col_indices)

        # 将三部分嵌入向量相加
        embeddings = embeddings + row_embeddings + col_embeddings

        # 应用 dropout
        embeddings = self.dropout(embeddings)

        return embeddings


class Pix2StructVisionAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.hidden_size = config.hidden_size
        self.key_value_proj_dim = config.d_kv
        self.n_heads = config.num_attention_heads
        self.dropout = config.attention_dropout
        self.inner_dim = self.n_heads * self.key_value_proj_dim

        # Mesh TensorFlow 初始化，避免 softmax 前的缩放
        self.query = nn.Linear(self.hidden_size, self.inner_dim, bias=False)
        self.key = nn.Linear(self.hidden_size, self.inner_dim, bias=False)
        self.value = nn.Linear(self.hidden_size, self.inner_dim, bias=False)
        self.output = nn.Linear(self.inner_dim, self.hidden_size, bias=False)

        self.gradient_checkpointing = False

    def forward(
        self,
        hidden_states,
        attention_mask=None,
        position_bias=None,
        layer_head_mask=None,
        output_attentions=False,
    ):
        """
        Self-attention block
        """
        # Input is (batch_size, seq_length, dim)
        # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length)
        # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head)
        batch_size, seq_length = hidden_states.shape[:2]

        def to_projection_shape(states):
            """将输入状态调整为投影形状"""
            return states.contiguous().view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)

        # 获取查询状态
        # (batch_size, n_heads, seq_length, dim_per_head)
        query_states = to_projection_shape(self.query(hidden_states))

        # 获取键/值状态
        key_states = to_projection_shape(self.key(hidden_states))
        value_states = to_projection_shape(self.value(hidden_states))

        # 计算注意力分数
        # 相当于 torch.einsum("bnqd,bnkd->bnqk", query_states, key_states)，与 onnx 操作兼容
        scores = torch.matmul(query_states, key_states.transpose(3, 2))

        if position_bias is None:
            position_bias = torch.zeros(
                (1, self.n_heads, seq_length, seq_length), device=scores.device, dtype=scores.dtype
            )
            if self.gradient_checkpointing and self.training:
                position_bias.requires_grad = True

            if attention_mask is None:
                attention_mask = torch.ones((batch_size, seq_length), device=scores.device, dtype=scores.dtype)

            if attention_mask.dim() == 2:
                position_bias = position_bias + attention_mask[:, None, None, :].to(position_bias.device)
            else:
                # (batch_size, n_heads, seq_length, key_length)
                position_bias = position_bias + attention_mask.to(position_bias.device)
            position_bias = 1 - position_bias

        position_bias_masked = position_bias.masked_fill(position_bias == 1, torch.finfo(scores.dtype).min)
        scores += position_bias_masked
        scores = torch.max(scores, torch.tensor(torch.finfo(scores.dtype).min))

        # (batch_size, n_heads, seq_length, key_length)
        attn_weights = nn.functional.softmax(scores, dim=-1, dtype=torch.float32).type_as(scores)

        # (batch_size, n_heads, seq_length, key_length)
        attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)

        # 如果需要，掩盖注意力头
        if layer_head_mask is not None:
            attn_weights = attn_weights * layer_head_mask

        attn_output = torch.matmul(attn_weights, value_states)

        # (batch_size, seq_length, dim)
        attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim)

        attn_output = self.output(attn_output)

        outputs = (attn_output,) + (position_bias,)

        if output_attentions:
            outputs = outputs + (attn_weights,)
        return outputs
# 从 transformers.models.t5.modeling_t5.T5DenseGatedActDense 复制的类，现在命名为 Pix2StructVisionMlp。
# T5DenseGatedActDense 被改名为 Pix2StructVisionMlp，T5Config 改为 Pix2StructVisionConfig，
# config.d_model 改为 config.hidden_size，dropout_rate 改为 dropout_rate。

class Pix2StructVisionMlp(nn.Module):
    def __init__(self, config: Pix2StructVisionConfig):
        super().__init__()
        # 创建一个线性层 wi_0，输入维度为 config.hidden_size，输出维度为 config.d_ff，没有偏置项。
        self.wi_0 = nn.Linear(config.hidden_size, config.d_ff, bias=False)
        # 创建一个线性层 wi_1，输入维度为 config.hidden_size，输出维度为 config.d_ff，没有偏置项。
        self.wi_1 = nn.Linear(config.hidden_size, config.d_ff, bias=False)
        # 创建一个线性层 wo，输入维度为 config.d_ff，输出维度为 config.hidden_size，没有偏置项。
        self.wo = nn.Linear(config.d_ff, config.hidden_size, bias=False)
        # 创建一个以 config.dropout_rate 概率随机将输入张量置零的 Dropout 层。
        self.dropout = nn.Dropout(config.dropout_rate)
        # 选择激活函数，根据 config.dense_act_fn 选择 ACT2FN 字典中对应的激活函数。
        self.act = ACT2FN[config.dense_act_fn]

    def forward(self, hidden_states):
        # 使用 wi_0 对隐藏状态进行线性变换，然后应用激活函数，得到 hidden_gelu。
        hidden_gelu = self.act(self.wi_0(hidden_states))
        # 使用 wi_1 对隐藏状态进行线性变换，得到 hidden_linear。
        hidden_linear = self.wi_1(hidden_states)
        # 将 hidden_gelu 和 hidden_linear 的按元素乘积作为新的隐藏状态。
        hidden_states = hidden_gelu * hidden_linear
        # 对新的隐藏状态应用 Dropout。
        hidden_states = self.dropout(hidden_states)

        # 为了使得 Google Flan 的 8 位量化工作，保持 self.wo 在 float32 类型。
        # 参考：https://github.com/huggingface/transformers/issues/20287
        # 如果 self.wo.weight 是 torch.Tensor 类型，并且 hidden_states 的数据类型与 self.wo.weight 的数据类型不同，
        # 且 self.wo.weight 的数据类型不是 torch.int8，则将 hidden_states 转换为 self.wo.weight 的数据类型。
        if (
            isinstance(self.wo.weight, torch.Tensor)
            and hidden_states.dtype != self.wo.weight.dtype
            and self.wo.weight.dtype != torch.int8
        ):
            hidden_states = hidden_states.to(self.wo.weight.dtype)

        # 使用 wo 对最终的隐藏状态进行线性变换，得到输出。
        hidden_states = self.wo(hidden_states)
        return hidden_states


# 定义了一个 Pix2StructVisionLayer 类，用于组成 Pix2Struct 模型的一个层。
class Pix2StructVisionLayer(nn.Module):
    def __init__(self, config: Pix2StructConfig) -> None:
        super().__init__()
        # 设置 feed forward 操作的块大小。
        self.chunk_size_feed_forward = config.chunk_size_feed_forward
        # 设置序列长度的维度。
        self.seq_len_dim = 1
        # 初始化注意力层，使用 Pix2StructVisionAttention 类。
        self.attention = Pix2StructVisionAttention(config)
        # 初始化 MLP 层，使用 Pix2StructVisionMlp 类。
        self.mlp = Pix2StructVisionMlp(config)
        # 初始化前 MLP 层归一化，使用 Pix2StructLayerNorm 类。
        self.pre_mlp_layer_norm = Pix2StructLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 初始化前注意力层归一化，使用 Pix2StructLayerNorm 类。
        self.pre_attention_layer_norm = Pix2StructLayerNorm(config.hidden_size, eps=config.layer_norm_eps)

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
        # 函数签名：接受 hidden_states 张量作为输入，可选的 attention_mask 张量，可选的 head_mask 张量，
        # 是否输出注意力权重的标志 output_attentions。
        ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
        # 定义函数签名，指定输入输出类型为包含两个张量的元组或包含一个张量的元组

        residual = hidden_states
        # 保存输入隐藏状态作为残差连接的基础

        # 在 Pix2StructVision 中，进行自注意力之前先应用层归一化
        hidden_states = self.pre_attention_layer_norm(hidden_states)
        # 使用预自注意力层归一化对隐藏状态进行处理

        self_attention_outputs = self.attention(
            hidden_states,
            attention_mask=attention_mask,
            layer_head_mask=head_mask,
            output_attentions=output_attentions,
        )
        # 调用注意力机制模块进行自注意力计算，返回自注意力输出和其他相关信息
        attention_output = self_attention_outputs[0]
        # 从自注意力输出中提取注意力输出结果
        outputs = self_attention_outputs[1:]  # 如果输出注意力权重，则添加自注意力输出

        # 第一个残差连接
        hidden_states = attention_output + residual
        # 将自注意力输出与残差相加，作为隐藏状态的更新结果

        # 在 Pix2StructVision 中，自注意力之后同样应用层归一化
        layer_output = self.pre_mlp_layer_norm(hidden_states)
        # 使用预多层感知机层归一化对隐藏状态进行处理
        layer_output = self.mlp(layer_output) + hidden_states  # 第二个残差连接
        # 经过多层感知机处理后的层输出与原始隐藏状态进行残差连接

        outputs = (layer_output,) + outputs
        # 将最终的层输出添加到输出元组中

        return outputs
        # 返回所有输出，包括层输出和可能的注意力信息
class Pix2StructVisionEncoder(nn.Module):
    # Pix2StructVisionEncoder 类，继承自 nn.Module
    def __init__(self, config: Pix2StructConfig) -> None:
        # 初始化方法，接收一个 Pix2StructConfig 类型的参数 config
        super().__init__()
        self.config = config
        # 创建一个 nn.ModuleList，包含 config.num_hidden_layers 个 Pix2StructVisionLayer 的实例
        self.layer = nn.ModuleList([Pix2StructVisionLayer(config) for _ in range(config.num_hidden_layers)])
        self.gradient_checkpointing = False  # 是否使用梯度检查点的标志，默认为 False

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
    ) -> Union[tuple, BaseModelOutput]:
        # forward 方法定义了模型的前向传播逻辑
        # 如果输出隐藏状态，则初始化 all_hidden_states 为空元组，否则为 None
        all_hidden_states = () if output_hidden_states else None
        # 如果输出注意力权重，则初始化 all_self_attentions 为空元组，否则为 None
        all_self_attentions = () if output_attentions else None

        # 遍历每个层次的 Pix2StructVisionLayer
        for i, layer_module in enumerate(self.layer):
            # 如果输出隐藏状态，则将当前隐藏状态添加到 all_hidden_states 中
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 获取当前层的头部掩码
            layer_head_mask = head_mask[i] if head_mask is not None else None

            # 如果启用梯度检查点并且处于训练模式，则使用梯度检查点函数进行调用
            if self.gradient_checkpointing and self.training:
                layer_outputs = self._gradient_checkpointing_func(
                    layer_module.__call__,
                    hidden_states,
                    attention_mask,
                    layer_head_mask,
                    output_attentions,
                )
            else:
                # 否则直接调用当前层的 __call__ 方法（即 forward 方法）
                layer_outputs = layer_module(hidden_states, attention_mask, layer_head_mask, output_attentions)

            # 更新隐藏状态为当前层的输出的第一个元素
            hidden_states = layer_outputs[0]

            # 如果输出注意力权重，则将当前层的注意力权重添加到 all_self_attentions 中
            if output_attentions:
                all_self_attentions = all_self_attentions + (layer_outputs[1],)

        # 如果输出隐藏状态，则将最终的隐藏状态添加到 all_hidden_states 中
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # 如果不返回字典，则返回非 None 的元组值
        if not return_dict:
            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
        # 否则返回 BaseModelOutput 类的实例，包含最终的隐藏状态、所有隐藏状态和所有注意力权重
        return BaseModelOutput(
            last_hidden_state=hidden_states,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
        )


class Pix2StructPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """
    # Pix2StructPreTrainedModel 类，继承自 PreTrainedModel
    config_class = Pix2StructConfig  # 类属性，指定配置类为 Pix2StructConfig

    @property
    def dummy_inputs(self):
        # dummy_inputs 属性，返回一个字典，包含输入和注意力掩码的示例数据
        input_ids = torch.tensor(DUMMY_INPUTS)  # 输入的张量示例
        input_mask = torch.tensor(DUMMY_MASK)   # 输入的掩码示例
        dummy_inputs = {
            "decoder_input_ids": input_ids,
            "input_ids": input_ids,
            "decoder_attention_mask": input_mask,
        }
        return dummy_inputs

    # 以下代码段来自于 transformers.models.t5.modeling_t5.T5PreTrainedModel._shift_right，已改为 Pix2Struct
    # 定义一个私有方法 `_shift_right`，用于将输入的标识符序列向右移动一位
    def _shift_right(self, input_ids):
        # 获取解码器起始标记的 ID
        decoder_start_token_id = self.config.decoder_start_token_id
        # 获取填充标记的 ID
        pad_token_id = self.config.pad_token_id

        # 如果解码器起始标记 ID 未定义，则抛出数值错误异常
        if decoder_start_token_id is None:
            raise ValueError(
                "self.model.config.decoder_start_token_id has to be defined. In Pix2Struct it is usually set to the pad_token_id. "
                "See Pix2Struct docs for more information."
            )

        # 将输入向右移动一位
        # 如果使用 Torch FX 代理对象
        if is_torch_fx_proxy(input_ids):
            # 对于代理对象，不支持原生的项目赋值操作
            # 创建一个形状与 input_ids 除最后一维外相同的张量，填充为 decoder_start_token_id
            shifted_input_ids = torch.full(input_ids.shape[:-1] + (1,), decoder_start_token_id)
            # 在最后一维上连接 shifted_input_ids 和 input_ids 的前 n-1 列
            shifted_input_ids = torch.cat([shifted_input_ids, input_ids[..., :-1]], dim=-1)
        else:
            # 对于普通的张量
            # 创建一个与 input_ids 形状相同的全零张量
            shifted_input_ids = input_ids.new_zeros(input_ids.shape)
            # 将 input_ids 的前 n-1 列赋值给 shifted_input_ids 的后 n-1 列
            shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
            # 将 decoder_start_token_id 赋值给 shifted_input_ids 的第一列
            shifted_input_ids[..., 0] = decoder_start_token_id

        # 如果 pad_token_id 未定义，则抛出数值错误异常
        if pad_token_id is None:
            raise ValueError("self.model.config.pad_token_id has to be defined.")
        
        # 将 shifted_input_ids 中可能的 -100 值替换为 pad_token_id
        shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)

        # 返回向右移位后的输入标识符序列
        return shifted_input_ids
@add_start_docstrings(
    "The bare Pix2StructVision Model transformer outputting raw hidden-states without any specific head on top.",
    PIX2STRUCT_VISION_START_DOCSTRING,
)
class Pix2StructVisionModel(Pix2StructPreTrainedModel):
    # 设置配置类，用于模型参数配置
    config_class = Pix2StructVisionConfig
    # 主要输入名称为 "flattened_patches"
    main_input_name = "flattened_patches"
    # 支持梯度检查点
    supports_gradient_checkpointing = True
    # 不拆分的模块列表
    _no_split_modules = ["Pix2StructVisionLayer"]

    def __init__(self, config: Pix2StructConfig):
        # 调用父类构造函数初始化模型
        super().__init__(config)
        # 设置模型配置
        self.config = config

        # 初始化嵌入层
        self.embeddings = Pix2StructVisionEmbeddings(config)
        # 初始化编码器
        self.encoder = Pix2StructVisionEncoder(config)

        # 初始化层归一化
        self.layernorm = Pix2StructLayerNorm(config.hidden_size, eps=config.layer_norm_eps)

        # 初始化权重并应用最终处理
        self.post_init()
    # 返回当前模型的输入嵌入层
    def get_input_embeddings(self):
        return self.embeddings.patch_projection

    # 剪枝模型中指定层的注意力头
    # heads_to_prune: 要剪枝的注意力头的字典，格式为 {层号: 要在该层剪枝的头列表}
    # 参见基类 PreTrainedModel
    def _prune_heads(self, heads_to_prune: Dict[int, List[int]]) -> None:
        for layer, heads in heads_to_prune.items():
            # 获取指定层的注意力模块并进行剪枝操作
            self.encoder.layer[layer].attention.prune_heads(heads)

    # 重写的前向传播函数，应用于 PIX2STRUCT_VISION_INPUTS_DOCSTRING 的输入文档字符串
    # 以及返回值替换为 BaseModelOutputWithPooling 的文档字符串，使用 _CONFIG_FOR_DOC 配置类
    def forward(
        self,
        flattened_patches: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 如果 output_attentions 不为 None，则使用其值；否则使用配置中的 output_attentions 值

        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 如果 output_hidden_states 不为 None，则使用其值；否则使用配置中的 output_hidden_states 值

        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 如果 return_dict 不为 None，则使用其值；否则使用配置中的 use_return_dict 值

        if flattened_patches is None:
            raise ValueError("You have to specify flattened_patches")
        # 如果 flattened_patches 为 None，则抛出 ValueError 异常，提示必须指定 flattened_patches

        if attention_mask is None:
            # 检查 flattened_patches 中哪些部分不为 0
            attention_mask = (flattened_patches.sum(dim=-1) != 0).float()
        # 如果 attention_mask 为 None，则根据 flattened_patches 的和不为 0 的位置创建注意力掩码

        # 准备头部掩码（如果需要）
        # head_mask 中的 1.0 表示保留该头部
        # attention_probs 的形状为 bsz x n_heads x N x N
        # 输入的 head_mask 形状为 [num_heads] 或 [num_hidden_layers x num_heads]
        # 并且 head_mask 被转换为形状 [num_hidden_layers x batch x num_heads x seq_length x seq_length]
        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
        # 使用 get_head_mask 方法生成头部掩码，参数为 head_mask 和 num_hidden_layers

        embedding_output = self.embeddings(flattened_patches)
        # 将 flattened_patches 输入到 embeddings 中，得到嵌入输出

        encoder_outputs = self.encoder(
            embedding_output,
            attention_mask=attention_mask,
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # 将嵌入输出传递给编码器（encoder），并传递相关参数

        sequence_output = encoder_outputs[0]
        # 从编码器输出中获取序列输出

        sequence_output = self.layernorm(sequence_output)
        # 应用层归一化到序列输出

        if not return_dict:
            head_outputs = (sequence_output,)
            return head_outputs + encoder_outputs[1:]
        # 如果 return_dict 为 False，则返回头部输出和编码器输出中的其他部分

        return BaseModelOutput(
            last_hidden_state=sequence_output,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )
        # 如果 return_dict 为 True，则返回包含最后隐藏状态、隐藏状态和注意力的 BaseModelOutput 对象
# 从transformers.models.t5.modeling_t5.T5DenseGatedActDense复制过来，将类名T5DenseGatedActDense改为Pix2StructTextDenseGatedActDense，将d_model改为hidden_size
class Pix2StructTextDenseGatedActDense(nn.Module):
    def __init__(self, config: Pix2StructTextConfig):
        super().__init__()
        # 初始化线性层wi_0，输入维度为config.hidden_size，输出维度为config.d_ff，无偏置
        self.wi_0 = nn.Linear(config.hidden_size, config.d_ff, bias=False)
        # 初始化线性层wi_1，输入维度为config.hidden_size，输出维度为config.d_ff，无偏置
        self.wi_1 = nn.Linear(config.hidden_size, config.d_ff, bias=False)
        # 初始化线性层wo，输入维度为config.d_ff，输出维度为config.hidden_size，无偏置
        self.wo = nn.Linear(config.d_ff, config.hidden_size, bias=False)
        # 初始化Dropout层，使用config.dropout_rate作为丢弃率
        self.dropout = nn.Dropout(config.dropout_rate)
        # 选择激活函数，根据config.dense_act_fn从ACT2FN字典中获取对应的激活函数
        self.act = ACT2FN[config.dense_act_fn]

    # 定义前向传播函数，接受hidden_states作为输入
    def forward(self, hidden_states):
        # 将hidden_states输入wi_0线性层并经过激活函数处理，得到hidden_gelu
        hidden_gelu = self.act(self.wi_0(hidden_states))
        # 将hidden_states输入wi_1线性层，得到hidden_linear
        hidden_linear = self.wi_1(hidden_states)
        # 计算element-wise乘积，得到新的hidden_states
        hidden_states = hidden_gelu * hidden_linear
        # 对hidden_states应用dropout操作
        hidden_states = self.dropout(hidden_states)

        # 为了使得8位量化在google/flan-t5-xxl上工作，保持self.wo为float32类型
        # 参考：https://github.com/huggingface/transformers/issues/20287
        # 同时确保权重不是`int8`类型，以防用户将`_keep_in_fp32_modules`强制设为`None`
        if (
            isinstance(self.wo.weight, torch.Tensor)
            and hidden_states.dtype != self.wo.weight.dtype
            and self.wo.weight.dtype != torch.int8
        ):
            # 将hidden_states转换为self.wo.weight相同的数据类型
            hidden_states = hidden_states.to(self.wo.weight.dtype)

        # 将hidden_states输入wo线性层，得到最终的输出hidden_states
        hidden_states = self.wo(hidden_states)
        return hidden_states


# 定义Pix2StructTextLayerFF类
class Pix2StructTextLayerFF(nn.Module):
    def __init__(self, config: Pix2StructTextConfig):
        super().__init__()
        # 初始化DenseReluDense层，使用Pix2StructTextDenseGatedActDense类，传入config参数
        self.DenseReluDense = Pix2StructTextDenseGatedActDense(config)

        # 初始化LayerNorm层，输入维度为config.hidden_size，epsilon为config.layer_norm_epsilon
        self.layer_norm = Pix2StructLayerNorm(config.hidden_size, eps=config.layer_norm_epsilon)
        # 初始化Dropout层，使用config.dropout_rate作为丢弃率
        self.dropout = nn.Dropout(config.dropout_rate)

    # 定义前向传播函数，接受hidden_states作为输入
    def forward(self, hidden_states):
        # 对输入hidden_states进行LayerNorm归一化处理
        forwarded_states = self.layer_norm(hidden_states)
        # 将归一化后的hidden_states输入DenseReluDense层进行前向传播
        forwarded_states = self.DenseReluDense(forwarded_states)
        # 将原始hidden_states与经Dropout后的forwarded_states相加，得到最终输出的hidden_states
        hidden_states = hidden_states + self.dropout(forwarded_states)
        return hidden_states
    def __init__(self, config: Pix2StructTextConfig, has_relative_attention_bias=False):
        super().__init__()
        self.has_relative_attention_bias = has_relative_attention_bias
        self.relative_attention_num_buckets = config.relative_attention_num_buckets
        self.relative_attention_max_distance = config.relative_attention_max_distance
        self.hidden_size = config.hidden_size
        self.key_value_proj_dim = config.d_kv
        self.n_heads = config.num_heads
        self.dropout = config.dropout_rate
        self.inner_dim = self.n_heads * self.key_value_proj_dim

        # Mesh TensorFlow initialization to avoid scaling before softmax
        # 初始化查询、键、值和输出线性层，用于注意力机制
        self.query = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
        self.key = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
        self.value = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
        self.output = nn.Linear(self.hidden_size, self.hidden_size, bias=False)

        if self.has_relative_attention_bias:
            # 如果使用相对注意力偏置，创建相对注意力偏置的嵌入层
            self.relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads)
        self.pruned_heads = set()
        self.gradient_checkpointing = False

    @staticmethod
    # 从transformers.models.t5.modeling_t5.T5Attention._relative_position_bucket复制而来
    def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
        """
        Adapted from Mesh Tensorflow:
        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

        Translate relative position to a bucket number for relative attention. The relative position is defined as
        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
        This should allow for more graceful generalization to longer sequences than the model has been trained on

        Args:
            relative_position: an int32 Tensor - the relative position between memory and query
            bidirectional: a boolean - whether the attention is bidirectional or not
            num_buckets: an integer - number of buckets to categorize relative positions into
            max_distance: an integer - maximum distance to consider for bucketing

        Returns:
            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
        """
        # Initialize relative_buckets to 0
        relative_buckets = 0

        # Adjust num_buckets if bidirectional is True
        if bidirectional:
            num_buckets //= 2
            # Calculate relative_buckets based on whether relative_position is positive
            relative_buckets += (relative_position > 0).to(torch.long) * num_buckets
            # Take absolute value of relative_position for further processing
            relative_position = torch.abs(relative_position)
        else:
            # Convert relative_position to non-positive values for unidirectional attention
            relative_position = -torch.min(relative_position, torch.zeros_like(relative_position))

        # now relative_position is in the range [0, inf)

        # Define max_exact as half of num_buckets for exact increments
        max_exact = num_buckets // 2

        # Determine if relative_position is small or large
        is_small = relative_position < max_exact

        # Compute relative_position_if_large for larger buckets using logarithmic scaling
        relative_position_if_large = max_exact + (
            torch.log(relative_position.float() / max_exact)
            / math.log(max_distance / max_exact)
            * (num_buckets - max_exact)
        ).to(torch.long)

        # Ensure relative_position_if_large does not exceed the maximum bucket index
        relative_position_if_large = torch.min(
            relative_position_if_large, torch.full_like(relative_position_if_large, num_buckets - 1)
        )

        # Combine small and large bucket calculations to get final relative_buckets
        relative_buckets += torch.where(is_small, relative_position, relative_position_if_large)

        # Return the calculated relative_buckets tensor
        return relative_buckets
    def compute_bias(self, query_length, key_length, device=None):
        """Compute binned relative position bias"""
        # 如果没有指定设备，则使用 self.relative_attention_bias 的设备作为默认设备
        if device is None:
            device = self.relative_attention_bias.weight.device
        
        # 创建一个形状为 (query_length, 1) 的长整型张量，表示查询的位置
        context_position = torch.arange(query_length, dtype=torch.long, device=device)[:, None]
        
        # 创建一个形状为 (1, key_length) 的长整型张量，表示记忆的位置
        memory_position = torch.arange(key_length, dtype=torch.long, device=device)[None, :]
        
        # 计算相对位置，形状为 (query_length, key_length)，表示每个查询位置相对于每个记忆位置的偏移量
        relative_position = memory_position - context_position
        
        # 对相对位置进行分桶，使用 _relative_position_bucket 方法
        relative_position_bucket = self._relative_position_bucket(
            relative_position,  # 形状为 (query_length, key_length)
            bidirectional=False,
            num_buckets=self.relative_attention_num_buckets,
            max_distance=self.relative_attention_max_distance,
        )
        
        # 根据分桶后的相对位置获取相对位置偏置值，形状为 (query_length, key_length, num_heads)
        values = self.relative_attention_bias(relative_position_bucket)
        
        # 将维度重新排列为 (1, num_heads, query_length, key_length)，并在最前面添加一个维度
        values = values.permute([2, 0, 1]).unsqueeze(0)
        
        # 返回计算得到的相对位置偏置值
        return values

    def forward(
        self,
        hidden_states,
        mask=None,
        key_value_states=None,
        position_bias=None,
        past_key_value=None,
        layer_head_mask=None,
        query_length=None,
        use_cache=False,
        output_attentions=False,
# 从transformers.models.t5.modeling_t5.T5LayerSelfAttention复制代码，并将T5LayerNorm重命名为Pix2StructLayerNorm，T5Attention重命名为Pix2StructTextAttention，self.SelfAttention重命名为self.attention，config.d_model重命名为config.hidden_size
class Pix2StructTextLayerSelfAttention(nn.Module):
    def __init__(self, config, has_relative_attention_bias=False):
        super().__init__()
        # 初始化自注意力层，使用Pix2StructTextAttention作为注意力机制
        self.attention = Pix2StructTextAttention(config, has_relative_attention_bias=has_relative_attention_bias)
        # 初始化层归一化层，使用Pix2StructLayerNorm，并设定eps为config.layer_norm_epsilon
        self.layer_norm = Pix2StructLayerNorm(config.hidden_size, eps=config.layer_norm_epsilon)
        # 初始化Dropout层，使用config.dropout_rate作为dropout率
        self.dropout = nn.Dropout(config.dropout_rate)

    def forward(
        self,
        hidden_states,
        attention_mask=None,
        position_bias=None,
        layer_head_mask=None,
        past_key_value=None,
        use_cache=False,
        output_attentions=False,
    ):
        # 对隐藏状态进行归一化
        normed_hidden_states = self.layer_norm(hidden_states)
        # 使用注意力层进行注意力计算
        attention_output = self.attention(
            normed_hidden_states,
            mask=attention_mask,
            position_bias=position_bias,
            layer_head_mask=layer_head_mask,
            past_key_value=past_key_value,
            use_cache=use_cache,
            output_attentions=output_attentions,
        )
        # 原始隐藏状态与注意力输出的dropout结果相加
        hidden_states = hidden_states + self.dropout(attention_output[0])
        # 如果需要输出注意力，将其包含在输出中
        outputs = (hidden_states,) + attention_output[1:]  # 如果输出注意力，则添加它们
        return outputs


# 从transformers.models.t5.modeling_t5.T5LayerCrossAttention复制代码，并将T5LayerNorm重命名为Pix2StructLayerNorm，T5Attention重命名为Pix2StructTextAttention，self.EncDecAttention重命名为self.attention，config.d_model重命名为config.hidden_size
class Pix2StructTextLayerCrossAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 初始化跨注意力层，使用Pix2StructTextAttention作为注意力机制
        self.attention = Pix2StructTextAttention(config, has_relative_attention_bias=False)
        # 初始化层归一化层，使用Pix2StructLayerNorm，并设定eps为config.layer_norm_epsilon
        self.layer_norm = Pix2StructLayerNorm(config.hidden_size, eps=config.layer_norm_epsilon)
        # 初始化Dropout层，使用config.dropout_rate作为dropout率
        self.dropout = nn.Dropout(config.dropout_rate)

    def forward(
        self,
        hidden_states,
        key_value_states,
        attention_mask=None,
        position_bias=None,
        layer_head_mask=None,
        past_key_value=None,
        use_cache=False,
        query_length=None,
        output_attentions=False,
    ):
        # 对隐藏状态进行归一化
        normed_hidden_states = self.layer_norm(hidden_states)
        # 使用注意力层进行注意力计算
        attention_output = self.attention(
            normed_hidden_states,
            mask=attention_mask,
            key_value_states=key_value_states,
            position_bias=position_bias,
            layer_head_mask=layer_head_mask,
            past_key_value=past_key_value,
            use_cache=use_cache,
            query_length=query_length,
            output_attentions=output_attentions,
        )
        # 原始隐藏状态与注意力输出的dropout结果相加
        layer_output = hidden_states + self.dropout(attention_output[0])
        # 如果需要输出注意力，将其包含在输出中
        outputs = (layer_output,) + attention_output[1:]  # 如果输出注意力，则添加它们
        return outputs
class Pix2StructTextBlock(nn.Module):
    def __init__(self, config, has_relative_attention_bias=False):
        super().__init__()

        # 初始化自注意力层，用于Pix2StructTextBlock模块
        self.self_attention = Pix2StructTextLayerSelfAttention(
            config, has_relative_attention_bias=has_relative_attention_bias
        )

        # 初始化编码器-解码器注意力层，用于Pix2StructTextBlock模块
        self.encoder_decoder_attention = Pix2StructTextLayerCrossAttention(config)

        # 初始化前馈网络层，用于Pix2StructTextBlock模块
        self.mlp = Pix2StructTextLayerFF(config)

    def forward(
        self,
        hidden_states,
        attention_mask=None,
        position_bias=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        encoder_decoder_position_bias=None,
        layer_head_mask=None,
        cross_attn_layer_head_mask=None,
        past_key_value=None,
        use_cache=False,
        output_attentions=False,
        return_dict=True,
    ):
        # 此方法定义了Pix2StructTextBlock模块的前向传播逻辑
        pass


PIX2STRUCT_START_DOCSTRING = r"""

    The Pix2Struct model was proposed in [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language
    Understanding](https://arxiv.org/abs/2210.03347) by Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu,
    Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova. It's an encoder decoder
    transformer pre-trained in a image-to-text setting.

    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config (Union[`Pix2StructConfig`, `Pix2StructTextConfig`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

PIX2STRUCT_TEXT_INPUTS_DOCSTRING = r"""
"""

PIX2STRUCT_INPUTS_DOCSTRING = r"""
"""


@add_start_docstrings(
    "The standalone text decoder of Pix2Struct",
    PIX2STRUCT_START_DOCSTRING,
)
class Pix2StructTextModel(Pix2StructPreTrainedModel):
    config_class = Pix2StructTextConfig
    _no_split_modules = ["Pix2StructTextBlock"]
    _tied_weights_keys = ["lm_head.weight"]
    supports_gradient_checkpointing = True
    def __init__(self, config):
        # 调用父类构造函数初始化模型配置
        super().__init__(config)
        # 初始化词嵌入层，将词汇表大小映射到隐藏层大小
        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)

        # 创建模型层列表，每层为一个Pix2StructTextBlock对象，根据配置决定是否使用相对注意力偏置
        self.layer = nn.ModuleList(
            [Pix2StructTextBlock(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)]
        )
        # 初始化最终的层归一化模块，使用给定的隐藏层大小和层归一化系数
        self.final_layer_norm = Pix2StructLayerNorm(config.hidden_size, eps=config.layer_norm_epsilon)
        # 初始化Dropout层，使用给定的丢弃率
        self.dropout = nn.Dropout(config.dropout_rate)

        # 初始化语言模型的输出层线性变换，将隐藏层的输出映射到词汇表大小的输出
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

        # 初始化权重并应用最终处理
        self.post_init()
        # 关闭渐变检查点功能
        self.gradient_checkpointing = False

    # 从transformers库中的T5PreTrainedModel._reorder_cache方法复制而来
    def _reorder_cache(self, past_key_values, beam_idx):
        # 如果过去的键值对未包含在输出中，则禁用速度解码并无需重新排序
        if past_key_values is None:
            logger.warning("You might want to consider setting `use_cache=True` to speed up decoding")
            return past_key_values

        # 重新排序解码器过去的状态
        reordered_decoder_past = ()
        for layer_past_states in past_key_values:
            # 根据beam_idx重新排序每层的过去状态
            reordered_layer_past_states = ()
            for layer_past_state in layer_past_states:
                # 使用beam_idx在设备上选择正确的过去状态
                reordered_layer_past_states = reordered_layer_past_states + (
                    layer_past_state.index_select(0, beam_idx.to(layer_past_state.device)),
                )

            # 检查重新排序后的状态形状是否与原始状态一致
            if reordered_layer_past_states[0].shape != layer_past_states[0].shape:
                raise ValueError(
                    f"reordered_layer_past_states[0] shape {reordered_layer_past_states[0].shape} and layer_past_states[0] shape {layer_past_states[0].shape} mismatched"
                )
            # 检查重新排序后的状态列表长度是否与原始状态列表一致
            if len(reordered_layer_past_states) != len(layer_past_states):
                raise ValueError(
                    f"length of reordered_layer_past_states {len(reordered_layer_past_states)} and length of layer_past_states {len(layer_past_states)} mismatched"
                )

            # 将重新排序后的层过去状态添加到重新排序的解码器过去状态元组中
            reordered_decoder_past = reordered_decoder_past + (reordered_layer_past_states,)
        return reordered_decoder_past

    def get_input_embeddings(self):
        # 返回输入词嵌入层
        return self.embed_tokens

    def set_input_embeddings(self, new_embeddings):
        # 设置新的输入词嵌入层
        self.embed_tokens = new_embeddings

    def get_output_embeddings(self):
        # 返回输出层线性变换
        return self.lm_head

    def set_output_embeddings(self, new_embeddings):
        # 设置新的输出层线性变换
        self.lm_head = new_embeddings

    @add_start_docstrings_to_model_forward(PIX2STRUCT_TEXT_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
    # 定义前向传播方法，用于模型的前向推理过程
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,  # 输入的 token IDs，类型为长整型张量，可选
        attention_mask: Optional[torch.FloatTensor] = None,  # 注意力遮罩，类型为浮点数张量，可选
        encoder_hidden_states: Optional[torch.FloatTensor] = None,  # 编码器隐藏状态，类型为浮点数张量，可选
        encoder_attention_mask: Optional[torch.FloatTensor] = None,  # 编码器注意力遮罩，类型为浮点数张量，可选
        inputs_embeds: Optional[torch.LongTensor] = None,  # 输入嵌入表示，类型为长整型张量，可选
        head_mask: Optional[torch.FloatTensor] = None,  # 头部遮罩，类型为浮点数张量，可选
        cross_attn_head_mask: Optional[torch.Tensor] = None,  # 交叉注意力头部遮罩，类型为张量，可选
        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,  # 过去的键值对，类型为元组中嵌套的浮点数张量元组，可选
        use_cache: Optional[bool] = None,  # 是否使用缓存，类型为布尔值，可选
        output_attentions: Optional[bool] = None,  # 是否输出注意力，类型为布尔值，可选
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态，类型为布尔值，可选
        labels: Optional[torch.LongTensor] = None,  # 标签，类型为长整型张量，可选
        return_dict: Optional[bool] = None,  # 是否返回字典形式的结果，类型为布尔值，可选
        **kwargs,  # 其余关键字参数
# 添加文档字符串到模型类，描述了其作为条件生成模型和语言建模头部的功能
@add_start_docstrings(
    "A conditional generation model with a language modeling head. Can be used for sequence generation tasks.",
    PIX2STRUCT_START_DOCSTRING,
)
class Pix2StructForConditionalGeneration(Pix2StructPreTrainedModel):
    # 指定配置类
    config_class = Pix2StructConfig
    # 主要输入名称为"flattened_patches"
    main_input_name = "flattened_patches"
    # 需要共享权重的键列表
    _tied_weights_keys = ["decoder.lm_head.weight"]

    def __init__(self, config: Pix2StructConfig):
        super().__init__(config)

        # 初始化编码器和解码器
        self.encoder = Pix2StructVisionModel(config.vision_config)
        self.decoder = Pix2StructTextModel(config.text_config)

        # 是否为视觉问答模型的标志
        self.is_vqa = config.is_vqa

        # 执行后续的初始化步骤
        self.post_init()

    def get_input_embeddings(self):
        # 获取解码器的输入嵌入层
        return self.decoder.get_input_embeddings()

    def set_input_embeddings(self, new_embeddings):
        # 设置解码器的输入嵌入层
        self.decoder.set_input_embeddings(new_embeddings)

    def get_output_embeddings(self) -> nn.Module:
        # 获取解码器的输出嵌入层
        return self.decoder.get_output_embeddings()

    def set_output_embeddings(self, new_embeddings):
        # 设置解码器的输出嵌入层
        self.decoder.set_output_embeddings(new_embeddings)

    def resize_token_embeddings(self, new_num_tokens: Optional[int] = None) -> nn.Embedding:
        # 调整令牌嵌入层的大小
        model_embeds = self.decoder.resize_token_embeddings(new_num_tokens)

        # 更新词汇表大小配置
        self.config.text_config.vocab_size = new_num_tokens

        return model_embeds

    def get_decoder(self):
        # 获取解码器模块
        return self.decoder

    def get_encoder(self):
        # 获取编码器模块
        return self.encoder

    @add_start_docstrings_to_model_forward(PIX2STRUCT_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=Seq2SeqModelOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        flattened_patches: Optional[torch.FloatTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        decoder_input_ids: Optional[torch.LongTensor] = None,
        decoder_attention_mask: Optional[torch.BoolTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        decoder_head_mask: Optional[torch.FloatTensor] = None,
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        labels: Optional[torch.LongTensor] = None,
        decoder_inputs_embeds: Optional[torch.Tensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    # 为生成器准备输入数据
    def prepare_inputs_for_generation(
        self,
        input_ids,
        flattened_patches: Optional[torch.FloatTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        decoder_attention_mask: Optional[torch.BoolTensor] = None,
        past_key_values=None,
        head_mask=None,
        decoder_head_mask=None,
        cross_attn_head_mask=None,
        use_cache=None,
        encoder_outputs=None,
        **kwargs,
    ):
        # 如果未提供decoder_attention_mask，则创建一个全为1的张量，与input_ids的设备匹配
        if decoder_attention_mask is None:
            decoder_attention_mask = torch.ones_like(input_ids).to(input_ids.device)

        # 如果使用了past_key_values，则调整input_ids以去除前缀
        if past_key_values is not None:
            # 获取past_key_values的长度
            past_length = past_key_values[0][0].shape[2]

            # 某些生成方法可能已经只传递了最后一个输入ID
            if input_ids.shape[1] > past_length:
                remove_prefix_length = past_length
            else:
                # 默认情况下保留仅最终的ID
                remove_prefix_length = input_ids.shape[1] - 1

            # 调整input_ids，仅保留后缀部分
            input_ids = input_ids[:, remove_prefix_length:]

        # 返回包含各种生成器输入的字典
        return {
            "flattened_patches": flattened_patches,
            "decoder_input_ids": input_ids,
            "past_key_values": past_key_values,
            "encoder_outputs": encoder_outputs,
            "attention_mask": attention_mask,
            "decoder_attention_mask": decoder_attention_mask,
            "head_mask": head_mask,
            "decoder_head_mask": decoder_head_mask,
            "cross_attn_head_mask": cross_attn_head_mask,
            "use_cache": use_cache,
        }

Transformers-源码解析-八十八-

Transformers 源码解析（八十八）

.\models\perceiver\tokenization_perceiver.py

.\models\perceiver\__init__.py

.\models\persimmon\configuration_persimmon.py

.\models\persimmon\convert_persimmon_weights_to_hf.py

.\models\persimmon\modeling_persimmon.py

.\models\persimmon\__init__.py

.\models\phi\configuration_phi.py

.\models\phi\convert_phi_weights_to_hf.py

.\models\phi\modeling_phi.py

.\models\phi\__init__.py

.\models\phobert\tokenization_phobert.py

.\models\phobert\__init__.py

.\models\pix2struct\configuration_pix2struct.py

.\models\pix2struct\convert_pix2struct_original_pytorch_to_hf.py

.\models\pix2struct\image_processing_pix2struct.py

.\models\pix2struct\modeling_pix2struct.py

`.\models\perceiver\tokenization_perceiver.py`

`.\models\perceiver\init.py`

`.\models\persimmon\configuration_persimmon.py`

`.\models\persimmon\convert_persimmon_weights_to_hf.py`

`.\models\persimmon\modeling_persimmon.py`

`.\models\persimmon\init.py`

`.\models\phi\configuration_phi.py`

`.\models\phi\convert_phi_weights_to_hf.py`

`.\models\phi\modeling_phi.py`

`.\models\phi\init.py`

`.\models\phobert\tokenization_phobert.py`

`.\models\phobert\init.py`

`.\models\pix2struct\configuration_pix2struct.py`

`.\models\pix2struct\convert_pix2struct_original_pytorch_to_hf.py`

`.\models\pix2struct\image_processing_pix2struct.py`

`.\models\pix2struct\modeling_pix2struct.py`