Transformers 源码解析（五十六）

`.\models\gpt_neox\tokenization_gpt_neox_fast.py`

# 设置脚本文件的编码格式为UTF-8
# 版权声明，指出此代码的版权归EleutherAI和The HuggingFace Inc.团队所有
#
# 根据Apache许可证2.0版进行许可，除非符合许可证的规定，否则不得使用此文件
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，本软件是基于“原样”提供的，不提供任何明示或暗示的保证或条件
# 请参阅许可证以获取特定语言的许可证详细信息
"""GPTNeoX的标记类。"""
# 导入json模块，用于处理JSON格式的数据
import json
# 导入Optional和Tuple用于类型提示
from typing import Optional, Tuple

# 从tokenizers库中导入pre_tokenizers模块
from tokenizers import pre_tokenizers

# 从tokenization_utils_fast模块中导入PreTrainedTokenizerFast类
from ...tokenization_utils_fast import PreTrainedTokenizerFast
# 从utils模块中导入logging函数
from ...utils import logging

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 定义词汇文件名字典
VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}

# 预训练词汇文件映射
PRETRAINED_VOCAB_FILES_MAP = {
    "tokenizer_file": {
        "EleutherAI/gpt-neox-20b": "https://huggingface.co/EleutherAI/gpt-neox-20b/resolve/main/tokenizer.json",
    },
}

# 预训练位置嵌入大小
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "gpt-neox-20b": 2048,
}


class GPTNeoXTokenizerFast(PreTrainedTokenizerFast):
    """
    构建一个“快速”的GPT-NeoX-20B标记生成器（由HuggingFace的*tokenizers*库支持）。基于字节级Byte-Pair-Encoding。

    这个标记生成器被训练成将空格视为标记的一部分（类似于sentencepiece），因此单词的编码会根据其是否位于句子开头（没有空格）而不同：

    ```
    >>> from transformers import GPTNeoXTokenizerFast

    >>> tokenizer = GPTNeoXTokenizerFast.from_pretrained("openai-community/gpt2")
    >>> tokenizer("Hello world")["input_ids"]
    [15496, 995]

    >>> tokenizer(" Hello world")["input_ids"]
    [18435, 995]
    ```

    如果在实例化标记生成器时传递`add_prefix_space=True`，可以绕过此行为，但由于模型未用此方式进行预训练，可能会降低性能。

    <Tip>

    当与`is_split_into_words=True`一起使用时，应使用`add_prefix_space=True`实例化此标记生成器。

    </Tip>

    此标记生成器继承自[`PreTrainedTokenizerFast`]，其中包含大多数主要方法。用户应参考此超类以获取有关这些方法的更多信息。

    """
    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        merges_file (`str`):
            Path to the merges file.
        errors (`str`, *optional*, defaults to `"replace"`):
            Paradigm to follow when decoding bytes to UTF-8. See
            [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
        unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        bos_token (`str`, *optional*, defaults to `<|endoftext|>`):
            The beginning of sequence token.
        eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
            The end of sequence token.
        add_prefix_space (`bool`, *optional*, defaults to `False`):
            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
            other word. (GPTNeoX tokenizer detect beginning of words by the preceding space).
        trim_offsets (`bool`, *optional*, defaults to `True`):
            Whether or not the post-processing step should trim offsets to avoid including whitespaces.
    """

    # 定义常量：用于存储预定义的词汇文件名列表
    vocab_files_names = VOCAB_FILES_NAMES
    # 定义常量：用于存储预定义的词汇文件映射字典
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    # 定义常量：用于存储预定义的最大模型输入大小列表
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    # 定义常量：用于存储预定义的模型输入名称列表
    model_input_names = ["input_ids", "attention_mask"]

    def __init__(
        self,
        vocab_file=None,
        merges_file=None,
        tokenizer_file=None,
        unk_token="<|endoftext|>",
        bos_token="<|endoftext|>",
        eos_token="<|endoftext|>",
        add_prefix_space=False,
        **kwargs,
    ):
        # 调用父类的初始化方法，传递参数以配置tokenizer
        super().__init__(
            vocab_file,
            merges_file,
            tokenizer_file=tokenizer_file,
            unk_token=unk_token,
            bos_token=bos_token,
            eos_token=eos_token,
            add_prefix_space=add_prefix_space,
            **kwargs,
        )

        # 获取当前tokenizer的预处理状态，并更新其中的add_prefix_space选项
        pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
        if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
            # 根据预处理器类型动态获取类，并根据更新后的状态重新配置预处理器
            pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
            pre_tok_state["add_prefix_space"] = add_prefix_space
            self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)

        # 设置类属性，用于存储是否添加前导空格的标志
        self.add_prefix_space = add_prefix_space

    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # 调用底层tokenizer模型的保存方法，保存模型到指定目录，并返回保存的文件名元组
        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
        return tuple(files)

    @property
    # 从transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.default_chat_template复制而来
    # 定义一个默认的聊天模板函数，用于生成聊天内容，忽略角色信息，并使用 EOS 标记连接消息。
    logger.warning_once(
        # 发出一次性警告日志，指示没有为此分词器定义聊天模板，而是使用默认模板。
        "\nNo chat template is defined for this tokenizer - using the default template "
        f"for the {self.__class__.__name__} class. If the default is not appropriate for "
        "your model, please set `tokenizer.chat_template` to an appropriate template. "
        "See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
    )
    # 返回一个字符串模板，用于格式化聊天消息，每条消息后附加 EOS 标记。
    return "{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}"

`.\models\gpt_neox\init.py`

# 版权声明，指明代码的版权信息
# 根据 Apache License, Version 2.0 许可证授权使用此文件
# 如果不符合许可证要求，禁止使用此文件
from typing import TYPE_CHECKING

# 从文件工具中导入 LazyModule、is_tokenizers_available 和 is_torch_available 函数
from ...file_utils import _LazyModule, is_tokenizers_available, is_torch_available
# 从工具函数中导入 OptionalDependencyNotAvailable 异常类
from ...utils import OptionalDependencyNotAvailable

# 定义模块导入结构字典
_import_structure = {"configuration_gpt_neox": ["GPT_NEOX_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPTNeoXConfig"]}

# 检查是否存在 tokenizers 库，如果不存在则引发 OptionalDependencyNotAvailable 异常
try:
    if not is_tokenizers_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若存在 tokenizers 库，则将 GPTNeoXTokenizerFast 导入到导入结构字典中
    _import_structure["tokenization_gpt_neox_fast"] = ["GPTNeoXTokenizerFast"]

# 检查是否存在 torch 库，如果不存在则引发 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若存在 torch 库，则将 GPTNeoX 相关模型导入到导入结构字典中
    _import_structure["modeling_gpt_neox"] = [
        "GPT_NEOX_PRETRAINED_MODEL_ARCHIVE_LIST",
        "GPTNeoXForCausalLM",
        "GPTNeoXForQuestionAnswering",
        "GPTNeoXForSequenceClassification",
        "GPTNeoXForTokenClassification",
        "GPTNeoXLayer",
        "GPTNeoXModel",
        "GPTNeoXPreTrainedModel",
    ]

# 如果类型检查为真，则从 configuration_gpt_neox 模块导入配置映射和配置类
if TYPE_CHECKING:
    from .configuration_gpt_neox import GPT_NEOX_PRETRAINED_CONFIG_ARCHIVE_MAP, GPTNeoXConfig

    # 检查是否存在 tokenizers 库，如果不存在则跳过导入
    try:
        if not is_tokenizers_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 若存在 tokenizers 库，则从 tokenization_gpt_neox_fast 模块导入 GPTNeoXTokenizerFast 类
        from .tokenization_gpt_neox_fast import GPTNeoXTokenizerFast

    # 检查是否存在 torch 库，如果不存在则跳过导入
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 若存在 torch 库，则从 modeling_gpt_neox 模块导入 GPTNeoX 相关类
        from .modeling_gpt_neox import (
            GPT_NEOX_PRETRAINED_MODEL_ARCHIVE_LIST,
            GPTNeoXForCausalLM,
            GPTNeoXForQuestionAnswering,
            GPTNeoXForSequenceClassification,
            GPTNeoXForTokenClassification,
            GPTNeoXLayer,
            GPTNeoXModel,
            GPTNeoXPreTrainedModel,
        )

# 如果不是类型检查模式，则将当前模块指定为 LazyModule 的代理
else:
    import sys

    # 将当前模块注册为 LazyModule，使用 LazyModule 实现延迟加载模块的功能
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\gpt_neox_japanese\configuration_gpt_neox_japanese.py`

# coding=utf-8
# Copyright 2022 ABEJA, Inc. and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" GPTNeoX Japanese model configuration"""

# 从相应的库中导入所需的类和函数
from ...configuration_utils import PretrainedConfig
from ...utils import logging

# 获取与当前模块相关联的日志记录器
logger = logging.get_logger(__name__)

# 定义一个字典，映射预训练模型名称到其配置文件的 URL
GPT_NEOX_JAPANESE_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "abeja/gpt-neox-japanese-2.7b": "https://huggingface.co/abeja/gpt-neox-japanese-2.7b/resolve/main/config.json",
}

# 定义一个配置类，用于存储 GPTNeoXJapanese 模型的配置信息
class GPTNeoXJapaneseConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`GPTNeoXModelJapanese`]. It is used to instantiate
    a GPTNeoX model according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the GPTNeoXJapanese
    [abeja/gpt-neox-japanese-2.7b](https://huggingface.co/abeja/gpt-neox-japanese-2.7b) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information. Default configs is set as 2.7B model
    """
    # 模型类型字符串常量，表示这是一个 GPTNeoXJapanese 模型
    model_type = "gpt_neox_japanese"
    # 初始化函数，用于创建一个新的实例对象
    def __init__(
        self,
        vocab_size=32000,  # 设置词汇表大小，默认为32000
        hidden_size=2560,  # 设置隐藏层大小，默认为2560
        num_hidden_layers=32,  # 设置隐藏层数，默认为32
        num_attention_heads=32,  # 设置注意力头数，默认为32
        intermediate_multiple_size=4,  # 设置中间层大小的倍数，默认为4
        hidden_act="gelu",  # 设置隐藏层激活函数，默认为GELU
        rotary_pct=1.00,  # 设置使用rotary位置嵌入的百分比，默认为100%
        rotary_emb_base=10000,  # 设置rotary位置嵌入的基础值，默认为10000
        max_position_embeddings=2048,  # 设置最大位置嵌入数，默认为2048
        initializer_range=0.02,  # 设置参数初始化范围，默认为0.02
        layer_norm_eps=1e-5,  # 设置层归一化的 epsilon，默认为1e-5
        use_cache=True,  # 设置是否使用缓存，默认为True
        bos_token_id=31996,  # 设置起始标记的 token id，默认为31996
        eos_token_id=31999,  # 设置结束标记的 token id，默认为31999
        attention_dropout=0.1,  # 设置注意力层的 dropout 比例，默认为0.1
        hidden_dropout=0.0,  # 设置隐藏层的 dropout 比例，默认为0.0
        **kwargs,  # 允许传入额外的关键字参数
    ):
        # 调用父类的初始化方法，传递起始标记和结束标记的 token id
        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
        # 设置实例对象的各个属性
        self.vocab_size = vocab_size  # 初始化词汇表大小
        self.max_position_embeddings = max_position_embeddings  # 初始化最大位置嵌入数
        self.hidden_size = hidden_size  # 初始化隐藏层大小
        self.num_hidden_layers = num_hidden_layers  # 初始化隐藏层数
        self.num_attention_heads = num_attention_heads  # 初始化注意力头数
        self.intermediate_multiple_size = intermediate_multiple_size  # 初始化中间层大小的倍数
        self.hidden_act = hidden_act  # 初始化隐藏层激活函数
        self.rotary_pct = rotary_pct  # 初始化使用rotary位置嵌入的百分比
        self.rotary_emb_base = rotary_emb_base  # 初始化rotary位置嵌入的基础值
        self.initializer_range = initializer_range  # 初始化参数初始化范围
        self.layer_norm_eps = layer_norm_eps  # 初始化层归一化的 epsilon
        self.use_cache = use_cache  # 初始化是否使用缓存
        self.attention_dropout = attention_dropout  # 初始化注意力层的 dropout 比例
        self.hidden_dropout = hidden_dropout  # 初始化隐藏层的 dropout 比例

`.\models\gpt_neox_japanese\modeling_gpt_neox_japanese.py`

# coding=utf-8
# Copyright 2022 ABEJA, Inc. and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch GPTNeoX model."""

from typing import Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import Tensor, nn
from torch.nn import CrossEntropyLoss

from ...activations import ACT2FN
from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
from ...modeling_utils import PreTrainedModel
from ...utils import logging
from .configuration_gpt_neox_japanese import GPTNeoXJapaneseConfig


logger = logging.get_logger(__name__)

_CHECKPOINT_FOR_DOC = "abeja/gpt-neox-japanese-2.7b"
_CONFIG_FOR_DOC = "GPTNeoXJapaneseConfig"

GPT_NEOX_JAPANESE_PRETRAINED_MODEL_ARCHIVE_LIST = {
    "https://huggingface.co/abeja/gpt-neox-japanese-2.7b/resolve/main/config.json",
    # See all GPTNeoXJapanese models at https://huggingface.co/models?filter=gpt_neox_japanese
}


class GPTNeoXJapanesePreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    config_class = GPTNeoXJapaneseConfig
    base_model_prefix = "gpt_neox_japanese"
    _no_split_modules = ["GPTNeoXJapaneseLayer"]
    _skip_keys_device_placement = "past_key_values"

    def _init_weights(self, module):
        """Initialize the weights"""
        # 如果模块是线性层，则使用正态分布初始化权重和零初始化偏置
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        # 如果模块是嵌入层，则使用正态分布初始化权重，对于指定的填充索引，将其权重初始化为零
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        # 如果模块是层归一化层，则初始化偏置为零，权重为全1
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)


class GPTNeoXJapaneseAttention(nn.Module):
    # 初始化函数，用于初始化一个注意力机制的模型
    def __init__(self, config, use_bias=False):
        super().__init__()
        # 设置注意力头的数量
        self.num_attention_heads = config.num_attention_heads
        # 设置隐藏层的大小
        self.hidden_size = config.hidden_size
        # 计算每个注意力头的大小
        self.head_size = self.hidden_size // self.num_attention_heads

        # 计算旋转嵌入的维度，基于头大小和配置中的旋转百分比
        self.rotary_ndims = int(self.head_size * config.rotary_pct)
        # 创建旋转嵌入对象，用于位置编码
        self.rotary_emb = RotaryEmbedding(
            self.rotary_ndims, config.max_position_embeddings, base=config.rotary_emb_base
        )
        # 设置最大位置编码数
        self.max_positions = config.max_position_embeddings
        # 设置注意力的dropout层
        self.attention_dropout = nn.Dropout(config.attention_dropout)
        # 计算归一化因子，用于注意力计算中
        self.norm_factor = torch.sqrt(torch.tensor(self.head_size, dtype=torch.float32)).to(torch.get_default_dtype())

        # 创建查询、键、值的线性层
        self.query_key_value = nn.Linear(config.hidden_size, 3 * config.hidden_size, bias=False)
        # 创建输出密集层
        self.dense = nn.Linear(config.hidden_size, config.hidden_size, bias=False)
        
        # 如果是最后一层，则激活偏置项
        self.use_bias = use_bias
        # 如果使用偏置，则创建偏置参数
        self.dense_bias = nn.Parameter(torch.zeros(config.hidden_size)) if use_bias else None

    # 前向传播函数，用于计算模型的输出
    def forward(
        self,
        hidden_states,
        attention_mask,
        head_mask=None,
        layer_past=None,
        use_cache=False,
        output_attentions=False,
        ):
            # 检查是否存在先前的层信息，并且该信息的元素数大于0
            has_layer_past = layer_past is not None and layer_past[0].numel() > 0

            # 计算 QKV
            # 注意力头 [batch, seq_len, hidden_size]
            #   --> [batch, seq_len, (np * 3 * head_size)]
            qkv = self.query_key_value(hidden_states)

            # [batch, seq_len, (num_heads * 3 * head_size)]
            #   --> [batch, seq_len, num_heads, 3 * head_size]
            new_qkv_shape = qkv.size()[:-1] + (self.num_attention_heads, 3 * self.head_size)
            qkv = qkv.view(*new_qkv_shape)

            # [batch, seq_len, num_attention_heads, 3 * head_size] --> 3 [batch, num_attention_heads, seq_len, head_size]
            query = qkv[..., : self.head_size].permute(0, 2, 1, 3)
            key = qkv[..., self.head_size : 2 * self.head_size].permute(0, 2, 1, 3)
            value = qkv[..., 2 * self.head_size :].permute(0, 2, 1, 3)

            # 计算旋转嵌入在旋转维度上的应用
            query_rot = query[..., : self.rotary_ndims]
            query_pass = query[..., self.rotary_ndims :]
            key_rot = key[..., : self.rotary_ndims]
            key_pass = key[..., self.rotary_ndims :]

            # 计算旋转嵌入的令牌偏移量（在解码时）
            seq_len = key.shape[-2]
            offset = 0
            if has_layer_past:
                offset = layer_past[0].shape[-2]
                seq_len += offset
            cos, sin = self.rotary_emb(value, seq_len=seq_len)
            query, key = apply_rotary_pos_emb(query_rot, key_rot, cos, sin, offset=offset)
            query = torch.cat((query, query_pass), dim=-1)
            key = torch.cat((key, key_pass), dim=-1)

            # 缓存 QKV 值
            if has_layer_past:
                past_key = layer_past[0]
                past_value = layer_past[1]
                key = torch.cat((past_key, key), dim=-2)
                value = torch.cat((past_value, value), dim=-2)
            present = (key, value) if use_cache else None

            # 计算注意力
            attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)

            # 重塑输出
            attn_output = self._merge_heads(attn_output, self.num_attention_heads, self.head_size)
            attn_output = self.dense(attn_output)

            outputs = (attn_output, present)
            if output_attentions:
                outputs += (attn_weights,)

            return outputs, self.dense_bias

        @classmethod
        def _split_heads(cls, tensor, num_attention_heads, attn_head_size):
            """
            将隐藏维度分割为 attn_head_size 和 num_attention_heads
            """
            # tensor: [bs, seq_len, hidden_size]
            new_shape = tensor.size()[:-1] + (num_attention_heads, attn_head_size)
            # -> [bs, seq_len, num_attention_heads, attn_head_size]
            tensor = tensor.view(new_shape)
            # -> [bs, num_attention_heads, seq_len, attn_head_size]
            tensor = tensor.permute(0, 2, 1, 3)
            return tensor
    def _merge_heads(cls, tensor, num_attention_heads, attn_head_size):
        """
        Merges attn_head_size dim and num_attn_heads dim into hidden dim
        """
        # 将张量进行维度置换，调整为 [bs, seq_len, num_attention_heads, attn_head_size] 的格式
        tensor = tensor.permute(0, 2, 1, 3).contiguous()
        # 将多头注意力机制和注意力头尺寸的维度合并成隐藏层维度，得到 [bs, seq_len, hidden_size] 的张量
        tensor = tensor.view(tensor.size(0), tensor.size(1), num_attention_heads * attn_head_size)
        # 返回合并后的张量
        return tensor

    def _create_causal_mask(self, key_length, query_length):
        # 创建一个因果遮蔽（causal mask）张量，用于自注意力机制中
        causal_mask = torch.tril(
            torch.ones((self.max_positions, self.max_positions), dtype=torch.bool).view(
                1, 1, self.max_positions, self.max_positions
            )
        )
        # 从因果遮蔽张量中选择出需要的部分，形成 [1, 1, key_length - query_length, key_length] 的子张量
        return causal_mask[:, :, key_length - query_length : key_length, :key_length]
    def _attn(self, query, key, value, attention_mask=None, head_mask=None):
        # q, k, v: [bs, num_attention_heads, seq_len, attn_head_size]
        # query, key, value 的维度说明：[批大小, 注意力头数, 序列长度, 每个注意力头的大小]

        # 获取 query 的维度信息
        batch_size, num_attention_heads, query_length, attn_head_size = query.size()
        # 获取 key 的序列长度信息
        key_length = key.size(-2)

        # 创建因果遮罩，基于 key 的长度和 query 的长度
        causal_mask = self._create_causal_mask(key_length, query_length)

        # 将 query 和 key 重塑成适合计算的形状
        query = query.view(batch_size * num_attention_heads, query_length, attn_head_size)
        key = key.view(batch_size * num_attention_heads, key_length, attn_head_size)

        # 初始化注意力分数矩阵为零
        attn_scores = torch.zeros(
            batch_size * num_attention_heads,
            query_length,
            key_length,
            dtype=query.dtype,
            device=key.device,
        )

        # 计算注意力分数
        attn_scores = torch.baddbmm(
            attn_scores,
            query,
            key.transpose(1, 2),
            beta=1.0,
            alpha=(torch.tensor(1.0, dtype=self.norm_factor.dtype, device=self.norm_factor.device) / self.norm_factor),
        )

        # 将注意力分数重塑回原来的形状
        attn_scores = attn_scores.view(batch_size, num_attention_heads, query_length, key_length)

        # 生成用于掩码的最小值
        mask_value = torch.finfo(attn_scores.dtype).min
        # 将最小值转换为张量，并移到相同的设备上
        mask_value = torch.tensor(mask_value, dtype=attn_scores.dtype).to(attn_scores.device)
        causal_mask = causal_mask.to(attn_scores.device)

        # 应用因果遮罩
        attn_scores = torch.where(causal_mask, attn_scores, mask_value)

        # 如果提供了注意力掩码，则应用它
        if attention_mask is not None:
            attn_scores = attn_scores + attention_mask

        # 对注意力分数进行 softmax 归一化
        attn_weights = nn.functional.softmax(attn_scores, dim=-1)

        # 应用注意力 dropout
        attn_weights = self.attention_dropout(attn_weights)

        # 将注意力权重转换为与 value 相同的数据类型
        attn_weights = attn_weights.to(value.dtype)

        # 如果提供了头部掩码，则应用它
        if head_mask is not None:
            attn_weights = attn_weights * head_mask

        # 计算注意力输出
        attn_output = torch.matmul(attn_weights, value)

        # 返回注意力输出和注意力权重
        return attn_output, attn_weights
# 从 transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXRotaryEmbedding 复制的类，现命名为 RotaryEmbedding
class RotaryEmbedding(nn.Module):
    # 从 transformers.models.mistral.modeling_mistral.MistralRotaryEmbedding.__init__ 复制的构造函数
    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
        super().__init__()

        # 初始化旋转嵌入层的参数
        self.dim = dim  # 嵌入维度
        self.max_position_embeddings = max_position_embeddings  # 最大位置嵌入数
        self.base = base  # 基数
        # 计算频率倒数，并在设备上注册为缓冲区
        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
        self.register_buffer("inv_freq", inv_freq, persistent=False)

        # 为了使 `torch.jit.trace` 正常工作，构建余弦和正弦缓存
        self._set_cos_sin_cache(
            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
        )

    def _set_cos_sin_cache(self, seq_len, device, dtype):
        # 设置余弦和正弦缓存
        self.max_seq_len_cached = seq_len
        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)

        freqs = torch.outer(t, self.inv_freq)
        # 根据论文使用的不同排列方式，构建嵌入张量
        emb = torch.cat((freqs, freqs), dim=-1)
        self.register_buffer("cos_cached", emb.cos(), persistent=False)
        self.register_buffer("sin_cached", emb.sin(), persistent=False)

    def forward(self, x, seq_len=None):
        # 前向传播函数
        # x: [bs, num_attention_heads, seq_len, head_size]
        if seq_len > self.max_seq_len_cached:
            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)

        # 返回余弦和正弦缓存
        return (
            self.cos_cached[:seq_len],
            self.sin_cached[:seq_len],
        )


def rotate_half(x):
    """将输入的一半隐藏维度旋转"""
    x1 = x[..., : x.shape[-1] // 2]
    x2 = x[..., x.shape[-1] // 2 :]
    return torch.cat((-x2, x1), dim=-1)


def apply_rotary_pos_emb(q, k, cos, sin, offset: int = 0):
    """应用旋转位置嵌入到查询和键中"""
    cos = cos[..., offset : q.shape[-2] + offset, :]
    sin = sin[..., offset : q.shape[-2] + offset, :]
    q_embed = (q * cos) + (rotate_half(q) * sin)
    k_embed = (k * cos) + (rotate_half(k) * sin)
    return q_embed, k_embed


def bias_dropout_add(x: Tensor, bias: Tensor, residual: Optional[Tensor], prob: float, training: bool) -> Tensor:
    """为输入添加偏置，应用 dropout 和残差连接

    Args:
        x (Tensor): 主路径的输出
        bias (Tensor): 最后一个注意力层的 attn_bias 或者 None
        residual (Optional[Tensor]): 残差值
        prob (float): dropout 概率
        training (bool): 是否处于训练模式

    Returns:
        Tensor: dropout(x + bias) + residual
    """
    if bias is not None:
        x = x + bias
    out = torch.nn.functional.dropout(x, p=prob, training=training)
    if residual is not None:
        out = residual + out
    return out


class GPTNeoXJapaneseMLP(nn.Module):
    # 初始化方法，接收一个配置对象作为参数
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()
        # 根据配置计算中间层的大小，这里使用了隐藏层大小乘以一个倍数来确定中间层的大小
        intermediate_size = int(config.hidden_size * config.intermediate_multiple_size)
        # 创建一个线性层，将隐藏状态映射到四倍隐藏状态大小的维度，不使用偏置
        self.dense_h_to_4h = nn.Linear(config.hidden_size, intermediate_size, bias=False)
        # 创建一个线性层，将四倍隐藏状态大小映射回隐藏状态大小的维度，不使用偏置
        # 这一步是将映射的结果投影回原始隐藏状态的维度
        self.dense_4h_to_h = nn.Linear(intermediate_size, config.hidden_size, bias=False)
        # 从配置中获取激活函数的选择，并赋值给类的实例变量
        self.act = ACT2FN[config.hidden_act]

    # 前向传播方法，接收隐藏状态作为输入
    def forward(self, hidden_states):
        # 使用第一个线性层将隐藏状态映射到四倍隐藏状态大小的维度
        intermediate = self.dense_h_to_4h(hidden_states)
        # 对映射结果应用激活函数
        intermediate = self.act(intermediate)
        # 使用第二个线性层将映射后的结果投影回原始隐藏状态大小的维度
        output = self.dense_4h_to_h(intermediate)
        # 返回最终的输出结果
        return output
# 定义一个名为 GPTNeoXJapaneseLayer 的新类，继承自 nn.Module
class GPTNeoXJapaneseLayer(nn.Module):
    # 初始化方法，接受 config 和 layer_number 两个参数
    def __init__(self, config, layer_number):
        # 调用父类 nn.Module 的初始化方法
        super().__init__()
        # 将输入的层编号存储到实例变量中
        self.layer_number = layer_number
        # 初始化输入层归一化层，使用 config 中定义的隐藏层大小和层归一化的 epsilon 值
        self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 初始化注意力后归一化层，使用 config 中定义的隐藏层大小和层归一化的 epsilon 值
        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 如果是最后一层，激活注意力层的偏置
        self.attention = GPTNeoXJapaneseAttention(config=config, use_bias=layer_number == config.num_hidden_layers - 1)
        # 初始化多层感知机层
        self.mlp = GPTNeoXJapaneseMLP(config)
        # 隐藏层的 dropout 概率
        self.hidden_dropout = config.hidden_dropout

    # 前向传播方法
    def forward(
        self,
        hidden_states,
        attention_mask=None,
        head_mask=None,
        use_cache=False,
        layer_past=None,
        output_attentions=False,
    ):
        # 将隐藏状态保存为残差连接的基础
        residual = hidden_states
        # 对输入层进行归一化处理
        ln_out = self.input_layernorm(hidden_states)
        # 使用注意力层进行计算，得到注意力层的输出和注意力偏置
        attention_layer_outputs, attn_bias = self.attention(
            ln_out,
            attention_mask=attention_mask,
            layer_past=layer_past,
            head_mask=head_mask,
            use_cache=use_cache,
            output_attentions=output_attentions,
        )
        # 获取注意力层的输出，索引为 0 的元素
        attn_output = attention_layer_outputs[0]  # output_attn: a, present, (attentions)
        # 将其余的输出保存到 outputs 变量中
        outputs = attention_layer_outputs[1:]

        # 使用 bias_dropout_add 函数将注意力输出与偏置和残差相加，并应用 dropout
        attn_output = bias_dropout_add(
            attn_output,
            bias=attn_bias.expand_as(residual) if attn_bias is not None else attn_bias,
            residual=residual,
            prob=self.hidden_dropout,
            training=self.training,
        )
        # 对注意力输出进行 MLP 处理，并应用注意力后归一化
        mlp_output = self.mlp(self.post_attention_layernorm(attn_output))

        # 再次使用 bias_dropout_add 函数将 MLP 输出与残差和注意力输出相加，并应用 dropout
        attn_output = bias_dropout_add(
            mlp_output, bias=None, residual=attn_output, prob=self.hidden_dropout, training=self.training
        )

        # 如果 use_cache 为真，将 attn_output 添加到 outputs 中；否则，只保留 outputs 中的第一个元素
        if use_cache:
            outputs = (attn_output,) + outputs
        else:
            outputs = (attn_output,) + outputs[1:]

        # 返回处理后的输出，包括隐藏状态、present 和（如果设置）注意力信息
        return outputs  # hidden_states, present, (attentions)
    Args:
        input_ids (`torch.LongTensor` of shape `({0})`):
            # 输入序列标记在词汇表中的索引。
            # 可以使用 [`AutoTokenizer`] 获得这些索引。

        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
            # 遮罩，用于避免在填充的标记索引上执行注意力操作。
            # 遮罩值选在 `[0, 1]` 之间：
            # - 1 表示**不被遮罩**的标记，
            # - 0 表示**被遮罩**的标记。

        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            # 段标记索引，用于指示输入的第一部分和第二部分。
            # 索引选在 `[0, 1]` 之间：
            # - 0 对应于*句子 A*的标记，
            # - 1 对应于*句子 B*的标记。

        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            # 每个输入序列标记在位置嵌入中的位置索引。
            # 选择范围为 `[0, config.max_position_embeddings - 1]`。

        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            # 用于屏蔽自注意力模块中选择的头部的遮罩。
            # 遮罩值选在 `[0, 1]` 之间：
            # - 1 表示头部**未被遮罩**，
            # - 0 表示头部**被遮罩**。

        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
            # 可选参数，代替传递 `input_ids`，直接传递嵌入表示。
            # 如果希望更精细地控制如何将 *input_ids* 索引转换为关联向量，比模型内部的嵌入查找矩阵更有用。

        output_attentions (`bool`, *optional*):
            # 是否返回所有注意力层的注意力张量。详细信息见返回的张量中的 `attentions`。

        output_hidden_states (`bool`, *optional*):
            # 是否返回所有层的隐藏状态。详细信息见返回的张量中的 `hidden_states`。

        return_dict (`bool`, *optional*):
            # 是否返回一个 [`~file_utils.ModelOutput`] 而不是普通元组。
"""
@add_start_docstrings(
    "The bare GPTNeoXJapaneseForCausalLM Model transformer with a causal language modeling head on top.",
    GPT_NEOX_JAPANESE_START_DOCSTRING,
)
"""
class GPTNeoXJapaneseForCausalLM(GPTNeoXJapanesePreTrainedModel):
    _tied_weights_keys = ["embed_out.weight"]

    def __init__(self, config):
        """
        Initialize the GPTNeoXJapaneseForCausalLM model.

        Args:
            config (GPTNeoXJapaneseConfig): Configuration class for the model.
        """
        super().__init__(config)
        self.config = config

        # Initialize the base GPTNeoXJapaneseModel
        self.gpt_neox_japanese = GPTNeoXJapaneseModel(config)
        
        # Initialize the output linear layer for language modeling
        self.embed_out = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

        # Initialize weights and apply final processing
        self.post_init()

    def get_output_embeddings(self):
        """
        Retrieve the output embeddings.

        Returns:
            nn.Linear: The output embeddings layer.
        """
        return self.embed_out

    def set_output_embeddings(self, new_embeddings):
        """
        Set new output embeddings.

        Args:
            new_embeddings (nn.Linear): New embeddings to be set.
        """
        self.embed_out = new_embeddings

    @add_start_docstrings_to_model_forward(GPT_NEOX_JAPANESE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        """
        Perform forward pass of the GPTNeoXJapaneseForCausalLM model.

        Args:
            input_ids (torch.LongTensor, optional): Input token IDs. Default: None
            attention_mask (torch.FloatTensor, optional): Attention mask. Default: None
            head_mask (torch.FloatTensor, optional): Head mask. Default: None
            inputs_embeds (torch.FloatTensor, optional): Embedded inputs. Default: None
            past_key_values (Tuple[Tuple[torch.FloatTensor]], optional): Past key values for autoregressive generation. Default: None
            use_cache (bool, optional): Whether to use cache for autoregressive generation. Default: None
            output_attentions (bool, optional): Whether to output attentions weights. Default: None
            output_hidden_states (bool, optional): Whether to output hidden states. Default: None
            return_dict (bool, optional): Whether to return a dictionary as output. Default: None

        Returns:
            output (CausalLMOutputWithPast): Model output for language modeling.
        """
        # Perform forward pass through the GPTNeoXJapaneseModel
        return self.gpt_neox_japanese(
            input_ids=input_ids,
            attention_mask=attention_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            past_key_values=past_key_values,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
    # 定义一个方法 `forward`，接收多个输入参数，并返回一个字典或张量
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        # 定义一个方法 `prepare_inputs_for_generation`，准备用于生成的输入
        def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs):
            # 获取输入 `input_ids` 的形状
            input_shape = input_ids.shape

            # 如果没有提供注意力掩码，则创建一个全为1的张量作为注意力掩码
            if attention_mask is None:
                attention_mask = input_ids.new_ones(input_shape)

            # 如果使用过去的键值对 `past_key_values`，则截取 `input_ids`
            if past_key_values and past_key_values[0] is not None:
                input_ids = input_ids[:, -1:]

            # 返回一个包含 `input_ids`, `attention_mask`, `past_key_values` 的字典
            return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}

        # 定义一个方法 `_reorder_cache`，重新排序缓存 `past_key_values` 中的内容
        def _reorder_cache(self, past_key_values, beam_idx):
            reordered_past = ()
            # 对每一层的过去键值对执行重新排序
            for layer_past in past_key_values:
                reordered_past += (
                    # 对过去状态进行索引选择，并根据 `beam_idx` 和设备类型调整顺序
                    tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past[:2])
                    # 保持其余部分不变
                    + layer_past[2:],
                )
            # 返回重新排序后的过去键值对
            return reordered_past

`.\models\gpt_neox_japanese\tokenization_gpt_neox_japanese.py`

# coding=utf-8
# 版权 2022 年 ABEJA, Inc. 和 The HuggingFace Inc. team. 保留所有权利。
#
# 根据 Apache 许可证 2.0 版本（"许可证"）获得许可;
# 除非符合许可证的规定，否则不得使用此文件。
# 您可以在以下网址获取许可证副本:
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则本软件是基于"原样"提供的，
# 没有任何形式的明示或暗示担保或条件。
# 有关更多详细信息，请参阅许可证。
"""GPTNeoXJapanese 的标记化类。"""
import collections
import json
import os
import re
from typing import Optional, Tuple

import numpy as np

from ...tokenization_utils_fast import PreTrainedTokenizer
from ...utils import logging

# 获取记录器实例
logger = logging.get_logger(__name__)

# 词汇文件名字典
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "emoji_file": "emoji.json"}

# 预训练词汇文件映射
PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "abeja/gpt-neox-japanese-2.7b": "https://huggingface.co/abeja/gpt-neox-japanese-2.7b/resolve/main/vocab.txt",
    },
    "emoji_file": {
        "abeja/gpt-neox-japanese-2.7b": "https://huggingface.co/abeja/gpt-neox-japanese-2.7b/resolve/main/emoji.json",
    },
}

# 预训练位置编码大小映射
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "abeja/gpt-neox-japanese-2.7b": 2048,
}


def load_vocab_and_emoji(vocab_file, emoji_file):
    """加载词汇文件和表情文件到字典中。"""
    # 打开并加载表情文件为 JSON 格式
    with open(emoji_file, "r", encoding="utf-8") as f:
        emoji = json.loads(f.read())

    # 初始化字典
    vocab = collections.OrderedDict()
    raw_vocab = collections.OrderedDict()
    ids_to_tokens = collections.OrderedDict()

    # 打开并处理词汇文件
    with open(vocab_file, "r", encoding="utf-8") as f:
        token = f.readlines()

    # 格式化处理 token
    token = [[t.rstrip("\n")] if (t == "," or "," not in t) else t.rstrip("\n").split(",") for t in token]
    
    # 枚举 tokens
    for idx, b in enumerate(token):
        ids_to_tokens[idx] = b
        raw_vocab[",".join(b)] = idx
        for wd in b:
            vocab[wd] = idx

    return vocab, raw_vocab, ids_to_tokens, emoji


class GPTNeoXJapaneseTokenizer(PreTrainedTokenizer):
    """
    这个标记生成器继承自[`PreTrainedTokenizer`]，基于日本特殊的子词编码，该编码在此代码库中使用
    （https://github.com/tanreinama/Japanese-BPEEncoder_V2）。详细信息请参阅该代码库。
    日语词汇相对较大，并且单词之间没有分隔。此外，语言是由平假名、片假名和汉字组成，
    并且经常使用"1"和"①"等变体。为了应对这些情况，这个标记生成器具有以下功能：
    - 逐字子词分割，介于字节字符串和形态分析之间。
    """
    # 导入所需的GPTNeoXJapaneseTokenizer类
    from transformers import GPTNeoXJapaneseTokenizer
    
    # 定义GPTNeoXJapaneseTokenizer类，继承自Tokenizer类
    class GPTNeoXJapaneseTokenizer:
        # 类变量：定义词汇文件名列表
        vocab_files_names = VOCAB_FILES_NAMES
        # 类变量：定义预训练词汇文件映射
        pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
        # 类变量：定义最大模型输入尺寸
        max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
        # 类变量：定义模型输入名称列表
        model_input_names = ["input_ids", "attention_mask"]
    
        # 初始化方法，接受多个参数
        def __init__(
            self,
            vocab_file,         # 词汇文件路径
            emoji_file,         # Emoji文件路径
            unk_token="<|endoftext|>",  # 未知标记的默认值
            pad_token="<|endoftext|>",  # 填充标记的默认值
            bos_token="<|startoftext|>",    # 序列开始标记的默认值
            eos_token="<|endoftext|>",  # 序列结束标记的默认值
            do_clean_text=False,    # 是否清理文本的标志，默认为False
            **kwargs,   # 其他关键字参数
        ):
            pass    # 初始化方法暂不做任何操作，保留扩展空间
    ):
        # 检查词汇文件是否存在，若不存在则抛出数值错误，指明路径，并建议从预训练模型加载
        if not os.path.isfile(vocab_file):
            raise ValueError(
                f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
                " model use `tokenizer = GPTNeoXJapaneseokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
            )
        # 检查表情文件是否存在，若不存在则抛出数值错误，指明路径，并建议从预训练模型加载
        if not os.path.isfile(emoji_file):
            raise ValueError(
                f"Can't find an emoji file at path '{emoji_file}'. To load the emoji information from a Google"
                " pretrained model use `tokenizer = GPTNeoXJapaneseokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
            )
        # 设定是否进行文本清理的标志位
        self.do_clean_text = do_clean_text
        # 载入词汇和表情数据到相应的属性中
        self.vocab, self.raw_vocab, self.ids_to_tokens, self.emoji = load_vocab_and_emoji(vocab_file, emoji_file)
        # 初始化日语分词器，并传入必要的词汇、词汇到标记的映射、表情数据
        self.subword_tokenizer = SubWordJapaneseTokenizer(
            vocab=self.vocab, ids_to_tokens=self.ids_to_tokens, emoji=self.emoji
        )
        # 调用父类初始化方法，传入通用的参数及kwargs
        super().__init__(
            unk_token=unk_token,
            pad_token=pad_token,
            bos_token=bos_token,
            eos_token=eos_token,
            do_clean_text=do_clean_text,
            **kwargs,
        )

    @property
    def vocab_size(self):
        # 返回词汇表大小，即 raw_vocab 的长度
        return len(self.raw_vocab)

    def get_vocab(self):
        # 返回原始词汇表和添加的特殊标记编码的字典
        return dict(self.raw_vocab, **self.added_tokens_encoder)

    def _tokenize(self, text):
        # 使用子词日语分词器对文本进行分词处理，根据 do_clean_text 的设置决定是否进行文本清理
        return self.subword_tokenizer.tokenize(text, clean=self.do_clean_text)

    def _convert_token_to_id(self, token):
        """Converts a token (str) in an id using the vocab."""
        # 将给定的 token 转换成其对应的 id，若找不到则使用 unk_token 的 id
        return self.vocab.get(token, self.vocab.get(self.unk_token))

    def _convert_id_to_token(self, index):
        """Converts an index (integer) in a token (str) using the vocab."""
        # 将给定的 index 转换成其对应的 token
        return self.subword_tokenizer.convert_id_to_token(index)

    def convert_tokens_to_string(self, tokens):
        """Converts a sequence of tokens (string) in a single string."""
        # 将一系列的 token 转换成单个字符串，并去除首尾空格
        out_string = "".join(tokens).strip()
        return out_string

    @property
    def default_chat_template(self):
        """
        A simple chat template that just adds BOS/EOS tokens around messages while discarding role information.
        """
        # 若未定义聊天模板，则发出警告并使用默认模板，返回该模板字符串
        logger.warning_once(
            "\nNo chat template is defined for this tokenizer - using the default template "
            f"for the {self.__class__.__name__} class. If the default is not appropriate for "
            "your model, please set `tokenizer.chat_template` to an appropriate template. "
            "See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
        )
        return (
            "{% for message in messages %}"
            "{{ bos_token + eos_token + message.content + eos_token }}"
            "{% endfor %}"
            "{% if add_generation_prompt %} {{ bos_token + eos_token }} {% endif %}"
        )
    # 定义一个方法用于保存词汇表到指定目录
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # 初始化索引为0
        index = 0
        # 检查保存目录是否存在
        if os.path.isdir(save_directory):
            # 构建词汇表文件路径
            vocab_file = os.path.join(
                save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
            )
            # 构建表情符号文件路径
            emoji_file = os.path.join(
                save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["emoji_file"]
            )
        else:
            # 若保存目录不存在，则在文件名前加上前缀，构建词汇表文件路径
            vocab_file = (
                (filename_prefix + "-" if filename_prefix else "") + save_directory + VOCAB_FILES_NAMES["vocab_file"]
            )
            # 构建表情符号文件路径
            emoji_file = (
                (filename_prefix + "-" if filename_prefix else "") + save_directory + VOCAB_FILES_NAMES["emoji_file"]
            )
        
        # 打开词汇表文件，使用utf-8编码方式写入数据
        with open(vocab_file, "w", encoding="utf-8") as writer:
            # 遍历词汇表中的token索引和token内容
            for token_index, token in self.ids_to_tokens.items():
                # 检查索引是否连续，若不连续则发出警告
                if index != token_index:
                    logger.warning(
                        f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
                        " Please check that the vocabulary is not corrupted!"
                    )
                    index = token_index
                # 将token内容以逗号分隔写入文件，并换行
                writer.write(",".join(token) + "\n")
                # 更新索引
                index += 1
        
        # 打开表情符号文件，使用utf-8编码方式写入JSON格式的表情符号数据
        with open(emoji_file, "w", encoding="utf-8") as writer:
            json.dump(self.emoji, writer)
        
        # 返回保存的词汇表文件路径和表情符号文件路径
        return vocab_file, emoji_file
class SubWordJapaneseTokenizer(object):
    """
    https://github.com/tanreinama/Japanese-BPEEncoder_V2 This tokenizer class is under MIT Lisence according to the
    original repository.

    MIT License

    Copyright (c) 2020 tanreinama

    Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
    documentation files (the "Software"), to deal in the Software without restriction, including without limitation the
    rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
    permit persons to whom the Software is furnished to do so, subject to the following conditions:

    The above copyright notice and this permission notice shall be included in all copies or substantial portions of
    the Software.

    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
    THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
    TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
    SOFTWARE.
    """

    def __init__(self, vocab, ids_to_tokens, emoji):
        self.vocab = vocab  # 词汇表，与 swe 相同
        self.ids_to_tokens = ids_to_tokens  # id 到 token 映射，与 bpe 相同
        self.emoji = emoji  # 表情符号
        self.maxlen = np.max([len(w) for w in self.vocab.keys()])  # 计算词汇表中最长词的长度
        # 定义多个正则表达式用于匹配特定的文本模式
        self.content_repatter1 = re.compile(r"(https?|ftp)(:\/\/[-_\.!~*\'()a-zA-Z0-9;\/?:\@&=\+$,%#]+)")
        self.content_repatter2 = re.compile(r"[A-Za-z0-9\._+]*@[\-_0-9A-Za-z]+(\.[A-Za-z]+)*")
        self.content_repatter3 = re.compile(r"[\(]{0,1}[0-9]{2,4}[\)\-\(]{0,1}[0-9]{2,4}[\)\-]{0,1}[0-9]{3,4}")
        self.content_repatter4 = re.compile(
            r"([12]\d{3}[/\-年])*(0?[1-9]|1[0-2])[/\-月]((0?[1-9]|[12][0-9]|3[01])日?)*(\d{1,2}|:|\d{1,2}時|\d{1,2}分|\(日\)|\(月\)|\(火\)|\(水\)|\(木\)|\(金\)|\(土\)|㈰|㈪|㈫|㈬|㈭|㈮|㈯)*"
        )
        self.content_repatter5 = re.compile(
            r"(明治|大正|昭和|平成|令和|㍾|㍽|㍼|㍻|\u32ff)\d{1,2}年(0?[1-9]|1[0-2])月(0?[1-9]|[12][0-9]|3[01])日(\d{1,2}|:|\d{1,2}時|\d{1,2}分|\(日\)|\(月\)|\(火\)|\(水\)|\(木\)|\(金\)|\(土\)|㈰|㈪|㈫|㈬|㈭|㈮|㈯)*"
        )
        self.content_repatter6 = re.compile(
            r"((0|[1-9]\d*|[1-9]\d{0,2}(,\d{3})+)*億)*((0|[1-9]\d*|[1-9]\d{0,2}(,\d{3})+)*万)*((0|[1-9]\d*|[1-9]\d{0,2}(,\d{3})+)*千)*(0|[1-9]\d*|[1-9]\d{0,2}(,\d{3})+)*(千円|万円|千万円|円|千ドル|万ドル|千万ドル|ドル|千ユーロ|万ユーロ|千万ユーロ|ユーロ)+(\(税込\)|\(税抜\)|\+tax)*"
        )
        keisen = "─━│┃┄┅┆┇┈┉┊┋┌┍┎┏┐┑┒┓└┕┖┗┘┙┚┛├┝┞┟┠┡┢┣┤┥┦┧┨┩┪┫┬┭┮┯┰┱┲┳┴┵┶┷┸┹┺┻┼┽┾┿╀╁╂╃╄╅╆╇╈╉╊╋╌╍╎╏═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠╡╢╣╤╥╦╧╨╩╪╫╬╭╮╯╰╱╲╳╴╵╶╷╸╹╺╻╼╽╾╿"
        blocks = "▀▁▂▃▄▅▆▇█▉▊▋▌▍▎▏▐░▒▓▔▕▖▗▘▙▚▛▜▝▞▟"
        self.content_trans1 = str.maketrans({k: "<BLOCK>" for k in keisen + blocks})  # 定义字符转换表，将一些特定字符替换为"<BLOCK>"

    def __len__(self):
        return len(self.ids_to_tokens)  # 返回 token 到 id 映射的长度
    # 清理文本内容，替换内容中的特定模式
    def clean_text(self, content):
        # 将内容中匹配到的 URL 替换为 "<URL>"
        content = self.content_repatter1.sub("<URL>", content)
        # 将内容中匹配到的 EMAIL 替换为 "<EMAIL>"
        content = self.content_repatter2.sub("<EMAIL>", content)
        # 将内容中匹配到的电话号码替换为 "<TEL>"
        content = self.content_repatter3.sub("<TEL>", content)
        # 将内容中匹配到的日期替换为 "<DATE>"
        content = self.content_repatter4.sub("<DATE>", content)
        # 再次将内容中匹配到的日期替换为 "<DATE>"
        content = self.content_repatter5.sub("<DATE>", content)
        # 将内容中匹配到的价格替换为 "<PRICE>"
        content = self.content_repatter6.sub("<PRICE>", content)
        # 使用指定的字符映射表进行字符转换
        content = content.translate(self.content_trans1)
        # 反复检查并替换连续的 "<BLOCK><BLOCK>" 为单个 "<BLOCK>"
        while "<BLOCK><BLOCK>" in content:
            content = content.replace("<BLOCK><BLOCK>", "<BLOCK>")
        # 返回清理后的内容
        return content
    # 定义一个方法，用于将文本进行分词处理，并可选地进行清理操作
    def tokenize(self, text, clean=False):
        # 将空格替换为特殊标记"<SP>"
        text = text.replace(" ", "<SP>")
        # 将全角空格替换为特殊标记"<SP>"
        text = text.replace("　", "<SP>")
        # 将Windows风格的换行符替换为特殊标记"<BR>"
        text = text.replace("\r\n", "<BR>")
        # 将Unix风格的换行符替换为特殊标记"<BR>"
        text = text.replace("\n", "<BR>")
        # 将老式Mac风格的换行符替换为特殊标记"<BR>"
        text = text.replace("\r", "<BR>")
        # 将制表符替换为特殊标记"<TAB>"
        text = text.replace("\t", "<TAB>")
        # 将特定字符替换为统一的字符"ー"
        text = text.replace("—", "ー")
        text = text.replace("−", "ー")
        
        # 替换文本中的表情符号为对应的Unicode字符串
        for k, v in self.emoji["emoji"].items():
            if k in text:
                text = text.replace(k, v)
        
        # 若clean参数为True，则调用clean_text方法清理文本
        if clean:
            text = self.clean_text(text)
        
        # 定义一个内部函数，用于检查是否为特定的符号字符
        def check_simbol(x):
            e = x.encode()
            if len(x) == 1 and len(e) == 2:
                c = (int(e[0]) << 8) + int(e[1])
                # 判断是否符合日语、朝鲜语等特定范围内的字符编码
                if (
                    (c >= 0xC2A1 and c <= 0xC2BF)
                    or (c >= 0xC780 and c <= 0xC783)
                    or (c >= 0xCAB9 and c <= 0xCBBF)
                    or (c >= 0xCC80 and c <= 0xCDA2)
                ):
                    return True
            return False
        
        # 定义一个内部函数，用于检查是否为范围内的双字节字符
        def checku2e(x):
            e = x.encode()
            if len(x) == 1 and len(e) == 3:
                c = (int(e[0]) << 16) + (int(e[1]) << 8) + int(e[2])
                # 判断是否为Unicode范围内的字符
                if c >= 0xE28080 and c <= 0xE2B07F:
                    return True
            return False
        
        # 初始化位置变量
        pos = 0
        # 初始化结果列表
        result = []
        
        # 开始处理文本
        while pos < len(text):
            # 计算当前处理的结束位置
            end = min(len(text), pos + self.maxlen + 1) if text[pos] == "<" else pos + 3
            # 候选列表用于存储可能的token及其信息
            candidates = []  # (token_id, token, pos)
            
            # 从最大长度向当前位置遍历，找到最长的合法token
            for e in range(end, pos, -1):
                wd = text[pos:e]
                if wd in self.vocab:
                    if wd[0] == "<" and len(wd) > 2:
                        candidates = [(self.vocab[wd], wd, e)]
                        break
                    else:
                        candidates.append((self.vocab[wd], wd, e))
            
            # 若候选列表不为空，则选择token_id最小的token作为结果之一
            if len(candidates) > 0:
                _, wd, e = sorted(candidates, key=lambda x: x[0])[0]
                result.append(wd)
                pos = e
            else:
                # 若无合法token，则处理单个字符
                end = pos + 1
                wd = text[pos:end]
                # 检查是否为特定符号，若是则添加"<KIGOU>"标记
                if check_simbol(wd):
                    result.append("<KIGOU>")
                # 检查是否为范围内的双字节字符，若是则添加"<U2000U2BFF>"标记
                elif checku2e(wd):
                    result.append("<U2000U2BFF>")
                # 否则，按字节添加"<|byte%d|>"的标记
                else:
                    for i in wd.encode("utf-8"):
                        result.append("<|byte%d|>" % i)
                pos = end
        
        # 返回处理后的结果列表
        return result
    # 将给定的索引转换为对应的文本标记
    def convert_id_to_token(self, index, breakline="\n"):
        # 初始化一个空列表，用于存储最终的文本标记
        words = []
        # 初始化一个空列表，用于临时存储字节标记
        byte_tokens = []
        # 获取索引处的标记
        word = self.ids_to_tokens[index][0]
        
        # 检查是否是字节标记
        if word[:6] == "<|byte" and word[-2:] == "|>":
            # 提取字节标记的值并添加到字节标记列表中
            byte_tokens.append(int(word[6:-2]))
        else:
            # 如果之前有未处理的字节标记，则解码并添加到最终文本标记列表中
            if len(byte_tokens) > 0:
                words.append(bytearray(byte_tokens).decode("utf-8", errors="replace"))
                byte_tokens = []
            
            # 根据特定标记进行处理
            if word[:7] == "<|emoji" and word[-2:] == "|>":
                # 如果是表情符号标记，则根据索引获取对应的表情符号并添加到文本标记列表中
                words.append(self.emoji["emoji_inv"][word])
            elif word == "<SP>":
                words.append(" ")  # 空格标记
            elif word == "<BR>":
                words.append(breakline)  # 换行符标记
            elif word == "<TAB>":
                words.append("\t")  # 制表符标记
            elif word == "<BLOCK>":
                words.append("▀")  # 方块字符标记
            elif word == "<KIGOU>":
                words.append("ǀ")  # 竖线符号标记
            elif word == "<U2000U2BFF>":
                words.append("‖")  # 双竖线符号标记
            else:
                words.append(word)  # 普通文本标记
        
        # 处理最后可能残留的字节标记并添加到文本标记列表中
        if len(byte_tokens) > 0:
            words.append(bytearray(byte_tokens).decode("utf-8", errors="replace"))
        
        # 将所有文本标记连接成一个字符串
        text = "".join(words)
        # 返回转换后的文本字符串
        return text

`.\models\gpt_neox_japanese\init.py`

# 导入必要的类型检查模块
from typing import TYPE_CHECKING

# 导入懒加载模块和条件判断函数
from ...file_utils import _LazyModule, is_torch_available
from ...utils import OptionalDependencyNotAvailable

# 定义模块的导入结构
_import_structure = {
    "configuration_gpt_neox_japanese": ["GPT_NEOX_JAPANESE_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPTNeoXJapaneseConfig"],
    "tokenization_gpt_neox_japanese": ["GPTNeoXJapaneseTokenizer"],
}

# 尝试检查是否导入了 Torch，若未导入则抛出异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若成功导入 Torch，则添加额外的模型相关导入结构
    _import_structure["modeling_gpt_neox_japanese"] = [
        "GPT_NEOX_JAPANESE_PRETRAINED_MODEL_ARCHIVE_LIST",
        "GPTNeoXJapaneseForCausalLM",
        "GPTNeoXJapaneseLayer",
        "GPTNeoXJapaneseModel",
        "GPTNeoXJapanesePreTrainedModel",
    ]

# 如果类型检查开启，导入相应的类型和模块
if TYPE_CHECKING:
    from .configuration_gpt_neox_japanese import GPT_NEOX_JAPANESE_PRETRAINED_CONFIG_ARCHIVE_MAP, GPTNeoXJapaneseConfig
    from .tokenization_gpt_neox_japanese import GPTNeoXJapaneseTokenizer

    # 同样地，尝试检查是否导入了 Torch，若未导入则抛出异常
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 若成功导入 Torch，则导入模型相关的类型和模块
        from .modeling_gpt_neox_japanese import (
            GPT_NEOX_JAPANESE_PRETRAINED_MODEL_ARCHIVE_LIST,
            GPTNeoXJapaneseForCausalLM,
            GPTNeoXJapaneseLayer,
            GPTNeoXJapaneseModel,
            GPTNeoXJapanesePreTrainedModel,
        )

# 若非类型检查模式，则直接将当前模块设置为懒加载模式
else:
    import sys

    # 动态设置当前模块为懒加载模式，使用 _LazyModule 进行懒加载
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\gpt_sw3\convert_megatron_to_pytorch.py`

# 版权声明和许可信息
# 2022 年版权归 HuggingFace Inc. 团队和 AI-Sweden 团队所有，保留所有权利。
#
# 根据 Apache 许可证版本 2.0 授权；
# 除非符合许可证的条款，否则不得使用此文件。
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则本软件按“原样”分发，
# 不附带任何明示或暗示的担保或条件。
# 有关详细信息，请参阅许可证。
""" 将 GPT-SW3 Megatron 检查点转换为 PyTorch 格式 """

import argparse
import os
from os.path import isfile

import torch

from transformers import GPT2Config


def recursive_print(name, val, spaces=0):
    # 递归打印函数名和对应的值
    if name is None:
        msg = None
    else:
        fmt = "." * max(0, spaces - 2) + "# {:" + str(50 - spaces) + "s}"
        msg = fmt.format(name)

    if isinstance(val, dict):
        # 如果值是字典，则递归打印键和值
        if msg is not None:
            print(msg)
        for k in val.keys():
            recursive_print(k, val[k], spaces + 2)
    elif isinstance(val, torch.Tensor):
        # 如果值是 Torch 张量，则打印值的大小
        print(msg, ":", val.size())
    else:
        # 否则只打印消息和值
        print(msg, ":", val)


def fix_query_key_value_ordering(param, num_splits, num_heads, hidden_size):
    # 调整参数张量的布局，以便与后续版本的 NVIDIA Megatron-LM 兼容
    # 如果 param 是自注意力模块的权重张量，则返回的张量需要再次转置才能被 HuggingFace GPT2 读取
    input_shape = param.size()
    saved_shape = (num_heads, num_splits, hidden_size) + input_shape[1:]
    param = param.view(*saved_shape)
    param = param.transpose(0, 1).contiguous()
    param = param.view(*input_shape)
    return param


def convert_megatron_checkpoint(sd_megatron, config):
    """
    将 Megatron 检查点转换为 HuggingFace GPT-SW3 检查点。
    """
    n_positions = config.n_positions
    layers = config.n_layer
    vocab_size = config.vocab_size
    heads = config.n_head
    hidden_size_per_head = config.n_embd // config.n_head

    word_embeddings = sd_megatron["model.language_model.embedding.word_embeddings.weight"][:vocab_size, :]
    # 定义一个字典 sd_hf，用于存储模型参数的映射关系
    sd_hf = {
        "transformer.wte.weight": word_embeddings,  # 将 word_embeddings 赋给键 "transformer.wte.weight"
        "transformer.wpe.weight": sd_megatron["model.language_model.embedding.position_embeddings.weight"],  # 将位置编码的权重赋给键 "transformer.wpe.weight"
        "transformer.ln_f.weight": sd_megatron["model.language_model.encoder.final_layernorm.weight"],  # 将最终层归一化层的权重赋给键 "transformer.ln_f.weight"
        "transformer.ln_f.bias": sd_megatron["model.language_model.encoder.final_layernorm.bias"],  # 将最终层归一化层的偏置赋给键 "transformer.ln_f.bias"
    }

    # 定义模型层的前缀字符串
    pf = "model.language_model.encoder.layers."

    # 遍历每个层，生成对应的参数映射
    for i in range(layers):
        # 创建一个因果掩码，限制自注意力只能关注当前及之前位置
        causal_mask = torch.tril(torch.ones((n_positions, n_positions), dtype=torch.bool))
        causal_mask = causal_mask.view(1, 1, n_positions, n_positions)
        sd_hf[f"transformer.h.{i}.attn.bias"] = causal_mask  # 将因果掩码赋给键 "transformer.h.{i}.attn.bias"
        
        # 设置自注意力层的偏置为一个很小的负数，用于掩盖无效位置的注意力分数
        sd_hf[f"transformer.h.{i}.attn.masked_bias"] = torch.tensor(-1e4, dtype=torch.bfloat16)  # 将负数偏置赋给键 "transformer.h.{i}.attn.masked_bias"

        # 以下为各个权重及偏置的赋值过程，从 Megatron 模型中提取对应层的参数
        sd_hf[f"transformer.h.{i}.ln_1.weight"] = sd_megatron[f"{pf}{i}.input_layernorm.weight"]
        sd_hf[f"transformer.h.{i}.ln_1.bias"] = sd_megatron[f"{pf}{i}.input_layernorm.bias"]

        val1 = sd_megatron[f"{pf}{i}.self_attention.query_key_value.weight"]
        val1 = fix_query_key_value_ordering(val1, 3, heads, hidden_size_per_head)
        sd_hf[f"transformer.h.{i}.attn.c_attn.weight"] = val1.transpose(0, 1).contiguous()

        val2 = sd_megatron[f"{pf}{i}.self_attention.query_key_value.bias"]
        val2 = fix_query_key_value_ordering(val2, 3, heads, hidden_size_per_head)
        sd_hf[f"transformer.h.{i}.attn.c_attn.bias"] = val2

        sd_hf[f"transformer.h.{i}.attn.c_proj.weight"] = sd_megatron[f"{pf}{i}.self_attention.dense.weight"].transpose(0, 1)
        sd_hf[f"transformer.h.{i}.attn.c_proj.bias"] = sd_megatron[f"{pf}{i}.self_attention.dense.bias"]
        sd_hf[f"transformer.h.{i}.ln_2.weight"] = sd_megatron[f"{pf}{i}.post_attention_layernorm.weight"]
        sd_hf[f"transformer.h.{i}.ln_2.bias"] = sd_megatron[f"{pf}{i}.post_attention_layernorm.bias"]
        sd_hf[f"transformer.h.{i}.mlp.c_fc.weight"] = sd_megatron[f"{pf}{i}.mlp.dense_h_to_4h.weight"].transpose(0, 1)
        sd_hf[f"transformer.h.{i}.mlp.c_fc.bias"] = sd_megatron[f"{pf}{i}.mlp.dense_h_to_4h.bias"]
        sd_hf[f"transformer.h.{i}.mlp.c_proj.weight"] = sd_megatron[f"{pf}{i}.mlp.dense_4h_to_h.weight"].transpose(0, 1)
        sd_hf[f"transformer.h.{i}.mlp.c_proj.bias"] = sd_megatron[f"{pf}{i}.mlp.dense_4h_to_h.bias"]

    # 对于语言模型头部，将词嵌入矩阵赋给 "lm_head.weight"
    sd_hf["lm_head.weight"] = word_embeddings

    # 返回完整的参数映射字典 sd_hf
    return sd_hf
# 主函数，接收参数 args
def main(args):
    # 打印输入的参数 args
    print(args)

    # 从参数中获取 checkpoint_path 和 save_path
    checkpoint_path = args.checkpoint_path
    save_path = args.save_path

    # 检查 checkpoint_path 是否为文件，如果是则抛出文件未找到异常
    if isfile(checkpoint_path):
        raise FileNotFoundError(f"ERROR! could not find file {checkpoint_path}")

    # 使用 torch 加载模型检查点
    checkpoint = torch.load(checkpoint_path, map_location="cpu")

    # 从检查点中加载 Megatron 的配置
    config_megatron = checkpoint["hyper_parameters"]["cfg"]

    # 创建一个新的 GPT2Config 对象，作为转换后的配置
    config_hf = GPT2Config()

    # 调用 copy_config 函数，将 Megatron 的配置复制到 config_hf 中
    config_hf = copy_config(config_hf=config_hf, config_megatron=config_megatron)

    # 设置模型的架构为 GPT2LMHeadModel
    config_hf.architectures = ["GPT2LMHeadModel"]

    # 从检查点中加载 Megatron 的状态字典
    sd_megatron = checkpoint["state_dict"]

    # 转换 Megatron 的状态字典到适用于 HF 的格式
    print("Converting")
    sd_hf = convert_megatron_checkpoint(sd_megatron, config_hf)

    # 如果设置了打印检查点结构的标志，递归打印转换后的状态字典结构
    if args.print_checkpoint_structure:
        recursive_print(None, sd_hf)

    # 设置 config_hf 的 tokenizer_class 为 "GPTSw3Tokenizer"
    config_hf.tokenizer_class = "GPTSw3Tokenizer"

    # 将配置保存到指定的 save_path
    print("Saving config")
    config_hf.save_pretrained(save_path)

    # 将转换后的状态字典保存到文件中
    output_checkpoint_file = os.path.join(save_path, "pytorch_model.bin")
    print(f'Saving checkpoint to "{output_checkpoint_file}"')
    torch.save(sd_hf, output_checkpoint_file)


if __name__ == "__main__":
    # 创建参数解析器
    parser = argparse.ArgumentParser()

    # 添加命令行参数：checkpoint_path，表示 Megatron 模型的检查点路径
    parser.add_argument(
        "--checkpoint_path",
        type=str,
        required=True,
        help="e.g. megatron_gpt--val_loss=2.42-step=38000-consumed_samples=54720000",
    )

    # 添加命令行参数：save_path，表示 HF 模型的保存路径
    parser.add_argument("--save_path", type=str, required=True, help="e.g. /home/user/gpt-sw3/hf")

    # 添加命令行选项：print-checkpoint-structure，如果设置则打印转换后的检查点结构
    parser.add_argument("--print-checkpoint-structure", action="store_true")
    # 解析命令行参数并存储到 _args 变量中
    _args = parser.parse_args()
    # 调用主函数 main，传入解析后的命令行参数作为参数
    main(_args)

`.\models\gpt_sw3\tokenization_gpt_sw3.py`

"""The tokenizer used by the GPT-SW3 models."""

# 引入所需的模块和库
import os  # 提供了与操作系统交互的功能
import re  # 提供了正则表达式操作支持
import unicodedata  # 提供了对 Unicode 数据库的访问
from shutil import copyfile  # 提供了文件和目录的操作函数
from typing import Any, Dict, List, Optional, Tuple, Union  # 提供了类型提示支持

import sentencepiece as spm  # 引入 SentencePiece 库用于分词

# 引入所需的自定义模块和函数
from ...tokenization_utils import PreTrainedTokenizer  # 引入预训练的 tokenizer 类
from ...utils import is_torch_available, logging  # 引入用于检查 Torch 是否可用和日志记录的工具函数


if is_torch_available():  # 检查是否安装了 Torch，如果是则引入 Torch
    import torch  # 引入 Torch 库

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器
VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}  # 定义词汇文件名字典

PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "AI-Sweden-Models/gpt-sw3-126m": "https://huggingface.co/AI-Sweden-Models/gpt-sw3-126m/resolve/main/spiece.model",
        "AI-Sweden-Models/gpt-sw3-356m": "https://huggingface.co/AI-Sweden-Models/gpt-sw3-356m/resolve/main/spiece.model",
        "AI-Sweden-Models/gpt-sw3-1.3b": "https://huggingface.co/AI-Sweden-Models/gpt-sw3-1.3b/resolve/main/spiece.model",
        "AI-Sweden-Models/gpt-sw3-6.7b": "https://huggingface.co/AI-Sweden-Models/gpt-sw3-6.7b/resolve/main/spiece.model",
        "AI-Sweden-Models/gpt-sw3-6.7b-v2": "https://huggingface.co/AI-Sweden-Models/gpt-sw3-6.7b-v2/resolve/main/spiece.model",
        "AI-Sweden-Models/gpt-sw3-20b": "https://huggingface.co/AI-Sweden-Models/gpt-sw3-20b/resolve/main/spiece.model",
        "AI-Sweden-Models/gpt-sw3-40b": "https://huggingface.co/AI-Sweden-Models/gpt-sw3-20b/resolve/main/spiece.model",
    }
}

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "AI-Sweden-Models/gpt-sw3-126m": 2048,
    "AI-Sweden-Models/gpt-sw3-356m": 2048,
    "AI-Sweden-Models/gpt-sw3-1.3b": 2048,
    "AI-Sweden-Models/gpt-sw3-6.7b": 2048,
    "AI-Sweden-Models/gpt-sw3-6.7b-v2": 2048,
    "AI-Sweden-Models/gpt-sw3-20b": 2048,
    "AI-Sweden-Models/gpt-sw3-40b": 2048,
}


class GPTSw3Tokenizer(PreTrainedTokenizer):
    """
    Construct an GPTSw3 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Example usage:
    ```
    >>> from transformers import GPTSw3Tokenizer

    >>> tokenizer = GPTSw3Tokenizer.from_pretrained("AI-Sweden-Models/gpt-sw3-126m")
    >>> tokenizer("Svenska är kul!")["input_ids"]
    [1814, 377, 3617, 63504]
    ```
    """
    Args:
        vocab_file (`str`):
            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
            contains the vocabulary necessary to instantiate a tokenizer.
        do_lower_case (`bool`, *optional*, defaults to `False`):
            Whether or not to lowercase the input when tokenizing.
        remove_space (`bool`, *optional*, defaults to `False`):
            Whether or not to strip the text when tokenizing (removing excess spaces before and after the string).
        keep_accents (`bool`, *optional*, defaults to `False`):
            Whether or not to keep accents when tokenizing.
        pad_token (`str`, *optional*):
            The token used for padding, for example when batching sequences of different lengths. If not provided, will
            default to '<pad>' or '<unk>' depending on model size.
        unk_token (`str`, *optional*):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead. If not provided, will default to '<unk>'.
        eos_token (`str`, *optional*):
            The end of sequence token seen during pretraining. If not provided, will default to '<|endoftext|>'
        bos_token (`str`, *optional*):
            The beginning of sequence token that can be used for downstream task, was not seen during pretraining. If
            not provided, will default to '<s>' or '<|endoftext|>', depending on model size.
        sp_model_kwargs (`dict`, *optional*):
            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
            to set:

            - `enable_sampling`: Enable subword regularization.
            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

              - `nbest_size = {0,1}`: No sampling is performed.
              - `nbest_size > 1`: samples from the nbest_size results.
              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.

            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.

    Attributes:
        sp_model (`SentencePieceProcessor`):
            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
        whitespaces (`set`):
            The whitespaces that are replaced in the whitespace normalization in preprocessing.
        non_printing_characters_re (`Pattern`):
            The compiled regular expression to remove non-printing characters in preprocessing.
    """

    # List of vocabulary files' names
    vocab_files_names = VOCAB_FILES_NAMES
    # Map of pretrained vocabulary files
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    # 将预训练模型的位置编码嵌入大小赋值给变量
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    # 模型输入的名称列表
    model_input_names = ["input_ids", "attention_mask"]

    # 初始化函数，接收多个参数来配置分词器
    def __init__(
        self,
        vocab_file,
        do_lower_case=False,
        remove_space=False,
        keep_accents=False,
        pad_token=None,
        unk_token=None,
        eos_token=None,
        bos_token=None,
        sp_model_kwargs: Optional[Dict[str, Any]] = None,
        **kwargs,
    ) -> None:
        # 如果 sp_model_kwargs 是 None，则设置为空字典
        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs

        # 获取 kwargs 中的 name_or_path 参数，如果不存在则警告并设置为 "None"
        name_or_path = kwargs.get("name_or_path")
        if name_or_path is None:
            logger.warning(
                "name_or_path not provided, will work for all GPTSw3 models except gpt-sw3-7b,"
                " you are testing the model, this can safely be ignored"
            )
            name_or_path = "None"

        # 根据情况设置 eos_token 和 unk_token 的默认值
        eos_token = "<|endoftext|>" if eos_token is None else eos_token
        unk_token = "<unk>" if unk_token is None else unk_token
        
        # 如果 name_or_path 包含 "gpt-sw3-7b"，则设置 pad_token 和 bos_token 的值
        if "gpt-sw3-7b" in name_or_path:
            pad_token = unk_token if pad_token is None else pad_token
            bos_token = eos_token if bos_token is None else bos_token
        else:
            pad_token = "<pad>" if pad_token is None else pad_token
            bos_token = "<s>" if bos_token is None else bos_token

        # 设置对象的属性
        self.do_lower_case = do_lower_case
        self.remove_space = remove_space
        self.keep_accents = keep_accents
        self.vocab_file = vocab_file

        # 使用 SentencePieceProcessor 初始化 self.sp_model
        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
        self.sp_model.Load(vocab_file)

        # 用于输入文本中空白字符的规范化
        # fmt : off
        self.whitespaces = {" ", " ", " ", " ", " ", "　", " ", " ", " ", " ", "", ""}
        # fmt : on

        # 正则表达式，用于在预处理中移除非打印字符（例如某些 Unicode 控制字符）
        self.non_printing_characters_re = re.compile(
            f"[{''.join(map(chr, list(range(0, 9)) + list(range(11, 32)) + list(range(127, 160)) + [160, 173, 8203]))}]"
        )

        # 调用父类的初始化方法
        super().__init__(
            do_lower_case=do_lower_case,
            remove_space=remove_space,
            keep_accents=keep_accents,
            bos_token=bos_token,
            eos_token=eos_token,
            unk_token=unk_token,
            pad_token=pad_token,
            sp_model_kwargs=self.sp_model_kwargs,
            **kwargs,
        )

    # 从 transformers.models.albert.tokenization_albert.AlbertTokenizer.__getstate__ 复制的方法
    # 返回对象的状态字典，将 sp_model 设为 None
    def __getstate__(self):
        state = self.__dict__.copy()
        state["sp_model"] = None
        return state

    # Copied from transformers.models.albert.tokenization_albert.AlbertTokenizer.__setstate__
    # 用于反序列化对象状态，将给定的字典 d 直接赋给对象的 __dict__ 属性
    def __setstate__(self, d):
        self.__dict__ = d

        # 用于向后兼容性检查，如果对象没有属性 "sp_model_kwargs"，则设置为空字典
        if not hasattr(self, "sp_model_kwargs"):
            self.sp_model_kwargs = {}

        # 使用 SentencePieceProcessor 初始化 self.sp_model 对象，传入 self.sp_model_kwargs 的参数
        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
        # 加载词汇文件到 self.sp_model
        self.sp_model.Load(self.vocab_file)

    @property
    # 从 transformers.models.albert.tokenization_albert.AlbertTokenizer.vocab_size 处复制的属性
    # 返回当前对象 self.sp_model 的词汇大小，即词汇表中的条目数
    def vocab_size(self) -> int:
        return len(self.sp_model)

    # 对给定文本进行预处理，返回处理后的文本
    def preprocess_text(self, text: str) -> str:
        """
        返回预处理后的文本。该过程与训练标记器时使用的过程相同。
        """

        # 移除非打印字符
        text = self.non_printing_characters_re.sub("", text)

        # 规范化空白字符
        text = "".join([char if char not in self.whitespaces else " " for char in text])

        # NFC Unicode 规范化
        text = unicodedata.normalize("NFC", text)
        return text

    # 将给定文本进行标记化处理，返回标记化后的列表
    def _tokenize(self, text: str, **kwargs) -> List[str]:
        text = self.preprocess_text(text)
        return self.sp_model.encode(text, out_type=str)

    # 将给定的 token (str) 转换为其对应的 id (int)，使用当前对象的词汇表进行转换
    def _convert_token_to_id(self, token: str) -> int:
        """将 token (str) 转换为 id (int)，使用词汇表进行转换。"""
        return self.sp_model.PieceToId(token)

    # 将给定的 id (int) 转换为其对应的 token (str)，使用当前对象的词汇表进行转换
    def _convert_id_to_token(self, index: int) -> str:
        """将 id (int) 转换为 token (str)，使用词汇表进行转换。"""
        return self.sp_model.IdToPiece(index)

    @staticmethod
    # 返回输入字符串本身，用于覆盖默认的清理函数
    def clean_up_tokenization(out_string: str) -> str:
        """返回输入的字符串，此函数用于移除默认的清理行为。"""
        return out_string

    # 将一系列 token (字符串) 转换为单个字符串，特殊 token 保持不变
    def convert_tokens_to_string(self, tokens: List[str]) -> str:
        """将一系列 token (字符串) 转换为单个字符串。特殊 token 保持不变。"""
        current_sub_tokens = []
        out_string = ""
        prev_is_special = False
        for token in tokens:
            # 确保特殊 token 不使用 sentencepiece 模型解码
            if token in self.all_special_tokens:
                # TODO: 检查是否需要这一步骤，它确保 decode(encode(doc)) != doc，通过在解码文档中添加额外的空格来实现
                if not prev_is_special:
                    out_string += " "

                out_string += self.sp_model.decode(current_sub_tokens) + token
                prev_is_special = True
                current_sub_tokens = []
            else:
                current_sub_tokens.append(token)
                prev_is_special = False
        out_string += self.sp_model.decode(current_sub_tokens)

        return out_string

    # 从 transformers.models.albert.tokenization_albert.AlbertTokenizer.get_vocab 处复制的方法
    # 返回当前对象的词汇表，包含 token 到 id 的映射
    def get_vocab(self) -> Dict[str, int]:
        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
        vocab.update(self.added_tokens_encoder)
        return vocab
    # 从transformers.models.albert.tokenization_albert.AlbertTokenizer.save_vocabulary方法复制而来
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # 检查保存目录是否存在，如果不存在则记录错误并返回
        if not os.path.isdir(save_directory):
            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
            return
        # 拼接输出的词汇表文件路径
        out_vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
        )

        # 如果当前词汇表文件路径不等于输出路径，并且当前词汇表文件存在，则复制当前词汇表文件到输出路径
        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
            copyfile(self.vocab_file, out_vocab_file)
        # 如果当前词汇表文件不存在，则将序列化的 sp_model 内容写入到输出文件
        elif not os.path.isfile(self.vocab_file):
            with open(out_vocab_file, "wb") as fi:
                content_spiece_model = self.sp_model.serialized_model_proto()
                fi.write(content_spiece_model)

        # 返回输出文件路径的元组
        return (out_vocab_file,)

    def encode_fast(
        self, text: Union[str, List[str]], return_tensors: Union[str, bool] = False
    ) -> Union[List[int], List[List[int]], "torch.Tensor"]:
        """
        Encodes a text or batch of texts to token ids using preprocessing and the raw SP tokenizer. This has reduced
        functionality but is often much faster.

        Does NOT handle special tokens correctly, these can manually be added as ids afterwards.

        Does NOT support padding, these can manually be added as ids afterwards.

        Use default HuggingFace tokenization methods for full functionality.

        Args:
            text (`str` or `List[str]`): One or several text(s) to convert to token ids.
            return_tensors (`str` or `bool`): Returns PyTorch tensors if set to True or "pt"

        Returns:
            `List[int]`, `List[List[int]]`, or `torch.Tensor`: The encoded text(s) as token ids.
        """

        # 如果输入是单个字符串，则预处理文本并使用 sp_model 进行编码
        if isinstance(text, str):
            text = self.preprocess_text(text)
            token_ids = self.sp_model.encode(text)
        # 如果输入是字符串列表，则分别预处理每个文本并使用 sp_model 进行编码
        else:
            text = [self.preprocess_text(t) for t in text]
            token_ids = self.sp_model.encode(text)

        # 如果需要返回 PyTorch 张量，则转换编码后的结果为张量
        if return_tensors is True or return_tensors == "pt":
            token_ids = torch.tensor(token_ids)

        # 返回编码后的 token_ids
        return token_ids

    def decode_fast(self, token_ids: Union[int, List[int]]) -> str:
        """
        Decodes a text or batch of texts from token ids using preprocessing and the raw SP tokenizer. This has reduced
        functionality but is often much faster.

        Args:
            token_ids (`int` or `List[int]`): Encoded token or text as token id(s).

        Returns:
            `str`: Decoded text
        """

        # 使用 sp_model 对 token_ids 进行解码，返回解码后的文本
        return self.sp_model.decode(token_ids)

    @property
    # 定义一个默认的聊天模板函数，用于格式化消息，类似即时通讯的聊天记录，消息前面有 "User:" 和 "Bot:" 字符串。消息之间使用 BOS 标记分隔。
    def default_chat_template(self):
        """
        This chat template formats messages like an instant messenger chat log, with "User:" and "Bot:" strings
        preceding messages. BOS tokens are added between all messages.
        """
        # 记录一次警告，提示没有为该分词器定义聊天模板，将使用该类的默认模板。如果默认模板不适合您的模型，请设置 `tokenizer.chat_template` 为合适的模板。
        # 查看 https://huggingface.co/docs/transformers/main/chat_templating 获取更多信息。
        logger.warning_once(
            "\nNo chat template is defined for this tokenizer - using the default template "
            f"for the {self.__class__.__name__} class. If the default is not appropriate for "
            "your model, please set `tokenizer.chat_template` to an appropriate template. "
            "See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
        )
        # 返回格式化后的聊天模板字符串，包括 EOS 标记和 BOS 标记在每条消息之间，处理 messages 列表中的每一条消息并添加对应的角色前缀。
        return (
            "{{ eos_token }}{{ bos_token }}"
            "{% for message in messages %}"
            "{% if message['role'] == 'user' %}{{ 'User: ' + message['content']}}"
            "{% else %}{{ 'Bot: ' + message['content']}}{% endif %}"
            "{{ message['text'] }}{{ bos_token }}"
            "{% endfor %}"
            "Bot:"  # 最后追加一个固定的 "Bot:" 字符串，表示消息结束
        )

`.\models\gpt_sw3\init.py`

# 版权声明和许可证信息，指明代码版权归 The HuggingFace Team 所有，使用 Apache License 2.0 许可
#
# 导入必要的模块和函数声明
from typing import TYPE_CHECKING

# 导入可选依赖相关的异常和工具函数
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_sentencepiece_available

# 定义一个空的导入结构
_import_structure = {}

# 尝试检查是否存在 SentencePiece 模块，如果不存在则引发 OptionalDependencyNotAvailable 异常
try:
    if not is_sentencepiece_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，则将 GPTSw3Tokenizer 添加到导入结构中
    _import_structure["tokenization_gpt_sw3"] = ["GPTSw3Tokenizer"]

# 如果是类型检查阶段
if TYPE_CHECKING:
    try:
        # 再次检查是否存在 SentencePiece 模块，如果不存在则引发异常
        if not is_sentencepiece_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果可用，则从 tokenization_gpt_sw3 模块中导入 GPTSw3Tokenizer 类
        from .tokenization_gpt_sw3 import GPTSw3Tokenizer

# 如果不是类型检查阶段
else:
    import sys

    # 将当前模块定义为 _LazyModule 的延迟加载模块，使用当前模块的名称和文件路径
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\graphormer\collating_graphormer.py`

# 从 Microsoft Corporation 和 HuggingFace 中导入必要的模块
# 使用 MIT 许可证进行许可

from typing import Any, Dict, List, Mapping  # 导入类型提示相关模块

import numpy as np  # 导入 NumPy 库
import torch  # 导入 PyTorch 库

from ...utils import is_cython_available, requires_backends  # 导入自定义模块，检查是否有 Cython 可用

if is_cython_available():  # 如果 Cython 可用，则导入相关模块
    import pyximport
    pyximport.install(setup_args={"include_dirs": np.get_include()})
    from . import algos_graphormer  # noqa E402  # 导入算法相关模块，忽略 E402 错误

def convert_to_single_emb(x, offset: int = 512):
    # 将输入 x 转换为单一嵌入表示，并返回转换后的结果
    feature_num = x.shape[1] if len(x.shape) > 1 else 1  # 计算特征数量
    feature_offset = 1 + np.arange(0, feature_num * offset, offset, dtype=np.int64)  # 计算特征偏移量
    x = x + feature_offset  # 将 x 加上特征偏移量
    return x  # 返回转换后的 x

def preprocess_item(item, keep_features=True):
    # 预处理给定的项目 item，根据需要保留特征

    requires_backends(preprocess_item, ["cython"])  # 检查是否需要 Cython 支持

    if keep_features and "edge_attr" in item.keys():  # 如果需要保留特征并且有边属性
        edge_attr = np.asarray(item["edge_attr"], dtype=np.int64)  # 转换边属性为 NumPy 数组
    else:
        edge_attr = np.ones((len(item["edge_index"][0]), 1), dtype=np.int64)  # 否则，默认为所有边相同的嵌入

    if keep_features and "node_feat" in item.keys():  # 如果需要保留特征并且有节点特征
        node_feature = np.asarray(item["node_feat"], dtype=np.int64)  # 转换节点特征为 NumPy 数组
    else:
        node_feature = np.ones((item["num_nodes"], 1), dtype=np.int64)  # 否则，默认为所有节点相同的嵌入

    edge_index = np.asarray(item["edge_index"], dtype=np.int64)  # 转换边索引为 NumPy 数组
    input_nodes = convert_to_single_emb(node_feature) + 1  # 转换输入节点为单一嵌入表示并加一

    num_nodes = item["num_nodes"]  # 获取节点数量

    if len(edge_attr.shape) == 1:
        edge_attr = edge_attr[:, None]  # 如果边属性的形状为一维，则扩展为二维

    attn_edge_type = np.zeros([num_nodes, num_nodes, edge_attr.shape[-1]], dtype=np.int64)  # 初始化注意力边类型矩阵

    attn_edge_type[edge_index[0], edge_index[1]] = convert_to_single_emb(edge_attr) + 1  # 设置注意力边类型

    adj = np.zeros([num_nodes, num_nodes], dtype=bool)  # 初始化邻接矩阵，布尔类型

    adj[edge_index[0], edge_index[1]] = True  # 根据边索引设置邻接矩阵的值为 True

    shortest_path_result, path = algos_graphormer.floyd_warshall(adj)  # 计算最短路径和路径
    max_dist = np.amax(shortest_path_result)  # 计算最大距离

    input_edges = algos_graphormer.gen_edge_input(max_dist, path, attn_edge_type)  # 生成输入边

    attn_bias = np.zeros([num_nodes + 1, num_nodes + 1], dtype=np.single)  # 初始化注意力偏置矩阵，单精度浮点数

    item["input_nodes"] = input_nodes + 1  # 将输入节点加一，用于填充
    item["attn_bias"] = attn_bias  # 设置注意力偏置矩阵
    item["attn_edge_type"] = attn_edge_type  # 设置注意力边类型
    item["spatial_pos"] = shortest_path_result.astype(np.int64) + 1  # 设置空间位置，加一用于填充
    item["in_degree"] = np.sum(adj, axis=1).reshape(-1) + 1  # 计算入度并加一用于填充
    item["out_degree"] = item["in_degree"]  # 对于无向图，出度等同于入度
    item["input_edges"] = input_edges + 1  # 设置输入边，加一用于填充

    if "labels" not in item:
        item["labels"] = item["y"]  # 如果没有标签，则使用 y 属性

    return item  # 返回预处理后的项目

class GraphormerDataCollator:
    # Graphormer 数据收集器类，用于收集和处理数据
    # 定义初始化方法，用于初始化对象
    def __init__(self, spatial_pos_max=20, on_the_fly_processing=False):
        # 检查是否有 Cython 可用，如果没有则抛出 ImportError
        if not is_cython_available():
            raise ImportError("Graphormer preprocessing needs Cython (pyximport)")

        # 设置对象的空间位置最大值属性
        self.spatial_pos_max = spatial_pos_max
        # 设置对象的动态处理属性
        self.on_the_fly_processing = on_the_fly_processing

`.\models\graphormer\configuration_graphormer.py`

# coding=utf-8
# 定义文件编码格式为 UTF-8

# 导入预训练配置类 PretrainedConfig 和日志记录工具 logging
from ...configuration_utils import PretrainedConfig
from ...utils import logging

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 定义 Graphormer 预训练模型配置文件的下载映射表
GRAPHORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    # pcqm4mv1 现在已经不推荐使用
    "graphormer-base": "https://huggingface.co/clefourrier/graphormer-base-pcqm4mv2/resolve/main/config.json",
    # 查看所有 Graphormer 模型的列表链接
    # See all Graphormer models at https://huggingface.co/models?filter=graphormer
}

# GraphormerConfig 类，用于存储 Graphormer 模型的配置信息
class GraphormerConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`~GraphormerModel`]. It is used to instantiate an
    Graphormer model according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the Graphormer
    [graphormer-base-pcqm4mv1](https://huggingface.co/graphormer-base-pcqm4mv1) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    """

    # 模型类型为 "graphormer"
    model_type = "graphormer"
    
    # 推断时需要忽略的键列表，在推断时不考虑过去的键值
    keys_to_ignore_at_inference = ["past_key_values"]
    # 初始化函数，用于初始化一个对象实例
    def __init__(
        self,
        num_classes: int = 1,  # 类别数量，默认为1
        num_atoms: int = 512 * 9,  # 原子数量，默认为512 * 9
        num_edges: int = 512 * 3,  # 边的数量，默认为512 * 3
        num_in_degree: int = 512,  # 输入度，默认为512
        num_out_degree: int = 512,  # 输出度，默认为512
        num_spatial: int = 512,  # 空间维度，默认为512
        num_edge_dis: int = 128,  # 边的分布，默认为128
        multi_hop_max_dist: int = 5,  # 多跳的最大距离，默认为5，有时为20
        spatial_pos_max: int = 1024,  # 空间位置的最大值，默认为1024
        edge_type: str = "multi_hop",  # 边的类型，默认为"multi_hop"
        max_nodes: int = 512,  # 最大节点数量，默认为512
        share_input_output_embed: bool = False,  # 是否共享输入输出嵌入，默认为False
        num_hidden_layers: int = 12,  # 隐藏层的数量，默认为12
        embedding_dim: int = 768,  # 嵌入维度，默认为768
        ffn_embedding_dim: int = 768,  # 前馈网络嵌入维度，默认为768
        num_attention_heads: int = 32,  # 注意力头的数量，默认为32
        dropout: float = 0.1,  # dropout概率，默认为0.1
        attention_dropout: float = 0.1,  # 注意力dropout概率，默认为0.1
        activation_dropout: float = 0.1,  # 激活函数dropout概率，默认为0.1
        layerdrop: float = 0.0,  # 层dropout概率，默认为0.0
        encoder_normalize_before: bool = False,  # 编码器层规范化前标志，默认为False
        pre_layernorm: bool = False,  # 层规范化前标志，默认为False
        apply_graphormer_init: bool = False,  # 是否应用Graphormer初始化，默认为False
        activation_fn: str = "gelu",  # 激活函数名称，默认为"gelu"
        embed_scale: float = None,  # 嵌入缩放因子，默认为None
        freeze_embeddings: bool = False,  # 是否冻结嵌入，默认为False
        num_trans_layers_to_freeze: int = 0,  # 要冻结的转换层数量，默认为0
        traceable: bool = False,  # 是否可追踪，默认为False
        q_noise: float = 0.0,  # 量化噪声，默认为0.0
        qn_block_size: int = 8,  # 量化块大小，默认为8
        kdim: int = None,  # 键的维度，默认为None
        vdim: int = None,  # 值的维度，默认为None
        bias: bool = True,  # 是否使用偏置，默认为True
        self_attention: bool = True,  # 是否使用自注意力，默认为True
        pad_token_id=0,  # 填充标记的ID，默认为0
        bos_token_id=1,  # 开始标记的ID，默认为1
        eos_token_id=2,  # 结束标记的ID，默认为2
        **kwargs,  # 其它参数，用于接收未明确定义的关键字参数
        self.num_classes = num_classes
        self.num_atoms = num_atoms
        self.num_in_degree = num_in_degree
        self.num_out_degree = num_out_degree
        self.num_edges = num_edges
        self.num_spatial = num_spatial
        self.num_edge_dis = num_edge_dis
        self.edge_type = edge_type
        self.multi_hop_max_dist = multi_hop_max_dist
        self.spatial_pos_max = spatial_pos_max
        self.max_nodes = max_nodes
        self.num_hidden_layers = num_hidden_layers
        self.embedding_dim = embedding_dim
        self.hidden_size = embedding_dim
        self.ffn_embedding_dim = ffn_embedding_dim
        self.num_attention_heads = num_attention_heads
        self.dropout = dropout
        self.attention_dropout = attention_dropout
        self.activation_dropout = activation_dropout
        self.layerdrop = layerdrop
        self.encoder_normalize_before = encoder_normalize_before
        self.pre_layernorm = pre_layernorm
        self.apply_graphormer_init = apply_graphormer_init
        self.activation_fn = activation_fn
        self.embed_scale = embed_scale
        self.freeze_embeddings = freeze_embeddings
        self.num_trans_layers_to_freeze = num_trans_layers_to_freeze
        self.share_input_output_embed = share_input_output_embed
        self.traceable = traceable
        self.q_noise = q_noise
        self.qn_block_size = qn_block_size


        # 初始化模型的各种参数
        self.num_classes = num_classes  # 类别数目
        self.num_atoms = num_atoms  # 原子数
        self.num_in_degree = num_in_degree  # 输入度数
        self.num_out_degree = num_out_degree  # 输出度数
        self.num_edges = num_edges  # 边的数目
        self.num_spatial = num_spatial  # 空间信息数目
        self.num_edge_dis = num_edge_dis  # 边的分布
        self.edge_type = edge_type  # 边的类型
        self.multi_hop_max_dist = multi_hop_max_dist  # 多跳最大距离
        self.spatial_pos_max = spatial_pos_max  # 空间位置的最大值
        self.max_nodes = max_nodes  # 最大节点数
        self.num_hidden_layers = num_hidden_layers  # 隐藏层的数目
        self.embedding_dim = embedding_dim  # 嵌入维度
        self.hidden_size = embedding_dim  # 隐藏层的大小（等于嵌入维度）
        self.ffn_embedding_dim = ffn_embedding_dim  # FeedForward网络的嵌入维度
        self.num_attention_heads = num_attention_heads  # 注意力头的数目
        self.dropout = dropout  # 通用的dropout率
        self.attention_dropout = attention_dropout  # 注意力模块的dropout率
        self.activation_dropout = activation_dropout  # 激活函数的dropout率
        self.layerdrop = layerdrop  # 层的dropout率
        self.encoder_normalize_before = encoder_normalize_before  # 编码器归一化前
        self.pre_layernorm = pre_layernorm  # 层归一化前
        self.apply_graphormer_init = apply_graphormer_init  # 应用Graphormer初始化
        self.activation_fn = activation_fn  # 激活函数
        self.embed_scale = embed_scale  # 嵌入的缩放因子
        self.freeze_embeddings = freeze_embeddings  # 冻结嵌入
        self.num_trans_layers_to_freeze = num_trans_layers_to_freeze  # 要冻结的转换层数
        self.share_input_output_embed = share_input_output_embed  # 共享输入输出嵌入
        self.traceable = traceable  # 可追踪性
        self.q_noise = q_noise  # Q值的噪声
        self.qn_block_size = qn_block_size  # QN块的大小


        # These parameters are here for future extensions
        # atm, the model only supports self attention
        self.kdim = kdim
        self.vdim = vdim
        self.self_attention = self_attention
        self.bias = bias

        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            **kwargs,
        )


        # 这些参数用于未来的扩展
        # 目前，模型仅支持自注意力
        self.kdim = kdim  # 键的维度
        self.vdim = vdim  # 值的维度
        self.self_attention = self_attention  # 自注意力
        self.bias = bias  # 偏置

        # 调用父类的初始化方法，设置特殊的token ID和其他关键字参数
        super().__init__(
            pad_token_id=pad_token_id,  # 填充token的ID
            bos_token_id=bos_token_id,  # 开始token的ID
            eos_token_id=eos_token_id,  # 结束token的ID
            **kwargs,  # 其他关键字参数
        )

`.\models\graphormer\modeling_graphormer.py`

# coding=utf-8
# Copyright 2022 Microsoft, clefourrier The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch Graphormer model."""

import math
from typing import Iterable, Iterator, List, Optional, Tuple, Union

import torch
import torch.nn as nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from ...activations import ACT2FN
from ...modeling_outputs import (
    BaseModelOutputWithNoAttention,
    SequenceClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...utils import logging
from .configuration_graphormer import GraphormerConfig

# 获取日志记录器
logger = logging.get_logger(__name__)

# 用于文档的检查点和配置
_CHECKPOINT_FOR_DOC = "graphormer-base-pcqm4mv1"
_CONFIG_FOR_DOC = "GraphormerConfig"

# 预训练模型存档列表
GRAPHORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "clefourrier/graphormer-base-pcqm4mv1",
    "clefourrier/graphormer-base-pcqm4mv2",
    # 查看所有 Graphormer 模型的列表：https://huggingface.co/models?filter=graphormer
]

def quant_noise(module: nn.Module, p: float, block_size: int):
    """
    From:
    https://github.com/facebookresearch/fairseq/blob/dd0079bde7f678b0cd0715cbd0ae68d661b7226d/fairseq/modules/quant_noise.py

    Wraps modules and applies quantization noise to the weights for subsequent quantization with Iterative Product
    Quantization as described in "Training with Quantization Noise for Extreme Model Compression"

    Args:
        - module: nn.Module
        - p: amount of Quantization Noise
        - block_size: size of the blocks for subsequent quantization with iPQ

    Remarks:
        - Module weights must have the right sizes wrt the block size
        - Only Linear, Embedding and Conv2d modules are supported for the moment
        - For more detail on how to quantize by blocks with convolutional weights, see "And the Bit Goes Down:
          Revisiting the Quantization of Neural Networks"
        - We implement the simplest form of noise here as stated in the paper which consists in randomly dropping
          blocks
    """

    # 如果没有量化噪声，则不注册钩子
    if p <= 0:
        return module

    # 只支持以下类型的模块
    if not isinstance(module, (nn.Linear, nn.Embedding, nn.Conv2d)):
        raise NotImplementedError("Module unsupported for quant_noise.")

    # 检查 module.weight 的维度是否符合 block_size
    is_conv = module.weight.ndim == 4

    # 2D 矩阵
    # 如果不是卷积层
    if not is_conv:
        # 检查输入特征是否是块大小的倍数，如果不是则抛出断言错误
        if module.weight.size(1) % block_size != 0:
            raise AssertionError("Input features must be a multiple of block sizes")

    # 如果是卷积层
    else:
        # 对于 1x1 卷积
        if module.kernel_size == (1, 1):
            # 检查输入通道数是否是块大小的倍数，如果不是则抛出断言错误
            if module.in_channels % block_size != 0:
                raise AssertionError("Input channels must be a multiple of block sizes")
        # 对于常规卷积
        else:
            # 计算卷积核大小
            k = module.kernel_size[0] * module.kernel_size[1]
            # 检查卷积核大小是否是块大小的倍数，如果不是则抛出断言错误
            if k % block_size != 0:
                raise AssertionError("Kernel size must be a multiple of block size")

    # 定义一个前向预处理钩子函数
    def _forward_pre_hook(mod, input):
        # 如果处于训练模式
        if mod.training:
            # 如果不是卷积层
            if not is_conv:
                # 获取权重和大小信息
                weight = mod.weight
                in_features = weight.size(1)
                out_features = weight.size(0)

                # 创建一个用于掩码的零张量，并根据概率 p 进行 Bernoulli 采样
                mask = torch.zeros(in_features // block_size * out_features, device=weight.device)
                mask.bernoulli_(p)
                mask = mask.repeat_interleave(block_size, -1).view(-1, in_features)

            else:
                # 获取权重和大小信息
                weight = mod.weight
                in_channels = mod.in_channels
                out_channels = mod.out_channels

                # 根据卷积核大小创建一个用于掩码的零张量，并根据概率 p 进行 Bernoulli 采样
                if mod.kernel_size == (1, 1):
                    mask = torch.zeros(
                        int(in_channels // block_size * out_channels),
                        device=weight.device,
                    )
                    mask.bernoulli_(p)
                    mask = mask.repeat_interleave(block_size, -1).view(-1, in_channels)
                else:
                    mask = torch.zeros(weight.size(0), weight.size(1), device=weight.device)
                    mask.bernoulli_(p)
                    mask = mask.unsqueeze(2).unsqueeze(3).repeat(1, 1, mod.kernel_size[0], mod.kernel_size[1])

            # 将掩码转换为布尔型张量，以便 TorchScript 中的兼容性
            mask = mask.to(torch.bool)
            # 计算权重的缩放因子并应用掩码
            s = 1 / (1 - p)
            mod.weight.data = s * weight.masked_fill(mask, 0)

    # 注册前向预处理钩子函数
    module.register_forward_pre_hook(_forward_pre_hook)
    # 返回处理后的模块
    return module
# 定义一个继承自 `nn.ModuleList` 的自定义模块 `LayerDropModuleList`
class LayerDropModuleList(nn.ModuleList):
    """
    From:
    https://github.com/facebookresearch/fairseq/blob/dd0079bde7f678b0cd0715cbd0ae68d661b7226d/fairseq/modules/layer_drop.py
    A LayerDrop implementation based on [`torch.nn.ModuleList`]. LayerDrop as described in
    https://arxiv.org/abs/1909.11556.

    We refresh the choice of which layers to drop every time we iterate over the LayerDropModuleList instance. During
    evaluation we always iterate over all layers.

    Usage:

    ```
    layers = LayerDropList(p=0.5, modules=[layer1, layer2, layer3])
    for layer in layers:  # this might iterate over layers 1 and 3
        x = layer(x)
    for layer in layers:  # this might iterate over all layers
        x = layer(x)
    for layer in layers:  # this might not iterate over any layers
        x = layer(x)
    ```

    Args:
        p (float): probability of dropping out each layer
        modules (iterable, optional): an iterable of modules to add
    """

    # 初始化方法，接收概率参数 p 和模块列表 modules
    def __init__(self, p: float, modules: Optional[Iterable[nn.Module]] = None):
        # 调用父类的初始化方法，将 modules 传入父类构造函数
        super().__init__(modules)
        # 存储概率参数 p
        self.p = p

    # 迭代器方法，返回一个迭代器，迭代模块列表中的每个模块
    def __iter__(self) -> Iterator[nn.Module]:
        # 创建一个与模块列表长度相同的随机概率张量
        dropout_probs = torch.empty(len(self)).uniform_()
        # 遍历模块列表中的每个模块和对应的随机概率
        for i, m in enumerate(super().__iter__()):
            # 如果不在训练模式下或者随机概率大于阈值 p，则保留该模块
            if not self.training or (dropout_probs[i] > self.p):
                yield m


# 定义一个用于计算图中节点特征的模块 `GraphormerGraphNodeFeature`
class GraphormerGraphNodeFeature(nn.Module):
    """
    Compute node features for each node in the graph.
    """

    # 初始化方法，接收一个 `GraphormerConfig` 类型的配置参数 config
    def __init__(self, config: GraphormerConfig):
        # 调用父类的初始化方法
        super().__init__()
        # 存储配置参数中的注意力头数和原子数
        self.num_heads = config.num_attention_heads
        self.num_atoms = config.num_atoms

        # 初始化原子编码器，输入维度为原子数加一，输出维度为隐藏大小，设置了填充索引
        self.atom_encoder = nn.Embedding(config.num_atoms + 1, config.hidden_size, padding_idx=config.pad_token_id)
        # 初始化入度编码器，输入维度为入度数，输出维度为隐藏大小，设置了填充索引
        self.in_degree_encoder = nn.Embedding(
            config.num_in_degree, config.hidden_size, padding_idx=config.pad_token_id
        )
        # 初始化出度编码器，输入维度为出度数，输出维度为隐藏大小，设置了填充索引
        self.out_degree_encoder = nn.Embedding(
            config.num_out_degree, config.hidden_size, padding_idx=config.pad_token_id
        )

        # 初始化图标记编码器，固定为 1 的嵌入，输出维度为隐藏大小
        self.graph_token = nn.Embedding(1, config.hidden_size)

    # 前向传播方法，接收输入节点、入度和出度张量，返回节点特征张量
    def forward(
        self,
        input_nodes: torch.LongTensor,
        in_degree: torch.LongTensor,
        out_degree: torch.LongTensor,
    ) -> torch.Tensor:
        # 获取输入节点张量的维度信息，n_graph 表示图的数量，n_node 表示每个图中节点的数量
        n_graph, n_node = input_nodes.size()[:2]

        # 计算节点特征，包括原子编码器的求和、入度编码器和出度编码器
        node_feature = (
            self.atom_encoder(input_nodes).sum(dim=-2)  # [n_graph, n_node, n_hidden]
            + self.in_degree_encoder(in_degree)
            + self.out_degree_encoder(out_degree)
        )

        # 计算图标记特征，使用图标记编码器的权重张量重复 n_graph 次
        graph_token_feature = self.graph_token.weight.unsqueeze(0).repeat(n_graph, 1, 1)

        # 拼接图标记特征和节点特征，沿着第一个维度拼接
        graph_node_feature = torch.cat([graph_token_feature, node_feature], dim=1)

        # 返回拼接后的图节点特征张量
        return graph_node_feature


# 定义一个用于计算每个注意力头的注意力偏置的模块 `GraphormerGraphAttnBias`
class GraphormerGraphAttnBias(nn.Module):
    """
    Compute attention bias for each head.
    """

    # 初始化方法，无需额外参数
    def __init__(self):
        # 调用父类的初始化方法
        super().__init__()

    # 此处无需定义前向传播方法，因为此模块仅用于计算注意力偏置
    # 初始化函数，接受一个 GraphormerConfig 类型的参数 config
    def __init__(self, config: GraphormerConfig):
        # 调用父类的初始化方法
        super().__init__()
        # 设置头注意力的数量为配置参数中的 num_attention_heads
        self.num_heads = config.num_attention_heads
        # 设置多跳最大距离为配置参数中的 multi_hop_max_dist
        self.multi_hop_max_dist = config.multi_hop_max_dist

        # 创建一个边特征编码器，使用 nn.Embedding 来表示边特征的组合
        # config.num_edges + 1 表示边特征的数量，config.num_attention_heads 表示每个边特征的维度，padding_idx=0 表示填充索引为0
        self.edge_encoder = nn.Embedding(config.num_edges + 1, config.num_attention_heads, padding_idx=0)

        # 设置边类型为配置参数中的 edge_type
        self.edge_type = config.edge_type
        # 如果边类型是 "multi_hop"，则创建一个边距离编码器
        if self.edge_type == "multi_hop":
            # 使用 nn.Embedding 创建边距离编码器，大小为 config.num_edge_dis * config.num_attention_heads * config.num_attention_heads
            # 输出维度为1
            self.edge_dis_encoder = nn.Embedding(
                config.num_edge_dis * config.num_attention_heads * config.num_attention_heads,
                1,
            )

        # 创建空间位置编码器，使用 nn.Embedding 表示空间位置
        # config.num_spatial 表示空间位置的数量，config.num_attention_heads 表示每个位置的维度，padding_idx=0 表示填充索引为0
        self.spatial_pos_encoder = nn.Embedding(config.num_spatial, config.num_attention_heads, padding_idx=0)

        # 创建图令牌的虚拟距离编码器，使用 nn.Embedding 表示虚拟距离
        # 大小为1，config.num_attention_heads 表示每个虚拟距离的维度
        self.graph_token_virtual_distance = nn.Embedding(1, config.num_attention_heads)

    # 前向传播函数，接受多个张量作为输入参数
    def forward(
        self,
        input_nodes: torch.LongTensor,
        attn_bias: torch.Tensor,
        spatial_pos: torch.LongTensor,
        input_edges: torch.LongTensor,
        attn_edge_type: torch.LongTensor,
        ...
        # 返回类型声明为 torch.Tensor
        n_graph, n_node = input_nodes.size()[:2]
        # 复制注意力偏置张量
        graph_attn_bias = attn_bias.clone()
        # 在第1维度上增加维度，并重复 num_heads 次，扩展为四维张量
        graph_attn_bias = graph_attn_bias.unsqueeze(1).repeat(
            1, self.num_heads, 1, 1
        )  # [n_graph, n_head, n_node+1, n_node+1]

        # 空间位置偏置处理
        # 调用 spatial_pos_encoder 处理 spatial_pos，并进行维度置换
        spatial_pos_bias = self.spatial_pos_encoder(spatial_pos).permute(0, 3, 1, 2)
        # 将 spatial_pos_bias 加到 graph_attn_bias 的对应位置上
        graph_attn_bias[:, :, 1:, 1:] = graph_attn_bias[:, :, 1:, 1:] + spatial_pos_bias

        # 在这里重置空间位置
        t = self.graph_token_virtual_distance.weight.view(1, self.num_heads, 1)
        # 将 t 加到 graph_attn_bias 的对应位置上
        graph_attn_bias[:, :, 1:, 0] = graph_attn_bias[:, :, 1:, 0] + t
        graph_attn_bias[:, :, 0, :] = graph_attn_bias[:, :, 0, :] + t

        # 边特征处理
        if self.edge_type == "multi_hop":
            # 复制 spatial_pos，并将值为0的元素设为1
            spatial_pos_ = spatial_pos.clone()
            spatial_pos_[spatial_pos_ == 0] = 1  # set pad to 1
            # 将大于1的元素减1，同时限制到 multi_hop_max_dist 的范围内
            spatial_pos_ = torch.where(spatial_pos_ > 1, spatial_pos_ - 1, spatial_pos_)
            if self.multi_hop_max_dist > 0:
                spatial_pos_ = spatial_pos_.clamp(0, self.multi_hop_max_dist)
                input_edges = input_edges[:, :, :, : self.multi_hop_max_dist, :]
            # 对 input_edges 进行边编码处理，并在倒数第二维求均值
            input_edges = self.edge_encoder(input_edges).mean(-2)
            max_dist = input_edges.size(-2)
            # 对 edge_input_flat 进行形状重塑和矩阵乘法操作
            edge_input_flat = input_edges.permute(3, 0, 1, 2, 4).reshape(max_dist, -1, self.num_heads)
            edge_input_flat = torch.bmm(
                edge_input_flat,
                self.edge_dis_encoder.weight.reshape(-1, self.num_heads, self.num_heads)[:max_dist, :, :],
            )
            # 对 input_edges 进行形状重塑和维度置换
            input_edges = edge_input_flat.reshape(max_dist, n_graph, n_node, n_node, self.num_heads).permute(
                1, 2, 3, 0, 4
            )
            input_edges = (input_edges.sum(-2) / (spatial_pos_.float().unsqueeze(-1))).permute(0, 3, 1, 2)
        else:
            # 对 attn_edge_type 进行边编码处理，并在倒数第二维求均值，并进行维度置换
            input_edges = self.edge_encoder(attn_edge_type).mean(-2).permute(0, 3, 1, 2)

        # 将 input_edges 加到 graph_attn_bias 的对应位置上
        graph_attn_bias[:, :, 1:, 1:] = graph_attn_bias[:, :, 1:, 1:] + input_edges
        # 将 attn_bias 增加一个维度后加到 graph_attn_bias 上，进行重置
        graph_attn_bias = graph_attn_bias + attn_bias.unsqueeze(1)

        # 返回 graph_attn_bias 结果
        return graph_attn_bias
    def reset_parameters(self):
        # 如果查询、键和值的维度相同，则使用缩放的初始化方法
        if self.qkv_same_dim:
            # 使用缩放的初始化方法对 k_proj, v_proj, q_proj 的权重进行初始化
            nn.init.xavier_uniform_(self.k_proj.weight, gain=1 / math.sqrt(2))
            nn.init.xavier_uniform_(self.v_proj.weight, gain=1 / math.sqrt(2))
            nn.init.xavier_uniform_(self.q_proj.weight, gain=1 / math.sqrt(2))
        else:
            # 使用普通的 xavier_uniform 初始化方法对 k_proj, v_proj, q_proj 的权重进行初始化
            nn.init.xavier_uniform_(self.k_proj.weight)
            nn.init.xavier_uniform_(self.v_proj.weight)
            nn.init.xavier_uniform_(self.q_proj.weight)

        # 使用 xavier_uniform 初始化方法对 out_proj 的权重进行初始化
        nn.init.xavier_uniform_(self.out_proj.weight)
        # 如果 out_proj 的偏置存在，则将其初始化为常数 0.0
        if self.out_proj.bias is not None:
            nn.init.constant_(self.out_proj.bias, 0.0)
    # 定义了一个方法 `forward`，用于执行模型的前向传播
    def forward(
        self,
        query: torch.LongTensor,
        key: Optional[torch.Tensor],
        value: Optional[torch.Tensor],
        attn_bias: Optional[torch.Tensor],
        key_padding_mask: Optional[torch.Tensor] = None,
        need_weights: bool = True,
        attn_mask: Optional[torch.Tensor] = None,
        before_softmax: bool = False,
        need_head_weights: bool = False,
    ):
        # apply_sparse_mask 方法，用于对注意力权重矩阵进行稀疏掩码处理
        def apply_sparse_mask(self, attn_weights: torch.Tensor, tgt_len: int, src_len: int, bsz: int) -> torch.Tensor:
            # 直接返回传入的注意力权重矩阵
            return attn_weights
class GraphormerGraphEncoderLayer(nn.Module):
    # Graphormer 图编码器层的定义，继承自 nn.Module
    def __init__(self, config: GraphormerConfig) -> None:
        super().__init__()

        # 初始化参数
        self.embedding_dim = config.embedding_dim  # 嵌入维度
        self.num_attention_heads = config.num_attention_heads  # 注意力头的数量
        self.q_noise = config.q_noise  # 量化噪声
        self.qn_block_size = config.qn_block_size  # 量化块大小
        self.pre_layernorm = config.pre_layernorm  # 是否使用层标准化前置

        # 初始化 Dropout 模块
        self.dropout_module = torch.nn.Dropout(p=config.dropout, inplace=False)

        # 初始化激活函数 Dropout 模块
        self.activation_dropout_module = torch.nn.Dropout(p=config.activation_dropout, inplace=False)

        # 初始化激活函数
        self.activation_fn = ACT2FN[config.activation_fn]

        # 初始化自注意力层
        self.self_attn = GraphormerMultiheadAttention(config)

        # 自注意力层后的层标准化
        self.self_attn_layer_norm = nn.LayerNorm(self.embedding_dim)

        # 构建第一个全连接层
        self.fc1 = self.build_fc(
            self.embedding_dim,
            config.ffn_embedding_dim,
            q_noise=config.q_noise,
            qn_block_size=config.qn_block_size,
        )

        # 构建第二个全连接层
        self.fc2 = self.build_fc(
            config.ffn_embedding_dim,
            self.embedding_dim,
            q_noise=config.q_noise,
            qn_block_size=config.qn_block_size,
        )

        # 最终层的层标准化
        self.final_layer_norm = nn.LayerNorm(self.embedding_dim)

    def build_fc(
        self, input_dim: int, output_dim: int, q_noise: float, qn_block_size: int
    ) -> Union[nn.Module, nn.Linear, nn.Embedding, nn.Conv2d]:
        # 构建带有量化噪声的全连接层
        return quant_noise(nn.Linear(input_dim, output_dim), q_noise, qn_block_size)

    def forward(
        self,
        input_nodes: torch.Tensor,
        self_attn_bias: Optional[torch.Tensor] = None,
        self_attn_mask: Optional[torch.Tensor] = None,
        self_attn_padding_mask: Optional[torch.Tensor] = None,
        ):
        # 前向传播函数定义
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        """
        定义函数的返回类型为一个包含两个元素的元组，第一个是 torch.Tensor 类型，第二个是可选的 torch.Tensor 类型。
        """
        residual = input_nodes
        # 如果配置为在 self-attention/ffn 模块之前应用 LayerNorm，则对输入进行 LayerNorm
        if self.pre_layernorm:
            input_nodes = self.self_attn_layer_norm(input_nodes)

        # 调用 self-attention 模块进行计算
        input_nodes, attn = self.self_attn(
            query=input_nodes,
            key=input_nodes,
            value=input_nodes,
            attn_bias=self_attn_bias,
            key_padding_mask=self_attn_padding_mask,
            need_weights=False,
            attn_mask=self_attn_mask,
        )
        # 对输出结果进行 dropout 处理
        input_nodes = self.dropout_module(input_nodes)
        # 将残差连接到当前的输出上
        input_nodes = residual + input_nodes
        # 如果配置为在 self-attention/ffn 模块之后应用 LayerNorm，则对输出进行 LayerNorm
        if not self.pre_layernorm:
            input_nodes = self.self_attn_layer_norm(input_nodes)

        residual = input_nodes
        # 如果配置为在最终层之前应用 LayerNorm，则对输入进行 LayerNorm
        if self.pre_layernorm:
            input_nodes = self.final_layer_norm(input_nodes)
        # 应用激活函数到第一个全连接层的输出上
        input_nodes = self.activation_fn(self.fc1(input_nodes))
        # 对第一个全连接层的输出进行 dropout 处理
        input_nodes = self.activation_dropout_module(input_nodes)
        # 通过第二个全连接层进行计算
        input_nodes = self.fc2(input_nodes)
        # 对第二个全连接层的输出进行 dropout 处理
        input_nodes = self.dropout_module(input_nodes)
        # 将残差连接到当前的输出上
        input_nodes = residual + input_nodes
        # 如果配置为在最终层之后应用 LayerNorm，则对输出进行 LayerNorm
        if not self.pre_layernorm:
            input_nodes = self.final_layer_norm(input_nodes)

        # 返回计算结果及 self-attention 的注意力权重（如果有的话）
        return input_nodes, attn
class GraphormerGraphEncoder(nn.Module):
    # 定义 Graphormer 图编码器模型
    def __init__(self, config: GraphormerConfig):
        super().__init__()

        # 定义模型的 dropout 模块，用于随机失活
        self.dropout_module = torch.nn.Dropout(p=config.dropout, inplace=False)
        
        # 设置层级随机失活率
        self.layerdrop = config.layerdrop
        
        # 设置嵌入维度
        self.embedding_dim = config.embedding_dim
        
        # 是否应用 Graphormer 初始化
        self.apply_graphormer_init = config.apply_graphormer_init
        
        # 是否支持跟踪
        self.traceable = config.traceable

        # 初始化 Graphormer 图节点特征
        self.graph_node_feature = GraphormerGraphNodeFeature(config)
        
        # 初始化 Graphormer 图注意力偏置
        self.graph_attn_bias = GraphormerGraphAttnBias(config)

        # 设置嵌入缩放
        self.embed_scale = config.embed_scale

        # 如果配置中有量化噪声，则初始化量化噪声模块
        if config.q_noise > 0:
            self.quant_noise = quant_noise(
                nn.Linear(self.embedding_dim, self.embedding_dim, bias=False),
                config.q_noise,
                config.qn_block_size,
            )
        else:
            self.quant_noise = None

        # 根据配置决定是否使用 Encoder 前归一化
        if config.encoder_normalize_before:
            self.emb_layer_norm = nn.LayerNorm(self.embedding_dim)
        else:
            self.emb_layer_norm = None

        # 如果配置中有预归一化选项，则初始化最终的层级归一化
        if config.pre_layernorm:
            self.final_layer_norm = nn.LayerNorm(self.embedding_dim)

        # 如果配置中定义了层级随机失活率，则创建相应的层级列表
        if self.layerdrop > 0.0:
            self.layers = LayerDropModuleList(p=self.layerdrop)
        else:
            self.layers = nn.ModuleList([])

        # 根据配置中的隐藏层数量，扩展层级列表，每层使用 Graphormer 图编码器层进行初始化
        self.layers.extend([GraphormerGraphEncoderLayer(config) for _ in range(config.num_hidden_layers)])

        # 在构建模型后，根据配置决定是否应用模型参数的初始化
        # 如果配置中冻结嵌入，则抛出未实现异常
        if config.freeze_embeddings:
            raise NotImplementedError("Freezing embeddings is not implemented yet.")

        # 冻结指定数量的转换层参数
        for layer in range(config.num_trans_layers_to_freeze):
            m = self.layers[layer]
            if m is not None:
                for p in m.parameters():
                    p.requires_grad = False

    # 前向传播函数定义，接受多个输入张量和可选参数
    def forward(
        self,
        input_nodes: torch.LongTensor,
        input_edges: torch.LongTensor,
        attn_bias: torch.Tensor,
        in_degree: torch.LongTensor,
        out_degree: torch.LongTensor,
        spatial_pos: torch.LongTensor,
        attn_edge_type: torch.LongTensor,
        perturb=None,
        last_state_only: bool = False,
        token_embeddings: Optional[torch.Tensor] = None,
        attn_mask: Optional[torch.Tensor] = None,
        # 定义函数签名和返回类型注释
        ) -> Tuple[Union[torch.Tensor, List[torch.LongTensor]], torch.Tensor]:
        # 计算填充掩码。这对多头注意力很重要
        data_x = input_nodes
        # 获取数据的图和节点数量
        n_graph, n_node = data_x.size()[:2]
        # 创建填充掩码，找出哪些位置是填充的
        padding_mask = (data_x[:, :, 0]).eq(0)
        # 创建一个新的填充掩码，用于CLS位置的特殊标记
        padding_mask_cls = torch.zeros(n_graph, 1, device=padding_mask.device, dtype=padding_mask.dtype)
        # 将CLS掩码和普通掩码拼接在一起
        padding_mask = torch.cat((padding_mask_cls, padding_mask), dim=1)

        # 计算注意力偏置
        attn_bias = self.graph_attn_bias(input_nodes, attn_bias, spatial_pos, input_edges, attn_edge_type)

        # 如果存在令牌嵌入，则使用令牌嵌入作为输入节点特征
        if token_embeddings is not None:
            input_nodes = token_embeddings
        else:
            # 否则，通过计算节点特征得到输入节点
            input_nodes = self.graph_node_feature(input_nodes, in_degree, out_degree)

        # 如果存在扰动，则添加扰动到输入节点
        if perturb is not None:
            input_nodes[:, 1:, :] += perturb

        # 如果存在嵌入缩放因子，则对输入节点进行缩放
        if self.embed_scale is not None:
            input_nodes = input_nodes * self.embed_scale

        # 如果存在量化噪声，则应用量化噪声到输入节点
        if self.quant_noise is not None:
            input_nodes = self.quant_noise(input_nodes)

        # 如果存在嵌入层规范化，则对输入节点进行规范化
        if self.emb_layer_norm is not None:
            input_nodes = self.emb_layer_norm(input_nodes)

        # 对输入节点应用丢弃模块，以防止过拟合
        input_nodes = self.dropout_module(input_nodes)

        # 转置输入节点，以适应模型需求
        input_nodes = input_nodes.transpose(0, 1)

        # 初始化内部状态列表
        inner_states = []
        # 如果不仅需最后一个状态，则将当前输入节点添加到内部状态列表
        if not last_state_only:
            inner_states.append(input_nodes)

        # 遍历所有层，依次进行计算和更新
        for layer in self.layers:
            input_nodes, _ = layer(
                input_nodes,
                self_attn_padding_mask=padding_mask,
                self_attn_mask=attn_mask,
                self_attn_bias=attn_bias,
            )
            # 如果不仅需最后一个状态，则将当前输入节点添加到内部状态列表
            if not last_state_only:
                inner_states.append(input_nodes)

        # 提取图表示，通常为第一个节点的输出
        graph_rep = input_nodes[0, :, :]

        # 如果仅需最后一个状态，则重置内部状态列表为仅包含最后一个状态
        if last_state_only:
            inner_states = [input_nodes]

        # 如果支持追踪，则返回内部状态的堆栈和图表示
        if self.traceable:
            return torch.stack(inner_states), graph_rep
        else:
            # 否则，仅返回内部状态和图表示
            return inner_states, graph_rep
    # 定义一个 Graphormer 解码器的头部模块，继承自 nn.Module
    class GraphormerDecoderHead(nn.Module):
        def __init__(self, embedding_dim: int, num_classes: int):
            super().__init__()
            """num_classes should be 1 for regression, or the number of classes for classification"""
            # 初始化一个学习偏置参数，用于输出层的线性变换
            self.lm_output_learned_bias = nn.Parameter(torch.zeros(1))
            # 定义一个线性分类器，输入维度为 embedding_dim，输出维度为 num_classes，不使用偏置项
            self.classifier = nn.Linear(embedding_dim, num_classes, bias=False)
            self.num_classes = num_classes

        def forward(self, input_nodes: torch.Tensor, **unused) -> torch.Tensor:
            # 将输入节点 input_nodes 经过分类器进行线性变换
            input_nodes = self.classifier(input_nodes)
            # 加上学习的偏置参数 lm_output_learned_bias
            input_nodes = input_nodes + self.lm_output_learned_bias
            # 返回处理后的节点数据
            return input_nodes


    class GraphormerPreTrainedModel(PreTrainedModel):
        """
        An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
        models.
        """

        # 指定配置类为 GraphormerConfig
        config_class = GraphormerConfig
        # 基础模型的前缀名称为 "graphormer"
        base_model_prefix = "graphormer"
        # 主要输入节点的名称为 "input_nodes"
        main_input_name_nodes = "input_nodes"
        # 主要输入边的名称为 "input_edges"

        def normal_(self, data: torch.Tensor):
            # 使用 FSDP（Fully Sharded Data Parallel）时，模块参数会在 CUDA 上，因此将它们转回 CPU
            # 以确保随机数生成器在有无 FSDP 时保持一致性
            data.copy_(data.cpu().normal_(mean=0.0, std=0.02).to(data.device))

        def init_graphormer_params(self, module: Union[nn.Linear, nn.Embedding, GraphormerMultiheadAttention]):
            """
            Initialize the weights specific to the Graphormer Model.
            """
            # 根据模块的类型初始化 Graphormer 模型特定的权重
            if isinstance(module, nn.Linear):
                # 初始化线性层的权重
                self.normal_(module.weight.data)
                # 如果有偏置项，则将其置零
                if module.bias is not None:
                    module.bias.data.zero_()
            elif isinstance(module, nn.Embedding):
                # 初始化嵌入层的权重
                self.normal_(module.weight.data)
                # 如果有填充索引，则将填充索引处的权重置零
                if module.padding_idx is not None:
                    module.weight.data[module.padding_idx].zero_()
            elif isinstance(module, GraphormerMultiheadAttention):
                # 初始化 Graphormer 多头注意力机制中的投影权重
                self.normal_(module.q_proj.weight.data)
                self.normal_(module.k_proj.weight.data)
                self.normal_(module.v_proj.weight.data)

        def _init_weights(
            self,
            module: Union[
                nn.Linear, nn.Conv2d, nn.Embedding, nn.LayerNorm, GraphormerMultiheadAttention, GraphormerGraphEncoder
            ],
        """
        初始化模型的权重
        """
        if isinstance(module, (nn.Linear, nn.Conv2d)):
            # 如果是线性层或卷积层，初始化权重为正态分布，均值为0，标准差为0.02
            module.weight.data.normal_(mean=0.0, std=0.02)
            if module.bias is not None:
                # 如果存在偏置项，将其初始化为零
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            # 如果是嵌入层，初始化权重为正态分布，均值为0，标准差为0.02
            module.weight.data.normal_(mean=0.0, std=0.02)
            if module.padding_idx is not None:
                # 如果设置了padding_idx，将其对应的权重初始化为零
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, GraphormerMultiheadAttention):
            # 如果是自定义的多头注意力层，分别初始化查询、键、值投影的权重
            module.q_proj.weight.data.normal_(mean=0.0, std=0.02)
            module.k_proj.weight.data.normal_(mean=0.0, std=0.02)
            module.v_proj.weight.data.normal_(mean=0.0, std=0.02)
            # 调用重置参数的方法
            module.reset_parameters()
        elif isinstance(module, nn.LayerNorm):
            # 如果是层归一化层，初始化偏置为零，权重为1.0
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        elif isinstance(module, GraphormerGraphEncoder):
            # 如果是自定义的图编码器，根据设置决定是否应用初始化方法
            if module.apply_graphormer_init:
                module.apply(self.init_graphormer_params)
        elif isinstance(module, nn.LayerNorm):
            # 再次检查是否是层归一化层，确保偏置为零，权重为1.0
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
class GraphormerModel(GraphormerPreTrainedModel):
    """The Graphormer model is a graph-encoder model.

    It goes from a graph to its representation. If you want to use the model for a downstream classification task, use
    GraphormerForGraphClassification instead. For any other downstream task, feel free to add a new class, or combine
    this model with a downstream model of your choice, following the example in GraphormerForGraphClassification.
    """

    def __init__(self, config: GraphormerConfig):
        super().__init__(config)
        self.max_nodes = config.max_nodes  # 初始化最大节点数

        self.graph_encoder = GraphormerGraphEncoder(config)  # 初始化图编码器

        self.share_input_output_embed = config.share_input_output_embed  # 是否共享输入输出的嵌入
        self.lm_output_learned_bias = None  # 学习到的偏置参数为 None

        # Fine-tuning时设置为True
        self.load_softmax = not getattr(config, "remove_head", False)

        self.lm_head_transform_weight = nn.Linear(config.embedding_dim, config.embedding_dim)  # 线性变换权重
        self.activation_fn = ACT2FN[config.activation_fn]  # 激活函数
        self.layer_norm = nn.LayerNorm(config.embedding_dim)  # 归一化层

        self.post_init()  # 调用后初始化方法

    def reset_output_layer_parameters(self):
        self.lm_output_learned_bias = nn.Parameter(torch.zeros(1))  # 重置输出层参数为学习到的偏置

    def forward(
        self,
        input_nodes: torch.LongTensor,
        input_edges: torch.LongTensor,
        attn_bias: torch.Tensor,
        in_degree: torch.LongTensor,
        out_degree: torch.LongTensor,
        spatial_pos: torch.LongTensor,
        attn_edge_type: torch.LongTensor,
        perturb: Optional[torch.FloatTensor] = None,
        masked_tokens: None = None,
        return_dict: Optional[bool] = None,
        **unused,
    ) -> Union[Tuple[torch.LongTensor], BaseModelOutputWithNoAttention]:
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        inner_states, graph_rep = self.graph_encoder(  # 调用图编码器进行前向传播
            input_nodes, input_edges, attn_bias, in_degree, out_degree, spatial_pos, attn_edge_type, perturb=perturb
        )

        # 取最后一个内部状态，然后反转批次和图长度
        input_nodes = inner_states[-1].transpose(0, 1)

        # 仅投影掩码的标记
        if masked_tokens is not None:
            raise NotImplementedError  # 抛出未实现错误

        input_nodes = self.layer_norm(self.activation_fn(self.lm_head_transform_weight(input_nodes)))  # 执行线性变换和激活函数后进行归一化

        # 投影回词汇表的大小
        if self.share_input_output_embed and hasattr(self.graph_encoder.embed_tokens, "weight"):
            input_nodes = torch.nn.functional.linear(input_nodes, self.graph_encoder.embed_tokens.weight)

        if not return_dict:
            return tuple(x for x in [input_nodes, inner_states] if x is not None)
        return BaseModelOutputWithNoAttention(last_hidden_state=input_nodes, hidden_states=inner_states)  # 返回模型输出

    def max_nodes(self):
        """Maximum output length supported by the encoder."""
        return self.max_nodes  # 返回最大节点数
class GraphormerForGraphClassification(GraphormerPreTrainedModel):
    """
    This model can be used for graph-level classification or regression tasks.

    It can be trained on
    - regression (by setting config.num_classes to 1); there should be one float-type label per graph
    - one task classification (by setting config.num_classes to the number of classes); there should be one integer
      label per graph
    - binary multi-task classification (by setting config.num_classes to the number of labels); there should be a list
      of integer labels for each graph.
    """

    def __init__(self, config: GraphormerConfig):
        super().__init__(config)
        # 初始化 GraphormerForGraphClassification 类的实例
        self.encoder = GraphormerModel(config)
        # 设置嵌入维度
        self.embedding_dim = config.embedding_dim
        # 设置类别数量
        self.num_classes = config.num_classes
        # 初始化分类器
        self.classifier = GraphormerDecoderHead(self.embedding_dim, self.num_classes)
        # 表示这个模型是编码器-解码器结构
        self.is_encoder_decoder = True

        # 初始化权重并进行最终处理
        self.post_init()

    def forward(
        self,
        input_nodes: torch.LongTensor,
        input_edges: torch.LongTensor,
        attn_bias: torch.Tensor,
        in_degree: torch.LongTensor,
        out_degree: torch.LongTensor,
        spatial_pos: torch.LongTensor,
        attn_edge_type: torch.LongTensor,
        labels: Optional[torch.LongTensor] = None,
        return_dict: Optional[bool] = None,
        **unused,
    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 将输入数据传递给编码器
        encoder_outputs = self.encoder(
            input_nodes,
            input_edges,
            attn_bias,
            in_degree,
            out_degree,
            spatial_pos,
            attn_edge_type,
            return_dict=True,
        )
        # 获取编码器的输出和隐藏状态
        outputs, hidden_states = encoder_outputs["last_hidden_state"], encoder_outputs["hidden_states"]

        # 将编码器的输出传递给分类器
        head_outputs = self.classifier(outputs)
        # 取出分类器的 logits（对数几率）
        logits = head_outputs[:, 0, :].contiguous()

        loss = None
        if labels is not None:
            # 创建一个标签的掩码，用于处理缺失的标签数据
            mask = ~torch.isnan(labels)

            if self.num_classes == 1:  # regression（回归）
                # 如果是回归任务，使用均方误差损失函数
                loss_fct = MSELoss()
                loss = loss_fct(logits[mask].squeeze(), labels[mask].squeeze().float())
            elif self.num_classes > 1 and len(labels.shape) == 1:  # One task classification（单一任务分类）
                # 如果是单一任务分类，使用交叉熵损失函数
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits[mask].view(-1, self.num_classes), labels[mask].view(-1))
            else:  # Binary multi-task classification（二进制多任务分类）
                # 如果是二进制多任务分类，使用带 logits 的二元交叉熵损失函数
                loss_fct = BCEWithLogitsLoss(reduction="sum")
                loss = loss_fct(logits[mask], labels[mask])

        # 如果不要求返回字典，则返回损失、logits 和隐藏状态的元组
        if not return_dict:
            return tuple(x for x in [loss, logits, hidden_states] if x is not None)
        # 如果要求返回字典，则返回 SequenceClassifierOutput 对象
        return SequenceClassifierOutput(loss=loss, logits=logits, hidden_states=hidden_states, attentions=None)

`.\models\graphormer\init.py`

# 版权声明及许可信息
# 版权所有 2020 年 HuggingFace 团队保留所有权利。
#
# 根据 Apache 许可证 2.0 版本（“许可证”）许可；
# 除非符合许可证的规定，否则您不得使用此文件。
# 您可以获取许可证的副本，详见
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，软件按“原样”分发，
# 没有任何明示或暗示的担保或条件。
# 有关具体语言版本下的权限，请参阅许可证。
from typing import TYPE_CHECKING

# 从...utils 中导入 OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available 函数
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available

# 定义模块的导入结构
_import_structure = {
    "configuration_graphormer": ["GRAPHORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "GraphormerConfig"],
}

# 检查是否支持 torch 库，若不支持则引发 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果支持 torch，则添加 modeling_graphormer 模块的导入结构
    _import_structure["modeling_graphormer"] = [
        "GRAPHORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
        "GraphormerForGraphClassification",
        "GraphormerModel",
        "GraphormerPreTrainedModel",
    ]

# 如果是类型检查模式，则导入 GRAPHORMER_PRETRAINED_CONFIG_ARCHIVE_MAP 和 GraphormerConfig 类
if TYPE_CHECKING:
    from .configuration_graphormer import GRAPHORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, GraphormerConfig

    # 检查是否支持 torch 库，若不支持则忽略异常
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果支持 torch，则导入 modeling_graphormer 模块的相关类和变量
        from .modeling_graphormer import (
            GRAPHORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
            GraphormerForGraphClassification,
            GraphormerModel,
            GraphormerPreTrainedModel,
        )

# 如果不是类型检查模式，则将当前模块定义为延迟加载模块
else:
    import sys

    # 将当前模块替换为 _LazyModule 实例
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\groupvit\configuration_groupvit.py`

# coding=utf-8
# 定义 Python 源文件编码为 UTF-8

# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
# 版权声明，保留所有权利

# Licensed under the Apache License, Version 2.0 (the "License");
# 根据 Apache 许可证 Version 2.0 授权

# you may not use this file except in compliance with the License.
# 除非符合许可证的要求，否则不得使用此文件

# You may obtain a copy of the License at
# 您可以在以下网址获取许可证的副本

#     http://www.apache.org/licenses/LICENSE-2.0
#     http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# 除非适用法律要求或书面同意，否则本软件按“原样”分发，不提供任何形式的担保或条件。
# 详见许可证，以了解特定语言的权限

""" GroupViT model configuration"""
# GroupViT 模型配置

import os
# 导入操作系统相关模块
from collections import OrderedDict
# 导入 OrderedDict 类型，用于维护键值对的插入顺序
from typing import TYPE_CHECKING, Any, Mapping, Optional, Union
# 导入类型提示相关模块

from ...configuration_utils import PretrainedConfig
# 从配置工具模块导入预训练配置类
from ...onnx import OnnxConfig
# 从 ONNX 模块导入 ONNX 配置类
from ...utils import logging
# 从工具模块导入日志模块

if TYPE_CHECKING:
    from ...processing_utils import ProcessorMixin
    # 如果是类型检查，从处理工具模块导入 ProcessorMixin 类
    from ...utils import TensorType
    # 如果是类型检查，从工具模块导入 TensorType 类型

logger = logging.get_logger(__name__)
# 获取当前模块的日志记录器对象

GROUPVIT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "nvidia/groupvit-gcc-yfcc": "https://huggingface.co/nvidia/groupvit-gcc-yfcc/resolve/main/config.json",
}
# GroupViT 预训练模型配置映射字典，将模型名称映射到其配置文件的 URL
# 当前只包含了一个模型 "nvidia/groupvit-gcc-yfcc" 对应的配置文件 URL
# https://huggingface.co/nvidia/groupvit-gcc-yfcc/resolve/main/config.json

class GroupViTTextConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`GroupViTTextModel`]. It is used to instantiate an
    GroupViT model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the GroupViT
    [nvidia/groupvit-gcc-yfcc](https://huggingface.co/nvidia/groupvit-gcc-yfcc) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    """
    # GroupViTTextConfig 类的文档字符串，描述了如何存储 GroupViTTextModel 的配置
    # 该配置用于实例化 GroupViT 模型，并根据指定的参数定义模型架构
    # 使用默认参数实例化配置将产生类似于 GroupViT [nvidia/groupvit-gcc-yfcc] 架构的配置
    # 配置对象继承自 PretrainedConfig，并可用于控制模型输出。详细信息请参阅 PretrainedConfig 的文档。
    # 定义一个字符串常量，表示 GroupViT 文本模型的类型
    model_type = "groupvit_text_model"
    # 初始化函数，用于创建一个新的配置对象
    def __init__(
        self,
        vocab_size=49408,
        hidden_size=256,
        intermediate_size=1024,
        num_hidden_layers=12,
        num_attention_heads=4,
        max_position_embeddings=77,
        hidden_act="quick_gelu",
        layer_norm_eps=1e-5,
        dropout=0.0,
        attention_dropout=0.0,
        initializer_range=0.02,
        initializer_factor=1.0,
        pad_token_id=1,
        bos_token_id=49406,
        eos_token_id=49407,
        **kwargs,
    ):
        # 调用父类的初始化函数，设置特殊的标记ID和其他传递的关键字参数
        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)

        # 设置配置对象的各种属性
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.dropout = dropout
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.max_position_embeddings = max_position_embeddings
        self.layer_norm_eps = layer_norm_eps
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.initializer_factor = initializer_factor
        self.attention_dropout = attention_dropout

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
        # 设置 token 的相关参数到 kwargs 中
        cls._set_token_in_kwargs(kwargs)

        # 获取配置字典和更新后的 kwargs
        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)

        # 如果配置字典中的 model_type 是 groupvit，则使用其中的 text_config
        if config_dict.get("model_type") == "groupvit":
            config_dict = config_dict["text_config"]

        # 如果存在 model_type 并且不与当前类的 model_type 相符，则发出警告
        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
            logger.warning(
                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
            )

        # 从配置字典创建配置对象并返回
        return cls.from_dict(config_dict, **kwargs)
# 继承自预训练配置类 PretrainedConfig，用于存储 GroupViTVisionModel 的配置信息
class GroupViTVisionConfig(PretrainedConfig):
    """
    这是一个配置类，用于存储 [`GroupViTVisionModel`] 的配置信息。它被用来根据指定的参数实例化一个 GroupViT 模型，
    定义模型的架构。使用默认参数实例化一个配置对象会得到与 GroupViT [nvidia/groupvit-gcc-yfcc] 架构相似的配置。

    配置对象继承自 [`PretrainedConfig`]，可以用来控制模型的输出。阅读 [`PretrainedConfig`] 的文档以获取更多信息。

    Args:
        hidden_size (`int`, *optional*, defaults to 384):
            编码器层和池化层的维度。
        intermediate_size (`int`, *optional*, defaults to 1536):
            Transformer 编码器中 "intermediate"（即前馈）层的维度。
        depths (`List[int]`, *optional*, defaults to [6, 3, 3]):
            每个编码器块中的层数。
        num_group_tokens (`List[int]`, *optional*, defaults to [64, 8, 0]):
            每个阶段的组令牌数量。
        num_output_groups (`List[int]`, *optional*, defaults to [64, 8, 8]):
            每个阶段的输出组数，0 表示没有组。
        num_attention_heads (`int`, *optional*, defaults to 6):
            Transformer 编码器中每个注意力层的注意头数。
        image_size (`int`, *optional*, defaults to 224):
            每个图像的大小（分辨率）。
        patch_size (`int`, *optional*, defaults to 16):
            每个补丁的大小（分辨率）。
        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            编码器和池化器中的非线性激活函数（函数或字符串）。支持的字符串有 "gelu"、"relu"、"selu" 和 "gelu_new" "quick_gelu"。
        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
            层归一化层使用的 epsilon。
        dropout (`float`, *optional*, defaults to 0.0):
            嵌入层、编码器和池化器中所有全连接层的 dropout 概率。
        attention_dropout (`float`, *optional*, defaults to 0.0):
            注意力概率的 dropout 比率。
        initializer_range (`float`, *optional*, defaults to 0.02):
            用于初始化所有权重矩阵的截断正态初始化器的标准差。
        initializer_factor (`float`, *optional*, defaults to 1.0):
            初始化所有权重矩阵的因子（应保持为 1，用于初始化测试中使用）。

    Example:

    ```
    >>> from transformers import GroupViTVisionConfig, GroupViTVisionModel
    ```
    # 初始化一个 GroupViTVisionConfig 对象，使用 nvidia/groupvit-gcc-yfcc 风格的配置
    configuration = GroupViTVisionConfig()

    # 使用上述配置初始化一个 GroupViTVisionModel 模型
    model = GroupViTVisionModel(configuration)

    # 获取模型的配置信息
    configuration = model.config
# 定义 GroupViTConfig 类，继承自 PretrainedConfig 类
class GroupViTConfig(PretrainedConfig):
    r"""
    [`GroupViTConfig`] 是用于存储 [`GroupViTModel`] 配置的配置类。它用于根据指定的参数实例化一个 GroupViT 模型，
    定义文本模型和视觉模型的配置。使用默认值实例化配置将产生与 GroupViT [nvidia/groupvit-gcc-yfcc] 架构类似的配置。

    配置对象继承自 [`PretrainedConfig`]，可以用于控制模型的输出。阅读 [`PretrainedConfig`] 的文档以获取更多信息。

    Args:
        text_config (`dict`, *optional*):
            用于初始化 [`GroupViTTextConfig`] 的配置选项字典。
        vision_config (`dict`, *optional*):
            用于初始化 [`GroupViTVisionConfig`] 的配置选项字典。
        projection_dim (`int`, *optional*, defaults to 256):
            文本和视觉投影层的维度。
        projection_intermediate_dim (`int`, *optional*, defaults to 4096):
            文本和视觉投影层中间层的维度。
        logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
            *logit_scale* 参数的初始值。默认值与原始 GroupViT 实现相匹配。
        kwargs (*optional*):
            关键字参数字典。
    """

    # 模型类型为 "groupvit"
    model_type = "groupvit"

    # 构造函数，初始化 GroupViTConfig 实例
    def __init__(
        self,
        text_config=None,
        vision_config=None,
        projection_dim=256,
        projection_intermediate_dim=4096,
        logit_scale_init_value=2.6592,
        **kwargs,
    ):
        # 调用父类的构造函数，设置配置选项
        super().__init__(**kwargs)

    @classmethod
    def from_text_vision_configs(cls, text_config: GroupViTTextConfig, vision_config: GroupViTVisionConfig, **kwargs):
        r"""
        从 GroupViT 文本模型配置和 GroupViT 视觉模型配置实例化一个 [`GroupViTConfig`]（或其派生类）。

        Returns:
            [`GroupViTConfig`]: 配置对象的一个实例
        """
        # 使用传入的 text_config 和 vision_config 创建一个新的 GroupViTConfig 实例
        return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)


class GroupViTOnnxConfig(OnnxConfig):
    @property
    def inputs(self) -> Mapping[str, Mapping[int, str]]:
        # 定义输入层的名称和维度映射
        return OrderedDict(
            [
                ("input_ids", {0: "batch", 1: "sequence"}),
                ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
                ("attention_mask", {0: "batch", 1: "sequence"}),
            ]
        )

    @property
    # 返回一个有序字典，包含输出标识和对应的数据结构
    def outputs(self) -> Mapping[str, Mapping[int, str]]:
        return OrderedDict(
            [
                ("logits_per_image", {0: "batch"}),  # 输出标识"logits_per_image"对应值为{0: "batch"}
                ("logits_per_text", {0: "batch"}),   # 输出标识"logits_per_text"对应值为{0: "batch"}
                ("text_embeds", {0: "batch"}),        # 输出标识"text_embeds"对应值为{0: "batch"}
                ("image_embeds", {0: "batch"}),       # 输出标识"image_embeds"对应值为{0: "batch"}
            ]
        )

    # 返回用于验证的绝对误差容限值
    @property
    def atol_for_validation(self) -> float:
        return 1e-4

    # 生成虚拟的输入数据字典，结合文本和图像处理器的虚拟输入
    def generate_dummy_inputs(
        self,
        processor: "ProcessorMixin",
        batch_size: int = -1,
        seq_length: int = -1,
        framework: Optional["TensorType"] = None,
    ) -> Mapping[str, Any]:
        # 调用父类方法生成文本输入字典
        text_input_dict = super().generate_dummy_inputs(
            processor.tokenizer, batch_size=batch_size, seq_length=seq_length, framework=framework
        )
        # 调用父类方法生成图像输入字典
        image_input_dict = super().generate_dummy_inputs(
            processor.image_processor, batch_size=batch_size, framework=framework
        )
        # 合并文本和图像输入字典，返回合并后的结果
        return {**text_input_dict, **image_input_dict}

    # 返回默认的ONNX操作集版本号
    @property
    def default_onnx_opset(self) -> int:
        return 14


这些注释解释了每个方法和属性的作用，确保了代码的清晰性和可读性。

`.\models\groupvit\convert_groupvit_nvlab_to_hf.py`

# 定义函数用于重命名模型参数键名
def rename_key(name):
    # 如果键名中包含 "img_encoder.pos_embed"，则替换为 "vision_model.embeddings.position_embeddings"
    if "img_encoder.pos_embed" in name:
        name = name.replace("img_encoder.pos_embed", "vision_model.embeddings.position_embeddings")
    # 如果键名中包含 "img_encoder.patch_embed.proj"，则替换为 "vision_model.embeddings.patch_embeddings.projection"
    if "img_encoder.patch_embed.proj" in name:
        name = name.replace("img_encoder.patch_embed.proj", "vision_model.embeddings.patch_embeddings.projection")
    # 如果键名中包含 "img_encoder.patch_embed.norm"，则替换为 "vision_model.embeddings.layernorm"
    if "img_encoder.patch_embed.norm" in name:
        name = name.replace("img_encoder.patch_embed.norm", "vision_model.embeddings.layernorm")
    # 如果键名中包含 "img_encoder.layers"，则替换为 "vision_model.encoder.stages"
    if "img_encoder.layers" in name:
        name = name.replace("img_encoder.layers", "vision_model.encoder.stages")
    # 如果键名中包含 "blocks" 且不包含 "res"，则替换为 "layers"
    if "blocks" in name and "res" not in name:
        name = name.replace("blocks", "layers")
    # 如果键名中包含 "attn" 且不包含 "pre_assign"，则替换为 "self_attn"
    if "attn" in name and "pre_assign" not in name:
        name = name.replace("attn", "self_attn")
    # 如果键名中包含 "proj" 且同时包含 "self_attn" 且不包含 "text"，则替换为 "out_proj"
    if "proj" in name and "self_attn" in name and "text" not in name:
        name = name.replace("proj", "out_proj")
    # 如果键名中包含 "pre_assign_attn.attn.proj"，则替换为 "pre_assign_attn.attn.out_proj"
    if "pre_assign_attn.attn.proj" in name:
        name = name.replace("pre_assign_attn.attn.proj", "pre_assign_attn.attn.out_proj")
    # 如果键名中包含 "norm1"，则替换为 "layer_norm1"
    if "norm1" in name:
        name = name.replace("norm1", "layer_norm1")
    # 如果键名中包含 "norm2" 且不包含 "pre_assign"，则替换为 "layer_norm2"
    if "norm2" in name and "pre_assign" not in name:
        name = name.replace("norm2", "layer_norm2")
    # 如果键名中包含 "img_encoder.norm"，则替换为 "vision_model.layernorm"
    if "img_encoder.norm" in name:
        name = name.replace("img_encoder.norm", "vision_model.layernorm")
    # 如果键名中包含 "text_encoder.token_embedding"，则替换为 "text_model.embeddings.token_embedding"
    if "text_encoder.token_embedding" in name:
        name = name.replace("text_encoder.token_embedding", "text_model.embeddings.token_embedding")
    # 如果键名中包含 "text_encoder.positional_embedding"，则替换为 "text_model.embeddings.position_embedding.weight"
    if "text_encoder.positional_embedding" in name:
        name = name.replace("text_encoder.positional_embedding", "text_model.embeddings.position_embedding.weight")
    # 如果键名中包含 "text_encoder.transformer.resblocks."，则替换为 "text_model.encoder.layers."
    if "text_encoder.transformer.resblocks." in name:
        name = name.replace("text_encoder.transformer.resblocks.", "text_model.encoder.layers.")
    # 如果键名中包含 "ln_1"，则替换为 "layer_norm1"
    if "ln_1" in name:
        name = name.replace("ln_1", "layer_norm1")
    # 如果键名中包含 "ln_2"，则替换为 "layer_norm2"
    if "ln_2" in name:
        name = name.replace("ln_2", "layer_norm2")
    # 如果键名中包含 "c_fc"，则替换为 "fc1"
    if "c_fc" in name:
        name = name.replace("c_fc", "fc1")
    # 如果键名中包含 "c_proj"，则替换为 "fc2"
    if "c_proj" in name:
        name = name.replace("c_proj", "fc2")
    # 如果变量 name 中包含字符串 "text_encoder"
    if "text_encoder" in name:
        # 将其替换为 "text_model"
        name = name.replace("text_encoder", "text_model")
    
    # 如果变量 name 中包含字符串 "ln_final"
    if "ln_final" in name:
        # 将其替换为 "final_layer_norm"
        name = name.replace("ln_final", "final_layer_norm")
    
    # 处理投影层的命名映射
    # 如果变量 name 中包含字符串 "img_projector.linear_hidden."
    if "img_projector.linear_hidden." in name:
        # 将其替换为 "visual_projection."
        name = name.replace("img_projector.linear_hidden.", "visual_projection.")
    
    # 如果变量 name 中包含字符串 "img_projector.linear_out."
    if "img_projector.linear_out." in name:
        # 将其替换为 "visual_projection.3."
        name = name.replace("img_projector.linear_out.", "visual_projection.3.")
    
    # 如果变量 name 中包含字符串 "text_projector.linear_hidden"
    if "text_projector.linear_hidden" in name:
        # 将其替换为 "text_projection"
        name = name.replace("text_projector.linear_hidden", "text_projection")
    
    # 如果变量 name 中包含字符串 "text_projector.linear_out"
    if "text_projector.linear_out" in name:
        # 将其替换为 "text_projection.3"
        name = name.replace("text_projector.linear_out", "text_projection.3")
    
    # 返回处理后的 name 变量作为结果
    return name
def convert_state_dict(orig_state_dict, config):
    # 简单地返回原始状态字典，未经任何改动
    return orig_state_dict


# 我们将在一张可爱猫咪的图像上验证我们的结果
def prepare_img():
    # 图像的 URL
    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
    # 使用 requests 库获取图像的原始字节流，并由 PIL 打开
    im = Image.open(requests.get(url, stream=True).raw)
    return im


@torch.no_grad()
def convert_groupvit_checkpoint(
    checkpoint_path, pytorch_dump_folder_path, model_name="groupvit-gcc-yfcc", push_to_hub=False
):
    """
    复制/粘贴/调整模型的权重以符合 Transformers 设计。
    """
    # 创建 GroupViT 模型配置
    config = GroupViTConfig()
    # 初始化 GroupViT 模型并设置为评估模式
    model = GroupViTModel(config).eval()

    # 从指定路径加载 GroupViT 模型的状态字典到 CPU
    state_dict = torch.load(checkpoint_path, map_location="cpu")["model"]
    # 将加载的状态字典转换为新的状态字典，使用给定的配置
    new_state_dict = convert_state_dict(state_dict, config)
    # 加载新的状态字典到模型中，允许不严格匹配
    missing_keys, unexpected_keys = model.load_state_dict(new_state_dict, strict=False)
    # 断言确实缺失的键为指定值
    assert missing_keys == ["text_model.embeddings.position_ids"]
    # 断言确实的键为指定值或长度为 0
    assert (unexpected_keys == ["multi_label_logit_scale"]) or (len(unexpected_keys) == 0)

    # 验证结果
    processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
    image = prepare_img()
    # 准备输入数据，包括文本和图像，使用 CLIP 处理器
    inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, padding=True, return_tensors="pt")

    with torch.no_grad():
        # 在不进行梯度计算的情况下，向模型输入数据并获取输出
        outputs = model(**inputs)

    # 根据模型名称设置期望的 logits 值，并进行数值近似比较
    if model_name == "groupvit-gcc-yfcc":
        expected_logits = torch.tensor([[13.3523, 6.3629]])
    elif model_name == "groupvit-gcc-redcaps":
        expected_logits = torch.tensor([[16.1873, 8.6230]])
    else:
        raise ValueError(f"Model name {model_name} not supported.")
    assert torch.allclose(outputs.logits_per_image, expected_logits, atol=1e-3)

    # 将处理器和模型保存到指定路径
    processor.save_pretrained(pytorch_dump_folder_path)
    model.save_pretrained(pytorch_dump_folder_path)
    print("Successfully saved processor and model to", pytorch_dump_folder_path)

    # 如果设置了推送到 Hub，执行推送操作
    if push_to_hub:
        print("Pushing to the hub...")
        processor.push_to_hub(model_name, organization="nielsr")
        model.push_to_hub(model_name, organization="nielsr")


if __name__ == "__main__":
    # 创建命令行参数解析器
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--pytorch_dump_folder_path", default=None, type=str, help="Path to dump the processor and PyTorch model."
    )
    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to GroupViT checkpoint")
    parser.add_argument(
        "--model_name",
        default="groupvit-gccy-fcc",
        type=str,
        help="Name of the model. Expecting either 'groupvit-gcc-yfcc' or 'groupvit-gcc-redcaps'",
    )
    parser.add_argument(
        "--push_to_hub",
        action="store_true",
        help="Whether or not to push the converted model and processor to the 🤗 hub using the provided `model_name`.",
    )
    # 解析命令行参数
    args = parser.parse_args()

    # 调用函数，执行 GroupViT 模型检查点的转换
    convert_groupvit_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.model_name, args.push_to_hub)

`.\models\groupvit\modeling_groupvit.py`

# 定义了编码格式为 UTF-8

import collections.abc  # 导入 collections.abc 模块，用于处理集合和容器数据类型
import math  # 导入数学模块，用于数学运算
from dataclasses import dataclass  # 从 dataclasses 模块导入 dataclass 装饰器，用于简化类的定义
from typing import Any, Optional, Tuple, Union  # 导入类型提示模块，用于声明函数参数和返回值类型

import numpy as np  # 导入 NumPy 库，用于数值计算
import torch  # 导入 PyTorch 深度学习库
import torch.utils.checkpoint  # 导入 PyTorch 检查点模块，用于内存优化
from torch import nn  # 从 PyTorch 导入神经网络模块

from ...activations import ACT2FN  # 导入激活函数模块中的 ACT2FN
from ...modeling_attn_mask_utils import _create_4d_causal_attention_mask, _prepare_4d_attention_mask  # 导入自定义的注意力掩码函数
from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling  # 导入模型输出相关类
from ...modeling_utils import PreTrainedModel  # 导入预训练模型类
from ...utils import (  # 从工具模块导入多个实用函数和类
    ModelOutput,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
from .configuration_groupvit import GroupViTConfig, GroupViTTextConfig, GroupViTVisionConfig  # 导入 GroupViT 相关的配置类

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器

_CHECKPOINT_FOR_DOC = "nvidia/groupvit-gcc-yfcc"  # 设置模型检查点的文档字符串

GROUPVIT_PRETRAINED_MODEL_ARCHIVE_LIST = [  # 定义预训练模型的存档列表
    "nvidia/groupvit-gcc-yfcc",
    # 可以在 https://huggingface.co/models?filter=groupvit 查看所有 GroupViT 模型
]


# 对比损失函数，从 https://sachinruk.github.io/blog/pytorch/pytorch%20lightning/loss%20function/gpu/2021/03/07/CLIP.html 改编而来
def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
    """
    计算对比损失函数。

    Args:
        logits (torch.Tensor): 模型输出的对比分数张量

    Returns:
        torch.Tensor: 对比损失值张量
    """
    return nn.functional.cross_entropy(logits, torch.arange(len(logits), device=logits.device))


# 从 transformers.models.clip.modeling_clip.clip_loss 复制并修改为 groupvit
def groupvit_loss(similarity: torch.Tensor) -> torch.Tensor:
    """
    计算 GroupViT 模型的损失函数。

    Args:
        similarity (torch.Tensor): 相似性分数张量

    Returns:
        torch.Tensor: GroupViT 损失值张量
    """
    caption_loss = contrastive_loss(similarity)
    image_loss = contrastive_loss(similarity.t())
    return (caption_loss + image_loss) / 2.0


def hard_softmax(logits: torch.Tensor, dim: int):
    """
    实现硬件 softmax 函数。

    Args:
        logits (torch.Tensor): 模型输出的 logits 张量
        dim (int): softmax 操作的维度

    Returns:
        torch.Tensor: 硬件 softmax 后的张量
    """
    y_soft = logits.softmax(dim)
    # 直通机制。
    index = y_soft.max(dim, keepdim=True)[1]
    y_hard = torch.zeros_like(logits, memory_format=torch.legacy_contiguous_format).scatter_(dim, index, 1.0)
    ret = y_hard - y_soft.detach() + y_soft

    return ret


def gumbel_softmax(logits: torch.Tensor, tau: float = 1, hard: bool = False, dim: int = -1) -> torch.Tensor:
    """
    实现 Gumbel softmax 函数。

    Args:
        logits (torch.Tensor): 模型输出的 logits 张量
        tau (float): Gumbel 分布的温度参数，默认为 1
        hard (bool): 是否使用硬件 softmax，即直通机制，默认为 False
        dim (int): softmax 操作的维度，默认为 -1

    Returns:
        torch.Tensor: Gumbel softmax 后的张量
    """
    # 更稳定的方式 https://github.com/pytorch/pytorch/issues/41663
    gumbel_dist = torch.distributions.gumbel.Gumbel(
        torch.tensor(0.0, device=logits.device, dtype=logits.dtype),
        torch.tensor(1.0, device=logits.device, dtype=logits.dtype),
    )
    gumbels = gumbel_dist.sample(logits.shape)
    gumbels = (logits + gumbels) / tau  # 计算 Gumbel 分布的样本，公式为 (logits + gumbels) / tau

    y_soft = gumbels.softmax(dim)  # 对 Gumbel 分布样本进行 softmax 操作，得到软化后的分布 y_soft

    if hard:
        # 使用直通法（Straight through）进行硬化操作
        index = y_soft.max(dim, keepdim=True)[1]  # 找到每行中最大值的索引，用于硬化操作
        y_hard = torch.zeros_like(logits, memory_format=torch.legacy_contiguous_format).scatter_(dim, index, 1.0)
        # 利用 scatter_ 方法在指定维度上将索引处置为 1.0，从而得到硬化后的独热编码 y_hard
        ret = y_hard - y_soft.detach() + y_soft  # 使用直通法修正硬化结果，得到最终输出 ret
    else:
        # 使用重参数化技巧（Reparametrization trick）
        ret = y_soft  # 直接返回软化后的分布 y_soft
    return ret
# 定义一个函数，用于调整注意力图的大小
def resize_attention_map(attentions, height, width, align_corners=False):
    """
    Args:
        attentions (`torch.Tensor`): attention map of shape [batch_size, groups, feat_height*feat_width]
        height (`int`): height of the output attention map
        width (`int`): width of the output attention map
        align_corners (`bool`, *optional*): the `align_corner` argument for `nn.functional.interpolate`.

    Returns:
        `torch.Tensor`: resized attention map of shape [batch_size, groups, height, width]
    """

    # 计算缩放比例
    scale = (height * width // attentions.shape[2]) ** 0.5
    if height > width:
        # 根据高度缩放比例计算特征图的宽度和高度
        feat_width = int(np.round(width / scale))
        feat_height = attentions.shape[2] // feat_width
    else:
        # 根据宽度缩放比例计算特征图的高度和宽度
        feat_height = int(np.round(height / scale))
        feat_width = attentions.shape[2] // feat_height

    batch_size = attentions.shape[0]
    groups = attentions.shape[1]  # 表示组token的数量
    # 将原始形状的注意力图重塑为 [batch_size, groups, feat_height, feat_width]
    attentions = attentions.reshape(batch_size, groups, feat_height, feat_width)
    # 使用双线性插值方法调整注意力图的大小到指定的 [height, width]
    attentions = nn.functional.interpolate(
        attentions, size=(height, width), mode="bilinear", align_corners=align_corners
    )
    return attentions


# 定义一个函数，从注意力图中获取分组信息
def get_grouping_from_attentions(attentions, hw_shape):
    """
    Args:
        attentions (`tuple(torch.FloatTensor)`): tuple of attention maps returned by `GroupViTVisionTransformer`
        hw_shape (`tuple(int)`): height and width of the output attention map
    Returns:
        `torch.Tensor`: the attention map of shape [batch_size, groups, height, width]
    """

    attn_maps = []
    with torch.no_grad():
        prev_attn_masks = None
        # 遍历每个注意力图
        for attn_masks in attentions:
            # 将注意力掩码重排列为 [batch_size, num_groups, height x width]
            attn_masks = attn_masks.permute(0, 2, 1).contiguous()
            if prev_attn_masks is None:
                prev_attn_masks = attn_masks
            else:
                # 将前一个注意力掩码与当前注意力掩码相乘
                prev_attn_masks = prev_attn_masks @ attn_masks
            # 调用 resize_attention_map 函数调整当前的注意力图大小
            cur_attn_map = resize_attention_map(prev_attn_masks.permute(0, 2, 1).contiguous(), *hw_shape)
            attn_maps.append(cur_attn_map)

    # 返回最终的分组注意力图，形状为 [batch_size, num_groups, height, width]
    final_grouping = attn_maps[-1]

    return final_grouping


# 定义一个类，实现 GroupViT 模型的跨注意力层
class GroupViTCrossAttentionLayer(nn.Module):
    def __init__(self, config: GroupViTVisionConfig):
        super().__init__()
        self.attn = GroupViTAttention(config)  # 初始化 GroupViTAttention
        self.norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)  # LayerNorm 层
        self.mlp = GroupViTMLP(config)  # 初始化 GroupViTMLP
        self.norm_post = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)  # LayerNorm 层
    # 定义神经网络模型的前向传播函数，接受查询(query)和键(key)作为输入
    def forward(self, query, key):
        # 将输入的查询(query)赋值给变量x
        x = query
        # 将查询(query)和键(key)传递给自注意力机制(attn)，并将返回的结果与x相加
        x = x + self.attn(query, encoder_hidden_states=key)[0]
        # 将x传递给多层感知机(mlp)和层归一化(norm2)，然后将得到的结果与x相加
        x = x + self.mlp(self.norm2(x))
        # 将处理后的x传递给后层归一化(norm_post)
        x = self.norm_post(x)
        # 返回处理后的结果x作为前向传播的输出
        return x
class GroupViTAssignAttention(nn.Module):
    # 定义一个 GroupViTAssignAttention 类，继承自 nn.Module
    def __init__(self, config: GroupViTVisionConfig):
        # 初始化方法，接受一个配置参数 config
        super().__init__()
        # 初始化 scale 参数为 hidden_size 的倒数平方
        self.scale = config.hidden_size**-0.5

        # 定义线性层 q_proj，将输入特征大小映射到 hidden_size
        self.q_proj = nn.Linear(config.hidden_size, config.hidden_size)
        # 定义线性层 k_proj，将输入特征大小映射到 hidden_size
        self.k_proj = nn.Linear(config.hidden_size, config.hidden_size)
        # 定义线性层 v_proj，将输入特征大小映射到 hidden_size
        self.v_proj = nn.Linear(config.hidden_size, config.hidden_size)
        # 定义线性层 proj，将输入特征大小映射到 hidden_size
        self.proj = nn.Linear(config.hidden_size, config.hidden_size)
        # 设置 assign_eps 参数
        self.assign_eps = config.assign_eps

    def get_attn(self, attn, gumbel=True, hard=True):
        # 根据 gumbel 和 hard 参数获取注意力分布
        if gumbel and self.training:
            # 如果 gumbel 为 True 且处于训练模式，使用 gumbel_softmax 进行注意力分配
            attn = gumbel_softmax(attn, dim=-2, hard=hard)
        else:
            # 否则，根据 hard 参数选择 softmax 或者 hard_softmax
            if hard:
                attn = hard_softmax(attn, dim=-2)
            else:
                attn = nn.functional.softmax(attn, dim=-2)

        return attn

    def forward(self, query, key):
        # 前向传播函数，接受 query 和 key 作为输入
        value = key
        # 将 query 映射到 hidden_size 维度
        query = self.q_proj(query)

        # 将 key 映射到 hidden_size 维度
        key = self.k_proj(key)

        # 将 value 映射到 hidden_size 维度
        value = self.v_proj(value)

        # 计算原始的注意力分数，query 和 key 的点积，乘以缩放因子 scale
        raw_attn = (query @ key.transpose(-2, -1)) * self.scale

        # 获取注意力分布，调用 get_attn 方法
        attn = self.get_attn(raw_attn)
        # 获取软化的注意力分布，关闭 gumbel 并使用 softmax
        soft_attn = self.get_attn(raw_attn, gumbel=False, hard=False)

        # 归一化注意力分布
        attn = attn / (attn.sum(dim=-1, keepdim=True) + self.assign_eps)

        # 计算最终输出，注意力分布乘以 value
        out = attn @ value

        # 将输出再次映射到 hidden_size 维度
        out = self.proj(out)

        return out, soft_attn


class GroupViTTokenAssign(nn.Module):
    # 定义一个 GroupViTTokenAssign 类，继承自 nn.Module
    def __init__(self, config: GroupViTVisionConfig, num_group_token, num_output_group):
        # 初始化方法，接受配置参数 config、群组 token 数量 num_group_token 和输出群组数 num_output_group
        super().__init__()
        # 设置输出的群组数量
        self.num_output_group = num_output_group
        # 对群组 token 进行层归一化
        self.norm_tokens = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        
        # 处理 assign_mlp_ratio 参数，确保其为可迭代对象，如果不是则复制为元组
        assign_mlp_ratio = (
            config.assign_mlp_ratio
            if isinstance(config.assign_mlp_ratio, collections.abc.Iterable)
            else (config.assign_mlp_ratio, config.assign_mlp_ratio)
        )
        # 计算 token 和 channels 维度的大小，基于 assign_mlp_ratio 和 hidden_size
        tokens_dim, channels_dim = [int(x * config.hidden_size) for x in assign_mlp_ratio]
        
        # 创建 GroupViTMixerMLP 层，处理群组 token 和输出群组数
        self.mlp_inter = GroupViTMixerMLP(config, num_group_token, tokens_dim, num_output_group)
        
        # 对 tokens 后的归一化处理
        self.norm_post_tokens = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        
        # 对 x 进行归一化处理
        self.norm_x = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        
        # 创建 GroupViTCrossAttentionLayer 层作为预分配注意力的层
        self.pre_assign_attn = GroupViTCrossAttentionLayer(config)
        
        # 创建 GroupViTAssignAttention 层，处理注意力分配
        self.assign = GroupViTAssignAttention(config)
        
        # 对新的 x 进行归一化处理
        self.norm_new_x = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        
        # 创建 GroupViTMLP 层，处理 hidden_size 和 channels 维度
        self.mlp_channels = GroupViTMLP(config, config.hidden_size, channels_dim, config.hidden_size)
    def project_group_token(self, group_tokens):
        """
        Args:
            group_tokens (torch.Tensor): group tokens, [batch_size, num_group_tokens, channels]

        Returns:
            projected_group_tokens (torch.Tensor): [batch_size, num_output_groups, channels]
        """
        # 使用 self.mlp_inter 对 group_tokens 进行线性变换和非线性变换
        projected_group_tokens = self.mlp_inter(group_tokens)
        # 对变换后的 group tokens 进行后续的归一化处理
        projected_group_tokens = self.norm_post_tokens(projected_group_tokens)
        return projected_group_tokens

    def forward(self, image_tokens, group_tokens):
        """
        Args:
            image_tokens (`torch.Tensor`): image tokens, of shape [batch_size, input_length, channels]
            group_tokens (`torch.Tensor`): group tokens, [batch_size, num_group_tokens, channels]
        """

        # 对输入的 group tokens 进行归一化处理
        group_tokens = self.norm_tokens(group_tokens)
        # 对输入的 image tokens 进行归一化处理
        image_tokens = self.norm_x(image_tokens)
        
        # 调用 project_group_token 方法进行处理，得到投影后的 group tokens
        projected_group_tokens = self.project_group_token(group_tokens)
        
        # 使用 self.pre_assign_attn 对投影后的 group tokens 和 image tokens 进行预分配注意力
        projected_group_tokens = self.pre_assign_attn(projected_group_tokens, image_tokens)
        
        # 使用 self.assign 方法将投影后的 group tokens 分配到 image tokens 上，并返回新的 image tokens 和注意力分布
        new_image_tokens, attention = self.assign(projected_group_tokens, image_tokens)
        
        # 将投影后的 group tokens 添加到新的 image tokens 上
        new_image_tokens += projected_group_tokens
        
        # 对新的 image tokens 进行通道维度的 MLP 处理
        new_image_tokens = new_image_tokens + self.mlp_channels(self.norm_new_x(new_image_tokens))

        return new_image_tokens, attention
@dataclass
class GroupViTModelOutput(ModelOutput):
    """
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
            Contrastive loss for image-text similarity.
        logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
            The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
            similarity scores.
        logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
            The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
            similarity scores.
        segmentation_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels, logits_height, logits_width)`):
            Classification scores for each pixel.

            <Tip warning={true}>

            The logits returned do not necessarily have the same size as the `pixel_values` passed as inputs. This is
            to avoid doing two interpolations and lose some quality when a user needs to resize the logits to the
            original image size as post-processing. You should always check your logits shape and resize as needed.

            </Tip>

        text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
            The text embeddings obtained by applying the projection layer to the pooled output of
            [`GroupViTTextModel`].
        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
            The image embeddings obtained by applying the projection layer to the pooled output of
            [`GroupViTVisionModel`].
        text_model_output (`BaseModelOutputWithPooling`):
            The output of the [`GroupViTTextModel`].
        vision_model_output (`BaseModelOutputWithPooling`):
            The output of the [`GroupViTVisionModel`].
    """

    loss: Optional[torch.FloatTensor] = None  # 损失，用于图像文本相似性对比的对比损失
    logits_per_image: torch.FloatTensor = None  # 图像嵌入和文本嵌入之间的缩放点积分数，表示图像-文本的相似性分数
    logits_per_text: torch.FloatTensor = None  # 文本嵌入和图像嵌入之间的缩放点积分数，表示文本-图像的相似性分数
    segmentation_logits: torch.FloatTensor = None  # 每个像素的分类分数，形状为 (batch_size, config.num_labels, logits_height, logits_width)

    text_embeds: torch.FloatTensor = None  # 应用投影层到文本模型输出汇总输出后得到的文本嵌入
    image_embeds: torch.FloatTensor = None  # 应用投影层到视觉模型输出汇总输出后得到的图像嵌入
    text_model_output: BaseModelOutputWithPooling = None  # [`GroupViTTextModel`] 的输出，带有池化的基础模型输出
    vision_model_output: BaseModelOutputWithPooling = None  # [`GroupViTVisionModel`] 的输出，带有池化的基础模型输出

    def to_tuple(self) -> Tuple[Any]:
        return tuple(
            self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
            for k in self.keys()
        )


class GroupViTPatchEmbeddings(nn.Module):
    """
    Image to Patch Embedding.
    """

    def __init__(
        self,
        image_size: int = 224,  # 图像大小，默认为 224
        patch_size: Union[int, Tuple[int, int]] = 16,  # 补丁大小，默认为 16
        num_channels: int = 3,  # 图像通道数，默认为 3
        embed_dim: int = 768,  # 嵌入维度，默认为 768
    ):
        # 调用父类的初始化方法
        super().__init__()
        # 确保 image_size 是一个可迭代对象，如果不是则转换为元组
        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
        # 确保 patch_size 是一个可迭代对象，如果不是则转换为元组
        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
        # 计算图像可以划分成的补丁数目
        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
        # 设置对象的属性：图像大小、补丁大小、补丁数目
        self.image_size = image_size
        self.patch_size = patch_size
        self.num_patches = num_patches

        # 创建一个卷积层用于投影，将输入图像划分为补丁，并映射到嵌入维度
        self.projection = nn.Conv2d(num_channels, embed_dim, kernel_size=patch_size, stride=patch_size)

    def forward(self, pixel_values: torch.Tensor, interpolate_pos_encoding: bool = False) -> torch.Tensor:
        # 获取输入张量的批量大小、通道数、高度和宽度
        batch_size, num_channels, height, width = pixel_values.shape
        # 如果不需要插值位置编码
        if not interpolate_pos_encoding:
            # 检查输入图像的尺寸是否与模型期望的图像尺寸相匹配
            if height != self.image_size[0] or width != self.image_size[1]:
                # 抛出值错误异常，指示输入图像尺寸不匹配模型期望的尺寸
                raise ValueError(
                    f"Input image size ({height}*{width}) doesn't match model"
                    f" ({self.image_size[0]}*{self.image_size[1]})."
                )
        # 对输入图像进行投影，然后将结果展平并转置
        x = self.projection(pixel_values).flatten(2).transpose(1, 2)
        # 返回处理后的张量
        return x
class GroupViTVisionEmbeddings(nn.Module):
    def __init__(self, config: GroupViTVisionConfig):
        super().__init__()

        # 初始化图像块嵌入层，使用 GroupViTPatchEmbeddings 类
        self.patch_embeddings = GroupViTPatchEmbeddings(
            image_size=config.image_size,
            patch_size=config.patch_size,
            num_channels=config.num_channels,
            embed_dim=config.hidden_size,
        )
        # 计算图像块的数量
        num_patches = self.patch_embeddings.num_patches
        # 初始化位置嵌入为可学习参数，维度为 (1, num_patches, hidden_size)
        self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches, config.hidden_size))
        # 初始化 dropout 层
        self.dropout = nn.Dropout(config.dropout)
        # 初始化 LayerNorm 层，对隐藏状态进行归一化
        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 保存配置信息
        self.config = config

    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
        """
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher
        resolution images.

        Source:
        https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174
        """

        # 获取当前图像块的数量
        npatch = embeddings.shape[1]
        # 如果当前图像块数量与位置嵌入的数量相等，并且图像的高度和宽度相等，则直接返回位置嵌入
        if npatch == self.position_embeddings.shape[1] and height == width:
            return self.position_embeddings

        # 否则，进行位置嵌入的插值操作，以适应更高分辨率的图像
        patch_pos_embed = self.position_embeddings
        num_original_pos_embed = patch_pos_embed.shape[1]
        dim = embeddings.shape[-1]
        feat_height = height // self.config.patch_size
        feat_width = width // self.config.patch_size

        # 添加一个小的数值以避免插值时的浮点误差
        feat_height, feat_width = feat_height + 0.1, feat_width + 0.1
        original_height = original_width = math.sqrt(num_original_pos_embed)

        # 将位置嵌入重塑为 (1, original_height, original_width, dim)，并调整维度顺序
        reshaped_patch_pos_embed = patch_pos_embed.reshape(1, int(original_height), int(original_width), dim).permute(
            0, 3, 1, 2
        )

        # 计算缩放因子
        scale_factor = (feat_height / original_height, feat_width / original_width)

        # 使用双三次插值方法进行位置嵌入的插值
        patch_pos_embed = nn.functional.interpolate(
            reshaped_patch_pos_embed,
            scale_factor=scale_factor,
            mode="bicubic",
            align_corners=False,
        )

        # 调整维度顺序并展平为 (1, -1, dim) 的形式
        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
        return patch_pos_embed
    # 定义前向传播方法，接受像素值张量和是否插值位置编码的标志，返回处理后的张量
    def forward(self, pixel_values: torch.Tensor, interpolate_pos_encoding: bool = False) -> torch.Tensor:
        # 获取输入张量的批量大小、通道数、高度和宽度
        batch_size, num_channels, height, width = pixel_values.shape
        
        # 使用 patch_embeddings 方法将像素值张量转换为嵌入向量
        embeddings = self.patch_embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)

        # 对嵌入向量进行 LayerNorm 处理
        embeddings = self.layernorm(embeddings)

        # 获取处理后嵌入向量的批量大小和序列长度
        batch_size, seq_len, _ = embeddings.size()

        # 如果设置了插值位置编码标志，对每个 token 添加插值位置编码
        if interpolate_pos_encoding:
            embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
        else:
            # 否则，直接将预先计算的位置编码加到嵌入向量上
            embeddings = embeddings + self.position_embeddings

        # 对处理后的嵌入向量进行 Dropout 处理
        embeddings = self.dropout(embeddings)

        # 返回处理后的嵌入向量作为输出
        return embeddings
# 从transformers.models.clip.modeling_clip.CLIPTextEmbeddings复制过来，改名为GroupViTTextEmbeddings，用于处理文本嵌入
class GroupViTTextEmbeddings(nn.Module):
    def __init__(self, config: GroupViTTextConfig):
        super().__init__()
        embed_dim = config.hidden_size

        # 初始化token_embedding，使用Embedding层，形状为(vocab_size, embed_dim)
        self.token_embedding = nn.Embedding(config.vocab_size, embed_dim)
        # 初始化position_embedding，使用Embedding层，形状为(max_position_embeddings, embed_dim)
        self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)

        # 创建position_ids张量，形状为(1, max_position_embeddings)，并且在序列化时被导出
        self.register_buffer(
            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
        )

    # 前向传播函数，接受input_ids、position_ids和inputs_embeds等参数，返回嵌入张量
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
    ) -> torch.Tensor:
        # 如果input_ids不为None，计算序列长度seq_length
        seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]

        # 如果position_ids为None，则使用预先创建的self.position_ids的前seq_length部分
        if position_ids is None:
            position_ids = self.position_ids[:, :seq_length]

        # 如果inputs_embeds为None，则使用token_embedding层对input_ids进行嵌入
        if inputs_embeds is None:
            inputs_embeds = self.token_embedding(input_ids)

        # 计算位置嵌入，使用position_embedding对position_ids进行嵌入
        position_embeddings = self.position_embedding(position_ids)
        # 计算最终的嵌入张量，将token嵌入和位置嵌入相加
        embeddings = inputs_embeds + position_embeddings

        return embeddings


class GroupViTStage(nn.Module):
    """这对应于GroupViT实现中的`GroupingLayer`类。"""

    def __init__(
        self,
        config: GroupViTVisionConfig,
        depth: int,
        num_prev_group_token: int,
        num_group_token: int,
        num_output_group: int,
    ):
        super().__init__()
        self.depth = depth
        self.num_group_token = num_group_token

        # 如果num_group_token大于0，则创建形状为(1, num_group_token, hidden_size)的可学习参数group_token
        if num_group_token > 0:
            self.group_token = nn.Parameter(torch.zeros(1, num_group_token, config.hidden_size))
        else:
            self.group_token = None

        # 创建包含depth个GroupViTEncoderLayer层的模块列表
        self.layers = nn.ModuleList([GroupViTEncoderLayer(config) for _ in range(depth)])

        # 如果num_group_token大于0，则创建GroupViTTokenAssign层作为downsample
        if num_group_token > 0:
            self.downsample = GroupViTTokenAssign(
                config=config,
                num_group_token=num_group_token,
                num_output_group=num_output_group,
            )
        else:
            self.downsample = None

        # 如果num_prev_group_token和num_group_token都大于0，则创建group_projector作为组投影器
        if num_prev_group_token > 0 and num_group_token > 0:
            self.group_projector = nn.Sequential(
                nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps),
                GroupViTMixerMLP(config, num_prev_group_token, config.hidden_size // 2, num_group_token),
            )
        else:
            self.group_projector = None

    # 返回是否存在group_token的布尔属性
    @property
    def with_group_token(self):
        return self.group_token is not None

    # 将输入张量x拆分为两部分，如果存在group_token则将最后num_group_token部分作为分组token返回
    def split_x(self, x):
        if self.with_group_token:
            return x[:, :-self.num_group_token], x[:, -self.num_group_token:]
        else:
            return x, None
    def concat_x(self, x: torch.Tensor, group_token: Optional[torch.Tensor] = None) -> torch.Tensor:
        # 如果 group_token 为 None，则直接返回输入张量 x
        if group_token is None:
            return x
        # 否则，将输入张量 x 与 group_token 沿着 dim=1 进行拼接
        return torch.cat([x, group_token], dim=1)

    def forward(
        self,
        hidden_states: torch.Tensor,
        prev_group_token: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.FloatTensor]:
        """
        Args:
            hidden_states (`torch.FloatTensor`): 输入到层的张量，形状为 `(batch, seq_len, embed_dim)`
            prev_group_token (`torch.FloatTensor`, *optional*): 上一个组令牌的张量，形状为 `(batch, 1, embed_dim)`
            output_attentions (`bool`, *optional*):
                是否返回 Grouping block 的注意力张量。
        """
        # 如果模型配置中包含 group_token
        if self.with_group_token:
            # 扩展 group_token 以匹配隐藏状态张量的批处理大小
            group_token = self.group_token.expand(hidden_states.size(0), -1, -1)
            # 如果存在组投影器，则对 group_token 进行投影
            if self.group_projector is not None:
                group_token = group_token + self.group_projector(prev_group_token)
        else:
            # 否则，将 group_token 设置为 None
            group_token = None

        # 初始化 x 为隐藏状态张量
        x = hidden_states

        # 将 x 和 group_token 拼接起来
        cat_x = self.concat_x(x, group_token)
        
        # 遍历所有层，并将拼接后的张量传入每一层
        for layer in self.layers:
            # 调用每一层的 forward 方法，并传入适当的注意力掩码
            layer_out = layer(cat_x, attention_mask=None, causal_attention_mask=None)
            # 更新 cat_x 为当前层的输出
            cat_x = layer_out[0]

        # 分离拼接后的张量 x 和 group_token
        x, group_token = self.split_x(cat_x)

        # 如果存在下采样操作，则对 x 和 group_token 进行下采样
        attention = None
        if self.downsample is not None:
            x, attention = self.downsample(x, group_token)

        # 输出结果包括 x 和 group_token
        outputs = (x, group_token)
        # 如果需要输出注意力张量，则将注意力张量添加到输出中
        if output_attentions:
            outputs = outputs + (attention,)

        # 返回最终的输出元组
        return outputs
class GroupViTMLP(nn.Module):
    def __init__(
        self,
        config: GroupViTVisionConfig,
        hidden_size: Optional[int] = None,
        intermediate_size: Optional[int] = None,
        output_size: Optional[int] = None,
    ):
        super().__init__()
        self.config = config
        self.activation_fn = ACT2FN[config.hidden_act]
        # 设置隐藏层大小，默认为配置中的隐藏层大小
        hidden_size = hidden_size if hidden_size is not None else config.hidden_size
        # 设置中间层大小，默认为配置中的中间层大小
        intermediate_size = intermediate_size if intermediate_size is not None else config.intermediate_size
        # 设置输出层大小，默认为隐藏层大小
        output_size = output_size if output_size is not None else hidden_size
        # 创建线性层，用于 MLP 的第一层和第二层
        self.fc1 = nn.Linear(hidden_size, intermediate_size)
        self.fc2 = nn.Linear(intermediate_size, output_size)

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 前向传播函数，执行线性变换和激活函数
        hidden_states = self.fc1(hidden_states)  # 第一线性层
        hidden_states = self.activation_fn(hidden_states)  # 激活函数
        hidden_states = self.fc2(hidden_states)  # 第二线性层
        return hidden_states


class GroupViTMixerMLP(GroupViTMLP):
    def forward(self, x):
        # 继承 GroupViTMLP 的前向传播，对输入进行转置处理
        x = super().forward(x.transpose(1, 2))
        return x.transpose(1, 2)


class GroupViTAttention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    def __init__(self, config):
        super().__init__()
        self.config = config
        self.embed_dim = config.hidden_size
        self.num_heads = config.num_attention_heads
        self.head_dim = self.embed_dim // self.num_heads
        # 检查是否可以均匀划分 embed_dim 为 num_heads
        if self.head_dim * self.num_heads != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
                f" {self.num_heads})."
            )
        # 缩放因子，用于多头注意力机制
        self.scale = self.head_dim**-0.5
        self.dropout = config.attention_dropout

        # 线性变换，用于 Q、K、V 和输出层的投影
        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)

    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        # 重塑输入张量，以便适应多头注意力的计算
        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        causal_attention_mask: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = False,
    ):
        # 执行多头注意力的前向传播
        # 使用 Q、K、V 线性映射
        query_states = self.q_proj(hidden_states)
        key_states = self.k_proj(hidden_states)
        value_states = self.v_proj(hidden_states)

        # 重塑 Q、K、V 张量，以便适应多头注意力计算
        query_states = self._shape(query_states, -1, hidden_states.size(0))
        key_states = self._shape(key_states, -1, hidden_states.size(0))
        value_states = self._shape(value_states, -1, hidden_states.size(0))

        # 计算注意力分数
        attn_scores = torch.matmul(query_states, key_states.transpose(-1, -2))
        attn_scores = attn_scores * self.scale

        # 应用注意力掩码（如果有）
        if attention_mask is not None:
            attn_scores = attn_scores.masked_fill(attention_mask == 0, float('-inf'))

        # 计算 softmax 归一化得到注意力权重
        attn_probs = F.softmax(attn_scores, dim=-1)

        # 应用 dropout
        attn_probs = F.dropout(attn_probs, p=self.dropout, training=self.training)

        # 将注意力权重乘以 V（值）张量
        attn_output = torch.matmul(attn_probs, value_states)

        # 将多头注意力的输出重塑回原始形状
        attn_output = attn_output.transpose(1, 2).contiguous().view(hidden_states.size())

        # 执行最终的线性映射投影
        attn_output = self.out_proj(attn_output)

        return attn_output


class GroupViTEncoderLayer(nn.Module):
    def __init__(self, config: GroupViTConfig):
        super().__init__()
        self.embed_dim = config.hidden_size
        self.self_attn = GroupViTAttention(config)
        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
        self.mlp = GroupViTMLP(config)
        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)


# 初始化函数，接受一个配置对象 config，配置对象包含隐藏大小和层标准化的 epsilon 值
def __init__(self, config: GroupViTConfig):
    # 调用父类的初始化方法
    super().__init__()
    # 设置嵌入维度为配置中的隐藏大小
    self.embed_dim = config.hidden_size
    # 创建自注意力层对象，使用给定的配置
    self.self_attn = GroupViTAttention(config)
    # 创建第一个层标准化层，使用嵌入维度和配置中的 epsilon 值
    self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
    # 创建 MLP（多层感知器）对象，使用给定的配置
    self.mlp = GroupViTMLP(config)
    # 创建第二个层标准化层，使用嵌入维度和配置中的 epsilon 值
    self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)



    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: torch.Tensor,
        causal_attention_mask: torch.Tensor,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.FloatTensor]:
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        """
        residual = hidden_states

        # 对输入的隐藏状态进行第一次层标准化
        hidden_states = self.layer_norm1(hidden_states)
        # 使用自注意力层处理标准化后的隐藏状态，同时传入注意力掩码等参数
        hidden_states, attn_weights = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            causal_attention_mask=causal_attention_mask,
            output_attentions=output_attentions,
        )
        # 加上残差连接
        hidden_states = residual + hidden_states

        residual = hidden_states
        # 对更新后的隐藏状态再次进行层标准化
        hidden_states = self.layer_norm2(hidden_states)
        # 使用 MLP 处理标准化后的隐藏状态
        hidden_states = self.mlp(hidden_states)
        # 再次加上残差连接
        hidden_states = residual + hidden_states

        # 设置输出为包含更新后的隐藏状态的元组
        outputs = (hidden_states,)

        # 如果需要输出注意力权重，添加到输出元组中
        if output_attentions:
            outputs += (attn_weights,)

        # 返回最终的输出元组
        return outputs
# GroupViTPreTrainedModel 类，继承自 PreTrainedModel 类，用于处理权重初始化和预训练模型下载与加载的抽象类
class GroupViTPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 配置类变量，指定使用 GroupViTConfig 类作为配置类
    config_class = GroupViTConfig
    # 基础模型前缀，指定为 "groupvit"
    base_model_prefix = "groupvit"
    # 支持梯度检查点
    supports_gradient_checkpointing = True

    # 初始化模型权重的方法
    def _init_weights(self, module):
        """Initialize the weights"""

        # 获取初始化范围
        init_range = self.config.initializer_range
        # 如果模块是线性层或二维卷积层
        if isinstance(module, (nn.Linear, nn.Conv2d)):
            # 使用正态分布初始化权重，均值为 0.0，标准差为初始化范围
            module.weight.data.normal_(mean=0.0, std=init_range)
            # 如果存在偏置，则将偏置初始化为零
            if module.bias is not None:
                module.bias.data.zero_()
        # 如果模块是 LayerNorm 层
        elif isinstance(module, nn.LayerNorm):
            # 将偏置初始化为零
            module.bias.data.zero_()
            # 将权重初始化为 1.0
            module.weight.data.fill_(1.0)

        # 获取初始化因子
        factor = self.config.initializer_factor
        # 如果模块是 GroupViTTextEmbeddings 类的实例
        if isinstance(module, GroupViTTextEmbeddings):
            # 使用正态分布初始化 token_embedding 的权重，均值为 0.0，标准差为初始化因子乘以 0.02
            module.token_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
            # 使用正态分布初始化 position_embedding 的权重，均值为 0.0，标准差为初始化因子乘以 0.02
            module.position_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
        # 如果模块是 GroupViTAttention 类的实例
        elif isinstance(module, GroupViTAttention):
            # 计算输入投影的标准差
            in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
            # 计算输出投影的标准差
            out_proj_std = (module.embed_dim**-0.5) * factor
            # 使用正态分布初始化 q_proj、k_proj、v_proj、out_proj 的权重，标准差分别为对应的标准差
            nn.init.normal_(module.q_proj.weight, std=in_proj_std)
            nn.init.normal_(module.k_proj.weight, std=in_proj_std)
            nn.init.normal_(module.v_proj.weight, std=in_proj_std)
            nn.init.normal_(module.out_proj.weight, std=out_proj_std)
        # 如果模块是 GroupViTMLP 类的实例
        elif isinstance(module, GroupViTMLP):
            # 计算输入投影的标准差
            in_proj_std = (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
            # 计算全连接层的标准差
            fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
            # 使用正态分布初始化 fc1、fc2 的权重，标准差分别为对应的标准差
            nn.init.normal_(module.fc1.weight, std=fc_std)
            nn.init.normal_(module.fc2.weight, std=in_proj_std)


# GROUPVIT_START_DOCSTRING 文档字符串，用于描述 GroupViTPreTrainedModel 类的使用方法和参数说明
GROUPVIT_START_DOCSTRING = r"""
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`GroupViTConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# GROUPVIT_TEXT_INPUTS_DOCSTRING 文档字符串，预留用于描述 GroupViT 模型的文本输入参数说明
GROUPVIT_TEXT_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            # 输入序列的标记索引，用于词汇表中的标记。默认情况下会忽略填充部分。

            # 可以使用`CLIPTokenizer`获取索引。详见`PreTrainedTokenizer.encode`和`PreTrainedTokenizer.__call__`。

            # [什么是输入 ID？](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            # 避免对填充标记索引执行注意力操作的掩码。掩码值在 `[0, 1]` 范围内：

            # - 1 表示**未被掩码**的标记，
            # - 0 表示**被掩码**的标记。

            # [什么是注意力掩码？](../glossary#attention-mask)
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            # 每个输入序列标记在位置嵌入中的位置索引。选择在 `[0, config.max_position_embeddings - 1]` 范围内。

            # [什么是位置 ID？](../glossary#position-ids)
        output_attentions (`bool`, *optional*):
            # 是否返回所有注意力层的注意力张量。有关详细信息，请查看返回的张量中的`attentions`。

        output_hidden_states (`bool`, *optional*):
            # 是否返回所有层的隐藏状态。有关详细信息，请查看返回的张量中的`hidden_states`。

        return_dict (`bool`, *optional*):
            # 是否返回[`~utils.ModelOutput`]而不是普通元组。
"""

GROUPVIT_VISION_INPUTS_DOCSTRING = r"""
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""

GROUPVIT_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`CLIPTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`CLIPImageProcessor.__call__`] for details.
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""


class GroupViTVisionEncoder(nn.Module):
    """
    Vision encoder module for GroupViT.
    """

    def __init__(self):
        """
        Initialize the GroupViT vision encoder.
        """
        # 调用父类的初始化方法
        super().__init__()

    def forward(
        self,
        pixel_values: torch.FloatTensor,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = False,
    ) -> torch.FloatTensor:
        """
        Forward pass of the GroupViT vision encoder.

        Args:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
                Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
                [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
                tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
                more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.

        Returns:
            torch.FloatTensor: Output tensor from the vision encoder.

        Notes:
            The detailed behavior of this method should be found in the GroupViT model's documentation.
        """
        # 实现 GroupViT 视觉编码器的前向传播
        pass
    # 初始化函数，接受一个 GroupViTVisionConfig 类型的参数 config
    # 调用父类的初始化方法
    def __init__(self, config: GroupViTVisionConfig) -> None:
        super().__init__()
        # 将传入的配置对象保存到实例变量中
        self.config = config
        # 创建一个 nn.ModuleList 来保存 GroupViTStage 的实例对象列表
        self.stages = nn.ModuleList(
            [
                # 使用列表推导式创建 GroupViTStage 实例对象的列表
                GroupViTStage(
                    config=config,  # 传入初始化时的 config 参数
                    depth=config.depths[i],  # 取 depths 列表中第 i 个元素作为 depth 参数
                    num_group_token=config.num_group_tokens[i],  # 取 num_group_tokens 列表中第 i 个元素作为 num_group_token 参数
                    num_output_group=config.num_output_groups[i],  # 取 num_output_groups 列表中第 i 个元素作为 num_output_group 参数
                    num_prev_group_token=config.num_output_groups[i - 1] if i > 0 else 0,  # 计算 num_prev_group_token 参数
                )
                # 遍历 config.depths 列表的索引范围
                for i in range(len(config.depths))
            ]
        )
        # 是否使用梯度检查点，默认为 False
        self.gradient_checkpointing = False

    # 前向传播函数，接受 torch.Tensor 类型的 hidden_states 参数以及几个可选的 bool 类型参数
    def forward(
        self,
        hidden_states: torch.Tensor,
        output_hidden_states: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[tuple, BaseModelOutput]:
        # 如果 output_attentions 参数不为 None，则使用它；否则使用 self.config.output_attentions
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 如果 output_hidden_states 参数不为 None，则使用它；否则使用 self.config.output_hidden_states
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 如果 return_dict 参数不为 None，则使用它；否则使用 self.config.use_return_dict
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 如果需要输出隐藏状态，则初始化 all_hidden_states 为空元组；否则设置为 None
        all_hidden_states = () if output_hidden_states else None
        # 如果需要输出注意力机制，则初始化 all_groupings 为空元组；否则设置为 None
        all_groupings = () if output_attentions else None

        # 初始化 group_tokens 为 None
        group_tokens = None

        # 遍历 self.stages 中的每个阶段
        for i, stage in enumerate(self.stages):
            # 如果需要输出隐藏状态，则将当前 hidden_states 加入 all_hidden_states 中
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 调用当前阶段的 forward 方法，计算该阶段的输出
            layer_outputs = stage(hidden_states, group_tokens, output_attentions)

            # 更新 hidden_states 和 group_tokens 为当前阶段的输出结果
            hidden_states = layer_outputs[0]
            group_tokens = layer_outputs[1]

            # 如果需要输出注意力机制，并且当前阶段的输出中包含 attentions，则将其加入 all_groupings 中
            if output_attentions and layer_outputs[2] is not None:
                all_groupings = all_groupings + (layer_outputs[2],)

        # 如果需要输出隐藏状态，则将最终的 hidden_states 加入 all_hidden_states 中
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # 如果不需要返回字典形式的输出，则返回隐藏状态、隐藏状态列表和注意力机制列表的元组
        if not return_dict:
            return tuple(v for v in [hidden_states, all_hidden_states, all_groupings] if v is not None)
        # 否则，返回一个 BaseModelOutput 对象，包含最终的隐藏状态、隐藏状态列表和注意力机制列表
        return BaseModelOutput(
            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_groupings
        )
class GroupViTTextEncoder(nn.Module):
    """
    Transformer encoder consisting of `config.num_hidden_layers` self-attention layers. Each layer is a
    [`GroupViTEncoderLayer`].

    Args:
        config: GroupViTTextConfig
    """

    def __init__(self, config: GroupViTTextConfig):
        super().__init__()
        self.config = config
        # 创建包含多个 `GroupViTEncoderLayer` 的层列表，数量为 config.num_hidden_layers
        self.layers = nn.ModuleList([GroupViTEncoderLayer(config) for _ in range(config.num_hidden_layers)])
        self.gradient_checkpointing = False

    def forward(
        self,
        inputs_embeds,
        attention_mask: Optional[torch.Tensor] = None,
        causal_attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        """
        Forward pass of the encoder.

        Args:
            inputs_embeds: Embedded input tokens.
            attention_mask: Mask to avoid attention on padding tokens.
            causal_attention_mask: Mask to apply causal masking in attention layers.
            output_attentions: Whether to output attentions weights.
            output_hidden_states: Whether to output hidden states.
            return_dict: Whether to return a dictionary instead of a tuple.

        Returns:
            BaseModelOutputWithPooling: Output with pooled representation and optionally attentions and hidden states.
        """
        # Implementation details of the forward pass are in the actual GroupViTTextEncoderLayer implementation.
        pass


# Copied from transformers.models.clip.modeling_clip.CLIPTextTransformer with CLIPText->GroupViTText, CLIPEncoder->GroupViTTextEncoder, CLIP_TEXT->GROUPVIT_TEXT
class GroupViTTextTransformer(nn.Module):
    def __init__(self, config: GroupViTTextConfig):
        super().__init__()
        self.config = config
        embed_dim = config.hidden_size
        # 初始化 GroupViTTextEmbeddings，用于输入的嵌入表示
        self.embeddings = GroupViTTextEmbeddings(config)
        # 初始化 GroupViTTextEncoder，用于编码输入序列
        self.encoder = GroupViTTextEncoder(config)
        # 对最终输出进行 LayerNorm 处理
        self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)

        # For `pooled_output` computation
        self.eos_token_id = config.eos_token_id

    @add_start_docstrings_to_model_forward(GROUPVIT_TEXT_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=GroupViTTextConfig)
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        """
        Forward pass of the model.

        Args:
            input_ids: Input token IDs.
            attention_mask: Mask to avoid attention on padding tokens.
            position_ids: IDs indicating the position of each token in the sequence.
            output_attentions: Whether to output attentions weights.
            output_hidden_states: Whether to output hidden states.
            return_dict: Whether to return a dictionary instead of a tuple.

        Returns:
            BaseModelOutputWithPooling: Output with pooled representation and optionally attentions and hidden states.
        """
        # Implementation details of the forward pass are in the actual GroupViTTextModel implementation.
        pass


class GroupViTTextModel(GroupViTPreTrainedModel):
    config_class = GroupViTTextConfig

    def __init__(self, config: GroupViTTextConfig):
        super().__init__(config)
        # 初始化 GroupViTTextTransformer 作为文本模型的主体
        self.text_model = GroupViTTextTransformer(config)
        # 初始化权重并应用最终处理
        self.post_init()

    def get_input_embeddings(self) -> nn.Module:
        return self.text_model.embeddings.token_embedding

    def set_input_embeddings(self, value):
        self.text_model.embeddings.token_embedding = value

    @add_start_docstrings_to_model_forward(GROUPVIT_TEXT_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=GroupViTTextConfig)
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        """
        Forward pass of the model.

        Args:
            input_ids: Input token IDs.
            attention_mask: Mask to avoid attention on padding tokens.
            position_ids: IDs indicating the position of each token in the sequence.
            output_attentions: Whether to output attentions weights.
            output_hidden_states: Whether to output hidden states.
            return_dict: Whether to return a dictionary instead of a tuple.

        Returns:
            BaseModelOutputWithPooling: Output with pooled representation and optionally attentions and hidden states.
        """
        # Implementation details of the forward pass are in the actual GroupViTTextModel implementation.
        pass
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutputWithPooling]:
        r"""
        此方法定义了模型的前向传播过程，接受多个输入参数并返回模型输出或元组。

        Returns:
            模型的输出或元组，包含了模型的不同部分或汇总结果。

        Examples:

        ```
        >>> from transformers import CLIPTokenizer, GroupViTTextModel

        >>> tokenizer = CLIPTokenizer.from_pretrained("nvidia/groupvit-gcc-yfcc")
        >>> model = GroupViTTextModel.from_pretrained("nvidia/groupvit-gcc-yfcc")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```
        """
        调用文本模型的前向传播，传递各种输入参数以控制模型行为和输出。
        return self.text_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
class GroupViTVisionTransformer(nn.Module):
    def __init__(self, config: GroupViTVisionConfig):
        super().__init__()
        self.config = config
        embed_dim = config.hidden_size

        # 初始化嵌入层
        self.embeddings = GroupViTVisionEmbeddings(config)
        # 初始化编码器
        self.encoder = GroupViTVisionEncoder(config)
        # 初始化 LayerNorm 层
        self.layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)

    @add_start_docstrings_to_model_forward(GROUPVIT_VISION_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=GroupViTVisionConfig)
    def forward(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
        output_hidden_states: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutputWithPooling]:
        r"""
        模型的前向传播方法

        返回:
            BaseModelOutputWithPooling 或 Tuple

        """
        # 确定是否输出注意力权重
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 确定是否输出隐藏状态
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 确定是否使用返回字典
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 如果未提供像素值，引发数值错误
        if pixel_values is None:
            raise ValueError("You have to specify pixel_values")

        # 嵌入层
        hidden_states = self.embeddings(pixel_values)

        # 编码器的输出
        encoder_outputs = self.encoder(
            hidden_states=hidden_states,
            output_hidden_states=output_hidden_states,
            output_attentions=output_attentions,
            return_dict=return_dict,
        )

        # 获取最后一个隐藏状态并进行归一化
        last_hidden_state = encoder_outputs[0]
        last_hidden_state = self.layernorm(last_hidden_state)
        pooled_output = last_hidden_state.mean(dim=1)

        # 如果不使用返回字典，返回元组形式的输出
        if not return_dict:
            return (last_hidden_state, pooled_output) + encoder_outputs[1:]

        # 使用 BaseModelOutputWithPooling 类封装输出
        return BaseModelOutputWithPooling(
            last_hidden_state=last_hidden_state,
            pooler_output=pooled_output,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )


class GroupViTVisionModel(GroupViTPreTrainedModel):
    config_class = GroupViTVisionConfig
    main_input_name = "pixel_values"

    def __init__(self, config: GroupViTVisionConfig):
        super().__init__(config)
        # 初始化视觉模型
        self.vision_model = GroupViTVisionTransformer(config)
        # 初始化权重并应用最终处理
        self.post_init()

    def get_input_embeddings(self) -> GroupViTPatchEmbeddings:
        # 返回嵌入层的补丁嵌入对象
        return self.vision_model.embeddings.patch_embeddings

    @add_start_docstrings_to_model_forward(GROUPVIT_VISION_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=GroupViTVisionConfig)
    def forward(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutputWithPooling]:
        r"""
        此方法用于模型的前向推断，接受多个参数并返回一个包含输出的对象。

        Returns:
            返回一个包含模型输出的对象，通常包括最后一层隐藏状态和池化后的输出。

        Examples:
        以下是一些使用示例，展示了如何使用该方法进行图像特征提取和推断。

        ```
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, GroupViTVisionModel

        >>> processor = AutoProcessor.from_pretrained("nvidia/groupvit-gcc-yfcc")
        >>> model = GroupViTVisionModel.from_pretrained("nvidia/groupvit-gcc-yfcc")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state  # 获取最后一层的隐藏状态
        >>> pooled_output = outputs.pooler_output  # 获取经过池化的输出（通常是CLS状态）
        ```
        """
        调用视觉模型的前向方法，传递参数并返回结果。
        return self.vision_model(
            pixel_values=pixel_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
# 使用装饰器为 GroupViTModel 类添加文档字符串，用于模型初始化和使用说明
@add_start_docstrings(GROUPVIT_START_DOCSTRING)
class GroupViTModel(GroupViTPreTrainedModel):
    # 设置 config_class 属性为 GroupViTConfig 类
    config_class = GroupViTConfig

    # 初始化方法，接受一个 GroupViTConfig 类型的参数 config
    def __init__(self, config: GroupViTConfig):
        # 调用父类的初始化方法
        super().__init__(config)

        # 检查 config.text_config 是否为 GroupViTTextConfig 类型，若不是则抛出异常
        if not isinstance(config.text_config, GroupViTTextConfig):
            raise ValueError(
                "config.text_config is expected to be of type GroupViTTextConfig but is of type"
                f" {type(config.text_config)}."
            )

        # 检查 config.vision_config 是否为 GroupViTVisionConfig 类型，若不是则抛出异常
        if not isinstance(config.vision_config, GroupViTVisionConfig):
            raise ValueError(
                "config.vision_config is expected to be of type GroupViTVisionConfig but is of type"
                f" {type(config.vision_config)}."
            )

        # 从 config 中获取 text_config 和 vision_config 对象
        text_config = config.text_config
        vision_config = config.vision_config

        # 设置模型的投影维度和投影中间维度
        self.projection_dim = config.projection_dim
        self.projection_intermediate_dim = config.projection_intermediate_dim
        # 设置文本嵌入维度和视觉嵌入维度
        self.text_embed_dim = text_config.hidden_size
        self.vision_embed_dim = vision_config.hidden_size

        # 创建 GroupViTTextTransformer 和 GroupViTVisionTransformer 模型
        self.text_model = GroupViTTextTransformer(text_config)
        self.vision_model = GroupViTVisionTransformer(vision_config)

        # 定义视觉特征的投影层
        self.visual_projection = nn.Sequential(
            nn.Linear(self.vision_embed_dim, self.projection_intermediate_dim, bias=True),
            nn.BatchNorm1d(self.projection_intermediate_dim),
            nn.ReLU(inplace=True),
            nn.Linear(self.projection_intermediate_dim, self.projection_dim, bias=True),
        )
        # 定义文本特征的投影层
        self.text_projection = nn.Sequential(
            nn.Linear(self.text_embed_dim, self.projection_intermediate_dim, bias=True),
            nn.BatchNorm1d(self.projection_intermediate_dim),
            nn.ReLU(inplace=True),
            nn.Linear(self.projection_intermediate_dim, self.projection_dim, bias=True),
        )
        # 创建一个可学习的 logit_scale 参数
        self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))

        # 执行初始化权重和最终处理
        self.post_init()

    # 使用装饰器为 get_text_features 方法添加文档字符串，指定模型前向传播时输入的文本相关参数
    @add_start_docstrings_to_model_forward(GROUPVIT_TEXT_INPUTS_DOCSTRING)
    def get_text_features(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        ) -> torch.FloatTensor:
        r"""
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`GroupViTTextModel`].

        Examples:

        ```
        >>> from transformers import CLIPTokenizer, GroupViTModel

        >>> model = GroupViTModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
        >>> tokenizer = CLIPTokenizer.from_pretrained("nvidia/groupvit-gcc-yfcc")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
        >>> text_features = model.get_text_features(**inputs)
        ```"""
        # Use GROUPVIT model's config for some fields (if specified) instead of those of vision & text components.
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # Check if output_hidden_states is specified; otherwise, use the config's value.
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # Determine if return_dict is explicitly set; if not, use the config's default setting.
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # Retrieve text model outputs with specified inputs and optional settings.
        text_outputs = self.text_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # Extract the pooled output from the text model's outputs.
        pooled_output = text_outputs[1]
        # Project the pooled output to obtain text features.
        text_features = self.text_projection(pooled_output)

        # Return the computed text features.
        return text_features
        ) -> torch.FloatTensor:
        r"""
        Returns:
            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
            applying the projection layer to the pooled output of [`GroupViTVisionModel`].

        Examples:

        ```
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, GroupViTModel

        >>> model = GroupViTModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
        >>> processor = AutoProcessor.from_pretrained("nvidia/groupvit-gcc-yfcc")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> image_features = model.get_image_features(**inputs)
        ```"""
        # Use GROUPVIT model's config for some fields (if specified) instead of those of vision & text components.
        # 如果指定了输出注意力机制的配置，则使用该配置；否则使用 GROUPVIT 模型默认配置
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 如果指定了输出隐藏状态的配置，则使用该配置；否则使用 GROUPVIT 模型默认配置
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 如果指定了返回字典的配置，则使用该配置；否则使用 GROUPVIT 模型默认配置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用视觉模型（GroupViTVisionModel）来获取视觉特征表示
        vision_outputs = self.vision_model(
            pixel_values=pixel_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从视觉输出中获取池化后的特征表示
        pooled_output = vision_outputs[1]  # pooled_output
        # 将池化后的特征表示投影到视觉投影层，得到最终的图像特征表示
        image_features = self.visual_projection(pooled_output)

        # 返回图像特征表示作为模型的输出
        return image_features

Transformers-源码解析-五十六-

Transformers 源码解析（五十六）

.\models\gpt_neox\tokenization_gpt_neox_fast.py

.\models\gpt_neox\__init__.py

.\models\gpt_neox_japanese\configuration_gpt_neox_japanese.py

.\models\gpt_neox_japanese\modeling_gpt_neox_japanese.py

.\models\gpt_neox_japanese\tokenization_gpt_neox_japanese.py

.\models\gpt_neox_japanese\__init__.py

.\models\gpt_sw3\convert_megatron_to_pytorch.py

.\models\gpt_sw3\tokenization_gpt_sw3.py

.\models\gpt_sw3\__init__.py

.\models\graphormer\collating_graphormer.py

.\models\graphormer\configuration_graphormer.py

.\models\graphormer\modeling_graphormer.py

.\models\graphormer\__init__.py

.\models\groupvit\configuration_groupvit.py

.\models\groupvit\convert_groupvit_nvlab_to_hf.py

.\models\groupvit\modeling_groupvit.py

`.\models\gpt_neox\tokenization_gpt_neox_fast.py`

`.\models\gpt_neox\init.py`

`.\models\gpt_neox_japanese\configuration_gpt_neox_japanese.py`

`.\models\gpt_neox_japanese\modeling_gpt_neox_japanese.py`

`.\models\gpt_neox_japanese\tokenization_gpt_neox_japanese.py`

`.\models\gpt_neox_japanese\init.py`

`.\models\gpt_sw3\convert_megatron_to_pytorch.py`

`.\models\gpt_sw3\tokenization_gpt_sw3.py`

`.\models\gpt_sw3\init.py`

`.\models\graphormer\collating_graphormer.py`

`.\models\graphormer\configuration_graphormer.py`

`.\models\graphormer\modeling_graphormer.py`

`.\models\graphormer\init.py`

`.\models\groupvit\configuration_groupvit.py`

`.\models\groupvit\convert_groupvit_nvlab_to_hf.py`

`.\models\groupvit\modeling_groupvit.py`