Transformers 源码解析（五十五）

`.\models\gptsan_japanese\tokenization_gptsan_japanese.py`

# 指定文件编码为 UTF-8
# 版权声明，版权归 HuggingFace Inc. 团队所有
#
# 根据 Apache 许可证 2.0 版本，除非符合许可证要求，否则不得使用此文件
# 您可以在以下网址获取许可证副本
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则按"原样"分发的软件，
# 没有任何明示或暗示的担保或条件
# 请参阅许可证获取具体语言的权限或限制
"""GPTSANJapanese 的标记化类"""
import collections  # 导入集合模块，用于处理有序字典等
import json  # 导入 JSON 模块，用于处理 JSON 数据
import os  # 导入 OS 模块，用于处理操作系统相关功能
import re  # 导入正则表达式模块，用于字符串匹配操作
from typing import List, Optional, Tuple, Union  # 导入类型提示相关模块

import numpy as np  # 导入 NumPy 模块，用于数值计算

from ...tokenization_utils import PreTrainedTokenizer  # 导入预训练标记器类
from ...tokenization_utils_base import (  # 导入基础标记化相关模块
    BatchEncoding,
    PreTokenizedInput,
    PreTokenizedInputPair,
    TextInput,
    TextInputPair,
    TruncationStrategy,
)
from ...utils import PaddingStrategy, logging  # 导入填充策略和日志模块

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器

VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "emoji_file": "emoji.json"}  # 定义词汇文件名和表情符号文件名

PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "Tanrei/GPTSAN-japanese": "https://huggingface.co/Tanrei/GPTSAN-japanese/blob/main/vocab.txt",
    },
    "emoji_file": {
        "Tanrei/GPTSAN-japanese": "https://huggingface.co/Tanrei/GPTSAN-japanese/blob/main/emoji.json",
    },
}  # 预训练词汇文件映射，指定 GPTSAN-japanese 模型的词汇和表情符号文件

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "Tanrei/GPTSAN-japanese": 1280,
}  # 预训练位置嵌入尺寸映射，指定 GPTSAN-japanese 模型的位置嵌入尺寸


def load_vocab_and_emoji(vocab_file, emoji_file):
    """加载词汇文件和表情符号文件到字典中。"""
    with open(emoji_file, "r", encoding="utf-8") as f:
        emoji = json.loads(f.read())  # 读取并解析 JSON 格式的表情符号文件内容

    vocab = collections.OrderedDict()  # 创建有序字典用于存储词汇表
    raw_vocab = collections.OrderedDict()  # 创建有序字典用于存储原始词汇表
    ids_to_tokens = collections.OrderedDict()  # 创建有序字典用于存储从索引到标记的映射关系
    with open(vocab_file, "r", encoding="utf-8") as f:
        token = f.readlines()  # 逐行读取词汇文件内容
    token = [[t.rstrip("\n")] if (t == ",\n" or "," not in t) else t.rstrip("\n").split(",") for t in token]  # 对每行进行处理，将其拆分为标记列表
    for idx, b in enumerate(token):
        ids_to_tokens[idx] = b  # 将索引与标记映射关系存入字典
        raw_vocab[",".join(b)] = idx  # 将标记列表转换为字符串作为键，索引作为值存入原始词汇表
        for wd in b:
            vocab[wd] = idx  # 将标记与索引的映射关系存入词汇表

    return vocab, raw_vocab, ids_to_tokens, emoji  # 返回词汇表、原始词汇表、索引到标记映射和表情符号字典


class GPTSanJapaneseTokenizer(PreTrainedTokenizer):
    """
    本标记器基于 GPTNeoXJapaneseTokenizer，并进行以下修改：
    - 正确解码字节0~255的标记
    - 添加 bagofword 标记处理
    - 为 Prefix-LM 模型返回 token_type_ids
    bagofword 标记表示前一个标记的重复，并在解码时转换为三个连续的标记
    此外，原始的日本特殊 Sub-Word-Encoding 已在此存储库中发布
    (https://github.com/tanreinama/Japanese-BPEEncoder_V2)。token_type_ids 是一个指示前缀输入的掩码
    """
    pass  # GPTSanJapaneseTokenizer 类目前无具体实现，仅有文档字符串说明其基本功能
    >>> from transformers import GPTSanJapaneseTokenizer
    引入 GPTSanJapaneseTokenizer 类从 transformers 库
    
    >>> tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
    使用预训练模型 "Tanrei/GPTSAN-japanese" 初始化一个 tokenizer 对象
    
    >>> # You can confirm both 慶応 and 慶應 are encoded to 17750
    # 使用 tokenizer 对字符串进行编码，返回输入文本的 token IDs 列表
    >>> tokenizer("吾輩は猫である🐯。実は慶応(慶應)大学出身")["input_ids"]
    [35993, 35998, 34347, 31459, 30647, 31448, 25, 30659, 35729, 35676, 32417, 30647, 17750, 35589, 17750, 35590, 321, 1281]
    
    >>> # Both 慶応 and 慶應 are decoded to 慶応
    # 使用 tokenizer 对 token IDs 进行解码，返回原始文本
    >>> tokenizer.decode(tokenizer("吾輩は猫である🐯。実は慶応(慶應)大学出身")["input_ids"])
    '吾輩は猫である🐯。実は慶応(慶応)大学出身'
    
    
    
    
    Example for Prefix-LM:
    
    >>> from transformers import GPTSanJapaneseTokenizer
    引入 GPTSanJapaneseTokenizer 类从 transformers 库
    
    >>> tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
    使用预训练模型 "Tanrei/GPTSAN-japanese" 初始化一个 tokenizer 对象
    
    >>> tokenizer("実は慶応(慶應)大学出身", prefix_text="吾輩は猫である🐯。")["input_ids"]
    # 使用 tokenizer 对带有前缀文本的字符串进行编码，返回 token IDs 列表
    [35993, 34347, 31459, 30647, 31448, 25, 30659, 35729, 35676, 35998, 32417, 30647, 17750, 35589, 17750, 35590, 321, 1281]
    
    >>> # Mask for Prefix-LM inputs
    # 返回带有前缀文本的输入的 token 类型 IDs
    >>> tokenizer("実は慶応(慶應)大学出身", prefix_text="吾輩は猫である🐯。")["token_type_ids"]
    [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    
    
    
    
    Example for batch encode:
    
    >>> from transformers import GPTSanJapaneseTokenizer
    引入 GPTSanJapaneseTokenizer 类从 transformers 库
    
    >>> tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
    使用预训练模型 "Tanrei/GPTSAN-japanese" 初始化一个 tokenizer 对象
    
    >>> tokenizer([["武田信玄", "は、"], ["織田信長", "の配下の、"]], padding=True)["input_ids"]
    # 使用 tokenizer 对批量输入进行编码，返回填充后的 token IDs 列表
    [[35993, 8640, 25948, 35998, 30647, 35675, 35999, 35999], [35993, 10382, 9868, 35998, 30646, 9459, 30646, 35675]]
    
    >>> # Mask for Prefix-LM inputs
    # 返回带有前缀文本的批量输入的 token 类型 IDs
    >>> tokenizer([["武田信玄", "は、"], ["織田信長", "の配下の、"]], padding=True)["token_type_ids"]
    [[1, 1, 1, 0, 0, 0, 0, 0], [1, 1, 1, 0, 0, 0, 0, 0]]
    
    >>> # Mask for padding
    # 返回填充后的批量输入的注意力掩码
    >>> tokenizer([["武田信玄", "は、"], ["織田信長", "の配下の、"]], padding=True)["attention_mask"]
    [[1, 1, 1, 1, 1, 1, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1]]
    Args:
        vocab_file (`str`):
            File containing the vocabulary.
        emoji_file (`str`):
            File containing the emoji.
        unk_token (`str`, *optional*, defaults to `"<|nottoken|>"`):
            The token used for unknown characters.
        pad_token (`str`, *optional*, defaults to `"<|separator|>"`):
            The token used for padding.
        bos_token (`str`, *optional*, defaults to `"<|startoftext|>"`):
            The beginning of sequence token.
        eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
            The end of sequence token.
        sep_token (`str`, *optional*, defaults to `"<|segmenter|>"`):
            A special token to separate tokens into prefix and general input parts.
        do_clean_text (`bool`, *optional*, defaults to `False`):
            Whether or not to clean text for URLs, emails, telephone numbers, Japanese dates, and Japanese prices.
    """
    # Define constants for files related to vocabulary and model configurations
    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    model_input_names = ["input_ids", "attention_mask", "token_type_ids"]

    def __init__(
        self,
        vocab_file,
        emoji_file,
        unk_token="<|nottoken|>",
        pad_token="<|separator|>",
        bos_token="<|startoftext|>",
        eos_token="<|endoftext|>",
        sep_token="<|segmenter|>",
        do_clean_text=False,
        **kwargs,
    ):
        # Check if vocabulary file exists; raise an error if not found
        if not os.path.isfile(vocab_file):
            raise ValueError(
                f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
                " model use `tokenizer = GPTSanJapaneseTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
            )
        # Check if emoji file exists; raise an error if not found
        if not os.path.isfile(emoji_file):
            raise ValueError(
                f"Can't find an emoji file at path '{emoji_file}'. To load the emoji information from a Google"
                " pretrained model use `tokenizer = GPTSanJapaneseTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
            )
        
        # Initialize the tokenizer with the provided parameters
        self.do_clean_text = do_clean_text
        self.vocab, self.raw_vocab, self.ids_to_tokens, self.emoji = load_vocab_and_emoji(vocab_file, emoji_file)
        self.subword_tokenizer = SubWordJapaneseTokenizer(
            vocab=self.vocab, ids_to_tokens=self.ids_to_tokens, emoji=self.emoji
        )

        # Initialize the superclass (TokenizerBase) with tokenizer specific parameters
        super().__init__(
            unk_token=unk_token,
            pad_token=pad_token,
            bos_token=bos_token,
            eos_token=eos_token,
            sep_token=sep_token,
            do_clean_text=do_clean_text,
            **kwargs,
        )

    @property
    # Property to get the size of the vocabulary
    # Copied from tokenization_gpt_neox_japanese.GPTNeoXJapaneseTokenizer.vocab_size
    def vocab_size(self):
        # The vocab_size property returns the length of the raw_vocab, which contains character variations unique to Japanese
        return len(self.raw_vocab)
    # 从 raw_vocab 和 added_tokens_encoder 构建并返回词汇表字典
    def get_vocab(self):
        return dict(self.raw_vocab, **self.added_tokens_encoder)

    # 使用 subword_tokenizer 对文本进行分词处理并返回结果
    def _tokenize(self, text):
        return self.subword_tokenizer.tokenize(text, clean=self.do_clean_text)

    # 根据 token 查找词汇表中的对应 id，如果找不到则返回 unk_token 的 id
    def _convert_token_to_id(self, token):
        """Converts a token (str) in an id using the vocab."""
        return self.vocab.get(token, self.vocab.get(self.unk_token))

    # 根据 id 查找词汇表中的对应 token
    def _convert_id_to_token(self, index):
        """Converts an index (integer) in a token (str) using the vocab."""
        return self.subword_tokenizer.convert_id_to_token(index)

    # 将一系列 token 转换为单个字符串
    def convert_tokens_to_string(self, tokens):
        """Converts a sequence of tokens (string) in a single string."""
        words = []
        byte_tokens = []
        for word in tokens:
            if word[:6] == "<|byte" and word[-2:] == "|>":
                byte_tokens.append(int(word[6:-2]))
            else:
                if len(byte_tokens) > 0:
                    words.append(bytearray(byte_tokens).decode("utf-8", errors="replace"))
                    byte_tokens = []
                if word[:7] == "<|emoji" and word[-2:] == "|>":
                    words.append(self.emoji["emoji_inv"][word])
                elif word == "<SP>":
                    words.append(" ")
                elif word == "<BR>":
                    words.append("\n")
                elif word == "<TAB>":
                    words.append("\t")
                elif word == "<BLOCK>":
                    words.append("▀")
                elif word == "<KIGOU>":
                    words.append("ǀ")
                elif word == "<U2000U2BFF>":
                    words.append("‖")
                elif word == "<|bagoftoken|>":
                    if len(words) > 0:
                        words.append(words[-1])
                        words.append(words[-1])
                        words.append(words[-1])
                elif word.startswith("<|") and word.endswith("|>"):
                    words.append("")
                else:
                    words.append(word)
        if len(byte_tokens) > 0:
            words.append(bytearray(byte_tokens).decode("utf-8", errors="replace"))
        text = "".join(words)
        return text
    # 默认的聊天模板，用于在消息之间添加标准的BOS、SEP和EOS标记，并且不包含角色信息。
    def default_chat_template(self):
        """
        A simple chat template that adds standard BOS, SEP and EOS tokens between messages while discarding role
        information.
        """
        # 如果未为此分词器定义聊天模板，则警告并使用默认模板
        logger.warning_once(
            "\nNo chat template is defined for this tokenizer - using the default template "
            f"for the {self.__class__.__name__} class. If the default is not appropriate for "
            "your model, please set `tokenizer.chat_template` to an appropriate template. "
            "See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
        )
        # 返回格式化后的聊天模板字符串
        return (
            "{% for message in messages %}"
            "{% if not loop.first %}{{ bos_token}}{% endif %}"
            "{{ sep_token }}{{ message.content }} {{ eos_token }}"
            "{% endfor %}"
        )

    # 从 GPTNeoXJapaneseTokenizer.save_vocabulary 复制而来
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # 初始化索引
        index = 0
        # 检查保存目录是否存在
        if os.path.isdir(save_directory):
            # 构建词汇表文件路径和表情符号文件路径
            vocab_file = os.path.join(
                save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
            )
            emoji_file = os.path.join(
                save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["emoji_file"]
            )
        else:
            # 构建词汇表文件路径和表情符号文件路径（不是目录）
            vocab_file = (
                (filename_prefix + "-" if filename_prefix else "") + save_directory + VOCAB_FILES_NAMES["vocab_file"]
            )
            emoji_file = (
                (filename_prefix + "-" if filename_prefix else "") + save_directory + VOCAB_FILES_NAMES["emoji_file"]
            )
        # 写入词汇表文件
        with open(vocab_file, "w", encoding="utf-8") as writer:
            # 遍历词汇表映射，将索引和对应的 token 写入文件
            for token_index, token in self.ids_to_tokens.items():
                if index != token_index:
                    # 若词汇表索引不连续，发出警告
                    logger.warning(
                        f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
                        " Please check that the vocabulary is not corrupted!"
                    )
                    index = token_index
                # 将 token 写入文件，每个 token 用逗号分隔
                writer.write(",".join(token) + "\n")
                index += 1
        # 写入表情符号文件
        with open(emoji_file, "w", encoding="utf-8") as writer:
            json.dump(self.emoji, writer)
        # 返回词汇表文件和表情符号文件的路径
        return vocab_file, emoji_file

    # 创建 token_type_ids 从 token_ids_0 和 token_ids_1 中
    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        # docstyle-ignore
        """
        The tokenizer returns token_type_ids as separators between the Prefix part and the rest.
        token_type_ids is 1 for the Prefix part and 0 for the rest of the token.

        Example:
        ```
        >>> from transformers import GPTSanJapaneseTokenizer

        >>> tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
        >>> x_token = tokenizer("ｱｲｳｴ")
        >>> # input_ids:      | SOT | SEG | ｱ | ｲ | ｳ | ｴ |
        >>> # token_type_ids: | 1   | 0   | 0 | 0 | 0 | 0 |

        >>> x_token = tokenizer("", prefix_text="ｱｲｳｴ")
        >>> # input_ids:      | SOT | ｱ | ｲ | ｳ | ｴ | SEG |
        >>> # token_type_ids: | 1   | 1 | 1 | 1 | 1 | 0  |

        >>> x_token = tokenizer("ｳｴ", prefix_text="ｱｲ")
        >>> # input_ids:      | SOT | ｱ | ｲ | SEG | ｳ | ｴ |
        >>> # token_type_ids: | 1   | 1 | 1 | 0   | 0 | 0 |
        ```"""
        # 计算前缀长度的初始值为 0
        prefix_len = 0
        # 检查分隔符在词汇表中存在
        if self.sep_token in self.vocab:
            # 获取分隔符在词汇表中的索引
            segid = self.vocab[self.sep_token]
            # 如果 token_ids_0 中存在分隔符的索引
            if segid in token_ids_0:
                # 计算前缀长度为分隔符索引之前的长度
                prefix_len = token_ids_0.index(segid)
        # 如果 token_ids_1 为 None，则总长度为 token_ids_0 的长度
        if token_ids_1 is None:
            total_len = len(token_ids_0)
        else:
            # 否则总长度为 token_ids_0 和 token_ids_1 的长度之和
            total_len = len(token_ids_0 + token_ids_1)
        # 返回前缀长度数量的 1，后面补充 (总长度 - 前缀长度) 个 0 组成的列表
        return prefix_len * [1] + (total_len - prefix_len) * [0]

    def prepare_for_tokenization(self, text, prefix_text=None, add_sep_token=None, **kwargs):
        # GPTSAN 在 Prefix-LM 中除了在文本生成中插入的 SOT，还额外插入 SEP 标记。
        # 文本开头的 SOT，以及在前缀部分和其余部分之间的 SEP 标记。
        if add_sep_token is None:
            # 如果未明确在非前缀位置插入 SEP 标记
            add_sep_token = self.sep_token not in text
        # 准备 tokenization 的文本，初始为空字符串或者以 BOS 标记开头的字符串
        prepared = self.bos_token if self.bos_token in self.vocab else ""
        # 如果有前缀文本，则将其添加到准备的文本中
        prepared += prefix_text if prefix_text is not None else ""
        # 如果需要添加 SEP 标记，则将其添加到准备的文本中
        if add_sep_token:
            prepared += self.sep_token if self.sep_token in self.vocab else ""
        # 将原始文本添加到准备的文本中
        prepared += text
        # 返回包含准备好的文本和其他关键字参数的元组
        return (prepared, kwargs)
    # 定义了一个方法 `_batch_encode_plus`，用于批量编码文本或文本对
    def _batch_encode_plus(
        self,
        batch_text_or_text_pairs: Union[
            List[TextInput], List[TextInputPair], List[PreTokenizedInput], List[PreTokenizedInputPair]
        ],
        add_special_tokens: bool = True,
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
        max_length: Optional[int] = None,
        stride: int = 0,
        is_split_into_words: bool = False,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[str] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
    ) -> BatchEncoding:
        # 此标记器将输入文本对转换为前缀输入和后续输入
        if isinstance(batch_text_or_text_pairs[0], tuple) or isinstance(tuple(batch_text_or_text_pairs[0]), list):
            # 如果输入是文本对或文本对列表，则处理成前缀加分隔符后的单一文本列表
            batch_prefix_texts = []
            for pref, txt in batch_text_or_text_pairs:
                batch_prefix_texts.append(pref + self.sep_token + txt)
            batch_text_or_text_pairs = batch_prefix_texts

        # 调用父类的 `_batch_encode_plus` 方法，传递所有参数，并返回结果
        return super()._batch_encode_plus(
            batch_text_or_text_pairs,
            add_special_tokens,
            padding_strategy,
            truncation_strategy,
            max_length,
            stride,
            is_split_into_words,
            pad_to_multiple_of,
            return_tensors,
            return_token_type_ids,
            return_attention_mask,
            return_overflowing_tokens,
            return_special_tokens_mask,
            return_offsets_mapping,
            return_length,
            verbose,
        )
# 定义 SubWordJapaneseTokenizer 类，用于日语分词，基于 GPTNeoXJapaneseTokenizer 并进行了以下修改
class SubWordJapaneseTokenizer(object):
    """
    This tokenizer is based on GPTNeoXJapaneseTokenizer and has the following modifications
    - Decoding byte0~byte255 tokens correctly
    - Added bagofword token handling

    https://github.com/tanreinama/Japanese-BPEEncoder_V2 This tokenizer class is under MIT Lisence according to the
    original repository.

    MIT License

    Copyright (c) 2020 tanreinama

    Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
    documentation files (the "Software"), to deal in the Software without restriction, including without limitation the
    rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
    permit persons to whom the Software is furnished to do so, subject to the following conditions:

    The above copyright notice and this permission notice shall be included in all copies or substantial portions of
    the Software.

    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
    THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
    TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
    SOFTWARE.
    """

    # 从 tokenization_gpt_neox_japanese.SubWordJapaneseTokenizer.__init__ 复制而来
    def __init__(self, vocab, ids_to_tokens, emoji):
        self.vocab = vocab  # 初始化词汇表属性，与参数swe相同
        self.ids_to_tokens = ids_to_tokens  # 初始化 ID 到词汇映射属性，与参数bpe相同
        self.emoji = emoji  # 初始化表情符号属性
        self.maxlen = np.max([len(w) for w in self.vocab.keys()])  # 计算词汇表中最长词的长度并赋值给maxlen
        # 初始化用于匹配文本中各种模式的正则表达式
        self.content_repatter1 = re.compile(r"(https?|ftp)(:\/\/[-_\.!~*\'()a-zA-Z0-9;\/?:\@&=\+$,%#]+)")
        self.content_repatter2 = re.compile(r"[A-Za-z0-9\._+]*@[\\-_0-9A-Za-z]+(\.[A-Za-z]+)*")
        self.content_repatter3 = re.compile(r"[\(]{0,1}[0-9]{2,4}[\)\-\(]{0,1}[0-9]{2,4}[\)\-]{0,1}[0-9]{3,4}")
        self.content_repatter4 = re.compile(
            r"([12]\d{3}[/\-年])*(0?[1-9]|1[0-2])[/\-月]((0?[1-9]|[12][0-9]|3[01])日?)*(\d{1,2}|:|\d{1,2}時|\d{1,2}分|\(日\)|\(月\)|\(火\)|\(水\)|\(木\)|\(金\)|\(土\)|㈰|㈪|㈫|㈬|㈭|㈮|㈯)*"
        )
        self.content_repatter5 = re.compile(
            r"(明治|大正|昭和|平成|令和|㍾|㍽|㍼|㍻|\u32ff)\d{1,2}年(0?[1-9]|1[0-2])月(0?[1-9]|[12][0-9]|3[01])日(\d{1,2}|:|\d{1,2}時|\d{1,2}分|\(日\)|\(月\)|\(火\)|\(水\)|\(木\)|\(金\)|\(土\)|㈰|㈪|㈫|㈬|㈭|㈮|㈯)*"
        )
        self.content_repatter6 = re.compile(
            r"((0|[1-9]\d*|[1-9]\d{0,2}(,\d{3})+)*億)*((0|[1-9]\d*|[1-9]\d{0,2}(,\d{3})+)*万)*((0|[1-9]\d*|[1-9]\d{0,2}(,\d{3})+)*千)*(0|[1-9]\d*|[1-9]\d{0,2}(,\d{3})+)*(千円|万円|千万円|円|千ドル|万ドル|千万ドル|ドル|千ユーロ|万ユーロ|千万ユーロ|ユーロ)+(\(税込\)|\(税抜\)|\+tax)*"
        )
        keisen = "─━│┃┄┅┆┇┈┉┊┋┌┍┎┏┐┑┒┓└┕┖┗┘┙┚┛├┝┞┟┠┡┢┣┤┥┦┧┨┩┪┫┬┭┮┯┰┱┲┳┴┵┶┷┸┹┺┻┼┽┾┿╀╁╂╃╄╅╆╇╈╉╊╋╌╍╎╏═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠╡╢╣╤╥╦╧╨╩╪╫╬╭╮╯╰╱╲╳╴╵╶╷╸╹╺╻╼╽╾╿"
        blocks = "▀▁▂▃▄▅▆▇█▉▊▋▌▍▎▏▐░▒▓▔▕▖▗▘▙▚▛▜▝▞▟"
        self.content_trans1 = str.maketrans({k: "<BLOCK>" for k in keisen + blocks})  # 创建字符替换映射表

    # 从tokenization_gpt_neox_japanese.SubWordJapaneseTokenizer.__len__中复制而来
    def __len__(self):
        return len(self.ids_to_tokens)  # 返回ids_to_tokens的长度作为对象的长度

    # 从tokenization_gpt_neox_japanese.SubWordJapaneseTokenizer.clean_text中复制而来
    def clean_text(self, content):
        content = self.content_repatter1.sub("<URL>", content)  # 将文本中的URL替换为"<URL>"
        content = self.content_repatter2.sub("<EMAIL>", content)  # 将文本中的邮箱地址替换为"<EMAIL>"
        content = self.content_repatter3.sub("<TEL>", content)  # 将文本中的电话号码替换为"<TEL>"
        content = self.content_repatter4.sub("<DATE>", content)  # 将文本中的日期替换为"<DATE>"
        content = self.content_repatter5.sub("<DATE>", content)  # 将文本中的日期替换为"<DATE>"
        content = self.content_repatter6.sub("<PRICE>", content)  # 将文本中的价格替换为"<PRICE>"
        content = content.translate(self.content_trans1)  # 使用content_trans1进行文本的字符替换
        while "<BLOCK><BLOCK>" in content:
            content = content.replace("<BLOCK><BLOCK>", "<BLOCK>")  # 将连续的"<BLOCK><BLOCK>"替换为单个"<BLOCK>"
        return content

    # 从tokenization_gpt_neox_japanese.SubWordJapaneseTokenizer.tokenize中复制而来
    # 将文本中的空格替换为"<SP>"
    text = text.replace(" ", "<SP>")
    # 将全角空格替换为"<SP>"
    text = text.replace("　", "<SP>")
    # 将 Windows 换行符"\r\n"替换为"<BR>"
    text = text.replace("\r\n", "<BR>")
    # 将普通换行符"\n"替换为"<BR>"
    text = text.replace("\n", "<BR>")
    # 将老式 Mac 换行符"\r"替换为"<BR>"
    text = text.replace("\r", "<BR>")
    # 将制表符"\t"替换为"<TAB>"
    text = text.replace("\t", "<TAB>")
    # 将"—"替换为"ー"
    text = text.replace("—", "ー")
    # 将"−"替换为"ー"
    text = text.replace("−", "ー")
    
    # 遍历表情字典中的每个键值对，如果文本中包含某个键，则用对应的值替换文本中的键
    for k, v in self.emoji["emoji"].items():
        if k in text:
            text = text.replace(k, v)
    
    # 如果 clean 参数为 True，则对文本进行清洗处理
    if clean:
        text = self.clean_text(text)

    # 定义检查单个字符是否为特定符号的函数
    def check_simbol(x):
        e = x.encode()
        # 检查字符长度为1且编码长度为2的情况
        if len(x) == 1 and len(e) == 2:
            c = (int(e[0]) << 8) + int(e[1])
            # 检查是否符合特定范围内的字符编码
            if (
                (c >= 0xC2A1 and c <= 0xC2BF)
                or (c >= 0xC780 and c <= 0xC783)
                or (c >= 0xCAB9 and c <= 0xCBBF)
                or (c >= 0xCC80 and c <= 0xCDA2)
            ):
                return True
        return False

    # 定义检查单个字符是否为 Unicode 表意文字扩展区域的函数
    def checku2e(x):
        e = x.encode()
        # 检查字符长度为1且编码长度为3的情况
        if len(x) == 1 and len(e) == 3:
            c = (int(e[0]) << 16) + (int(e[1]) << 8) + int(e[2])
            # 检查是否符合特定范围内的字符编码
            if c >= 0xE28080 and c <= 0xE2B07F:
                return True
        return False

    # 初始化位置变量为0
    pos = 0
    # 初始化结果列表
    result = []
    # 当位置小于文本长度时循环处理文本
    while pos < len(text):
        # 如果当前字符是"<"，则结束位置为当前位置加上最大长度加1；否则结束位置为当前位置加3
        end = min(len(text), pos + self.maxlen + 1) if text[pos] == "<" else pos + 3
        # 候选词列表初始化为空
        candidates = []  # (token_id, token, pos)
        # 从结束位置向当前位置遍历
        for e in range(end, pos, -1):
            # 获取当前位置到结束位置的子串
            wd = text[pos:e]
            # 如果该子串在词汇表中存在
            if wd in self.vocab:
                # 如果子串以"<"开头且长度大于2，则将其作为一个候选项加入列表
                if wd[0] == "<" and len(wd) > 2:
                    candidates = [(self.vocab[wd], wd, e)]
                    break
                else:
                    candidates.append((self.vocab[wd], wd, e))
        # 如果候选词列表不为空
        if len(candidates) > 0:
            # 根据 token_id 最小的原则选取候选项中的一个进行处理
            _, wd, e = sorted(candidates, key=lambda x: x[0])[0]
            # 将选取的词添加到结果列表中
            result.append(wd)
            # 更新位置为 e
            pos = e
        else:
            # 如果候选词列表为空，则处理当前位置到结束位置的子串
            end = pos + 1
            wd = text[pos:end]
            # 如果子串为特定符号，则将"<KIGOU>"加入结果列表
            if check_simbol(wd):
                result.append("<KIGOU>")
            # 如果子串为 Unicode 表意文字扩展区域的字符，则将"<U2000U2BFF>"加入结果列表
            elif checku2e(wd):
                result.append("<U2000U2BFF>")
            else:
                # 否则将子串中的每个字节按照格式"<|byte%d|>"添加到结果列表中
                for i in wd.encode("utf-8"):
                    result.append("<|byte%d|>" % i)
            # 更新位置为 end
            pos = end
    
    # 返回处理后的结果列表
    return result

`.\models\gptsan_japanese\init.py`

# 导入必要的模块和函数
from typing import TYPE_CHECKING

# 导入可选的异常类和延迟加载模块的帮助函数
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_flax_available,
    is_tf_available,
    is_torch_available,
)

# 定义模块的导入结构
_import_structure = {
    "configuration_gptsan_japanese": ["GPTSAN_JAPANESE_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPTSanJapaneseConfig"],
    "tokenization_gptsan_japanese": ["GPTSanJapaneseTokenizer"],
}

# 检查是否可用 Torch 库，若不可用则引发异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，则扩展导入结构中的模型和标记化模块
    _import_structure["modeling_gptsan_japanese"] = [
        "GPTSAN_JAPANESE_PRETRAINED_MODEL_ARCHIVE_LIST",
        "GPTSanJapaneseForConditionalGeneration",
        "GPTSanJapaneseModel",
        "GPTSanJapanesePreTrainedModel",
    ]
    _import_structure["tokenization_gptsan_japanese"] = [
        "GPTSanJapaneseTokenizer",
    ]

# 如果是类型检查模式，则进行详细的类型导入
if TYPE_CHECKING:
    # 导入配置和标记化模块的特定类
    from .configuration_gptsan_japanese import GPTSAN_JAPANESE_PRETRAINED_CONFIG_ARCHIVE_MAP, GPTSanJapaneseConfig
    from .tokenization_gptsan_japanese import GPTSanJapaneseTokenizer

    # 再次检查 Torch 库是否可用，若不可用则引发异常
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 若可用，则详细导入模型相关的类
        from .modeling_gptsan_japanese import (
            GPTSAN_JAPANESE_PRETRAINED_MODEL_ARCHIVE_LIST,
            GPTSanJapaneseForConditionalGeneration,
            GPTSanJapaneseModel,
            GPTSanJapanesePreTrainedModel,
        )
        from .tokenization_gptsan_japanese import GPTSanJapaneseTokenizer

# 非类型检查模式下，设置模块的延迟加载
else:
    import sys

    # 将当前模块注册为延迟加载模块
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\gpt_bigcode\configuration_gpt_bigcode.py`

# coding=utf-8
# Copyright 2023 The BigCode team and HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
GPTBigCode configuration

This module contains the configuration for the GPTBigCode model, specifying how to instantiate and customize it.
"""

# Importing necessary modules from the parent directories
from ...configuration_utils import PretrainedConfig
from ...utils import logging

# Setting up logging for the current module
logger = logging.get_logger(__name__)

# Mapping from model identifier to its corresponding configuration file URL
GPT_BIGCODE_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "bigcode/gpt_bigcode-santacoder": "https://huggingface.co/bigcode/gpt_bigcode-santacoder/resolve/main/config.json",
}

# Configuration class for GPTBigCode model, inherits from PretrainedConfig
class GPTBigCodeConfig(PretrainedConfig):
    """
    This is the configuration class to store the configuration of a [`GPTBigCodeModel`]. It is used to instantiate a
    GPTBigCode model according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the GPTBigCode
    [gpt_bigcode](https://huggingface.co/gpt_bigcode) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    """
    # 定义 GPT-2 模型的配置类 GPTBigCodeConfig，包含了各种可选参数
    Args:
        vocab_size (`int`, *optional*, defaults to 50257):
            GPT-2 模型的词汇表大小，定义了可以表示的不同标记数量
        n_positions (`int`, *optional*, defaults to 1024):
            模型可能使用的最大序列长度。通常设置为一个较大的值，例如 512、1024 或 2048
        n_embd (`int`, *optional*, defaults to 768):
            嵌入和隐藏状态的维度
        n_layer (`int`, *optional*, defaults to 12):
            Transformer 编码器中的隐藏层数量
        n_head (`int`, *optional*, defaults to 12):
            Transformer 编码器中每个注意力层的注意头数量
        n_inner (`int`, *optional*, defaults to None):
            内部前馈层的维度。如果为 `None`，将设置为 4 倍的 n_embd
        activation_function (`str`, *optional*, defaults to `"gelu_pytorch_tanh"`):
            激活函数，可在列表 `["relu", "silu", "gelu", "tanh", "gelu_new", "gelu_pytorch_tanh"]` 中选择
        resid_pdrop (`float`, *optional*, defaults to 0.1):
            嵌入、编码器和池化器中所有全连接层的 dropout 概率
        embd_pdrop (`float`, *optional*, defaults to 0.1):
            嵌入的 dropout 比率
        attn_pdrop (`float`, *optional*, defaults to 0.1):
            注意力的 dropout 比率
        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
            层归一化层使用的 epsilon
        initializer_range (`float`, *optional*, defaults to 0.02):
            用于初始化所有权重矩阵的截断正态分布的标准差
        scale_attn_weights (`bool`, *optional*, defaults to `True`):
            是否通过除以 sqrt(hidden_size) 来缩放注意力权重
        use_cache (`bool`, *optional*, defaults to `True`):
            模型是否应返回最后的键/值注意力（不是所有模型都使用）
        attention_softmax_in_fp32 (`bool`, *optional*, defaults to `True`):
            是否在 float32 中调用融合 softmax
        scale_attention_softmax_in_fp32 (`bool`, *optional*, defaults to `True`):
            是否在 float32 中缩放注意力 softmax
        attention_type (`bool`, *optional*, defaults to `True`):
            是否使用多查询注意力（True）或多头注意力（False）
    Example:

    ```
    >>> from transformers import GPTBigCodeConfig, GPTBigCodeModel

    >>> # 初始化一个 GPTBigCodeConfig 配置对象
    >>> configuration = GPTBigCodeConfig()

    >>> # 根据配置初始化一个具有随机权重的模型
    >>> model = GPTBigCodeModel(configuration)
    ```
    # 访问模型配置信息
    configuration = model.config



    # 设置模型类型为"gpt_bigcode"
    model_type = "gpt_bigcode"



    # 在推断过程中忽略的键列表
    keys_to_ignore_at_inference = ["past_key_values"]



    # 属性映射，将模型配置的名称映射到内部使用的名称
    attribute_map = {
        "hidden_size": "n_embd",
        "max_position_embeddings": "n_positions",
        "num_attention_heads": "n_head",
        "num_hidden_layers": "n_layer",
    }



    # 初始化函数，用于设置模型的各种参数和默认值
    def __init__(
        self,
        vocab_size=50257,
        n_positions=1024,
        n_embd=768,
        n_layer=12,
        n_head=12,
        n_inner=None,
        activation_function="gelu_pytorch_tanh",
        resid_pdrop=0.1,
        embd_pdrop=0.1,
        attn_pdrop=0.1,
        layer_norm_epsilon=1e-5,
        initializer_range=0.02,
        scale_attn_weights=True,
        use_cache=True,
        bos_token_id=50256,
        eos_token_id=50256,
        attention_softmax_in_fp32=True,
        scale_attention_softmax_in_fp32=True,
        multi_query=True,
        **kwargs,
    ):



        # 初始化模型的各个参数
        self.vocab_size = vocab_size
        self.n_positions = n_positions
        self.n_embd = n_embd
        self.n_layer = n_layer
        self.n_head = n_head
        self.n_inner = n_inner
        self.activation_function = activation_function
        self.resid_pdrop = resid_pdrop
        self.embd_pdrop = embd_pdrop
        self.attn_pdrop = attn_pdrop
        self.layer_norm_epsilon = layer_norm_epsilon
        self.initializer_range = initializer_range
        self.scale_attn_weights = scale_attn_weights
        self.use_cache = use_cache
        self.attention_softmax_in_fp32 = attention_softmax_in_fp32
        self.scale_attention_softmax_in_fp32 = scale_attention_softmax_in_fp32
        self.multi_query = multi_query

        self.bos_token_id = bos_token_id
        self.eos_token_id = eos_token_id

        # 调用父类的初始化方法，设置起始和结束标记的 token ID
        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)

`.\models\gpt_bigcode\modeling_gpt_bigcode.py`

# 导入所需的模块和库
import math
from typing import List, Optional, Tuple, Union

import torch
import torch.nn.functional as F
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

# 导入特定功能模块和函数
from ...activations import ACT2FN
from ...modeling_attn_mask_utils import AttentionMaskConverter
from ...modeling_outputs import (
    BaseModelOutputWithPastAndCrossAttentions,
    CausalLMOutputWithCrossAttentions,
    SequenceClassifierOutputWithPast,
    TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import is_torch_greater_or_equal_than_2_2
from ...utils import (
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    is_flash_attn_2_available,
    is_flash_attn_greater_or_equal_2_10,
    logging,
)

# 如果系统支持 Flash Attention 2.0 及以上版本，导入相关函数和模块
if is_flash_attn_2_available():
    from flash_attn import flash_attn_func, flash_attn_varlen_func
    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa

# 获取日志记录器对象
logger = logging.get_logger(__name__)

# 用于文档的检查点和配置信息
_CHECKPOINT_FOR_DOC = "bigcode/gpt_bigcode-santacoder"
_CONFIG_FOR_DOC = "GPTBigCodeConfig"

# 预训练模型存档列表
GPT_BIGCODE_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "bigcode/gpt_bigcode-santacoder",
    # 更多模型可以在 https://huggingface.co/models?filter=gpt_bigcode 查看
]

# 下面是一些使用 Torch JIT 脚本定义的函数，用于在 GPU 上优化计算效率

# 对输入进行 softmax 计算，并支持按条件屏蔽某些位置
@torch.jit.script
def upcast_masked_softmax(
    x: torch.Tensor, mask: torch.Tensor, mask_value: torch.Tensor, scale: float, softmax_dtype: torch.dtype
):
    # 将输入张量转换为指定的数据类型以提升计算效率
    input_dtype = x.dtype
    x = x.to(softmax_dtype) * scale
    # 根据掩码条件，将无效位置的值替换为指定的掩码值
    x = torch.where(mask, x, mask_value)
    # 在指定维度上进行 softmax 计算，并将结果转回原始输入张量的数据类型
    x = torch.nn.functional.softmax(x, dim=-1).to(input_dtype)
    return x

# 对输入进行 softmax 计算，并支持指定数据类型以提升计算效率
@torch.jit.script
def upcast_softmax(x: torch.Tensor, scale: float, softmax_dtype: torch.dtype):
    # 将输入张量转换为指定的数据类型以提升计算效率
    input_dtype = x.dtype
    x = x.to(softmax_dtype) * scale
    # 在指定维度上进行 softmax 计算，并将结果转回原始输入张量的数据类型
    x = torch.nn.functional.softmax(x, dim=-1).to(input_dtype)
    return x

# 对输入进行 softmax 计算，并支持按条件屏蔽某些位置
@torch.jit.script
def masked_softmax(x: torch.Tensor, mask: torch.Tensor, mask_value: torch.Tensor):
    # 根据掩码条件，将无效位置的值替换为指定的掩码值
    x = torch.where(mask, x, mask_value)
    # 使用 PyTorch 的 nn.functional.softmax 函数对张量 x 进行 softmax 操作，指定在最后一个维度上进行计算
    x = torch.nn.functional.softmax(x, dim=-1)
    # 返回经过 softmax 操作后的张量 x
    return x
# 定义一个函数 `_get_unpad_data`，用于处理注意力掩码。
def _get_unpad_data(attention_mask):
    # 计算每个样本序列的长度之和，结果为一个整数张量
    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
    # 找出所有非零元素的索引并展平，返回的是一维张量
    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
    # 计算批次中最大的序列长度，将其转换为 Python 整数
    max_seqlen_in_batch = seqlens_in_batch.max().item()
    # 计算累积序列长度，使用零填充以保持形状
    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
    # 返回处理后的结果元组
    return (
        indices,
        cu_seqlens,
        max_seqlen_in_batch,
    )


# 定义一个名为 GPTBigCodeAttention 的类，继承自 nn.Module
class GPTBigCodeAttention(nn.Module):
    # 初始化方法，接收 config、is_cross_attention 和 layer_idx 参数
    def __init__(self, config, is_cross_attention=False, layer_idx=None):
        # 调用父类的初始化方法
        super().__init__()
        # 将 config 参数保存在实例变量中
        self.config = config

        # 初始化一些实例变量
        self.mask_value = None
        self.multi_query = config.multi_query
        self.embed_dim = config.hidden_size
        self.num_heads = config.num_attention_heads
        self.head_dim = self.embed_dim // self.num_heads
        self.kv_heads = 1 if self.multi_query else self.num_heads
        self.kv_dim = self.kv_heads * self.head_dim
        self.split_size = self.embed_dim
        self.is_causal = True

        # 检查是否满足 embed_dim 能被 num_heads 整除的条件
        if self.head_dim * self.num_heads != self.embed_dim:
            raise ValueError(
                f"`embed_dim` must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
                f" {self.num_heads})."
            )

        # 保存一些配置参数
        self.scale_attn_weights = config.scale_attn_weights
        self.is_cross_attention = is_cross_attention

        self.layer_idx = layer_idx
        self.attention_softmax_in_fp32 = config.attention_softmax_in_fp32
        self.scale_attention_softmax_in_fp32 = (
            config.scale_attention_softmax_in_fp32 and config.attention_softmax_in_fp32
        )
        self.attn_pdrop = config.attn_pdrop

        # 如果是交叉注意力模式
        if self.is_cross_attention:
            # 如果使用多查询，抛出未实现错误
            if self.multi_query:
                raise NotImplementedError("Multi-Query Attention not supported for cross_attention")

            # 创建一个线性层，用于跨注意力的内容注意力
            self.c_attn = nn.Linear(self.embed_dim, 2 * self.embed_dim)
            # 创建一个线性层，用于跨注意力的查询注意力
            self.q_attn = nn.Linear(self.embed_dim, self.embed_dim)
        else:
            # 创建一个线性层，用于自注意力的内容和键值对注意力
            self.c_attn = nn.Linear(self.embed_dim, self.embed_dim + 2 * self.kv_dim)

        # 创建一个线性层，用于计算注意力后的投影
        self.c_proj = nn.Linear(self.embed_dim, self.embed_dim)

        # 创建一个注意力丢弃层
        self.attn_dropout = nn.Dropout(config.attn_pdrop)
        # 创建一个残差连接丢弃层
        self.resid_dropout = nn.Dropout(config.resid_pdrop)

    # 辅助方法，获取掩码值
    def _get_mask_value(self, device, dtype):
        # torch.where 函数期望一个张量，为了避免每次重新创建，使用缓存
        if self.mask_value is None or self.mask_value.dtype != dtype or self.mask_value.device != device:
            self.mask_value = torch.full([], torch.finfo(dtype).min, dtype=dtype, device=device)
        # 返回缓存的掩码值
        return self.mask_value
    # 定义前向传播函数，用于Transformer模型的自注意力机制或者交叉注意力机制
    def forward(
        self,
        hidden_states: torch.Tensor,
        layer_past: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
        use_cache: Optional[bool] = False,
        output_attentions: Optional[bool] = False,
    ) -> Union[
        Tuple[torch.Tensor, Optional[torch.Tensor]],
        Tuple[torch.Tensor, Optional[torch.Tensor], Tuple[torch.Tensor, ...]],
    ]:
        # 如果存在编码器隐藏状态，则执行交叉注意力机制的相关逻辑
        if encoder_hidden_states is not None:
            # 如果当前对象没有属性 "q_attn" 或者不是交叉注意力模式，则抛出值错误异常
            if not hasattr(self, "q_attn") or not self.is_cross_attention:
                raise ValueError(
                    "If class is used as cross attention, the weights `q_attn` have to be defined. "
                    "Please make sure to instantiate class with `GPTBigCodeAttention(..., is_cross_attention=True)`."
                )

            # 使用当前对象的自注意力权重函数对隐藏状态进行查询
            query = self.q_attn(hidden_states)
            # 使用当前对象的交叉注意力权重函数对编码器隐藏状态进行键值映射
            key_value = self.c_attn(encoder_hidden_states)
            # 更新注意力掩码为编码器的注意力掩码
            attention_mask = encoder_attention_mask
        # 如果是多查询模式
        elif self.multi_query:
            # 使用当前对象的注意力权重函数对隐藏状态进行键值映射并分割为查询和键值对
            query, key_value = self.c_attn(hidden_states).split((self.embed_dim, 2 * self.kv_dim), dim=2)
        else:
            # 注意：我们将维度分割为 (self.num_heads, 3, self.head_dim) 而不是 (3, self.num_heads, self.head_dim)，
            # 即，内存布局与GPT2不同。
            # 这样可以更有效地与过去的键值对连接。
            query, key_value = (
                self.c_attn(hidden_states)
                .view(*hidden_states.shape[:2], self.num_heads, 3 * self.head_dim)
                .transpose(1, 2)
                .split((self.head_dim, 2 * self.head_dim), dim=3)
            )

        # 如果存在过去的层键值对，则将其与当前键值对连接起来
        if layer_past is not None:
            key_value = torch.cat((layer_past, key_value), dim=-2)
        # 如果使用缓存，则将当前键值对设置为输出的 "present"
        present = key_value if use_cache else None

        # 将键值对分割为键和值
        key, value = key_value.split((self.head_dim, self.head_dim), dim=-1)

        # 执行注意力计算，得到注意力输出和注意力权重
        attn_output, attn_weights = self._attn(query, key.transpose(-1, -2), value, attention_mask, head_mask)

        # 如果不是多查询模式，则转置注意力输出并重新整形为与隐藏状态相同的形状
        if not self.multi_query:
            attn_output = attn_output.transpose(1, 2).reshape(hidden_states.shape)
        # 使用当前对象的投影函数对注意力输出进行变换
        attn_output = self.c_proj(attn_output)
        # 对注意力输出进行残差连接的dropout处理
        attn_output = self.resid_dropout(attn_output)

        # 将注意力输出和 "present" 放入输出元组
        outputs = (attn_output, present)
        # 如果需要输出注意力权重，则将其添加到输出元组中
        if output_attentions:
            if self.multi_query:
                # 转置以返回通常格式的注意力权重 (batch_size, num_heads, query_length, key_length)
                attn_weights = attn_weights.transpose(1, 2)
            outputs += (attn_weights,)

        return outputs  # 返回注意力输出，"present"，(注意力权重)
# 定义了一个名为 GPTBigCodeFlashAttention2 的类，继承自 GPTBigCodeAttention 类。该模块用于处理 flash attention，保持权重不变。唯一需要修改的是前向传播，在其中正确调用 flash attention 的公共 API，并处理可能存在的填充标记。
class GPTBigCodeFlashAttention2(GPTBigCodeAttention):
    """
    GPTBigCode flash attention module. This module inherits from `GPTBigCodeAttention` as the weights of the module
    stays untouched. The only required change would be on the forward pass where it needs to correctly call the public
    API of flash attention and deal with padding tokens in case the input contains any of them.
    """

    # 从 transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__ 复制而来
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        # TODO: Once Flash Attention for RoCm is bumped to 2.1, this should be removed.
        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment,
        # that was made default for flash_attn>=2.1. This attribute is used to handle this difference.
        # Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
        # 注意，flash_attn<2.1 在 q_seqlen != k_seqlen 时（除非 q_seqlen == 1），生成的是错误的掩码（左上角）。
        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()

    # 定义前向传播方法
    def forward(
        self,
        hidden_states: torch.Tensor,
        layer_past: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
        use_cache: Optional[bool] = False,
        output_attentions: Optional[bool] = False,
    ) -> Union[
        Tuple[torch.Tensor, Optional[torch.Tensor]],
        Tuple[torch.Tensor, Optional[torch.Tensor], Tuple[torch.Tensor, ...]],
    ]:
        # 前向传播方法的输入参数和返回值类型注释

        # 从 transformers.models.llama.modeling_llama.LlamaFlashAttention2._flash_attention_forward 复制而来
        def _flash_attention_forward(
            self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
    ):
        """
        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
        first unpad the input, then computes the attention scores and pad the final attention scores.

        Args:
            query_states (`torch.Tensor`):
                Input query states to be passed to Flash Attention API
            key_states (`torch.Tensor`):
                Input key states to be passed to Flash Attention API
            value_states (`torch.Tensor`):
                Input value states to be passed to Flash Attention API
            attention_mask (`torch.Tensor`):
                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
                position of padding tokens and 1 for the position of non-padding tokens.
            dropout (`float`):
                Attention dropout
            softmax_scale (`float`, *optional*):
                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
        """
        if not self._flash_attn_uses_top_left_mask:
            # Determine if causal masking is required based on the model's configuration
            causal = self.is_causal
        else:
            # Temporary workaround until Flash Attention for RoCm version 2.1
            # Remove this condition when the issue is resolved, see LlamaFlashAttention2 __init__ for details
            causal = self.is_causal and query_length != 1

        # Check if there are any padding tokens in the sequence
        if attention_mask is not None:
            batch_size = query_states.shape[0]
            # Unpad the input sequences based on the attention mask
            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
                query_states, key_states, value_states, attention_mask, query_length
            )

            # Extract sequence lengths after unpadding
            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens

            # Compute attention scores for the unpad inputs
            attn_output_unpad = flash_attn_varlen_func(
                query_states,
                key_states,
                value_states,
                cu_seqlens_q=cu_seqlens_q,
                cu_seqlens_k=cu_seqlens_k,
                max_seqlen_q=max_seqlen_in_batch_q,
                max_seqlen_k=max_seqlen_in_batch_k,
                dropout_p=dropout,
                softmax_scale=softmax_scale,
                causal=causal,
            )

            # Pad the attention output back to original sequence length
            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
        else:
            # Compute attention scores without considering padding (fallback case)
            attn_output = flash_attn_func(
                query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
            )

        return attn_output

    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._upad_input
    # 定义一个方法用于处理输入数据，用于注意力机制
    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
        # 获取未填充数据的索引、当前序列长度及批次内最大序列长度
        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
        # 获取批次大小、键值对序列长度、键值头数、头维度
        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape

        # 重新组织键层数据，按未填充数据的索引进行索引
        key_layer = index_first_axis(
            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
        )
        # 重新组织值层数据，按未填充数据的索引进行索引
        value_layer = index_first_axis(
            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
        )

        # 根据查询长度调整查询层数据
        if query_length == kv_seq_len:
            # 当查询长度等于键值对序列长度时，按未填充数据的索引重新组织查询层数据
            query_layer = index_first_axis(
                query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
            )
            cu_seqlens_q = cu_seqlens_k
            max_seqlen_in_batch_q = max_seqlen_in_batch_k
            indices_q = indices_k
        elif query_length == 1:
            # 当查询长度为1时，处理查询层数据
            max_seqlen_in_batch_q = 1
            cu_seqlens_q = torch.arange(
                batch_size + 1, dtype=torch.int32, device=query_layer.device
            )  # 这里有一个内存复制操作，性能较差。
            indices_q = cu_seqlens_q[:-1]
            query_layer = query_layer.squeeze(1)
        else:
            # 否则，根据查询长度和注意力掩码进行未填充数据处理
            # -query_length: 切片表示左填充操作
            attention_mask = attention_mask[:, -query_length:]
            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)

        # 返回处理后的查询层、键层、值层、查询索引、序列长度信息元组
        return (
            query_layer,
            key_layer,
            value_layer,
            indices_q,
            (cu_seqlens_q, cu_seqlens_k),
            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
        )
class GPTBigCodeSdpaAttention(GPTBigCodeAttention):
    # 继承自GPTBigCodeAttention类的SDPA注意力机制的实现
    def forward(
        self,
        hidden_states: torch.Tensor,
        layer_past: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
        use_cache: Optional[bool] = False,
        output_attentions: Optional[bool] = False,
    ) -> Union[
        Tuple[torch.Tensor, Optional[torch.Tensor]],
        Tuple[torch.Tensor, Optional[torch.Tensor], Tuple[torch.Tensor, ...]],
    ]:
        # SDPA注意力机制的前向传播函数，接受多个参数并返回输出张量和可能的额外输出
        pass  # 实际实现未提供，暂未实现具体的前向传播逻辑


class GPTBigCodeMLP(nn.Module):
    # 基于配置的GPT大型代码模型的多层感知机（MLP）实现
    def __init__(self, intermediate_size, config):
        super().__init__()
        embed_dim = config.hidden_size
        # 第一个线性层：将输入维度转换为中间维度
        self.c_fc = nn.Linear(embed_dim, intermediate_size)
        # 第二个线性层：将中间维度转换回原始嵌入维度
        self.c_proj = nn.Linear(intermediate_size, embed_dim)
        # 激活函数，根据配置选择
        self.act = ACT2FN[config.activation_function]
        # Dropout层，根据配置设置丢弃概率
        self.dropout = nn.Dropout(config.resid_pdrop)

    # 从transformers.models.gpt2.modeling_gpt2.GPT2MLP.forward复制而来
    def forward(self, hidden_states: Optional[Tuple[torch.FloatTensor]]) -> torch.FloatTensor:
        # 多层感知机的前向传播逻辑
        hidden_states = self.c_fc(hidden_states)  # 线性变换
        hidden_states = self.act(hidden_states)   # 激活函数
        hidden_states = self.c_proj(hidden_states)  # 第二个线性变换
        hidden_states = self.dropout(hidden_states)  # Dropout
        return hidden_states


GPTBIGCODE_ATTENTION_CLASSES = {
    "eager": GPTBigCodeAttention,
    "flash_attention_2": GPTBigCodeFlashAttention2,
    "sdpa": GPTBigCodeSdpaAttention,  # SDPA注意力机制类
}


class GPTBigCodeBlock(nn.Module):
    # GPT大型代码模型的块，根据配置初始化层归一化、注意力、MLP等组件
    def __init__(self, config, layer_idx=None):
        super().__init__()
        hidden_size = config.hidden_size
        # 内部维度，如果配置未指定，则为4倍隐藏层大小
        self.inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size

        # 第一层归一化层
        self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)

        # 根据配置选择的注意力机制实现
        self.attn = GPTBIGCODE_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx=layer_idx)

        # 第二层归一化层
        self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)

        # 如果配置指定添加交叉注意力
        if config.add_cross_attention:
            if config.multi_query:
                raise NotImplementedError("Cross-attention not implemented for MQA")

            # 初始化交叉注意力
            self.crossattention = GPTBIGCODE_ATTENTION_CLASSES[config._attn_implementation](
                config, is_cross_attention=True, layer_idx=layer_idx
            )

            # 交叉注意力后的归一化层
            self.ln_cross_attn = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)

        # 多层感知机模块
        self.mlp = GPTBigCodeMLP(self.inner_dim, config)
    # 定义模型的前向传播函数，用于处理输入的隐藏状态和一些可选参数，返回不同组合的输出元组
    def forward(
        self,
        hidden_states: Optional[Tuple[torch.Tensor]],  # 输入的隐藏状态，可以是一个张量元组，可选
        layer_past: Optional[torch.Tensor] = None,  # 先前层的状态，可选，默认为 None
        attention_mask: Optional[torch.Tensor] = None,  # 注意力掩码，可选，默认为 None
        head_mask: Optional[torch.Tensor] = None,  # 注意力头的掩码，可选，默认为 None
        encoder_hidden_states: Optional[torch.Tensor] = None,  # 编码器的隐藏状态，可选，默认为 None
        encoder_attention_mask: Optional[torch.Tensor] = None,  # 编码器的注意力掩码，可选，默认为 None
        use_cache: Optional[bool] = False,  # 是否使用缓存，可选，默认为 False
        output_attentions: Optional[bool] = False,  # 是否输出注意力权重，可选，默认为 False
    ) -> Union[
        Tuple[torch.Tensor], Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor, torch.Tensor, torch.Tensor]
    ]:
        # 保存输入的隐藏状态作为残差连接的基准
        residual = hidden_states
        # 应用 Layer Normalization 到隐藏状态
        hidden_states = self.ln_1(hidden_states)
        # 调用注意力层处理隐藏状态
        attn_outputs = self.attn(
            hidden_states,
            layer_past=layer_past,
            attention_mask=attention_mask,
            head_mask=head_mask,
            use_cache=use_cache,
            output_attentions=output_attentions,
        )
        # 获取注意力层的输出
        attn_output = attn_outputs[0]  # output_attn: a, present, (attentions)
        # 获取额外的输出（如果有的话）
        outputs = attn_outputs[1:]
        # 执行残差连接
        hidden_states = attn_output + residual

        # 如果存在编码器的隐藏状态
        if encoder_hidden_states is not None:
            # 添加一个用于交叉注意力的自注意力块
            if not hasattr(self, "crossattention"):
                # 如果未配置交叉注意力层，则抛出错误
                raise ValueError(
                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with "
                    "cross-attention layers by setting `config.add_cross_attention=True`"
                )
            # 保存当前隐藏状态作为残差连接的基准
            residual = hidden_states
            # 应用 Layer Normalization 到交叉注意力层的隐藏状态
            hidden_states = self.ln_cross_attn(hidden_states)
            # 调用交叉注意力层处理隐藏状态
            cross_attn_outputs = self.crossattention(
                hidden_states,
                attention_mask=attention_mask,
                head_mask=head_mask,
                encoder_hidden_states=encoder_hidden_states,
                encoder_attention_mask=encoder_attention_mask,
                output_attentions=output_attentions,
            )
            # 获取交叉注意力层的输出
            attn_output = cross_attn_outputs[0]
            # 执行残差连接
            hidden_states = residual + attn_output
            # 添加交叉注意力权重到输出中（如果需要输出注意力权重）
            outputs = outputs + cross_attn_outputs[2:]

        # 保存当前隐藏状态作为残差连接的基准
        residual = hidden_states
        # 应用 Layer Normalization 到隐藏状态
        hidden_states = self.ln_2(hidden_states)
        # 应用 MLP 层处理隐藏状态
        feed_forward_hidden_states = self.mlp(hidden_states)
        # 执行残差连接
        hidden_states = residual + feed_forward_hidden_states

        # 如果需要使用缓存，则将当前隐藏状态添加到输出中
        if use_cache:
            outputs = (hidden_states,) + outputs
        else:
            # 否则，只保留除了隐藏状态以外的输出部分
            outputs = (hidden_states,) + outputs[1:]

        # 返回最终的输出元组
        return outputs  # hidden_states, present, (attentions, cross_attentions)
class GPTBigCodePreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 使用 GPTBigCodeConfig 作为配置类
    config_class = GPTBigCodeConfig
    # 指定基础模型的前缀名称
    base_model_prefix = "transformer"
    # 支持梯度检查点
    supports_gradient_checkpointing = True
    # 不需要拆分的模块名称列表
    _no_split_modules = ["GPTBigCodeBlock"]
    # 跳过设备放置的键名
    _skip_keys_device_placement = "past_key_values"
    # 支持 Flash Attention 2
    _supports_flash_attn_2 = True
    # 支持 Self-Dual-Path Attention (SDPA)
    _supports_sdpa = True

    def __init__(self, *inputs, **kwargs):
        super().__init__(*inputs, **kwargs)

    def _init_weights(self, module):
        """Initialize the weights."""
        if isinstance(module, (GPTBigCodeMLP, GPTBigCodeAttention)):
            # 根据 OpenAI GPT-2 论文中的方案重新初始化选定的权重：
            #   > 使用修改后的初始化，考虑模型深度上残差路径的累积。在初始化时，通过 1/√N 缩放残差层的权重，
            #   > 其中 N 是残差层的数量。
            #   >   -- GPT-2 :: https://openai.com/blog/better-language-models/
            #
            # 参考 (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
            module.c_proj.weight.data.normal_(
                mean=0.0, std=(self.config.initializer_range / math.sqrt(2 * self.config.n_layer))
            )
            module.c_proj._is_hf_initialized = True
        elif isinstance(module, nn.Linear):
            # 与 TF 版本略有不同，TF 版本使用截断正态分布进行初始化
            # 参考 https://github.com/pytorch/pytorch/pull/5617
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)


GPT_BIGCODE_START_DOCSTRING = r"""

    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.


"""
    Parameters:
        config ([`GPTBigCodeConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
GPT_BIGCODE_INPUTS_DOCSTRING = r"""
"""


@add_start_docstrings(
    "The bare GPT_BIGCODE Model transformer outputting raw hidden-states without any specific head on top.",
    GPT_BIGCODE_START_DOCSTRING,
)
class GPTBigCodeModel(GPTBigCodePreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.multi_query = config.multi_query  # 从配置中获取多查询选项
        self.embed_dim = config.hidden_size  # 从配置中获取嵌入维度大小

        self.wte = nn.Embedding(config.vocab_size, self.embed_dim)  # 词嵌入层，根据词汇表大小和嵌入维度创建
        self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)  # 位置嵌入层，根据最大位置嵌入大小和嵌入维度创建

        self.drop = nn.Dropout(config.embd_pdrop)  # Dropout层，根据配置中的嵌入dropout概率创建
        self.h = nn.ModuleList([GPTBigCodeBlock(config, layer_idx=i) for i in range(config.num_hidden_layers)])  # 多层GPTBigCodeBlock组成的层列表
        self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)  # Layer normalization层，根据嵌入维度和配置中的epsilon创建

        max_positions = config.max_position_embeddings
        self.register_buffer(
            "bias", torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)), persistent=False
        )  # 创建一个下三角矩阵作为偏置，类型为bool，注册为模型的缓冲区

        self.gradient_checkpointing = False  # 梯度检查点开关，默认关闭

        self._use_sdpa = config._attn_implementation == "sdpa"  # 根据配置中的注意力实现类型判断是否使用sdpa
        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"  # 根据配置中的注意力实现类型判断是否使用flash_attention_2

        # 初始化权重并应用最终处理
        self.post_init()

    def get_input_embeddings(self):
        return self.wte  # 返回输入嵌入层对象

    def set_input_embeddings(self, new_embeddings):
        self.wte = new_embeddings  # 设置新的输入嵌入层对象

    @add_start_docstrings_to_model_forward(GPT_BIGCODE_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=BaseModelOutputWithPastAndCrossAttentions,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        past_key_values: Optional[List[torch.Tensor]] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        # 此处应该包含模型前向传播的详细文档字符串和示例代码注释，但根据示例我们只输出类的注释部分
        pass


@add_start_docstrings(
    """
    The GPT_BIGCODE Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    """,
    GPT_BIGCODE_START_DOCSTRING,
)
class GPTBigCodeForCausalLM(GPTBigCodePreTrainedModel):
    _tied_weights_keys = ["lm_head.weight"]  # 定义与输入嵌入层权重相关联的权重键
    def __init__(self, config):
        # 调用父类的初始化方法，传递配置参数
        super().__init__(config)
        # 使用给定配置创建 GPTBigCodeModel 模型
        self.transformer = GPTBigCodeModel(config)
        # 创建线性层用于语言模型的输出
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

        # 初始化模型权重并进行最终处理
        self.post_init()

    def get_output_embeddings(self):
        # 返回语言模型头部，即线性层
        return self.lm_head

    def set_output_embeddings(self, new_embeddings):
        # 设置新的输出嵌入
        self.lm_head = new_embeddings

    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
        token_type_ids = kwargs.get("token_type_ids", None)
        
        # 如果存在过去的键值（past_key_values），则移除已覆盖的token
        if past_key_values:
            if self.config.multi_query:
                past_length = past_key_values[0].shape[1]
            else:
                past_length = past_key_values[0].shape[2]

            if input_ids.shape[1] > past_length:
                remove_prefix_length = past_length
            else:
                remove_prefix_length = input_ids.shape[1] - 1

            # 移除已覆盖的token
            input_ids = input_ids[:, remove_prefix_length:]
            if token_type_ids is not None:
                token_type_ids = token_type_ids[:, -input_ids.shape[1] :]

        attention_mask = kwargs.get("attention_mask", None)
        position_ids = kwargs.get("position_ids", None)

        # 如果存在attention_mask但不存在position_ids，则动态创建position_ids用于批处理生成
        if attention_mask is not None and position_ids is None:
            position_ids = attention_mask.long().cumsum(-1) - 1
            position_ids.masked_fill_(attention_mask == 0, 1)
            if past_key_values:
                position_ids = position_ids[:, -input_ids.shape[1] :]
        else:
            position_ids = None

        # 如果传入了inputs_embeds，只在第一次生成步骤中使用它们
        if inputs_embeds is not None and past_key_values is None:
            model_inputs = {"inputs_embeds": inputs_embeds}
        else:
            model_inputs = {"input_ids": input_ids}

        # 更新模型输入参数
        model_inputs.update(
            {
                "past_key_values": past_key_values,
                "use_cache": kwargs.get("use_cache"),
                "position_ids": position_ids,
                "attention_mask": attention_mask,
                "token_type_ids": token_type_ids,
            }
        )
        return model_inputs

    @add_start_docstrings_to_model_forward(GPT_BIGCODE_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=CausalLMOutputWithCrossAttentions,
        config_class=_CONFIG_FOR_DOC,
    )
    # 定义 Transformer 模型的前向传播方法，用于推断或训练过程中的正向计算
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,  # 输入的 token IDs，可以为 None
        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,  # 用于存储过去的键值对，可以为 None
        attention_mask: Optional[torch.Tensor] = None,  # 注意力掩码，用于指定哪些位置的 token 需要被忽略
        token_type_ids: Optional[torch.Tensor] = None,  # token 类型 IDs，如用于区分句子 A 和句子 B
        position_ids: Optional[torch.Tensor] = None,  # 位置 IDs，标识每个 token 的位置信息
        head_mask: Optional[torch.Tensor] = None,  # 头部掩码，用于指定哪些注意力头部需要被忽略
        inputs_embeds: Optional[torch.Tensor] = None,  # 输入的嵌入表示，代替 input_ids 使用
        encoder_hidden_states: Optional[torch.Tensor] = None,  # 编码器的隐藏状态，用于编码器-解码器结构
        encoder_attention_mask: Optional[torch.Tensor] = None,  # 编码器的注意力掩码
        labels: Optional[torch.Tensor] = None,  # 预测的标签，用于计算损失
        use_cache: Optional[bool] = None,  # 是否使用缓存，用于存储中间计算结果以加速解码
        output_attentions: Optional[bool] = None,  # 是否输出注意力权重
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态
        return_dict: Optional[bool] = None,  # 是否返回一个字典作为输出
    ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
        r"""
        labels (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        """
        # 如果 return_dict 不是 None，则使用其值；否则使用 self.config.use_return_dict 的值
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 使用 transformer 处理输入数据，获取 transformer 的输出结果
        transformer_outputs = self.transformer(
            input_ids,
            past_key_values=past_key_values,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # 获取 transformer 的隐藏状态
        hidden_states = transformer_outputs[0]

        # 使用 lm_head 对隐藏状态进行预测，得到语言模型的 logits
        lm_logits = self.lm_head(hidden_states)

        # 初始化损失值
        loss = None
        # 如果存在 labels，则计算损失
        if labels is not None:
            # 将 logits 向左移动一个位置，以便于标签预测下一个位置的 token
            shift_logits = lm_logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous().to(shift_logits.device)
            # 将预测的 token 和标签展开，计算交叉熵损失
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))

        # 如果 return_dict 为 False，则返回输出的元组
        if not return_dict:
            output = (lm_logits,) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 如果 return_dict 为 True，则返回带有交叉注意力的 CausalLMOutputWithCrossAttentions 对象
        return CausalLMOutputWithCrossAttentions(
            loss=loss,
            logits=lm_logits,
            past_key_values=transformer_outputs.past_key_values,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
            cross_attentions=transformer_outputs.cross_attentions,
        )

    @staticmethod
    def _reorder_cache(
        past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor
    ) -> Tuple[Tuple[torch.Tensor]]:
        """
        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
        beam_idx at every generation step.
        """
        # 根据 beam_idx 重新排序 past_key_values 的缓存，以匹配每个生成步骤的正确 beam_idx
        return tuple(layer_past.index_select(0, beam_idx.to(layer_past.device)) for layer_past in past_key_values)
"""
The GPTBigCode Model transformer with a sequence classification head on top (linear layer).

[`GPTBigCodeForSequenceClassification`] uses the last token in order to do the classification, as other causal
models (e.g. GPT-1) do.

Since it does classification on the last token, it requires to know the position of the last token. If a
`pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
each row of the batch).
"""
@add_start_docstrings(
    """
    GPT_BIGCODE Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
    for Named-Entity-Recognition (NER) tasks.
    """,
    GPT_BIGCODE_START_DOCSTRING,
)
class GPTBigCodeForTokenClassification(GPTBigCodePreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.transformer = GPTBigCodeModel(config)
        # Determine the dropout rate for the classifier based on the configuration
        if hasattr(config, "classifier_dropout") and config.classifier_dropout is not None:
            classifier_dropout = config.classifier_dropout
        elif hasattr(config, "hidden_dropout") and config.hidden_dropout is not None:
            classifier_dropout = config.hidden_dropout
        else:
            classifier_dropout = 0.1
        self.dropout = nn.Dropout(classifier_dropout)
        # Define the linear classifier layer for token classification
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        # Initialize weights and apply final processing
        self.post_init()
    @add_start_docstrings_to_model_forward(GPT_BIGCODE_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, TokenClassifierOutput]:
        r"""
        labels (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 若未指定 return_dict，则使用模型配置中的默认设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用 transformer 的 forward 方法，传递所有参数
        transformer_outputs = self.transformer(
            input_ids,
            past_key_values=past_key_values,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取 transformer 输出的隐藏状态
        hidden_states = transformer_outputs[0]
        # 对隐藏状态应用 dropout 操作
        hidden_states = self.dropout(hidden_states)
        # 将处理后的隐藏状态输入分类器得到 logits
        logits = self.classifier(hidden_states)

        # 初始化损失值为 None
        loss = None
        # 如果提供了标签，则计算损失值
        if labels is not None:
            # 使用交叉熵损失函数
            loss_fct = CrossEntropyLoss()
            # 将 logits 和标签视图转换为合适的形状并计算损失
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1).to(logits.device))

        # 如果不要求返回字典格式的输出，则按需返回元组形式的输出
        if not return_dict:
            output = (logits,) + transformer_outputs[2:]  # 按顺序拼接输出元组
            return ((loss,) + output) if loss is not None else output

        # 返回 TokenClassifierOutput 对象，其中包含损失、logits、隐藏状态和注意力权重
        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )

`.\models\gpt_bigcode\init.py`

# 引入类型检查模块
from typing import TYPE_CHECKING

# 从工具包中引入异常和懒加载模块
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_torch_available,
)

# 定义导入结构字典，包含配置和模型
_import_structure = {
    "configuration_gpt_bigcode": ["GPT_BIGCODE_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPTBigCodeConfig"],
}

# 检查是否存在Torch库，若不存在则抛出OptionalDependencyNotAvailable异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若Torch可用，则添加模型相关的导入结构
    _import_structure["modeling_gpt_bigcode"] = [
        "GPT_BIGCODE_PRETRAINED_MODEL_ARCHIVE_LIST",
        "GPTBigCodeForSequenceClassification",
        "GPTBigCodeForTokenClassification",
        "GPTBigCodeForCausalLM",
        "GPTBigCodeModel",
        "GPTBigCodePreTrainedModel",
    ]

# 如果类型检查开启
if TYPE_CHECKING:
    # 从配置文件中导入所需内容
    from .configuration_gpt_bigcode import GPT_BIGCODE_PRETRAINED_CONFIG_ARCHIVE_MAP, GPTBigCodeConfig

    # 检查是否存在Torch库，若不存在则忽略
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 从模型文件中导入所需内容
        from .modeling_gpt_bigcode import (
            GPT_BIGCODE_PRETRAINED_MODEL_ARCHIVE_LIST,
            GPTBigCodeForCausalLM,
            GPTBigCodeForSequenceClassification,
            GPTBigCodeForTokenClassification,
            GPTBigCodeModel,
            GPTBigCodePreTrainedModel,
        )

# 如果类型检查未开启
else:
    # 引入系统模块
    import sys

    # 将当前模块替换为懒加载模块实例
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\gpt_neo\configuration_gpt_neo.py`

# 定义模型配置类 GPTNeoConfig，它继承自 PretrainedConfig，用于存储 GPT Neo 模型的配置信息
class GPTNeoConfig(PretrainedConfig):
    # 类属性：模型类型为 "gpt_neo"
    model_type = "gpt_neo"
    # 在推断时忽略的键列表，包括 "past_key_values"
    keys_to_ignore_at_inference = ["past_key_values"]
    # 属性映射字典，将 "num_attention_heads" 映射到 "num_heads"，将 "num_hidden_layers" 映射到 "num_layers"
    attribute_map = {"num_attention_heads": "num_heads", "num_hidden_layers": "num_layers"}
    # 初始化函数，用于创建一个新的Transformer模型配置对象
    def __init__(
        self,
        vocab_size=50257,  # 词汇表大小，默认为50257
        max_position_embeddings=2048,  # 最大位置嵌入长度，默认为2048
        hidden_size=2048,  # 隐藏层大小，默认为2048
        num_layers=24,  # Transformer层数，默认为24
        attention_types=[[["global", "local"], 12]],  # 注意力类型及其数量，默认为[['global', 'local'], 12]
        num_heads=16,  # 注意力头数，默认为16
        intermediate_size=None,  # 中间层大小，默认为None
        window_size=256,  # 窗口大小，默认为256
        activation_function="gelu_new",  # 激活函数类型，默认为'gelu_new'
        resid_dropout=0.0,  # 残差连接的dropout概率，默认为0.0
        embed_dropout=0.0,  # 嵌入层dropout概率，默认为0.0
        attention_dropout=0.0,  # 注意力层dropout概率，默认为0.0
        classifier_dropout=0.1,  # 分类器dropout概率，默认为0.1
        layer_norm_epsilon=1e-5,  # Layer Norm层的epsilon值，默认为1e-5
        initializer_range=0.02,  # 初始化范围，默认为0.02
        use_cache=True,  # 是否使用缓存，默认为True
        bos_token_id=50256,  # 起始token的ID，默认为50256
        eos_token_id=50256,  # 结束token的ID，默认为50256
        **kwargs,
    ):
        self.vocab_size = vocab_size  # 初始化词汇表大小
        self.max_position_embeddings = max_position_embeddings  # 初始化最大位置嵌入长度
        self.hidden_size = hidden_size  # 初始化隐藏层大小
        self.num_layers = num_layers  # 初始化Transformer层数
        self.num_heads = num_heads  # 初始化注意力头数
        self.intermediate_size = intermediate_size  # 初始化中间层大小
        self.window_size = window_size  # 初始化窗口大小
        self.activation_function = activation_function  # 初始化激活函数类型
        self.resid_dropout = resid_dropout  # 初始化残差连接的dropout概率
        self.embed_dropout = embed_dropout  # 初始化嵌入层dropout概率
        self.attention_dropout = attention_dropout  # 初始化注意力层dropout概率
        self.classifier_dropout = classifier_dropout  # 初始化分类器dropout概率
        self.layer_norm_epsilon = layer_norm_epsilon  # 初始化Layer Norm层的epsilon值
        self.initializer_range = initializer_range  # 初始化初始化范围
        self.use_cache = use_cache  # 初始化是否使用缓存

        self.bos_token_id = bos_token_id  # 初始化起始token的ID
        self.eos_token_id = eos_token_id  # 初始化结束token的ID

        self.attention_types = attention_types  # 初始化注意力类型及其数量
        # 根据注意力类型扩展注意力层参数，并将结果赋值给self.attention_layers
        self.attention_layers = self.expand_attention_types_params(attention_types)

        # 如果注意力层数与Transformer层数不匹配，抛出数值错误异常
        if len(self.attention_layers) != self.num_layers:
            raise ValueError(
                "Configuration for convolutional module is incorrect. "
                "It is required that `len(config.attention_layers)` == `config.num_layers` "
                f"but is `len(config.attention_layers) = {len(self.attention_layers)}`, "
                f"`config.num_layers = {self.num_layers}`. "
                "`config.attention_layers` is prepared using `config.attention_types`. "
                "Please verify the value of `config.attention_types` argument."
            )

        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)

    @staticmethod
    # 静态方法：根据注意力类型参数扩展注意力类型并返回扩展后的列表
    def expand_attention_types_params(attention_types):
        attentions = []  # 创建一个空列表来存放扩展后的注意力类型
        for item in attention_types:  # 遍历注意力类型列表中的每个元素
            for _ in range(item[1]):  # 根据每个元素的数量参数进行扩展
                attentions.extend(item[0])  # 将注意力类型按数量添加到列表中
        return attentions  # 返回扩展后的注意力类型列表
# 定义一个自定义函数custom_unfold，实现类似torch.Tensor.unfold的功能，以便导出到ONNX
def custom_unfold(input, dimension, size, step):
    """Custom torch.Tensor.unfold implementation to enable the export to ONNX."""
    import torch
    
    # 获取输入张量的形状
    shape = input.size()
    # 张量的维度数
    rank = len(shape)
    # 指定维度的大小
    sizedim = shape[dimension]
    
    # 创建一个从0开始，步长为step的索引张量
    low_indices = torch.arange(0, sizedim, step)
    # 计算可以完整切分的最小长度
    min_length = torch.div(sizedim - size, step, rounding_mode="floor") + 1
    # 根据最小长度和步长，生成对应的索引
    indices = torch.arange(size) + low_indices[:min_length][:, None]
    
    # 构建一个切片的索引列表
    s = [slice(None)] * rank
    s[dimension] = indices
    # 根据索引对输入张量进行切片操作
    sliced = input[s]
    
    # 创建一个排列列表，用于对切片后的张量进行维度重排
    perm = list(range(0, rank + 1))
    perm.append(perm.pop(dimension + 1))
    
    # 返回重排后的张量
    return sliced.permute(perm)


# 定义一个自定义函数custom_get_block_length_and_num_blocks，实现GPTNeoAttentionMixin._get_block_length_and_num_blocks的功能
def custom_get_block_length_and_num_blocks(seq_length, window_size):
    """
    Custom implementation for GPTNeoAttentionMixin._get_block_length_and_num_blocks to enable the export to ONNX as
    original implementation uses Python variables and control flow.
    """
    import torch
    
    # 创建一个候选的窗口大小张量
    candidates = torch.arange(1, window_size)
    # 计算序列长度与候选窗口大小的余数
    remainders = torch.remainder(seq_length, candidates)
    # 找到能整除序列长度的候选窗口大小
    divisor_indices = remainders == 0
    divisors = candidates[divisor_indices]
    # 找到最大的能整除序列长度的窗口大小
    largest_divisor = torch.max(divisors)
    # 返回最大的能整除序列长度的窗口大小以及能整除后的块数量
    return largest_divisor, torch.div(seq_length, largest_divisor, rounding_mode="floor")


# 定义一个类GPTNeoOnnxConfig，继承自OnnxConfigWithPast类
class GPTNeoOnnxConfig(OnnxConfigWithPast):
    @property
    def inputs(self) -> Mapping[str, Mapping[int, str]]:
        # 定义常见的输入映射，包括input_ids和attention_mask
        common_inputs = OrderedDict({"input_ids": {0: "batch", 1: "sequence"}})
        # 如果使用过去状态（self.use_past为True），则填充输入映射中的attention_mask
        if self.use_past:
            self.fill_with_past_key_values_(common_inputs, direction="inputs")
            common_inputs["attention_mask"] = {0: "batch", 1: "past_sequence + sequence"}
        else:
            common_inputs["attention_mask"] = {0: "batch", 1: "sequence"}
        
        # 返回最终的输入映射
        return common_inputs

    @property
    def num_attention_heads(self) -> int:
        # 返回配置中的注意力头数目
        return self._config.num_heads

    def generate_dummy_inputs(
        self,
        tokenizer: PreTrainedTokenizer,
        batch_size: int = -1,
        seq_length: int = -1,
        is_pair: bool = False,
        framework: Optional[TensorType] = None,
    ) -> Mapping[str, Any]:
        # 调用父类方法生成通用输入
        common_inputs = super(OnnxConfigWithPast, self).generate_dummy_inputs(
            tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework
        )

        # 按照 forward() 方法中的顺序排序输入
        ordered_inputs = OrderedDict({"input_ids": common_inputs["input_ids"]})

        # 需要添加 past_keys
        if self.use_past:
            # 检查是否安装了 PyTorch
            if not is_torch_available():
                raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.")
            else:
                import torch

                batch, seqlen = common_inputs["input_ids"].shape
                # 计算 past_key_values 的长度，不使用相同的长度
                past_key_values_length = seqlen + 2
                past_shape = (
                    batch,
                    self.num_attention_heads,
                    past_key_values_length,
                    self._config.hidden_size // self.num_attention_heads,
                )
                # 为每一层生成零初始化的 past_key_values 对
                ordered_inputs["past_key_values"] = [
                    (torch.zeros(past_shape), torch.zeros(past_shape)) for _ in range(self.num_layers)
                ]

        # 添加 attention_mask 到有序输入中
        ordered_inputs["attention_mask"] = common_inputs["attention_mask"]
        
        # 如果使用 past_keys，则扩展 attention_mask 的长度
        if self.use_past:
            mask_dtype = ordered_inputs["attention_mask"].dtype
            ordered_inputs["attention_mask"] = torch.cat(
                [ordered_inputs["attention_mask"], torch.ones(batch, past_key_values_length, dtype=mask_dtype)], dim=1
            )

        # 返回有序的输入字典
        return ordered_inputs

    @property
    def default_onnx_opset(self) -> int:
        # 返回默认的 ONNX 操作集版本号
        return 13

`.\models\gpt_neo\convert_gpt_neo_mesh_tf_to_pytorch.py`

# 导入必要的模块和函数
import argparse  # 导入命令行参数解析模块
import json  # 导入处理 JSON 格式数据的模块

from transformers import GPTNeoConfig, GPTNeoForCausalLM, load_tf_weights_in_gpt_neo  # 导入 GPT-Neo 相关的类和函数
from transformers.utils import logging  # 导入日志记录模块


logging.set_verbosity_info()  # 设置日志记录级别为信息

# 定义函数，用于将 TensorFlow 的检查点文件转换为 PyTorch 模型
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path):
    # 从配置文件中加载 JSON 数据
    config_json = json.load(open(config_file, "r"))
    
    # 根据加载的配置数据创建 GPTNeoConfig 对象
    config = GPTNeoConfig(
        hidden_size=config_json["n_embd"],  # 设置隐藏层的大小
        num_layers=config_json["n_layer"],  # 设置层数
        num_heads=config_json["n_head"],  # 设置注意力头数
        attention_types=config_json["attention_types"],  # 设置注意力类型
        max_position_embeddings=config_json["n_positions"],  # 设置最大位置编码长度
        resid_dropout=config_json["res_dropout"],  # 设置残差连接的 dropout 率
        embed_dropout=config_json["embed_dropout"],  # 设置嵌入层的 dropout 率
        attention_dropout=config_json["attn_dropout"],  # 设置注意力层的 dropout 率
    )
    
    # 打印配置信息
    print(f"Building PyTorch model from configuration: {config}")
    
    # 根据配置创建 GPTNeoForCausalLM 模型对象
    model = GPTNeoForCausalLM(config)

    # 加载 TensorFlow 检查点中的权重到 PyTorch 模型中
    load_tf_weights_in_gpt_neo(model, config, tf_checkpoint_path)

    # 将 PyTorch 模型保存到指定路径
    print(f"Save PyTorch model to {pytorch_dump_path}")
    model.save_pretrained(pytorch_dump_path)


if __name__ == "__main__":
    # 创建命令行参数解析器
    parser = argparse.ArgumentParser()

    # 添加必需的命令行参数
    parser.add_argument(
        "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
    )
    parser.add_argument(
        "--config_file",
        default=None,
        type=str,
        required=True,
        help=(
            "The config json file corresponding to the pre-trained mesh-tf model. \n"
            "This specifies the model architecture."
        ),
    )
    parser.add_argument(
        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
    )

    # 解析命令行参数
    args = parser.parse_args()

    # 调用转换函数，将 TensorFlow 检查点转换为 PyTorch 模型
    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path)

`.\models\gpt_neo\modeling_flax_gpt_neo.py`

# 导入所需模块和库
from functools import partial
from typing import Optional, Tuple

# 导入 Flax 相关模块
import flax.linen as nn
import jax
import jax.numpy as jnp
from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
from flax.linen import combine_masks, make_causal_mask
from flax.linen.attention import dot_product_attention_weights
from flax.traverse_util import flatten_dict, unflatten_dict
from jax import lax

# 导入自定义的模型输出和工具函数
from ...modeling_flax_outputs import FlaxBaseModelOutput, FlaxCausalLMOutput
from ...modeling_flax_utils import ACT2FN, FlaxPreTrainedModel, append_call_sample_docstring
from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging

# 导入 GPT-Neo 的配置类
from .configuration_gpt_neo import GPTNeoConfig

# 获取日志记录器对象
logger = logging.get_logger(__name__)

# 文档中使用的配置和检查点字符串常量
_CONFIG_FOR_DOC = "GPTNeoConfig"
_CHECKPOINT_FOR_DOC = "EleutherAI/gpt-neo-1.3B"

# GPT-Neo 模型的起始文档字符串
GPT_NEO_START_DOCSTRING = r"""

    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a Flax Linen
    [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a
    regular Flax Module and refer to the Flax documentation for all matter related to general usage and behavior.

    Finally, this model supports inherent JAX features such as:

    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
    # Parameters: 定义函数参数列表
    #     config ([`GPTNeoConfig`]): 使用 `GPTNeoConfig` 类配置模型的参数
    #         Initializing with a config file does not load the weights associated with the model, only the
    #         configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
    #         用配置文件初始化不会加载与模型关联的权重，仅加载配置。查看 [`~FlaxPreTrainedModel.from_pretrained`] 方法以加载模型权重。
    #     dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
    #         计算的数据类型。可以是 `jax.numpy.float32`, `jax.numpy.float16`（在GPU上）和 `jax.numpy.bfloat16`（在TPU上）之一。
    #         可用于在GPU或TPU上启用混合精度训练或半精度推断。如果指定，则所有计算将使用给定的 `dtype` 进行。
    #         
    #         **注意，这仅指定计算的数据类型，不影响模型参数的数据类型。**
    #         
    #         如果希望更改模型参数的数据类型，请参阅 [`~FlaxPreTrainedModel.to_fp16`] 和 [`~FlaxPreTrainedModel.to_bf16`]。
"""
FlaxGPTNeoSelfAttention 类的文档字符串，描述了该类的输入参数和返回内容。

Args:
    input_ids (`numpy.ndarray` of shape `(batch_size, input_ids_length)`):
        输入序列标记的索引数组，形状为 `(batch_size, input_ids_length)`。
        可以使用 `AutoTokenizer` 获取这些索引。参见 `PreTrainedTokenizer.encode` 和 `PreTrainedTokenizer.__call__` 的详细说明。

        [什么是输入 ID？](../glossary#input-ids)
    attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
        注意力遮罩，用于避免对填充标记索引进行注意力计算。遮罩值在 `[0, 1]` 范围内：

        - 对于不被遮罩的标记，值为 1，
        - 对于被遮罩的标记，值为 0。

        [什么是注意力遮罩？](../glossary#attention-mask)
    position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
        输入序列标记在位置嵌入中的位置索引数组。取值范围是 `[0, config.max_position_embeddings - 1]`。
    past_key_values (`Dict[str, np.ndarray]`, *optional*, 由 `init_cache` 返回或传入先前的 `past_key_values`):
        预先计算的隐藏状态字典（用于注意力模块中的键和值）。预计算的键和值的形状为 `[batch_size, max_length]`。
    output_attentions (`bool`, *optional*):
        是否返回所有注意力层的注意力张量。更多细节参见返回张量中的 `attentions` 字段。
    output_hidden_states (`bool`, *optional*):
        是否返回所有层的隐藏状态。更多细节参见返回张量中的 `hidden_states` 字段。
    return_dict (`bool`, *optional*):
        是否返回 `~utils.ModelOutput` 而不是普通的元组。
"""
class FlaxGPTNeoSelfAttention(nn.Module):
    # FlaxGPTNeoSelfAttention 类，继承自 nn.Module
    config: GPTNeoConfig
    # GPTNeoConfig 类型的 config 属性
    attention_type: str
    # 注意力类型字符串属性
    dtype: jnp.dtype = jnp.float32
    # 数据类型，默认为 jnp.float32
    def setup(self):
        # 从配置中获取参数
        config = self.config
        # 设置嵌入维度为隐藏大小
        self.embed_dim = config.hidden_size
        # 设置注意力头的数量
        self.num_heads = config.num_attention_heads
        # 计算每个注意力头的维度
        self.head_dim = self.embed_dim // self.num_heads
        # 检查 embed_dim 是否能被 num_heads 整除
        if self.head_dim * self.num_heads != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and "
                f"`num_heads`: {self.num_heads})."
            )

        # 初始化注意力和残差的 dropout 层
        self.attn_dropout = nn.Dropout(config.attention_dropout)
        self.resid_dropout = nn.Dropout(config.resid_dropout)

        # 部分应用带有特定初始化的 Dense 层函数
        dense = partial(
            nn.Dense,
            self.embed_dim,
            dtype=self.dtype,
            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
        )

        # 初始化 query、key、value 的投影层
        self.q_proj, self.k_proj, self.v_proj = dense(use_bias=False), dense(use_bias=False), dense(use_bias=False)
        # 初始化输出投影层
        self.out_proj = dense()

        # 创建因果遮蔽掩码
        self.causal_mask = make_causal_mask(jnp.ones((1, config.max_position_embeddings), dtype="bool"), dtype="bool")
        # 如果注意力类型为局部注意力，则修改因果遮蔽掩码
        if self.attention_type == "local":
            self.causal_mask = self.causal_mask ^ jnp.tril(self.causal_mask, -config.window_size)

    # 将隐藏状态按头分割
    def _split_heads(self, hidden_states):
        return hidden_states.reshape(hidden_states.shape[:2] + (self.num_heads, self.head_dim))

    # 将分割的头合并为隐藏状态
    def _merge_heads(self, hidden_states):
        return hidden_states.reshape(hidden_states.shape[:2] + (self.embed_dim,))

    # 使用 nn.compact 装饰器定义紧凑模块
    @nn.compact


这些注释描述了给定代码中每个方法和语句的功能和作用。
    def _concatenate_to_cache(self, key, value, query, attention_mask):
        """
        This function takes projected key, value states from a single input token and concatenates the states to cached
        states from previous steps. This function is slighly adapted from the official Flax repository:
        https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
        """
        # 检测是否正在初始化，通过检查缓存数据是否存在来判断
        is_initialized = self.has_variable("cache", "cached_key")
        # 获取或创建缓存的键（key）和值（value）变量，如果不存在则初始化为全零数组
        cached_key = self.variable("cache", "cached_key", jnp.zeros, key.shape, key.dtype)
        cached_value = self.variable("cache", "cached_value", jnp.zeros, value.shape, value.dtype)
        # 获取或创建缓存的索引（index）变量，如果不存在则初始化为整数 0
        cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32))

        if is_initialized:
            # 获取缓存键（key）的形状，并提取批次维度、最大长度、头数、每头深度
            *batch_dims, max_length, num_heads, depth_per_head = cached_key.value.shape
            # 使用新的一维空间切片更新键（key）和值（value）的缓存
            cur_index = cache_index.value
            indices = (0,) * len(batch_dims) + (cur_index, 0, 0)
            key = lax.dynamic_update_slice(cached_key.value, key, indices)
            value = lax.dynamic_update_slice(cached_value.value, value, indices)
            # 更新缓存中的键（key）和值（value）
            cached_key.value = key
            cached_value.value = value
            # 计算更新的缓存向量数，并更新缓存索引（index）
            num_updated_cache_vectors = query.shape[1]
            cache_index.value = cache_index.value + num_updated_cache_vectors
            # 用于缓存解码器自注意力的因果掩码：我们的单个查询位置应该仅注意到已生成并缓存的键位置，而不是剩余的零元素。
            pad_mask = jnp.broadcast_to(
                jnp.arange(max_length) < cur_index + num_updated_cache_vectors,
                tuple(batch_dims) + (1, num_updated_cache_vectors, max_length),
            )
            # 组合并更新注意力掩码（mask）
            attention_mask = combine_masks(pad_mask, attention_mask)
        
        # 返回更新后的键（key）、值（value）和注意力掩码（mask）
        return key, value, attention_mask
        ):
            # 计算查询向量，乘以 sqrt(head_dim)，并转换为指定数据类型
            query = self.q_proj(hidden_states) * jnp.sqrt(self.head_dim).astype(self.dtype)
            # 计算键向量
            key = self.k_proj(hidden_states)
            # 计算值向量
            value = self.v_proj(hidden_states)

            # 将查询向量拆分成多个头部
            query = self._split_heads(query)
            # 将键向量拆分成多个头部
            key = self._split_heads(key)
            # 将值向量拆分成多个头部
            value = self._split_heads(value)

            # 获取查询向量和键向量的长度
            query_length, key_length = query.shape[1], key.shape[1]

            # 如果存在缓存的键，则创建一个因果遮罩
            if self.has_variable("cache", "cached_key"):
                mask_shift = self.variables["cache"]["cache_index"]
                max_decoder_length = self.variables["cache"]["cached_key"].shape[1]
                # 从因果遮罩中动态切片出部分用于当前查询和键的长度
                causal_mask = lax.dynamic_slice(
                    self.causal_mask, (0, 0, mask_shift, 0), (1, 1, query_length, max_decoder_length)
                )
            else:
                # 否则使用整个因果遮罩
                causal_mask = self.causal_mask[:, :, :query_length, :key_length]

            # 获取批次大小
            batch_size = hidden_states.shape[0]
            # 将因果遮罩广播到与注意力头部匹配的形状
            causal_mask = jnp.broadcast_to(causal_mask, (batch_size,) + causal_mask.shape[1:])

            # 将注意力遮罩广播到与因果遮罩相同的形状
            attention_mask = jnp.broadcast_to(jnp.expand_dims(attention_mask, axis=(-3, -2)), causal_mask.shape)
            # 合并注意力遮罩和因果遮罩
            attention_mask = combine_masks(attention_mask, causal_mask)

            dropout_rng = None
            if not deterministic and self.config.attention_dropout > 0.0:
                # 如果不是确定性的且注意力 dropout 大于 0，则创建一个随机数生成器用于 dropout
                dropout_rng = self.make_rng("dropout")

            # 在快速自回归解码期间，我们逐步输入一个位置，并逐步缓存键和值
            if self.has_variable("cache", "cached_key") or init_cache:
                # 如果存在缓存的键或者需要初始化缓存，则将键、值和注意力遮罩连接到缓存中
                key, value, attention_mask = self._concatenate_to_cache(key, value, query, attention_mask)

            # 将布尔类型的注意力遮罩转换为浮点数类型的注意力偏置
            attention_bias = lax.select(
                attention_mask > 0,
                jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
                jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype),
            )

            # 常规的点积注意力计算
            attn_weights = dot_product_attention_weights(
                query,
                key,
                bias=attention_bias,
                dropout_rng=dropout_rng,
                dropout_rate=self.config.attention_dropout,
                deterministic=deterministic,
                dtype=self.dtype,
                precision=None,
            )

            # 使用 einsum 执行加权求和得到注意力输出
            attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value)
            # 合并多头注意力的输出
            attn_output = self._merge_heads(attn_output)
            # 对输出应用输出投影
            attn_output = self.out_proj(attn_output)
            # 应用残差连接中的 dropout
            attn_output = self.resid_dropout(attn_output, deterministic=deterministic)

            # 根据需要返回注意力输出和注意力权重
            outputs = (attn_output, attn_weights) if output_attentions else (attn_output,)
            return outputs
class FlaxGPTNeoAttention(nn.Module):
    config: GPTNeoConfig  # 类变量，存储模型配置信息
    layer_id: int = 0  # 类变量，表示当前层的索引，默认为0
    dtype: jnp.dtype = jnp.float32  # 类变量，指定数据类型为32位浮点数

    def setup(self):
        attention_type = self.config.attention_layers[self.layer_id]
        self.attention = FlaxGPTNeoSelfAttention(self.config, attention_type, dtype=self.dtype)
        # 根据配置和注意力类型创建自注意力层对象

    def __call__(
        self,
        hidden_states,
        attention_mask=None,
        deterministic: bool = True,
        init_cache: bool = False,
        output_attentions: bool = False,
    ):
        return self.attention(
            hidden_states,
            attention_mask=attention_mask,
            deterministic=deterministic,
            init_cache=init_cache,
            output_attentions=output_attentions,
        )
        # 调用自注意力层对象处理输入隐藏状态，并返回处理后的结果


class FlaxGPTNeoMLP(nn.Module):
    config: GPTNeoConfig  # 类变量，存储模型配置信息
    intermediate_size: int  # 类变量，表示中间隐藏层的大小
    dtype: jnp.dtype = jnp.float32  # 类变量，指定数据类型为32位浮点数

    def setup(self):
        embed_dim = self.config.hidden_size
        kernel_init = jax.nn.initializers.normal(self.config.initializer_range)
        self.c_fc = nn.Dense(self.intermediate_size, dtype=self.dtype, kernel_init=kernel_init)
        # 创建全连接层，用于变换隐藏状态的维度
        self.c_proj = nn.Dense(embed_dim, dtype=self.dtype, kernel_init=kernel_init)
        # 创建全连接层，将变换后的隐藏状态映射回原始维度
        self.act = ACT2FN[self.config.activation_function]
        # 根据配置选择激活函数
        self.dropout = nn.Dropout(rate=self.config.resid_dropout)
        # 创建Dropout层，用于随机置零输入张量的元素，以防止过拟合

    def __call__(self, hidden_states, deterministic: bool = True):
        hidden_states = self.c_fc(hidden_states)
        # 使用全连接层变换隐藏状态
        hidden_states = self.act(hidden_states)
        # 应用激活函数
        hidden_states = self.c_proj(hidden_states)
        # 使用全连接层将变换后的隐藏状态映射回原始维度
        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
        # 应用Dropout层
        return hidden_states
        # 返回处理后的隐藏状态


class FlaxGPTNeoBlock(nn.Module):
    config: GPTNeoConfig  # 类变量，存储模型配置信息
    layer_id: int = 0  # 类变量，表示当前层的索引，默认为0
    dtype: jnp.dtype = jnp.float32  # 类变量，指定数据类型为32位浮点数

    def setup(self):
        hidden_size = self.config.hidden_size
        inner_dim = self.config.intermediate_size if self.config.intermediate_size is not None else 4 * hidden_size
        # 根据配置确定内部维度大小

        self.ln_1 = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)
        # 创建LayerNorm层，用于归一化隐藏状态
        self.attn = FlaxGPTNeoAttention(self.config, layer_id=self.layer_id, dtype=self.dtype)
        # 创建注意力层对象
        self.ln_2 = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)
        # 创建LayerNorm层，用于归一化注意力输出
        self.mlp = FlaxGPTNeoMLP(self.config, inner_dim, dtype=self.dtype)
        # 创建MLP对象，用于处理注意力输出

    def __call__(
        self,
        hidden_states,
        attention_mask=None,
        deterministic: bool = True,
        init_cache: bool = False,
        output_attentions: bool = False,
    ):
        # 通过注意力层和MLP层处理输入
        ):
            # 保存原始的隐藏状态，用于残差连接
            residual = hidden_states
            # LayerNormalization层，对隐藏状态进行归一化处理
            hidden_states = self.ln_1(hidden_states)
            # 注意力机制，处理隐藏状态并返回输出
            outputs = self.attn(
                hidden_states,
                attention_mask=attention_mask,
                deterministic=deterministic,
                init_cache=init_cache,
                output_attentions=output_attentions,
            )
            # 从注意力输出中获取注意力层的输出
            attn_output = outputs[0]
            # 执行残差连接，更新隐藏状态
            hidden_states = attn_output + residual

            # 保存当前隐藏状态作为残差
            residual = hidden_states
            # LayerNormalization层，再次对隐藏状态进行归一化处理
            hidden_states = self.ln_2(hidden_states)
            # 多层感知机（MLP），对隐藏状态进行前馈神经网络处理
            feed_forward_hidden_states = self.mlp(hidden_states, deterministic=deterministic)
            # 执行残差连接，更新隐藏状态
            hidden_states = residual + feed_forward_hidden_states

            # 返回更新后的隐藏状态以及可能的额外输出
            return (hidden_states,) + outputs[1:]
    # FlaxGPTNeoPreTrainedModel 类定义，继承自 FlaxPreTrainedModel，用于处理权重初始化以及预训练模型下载和加载的抽象类
    class FlaxGPTNeoPreTrainedModel(FlaxPreTrainedModel):
        """
        An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
        models.
        """

        # 指定配置类为 GPTNeoConfig
        config_class = GPTNeoConfig
        # 指定基础模型前缀为 "transformer"
        base_model_prefix = "transformer"
        # 模块类变量初始化为 None
        module_class: nn.Module = None

        def __init__(
            self,
            config: GPTNeoConfig,
            input_shape: Tuple = (1, 1),
            seed: int = 0,
            dtype: jnp.dtype = jnp.float32,
            _do_init: bool = True,
            **kwargs,
        ):
            # 使用给定的配置和参数初始化模块对象
            module = self.module_class(config=config, dtype=dtype, **kwargs)
            # 调用父类构造函数初始化模型
            super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)

        def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
            # 初始化输入张量
            input_ids = jnp.zeros(input_shape, dtype="i4")
            attention_mask = jnp.ones_like(input_ids)
            position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_shape)
            # 划分随机数生成器
            params_rng, dropout_rng = jax.random.split(rng)
            rngs = {"params": params_rng, "dropout": dropout_rng}

            # 使用初始化的参数生成随机参数
            random_params = self.module.init(rngs, input_ids, attention_mask, position_ids, return_dict=False)["params"]

            # 如果存在预定义的参数，将随机参数与预定义参数合并
            if params is not None:
                random_params = flatten_dict(unfreeze(random_params))
                params = flatten_dict(unfreeze(params))
                for missing_key in self._missing_keys:
                    params[missing_key] = random_params[missing_key]
                self._missing_keys = set()
                return freeze(unflatten_dict(params))
            else:
                return random_params

        def init_cache(self, batch_size, max_length):
            r"""
            Args:
                batch_size (`int`):
                    batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
                max_length (`int`):
                    maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
                    cache.
            """
            # 初始化用于检索缓存的输入变量
            input_ids = jnp.ones((batch_size, max_length))
            attention_mask = jnp.ones_like(input_ids)
            position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)

            # 使用初始化的变量生成缓存
            init_variables = self.module.init(
                jax.random.PRNGKey(0), input_ids, attention_mask, position_ids, return_dict=False, init_cache=True
            )
            return unfreeze(init_variables["cache"])

        # 添加模型输入文档字符串到模型前向方法
        @add_start_docstrings_to_model_forward(GPT_NEO_INPUTS_DOCSTRING)
    def __call__(
        self,
        input_ids,
        attention_mask=None,
        position_ids=None,
        params: dict = None,
        past_key_values: dict = None,
        dropout_rng: jax.random.PRNGKey = None,
        train: bool = False,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        # 设置输出注意力权重的选项，如果未指定，则使用配置中的默认设置
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 设置输出隐藏状态的选项，如果未指定，则使用配置中的默认设置
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 设置返回字典的选项，如果未指定，则使用配置中的默认设置
        return_dict = return_dict if return_dict is not None else self.config.return_dict

        # 获取输入张量的批次大小和序列长度
        batch_size, sequence_length = input_ids.shape

        # 如果未提供位置编码，则根据序列长度和批次大小创建默认位置编码
        if position_ids is None:
            if past_key_values is not None:
                raise ValueError("Make sure to provide `position_ids` when passing `past_key_values`.")
            position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))

        # 如果未提供注意力掩码，则创建一个全为1的默认注意力掩码
        if attention_mask is None:
            attention_mask = jnp.ones((batch_size, sequence_length))

        # 处理任何需要的伪随机数生成器
        rngs = {}
        if dropout_rng is not None:
            rngs["dropout"] = dropout_rng

        inputs = {"params": params or self.params}

        # 如果传递了过去的键值，则初始化缓存，并确保缓存是可变的
        if past_key_values:
            inputs["cache"] = past_key_values
            mutable = ["cache"]
        else:
            mutable = False

        # 调用模块的应用方法来处理输入，并传递必要的参数和选项
        outputs = self.module.apply(
            inputs,
            jnp.array(input_ids, dtype="i4"),
            jnp.array(attention_mask, dtype="i4"),
            jnp.array(position_ids, dtype="i4"),
            not train,
            False,
            output_attentions,
            output_hidden_states,
            return_dict,
            rngs=rngs,
            mutable=mutable,
        )

        # 如果传递了过去的键值并且需要返回字典，则将更新后的缓存添加到模型输出中
        if past_key_values is not None and return_dict:
            outputs, past_key_values = outputs
            outputs["past_key_values"] = unfreeze(past_key_values["cache"])
            return outputs
        # 如果传递了过去的键值但不需要返回字典，则更新缓存后将其添加到模型输出中
        elif past_key_values is not None and not return_dict:
            outputs, past_key_values = outputs
            outputs = outputs[:1] + (unfreeze(past_key_values["cache"]),) + outputs[1:]

        # 返回最终的模型输出
        return outputs
class FlaxGPTNeoBlockCollection(nn.Module):
    config: GPTNeoConfig
    dtype: jnp.dtype = jnp.float32

    # 初始化方法，设置模块内的各个子块
    def setup(self):
        # 创建一个由多个 FlaxGPTNeoBlock 实例组成的列表，每个块对应模型的一个隐藏层
        self.blocks = [
            FlaxGPTNeoBlock(self.config, layer_id=i, name=str(i), dtype=self.dtype)
            for i in range(self.config.num_hidden_layers)
        ]

    # 调用方法，接受输入并依次经过每个块的处理
    def __call__(
        self,
        hidden_states,
        attention_mask=None,
        deterministic: bool = True,
        init_cache: bool = False,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
    ):
        # 如果需要输出注意力矩阵，则初始化空的元组用于存储每个块的注意力矩阵
        all_attentions = () if output_attentions else None
        # 如果需要输出隐藏状态，则初始化空的元组用于存储每个块的隐藏状态
        all_hidden_states = () if output_hidden_states else None

        # 对每个块进行迭代处理
        for block in self.blocks:
            # 如果需要输出隐藏状态，则将当前隐藏状态添加到 all_hidden_states 中
            if output_hidden_states:
                all_hidden_states += (hidden_states,)

            # 调用当前块的处理方法，得到该块的输出
            layer_outputs = block(
                hidden_states,
                attention_mask,
                deterministic=deterministic,
                init_cache=init_cache,
                output_attentions=output_attentions,
            )
            # 更新当前隐藏状态为当前块的输出的第一个元素（通常是下一层的隐藏状态）
            hidden_states = layer_outputs[0]

            # 如果需要输出注意力矩阵，则将当前块的注意力矩阵添加到 all_attentions 中
            if output_attentions:
                all_attentions += (layer_outputs[1],)

        # 组装最终输出元组，包含最终的隐藏状态、所有块的隐藏状态序列和所有块的注意力矩阵序列
        outputs = (hidden_states, all_hidden_states, all_attentions)

        return outputs


class FlaxGPTNeoModule(nn.Module):
    config: GPTNeoConfig
    dtype: jnp.dtype = jnp.float32

    # 初始化方法，设置模块内的各个子模块和变量
    def setup(self):
        # 设置词嵌入维度
        self.embed_dim = self.config.hidden_size
        # 使用正态分布初始化词嵌入矩阵
        embedding_init = jax.nn.initializers.normal(stddev=self.config.initializer_range)
        # 创建词嵌入层
        self.wte = nn.Embed(
            self.config.vocab_size,
            self.embed_dim,
            embedding_init=embedding_init,
        )
        # 创建位置嵌入层
        self.wpe = nn.Embed(
            self.config.max_position_embeddings,
            self.embed_dim,
            embedding_init=embedding_init,
        )
        # 创建 Dropout 层，用于随机断开输入的连接，防止过拟合
        self.dropout = nn.Dropout(rate=self.config.embed_dropout)
        # 创建 FlaxGPTNeoBlockCollection 实例，用于处理模型的多层块
        self.h = FlaxGPTNeoBlockCollection(self.config, dtype=self.dtype)
        # 创建 Layer Normalization 层，用于对最后的隐藏状态进行归一化处理
        self.ln_f = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)

    # 调用方法，接受输入并依次经过模型内的各个子模块处理
    def __call__(
        self,
        input_ids,
        attention_mask,
        position_ids,
        deterministic=True,
        init_cache: bool = False,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
    ):
        ):
        # 将输入的词嵌入转换为特定数据类型的张量，并传递给词嵌入层处理
        input_embeds = self.wte(input_ids.astype("i4"))
        # 将位置编码转换为特定数据类型的张量，并传递给位置编码层处理
        position_embeds = self.wpe(position_ids.astype("i4"))

        # 将输入的词嵌入张量和位置编码张量相加得到隐藏状态张量
        hidden_states = input_embeds + position_embeds
        # 对隐藏状态张量应用dropout操作，用于模型训练中的随机失活
        hidden_states = self.dropout(hidden_states, deterministic=deterministic)

        # 调用Transformer模型中的H层进行前向传播计算
        outputs = self.h(
            hidden_states,
            attention_mask,
            deterministic=deterministic,
            init_cache=init_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取H层输出的第一个张量，即隐藏状态张量
        hidden_states = outputs[0]
        # 对隐藏状态张量应用LN_F层，进行Layer Normalization处理
        hidden_states = self.ln_f(hidden_states)

        # 再次获取H层输出的第一个张量，即隐藏状态张量（重复的代码行）
        hidden_states = outputs[0]
        # 对隐藏状态张量应用LN_F层，进行Layer Normalization处理（重复的代码行）

        # 如果需要输出所有隐藏状态，则将当前隐藏状态张量添加到所有隐藏状态的列表中
        if output_hidden_states:
            all_hidden_states = outputs[1] + (hidden_states,)
            # 更新输出元组，将所有隐藏状态列表添加到输出元组中
            outputs = (hidden_states, all_hidden_states) + outputs[2:]
        else:
            # 更新输出元组，将当前隐藏状态张量添加到输出元组中
            outputs = (hidden_states,) + outputs[1:]

        # 如果不需要返回字典类型结果，则返回所有非空元素的元组
        if not return_dict:
            return tuple(v for v in outputs if v is not None)

        # 返回模型输出结果的FlaxBaseModelOutput对象，包括最终隐藏状态、所有隐藏状态和注意力分数
        return FlaxBaseModelOutput(
            last_hidden_state=hidden_states,
            hidden_states=outputs[1],
            attentions=outputs[-1],
        )
# 添加起始文档字符串，描述该类是一个不带特定输出头部的原始隐藏状态的 GPTNeo 模型转换器。
@add_start_docstrings(
    "The bare GPTNeo Model transformer outputting raw hidden-states without any specific head on top.",
    GPT_NEO_START_DOCSTRING,
)
class FlaxGPTNeoModel(FlaxGPTNeoPreTrainedModel):
    # 模块类属性指定为 FlaxGPTNeoModule
    module_class = FlaxGPTNeoModule


# 添加调用样本文档字符串的方法，指定 FlaxGPTNeoModel 的一些文档化信息
append_call_sample_docstring(FlaxGPTNeoModel, _CHECKPOINT_FOR_DOC, FlaxBaseModelOutput, _CONFIG_FOR_DOC)


# 定义用于有因果语言建模的 GPTNeo 模型的模块
class FlaxGPTNeoForCausalLMModule(nn.Module):
    # 使用 GPTNeoConfig 作为配置，并指定默认数据类型为 jnp.float32
    config: GPTNeoConfig
    dtype: jnp.dtype = jnp.float32

    # 设置方法，在模块初始化时调用，初始化 transformer 和 lm_head
    def setup(self):
        self.transformer = FlaxGPTNeoModule(self.config, dtype=self.dtype)
        self.lm_head = nn.Dense(
            self.config.vocab_size,
            use_bias=False,
            dtype=self.dtype,
            # 使用正态分布初始化器初始化权重
            kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
        )

    # 调用方法，实现模块的前向传播
    def __call__(
        self,
        input_ids,
        attention_mask,
        position_ids,
        deterministic: bool = True,
        init_cache: bool = False,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
    ):
        # 使用 transformer 模型进行前向传播
        outputs = self.transformer(
            input_ids,
            attention_mask,
            position_ids,
            deterministic=deterministic,
            init_cache=init_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        hidden_states = outputs[0]

        # 如果配置要求词嵌入权重共享，则共享 wte 参数的嵌入权重，并应用到 lm_head
        if self.config.tie_word_embeddings:
            shared_kernel = self.transformer.variables["params"]["wte"]["embedding"].T
            lm_logits = self.lm_head.apply({"params": {"kernel": shared_kernel}}, hidden_states)
        else:
            # 否则直接将隐藏状态传递给 lm_head
            lm_logits = self.lm_head(hidden_states)

        # 如果不返回字典，则返回元组形式的结果
        if not return_dict:
            return (lm_logits,) + outputs[1:]

        # 返回具有自定义输出的 FlaxCausalLMOutput 对象
        return FlaxCausalLMOutput(logits=lm_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions)


# 添加起始文档字符串，描述带有语言建模头部的 GPTNeo 模型转换器
@add_start_docstrings(
    """
    The GPTNeo Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    """,
    GPT_NEO_START_DOCSTRING,
)
class FlaxGPTNeoForCausalLM(FlaxGPTNeoPreTrainedModel):
    module_class = FlaxGPTNeoForCausalLMModule
    # 为生成准备输入数据，接受输入的token ids、最大长度和可选的注意力掩码
    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None):
        # 初始化缓存
        batch_size, seq_length = input_ids.shape

        # 使用输入的batch_size和max_length初始化缓存
        past_key_values = self.init_cache(batch_size, max_length)

        # 注意：通常需要在attention_mask的超出input_ids.shape[-1]和小于cache_length的位置放置0。
        # 但由于GPTNeo使用因果掩码，这些位置已经被掩盖了。
        # 因此，我们可以在这里创建一个静态的attention_mask，对编译更有效。
        extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")

        # 如果提供了attention_mask，则根据它计算position_ids
        if attention_mask is not None:
            position_ids = attention_mask.cumsum(axis=-1) - 1
            extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, attention_mask, (0, 0))
        else:
            # 否则，使用默认的位置ids：从0到seq_length的广播
            position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length))

        # 返回一个包含past_key_values、extended_attention_mask和position_ids的字典
        return {
            "past_key_values": past_key_values,
            "attention_mask": extended_attention_mask,
            "position_ids": position_ids,
        }

    # 更新生成时的输入数据，接受模型输出和模型参数字典
    def update_inputs_for_generation(self, model_outputs, model_kwargs):
        # 将模型输出中的past_key_values更新到模型参数字典中
        model_kwargs["past_key_values"] = model_outputs.past_key_values

        # 更新position_ids，只保留最后一个位置并加1
        model_kwargs["position_ids"] = model_kwargs["position_ids"][:, -1:] + 1

        # 返回更新后的模型参数字典
        return model_kwargs
# 导入自动生成文档字符串所需的示例函数的模块，并添加到 FlaxGPTNeoForCausalLM 类的文档中
# _CHECKPOINT_FOR_DOC 是用于文档的检查点
# FlaxCausalLMOutput 是 FlaxGPTNeoForCausalLM 的输出类
# _CONFIG_FOR_DOC 是 FlaxGPTNeoForCausalLM 的配置类
append_call_sample_docstring(FlaxGPTNeoForCausalLM, _CHECKPOINT_FOR_DOC, FlaxCausalLMOutput, _CONFIG_FOR_DOC)

`.\models\gpt_neo\modeling_gpt_neo.py`

# 设置编码为 UTF-8，确保源文件可以正确解析非 ASCII 字符
# 版权声明和许可信息，遵循 Apache License 2.0
# 此模块定义了 PyTorch 中的 GPT Neo 模型

# 引入操作系统相关的功能
import os
# 引入类型提示模块中的类型
from typing import Optional, Tuple, Union

# 引入 PyTorch 相关模块
import torch
# 引入 PyTorch 中的函数库和功能
import torch.nn.functional as F
import torch.utils.checkpoint
# 引入 PyTorch 中的损失函数
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

# 引入自定义的激活函数映射表
from ...activations import ACT2FN
# 引入处理注意力掩码相关的工具函数
from ...modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
# 引入模型输出相关的类定义
from ...modeling_outputs import (
    BaseModelOutputWithPast,
    BaseModelOutputWithPastAndCrossAttentions,
    CausalLMOutputWithCrossAttentions,
    CausalLMOutputWithPast,
    QuestionAnsweringModelOutput,
    SequenceClassifierOutputWithPast,
    TokenClassifierOutput,
)
# 引入模型工具函数
from ...modeling_utils import PreTrainedModel
# 引入 PyTorch 工具函数
from ...pytorch_utils import is_torch_greater_or_equal_than_1_13
# 引入通用工具函数
from ...utils import (
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    is_flash_attn_2_available,
    is_flash_attn_greater_or_equal_2_10,
    is_torch_fx_available,
    logging,
)

# 如果支持 Flash Attention 2，引入相应的函数
if is_flash_attn_2_available():
    from flash_attn import flash_attn_func, flash_attn_varlen_func
    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa

# 如果支持 Torch FX，包装 _prepare_4d_causal_attention_mask 函数，使其成为 FX 图中的一个叶节点
if is_torch_fx_available():
    # 如果 Torch 版本低于 1.13，则引入 torch.fx 模块
    if not is_torch_greater_or_equal_than_1_13:
        import torch.fx

    _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask)

# 获取日志记录器
logger = logging.get_logger(__name__)

# 用于文档的配置信息
_CONFIG_FOR_DOC = "GPTNeoConfig"

# 预训练模型存档列表
GPT_NEO_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "EleutherAI/gpt-neo-1.3B",
    # 更多 GPT Neo 模型信息可以在 https://huggingface.co/models?filter=gpt_neo 查看
]

# 用于文档的检查点信息
_CHECKPOINT_FOR_DOC = "EleutherAI/gpt-neo-1.3B"

# 从 transformers.models.llama.modeling_llama._get_unpad_data 复制的函数
def _get_unpad_data(attention_mask):
    # 计算每个样本的序列长度之和
    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
    # 找出注意力掩码中非零元素的索引
    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
    # 计算批次中的最大序列长度
    max_seqlen_in_batch = seqlens_in_batch.max().item()
    # 计算累积的序列长度
    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
    return (
        indices,
        cu_seqlens,
        max_seqlen_in_batch,
    # 定义一个名为`data_transform`的函数，接受参数`data`
    def data_transform(data):
        # 将`data`参数以逗号为分隔符进行切分，并返回切分后的结果列表
        return data.split(',')
    # 导入必要的模块：re 用于正则表达式，tf 用于 TensorFlow 操作
    try:
        import re
        import tensorflow as tf
    except ImportError:
        # 如果导入失败，记录错误并抛出异常
        logger.error(
            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
            "https://www.tensorflow.org/install/ for installation instructions."
        )
        raise
    
    # 获取 TensorFlow 模型的绝对路径
    tf_path = os.path.abspath(gpt_neo_checkpoint_path)
    # 输出日志，指示正在从 TensorFlow checkpoint 转换
    logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
    
    # 加载 TF 模型的变量列表
    init_vars = tf.train.list_variables(tf_path)
    names = []
    arrays = []
    
    # 遍历每个变量名和形状
    for name, shape in init_vars:
        # 排除特定的变量名，如全局步数和优化器参数
        if "global_step" not in name and "adam" not in name:
            # 加载 TF 模型中的变量值
            array = tf.train.load_variable(tf_path, name)
            # 将数组转换为 numpy 数组，并确保数据类型为 float32
            array = tf.dtypes.cast(array.squeeze(), tf.float32).numpy()
            
            # 替换特定名称以适应 PyTorch 模型结构
            name = name.replace("attn/q", "attn/attention/q_proj/w")
            name = name.replace("attn/k", "attn/attention/k_proj/w")
            name = name.replace("attn/v", "attn/attention/v_proj/w")
            name = name.replace("attn/o", "attn/attention/out_proj/w")
            name = name.replace("norm_1", "ln_1")
            name = name.replace("norm_2", "ln_2")
            name = name.replace("attn/compute_output_bias/o_b", "attn/attention/out_proj/b")
            name = name.replace("conv1d_main/c_fc/kernel", "c_fc/w")
            name = name.replace("conv1d_main/c_fc/bias", "c_fc/b")
            name = name.replace("conv1d_main/c_proj/kernel", "c_proj/w")
            name = name.replace("conv1d_main/c_proj/bias", "c_proj/b")

            # 将处理后的名称添加到列表中
            names.append(name)
            # 将处理后的数组添加到列表中
            arrays.append(array)
    for name, array in zip(names, arrays):
        name = name[5:]  # 跳过前缀 "gpt2/"
        name = name.split("/")  # 将文件路径分割为列表

        pointer = model.transformer  # 初始化指针为模型的transformer部分

        # 遍历分割后的文件路径名列表
        for m_name in name:
            if re.fullmatch(r"[A-Za-z]+\d+", m_name):
                scope_names = re.split(r"(\d+)", m_name)  # 如果匹配字母+数字的模式，则分割名字
            else:
                scope_names = [m_name]

            # 根据scope_names的第一个元素选择指针的属性
            if scope_names[0] == "w" or scope_names[0] == "g":
                pointer = getattr(pointer, "weight")  # 如果是"w"或"g"，选择weight属性
            elif scope_names[0] == "b":
                pointer = getattr(pointer, "bias")  # 如果是"b"，选择bias属性
            elif scope_names[0] == "wpe" or scope_names[0] == "wte":
                pointer = getattr(pointer, scope_names[0])  # 如果是"wpe"或"wte"，选择对应的属性
                pointer = getattr(pointer, "weight")  # 再选择weight属性
            else:
                pointer = getattr(pointer, scope_names[0])  # 否则选择scope_names的第一个元素作为属性名

            # 如果scope_names的长度大于等于2，选择指针中的第num个元素
            if len(scope_names) >= 2:
                num = int(scope_names[1])
                pointer = pointer[num]

        # 如果文件路径的最后一个元素是"w"并且倒数第二个元素在指定的列表中，则对array进行转置操作
        if name[-1] == "w" and name[-2] in ["out_proj", "k_proj", "q_proj", "v_proj", "c_proj", "c_fc"]:
            array = array.transpose()

        # 如果文件路径名为["wte"]，则根据配置截断array的长度
        if name == ["wte"]:
            array = array[: config.vocab_size]

        # 检查pointer和array的形状是否匹配，如果不匹配则抛出异常
        if pointer.shape != array.shape:
            raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched {name}")

        # 打印初始化的PyTorch权重信息
        print(f"Initialize PyTorch weight {name}")

        # 将array转换为torch张量，并赋值给pointer的data属性
        pointer.data = torch.from_numpy(array)

    # 初始化最终的线性层，使用word embeddings
    embs = model.transformer.wte.weight  # 获取模型transformer部分的word embeddings
    lin = nn.Linear(embs.size()[1], embs.size()[0], bias=False)  # 创建一个线性层，输入和输出大小根据embs的形状确定，不带偏置
    lin.weight = embs  # 将embs作为线性层的权重
    model.set_output_embeddings(lin)  # 设置模型的输出embeddings为lin
    return model  # 返回更新后的模型
# 定义一个名为 GPTNeoSelfAttention 的类，继承自 nn.Module
class GPTNeoSelfAttention(nn.Module):
    # 初始化函数，接受 config 和 attention_type 两个参数
    def __init__(self, config, attention_type):
        super().__init__()
        self.config = config

        # 从 config 中获取最大位置嵌入数，并创建一个布尔类型的下三角矩阵作为初始的偏置
        max_positions = config.max_position_embeddings
        bias = torch.tril(torch.ones((max_positions, max_positions), dtype=bool)).view(
            1, 1, max_positions, max_positions
        )

        # 如果 attention_type 是 "local"，则更新偏置，使每个标记只能关注前 window_size 个标记
        if attention_type == "local":
            bias = torch.bitwise_xor(bias, torch.tril(bias, -config.window_size))

        # 将偏置作为缓冲区注册到模型中，不会被视为模型参数
        self.register_buffer("bias", bias, persistent=False)
        # 注册一个固定的掩码偏置，用于在 self-attention 中屏蔽无效位置
        self.register_buffer("masked_bias", torch.tensor(-1e9), persistent=False)

        # 定义注意力(dropout)和残差(dropout)的层
        self.attn_dropout = nn.Dropout(float(config.attention_dropout))
        self.resid_dropout = nn.Dropout(float(config.resid_dropout))
        self.is_causal = True

        # 获取隐藏层大小和注意力头数，并计算每个头的维度
        self.embed_dim = config.hidden_size
        self.num_heads = config.num_heads
        self.head_dim = self.embed_dim // self.num_heads
        # 检查 embed_dim 是否能被 num_heads 整除
        if self.head_dim * self.num_heads != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: "
                f"{self.num_heads})."
            )

        # 定义线性变换层，用于生成查询(q_proj)、键(k_proj)、值(v_proj)和输出(out_proj)
        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=True)

    # 将输入张量分割为多头注意力张量
    def _split_heads(self, tensor, num_heads, attn_head_size):
        """
        Splits hidden_size dim into attn_head_size and num_heads
        """
        new_shape = tensor.size()[:-1] + (num_heads, attn_head_size)
        tensor = tensor.view(new_shape)
        return tensor.permute(0, 2, 1, 3)  # (batch, head, seq_length, head_features)

    # 将多头注意力张量合并为原始张量
    def _merge_heads(self, tensor, num_heads, attn_head_size):
        """
        Merges attn_head_size dim and num_attn_heads dim into hidden_size
        """
        tensor = tensor.permute(0, 2, 1, 3).contiguous()
        new_shape = tensor.size()[:-2] + (num_heads * attn_head_size,)
        return tensor.view(new_shape)
    def _attn(self, query, key, value, attention_mask=None, head_mask=None):
        # Keep the attention weights computation in fp32 to avoid overflow issues
        # 将注意力权重的计算保持在fp32中，以避免溢出问题
        query = query.to(torch.float32)
        key = key.to(torch.float32)

        # Compute attention weights using matrix multiplication
        # 使用矩阵乘法计算注意力权重
        attn_weights = torch.matmul(query, key.transpose(-1, -2))

        # Determine the dimensions for causal masking
        # 确定因果遮罩的维度
        query_length, key_length = query.size(-2), key.size(-2)
        causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length]

        # Set the mask value to the minimum value of the data type of attn_weights
        # 将掩码值设置为attn_weights数据类型的最小值
        mask_value = torch.finfo(attn_weights.dtype).min
        mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype).to(attn_weights.device)

        # Apply causal mask to attention weights
        # 将因果遮罩应用于注意力权重
        attn_weights = torch.where(causal_mask, attn_weights, mask_value)

        if attention_mask is not None:
            # Apply the provided attention mask
            # 应用提供的注意力掩码
            attn_weights = attn_weights + attention_mask

        # Apply softmax to get normalized attention weights
        # 应用softmax函数以获取归一化的注意力权重
        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
        attn_weights = attn_weights.to(value.dtype)

        # Apply dropout to attention weights
        # 对注意力权重应用dropout
        attn_weights = self.attn_dropout(attn_weights)

        if head_mask is not None:
            # Apply head mask if provided
            # 如果提供了头部掩码，则应用头部掩码
            attn_weights = attn_weights * head_mask

        # Compute attention output by weighted sum with value
        # 通过与value的加权求和计算注意力输出
        attn_output = torch.matmul(attn_weights, value)

        return attn_output, attn_weights

    def forward(
        self,
        hidden_states,
        attention_mask=None,
        layer_past=None,
        head_mask=None,
        use_cache=False,
        output_attentions=False,
    ):
        # Project hidden states to query, key, and value tensors
        # 将隐藏状态投影到query、key和value张量上
        query = self.q_proj(hidden_states)
        key = self.k_proj(hidden_states)
        value = self.v_proj(hidden_states)

        # Split heads for multi-head attention
        # 对多头注意力进行头部分离
        query = self._split_heads(query, self.num_heads, self.head_dim)
        key = self._split_heads(key, self.num_heads, self.head_dim)
        value = self._split_heads(value, self.num_heads, self.head_dim)

        if layer_past is not None:
            # Concatenate past key and value with current key and value
            # 将过去的key和value与当前的key和value拼接起来
            past_key = layer_past[0]
            past_value = layer_past[1]
            key = torch.cat((past_key, key), dim=-2)
            value = torch.cat((past_value, value), dim=-2)

        if use_cache is True:
            # Create present tuple if caching is enabled
            # 如果启用缓存，创建present元组
            present = (key, value)
        else:
            present = None

        # Compute attention output and attention weights using _attn function
        # 使用_attn函数计算注意力输出和注意力权重
        attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)

        # Merge heads back together
        # 将头部重新合并在一起
        attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim)

        # Project attention output to get final output
        # 投影注意力输出以获得最终输出
        attn_output = self.out_proj(attn_output)

        # Apply residual dropout
        # 应用残差dropout
        attn_output = self.resid_dropout(attn_output)

        outputs = (attn_output, present)

        if output_attentions:
            # Include attention weights in outputs if requested
            # 如果需要，将注意力权重包含在输出中
            outputs += (attn_weights,)

        return outputs  # a, present, (attentions)
class GPTNeoFlashAttention2(GPTNeoSelfAttention):
    """
    GPTNeo flash attention module. This module inherits from `GPTNeoSelfAttention` as the weights of the module stays
    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
    flash attention and deal with padding tokens in case the input contains any of them.
    """

    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()

    def forward(
        self,
        hidden_states,
        attention_mask=None,
        layer_past=None,
        head_mask=None,
        use_cache=False,
        output_attentions=False,
    ):
        # Forward pass method for the attention module

    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._flash_attention_forward
    def _flash_attention_forward(
        self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
    ):
        # Forward pass method for the flash attention mechanism
        """
        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
        first unpad the input, then computes the attention scores and pad the final attention scores.

        Args:
            query_states (`torch.Tensor`):
                Input query states to be passed to Flash Attention API
            key_states (`torch.Tensor`):
                Input key states to be passed to Flash Attention API
            value_states (`torch.Tensor`):
                Input value states to be passed to Flash Attention API
            attention_mask (`torch.Tensor`):
                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
                position of padding tokens and 1 for the position of non-padding tokens.
            dropout (`float`):
                Attention dropout
            softmax_scale (`float`, *optional*):
                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
        """
        # Determine if causal masking is required based on `_flash_attn_uses_top_left_mask` and `query_length`
        if not self._flash_attn_uses_top_left_mask:
            causal = self.is_causal
        else:
            # Temporary check for specific conditions in Flash Attention for RoCm
            causal = self.is_causal and query_length != 1

        # Apply unpadding if the input contains padding tokens
        if attention_mask is not None:
            batch_size = query_states.shape[0]
            # Call _upad_input to unpad the input based on attention_mask and query_length
            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
                query_states, key_states, value_states, attention_mask, query_length
            )

            # Extract sequence lengths from cu_seq_lens
            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
            # Extract maximum sequence lengths from max_seq_lens
            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens

            # Perform Flash Attention with variable-length sequences
            attn_output_unpad = flash_attn_varlen_func(
                query_states,
                key_states,
                value_states,
                cu_seqlens_q=cu_seqlens_q,
                cu_seqlens_k=cu_seqlens_k,
                max_seqlen_q=max_seqlen_in_batch_q,
                max_seqlen_k=max_seqlen_in_batch_k,
                dropout_p=dropout,
                softmax_scale=softmax_scale,
                causal=causal,
            )

            # Pad the attention output based on the unpadding indices
            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
        else:
            # If no attention_mask is provided, apply regular Flash Attention
            attn_output = flash_attn_func(
                query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
            )

        # Return the computed attention output
        return attn_output
    ```
    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
        # 获取未填充数据的索引、当前序列长度和批次中的最大序列长度
        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
        
        # 获取 key_layer 的形状信息
        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
        
        # 根据未填充的索引重新排列 key_layer 和 value_layer 的数据
        key_layer = index_first_axis(
            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
        )
        value_layer = index_first_axis(
            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
        )
        
        # 根据查询长度调整 query_layer
        if query_length == kv_seq_len:
            # 如果查询长度等于 key/value 序列长度，则直接使用索引 k 对 query_layer 进行重新排列
            query_layer = index_first_axis(
                query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
            )
            cu_seqlens_q = cu_seqlens_k
            max_seqlen_in_batch_q = max_seqlen_in_batch_k
            indices_q = indices_k
        elif query_length == 1:
            # 如果查询长度为 1，则处理为每个批次生成一个长度为 1 的序列
            max_seqlen_in_batch_q = 1
            cu_seqlens_q = torch.arange(
                batch_size + 1, dtype=torch.int32, device=query_layer.device
            )  # 这里有一个 memcpy，这样做非常不好。
            indices_q = cu_seqlens_q[:-1]
            query_layer = query_layer.squeeze(1)
        else:
            # 否则，根据查询长度和注意力掩码对 query_layer 进行未填充处理
            # 注意：-query_length 切片假设左填充。
            attention_mask = attention_mask[:, -query_length:]
            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)

        # 返回调整后的 query_layer、key_layer、value_layer，以及相关的索引和长度信息
        return (
            query_layer,
            key_layer,
            value_layer,
            indices_q,
            (cu_seqlens_q, cu_seqlens_k),
            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
        )
# 定义一个字典，将字符串映射到不同的注意力类
GPT_NEO_ATTENTION_CLASSES = {
    "eager": GPTNeoSelfAttention,
    "flash_attention_2": GPTNeoFlashAttention2,
}

# GPTNeoAttention 类，用于处理注意力机制
class GPTNeoAttention(nn.Module):
    def __init__(self, config, layer_id=0):
        super().__init__()
        self.layer_id = layer_id
        self.attention_layers = config.attention_layers  # 从配置中获取注意力层列表
        self.attention_type = self.attention_layers[layer_id]  # 获取当前层的注意力类型

        # 根据不同的注意力类型选择不同的注意力实现类
        if self.attention_type in ["global", "local"]:
            self.attention = GPT_NEO_ATTENTION_CLASSES[config._attn_implementation](config, self.attention_type)
        else:
            raise NotImplementedError(
                "Only attn layer types 'global' and 'local' exist, but got `config.attention_layers`: "
                f"{config.attention_layers}. Select attn layer types from ['global', 'local'] only."
            )

    def forward(
        self,
        hidden_states,
        layer_past=None,
        attention_mask=None,
        head_mask=None,
        use_cache=False,
        output_attentions=False,
    ):
        # 调用注意力机制的前向传播
        return self.attention(
            hidden_states,
            attention_mask=attention_mask,
            layer_past=layer_past,
            head_mask=head_mask,
            use_cache=use_cache,
            output_attentions=output_attentions,
        )


# GPTNeoMLP 类，多层感知机部分的实现
class GPTNeoMLP(nn.Module):
    def __init__(self, intermediate_size, config):  # 在 MLP 中，intermediate_size=4 * hidden_size
        super().__init__()
        embed_dim = config.hidden_size
        self.c_fc = nn.Linear(embed_dim, intermediate_size)  # 全连接层，将输入维度转换为中间层维度
        self.c_proj = nn.Linear(intermediate_size, embed_dim)  # 全连接层，将中间层维度转换为输出维度
        self.act = ACT2FN[config.activation_function]  # 激活函数，根据配置选择相应的激活函数
        self.dropout = nn.Dropout(float(config.resid_dropout))  # Dropout 层，用于防止过拟合

    def forward(self, hidden_states):
        hidden_states = self.c_fc(hidden_states)  # 全连接层变换
        hidden_states = self.act(hidden_states)  # 激活函数变换
        hidden_states = self.c_proj(hidden_states)  # 再次全连接层变换
        hidden_states = self.dropout(hidden_states)  # Dropout 处理
        return hidden_states


# GPTNeoBlock 类，GPT-Neo 模型的一个块
class GPTNeoBlock(nn.Module):
    def __init__(self, config, layer_id):
        super().__init__()
        hidden_size = config.hidden_size
        inner_dim = config.intermediate_size if config.intermediate_size is not None else 4 * hidden_size
        self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)  # LayerNorm 层，第一层
        self.attn = GPTNeoAttention(config, layer_id)  # 注意力层
        self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)  # LayerNorm 层，第二层
        self.mlp = GPTNeoMLP(inner_dim, config)  # MLP 层，多层感知机

    def forward(
        self,
        hidden_states,
        layer_past=None,
        attention_mask=None,
        head_mask=None,
        use_cache=False,
        output_attentions=False,
    ):
        # 块的前向传播，依次经过 LayerNorm、注意力、LayerNorm、MLP
        ):
            # 保存输入的 hidden_states 到 residual 变量中，以便后续使用残差连接
            residual = hidden_states
            # 对当前的 hidden_states 进行 Layer Normalization 处理
            hidden_states = self.ln_1(hidden_states)
            # 调用注意力机制模块进行注意力计算
            attn_outputs = self.attn(
                hidden_states,
                layer_past=layer_past,
                attention_mask=attention_mask,
                head_mask=head_mask,
                use_cache=use_cache,
                output_attentions=output_attentions,
            )
            # 获取注意力计算的输出结果
            attn_output = attn_outputs[0]  # output_attn: a, present, (attentions)
            # 获取额外的输出结果
            outputs = attn_outputs[1:]
            # 残差连接，将注意力计算的输出和之前保存的 residual 相加
            hidden_states = attn_output + residual

            # 保存当前的 hidden_states 到 residual 变量中，用于后续的残差连接
            residual = hidden_states
            # 对当前的 hidden_states 进行第二层 Layer Normalization 处理
            hidden_states = self.ln_2(hidden_states)
            # 使用 MLP 模块进行前馈网络计算
            feed_forward_hidden_states = self.mlp(hidden_states)
            # 残差连接，将第一次保存的 residual 和前馈网络计算结果相加
            hidden_states = residual + feed_forward_hidden_states

            # 如果 use_cache 为 True，则将 hidden_states 添加到输出中
            if use_cache:
                outputs = (hidden_states,) + outputs
            else:
                # 否则，只将 hidden_states 和第一个额外输出添加到输出中
                outputs = (hidden_states,) + outputs[1:]

            # 返回输出结果，通常包括 hidden_states, present, (attentions, cross_attentions)
            return outputs
class GPTNeoPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 使用 GPTNeoConfig 作为配置类
    config_class = GPTNeoConfig
    # 使用 load_tf_weights_in_gpt_neo 函数来加载 TensorFlow 权重
    load_tf_weights = load_tf_weights_in_gpt_neo
    # 基础模型前缀为 "transformer"
    base_model_prefix = "transformer"
    # 支持梯度检查点
    supports_gradient_checkpointing = True
    # 不需要拆分的模块列表
    _no_split_modules = ["GPTNeoBlock"]
    # 跳过的设备放置键名
    _skip_keys_device_placement = "past_key_values"
    # 支持闪电注意力的第二个版本
    _supports_flash_attn_2 = True

    def __init__(self, *inputs, **kwargs):
        super().__init__(*inputs, **kwargs)

    def _init_weights(self, module):
        """Initialize the weights."""
        if isinstance(module, (nn.Linear,)):
            # 对于线性层，使用正态分布初始化权重，标准差为配置中的 initializer_range
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                # 如果有偏置项，则初始化为零
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            # 对于嵌入层，使用正态分布初始化权重，标准差为配置中的 initializer_range
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                # 如果有 padding_idx，则将对应位置的权重初始化为零
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            # 对于 LayerNorm 层，初始化偏置为零，权重为全一
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)


GPT_NEO_START_DOCSTRING = r"""

    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`GPTNeoConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

GPT_NEO_INPUTS_DOCSTRING = r"""
"""


@add_start_docstrings(
    "The bare GPT Neo Model transformer outputting raw hidden-states without any specific head on top.",
    GPT_NEO_START_DOCSTRING,
)
# GPTNeoModel 继承自 GPTNeoPreTrainedModel，并添加了文档字符串
class GPTNeoModel(GPTNeoPreTrainedModel):
    # 初始化方法，接收配置参数并调用父类初始化方法
    def __init__(self, config):
        super().__init__(config)

        # 设置嵌入维度为隐藏大小
        self.embed_dim = config.hidden_size
        # 创建词嵌入层，形状为（词汇表大小，嵌入维度）
        self.wte = nn.Embedding(config.vocab_size, self.embed_dim)
        # 创建位置嵌入层，形状为（最大位置编码数，嵌入维度）
        self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
        # 创建dropout层，使用配置中的嵌入dropout率
        self.drop = nn.Dropout(float(config.embed_dropout))
        # 创建多层GPTNeoBlock模块的列表，每层使用相同的配置，编号从0到num_layers-1
        self.h = nn.ModuleList([GPTNeoBlock(config, layer_id=i) for i in range(config.num_layers)])
        # 根据配置中的注意力机制实现，决定是否使用特定的flash_attention_2实现
        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
        # 创建LayerNorm层，对嵌入维度进行归一化，使用配置中的epsilon参数
        self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)

        # 梯度检查点初始化为False，表示不启用梯度检查点技术
        self.gradient_checkpointing = False
        # 执行后续初始化操作，包括权重初始化和最终处理
        self.post_init()

    # 返回词嵌入层wte
    def get_input_embeddings(self):
        return self.wte

    # 设置新的输入嵌入到词嵌入层wte
    def set_input_embeddings(self, new_embeddings):
        self.wte = new_embeddings

    # 前向传播函数，接收多种输入参数，详细见函数注释文档
    @add_start_docstrings_to_model_forward(GPT_NEO_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=BaseModelOutputWithPastAndCrossAttentions,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
"""
The GPT Neo Model transformer with a language modeling head on top (linear layer with weights tied to the input
embeddings).
"""
# 基于 GPT Neo 模型的语言建模头部的转换器模型
@add_start_docstrings(
    """
    The GPT Neo Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    """,
    GPT_NEO_START_DOCSTRING,
)
class GPTNeoForCausalLM(GPTNeoPreTrainedModel):
    # 定义需要绑定权重的键值对
    _tied_weights_keys = ["lm_head.weight"]

    def __init__(self, config):
        # 调用父类构造函数，初始化模型
        super().__init__(config)
        # 初始化 GPT Neo 模型的主体部分
        self.transformer = GPTNeoModel(config)
        # 初始化语言建模头部的线性层，用于预测下一个词的概率分布
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

        # 初始化权重并应用最终处理
        self.post_init()

    def get_output_embeddings(self):
        # 返回语言建模头部的输出嵌入
        return self.lm_head

    def set_output_embeddings(self, new_embeddings):
        # 设置新的输出嵌入到语言建模头部
        self.lm_head = new_embeddings

    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
        # 获取 token_type_ids 参数，如果不存在则设为 None
        token_type_ids = kwargs.get("token_type_ids", None)
        
        # 如果存在 past_key_values，则根据其覆盖情况调整 input_ids
        if past_key_values:
            past_length = past_key_values[0][0].shape[2]

            # 一些生成方法可能只传递最后一个输入 ID
            if input_ids.shape[1] > past_length:
                remove_prefix_length = past_length
            else:
                # 默认行为：仅保留最后一个 ID
                remove_prefix_length = input_ids.shape[1] - 1

            input_ids = input_ids[:, remove_prefix_length:]
            if token_type_ids is not None:
                token_type_ids = token_type_ids[:, -input_ids.shape[1] :]

        attention_mask = kwargs.get("attention_mask", None)
        position_ids = kwargs.get("position_ids", None)

        if attention_mask is not None and position_ids is None:
            # 为批量生成创建动态的 position_ids
            position_ids = attention_mask.long().cumsum(-1) - 1
            position_ids.masked_fill_(attention_mask == 0, 1)
            if past_key_values:
                position_ids = position_ids[:, -input_ids.shape[1] :]

        # 如果传递了 inputs_embeds，我们只在第一个生成步骤中使用它们
        if inputs_embeds is not None and past_key_values is None:
            model_inputs = {"inputs_embeds": inputs_embeds}
        else:
            model_inputs = {"input_ids": input_ids}

        # 更新模型输入的字典
        model_inputs.update(
            {
                "past_key_values": past_key_values,
                "use_cache": kwargs.get("use_cache"),
                "position_ids": position_ids,
                "attention_mask": attention_mask,
                "token_type_ids": token_type_ids,
            }
        )

        return model_inputs

    # 添加文档字符串到模型的前向方法
    @add_start_docstrings_to_model_forward(GPT_NEO_INPUTS_DOCSTRING)
    # 添加代码示例的文档字符串
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=CausalLMOutputWithCrossAttentions,
        config_class=_CONFIG_FOR_DOC,
    )
    # 此方法用于执行模型的前向传播，接受多个可选参数用于控制传播过程和返回的内容

    self,
        # "self" 是 Python 中类方法的隐式第一个参数，表示当前实例对象

    input_ids: Optional[torch.Tensor] = None,
        # 输入的 token IDs，是一个可选的 PyTorch 张量，默认为 None

    past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
        # 先前计算的键值对，作为元组的形式，包含可选的 PyTorch 浮点数张量，默认为 None

    attention_mask: Optional[torch.Tensor] = None,
        # 注意力掩码，用于控制模型关注哪些 token，可选的 PyTorch 张量，默认为 None

    token_type_ids: Optional[torch.Tensor] = None,
        # token 类型 IDs，用于区分不同类型的 token，可选的 PyTorch 张量，默认为 None

    position_ids: Optional[torch.Tensor] = None,
        # token 的位置 IDs，用于指示 token 的位置信息，可选的 PyTorch 张量，默认为 None

    head_mask: Optional[torch.Tensor] = None,
        # 头部掩码，用于屏蔽特定的注意力头部，可选的 PyTorch 张量，默认为 None

    inputs_embeds: Optional[torch.Tensor] = None,
        # 输入的嵌入表示，用于直接提供 token 的嵌入表示而不是通过输入的 token IDs 进行计算，可选的 PyTorch 张量，默认为 None

    labels: Optional[torch.Tensor] = None,
        # 模型的标签，用于计算损失或评估模型输出的正确性，可选的 PyTorch 张量，默认为 None

    use_cache: Optional[bool] = None,
        # 是否使用缓存以加速推断，可选的布尔值，默认为 None

    output_attentions: Optional[bool] = None,
        # 是否返回注意力权重，可选的布尔值，默认为 None

    output_hidden_states: Optional[bool] = None,
        # 是否返回所有隐藏状态，可选的布尔值，默认为 None

    return_dict: Optional[bool] = None,
        # 是否以字典形式返回输出，可选的布尔值，默认为 None
    ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 使用 Transformer 处理输入数据，获取隐藏状态和其它附加输出
        transformer_outputs = self.transformer(
            input_ids,
            past_key_values=past_key_values,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = transformer_outputs[0]

        # 通过 lm_head 计算语言模型的 logits
        lm_logits = self.lm_head(hidden_states)

        loss = None
        if labels is not None:
            # 将 labels 移动到正确的设备以启用模型并行计算
            labels = labels.to(lm_logits.device)
            # 为了与 mesh-tf 版本匹配，在 fp32 中计算损失
            lm_logits = lm_logits.to(torch.float32)

            # 将 logits 和 labels 向左移动一个位置，以便 <n 预测 n
            shift_logits = lm_logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            # 展平 tokens
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))

            lm_logits = lm_logits.to(hidden_states.dtype)
            loss = loss.to(hidden_states.dtype)

        # 如果不使用 return_dict，输出 loss 和 transformer_outputs 的其它部分
        if not return_dict:
            output = (lm_logits,) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 使用 CausalLMOutputWithPast 返回 loss、logits 和 transformer_outputs 的其它部分
        return CausalLMOutputWithPast(
            loss=loss,
            logits=lm_logits,
            past_key_values=transformer_outputs.past_key_values,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )

    @staticmethod
    def _reorder_cache(
        past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor
        """
        This function is used to re-order the `past_key_values` cache if [`~PretrainedModel.beam_search`] or
        [`~PretrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
        beam_idx at every generation step.
        """

        # 返回一个元组的元组，其中每个内部元组包含经过重新排序后的 `past_key_values` 缓存
        return tuple(
            # 对于每个 `layer_past` 中的 `past_state`，按照 `beam_idx` 的顺序重新选择并返回
            tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past)
            for layer_past in past_key_values
        )
@add_start_docstrings(
    """
    The GPTNeo Model transformer with a sequence classification head on top (linear layer).

    [`GPTNeoForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-1) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    """,
    GPT_NEO_START_DOCSTRING,
)
class GPTNeoForSequenceClassification(GPTNeoPreTrainedModel):
    """
    GPTNeo model for sequence classification tasks.

    Inherits from `GPTNeoPreTrainedModel` and adds a linear classification layer for sequence classification.
    """

    def __init__(self, config):
        """
        Initializes the GPTNeoForSequenceClassification model.

        Args:
            config (:class:`~transformers.GPTNeoConfig`):
                The configuration object that defines the model architecture.

        Attributes:
            num_labels (int):
                Number of labels for sequence classification.
            transformer (:class:`~transformers.GPTNeoModel`):
                The GPTNeoModel transformer instance.
            score (:class:`~torch.nn.Linear`):
                Linear layer for computing scores for each label.
        """
        super().__init__(config)
        self.num_labels = config.num_labels
        self.transformer = GPTNeoModel(config)
        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)

        # Initialize weights and apply final processing
        self.post_init()

    @add_start_docstrings_to_model_forward(GPT_NEO_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=SequenceClassifierOutputWithPast,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        """
        Performs forward pass of the GPTNeoForSequenceClassification model.

        Args:
            input_ids (torch.Tensor, optional):
                The input token IDs. Shape [batch_size, sequence_length].
            past_key_values (Tuple[torch.FloatTensor], optional):
                Tuple of length 1 containing the cached key and value tensors from previous attention layers.
            attention_mask (torch.Tensor, optional):
                Mask to avoid performing attention on padding tokens. Shape [batch_size, sequence_length].
            token_type_ids (torch.Tensor, optional):
                Segment token indices to differentiate sequences in batch. Shape [batch_size, sequence_length].
            position_ids (torch.Tensor, optional):
                Indices of positions of each input token in the sequence. Shape [batch_size, sequence_length].
            head_mask (torch.Tensor, optional):
                Mask to nullify selected heads of the attention modules. Shape [num_heads] or [num_layers, num_heads].
            inputs_embeds (torch.Tensor, optional):
                Optionally provided embeddings instead of input_ids. Shape [batch_size, sequence_length, hidden_size].
            labels (torch.Tensor, optional):
                Labels for computing the sequence classification loss. Shape [batch_size].
            use_cache (bool, optional):
                Whether to use cache for the attention mechanism.
            output_attentions (bool, optional):
                Whether to output the attentions tensors.
            output_hidden_states (bool, optional):
                Whether to output the hidden states tensors.
            return_dict (bool, optional):
                Whether to return a :class:`~transformers.file_utils.SequenceClassifierOutputWithPast`.

        Returns:
            :class:`~transformers.file_utils.SequenceClassifierOutputWithPast`:
                Sequence classifier output consisting of loss, logits, past key values, attentions, and hidden states.
        """
        # Implementation of forward pass is handled by the parent class.
        return super().forward(
            input_ids=input_ids,
            past_key_values=past_key_values,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            labels=labels,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )


@add_start_docstrings(
    """
    GPT Neo model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    """,
    GPT_NEO_START_DOCSTRING,
)
class GPTNeoForTokenClassification(GPTNeoPreTrainedModel):
    """
    GPTNeo model for token classification tasks.

    Inherits from `GPTNeoPreTrainedModel` and adds a linear classification layer for token classification.
    """

    def __init__(self, config):
        """
        Initializes the GPTNeoForTokenClassification model.

        Args:
            config (:class:`~transformers.GPTNeoConfig`):
                The configuration object that defines the model architecture.

        Attributes:
            num_labels (int):
                Number of labels for token classification.
            transformer (:class:`~transformers.GPTNeoModel`):
                The GPTNeoModel transformer instance.
            dropout (:class:`~torch.nn.Dropout`):
                Dropout layer for regularization.
            classifier (:class:`~torch.nn.Linear`):
                Linear layer for computing scores for each token label.
        """
        super().__init__(config)
        self.num_labels = config.num_labels
        self.transformer = GPTNeoModel(config)
        self.dropout = nn.Dropout(config.classifier_dropout)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        # Initialize weights and apply final processing
        self.post_init()

    @add_start_docstrings_to_model_forward(GPT_NEO_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint="EleutherAI/gpt-neo-125m",
        output_type=TokenClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
        expected_loss=0.25,
    )
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        """
        Performs forward pass of the GPTNeoForTokenClassification model.

        Args:
            input_ids (torch.Tensor, optional):
                The input token IDs. Shape [batch_size, sequence_length].
            past_key_values (Tuple[torch.FloatTensor], optional):
                Tuple of length 1 containing the cached key and value tensors from previous attention layers.
            attention_mask (torch.Tensor, optional):
                Mask to avoid performing attention on padding tokens. Shape [batch_size, sequence_length].
            token_type_ids (torch.Tensor, optional):
                Segment token indices to differentiate sequences in batch. Shape [batch_size, sequence_length].
            position_ids (torch.Tensor, optional):
                Indices of positions of each input token in the sequence. Shape [batch_size, sequence_length].
            head_mask (torch.Tensor, optional):
                Mask to nullify selected heads of the attention modules. Shape [num_heads] or [num_layers, num_heads].
            inputs_embeds (torch.Tensor, optional):
                Optionally provided embeddings instead of input_ids. Shape [batch_size, sequence_length, hidden_size].
            labels (torch.Tensor, optional):
                Labels for computing the token classification loss. Shape [batch_size, sequence_length].
            use_cache (bool, optional):
                Whether to use cache for the attention mechanism.
            output_attentions (bool, optional):
                Whether to output the attentions tensors.
            output_hidden_states (bool, optional):
                Whether to output the hidden states tensors.
            return_dict (bool, optional):
                Whether to return a :class:`~transformers.file_utils.TokenClassifierOutput`.

        Returns:
            :class:`~transformers.file_utils.TokenClassifierOutput`:
                Token classifier output consisting of loss, logits, attentions, and hidden states.
        """
        # Implementation of forward pass is handled by the parent class.
        return super().forward(
            input_ids=input_ids,
            past_key_values=past_key_values,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, TokenClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 如果 return_dict 为 None，则使用模型配置中的默认值
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 将输入传递给 transformer 模型进行处理
        transformer_outputs = self.transformer(
            input_ids,
            past_key_values=past_key_values,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从 transformer 输出中获取隐藏状态，并应用 dropout
        hidden_states = transformer_outputs[0]
        hidden_states = self.dropout(hidden_states)

        # 将 dropout 后的隐藏状态输入分类器，得到 logits
        logits = self.classifier(hidden_states)

        # 初始化损失为 None
        loss = None

        # 如果提供了标签，则计算损失
        if labels is not None:
            # 将标签移动到与 logits 相同的设备上
            labels = labels.to(logits.device)
            # 定义交叉熵损失函数
            loss_fct = CrossEntropyLoss()
            # 计算交叉熵损失
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        # 如果不需要返回字典格式的输出，则返回一个元组
        if not return_dict:
            output = (logits,) + transformer_outputs[2:]  # 这里只返回 logits 和可能的其他输出
            return ((loss,) + output) if loss is not None else output

        # 如果需要返回字典格式的输出，则使用 TokenClassifierOutput 封装结果
        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )
# 定义 GPT-Neo 用于问答任务的模型类，包括一个在隐藏状态之上的跨度分类头部用于像 SQuAD 这样的任务
@add_start_docstrings(
    """
    The GPT-Neo Model transformer with a span classification head on top for extractive question-answering tasks like
    SQuAD (a linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
    """,
    GPT_NEO_START_DOCSTRING,
)
class GPTNeoForQuestionAnswering(GPTNeoPreTrainedModel):
    
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels  # 初始化模型的标签数量
        self.transformer = GPTNeoModel(config)  # 初始化 GPT-Neo 模型
        self.qa_outputs = nn.Linear(config.hidden_size, 2)  # 初始化线性层，用于生成跨度开始和结束的 logit
        
        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(GPT_NEO_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=QuestionAnsweringModelOutput,
        config_class=_CONFIG_FOR_DOC,
        real_checkpoint=_CHECKPOINT_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        start_positions: Optional[torch.LongTensor] = None,
        end_positions: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, QuestionAnsweringModelOutput]:
        r"""
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        """
        # 判断是否需要返回字典形式的输出，若不需要则使用配置中的设定
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 将输入传递给Transformer模型进行处理
        outputs = self.transformer(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取Transformer模型的输出中的序列输出
        sequence_output = outputs[0]

        # 将序列输出传递给QA输出层，得到起始和结束logits
        logits = self.qa_outputs(sequence_output)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1).contiguous()  # 去掉多余的维度并保证连续存储
        end_logits = end_logits.squeeze(-1).contiguous()      # 去掉多余的维度并保证连续存储

        total_loss = None
        # 如果提供了起始和结束位置，计算损失
        if start_positions is not None and end_positions is not None:
            # 如果在多GPU环境下，增加一个维度
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)
            # 忽略超出模型输入的起始/结束位置
            ignored_index = start_logits.size(1)
            start_positions = start_positions.clamp(0, ignored_index)
            end_positions = end_positions.clamp(0, ignored_index)

            # 定义交叉熵损失函数，并计算起始和结束位置的损失
            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2

        # 如果不需要以字典形式返回结果，则以元组形式返回结果
        if not return_dict:
            output = (start_logits, end_logits) + outputs[2:]
            return ((total_loss,) + output) if total_loss is not None else output

        # 如果需要以字典形式返回结果，则创建QuestionAnsweringModelOutput对象
        return QuestionAnsweringModelOutput(
            loss=total_loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

`.\models\gpt_neo\init.py`

# 导入必要的模块和函数
from typing import TYPE_CHECKING
# 导入自定义异常类，用于指示可选依赖项不可用
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_flax_available, is_torch_available

# 定义导入结构，包括配置和模型的名称列表
_import_structure = {
    "configuration_gpt_neo": ["GPT_NEO_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPTNeoConfig", "GPTNeoOnnxConfig"],
}

# 检查是否有 torch 可用，若不可用则引发异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若可用则添加 GPT Neo 模型相关的名称列表到导入结构中
    _import_structure["modeling_gpt_neo"] = [
        "GPT_NEO_PRETRAINED_MODEL_ARCHIVE_LIST",
        "GPTNeoForCausalLM",
        "GPTNeoForQuestionAnswering",
        "GPTNeoForSequenceClassification",
        "GPTNeoForTokenClassification",
        "GPTNeoModel",
        "GPTNeoPreTrainedModel",
        "load_tf_weights_in_gpt_neo",
    ]

# 检查是否有 flax 可用，若不可用则引发异常
try:
    if not is_flax_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若可用则添加 Flax GPT Neo 模型相关的名称列表到导入结构中
    _import_structure["modeling_flax_gpt_neo"] = [
        "FlaxGPTNeoForCausalLM",
        "FlaxGPTNeoModel",
        "FlaxGPTNeoPreTrainedModel",
    ]

# 如果类型检查被启用，导入具体的配置和模型类
if TYPE_CHECKING:
    from .configuration_gpt_neo import GPT_NEO_PRETRAINED_CONFIG_ARCHIVE_MAP, GPTNeoConfig, GPTNeoOnnxConfig

    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_gpt_neo import (
            GPT_NEO_PRETRAINED_MODEL_ARCHIVE_LIST,
            GPTNeoForCausalLM,
            GPTNeoForQuestionAnswering,
            GPTNeoForSequenceClassification,
            GPTNeoForTokenClassification,
            GPTNeoModel,
            GPTNeoPreTrainedModel,
            load_tf_weights_in_gpt_neo,
        )

    try:
        if not is_flax_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_flax_gpt_neo import FlaxGPTNeoForCausalLM, FlaxGPTNeoModel, FlaxGPTNeoPreTrainedModel

# 如果类型检查未启用，则将当前模块设置为延迟加载模块
else:
    import sys

    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\gpt_neox\configuration_gpt_neox.py`

# coding=utf-8
# 版权所有 2022 EleutherAI 和 The HuggingFace Inc. 团队。保留所有权利。
#
# 根据 Apache 许可证 2.0 版本（“许可证”）获得许可；
# 除非符合许可证要求，否则您不得使用此文件。
# 您可以在以下网址获取许可证副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则本软件根据“原样”基础分发，
# 没有任何明示或暗示的担保或条件。
# 有关特定语言的权限，请参阅许可证。

""" GPTNeoX 模型配置"""

# 从 transformers 库中导入预训练配置类 PretrainedConfig
from ...configuration_utils import PretrainedConfig
# 从 transformers 库中导入日志记录工具 logging
from ...utils import logging

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 定义预训练配置文件映射字典，将模型名称映射到其配置文件的 URL
GPT_NEOX_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "EleutherAI/gpt-neox-20b": "https://huggingface.co/EleutherAI/gpt-neox-20b/resolve/main/config.json",
    # 查看所有 GPTNeoX 模型的列表，请访问 https://huggingface.co/models?filter=gpt_neox
}

# 定义 GPTNeoXConfig 类，继承自 PretrainedConfig 类
class GPTNeoXConfig(PretrainedConfig):
    r"""
    这是一个配置类，用于存储 [`GPTNeoXModel`] 的配置。它用于根据指定的参数实例化一个 GPTNeoX 模型，
    定义模型架构。使用默认参数实例化配置将产生类似于 GPTNeoX [EleutherAI/gpt-neox-20b]
    (https://huggingface.co/EleutherAI/gpt-neox-20b) 架构的配置。

    配置对象继承自 [`PretrainedConfig`]，可以用于控制模型的输出。更多信息请参阅 [`PretrainedConfig`] 的文档。

    ```
    >>> from transformers import GPTNeoXConfig, GPTNeoXModel

    >>> # 初始化一个 GPTNeoX gpt-neox-20b 风格的配置
    >>> configuration = GPTNeoXConfig()

    >>> # 使用配置初始化一个（具有随机权重的）gpt-neox-20b 风格的模型
    >>> model = GPTNeoXModel(configuration)  # doctest: +SKIP

    >>> # 访问模型配置
    >>> configuration = model.config  # doctest: +SKIP
    ```
    """

    # 模型类型设为 "gpt_neox"
    model_type = "gpt_neox"

    # 初始化函数，设定模型配置的各种参数
    def __init__(
        self,
        vocab_size=50432,
        hidden_size=6144,
        num_hidden_layers=44,
        num_attention_heads=64,
        intermediate_size=24576,
        hidden_act="gelu",
        rotary_pct=0.25,
        rotary_emb_base=10000,
        attention_dropout=0.0,
        hidden_dropout=0.0,
        classifier_dropout=0.1,
        max_position_embeddings=2048,
        initializer_range=0.02,
        layer_norm_eps=1e-5,
        use_cache=True,
        bos_token_id=0,
        eos_token_id=2,
        tie_word_embeddings=False,
        use_parallel_residual=True,
        rope_scaling=None,
        attention_bias=True,
        **kwargs,
    ):
        # 调用父类的初始化方法，传入起始标记和结束标记的 token ID，以及其他可选参数
        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
        # 设置词汇表大小
        self.vocab_size = vocab_size
        # 设置最大位置嵌入的长度
        self.max_position_embeddings = max_position_embeddings
        # 设置隐藏层大小
        self.hidden_size = hidden_size
        # 设置隐藏层数量
        self.num_hidden_layers = num_hidden_layers
        # 设置注意力头的数量
        self.num_attention_heads = num_attention_heads
        # 设置中间层大小
        self.intermediate_size = intermediate_size
        # 设置隐藏层的激活函数
        self.hidden_act = hidden_act
        # 设置旋转注意力的百分比
        self.rotary_pct = rotary_pct
        # 设置旋转嵌入的基数
        self.rotary_emb_base = rotary_emb_base
        # 设置注意力机制的 dropout 比例
        self.attention_dropout = attention_dropout
        # 设置隐藏层的 dropout 比例
        self.hidden_dropout = hidden_dropout
        # 设置分类器的 dropout 比例
        self.classifier_dropout = classifier_dropout
        # 设置初始化权重的范围
        self.initializer_range = initializer_range
        # 设置层归一化的 epsilon 值
        self.layer_norm_eps = layer_norm_eps
        # 设置是否使用缓存
        self.use_cache = use_cache
        # 设置是否绑定词嵌入
        self.tie_word_embeddings = tie_word_embeddings
        # 设置是否使用并行残差连接
        self.use_parallel_residual = use_parallel_residual
        # 设置注意力偏置
        self.attention_bias = attention_bias
        # 验证并调整旋转注意力的设置
        self._rope_scaling_validation()

        # 如果隐藏层大小不能被注意力头数量整除，抛出数值错误异常
        if self.hidden_size % self.num_attention_heads != 0:
            raise ValueError(
                "The hidden size is not divisble by the number of attention heads! Make sure to update them!"
            )

    # 从 transformers.models.llama.configuration_llama.LlamaConfig._rope_scaling_validation 复制而来
    def _rope_scaling_validation(self):
        """
        验证 `rope_scaling` 配置是否有效。
        """
        # 如果 `rope_scaling` 为 None，直接返回
        if self.rope_scaling is None:
            return

        # 如果 `rope_scaling` 不是字典类型或者字典长度不为 2，抛出数值错误异常
        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
            raise ValueError(
                "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
                f"got {self.rope_scaling}"
            )
        
        # 获取 `rope_scaling` 的类型和因子
        rope_scaling_type = self.rope_scaling.get("type", None)
        rope_scaling_factor = self.rope_scaling.get("factor", None)
        
        # 如果 `rope_scaling` 的类型为空或者不在 ['linear', 'dynamic'] 中，抛出数值错误异常
        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
            raise ValueError(
                f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
            )
        
        # 如果 `rope_scaling` 的因子为空或者不是大于 1 的浮点数，抛出数值错误异常
        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
            raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")

`.\models\gpt_neox\modeling_gpt_neox.py`

# coding=utf-8
# 版权 2022 EleutherAI 和 HuggingFace Inc. 团队。保留所有权利。
#
# 根据 Apache 许可 2.0 版本（“许可证”）许可；
# 除非符合许可证，否则您不得使用此文件。
# 您可以在以下网址获取许可证副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则本软件根据“原样”分发，
# 没有任何明示或暗示的担保或条件。
# 有关详细信息，请参阅许可证。
""" PyTorch GPTNeoX 模型。"""

from typing import Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from torch.nn import functional as F

from ...activations import ACT2FN
from ...file_utils import (
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    replace_return_docstrings,
)
from ...modeling_outputs import (
    BaseModelOutputWithPast,
    CausalLMOutputWithPast,
    QuestionAnsweringModelOutput,
    SequenceClassifierOutputWithPast,
    TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...utils import is_flash_attn_2_available, is_flash_attn_greater_or_equal_2_10, logging
from .configuration_gpt_neox import GPTNeoXConfig

# 如果 flash attention 2 可用，则导入相关函数
if is_flash_attn_2_available():
    from flash_attn import flash_attn_func, flash_attn_varlen_func
    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa

# 获取全局日志记录器
logger = logging.get_logger(__name__)

# 用于文档的检查点和配置信息
_CHECKPOINT_FOR_DOC = "trl-internal-testing/tiny-random-GPTNeoXForCausalLM"
_REAL_CHECKPOINT_FOR_DOC = "EleutherAI/gpt-neox-20b"
_CONFIG_FOR_DOC = "GPTNeoXConfig"

# GPTNeoX 预训练模型的存档列表
GPT_NEOX_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "EleutherAI/gpt-neox-20b",
    # 查看所有 GPTNeoX 模型：https://huggingface.co/models?filter=gpt_neox
]

# 从 transformers.models.llama.modeling_llama._get_unpad_data 复制而来的函数
def _get_unpad_data(attention_mask):
    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
    max_seqlen_in_batch = seqlens_in_batch.max().item()
    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
    return (
        indices,
        cu_seqlens,
        max_seqlen_in_batch,
    )

# GPTNeoX 预训练模型的基类，用于处理权重初始化和预训练模型的下载加载接口
class GPTNeoXPreTrainedModel(PreTrainedModel):
    """
    一个抽象类，用于处理权重初始化和简单的预训练模型下载加载接口。
    """

    config_class = GPTNeoXConfig  # 配置类
    base_model_prefix = "gpt_neox"  # 基础模型前缀
    supports_gradient_checkpointing = True  # 支持梯度检查点
    _no_split_modules = ["GPTNeoXLayer"]  # 不需要拆分的模块列表
    _skip_keys_device_placement = "past_key_values"  # 跳过设备放置的键名
    _supports_flash_attn_2 = True  # 支持 flash attention 2 特性
    def _init_weights(self, module):
        """Initialize the weights"""
        # 如果是线性层（全连接层）
        if isinstance(module, nn.Linear):
            # 初始化权重为正态分布，均值为0，标准差为配置文件中的初始化范围
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            # 如果有偏置项，将其初始化为零
            if module.bias is not None:
                module.bias.data.zero_()
        # 如果是嵌入层
        elif isinstance(module, nn.Embedding):
            # 初始化权重为正态分布，均值为0，标准差为配置文件中的初始化范围
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            # 如果定义了填充索引，将填充索引对应的权重初始化为零
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        # 如果是层归一化层
        elif isinstance(module, nn.LayerNorm):
            # 初始化偏置项为零
            module.bias.data.zero_()
            # 初始化权重为1
            module.weight.data.fill_(1.0)
# 定义一个名为 GPTNeoXAttention 的类，继承自 nn.Module
class GPTNeoXAttention(nn.Module):
    # 初始化方法，接收一个名为 config 的参数
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()
        # 将传入的 config 参数保存在实例变量 self.config 中
        self.config = config
        # 从 config 中获取并保存注意力头数
        self.num_attention_heads = config.num_attention_heads
        # 从 config 中获取并保存隐藏层大小
        self.hidden_size = config.hidden_size
        # 检查隐藏层大小是否可以被注意力头数整除，否则抛出异常
        if self.hidden_size % self.num_attention_heads != 0:
            raise ValueError(
                "The hidden size is not divisble by the number of attention heads! Make sure to update them"
            )
        # 计算每个注意力头的大小
        self.head_size = self.hidden_size // self.num_attention_heads
        # 计算旋转嵌入的维度数量
        self.rotary_ndims = int(self.head_size * config.rotary_pct)
        # 初始化偏置参数
        self._init_bias(config.max_position_embeddings)

        # 注册一个名为 masked_bias 的缓冲区，并设置其为固定的 torch.tensor(-1e9)
        self.register_buffer("masked_bias", torch.tensor(-1e9), persistent=False)
        # 初始化 RoPE（Rotary Positional Embeddings）
        self._init_rope()

        # 计算规范化因子，用于注意力计算中
        self.norm_factor = self.head_size ** -0.5
        # 定义一个线性层，用于生成查询、键、值向量
        self.query_key_value = nn.Linear(config.hidden_size, 3 * config.hidden_size, bias=config.attention_bias)
        # 定义一个线性层，用于最终输出
        self.dense = nn.Linear(config.hidden_size, config.hidden_size, bias=config.attention_bias)
        # 定义一个 Dropout 层，用于注意力机制中的随机失活
        self.attention_dropout = nn.Dropout(config.attention_dropout)
        # 设定是否为因果关系（自回归任务中使用）
        self.is_causal = True

    # 初始化偏置方法，接收最大位置嵌入数和设备参数（可选）
    def _init_bias(self, max_positions, device=None):
        # 创建一个下三角矩阵，用于自注意力机制中的掩码
        self.register_buffer(
            "bias",
            torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view(
                1, 1, max_positions, max_positions
            ),
            persistent=False,
        )
        # 如果设备参数不为空，则将 bias 缓冲区移到指定设备上
        if device is not None:
            self.bias = self.bias.to(device)

    # 初始化 RoPE 方法
    def _init_rope(self):
        # 如果配置中未指定 RoPE 缩放类型，则使用基本的 GPTNeoXRotaryEmbedding
        if self.config.rope_scaling is None:
            self.rotary_emb = GPTNeoXRotaryEmbedding(
                self.rotary_ndims, self.config.max_position_embeddings, base=self.config.rotary_emb_base
            )
        else:
            # 否则，根据配置中的 RoPE 缩放类型选择不同的 RoPE 实现
            scaling_type = self.config.rope_scaling["type"]
            scaling_factor = self.config.rope_scaling["factor"]
            if scaling_type == "linear":
                self.rotary_emb = GPTNeoXLinearScalingRotaryEmbedding(
                    self.rotary_ndims,
                    self.config.max_position_embeddings,
                    base=self.config.rotary_emb_base,
                    scaling_factor=scaling_factor,
                )
            elif scaling_type == "dynamic":
                self.rotary_emb = GPTNeoXDynamicNTKScalingRotaryEmbedding(
                    self.rotary_ndims,
                    self.config.max_position_embeddings,
                    base=self.config.rotary_emb_base,
                    scaling_factor=scaling_factor,
                )
            else:
                # 如果配置中指定了未知的 RoPE 缩放类型，则抛出异常
                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
    def forward(
        self,
        hidden_states: torch.FloatTensor,
        attention_mask: torch.FloatTensor,
        position_ids: torch.LongTensor,
        head_mask: Optional[torch.FloatTensor] = None,
        layer_past: Optional[Tuple[torch.Tensor]] = None,
        use_cache: Optional[bool] = False,
        output_attentions: Optional[bool] = False,
        padding_mask: Optional[torch.Tensor] = None,
    ):
        # 检查是否存在先前的层级过去信息
        has_layer_past = layer_past is not None

        # 计算查询、键和值的QKV
        qkv = self.query_key_value(hidden_states)

        # 重塑QKV的形状，以便分离头部
        new_qkv_shape = qkv.size()[:-1] + (self.num_attention_heads, 3 * self.head_size)
        qkv = qkv.view(*new_qkv_shape)

        # 分离查询、键和值，并重新排列维度以适应多头注意力计算
        query = qkv[..., : self.head_size].permute(0, 2, 1, 3)
        key = qkv[..., self.head_size : 2 * self.head_size].permute(0, 2, 1, 3)
        value = qkv[..., 2 * self.head_size :].permute(0, 2, 1, 3)

        # 对旋转维度的查询和键应用旋转嵌入
        query_rot = query[..., : self.rotary_ndims]
        query_pass = query[..., self.rotary_ndims :]
        key_rot = key[..., : self.rotary_ndims]
        key_pass = key[..., self.rotary_ndims :]
        
        # 计算旋转嵌入的余弦和正弦值
        seq_len = key.shape[-2]
        if has_layer_past:
            seq_len += layer_past[0].shape[-2]
        cos, sin = self.rotary_emb(value, seq_len=seq_len)
        query, key = apply_rotary_pos_emb(query_rot, key_rot, cos, sin, position_ids)
        query = torch.cat((query, query_pass), dim=-1)
        key = torch.cat((key, key_pass), dim=-1)

        # 如果存在先前的层级过去信息，将当前的键和值与先前的拼接起来
        if has_layer_past:
            past_key = layer_past[0]
            past_value = layer_past[1]
            key = torch.cat((past_key, key), dim=-2)
            value = torch.cat((past_value, value), dim=-2)
        present = (key, value) if use_cache else None

        # 计算注意力输出和注意力权重
        attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)

        # 将多头注意力的输出重新合并
        attn_output = self._merge_heads(attn_output, self.num_attention_heads, self.head_size)
        attn_output = self.dense(attn_output)

        # 准备输出元组
        outputs = (attn_output, present)
        if output_attentions:
            outputs += (attn_weights,)

        return outputs

    @classmethod
    # 将隐藏维度拆分为多头注意力的大小和数量
    def _split_heads(cls, tensor, num_attention_heads, attn_head_size):
        """
        Splits hidden dim into attn_head_size and num_attention_heads
        """
        # tensor: [bs, seq_len, hidden_size]
        new_shape = tensor.size()[:-1] + (num_attention_heads, attn_head_size)
        # -> [bs, seq_len, num_attention_heads, attn_head_size]
        tensor = tensor.view(new_shape)
        # -> [bs, num_attention_heads, seq_len, attn_head_size]
        tensor = tensor.permute(0, 2, 1, 3)
        return tensor

    @classmethod
    # 将注意力头的大小和数量合并为隐藏维度
    def _merge_heads(cls, tensor, num_attention_heads, attn_head_size):
        """
        Merges attn_head_size dim and num_attn_heads dim into hidden dim
        """
        # tensor [bs, num_attention_heads, seq_len, attn_head_size]
        tensor = tensor.permute(0, 2, 1, 3).contiguous()
        # -> [bs, seq_len, num_attention_heads, attn_head_size]
        tensor = tensor.view(tensor.size(0), tensor.size(1), num_attention_heads * attn_head_size)
        # -> [bs, seq_len, hidden_size]
        return tensor
    def _attn(self, query, key, value, attention_mask=None, head_mask=None):
        # q, k, v: [bs, num_attention_heads, seq_len, attn_head_size]
        # query, key, value分别表示查询、键、值的张量，维度为[批大小, 注意力头数, 序列长度, 每个注意力头大小]

        # 获取查询张量的维度信息
        batch_size, num_attention_heads, query_length, attn_head_size = query.size()
        # 获取键张量的长度信息
        key_length = key.size(-2)

        # 根据需要动态增加因果掩码的长度
        if key_length > self.bias.shape[-1]:
            self._init_bias(key_length, device=key.device)
        # 从预先存储的偏置中获取因果掩码
        causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length]

        # 将查询、键张量重塑为二维矩阵以进行注意力计算
        query = query.view(batch_size * num_attention_heads, query_length, attn_head_size)
        key = key.view(batch_size * num_attention_heads, key_length, attn_head_size)

        # 初始化注意力分数张量，全零初始化
        attn_scores = torch.zeros(
            batch_size * num_attention_heads,
            query_length,
            key_length,
            dtype=query.dtype,
            device=key.device,
        )
        
        # 计算注意力分数
        attn_scores = torch.baddbmm(
            attn_scores,
            query,
            key.transpose(1, 2),
            beta=1.0,
            alpha=self.norm_factor,
        )

        # 将注意力分数张量重新塑造为四维张量
        attn_scores = attn_scores.view(batch_size, num_attention_heads, query_length, key_length)

        # 创建一个最小浮点数的张量，用于掩码操作
        mask_value = torch.finfo(attn_scores.dtype).min
        mask_value = torch.tensor(mask_value, dtype=attn_scores.dtype).to(attn_scores.device)

        # 根据因果掩码进行注意力分数的掩码处理
        attn_scores = torch.where(causal_mask, attn_scores, mask_value)

        # 如果提供了注意力掩码，则应用该掩码
        if attention_mask is not None:
            attn_scores = attn_scores + attention_mask

        # 对注意力分数进行 softmax 操作，以获取注意力权重
        attn_weights = nn.functional.softmax(attn_scores, dim=-1)
        attn_weights = attn_weights.to(value.dtype)

        # 如果提供了头掩码，则对注意力权重进行掩码处理
        if head_mask is not None:
            attn_weights = attn_weights * head_mask

        # 对注意力权重应用注意力丢弃（dropout）
        attn_weights = self.attention_dropout(attn_weights)

        # 计算最终的注意力输出，使用注意力权重加权值张量
        attn_output = torch.matmul(attn_weights, value)

        # 返回注意力输出和注意力权重张量
        return attn_output, attn_weights
class GPTNeoXFlashAttention2(GPTNeoXAttention):
    """
    GPTNeoX flash attention module. This module inherits from `GPTNeoXAttention` as the weights of the module stays
    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
    flash attention and deal with padding tokens in case the input contains any of them.
    """

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
        # 标记是否使用旧版 Flash Attention 的顶部左对齐掩码
        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()

    def forward(
        self,
        hidden_states: torch.FloatTensor,
        attention_mask: torch.FloatTensor,
        position_ids: torch.LongTensor,
        head_mask: Optional[torch.FloatTensor] = None,
        layer_past: Optional[Tuple[torch.Tensor]] = None,
        use_cache: Optional[bool] = False,
        output_attentions: Optional[bool] = False,
    ):
        # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._flash_attention_forward
        # 执行注意力机制的前向传播，处理输入的隐藏状态、注意力掩码和位置 ID，还有一些可选参数如头部掩码、历史信息、缓存和是否输出注意力权重
        def _flash_attention_forward(
            self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
        ):
    ):
        """
        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
        first unpad the input, then computes the attention scores and pad the final attention scores.

        Args:
            query_states (`torch.Tensor`):
                Input query states to be passed to Flash Attention API
            key_states (`torch.Tensor`):
                Input key states to be passed to Flash Attention API
            value_states (`torch.Tensor`):
                Input value states to be passed to Flash Attention API
            attention_mask (`torch.Tensor`):
                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
                position of padding tokens and 1 for the position of non-padding tokens.
            dropout (`float`):
                Attention dropout
            softmax_scale (`float`, *optional*):
                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
        """
        # Determine if causal masking is needed based on `_flash_attn_uses_top_left_mask` and `query_length`
        if not self._flash_attn_uses_top_left_mask:
            causal = self.is_causal
        else:
            # Temporary check for specific conditions related to Flash Attention for RoCm version 2.1
            causal = self.is_causal and query_length != 1

        # Check if there are any padding tokens in the input sequence
        if attention_mask is not None:
            batch_size = query_states.shape[0]
            # Unpad the input based on the attention mask
            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
                query_states, key_states, value_states, attention_mask, query_length
            )

            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens

            # Perform Flash Attention on the unpadded inputs
            attn_output_unpad = flash_attn_varlen_func(
                query_states,
                key_states,
                value_states,
                cu_seqlens_q=cu_seqlens_q,
                cu_seqlens_k=cu_seqlens_k,
                max_seqlen_q=max_seqlen_in_batch_q,
                max_seqlen_k=max_seqlen_in_batch_k,
                dropout_p=dropout,
                softmax_scale=softmax_scale,
                causal=causal,
            )

            # Pad the attention output back according to the original indices
            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
        else:
            # Perform regular Flash Attention without masking
            attn_output = flash_attn_func(
                query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
            )

        return attn_output

    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._upad_input with num_heads->num_attention_heads
    # 定义一个私有方法 `_upad_input`，用于处理注意力机制中的输入数据
    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
        # 获取不需要填充的数据的索引、当前序列长度和批次中的最大序列长度
        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
        
        # 获取批次大小、键值对序列长度、键值头数以及头维度
        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
        
        # 重塑键层和值层，以便进行索引操作
        key_layer = index_first_axis(
            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
        )
        value_layer = index_first_axis(
            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
        )
        
        # 根据查询长度的不同情况处理查询层
        if query_length == kv_seq_len:
            # 如果查询长度等于键值对序列长度，则直接索引查询层
            query_layer = index_first_axis(
                query_layer.reshape(batch_size * kv_seq_len, self.num_attention_heads, head_dim), indices_k
            )
            cu_seqlens_q = cu_seqlens_k
            max_seqlen_in_batch_q = max_seqlen_in_batch_k
            indices_q = indices_k
        elif query_length == 1:
            # 如果查询长度为1，则直接处理成适应的形状
            max_seqlen_in_batch_q = 1
            cu_seqlens_q = torch.arange(
                batch_size + 1, dtype=torch.int32, device=query_layer.device
            )  # 这里有一个 memcpy 操作，效率很差。
            indices_q = cu_seqlens_q[:-1]
            query_layer = query_layer.squeeze(1)
        else:
            # 否则，根据注意力掩码对查询层进行处理
            # -query_length: 切片假定左填充
            attention_mask = attention_mask[:, -query_length:]
            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
        
        # 返回处理后的查询层、键层、值层、查询层索引、当前序列长度元组和最大序列长度元组
        return (
            query_layer,
            key_layer,
            value_layer,
            indices_q,
            (cu_seqlens_q, cu_seqlens_k),
            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
        )
# 定义一个函数，用于处理注意力分数和左到右掩码
def attention_mask_func(attention_scores, ltor_mask):
    # 将注意力分数中掩码为假（False）的位置替换为一个极小的值，以此实现屏蔽效果
    attention_scores.masked_fill_(~ltor_mask, torch.finfo(attention_scores.dtype).min)
    return attention_scores

# 定义一个新的神经网络模块，用于实现旋转嵌入
class GPTNeoXRotaryEmbedding(nn.Module):
    # 从transformers库中的MistralRotaryEmbedding类的__init__方法复制而来
    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
        super().__init__()

        # 初始化旋转嵌入模块的参数
        self.dim = dim
        self.max_position_embeddings = max_position_embeddings
        self.base = base

        # 计算频率倒数，并将其作为缓冲区（buffer）注册到模块中
        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
        self.register_buffer("inv_freq", inv_freq, persistent=False)

        # 为了让`torch.jit.trace`正常工作，在这里构建cos和sin的缓存
        self._set_cos_sin_cache(
            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
        )

    # 设置cos和sin的缓存，用于旋转嵌入的计算
    def _set_cos_sin_cache(self, seq_len, device, dtype):
        self.max_seq_len_cached = seq_len
        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)

        # 计算频率与位置的外积，生成用于旋转嵌入的cos和sin缓存
        freqs = torch.outer(t, self.inv_freq)
        emb = torch.cat((freqs, freqs), dim=-1)  # 按最后一个维度拼接cos和sin
        self.register_buffer("cos_cached", emb.cos(), persistent=False)
        self.register_buffer("sin_cached", emb.sin(), persistent=False)

    # 前向传播函数，用于应用旋转嵌入到输入张量上
    def forward(self, x, seq_len=None):
        # x: [bs, num_attention_heads, seq_len, head_size]

        # 如果指定的序列长度大于当前缓存的最大序列长度，重新设置cos和sin的缓存
        if seq_len > self.max_seq_len_cached:
            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)

        # 返回当前缓存中的cos和sin，截取到指定的序列长度
        return (
            self.cos_cached[:seq_len],
            self.sin_cached[:seq_len],
        )


# 从transformers库中的LlamaLinearScalingRotaryEmbedding类的__init__方法复制而来
class GPTNeoXLinearScalingRotaryEmbedding(GPTNeoXRotaryEmbedding):
    """GPTNeoXRotaryEmbedding扩展，添加了线性缩放功能。鸣谢Reddit用户/u/kaiokendev"""

    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
        # 初始化线性缩放旋转嵌入模块的参数
        self.scaling_factor = scaling_factor
        super().__init__(dim, max_position_embeddings, base, device)

    # 重写父类方法，设置缩放后的cos和sin的缓存
    def _set_cos_sin_cache(self, seq_len, device, dtype):
        self.max_seq_len_cached = seq_len
        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
        t = t / self.scaling_factor  # 应用缩放因子到位置编码中的时间步长

        # 计算缩放后的频率与位置的外积，生成用于旋转嵌入的cos和sin缓存
        freqs = torch.outer(t, self.inv_freq)
        emb = torch.cat((freqs, freqs), dim=-1)  # 按最后一个维度拼接cos和sin
        self.register_buffer("cos_cached", emb.cos(), persistent=False)
        self.register_buffer("sin_cached", emb.sin(), persistent=False)
class GPTNeoXDynamicNTKScalingRotaryEmbedding(GPTNeoXRotaryEmbedding):
    """GPTNeoXRotaryEmbedding扩展，增加了动态NTK缩放功能。由Reddit用户/u/bloc97和/u/emozilla贡献"""

    # 从transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding.__init__复制而来
    # TODO @gante 现在不再从那里复制
    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
        # 初始化函数，接收维度(dim)、最大位置嵌入数(max_position_embeddings)、基础值(base)、设备(device)和缩放因子(scaling_factor)
        self.scaling_factor = scaling_factor
        # 调用父类的初始化函数，传入维度、最大位置嵌入数、基础值和设备
        super().__init__(dim, max_position_embeddings, base, device)

    def _set_cos_sin_cache(self, seq_len, device, dtype):
        # 设置余弦和正弦缓存，用于后续的旋转位置嵌入计算
        self.max_seq_len_cached = seq_len

        if seq_len > self.max_position_embeddings:
            # 如果序列长度超过最大位置嵌入数，则计算基础值
            base = self.base * (
                (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
            ) ** (self.dim / (self.dim - 2))
            # 计算逆频率向量
            inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
            # 注册逆频率向量为缓存
            self.register_buffer("inv_freq", inv_freq, persistent=False)

        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)

        freqs = torch.outer(t, self.inv_freq)
        # 不同于论文中的实现，但使用不同的排列以达到相同的计算效果
        emb = torch.cat((freqs, freqs), dim=-1)
        # 注册余弦缓存
        self.register_buffer("cos_cached", emb.cos(), persistent=False)
        # 注册正弦缓存
        self.register_buffer("sin_cached", emb.sin(), persistent=False)


def rotate_half(x):
    """将输入的一半隐藏维度进行旋转。"""
    x1 = x[..., : x.shape[-1] // 2]
    x2 = x[..., x.shape[-1] // 2 :]
    return torch.cat((-x2, x1), dim=-1)


# 从transformers.models.mistral.modeling_mistral.apply_rotary_pos_emb复制而来
def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
    """将旋转位置嵌入应用到查询和键张量上。"""
    # 使用给定的位置索引从余弦部分提取位置编码，并在指定维度上进行unsqueeze操作，以便与q和k进行广播匹配
    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
    
    # 使用给定的位置索引从正弦部分提取位置编码，并在指定维度上进行unsqueeze操作，以便与q和k进行广播匹配
    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
    
    # 将查询张量q与cos位置编码相乘，并将其与查询张量q经过rotate_half函数后的结果与sin位置编码相乘的结果相加，得到旋转后的查询张量
    q_embed = (q * cos) + (rotate_half(q) * sin)
    
    # 将键张量k与cos位置编码相乘，并将其与键张量k经过rotate_half函数后的结果与sin位置编码相乘的结果相加，得到旋转后的键张量
    k_embed = (k * cos) + (rotate_half(k) * sin)
    
    # 返回旋转后的查询张量和键张量作为元组
    return q_embed, k_embed
# 定义了一个名为 GPTNeoXMLP 的新神经网络模块，继承自 nn.Module 类
class GPTNeoXMLP(nn.Module):
    # 初始化方法，接收一个 config 参数
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()
        # 创建一个线性层，将输入大小为 config.hidden_size，输出大小为 config.intermediate_size
        self.dense_h_to_4h = nn.Linear(config.hidden_size, config.intermediate_size)
        # 创建另一个线性层，将输入大小为 config.intermediate_size，输出大小为 config.hidden_size
        self.dense_4h_to_h = nn.Linear(config.intermediate_size, config.hidden_size)
        # 选择激活函数，根据配置中的 hidden_act 参数，从预定义的字典 ACT2FN 中获取对应的激活函数
        self.act = ACT2FN[config.hidden_act]

    # 前向传播方法，接收 hidden_states 作为输入
    def forward(self, hidden_states):
        # 输入 hidden_states 经过第一个线性层 dense_h_to_4h
        hidden_states = self.dense_h_to_4h(hidden_states)
        # 经过激活函数 act 处理后的 hidden_states
        hidden_states = self.act(hidden_states)
        # 再经过第二个线性层 dense_4h_to_h，最终输出
        hidden_states = self.dense_4h_to_h(hidden_states)
        return hidden_states


# 定义了一个名为 GPTNeoXLayer 的新神经网络模块，继承自 nn.Module 类
class GPTNeoXLayer(nn.Module):
    # 初始化方法，接收一个 config 参数
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()
        # 根据配置参数设置是否使用并行残差连接
        self.use_parallel_residual = config.use_parallel_residual
        # 输入层的 Layer Normalization，输入大小为 config.hidden_size
        self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 经过注意力机制后的 Layer Normalization，输入大小同样为 config.hidden_size
        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 注意力机制后的 Dropout 层，丢弃率为 config.hidden_dropout
        self.post_attention_dropout = nn.Dropout(config.hidden_dropout)
        # MLP（多层感知机）后的 Dropout 层，丢弃率同样为 config.hidden_dropout
        self.post_mlp_dropout = nn.Dropout(config.hidden_dropout)
        # 根据配置中的 _attn_implementation 参数选择相应的注意力机制类，并初始化
        self.attention = GPT_NEOX_ATTENTION_CLASSES[config._attn_implementation](config)
        # 创建一个 GPTNeoXMLP 对象，用于处理 MLP 部分
        self.mlp = GPTNeoXMLP(config)

    # 前向传播方法，接收多个输入参数
    def forward(
        self,
        hidden_states: Optional[torch.FloatTensor],
        attention_mask: Optional[torch.FloatTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = False,
        layer_past: Optional[Tuple[torch.Tensor]] = None,
        output_attentions: Optional[bool] = False,
        ):
            # 使用 self.attention 方法处理输入的 hidden_states，应用 layer normalization
            # 这里包括了 attention_mask、position_ids、layer_past、head_mask 和 use_cache 等参数
            attention_layer_outputs = self.attention(
                self.input_layernorm(hidden_states),
                attention_mask=attention_mask,
                position_ids=position_ids,
                layer_past=layer_past,
                head_mask=head_mask,
                use_cache=use_cache,
                output_attentions=output_attentions,
            )
            # 获取注意力层的输出作为 attn_output
            attn_output = attention_layer_outputs[0]  # output_attn: attn_output, present, (attn_weights)
            # 对 attn_output 应用后续的 dropout
            attn_output = self.post_attention_dropout(attn_output)
            # outputs 包含 attention_layer_outputs 的其余部分
            outputs = attention_layer_outputs[1:]

            if self.use_parallel_residual:
                # 如果使用并行残差连接（parallel residual connection）
                # 通过 MLP 处理经过注意力后的层标准化（ln1），并应用 dropout
                mlp_output = self.mlp(self.post_attention_layernorm(hidden_states))
                mlp_output = self.post_mlp_dropout(mlp_output)
                # 更新 hidden_states 为 mlp_output、attn_output 和原始 hidden_states 的和
                hidden_states = mlp_output + attn_output + hidden_states
            else:
                # 如果不使用并行残差连接
                # 先将 attn_output 加到 hidden_states 上
                attn_output = attn_output + hidden_states
                # 然后通过 MLP 处理经过注意力后的层标准化（ln1），并应用 dropout
                mlp_output = self.mlp(self.post_attention_layernorm(attn_output))
                mlp_output = self.post_mlp_dropout(mlp_output)
                # 更新 hidden_states 为 mlp_output 和 attn_output 的和
                hidden_states = mlp_output + attn_output

            if use_cache:
                # 如果 use_cache 为真，则在输出中包含 hidden_states
                outputs = (hidden_states,) + outputs  # hidden_states, present, (attn_weights)
            else:
                # 如果 use_cache 为假，则在输出中不包含 present
                outputs = (hidden_states,) + outputs[1:]  # hidden_states, (attn_weights)

            # 返回最终的 outputs
            return outputs
GPT_NEOX_START_DOCSTRING = r"""
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`~GPTNeoXConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

GPT_NEOX_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.n_positions - 1]`.

            [What are position IDs?](../glossary#position-ids)
        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""

@add_start_docstrings(
    """Add model-specific documentation to the provided function or class.

    Args:
        docstring (`str`): The docstring to add to the function or class.

    Returns:
        Callable: A decorator function that adds the specified docstring to the decorated function or class.
    """
)
    # 定义一个字符串，描述GPTNeoX模型输出原始隐藏状态而不带任何特定的顶层头部
    "The bare GPTNeoX Model transformer outputting raw hidden-states without any specific head on top.",
    # GPT_NEOX_START_DOCSTRING用于标记GPTNeoX模型文档字符串的起始位置
    GPT_NEOX_START_DOCSTRING,
# 定义一个名为 GPTNeoXModel 的类，继承自 GPTNeoXPreTrainedModel
class GPTNeoXModel(GPTNeoXPreTrainedModel):
    # 初始化方法，接收一个 config 参数
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__(config)
        # 将传入的 config 参数保存在实例变量 self.config 中
        self.config = config

        # 创建一个词嵌入层，参数为词汇表大小和隐藏层大小
        self.embed_in = nn.Embedding(config.vocab_size, config.hidden_size)
        # 创建一个 dropout 层，参数为隐藏层的 dropout 概率
        self.emb_dropout = nn.Dropout(config.hidden_dropout)
        
        # 创建一个由多个 GPTNeoXLayer 组成的层列表，列表长度为 config.num_hidden_layers
        self.layers = nn.ModuleList([GPTNeoXLayer(config) for _ in range(config.num_hidden_layers)])
        
        # 创建一个 LayerNorm 层，用于最终的归一化处理，参数为隐藏层大小和 eps 值
        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        
        # 根据配置文件判断是否使用 flash_attention_2
        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"

        # 梯度检查点默认关闭
        self.gradient_checkpointing = False

        # 初始化权重并应用最终处理
        self.post_init()

    # 获取输入词嵌入层的方法
    def get_input_embeddings(self):
        return self.embed_in

    # 设置输入词嵌入层的方法
    def set_input_embeddings(self, value):
        self.embed_in = value

    # 前向传播方法，接收多个输入参数
    @add_start_docstrings_to_model_forward(GPT_NEOX_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        real_checkpoint=_REAL_CHECKPOINT_FOR_DOC,
        output_type=BaseModelOutputWithPast,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,



# 添加描述模型的开始文档字符串，说明这是一个用于 CLM fine-tuning 的 GPTNeoX 模型
@add_start_docstrings(
    """GPTNeoX Model with a `language modeling` head on top for CLM fine-tuning.""", GPT_NEOX_START_DOCSTRING
)
# 定义一个名为 GPTNeoXForCausalLM 的类，继承自 GPTNeoXPreTrainedModel
class GPTNeoXForCausalLM(GPTNeoXPreTrainedModel):
    # 静态变量，指定与嵌入权重相关联的键名
    _tied_weights_keys = ["embed_out.weight"]

    # 初始化方法，接收一个 config 参数
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__(config)

        # 创建一个 GPTNeoXModel 类的实例，传入 config 参数
        self.gpt_neox = GPTNeoXModel(config)
        
        # 创建一个线性层，将隐藏层的输出映射到词汇表大小的输出空间，不使用偏置项
        self.embed_out = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

        # 初始化权重并应用最终处理
        self.post_init()

    # 获取输出词嵌入层的方法
    def get_output_embeddings(self):
        return self.embed_out

    # 设置输出词嵌入层的方法
    def set_output_embeddings(self, new_embeddings):
        self.embed_out = new_embeddings

    # 前向传播方法，接收多个输入参数
    @add_start_docstrings_to_model_forward(GPT_NEOX_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        """
        Model forward method for transformer-like models.

        Args:
            input_ids (Optional[torch.LongTensor]): Input token IDs.
            attention_mask (Optional[torch.FloatTensor]): Mask to avoid performing attention on padding tokens.
            position_ids (Optional[torch.LongTensor]): Position IDs for positional embeddings.
            inputs_embeds (Optional[torch.FloatTensor]): Optional input embeddings directly provided instead of input_ids.
            head_mask (Optional[torch.FloatTensor]): Mask for attention heads.
            past_key_values (Optional[Tuple[Tuple[torch.FloatTensor]]]): Cached key-value states for fast autoregressive decoding.
            labels (Optional[torch.LongTensor]): Target labels for training.
            use_cache (Optional[bool]): Whether to use the cached past key-values for generation.
            output_attentions (Optional[bool]): Whether to return attention weights.
            output_hidden_states (Optional[bool]): Whether to return hidden states.
            return_dict (Optional[bool]): Whether to return a dictionary.

        Returns:
            model_inputs (Dict[str, torch.Tensor]): Dictionary containing model inputs.
        """
        # Implementation details of model forward pass goes here
        pass

    def prepare_inputs_for_generation(
        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
    ):
        """
        Prepares inputs for generation by adjusting input_ids and other necessary tensors.

        Args:
            input_ids (torch.Tensor): Input token IDs.
            past_key_values (Optional[Tuple[Tuple[torch.FloatTensor]]]): Cached key-value states from previous decoding steps.
            attention_mask (torch.Tensor): Mask to avoid attending to padding tokens.
            inputs_embeds (torch.Tensor): Optional input embeddings.
            **kwargs: Additional keyword arguments.

        Returns:
            model_inputs (Dict[str, torch.Tensor]): Dictionary containing prepared model inputs.
        """
        input_shape = input_ids.shape

        # cut decoder_input_ids if past is used
        if past_key_values is not None:
            past_length = past_key_values[0][0].shape[2]

            # Some generation methods already pass only the last input ID
            if input_ids.shape[1] > past_length:
                remove_prefix_length = past_length
            else:
                # Default to old behavior: keep only final ID
                remove_prefix_length = input_ids.shape[1] - 1

            input_ids = input_ids[:, remove_prefix_length:]

        position_ids = kwargs.get("position_ids", None)
        if attention_mask is not None and position_ids is None:
            # create position_ids on the fly for batch generation
            position_ids = attention_mask.long().cumsum(-1) - 1
            position_ids.masked_fill_(attention_mask == 0, 1)
            if past_key_values:
                position_ids = position_ids[:, -input_ids.shape[1] :]

        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
        if attention_mask is None:
            attention_mask = input_ids.new_ones(input_shape)

        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
        if inputs_embeds is not None and past_key_values is None:
            model_inputs = {"inputs_embeds": inputs_embeds}
        else:
            model_inputs = {"input_ids": input_ids}
        model_inputs.update(
            {
                "attention_mask": attention_mask,
                "past_key_values": past_key_values,
                "position_ids": position_ids,
            }
        )

        return model_inputs

    def _reorder_cache(self, past_key_values, beam_idx):
        """
        Reorders cached past key-values according to beam search index.

        Args:
            past_key_values (Tuple[Tuple[torch.FloatTensor]]): Cached key-value states.
            beam_idx (torch.Tensor): Index tensor for reordering.

        Returns:
            reordered_past (Tuple[Tuple[torch.FloatTensor]]): Reordered past key-values.
        """
        reordered_past = ()
        for layer_past in past_key_values:
            reordered_past += (
                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past[:2])
                + layer_past[2:],
            )
        return reordered_past
"""
The GPTNeoX Model transformer with a sequence classification head on top (linear layer).

[`GPTNeoXForSequenceClassification`] uses the last token in order to do the classification, as other causal models
(e.g. GPT-1) do.

Since it does classification on the last token, it requires to know the position of the last token. If a
`pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
each row of the batch).
"""
@add_start_docstrings(
    """
    The GPTNeoX Model transformer with a token classification head on top (linear layer).

    This model uses the GPTNeoX architecture and adds a linear layer on top for token classification tasks.

    It includes dropout and a linear classifier for the token classification layer.

    Since it performs classification on each token independently, it requires the position information for each token.
    If `pad_token_id` is defined in the configuration, it identifies tokens that are not padding tokens in each sequence.
    If `pad_token_id` is not defined, it uses the last token in each sequence. When using `inputs_embeds` instead of
    `input_ids`, the model assumes the last token in each sequence for classification.

    Note that the configuration checkpoint for this model can be found at "LarsJonasson/pythia-410m-deduped-sft-swedish".
    """,
    GPT_NEOX_START_DOCSTRING,
)
class GPTNeoXForTokenClassification(GPTNeoXPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        # Initialize GPTNeoX model
        self.gpt_neox = GPTNeoXModel(config)
        # Dropout layer for regularization
        self.dropout = nn.Dropout(config.classifier_dropout)
        # Linear layer for classification
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        # Initialize weights and apply final processing
        self.post_init()

    @add_start_docstrings_to_model_forward(GPT_NEOX_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint="LarsJonasson/pythia-410m-deduped-sft-swedish",
        output_type=TokenClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
        expected_loss=0.25,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, TokenClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 确定是否返回字典格式的输出，若未指定则使用模型配置中的默认设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 将输入传递给 GPT-NeoX 模型，并获取输出
        outputs = self.gpt_neox(
            input_ids,
            past_key_values=past_key_values,
            attention_mask=attention_mask,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 提取模型的隐藏状态并应用 dropout
        hidden_states = outputs[0]
        hidden_states = self.dropout(hidden_states)
        
        # 将处理后的隐藏状态传递给分类器，得到分类器的 logits
        logits = self.classifier(hidden_states)

        # 计算损失（如果有提供标签）
        loss = None
        if labels is not None:
            # 将标签转移到与 logits 相同的设备上
            labels = labels.to(logits.device)
            # 使用交叉熵损失函数计算损失
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        # 如果不要求返回字典格式的输出，则组装并返回输出元组
        if not return_dict:
            output = (logits,) + outputs[2:]  # 输出包括 logits 和可能的附加信息
            return ((loss,) + output) if loss is not None else output

        # 如果要求返回字典格式的输出，则使用 TokenClassifierOutput 类构建输出
        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
"""
    The GPT-NeoX Model transformer with a span classification head on top for extractive question-answering tasks like
    SQuAD (a linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
    """
    # 使用 GPT-NeoX 模型变换器，顶部带有用于抽取式问答任务的跨度分类头，例如 SQuAD
    # （在隐藏状态输出的顶部添加线性层来计算“起始位置对数”和“结束位置对数”）
    @add_start_docstrings_to_model_forward(GPT_NEOX_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    # 将模型的前向传播方法添加描述性文档字符串，指定输入的文档字符串格式
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=QuestionAnsweringModelOutput,
        config_class=_CONFIG_FOR_DOC,
        real_checkpoint=_REAL_CHECKPOINT_FOR_DOC,
    )
    # 将模型的前向传播方法添加代码示例的文档字符串，包括用于文档的检查点、输出类型、配置类和真实检查点
    class GPTNeoXForQuestionAnswering(GPTNeoXPreTrainedModel):
        def __init__(self, config):
            super().__init__(config)
            self.num_labels = config.num_labels
            self.gpt_neox = GPTNeoXModel(config)
            self.qa_outputs = nn.Linear(config.hidden_size, 2)

            # 初始化权重并应用最终处理
            self.post_init()

        def forward(
            self,
            input_ids: Optional[torch.LongTensor] = None,
            attention_mask: Optional[torch.FloatTensor] = None,
            token_type_ids: Optional[torch.LongTensor] = None,
            position_ids: Optional[torch.LongTensor] = None,
            head_mask: Optional[torch.FloatTensor] = None,
            inputs_embeds: Optional[torch.FloatTensor] = None,
            start_positions: Optional[torch.LongTensor] = None,
            end_positions: Optional[torch.LongTensor] = None,
            output_attentions: Optional[bool] = None,
            output_hidden_states: Optional[bool] = None,
            return_dict: Optional[bool] = None,
        ):
            # 前向传播函数，接受多种输入，并根据需要返回字典或单个张量
    ) -> Union[Tuple, QuestionAnsweringModelOutput]:
        r"""
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        """
        # 确保 return_dict 不为 None 则使用其值，否则使用 self.config.use_return_dict
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用 GPT-NeoX 模型进行推理
        outputs = self.gpt_neox(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从模型输出中获取序列输出
        sequence_output = outputs[0]

        # 使用序列输出计算问答模型的 logits
        logits = self.qa_outputs(sequence_output)

        # 将 logits 拆分为 start_logits 和 end_logits
        start_logits, end_logits = logits.split(1, dim=-1)

        # 去除多余的维度，并确保连续性
        start_logits = start_logits.squeeze(-1).contiguous()
        end_logits = end_logits.squeeze(-1).contiguous()

        total_loss = None
        if start_positions is not None and end_positions is not None:
            # 如果 start_positions 或 end_positions 的维度大于 1，则压缩至一维，并转移到相应设备
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1).to(start_logits.device)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1).to(end_logits.device)

            # 忽略超出模型输入范围的位置索引
            ignored_index = start_logits.size(1)
            start_positions = start_positions.clamp(0, ignored_index)
            end_positions = end_positions.clamp(0, ignored_index)

            # 定义交叉熵损失函数，忽略指定的索引位置
            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)

            # 计算总损失
            total_loss = (start_loss + end_loss) / 2

        if not return_dict:
            # 如果不返回字典，则返回元组形式的输出
            output = (start_logits, end_logits) + outputs[2:]
            return ((total_loss,) + output) if total_loss is not None else output

        # 返回 QuestionAnsweringModelOutput 类型的对象，包含损失、logits、隐藏状态和注意力权重
        return QuestionAnsweringModelOutput(
            loss=total_loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

Transformers-源码解析-五十五-

Transformers 源码解析（五十五）

.\models\gptsan_japanese\tokenization_gptsan_japanese.py

.\models\gptsan_japanese\__init__.py

.\models\gpt_bigcode\configuration_gpt_bigcode.py

.\models\gpt_bigcode\modeling_gpt_bigcode.py

.\models\gpt_bigcode\__init__.py

.\models\gpt_neo\configuration_gpt_neo.py

.\models\gpt_neo\convert_gpt_neo_mesh_tf_to_pytorch.py

.\models\gpt_neo\modeling_flax_gpt_neo.py

.\models\gpt_neo\modeling_gpt_neo.py

.\models\gpt_neo\__init__.py

.\models\gpt_neox\configuration_gpt_neox.py

.\models\gpt_neox\modeling_gpt_neox.py

`.\models\gptsan_japanese\tokenization_gptsan_japanese.py`

`.\models\gptsan_japanese\init.py`

`.\models\gpt_bigcode\configuration_gpt_bigcode.py`

`.\models\gpt_bigcode\modeling_gpt_bigcode.py`

`.\models\gpt_bigcode\init.py`

`.\models\gpt_neo\configuration_gpt_neo.py`

`.\models\gpt_neo\convert_gpt_neo_mesh_tf_to_pytorch.py`

`.\models\gpt_neo\modeling_flax_gpt_neo.py`

`.\models\gpt_neo\modeling_gpt_neo.py`

`.\models\gpt_neo\init.py`

`.\models\gpt_neox\configuration_gpt_neox.py`

`.\models\gpt_neox\modeling_gpt_neox.py`