Transformers 源码解析（一百二十五）

`.\models\whisper\tokenization_whisper_fast.py`

# coding=utf-8
# 设置文件编码为UTF-8，确保可以正确处理中文等特殊字符

# 版权声明，指明此代码的版权归The HuggingFace Inc.团队所有，保留所有权利

# 导入需要的模块和库
import json  # 导入用于处理JSON格式的模块
import os  # 导入操作系统功能的模块
import re  # 导入正则表达式模块，用于文本匹配和处理
import warnings  # 导入警告处理模块，用于处理警告信息
from functools import lru_cache  # 导入缓存函数装饰器，用于缓存函数的返回值
from typing import List, Optional, Tuple  # 导入类型提示相关的模块

import numpy as np  # 导入数值计算库Numpy
from tokenizers import AddedToken, pre_tokenizers, processors  # 从tokenizers库中导入特定的类和函数

from ...tokenization_utils_base import BatchEncoding  # 导入来自tokenization_utils_base模块的BatchEncoding类
from ...tokenization_utils_fast import PreTrainedTokenizerFast  # 导入来自tokenization_utils_fast模块的PreTrainedTokenizerFast类
from ...utils import logging  # 从utils模块中导入logging函数
from .english_normalizer import BasicTextNormalizer, EnglishTextNormalizer  # 导入本地english_normalizer模块中的文本规范化类
from .tokenization_whisper import LANGUAGES, TASK_IDS, TO_LANGUAGE_CODE, WhisperTokenizer, _decode_asr  # 导入本地tokenization_whisper模块中的特定内容

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器对象

# 定义常量，指明文件名称和其对应的内容类型
VOCAB_FILES_NAMES = {
    "vocab_file": "vocab.json",  # 词汇表文件的名称为vocab.json
    "tokenizer_file": "tokenizer.json",  # 分词器文件的名称为tokenizer.json
    "merges_file": "merges.txt",  # 合并文件的名称为merges.txt
    "normalizer_file": "normalizer.json",  # 规范化器文件的名称为normalizer.json
}

# 定义预训练模型的文件映射，包含不同模型及其对应的词汇表文件URL
PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "openai/whisper-tiny": "https://huggingface.co/openai/whisper-tiny/resolve/main/vocab.json",
        "openai/whisper-base": "https://huggingface.co/openai/whisper-base/resolve/main/vocab.json",
        "openai/whisper-small": "https://huggingface.co/openai/whisper-small/resolve/main/vocab.json",
        "openai/whisper-medium": "https://huggingface.co/openai/whisper-medium/resolve/main/vocab.json",
        "openai/whisper-large": "https://huggingface.co/openai/whisper-large/resolve/main/vocab.json",
        "openai/whisper-tiny.en": "https://huggingface.co/openai/whisper-tiny.en/resolve/main/vocab.json",
        "openai/whisper-base.en": "https://huggingface.co/openai/whisper-base.en/resolve/main/vocab.json",
        "openai/whisper-small.en": "https://huggingface.co/openai/whisper-small.en/resolve/main/vocab.json",
        "openai/whisper-medium.en": "https://huggingface.co/openai/whisper-medium.en/resolve/main/vocab.json",
    },
}
    "merges_file": {
        # merges_file 是一个字典，包含了各种不同模型的名称和对应的 merges.txt 文件的 URL
        "openai/whisper-tiny": "https://huggingface.co/openai/whisper-tiny/resolve/main/merges.txt",
        "openai/whisper-base": "https://huggingface.co/openai/whisper-base/resolve/main/merges.txt",
        "openai/whisper-small": "https://huggingface.co/openai/whisper-small/resolve/main/merges.txt",
        "openai/whisper-medium": "https://huggingface.co/openai/whisper-medium/resolve/main/merges.txt",
        "openai/whisper-large": "https://huggingface.co/openai/whisper-large/resolve/main/merges.txt",
        "openai/whisper-tiny.en": "https://huggingface.co/openai/whisper-tiny.en/resolve/main/merges.txt",
        "openai/whisper-base.en": "https://huggingface.co/openai/whisper-base.en/resolve/main/merges.txt",
        "openai/whisper-small.en": "https://huggingface.co/openai/whisper-small.en/resolve/main/merges.txt",
        "openai/whisper-medium.en": "https://huggingface.co/openai/whisper-medium.en/resolve/main/merges.txt",
    },
    "tokenizer_file": {
        # tokenizer_file 是一个字典，包含了各种不同模型的名称和对应的 tokenizer.json 文件的 URL
        "openai/whisper-tiny": "https://huggingface.co/openai/whisper-tiny/resolve/main/tokenizer.json",
        "openai/whisper-base": "https://huggingface.co/openai/whisper-base/resolve/main/tokenizer.json",
        "openai/whisper-small": "https://huggingface.co/openai/whisper-small/resolve/main/tokenizer.json",
        "openai/whisper-medium": "https://huggingface.co/openai/whisper-medium/resolve/main/tokenizer.json",
        "openai/whisper-large": "https://huggingface.co/openai/whisper-large/resolve/main/tokenizer.json",
        "openai/whisper-tiny.en": "https://huggingface.co/openai/whisper-tiny.en/resolve/main/tokenizer.json",
        "openai/whisper-base.en": "https://huggingface.co/openai/whisper-base.en/resolve/main/tokenizer.json",
        "openai/whisper-small.en": "https://huggingface.co/openai/whisper-small.en/resolve/main/tokenizer.json",
        "openai/whisper-medium.en": "https://huggingface.co/openai/whisper-medium.en/resolve/main/tokenizer.json",
    },
}

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    # 定义预训练位置嵌入的大小，对应不同模型的键值对
    "openai/whisper-tiny": 1500,
    "openai/whisper-base": 1500,
    "openai/whisper-small": 1500,
    "openai/whisper-medium": 1500,
    "openai/whisper-large": 1500,
    "openai/whisper-tiny.en": 1500,
    "openai/whisper-base.en": 1500,
    "openai/whisper-small.en": 1500,
    "openai/whisper-medium.en": 1500,
}

class WhisperTokenizerFast(PreTrainedTokenizerFast):
    """
    Construct a "fast" Whisper tokenizer (backed by HuggingFace's *tokenizers* library).

    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`, *optional*):
            Path to the vocabulary file.
        merges_file (`str`, *optional*):
            Path to the merges file.
        normalizer_file (`str`, *optional*):
            Path to the normalizer_file file.
        tokenizer_file (`str`, *optional*):
            Path to [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
            contains everything needed to load the tokenizer.
        unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        bos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
            The beginning of sequence token. The `decoder_start_token_id` is used to set the first token as
            `"<|startoftranscript|>"` when generating.
        eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
            The end of sequence token.
        add_prefix_space (`bool`, *optional*, defaults to `False`):
            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
            other word. (Whisper tokenizer detect beginning of words by the preceding space).
        language (`str`, *optional*):
            The language of the transcription text. The corresponding language id token is appended to the start of the
            sequence for multilingual speech recognition and speech translation tasks, e.g. for Spanish the token
            `"<|es|>"` is appended to the start of sequence. This should be used for multilingual fine-tuning only.
        task (`str`, *optional*):
            Task identifier to append at the start of sequence (if any). This should be used for mulitlingual
            fine-tuning, with `"transcribe"` for speech recognition and `"translate"` for speech translation.
        predict_timestamps (`bool`, *optional*, defaults to `False`):
            Whether to omit the `<|notimestamps|>` token at the start of the sequence.
    """

    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    # 使用预先定义的全局变量来初始化最大模型输入尺寸
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    # 定义模型输入的名称列表
    model_input_names = ["input_ids", "attention_mask"]
    # 设置慢速标记器的类为WhisperTokenizer
    slow_tokenizer_class = WhisperTokenizer

    # 初始化方法，接受多个可选参数和关键字参数
    def __init__(
        self,
        vocab_file=None,
        merges_file=None,
        normalizer_file=None,
        tokenizer_file=None,
        unk_token="<|endoftext|>",
        bos_token="<|endoftext|>",
        eos_token="<|endoftext|>",
        add_prefix_space=False,
        language=None,
        task=None,
        predict_timestamps=False,
        **kwargs,
    ):
        # 根据传入的参数初始化起始标记（bos_token）、结束标记（eos_token）和未知标记（unk_token）
        bos_token = (
            AddedToken(bos_token, lstrip=False, rstrip=False, normalized=False, special=True)
            if isinstance(bos_token, str)
            else bos_token
        )
        eos_token = (
            AddedToken(eos_token, lstrip=False, rstrip=False, normalized=False, special=True)
            if isinstance(eos_token, str)
            else eos_token
        )
        unk_token = (
            AddedToken(unk_token, lstrip=False, rstrip=False, normalized=False, special=True)
            if isinstance(unk_token, str)
            else unk_token
        )

        # 调用父类的初始化方法，传入相关参数和关键字参数
        super().__init__(
            vocab_file,
            merges_file,
            tokenizer_file=tokenizer_file,
            unk_token=unk_token,
            bos_token=bos_token,
            eos_token=eos_token,
            add_prefix_space=add_prefix_space,
            **kwargs,
        )

        # 从关键字参数中移除并设置add_bos_token属性，默认为False
        self.add_bos_token = kwargs.pop("add_bos_token", False)

        # 获取前处理器的状态，确保前缀空格的一致性，并更新到后端标记器的前处理器中
        pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
        if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
            pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
            pre_tok_state["add_prefix_space"] = add_prefix_space
            self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)

        # 如果提供了正常化文件，则加载英语拼写规范化器
        if normalizer_file is not None:
            with open(normalizer_file, encoding="utf-8") as vocab_handle:
                self.english_spelling_normalizer = json.load(vocab_handle)
        else:
            self.english_spelling_normalizer = None

        # 设置是否添加前缀空格的属性
        self.add_prefix_space = add_prefix_space
        # 编译用于匹配时间戳的正则表达式模式
        self.timestamp_pat = re.compile(r"<\|(\d+\.\d+)\|>")

        # 设置语言、任务和是否预测时间戳的属性
        self.language = language
        self.task = task
        self.predict_timestamps = predict_timestamps

    # 从transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast._batch_encode_plus复制而来
    # 批量编码方法，返回BatchEncoding对象
    def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
        # 检查是否已分割为单词，确保如果使用预分词的输入则需要设置add_prefix_space=True
        is_split_into_words = kwargs.get("is_split_into_words", False)
        assert self.add_prefix_space or not is_split_into_words, (
            f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
            "to use it with pretokenized inputs."
        )

        # 调用父类的_batch_encode_plus方法进行编码处理
        return super()._batch_encode_plus(*args, **kwargs)

    # 从transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast._encode_plus复制而来
    # 检查是否传入了参数 is_split_into_words，并获取其值，默认为 False
    is_split_into_words = kwargs.get("is_split_into_words", False)

    # 断言条件：如果 add_prefix_space 为 True 或者 is_split_into_words 为 False，否则抛出错误信息
    assert self.add_prefix_space or not is_split_into_words, (
        f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
        "to use it with pretokenized inputs."
    )

    # 调用父类的 _encode_plus 方法，并返回其结果
    return super()._encode_plus(*args, **kwargs)

# 从 transformers.models.whisper.tokenization_whisper.WhisperTokenizer._decode_with_timestamps 复制而来
def _decode_with_timestamps(self, token_ids, skip_special_tokens=False, time_precision=0.02) -> str:
    """
    Timestamp tokens are above the special tokens' id range and are ignored by `decode()`. This method decodes
    given tokens with timestamps tokens annotated, e.g. "<|1.08|>".
    """
    # 计算时间戳的起始位置，即特殊标记的最后一个 ID 加一
    timestamp_begin = self.all_special_ids[-1] + 1
    # 初始化输出列表
    outputs = [[]]

    # 当前最大时间戳和前一个段落的长度
    cur_max_timestamp = 0.0
    prev_segments_len = 0.0

    # 遍历 token_ids 中的每个 token
    for token in token_ids:
        # 如果 token 大于等于 timestamp_begin，表示是时间戳的 token
        if token >= timestamp_begin:
            # 计算时间戳值，根据时间精度 time_precision 进行计算
            timestamp = float((token - timestamp_begin) * time_precision)

            # 如果时间戳小于当前最大时间戳
            if timestamp < cur_max_timestamp:
                # 下一个段落已开始，更新前一个段落的长度
                prev_segments_len += cur_max_timestamp

            # 更新当前最大时间戳
            cur_max_timestamp = timestamp

            # 将时间戳标记添加到输出列表中
            outputs.append(f"<|{(timestamp + prev_segments_len):.2f}|>")
            # 添加一个空列表，用于存储下一个段落的 token
            outputs.append([])
        else:
            # 如果不是时间戳 token，直接添加到当前段落的列表中
            outputs[-1].append(token)

    # 对 outputs 列表中的每个子列表进行处理，如果是字符串则保持不变，否则调用 decode 方法解码
    outputs = [
        s if isinstance(s, str) else self.decode(s, skip_special_tokens=skip_special_tokens) for s in outputs
    ]
    # 将所有子列表合并为一个字符串并返回
    return "".join(outputs)

# 从 transformers.models.whisper.tokenization_whisper.WhisperTokenizer._compute_offsets 复制而来
    def _compute_offsets(self, token_ids, time_precision=0.02):
        """
        Compute offsets for a given tokenized input

        Args:
            token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
                List of tokenized input ids. Can be obtained using the `__call__` method.
            time_precision (`float`, `optional`, defaults to 0.02):
                The time ratio to convert from token to time.
        """
        offsets = []
        
        # ensure torch tensor of token ids is placed on cpu
        if "torch" in str(type(token_ids)) and (hasattr(token_ids, "cpu") and callable(token_ids.cpu)):
            token_ids = token_ids.cpu()
        
        token_ids = np.array(token_ids)  # Convert token_ids to a numpy array
        
        # Check if token_ids contains more than one input or is multi-dimensional
        if token_ids.shape[0] > 1 and len(token_ids.shape) > 1:
            raise ValueError("Can only process a single input at a time")
        
        # Define the beginning token index for timestamps
        timestamp_begin = self.all_special_ids[-1] + 1
        
        # Identify tokens that represent timestamps
        timestamp_tokens = token_ids >= timestamp_begin
        
        # Find consecutive timestamp tokens
        consecutive = np.where(timestamp_tokens[:-1] & timestamp_tokens[1:])[0] + 1
        
        # Handle cases where there are no timestamps or no consecutive timestamps
        if consecutive.shape[0] == 0 and timestamp_tokens.sum() <= 1:
            return []  # Return an empty list if no valid offsets are found
        elif np.where(timestamp_tokens)[0][-1] + 1 not in consecutive:
            consecutive = np.append(consecutive, np.where(timestamp_tokens)[0][-1] + 1)
        
        # Initialize variables to track slices of token_ids
        last_slice = np.where(timestamp_tokens)[0][0]
        
        # Iterate over consecutive timestamp segments
        for current_slice in consecutive:
            sliced_tokens = token_ids[last_slice:current_slice]
            
            # Compute start and end positions of timestamps
            start_timestamp_position = sliced_tokens[0].item() - timestamp_begin
            end_timestamp_position = sliced_tokens[-1].item() - timestamp_begin
            
            # Preprocess token ids to strip timestamp tokens from text output
            sliced_tokens = self._preprocess_token_ids(sliced_tokens)
            
            # Decode token ids into text
            text = self._decode(sliced_tokens)
            
            # Filter out timestamp identifiers from text
            text = self._filter_timestamp_ids(text)
            
            # Calculate and store offset information
            offsets.append(
                {
                    "text": text,
                    "timestamp": (
                        start_timestamp_position * time_precision,
                        end_timestamp_position * time_precision,
                    ),
                }
            )
            
            last_slice = current_slice  # Update last slice index
        
        return offsets  # Return computed offsets
    # 计算给定精度的时间戳标记 ID，并保存到最近最少使用 (LRU) 缓存中
    def timestamp_ids(self, time_precision=0.02):
        return self.convert_tokens_to_ids([("<|%.2f|>" % (i * time_precision)) for i in range(1500 + 1)])

    # 从 transformers.models.whisper.tokenization_whisper.WhisperTokenizer._preprocess_token_ids 复制而来
    # 预处理令牌 ID，通过移除提示令牌 ID 和时间戳令牌 ID 来为解码做准备
    def _preprocess_token_ids(self, token_ids, skip_special_tokens: bool = False):
        if skip_special_tokens:
            prompt_token_id = self.convert_tokens_to_ids("<|startofprev|>")
            decoder_start_token_id = self.convert_tokens_to_ids("<|startoftranscript|>")
            token_ids = self._strip_prompt(token_ids, prompt_token_id, decoder_start_token_id)

        return token_ids

    # 从 transformers.models.whisper.tokenization_whisper.WhisperTokenizer._filter_timestamp_ids 复制而来
    # 过滤时间戳令牌 ID
    def _filter_timestamp_ids(self, token_ids):
        return re.sub(self.timestamp_pat, "", token_ids)

    # 从 transformers.models.whisper.tokenization_whisper.WhisperTokenizer.decode 复制而来
    def decode(
        self,
        token_ids,
        skip_special_tokens: bool = False,
        clean_up_tokenization_spaces: bool = None,
        output_offsets: bool = False,
        time_precision: float = 0.02,
        decode_with_timestamps: bool = False,
        normalize: bool = False,
        basic_normalize: bool = False,
        remove_diacritics: bool = False,
        **kwargs,
    ):
        """
        解码令牌 ID 为文本。

        Args:
            token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
                Tokenized input ids list.
            skip_special_tokens (`bool`, optional, defaults to `False`):
                Whether or not to remove special tokens from the token ids. If `True`, the prompt token ids will be
                removed.
            clean_up_tokenization_spaces (`bool`, optional):
                Whether or not to clean up tokenization spaces in the output text.
            output_offsets (`bool`, optional):
                Whether to return the token-level offsets in the original input text.
            time_precision (`float`, optional, defaults to 0.02):
                The time precision used for decoding timestamps.
            decode_with_timestamps (`bool`, optional):
                Whether to decode timestamps along with the tokens.
            normalize (`bool`, optional):
                Whether to normalize the decoded text.
            basic_normalize (`bool`, optional):
                Whether to perform basic normalization on the decoded text.
            remove_diacritics (`bool`, optional):
                Whether to remove diacritics from the decoded text.

        Returns:
            Decoded text as a string.
        """
        if normalize:
            clean_text = self._normalize(text)
            return clean_text
        elif basic_normalize:
            clean_text = self._basic_normalize(text, remove_diacritics=remove_diacritics)
            return clean_text
        else:
            return text

    # 从 transformers.models.whisper.tokenization_whisper.WhisperTokenizer._decode 复制而来
    def _decode(
        self, *args, normalize: bool = False, basic_normalize: bool = False, remove_diacritics: bool = False, **kwargs
    ) -> str:
        """
        解码操作的内部实现。

        Args:
            *args: 传递给超类的参数。
            normalize (`bool`, optional):
                是否对解码后的文本进行规范化处理。
            basic_normalize (`bool`, optional):
                是否对解码后的文本进行基础规范化处理。
            remove_diacritics (`bool`, optional):
                是否移除解码后文本中的变音符号。

        Returns:
            解码后的字符串。
        """
        text = super()._decode(*args, **kwargs)

        if normalize:
            clean_text = self._normalize(text)
            return clean_text
        elif basic_normalize:
            clean_text = self._basic_normalize(text, remove_diacritics=remove_diacritics)
            return clean_text
        else:
            return text

    # 从 transformers.models.whisper.tokenization_whisper.WhisperTokenizer._normalize 复制而来
    # 对文本进行规范化处理
    # 对输入的文本进行规范化处理，已被废弃，在 Transformers 的 v5 版本中将会移除
    def _normalize(self, text):
        warnings.warn(
            "The private method `_normalize` is deprecated and will be removed in v5 of Transformers."
            "You can normalize an input string using the Whisper English normalizer using the `normalize` method."
        )
        return self.normalize(text)

    # 从 transformers.models.whisper.tokenization_whisper.WhisperTokenizer._basic_normalize 复制而来
    # 对输入的文本进行基本规范化处理，已被废弃，在 Transformers 的 v5 版本中将会移除
    def _basic_normalize(self, text, remove_diacritics=False):
        warnings.warn(
            "The private method `_basic_normalize` is deprecated and will be removed in v5 of Transformers."
            "You can normalize an input string using the Whisper basic normalizer using the `basic_normalize` method."
        )
        return self.basic_normalize(text, remove_diacritics=remove_diacritics)

    # 从 transformers.models.whisper.tokenization_whisper.WhisperTokenizer.normalize 复制而来
    # 使用 `EnglishTextNormalizer` 类对给定的字符串进行规范化处理，执行常见的英文文本转换
    def normalize(self, text):
        normalizer = EnglishTextNormalizer(self.english_spelling_normalizer)
        return normalizer(text)

    @staticmethod
    # 从 transformers.models.whisper.tokenization_whisper.WhisperTokenizer.basic_normalize 复制而来
    # 使用 `BasicTextNormalizer` 类对给定的字符串进行规范化处理，执行常见的多语言文本转换
    def basic_normalize(text, remove_diacritics=False):
        normalizer = BasicTextNormalizer(remove_diacritics=remove_diacritics)
        return normalizer(text)

    # 保存词汇表到指定的目录，返回保存的文件列表和规范化文件
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # 调用 tokenizer 对象的 model 的 save 方法，将词汇保存到指定目录中
        files = self._tokenizer.model.save(save_directory, name=filename_prefix)

        # 初始化规范化文件路径
        normalizer_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["normalizer_file"]
        )

        # 如果存在英文拼写规范化器，则将其保存到规范化文件中
        if self.english_spelling_normalizer is not None:
            with open(normalizer_file, "w", encoding="utf-8") as f:
                f.write(
                    json.dumps(self.english_spelling_normalizer, indent=2, sort_keys=True, ensure_ascii=False) + "\n"
                )

        # 返回保存的文件列表和规范化文件路径
        return tuple(files) + (normalizer_file,)
    def set_prefix_tokens(self, language: str = None, task: str = None, predict_timestamps: bool = None):
        """
        Override the prefix tokens appended to the start of the label sequence. This method can be used standalone to
        update the prefix tokens as required when fine-tuning. Example:

        ```
        >>> # instantiate the tokenizer and set the prefix token to Spanish
        >>> tokenizer = WhisperTokenizerFast.from_pretrained("openai/whisper-tiny", language="spanish")
        >>> # now switch the prefix token from Spanish to French
        >>> tokenizer.set_prefix_tokens(language="french")
        ```

        Args:
            language (`str`, *optional*, defaults to `None`):
                The language of the transcription text.
            task (`str`, *optional*, defaults to `None`):
                Task identifier to append at the start of sequence (if any).
            predict_timestamps (`bool`, *optional*, defaults to `None`):
                Whether to omit the `<|notimestamps|>` token at the start of the sequence.
        """
        # 设置语言参数，如果未提供则使用当前实例的语言
        self.language = language if language is not None else self.language
        # 设置任务参数，如果未提供则使用当前实例的任务
        self.task = task if task is not None else self.task
        # 设置预测时间戳参数，如果未提供则使用当前实例的预测时间戳设置
        self.predict_timestamps = predict_timestamps if predict_timestamps is not None else self.predict_timestamps

        # 获取当前实例的前缀 token IDs
        prefix_token_ids = self.prefix_tokens
        # 将前缀 token IDs 转换为 token 列表
        prefixes = self.convert_ids_to_tokens(prefix_token_ids)
        # 获取结束符号（EOS）
        eos = self.eos_token
        # 获取结束符号（EOS）的 token ID
        eos_token_id = self.eos_token_id
        # 构建前缀模板，用于后续的处理
        prefix_template = " ".join([f"{token}:0" for token in prefixes])
        # 设置后处理器，使用模板处理器，定义单个和对称序列的格式
        self.backend_tokenizer.post_processor = processors.TemplateProcessing(
            single=f"{prefix_template} $A:0 {eos}:0",
            pair=f"{prefix_template} $A:0 $B:1 {eos}:1",
            special_tokens=[
                (eos, eos_token_id),  # 添加结束符号（EOS）和其 token ID 到特殊 token 列表
                *zip(prefixes, prefix_token_ids),  # 添加前缀 token 和对应的 token ID 到特殊 token 列表
            ],
        )

    @property
    # 从 transformers.models.whisper.tokenization_whisper.WhisperTokenizer.prefix_tokens 复制而来
    # 返回一个列表，包含特定的起始标记的 ID
    def prefix_tokens(self) -> List[int]:
        # 将 "<|startoftranscript|>" 转换为对应的 ID
        bos_token_id = self.convert_tokens_to_ids("<|startoftranscript|>")
        # 将 "<|translate|>" 转换为对应的 ID
        translate_token_id = self.convert_tokens_to_ids("<|translate|>")
        # 将 "<|transcribe|>" 转换为对应的 ID
        transcribe_token_id = self.convert_tokens_to_ids("<|transcribe|>")
        # 将 "<|notimestamps|>" 转换为对应的 ID
        notimestamps_token_id = self.convert_tokens_to_ids("<|notimestamps|>")
        # 获取所有语言的键值对的元组
        langs = tuple(LANGUAGES.keys())

        if self.language is not None:
            # 将语言名称转换为小写
            self.language = self.language.lower()
            # 检查语言是否在 TO_LANGUAGE_CODE 中存在，如果是则获取对应的语言 ID
            if self.language in TO_LANGUAGE_CODE:
                language_id = TO_LANGUAGE_CODE[self.language]
            # 如果语言名称不在 TO_LANGUAGE_CODE 中，则检查是否是语言 ID，如果是则直接使用
            elif self.language in TO_LANGUAGE_CODE.values():
                language_id = self.language
            else:
                # 如果语言既不是名称也不是 ID，则抛出错误
                is_language_code = len(self.language) == 2
                raise ValueError(
                    f"Unsupported language: {self.language}. Language should be one of:"
                    f" {list(TO_LANGUAGE_CODE.values()) if is_language_code else list(TO_LANGUAGE_CODE.keys())}."
                )

        if self.task is not None:
            # 检查任务是否在支持的任务列表 TASK_IDS 中，如果不是则抛出错误
            if self.task not in TASK_IDS:
                raise ValueError(f"Unsupported task: {self.task}. Task should be in: {TASK_IDS}")

        bos_sequence = [bos_token_id]
        if self.language is not None:
            # 如果有指定语言，则将其对应的特定 ID 添加到序列中
            bos_sequence.append(bos_token_id + 1 + langs.index(language_id))
        if self.task is not None:
            # 如果有指定任务，则根据任务类型添加对应的特定 ID 到序列中
            bos_sequence.append(transcribe_token_id if self.task == "transcribe" else translate_token_id)
        if not self.predict_timestamps:
            # 如果不需要预测时间戳，则添加对应的特定 ID 到序列中
            bos_sequence.append(notimestamps_token_id)
        return bos_sequence

    # 从给定的 token_ids_0 和 token_ids_1 构建模型输入，同时添加结束标记 eos_token_id
    # 来自 transformers.models.whisper.tokenization_whisper.WhisperTokenizer.build_inputs_with_special_tokens
    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> List[int]:
        """Build model inputs from a sequence by appending eos_token_id."""
        if token_ids_1 is None:
            # 如果没有第二个序列，则只需将 token_ids_0 与前缀序列和结束标记连接起来
            return self.prefix_tokens + token_ids_0 + [self.eos_token_id]
        # 对于存在第二个序列的情况，按照API的一般性保留了对成对逻辑的处理
        return self.prefix_tokens + token_ids_0 + token_ids_1 + [self.eos_token_id]

    # 获取特殊 token 的掩码，来自 transformers.models.whisper.tokenization_whisper.WhisperTokenizer.get_special_tokens_mask
    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
    ):
    ) -> List[int]:
        """
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """

        # Check if special tokens are already added; if so, delegate to superclass method
        if already_has_special_tokens:
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
            )

        # Initialize prefix tokens as all 1s (special tokens)
        prefix_ones = [1] * len(self.prefix_tokens)
        # Suffix token is always 1 (special token)
        suffix_ones = [1]

        # If token_ids_1 is None, return concatenated list of prefix, sequence tokens, and suffix
        if token_ids_1 is None:
            return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones
        
        # If token_ids_1 exists, return concatenated list of prefix, sequence tokens from both lists, and suffix
        return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones

    @property
    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.default_chat_template
    def default_chat_template(self):
        """
        A simple chat template that ignores role information and just concatenates messages with EOS tokens.
        """
        # Issue a warning if no specific chat template is defined
        logger.warning_once(
            "\nNo chat template is defined for this tokenizer - using the default template "
            f"for the {self.__class__.__name__} class. If the default is not appropriate for "
            "your model, please set `tokenizer.chat_template` to an appropriate template. "
            "See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
        )
        # Return the default chat template string
        return "{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}"

    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer.get_decoder_prompt_ids
    def get_decoder_prompt_ids(self, task=None, language=None, no_timestamps=True):
        # Set prefix tokens based on task, language, and timestamp settings
        self.set_prefix_tokens(task=task, language=language, predict_timestamps=not no_timestamps)
        
        # Prefix tokens format: <|startoftranscript|> <|lang_id|> <|task|> <|notimestamps|>
        # Exclude the first token (`<|startoftranscript|>`) as it is the starting token for generation
        # Extract the remaining prefix tokens for decoder prompt IDs
        forced_tokens = self.prefix_tokens[1:]
        # Pair each token with its rank shifted by one (rank + 1) for decoder prompt IDs
        forced_decoder_ids = [(rank + 1, token) for rank, token in enumerate(forced_tokens)]
        
        # Return the list of forced decoder prompt IDs
        return forced_decoder_ids
    # 将指定的 ASR 模型输出解码为文本，根据参数选择是否返回时间戳、语言和时间精度
    def _decode_asr(self, model_outputs, *, return_timestamps, return_language, time_precision):
        return _decode_asr(
            self,
            model_outputs,
            return_timestamps=return_timestamps,
            return_language=return_language,
            time_precision=time_precision,
        )

    # 从WhisperTokenizer类中复制的方法，用于获取文本的提示词ID
    def get_prompt_ids(self, text: str, return_tensors="np"):
        """Converts prompt text to IDs that can be passed to [`~WhisperForConditionalGeneration.generate`]."""
        # 使用WhisperTokenizer实例的编码方法将特殊标记加入到输入文本前，并进行批处理编码
        batch_encoding = self("<|startofprev|>", " " + text.strip(), add_special_tokens=False)

        # 检查特殊标记
        prompt_text_ids = batch_encoding["input_ids"][1:]  # 移除首个特殊标记
        special_token_id = next((x for x in prompt_text_ids if x >= self.all_special_ids[0]), None)
        if special_token_id is not None:
            token = self.convert_ids_to_tokens(special_token_id)
            raise ValueError(f"Encountered text in the prompt corresponding to disallowed special token: {token}.")

        # 将批处理编码转换为指定类型的张量（如numpy）
        batch_encoding.convert_to_tensors(tensor_type=return_tensors)
        return batch_encoding["input_ids"]  # 返回输入文本对应的ID列表

    @staticmethod
    # 从WhisperTokenizer类中复制的方法，用于从标记ID列表中删除提示标记
    def _strip_prompt(token_ids: List[int], prompt_token_id: int, decoder_start_token_id: int):
        has_prompt = isinstance(token_ids, list) and token_ids and token_ids[0] == prompt_token_id
        if has_prompt:
            if decoder_start_token_id in token_ids:
                return token_ids[token_ids.index(decoder_start_token_id) :]  # 返回从解码开始标记到结尾的标记ID列表
            else:
                return []  # 如果没有解码开始标记，则返回空列表

        return token_ids  # 如果没有提示标记，则直接返回标记ID列表

`.\models\whisper\init.py`

# 导入所需的模块和函数
from typing import TYPE_CHECKING

# 导入可能的异常处理类和模块
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_flax_available,
    is_tf_available,
    is_tokenizers_available,
    is_torch_available,
)

# 定义模块的导入结构，每个模块对应其所需的类或函数列表
_import_structure = {
    "configuration_whisper": ["WHISPER_PRETRAINED_CONFIG_ARCHIVE_MAP", "WhisperConfig", "WhisperOnnxConfig"],
    "feature_extraction_whisper": ["WhisperFeatureExtractor"],
    "processing_whisper": ["WhisperProcessor"],
    "tokenization_whisper": ["WhisperTokenizer"],
}

# 检查是否存在 tokenizers 库，如果不存在则抛出异常
try:
    if not is_tokenizers_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果库可用，则将 tokenization_whisper_fast 模块添加到导入结构中
    _import_structure["tokenization_whisper_fast"] = ["WhisperTokenizerFast"]

# 检查是否存在 torch 库，如果不存在则抛出异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果库可用，则将 modeling_whisper 模块添加到导入结构中
    _import_structure["modeling_whisper"] = [
        "WHISPER_PRETRAINED_MODEL_ARCHIVE_LIST",
        "WhisperForCausalLM",
        "WhisperForConditionalGeneration",
        "WhisperModel",
        "WhisperPreTrainedModel",
        "WhisperForAudioClassification",
    ]

# 检查是否存在 tensorflow 库，如果不存在则抛出异常
try:
    if not is_tf_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果库可用，则将 modeling_tf_whisper 模块添加到导入结构中
    _import_structure["modeling_tf_whisper"] = [
        "TF_WHISPER_PRETRAINED_MODEL_ARCHIVE_LIST",
        "TFWhisperForConditionalGeneration",
        "TFWhisperModel",
        "TFWhisperPreTrainedModel",
    ]

# 检查是否存在 flax 库，如果不存在则抛出异常
try:
    if not is_flax_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果库可用，则将 modeling_flax_whisper 模块添加到导入结构中
    _import_structure["modeling_flax_whisper"] = [
        "FlaxWhisperForConditionalGeneration",
        "FlaxWhisperModel",
        "FlaxWhisperPreTrainedModel",
        "FlaxWhisperForAudioClassification",
    ]

# 如果是类型检查模式，导入相关的类型定义
if TYPE_CHECKING:
    from .configuration_whisper import WHISPER_PRETRAINED_CONFIG_ARCHIVE_MAP, WhisperConfig, WhisperOnnxConfig
    from .feature_extraction_whisper import WhisperFeatureExtractor
    from .processing_whisper import WhisperProcessor
    from .tokenization_whisper import WhisperTokenizer

    # 检查是否存在 tokenizers 库，如果可用，则导入 tokenization_whisper_fast 模块
    try:
        if not is_tokenizers_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .tokenization_whisper_fast import WhisperTokenizerFast
    # 检查是否安装了 Torch 库，如果没有则引发 OptionalDependencyNotAvailable 异常
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    # 捕获 OptionalDependencyNotAvailable 异常，如果引发则不做任何操作
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 导入 Torch 版本的 Whisper 模型相关内容
        from .modeling_whisper import (
            WHISPER_PRETRAINED_MODEL_ARCHIVE_LIST,
            WhisperForAudioClassification,
            WhisperForCausalLM,
            WhisperForConditionalGeneration,
            WhisperModel,
            WhisperPreTrainedModel,
        )

    # 检查是否安装了 TensorFlow 库，如果没有则引发 OptionalDependencyNotAvailable 异常
    try:
        if not is_tf_available():
            raise OptionalDependencyNotAvailable()
    # 捕获 OptionalDependencyNotAvailable 异常，如果引发则不做任何操作
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 导入 TensorFlow 版本的 Whisper 模型相关内容
        from .modeling_tf_whisper import (
            TF_WHISPER_PRETRAINED_MODEL_ARCHIVE_LIST,
            TFWhisperForConditionalGeneration,
            TFWhisperModel,
            TFWhisperPreTrainedModel,
        )

    # 检查是否安装了 Flax 库，如果没有则引发 OptionalDependencyNotAvailable 异常
    try:
        if not is_flax_available():
            raise OptionalDependencyNotAvailable()
    # 捕获 OptionalDependencyNotAvailable 异常，如果引发则不做任何操作
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 导入 Flax 版本的 Whisper 模型相关内容
        from .modeling_flax_whisper import (
            FlaxWhisperForAudioClassification,
            FlaxWhisperForConditionalGeneration,
            FlaxWhisperModel,
            FlaxWhisperPreTrainedModel,
        )
else:
    # 导入 sys 模块，用于动态修改当前模块的属性
    import sys

    # 将当前模块注册到 sys.modules[__name__] 中，使用 _LazyModule 进行延迟加载
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\xglm\configuration_xglm.py`

# coding=utf-8
# Copyright The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" XGLM model configuration"""

# 导入预训练配置基类和日志工具
from ...configuration_utils import PretrainedConfig
from ...utils import logging

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 预训练配置文件映射，指定模型名称到配置文件的映射
XGLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "facebook/xglm-564M": "https://huggingface.co/facebook/xglm-564M/resolve/main/config.json",
    # 查看所有 XGLM 模型信息，请访问 https://huggingface.co/models?filter=xglm
}

# XGLM 模型配置类，继承自预训练配置基类
class XGLMConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`XGLMModel`]. It is used to instantiate an XGLM
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the XGLM
    [facebook/xglm-564M](https://huggingface.co/facebook/xglm-564M) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    """
    # 模型类型设定为 "xglm"
    model_type = "xglm"
    # 在推断时忽略的键列表，这些键不会在推断时使用
    keys_to_ignore_at_inference = ["past_key_values"]
    # 定义一个映射，将模型参数的外部名称映射到内部名称
    attribute_map = {
        "num_attention_heads": "attention_heads",  # 将外部参数 "num_attention_heads" 映射为内部参数 "attention_heads"
        "hidden_size": "d_model",  # 将外部参数 "hidden_size" 映射为内部参数 "d_model"
        "num_hidden_layers": "num_layers",  # 将外部参数 "num_hidden_layers" 映射为内部参数 "num_layers"
    }
    
    # 初始化方法，设置模型的各种参数
    def __init__(
        self,
        vocab_size=256008,  # 词汇表大小，默认为 256008
        max_position_embeddings=2048,  # 最大位置编码，默认为 2048
        d_model=1024,  # 隐藏层大小，默认为 1024
        ffn_dim=4096,  # Feedforward 层的维度，默认为 4096
        num_layers=24,  # 网络层数，默认为 24
        attention_heads=16,  # 注意力头的数量，默认为 16
        activation_function="gelu",  # 激活函数，默认为 "gelu"
        dropout=0.1,  # 普通 dropout 的比例，默认为 0.1
        attention_dropout=0.1,  # 注意力层的 dropout 比例，默认为 0.1
        activation_dropout=0.0,  # 激活函数的 dropout 比例，默认为 0.0
        layerdrop=0.0,  # 层级 dropout 的比例，默认为 0.0
        init_std=0.02,  # 参数初始化的标准差，默认为 0.02
        scale_embedding=True,  # 是否对嵌入进行缩放，默认为 True，如果是，则缩放因子为 sqrt(d_model)
        use_cache=True,  # 是否使用缓存，默认为 True
        decoder_start_token_id=2,  # 解码器起始标记的 id，默认为 2
        pad_token_id=1,  # 填充标记的 id，默认为 1
        bos_token_id=0,  # 起始标记的 id，默认为 0
        eos_token_id=2,  # 结束标记的 id，默认为 2
        **kwargs,  # 其他可变关键字参数
    ):
        self.vocab_size = vocab_size  # 设置词汇表大小
        self.max_position_embeddings = max_position_embeddings  # 设置最大位置编码
        self.d_model = d_model  # 设置隐藏层大小
        self.ffn_dim = ffn_dim  # 设置 Feedforward 层的维度
        self.num_layers = num_layers  # 设置网络层数
        self.attention_heads = attention_heads  # 设置注意力头的数量
        self.activation_function = activation_function  # 设置激活函数
        self.dropout = dropout  # 设置普通 dropout 比例
        self.attention_dropout = attention_dropout  # 设置注意力层的 dropout 比例
        self.activation_dropout = activation_dropout  # 设置激活函数的 dropout 比例
        self.layerdrop = layerdrop  # 设置层级 dropout 比例
        self.init_std = init_std  # 设置参数初始化的标准差
        self.scale_embedding = scale_embedding  # 设置是否缩放嵌入
        self.use_cache = use_cache  # 设置是否使用缓存
    
        # 调用父类的初始化方法，传入特殊的 token id 参数和其他可变关键字参数
        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            decoder_start_token_id=decoder_start_token_id,
            **kwargs,
        )

`.\models\xglm\convert_xglm_original_ckpt_to_trfms.py`

import argparse                            # 导入argparse库，用于处理命令行参数
from argparse import Namespace             # 导入Namespace类，用于创建命名空间

import torch                               # 导入PyTorch库
from torch import nn                       # 导入神经网络模块

from transformers import XGLMConfig, XGLMForCausalLM   # 导入transformers库中的XGLMConfig和XGLMForCausalLM类


def remove_ignore_keys_(state_dict):
    # 定义函数，用于从状态字典中移除特定的键
    ignore_keys = [
        "decoder.version",                             # 忽略的键1
        "decoder.output_projection.weight",             # 忽略的键2
        "_float_tensor",                               # 忽略的键3
        "decoder.embed_positions._float_tensor",       # 忽略的键4
    ]
    for k in ignore_keys:
        state_dict.pop(k, None)


def make_linear_from_emb(emb):
    # 定义函数，从给定的嵌入矩阵创建一个线性层
    vocab_size, emb_size = emb.weight.shape
    lin_layer = nn.Linear(vocab_size, emb_size, bias=False)   # 创建线性层，无偏置
    lin_layer.weight.data = emb.weight.data                   # 将权重数据设置为输入嵌入的权重数据
    return lin_layer


def convert_fairseq_xglm_checkpoint_from_disk(checkpoint_path):
    # 定义函数，从Fairseq的检查点文件中加载模型并转换为XGLM模型

    # 加载检查点文件
    checkpoint = torch.load(checkpoint_path, map_location="cpu")
    args = Namespace(**checkpoint["cfg"]["model"])   # 从检查点中读取模型参数并创建命名空间对象
    state_dict = checkpoint["model"]                 # 从检查点中读取模型的状态字典
    remove_ignore_keys_(state_dict)                  # 调用函数移除状态字典中的特定键

    vocab_size = state_dict["decoder.embed_tokens.weight"].shape[0]   # 获取词汇表大小

    # 重命名状态字典中的键，将"decoder"替换为"model"
    state_dict = {key.replace("decoder", "model"): val for key, val in state_dict.items()}

    # 根据配置创建XGLMConfig对象
    config = XGLMConfig(
        vocab_size=vocab_size,
        max_position_embeddings=args.max_target_positions,
        num_layers=args.decoder_layers,
        attention_heads=args.decoder_attention_heads,
        ffn_dim=args.decoder_ffn_embed_dim,
        d_model=args.decoder_embed_dim,
        layerdrop=args.decoder_layerdrop,
        dropout=args.dropout,
        attention_dropout=args.attention_dropout,
        activation_dropout=args.activation_dropout,
        activation_function="gelu",
        scale_embedding=not args.no_scale_embedding,
        tie_word_embeddings=args.share_decoder_input_output_embed,
    )

    model = XGLMForCausalLM(config)     # 创建XGLM模型对象
    missing = model.load_state_dict(state_dict, strict=False)   # 加载状态字典到模型，允许不严格匹配
    print(missing)                     # 打印加载时缺失的键信息
    model.lm_head = make_linear_from_emb(model.model.embed_tokens)   # 根据嵌入矩阵创建线性头部

    return model                        # 返回转换后的XGLM模型对象


if __name__ == "__main__":
    parser = argparse.ArgumentParser()   # 创建参数解析器对象
    # 添加必需的命令行参数
    parser.add_argument("fairseq_path", type=str, help="path to a model.pt on local filesystem.")
    parser.add_argument("pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
    args = parser.parse_args()           # 解析命令行参数
    model = convert_fairseq_xglm_checkpoint_from_disk(args.fairseq_path)   # 转换Fairseq检查点文件为XGLM模型
    model.save_pretrained(args.pytorch_dump_folder_path)   # 将模型保存到指定路径

`.\models\xglm\modeling_flax_xglm.py`

# coding=utf-8
# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Flax XGLM model."""

# 导入需要的库和模块
import math
import random
from functools import partial
from typing import Optional, Tuple

import flax.linen as nn  # 导入Flax的linen模块，通常用来定义神经网络模型
import jax  # 导入JAX，用于自动求导和数组计算
import jax.numpy as jnp  # 导入JAX的NumPy接口，用于数组操作
import numpy as np  # 导入NumPy，通用的数值计算库
from flax.core.frozen_dict import FrozenDict, freeze, unfreeze  # 导入Flax的FrozenDict，用于不可变字典的操作
from flax.linen import combine_masks, make_causal_mask  # 导入Flax的函数和类
from flax.linen.attention import dot_product_attention_weights  # 导入Flax的注意力机制函数
from flax.traverse_util import flatten_dict, unflatten_dict  # 导入Flax的工具函数，用于字典扁平化和反扁平化
from jax import lax  # 导入JAX的lax模块，用于定义和执行JAX原语
from jax.random import PRNGKey  # 导入JAX的随机数生成器

from ...modeling_flax_outputs import (
    FlaxBaseModelOutputWithPastAndCrossAttentions,
    FlaxCausalLMOutputWithCrossAttentions,
)  # 导入自定义的Flax模型输出类
from ...modeling_flax_utils import ACT2FN, FlaxPreTrainedModel, append_call_sample_docstring  # 导入自定义的Flax模型和工具函数
from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging  # 导入自定义的工具函数和日志模块
from .configuration_xglm import XGLMConfig  # 导入XGLM模型的配置文件

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器

_CHECKPOINT_FOR_DOC = "facebook/xglm-564M"  # 文档中的预训练模型名
_CONFIG_FOR_DOC = "XGLMConfig"  # 文档中的配置文件名

XGLM_START_DOCSTRING = r"""
    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a Flax Linen
    [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a
    regular Flax Module and refer to the Flax documentation for all matter related to general usage and behavior.

    Finally, this model supports inherent JAX features such as:

    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
"""
    # 定义一个函数，接受以下参数:
    #   config (`XGLMConfig`)：包含模型所有参数的配置类。
    #       使用配置文件初始化不会加载与模型关联的权重，只加载配置。
    #       可以查看 [`~FlaxPreTrainedModel.from_pretrained`] 方法来加载模型权重。
    #   dtype (`jax.numpy.dtype`, *可选*, 默认为 `jax.numpy.float32`)：
    #       计算的数据类型。可以是 `jax.numpy.float32`、`jax.numpy.float16`（在GPU上）、`jax.numpy.bfloat16`（在TPU上）之一。
    #
    #       这可以用于在GPU或TPU上启用混合精度训练或半精度推断。如果指定，则所有计算将使用给定的 `dtype` 执行。
    #
    #       **注意，这仅指定计算的dtype，不影响模型参数的dtype。**
    #
    #       如果要更改模型参数的dtype，请参阅 [`~FlaxPreTrainedModel.to_fp16`] 和 [`~FlaxPreTrainedModel.to_bf16`]。
"""
XGLM_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""


def create_sinusoidal_positions(n_pos, dim, padding_idx=1):
    # Calculate half of the dimension for sinusoidal embedding
    half_dim = dim // 2
    # Compute the exponential term for sinusoidal embedding
    emb = math.log(10000) / (half_dim - 1)
    emb = np.exp(np.arange(half_dim) * -emb)
    # Expand dimensions to perform element-wise multiplication
    emb = np.expand_dims(np.arange(n_pos), 1) * np.expand_dims(emb, 0)
    # Concatenate sine and cosine transformations of embeddings
    emb = np.concatenate([np.sin(emb), np.cos(emb)], 1)
    # Reshape the embedding to match desired dimensions
    emb = np.reshape(emb, (n_pos, dim))

    # If padding index is specified, zero out its embedding
    if padding_idx is not None:
        emb[padding_idx, :] = 0

    # Convert embedding to JAX array
    return jnp.array(emb)


class FlaxXGLMAttention(nn.Module):
    config: XGLMConfig
    embed_dim: int
    num_heads: int
    dropout: float = 0.0
    causal: bool = False
    bias: bool = True
    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
    def setup(self) -> None:
        # 计算每个头部的维度
        self.head_dim = self.embed_dim // self.num_heads

        # 检查 embed_dim 是否能被 num_heads 整除，否则抛出数值错误
        if self.head_dim * self.num_heads != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} "
                f"and `num_heads`: {self.num_heads})."
            )

        # 定义部分应用了部分参数的 Dense 层构造函数
        dense = partial(
            nn.Dense,
            self.embed_dim,
            use_bias=self.bias,
            dtype=self.dtype,
            kernel_init=jax.nn.initializers.normal(self.config.init_std),
        )

        # 初始化查询、键、值、输出投影层
        self.q_proj, self.k_proj, self.v_proj = dense(), dense(), dense()
        self.out_proj = dense()

        # 初始化 Dropout 层
        self.dropout_layer = nn.Dropout(rate=self.dropout)

        # 如果需要引入因果注意力机制，则创建对应的因果掩码
        if self.causal:
            self.causal_mask = make_causal_mask(
                jnp.ones((1, self.config.max_position_embeddings), dtype="bool"), dtype="bool"
            )

    def _split_heads(self, hidden_states):
        # 将隐藏状态张量按头部分割
        return hidden_states.reshape(hidden_states.shape[:2] + (self.num_heads, self.head_dim))

    def _merge_heads(self, hidden_states):
        # 将分割后的头部重新合并
        return hidden_states.reshape(hidden_states.shape[:2] + (self.embed_dim,))

    @nn.compact
    def _concatenate_to_cache(self, key, value, query, attention_mask):
        """
        This function takes projected key, value states from a single input token and concatenates the states to cached
        states from previous steps. This function is slighly adapted from the official Flax repository:
        https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
        """
        # 检测是否正在初始化，通过检查是否存在缓存数据来判断
        is_initialized = self.has_variable("cache", "cached_key")
        # 获取或创建缓存的键值对应的变量，如果不存在则创建一个全零数组
        cached_key = self.variable("cache", "cached_key", jnp.zeros, key.shape, key.dtype)
        # 获取或创建缓存的值对应的变量，如果不存在则创建一个全零数组
        cached_value = self.variable("cache", "cached_value", jnp.zeros, value.shape, value.dtype)
        # 获取或创建缓存索引对应的变量，如果不存在则创建一个值为0的整数数组
        cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32))

        if is_initialized:
            # 提取当前缓存的维度信息，包括批次维度、最大长度、头数、每头深度
            *batch_dims, max_length, num_heads, depth_per_head = cached_key.value.shape
            # 使用新的一维空间切片更新键和值的缓存
            cur_index = cache_index.value
            indices = (0,) * len(batch_dims) + (cur_index, 0, 0)
            key = lax.dynamic_update_slice(cached_key.value, key, indices)
            value = lax.dynamic_update_slice(cached_value.value, value, indices)
            # 更新缓存中的键和值
            cached_key.value = key
            cached_value.value = value
            # 更新缓存索引，增加已更新的缓存向量数目
            num_updated_cache_vectors = query.shape[1]
            cache_index.value = cache_index.value + num_updated_cache_vectors
            # 用于缓存的因果掩码：我们的单个查询位置应该只关注已生成和缓存的键位置，而不是剩余的零元素。
            pad_mask = jnp.broadcast_to(
                jnp.arange(max_length) < cur_index + num_updated_cache_vectors,
                tuple(batch_dims) + (1, num_updated_cache_vectors, max_length),
            )
            # 将因果掩码和传入的注意力掩码结合起来
            attention_mask = combine_masks(pad_mask, attention_mask)
        # 返回更新后的键、值和注意力掩码
        return key, value, attention_mask
    # 定义一个 FlaxXGLMDecoderLayer 类，继承自 nn.Module
    class FlaxXGLMDecoderLayer(nn.Module):
        # 类变量：XGLMConfig 类型的 config 变量
        config: XGLMConfig
        # 类变量：jnp.float32 类型的 dtype，默认为 jnp.float32
        dtype: jnp.dtype = jnp.float32

        # 初始化方法，无返回值
        def setup(self) -> None:
            # 实例变量：self.embed_dim 等于 config.d_model
            self.embed_dim = self.config.d_model
            # 实例变量：self.self_attn 是一个 FlaxXGLMAttention 实例
            # 根据给定的 config 参数进行初始化
            self.self_attn = FlaxXGLMAttention(
                config=self.config,
                embed_dim=self.embed_dim,
                num_heads=self.config.attention_heads,
                dropout=self.config.attention_dropout,
                causal=True,
                dtype=self.dtype,
            )
            # 实例变量：self.self_attn_layer_norm 是一个 LayerNorm 实例
            # 根据 dtype 参数进行初始化
            self.self_attn_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)
            # 实例变量：self.dropout_layer 是一个 Dropout 层实例
            # 根据 config.dropout 参数进行初始化
            self.dropout_layer = nn.Dropout(rate=self.config.dropout)
            # 实例变量：self.activation_fn 是一个激活函数，根据 config.activation_function 选择
            self.activation_fn = ACT2FN[self.config.activation_function]
            # 实例变量：self.activation_dropout_layer 是一个 Dropout 层实例
            # 根据 config.activation_dropout 参数进行初始化
            self.activation_dropout_layer = nn.Dropout(rate=self.config.activation_dropout)

            # 如果 config.add_cross_attention 为 True，则初始化下面的变量
            if self.config.add_cross_attention:
                # 实例变量：self.encoder_attn 是一个 FlaxXGLMAttention 实例
                # 根据给定的 config 参数进行初始化
                self.encoder_attn = FlaxXGLMAttention(
                    config=self.config,
                    embed_dim=self.embed_dim,
                    num_heads=self.config.decoder_attention_heads,
                    dropout=self.config.attention_dropout,
                    dtype=self.dtype,
                )
                # 实例变量：self.encoder_attn_layer_norm 是一个 LayerNorm 实例
                # 根据 dtype 参数进行初始化
                self.encoder_attn_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)

            # 实例变量：self.fc1 是一个全连接层实例
            # 输入维度为 self.config.ffn_dim，输出维度为 self.embed_dim
            # 根据 dtype 参数和 self.config.init_std 进行初始化
            self.fc1 = nn.Dense(
                self.config.ffn_dim,
                dtype=self.dtype,
                kernel_init=jax.nn.initializers.normal(self.config.init_std),
            )
            # 实例变量：self.fc2 是一个全连接层实例
            # 输入维度为 self.embed_dim，输出维度为 self.embed_dim
            # 根据 dtype 参数和 self.config.init_std 进行初始化
            self.fc2 = nn.Dense(
                self.embed_dim, dtype=self.dtype, kernel_init=jax.nn.initializers.normal(self.config.init_std)
            )
            # 实例变量：self.final_layer_norm 是一个 LayerNorm 实例
            # 根据 dtype 参数进行初始化
            self.final_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)

        # 重写 __call__ 方法，用于实例调用时的行为
        # 可以接收多种输入参数并处理
        # 来自 transformers.models.mbart.modeling_flax_mbart.FlaxMBartDecoderLayer.__call__
        def __call__(
            self,
            hidden_states: jnp.ndarray,  # 输入的隐藏状态，类型为 jnp.ndarray
            attention_mask: jnp.ndarray,  # 注意力掩码，类型为 jnp.ndarray
            encoder_hidden_states: Optional[jnp.ndarray] = None,  # 编码器的隐藏状态，可选参数，默认为 None
            encoder_attention_mask: Optional[jnp.ndarray] = None,  # 编码器的注意力掩码，可选参数，默认为 None
            init_cache: bool = False,  # 是否初始化缓存，类型为布尔值，默认为 False
            output_attentions: bool = True,  # 是否输出注意力权重，类型为布尔值，默认为 True
            deterministic: bool = True,  # 是否确定性计算，类型为布尔值，默认为 True
            # 返回值类型为 Tuple[jnp.ndarray, Optional[jnp.ndarray]]
            # 其中第一个元素为输出的隐藏状态，第二个元素为注意力权重，可选
            ) -> Tuple[jnp.ndarray, Optional[jnp.ndarray]]:
        ) -> Tuple[jnp.ndarray]:
        # 保存残差连接（Residual Connection）的输入隐藏状态
        residual = hidden_states
        # 应用自注意力机制前的层归一化
        hidden_states = self.self_attn_layer_norm(hidden_states)

        # 自注意力机制
        hidden_states, self_attn_weights = self.self_attn(
            hidden_states=hidden_states, attention_mask=attention_mask, init_cache=init_cache
        )
        # 应用 dropout
        hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
        # 添加残差连接
        hidden_states = residual + hidden_states

        # 交叉注意力块
        cross_attn_weights = None
        if encoder_hidden_states is not None:
            # 保存残差连接
            residual = hidden_states

            # 应用编码器注意力块前的层归一化
            hidden_states = self.encoder_attn_layer_norm(hidden_states)
            # 应用编码器注意力机制
            hidden_states, cross_attn_weights = self.encoder_attn(
                hidden_states=hidden_states,
                key_value_states=encoder_hidden_states,
                attention_mask=encoder_attention_mask,
            )
            # 应用 dropout
            hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
            # 添加残差连接
            hidden_states = residual + hidden_states

        # 全连接层
        residual = hidden_states
        # 应用最终层归一化
        hidden_states = self.final_layer_norm(hidden_states)
        # 应用激活函数
        hidden_states = self.activation_fn(self.fc1(hidden_states))
        # 应用激活函数后的 dropout
        hidden_states = self.activation_dropout_layer(hidden_states, deterministic=deterministic)
        # 应用最后的线性变换
        hidden_states = self.fc2(hidden_states)
        # 应用 dropout
        hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
        # 添加残差连接
        hidden_states = residual + hidden_states

        # 准备输出
        outputs = (hidden_states,)

        # 如果需要输出注意力权重，则添加到输出中
        if output_attentions:
            outputs += (self_attn_weights, cross_attn_weights)

        return outputs
class FlaxXGLMDecoderLayerCollection(nn.Module):
    config: XGLMConfig
    dtype: jnp.dtype = jnp.float32  # 计算时使用的数据类型

    def setup(self):
        # 初始化所有的解码器层，并根据配置添加到层列表中
        self.layers = [
            FlaxXGLMDecoderLayer(self.config, name=str(i), dtype=self.dtype) for i in range(self.config.num_layers)
        ]
        # 设置层间隔概率（LayerDrop）
        self.layerdrop = self.config.layerdrop

    def __call__(
        self,
        hidden_states,
        attention_mask,
        encoder_hidden_states: Optional[jnp.ndarray] = None,
        encoder_attention_mask: Optional[jnp.ndarray] = None,
        deterministic: bool = True,
        init_cache: bool = False,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
    ):
        # 如果需要输出隐藏状态，则初始化存储所有隐藏状态的元组
        all_hidden_states = () if output_hidden_states else None
        # 如果需要输出注意力权重，则初始化存储所有自注意力权重的元组
        all_self_attns = () if output_attentions else None
        # 如果需要输出交叉注意力权重且存在编码器隐藏状态，则初始化存储所有交叉注意力权重的元组
        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None

        # 遍历所有解码器层
        for decoder_layer in self.layers:
            if output_hidden_states:
                # 如果需要输出隐藏状态，则将当前隐藏状态添加到存储所有隐藏状态的元组中
                all_hidden_states += (hidden_states,)
                # 添加层间隔概率（LayerDrop），详见论文 https://arxiv.org/abs/1909.11556
            dropout_probability = random.uniform(0, 1)
            if not deterministic and (dropout_probability < self.layerdrop):
                # 如果不是确定性计算且随机丢弃概率小于层间隔概率，则设置层输出为None
                layer_outputs = (None, None, None)
            else:
                # 否则，调用当前解码器层进行前向计算
                layer_outputs = decoder_layer(
                    hidden_states,
                    attention_mask=attention_mask,
                    encoder_hidden_states=encoder_hidden_states,
                    encoder_attention_mask=encoder_attention_mask,
                    init_cache=init_cache,
                    output_attentions=output_attentions,
                    deterministic=deterministic,
                )

            # 更新当前隐藏状态为解码器层的输出的第一个元素
            hidden_states = layer_outputs[0]
            if output_attentions:
                # 如果需要输出注意力权重，则将当前解码器层的自注意力权重添加到存储所有自注意力权重的元组中
                all_self_attns += (layer_outputs[1],)

                if encoder_hidden_states is not None:
                    # 如果存在编码器隐藏状态，则将当前解码器层的交叉注意力权重添加到存储所有交叉注意力权重的元组中
                    all_cross_attentions += (layer_outputs[2],)

        # 添加来自最后一个解码器层的隐藏状态
        if output_hidden_states:
            all_hidden_states += (hidden_states,)

        # 构建模型输出，根据需要返回不同的数据结构
        outputs = (hidden_states, all_hidden_states, all_self_attns, all_cross_attentions)

        if not return_dict:
            # 如果不需要返回字典形式的输出，则只返回非空的元组元素
            return tuple(v for v in outputs if v is not None)

        # 否则，返回包含各类注意力权重和隐藏状态的字典形式的输出
        return FlaxBaseModelOutputWithPastAndCrossAttentions(
            last_hidden_state=hidden_states,
            hidden_states=all_hidden_states,
            attentions=all_self_attns,
            cross_attentions=all_cross_attentions,
        )


class FlaxXGLMModule(nn.Module):
    config: XGLMConfig
    dtype: jnp.dtype = jnp.float32  # 计算时使用的数据类型
    # 设置模型的初始配置
    def setup(self):
        # 初始化 dropout 层
        self.dropout_layer = nn.Dropout(rate=self.config.dropout)

        # 获取嵌入维度、填充索引、最大目标位置和嵌入缩放因子的配置信息
        embed_dim = self.config.d_model
        self.padding_idx = self.config.pad_token_id
        self.max_target_positions = self.config.max_position_embeddings
        self.embed_scale = math.sqrt(self.config.d_model) if self.config.scale_embedding else 1.0

        # 创建词嵌入矩阵，指定词汇表大小和嵌入维度，使用正态分布初始化
        self.embed_tokens = nn.Embed(
            self.config.vocab_size,
            embed_dim,
            embedding_init=jax.nn.initializers.normal(self.config.init_std),
        )

        # XGLM 模型的特殊设置：如果指定了填充索引，将嵌入 id 偏移 2，并相应调整 num_embeddings
        # 其他模型不需要此调整
        self.offset = 2
        # 创建 sinusoidal 位置嵌入，考虑偏移量和嵌入维度
        self.embed_positions = create_sinusoidal_positions(
            self.config.max_position_embeddings + self.offset, embed_dim
        )
        
        # 初始化 XGLM 解码器层集合
        self.layers = FlaxXGLMDecoderLayerCollection(self.config, self.dtype)
        # 初始化 LayerNorm 层，设置类型和 epsilon 值
        self.layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)

    # 定义模型调用方法
    def __call__(
        self,
        input_ids,
        attention_mask,
        position_ids,
        encoder_hidden_states: Optional[jnp.ndarray] = None,
        encoder_attention_mask: Optional[jnp.ndarray] = None,
        init_cache: bool = False,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
        deterministic: bool = True,
        ):
            # 获取输入张量的形状
            input_shape = input_ids.shape
            # 将输入张量重新整形为二维张量，保留最后一个维度不变
            input_ids = input_ids.reshape(-1, input_shape[-1])

            # 使用模型的词嵌入层对输入张量进行嵌入，并乘以嵌入缩放因子
            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale

            # 嵌入位置信息
            position_ids = position_ids + self.offset
            positions = jnp.take(self.embed_positions, position_ids, axis=0)

            # 将词嵌入和位置嵌入相加得到隐藏状态
            hidden_states = inputs_embeds + positions
            # 使用 dropout 层对隐藏状态进行处理，根据 deterministic 参数确定是否使用确定性的 dropout
            hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)

            # 将隐藏状态传入模型的层中进行处理
            outputs = self.layers(
                hidden_states,
                attention_mask,
                encoder_hidden_states,
                encoder_attention_mask,
                deterministic=deterministic,
                init_cache=init_cache,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
                return_dict=return_dict,
            )

            # 获取模型输出中的最后一个隐藏状态，并进行层归一化处理
            last_hidden_states = outputs[0]
            last_hidden_states = self.layer_norm(last_hidden_states)

            hidden_states = None
            # 如果需要输出所有隐藏状态，则将其从模型输出中提取并添加最后一个隐藏状态
            if output_hidden_states:
                hidden_states = outputs[1]
                hidden_states = hidden_states[:-1] + (last_hidden_states,)

            # 根据 return_dict 决定如何返回模型输出
            if not return_dict:
                # 如果不需要返回字典形式的结果，则根据需要组合输出
                outputs = (last_hidden_states, hidden_states) + (outputs[2:] if output_hidden_states else outputs[1:])
                # 过滤掉空值并返回元组形式的结果
                return tuple(v for v in outputs if v is not None)

            # 如果需要返回字典形式的结果，则构建 FlaxBaseModelOutputWithPastAndCrossAttentions 对象
            return FlaxBaseModelOutputWithPastAndCrossAttentions(
                last_hidden_state=last_hidden_states,
                hidden_states=hidden_states,
                attentions=outputs.attentions,
                cross_attentions=outputs.cross_attentions,
            )
    # 定义 FlaxXGLMPreTrainedModel 类，继承自 FlaxPreTrainedModel 类
    class FlaxXGLMPreTrainedModel(FlaxPreTrainedModel):
        # 指定配置类为 XGLMConfig
        config_class = XGLMConfig
        # 指定基础模型前缀为 "model"
        base_model_prefix: str = "model"
        # 模块类默认为空
        module_class: nn.Module = None

        # 初始化方法，接受配置、输入形状、种子、数据类型等参数
        def __init__(
            self,
            config: XGLMConfig,
            input_shape: Tuple[int] = (1, 1),
            seed: int = 0,
            dtype: jnp.dtype = jnp.float32,
            _do_init: bool = True,
            **kwargs,
        ):
            # 使用模块类和其他参数初始化模块
            module = self.module_class(config=config, dtype=dtype, **kwargs)
            # 调用父类的初始化方法，传入配置、模块、输入形状、种子、数据类型等参数
            super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)

        # 初始化权重方法，接受随机数种子、输入形状和参数字典等参数
        def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
            # 初始化输入张量
            input_ids = jnp.zeros(input_shape, dtype="i4")
            # 创建与 input_ids 类型相同的全1张量作为 attention_mask
            attention_mask = jnp.ones_like(input_ids)
            # 根据 input_ids 的形状广播生成位置编码张量
            position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_shape)
            # 切分随机数种子为 params_rng 和 dropout_rng
            params_rng, dropout_rng = jax.random.split(rng)
            # 创建随机数字典 rngs，用于参数和 dropout
            rngs = {"params": params_rng, "dropout": dropout_rng}

            # 如果配置中包含跨注意力机制
            if self.config.add_cross_attention:
                # 创建与 input_shape 和配置的嵌入维度大小相同的全0隐藏状态张量
                encoder_hidden_states = jnp.zeros(input_shape + (self.config.n_embd,))
                # 将 attention_mask 用作编码器的注意力掩码
                encoder_attention_mask = attention_mask
                # 使用模块的初始化方法进行初始化，传入随机数字典、input_ids、attention_mask、position_ids、隐藏状态张量及其注意力掩码
                module_init_outputs = self.module.init(
                    rngs,
                    input_ids,
                    attention_mask,
                    position_ids,
                    encoder_hidden_states,
                    encoder_attention_mask,
                    return_dict=False,
                )
            else:
                # 否则，只使用 input_ids、attention_mask、position_ids 进行模块的初始化
                module_init_outputs = self.module.init(rngs, input_ids, attention_mask, position_ids, return_dict=False)

            # 获取随机初始化的模型参数
            random_params = module_init_outputs["params"]

            # 如果提供了预定义的参数，将随机参数与已有参数进行合并
            if params is not None:
                # 展平并解冻随机参数和已有参数
                random_params = flatten_dict(unfreeze(random_params))
                params = flatten_dict(unfreeze(params))
                # 将随机参数中缺失的键加入已有参数中
                for missing_key in self._missing_keys:
                    params[missing_key] = random_params[missing_key]
                self._missing_keys = set()
                # 冻结并重新构造参数字典
                return freeze(unflatten_dict(params))
            else:
                # 否则，直接返回随机初始化的参数
                return random_params

        # 初始化缓存方法，用于快速自回归解码
        def init_cache(self, batch_size, max_length):
            """
            Args:
                batch_size (`int`):
                    用于快速自回归解码的批处理大小。定义初始化缓存的批处理大小。
                max_length (`int`):
                    自回归解码的最大可能长度。定义初始化缓存的序列长度。
            """
            # 初始化用于检索缓存的输入变量
            input_ids = jnp.ones((batch_size, max_length), dtype="i4")
            # 创建与 input_ids 类型相同的全1张量作为 attention_mask
            attention_mask = jnp.ones_like(input_ids, dtype="i4")
            # 根据 input_ids 的形状广播生成位置编码张量
            position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)

            # 使用模块的初始化方法初始化变量，包括 input_ids、attention_mask、position_ids，并请求返回缓存
            init_variables = self.module.init(
                jax.random.PRNGKey(0), input_ids, attention_mask, position_ids, return_dict=False, init_cache=True
            )
            # 返回解冻后的初始化缓存
            return unfreeze(init_variables["cache"])
    # 将模型的前向传播方法装饰为添加文档字符串，用于模型输入参数的说明
    @add_start_docstrings_to_model_forward(XGLM_INPUTS_DOCSTRING)
    # 定义模型的调用方法，接受多个参数作为输入
    def __call__(
        self,
        input_ids: jnp.ndarray,  # 输入的token IDs，作为模型的输入
        attention_mask: Optional[jnp.ndarray] = None,  # 可选的注意力掩码，指示哪些token需要注意
        position_ids: Optional[jnp.ndarray] = None,  # 可选的位置IDs，用于指示token的位置信息
        encoder_hidden_states: Optional[jnp.ndarray] = None,  # 可选的编码器隐藏状态，用于encoder-decoder模型
        encoder_attention_mask: Optional[jnp.ndarray] = None,  # 可选的编码器注意力掩码
        output_attentions: Optional[bool] = None,  # 是否输出注意力权重
        output_hidden_states: Optional[bool] = None,  # 是否输出所有层的隐藏状态
        return_dict: Optional[bool] = None,  # 是否以字典形式返回结果
        train: bool = False,  # 是否处于训练模式
        params: dict = None,  # 模型参数字典
        past_key_values: dict = None,  # 过去的键值，用于存储前一次的状态信息
        dropout_rng: PRNGKey = None,  # 随机数生成器，用于Dropout层的随机掩码
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.return_dict

        if encoder_hidden_states is not None and encoder_attention_mask is None:
            batch_size, sequence_length = encoder_hidden_states.shape[:2]
            encoder_attention_mask = jnp.ones((batch_size, sequence_length))

        # 准备编码器的输入
        # 如果 attention_mask 为空，则使用与 input_ids 相同形状的全 1 数组
        if attention_mask is None:
            attention_mask = jnp.ones_like(input_ids)
        # 如果 position_ids 为空，则广播形状为 (batch_size, sequence_length) 的序列长度数组
        if position_ids is None:
            batch_size, sequence_length = input_ids.shape
            position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))

        # 如果需要处理任何伪随机数生成器 (PRNG)，则构建相应的字典
        rngs = {"dropout": dropout_rng} if dropout_rng is not None else {}

        inputs = {"params": params or self.params}

        # 如果 past_key_values 被传递，则初始化了缓存，并传递一个私有标志 init_cache 以确保使用缓存。
        # 必须确保缓存被标记为可变，以便 FlaxXGLMAttention 模块可以更改它。
        if past_key_values:
            inputs["cache"] = past_key_values
            mutable = ["cache"]
        else:
            mutable = False

        # 调用模块的 apply 方法，传递输入参数
        outputs = self.module.apply(
            inputs,
            input_ids=jnp.array(input_ids, dtype="i4"),
            attention_mask=jnp.array(attention_mask, dtype="i4"),
            position_ids=jnp.array(position_ids, dtype="i4"),
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            deterministic=not train,
            rngs=rngs,
            mutable=mutable,
        )

        # 将更新后的缓存添加到模型输出中
        if past_key_values is not None and return_dict:
            outputs, past_key_values = outputs
            outputs["past_key_values"] = unfreeze(past_key_values["cache"])
            return outputs
        elif past_key_values is not None and not return_dict:
            outputs, past_key_values = outputs
            outputs = outputs[:1] + (unfreeze(past_key_values["cache"]),) + outputs[1:]

        # 返回模型输出
        return outputs
# 为了给 FlaxXGLMModel 类添加文档字符串，指定它输出原始隐藏状态而没有特定的顶部头部。
@add_start_docstrings(
    "The bare XGLM Model transformer outputting raw hidden-states without any specific head on top.",
    XGLM_START_DOCSTRING,
)
class FlaxXGLMModel(FlaxXGLMPreTrainedModel):
    module_class = FlaxXGLMModule


# 添加调用示例的文档字符串给 FlaxXGLMModel 类
append_call_sample_docstring(
    FlaxXGLMModel,
    _CHECKPOINT_FOR_DOC,
    FlaxBaseModelOutputWithPastAndCrossAttentions,
    _CONFIG_FOR_DOC,
)


class FlaxXGLMForCausalLMModule(nn.Module):
    config: XGLMConfig
    dtype: jnp.dtype = jnp.float32  # 计算的数据类型

    def setup(self):
        # 使用配置和数据类型初始化 FlaxXGLMModule 模型
        self.model = FlaxXGLMModule(self.config, self.dtype)
        # 初始化语言模型头部，是一个全连接层，不使用偏置
        self.lm_head = nn.Dense(
            self.config.vocab_size,
            use_bias=False,
            dtype=self.dtype,
            kernel_init=jax.nn.initializers.normal(self.config.init_std),
        )

    def __call__(
        self,
        input_ids,
        attention_mask,
        position_ids,
        encoder_hidden_states: Optional[jnp.ndarray] = None,
        encoder_attention_mask: Optional[jnp.ndarray] = None,
        init_cache: bool = False,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
        deterministic: bool = True,
    ):
        # 调用模型进行前向传播
        outputs = self.model(
            input_ids,
            attention_mask,
            position_ids,
            encoder_hidden_states,
            encoder_attention_mask,
            deterministic=deterministic,
            init_cache=init_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        hidden_states = outputs[0]

        # 如果配置要求词嵌入共享，则使用共享的嵌入层参数进行计算
        if self.config.tie_word_embeddings:
            shared_embedding = self.model.variables["params"]["embed_tokens"]["embedding"]
            lm_logits = self.lm_head.apply({"params": {"kernel": shared_embedding.T}}, hidden_states)
        else:
            # 否则直接使用语言模型头部进行计算
            lm_logits = self.lm_head(hidden_states)

        # 如果不需要返回字典格式，则返回元组
        if not return_dict:
            return (lm_logits,) + outputs[1:]

        # 返回带有交叉注意力输出的 FlaxCausalLMOutputWithCrossAttentions 对象
        return FlaxCausalLMOutputWithCrossAttentions(
            logits=lm_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
            cross_attentions=outputs.cross_attentions,
        )


# 为 FlaxXGLMForCausalLM 类添加文档字符串，描述其为带有语言建模头部的 XGLM 模型变换器
@add_start_docstrings(
    """
    The XGLM Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    """,
    XGLM_START_DOCSTRING,
)
class FlaxXGLMForCausalLM(FlaxXGLMPreTrainedModel):
    module_class = FlaxXGLMForCausalLMModule
    # 为生成准备输入数据，初始化缓存
    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None):
        # 获取输入张量的批量大小和序列长度
        batch_size, seq_length = input_ids.shape

        # 使用 self.init_cache 方法初始化过去键值对
        past_key_values = self.init_cache(batch_size, max_length)
        
        # 创建一个扩展的注意力掩码，初始化为全1数组
        extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")
        
        # 如果给定了 attention_mask，则根据其累积和更新位置 ID，并将 attention_mask 的值复制到扩展的注意力掩码中对应位置
        if attention_mask is not None:
            position_ids = attention_mask.cumsum(axis=-1) - 1
            extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, attention_mask, (0, 0))
        else:
            # 否则，根据序列长度广播生成位置 ID
            position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length))

        # 返回包含过去键值对、扩展注意力掩码和位置 ID 的字典
        return {
            "past_key_values": past_key_values,
            "attention_mask": extended_attention_mask,
            "position_ids": position_ids,
        }

    # 更新生成的输入数据，将模型输出的过去键值对和更新后的位置 ID 存入 model_kwargs 中
    def update_inputs_for_generation(self, model_outputs, model_kwargs):
        model_kwargs["past_key_values"] = model_outputs.past_key_values
        model_kwargs["position_ids"] = model_kwargs["position_ids"][:, -1:] + 1
        return model_kwargs
# 调用函数 `append_call_sample_docstring`，将以下参数传递给它：
# - FlaxXGLMForCausalLM: 作为第一个参数传递的类或函数
# - _CHECKPOINT_FOR_DOC: 作为第二个参数传递的变量或值
# - FlaxCausalLMOutputWithCrossAttentions: 作为第三个参数传递的类或函数
# - _CONFIG_FOR_DOC: 作为第四个参数传递的变量或值
append_call_sample_docstring(
    FlaxXGLMForCausalLM,  # 第一个参数，传递类或函数 FlaxXGLMForCausalLM
    _CHECKPOINT_FOR_DOC,   # 第二个参数，传递变量或值 _CHECKPOINT_FOR_DOC
    FlaxCausalLMOutputWithCrossAttentions,  # 第三个参数，传递类或函数 FlaxCausalLMOutputWithCrossAttentions
    _CONFIG_FOR_DOC,  # 第四个参数，传递变量或值 _CONFIG_FOR_DOC
)

`.\models\xglm\modeling_tf_xglm.py`

# coding=utf-8
# Copyright 2021 The Fairseq Authors The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" TF 2.0 XGLM model."""

# 导入所需的模块和库
from __future__ import annotations

import math
import random
from typing import Any, Optional, Tuple, Union

import numpy as np
import tensorflow as tf

from ...activations_tf import get_tf_activation

# Public API
from ...file_utils import (
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    replace_return_docstrings,
)
from ...modeling_tf_outputs import TFBaseModelOutputWithPastAndCrossAttentions, TFCausalLMOutputWithCrossAttentions
from ...modeling_tf_utils import (
    TFCausalLanguageModelingLoss,
    TFModelInputType,
    TFPreTrainedModel,
    TFSharedEmbeddings,
    get_initializer,
    keras,
    keras_serializable,
    unpack_inputs,
)
from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
from ...utils import logging
from .configuration_xglm import XGLMConfig

# 获取日志记录器
logger = logging.get_logger(__name__)

# 用于文档的模型检查点和配置
_CHECKPOINT_FOR_DOC = "facebook/xglm-564M"
_CONFIG_FOR_DOC = "XGLMConfig"

# 预训练模型的存档列表
TF_XGLM_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "facebook/xglm-564M",
    # See all XGLM models at https://huggingface.co/models?filter=xglm
]

# 定义一个大的负数常量
LARGE_NEGATIVE = -1e8

# 创建正弦位置编码
def create_sinusoidal_positions(num_positions: int, embedding_dim: int, padding_idx: Optional[int]) -> tf.Tensor:
    half_dim = embedding_dim // 2
    emb = math.log(10000) / (half_dim - 1)
    emb = tf.exp(tf.range(half_dim, dtype=tf.float32) * -emb)
    emb = tf.expand_dims(tf.range(num_positions, dtype=tf.float32), axis=1) * tf.expand_dims(emb, axis=0)
    emb = tf.reshape(tf.concat([tf.sin(emb), tf.cos(emb)], axis=1), (num_positions, -1))
    if embedding_dim % 2 == 1:
        # 如果embedding_dim是奇数，需要在末尾补零
        emb = tf.concat([emb, tf.zeros((num_positions, 1))], axis=1)
    if padding_idx is not None:
        # 创建用于填充位置的掩码，确保填充位置的位置编码为零
        _padding_mask = tf.concat(
            [
                tf.ones((padding_idx, shape_list(emb)[1])),
                tf.zeros((1, shape_list(emb)[1])),
                tf.ones((shape_list(emb)[0] - padding_idx - 1, shape_list(emb)[1])),
            ],
            axis=0,
        )
        emb *= _padding_mask

    return tf.constant(emb, name="embed_positions")


# 从输入ID创建位置ID
def _create_position_ids_from_input_ids(
    input_ids: tf.Tensor, past_key_values_length: int, padding_idx: Optional[int]
) -> tf.Tensor:
    """
    根据输入的token IDs创建位置 IDs

    Args:
        input_ids (tf.Tensor): 输入的token IDs
        past_key_values_length (int): 过去key values的长度
        padding_idx (Optional[int]): 填充的索引位置

    Returns:
        tf.Tensor: 对应的位置 IDs
    """
    """
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`.
    """
    # 使用 TensorFlow 的 where 函数创建一个掩码，标记非填充符号位置为1，填充符号位置为0
    mask = tf.where(input_ids != padding_idx, 1, 0)
    # 计算增量索引，累积非填充位置的数量，并加上过去键值长度，乘以掩码确保只对非填充符号操作
    incremental_indices = (tf.cast(tf.cumsum(mask, axis=1), dtype=mask.dtype) + past_key_values_length) * mask
    # 将增量索引转换为 int64 类型，并加上填充索引，以得到最终的位置编码
    return tf.cast(incremental_indices, dtype=tf.int64) + padding_idx
# 定义一个函数，根据输入的嵌入向量和过去的键值对长度，生成位置ID张量
def _create_position_ids_from_inputs_embeds(
    inputs_embeds: tf.Tensor, past_key_values_length: int, padding_idx: Optional[int]
) -> tf.Tensor:
    """
    Args:
        inputs_embeds: 直接提供的嵌入向量张量
    Returns:
        tf.Tensor: 生成的位置ID张量
    """
    # 获取输入嵌入向量的形状
    input_shape = shape_list(inputs_embeds)[:-1]
    # 获取序列长度
    sequence_length = input_shape[1]

    # 生成从padding_idx + 1到sequence_length + padding_idx + 1的序列，数据类型为tf.int64
    position_ids = tf.range(padding_idx + 1, sequence_length + padding_idx + 1, dtype=tf.int64)

    # 将位置ID张量扩展为与输入形状相同的广播形式，并加上过去键值对长度
    return tf.broadcast_to(tf.expand_dims(position_ids, axis=0), input_shape) + past_key_values_length


# 从transformers.models.bart.modeling_tf_bart._make_causal_mask复制而来
def _make_causal_mask(input_ids_shape: tf.TensorShape, past_key_values_length: int = 0):
    """
    创建用于双向自注意力的因果掩码。
    """
    # 获取批量大小
    bsz = input_ids_shape[0]
    # 目标长度为输入形状的第二维度长度
    tgt_len = input_ids_shape[1]
    # 创建一个初始化为-LARGE_NEGATIVE的全1掩码
    mask = tf.ones((tgt_len, tgt_len)) * LARGE_NEGATIVE
    mask_cond = tf.range(shape_list(mask)[-1])

    # 将掩码中对角线以下的元素设为0
    mask = tf.where(mask_cond < tf.reshape(mask_cond + 1, (shape_list(mask)[-1], 1)), 0.0, mask)

    # 如果过去的键值对长度大于0，则在掩码的左侧填充0
    if past_key_values_length > 0:
        mask = tf.concat([tf.zeros((tgt_len, past_key_values_length)), mask], axis=-1)

    return tf.tile(mask[None, None, :, :], (bsz, 1, 1, 1))


# 从transformers.models.bart.modeling_tf_bart._expand_mask复制而来
def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None):
    """
    将注意力掩码从 `[bsz, seq_len]` 扩展到 `[bsz, 1, tgt_seq_len, src_seq_len]`。
    """
    # 获取源序列长度
    src_len = shape_list(mask)[1]
    # 如果未提供目标序列长度，则默认为源序列长度
    tgt_len = tgt_len if tgt_len is not None else src_len
    one_cst = tf.constant(1.0)
    mask = tf.cast(mask, dtype=one_cst.dtype)
    # 在第二维度上扩展掩码
    expanded_mask = tf.tile(mask[:, None, None, :], (1, 1, tgt_len, 1))

    return (one_cst - expanded_mask) * LARGE_NEGATIVE


# 从transformers.models.bart.modeling_tf_bart.TFXGLMAttention（将Bart更改为XGLM）复制而来
class TFXGLMAttention(keras.layers.Layer):
    """来自"Attention Is All You Need"的多头注意力"""

    def __init__(
        self,
        embed_dim: int,
        num_heads: int,
        dropout: float = 0.0,
        is_decoder: bool = False,
        bias: bool = True,
        **kwargs,
        ):
            # 调用父类初始化函数，并传递所有关键字参数
            super().__init__(**kwargs)
            # 设置嵌入维度
            self.embed_dim = embed_dim

            # 设置注意力头数
            self.num_heads = num_heads
            # 创建一个丢弃层，用于在训练时随机丢弃输入单元
            self.dropout = keras.layers.Dropout(dropout)
            # 计算每个注意力头的维度
            self.head_dim = embed_dim // num_heads
            # 如果 embed_dim 不能被 num_heads 整除，抛出数值错误
            if (self.head_dim * num_heads) != self.embed_dim:
                raise ValueError(
                    f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
                    f" and `num_heads`: {num_heads})."
                )
            # 缩放因子，用于缩放注意力分数
            self.scaling = self.head_dim**-0.5
            # 是否为解码器的标志
            self.is_decoder = is_decoder

            # 创建用于键、查询、值和输出的全连接层
            self.k_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj")
            self.q_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj")
            self.v_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj")
            self.out_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj")

        # 将张量重塑为指定形状的私有方法
        def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
            return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3))

        # 前向传播函数，接收多个输入张量，并返回输出张量
        def call(
            self,
            hidden_states: tf.Tensor,
            key_value_states: tf.Tensor | None = None,
            past_key_value: Tuple[Tuple[tf.Tensor]] | None = None,
            attention_mask: tf.Tensor | None = None,
            layer_head_mask: tf.Tensor | None = None,
            training: Optional[bool] = False,
        ):
            # 如果已经构建过，则直接返回
            if self.built:
                return
            # 标记为已构建
            self.built = True
            # 如果存在 k_proj 属性，则构建 k_proj 层
            if getattr(self, "k_proj", None) is not None:
                with tf.name_scope(self.k_proj.name):
                    self.k_proj.build([None, None, self.embed_dim])
            # 如果存在 q_proj 属性，则构建 q_proj 层
            if getattr(self, "q_proj", None) is not None:
                with tf.name_scope(self.q_proj.name):
                    self.q_proj.build([None, None, self.embed_dim])
            # 如果存在 v_proj 属性，则构建 v_proj 层
            if getattr(self, "v_proj", None) is not None:
                with tf.name_scope(self.v_proj.name):
                    self.v_proj.build([None, None, self.embed_dim])
            # 如果存在 out_proj 属性，则构建 out_proj 层
            if getattr(self, "out_proj", None) is not None:
                with tf.name_scope(self.out_proj.name):
                    self.out_proj.build([None, None, self.embed_dim])
# TFXGLMDecoderLayer 类的构造函数，初始化一个解码器层
class TFXGLMDecoderLayer(keras.layers.Layer):
    def __init__(self, config: XGLMConfig, **kwargs: Any) -> None:
        super().__init__(**kwargs)
        # 设置嵌入维度为配置中的模型维度
        self.embed_dim = config.d_model
        # 初始化自注意力层对象，用于处理解码器自注意力机制
        self.self_attn = TFXGLMAttention(
            embed_dim=self.embed_dim,
            num_heads=config.attention_heads,
            dropout=config.attention_dropout,
            is_decoder=True,
            name="self_attn",
        )
        # 初始化 dropout 层，用于在训练过程中进行随机失活
        self.dropout = keras.layers.Dropout(config.dropout)
        # 根据配置获取激活函数，并设置激活函数对象
        self.activation_fn = get_tf_activation(config.activation_function)
        # 初始化激活函数后的dropout层，用于激活函数后进行随机失活
        self.activation_dropout = keras.layers.Dropout(config.activation_dropout)

        # 如果配置要求添加跨注意力，初始化编码器注意力层对象
        if config.add_cross_attention:
            self.encoder_attn = TFXGLMAttention(
                embed_dim=self.embed_dim,
                num_heads=config.attention_heads,
                dropout=config.attention_dropout,
                is_decoder=True,
                name="encoder_attn",
            )
            # 初始化编码器注意力层后的 LayerNormalization 层
            self.encoder_attn_layer_norm = keras.layers.LayerNormalization(
                epsilon=1e-5, name="encoder_attn_layer_norm"
            )

        # 初始化自注意力层后的 LayerNormalization 层
        self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
        # 初始化全连接层1，用于前馈神经网络的第一层
        self.fc1 = keras.layers.Dense(config.ffn_dim, name="fc1")
        # 初始化全连接层2，用于前馈神经网络的第二层，输出维度为嵌入维度
        self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2")
        # 初始化最终的 LayerNormalization 层
        self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
        # 保存配置对象到实例变量中
        self.config = config

    # Copied from transformers.models.mbart.modeling_tf_mbart.TFMBartDecoderLayer.call
    # 定义层的调用方法，接受多个输入和参数
    def call(
        self,
        hidden_states: tf.Tensor,
        attention_mask: tf.Tensor | None = None,
        encoder_hidden_states: tf.Tensor | None = None,
        encoder_attention_mask: tf.Tensor | None = None,
        layer_head_mask: tf.Tensor | None = None,
        cross_attn_layer_head_mask: tf.Tensor | None = None,
        past_key_value: Tuple[tf.Tensor] | None = None,
        training: Optional[bool] = False,
        # 返回层的调用结果

        # 隐藏状态：当前层的输入张量
        hidden_states: tf.Tensor,
        # 注意力掩码：用于指定哪些位置的元素需要被注意，哪些不需要
        attention_mask: tf.Tensor | None = None,
        # 编码器隐藏状态：编码器层的输出张量
        encoder_hidden_states: tf.Tensor | None = None,
        # 编码器注意力掩码：编码器层的注意力掩码张量
        encoder_attention_mask: tf.Tensor | None = None,
        # 层头掩码：指定每个注意力头的掩码张量
        layer_head_mask: tf.Tensor | None = None,
        # 跨注意力层头掩码：用于跨注意力的每个注意力头的掩码张量
        cross_attn_layer_head_mask: tf.Tensor | None = None,
        # 过去的键值对：用于缓存的过去键值对元组
        past_key_value: Tuple[tf.Tensor] | None = None,
        # 训练标志：指定是否处于训练模式
        training: Optional[bool] = False,
        # 返回：当前层的调用结果

        # 隐藏状态：当前层的输入张量
        hidden_states: tf.Tensor,
        # 注意力掩码：用于指定哪些位置的元素需要被注意，哪些不需要
        attention_mask: tf.Tensor | None = None,
        # 编码器隐藏状态：编码器层的输出张量
        encoder_hidden_states: tf.Tensor | None = None,
        # 编码器注意力掩码：编码器层的注意力掩码张量
        encoder_attention_mask: tf.Tensor | None = None,
        # 层头掩码：指定每个注意力头的掩码张量
        layer_head_mask: tf.Tensor | None = None,
        # 跨注意力层头掩码：用于跨注意力的每个注意力头的掩码张量
        cross_attn_layer_head_mask: tf.Tensor | None = None,
        # 过去的键值对：用于缓存的过去键值对元组
        past_key_value: Tuple[tf.Tensor] | None = None,
        # 训练标志：指定是否处于训练模式，默认为 False
        training: Optional[bool] = False,
    ) -> tf.Tensor:
        # 返回当前层的调用结果，后续逻辑在具体模型调用时处理
        pass


注释：
    # 如果模型已经构建，则直接返回，避免重复构建
    def build(self, input_shape=None):
        if self.built:
            return
        
        # 标记模型已经构建
        self.built = True
        
        # 如果存在 self_attn 属性，则构建 self_attn 层，并设置作用域名称
        if getattr(self, "self_attn", None) is not None:
            with tf.name_scope(self.self_attn.name):
                self.self_attn.build(None)
        
        # 如果存在 self_attn_layer_norm 属性，则构建 self_attn_layer_norm 层，并设置作用域名称
        if getattr(self, "self_attn_layer_norm", None) is not None:
            with tf.name_scope(self.self_attn_layer_norm.name):
                self.self_attn_layer_norm.build([None, None, self.embed_dim])
        
        # 如果存在 fc1 属性，则构建 fc1 层，并设置作用域名称
        if getattr(self, "fc1", None) is not None:
            with tf.name_scope(self.fc1.name):
                self.fc1.build([None, None, self.embed_dim])
        
        # 如果存在 fc2 属性，则构建 fc2 层，并设置作用域名称
        if getattr(self, "fc2", None) is not None:
            with tf.name_scope(self.fc2.name):
                self.fc2.build([None, None, self.config.ffn_dim])
        
        # 如果存在 final_layer_norm 属性，则构建 final_layer_norm 层，并设置作用域名称
        if getattr(self, "final_layer_norm", None) is not None:
            with tf.name_scope(self.final_layer_norm.name):
                self.final_layer_norm.build([None, None, self.embed_dim])
        
        # 如果存在 encoder_attn 属性，则构建 encoder_attn 层，并设置作用域名称
        if getattr(self, "encoder_attn", None) is not None:
            with tf.name_scope(self.encoder_attn.name):
                self.encoder_attn.build(None)
        
        # 如果存在 encoder_attn_layer_norm 属性，则构建 encoder_attn_layer_norm 层，并设置作用域名称
        if getattr(self, "encoder_attn_layer_norm", None) is not None:
            with tf.name_scope(self.encoder_attn_layer_norm.name):
                self.encoder_attn_layer_norm.build([None, None, self.embed_dim])
# 使用 keras_serializable 装饰器将类标记为可序列化，以便可以序列化和反序列化
@keras_serializable
class TFXGLMMainLayer(keras.layers.Layer):
    # 设置配置类，用于该层的配置信息
    config_class = XGLMConfig

    # 初始化方法，接受配置对象和其他参数，继承父类的初始化方法
    def __init__(
        self, config: XGLMConfig, embed_tokens: Optional[TFSharedEmbeddings] = None, *inputs, **kwargs: Any
    ) -> None:
        super().__init__(*inputs, **kwargs)

        # 将配置对象保存为类属性
        self.config = config
        # 设置填充标记的索引
        self.padding_idx = config.pad_token_id
        # 设置最大目标位置
        self.max_target_positions = config.max_position_embeddings
        # 如果配置为缩放嵌入，则计算嵌入比例
        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0

        # 如果提供了嵌入标记，则使用提供的；否则创建一个新的共享嵌入层
        if embed_tokens is not None:
            self.embed_tokens = embed_tokens
        else:
            self.embed_tokens = TFSharedEmbeddings(
                config.vocab_size, config.d_model, self.padding_idx, name="embed_tokens"
            )

        # 设置偏移量为2，用于嵌入位置
        self.offset = 2
        # 创建正弦位置嵌入的权重矩阵
        self._embed_positions_weights = create_sinusoidal_positions(
            num_positions=config.max_position_embeddings + self.offset,
            embedding_dim=config.d_model,
            padding_idx=config.pad_token_id,
        )

        # 设置丢弃层，用于模型训练时的随机丢弃
        self.dropout = keras.layers.Dropout(config.dropout)
        # 创建多层解码器层的列表
        self.layers = [TFXGLMDecoderLayer(config, name=f"layers.{i}") for i in range(config.num_layers)]
        # 设置层丢弃率
        self.layerdrop = config.layerdrop
        # 创建层归一化层，用于归一化层输出
        self.layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm")

    # 获取输入嵌入层对象
    def get_input_embeddings(self) -> TFSharedEmbeddings:
        return self.embed_tokens

    # 设置输入嵌入层对象
    def set_input_embeddings(self, value: TFSharedEmbeddings) -> None:
        self.embed_tokens = value

    # 准备解码器的注意力掩码
    def _prepare_decoder_attention_mask(
        self,
        attention_mask: tf.Tensor | None,
        input_shape: tf.TensorShape,
        past_key_values_length: int,
    ) -> tf.Tensor:
        # 创建因果掩码
        combined_attention_mask = _make_causal_mask(input_shape, past_key_values_length)
        # 如果输入序列长度大于1，则使用创建的掩码；否则创建一个全1的掩码
        combined_attention_mask = tf.cond(
            input_shape[-1] > 1, lambda: combined_attention_mask, lambda: tf.ones_like(combined_attention_mask)
        )
        # 如果没有提供额外的注意力掩码，则直接返回组合的注意力掩码
        if attention_mask is None:
            return combined_attention_mask
        # 否则，根据目标序列长度扩展提供的注意力掩码，并与组合的注意力掩码相加
        expand_attention_mask = _expand_mask(attention_mask, tgt_len=input_shape[-1])
        return expand_attention_mask + combined_attention_mask

    # 嵌入位置信息到输入中
    def embed_positions(self, position_ids: np.ndarray | tf.Tensor | None = None) -> tf.Tensor:
        # 将位置 IDs 偏移量加到输入中
        position_ids += self.offset
        # 从位置权重矩阵中根据位置 IDs 获取对应的位置嵌入
        positions = tf.gather(self._embed_positions_weights, position_ids, axis=0)
        return positions

    # 解包输入的装饰器，用于解析传入的输入参数
    @unpack_inputs
    # 定义一个方法 `call`，用于执行模型推断或训练的操作，接受多个输入参数
    def call(
        self,
        input_ids: TFModelInputType | None = None,  # 输入的 token IDs，可以是空值
        attention_mask: np.ndarray | tf.Tensor | None = None,  # 注意力掩码，可以是 NumPy 数组或 TensorFlow 张量，也可以是空值
        position_ids: np.ndarray | tf.Tensor | None = None,  # 位置 IDs，可以是 NumPy 数组或 TensorFlow 张量，也可以是空值
        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,  # 编码器隐藏状态，可以是 NumPy 数组或 TensorFlow 张量，也可以是空值
        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,  # 编码器注意力掩码，可以是 NumPy 数组或 TensorFlow 张量，也可以是空值
        head_mask: np.ndarray | tf.Tensor | None = None,  # 头部掩码，可以是 NumPy 数组或 TensorFlow 张量，也可以是空值
        cross_attn_head_mask: np.ndarray | tf.Tensor | None = None,  # 交叉注意力头部掩码，可以是 NumPy 数组或 TensorFlow 张量，也可以是空值
        past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,  # 过去的键值对，可以是空值或包含 NumPy 数组或 TensorFlow 张量的元组
        inputs_embeds: np.ndarray | tf.Tensor | None = None,  # 输入嵌入向量，可以是 NumPy 数组或 TensorFlow 张量，也可以是空值
        use_cache: Optional[bool] = None,  # 是否使用缓存，可以是空值
        output_attentions: Optional[bool] = None,  # 是否输出注意力权重，可以是空值
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态，可以是空值
        return_dict: Optional[bool] = None,  # 是否返回字典形式的结果，可以是空值
        training: Optional[bool] = False,  # 是否处于训练模式，默认为 False
        **kwargs: Any,  # 其它未指定参数，以字典形式收集
    ):
        # 模型建造方法，如果已经建造过则直接返回
        def build(self, input_shape=None):
            if self.built:
                return
            self.built = True
            # 如果存在层归一化，则进行层归一化的建造
            if getattr(self, "layer_norm", None) is not None:
                with tf.name_scope(self.layer_norm.name):
                    self.layer_norm.build([None, None, self.config.d_model])
            # 如果存在嵌入标记，则进行嵌入标记的建造
            if getattr(self, "embed_tokens", None) is not None:
                with tf.name_scope(self.embed_tokens.name):
                    self.embed_tokens.build(None)
            # 如果存在多层，则遍历每一层并建造
            if getattr(self, "layers", None) is not None:
                for layer in self.layers:
                    with tf.name_scope(layer.name):
                        layer.build(None)
# 导入必要的库和模块
class TFXGLMPreTrainedModel(TFPreTrainedModel):
    # 设置配置类，用于此模型的配置参数
    config_class = XGLMConfig
    # 模型基础名称前缀，通常是 "model"
    base_model_prefix = "model"


# XGLM_START_DOCSTRING 是一个原始字符串，用于文档化模型的基本信息和使用方法
XGLM_START_DOCSTRING = r"""
    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
    behavior.

    <Tip>

    TensorFlow models and layers in `transformers` accept two formats as input:

    - having all inputs as keyword arguments (like PyTorch models), or
    - having all inputs as a list, tuple or dict in the first positional argument.

    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
    positional argument:

    - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
    `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
    `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`

    Note that when creating models and layers with
    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
    about any of this, as you can just pass inputs like you would to any other Python function!

    </Tip>

    Args:
        config ([`XGLMConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
"""

# XGLM_INPUTS_DOCSTRING 是一个原始字符串，用于文档化模型输入的信息，但在提供的代码中没有内容

# 使用装饰器 add_start_docstrings 将类的文档字符串与特定描述组合起来
@add_start_docstrings(
    "The bare XGLM Model transformer outputting raw hidden-states without any specific head on top.",
    XGLM_START_DOCSTRING,
)
# TFXGLMModel 类继承自 TFXGLMPreTrainedModel 类，表示一个 Transformer 解码器模型
class TFXGLMModel(TFXGLMPreTrainedModel):
    """
    Transformer decoder consisting of *config.num_layers* layers. Each layer is a [`TFXGLMDecoderLayer`]
    """
    """
    初始化函数，设置模型的配置和嵌入层参数，继承父类的初始化方法。

    Args:
        config: XGLMConfig 类型的配置对象
        embed_tokens: 可选的 TFSharedEmbeddings 类型的嵌入层参数
        *inputs: 可变数量的输入参数
        **kwargs: 可变数量的关键字参数
    """
    super().__init__(config, *inputs, **kwargs)

    # 使用给定的配置和嵌入层参数创建 TF 模型的主层
    self.model = TFXGLMMainLayer(config, embed_tokens=embed_tokens, name="model")

@unpack_inputs
@add_start_docstrings_to_model_forward(XGLM_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
    checkpoint=_CHECKPOINT_FOR_DOC,
    output_type=TFBaseModelOutputWithPastAndCrossAttentions,
    config_class=_CONFIG_FOR_DOC,
)
def call(
    self,
    input_ids: TFModelInputType | None = None,
    attention_mask: np.ndarray | tf.Tensor | None = None,
    position_ids: np.ndarray | tf.Tensor | None = None,
    encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
    encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
    head_mask: np.ndarray | tf.Tensor | None = None,
    cross_attn_head_mask: np.ndarray | tf.Tensor | None = None,
    past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
    inputs_embeds: np.ndarray | tf.Tensor | None = None,
    use_cache: Optional[bool] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
    training: Optional[bool] = False,
    **kwargs: Any,
) -> Union[TFBaseModelOutputWithPastAndCrossAttentions, Tuple[tf.Tensor]]:
    """
    调用方法，用于模型的前向推断。

    Args:
        input_ids: TFModelInputType 类型或 None，输入的 token IDs
        attention_mask: np.ndarray 或 tf.Tensor 或 None，注意力遮罩
        position_ids: np.ndarray 或 tf.Tensor 或 None，位置 IDs
        encoder_hidden_states: np.ndarray 或 tf.Tensor 或 None，编码器隐藏状态
        encoder_attention_mask: np.ndarray 或 tf.Tensor 或 None，编码器注意力遮罩
        head_mask: np.ndarray 或 tf.Tensor 或 None，注意力头部遮罩
        cross_attn_head_mask: np.ndarray 或 tf.Tensor 或 None，跨注意力头部遮罩
        past_key_values: 可选的 Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]，过去的键值
        inputs_embeds: np.ndarray 或 tf.Tensor 或 None，输入的嵌入
        use_cache: 可选的 bool 类型，是否使用缓存
        output_attentions: 可选的 bool 类型，是否输出注意力权重
        output_hidden_states: 可选的 bool 类型，是否输出隐藏状态
        return_dict: 可选的 bool 类型，是否返回字典格式的输出
        training: 可选的 bool 类型，默认为 False，是否处于训练模式
        **kwargs: 其他关键字参数

    Returns:
        模型输出，可以是 TFBaseModelOutputWithPastAndCrossAttentions 类型或 tf.Tensor 的元组
    """
    # 调用模型的前向计算，传递所有参数
    outputs = self.model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        encoder_hidden_states=encoder_hidden_states,
        encoder_attention_mask=encoder_attention_mask,
        head_mask=head_mask,
        cross_attn_head_mask=cross_attn_head_mask,
        past_key_values=past_key_values,
        inputs_embeds=inputs_embeds,
        use_cache=use_cache,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
        training=training,
    )

    return outputs

def build(self, input_shape=None):
    """
    构建方法，用于建立模型的层次结构。

    Args:
        input_shape: 可选的输入形状信息
    """
    if self.built:
        return
    self.built = True
    # 如果存在模型对象，则在其命名空间下建立模型
    if getattr(self, "model", None) is not None:
        with tf.name_scope(self.model.name):
            self.model.build(None)
# 使用装饰器给类添加文档字符串，描述其作为带语言建模头部的 XGLM 模型转换器的特性
@add_start_docstrings(
    """
    The XGLM Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    """,
    XGLM_START_DOCSTRING,
)
class TFXGLMForCausalLM(TFXGLMPreTrainedModel, TFCausalLanguageModelingLoss):
    # 模型在加载时忽略的键列表，用于处理缺失的情况
    base_model_prefix = "model"
    _keys_to_ignore_on_load_missing = [
        r"model.embed_positions.weights",
        r"lm_head.weight",
    ]
    # 模型在保存时忽略的键列表，用于避免保存不必要的参数
    _keys_to_ignore_on_save = [
        r"model.embed_positions.weights",
    ]

    def __init__(
        self, config: XGLMConfig, embed_tokens: Optional[TFSharedEmbeddings] = None, *inputs: Any, **kwargs: Any
    ) -> None:
        # 调用父类的初始化方法，传递配置和其他参数
        super().__init__(config, *inputs, **kwargs)

        # 创建模型主体层，并命名为 "model"
        self.model = TFXGLMMainLayer(config, embed_tokens=embed_tokens, name="model")
        
        # 创建语言建模头部，使用 Dense 层，不使用偏置，使用指定的初始化器初始化权重
        self.lm_head = keras.layers.Dense(
            config.vocab_size,
            use_bias=False,
            kernel_initializer=get_initializer(config.init_std),
            name="lm_head",
        )
        
        # 保存配置对象
        self.config = config

    # 返回语言建模头部
    def get_output_embeddings(self):
        return self.lm_head

    # 设置新的输出嵌入
    def set_output_embeddings(self, new_embeddings):
        self.lm_head = new_embeddings

    # 准备生成时的输入，根据参数和过去的键值决定输入的形式
    def prepare_inputs_for_generation(self, inputs, past_key_values=None, use_cache=None, **kwargs):
        # 如果有过去的键值，只使用输入的最后一个标记
        if past_key_values:
            inputs = tf.expand_dims(inputs[:, -1], -1)

        # 获取位置标识和注意力掩码
        position_ids = kwargs.get("position_ids", None)
        attention_mask = kwargs.get("attention_mask", None)

        # 如果有注意力掩码但没有位置标识，根据掩码计算位置标识
        if attention_mask is not None and position_ids is None:
            position_ids = tf.math.cumsum(attention_mask, axis=-1, exclusive=True)
            if past_key_values:
                position_ids = tf.expand_dims(position_ids[:, -1], -1)

        # 返回准备好的输入字典
        return {
            "input_ids": inputs,
            "attention_mask": attention_mask,
            "position_ids": position_ids,
            "past_key_values": past_key_values,
            "use_cache": use_cache,
        }

    # 使用装饰器来解包输入，并添加模型前向传播的文档字符串
    @unpack_inputs
    @add_start_docstrings_to_model_forward(XGLM_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=TFCausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFCausalLMOutputWithCrossAttentions,
        config_class=_CONFIG_FOR_DOC,
    )
    # 定义一个方法 `call`，用于调用当前类的实例
    def call(
        # 输入模型的标识符，可以是 TensorFlow 模型输入类型或 None
        self,
        input_ids: TFModelInputType | None = None,
        # 注意力掩码，可以是 NumPy 数组或 TensorFlow 张量或 None
        attention_mask: np.ndarray | tf.Tensor | None = None,
        # 位置编码，可以是 NumPy 数组或 TensorFlow 张量或 None
        position_ids: np.ndarray | tf.Tensor | None = None,
        # 编码器隐藏状态，可以是 NumPy 数组或 TensorFlow 张量或 None
        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
        # 编码器注意力掩码，可以是 NumPy 数组或 TensorFlow 张量或 None
        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
        # 头部掩码，可以是 NumPy 数组或 TensorFlow 张量或 None
        head_mask: np.ndarray | tf.Tensor | None = None,
        # 跨注意力头部掩码，可以是 NumPy 数组或 TensorFlow 张量或 None
        cross_attn_head_mask: np.ndarray | tf.Tensor | None = None,
        # 过去键值对，类型为可选的元组，每个元组包含 NumPy 数组或 TensorFlow 张量
        past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
        # 输入嵌入，可以是 NumPy 数组或 TensorFlow 张量或 None
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        # 标签，可以是 NumPy 数组或 TensorFlow 张量或 None
        labels: np.ndarray | tf.Tensor | None = None,
        # 是否使用缓存，可以是布尔值或 None
        use_cache: Optional[bool] = None,
        # 是否输出注意力，可以是布尔值或 None
        output_attentions: Optional[bool] = None,
        # 是否输出隐藏状态，可以是布尔值或 None
        output_hidden_states: Optional[bool] = None,
        # 是否返回字典形式的结果，可以是布尔值或 None
        return_dict: Optional[bool] = None,
        # 是否处于训练模式，可以是布尔值，默认为 False
        training: Optional[bool] = False,
        # 其他参数，类型为任意
        **kwargs: Any,
    ) -> Union[TFCausalLMOutputWithCrossAttentions, Tuple[tf.Tensor]]:
        r"""
        labels (`np.ndarray` or `tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        """

        # 调用模型进行前向传播，生成输出结果
        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            head_mask=head_mask,
            cross_attn_head_mask=cross_attn_head_mask,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 从模型输出的第一个元素中获取隐藏状态
        hidden_states = outputs[0]
        # 使用语言模型头部生成语言模型的逻辑(logits)
        lm_logits = self.lm_head(hidden_states)

        loss = None
        if labels is not None:
            # 将标签向左移动一位，并且截断最后一个逻辑(token)
            labels = tf.concat(
                [labels[:, 1:], tf.fill((labels.shape[0], 1), tf.cast(self.config.pad_token_id, labels.dtype))],
                axis=-1,
            )
            # 计算损失
            loss = self.hf_compute_loss(labels, lm_logits)

        if not return_dict:
            # 如果不返回字典，按顺序返回结果
            output = (lm_logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 返回带有交叉注意力的 TF 语言模型输出对象
        return TFCausalLMOutputWithCrossAttentions(
            loss=loss,
            logits=lm_logits,
            past_key_values=outputs.past_key_values,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
            cross_attentions=outputs.cross_attentions,
        )

    def build(self, input_shape=None):
        if self.built:
            return
        # 标记模型已构建
        self.built = True
        if getattr(self, "model", None) is not None:
            # 构建模型
            with tf.name_scope(self.model.name):
                self.model.build(None)
        if getattr(self, "lm_head", None) is not None:
            # 构建语言模型头部
            with tf.name_scope(self.lm_head.name):
                self.lm_head.build([None, None, self.config.hidden_size])

    def tf_to_pt_weight_rename(self, tf_weight):
        if tf_weight == "lm_head.weight":
            # 重命名权重，将 tf 的 lm_head.weight 映射到 PyTorch 的 model.embed_tokens.weight
            return tf_weight, "model.embed_tokens.weight"
        else:
            return (tf_weight,)

`.\models\xglm\modeling_xglm.py`

# 设置文件编码为UTF-8
# 版权声明和许可协议，指定了代码的使用条款
# 导入必要的库和模块
# 导入了一些特定的类和函数用于模型定义和训练
""" PyTorch XGLM model."""

# 导入数学库
import math
# 导入类型提示工具
from typing import List, Optional, Tuple, Union

# 导入PyTorch库
import torch
# 导入PyTorch中的checkpoint功能
import torch.utils.checkpoint
# 导入PyTorch中的神经网络模块
from torch import nn
# 导入PyTorch中的交叉熵损失函数
from torch.nn import CrossEntropyLoss

# 导入自定义模块和函数
from ...activations import ACT2FN
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask
from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, CausalLMOutputWithCrossAttentions
from ...modeling_utils import PreTrainedModel
from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
# 导入XGLM模型的配置类
from .configuration_xglm import XGLMConfig

# 获取日志记录器
logger = logging.get_logger(__name__)

# 预设的模型检查点和配置信息
_CHECKPOINT_FOR_DOC = "facebook/xglm-564M"
_CONFIG_FOR_DOC = "XGLMConfig"

# 预设的预训练模型列表
XGLM_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "facebook/xglm-564M",
    # 查看所有XGLM模型：https://huggingface.co/models?filter=xglm
]

# XGLM模型的开始文档字符串，描述了模型的继承和参数
XGLM_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`XGLMConfig`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# XGLM模型的输入文档字符串，目前为空
XGLM_INPUTS_DOCSTRING = r"""
"""

# 定义一个用于生成任意长度正弦位置嵌入的模块
class XGLMSinusoidalPositionalEmbedding(nn.Module):
    """This module produces sinusoidal positional embeddings of any length."""

    def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):
        super().__init__()
        # 初始偏移量
        self.offset = 2
        # 嵌入维度
        self.embedding_dim = embedding_dim
        # 填充索引，可选
        self.padding_idx = padding_idx
        # 生成位置权重
        self.make_weights(num_positions + self.offset, embedding_dim, padding_idx)
    # 定义一个方法，用于创建权重矩阵，用于位置编码或其他嵌入操作
    def make_weights(self, num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
        # 调用get_embedding方法获取嵌入权重
        emb_weights = self.get_embedding(num_embeddings, embedding_dim, padding_idx)
        # 如果对象已有weights属性，将新创建的权重矩阵类型和设备与该属性相匹配
        if hasattr(self, "weights"):
            emb_weights = emb_weights.to(dtype=self.weights.dtype, device=self.weights.device)

        # 注册权重为缓冲区，不会被视为模型的参数
        self.register_buffer("weights", emb_weights, persistent=False)

    @staticmethod
    def get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
        """
        构建正弦嵌入。

        这与tensor2tensor中的实现相匹配，但与"Attention Is All You Need"第3.5节的描述略有不同。
        """
        # 计算正弦周期的半长度
        half_dim = embedding_dim // 2
        # 计算正弦函数的周期
        emb = math.log(10000) / (half_dim - 1)
        # 计算正弦嵌入
        emb = torch.exp(torch.arange(half_dim, dtype=torch.int64).float() * -emb)
        emb = torch.arange(num_embeddings, dtype=torch.int64).float().unsqueeze(1) * emb.unsqueeze(0)
        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(num_embeddings, -1)
        # 如果embedding_dim是奇数，进行零填充
        if embedding_dim % 2 == 1:
            emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
        # 如果有padding_idx，则将对应位置的嵌入设置为零
        if padding_idx is not None:
            emb[padding_idx, :] = 0

        return emb.to(torch.get_default_dtype())

    @torch.no_grad()
    def forward(self, position_ids: torch.Tensor = None, past_key_values_length: int = 0):
        # 获取位置编码的批大小和序列长度
        bsz, seq_len = position_ids.size()
        # 将位置编码偏移量加到输入的位置编码上
        position_ids += self.offset

        # 扩展嵌入权重，如果需要的话。不使用`position_ids.max()`是为了保持torch.fx的兼容性。
        max_pos = 2 + seq_len + past_key_values_length
        if max_pos > self.weights.size(0):
            self.make_weights(max_pos, self.embedding_dim, self.padding_idx)

        # 根据位置编码选择对应的权重，并调整形状以匹配输入的bsz和seq_len，并返回不可变版本
        return self.weights.index_select(0, position_ids.view(-1)).view(bsz, seq_len, self.weights.shape[-1]).detach()
class XGLMAttention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    def __init__(
        self,
        embed_dim: int,
        num_heads: int,
        dropout: float = 0.0,
        is_decoder: bool = False,
        bias: bool = True,
    ):
        super().__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.dropout = dropout
        self.head_dim = embed_dim // num_heads

        # 检查 embed_dim 必须能被 num_heads 整除，否则抛出 ValueError
        if (self.head_dim * num_heads) != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
                f" and `num_heads`: {num_heads})."
            )
        
        # 缩放因子，用于缩放 Q、K、V 矩阵的值
        self.scaling = self.head_dim**-0.5
        self.is_decoder = is_decoder

        # 线性变换层，用于计算 Q、K、V 的投影
        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)

    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        # 将输入的 tensor 重塑为多头注意力所需的形状
        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()

    def forward(
        self,
        hidden_states: torch.Tensor,
        key_value_states: Optional[torch.Tensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        attention_mask: Optional[torch.Tensor] = None,
        layer_head_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
    ):
        # 此处定义注意力层的前向传播过程
        pass  # 实际的实现应当包括 Q、K、V 的计算、注意力分数的计算以及输出的组装


class XGLMDecoderLayer(nn.Module):
    def __init__(self, config: XGLMConfig):
        super().__init__()
        self.embed_dim = config.d_model

        # 自注意力层，使用 XGLMAttention 类定义
        self.self_attn = XGLMAttention(
            embed_dim=self.embed_dim,
            num_heads=config.attention_heads,
            dropout=config.attention_dropout,
            is_decoder=True,
        )
        self.dropout = config.dropout
        self.activation_fn = ACT2FN[config.activation_function]
        self.activation_dropout = config.activation_dropout

        if config.add_cross_attention:
            # 如果配置需要跨注意力，则定义一个额外的注意力层
            self.encoder_attn = XGLMAttention(
                embed_dim=self.embed_dim,
                num_heads=config.attention_heads,
                dropout=config.attention_dropout,
                is_decoder=True,
            )
            self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)

        # 自注意力层和全连接层后的 LayerNorm
        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
        self.fc1 = nn.Linear(self.embed_dim, config.ffn_dim)
        self.fc2 = nn.Linear(config.ffn_dim, self.embed_dim)
        self.final_layer_norm = nn.LayerNorm(self.embed_dim)

    # 此处缺少 forward 方法的实现，应当在复制的代码中找到并补充其实现部分
    # 定义神经网络模型的前向传播方法，接收以下参数：
    # - hidden_states: 隐藏状态的张量，通常是当前层的输出
    # - attention_mask: 可选参数，用于指定哪些位置需要被屏蔽，以避免注意力机制处理这些位置
    # - encoder_hidden_states: 可选参数，编码器的隐藏状态张量，用于注意力机制中的计算
    # - encoder_attention_mask: 可选参数，编码器的注意力掩码张量，用于编码器-解码器注意力
    # - layer_head_mask: 可选参数，多头注意力机制中每个头部的掩码，以允许或禁止特定头部的计算
    # - cross_attn_layer_head_mask: 可选参数，用于跨层注意力的头部掩码，控制不同层之间的注意力计算
    # - past_key_value: 可选参数，包含过去键值状态的元组，用于在递归解码器中重用先前计算的键值
    # - output_attentions: 可选参数，布尔值，指示是否输出注意力权重
    # - use_cache: 可选参数，布尔值，指示是否使用缓存加速解码器的计算
class XGLMPreTrainedModel(PreTrainedModel):
    # 设置配置类为 XGLMConfig
    config_class = XGLMConfig
    # 模型基本名称前缀为 "model"
    base_model_prefix = "model"
    # 支持梯度检查点
    supports_gradient_checkpointing = True
    # 不分割的模块名称列表，包括 "XGLMDecoderLayer"
    _no_split_modules = ["XGLMDecoderLayer"]

    def _init_weights(self, module):
        # 初始化权重函数
        std = self.config.init_std
        # 如果模块是线性层
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=std)
            # 如果有偏置，则初始化为零
            if module.bias is not None:
                module.bias.data.zero_()
        # 如果模块是嵌入层
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=std)
            # 如果设置了填充索引，则对应位置初始化为零
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()


@add_start_docstrings(
    "The bare XGLM Model transformer outputting raw hidden-states without any specific head on top.",
    XGLM_START_DOCSTRING,
)
class XGLMModel(XGLMPreTrainedModel):
    """
    Transformer decoder consisting of *config.num_layers* layers. Each layer is a [`XGLMDecoderLayer`]

    Args:
        config: XGLMConfig
        embed_tokens (nn.Embedding): output embedding
    """

    def __init__(self, config: XGLMConfig, embed_tokens: Optional[nn.Embedding] = None):
        super().__init__(config)
        # 丢弃率
        self.dropout = config.dropout
        # 层丢弃率
        self.layerdrop = config.layerdrop
        # 填充索引
        self.padding_idx = config.pad_token_id
        # 最大目标位置
        self.max_target_positions = config.max_position_embeddings
        # 嵌入尺度
        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0

        # 如果提供了嵌入令牌，则使用提供的；否则初始化一个新的嵌入层
        if embed_tokens is not None:
            self.embed_tokens = embed_tokens
        else:
            self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)

        # 初始化位置编码
        self.embed_positions = XGLMSinusoidalPositionalEmbedding(
            config.max_position_embeddings,
            config.d_model,
            config.pad_token_id,
        )
        # 创建一系列解码层
        self.layers = nn.ModuleList([XGLMDecoderLayer(config) for _ in range(config.num_layers)])
        # 层归一化
        self.layer_norm = nn.LayerNorm(config.d_model)

        # 梯度检查点设为假
        self.gradient_checkpointing = False
        # 初始化权重并应用最终处理
        self.post_init()

    def get_input_embeddings(self):
        # 获取输入嵌入层
        return self.embed_tokens

    def set_input_embeddings(self, value):
        # 设置输入嵌入层
        self.embed_tokens = value

    @add_start_docstrings_to_model_forward(XGLM_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=BaseModelOutputWithPastAndCrossAttentions,
        config_class=_CONFIG_FOR_DOC,
    )
    # 定义模型的前向传播方法，接受多个输入参数
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,  # 输入的 token IDs，可以是 None 或者 torch.Tensor 类型
        attention_mask: Optional[torch.Tensor] = None,  # 注意力掩码，用于指示哪些元素是 padding，可以是 None 或者 torch.Tensor 类型
        position_ids: Optional[torch.Tensor] = None,  # 位置编码，用于指示每个 token 的位置信息，可以是 None 或者 torch.Tensor 类型
        encoder_hidden_states: Optional[torch.Tensor] = None,  # 编码器的隐藏状态，可以是 None 或者 torch.Tensor 类型
        encoder_attention_mask: Optional[torch.Tensor] = None,  # 编码器的注意力掩码，可以是 None 或者 torch.Tensor 类型
        head_mask: Optional[torch.Tensor] = None,  # 多头注意力的掩码，可以是 None 或者 torch.Tensor 类型
        cross_attn_head_mask: Optional[torch.Tensor] = None,  # 跨注意力头的掩码，可以是 None 或者 torch.Tensor 类型
        past_key_values: Optional[List[torch.FloatTensor]] = None,  # 缓存的键值对，可以是 None 或者 List[torch.FloatTensor] 类型
        inputs_embeds: Optional[torch.Tensor] = None,  # 输入的嵌入向量，可以是 None 或者 torch.Tensor 类型
        use_cache: Optional[bool] = None,  # 是否使用缓存，可以是 None 或者 bool 类型
        output_attentions: Optional[bool] = None,  # 是否输出注意力权重，可以是 None 或者 bool 类型
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态，可以是 None 或者 bool 类型
        return_dict: Optional[bool] = None,  # 是否返回字典格式的输出，可以是 None 或者 bool 类型
# 使用装饰器添加文档字符串，描述了 XGLM 模型转换器，带有一个在顶部的语言建模头部的线性层（其权重与输入嵌入层相绑定）。
@add_start_docstrings(
    """
    The XGLM Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    """,
    XGLM_START_DOCSTRING,
)
# 声明 XGLMForCausalLM 类，继承自 XGLMPreTrainedModel 类
class XGLMForCausalLM(XGLMPreTrainedModel):
    # 指定模型的前缀字符串
    base_model_prefix = "model"
    # 定义被绑定权重的键名列表
    _tied_weights_keys = ["lm_head.weight"]

    # 初始化方法，接受一个配置对象作为参数
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__(config)
        # 创建 XGLMModel 类的实例并赋值给 self.model
        self.model = XGLMModel(config)
        # 创建一个线性层用于语言建模头部，输出大小为 config.vocab_size，无偏置
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

        # 调用后续初始化方法
        self.post_init()

    # 获取输入嵌入层的方法
    def get_input_embeddings(self):
        return self.model.embed_tokens

    # 设置输入嵌入层的方法
    def set_input_embeddings(self, value):
        self.model.embed_tokens = value

    # 获取输出嵌入层（语言建模头部）的方法
    def get_output_embeddings(self):
        return self.lm_head

    # 设置输出嵌入层（语言建模头部）的方法
    def set_output_embeddings(self, new_embeddings):
        self.lm_head = new_embeddings

    # 前向传播方法，接受多个输入参数
    @add_start_docstrings_to_model_forward(XGLM_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=CausalLMOutputWithCrossAttentions,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        # 函数声明的参数列表未完，需要继续
    ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        """

        # 设置是否输出注意力权重
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 设置是否输出隐藏状态
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 设置是否返回字典格式的输出
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
        # 将输入传递给模型进行前向传播
        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            head_mask=head_mask,
            cross_attn_head_mask=cross_attn_head_mask,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 通过语言模型头部生成逻辑回归结果
        logits = self.lm_head(outputs[0])

        loss = None
        if labels is not None:
            # 调整标签并在末尾添加一个填充标记
            shift_labels = labels.new_zeros(labels.shape)
            shift_labels[:, :-1] = labels[:, 1:].clone()
            shift_labels[:, -1] = self.config.pad_token_id

            # 计算交叉熵损失
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.config.vocab_size), shift_labels.view(-1))

        if not return_dict:
            # 如果不返回字典格式的输出，则按顺序返回元组
            output = (logits,) + outputs[1:]
            return (loss,) + output if loss is not None else output

        # 如果返回字典格式的输出，则构建并返回带有交叉注意力的因果语言模型输出对象
        return CausalLMOutputWithCrossAttentions(
            loss=loss,
            logits=logits,
            past_key_values=outputs.past_key_values,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
            cross_attentions=outputs.cross_attentions,
        )

    def prepare_inputs_for_generation(
        self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, **kwargs
        ):
            # 如果传入的过去键值不为None，则获取第一个键值对应的形状的第三个元素，即长度
            if past_key_values is not None:
                past_length = past_key_values[0][0].shape[2]

                # 如果输入的input_ids的第二个维度大于过去长度，则设定要移除的前缀长度为过去长度
                if input_ids.shape[1] > past_length:
                    remove_prefix_length = past_length
                else:
                    # 否则，默认保留最后一个ID，设定要移除的前缀长度为input_ids的第二个维度减1
                    remove_prefix_length = input_ids.shape[1] - 1

                # 重新设定input_ids为去除前缀后的部分
                input_ids = input_ids[:, remove_prefix_length:]

            position_ids = kwargs.get("position_ids", None)
            # 如果存在attention_mask且position_ids为None，则动态创建position_ids用于批量生成
            if attention_mask is not None and position_ids is None:
                position_ids = attention_mask.long().cumsum(-1) - 1
                position_ids.masked_fill_(attention_mask == 0, 1)
                # 如果存在过去键值，则截取最后input_ids.shape[1]列
                if past_key_values:
                    position_ids = position_ids[:, -input_ids.shape[1] :]
            else:
                position_ids = None
                # 如果模型作为编码器-解码器模型中的解码器使用，则动态创建解码器的attention_mask
                if attention_mask is None:
                    attention_mask = input_ids.new_ones(input_ids.shape)

            # 第一步，decoder_cached_states为空
            return {
                "input_ids": input_ids,  # encoder_outputs is defined. input_ids not needed
                "attention_mask": attention_mask,
                "position_ids": position_ids,
                "past_key_values": past_key_values,
                "use_cache": use_cache,
            }

        @staticmethod
        def _reorder_cache(past_key_values, beam_idx):
            reordered_past = ()
            # 对过去的键值进行重新排序，根据beam_idx
            for layer_past in past_key_values:
                reordered_past += (
                    # 将每一层的过去状态按照beam_idx重新排序，并且将结果组合成一个元组
                    tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
                )
            return reordered_past

`.\models\xglm\tokenization_xglm.py`

# coding=utf-8
# Copyright The HuggingFace Team and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes for ."""

# 导入标准库 os 和 shutil 中的 copyfile 函数
import os
from shutil import copyfile
# 导入类型提示模块中的相关对象
from typing import Any, Dict, List, Optional, Tuple

# 导入 sentencepiece 库，用于处理基于 SentencePiece 的 tokenization
import sentencepiece as spm

# 导入父类 PreTrainedTokenizer 和 logging 工具
from ...tokenization_utils import PreTrainedTokenizer
from ...utils import logging

# 获取当前模块的 logger 对象
logger = logging.get_logger(__name__)

# 定义 SentencePiece 使用的特殊 token
SPIECE_UNDERLINE = "▁"

# 定义 vocab 文件的名称映射，包含一个 vocab 文件的标准名称
VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"}

# 定义预训练模型中的 vocab 文件映射
PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "facebook/xglm-564M": "https://huggingface.co/facebook/xglm-564M/resolve/main/sentencepiece.bpe.model",
    }
}

# 定义预训练模型的位置编码大小映射
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "facebook/xglm-564M": 2048,
}

# 定义 XGLMTokenizer 类，继承自 PreTrainedTokenizer 类
class XGLMTokenizer(PreTrainedTokenizer):
    """
    Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
    [SentencePiece](https://github.com/google/sentencepiece).

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.
    """
    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the beginning of
            sequence. The token used is the `cls_token`.

            </Tip>

        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
            The token used is the `sep_token`.

            </Tip>

        sep_token (`str`, *optional*, defaults to `"</s>"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
        cls_token (`str`, *optional*, defaults to `"<s>"`):
            The classifier token which is used when doing sequence classification (classification of the whole sequence
            instead of per-token classification). It is the first token of the sequence when built with special tokens.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        sp_model_kwargs (`dict`, *optional*):
            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
            to set:

            - `enable_sampling`: Enable subword regularization.
            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

              - `nbest_size = {0,1}`: No sampling is performed.
              - `nbest_size > 1`: samples from the nbest_size results.
              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.

            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.

    Attributes:
        sp_model (`SentencePieceProcessor`):
            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
    """

    # 从全局常量中获取词汇文件名列表
    vocab_files_names = VOCAB_FILES_NAMES
    # 预训练词汇文件映射，指定了每个特殊词汇的文件路径
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    # 将预训练的位置嵌入大小赋值给max_model_input_sizes变量
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    # 定义模型输入的名称列表
    model_input_names = ["input_ids", "attention_mask"]

    # 初始化函数，用于创建一个新的实例对象
    def __init__(
        self,
        vocab_file,
        bos_token="<s>",
        eos_token="</s>",
        sep_token="</s>",
        cls_token="<s>",
        unk_token="<unk>",
        pad_token="<pad>",
        sp_model_kwargs: Optional[Dict[str, Any]] = None,
        **kwargs,
    ) -> None:
        # 如果sp_model_kwargs为None，则设为一个空字典
        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs

        # 兼容性处理，与原始分词器的兼容
        self.num_madeup_words = 7
        madeup_words = [f"<madeupword{i}>" for i in range(self.num_madeup_words)]

        # 获取kwargs中的additional_special_tokens列表，如果不存在则创建一个空列表
        kwargs["additional_special_tokens"] = kwargs.get("additional_special_tokens", []) or []
        # 将madeup_words中未在additional_special_tokens中的单词添加到additional_special_tokens中
        kwargs["additional_special_tokens"] += [
            word for word in madeup_words if word not in kwargs["additional_special_tokens"]
        ]

        # 使用指定的参数初始化SentencePieceProcessor对象
        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
        # 加载词汇文件到self.sp_model中
        self.sp_model.Load(str(vocab_file))
        # 将vocab_file保存到self.vocab_file中
        self.vocab_file = vocab_file

        # 原始fairseq词汇表和spm词汇表必须是“对齐”的：
        # Vocab    |    0    |    1    |   2    |    3    |  4  |  5  |  6  |   7   |   8   |  9
        # -------- | ------- | ------- | ------ | ------- | --- | --- | --- | ----- | ----- | ----
        # fairseq  | '<s>'   | '<pad>' | '</s>' | '<unk>' | ',' | '.' | '▁' | 's'   | '▁de' | '-'
        # spm      | '<unk>' | '<s>'   | '</s>' | ','     | '.' | '▁' | 's' | '▁de' | '-'   | '▁a'

        # 在原始fairseq词汇表和spm词汇表之间进行对齐，第一个“真实”标记“,”在fairseq词汇表中位置为4，在spm词汇表中位置为3
        self.fairseq_offset = 1

        # 模仿fairseq的token-to-id对齐，对前4个token进行映射
        self.fairseq_tokens_to_ids = {"<s>": 0, "<pad>": 1, "</s>": 2, "<unk>": 3}

        # 计算spm词汇表的大小
        sp_size = len(self.sp_model)
        # 创建一个字典，将madeup_words映射到fairseq词汇表之后的位置
        madeup_words = {f"<madeupword{i}>": sp_size + i + self.fairseq_offset for i in range(self.num_madeup_words)}
        self.fairseq_tokens_to_ids.update(madeup_words)

        # 创建一个反向映射，从token id到token的映射
        self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}

        # 调用父类的初始化方法，传入相应的参数
        super().__init__(
            bos_token=bos_token,
            eos_token=eos_token,
            unk_token=unk_token,
            sep_token=sep_token,
            cls_token=cls_token,
            pad_token=pad_token,
            sp_model_kwargs=self.sp_model_kwargs,
            **kwargs,
        )

    # 序列化对象时调用的方法，返回对象的状态信息
    def __getstate__(self):
        state = self.__dict__.copy()
        # 将self.sp_model设置为None，因为它不能直接被序列化
        state["sp_model"] = None
        # 将self.sp_model的序列化模型信息保存到state中
        state["sp_model_proto"] = self.sp_model.serialized_model_proto()
        return state

    # 反序列化对象时调用的方法，用于恢复对象的状态信息
    def __setstate__(self, d):
        # 恢复对象的状态信息
        self.__dict__ = d

        # 向后兼容性处理
        if not hasattr(self, "sp_model_kwargs"):
            self.sp_model_kwargs = {}

        # 使用保存的sp_model_proto信息重新初始化self.sp_model
        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
        self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        从序列或序列对构建用于序列分类任务的模型输入，通过连接并添加特殊标记。XLM-RoBERTa 序列的格式如下：

        - 单序列： `<s> X </s>`
        - 序列对： `<s> A </s></s> B </s>`

        Args:
            token_ids_0 (`List[int]`):
                要添加特殊标记的 ID 列表。
            token_ids_1 (`List[int]`, *optional*):
                可选的第二个 ID 列表，用于序列对。

        Returns:
            `List[int]`: 带有适当特殊标记的输入 ID 列表。
        """

        if token_ids_1 is None:
            # 如果只有一个序列，返回带有 SEP 特殊标记的 token_ids_0
            return [self.sep_token_id] + token_ids_0
        sep = [self.sep_token_id]
        # 如果有两个序列，返回连接的序列，每个序列末尾带有两个 SEP 特殊标记
        return sep + token_ids_0 + sep + sep + token_ids_1

    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
    ) -> List[int]:
        """
        从没有添加特殊标记的 token 列表中检索序列 ID。在使用 tokenizer 的 `prepare_for_model` 方法添加特殊标记时调用此方法。

        Args:
            token_ids_0 (`List[int]`):
                ID 列表。
            token_ids_1 (`List[int]`, *optional*):
                可选的第二个 ID 列表，用于序列对。
            already_has_special_tokens (`bool`, *optional*, 默认为 `False`):
                token 列表是否已经使用特殊标记格式化为模型。

        Returns:
            `List[int]`: 整数列表，范围为 [0, 1]：1 表示特殊标记，0 表示序列标记。
        """

        if already_has_special_tokens:
            # 如果已经有特殊标记，调用父类方法获取特殊标记掩码
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
            )

        if token_ids_1 is None:
            # 如果只有一个序列，返回一个序列首部带有特殊标记的掩码
            return [1] + ([0] * len(token_ids_0))
        # 如果有两个序列，返回连接的序列，每个序列首尾带有特殊标记的掩码
        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1))

    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ):
        """
        从序列或序列对创建用于区分 token 类型的 token 类型 ID。

        Args:
            token_ids_0 (`List[int]`):
                第一个序列的 ID 列表。
            token_ids_1 (`List[int]`, *optional*):
                可选的第二个序列的 ID 列表，用于序列对。

        Returns:
            无返回值，该方法会生成用于区分 token 类型的 token 类型 ID。
        """
    ) -> List[int]:
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. XLM-RoBERTa does
        not make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.
        """

        # Define a separator token list containing `self.sep_token_id`
        sep = [self.sep_token_id]

        # Check if token_ids_1 is None; if so, return a list of zeros based on the length of `sep + token_ids_0`
        if token_ids_1 is None:
            return len(sep + token_ids_0) * [0]
        
        # If token_ids_1 is provided, return a list of zeros based on the extended length of tokens including separators
        return len(sep + token_ids_0 + sep + sep + token_ids_1) * [0]

    @property
    def vocab_size(self):
        # Calculate and return the total vocabulary size, including fairseq offsets and made-up words
        return len(self.sp_model) + self.fairseq_offset + self.num_madeup_words

    def get_vocab(self):
        # Create a dictionary mapping from token strings to their corresponding IDs within the vocabulary
        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
        vocab.update(self.added_tokens_encoder)  # Update with additional tokens from `added_tokens_encoder`
        return vocab

    def _tokenize(self, text: str) -> List[str]:
        # Tokenize the input `text` using `sp_model` and return a list of token strings
        return self.sp_model.encode(text, out_type=str)

    def _convert_token_to_id(self, token):
        """Converts a token (str) into an ID using the vocabulary."""
        if token in self.fairseq_tokens_to_ids:
            return self.fairseq_tokens_to_ids[token]
        spm_id = self.sp_model.PieceToId(token)

        # Return the offset ID for unknown tokens if SP model returns 0 (indicating unknown token)
        return spm_id + self.fairseq_offset if spm_id else self.unk_token_id

    def _convert_id_to_token(self, index):
        """Converts an index (integer) into a token (str) using the vocabulary."""
        if index in self.fairseq_ids_to_tokens:
            return self.fairseq_ids_to_tokens[index]
        return self.sp_model.IdToPiece(index - self.fairseq_offset)

    def convert_tokens_to_string(self, tokens):
        """Converts a sequence of tokens (strings for sub-words) into a single string."""
        # Concatenate tokens into a single string, replacing SPIECE_UNDERLINE with spaces and stripping leading/trailing spaces
        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
        return out_string

    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # Ensure `save_directory` exists; if not, log an error and return None
        if not os.path.isdir(save_directory):
            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
            return
        
        # Define the output vocabulary file path based on `save_directory` and `filename_prefix`
        out_vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
        )

        # If the current `vocab_file` path is different from `out_vocab_file` and exists, copy `vocab_file` to `out_vocab_file`
        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
            copyfile(self.vocab_file, out_vocab_file)
        # If `vocab_file` does not exist, write `sp_model.serialized_model_proto()` content to `out_vocab_file`
        elif not os.path.isfile(self.vocab_file):
            with open(out_vocab_file, "wb") as fi:
                content_spiece_model = self.sp_model.serialized_model_proto()
                fi.write(content_spiece_model)

        return (out_vocab_file,)

`.\models\xglm\tokenization_xglm_fast.py`

# coding=utf-8
# Copyright The HuggingFace Team and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes for XGLM."""

import os  # 导入操作系统功能
from shutil import copyfile  # 导入复制文件功能
from typing import List, Optional, Tuple  # 导入类型提示相关的模块

from ...tokenization_utils_fast import PreTrainedTokenizerFast  # 导入速度快的预训练分词器类
from ...utils import is_sentencepiece_available, logging  # 导入判断是否安装了 sentencepiece 和日志记录相关的工具

if is_sentencepiece_available():
    from .tokenization_xglm import XGLMTokenizer  # 如果安装了 sentencepiece，则导入 XGLMTokenizer 类
else:
    XGLMTokenizer = None  # 否则将 XGLMTokenizer 设置为 None

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器

VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model", "tokenizer_file": "tokenizer.json"}  # 定义词汇文件和分词器文件的名称

PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "facebook/xglm-564M": "https://huggingface.co/facebook/xglm-564M/resolve/main/sentencepiece.bpe.model",
    },
    "tokenizer_file": {
        "facebook/xglm-564M": "https://huggingface.co/facebook/xglm-564M/resolve/main/tokenizer.json",
    },
}  # 预训练模型对应的词汇文件和分词器文件的映射

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "facebook/xglm-564M": 2048,
}  # 预训练模型对应的位置嵌入大小

class XGLMTokenizerFast(PreTrainedTokenizerFast):
    """
    Construct a "fast" XGLM tokenizer (backed by HuggingFace's *tokenizers* library). Adapted from [`RobertaTokenizer`]
    and [`XLNetTokenizer`]. Based on
    [BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models).

    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.
    """
    """
    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the beginning of
            sequence. The token used is the `cls_token`.

            </Tip>

        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
            The token used is the `sep_token`.

            </Tip>

        sep_token (`str`, *optional*, defaults to `"</s>"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
        cls_token (`str`, *optional*, defaults to `"<s>"`):
            The classifier token which is used when doing sequence classification (classification of the whole sequence
            instead of per-token classification). It is the first token of the sequence when built with special tokens.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        additional_special_tokens (`List[str]`, *optional*, defaults to `["<s>NOTUSED", "</s>NOTUSED"]`):
            Additional special tokens used by the tokenizer.
    """

    # 映射不同文件名与对应的词汇表文件名
    vocab_files_names = VOCAB_FILES_NAMES
    # 预训练模型的词汇文件映射
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    # 预训练位置嵌入的最大输入尺寸
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    # 模型输入的名称列表
    model_input_names = ["input_ids", "attention_mask"]
    # 慢速分词器的类
    slow_tokenizer_class = XGLMTokenizer

    # 初始化函数，设置各种默认参数和额外参数
    def __init__(
        self,
        vocab_file=None,
        tokenizer_file=None,
        bos_token="<s>",
        eos_token="</s>",
        sep_token="</s>",
        cls_token="<s>",
        unk_token="<unk>",
        pad_token="<pad>",
        **kwargs,
    """
    ):
        """
        Compatibility with the original tokenizer.
        Set the number of made-up words to 7 and generate a list of made-up words.
        Append any new made-up words to the 'additional_special_tokens' in kwargs.
        Initialize the superclass with various parameters including vocab_file and additional_special_tokens.
        """

        self.num_madeup_words = 7
        madeup_words = [f"<madeupword{i}>" for i in range(self.num_madeup_words)]

        kwargs["additional_special_tokens"] = kwargs.get("additional_special_tokens", []) or []
        kwargs["additional_special_tokens"] += [
            word for word in madeup_words if word not in kwargs["additional_special_tokens"]
        ]

        super().__init__(
            vocab_file,
            tokenizer_file=tokenizer_file,
            bos_token=bos_token,
            eos_token=eos_token,
            sep_token=sep_token,
            cls_token=cls_token,
            unk_token=unk_token,
            pad_token=pad_token,
            **kwargs,
        )

        self.vocab_file = vocab_file

    @property
    def can_save_slow_tokenizer(self) -> bool:
        """
        Check if the vocab_file exists to determine if the slow tokenizer can be saved.
        Returns True if vocab_file exists, False otherwise.
        """
        return os.path.isfile(self.vocab_file) if self.vocab_file else False

    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Build model inputs for sequence classification tasks by adding special tokens.
        Formats sequences according to XLM-RoBERTa standards.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs for the first sequence.
            token_ids_1 (`List[int]`, *optional*):
                List of IDs for the second sequence (optional).

        Returns:
            `List[int]`: List of input IDs with special tokens added.
        """

        if token_ids_1 is None:
            return [self.sep_token_id] + token_ids_0
        sep = [self.sep_token_id]
        return sep + token_ids_0 + sep + sep + token_ids_1

    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Create token type IDs for sequence-pair classification tasks.
        Always returns a list of zeros as XLM-RoBERTa does not use token type IDs.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs for the first sequence.
            token_ids_1 (`List[int]`, *optional*):
                List of IDs for the second sequence (optional).

        Returns:
            `List[int]`: List of zeros (indicating no distinction in token types).
        """
        
        sep = [self.sep_token_id]

        if token_ids_1 is None:
            return len(sep + token_ids_0) * [0]
        return len(sep + token_ids_0 + sep + sep + token_ids_1) * [0]
    # 定义一个方法用于保存词汇表到指定目录下的文件中，方法签名指定了参数和返回类型
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # 如果当前的快速分词器没有保存慢速分词器所需的信息，则抛出数值错误异常
        if not self.can_save_slow_tokenizer:
            raise ValueError(
                "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
                "tokenizer."
            )

        # 如果保存词汇表的目录不存在，则记录错误日志并返回
        if not os.path.isdir(save_directory):
            logger.error(f"Vocabulary path ({save_directory}) should be a directory.")
            return

        # 构建输出词汇表文件的路径，包括可选的文件名前缀和文件名
        out_vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
        )

        # 如果当前词汇表文件的绝对路径与输出文件的绝对路径不同，则复制当前词汇表文件到输出路径
        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
            copyfile(self.vocab_file, out_vocab_file)

        # 返回一个包含输出词汇表文件路径的元组
        return (out_vocab_file,)

`.\models\xglm\init.py`

# 导入必要的模块和函数
from typing import TYPE_CHECKING
# 从工具包中导入相关依赖项
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_flax_available,
    is_sentencepiece_available,
    is_tf_available,
    is_tokenizers_available,
    is_torch_available,
)

# 定义模块的导入结构，包含了所需的模块和函数
_import_structure = {"configuration_xglm": ["XGLM_PRETRAINED_CONFIG_ARCHIVE_MAP", "XGLMConfig"]}

# 检查是否存在 sentencepiece 库，如果不存在则抛出异常
try:
    if not is_sentencepiece_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，则将 XGLMTokenizer 加入到导入结构中
    _import_structure["tokenization_xglm"] = ["XGLMTokenizer"]

# 检查是否存在 tokenizers 库，如果不存在则抛出异常
try:
    if not is_tokenizers_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，则将 XGLMTokenizerFast 加入到导入结构中
    _import_structure["tokenization_xglm_fast"] = ["XGLMTokenizerFast"]

# 检查是否存在 torch 库，如果不存在则抛出异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，则将 modeling_xglm 相关模块加入到导入结构中
    _import_structure["modeling_xglm"] = [
        "XGLM_PRETRAINED_MODEL_ARCHIVE_LIST",
        "XGLMForCausalLM",
        "XGLMModel",
        "XGLMPreTrainedModel",
    ]

# 检查是否存在 flax 库，如果不存在则抛出异常
try:
    if not is_flax_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，则将 modeling_flax_xglm 相关模块加入到导入结构中
    _import_structure["modeling_flax_xglm"] = [
        "FlaxXGLMForCausalLM",
        "FlaxXGLMModel",
        "FlaxXGLMPreTrainedModel",
    ]

# 检查是否存在 tensorflow 库，如果不存在则抛出异常
try:
    if not is_tf_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，则将 modeling_tf_xglm 相关模块加入到导入结构中
    _import_structure["modeling_tf_xglm"] = [
        "TF_XGLM_PRETRAINED_MODEL_ARCHIVE_LIST",
        "TFXGLMForCausalLM",
        "TFXGLMModel",
        "TFXGLMPreTrainedModel",
    ]

# 如果是类型检查阶段，导入额外的类型定义和模块
if TYPE_CHECKING:
    from .configuration_xglm import XGLM_PRETRAINED_CONFIG_ARCHIVE_MAP, XGLMConfig

    # 检查是否存在 sentencepiece 库，如果可用则导入 XGLMTokenizer
    try:
        if not is_sentencepiece_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .tokenization_xglm import XGLMTokenizer

    # 检查是否存在 tokenizers 库，如果可用则导入 XGLMTokenizerFast
    try:
        if not is_tokenizers_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .tokenization_xglm_fast import XGLMTokenizerFast

    # 检查是否存在 torch 库，如果可用则导入相关模块
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    # 如果OptionalDependencyNotAvailable异常被抛出，则忽略并继续执行
    except OptionalDependencyNotAvailable:
        pass
    # 如果没有抛出异常，则导入相关模块和类
    else:
        from .modeling_xglm import XGLM_PRETRAINED_MODEL_ARCHIVE_LIST, XGLMForCausalLM, XGLMModel, XGLMPreTrainedModel

    # 尝试检查是否Flax库可用，如果不可用则忽略并继续执行
    try:
        if not is_flax_available():
            raise OptionalDependencyNotAvailable()
    # 如果OptionalDependencyNotAvailable异常被抛出，则忽略并继续执行
    except OptionalDependencyNotAvailable:
        pass
    # 如果没有抛出异常，则导入相关模块和类
    else:
        from .modeling_flax_xglm import FlaxXGLMForCausalLM, FlaxXGLMModel, FlaxXGLMPreTrainedModel

    # 尝试检查是否TensorFlow库可用，如果不可用则忽略并继续执行
    try:
        if not is_tf_available():
            raise OptionalDependencyNotAvailable()
    # 如果OptionalDependencyNotAvailable异常被抛出，则忽略并继续执行
    except OptionalDependencyNotAvailable:
        pass
    # 如果没有抛出异常，则导入相关模块和类
    else:
        from .modeling_tf_xglm import (
            TF_XGLM_PRETRAINED_MODEL_ARCHIVE_LIST,
            TFXGLMForCausalLM,
            TFXGLMModel,
            TFXGLMPreTrainedModel,
        )
else:
    # 导入内置模块 sys
    import sys

    # 将当前模块(__name__)的引用替换为一个延迟加载的模块对象 _LazyModule
    # _LazyModule 的构造参数依次为模块名称(__name__)、模块文件路径(__file__)、导入结构(_import_structure)
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)

`.\models\xlm\configuration_xlm.py`

# 导入所需的模块和类
from collections import OrderedDict  # 导入有序字典模块
from typing import Mapping  # 导入 Mapping 类型提示

from ...configuration_utils import PretrainedConfig  # 导入预训练配置基类
from ...onnx import OnnxConfig  # 导入 ONNX 配置
from ...utils import logging  # 导入日志工具

# 获取 logger 对象，用于记录日志
logger = logging.get_logger(__name__)

# 预训练模型配置文件映射表，将模型名称映射到其配置文件的 URL
XLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "FacebookAI/xlm-mlm-en-2048": "https://huggingface.co/FacebookAI/xlm-mlm-en-2048/resolve/main/config.json",
    "FacebookAI/xlm-mlm-ende-1024": "https://huggingface.co/FacebookAI/xlm-mlm-ende-1024/resolve/main/config.json",
    "FacebookAI/xlm-mlm-enfr-1024": "https://huggingface.co/FacebookAI/xlm-mlm-enfr-1024/resolve/main/config.json",
    "FacebookAI/xlm-mlm-enro-1024": "https://huggingface.co/FacebookAI/xlm-mlm-enro-1024/resolve/main/config.json",
    "FacebookAI/xlm-mlm-tlm-xnli15-1024": "https://huggingface.co/FacebookAI/xlm-mlm-tlm-xnli15-1024/resolve/main/config.json",
    "FacebookAI/xlm-mlm-xnli15-1024": "https://huggingface.co/FacebookAI/xlm-mlm-xnli15-1024/resolve/main/config.json",
    "FacebookAI/xlm-clm-enfr-1024": "https://huggingface.co/FacebookAI/xlm-clm-enfr-1024/resolve/main/config.json",
    "FacebookAI/xlm-clm-ende-1024": "https://huggingface.co/FacebookAI/xlm-clm-ende-1024/resolve/main/config.json",
    "FacebookAI/xlm-mlm-17-1280": "https://huggingface.co/FacebookAI/xlm-mlm-17-1280/resolve/main/config.json",
    "FacebookAI/xlm-mlm-100-1280": "https://huggingface.co/FacebookAI/xlm-mlm-100-1280/resolve/main/config.json",
}

class XLMConfig(PretrainedConfig):
    """
    XLM 模型的配置类，用于存储 [`XLMModel`] 或 [`TFXLMModel`] 的配置信息。根据指定参数实例化一个 XLM 模型配置，定义模型的架构。
    使用默认值实例化配置将得到与 [FacebookAI/xlm-mlm-en-2048](https://huggingface.co/FacebookAI/xlm-mlm-en-2048) 架构类似的配置。

    配置对象继承自 [`PretrainedConfig`]，可用于控制模型输出。阅读 [`PretrainedConfig`] 的文档以获取更多信息。

    Examples:

    ```
    >>> from transformers import XLMConfig, XLMModel

    >>> # 初始化一个 XLM 配置
    >>> configuration = XLMConfig()

    >>> # 从配置初始化一个模型（随机权重）

    ```
    """
    # 创建一个 XLMModel 的实例，使用给定的 configuration 参数
    >>> model = XLMModel(configuration)

    # 访问模型配置信息
    >>> # Accessing the model configuration
    >>> configuration = model.config

    # 定义一个表示 XLM 模型类型的字符串变量
    model_type = "xlm"

    # 定义一个字典，将 XLM 模型属性名映射为别名
    attribute_map = {
        "hidden_size": "emb_dim",
        "num_attention_heads": "n_heads",
        "num_hidden_layers": "n_layers",
        "n_words": "vocab_size",  # 为了向后兼容性
    }

    # XLMConfig 类的构造函数，初始化 XLM 模型的各项配置参数
    def __init__(
        self,
        vocab_size=30145,
        emb_dim=2048,
        n_layers=12,
        n_heads=16,
        dropout=0.1,
        attention_dropout=0.1,
        gelu_activation=True,
        sinusoidal_embeddings=False,
        causal=False,
        asm=False,
        n_langs=1,
        use_lang_emb=True,
        max_position_embeddings=512,
        embed_init_std=2048**-0.5,
        layer_norm_eps=1e-12,
        init_std=0.02,
        bos_index=0,
        eos_index=1,
        pad_index=2,
        unk_index=3,
        mask_index=5,
        is_encoder=True,
        summary_type="first",
        summary_use_proj=True,
        summary_activation=None,
        summary_proj_to_labels=True,
        summary_first_dropout=0.1,
        start_n_top=5,
        end_n_top=5,
        mask_token_id=0,
        lang_id=0,
        pad_token_id=2,
        bos_token_id=0,
        **kwargs,
    ):
        """Constructs XLMConfig."""
        # 初始化 XLMConfig 对象的各个配置参数
        self.vocab_size = vocab_size
        self.emb_dim = emb_dim
        self.n_layers = n_layers
        self.n_heads = n_heads
        self.dropout = dropout
        self.attention_dropout = attention_dropout
        self.gelu_activation = gelu_activation
        self.sinusoidal_embeddings = sinusoidal_embeddings
        self.causal = causal
        self.asm = asm
        self.n_langs = n_langs
        self.use_lang_emb = use_lang_emb
        self.layer_norm_eps = layer_norm_eps
        self.bos_index = bos_index
        self.eos_index = eos_index
        self.pad_index = pad_index
        self.unk_index = unk_index
        self.mask_index = mask_index
        self.is_encoder = is_encoder
        self.max_position_embeddings = max_position_embeddings
        self.embed_init_std = embed_init_std
        self.init_std = init_std
        self.summary_type = summary_type
        self.summary_use_proj = summary_use_proj
        self.summary_activation = summary_activation
        self.summary_proj_to_labels = summary_proj_to_labels
        self.summary_first_dropout = summary_first_dropout
        self.start_n_top = start_n_top
        self.end_n_top = end_n_top
        self.mask_token_id = mask_token_id
        self.lang_id = lang_id

        # 如果 kwargs 中包含 'n_words' 参数，将其赋值给 self.n_words
        if "n_words" in kwargs:
            self.n_words = kwargs["n_words"]

        # 调用父类的构造函数，初始化基类的一些参数，如 pad_token_id 和 bos_token_id
        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, **kwargs)
# 从 transformers.models.bert.configuration_bert.BertOnnxConfig 复制过来的 XLMOnnxConfig 类
class XLMOnnxConfig(OnnxConfig):
    # 定义 inputs 属性，返回一个映射类型，其键为字符串，值为映射类型，映射类型的键为整数，值为字符串
    @property
    def inputs(self) -> Mapping[str, Mapping[int, str]]:
        # 如果任务类型是 "multiple-choice"
        if self.task == "multiple-choice":
            # 设置动态轴的映射，0 对应 "batch"，1 对应 "choice"，2 对应 "sequence"
            dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
        else:
            # 否则设置动态轴的映射，0 对应 "batch"，1 对应 "sequence"
            dynamic_axis = {0: "batch", 1: "sequence"}
        # 返回一个有序字典，包含三个键值对，键为字符串，值为 dynamic_axis 映射
        return OrderedDict(
            [
                ("input_ids", dynamic_axis),         # 键为 "input_ids"，值为 dynamic_axis 映射
                ("attention_mask", dynamic_axis),    # 键为 "attention_mask"，值为 dynamic_axis 映射
                ("token_type_ids", dynamic_axis),    # 键为 "token_type_ids"，值为 dynamic_axis 映射
            ]
        )

`.\models\xlm\convert_xlm_original_pytorch_checkpoint_to_pytorch.py`

# coding=utf-8
# Copyright 2018 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert OpenAI GPT checkpoint."""


import argparse  # 导入用于解析命令行参数的模块
import json  # 导入处理 JSON 格式数据的模块

import numpy  # 导入处理数值运算的模块
import torch  # 导入 PyTorch 深度学习框架

from transformers.models.xlm.tokenization_xlm import VOCAB_FILES_NAMES  # 导入 XLM 模型的词汇文件名
from transformers.utils import CONFIG_NAME, WEIGHTS_NAME, logging  # 导入配置文件、权重文件名以及日志模块


logging.set_verbosity_info()  # 设置日志输出级别为 info


def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_path):
    # Load checkpoint 加载模型检查点文件
    chkpt = torch.load(xlm_checkpoint_path, map_location="cpu")

    state_dict = chkpt["model"]  # 获取模型的状态字典

    # We have the base model one level deeper than the original XLM repository
    # 将模型的字典键名做适当的修改，使其符合转换后的 PyTorch 模型结构
    two_levels_state_dict = {}
    for k, v in state_dict.items():
        if "pred_layer" in k:
            two_levels_state_dict[k] = v
        else:
            two_levels_state_dict["transformer." + k] = v

    config = chkpt["params"]  # 获取模型的参数配置信息
    config = {n: v for n, v in config.items() if not isinstance(v, (torch.FloatTensor, numpy.ndarray))}  # 过滤掉浮点数类型的配置项

    vocab = chkpt["dico_word2id"]  # 获取词汇表
    vocab = {s + "</w>" if s.find("@@") == -1 and i > 13 else s.replace("@@", ""): i for s, i in vocab.items()}  # 处理词汇表内容

    # Save pytorch-model 保存转换后的 PyTorch 模型权重文件
    pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME
    torch.save(two_levels_state_dict, pytorch_weights_dump_path)

    # Save configuration file 保存模型的配置文件
    pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME
    with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
        f.write(json.dumps(config, indent=2) + "\n")

    # Save vocab file 保存模型的词汇表文件
    pytorch_vocab_dump_path = pytorch_dump_folder_path + "/" + VOCAB_FILES_NAMES["vocab_file"]
    with open(pytorch_vocab_dump_path, "w", encoding="utf-8") as f:
        f.write(json.dumps(vocab, indent=2) + "\n")


if __name__ == "__main__":
    parser = argparse.ArgumentParser()  # 创建参数解析器
    # Required parameters 必须的命令行参数
    parser.add_argument(
        "--xlm_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump."
    )
    parser.add_argument(
        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
    )
    args = parser.parse_args()  # 解析命令行参数
    convert_xlm_checkpoint_to_pytorch(args.xlm_checkpoint_path, args.pytorch_dump_folder_path)  # 调用转换函数进行转换

`.\models\xlm\modeling_tf_xlm.py`

# 计算位置编码并将其写入输出张量中
def create_sinusoidal_embeddings(n_pos, dim, out):
    # 生成位置编码矩阵，其中每个位置的编码包括正弦和余弦部分
    position_enc = np.array([[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)])
    # 将正弦编码写入输出张量的偶数索引位置
    out[:, 0::2] = tf.constant(np.sin(position_enc[:, 0::2]))
    # 将余弦编码写入输出张量的奇数索引位置
    out[:, 1::2] = tf.constant(np.cos(position_enc[:, 1::2]))

# 生成隐藏状态掩码和可选的注意力掩码
def get_masks(slen, lengths, causal, padding_mask=None):
    # 获取批次大小
    bs = shape_list(lengths)[0]
    # 如果存在填充掩码，则使用填充掩码作为掩码
    if padding_mask is not None:
        mask = padding_mask
    else:
        # 如果不是 causal 模式，则创建长度等于 slen 的序列 alen
        alen = tf.range(slen, dtype=lengths.dtype)
        # 创建一个掩码 mask，标记 alen 中小于每个长度值的位置为 True，其余为 False
        mask = alen < tf.expand_dims(lengths, axis=1)

    # 如果是 causal 模式，则创建一个上三角形式的注意力掩码 attn_mask
    # 否则，attn_mask 与 mask 相同
    if causal:
        attn_mask = tf.less_equal(
            tf.tile(tf.reshape(alen, (1, 1, slen)), (bs, slen, 1)), tf.reshape(alen, (1, slen, 1))
        )
    else:
        attn_mask = mask

    # 对掩码 mask 进行形状检查，确保其形状为 [bs, slen]
    tf.debugging.assert_equal(shape_list(mask), [bs, slen])
    # 如果是 causal 模式，则对 attn_mask 进行形状检查，确保其形状为 [bs, slen, slen]
    if causal:
        tf.debugging.assert_equal(shape_list(attn_mask), [bs, slen, slen])

    # 返回 mask 和 attn_mask 作为结果
    return mask, attn_mask
class TFXLMMultiHeadAttention(keras.layers.Layer):
    # 类变量，用于生成唯一的层标识符
    NEW_ID = itertools.count()

    def __init__(self, n_heads, dim, config, **kwargs):
        super().__init__(**kwargs)
        # 分配当前层的唯一标识符
        self.layer_id = next(TFXLMMultiHeadAttention.NEW_ID)
        self.dim = dim  # 设置注意力机制的维度
        self.n_heads = n_heads  # 设置注意力头的数量
        self.output_attentions = config.output_attentions  # 是否输出注意力权重
        assert self.dim % self.n_heads == 0  # 断言：确保维度可以被注意力头数量整除

        # 定义用于查询、键、值的线性层，并初始化
        self.q_lin = keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="q_lin")
        self.k_lin = keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="k_lin")
        self.v_lin = keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="v_lin")
        self.out_lin = keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="out_lin")
        self.dropout = keras.layers.Dropout(config.attention_dropout)  # 定义注意力机制的dropout层
        self.pruned_heads = set()  # 初始化一个空集合，用于记录被修剪的注意力头
        self.dim = dim  # 更新维度信息

    def prune_heads(self, heads):
        raise NotImplementedError  # 剪枝注意力头的方法，目前未实现

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 构建查询、键、值以及输出线性层的神经网络结构
        if getattr(self, "q_lin", None) is not None:
            with tf.name_scope(self.q_lin.name):
                self.q_lin.build([None, None, self.dim])
        if getattr(self, "k_lin", None) is not None:
            with tf.name_scope(self.k_lin.name):
                self.k_lin.build([None, None, self.dim])
        if getattr(self, "v_lin", None) is not None:
            with tf.name_scope(self.v_lin.name):
                self.v_lin.build([None, None, self.dim])
        if getattr(self, "out_lin", None) is not None:
            with tf.name_scope(self.out_lin.name):
                self.out_lin.build([None, None, self.dim])


class TFXLMTransformerFFN(keras.layers.Layer):
    def __init__(self, in_dim, dim_hidden, out_dim, config, **kwargs):
        super().__init__(**kwargs)

        # 定义前馈神经网络的两个线性层，并初始化
        self.lin1 = keras.layers.Dense(dim_hidden, kernel_initializer=get_initializer(config.init_std), name="lin1")
        self.lin2 = keras.layers.Dense(out_dim, kernel_initializer=get_initializer(config.init_std), name="lin2")
        # 根据配置选择激活函数（GELU或ReLU）
        self.act = get_tf_activation("gelu") if config.gelu_activation else get_tf_activation("relu")
        self.dropout = keras.layers.Dropout(config.dropout)  # 定义前馈神经网络的dropout层
        self.in_dim = in_dim  # 输入维度
        self.dim_hidden = dim_hidden  # 隐藏层维度

    def call(self, input, training=False):
        # 前向传播函数
        x = self.lin1(input)  # 第一层线性变换
        x = self.act(x)  # 应用激活函数
        x = self.lin2(x)  # 第二层线性变换
        x = self.dropout(x, training=training)  # 应用dropout层

        return x  # 返回前向传播的结果
    # 如果模型已经建立，直接返回，不进行重复建立
    if self.built:
        return
    # 设置模型状态为已建立
    self.built = True

    # 如果存在名为lin1的属性，并且不为None，则进行下面的操作
    if getattr(self, "lin1", None) is not None:
        # 使用tf.name_scope为self.lin1的操作定义命名空间
        with tf.name_scope(self.lin1.name):
            # 使用self.in_dim作为输入维度，构建self.lin1层
            self.lin1.build([None, None, self.in_dim])

    # 如果存在名为lin2的属性，并且不为None，则进行下面的操作
    if getattr(self, "lin2", None) is not None:
        # 使用tf.name_scope为self.lin2的操作定义命名空间
        with tf.name_scope(self.lin2.name):
            # 使用self.dim_hidden作为隐藏层维度，构建self.lin2层
            self.lin2.build([None, None, self.dim_hidden])
# 将类 TFXLMMainLayer 声明为可序列化的 Keras 层
@keras_serializable
class TFXLMMainLayer(keras.layers.Layer):
    # 使用 XLMConfig 类作为配置类
    config_class = XLMConfig

    # 构建层的方法，初始化位置嵌入等权重
    def build(self, input_shape=None):
        # 如果已经构建过，则直接返回
        if self.built:
            return
        # 标记此层为已构建
        self.built = True
        # 在命名作用域 "position_embeddings" 下添加位置嵌入权重
        with tf.name_scope("position_embeddings"):
            self.position_embeddings = self.add_weight(
                name="embeddings",
                shape=[self.max_position_embeddings, self.dim],
                initializer=get_initializer(self.embed_init_std),
            )

        # 如果有多于一个语言并且使用语言嵌入，则添加语言嵌入权重
        if self.n_langs > 1 and self.use_lang_emb:
            with tf.name_scope("lang_embeddings"):
                self.lang_embeddings = self.add_weight(
                    name="embeddings",
                    shape=[self.n_langs, self.dim],
                    initializer=get_initializer(self.embed_init_std),
                )
        
        # 如果存在 embeddings 属性，则对其进行构建
        if getattr(self, "embeddings", None) is not None:
            with tf.name_scope(self.embeddings.name):
                self.embeddings.build(None)
        
        # 如果存在 layer_norm_emb 属性，则对其进行构建
        if getattr(self, "layer_norm_emb", None) is not None:
            with tf.name_scope(self.layer_norm_emb.name):
                self.layer_norm_emb.build([None, None, self.dim])
        
        # 对每个自注意力层进行构建
        for layer in self.attentions:
            with tf.name_scope(layer.name):
                layer.build(None)
        
        # 对每个 LayerNorm 层进行构建
        for layer in self.layer_norm1:
            with tf.name_scope(layer.name):
                layer.build([None, None, self.dim])
        
        # 对每个前馈神经网络层进行构建
        for layer in self.ffns:
            with tf.name_scope(layer.name):
                layer.build(None)
        
        # 对每个 LayerNorm 层进行构建
        for layer in self.layer_norm2:
            with tf.name_scope(layer.name):
                layer.build([None, None, self.dim])

    # 获取输入嵌入的方法
    def get_input_embeddings(self):
        return self.embeddings

    # 设置输入嵌入的方法
    def set_input_embeddings(self, value):
        self.embeddings.weight = value
        self.embeddings.vocab_size = shape_list(value)[0]

    # 剪枝模型中的注意力头部的方法
    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        raise NotImplementedError

    # 解包输入参数并调用模型的方法
    @unpack_inputs
    def call(
        self,
        input_ids=None,
        attention_mask=None,
        langs=None,
        token_type_ids=None,
        position_ids=None,
        lengths=None,
        cache=None,
        head_mask=None,
        inputs_embeds=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        training=False,



        ):
        # 这里会实际执行模型的计算过程，在此不做具体注释，具体操作会依赖模型的实现细节
        pass

# TFXLMPreTrainedModel 类，继承自 TFPreTrainedModel，用于处理模型权重初始化以及预训练模型下载和加载的抽象类
class TFXLMPreTrainedModel(TFPreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 使用 XLMConfig 类作为配置类
    config_class = XLMConfig
    # 基础模型名称前缀为 "transformer"
    base_model_prefix = "transformer"

    # 属性装饰器，用于返回输入嵌入
    @property
    def dummy_inputs(self):
        # 定义一个包含了一些假输入数据的函数
        # 创建包含输入数据的张量列表，数据类型为整型
        inputs_list = tf.constant([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]], dtype=tf.int32)
        # 创建包含注意力掩码数据的张量列表，数据类型为整型
        attns_list = tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]], dtype=tf.int32)
        # 如果需要使用语言嵌入并且有多个语言，则返回包含输入、注意力掩码和语言信息的字典
        if self.config.use_lang_emb and self.config.n_langs > 1:
            return {
                "input_ids": inputs_list,
                "attention_mask": attns_list,
                "langs": tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]], dtype=tf.int32),
            }
        # 如果不需要使用语言嵌入或者只有一种语言，则返回包含输入和注意力掩码信息的字典
        else:
            return {"input_ids": inputs_list, "attention_mask": attns_list}
# 当 XLMWithLMHead 计算损失类似于其他语言模型时移除
@dataclass
class TFXLMWithLMHeadModelOutput(ModelOutput):
    """
    [`TFXLMWithLMHeadModel`] 输出的基类。

    Args:
        logits (`tf.Tensor`，形状为 `(batch_size, sequence_length, config.vocab_size)`):
            语言建模头部的预测分数（SoftMax 之前的每个词汇标记的分数）。
        hidden_states (`tuple(tf.Tensor)`，*可选*，当传入 `output_hidden_states=True` 或 `config.output_hidden_states=True` 时返回):
            形状为 `(batch_size, sequence_length, hidden_size)` 的 `tf.Tensor` 元组。
            
            模型在每层输出的隐藏状态以及初始嵌入输出。
        attentions (`tuple(tf.Tensor)`，*可选*，当传入 `output_attentions=True` 或 `config.output_attentions=True` 时返回):
            形状为 `(batch_size, num_heads, sequence_length, sequence_length)` 的 `tf.Tensor` 元组。
            
            注意力 SoftMax 之后的注意力权重，用于计算自注意力头中的加权平均值。
    """

    logits: tf.Tensor = None
    hidden_states: Tuple[tf.Tensor, ...] | None = None
    attentions: Tuple[tf.Tensor, ...] | None = None


XLM_START_DOCSTRING = r"""

    该模型继承自 [`TFPreTrainedModel`]。请查阅超类文档以获取库实现的所有模型通用方法（如下载或保存模型、调整输入嵌入大小、修剪头等）。

    该模型也是 [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) 的子类。可以将其用作常规的 TF 2.0 Keras 模型，并参考 TF 2.0 文档以获取与一般使用和行为相关的所有内容。

    <Tip>

    `transformers` 中的 TensorFlow 模型和层接受两种输入格式：

    - 将所有输入作为关键字参数传递（类似于 PyTorch 模型），或者
    - 将所有输入作为列表、元组或字典传递给第一个位置参数。

    支持第二种格式的原因是 Keras 方法更喜欢在将输入传递给模型和层时使用此格式。由于此支持，当使用诸如 `model.fit()` 等方法时，您只需将输入和标签以 `model.fit()` 支持的任何格式传递即可！但是，如果您希望在 Keras 方法之外（例如在使用 Keras `Functional` API 创建自己的层或模型时）使用第二种格式，那么您可以使用以下三种可能性将所有输入张量收集到第一个位置参数中：

    - 仅具有 `input_ids` 的单个张量且没有其他内容：`model(input_ids)`
    - 长度不同的列表，其中按照文档字符串中给定的顺序包含一个或多个输入张量：
    `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
    - 根据不同的输入情况，可以接受一个或多个输入张量的字典。
    `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`

    Note that when creating models and layers with
    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
    about any of this, as you can just pass inputs like you would to any other Python function!
    在使用子类化创建模型和层时，可以像传递给任何其他Python函数一样传递输入，因此无需担心这些细节。

    Parameters:
        config ([`XLMConfig`]): Model configuration class with all the parameters of the model.
            使用包含模型所有参数的配置类（例如`XLMConfig`）进行模型的配置。
            使用配置文件进行初始化不会加载与模型关联的权重，仅加载配置。
            可以查看 [`~PreTrainedModel.from_pretrained`] 方法来加载模型的权重。
"""

XLM_INPUTS_DOCSTRING = r"""
"""


@add_start_docstrings(
    "The bare XLM Model transformer outputting raw hidden-states without any specific head on top.",
    XLM_START_DOCSTRING,
)
class TFXLMModel(TFXLMPreTrainedModel):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        # 初始化 transformer 层，使用配置对象创建 TFXLMMainLayer，并命名为 "transformer"
        self.transformer = TFXLMMainLayer(config, name="transformer")

    @unpack_inputs
    @add_start_docstrings_to_model_forward(XLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFBaseModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: tf.Tensor | None = None,
        langs: tf.Tensor | None = None,
        token_type_ids: tf.Tensor | None = None,
        position_ids: tf.Tensor | None = None,
        lengths: tf.Tensor | None = None,
        cache: Dict[str, tf.Tensor] | None = None,
        head_mask: tf.Tensor | None = None,
        inputs_embeds: tf.Tensor | None = None,
        output_attentions: bool | None = None,
        output_hidden_states: bool | None = None,
        return_dict: bool | None = None,
        training: bool = False,
    ) -> TFBaseModelOutput | Tuple[tf.Tensor]:
        # 调用 transformer 层的前向传播函数，传入各种输入参数
        outputs = self.transformer(
            input_ids=input_ids,
            attention_mask=attention_mask,
            langs=langs,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            lengths=lengths,
            cache=cache,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        return outputs

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        if getattr(self, "transformer", None) is not None:
            with tf.name_scope(self.transformer.name):
                # 构建 transformer 层
                self.transformer.build(None)


class TFXLMPredLayer(keras.layers.Layer):
    """
    Prediction layer (cross_entropy or adaptive_softmax).
    """

    def __init__(self, config, input_embeddings, **kwargs):
        super().__init__(**kwargs)

        # 初始化预测层参数
        self.asm = config.asm
        self.n_words = config.n_words
        self.pad_index = config.pad_index

        if config.asm is False:
            # 如果不使用自适应 softmax，直接使用输入的嵌入层
            self.input_embeddings = input_embeddings
        else:
            # 抛出未实现的错误，因为自适应 softmax 模块未实现
            raise NotImplementedError
            # self.proj = nn.AdaptiveLogSoftmaxWithLoss(
            #     in_features=dim,
            #     n_classes=config.n_words,
            #     cutoffs=config.asm_cutoffs,
            #     div_value=config.asm_div_value,
            #     head_bias=True,  # default is False
            # )
    # 在神经网络层的构建方法中，初始化一个偏置项，每个标记对应一个输出偏置。
    self.bias = self.add_weight(shape=(self.n_words,), initializer="zeros", trainable=True, name="bias")

    # 调用父类的构建方法
    super().build(input_shape)

# 获取输出的嵌入向量
def get_output_embeddings(self):
    return self.input_embeddings

# 设置输出的嵌入向量
def set_output_embeddings(self, value):
    # 更新输入嵌入的权重值
    self.input_embeddings.weight = value
    # 更新输入嵌入的词汇量大小
    self.input_embeddings.vocab_size = shape_list(value)[0]

# 获取偏置项
def get_bias(self):
    return {"bias": self.bias}

# 设置偏置项
def set_bias(self, value):
    # 更新偏置项的值
    self.bias = value["bias"]
    # 更新词汇量的大小
    self.vocab_size = shape_list(value["bias"])[0]

# 神经网络的调用方法，接受隐藏状态作为输入
def call(self, hidden_states):
    # 使用输入嵌入对隐藏状态进行线性变换
    hidden_states = self.input_embeddings(hidden_states, mode="linear")
    # 添加偏置项到隐藏状态中
    hidden_states = hidden_states + self.bias

    # 返回处理后的隐藏状态
    return hidden_states
"""
The XLM Model transformer with a language modeling head on top (linear layer with weights tied to the input
embeddings).
"""
# 引入装饰器和必要的模块
@add_start_docstrings(
    """
    The XLM Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    """,
    XLM_START_DOCSTRING,  # 引用 XLM 的起始文档字符串
)
# TFXLMWithLMHeadModel 类定义，继承自 TFXLMPreTrainedModel
class TFXLMWithLMHeadModel(TFXLMPreTrainedModel):
    
    # 初始化方法
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)  # 调用父类的初始化方法
        self.transformer = TFXLMMainLayer(config, name="transformer")  # 创建 XLM 主层对象
        self.pred_layer = TFXLMPredLayer(config, self.transformer.embeddings, name="pred_layer_._proj")
        # 创建 XLM 预测层对象，并连接到嵌入层
        
        # XLM 不支持过去的缓存特性
        self.supports_xla_generation = False

    # 获取语言模型头部
    def get_lm_head(self):
        return self.pred_layer

    # 获取前缀偏置名称（已弃用）
    def get_prefix_bias_name(self):
        warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
        return self.name + "/" + self.pred_layer.name

    # 为生成准备输入
    def prepare_inputs_for_generation(self, inputs, **kwargs):
        mask_token_id = self.config.mask_token_id  # 获取掩码标记 ID
        lang_id = self.config.lang_id  # 获取语言 ID

        effective_batch_size = inputs.shape[0]  # 计算有效批次大小
        mask_token = tf.fill((effective_batch_size, 1), 1) * mask_token_id  # 创建掩码令牌
        inputs = tf.concat([inputs, mask_token], axis=1)  # 将掩码令牌连接到输入末尾

        if lang_id is not None:
            langs = tf.ones_like(inputs) * lang_id  # 如果存在语言 ID，创建相应的语言张量
        else:
            langs = None
        
        # 返回输入字典，包含输入 ID 和语言信息
        return {"input_ids": inputs, "langs": langs}

    # 调用方法，处理各种输入和选项
    @unpack_inputs
    @add_start_docstrings_to_model_forward(XLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,  # 示例使用的检查点
        output_type=TFXLMWithLMHeadModelOutput,  # 输出类型
        config_class=_CONFIG_FOR_DOC,  # 配置类
    )
    def call(
        self,
        input_ids: TFModelInputType | None = None,  # 输入 ID
        attention_mask: np.ndarray | tf.Tensor | None = None,  # 注意力掩码
        langs: np.ndarray | tf.Tensor | None = None,  # 语言标识
        token_type_ids: np.ndarray | tf.Tensor | None = None,  # 令牌类型 ID
        position_ids: np.ndarray | tf.Tensor | None = None,  # 位置 ID
        lengths: np.ndarray | tf.Tensor | None = None,  # 序列长度
        cache: Optional[Dict[str, tf.Tensor]] = None,  # 缓存
        head_mask: np.ndarray | tf.Tensor | None = None,  # 头部掩码
        inputs_embeds: np.ndarray | tf.Tensor | None = None,  # 嵌入输入
        output_attentions: Optional[bool] = None,  # 输出注意力
        output_hidden_states: Optional[bool] = None,  # 输出隐藏状态
        return_dict: Optional[bool] = None,  # 返回字典
        training: bool = False,  # 是否训练模式
        ):
    # 定义函数的返回类型为 TFXLMWithLMHeadModelOutput 或者 (TFXLMWithLMHeadModelOutput, tf.Tensor) 的元组
    -> Union[TFXLMWithLMHeadModelOutput, Tuple[tf.Tensor]]:
        # 调用 self.transformer 对象的 __call__ 方法，传入各种输入参数
        transformer_outputs = self.transformer(
            input_ids=input_ids,                  # 输入的 token IDs
            attention_mask=attention_mask,        # 注意力遮罩
            langs=langs,                          # 语言 ID（如果模型支持多语言）
            token_type_ids=token_type_ids,        # token 类型 IDs（如果模型支持）
            position_ids=position_ids,            # 位置 IDs
            lengths=lengths,                      # 序列长度
            cache=cache,                          # 缓存（如果有）
            head_mask=head_mask,                  # 头部遮罩（如果有）
            inputs_embeds=inputs_embeds,          # 嵌入的输入（如果有）
            output_attentions=output_attentions,  # 是否输出注意力权重
            output_hidden_states=output_hidden_states,  # 是否输出隐藏状态
            return_dict=return_dict,              # 是否以字典形式返回输出
            training=training,                    # 是否处于训练模式
        )

        # 从 transformer_outputs 中取出第一个元素，通常是模型的输出
        output = transformer_outputs[0]
        # 将模型输出通过 self.pred_layer 进行预测
        outputs = self.pred_layer(output)

        # 如果 return_dict 为 False，则返回一个元组，包含 outputs 和 transformer_outputs 的其余部分
        if not return_dict:
            return (outputs,) + transformer_outputs[1:]

        # 如果 return_dict 为 True，则构造一个 TFXLMWithLMHeadModelOutput 对象，并返回
        return TFXLMWithLMHeadModelOutput(
            logits=outputs,                              # 预测的逻辑回归输出
            hidden_states=transformer_outputs.hidden_states,  # 隐藏状态列表
            attentions=transformer_outputs.attentions     # 注意力权重列表
        )

    # 构建模型，在此处初始化所有子层的权重
    def build(self, input_shape=None):
        # 如果模型已经构建好，直接返回
        if self.built:
            return
        # 标记模型为已构建状态
        self.built = True

        # 如果 self.transformer 存在，则在名为 self.transformer.name 的作用域内构建它
        if getattr(self, "transformer", None) is not None:
            with tf.name_scope(self.transformer.name):
                self.transformer.build(None)

        # 如果 self.pred_layer 存在，则在名为 self.pred_layer.name 的作用域内构建它
        if getattr(self, "pred_layer", None) is not None:
            with tf.name_scope(self.pred_layer.name):
                self.pred_layer.build(None)
"""
XLM Model with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g.
for GLUE tasks.
"""
# 继承自 TFXLMPreTrainedModel 和 TFSequenceClassificationLoss 的 XLM 模型，用于序列分类或回归任务
class TFXLMForSequenceClassification(TFXLMPreTrainedModel, TFSequenceClassificationLoss):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        # 初始化时设置类别数量
        self.num_labels = config.num_labels

        # 创建 XLM 主层对象，命名为 "transformer"
        self.transformer = TFXLMMainLayer(config, name="transformer")
        
        # 创建序列摘要对象，用于生成序列摘要特征
        self.sequence_summary = TFSequenceSummary(config, initializer_range=config.init_std, name="sequence_summary")

    @unpack_inputs
    # 将输入解包并添加关于模型前向传播的文档字符串，展示输入的批次大小和序列长度
    @add_start_docstrings_to_model_forward(XLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    # 添加关于模型前向传播的代码示例的文档字符串，展示相关的检查点、输出类型和配置类
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFSequenceClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        langs: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        lengths: np.ndarray | tf.Tensor | None = None,
        cache: Optional[Dict[str, tf.Tensor]] = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: np.ndarray | tf.Tensor | None = None,
        training: bool = False,
        # ...
    ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
        r"""
        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 调用transformer模型进行前向传播，传入各种参数
        transformer_outputs = self.transformer(
            input_ids=input_ids,
            attention_mask=attention_mask,
            langs=langs,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            lengths=lengths,
            cache=cache,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 获取transformer的输出作为序列总结的输入
        output = transformer_outputs[0]

        # 通过序列总结模型得到最终的logits
        logits = self.sequence_summary(output)

        # 如果提供了labels，则计算损失函数（交叉熵或均方损失）
        loss = None if labels is None else self.hf_compute_loss(labels, logits)

        # 如果不需要返回字典，则按照tuple的形式返回输出
        if not return_dict:
            output = (logits,) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 如果需要返回字典，则构造TFSequenceClassifierOutput对象返回
        return TFSequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )

    def build(self, input_shape=None):
        # 如果已经构建过，直接返回
        if self.built:
            return
        self.built = True
        # 如果transformer模型存在，则在命名空间下构建transformer模型
        if getattr(self, "transformer", None) is not None:
            with tf.name_scope(self.transformer.name):
                self.transformer.build(None)
        # 如果序列总结模型存在，则在命名空间下构建序列总结模型
        if getattr(self, "sequence_summary", None) is not None:
            with tf.name_scope(self.sequence_summary.name):
                self.sequence_summary.build(None)
"""
XLM Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
softmax) e.g. for RocStories/SWAG tasks.
"""
# 定义了一个 XLM 模型，该模型在其顶部添加了一个多选分类头部，包括一个线性层和一个 softmax 层，用于例如 RocStories/SWAG 任务。

class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)

        # 初始化 XLM 主层，命名为 "transformer"
        self.transformer = TFXLMMainLayer(config, name="transformer")
        
        # 初始化序列摘要层，用于生成序列摘要，命名为 "sequence_summary"
        self.sequence_summary = TFSequenceSummary(config, initializer_range=config.init_std, name="sequence_summary")
        
        # 初始化 logits 投影层，一个全连接层用于多选分类任务，输出维度为 1
        self.logits_proj = keras.layers.Dense(
            1, kernel_initializer=get_initializer(config.initializer_range), name="logits_proj"
        )
        
        # 保存配置对象
        self.config = config

    @property
    def dummy_inputs(self):
        """
        Dummy inputs to build the network.

        Returns:
            tf.Tensor with dummy inputs
        """
        # 如果配置要求使用语言嵌入且语言数大于1，则返回包含 "input_ids" 和 "langs" 的虚拟输入
        if self.config.use_lang_emb and self.config.n_langs > 1:
            return {
                "input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS, dtype=tf.int32),
                "langs": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS, dtype=tf.int32),
            }
        else:
            # 否则，只返回包含 "input_ids" 的虚拟输入
            return {
                "input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS, dtype=tf.int32),
            }

    @unpack_inputs
    @add_start_docstrings_to_model_forward(XLM_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFMultipleChoiceModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        langs: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        lengths: np.ndarray | tf.Tensor | None = None,
        cache: Optional[Dict[str, tf.Tensor]] = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: np.ndarray | tf.Tensor | None = None,
        training: bool = False,
        # 神经网络前向传播方法，接收并处理各种输入和配置参数，生成预测或特征输出
    # 定义方法签名，指定输入参数和返回类型
    ) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
        # 如果输入参数 input_ids 不为 None，则获取其第二维和第三维的大小
        if input_ids is not None:
            num_choices = shape_list(input_ids)[1]  # 获取选项数量
            seq_length = shape_list(input_ids)[2]   # 获取序列长度
        else:
            # 否则，获取 inputs_embeds 的第二维和第三维的大小
            num_choices = shape_list(inputs_embeds)[1]  # 获取选项数量
            seq_length = shape_list(inputs_embeds)[2]   # 获取序列长度

        # 将 input_ids 重新 reshape 成 (-1, seq_length)，如果 input_ids 不为 None
        flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None
        # 将 attention_mask 重新 reshape 成 (-1, seq_length)，如果 attention_mask 不为 None
        flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
        # 将 token_type_ids 重新 reshape 成 (-1, seq_length)，如果 token_type_ids 不为 None
        flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
        # 将 position_ids 重新 reshape 成 (-1, seq_length)，如果 position_ids 不为 None
        flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
        # 将 langs 重新 reshape 成 (-1, seq_length)，如果 langs 不为 None
        flat_langs = tf.reshape(langs, (-1, seq_length)) if langs is not None else None
        # 将 inputs_embeds 重新 reshape 成 (-1, seq_length, shape_list(inputs_embeds)[3])，如果 inputs_embeds 不为 None
        flat_inputs_embeds = (
            tf.reshape(inputs_embeds, (-1, seq_length, shape_list(inputs_embeds)[3]))
            if inputs_embeds is not None
            else None
        )

        # 如果 lengths 不为 None，则发出警告并将其设为 None，因为 XLM 多选模型不能使用 lengths 参数
        if lengths is not None:
            logger.warning(
                "The `lengths` parameter cannot be used with the XLM multiple choice models. Please use the "
                "attention mask instead.",
            )
            lengths = None

        # 调用 self.transformer 方法进行转换器的前向计算
        transformer_outputs = self.transformer(
            flat_input_ids,
            flat_attention_mask,
            flat_langs,
            flat_token_type_ids,
            flat_position_ids,
            lengths,
            cache,
            head_mask,
            flat_inputs_embeds,
            output_attentions,
            output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 获取 transformer 输出的第一个元素作为输出
        output = transformer_outputs[0]
        # 对输出进行序列总结，得到 logits
        logits = self.sequence_summary(output)
        # 对 logits 进行投影处理
        logits = self.logits_proj(logits)
        # 将 logits 重新 reshape 成 (-1, num_choices)，以符合多选题的格式
        reshaped_logits = tf.reshape(logits, (-1, num_choices))

        # 如果 labels 为 None，则损失为 None；否则，调用 self.hf_compute_loss 方法计算损失
        loss = None if labels is None else self.hf_compute_loss(labels, reshaped_logits)

        # 如果不需要返回字典形式的输出，则重新组织 output，并在其前面加上损失值（如果有）
        if not return_dict:
            output = (reshaped_logits,) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 返回 TFMultipleChoiceModelOutput 对象，包括损失、logits、隐藏状态和注意力权重
        return TFMultipleChoiceModelOutput(
            loss=loss,
            logits=reshaped_logits,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )
    # 构建函数，用于构建神经网络层次结构
    def build(self, input_shape=None):
        # 如果已经构建过，则直接返回，避免重复构建
        if self.built:
            return
        # 将标记位设置为已构建
        self.built = True
        
        # 如果存在变换器(transformer)对象，则构建其内部结构
        if getattr(self, "transformer", None) is not None:
            # 在 TensorFlow 中使用名称作用域，指定变换器的名称范围
            with tf.name_scope(self.transformer.name):
                self.transformer.build(None)
        
        # 如果存在序列摘要(sequence_summary)对象，则构建其内部结构
        if getattr(self, "sequence_summary", None) is not None:
            # 在 TensorFlow 中使用名称作用域，指定序列摘要的名称范围
            with tf.name_scope(self.sequence_summary.name):
                self.sequence_summary.build(None)
        
        # 如果存在分类投影(logits_proj)对象，则构建其内部结构
        if getattr(self, "logits_proj", None) is not None:
            # 在 TensorFlow 中使用名称作用域，指定分类投影的名称范围
            with tf.name_scope(self.logits_proj.name):
                # 构建分类投影层，输入形状为 [None, None, self.config.num_labels]
                self.logits_proj.build([None, None, self.config.num_labels])
@add_start_docstrings(
    """
    XLM Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    """,
    XLM_START_DOCSTRING,
)
class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLoss):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.num_labels = config.num_labels  # 从配置中获取标签的数量

        self.transformer = TFXLMMainLayer(config, name="transformer")  # 初始化 XLM 主层模型
        self.dropout = keras.layers.Dropout(config.dropout)  # 设置 dropout 层，用于正则化
        self.classifier = keras.layers.Dense(
            config.num_labels, kernel_initializer=get_initializer(config.init_std), name="classifier"
        )  # 设置分类器，用于输出标签的预测结果
        self.config = config  # 存储配置信息

    @unpack_inputs
    @add_start_docstrings_to_model_forward(XLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFTokenClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def call(
        self,
        input_ids: TFModelInputType | None = None,  # 输入的 token IDs
        attention_mask: np.ndarray | tf.Tensor | None = None,  # 注意力掩码，控制哪些位置需要注意
        langs: np.ndarray | tf.Tensor | None = None,  # 语言 ID 或者语言掩码
        token_type_ids: np.ndarray | tf.Tensor | None = None,  # token 类型 IDs，用于区分 segment
        position_ids: np.ndarray | tf.Tensor | None = None,  # 位置 IDs，用于标记 token 在句子中的位置
        lengths: np.ndarray | tf.Tensor | None = None,  # 句子长度
        cache: Optional[Dict[str, tf.Tensor]] = None,  # 缓存用于加速推断
        head_mask: np.ndarray | tf.Tensor | None = None,  # 头部掩码，用于控制哪些头部需要注意
        inputs_embeds: np.ndarray | tf.Tensor | None = None,  # 输入的嵌入表示
        output_attentions: Optional[bool] = None,  # 是否输出注意力权重
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态
        return_dict: Optional[bool] = None,  # 是否返回字典格式的输出
        labels: np.ndarray | tf.Tensor | None = None,  # 真实的标签
        training: bool = False,  # 是否处于训练模式
    ) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
        r"""
        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        """
        # 调用 Transformer 模型来处理输入数据，获取转换后的输出
        transformer_outputs = self.transformer(
            input_ids=input_ids,
            attention_mask=attention_mask,
            langs=langs,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            lengths=lengths,
            cache=cache,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 从 Transformer 输出中获取序列输出
        sequence_output = transformer_outputs[0]

        # 对序列输出应用 dropout，用于防止过拟合
        sequence_output = self.dropout(sequence_output, training=training)
        # 将 dropout 后的输出送入分类器，得到分类 logits
        logits = self.classifier(sequence_output)

        # 如果提供了 labels，则计算损失；否则损失设为 None
        loss = None if labels is None else self.hf_compute_loss(labels, logits)

        # 如果不要求返回字典形式的输出，则按需返回不同的输出格式
        if not return_dict:
            output = (logits,) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 返回 TFTokenClassifierOutput 对象，包含损失、logits、隐藏状态和注意力权重
        return TFTokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )

    def build(self, input_shape=None):
        # 如果模型已经构建过，直接返回
        if self.built:
            return
        # 标记模型已构建
        self.built = True
        # 如果 Transformer 存在，则构建 Transformer 模型
        if getattr(self, "transformer", None) is not None:
            with tf.name_scope(self.transformer.name):
                self.transformer.build(None)
        # 如果分类器存在，则构建分类器模型
        if getattr(self, "classifier", None) is not None:
            with tf.name_scope(self.classifier.name):
                # 构建分类器，设置输入形状为 [None, None, self.config.hidden_size]
                self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings(
    """
    XLM Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layer
    on top of the hidden-states output to compute `span start logits` and `span end logits`).
    """,
    XLM_START_DOCSTRING,
)
class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel, TFQuestionAnsweringLoss):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        # 初始化一个 XLM 主模型层，命名为 "transformer"
        self.transformer = TFXLMMainLayer(config, name="transformer")
        # 初始化一个全连接层用于问题-答案抽取任务的输出，参数由配置文件中的标签数量决定
        self.qa_outputs = keras.layers.Dense(
            config.num_labels, kernel_initializer=get_initializer(config.init_std), name="qa_outputs"
        )
        # 将配置信息保存到对象属性中
        self.config = config

    @unpack_inputs
    @add_start_docstrings_to_model_forward(XLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFQuestionAnsweringModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        langs: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        lengths: np.ndarray | tf.Tensor | None = None,
        cache: Optional[Dict[str, tf.Tensor]] = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        start_positions: np.ndarray | tf.Tensor | None = None,
        end_positions: np.ndarray | tf.Tensor | None = None,
        training: bool = False,
    ) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
        r"""
        start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        """
        transformer_outputs = self.transformer(
            input_ids=input_ids,
            attention_mask=attention_mask,
            langs=langs,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            lengths=lengths,
            cache=cache,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 从transformer模型获得输出序列
        sequence_output = transformer_outputs[0]

        # 将序列输出通过问答输出层获得logits
        logits = self.qa_outputs(sequence_output)
        
        # 将logits按照最后一个维度分割为start_logits和end_logits
        start_logits, end_logits = tf.split(logits, 2, axis=-1)
        
        # 去除start_logits和end_logits中的多余维度
        start_logits = tf.squeeze(start_logits, axis=-1)
        end_logits = tf.squeeze(end_logits, axis=-1)

        # 计算损失
        loss = None
        if start_positions is not None and end_positions is not None:
            labels = {"start_position": start_positions}
            labels["end_position"] = end_positions
            loss = self.hf_compute_loss(labels, (start_logits, end_logits))

        # 如果不返回字典格式的结果，组合输出
        if not return_dict:
            output = (start_logits, end_logits) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 返回TFQuestionAnsweringModelOutput格式的结果
        return TFQuestionAnsweringModelOutput(
            loss=loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )

    def build(self, input_shape=None):
        # 如果模型已经建立，直接返回
        if self.built:
            return
        # 标记模型为已建立状态
        self.built = True
        
        # 如果transformer模型已定义，则建立transformer模型
        if getattr(self, "transformer", None) is not None:
            with tf.name_scope(self.transformer.name):
                self.transformer.build(None)
        
        # 如果qa_outputs已定义，则建立qa_outputs
        if getattr(self, "qa_outputs", None) is not None:
            with tf.name_scope(self.qa_outputs.name):
                self.qa_outputs.build([None, None, self.config.hidden_size])

Transformers-源码解析-一百二十五-

Transformers 源码解析（一百二十五）

.\models\whisper\tokenization_whisper_fast.py

.\models\whisper\__init__.py

.\models\xglm\configuration_xglm.py

.\models\xglm\convert_xglm_original_ckpt_to_trfms.py

.\models\xglm\modeling_flax_xglm.py

.\models\xglm\modeling_tf_xglm.py

.\models\xglm\modeling_xglm.py

.\models\xglm\tokenization_xglm.py

.\models\xglm\tokenization_xglm_fast.py

.\models\xglm\__init__.py

.\models\xlm\configuration_xlm.py

.\models\xlm\convert_xlm_original_pytorch_checkpoint_to_pytorch.py

.\models\xlm\modeling_tf_xlm.py

`.\models\whisper\tokenization_whisper_fast.py`

`.\models\whisper\init.py`

`.\models\xglm\configuration_xglm.py`

`.\models\xglm\convert_xglm_original_ckpt_to_trfms.py`

`.\models\xglm\modeling_flax_xglm.py`

`.\models\xglm\modeling_tf_xglm.py`

`.\models\xglm\modeling_xglm.py`

`.\models\xglm\tokenization_xglm.py`

`.\models\xglm\tokenization_xglm_fast.py`

`.\models\xglm\init.py`

`.\models\xlm\configuration_xlm.py`

`.\models\xlm\convert_xlm_original_pytorch_checkpoint_to_pytorch.py`

`.\models\xlm\modeling_tf_xlm.py`