Transformers 源码解析（九十）

`.\models\pop2piano\tokenization_pop2piano.py`

# coding=utf-8
# Copyright 2023 The Pop2Piano Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Tokenization class for Pop2Piano.
"""

import json
import os
from typing import List, Optional, Tuple, Union

import numpy as np

from ...feature_extraction_utils import BatchFeature
from ...tokenization_utils import AddedToken, BatchEncoding, PaddingStrategy, PreTrainedTokenizer, TruncationStrategy
from ...utils import TensorType, is_pretty_midi_available, logging, requires_backends, to_numpy

if is_pretty_midi_available():
    import pretty_midi

logger = logging.get_logger(__name__)

# 定义词汇文件的名称，"vocab" 对应 "vocab.json"
VOCAB_FILES_NAMES = {
    "vocab": "vocab.json",
}

# 预训练模型所需的词汇文件映射
PRETRAINED_VOCAB_FILES_MAP = {
    "vocab": {
        "sweetcocoa/pop2piano": "https://huggingface.co/sweetcocoa/pop2piano/blob/main/vocab.json",
    },
}


def token_time_to_note(number, cutoff_time_idx, current_idx):
    """
    将时间令牌转换为音符索引。

    Args:
        number (int): 时间令牌的数量。
        cutoff_time_idx (int or None): 时间截止索引（可选）。
        current_idx (int): 当前索引位置。

    Returns:
        int: 更新后的当前索引位置。
    """
    current_idx += number
    if cutoff_time_idx is not None:
        current_idx = min(current_idx, cutoff_time_idx)

    return current_idx


def token_note_to_note(number, current_velocity, default_velocity, note_onsets_ready, current_idx, notes):
    """
    将音符令牌转换为音符。

    Args:
        number (int): 音符令牌的数量。
        current_velocity (int): 当前速度。
        default_velocity (int): 默认速度。
        note_onsets_ready (list or None): 准备好的音符发生时刻的列表或 None。
        current_idx (int): 当前索引位置。
        notes (list): 音符列表。

    Returns:
        list: 更新后的音符列表。
    """
    if note_onsets_ready[number] is not None:
        # 偏移与起始
        onset_idx = note_onsets_ready[number]
        if onset_idx < current_idx:
            # 前一个音符后的时间偏移
            offset_idx = current_idx
            notes.append([onset_idx, offset_idx, number, default_velocity])
            onsets_ready = None if current_velocity == 0 else current_idx
            note_onsets_ready[number] = onsets_ready
    else:
        note_onsets_ready[number] = current_idx
    return notes


class Pop2PianoTokenizer(PreTrainedTokenizer):
    """
    构造 Pop2Piano 分词器。此分词器不需要训练。

    Args:
        vocab (`str`): 包含词汇表的文件路径。
        default_velocity (`int`, *optional*, 默认为 77):
            创建 MIDI 音符时使用的默认速度。
        num_bars (`int`, *optional*, 默认为 2):
            每个令牌的截止时间索引。
    """

    model_input_names = ["token_ids", "attention_mask"]
    vocab_files_names = VOCAB_FILES_NAMES
    # 使用预训练的词汇文件映射
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP

    def __init__(
        self,
        vocab,
        default_velocity=77,
        num_bars=2,
        unk_token="-1",
        eos_token="1",
        pad_token="0",
        bos_token="2",
        **kwargs,
    ):
        # 初始化未知标记
        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
        # 初始化结束标记
        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
        # 初始化填充标记
        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
        # 初始化开始标记
        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token

        # 设置默认速度和小节数
        self.default_velocity = default_velocity
        self.num_bars = num_bars

        # 加载词汇表
        with open(vocab, "rb") as file:
            self.encoder = json.load(file)

        # 创建解码器的映射
        self.decoder = {v: k for k, v in self.encoder.items()}

        # 调用父类初始化方法
        super().__init__(
            unk_token=unk_token,
            eos_token=eos_token,
            pad_token=pad_token,
            bos_token=bos_token,
            **kwargs,
        )

    @property
    def vocab_size(self):
        """Returns the vocabulary size of the tokenizer."""
        # 返回词汇表大小
        return len(self.encoder)

    def get_vocab(self):
        """Returns the vocabulary of the tokenizer."""
        # 返回标记器的词汇表，包括新增的标记
        return dict(self.encoder, **self.added_tokens_encoder)

    def _convert_id_to_token(self, token_id: int) -> list:
        """
        Decodes the token ids generated by the transformer into notes.

        Args:
            token_id (`int`):
                This denotes the ids generated by the transformers to be converted to Midi tokens.

        Returns:
            `List`: A list consists of token_type (`str`) and value (`int`).
        """
        # 将转换器生成的标记 ID 解码为音符
        token_type_value = self.decoder.get(token_id, f"{self.unk_token}_TOKEN_TIME")
        token_type_value = token_type_value.split("_")
        token_type, value = "_".join(token_type_value[1:]), int(token_type_value[0])

        return [token_type, value]

    def _convert_token_to_id(self, token, token_type="TOKEN_TIME") -> int:
        """
        Encodes the Midi tokens to transformer generated token ids.

        Args:
            token (`int`):
                This denotes the token value.
            token_type (`str`):
                This denotes the type of the token. There are four types of midi tokens such as "TOKEN_TIME",
                "TOKEN_VELOCITY", "TOKEN_NOTE" and "TOKEN_SPECIAL".

        Returns:
            `int`: returns the id of the token.
        """
        # 将 Midi 标记编码为转换器生成的标记 ID
        return self.encoder.get(f"{token}_{token_type}", int(self.unk_token))

    def relative_batch_tokens_ids_to_notes(
        self,
        tokens: np.ndarray,
        beat_offset_idx: int,
        bars_per_batch: int,
        cutoff_time_idx: int,
    ):
        """
        Converts relative tokens to notes which are then used to generate pretty midi object.

        Args:
            tokens (`numpy.ndarray`):
                Tokens to be converted to notes.
            beat_offset_idx (`int`):
                Denotes beat offset index for each note in generated Midi.
            bars_per_batch (`int`):
                A parameter to control the Midi output generation.
            cutoff_time_idx (`int`):
                Denotes the cutoff time index for each note in generated Midi.
        """

        # Initialize notes to None
        notes = None

        # Iterate through each index in tokens
        for index in range(len(tokens)):
            # Fetch tokens for the current index
            _tokens = tokens[index]
            # Calculate start index for the current batch
            _start_idx = beat_offset_idx + index * bars_per_batch * 4
            # Calculate cutoff time index for the current batch
            _cutoff_time_idx = cutoff_time_idx + _start_idx
            # Convert relative tokens to notes using specified indices
            _notes = self.relative_tokens_ids_to_notes(
                _tokens,
                start_idx=_start_idx,
                cutoff_time_idx=_cutoff_time_idx,
            )

            # Check if _notes is empty
            if len(_notes) == 0:
                pass
            # If notes is None, assign _notes; otherwise concatenate to existing notes
            elif notes is None:
                notes = _notes
            else:
                notes = np.concatenate((notes, _notes), axis=0)

        # If notes is still None, return an empty list; otherwise, return notes
        if notes is None:
            return []
        return notes

    def relative_batch_tokens_ids_to_midi(
        self,
        tokens: np.ndarray,
        beatstep: np.ndarray,
        beat_offset_idx: int = 0,
        bars_per_batch: int = 2,
        cutoff_time_idx: int = 12,
    ):
        """
        Converts tokens to Midi. This method calls `relative_batch_tokens_ids_to_notes` method to convert batch tokens
        to notes then uses `notes_to_midi` method to convert them to Midi.

        Args:
            tokens (`numpy.ndarray`):
                Denotes tokens which alongside beatstep will be converted to Midi.
            beatstep (`np.ndarray`):
                We get beatstep from feature extractor which is also used to get Midi.
            beat_offset_idx (`int`, *optional*, defaults to 0):
                Denotes beat offset index for each note in generated Midi.
            bars_per_batch (`int`, *optional*, defaults to 2):
                A parameter to control the Midi output generation.
            cutoff_time_idx (`int`, *optional*, defaults to 12):
                Denotes the cutoff time index for each note in generated Midi.
        """
        # Set beat_offset_idx to 0 if it's None
        beat_offset_idx = 0 if beat_offset_idx is None else beat_offset_idx
        # Convert tokens to notes using relative_batch_tokens_ids_to_notes method
        notes = self.relative_batch_tokens_ids_to_notes(
            tokens=tokens,
            beat_offset_idx=beat_offset_idx,
            bars_per_batch=bars_per_batch,
            cutoff_time_idx=cutoff_time_idx,
        )
        # Convert notes to Midi using notes_to_midi method
        midi = self.notes_to_midi(notes, beatstep, offset_sec=beatstep[beat_offset_idx])
        return midi

    # Taken from the original code
    # Please see https://github.com/sweetcocoa/pop2piano/blob/fac11e8dcfc73487513f4588e8d0c22a22f2fdc5/midi_tokenizer.py#L257
    def relative_tokens_ids_to_notes(self, tokens: np.ndarray, start_idx: float, cutoff_time_idx: float = None):
        """
        Converts relative tokens to notes which will then be used to create Pretty Midi objects.

        Args:
            tokens (`numpy.ndarray`):
                Relative Tokens which will be converted to notes.
            start_idx (`float`):
                A parameter which denotes the starting index.
            cutoff_time_idx (`float`, *optional*):
                A parameter used while converting tokens to notes.
        """
        # 将 tokens 转换为对应的词汇列表
        words = [self._convert_id_to_token(token) for token in tokens]

        # 初始化当前索引和当前速度
        current_idx = start_idx
        current_velocity = 0

        # 准备音符的起始时间列表，根据已定义的音符种类数量动态生成
        note_onsets_ready = [None for i in range(sum([k.endswith("NOTE") for k in self.encoder.keys()]) + 1)]
        notes = []

        # 遍历 tokens 对应的词汇列表
        for token_type, number in words:
            # 处理特殊 token
            if token_type == "TOKEN_SPECIAL":
                if number == 1:
                    break

            # 处理时间 token
            elif token_type == "TOKEN_TIME":
                current_idx = token_time_to_note(
                    number=number, cutoff_time_idx=cutoff_time_idx, current_idx=current_idx
                )

            # 处理速度 token
            elif token_type == "TOKEN_VELOCITY":
                current_velocity = number

            # 处理音符 token
            elif token_type == "TOKEN_NOTE":
                notes = token_note_to_note(
                    number=number,
                    current_velocity=current_velocity,
                    default_velocity=self.default_velocity,
                    note_onsets_ready=note_onsets_ready,
                    current_idx=current_idx,
                    notes=notes,
                )

            else:
                # 抛出异常，未知的 token 类型
                raise ValueError("Token type not understood!")

        # 对于每个音高和其对应的音符起始时间，若起始时间未定义，则强制定义一个偏移量
        for pitch, note_onset in enumerate(note_onsets_ready):
            if note_onset is not None:
                if cutoff_time_idx is None:
                    cutoff = note_onset + 1
                else:
                    cutoff = max(cutoff_time_idx, note_onset + 1)

                offset_idx = max(current_idx, cutoff)
                notes.append([note_onset, offset_idx, pitch, self.default_velocity])

        # 若 notes 为空则返回空列表，否则对 notes 按照音符起始时间排序并返回
        if len(notes) == 0:
            return []
        else:
            notes = np.array(notes)
            note_order = notes[:, 0] * 128 + notes[:, 1]
            notes = notes[note_order.argsort()]
            return notes
    def notes_to_midi(self, notes: np.ndarray, beatstep: np.ndarray, offset_sec: int = 0.0):
        """
        Converts notes to Midi.

        Args:
            notes (`numpy.ndarray`):
                This is used to create Pretty Midi objects.
            beatstep (`numpy.ndarray`):
                This is the extrapolated beatstep that we get from feature extractor.
            offset_sec (`int`, *optional*, defaults to 0.0):
                This represents the offset seconds which is used while creating each Pretty Midi Note.
        """

        # 检查是否需要 Pretty MIDI 后端支持
        requires_backends(self, ["pretty_midi"])

        # 创建一个新的 PrettyMIDI 对象，设置分辨率为384，初始节奏为120.0 BPM
        new_pm = pretty_midi.PrettyMIDI(resolution=384, initial_tempo=120.0)
        
        # 创建一个新的乐器对象，使用默认音色（program=0）
        new_inst = pretty_midi.Instrument(program=0)
        
        # 存储新创建的音符对象的列表
        new_notes = []

        # 遍历输入的音符数组，并创建相应的 Pretty MIDI 音符对象
        for onset_idx, offset_idx, pitch, velocity in notes:
            # 创建一个新的音符对象
            new_note = pretty_midi.Note(
                velocity=velocity,
                pitch=pitch,
                start=beatstep[onset_idx] - offset_sec,
                end=beatstep[offset_idx] - offset_sec,
            )
            # 将新创建的音符对象添加到列表中
            new_notes.append(new_note)
        
        # 将新创建的音符列表设置为乐器对象的音符列表
        new_inst.notes = new_notes
        
        # 将乐器对象添加到 PrettyMIDI 对象中
        new_pm.instruments.append(new_inst)
        
        # 删除无效的音符（例如持续时间为0的音符）
        new_pm.remove_invalid_notes()
        
        # 返回创建的 PrettyMIDI 对象
        return new_pm

    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        """
        Saves the tokenizer's vocabulary dictionary to the provided save_directory.

        Args:
            save_directory (`str`):
                A path to the directory where to saved. It will be created if it doesn't exist.
            filename_prefix (`Optional[str]`, *optional*):
                A prefix to add to the names of the files saved by the tokenizer.
        """
        # 检查保存目录是否存在，如果不存在则记录错误并返回
        if not os.path.isdir(save_directory):
            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
            return

        # 构建要保存的词汇文件的完整路径
        out_vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab"]
        )
        
        # 将编码器的词汇表以 JSON 格式写入到文件中
        with open(out_vocab_file, "w") as file:
            file.write(json.dumps(self.encoder))

        # 返回保存的文件路径的元组形式
        return (out_vocab_file,)

    def encode_plus(
        self,
        notes: Union[np.ndarray, List[pretty_midi.Note]],
        truncation_strategy: Optional[TruncationStrategy] = None,
        max_length: Optional[int] = None,
        **kwargs,
    ):
        """
        Placeholder function for encoding notes into a format suitable for model input.
        This is meant to be overridden by subclasses.
        """
        # 此方法用于将音符编码为模型输入格式，应由子类重写具体实现
        pass

    def batch_encode_plus(
        self,
        notes: Union[np.ndarray, List[pretty_midi.Note]],
        truncation_strategy: Optional[TruncationStrategy] = None,
        max_length: Optional[int] = None,
        **kwargs,
    ):
        """
        Placeholder function for batch encoding notes into a format suitable for model input.
        This is meant to be overridden by subclasses.
        """
        # 此方法用于批量将音符编码为模型输入格式，应由子类重写具体实现
        pass
    ) -> BatchEncoding:
        r"""
        This is the `batch_encode_plus` method for `Pop2PianoTokenizer`. It converts the midi notes to the transformer
        generated token ids. It works on multiple batches by calling `encode_plus` multiple times in a loop.

        Args:
            notes (`numpy.ndarray` of shape `[batch_size, sequence_length, 4]` or `list` of `pretty_midi.Note` objects):
                This represents the midi notes. If `notes` is a `numpy.ndarray`:
                    - Each sequence must have 4 values, they are `onset idx`, `offset idx`, `pitch` and `velocity`.
                If `notes` is a `list` containing `pretty_midi.Note` objects:
                    - Each sequence must have 4 attributes, they are `start`, `end`, `pitch` and `velocity`.
            truncation_strategy ([`~tokenization_utils_base.TruncationStrategy`], *optional*):
                Indicates the truncation strategy that is going to be used during truncation.
            max_length (`int`, *optional*):
                Maximum length of the returned list and optionally padding length (see above).

        Returns:
            `BatchEncoding` containing the tokens ids.
        """

        encoded_batch_token_ids = []
        for i in range(len(notes)):
            encoded_batch_token_ids.append(
                # Call the `encode_plus` method to convert each batch of midi notes into token ids
                self.encode_plus(
                    notes[i],  # Pass each batch of midi notes to the `encode_plus` method
                    truncation_strategy=truncation_strategy,  # Specify the truncation strategy
                    max_length=max_length,  # Specify the maximum length for padding/truncation
                    **kwargs,  # Additional keyword arguments
                )["token_ids"]  # Retrieve the token ids from the returned BatchEncoding
            )

        return BatchEncoding({"token_ids": encoded_batch_token_ids})  # Return BatchEncoding containing token ids

    def __call__(
        self,
        notes: Union[
            np.ndarray,
            List[pretty_midi.Note],
            List[List[pretty_midi.Note]],
        ],
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = None,
        max_length: Optional[int] = None,
        pad_to_multiple_of: Optional[int] = None,
        return_attention_mask: Optional[bool] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        verbose: bool = True,
        **kwargs,
    ):
        """
        This method allows the tokenizer object to be called as a function, enabling batch encoding of midi notes.

        Args:
            notes (Union[np.ndarray, List[pretty_midi.Note], List[List[pretty_midi.Note]]]):
                Midi notes to be tokenized. Can be a numpy array or a nested list of pretty_midi.Note objects.
            padding (Union[bool, str, PaddingStrategy], optional): Whether to pad sequences to the same length. Defaults to False.
            truncation (Union[bool, str, TruncationStrategy], optional): Truncation strategy for sequences longer than `max_length`. Defaults to None.
            max_length (int, optional): Maximum length of the returned sequences after padding/truncation. Defaults to None.
            pad_to_multiple_of (int, optional): Pad the sequence length to a multiple of this value. Defaults to None.
            return_attention_mask (bool, optional): Whether to return attention masks. Defaults to None.
            return_tensors (Union[str, TensorType], optional): Return tensors format. Defaults to None.
            verbose (bool, optional): Whether to print information about encoding. Defaults to True.
            **kwargs: Additional keyword arguments passed to `encode_plus`.

        Returns:
            BatchEncoding: Contains token ids and optionally attention masks and tensor format.
        """
        # Code for __call__ method implementation goes here
        pass

    def batch_decode(
        self,
        token_ids,
        feature_extractor_output: BatchFeature,
        return_midi: bool = True,
        **kwargs,
    ):
        """
        This method decodes a batch of token ids back into MIDI representation.

        Args:
            token_ids (list): List of token ids to be decoded.
            feature_extractor_output (BatchFeature): Output from feature extractor.
            return_midi (bool, optional): Whether to return MIDI objects. Defaults to True.
            **kwargs: Additional keyword arguments.

        Returns:
            Dependent on `return_midi`, returns MIDI objects or other format as specified.
        """
        # Code for batch_decode method implementation goes here
        pass

`.\models\pop2piano\init.py`

# 导入必要的类型检查模块
from typing import TYPE_CHECKING

# 导入依赖项检查函数和模块
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_essentia_available,
    is_librosa_available,
    is_pretty_midi_available,
    is_scipy_available,
    is_torch_available,
)

# 定义导入结构字典，用于组织不同模块的导入
_import_structure = {
    "configuration_pop2piano": ["POP2PIANO_PRETRAINED_CONFIG_ARCHIVE_MAP", "Pop2PianoConfig"],
}

# 检查是否存在 torch 库，若不存在则抛出异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若 torch 存在，则添加相关模块到导入结构字典
    _import_structure["modeling_pop2piano"] = [
        "POP2PIANO_PRETRAINED_MODEL_ARCHIVE_LIST",
        "Pop2PianoForConditionalGeneration",
        "Pop2PianoPreTrainedModel",
    ]

# 检查是否存在 librosa、essentia、scipy 和 torch 库，若有任一依赖项缺失则抛出异常
try:
    if not (is_librosa_available() and is_essentia_available() and is_scipy_available() and is_torch_available()):
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若所有依赖项都存在，则添加相关模块到导入结构字典
    _import_structure["feature_extraction_pop2piano"] = ["Pop2PianoFeatureExtractor"]

# 检查是否存在 pretty_midi 和 torch 库，若任一依赖项缺失则抛出异常
try:
    if not (is_pretty_midi_available() and is_torch_available()):
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若依赖项都存在，则添加相关模块到导入结构字典
    _import_structure["tokenization_pop2piano"] = ["Pop2PianoTokenizer"]

# 检查是否存在 pretty_midi、torch、librosa、essentia 和 scipy 库，若有任一依赖项缺失则抛出异常
try:
    if not (
        is_pretty_midi_available()
        and is_torch_available()
        and is_librosa_available()
        and is_essentia_available()
        and is_scipy_available()
    ):
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若所有依赖项都存在，则添加相关模块到导入结构字典
    _import_structure["processing_pop2piano"] = ["Pop2PianoProcessor"]

# 如果在类型检查模式下，则从相应模块导入所需的类和变量
if TYPE_CHECKING:
    from .configuration_pop2piano import POP2PIANO_PRETRAINED_CONFIG_ARCHIVE_MAP, Pop2PianoConfig

    # 检查是否存在 torch 库，若不存在则抛出异常
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 若 torch 存在，则从 modeling_pop2piano 模块中导入相关类和变量
        from .modeling_pop2piano import (
            POP2PIANO_PRETRAINED_MODEL_ARCHIVE_LIST,
            Pop2PianoForConditionalGeneration,
            Pop2PianoPreTrainedModel,
        )

    # 检查是否存在 librosa、essentia、scipy 和 torch 库，若有任一依赖项缺失则抛出异常
    try:
        if not (is_librosa_available() and is_essentia_available() and is_scipy_available() and is_torch_available()):
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果前面的导入失败，则尝试从本地模块导入 Pop2PianoFeatureExtractor
        from .feature_extraction_pop2piano import Pop2PianoFeatureExtractor

    try:
        # 检查必要的依赖是否都可用，否则抛出 OptionalDependencyNotAvailable 异常
        if not (is_pretty_midi_available() and is_torch_available()):
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        # 如果依赖不可用，则不进行后续操作
        pass
    else:
        # 如果依赖可用，则从本地模块导入 Pop2PianoTokenizer
        from .tokenization_pop2piano import Pop2PianoTokenizer

    try:
        # 检查多个依赖是否都可用，否则抛出 OptionalDependencyNotAvailable 异常
        if not (
            is_pretty_midi_available()
            and is_torch_available()
            and is_librosa_available()
            and is_essentia_available()
            and is_scipy_available()
        ):
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        # 如果依赖不可用，则不进行后续操作
        pass
    else:
        # 如果依赖可用，则从本地模块导入 Pop2PianoProcessor
        from .processing_pop2piano import Pop2PianoProcessor
else:
    # 导入 sys 模块，用于操作解释器相关的功能
    import sys

    # 将当前模块（__name__）的引用指向 _LazyModule 类的实例，这是一种惰性加载模块的方法
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\prophetnet\configuration_prophetnet.py`

# coding=utf-8
# 定义了文件的编码格式为 UTF-8

# 版权声明，指出此代码的版权归属于 Microsoft Authors 和 HuggingFace Inc. 团队
# 代码使用 Apache License, Version 2.0 进行许可，表示在遵守许可的情况下可以使用此代码
# 可以通过指定的链接获取许可的详细信息
# http://www.apache.org/licenses/LICENSE-2.0

# 导入所需模块和库
""" ProphetNet model configuration"""

# 从 typing 模块导入 Callable, Optional, Union 类型提示
from typing import Callable, Optional, Union

# 从配置工具模块中导入预训练配置类 PretrainedConfig
from ...configuration_utils import PretrainedConfig
# 从工具模块中导入日志记录工具
from ...utils import logging

# 获取 logger 对象，用于日志记录
logger = logging.get_logger(__name__)

# 预训练配置映射字典，将预训练模型名称映射到其配置文件的 URL
PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "microsoft/prophetnet-large-uncased": (
        "https://huggingface.co/microsoft/prophetnet-large-uncased/resolve/main/config.json"
    ),
}


# ProphetNetConfig 类，继承自 PretrainedConfig 类
class ProphetNetConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`ProphetNetModel`]. It is used to instantiate a
    ProphetNet model according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the ProphetNet
    [microsoft/prophetnet-large-uncased](https://huggingface.co/microsoft/prophetnet-large-uncased) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    """

    # 模型类型声明为 prophetnet
    model_type = "prophetnet"
    # 推断过程中需要忽略的键名列表
    keys_to_ignore_at_inference = ["past_key_values"]
    # 属性映射字典，将配置中的 num_attention_heads 映射到 num_encoder_attention_heads
    attribute_map = {
        "num_attention_heads": "num_encoder_attention_heads",
    }
    # 定义初始化方法，设置模型的各项参数
    def __init__(
        self,
        activation_dropout: Optional[float] = 0.1,  # 激活函数的dropout率，默认为0.1
        activation_function: Optional[Union[str, Callable]] = "gelu",  # 激活函数的类型，默认为GELU
        vocab_size: Optional[int] = 30522,  # 词汇表大小，默认为30522
        hidden_size: Optional[int] = 1024,  # 隐藏层大小，默认为1024
        encoder_ffn_dim: Optional[int] = 4096,  # 编码器中FFN层的维度，默认为4096
        num_encoder_layers: Optional[int] = 12,  # 编码器层数，默认为12
        num_encoder_attention_heads: Optional[int] = 16,  # 编码器中注意力头的数量，默认为16
        decoder_ffn_dim: Optional[int] = 4096,  # 解码器中FFN层的维度，默认为4096
        num_decoder_layers: Optional[int] = 12,  # 解码器层数，默认为12
        num_decoder_attention_heads: Optional[int] = 16,  # 解码器中注意力头的数量，默认为16
        attention_dropout: Optional[float] = 0.1,  # 注意力机制的dropout率，默认为0.1
        dropout: Optional[float] = 0.1,  # 通用的dropout率，默认为0.1
        max_position_embeddings: Optional[int] = 512,  # 最大位置编码数，默认为512
        init_std: Optional[float] = 0.02,  # 权重初始化的标准差，默认为0.02
        is_encoder_decoder: Optional[bool] = True,  # 是否为编码解码模型，默认为True
        add_cross_attention: Optional[bool] = True,  # 是否添加交叉注意力，默认为True
        decoder_start_token_id: Optional[int] = 0,  # 解码器起始标记的ID，默认为0
        ngram: Optional[int] = 2,  # ProphetNet模型中的ngram大小，默认为2
        num_buckets: Optional[int] = 32,  # ProphetNet模型中的桶数量，默认为32
        relative_max_distance: Optional[int] = 128,  # ProphetNet模型中的最大相对距离，默认为128
        disable_ngram_loss: Optional[bool] = False,  # 是否禁用ngram损失，默认为False
        eps: Optional[float] = 0.0,  # 极小值，用于数值稳定性，默认为0.0
        use_cache: Optional[bool] = True,  # 是否使用缓存，默认为True
        pad_token_id: Optional[int] = 0,  # 填充标记的ID，默认为0
        bos_token_id: Optional[int] = 1,  # 起始标记的ID，默认为1
        eos_token_id: Optional[int] = 2,  # 结束标记的ID，默认为2
        **kwargs,
    ):
        self.vocab_size = vocab_size  # 设置词汇表大小
        self.hidden_size = hidden_size  # 设置隐藏层大小
        self.encoder_ffn_dim = encoder_ffn_dim  # 设置编码器FFN层的维度
        self.num_encoder_layers = num_encoder_layers  # 设置编码器层数
        self.num_encoder_attention_heads = num_encoder_attention_heads  # 设置编码器注意力头的数量
        self.decoder_ffn_dim = decoder_ffn_dim  # 设置解码器FFN层的维度
        self.num_decoder_layers = num_decoder_layers  # 设置解码器层数
        self.num_decoder_attention_heads = num_decoder_attention_heads  # 设置解码器注意力头的数量
        self.max_position_embeddings = max_position_embeddings  # 设置最大位置编码数
        self.init_std = init_std  # 设置权重初始化的标准差
        self.activation_function = activation_function  # 设置激活函数类型

        # ProphetNet模型的参数
        self.ngram = ngram  # 设置ngram大小
        self.num_buckets = num_buckets  # 设置桶数量
        self.relative_max_distance = relative_max_distance  # 设置最大相对距离
        self.disable_ngram_loss = disable_ngram_loss  # 设置是否禁用ngram损失
        self.eps = eps  # 设置极小值

        # 三种类型的dropout
        self.attention_dropout = attention_dropout  # 设置注意力机制的dropout率
        self.activation_dropout = activation_dropout  # 设置激活函数的dropout率
        self.dropout = dropout  # 设置通用的dropout率

        self.use_cache = use_cache  # 设置是否使用缓存

        # 调用父类的初始化方法，设置其他参数
        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            is_encoder_decoder=is_encoder_decoder,
            add_cross_attention=add_cross_attention,
            decoder_start_token_id=decoder_start_token_id,
            **kwargs,
        )

    @property
    def num_hidden_layers(self) -> int:
        return self.num_encoder_layers + self.num_decoder_layers
    # 定义一个方法 `num_hidden_layers`，该方法接受一个参数 `value`
    def num_hidden_layers(self, value):
        # 抛出 `NotImplementedError` 异常，表示这个模型不支持设置 `num_hidden_layers` 参数
        raise NotImplementedError(
            "This model does not support the setting of `num_hidden_layers`. Please set `num_encoder_layers` and"
            " `num_decoder_layers`."
        )

`.\models\prophetnet\convert_prophetnet_original_pytorch_checkpoint_to_pytorch.py`

# coding=utf-8
# 定义脚本编码格式为 UTF-8

# 版权声明及许可证信息，使用 Apache License 2.0
# 更多信息可访问 http://www.apache.org/licenses/LICENSE-2.0

"""Convert ProphetNet checkpoint."""


import argparse  # 导入用于处理命令行参数的模块 argparse

from torch import nn  # 导入 PyTorch 的神经网络模块 nn

# 导入旧版本的 ProphetNet 和 XLMProphetNet 模型定义，对应分支 `save_old_prophetnet_model_structure`
# 原始的 prophetnet_checkpoints 存储在 `patrickvonplaten/..._old` 中
from transformers_old.modeling_prophetnet import (
    ProphetNetForConditionalGeneration as ProphetNetForConditionalGenerationOld,
)
from transformers_old.modeling_xlm_prophetnet import (
    XLMProphetNetForConditionalGeneration as XLMProphetNetForConditionalGenerationOld,
)

# 导入新版本的 ProphetNet 和 XLMProphetNet 模型定义以及日志记录模块
from transformers import ProphetNetForConditionalGeneration, XLMProphetNetForConditionalGeneration, logging


logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器
logging.set_verbosity_info()  # 设置日志记录级别为 info


def convert_prophetnet_checkpoint_to_pytorch(prophetnet_checkpoint_path: str, pytorch_dump_folder_path: str):
    """
    Copy/paste/tweak prohpetnet's weights to our prophetnet structure.
    将 ProphetNet 的权重复制/粘贴/调整到我们的 ProphetNet 结构中。
    """
    # 根据路径中是否包含 "xprophetnet" 来判断使用哪个版本的模型进行加载和转换
    if "xprophetnet" in prophetnet_checkpoint_path:
        prophet_old = XLMProphetNetForConditionalGenerationOld.from_pretrained(prophetnet_checkpoint_path)
        # 加载旧版 XLMProphetNet 模型，并输出加载信息
        prophet, loading_info = XLMProphetNetForConditionalGeneration.from_pretrained(
            prophetnet_checkpoint_path, output_loading_info=True
        )
    else:
        prophet_old = ProphetNetForConditionalGenerationOld.from_pretrained(prophetnet_checkpoint_path)
        # 加载旧版 ProphetNet 模型，并输出加载信息
        prophet, loading_info = ProphetNetForConditionalGeneration.from_pretrained(
            prophetnet_checkpoint_path, output_loading_info=True
        )

    # 定义需要特殊处理的关键字列表
    special_keys = ["key_proj", "value_proj", "query_proj"]

    # 定义模型中权重名称的映射关系，用于转换模型结构
    mapping = {
        "self_attn": "ngram_self_attn",
        "cross_attn": "encoder_attn",
        "cross_attn_layer_norm": "encoder_attn_layer_norm",
        "feed_forward_layer_norm": "final_layer_norm",
        "feed_forward": "",
        "intermediate": "fc1",
        "output": "fc2",
        "key_proj": "k_proj",
        "query_proj": "q_proj",
        "value_proj": "v_proj",
        "word_embeddings": "embed_tokens",
        "embeddings_layer_norm": "emb_layer_norm",
        "relative_pos_embeddings": "relative_linear",
        "ngram_embeddings": "ngram_input_embed",
        "position_embeddings": "embed_positions",
    }

    print(f"Saving model to {pytorch_dump_folder_path}")  # 打印保存模型的目录路径
    prophet.save_pretrained(pytorch_dump_folder_path)  # 保存转换后的 PyTorch 模型


if __name__ == "__main__":
    parser = argparse.ArgumentParser()  # 创建命令行参数解析器
    # 添加必需的参数定义
    parser.add_argument(
        "--prophetnet_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump."
    )
    # 添加必需的参数定义
    parser.add_argument(
        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
    )
    # 解析命令行参数，将其存储到args变量中
    args = parser.parse_args()
    # 调用函数将ProphetNet的检查点文件转换为PyTorch模型文件
    convert_prophetnet_checkpoint_to_pytorch(args.prophetnet_checkpoint_path, args.pytorch_dump_folder_path)

`.\models\prophetnet\modeling_prophetnet.py`

# 导入所需模块和库
import copy
import math
import warnings
from dataclasses import dataclass
from typing import Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import Tensor, nn
from torch.nn import LayerNorm

# 导入相关的自定义模块和函数
from ...activations import ACT2FN
from ...modeling_outputs import BaseModelOutput
from ...modeling_utils import PreTrainedModel
from ...utils import (
    ModelOutput,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
from .configuration_prophetnet import ProphetNetConfig

# 获取 logger 对象用于日志记录
logger = logging.get_logger(__name__)

# 配置和检查点信息，用于文档和模型加载
_CONFIG_FOR_DOC = "ProphenetConfig"
_CHECKPOINT_FOR_DOC = "microsoft/prophetnet-large-uncased"

# 预训练模型存档列表
PROPHETNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "microsoft/prophetnet-large-uncased",
    # 可在 https://huggingface.co/models?filter=prophetnet 查看所有 ProphetNet 模型
]

# ProphetNet 模型开始文档字符串
PROPHETNET_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    Original ProphetNet code can be found [here](https://github.com/microsoft/ProphetNet). Checkpoints were converted
    from original Fairseq checkpoints. For more information on the checkpoint conversion, please take a look at the
    file `convert_prophetnet_original_pytorch_checkpoint_to_pytorch.py`.

    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matters related to general usage and
    behavior.

    Parameters:
        config ([`ProphetNetConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# ProphetNet 输入文档字符串
PROPHETNET_INPUTS_DOCSTRING = r"""
"""

# 独立输入文档字符串
PROPHETNET_STANDALONE_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            # 输入序列标记在词汇表中的索引。默认情况下，会忽略填充标记。
            # 可以使用`AutoTokenizer`获取这些索引。详见`PreTrainedTokenizer.encode`和`PreTrainedTokenizer.__call__`。

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            # 遮罩，用于避免在填充标记的位置进行注意力计算。遮罩值在 `[0, 1]` 范围内：

            - 1 表示**不被遮罩**的标记，
            - 0 表示**被遮罩**的标记。

            [What are attention masks?](../glossary#attention-mask)
        head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
            # 用于在编码器的注意力模块中屏蔽选定头部的遮罩。遮罩值在 `[0, 1]` 范围内：

            - 1 表示**不被遮罩**的头部，
            - 0 表示**被遮罩**的头部。

        output_attentions (`bool`, *optional*):
            # 是否返回所有注意力层的注意力张量。返回的张量中会有关于注意力的更多细节。

        output_hidden_states (`bool`, *optional*):
            # 是否返回所有层的隐藏状态。返回的张量中会有关于隐藏状态的更多细节。

        return_dict (`bool`, *optional*):
            # 是否返回一个[`~utils.ModelOutput`]而不是普通的元组。
    # 计算主流相对位置
    main_stream_relative_positions = position_ids.unsqueeze(1).repeat(1, position_ids.size(-1), 1)
    # 从主流位置中减去每个位置的索引，得到相对位置
    main_stream_relative_positions = main_stream_relative_positions - position_ids.unsqueeze(-1)

    # 预测流相对位置
    predicting_stream_relative_positions = torch.cat((position_ids - 1, position_ids), dim=-1).unsqueeze(1)
    # 将预测流位置重复以匹配主流位置数目，并计算相对位置
    predicting_stream_relative_positions = predicting_stream_relative_positions.repeat(1, position_ids.size(-1), 1)
    predicting_stream_relative_positions = predicting_stream_relative_positions - position_ids.unsqueeze(-1)

    # 获取主流和预测流的位置桶
    # 计算主要流相对位置的桶
    main_relative_position_buckets = compute_relative_buckets(
        num_buckets,                   # 桶的数量
        max_distance,                  # 最大距离
        main_stream_relative_positions, # 主要流的相对位置
        is_bidirectional=False         # 是否双向（这里为单向，即不考虑双向）
    )
    
    # 计算预测流相对位置的桶
    predict_relative_position_buckets = compute_relative_buckets(
        num_buckets,                          # 桶的数量
        max_distance,                         # 最大距离
        predicting_stream_relative_positions, # 预测流的相对位置
        is_bidirectional=False                # 是否双向（这里为单向，即不考虑双向）
    )
    
    # 返回计算得到的主要流和预测流的相对位置桶
    return main_relative_position_buckets, predict_relative_position_buckets
@dataclass
class ProphetNetSeq2SeqLMOutput(ModelOutput):
    """
    Base class for sequence-to-sequence language models outputs.

    """

    # 损失值，可选的浮点张量
    loss: Optional[torch.FloatTensor] = None
    # 模型输出的 logits，浮点张量
    logits: torch.FloatTensor = None
    # ngram 模型输出的 logits，可选的浮点张量
    logits_ngram: Optional[torch.FloatTensor] = None
    # 过去的键/值对，可选的张量元组，用于加速顺序解码
    past_key_values: Optional[Tuple[torch.FloatTensor]] = None
    # 解码器隐藏状态的元组，可选的浮点张量元组
    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    # ngram 解码器隐藏状态的元组，可选的浮点张量元组
    decoder_ngram_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    # 解码器注意力权重的元组，可选的浮点张量元组
    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
    # ngram 解码器注意力权重的元组，可选的浮点张量元组
    decoder_ngram_attentions: Optional[Tuple[torch.FloatTensor]] = None
    # 交叉注意力权重的元组，可选的浮点张量元组
    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
    # 编码器最后隐藏状态，可选的浮点张量
    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
    # 编码器隐藏状态的元组，可选的浮点张量元组
    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    # 编码器注意力权重的元组，可选的浮点张量元组
    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None

    @property
    def decoder_cross_attentions(self):
        # 警告信息，提示 'decoder_cross_attentions' 将被移除，请使用 'cross_attentions' 替代
        warnings.warn(
            "`decoder_cross_attentions` is deprecated and will be removed soon. Please use `cross_attentions`"
            " instead.",
            FutureWarning,
        )
        # 返回交叉注意力权重的元组
        return self.cross_attentions


@dataclass
class ProphetNetSeq2SeqModelOutput(ModelOutput):
    """
    Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential
    decoding.

    """

    # 最后一个隐藏状态的浮点张量
    last_hidden_state: torch.FloatTensor
    # ngram 模型的最后一个隐藏状态，可选的浮点张量
    last_hidden_state_ngram: Optional[torch.FloatTensor] = None
    # 过去的键/值对，可选的张量元组，用于加速顺序解码
    past_key_values: Optional[Tuple[torch.FloatTensor]] = None
    # 解码器隐藏状态的元组，可选的浮点张量元组
    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    # ngram 解码器隐藏状态的元组，可选的浮点张量元组
    decoder_ngram_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    # 解码器注意力权重的元组，可选的浮点张量元组
    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
    # ngram 解码器注意力权重的元组，可选的浮点张量元组
    decoder_ngram_attentions: Optional[Tuple[torch.FloatTensor]] = None
    # 交叉注意力权重的元组，可选的浮点张量元组
    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
    # 编码器最后隐藏状态，可选的浮点张量
    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
    # 编码器隐藏状态的元组，可选的浮点张量元组
    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    # 编码器注意力权重的元组，可选的浮点张量元组
    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None

    @property
    def decoder_cross_attentions(self):
        # 警告信息，提示 'decoder_cross_attentions' 将被移除，请使用 'cross_attentions' 替代
        warnings.warn(
            "`decoder_cross_attentions` is deprecated and will be removed soon. Please use `cross_attentions`"
            " instead.",
            FutureWarning,
        )
        # 返回交叉注意力权重的元组
        return self.cross_attentions


@dataclass
class ProphetNetDecoderModelOutput(ModelOutput):
    """
    Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).

    """

    # 最后一个隐藏状态的浮点张量
    last_hidden_state: torch.FloatTensor
    # ngram 模型的最后一个隐藏状态，可选的浮点张量
    last_hidden_state_ngram: Optional[torch.FloatTensor] = None
    # 过去的键/值对，可选的张量元组，用于加速顺序解码
    past_key_values: Optional[Tuple[torch.FloatTensor]] = None
    # 隐藏状态的元组，可选的浮点张量元组
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    # ngram 隐藏状态的元组，可选的浮点张量元组
    hidden_states_ngram: Optional[Tuple[torch.FloatTensor]] = None
    # 注意力权重的元组，可选的浮点张量元组
    attentions: Optional[Tuple[torch.FloatTensor]] = None
    # ngram 注意力权重的元组，可选的浮点张量元组
    ngram_attentions: Optional[Tuple[torch.FloatTensor]] = None
    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
@dataclass
class ProphetNetDecoderLMOutput(ModelOutput):
    """
    Model output class for the ProphetNet decoder, inheriting from ModelOutput.
    Contains various tensors representing model predictions and intermediate states.
    """

    loss: Optional[torch.FloatTensor] = None  # Optional tensor for model training loss
    logits: torch.FloatTensor = None  # Tensor containing logits (predictions) from the decoder
    logits_ngram: Optional[torch.FloatTensor] = None  # Optional tensor for n-gram logits
    past_key_values: Optional[Tuple[torch.FloatTensor]] = None  # Optional tuple of past key/values for fast decoding
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None  # Optional tuple of hidden states
    hidden_states_ngram: Optional[Tuple[torch.FloatTensor]] = None  # Optional tuple of n-gram hidden states
    attentions: Optional[Tuple[torch.FloatTensor]] = None  # Optional tuple of attention tensors
    ngram_attentions: Optional[Tuple[torch.FloatTensor]] = None  # Optional tuple of n-gram attention tensors
    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None  # Optional tuple of cross-attention tensors


class ProphetNetPreTrainedModel(PreTrainedModel):
    """
    Base class for all models in the ProphetNet series, inheriting from PreTrainedModel.
    """

    config_class = ProphetNetConfig  # Configuration class for ProphetNet models
    base_model_prefix = "prophetnet"  # Prefix used for the base model
    supports_gradient_checkpointing = True  # Indicates whether the model supports gradient checkpointing

    def _init_weights(self, module):
        """
        Initialize weights of linear and embedding modules based on configuration.
        """
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.init_std)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.init_std)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()

    def _shift_right(self, input_ids):
        """
        Shift input ids to the right for autoregressive decoding.
        """
        decoder_start_token_id = self.config.decoder_start_token_id
        pad_token_id = self.config.pad_token_id

        assert decoder_start_token_id is not None, (
            "self.model.config.decoder_start_token_id has to be defined. In ProphetNet it is usually set to the"
            " pad_token_id. See ProphetNet docs for more information"
        )

        # shift inputs to the right
        shifted_input_ids = input_ids.new_zeros(input_ids.shape)
        shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
        shifted_input_ids[..., 0] = decoder_start_token_id

        assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined."
        # replace possible -100 values in labels by `pad_token_id`
        shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)

        assert torch.all(shifted_input_ids >= 0).item(), "Verify that `shifted_input_ids` has only positive values"

        return shifted_input_ids


class ProphetNetPositionalEmbeddings(nn.Embedding):
    """
    Positional embedding module for ProphetNet models.
    Learns positional embeddings up to a fixed maximum size, handling padding ids.
    """

    def __init__(self, config: ProphetNetConfig) -> None:
        self.max_length = config.max_position_embeddings
        super().__init__(config.max_position_embeddings, config.hidden_size, config.pad_token_id)
    # 定义前向传播函数，接受输入形状、设备信息，可选的注意力掩码、过去的键值对和位置 ID
    def forward(self, inputs_shape, device, attention_mask=None, past_key_values=None, position_ids=None):
        # 断言：如果位置 ID 已预先计算，则填充索引不应设置
        assert (position_ids is None) or (
            self.padding_idx is None
        ), "If position_ids is pre-computed then padding_idx should not be set."

        # 如果位置 ID 未提供
        if position_ids is None:
            # 如果有过去的键值对
            if past_key_values is not None:
                # 位置 ID 在解码单步时对每个令牌相同
                # 在导出到 ONNX 时，如果没有 int() 转换，在某些情况下可能无法正常工作
                prev_num_input_ids = past_key_values[0][0].shape[2]
                num_input_ids = inputs_shape[1] + prev_num_input_ids
                # 计算位置 ID，确保它为填充索引加上输入令牌数
                position_ids = torch.ones((1, 1), dtype=torch.long, device=device) * (
                    int(self.padding_idx + num_input_ids)
                )
            else:
                # 如果没有过去的键值对，并且没有提供注意力掩码，则创建全一的注意力掩码
                if attention_mask is None:
                    attention_mask = torch.ones(inputs_shape, dtype=torch.long, device=device)

                # 从输入令牌 / 注意力掩码中检索位置 ID
                position_ids = (
                    torch.cumsum(attention_mask, dim=1).type_as(attention_mask) * attention_mask
                ).long() + self.padding_idx

                # 确保位置 ID 不超过最大长度减一
                position_ids = position_ids.clamp(0, self.max_length - 1)

        # 调用父类的前向传播函数，返回结果和计算得到的位置 ID
        return super().forward(position_ids), position_ids

    # 私有方法 _forward，接受位置 ID 参数
    def _forward(self, position_ids):
        # 调用父类的前向传播函数，传递位置 ID 参数
        return super().forward(position_ids)
class ProphetNetAttention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    def __init__(
        self,
        config: ProphetNetConfig,
        num_attn_heads: int,
    ):
        super().__init__()
        hidden_size = config.hidden_size

        self.attention_dropout = config.attention_dropout  # 从配置中获取注意力丢弃率
        self.dropout = config.dropout  # 从配置中获取全连接层输出的丢弃率
        self.num_attn_heads = num_attn_heads  # 设置注意力头的数量
        self.head_dim = hidden_size // num_attn_heads  # 计算每个注意力头的维度

        assert self.head_dim * num_attn_heads == hidden_size, (
            "`config.hidden_size` must be divisible by `config.num_encoder_attention_heads` and"
            " `config.num_decoder_attention_heads`"
        )

        self.key_proj = nn.Linear(hidden_size, hidden_size)  # 初始化键的投影矩阵
        self.value_proj = nn.Linear(hidden_size, hidden_size)  # 初始化值的投影矩阵
        self.query_proj = nn.Linear(hidden_size, hidden_size)  # 初始化查询的投影矩阵

        self.out_proj = nn.Linear(hidden_size, hidden_size)  # 初始化输出投影矩阵

    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        return tensor.view(bsz, seq_len, self.num_attn_heads, self.head_dim).transpose(1, 2).contiguous()
        # 重新形状张量以便进行多头注意力计算

    def forward(
        self,
        hidden_states,
        key_value_states: Optional[Tensor] = None,
        attention_mask: Optional[Tensor] = None,
        layer_head_mask: Optional[Tensor] = None,
        past_key_value: Optional[Tuple[Tensor]] = None,
        output_attentions: bool = False,
    ):
        # 前向传播函数定义，执行注意力计算
    # 初始化函数，接受一个ProphetNetConfig类型的参数config
    def __init__(self, config: ProphetNetConfig):
        # 调用父类的初始化函数
        super().__init__()
        # 设置隐藏层大小
        self.hidden_size = config.hidden_size

        # 设置桶的数量
        self.num_buckets = config.num_buckets
        # 设置相对位置的最大距离
        self.relative_max_distance = config.relative_max_distance
        # 设置注意力头的数量
        self.num_attn_heads = config.num_decoder_attention_heads
        # 设置dropout率
        self.dropout = config.dropout
        # 设置注意力dropout率
        self.attention_dropout = config.attention_dropout
        # 计算每个注意力头的维度
        self.head_dim = config.hidden_size // self.num_attn_heads
        # 设置ngram
        self.ngram = config.ngram

        # 断言确保隐藏层大小能够被注意力头的数量整除
        assert (
            self.head_dim * self.num_attn_heads == config.hidden_size
        ), "config.hidden_size must be divisible by num_attn_heads"

        # key, value, query的投影层
        self.key_proj = nn.Linear(config.hidden_size, config.hidden_size)
        self.value_proj = nn.Linear(config.hidden_size, config.hidden_size)
        self.query_proj = nn.Linear(config.hidden_size, config.hidden_size)

        # 输出投影层
        self.out_proj = nn.Linear(config.hidden_size, config.hidden_size)

        # 相对位置嵌入
        self.relative_pos_embeddings = nn.Linear(config.hidden_size, self.num_buckets * self.num_attn_heads)

        # 用于ONNX运行时的标志
        self.onnx_trace = False

    # 将张量形状重新整理为(batch_size, seq_len, num_attn_heads, head_dim)，并进行转置
    def _shape(self, tensor, seq_len, batch_size):
        return tensor.view(batch_size, seq_len, self.num_attn_heads, self.head_dim).transpose(1, 2).contiguous()

    # 准备用于导出到ONNX的设置
    def prepare_for_onnx_export_(self):
        self.onnx_trace = True

    # 前向传播函数，接受一系列输入参数，并返回输出结果
    def forward(
        self,
        hidden_states,
        past_key_value: Optional[Tuple[Tensor]] = None,
        attention_mask=None,
        layer_head_mask=None,
        extended_predict_attention_mask=None,
        main_relative_position_buckets=None,
        predict_relative_position_buckets=None,
        position_ids=None,
    ):
        # 省略部分前向传播函数的具体实现
    ):
        # input hidden_states [batch_size, sequence_length, hidden_size]
        # input attn_weights [batch_size, num_heads, sequence_length, sequence_length]
        # input position_ids [batch_size, sequence_length] or [1,1]
        
        # 解构输入参数中的维度信息
        batch_size, num_attn_heads, tgt_len, src_len = attn_weights.shape
        
        # 调整注意力权重张量的形状，以匹配后续计算需求
        attn_weights = attn_weights.view(batch_size, num_attn_heads, tgt_len, src_len)
        
        # 如果未提供主要相对位置桶，则计算默认相对位置信息
        if main_relative_position_buckets is None:
            # 获取隐藏状态张量的形状信息
            batch_size, sequence_length = hidden_states.shape[:2]
            
            # 计算相对位置张量，减去给定的位置标识
            relative_positions = (
                torch.arange(1, attn_weights.shape[-1] + 1)
                .unsqueeze(0)
                .unsqueeze(0)
                .repeat(batch_size, sequence_length, 1)
                .to(position_ids.device)
            )
            relative_positions = relative_positions - position_ids.unsqueeze(0).repeat(batch_size, sequence_length, 1)
            
            # 计算主要相对位置桶，用于多头注意力机制
            main_relative_position_buckets = compute_relative_buckets(
                self.num_buckets, self.relative_max_distance, relative_positions, False
            )

        # 计算相对位置编码张量
        rel_pos_embeddings = self.relative_pos_embeddings(hidden_states)
        
        # 调整相对位置编码张量的形状，以匹配注意力权重和多头数目
        rel_pos_embeddings = rel_pos_embeddings.view(
            rel_pos_embeddings.shape[:2] + (self.num_buckets, self.num_attn_heads)
        )
        rel_pos_embeddings = rel_pos_embeddings.permute(0, 3, 1, 2)
        
        # 将相对位置编码张量重塑为适合注意力权重形状的张量
        rel_pos_embeddings = rel_pos_embeddings.reshape(attn_weights.shape[:3] + (-1,))

        # 复制主要相对位置桶以适配多头数目
        main_relative_position_buckets = main_relative_position_buckets.repeat(1, self.num_attn_heads, 1)
        
        # 将主要相对位置桶的形状调整为适合索引操作的形式
        main_relative_position_buckets = main_relative_position_buckets.view(
            -1, main_relative_position_buckets.shape[-1]
        )
        
        # 将主要相对位置桶转换为长整型
        main_relative_position_buckets = main_relative_position_buckets.long()
        
        # 将相对位置编码张量重塑为适合索引操作的形式
        rel_pos_embeddings = rel_pos_embeddings.reshape(-1, rel_pos_embeddings.size(-1))

        # 使用索引操作从相对位置编码张量中获取主要相对位置编码
        main_relative_pos_embeddings = torch.gather(rel_pos_embeddings, dim=1, index=main_relative_position_buckets)
        
        # 将获取的主要相对位置编码重新调整为原始形状
        main_relative_pos_embeddings = main_relative_pos_embeddings.view(batch_size, num_attn_heads, tgt_len, -1)
        
        # 返回主要相对位置编码张量
        return main_relative_pos_embeddings

    def get_predict_relative_pos_embeddings(
        self, hidden_states, attn_weights, position_ids, predict_relative_position_buckets
    ):
        # input hidden_states [batch_size, sequence_length, ngram, hidden_size]
        # input attn_weights [batch_size, ngram, num_heads, sequence_length, 2*sequence_length]
        # input position_ids [batch_size, sequence_length] or [1,1]
        # input predict_relative_position_buckets [batch_size, sequence_length, 2*sequence_length] or None
        
        # 获取 batch_size 和 sequence_length
        batch_size, sequence_length = hidden_states.shape[0:2]

        # 如果 predict_relative_position_buckets 为 None，则计算相对位置信息
        if predict_relative_position_buckets is None:
            # 获取 attn_weights 的 key_sequence_length
            key_sequence_length = attn_weights.shape[-1]
            # 检查 position_ids 的有效性，确保格式为 1 2 3 4 5 ... (key_sequence_length - 1)
            assert (
                position_ids[0][0] == key_sequence_length - 1
            ), "`position_ids` are incorrect. They should be of the format 1 2 3 4 5 ... (key_sequence_length - 1)"
            
            # 生成相对位置信息 relative_positions
            relative_positions = (
                torch.arange(0, key_sequence_length)
                .unsqueeze(0)
                .unsqueeze(0)
                .repeat(batch_size, sequence_length, 1)
                .to(position_ids.device)
            )

            # 计算相对位置差值
            relative_positions = relative_positions - position_ids.unsqueeze(0).repeat(batch_size, sequence_length, 1)
            
            # 计算预测相对位置桶
            predict_relative_position_buckets = compute_relative_buckets(
                self.num_buckets, self.relative_max_distance, relative_positions, False
            )

        # 将 hidden_states 的维度 [batch_size, sequence_length, ngram, hidden_size] 转置为 [batch_size, ngram, sequence_length, hidden_size]
        hidden_states = hidden_states.transpose(1, 2)
        
        # 计算相对位置嵌入 rel_pos_embeddings
        rel_pos_embeddings = self.relative_pos_embeddings(hidden_states)

        # 将 rel_pos_embeddings 的维度调整为 [batch_size, ngram, sequence_length, num_buckets, num_heads]
        rel_pos_embeddings = rel_pos_embeddings.view(
            hidden_states.shape[:-1] + (self.num_buckets, self.num_attn_heads)
        )
        
        # 将 rel_pos_embeddings 的维度重新排列为 [batch_size, ngram, num_heads, sequence_length, num_buckets]
        rel_pos_embeddings = rel_pos_embeddings.permute(0, 2, 1, 4, 3)
        
        # 将 rel_pos_embeddings 的形状调整为 [batch_size * ngram * sequence_length * num_heads, num_buckets]
        rel_pos_embeddings = rel_pos_embeddings.reshape(-1, self.num_buckets)
        
        # 将 predict_relative_position_buckets 的形状调整为 [ngram, batch_size, num_heads, sequence_length, -1]
        predict_relative_position_buckets = predict_relative_position_buckets.unsqueeze(0)
        predict_relative_position_buckets = predict_relative_position_buckets.repeat(
            self.ngram, 1, self.num_attn_heads, 1
        )
        
        # 将 predict_relative_position_buckets 的形状调整为 [ngram * batch_size * num_heads * sequence_length, -1]
        predict_relative_position_buckets = predict_relative_position_buckets.view(
            -1, predict_relative_position_buckets.size(-1)
        ).long()
        
        # 使用 torch.gather 获取预测的相对位置嵌入 predict_relative_pos_embeddings
        predict_relative_pos_embeddings = torch.gather(
            rel_pos_embeddings, dim=1, index=predict_relative_position_buckets
        )

        # 将 predict_relative_pos_embeddings 的形状调整为 [batch_size, gram, num_heads, sequence_length, -1]
        predict_relative_pos_embeddings = predict_relative_pos_embeddings.view(
            batch_size, self.ngram, self.num_attn_heads, sequence_length, -1
        )

        # 返回预测的相对位置嵌入 predict_relative_pos_embeddings
        return predict_relative_pos_embeddings
class ProphetNetEncoderLayer(nn.Module):
    """
    Encoder block for Prophetnet
    """

    def __init__(self, config: ProphetNetConfig):
        super().__init__()
        # 1st residual block
        # 创建自注意力机制模块，使用 ProphetNetAttention
        self.self_attn = ProphetNetAttention(config, config.num_encoder_attention_heads)
        # 创建自注意力机制的 LayerNorm 层
        self.self_attn_layer_norm = LayerNorm(config.hidden_size)

        # 2nd residual block
        # 创建前馈神经网络模块，使用 ProphetNetFeedForward
        self.feed_forward = ProphetNetFeedForward(config, config.encoder_ffn_dim)
        # 创建前馈神经网络的 LayerNorm 层
        self.feed_forward_layer_norm = LayerNorm(config.hidden_size)

    def forward(
        self,
        hidden_states,
        attention_mask,
        layer_head_mask,
        output_attentions: bool = False,
    ):
        # 1st residual block
        # 执行自注意力机制，得到注意力输出、注意力权重和无用的变量 _
        attention_output, attn_weights, _ = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            layer_head_mask=layer_head_mask,
            output_attentions=output_attentions,
        )
        # 应用 LayerNorm 到注意力输出和输入状态的残差连接
        hidden_states = self.self_attn_layer_norm(attention_output + hidden_states)

        # 2nd residual block
        # 执行前馈神经网络
        feed_forward_output = self.feed_forward(hidden_states)
        # 应用 LayerNorm 到前馈神经网络输出和输入状态的残差连接
        hidden_states = self.feed_forward_layer_norm(feed_forward_output + hidden_states)

        outputs = (hidden_states,)

        if output_attentions:
            outputs += (attn_weights,)

        return outputs


class ProphetNetDecoderLayer(nn.Module):
    """
    Decoder block for Prophetnet
    """

    def __init__(self, config: ProphetNetConfig):
        super().__init__()
        # 1st residual block
        # 创建 N-gram 自注意力机制模块，使用 ProphetNetNgramSelfAttention
        self.self_attn = ProphetNetNgramSelfAttention(config)
        # 创建自注意力机制的 LayerNorm 层
        self.self_attn_layer_norm = LayerNorm(config.hidden_size)

        # 2nd residual block
        # 如果配置要求添加跨注意力机制
        if config.add_cross_attention:
            # 创建跨注意力机制模块，使用 ProphetNetAttention
            self.cross_attn = ProphetNetAttention(config, config.num_decoder_attention_heads)
            # 创建跨注意力机制的 LayerNorm 层
            self.cross_attn_layer_norm = LayerNorm(config.hidden_size)

        # 3rd residual block
        # 创建解码器前馈神经网络模块，使用 ProphetNetFeedForward
        self.feed_forward = ProphetNetFeedForward(config, config.decoder_ffn_dim)
        # 创建前馈神经网络的 LayerNorm 层
        self.feed_forward_layer_norm = LayerNorm(config.hidden_size)

    def forward(
        self,
        hidden_states,
        attention_mask=None,
        encoder_hidden_states=None,
        encoder_attn_mask=None,
        layer_head_mask=None,
        cross_attn_layer_head_mask=None,
        extended_predict_attention_mask=None,
        main_relative_position_buckets=None,
        predict_relative_position_buckets=None,
        position_ids=None,
        past_key_value=None,
        use_cache: bool = True,
        output_attentions: bool = False,
    ):
        ):
            # 1st residual block
            # 如果过去的键/值对存在，则从中选择前两个作为自注意力的过去键/值对；否则为 None
            self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
            # 调用自注意力机制，计算输出和注意力权重
            ngram_attention_output, self_attn_weights, self_attn_weights_ngram, present_key_value = self.self_attn(
                hidden_states=hidden_states,
                past_key_value=self_attn_past_key_value,
                attention_mask=attention_mask,
                layer_head_mask=layer_head_mask,
                extended_predict_attention_mask=extended_predict_attention_mask,
                main_relative_position_buckets=main_relative_position_buckets,
                predict_relative_position_buckets=predict_relative_position_buckets,
                position_ids=position_ids,
            )
            # 应用 Layer Normalization 到自注意力输出和原始隐藏状态的和
            hidden_states = self.self_attn_layer_norm(hidden_states + ngram_attention_output)

            # 如果过去的键/值对存在，则从中选择后两个作为跨注意力的过去键/值对；否则为 None
            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
            cross_attn_weights = None
            if encoder_hidden_states is not None:
                # 2nd residual block
                # 调用跨注意力机制，计算输出、注意力权重和当前键/值对
                attention_output, cross_attn_weights, cross_attn_present_key_value = self.cross_attn(
                    hidden_states=hidden_states,
                    key_value_states=encoder_hidden_states,
                    attention_mask=encoder_attn_mask,
                    layer_head_mask=cross_attn_layer_head_mask,
                    past_key_value=cross_attn_past_key_value,
                    output_attentions=output_attentions,
                )
                # 应用 Layer Normalization 到跨注意力输出和原始隐藏状态的和
                hidden_states = self.cross_attn_layer_norm(attention_output + hidden_states)

                # 将跨注意力的当前键/值对添加到当前键/值对中的第三、第四个位置
                present_key_value = present_key_value + cross_attn_present_key_value

            # 3rd residual block
            # 应用前馈网络层到隐藏状态，得到前馈网络输出
            feed_forward_output = self.feed_forward(hidden_states)
            # 应用 Layer Normalization 到前馈网络输出和原始隐藏状态的和
            hidden_states = self.feed_forward_layer_norm(feed_forward_output + hidden_states)

            # 输出结果初始化为包含隐藏状态的元组
            outputs = (hidden_states,)

            # 如果需要输出注意力权重，则将自注意力和跨注意力的权重添加到输出结果中
            if output_attentions:
                outputs += (self_attn_weights, self_attn_weights_ngram, cross_attn_weights)

            # 如果需要使用缓存，则将当前键/值对添加到输出结果中
            if use_cache:
                outputs += (present_key_value,)

            # 返回最终的输出结果
            return outputs
# 定义封装了ProphetNet模型的独立编码器部分 的类

@add_start_docstrings(
    "The standalone encoder part of the ProphetNetModel.",
    PROPHETNET_START_DOCSTRING,
)
class ProphetNetEncoder(ProphetNetPreTrainedModel):
    """
    代表ProphetNet编码器部分，用于封装模型的编码器参数。这个类可以使用预定义的词嵌入初始化，而不是随机初始化的词嵌入。
    """

    def __init__(self, config: ProphetNetConfig, word_embeddings: nn.Embedding = None):
        # 初始化ProphetNet编码器
        super().__init__(config)

        # 根据传入的参数，创建词嵌入层。如果没有提供，使用预设的随机初始化和填充索引。
        self.word_embeddings = (
            word_embeddings
            if word_embeddings is not None
            else nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
        )

        # 初始化位置嵌入层
        self.position_embeddings = ProphetNetPositionalEmbeddings(config)

        # 初始化嵌入层归一化层
        self.embeddings_layer_norm = LayerNorm(config.hidden_size)

        # 创建编码器层列表
        self.layers = nn.ModuleList([
            ProphetNetEncoderLayer(config)
            for _ in range(config.num_encoder_layers)
        ])

        # 初始化梯度检查点
        self.gradient_checkpointing = False

        # 执行最后的操作，初始化权重和处理
        self.post_init()

    # 获取输入嵌入的函数
    def get_input_embeddings(self):
        return self.word_embeddings

    # 设置输入嵌入的函数
    def set_input_embeddings(self, value):
        self.word_embeddings = value

    # 定义处理输入的数据格式和参数的函数，实现前馈操作
    @add_start_docstrings_to_model_forward(PROPHETNET_STANDALONE_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        return MethodName(...)

# 定义封装了ProphetNet模型的独立解码器部分 的类
@add_start_docstrings(
    "The standalone decoder part of the ProphetNetModel.",
    PROPHETNET_START_DOCSTRING,
)
class ProphetNetDecoder(ProphetNetPreTrainedModel):
    """
    用于封装ProphetNet模型的独立解码器部分。此类可用于通过提供预定义的词嵌入初始化模型，而不是随机初始化词嵌入。
    """

    def __init__(self, config: ProphetNetConfig, word_embeddings: nn.Embedding = None):
        # 初始化ProphetNet解码器
        super().__init__(config)

        # 根据传入的参数，创建词嵌入层。如果没有提供，使用预设的随机初始化和填充索引。
        self.word_embeddings = (
            word_embeddings
            if word_embeddings is not None
            else nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
        )

        # 这里可添加额外的初始化操作和参数初始化
    # 初始化函数，用于初始化ProphetNetDecoder模型的各种参数和组件
    def __init__(self, config: ProphetNetConfig, word_embeddings: Optional[nn.Embedding] = None):
        # 调用父类的初始化函数，初始化模型的基本配置
        super().__init__(config)

        # 设置模型使用的N-gram大小
        self.ngram = config.ngram
        # 设置模型使用的桶（bucket）数量
        self.num_buckets = config.num_buckets
        # 设置模型使用的相对最大距离
        self.relative_max_distance = config.relative_max_distance
        # 设置模型使用的dropout比例
        self.dropout = config.dropout
        # 设置模型允许的最大目标位置
        self.max_target_positions = config.max_position_embeddings

        # 初始化词嵌入层，如果给定了外部的词嵌入则使用外部的，否则创建新的词嵌入
        self.word_embeddings = (
            word_embeddings
            if word_embeddings is not None
            else nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
        )
        
        # 初始化位置编码层
        self.position_embeddings = ProphetNetPositionalEmbeddings(config)

        # 初始化N-gram编码层
        self.ngram_embeddings = nn.Embedding(self.ngram, config.hidden_size, None)
        
        # 初始化多层ProphetNet解码器层
        self.layers = nn.ModuleList([ProphetNetDecoderLayer(config) for _ in range(config.num_decoder_layers)])
        
        # 初始化嵌入层的LayerNorm层
        self.embeddings_layer_norm = LayerNorm(config.hidden_size)

        # 设置梯度检查点为False，通常用于内存优化
        self.gradient_checkpointing = False
        
        # 执行后续的初始化和权重设置
        self.post_init()

    # 返回模型的输入词嵌入层
    def get_input_embeddings(self):
        return self.word_embeddings

    # 设置模型的输入词嵌入层
    def set_input_embeddings(self, value):
        self.word_embeddings = value

    # ProphetNetDecoder模型的前向传播函数，接受多个输入参数并返回相应的输出
    @add_start_docstrings_to_model_forward(PROPHETNET_STANDALONE_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=ProphetNetDecoderModelOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        # 输入参数的详细描述已经通过装饰器添加到模型前向函数的文档中
    def compute_buffered_relative_buckets(self, position_ids):
        # 获取批处理大小和序列长度
        batch_size, sequence_length = position_ids.shape

        # 创建位置ID序列，范围从1到self.max_target_positions，复制到当前设备
        position_ids = torch.arange(1, self.max_target_positions).to(position_ids.device).repeat(1, 1)
        
        # 计算主要和预测相对桶
        main_relative_buckets, predict_relative_buckets = compute_all_stream_relative_buckets(
            self.num_buckets, self.relative_max_distance, position_ids
        )

        # 缓冲相对桶
        main_relative_buckets = main_relative_buckets[:, :sequence_length, :sequence_length].repeat(batch_size, 1, 1)
        predict_relative_buckets = torch.cat(
            [
                predict_relative_buckets[:, :sequence_length, :sequence_length],
                predict_relative_buckets[
                    :, :sequence_length, self.max_target_positions : self.max_target_positions + sequence_length
                ],
            ],
            2,
        ).repeat(batch_size, 1, 1)

        return main_relative_buckets, predict_relative_buckets

    def prepare_attention_mask(self, hidden_states, attention_mask):
        # 获取批处理大小和序列长度
        batch_size, seq_length = hidden_states.shape[:2]

        # 获取因果遮罩
        causal_mask = torch.full(
            (seq_length, seq_length),
            torch.finfo(hidden_states.dtype).min,  # 用隐藏状态的最小浮点数填充
            dtype=hidden_states.dtype,
            device=hidden_states.device,
        )
        causal_mask = torch.triu(causal_mask, 1)  # 获取因果上三角遮罩

        extended_causal_mask = causal_mask[:seq_length, :seq_length][None, None, :, :].expand(
            (batch_size, self.config.num_decoder_attention_heads) + causal_mask.shape
        )

        # 添加常规的注意力遮罩
        if attention_mask is not None:
            extended_attention_mask = (1.0 - attention_mask[:, None, None, :]) * torch.finfo(hidden_states.dtype).min
            extended_attention_mask = extended_causal_mask + extended_attention_mask
        else:
            extended_attention_mask = extended_causal_mask

        return extended_attention_mask.to(hidden_states.dtype)
    # 定义一个方法用于准备预测时的注意力掩码
    def prepare_predict_attention_mask(self, hidden_states, attention_mask):
        # 获取批处理大小和序列长度
        batch_size, seq_length = hidden_states.shape[:2]

        # 获取因果掩码
        predict_causal_mask = ngram_attention_bias(
            self.max_target_positions, self.ngram, hidden_states.device, hidden_states.dtype
        )
        # 将因果掩码按列连接，形成预测时的完整因果掩码
        predict_causal_mask = torch.cat(
            [
                predict_causal_mask[:, :seq_length, :seq_length],
                predict_causal_mask[
                    :, :seq_length, self.max_target_positions : self.max_target_positions + seq_length
                ],
            ],
            dim=-1,
        )
        # 扩展因果掩码以适应批处理维度和注意力头的数量
        extended_predict_causal_mask = predict_causal_mask[None, None, :, :, :].expand(
            (batch_size, self.config.num_decoder_attention_heads) + predict_causal_mask.shape
        )

        # 添加普通的注意力掩码
        if attention_mask is not None:
            # 创建扩展的注意力掩码，并确保预测流的注意力掩码始终为0
            extended_attention_mask = (1.0 - attention_mask[:, None, None, None, :]) * torch.finfo(self.dtype).min
            extended_attention_mask = extended_attention_mask.expand(
                (batch_size, self.config.num_decoder_attention_heads, self.ngram, seq_length, seq_length)
            )
            extended_attention_mask = torch.cat(
                [extended_attention_mask, torch.zeros_like(extended_attention_mask)], dim=-1
            )
            extended_predict_attention_mask = extended_predict_causal_mask + extended_attention_mask
        else:
            extended_predict_attention_mask = extended_predict_causal_mask
        
        # 将最终的扩展预测注意力掩码转换为隐藏状态的数据类型并返回
        return extended_predict_attention_mask.to(hidden_states.dtype)
@add_start_docstrings(
    "The bare ProphetNet Model outputting raw hidden-states without any specific head on top.",
    PROPHETNET_START_DOCSTRING,
)
# 定义 ProphetNetModel 类，继承自 ProphetNetPreTrainedModel
class ProphetNetModel(ProphetNetPreTrainedModel):
    # 定义 tied_weights_keys 列表，用于存储需要绑定权重的键名
    _tied_weights_keys = ["encoder.word_embeddings.weight", "decoder.word_embeddings.weight"]

    # 初始化方法，接收 ProphetNetConfig 类型的 config 参数
    def __init__(self, config: ProphetNetConfig):
        # 调用父类 ProphetNetPreTrainedModel 的初始化方法
        super().__init__(config)
        
        # 创建词嵌入层，使用 nn.Embedding 类，设置词汇量大小、隐藏层大小和填充标记ID
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)

        # 复制 config 以创建编码器的配置，设置为不是编码-解码器模式且不使用缓存
        encoder_config = copy.deepcopy(config)
        encoder_config.is_encoder_decoder = False
        encoder_config.use_cache = False
        
        # 创建编码器实例，使用 ProphetNetEncoder 类，并传入配置和词嵌入层
        self.encoder = ProphetNetEncoder(encoder_config, self.word_embeddings)

        # 复制 config 以创建解码器的配置，设置为解码器模式且不是编码-解码器模式
        decoder_config = copy.deepcopy(config)
        decoder_config.is_decoder = True
        decoder_config.is_encoder_decoder = False
        
        # 创建解码器实例，使用 ProphetNetDecoder 类，并传入配置和词嵌入层
        self.decoder = ProphetNetDecoder(decoder_config, self.word_embeddings)

        # 初始化权重并应用最终处理
        self.post_init()

    # 获取输入词嵌入层的方法
    def get_input_embeddings(self):
        return self.word_embeddings

    # 设置输入词嵌入层的方法，接收 value 参数
    def set_input_embeddings(self, value):
        # 设置词嵌入层为 value
        self.word_embeddings = value
        # 设置编码器和解码器的词嵌入层为相同的 value
        self.encoder.word_embeddings = self.word_embeddings
        self.decoder.word_embeddings = self.word_embeddings

    # 绑定权重的私有方法
    def _tie_weights(self):
        # 如果配置中指定了绑定词嵌入层的权重
        if self.config.tie_word_embeddings:
            # 将编码器和解码器的词嵌入层权重绑定到同一个实例
            self._tie_or_clone_weights(self.encoder.word_embeddings, self.word_embeddings)
            self._tie_or_clone_weights(self.decoder.word_embeddings, self.word_embeddings)

    # 获取编码器实例的方法
    def get_encoder(self):
        return self.encoder

    # 获取解码器实例的方法
    def get_decoder(self):
        return self.decoder

    # 前向传播方法，接收多个输入参数，并设置了输出文档的注释和返回值类型
    @add_start_docstrings_to_model_forward(PROPHETNET_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=ProphetNetSeq2SeqModelOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        decoder_input_ids: Optional[torch.Tensor] = None,
        decoder_attention_mask: Optional[torch.BoolTensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        decoder_head_mask: Optional[torch.Tensor] = None,
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        encoder_outputs: Optional[Tuple] = None,
        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        decoder_inputs_embeds: Optional[torch.Tensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    # 定义一个类变量，包含需要共享权重的模型层的名称列表
    _tied_weights_keys = ["encoder.word_embeddings.weight", "decoder.word_embeddings.weight", "lm_head.weight"]

    # 初始化方法，接收一个ProphetNetConfig类型的配置对象作为参数
    def __init__(self, config: ProphetNetConfig):
        # 调用父类初始化方法，传入配置对象
        super().__init__(config)
        # 创建ProphetNetModel对象，并将其保存在self.prophetnet中
        self.prophetnet = ProphetNetModel(config)
        # 设置padding_idx为配置对象中的pad_token_id属性
        self.padding_idx = config.pad_token_id
        # 根据配置对象的disable_ngram_loss属性设置self.disable_ngram_loss
        self.disable_ngram_loss = config.disable_ngram_loss

        # 创建一个线性层，将输入维度设为配置对象的hidden_size，输出维度设为配置对象的vocab_size
        # 不使用偏置项（bias=False）
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

        # 调用初始化权重并应用最终处理方法
        self.post_init()

    # 返回lm_head作为输出的嵌入层对象
    def get_output_embeddings(self):
        return self.lm_head

    # 将新的嵌入层对象赋值给lm_head
    def set_output_embeddings(self, new_embeddings):
        self.lm_head = new_embeddings

    # 如果配置对象指定了tie_word_embeddings，则共享权重
    def _tie_weights(self):
        if self.config.tie_word_embeddings:
            self._tie_or_clone_weights(self.prophetnet.word_embeddings, self.lm_head)

    # 返回prophetnet模型中的word_embeddings作为输入嵌入层对象
    def get_input_embeddings(self):
        return self.prophetnet.word_embeddings

    # 前向传播方法，接受一系列可能为空的张量作为输入参数
    @add_start_docstrings_to_model_forward(PROPHETNET_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=ProphetNetSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        decoder_input_ids: Optional[torch.Tensor] = None,
        decoder_attention_mask: Optional[torch.BoolTensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        decoder_head_mask: Optional[torch.Tensor] = None,
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        encoder_outputs: Optional[torch.Tensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        decoder_inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    # 定义一个方法用于计算损失，输入参数包括 logits（预测值）、labels（真实标签）、ignore_index（忽略的索引，默认为-100）
    def _compute_loss(self, logits, labels, ignore_index=-100):
        # 创建一个与 labels 维度相同的全零张量，用于存储扩展后的标签，填充值为 ignore_index
        expend_targets = labels.new_zeros(self.config.ngram, labels.size(0), labels.size(1)).fill_(ignore_index)

        # 根据 config 中的 ngram 参数，扩展标签，将 labels 复制到 expend_targets 的不同维度中
        for i in range(self.config.ngram):
            if i > 0 and self.disable_ngram_loss:
                break
            expend_targets[i, :, :] = labels

        # 调整 logits 的维度顺序，并确保其连续性
        logits = logits.transpose(0, 1).contiguous()
        # 计算 log_softmax，得到 lprobs，用于后续的负对数似然损失计算
        lprobs = nn.functional.log_softmax(
            logits.view(-1, logits.size(-1)),  # 展平 logits 张量的前两个维度
            dim=-1,
            dtype=torch.float32,
        )

        # 使用负对数似然损失函数计算损失值，reduction 参数为 "mean" 表示计算平均损失
        loss = nn.functional.nll_loss(lprobs, expend_targets.view(-1), reduction="mean")

        # 如果 config 中的 eps 大于 0.0，则执行 label 平滑操作
        if self.config.eps > 0.0:
            # 计算平滑损失
            smooth_loss = -lprobs.sum(dim=-1, keepdim=True)
            non_masked_tokens = expend_targets.ne(ignore_index).view(-1)
            smooth_loss = smooth_loss[non_masked_tokens]
            smooth_loss = smooth_loss.mean()

            # 计算 eps_i
            eps_i = self.config.eps / lprobs.size(-1)
            # 结合 label 平滑和原始损失值，得到最终的损失值
            loss = (1.0 - self.config.eps) * loss + eps_i * smooth_loss

        # 返回计算得到的损失值
        return loss

    # 为生成准备输入的方法，返回一个包含所需输入的字典
    def prepare_inputs_for_generation(
        self,
        decoder_input_ids,
        past_key_values=None,
        attention_mask=None,
        head_mask=None,
        decoder_head_mask=None,
        cross_attn_head_mask=None,
        use_cache=None,
        encoder_outputs=None,
        **kwargs,
    ):
        # 断言 encoder_outputs 参数不为 None，确保其在生成时被传递
        assert encoder_outputs is not None, "`encoder_outputs` have to be passed for generation."

        # 如果 past_key_values 存在，仅保留 decoder_input_ids 的最后一个 token
        if past_key_values:
            decoder_input_ids = decoder_input_ids[:, -1:]

        # 返回包含生成所需输入的字典
        return {
            "input_ids": None,  # encoder_outputs 已定义，不需要 input_ids
            "encoder_outputs": encoder_outputs,
            "past_key_values": past_key_values,
            "decoder_input_ids": decoder_input_ids,
            "attention_mask": attention_mask,
            "head_mask": head_mask,
            "decoder_head_mask": decoder_head_mask,
            "cross_attn_head_mask": cross_attn_head_mask,
            "use_cache": use_cache,
        }

    # 根据标签准备 decoder_input_ids 的静态方法
    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
        return self._shift_right(labels)

    # 从 past_key_values 中重新排序缓存的静态方法，用于 beam search 生成
    @staticmethod
    def _reorder_cache(past_key_values, beam_idx):
        reordered_past = ()
        for layer_past in past_key_values:
            # 对每一层的过去状态执行重新排序，以适应 beam search 的索引变化
            reordered_past += (
                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past[:2])
                + layer_past[2:],
            )
        return reordered_past

    # 获取 encoder 的方法，返回 prophetnet 模型的 encoder 部分
    def get_encoder(self):
        return self.prophetnet.encoder

    # 获取 decoder 的方法，返回 prophetnet 模型的 decoder 部分
    def get_decoder(self):
        return self.prophetnet.decoder
# 为 ProphetNetForCausalLM 类添加文档字符串，描述其作为 ProphetNetModel 的解码器部分，用于有因果关系的语言建模
@add_start_docstrings(
    "The standalone decoder part of the ProphetNetModel with a lm head on top. The model can be used for causal"
    " language modeling.",
    PROPHETNET_START_DOCSTRING,
)
class ProphetNetForCausalLM(ProphetNetPreTrainedModel):
    # 定义绑定权重的关键词列表，用于共享或复制权重
    _tied_weights_keys = [
        "prophetnet.word_embeddings.weight",
        "prophetnet.decoder.word_embeddings.weight",
        "lm_head.weight",
    ]

    # 初始化方法，接收 ProphetNetConfig 类型的配置参数
    def __init__(self, config: ProphetNetConfig):
        # 深拷贝配置对象，设置为解码器模式，关闭编码-解码模式
        config = copy.deepcopy(config)
        config.is_decoder = True
        config.is_encoder_decoder = False
        # 调用父类初始化方法
        super().__init__(config)
        # 创建 ProphetNetDecoderWrapper 对象
        self.prophetnet = ProphetNetDecoderWrapper(config)

        # 设置填充 token 的索引
        self.padding_idx = config.pad_token_id
        # 是否禁用 ngram 损失的标志
        self.disable_ngram_loss = config.disable_ngram_loss

        # 创建线性层 lm_head，用于预测词汇表中词的概率分布
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

        # 调用初始化权重和应用最终处理的方法
        self.post_init()

    # 获取输入嵌入的方法，返回 ProphetNet 解码器的词嵌入层
    def get_input_embeddings(self):
        return self.prophetnet.decoder.word_embeddings

    # 设置输入嵌入的方法，设置 ProphetNet 解码器的词嵌入层
    def set_input_embeddings(self, value):
        self.prophetnet.decoder.word_embeddings = value

    # 获取输出嵌入的方法，返回 lm_head 线性层，用于预测词汇表中词的概率分布
    def get_output_embeddings(self):
        return self.lm_head

    # 设置输出嵌入的方法，设置 lm_head 线性层
    def set_output_embeddings(self, new_embeddings):
        self.lm_head = new_embeddings

    # 绑定权重的方法，如果配置指定了共享词嵌入，则共享 ProphetNet 解码器的词嵌入层和 lm_head 线性层
    def _tie_weights(self):
        if self.config.tie_word_embeddings:
            self._tie_or_clone_weights(self.prophetnet.decoder.word_embeddings, self.lm_head)

    # 设置解码器的方法，用给定的解码器替换当前的 ProphetNet 解码器
    def set_decoder(self, decoder):
        self.prophetnet.decoder = decoder

    # 获取解码器的方法，返回当前 ProphetNet 解码器
    def get_decoder(self):
        return self.prophetnet.decoder

    # 前向传播方法，执行 ProphetNet 解码器的前向传播，预测下一个词的分布
    @add_start_docstrings_to_model_forward(PROPHETNET_STANDALONE_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=ProphetNetDecoderLMOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        # 前向传播的参数列表，支持 ProphetNetDecoderLMOutput 类型的输出
        **kwargs,
    ):
    def _compute_loss(self, logits, labels, ignore_index=-100):
        # 创建一个与labels具有相同大小的张量，填充为ignore_index，用于扩展目标张量
        expend_targets = labels.new_zeros(self.config.ngram, labels.size(0), labels.size(1)).fill_(ignore_index)

        for i in range(self.config.ngram):
            # 如果当前ngram大于0并且禁用了ngram损失，则退出循环
            if i > 0 and self.disable_ngram_loss:
                break
            # 将labels复制到扩展目标张量的第i层
            expend_targets[i, :, :] = labels

        # 调整logits的维度顺序，并确保内存连续
        logits = logits.transpose(0, 1).contiguous()
        # 计算log_softmax以获取概率对数
        lprobs = nn.functional.log_softmax(
            logits.view(-1, logits.size(-1)),
            dim=-1,
            dtype=torch.float32,
        )

        # 计算负对数似然损失
        loss = nn.functional.nll_loss(lprobs, expend_targets.view(-1), reduction="mean")

        if self.config.eps > 0.0:
            # 计算平滑损失，排除掩码标记，并计算平均值
            smooth_loss = -lprobs.sum(dim=-1, keepdim=True)
            non_masked_tokens = expend_targets.ne(ignore_index).view(-1)
            smooth_loss = smooth_loss[non_masked_tokens]
            smooth_loss = smooth_loss.mean()

            # 计算eps_i
            eps_i = self.config.eps / lprobs.size(-1)
            # 应用平滑损失到总损失中
            loss = (1.0 - self.config.eps) * loss + eps_i * smooth_loss

        # 返回最终的损失值
        return loss

    def prepare_inputs_for_generation(
        self,
        input_ids,
        past_key_values=None,
        attention_mask=None,
        head_mask=None,
        use_cache=None,
        **kwargs,
    ):
        # 如果attention_mask为空，则创建全为1的张量，表示所有token都被attention
        if attention_mask is None:
            attention_mask = input_ids.new_ones(input_ids.shape)

        if past_key_values:
            # 如果past_key_values存在，则仅使用最后一个token作为输入
            input_ids = input_ids[:, -1:]
        
        # 返回用于生成的输入字典
        return {
            "input_ids": input_ids,  # encoder_outputs is defined. input_ids not needed
            "attention_mask": attention_mask,
            "head_mask": head_mask,
            "past_key_values": past_key_values,
            "use_cache": use_cache,
        }

    @staticmethod
    # 重新排序缓存中的过去键值，以匹配beam search的顺序
    # 从transformers.models.bart.modeling_bart.BartForCausalLM._reorder_cache复制而来
    def _reorder_cache(past_key_values, beam_idx):
        reordered_past = ()
        for layer_past in past_key_values:
            # 根据beam_idx重新排序每一层的过去状态
            reordered_past += (
                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
            )
        return reordered_past
# 定义一个名为 ProphetNetDecoderWrapper 的类，继承自 ProphetNetPreTrainedModel 类
class ProphetNetDecoderWrapper(ProphetNetPreTrainedModel):
    """
    This is a wrapper class, so that [`ProphetNetForCausalLM`] can correctly be loaded from pretrained prophetnet
    classes.
    """

    # 初始化方法，接受一个 ProphetNetConfig 类型的参数 config
    def __init__(self, config: ProphetNetConfig):
        # 调用父类的初始化方法，传入 config 参数
        super().__init__(config)

        # 创建一个 nn.Embedding 对象，用于词嵌入，参数包括词汇表大小、隐藏层大小和填充标记的索引
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
        
        # 创建 ProphetNetDecoder 对象，传入 config 参数和之前创建的词嵌入对象
        self.decoder = ProphetNetDecoder(config, word_embeddings=self.word_embeddings)

        # 初始化权重并应用最终处理
        self.post_init()

    # 方法，用于将词嵌入层的权重与解码器的输入词嵌入层权重相绑定
    def _tie_weights(self):
        self._tie_or_clone_weights(self.word_embeddings, self.decoder.get_input_embeddings())

    # 前向传播方法，将调用解码器的前向传播方法
    def forward(self, *args, **kwargs):
        return self.decoder(*args, **kwargs)

`.\models\prophetnet\tokenization_prophetnet.py`

# coding=utf-8
# Copyright 2020 The Microsoft Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import collections  # 导入 collections 库，用于高效的数据结构
import os  # 导入 os 库，提供与操作系统交互的功能
import unicodedata  # 导入 unicodedata 库，用于 Unicode 字符数据的处理
from typing import Iterable, List, Optional, Tuple  # 导入类型提示相关的模块

from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace  # 导入 tokenization_utils 模块中的相关函数
from ...utils import logging  # 导入 logging 模块中的 logging 函数


logger = logging.get_logger(__name__)  # 获取当前模块的 logger 对象

VOCAB_FILES_NAMES = {"vocab_file": "prophetnet.tokenizer"}  # 定义词汇文件名映射的字典

PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "microsoft/prophetnet-large-uncased": (
            "https://huggingface.co/microsoft/prophetnet-large-uncased/resolve/main/prophetnet.tokenizer"
        ),
    }
}  # 定义预训练模型对应的词汇文件映射

PRETRAINED_INIT_CONFIGURATION = {
    "microsoft/prophetnet-large-uncased": {"do_lower_case": True},  # 预训练模型的初始化配置
}

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "microsoft/prophetnet-large-uncased": 512,  # 预训练模型的位置嵌入尺寸
}


# Copied from transformers.models.bert.tokenization_bert.whitespace_tokenize
def whitespace_tokenize(text):
    """Runs basic whitespace cleaning and splitting on a piece of text."""
    text = text.strip()  # 去除文本两端的空白字符
    if not text:
        return []  # 如果文本为空，则返回空列表
    tokens = text.split()  # 使用空格分割文本，得到词汇列表
    return tokens  # 返回分割后的词汇列表


# Copied from transformers.models.bert.tokenization_bert.BasicTokenizer
class BasicTokenizer(object):
    """
    Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).

    Args:
        do_lower_case (`bool`, *optional*, defaults to `True`):
            Whether or not to lowercase the input when tokenizing.
        never_split (`Iterable`, *optional*):
            Collection of tokens which will never be split during tokenization. Only has an effect when
            `do_basic_tokenize=True`
        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
            Whether or not to tokenize Chinese characters.

            This should likely be deactivated for Japanese (see this
            [issue](https://github.com/huggingface/transformers/issues/328)).
        strip_accents (`bool`, *optional*):
            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
            value for `lowercase` (as in the original BERT).
        do_split_on_punc (`bool`, *optional*, defaults to `True`):
            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
            the full context of the words, such as contractions.
    """
    # 构造一个 BasicTokenizer 类，执行基本的分词（如分割标点符号、小写化等）

    def __init__(
        self,
        do_lower_case=True,  # 是否在分词时进行小写处理，默认为 True
        never_split=None,  # 在分词过程中永不分割的标记集合，仅在 do_basic_tokenize=True 时有效
        tokenize_chinese_chars=True,  # 是否分割中文字符，默认为 True
        strip_accents=None,  # 是否去除所有重音符号，默认根据 lowercase 的值决定（与原始 BERT 相同）
        do_split_on_punc=True,  # 是否在某些情况下跳过基本标点分割，以便后续的分词能够捕获词汇的完整上下文，如缩略词
    ):
        self.do_lower_case = do_lower_case
        self.never_split = never_split if never_split is not None else set()
        self.tokenize_chinese_chars = tokenize_chinese_chars
        self.strip_accents = strip_accents
        self.do_split_on_punc = do_split_on_punc
    def __init__(
        self,
        do_lower_case=True,
        never_split=None,
        tokenize_chinese_chars=True,
        strip_accents=None,
        do_split_on_punc=True,
    ):
        # 如果没有指定 never_split 参数，则初始化为一个空列表
        if never_split is None:
            never_split = []
        # 设置对象的属性值
        self.do_lower_case = do_lower_case
        # 将 never_split 转换为集合类型并赋值给对象的属性
        self.never_split = set(never_split)
        self.tokenize_chinese_chars = tokenize_chinese_chars
        self.strip_accents = strip_accents
        self.do_split_on_punc = do_split_on_punc

    def tokenize(self, text, never_split=None):
        """
        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.

        Args:
            never_split (`List[str]`, *optional*)
                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
                [`PreTrainedTokenizer.tokenize`]) List of token not to split.
        """
        # 如果传入了 never_split 参数，则将当前对象的 never_split 属性与参数 never_split 的集合并
        never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
        # 清洗文本，去除不必要的字符
        text = self._clean_text(text)

        # 对于中文字符的处理，如果开启了 tokenize_chinese_chars，则进行中文字符的特殊处理
        # 这个特性最早于2018年11月1日添加，用于多语言和中文模型，现在也应用于英文模型
        if self.tokenize_chinese_chars:
            text = self._tokenize_chinese_chars(text)
        
        # 对文本进行 Unicode 规范化处理，确保相同字符的不同 Unicode 编码在处理中被视为相同字符
        unicode_normalized_text = unicodedata.normalize("NFC", text)
        # 使用空白字符进行分词，得到原始的 token 列表
        orig_tokens = whitespace_tokenize(unicode_normalized_text)
        split_tokens = []
        # 遍历原始的 token 列表，根据条件进行分割和处理
        for token in orig_tokens:
            if token not in never_split:
                # 如果开启了小写处理，则将 token 转换为小写
                if self.do_lower_case:
                    token = token.lower()
                    # 如果 strip_accents 不为 False，则移除 token 中的重音符号
                    if self.strip_accents is not False:
                        token = self._run_strip_accents(token)
                # 如果开启了 strip_accents，则移除 token 中的重音符号
                elif self.strip_accents:
                    token = self._run_strip_accents(token)
            # 将处理过的 token 经过分割处理后加入 split_tokens 列表
            split_tokens.extend(self._run_split_on_punc(token, never_split))

        # 将分割处理后的 token 列表再次使用空白字符进行分词，得到最终的输出 token 列表
        output_tokens = whitespace_tokenize(" ".join(split_tokens))
        return output_tokens

    def _run_strip_accents(self, text):
        """Strips accents from a piece of text."""
        # 对文本进行 Unicode 规范化处理，将字符分解为基字符和附加记号
        text = unicodedata.normalize("NFD", text)
        output = []
        # 遍历文本中的每个字符，根据字符的分类决定是否保留
        for char in text:
            cat = unicodedata.category(char)
            if cat == "Mn":
                continue  # 如果是 Mark, Nonspacing 类别的字符，则跳过
            output.append(char)
        # 将处理过的字符列表连接成字符串并返回
        return "".join(output)
    def _run_split_on_punc(self, text, never_split=None):
        """根据标点符号分割文本。"""
        # 如果不需要在标点符号处分割或者文本在 never_split 中，则直接返回原文本列表
        if not self.do_split_on_punc or (never_split is not None and text in never_split):
            return [text]
        chars = list(text)
        i = 0
        start_new_word = True
        output = []
        while i < len(chars):
            char = chars[i]
            # 如果是标点符号，则单独作为一个列表项加入输出，并标记可以开始一个新词
            if _is_punctuation(char):
                output.append([char])
                start_new_word = True
            else:
                # 如果不是标点符号，根据是否开始新词来添加到当前最后一个列表项中
                if start_new_word:
                    output.append([])
                start_new_word = False
                output[-1].append(char)
            i += 1

        return ["".join(x) for x in output]

    def _tokenize_chinese_chars(self, text):
        """在每个中日韩（CJK）字符周围添加空格。"""
        output = []
        for char in text:
            cp = ord(char)
            # 如果是中日韩字符，则在其前后添加空格
            if self._is_chinese_char(cp):
                output.append(" ")
                output.append(char)
                output.append(" ")
            else:
                output.append(char)
        return "".join(output)

    def _is_chinese_char(self, cp):
        """检查给定的码点是否是中日韩字符的码点。"""
        # 这里的中日韩字符指的是CJK统一表意文字区块中的字符：
        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
        #
        # 需要注意，CJK统一表意文字区块并不包括所有的日文和韩文字符，
        # 现代韩文的字符属于不同的区块，日文的平假名和片假名也是如此。
        # 这些字符用于书写空格分隔的单词，因此不会特别处理，而是像其他语言一样处理。
        if (
            (cp >= 0x4E00 and cp <= 0x9FFF)
            or (cp >= 0x3400 and cp <= 0x4DBF)  #
            or (cp >= 0x20000 and cp <= 0x2A6DF)  #
            or (cp >= 0x2A700 and cp <= 0x2B73F)  #
            or (cp >= 0x2B740 and cp <= 0x2B81F)  #
            or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
            or (cp >= 0xF900 and cp <= 0xFAFF)
            or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
        ):  #
            return True

        return False

    def _clean_text(self, text):
        """对文本执行无效字符删除和空白字符清理。"""
        output = []
        for char in text:
            cp = ord(char)
            # 如果字符是无效字符或控制字符，则跳过
            if cp == 0 or cp == 0xFFFD or _is_control(char):
                continue
            # 如果是空白字符，则用一个空格替换
            if _is_whitespace(char):
                output.append(" ")
            else:
                output.append(char)
        return "".join(output)
# 从transformers.models.bert.tokenization_bert.WordpieceTokenizer复制而来，用于执行WordPiece分词的类
class WordpieceTokenizer(object):
    """Runs WordPiece tokenization."""

    def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
        # 初始化WordpieceTokenizer对象，设置词汇表、未知token和每个单词的最大输入字符数
        self.vocab = vocab
        self.unk_token = unk_token
        self.max_input_chars_per_word = max_input_chars_per_word

    def tokenize(self, text):
        """
        Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
        tokenization using the given vocabulary.

        For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.

        Args:
            text: A single token or whitespace separated tokens. This should have
                already been passed through *BasicTokenizer*.

        Returns:
            A list of wordpiece tokens.
        """
        # 将文本分词为其WordPiece tokens。使用贪婪的最长匹配算法，并使用给定的词汇表进行分词
        output_tokens = []
        for token in whitespace_tokenize(text):
            chars = list(token)
            if len(chars) > self.max_input_chars_per_word:
                # 如果token长度超过最大字符数限制，则将其替换为未知token
                output_tokens.append(self.unk_token)
                continue

            is_bad = False
            start = 0
            sub_tokens = []
            while start < len(chars):
                end = len(chars)
                cur_substr = None
                while start < end:
                    substr = "".join(chars[start:end])
                    if start > 0:
                        substr = "##" + substr
                    if substr in self.vocab:
                        cur_substr = substr
                        break
                    end -= 1
                if cur_substr is None:
                    is_bad = True
                    break
                sub_tokens.append(cur_substr)
                start = end

            if is_bad:
                output_tokens.append(self.unk_token)
            else:
                output_tokens.extend(sub_tokens)
        return output_tokens


def load_vocab(vocab_file):
    """Loads a vocabulary file into a dictionary."""
    # 从文件中加载词汇表到一个有序字典中
    vocab = collections.OrderedDict()
    with open(vocab_file, "r", encoding="utf-8") as reader:
        tokens = reader.readlines()
    for index, token in enumerate(tokens):
        token = token.rstrip("\
")
        vocab[token] = index
    return vocab


class ProphetNetTokenizer(PreTrainedTokenizer):
    r"""
    Construct a ProphetNetTokenizer. Based on WordPiece.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.
    """
    Args:
        vocab_file (`str`):
            File containing the vocabulary.
        do_lower_case (`bool`, *optional*, defaults to `True`):
            Whether or not to lowercase the input when tokenizing.
        do_basic_tokenize (`bool`, *optional*, defaults to `True`):
            Whether or not to do basic tokenization before WordPiece.
        never_split (`Iterable`, *optional*):
            Collection of tokens which will never be split during tokenization. Only has an effect when
            `do_basic_tokenize=True`
        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
        x_sep_token (`str`, *optional*, defaults to `"[X_SEP]"`):
            Special second separator token, which can be generated by [`ProphetNetForConditionalGeneration`]. It is
            used to separate bullet-point like sentences in summarization, *e.g.*.
        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
            The token used for padding, for example when batching sequences of different lengths.
        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
            The token used for masking values. This is the token used when training this model with masked language
            modeling. This is the token which the model will try to predict.
        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
            Whether or not to tokenize Chinese characters.

            This should likely be deactivated for Japanese (see this
            [issue](https://github.com/huggingface/transformers/issues/328)).
        strip_accents (`bool`, *optional*):
            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
            value for `lowercase` (as in the original BERT).
    """

    # Define constants related to vocabulary files, pretrained models, and configurations
    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES

    # Define model input names required for `tokenizer.pad(...)` to function correctly
    # For `ProphetNet`, `token_type_ids` is not a required argument.
    model_input_names: List[str] = ["input_ids", "attention_mask"]
    # 初始化方法，接受多个参数来配置分词器实例
    def __init__(
        self,
        vocab_file: str,
        do_lower_case: Optional[bool] = True,
        do_basic_tokenize: Optional[bool] = True,
        never_split: Optional[Iterable] = None,
        unk_token: Optional[str] = "[UNK]",
        sep_token: Optional[str] = "[SEP]",
        x_sep_token: Optional[str] = "[X_SEP]",
        pad_token: Optional[str] = "[PAD]",
        mask_token: Optional[str] = "[MASK]",
        tokenize_chinese_chars: Optional[bool] = True,
        strip_accents: Optional[bool] = None,
        **kwargs,
    ):
        # 检查给定的词汇文件是否存在，如果不存在则抛出异常
        if not os.path.isfile(vocab_file):
            raise ValueError(
                f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
                " model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
            )
        # 加载词汇表文件内容到实例变量中
        self.vocab = load_vocab(vocab_file)
        # 创建一个从id到token的有序字典，以便根据id查找对应的token
        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
        # 根据参数决定是否进行基本分词
        self.do_basic_tokenize = do_basic_tokenize
        # 如果需要进行基本分词，则初始化BasicTokenizer实例
        if do_basic_tokenize:
            self.basic_tokenizer = BasicTokenizer(
                do_lower_case=do_lower_case,
                never_split=never_split,
                tokenize_chinese_chars=tokenize_chinese_chars,
                strip_accents=strip_accents,
            )
        # 使用给定的词汇表和未知token初始化WordpieceTokenizer实例
        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))

        # 调用父类的初始化方法，传递相同的参数和额外的关键字参数
        super().__init__(
            do_lower_case=do_lower_case,
            do_basic_tokenize=do_basic_tokenize,
            never_split=never_split,
            unk_token=unk_token,
            sep_token=sep_token,
            x_sep_token=x_sep_token,
            pad_token=pad_token,
            mask_token=mask_token,
            tokenize_chinese_chars=tokenize_chinese_chars,
            strip_accents=strip_accents,
            **kwargs,
        )

    # 返回词汇表大小的属性方法
    @property
    def vocab_size(self):
        return len(self.vocab)

    # 返回包含词汇表和添加token编码器的字典
    def get_vocab(self):
        return dict(self.vocab, **self.added_tokens_encoder)

    # 对给定文本进行分词，返回分词后的token列表
    def _tokenize(self, text):
        split_tokens = []
        # 如果需要进行基本分词
        if self.do_basic_tokenize:
            # 使用BasicTokenizer分词器对文本进行分词
            for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
                # 如果token在never_split集合中，则直接添加到分词结果列表中
                if token in self.basic_tokenizer.never_split:
                    split_tokens.append(token)
                else:
                    # 否则，使用WordpieceTokenizer对token进行进一步分词，并将结果扩展到split_tokens列表中
                    split_tokens += self.wordpiece_tokenizer.tokenize(token)
        else:
            # 如果不需要基本分词，则直接使用WordpieceTokenizer对文本进行分词
            split_tokens = self.wordpiece_tokenizer.tokenize(text)
        return split_tokens

    # 根据token查找其在词汇表中对应的id，如果不存在则返回unk_token对应的id
    def _convert_token_to_id(self, token: str):
        """Converts a token (str) in an id using the vocab."""
        return self.vocab.get(token, self.vocab.get(self.unk_token))

    # 根据id查找其在词汇表中对应的token，如果不存在则返回unk_token
    def _convert_id_to_token(self, index: int):
        """Converts an index (integer) in a token (str) using the vocab."""
        return self.ids_to_tokens.get(index, self.unk_token)
    def convert_tokens_to_string(self, tokens: str):
        """
        Converts a sequence of tokens (string) into a single string.
        Args:
            tokens (`str`): A sequence of tokens.

        Returns:
            `str`: The concatenated string without '##' symbols.
        """
        # Join tokens into a single string, remove '##' and strip leading/trailing spaces
        out_string = " ".join(tokens).replace(" ##", "").strip()
        return out_string

    def get_special_tokens_mask(
        self,
        token_ids_0: List[int],
        token_ids_1: Optional[List[int]] = None,
        already_has_special_tokens: Optional[bool] = False,
    ) -> List[int]:
        """
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`): List of IDs.
            token_ids_1 (`List[int]`, *optional*): Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*): Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """
        if already_has_special_tokens:
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
            )

        if token_ids_1 is None:
            # Return a list of zeros of the same length as token_ids_0, with a single 1 appended
            return ([0] * len(token_ids_0)) + [1]
        else:
            # Return a list of zeros of the combined length of token_ids_0 and token_ids_1, each followed by a 1
            return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]

    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A ProphetNet
        sequence pair mask has the following format:

        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```

        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).

        Args:
            token_ids_0 (`List[int]`): List of IDs.
            token_ids_1 (`List[int]`, *optional*): Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of token type IDs according to the given sequence(s).
        """
        sep = [self.sep_token_id]
        if token_ids_1 is None:
            # Return a list of zeros with a length equal to the sum of token_ids_0 and one separator token
            return len(token_ids_0 + sep) * [0]
        else:
            # Return a list of zeros with a length equal to the combined sum of token_ids_0, token_ids_1, and two separator tokens
            return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
    # 将词汇表保存到指定目录下的文件中
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # 初始化索引
        index = 0
        # 检查保存目录是否存在
        if os.path.isdir(save_directory):
            # 构建词汇表文件路径，包括可选的文件名前缀和默认的词汇表文件名
            vocab_file = os.path.join(
                save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
            )
        else:
            # 如果保存目录不存在，则直接将其作为文件路径
            vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
        # 打开文件，写入词汇表内容
        with open(vocab_file, "w", encoding="utf-8") as writer:
            # 遍历词汇表中的每个词汇和对应的索引
            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
                # 检查当前索引是否连续
                if index != token_index:
                    # 如果不连续，记录警告信息
                    logger.warning(
                        f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
                        " Please check that the vocabulary is not corrupted!"
                    )
                    # 更新索引
                    index = token_index
                # 将词汇写入文件，每个词汇后面加上换行符
                writer.write(token + "\n")
                # 更新索引
                index += 1
        # 返回保存的文件路径，以元组形式返回
        return (vocab_file,)

    # 构建包含特殊标记的模型输入
    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        通过连接和添加特殊标记，从序列或序列对构建用于序列分类任务的模型输入。BERT 序列的格式如下：

        - 单个序列：`[CLS] X [SEP]`
        - 序列对：`[CLS] A [SEP] B [SEP]`

        Args:
            token_ids_0 (`List[int]`):
                将要添加特殊标记的 ID 列表。
            token_ids_1 (`List[int]`, *可选*):
                第二个序列对的 ID 列表（可选）。

        Returns:
            `List[int]`: 包含适当特殊标记的输入 ID 列表。
        """
        # 如果没有第二个序列对，则直接返回第一个序列加上分隔标记的结果
        if token_ids_1 is None:
            return token_ids_0 + [self.sep_token_id]
        # 构造分隔标记列表
        sep = [self.sep_token_id]
        # 返回连接后的两个序列及其之间的分隔标记列表
        return token_ids_0 + sep + token_ids_1 + sep

`.\models\prophetnet\init.py`

# 导入必要的模块和函数
from typing import TYPE_CHECKING
# 从工具包中导入自定义异常和模块延迟加载工具函数
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available

# 定义模块的导入结构，包括各个子模块及其导入的类和变量
_import_structure = {
    "configuration_prophetnet": ["PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "ProphetNetConfig"],
    "tokenization_prophetnet": ["ProphetNetTokenizer"],
}

# 检查是否可以导入 torch，若不可用则抛出自定义的依赖不可用异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果 torch 可用，添加 modeling_prophetnet 子模块及其导入的类和变量
    _import_structure["modeling_prophetnet"] = [
        "PROPHETNET_PRETRAINED_MODEL_ARCHIVE_LIST",
        "ProphetNetDecoder",
        "ProphetNetEncoder",
        "ProphetNetForCausalLM",
        "ProphetNetForConditionalGeneration",
        "ProphetNetModel",
        "ProphetNetPreTrainedModel",
    ]

# 如果在类型检查模式下
if TYPE_CHECKING:
    # 从子模块中导入特定的类和变量，用于类型检查
    from .configuration_prophetnet import PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP, ProphetNetConfig
    from .tokenization_prophetnet import ProphetNetTokenizer

    # 再次检查是否可以导入 torch，若不可用则忽略
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 从 modeling_prophetnet 子模块中导入特定的类和变量，用于类型检查
        from .modeling_prophetnet import (
            PROPHETNET_PRETRAINED_MODEL_ARCHIVE_LIST,
            ProphetNetDecoder,
            ProphetNetEncoder,
            ProphetNetForCausalLM,
            ProphetNetForConditionalGeneration,
            ProphetNetModel,
            ProphetNetPreTrainedModel,
        )

# 如果不在类型检查模式下
else:
    import sys

    # 将当前模块替换为 LazyModule 对象，用于延迟加载模块
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\pvt\configuration_pvt.py`

# coding=utf-8
# 上面是设置文件编码为UTF-8，确保可以处理各种语言字符
# Copyright 2023 Authors: Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan,
# Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao and The HuggingFace Inc. team.
# 版权声明，列出了作者和HuggingFace团队的版权信息
# All rights reserved.
# 版权声明，保留所有权利
#
# Licensed under the Apache License, Version 2.0 (the "License");
# 依据Apache License, Version 2.0许可证授权
# you may not use this file except in compliance with the License.
# 除非符合许可证的规定，否则不得使用此文件
# You may obtain a copy of the License at
# 可以在以下网址获取许可证副本
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# 除非适用法律要求或书面同意，否则依据许可证分发的软件都是基于"AS IS"的基础上分发
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 没有明示或暗示的担保或条件
# See the License for the specific language governing permissions and
# 请查看许可证，了解具体的语言控制和
# limitations under the License.
# 许可证下的限制
""" Pvt model configuration"""
# 此处是模型配置的文档字符串说明

from collections import OrderedDict
# 导入OrderedDict用于创建有序字典
from typing import Callable, List, Mapping
# 导入类型提示模块，用于声明函数类型、列表类型和映射类型

from packaging import version
# 导入版本包装模块，用于处理版本信息

from ...configuration_utils import PretrainedConfig
# 导入预训练配置工具模块中的PretrainedConfig类
from ...onnx import OnnxConfig
# 导入ONNX配置模块中的OnnxConfig类
from ...utils import logging
# 导入工具包中的日志模块

logger = logging.get_logger(__name__)
# 获取当前模块的日志记录器对象

PVT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "pvt-tiny-224": "https://huggingface.co/Zetatech/pvt-tiny-224",
    # Pvt预训练模型的名称映射到其存档URL
    # 可以在https://huggingface.co/models?filter=pvt查看所有PVT模型
}


class PvtConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`PvtModel`]. It is used to instantiate an Pvt
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the Pvt
    [Xrenya/pvt-tiny-224](https://huggingface.co/Xrenya/pvt-tiny-224) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    """
    # PvtConfig类继承自PretrainedConfig，用于存储PvtModel的配置信息
    # 通过指定的参数实例化一个Pvt模型，定义模型架构
    # 使用默认参数实例化配置对象将产生与Pvt [Xrenya/pvt-tiny-224]架构类似的配置
    # 定义 PVT 模型配置类，用于初始化 PVT 模型的参数
    Args:
        image_size (`int`, *optional*, defaults to 224):
            输入图像的大小，默认为224
        num_channels (`int`, *optional*, defaults to 3):
            输入通道的数量，默认为3
        num_encoder_blocks (`int`, *optional*, defaults to 4):
            编码器块的数量（Mix Transformer 编码器中的阶段数），默认为4
        depths (`List[int]`, *optional*, defaults to `[2, 2, 2, 2]`):
            每个编码器块中的层数，默认为 `[2, 2, 2, 2]`
        sequence_reduction_ratios (`List[int]`, *optional*, defaults to `[8, 4, 2, 1]`):
            每个编码器块中的序列减少比例，默认为 `[8, 4, 2, 1]`
        hidden_sizes (`List[int]`, *optional*, defaults to `[64, 128, 320, 512]`):
            每个编码器块的维度，默认为 `[64, 128, 320, 512]`
        patch_sizes (`List[int]`, *optional*, defaults to `[4, 2, 2, 2]`):
            每个编码器块之前的补丁大小，默认为 `[4, 2, 2, 2]`
        strides (`List[int]`, *optional*, defaults to `[4, 2, 2, 2]`):
            每个编码器块之前的步长，默认为 `[4, 2, 2, 2]`
        num_attention_heads (`List[int]`, *optional*, defaults to `[1, 2, 5, 8]`):
            每个 Transformer 编码器块中每个注意力层的注意力头数，默认为 `[1, 2, 5, 8]`
        mlp_ratios (`List[int]`, *optional*, defaults to `[8, 8, 4, 4]`):
            Mix FFNs 中隐藏层大小与输入层大小的比例，默认为 `[8, 8, 4, 4]`
        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            编码器和池化器中的非线性激活函数（函数或字符串），支持 `"gelu"`, `"relu"`, `"selu"` 和 `"gelu_new"`, 默认为 `"gelu"`
        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
            嵌入层、编码器和池化器中所有全连接层的 dropout 概率，默认为 0.0
        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
            注意力概率的 dropout 比率，默认为 0.0
        initializer_range (`float`, *optional*, defaults to 0.02):
            初始化所有权重矩阵的截断正态分布的标准差，默认为 0.02
        drop_path_rate (`float`, *optional*, defaults to 0.0):
            用于随机深度的 dropout 概率，在 Transformer 编码器的块中使用，默认为 0.0
        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
            层归一化层使用的 epsilon，默认为 1e-06
        qkv_bias (`bool`, *optional*, defaults to `True`):
            是否为查询、键和值添加可学习偏置，默认为 True
        num_labels ('int', *optional*, defaults to 1000):
            类别数量，默认为 1000
    Example:

    ```
    >>> from transformers import PvtModel, PvtConfig

    >>> # Initializing a PVT Xrenya/pvt-tiny-224 style configuration
    >>> configuration = PvtConfig()
    ```
    >>> model = PvtModel(configuration)

    >>> # 访问模型配置
    >>> configuration = model.config
    ```
    
    # 定义一个类名为PvtModel，表示使用PVT模型
    model_type = "pvt"

    # 初始化函数，设置PvtModel类的各种属性
    def __init__(
        self,
        image_size: int = 224,  # 图像大小，默认为224
        num_channels: int = 3,  # 图像通道数，默认为3
        num_encoder_blocks: int = 4,  # 编码器块的数量，默认为4
        depths: List[int] = [2, 2, 2, 2],  # 每个阶段的深度列表，默认为[2, 2, 2, 2]
        sequence_reduction_ratios: List[int] = [8, 4, 2, 1],  # 序列减少比例列表，默认为[8, 4, 2, 1]
        hidden_sizes: List[int] = [64, 128, 320, 512],  # 隐藏层大小列表，默认为[64, 128, 320, 512]
        patch_sizes: List[int] = [4, 2, 2, 2],  # 补丁大小列表，默认为[4, 2, 2, 2]
        strides: List[int] = [4, 2, 2, 2],  # 步幅列表，默认为[4, 2, 2, 2]
        num_attention_heads: List[int] = [1, 2, 5, 8],  # 注意力头的数量列表，默认为[1, 2, 5, 8]
        mlp_ratios: List[int] = [8, 8, 4, 4],  # MLP比率列表，默认为[8, 8, 4, 4]
        hidden_act: Mapping[str, Callable] = "gelu",  # 隐藏层激活函数，默认为'gelu'
        hidden_dropout_prob: float = 0.0,  # 隐藏层dropout概率，默认为0.0
        attention_probs_dropout_prob: float = 0.0,  # 注意力概率dropout概率，默认为0.0
        initializer_range: float = 0.02,  # 初始化范围，默认为0.02
        drop_path_rate: float = 0.0,  # drop path率，默认为0.0
        layer_norm_eps: float = 1e-6,  # 层归一化epsilon值，默认为1e-6
        qkv_bias: bool = True,  # 是否使用QKV偏置，默认为True
        num_labels: int = 1000,  # 标签数量，默认为1000
        **kwargs,
    ):
        # 调用父类的初始化函数
        super().__init__(**kwargs)

        # 设置对象的各种属性
        self.image_size = image_size
        self.num_channels = num_channels
        self.num_encoder_blocks = num_encoder_blocks
        self.depths = depths
        self.sequence_reduction_ratios = sequence_reduction_ratios
        self.hidden_sizes = hidden_sizes
        self.patch_sizes = patch_sizes
        self.strides = strides
        self.mlp_ratios = mlp_ratios
        self.num_attention_heads = num_attention_heads
        self.hidden_act = hidden_act
        self.hidden_dropout_prob = hidden_dropout_prob
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        self.initializer_range = initializer_range
        self.drop_path_rate = drop_path_rate
        self.layer_norm_eps = layer_norm_eps
        self.num_labels = num_labels
        self.qkv_bias = qkv_bias
# 定义一个私有的 OnnxConfig 类，继承自 OnnxConfig 类
class PvtOnnxConfig(OnnxConfig):
    # 设定 torch_onnx_minimum_version 属性为版本号 "1.11"
    torch_onnx_minimum_version = version.parse("1.11")

    # 定义 inputs 属性为一个字典，表示模型输入的结构
    @property
    def inputs(self) -> Mapping[str, Mapping[int, str]]:
        return OrderedDict(
            [
                # 定义输入的像素值及其维度顺序
                ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
            ]
        )

    # 定义 atol_for_validation 属性，表示用于验证的绝对容差
    @property
    def atol_for_validation(self) -> float:
        return 1e-4

    # 定义 default_onnx_opset 属性，表示默认的 ONNX 运算集版本
    @property
    def default_onnx_opset(self) -> int:
        return 12

`.\models\pvt\convert_pvt_to_pytorch.py`

# coding=utf-8
# Copyright 2023 Authors: Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan,
# Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao and The HuggingFace Inc. team.
# All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert Pvt checkpoints from the original library."""

import argparse             # 导入解析命令行参数的模块
from pathlib import Path    # 导入处理路径的模块

import requests             # 导入处理HTTP请求的模块
import torch                # 导入PyTorch深度学习框架
from PIL import Image       # 导入Python Imaging Library (PIL) 图像处理库

from transformers import PvtConfig, PvtForImageClassification, PvtImageProcessor   # 导入转换模型用到的类
from transformers.utils import logging   # 导入日志记录工具

logging.set_verbosity_info()    # 设置日志记录的详细程度为信息级别
logger = logging.get_logger(__name__)   # 获取当前模块的日志记录器

# here we list all keys to be renamed (original name on the left, our name on the right)
def create_rename_keys(config):
    rename_keys = []    # 初始化一个空列表用于存储重命名的键值对
    # Rename cls token
    rename_keys.extend(     # 扩展列表以添加元组的方式来指定需要重命名的键值对
        [
            ("cls_token", "pvt.encoder.patch_embeddings.3.cls_token"),
        ]
    )
    # Rename norm layer and classifier layer
    rename_keys.extend(     # 继续扩展列表以添加更多需要重命名的键值对
        [
            ("norm.weight", "pvt.encoder.layer_norm.weight"),
            ("norm.bias", "pvt.encoder.layer_norm.bias"),
            ("head.weight", "classifier.weight"),
            ("head.bias", "classifier.bias"),
        ]
    )

    return rename_keys    # 返回所有的重命名键值对列表

# we split up the matrix of each encoder layer into queries, keys and values
def read_in_k_v(state_dict, config):
    # for each of the encoder blocks:
    for i in range(config.num_encoder_blocks):   # 遍历编码器块的数量
        for j in range(config.depths[i]):       # 遍历每个编码器块中的层数
            # read in weights + bias of keys and values (which is a single matrix in the original implementation)
            kv_weight = state_dict.pop(f"pvt.encoder.block.{i}.{j}.attention.self.kv.weight")    # 弹出键值对中的权重
            kv_bias = state_dict.pop(f"pvt.encoder.block.{i}.{j}.attention.self.kv.bias")       # 弹出键值对中的偏置
            # next, add keys and values (in that order) to the state dict
            state_dict[f"pvt.encoder.block.{i}.{j}.attention.self.key.weight"] = kv_weight[: config.hidden_sizes[i], :]
            state_dict[f"pvt.encoder.block.{i}.{j}.attention.self.key.bias"] = kv_bias[: config.hidden_sizes[i]]   # 将键和偏置添加到状态字典中

            state_dict[f"pvt.encoder.block.{i}.{j}.attention.self.value.weight"] = kv_weight[
                config.hidden_sizes[i] :, :
            ]
            state_dict[f"pvt.encoder.block.{i}.{j}.attention.self.value.bias"] = kv_bias[config.hidden_sizes[i] :]   # 将值和偏置添加到状态字典中


def rename_key(dct, old, new):
    val = dct.pop(old)    # 弹出旧键对应的值
    dct[new] = val        # 添加新键并将值赋予该新键

# We will verify our results on an image of cute cats
def prepare_img():
    # 定义图片的 URL 地址
    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
    # 使用 requests 库发送 GET 请求获取图片的二进制数据流，并通过 stream=True 确保以流式方式获取数据
    im = Image.open(requests.get(url, stream=True).raw)
    # 返回打开的图片对象
    return im
@torch.no_grad()
def convert_pvt_checkpoint(pvt_size, pvt_checkpoint, pytorch_dump_folder_path):
    """
    Copy/paste/tweak model's weights to our PVT structure.
    """

    # 定义默认的 PVT 配置路径
    if pvt_size == "tiny":
        config_path = "Zetatech/pvt-tiny-224"
    elif pvt_size == "small":
        config_path = "Zetatech/pvt-small-224"
    elif pvt_size == "medium":
        config_path = "Zetatech/pvt-medium-224"
    elif pvt_size == "large":
        config_path = "Zetatech/pvt-large-224"
    else:
        raise ValueError(f"Available model's size: 'tiny', 'small', 'medium', 'large', but " f"'{pvt_size}' was given")

    # 使用指定的配置路径创建 PVTConfig 对象
    config = PvtConfig(name_or_path=config_path)
    
    # 从指定路径加载原始模型权重
    state_dict = torch.load(pvt_checkpoint, map_location="cpu")

    # 根据 PVT 配置创建重命名键
    rename_keys = create_rename_keys(config)
    for src, dest in rename_keys:
        rename_key(state_dict, src, dest)
    # 根据 PVT 配置读取键值对
    read_in_k_v(state_dict, config)

    # 加载 HuggingFace 的 PVT 图像分类模型，并设置为评估模式
    model = PvtForImageClassification(config).eval()
    model.load_state_dict(state_dict)

    # 使用 PVTFeatureExtractor 准备图像，并检查输出
    image_processor = PvtImageProcessor(size=config.image_size)
    encoding = image_processor(images=prepare_img(), return_tensors="pt")
    pixel_values = encoding["pixel_values"]
    outputs = model(pixel_values)
    logits = outputs.logits.detach().cpu()

    # 根据 PVT 模型大小选择预期的输出片段 logits
    if pvt_size == "tiny":
        expected_slice_logits = torch.tensor([-1.4192, -1.9158, -0.9702])
    elif pvt_size == "small":
        expected_slice_logits = torch.tensor([0.4353, -0.1960, -0.2373])
    elif pvt_size == "medium":
        expected_slice_logits = torch.tensor([-0.2914, -0.2231, 0.0321])
    elif pvt_size == "large":
        expected_slice_logits = torch.tensor([0.3740, -0.7739, -0.4214])
    else:
        raise ValueError(f"Available model's size: 'tiny', 'small', 'medium', 'large', but " f"'{pvt_size}' was given")

    # 断言模型输出的前三个 logits 与预期的值非常接近
    assert torch.allclose(logits[0, :3], expected_slice_logits, atol=1e-4)

    # 创建输出路径文件夹（如果不存在）
    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
    print(f"Saving model pytorch_model.bin to {pytorch_dump_folder_path}")
    # 将模型保存为 PyTorch 预训练模型
    model.save_pretrained(pytorch_dump_folder_path)
    print(f"Saving image processor to {pytorch_dump_folder_path}")
    # 将图像处理器保存到 PyTorch 模型目录中
    image_processor.save_pretrained(pytorch_dump_folder_path)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    # 必选参数
    parser.add_argument(
        "--pvt_size",
        default="tiny",
        type=str,
        help="Size of the PVT pretrained model you'd like to convert.",
    )
    parser.add_argument(
        "--pvt_checkpoint",
        default="pvt_tiny.pth",
        type=str,
        help="Checkpoint of the PVT pretrained model you'd like to convert.",
    )
    parser.add_argument(
        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
    )

    args = parser.parse_args()
    # 调用函数以转换私有检查点文件格式到PyTorch格式
    convert_pvt_checkpoint(args.pvt_size, args.pvt_checkpoint, args.pytorch_dump_folder_path)

`.\models\pvt\image_processing_pvt.py`

# 导入所需的模块和类
from typing import Dict, List, Optional, Union  # 导入类型提示模块

import numpy as np  # 导入NumPy库

# 导入图像处理相关的工具和函数
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
from ...image_transforms import resize, to_channel_dimension_format
from ...image_utils import (
    IMAGENET_DEFAULT_MEAN,  # 导入常量：ImageNet图像的默认均值
    IMAGENET_DEFAULT_STD,   # 导入常量：ImageNet图像的默认标准差
    ChannelDimension,       # 导入枚举类型：通道维度
    ImageInput,             # 导入类型别名：图像输入
    PILImageResampling,     # 导入枚举类型：PIL图像的重采样方法
    infer_channel_dimension_format,  # 导入函数：推断通道维度格式
    is_scaled_image,        # 导入函数：判断图像是否被缩放过
    make_list_of_images,    # 导入函数：创建图像列表
    to_numpy_array,         # 导入函数：将图像转换为NumPy数组
    valid_images,           # 导入函数：验证图像的有效性
    validate_kwargs,        # 导入函数：验证关键字参数
    validate_preprocess_arguments,  # 导入函数：验证预处理参数
)
from ...utils import TensorType, logging  # 导入类型别名和日志记录模块

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器
    """
    Args:
        do_resize (`bool`, *optional*, defaults to `True`):
            Whether to resize the image's (height, width) dimensions to the specified `(size["height"],
            size["width"])`. Can be overridden by the `do_resize` parameter in the `preprocess` method.
        size (`dict`, *optional*, defaults to `{"height": 224, "width": 224}`):
            Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
            method.
        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
            Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
            `preprocess` method.
        do_rescale (`bool`, *optional*, defaults to `True`):
            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
            parameter in the `preprocess` method.
        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
            Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
            `preprocess` method.
        do_normalize (`bool`, *optional*, defaults to `True`):
            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
            method.
        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`):
            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
    """
    # 定义一个图像处理器类，继承自父类，用于处理图像的预处理操作
    def __init__(
        self,
        **kwargs,
    ) -> None:
        # 调用父类初始化方法，传入所有关键字参数
        super().__init__(**kwargs)
        # 确定图像大小，若未指定则使用默认大小 {"height": 224, "width": 224}
        size = size if size is not None else {"height": 224, "width": 224}
        # 调用辅助函数，将 size 转换为规范化的尺寸字典
        size = get_size_dict(size)
        # 是否进行图像大小调整的标志位
        self.do_resize = do_resize
        # 是否进行图像尺度缩放的标志位
        self.do_rescale = do_rescale
        # 是否进行图像归一化的标志位
        self.do_normalize = do_normalize
        # 存储图像大小的字典
        self.size = size
        # 图像调整使用的重采样方法
        self.resample = resample
        # 图像尺度缩放的因子
        self.rescale_factor = rescale_factor
        # 图像均值，若未指定则使用默认的 IMAGENET_DEFAULT_MEAN
        self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
        # 图像标准差，若未指定则使用默认的 IMAGENET_DEFAULT_STD
        self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
        # 存储有效的处理器关键字列表
        self._valid_processor_keys = [
            "images",
            "do_resize",
            "size",
            "resample",
            "do_rescale",
            "rescale_factor",
            "do_normalize",
            "image_mean",
            "image_std",
            "return_tensors",
            "data_format",
            "input_data_format",
        ]
    
    # 从 transformers.models.vit.image_processing_vit.ViTImageProcessor.resize 复制而来的方法
    def resize(
        self,
        image: np.ndarray,
        size: Dict[str, int],
        resample: PILImageResampling = PILImageResampling.BILINEAR,
        data_format: Optional[Union[str, ChannelDimension]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,
    ) -> np.ndarray:
        """
        Resize an image to `(size["height"], size["width"])`.

        Args:
            image (`np.ndarray`):
                Image to resize.
            size (`Dict[str, int]`):
                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
            data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the output image. If unset, the channel dimension format of the input
                image is used. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the input image. If unset, the channel dimension format is inferred
                from the input image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.

        Returns:
            `np.ndarray`: The resized image.
        """
        size = get_size_dict(size)  # 获取调整后的尺寸字典
        if "height" not in size or "width" not in size:
            raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
        output_size = (size["height"], size["width"])  # 设置输出图像的高度和宽度
        return resize(
            image,
            size=output_size,
            resample=resample,
            data_format=data_format,
            input_data_format=input_data_format,
            **kwargs,
        )

    def preprocess(
        self,
        images: ImageInput,
        do_resize: Optional[bool] = None,
        size: Dict[str, int] = None,
        resample: PILImageResampling = None,
        do_rescale: Optional[bool] = None,
        rescale_factor: Optional[float] = None,
        do_normalize: Optional[bool] = None,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,
    ):

`.\models\pvt\modeling_pvt.py`

# coding=utf-8
# 上面的行指定了文件的编码格式为 UTF-8，确保可以正确处理所有字符
# Copyright 2023 Authors: Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan,
# Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao and The HuggingFace Inc. team.
# 版权声明，列出了代码的版权信息及贡献者
# All rights reserved.
# 版权声明，保留所有权利
#
# Licensed under the Apache License, Version 2.0 (the "License");
# 根据 Apache License, Version 2.0 授权许可，允许在遵守许可的情况下使用本文件
# you may not use this file except in compliance with the License.
# 除非符合许可证的要求，否则不得使用本文件
# You may obtain a copy of the License at
# 可以在以下网址获取许可证副本
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# 除非适用法律要求或书面同意，否则软件按"AS IS"分发，
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 在许可的情况下，按"AS IS"分发，不提供任何明示或暗示的保证或条件
# See the License for the specific language governing permissions and
# limitations under the License.
# 查看许可证以了解特定语言下的权限和限制
""" PyTorch PVT model."""
# 模型的简短描述

import collections
# 导入 collections 模块，用于操作集合数据类型
import math
# 导入 math 模块，提供数学运算函数
from typing import Iterable, Optional, Tuple, Union
# 导入类型提示的模块，用于声明函数和变量的类型

import torch
# 导入 PyTorch 库
import torch.nn.functional as F
# 导入 PyTorch 的函数模块
import torch.utils.checkpoint
# 导入 PyTorch 的检查点模块
from torch import nn
# 从 PyTorch 中导入神经网络模块
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
# 从 PyTorch 中导入损失函数

from ...activations import ACT2FN
# 从本地模块导入 ACT2FN 激活函数
from ...modeling_outputs import BaseModelOutput, ImageClassifierOutput
# 从本地模块导入模型输出类
from ...modeling_utils import PreTrainedModel
# 从本地模块导入预训练模型的基类
from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
# 从本地模块导入模型剪枝相关的函数
from ...utils import (
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
)
# 从本地模块导入其他实用工具函数和日志函数

from .configuration_pvt import PvtConfig
# 从当前目录下的配置文件中导入 PVT 模型的配置类

logger = logging.get_logger(__name__)
# 获取当前模块的日志记录器

_CONFIG_FOR_DOC = "PvtConfig"
# 文档中用于说明的配置变量名

_CHECKPOINT_FOR_DOC = "Zetatech/pvt-tiny-224"
# 文档中用于说明的检查点变量名

_EXPECTED_OUTPUT_SHAPE = [1, 50, 512]
# 预期的模型输出形状

_IMAGE_CLASS_CHECKPOINT = "Zetatech/pvt-tiny-224"
# 图像分类检查点的名称

_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"
# 预期的图像分类输出

PVT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "Zetatech/pvt-tiny-224"
    # PVT 预训练模型存档列表，包含一个模型路径
    # See all PVT models at https://huggingface.co/models?filter=pvt
    # 可以在指定的网址查看所有的 PVT 模型
]


# Copied from transformers.models.beit.modeling_beit.drop_path
def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
    """
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    """
    # 如果 drop_prob 为 0 或者不处于训练模式，则直接返回输入
    if drop_prob == 0.0 or not training:
        return input
    # 计算保留概率
    keep_prob = 1 - drop_prob
    # 创建随机张量，与输入张量相同的形状
    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
    random_tensor.floor_()  # 将随机张量二值化
    # 计算输出，按照保留概率调整输入张量的值
    output = input.div(keep_prob) * random_tensor
    return output
# 从 transformers.models.convnext.modeling_convnext.ConvNextDropPath 复制过来的类，用于在残差块的主路径中每个样本上应用Drop Path（随机深度）。
class PvtDropPath(nn.Module):
    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""

    def __init__(self, drop_prob: Optional[float] = None) -> None:
        super().__init__()
        self.drop_prob = drop_prob

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 调用 drop_path 函数，对输入的 hidden_states 执行 Drop Path 操作，根据当前模型是否处于训练状态来决定是否应用 Drop Path。
        return drop_path(hidden_states, self.drop_prob, self.training)

    def extra_repr(self) -> str:
        # 返回一个描述对象的额外信息的字符串，格式为 "p=drop_prob"，其中 drop_prob 是初始化时传入的概率值。
        return "p={}".format(self.drop_prob)


class PvtPatchEmbeddings(nn.Module):
    """
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    """

    def __init__(
        self,
        config: PvtConfig,
        image_size: Union[int, Iterable[int]],
        patch_size: Union[int, Iterable[int]],
        stride: int,
        num_channels: int,
        hidden_size: int,
        cls_token: bool = False,
    ):
        super().__init__()
        # 初始化函数，用于将输入的像素值 `pixel_values` 转换成 Transformer 模型可用的 patch embeddings。
        self.config = config
        # 将 image_size 和 patch_size 转换为 Iterable 类型，如果它们不是的话。
        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
        # 计算图像中的 patch 数量。
        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
        self.image_size = image_size
        self.patch_size = patch_size
        self.num_channels = num_channels
        self.num_patches = num_patches

        # 初始化位置编码张量，形状为 (1, num_patches + 1 if cls_token else num_patches, hidden_size)，用于 Transformer 模型中位置编码。
        self.position_embeddings = nn.Parameter(
            torch.randn(1, num_patches + 1 if cls_token else num_patches, hidden_size)
        )
        # 如果 cls_token 为 True，则初始化一个形状为 (1, 1, hidden_size) 的可学习的类别令牌（class token）张量。
        self.cls_token = nn.Parameter(torch.zeros(1, 1, hidden_size)) if cls_token else None
        # 使用卷积操作将输入的像素值转换成 hidden_size 维度的 patch embeddings。
        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=stride, stride=patch_size)
        # 使用 LayerNorm 对隐藏状态进行归一化，eps 是 LayerNorm 的 epsilon 参数。
        self.layer_norm = nn.LayerNorm(hidden_size, eps=config.layer_norm_eps)
        # 使用 Dropout 进行隐藏状态的随机丢弃，p 是 Dropout 概率。
        self.dropout = nn.Dropout(p=config.hidden_dropout_prob)

    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
        # 插值位置编码，将输入的位置编码张量 embeddings 插值到新的高度和宽度。
        num_patches = height * width
        # 如果当前输入的 patch 数量等于配置中指定的图像总 patch 数量，则直接返回位置编码张量。
        if num_patches == self.config.image_size * self.config.image_size:
            return self.position_embeddings
        # 将输入的 embeddings 重塑为 (1, height, width, -1)，然后进行维度置换。
        embeddings = embeddings.reshape(1, height, width, -1).permute(0, 3, 1, 2)
        # 使用双线性插值将 embeddings 插值到新的高度和宽度。
        interpolated_embeddings = F.interpolate(embeddings, size=(height, width), mode="bilinear")
        # 重塑插值后的 embeddings 为 (1, -1, height * width)，然后进行维度置换。
        interpolated_embeddings = interpolated_embeddings.reshape(1, -1, height * width).permute(0, 2, 1)
        return interpolated_embeddings
    # 定义一个方法，接受一个张量 pixel_values，返回元组 (embeddings, height, width)
    def forward(self, pixel_values: torch.Tensor) -> Tuple[torch.Tensor, int, int]:
        # 获取输入张量的维度信息：batch_size, num_channels, height, width
        batch_size, num_channels, height, width = pixel_values.shape
        
        # 检查输入张量的通道数是否与模型要求的通道数一致
        if num_channels != self.num_channels:
            raise ValueError(
                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
            )
        
        # 使用 projection 方法对输入张量进行投影，得到 patch_embed
        patch_embed = self.projection(pixel_values)
        
        # 忽略前面的维度，获取 patch_embed 的最后两个维度的大小作为新的 height 和 width
        *_, height, width = patch_embed.shape
        
        # 将 patch_embed 进行展平处理，然后交换维度 1 和 2
        patch_embed = patch_embed.flatten(2).transpose(1, 2)
        
        # 对 patch_embed 进行 layer normalization
        embeddings = self.layer_norm(patch_embed)
        
        # 如果存在 cls_token，则在 embeddings 前面添加 cls_token
        if self.cls_token is not None:
            cls_token = self.cls_token.expand(batch_size, -1, -1)
            embeddings = torch.cat((cls_token, embeddings), dim=1)
            
            # 使用 interpolate_pos_encoding 方法插值生成位置编码，并在前面添加初始位置编码
            position_embeddings = self.interpolate_pos_encoding(self.position_embeddings[:, 1:], height, width)
            position_embeddings = torch.cat((self.position_embeddings[:, :1], position_embeddings), dim=1)
        else:
            # 如果不存在 cls_token，则直接使用 interpolate_pos_encoding 方法生成位置编码
            position_embeddings = self.interpolate_pos_encoding(self.position_embeddings, height, width)
        
        # 将 embeddings 和 position_embeddings 相加，并应用 dropout
        embeddings = self.dropout(embeddings + position_embeddings)

        # 返回计算得到的 embeddings，以及当前的 height 和 width
        return embeddings, height, width
    # PvtSelfOutput 类定义，继承自 nn.Module
    class PvtSelfOutput(nn.Module):
        def __init__(self, config: PvtConfig, hidden_size: int):
            super().__init__()
            # 初始化一个全连接层，用于线性变换 hidden_states
            self.dense = nn.Linear(hidden_size, hidden_size)
            # 初始化一个 Dropout 层，用于随机失活 hidden_states 的部分神经元
            self.dropout = nn.Dropout(config.hidden_dropout_prob)

        def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
            # 对输入的 hidden_states 进行线性变换
            hidden_states = self.dense(hidden_states)
            # 对变换后的 hidden_states 进行随机失活
            hidden_states = self.dropout(hidden_states)
            return hidden_states

    # PvtEfficientSelfAttention 类定义，继承自 nn.Module
    class PvtEfficientSelfAttention(nn.Module):
        """Efficient self-attention mechanism with reduction of the sequence [PvT paper](https://arxiv.org/abs/2102.12122)."""

        def __init__(
            self, config: PvtConfig, hidden_size: int, num_attention_heads: int, sequences_reduction_ratio: float
        ):
            super().__init__()
            self.hidden_size = hidden_size
            self.num_attention_heads = num_attention_heads

            # 检查隐藏大小是否可以被注意力头的数量整除
            if self.hidden_size % self.num_attention_heads != 0:
                raise ValueError(
                    f"The hidden size ({self.hidden_size}) is not a multiple of the number of attention "
                    f"heads ({self.num_attention_heads})"
                )

            # 计算每个注意力头的大小
            self.attention_head_size = int(self.hidden_size / self.num_attention_heads)
            self.all_head_size = self.num_attention_heads * self.attention_head_size

            # 初始化查询、键、值的线性层，并指定是否使用偏置
            self.query = nn.Linear(self.hidden_size, self.all_head_size, bias=config.qkv_bias)
            self.key = nn.Linear(self.hidden_size, self.all_head_size, bias=config.qkv_bias)
            self.value = nn.Linear(self.hidden_size, self.all_head_size, bias=config.qkv_bias)

            # 初始化 Dropout 层，用于注意力分数的随机失活
            self.dropout = nn.Dropout(config.attention_probs_dropout_prob)

            # 设置序列缩减比率，如果比率大于1，则初始化一个二维卷积层和 LayerNorm 层
            self.sequences_reduction_ratio = sequences_reduction_ratio
            if sequences_reduction_ratio > 1:
                self.sequence_reduction = nn.Conv2d(
                    hidden_size, hidden_size, kernel_size=sequences_reduction_ratio, stride=sequences_reduction_ratio
                )
                self.layer_norm = nn.LayerNorm(hidden_size, eps=config.layer_norm_eps)

        def transpose_for_scores(self, hidden_states: int) -> torch.Tensor:
            # 重新形状 hidden_states，以便进行多头注意力计算
            new_shape = hidden_states.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
            hidden_states = hidden_states.view(new_shape)
            return hidden_states.permute(0, 2, 1, 3)

        def forward(
            self,
            hidden_states: torch.Tensor,
            height: int,
            width: int,
            output_attentions: bool = False,
    ) -> Tuple[torch.Tensor]:
        # 使用 self.query 对隐藏状态进行查询操作，并调整维度以匹配注意力分数计算所需的格式
        query_layer = self.transpose_for_scores(self.query(hidden_states))

        # 如果指定了序列缩减比率大于1，则进行以下操作
        if self.sequences_reduction_ratio > 1:
            batch_size, seq_len, num_channels = hidden_states.shape
            # 将隐藏状态重塑为 (batch_size, num_channels, height, width) 的格式
            hidden_states = hidden_states.permute(0, 2, 1).reshape(batch_size, num_channels, height, width)
            # 应用序列缩减操作
            hidden_states = self.sequence_reduction(hidden_states)
            # 将隐藏状态重塑回 (batch_size, seq_len, num_channels) 的格式
            hidden_states = hidden_states.reshape(batch_size, num_channels, -1).permute(0, 2, 1)
            # 应用层归一化操作
            hidden_states = self.layer_norm(hidden_states)

        # 使用 self.key 对隐藏状态进行键操作，并调整维度以匹配注意力分数计算所需的格式
        key_layer = self.transpose_for_scores(self.key(hidden_states))
        
        # 使用 self.value 对隐藏状态进行值操作，并调整维度以匹配注意力分数计算所需的格式
        value_layer = self.transpose_for_scores(self.value(hidden_states))

        # 计算 "查询" 和 "键" 之间的点积，得到原始注意力分数
        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))

        # 将注意力分数除以缩放因子，以防止数值不稳定
        attention_scores = attention_scores / math.sqrt(self.attention_head_size)

        # 将注意力分数归一化为注意力概率
        attention_probs = nn.functional.softmax(attention_scores, dim=-1)

        # 使用 dropout 随机丢弃一些注意力概率，以减少过拟合风险
        attention_probs = self.dropout(attention_probs)

        # 计算上下文向量，将注意力概率与值进行加权求和
        context_layer = torch.matmul(attention_probs, value_layer)

        # 调整上下文向量的维度以匹配后续网络层的输入要求
        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
        context_layer = context_layer.view(new_context_layer_shape)

        # 根据需要返回不同的输出，包括上下文向量和注意力概率
        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)

        return outputs
class PvtAttention(nn.Module):
    def __init__(
        self, config: PvtConfig, hidden_size: int, num_attention_heads: int, sequences_reduction_ratio: float
    ):
        super().__init__()
        # 初始化自注意力层，使用给定的配置、隐藏大小、注意力头数和序列缩减比例
        self.self = PvtEfficientSelfAttention(
            config,
            hidden_size=hidden_size,
            num_attention_heads=num_attention_heads,
            sequences_reduction_ratio=sequences_reduction_ratio,
        )
        # 初始化自注意力输出层，使用给定的配置和隐藏大小
        self.output = PvtSelfOutput(config, hidden_size=hidden_size)
        # 初始化一个空集合，用于存储被修剪的注意力头
        self.pruned_heads = set()

    def prune_heads(self, heads):
        if len(heads) == 0:
            return
        # 找到可修剪的注意力头和其索引
        heads, index = find_pruneable_heads_and_indices(
            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
        )

        # 对线性层进行修剪
        self.self.query = prune_linear_layer(self.self.query, index)
        self.self.key = prune_linear_layer(self.self.key, index)
        self.self.value = prune_linear_layer(self.self.value, index)
        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)

        # 更新超参数并存储修剪后的注意力头
        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
        self.pruned_heads = self.pruned_heads.union(heads)

    def forward(
        self, hidden_states: torch.Tensor, height: int, width: int, output_attentions: bool = False
    ) -> Tuple[torch.Tensor]:
        # 执行前向传播，获取自注意力层的输出
        self_outputs = self.self(hidden_states, height, width, output_attentions)

        # 将自注意力层的输出作为输入，经过输出层得到注意力输出
        attention_output = self.output(self_outputs[0])
        # 如果需要输出注意力权重，则将它们加入到输出中
        outputs = (attention_output,) + self_outputs[1:]
        return outputs


class PvtFFN(nn.Module):
    def __init__(
        self,
        config: PvtConfig,
        in_features: int,
        hidden_features: Optional[int] = None,
        out_features: Optional[int] = None,
    ):
        super().__init__()
        # 根据输入和输出特征大小初始化第一个线性层
        out_features = out_features if out_features is not None else in_features
        self.dense1 = nn.Linear(in_features, hidden_features)
        # 初始化中间激活函数，根据配置中的隐藏激活函数选择相应的函数
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act
        # 初始化第二个线性层，输出特征大小根据给定或默认的大小确定
        self.dense2 = nn.Linear(hidden_features, out_features)
        # 初始化一个Dropout层，使用给定的隐藏丢弃概率
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 通过第一个线性层传播输入的隐藏状态
        hidden_states = self.dense1(hidden_states)
        # 经过中间激活函数处理隐藏状态
        hidden_states = self.intermediate_act_fn(hidden_states)
        # 应用Dropout层处理隐藏状态
        hidden_states = self.dropout(hidden_states)
        # 通过第二个线性层传播隐藏状态
        hidden_states = self.dense2(hidden_states)
        # 再次应用Dropout层处理最终的隐藏状态并返回
        hidden_states = self.dropout(hidden_states)
        return hidden_states


class PvtLayer(nn.Module):
    # 初始化函数，用于初始化一个 PvtLayer 对象
    def __init__(
        self,
        config: PvtConfig,
        hidden_size: int,
        num_attention_heads: int,
        drop_path: float,
        sequences_reduction_ratio: float,
        mlp_ratio: float,
    ):
        super().__init__()
        # 初始化第一个 LayerNorm 层，用于对输入进行归一化处理
        self.layer_norm_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_eps)
        # 初始化注意力机制模块 PvtAttention
        self.attention = PvtAttention(
            config=config,
            hidden_size=hidden_size,
            num_attention_heads=num_attention_heads,
            sequences_reduction_ratio=sequences_reduction_ratio,
        )
        # 根据 drop_path 参数初始化 PvtDropPath，若 drop_path > 0.0 则使用 PvtDropPath，否则使用 nn.Identity()
        self.drop_path = PvtDropPath(drop_path) if drop_path > 0.0 else nn.Identity()
        # 初始化第二个 LayerNorm 层，用于对输入进行归一化处理
        self.layer_norm_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_eps)
        # 计算 MLP 的隐藏层大小
        mlp_hidden_size = int(hidden_size * mlp_ratio)
        # 初始化 MLP 模块 PvtFFN
        self.mlp = PvtFFN(config=config, in_features=hidden_size, hidden_features=mlp_hidden_size)

    # 前向传播函数，处理输入并返回输出
    def forward(self, hidden_states: torch.Tensor, height: int, width: int, output_attentions: bool = False):
        # 第一步，对输入 hidden_states 进行 LayerNorm 归一化处理，并输入到 self.attention 模块中
        self_attention_outputs = self.attention(
            hidden_states=self.layer_norm_1(hidden_states),
            height=height,
            width=width,
            output_attentions=output_attentions,
        )
        # 从 self_attention_outputs 中取出经过注意力机制处理后的输出
        attention_output = self_attention_outputs[0]
        # 获取其余的输出，这些输出可以用于进一步分析，如输出注意力分数等
        outputs = self_attention_outputs[1:]

        # 应用 drop_path 操作，根据 drop_path 的值决定是否应用 drop path
        attention_output = self.drop_path(attention_output)
        # 将经过 drop path 处理后的注意力输出与原始输入 hidden_states 相加，得到新的 hidden_states
        hidden_states = attention_output + hidden_states

        # 对经过注意力层处理后的 hidden_states 再次进行 LayerNorm 归一化处理
        mlp_output = self.mlp(self.layer_norm_2(hidden_states))

        # 应用 drop_path 操作到 MLP 输出上
        mlp_output = self.drop_path(mlp_output)
        # 将 MLP 处理后的输出与之前的 hidden_states 相加，得到最终层的输出 layer_output
        layer_output = hidden_states + mlp_output

        # 将最终的层输出和之前的其他输出一起返回
        outputs = (layer_output,) + outputs

        return outputs
# 定义一个私有编码器的神经网络模块，继承自 nn.Module 类
class PvtEncoder(nn.Module):
    # 初始化方法，接受一个 PvtConfig 类型的配置对象作为参数
    def __init__(self, config: PvtConfig):
        super().__init__()
        # 将配置对象保存在模块的属性中
        self.config = config

        # 使用线性空间生成随机深度衰减规则列表，用于随机深度路径(drop path)
        drop_path_decays = torch.linspace(0, config.drop_path_rate, sum(config.depths)).tolist()

        # 补丁嵌入
        embeddings = []

        # 循环创建编码器块的嵌入层
        for i in range(config.num_encoder_blocks):
            embeddings.append(
                PvtPatchEmbeddings(
                    config=config,
                    # 如果是第一个块，则使用完整的图像尺寸，否则根据块的索引减少图像尺寸
                    image_size=config.image_size if i == 0 else self.config.image_size // (2 ** (i + 1)),
                    patch_size=config.patch_sizes[i],
                    stride=config.strides[i],
                    num_channels=config.num_channels if i == 0 else config.hidden_sizes[i - 1],
                    hidden_size=config.hidden_sizes[i],
                    # 如果是最后一个块，则设置为 True，否则为 False
                    cls_token=i == config.num_encoder_blocks - 1,
                )
            )
        # 将嵌入层列表转换为 nn.ModuleList，并保存在模块的属性中
        self.patch_embeddings = nn.ModuleList(embeddings)

        # Transformer 块
        blocks = []
        cur = 0
        # 循环创建编码器块的 Transformer 层
        for i in range(config.num_encoder_blocks):
            # 每个块由多个层组成
            layers = []
            if i != 0:
                cur += config.depths[i - 1]
            for j in range(config.depths[i]):
                layers.append(
                    PvtLayer(
                        config=config,
                        hidden_size=config.hidden_sizes[i],
                        num_attention_heads=config.num_attention_heads[i],
                        drop_path=drop_path_decays[cur + j],
                        sequences_reduction_ratio=config.sequence_reduction_ratios[i],
                        mlp_ratio=config.mlp_ratios[i],
                    )
                )
            # 将层列表转换为 nn.ModuleList，并保存在块列表中
            blocks.append(nn.ModuleList(layers))

        # 将块列表转换为 nn.ModuleList，并保存在模块的属性中
        self.block = nn.ModuleList(blocks)

        # 层归一化
        # 创建一个归一化层，对最后一个隐藏层进行归一化处理
        self.layer_norm = nn.LayerNorm(config.hidden_sizes[-1], eps=config.layer_norm_eps)

    # 前向传播方法，接受像素值张量和可选的输出参数，并返回模型的输出
    def forward(
        self,
        pixel_values: torch.FloatTensor,
        output_attentions: Optional[bool] = False,
        output_hidden_states: Optional[bool] = False,
        return_dict: Optional[bool] = True,
        # 前向传播方法的参数包括像素值张量和可选的输出参数
        ) -> Union[Tuple, BaseModelOutput]:
        # 如果不需要输出隐藏状态，则初始化为空元组；否则设置为 None
        all_hidden_states = () if output_hidden_states else None
        # 如果不需要输出注意力权重，则初始化为空元组；否则设置为 None
        all_self_attentions = () if output_attentions else None

        # 获取输入张量的批大小
        batch_size = pixel_values.shape[0]
        # 获取 Transformer 模型的块数
        num_blocks = len(self.block)
        # 初始化隐藏状态为输入像素值
        hidden_states = pixel_values

        # 迭代每个嵌入层和对应的块
        for idx, (embedding_layer, block_layer) in enumerate(zip(self.patch_embeddings, self.block)):
            # 第一步，获得图像块的嵌入表示
            hidden_states, height, width = embedding_layer(hidden_states)
            
            # 第二步，将嵌入表示送入 Transformer 块中
            for block in block_layer:
                # 调用 Transformer 块计算输出
                layer_outputs = block(hidden_states, height, width, output_attentions)
                hidden_states = layer_outputs[0]
                # 如果需要输出注意力权重，则累加到 all_self_attentions 中
                if output_attentions:
                    all_self_attentions = all_self_attentions + (layer_outputs[1],)
                # 如果需要输出隐藏状态，则累加到 all_hidden_states 中
                if output_hidden_states:
                    all_hidden_states = all_hidden_states + (hidden_states,)
            
            # 如果不是最后一个块，则调整隐藏状态的形状和顺序
            if idx != num_blocks - 1:
                hidden_states = hidden_states.reshape(batch_size, height, width, -1).permute(0, 3, 1, 2).contiguous()
        
        # 对最终的隐藏状态进行 LayerNorm 处理
        hidden_states = self.layer_norm(hidden_states)

        # 如果需要输出隐藏状态，则将最终隐藏状态加入到 all_hidden_states 中
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)
        
        # 如果不需要以字典形式返回结果，则将所有结果打包成元组返回
        if not return_dict:
            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
        
        # 如果需要以字典形式返回结果，则创建一个 BaseModelOutput 对象返回
        return BaseModelOutput(
            last_hidden_state=hidden_states,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
        )
PVT_START_DOCSTRING = r"""
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`~PvtConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

PVT_INPUTS_DOCSTRING = r"""
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`PvtImageProcessor.__call__`]
            for details.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
    PVT_START_DOCSTRING,


注释：


# PVT_START_DOCSTRING 是一个常量或者预处理器指令，用于标识文档字符串的起始位置。
# 在一些代码风格中，PVT_START_DOCSTRING 可能用于指示文档字符串块的开始。
# 这种约定有助于程序员识别和管理代码中的文档说明部分。
)
# PvtModel 类的定义，继承自 PvtPreTrainedModel 类
class PvtModel(PvtPreTrainedModel):
    def __init__(self, config: PvtConfig):
        super().__init__(config)
        self.config = config

        # hierarchical Transformer encoder
        # 使用给定的配置初始化 PvtEncoder 对象作为编码器
        self.encoder = PvtEncoder(config)

        # Initialize weights and apply final processing
        # 执行初始化权重和最终处理步骤
        self.post_init()

    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        # 遍历需要修剪的层和头部信息，逐层修剪注意力头
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)

    @add_start_docstrings_to_model_forward(PVT_INPUTS_DOCSTRING.format("(batch_size, channels, height, width)"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=BaseModelOutput,
        config_class=_CONFIG_FOR_DOC,
        modality="vision",
        expected_output=_EXPECTED_OUTPUT_SHAPE,
    )
    # 定义前向传播方法，接受像素值和可选的输出参数，返回模型输出
    def forward(
        self,
        pixel_values: torch.FloatTensor,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutput]:
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 将像素值和其它参数传递给编码器，并接收编码器的输出
        encoder_outputs = self.encoder(
            pixel_values=pixel_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # 获取编码器的序列输出
        sequence_output = encoder_outputs[0]

        # 如果不使用返回字典，则返回序列输出和其它编码器输出
        if not return_dict:
            return (sequence_output,) + encoder_outputs[1:]

        # 使用 BaseModelOutput 类构造返回的输出字典
        return BaseModelOutput(
            last_hidden_state=sequence_output,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )


@add_start_docstrings(
    """
    Pvt Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
    the [CLS] token) e.g. for ImageNet.
    """,
    PVT_START_DOCSTRING,
)
# PvtForImageClassification 类的定义，继承自 PvtPreTrainedModel 类
class PvtForImageClassification(PvtPreTrainedModel):
    def __init__(self, config: PvtConfig) -> None:
        super().__init__(config)

        self.num_labels = config.num_labels
        # 使用给定的配置初始化 PvtModel 对象
        self.pvt = PvtModel(config)

        # Classifier head
        # 根据配置选择线性分类器或者恒等映射
        self.classifier = (
            nn.Linear(config.hidden_sizes[-1], config.num_labels) if config.num_labels > 0 else nn.Identity()
        )

        # Initialize weights and apply final processing
        # 执行初始化权重和最终处理步骤
        self.post_init()

    @add_start_docstrings_to_model_forward(PVT_INPUTS_DOCSTRING.format("(batch_size, channels, height, width)"))
    # 重写前向传播方法的文档注释，描述输入参数的形状
    def forward(
        self,
        pixel_values: torch.FloatTensor,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutput]:
    @add_code_sample_docstrings(
        checkpoint=_IMAGE_CLASS_CHECKPOINT,
        output_type=ImageClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
    )
    # 使用装饰器添加代码样本的文档字符串，指定了检查点、输出类型、配置类和预期输出
    def forward(
        self,
        pixel_values: Optional[torch.Tensor],
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[tuple, ImageClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 如果 return_dict 为 None，则根据 self.config.use_return_dict 来设定 return_dict 的值
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用 PVT 模型进行前向传播，获取输出结果
        outputs = self.pvt(
            pixel_values=pixel_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从输出中获取序列输出
        sequence_output = outputs[0]

        # 对序列输出的第一个位置进行分类器的运算，得到 logits
        logits = self.classifier(sequence_output[:, 0, :])

        # 初始化 loss 为 None
        loss = None
        # 如果 labels 不为 None，则计算损失函数
        if labels is not None:
            # 如果 self.config.problem_type 未定义，则根据 num_labels 和 labels 的数据类型来定义问题类型
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            # 根据问题类型选择相应的损失函数
            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        # 如果 return_dict 为 False，则返回 tuple 类型的输出
        if not return_dict:
            output = (logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 如果 return_dict 为 True，则返回 ImageClassifierOutput 类型的对象
        return ImageClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

`.\models\pvt\init.py`

# coding=utf-8
# 以上行指定了源代码文件的字符编码格式为 UTF-8

# 版权声明和作者信息，标识代码版权归属和作者列表
# Copyright 2023 Authors: Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan,
# Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao and The HuggingFace Inc. team.
# All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# 从 typing 模块中导入 TYPE_CHECKING 类型标记
from typing import TYPE_CHECKING

# 从 ...utils 中导入所需的模块和函数
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_torch_available,
    is_vision_available,
)

# 定义模块导入结构的字典，包含 configuration_pvt 模块的导入结构
_import_structure = {
    "configuration_pvt": ["PVT_PRETRAINED_CONFIG_ARCHIVE_MAP", "PvtConfig", "PvtOnnxConfig"],
}

# 检查视觉处理是否可用，若不可用则引发 OptionalDependencyNotAvailable 异常
try:
    if not is_vision_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，将 image_processing_pvt 模块导入结构添加到 _import_structure 字典中
    _import_structure["image_processing_pvt"] = ["PvtImageProcessor"]

# 检查 Torch 是否可用，若不可用则引发 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，将 modeling_pvt 模块导入结构添加到 _import_structure 字典中
    _import_structure["modeling_pvt"] = [
        "PVT_PRETRAINED_MODEL_ARCHIVE_LIST",
        "PvtForImageClassification",
        "PvtModel",
        "PvtPreTrainedModel",
    ]

# 如果当前环境为 TYPE_CHECKING，进行类型导入
if TYPE_CHECKING:
    # 从 configuration_pvt 模块导入指定的类和变量
    from .configuration_pvt import PVT_PRETRAINED_CONFIG_ARCHIVE_MAP, PvtConfig, PvtOnnxConfig

    # 在视觉处理可用时，从 image_processing_pvt 模块导入 PvtImageProcessor 类
    try:
        if not is_vision_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .image_processing_pvt import PvtImageProcessor

    # 在 Torch 可用时，从 modeling_pvt 模块导入指定的类和变量
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_pvt import (
            PVT_PRETRAINED_MODEL_ARCHIVE_LIST,
            PvtForImageClassification,
            PvtModel,
            PvtPreTrainedModel,
        )

# 如果不是 TYPE_CHECKING 环境，则将当前模块注册为 LazyModule
else:
    import sys

    # 使用 _LazyModule 将当前模块设置为惰性加载模块
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\pvt_v2\configuration_pvt_v2.py`

# coding=utf-8
# 版权 2024 作者: Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan,
# Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao 和 HuggingFace 公司团队。
# 保留所有权利。
#
# 根据 Apache 许可证版本 2.0 许可；
# 除非符合许可证的要求，否则不得使用此文件。
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，本软件根据“原样”分发，
# 不附带任何明示或暗示的担保或条件。
# 有关更多信息，请参见许可证。
"""Pvt V2 模型配置"""

from typing import Callable, List, Tuple, Union

# 导入预训练配置类
from ...configuration_utils import PretrainedConfig
# 导入日志工具
from ...utils import logging
# 导入骨干网络配置混合类和获取对齐输出特征输出索引函数
from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices

# 获取全局日志记录器
logger = logging.get_logger(__name__)

# Pvt V2 预训练模型配置映射表，指定不同模型的预训练地址
PVT_V2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "pvt_v2_b0": "https://huggingface.co/OpenGVLab/pvt_v2_b0",
    "pvt_v2_b1": "https://huggingface.co/OpenGVLab/pvt_v2_b1",
    "pvt_v2_b2": "https://huggingface.co/OpenGVLab/pvt_v2_b2",
    "pvt_v2_b2_linear": "https://huggingface.co/OpenGVLab/pvt_v2_b2_linear",
    "pvt_v2_b3": "https://huggingface.co/OpenGVLab/pvt_v2_b3",
    "pvt_v2_b4": "https://huggingface.co/OpenGVLab/pvt_v2_b4",
    "pvt_v2_b5": "https://huggingface.co/OpenGVLab/pvt_v2_b5",
}

# Pvt V2 模型配置类，继承自骨干网络配置混合类和预训练配置类
class PvtV2Config(BackboneConfigMixin, PretrainedConfig):
    r"""
    这是一个配置类，用于存储 [`PvtV2Model`] 的配置信息。根据指定的参数实例化 Pvt V2 模型，
    定义模型的架构。使用默认配置进行实例化将产生类似于 Pvt V2 B0
    [OpenGVLab/pvt_v2_b0](https://huggingface.co/OpenGVLab/pvt_v2_b0) 架构的配置。

    配置对象继承自 [`PretrainedConfig`]，可用于控制模型输出。阅读
    [`PretrainedConfig`] 的文档以获取更多信息。

    示例：

    ```
    >>> from transformers import PvtV2Model, PvtV2Config

    >>> # 初始化一个 pvt_v2_b0 风格的配置
    >>> configuration = PvtV2Config()

    >>> # 从 OpenGVLab/pvt_v2_b0 风格的配置初始化一个模型
    >>> model = PvtV2Model(configuration)

    >>> # 访问模型配置
    >>> configuration = model.config
    ```
    """

    # 模型类型
    model_type = "pvt_v2"
    # 初始化函数，用于创建一个新的对象实例
    def __init__(
        self,
        image_size: Union[int, Tuple[int, int]] = 224,  # 图像尺寸，可以是单个整数或元组表示的宽高
        num_channels: int = 3,  # 图像通道数，默认为3（RGB）
        num_encoder_blocks: int = 4,  # 编码器块的数量，默认为4
        depths: List[int] = [2, 2, 2, 2],  # 每个阶段的编码器块的深度列表，默认为每阶段2个块
        sr_ratios: List[int] = [8, 4, 2, 1],  # 每个阶段的空间分辨率缩放比例列表，默认为递减的倍数
        hidden_sizes: List[int] = [32, 64, 160, 256],  # 每个阶段的隐藏层大小列表，默认为递增
        patch_sizes: List[int] = [7, 3, 3, 3],  # 每个阶段的图像块大小列表，默认为不同的尺寸
        strides: List[int] = [4, 2, 2, 2],  # 每个阶段的步长列表，默认为不同的步幅
        num_attention_heads: List[int] = [1, 2, 5, 8],  # 每个阶段的注意力头数列表，默认为不同的数量
        mlp_ratios: List[int] = [8, 8, 4, 4],  # 每个阶段的MLP层的扩展比例列表，默认为不同的倍数
        hidden_act: Union[str, Callable] = "gelu",  # 隐藏层激活函数，默认为GELU函数
        hidden_dropout_prob: float = 0.0,  # 隐藏层的dropout概率，默认为0.0，即无dropout
        attention_probs_dropout_prob: float = 0.0,  # 注意力概率的dropout概率，默认为0.0，即无dropout
        initializer_range: float = 0.02,  # 初始化范围，默认为0.02
        drop_path_rate: float = 0.0,  # 丢弃路径的比率，默认为0.0，即无丢弃
        layer_norm_eps: float = 1e-6,  # 层归一化的epsilon值，默认为1e-6
        qkv_bias: bool = True,  # 是否在QKV层使用偏置，默认为True
        linear_attention: bool = False,  # 是否使用线性注意力，默认为False
        out_features=None,  # 输出特征，用于描述模型输出的特征信息，默认为None
        out_indices=None,  # 输出索引，用于描述模型输出的索引信息，默认为None
        **kwargs,  # 其他关键字参数，用于接收除上述参数外的其他参数
    ):
        super().__init__(**kwargs)  # 调用父类的初始化方法

        image_size = (image_size, image_size) if isinstance(image_size, int) else image_size  # 如果image_size是整数，则转换为元组形式

        self.image_size = image_size  # 设置对象实例的图像尺寸属性
        self.num_channels = num_channels  # 设置对象实例的图像通道数属性
        self.num_encoder_blocks = num_encoder_blocks  # 设置对象实例的编码器块数量属性
        self.depths = depths  # 设置对象实例的深度列表属性
        self.sr_ratios = sr_ratios  # 设置对象实例的空间分辨率缩放比例列表属性
        self.hidden_sizes = hidden_sizes  # 设置对象实例的隐藏层大小列表属性
        self.patch_sizes = patch_sizes  # 设置对象实例的图像块大小列表属性
        self.strides = strides  # 设置对象实例的步长列表属性
        self.mlp_ratios = mlp_ratios  # 设置对象实例的MLP层扩展比例列表属性
        self.num_attention_heads = num_attention_heads  # 设置对象实例的注意力头数列表属性
        self.hidden_act = hidden_act  # 设置对象实例的隐藏层激活函数属性
        self.hidden_dropout_prob = hidden_dropout_prob  # 设置对象实例的隐藏层dropout概率属性
        self.attention_probs_dropout_prob = attention_probs_dropout_prob  # 设置对象实例的注意力dropout概率属性
        self.initializer_range = initializer_range  # 设置对象实例的初始化范围属性
        self.drop_path_rate = drop_path_rate  # 设置对象实例的丢弃路径比率属性
        self.layer_norm_eps = layer_norm_eps  # 设置对象实例的层归一化epsilon值属性
        self.qkv_bias = qkv_bias  # 设置对象实例的QKV层是否使用偏置属性
        self.linear_attention = linear_attention  # 设置对象实例的是否使用线性注意力属性
        self.stage_names = [f"stage{idx}" for idx in range(1, len(depths) + 1)]  # 设置对象实例的阶段名称列表属性
        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
        )  # 调用函数获取对齐的输出特征和输出索引，设置对象实例的相关属性

`.\models\pvt_v2\convert_pvt_v2_to_pytorch.py`

# coding=utf-8
# Copyright 2023 Authors: Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan,
# Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao and The HuggingFace Inc. team.
# All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Convert PvtV2 checkpoints from the original library.
"""

import argparse
from pathlib import Path

import requests
import torch
from PIL import Image

from transformers import PvtImageProcessor, PvtV2Config, PvtV2ForImageClassification
from transformers.utils import logging

# 设置日志输出级别为信息
logging.set_verbosity_info()
# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)


# 定义需要重命名的键列表（左侧为原始名称，右侧为目标名称）
def create_rename_keys(config):
    rename_keys = []
    # 添加需要重命名的键
    rename_keys.extend(
        [
            ("head.weight", "classifier.weight"),
            ("head.bias", "classifier.bias"),
        ]
    )

    return rename_keys


# 将每个编码器层的权重矩阵拆分为查询（queries）、键（keys）和值（values）
def read_in_k_v(state_dict, config):
    # 遍历每个编码器块
    for i in range(config.num_encoder_blocks):
        for j in range(config.depths[i]):
            # 读取键值（keys）和值（values）的权重和偏置
            kv_weight = state_dict.pop(f"pvt_v2.encoder.layers.{i}.blocks.{j}.attention.kv.weight")
            kv_bias = state_dict.pop(f"pvt_v2.encoder.layers.{i}.blocks.{j}.attention.kv.bias")
            # 将权重和偏置添加到状态字典中作为键和值的权重和偏置
            state_dict[f"pvt_v2.encoder.layers.{i}.blocks.{j}.attention.key.weight"] = kv_weight[
                : config.hidden_sizes[i], :
            ]
            state_dict[f"pvt_v2.encoder.layers.{i}.blocks.{j}.attention.key.bias"] = kv_bias[: config.hidden_sizes[i]]

            state_dict[f"pvt_v2.encoder.layers.{i}.blocks.{j}.attention.value.weight"] = kv_weight[
                config.hidden_sizes[i] :, :
            ]
            state_dict[f"pvt_v2.encoder.layers.{i}.blocks.{j}.attention.value.bias"] = kv_bias[
                config.hidden_sizes[i] :
            ]


# 重命名键名的辅助函数
def rename_key(dct, old, new):
    val = dct.pop(old)
    dct[new] = val


# 在一张可爱猫咪的图片上验证结果
def prepare_img():
    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
    # 从 URL 获取图像并打开
    im = Image.open(requests.get(url, stream=True).raw)
    return im


# 使用 torch.no_grad 装饰器，确保在执行期间禁用梯度计算
@torch.no_grad()
# 根据给定的 PVT-V2 模型大小选择相应的配置路径
def convert_pvt_v2_checkpoint(pvt_v2_size, pvt_v2_checkpoint, pytorch_dump_folder_path, verify_imagenet_weights=False):
    """
    Copy/paste/tweak model's weights to our PVT structure.
    """

    # 定义默认的 PVT-V2 配置路径
    if pvt_v2_size == "b0":
        config_path = "OpenGVLab/pvt_v2_b0"
    elif pvt_v2_size == "b1":
        config_path = "OpenGVLab/pvt_v2_b1"
    elif pvt_v2_size == "b2":
        config_path = "OpenGVLab/pvt_v2_b2"
    elif pvt_v2_size == "b2-linear":
        config_path = "OpenGVLab/pvt_v2_b2_linear"
    elif pvt_v2_size == "b3":
        config_path = "OpenGVLab/pvt_v2_b3"
    elif pvt_v2_size == "b4":
        config_path = "OpenGVLab/pvt_v2_b4"
    elif pvt_v2_size == "b5":
        config_path = "OpenGVLab/pvt_v2_b5"
    else:
        # 如果给定的模型大小不在预定义的列表中，引发值错误异常
        raise ValueError(
            f"Available model sizes: 'b0', 'b1', 'b2', 'b2-linear', 'b3', 'b4', 'b5', but "
            f"'{pvt_v2_size}' was given"
        )
    
    # 使用预训练配置路径创建 PVT-V2 配置对象
    config = PvtV2Config.from_pretrained(config_path)
    
    # 从指定路径加载原始 PVT-V2 模型权重（通常从 https://github.com/whai362/PVT 下载）
    state_dict = torch.load(pvt_v2_checkpoint, map_location="cpu")

    # 创建重命名键列表以匹配当前模型结构
    rename_keys = create_rename_keys(config)
    
    # 根据重命名键重命名加载的状态字典中的键名
    for src, dest in rename_keys:
        rename_key(state_dict, src, dest)
    
    # 将状态字典中的键值对读入 PVT-V2 配置对象
    read_in_k_v(state_dict, config)

    # 加载 HuggingFace 模型，用于图像分类
    model = PvtV2ForImageClassification(config).eval()
    model.load_state_dict(state_dict)
    
    # 创建与配置图像大小匹配的图像处理器对象
    image_processor = PvtImageProcessor(size=config.image_size)
    # 如果需要验证 ImageNet 权重
    if verify_imagenet_weights:
        # 打印信息，验证预训练 ImageNet 权重的转换
        print("Verifying conversion of pretrained ImageNet weights...")
        
        # 使用 PvtImageProcessor 准备图像并编码
        encoding = image_processor(images=prepare_img(), return_tensors="pt")
        pixel_values = encoding["pixel_values"]
        
        # 使用模型推断图像
        outputs = model(pixel_values)
        logits = outputs.logits.detach().cpu()

        # 根据 PvtV2 模型大小选择预期的 logits 切片
        if pvt_v2_size == "b0":
            expected_slice_logits = torch.tensor([-1.1939, -1.4547, -0.1076])
        elif pvt_v2_size == "b1":
            expected_slice_logits = torch.tensor([-0.4716, -0.7335, -0.4600])
        elif pvt_v2_size == "b2":
            expected_slice_logits = torch.tensor([0.0795, -0.3170, 0.2247])
        elif pvt_v2_size == "b2-linear":
            expected_slice_logits = torch.tensor([0.0968, 0.3937, -0.4252])
        elif pvt_v2_size == "b3":
            expected_slice_logits = torch.tensor([-0.4595, -0.2870, 0.0940])
        elif pvt_v2_size == "b4":
            expected_slice_logits = torch.tensor([-0.1769, -0.1747, -0.0143])
        elif pvt_v2_size == "b5":
            expected_slice_logits = torch.tensor([-0.2943, -0.1008, 0.6812])
        else:
            # 如果提供的 PvtV2 模型大小无效，抛出 ValueError 异常
            raise ValueError(
                f"Available model sizes: 'b0', 'b1', 'b2', 'b2-linear', 'b3', 'b4', 'b5', but "
                f"'{pvt_v2_size}' was given"
            )

        # 断言实际的 logits 与预期的 logits 切片在指定的误差范围内相近，否则打印错误信息
        assert torch.allclose(
            logits[0, :3], expected_slice_logits, atol=1e-4
        ), "ImageNet weights not converted successfully."

        # 打印信息，ImageNet 权重验证成功
        print("ImageNet weights verified, conversion successful.")

    # 确保存储 PyTorch 模型的文件夹存在，如果不存在则创建
    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
    # 打印信息，保存模型的 pytorch_model.bin 文件到指定路径
    print(f"Saving model pytorch_model.bin to {pytorch_dump_folder_path}")
    # 将模型保存到指定路径
    model.save_pretrained(pytorch_dump_folder_path)
    # 打印信息，保存图像处理器到指定路径
    print(f"Saving image processor to {pytorch_dump_folder_path}")
    # 将图像处理器保存到指定路径
    image_processor.save_pretrained(pytorch_dump_folder_path)
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    # 创建一个参数解析器对象
    
    # 必选参数
    parser.add_argument(
        "--pvt_v2_size",
        default="b0",
        type=str,
        help="Size of the PVTv2 pretrained model you'd like to convert.",
    )
    # 指定 PVTv2 预训练模型的大小，作为字符串类型的参数，默认为 'b0'
    
    parser.add_argument(
        "--pvt_v2_checkpoint",
        default="pvt_v2_b0.pth",
        type=str,
        help="Checkpoint of the PVTv2 pretrained model you'd like to convert.",
    )
    # 指定 PVTv2 预训练模型的检查点文件路径，作为字符串类型的参数，默认为 'pvt_v2_b0.pth'
    
    parser.add_argument(
        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
    )
    # 指定输出 PyTorch 模型的目录路径，作为字符串类型的参数，默认为 None
    
    parser.add_argument(
        "--verify-imagenet-weights",
        action="store_true",
        default=False,
        help="Verifies the correct conversion of author-published pretrained ImageNet weights.",
    )
    # 如果存在该选项，则设置为 True，用于验证作者发布的预训练 ImageNet 权重的正确转换
    
    # 解析命令行参数
    args = parser.parse_args()
    
    # 调用函数 convert_pvt_v2_checkpoint，传入解析后的参数
    convert_pvt_v2_checkpoint(
        pvt_v2_size=args.pvt_v2_size,
        pvt_v2_checkpoint=args.pvt_v2_checkpoint,
        pytorch_dump_folder_path=args.pytorch_dump_folder_path,
        verify_imagenet_weights=args.verify_imagenet_weights,
    )

`.\models\pvt_v2\modeling_pvt_v2.py`

# 设置文件编码为 UTF-8
# 版权声明，包含作者信息和 HuggingFace 公司信息，保留所有权利
# 根据 Apache 许可证 2.0 版本，除非符合许可证要求，否则不得使用此文件
# 可以在以下链接处获取许可证的副本：http://www.apache.org/licenses/LICENSE-2.0
# 根据适用法律或书面同意，本软件按“原样”分发，无任何明示或暗示的担保或条件
# 详见许可证，了解特定语言的权利和限制
"""PyTorch PVTv2 模型."""

import math
from typing import Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

# 从 HuggingFace 库导入一些工具和模块
from ...activations import ACT2FN
from ...modeling_outputs import BackboneOutput, BaseModelOutput, ImageClassifierOutput
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import (
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
# 从 backbone_utils 模块中导入 BackboneMixin 类
from ...utils.backbone_utils import BackboneMixin
# 导入 PVTv2 配置文件
from .configuration_pvt_v2 import PvtV2Config

# 获取日志记录器
logger = logging.get_logger(__name__)

# 以下是用于文档的常量定义
_CONFIG_FOR_DOC = "PvtV2Config"

_CHECKPOINT_FOR_DOC = "OpenGVLab/pvt_v2_b0"
_EXPECTED_OUTPUT_SHAPE = [1, 256, 7, 7]

_IMAGE_CLASS_CHECKPOINT = "OpenGVLab/pvt_v2_b0"
_IMAGE_CLASS_EXPECTED_OUTPUT = "LABEL_281"  # "tabby, tabby cat" 的 ImageNet ID

# 预训练模型存档列表
PVT_V2_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "OpenGVLab/pvt_v2_b0",
    "OpenGVLab/pvt_v2_b1",
    "OpenGVLab/pvt_v2_b2",
    "OpenGVLab/pvt_v2_b2_linear",
    "OpenGVLab/pvt_v2_b3",
    "OpenGVLab/pvt_v2_b4",
    "OpenGVLab/pvt_v2_b5",
    # 更多 PVT 模型请查看 https://huggingface.co/models?filter=pvt_v2
]

# 从 transformers.models.beit.modeling_beit.drop_path 复制的函数
def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
    """
    每个样本都会丢弃路径（随机深度），主要用于残差块的主路径中。

    Ross Wightman 的评论：这与我为 EfficientNet 等网络创建的 DropConnect 实现相同，
    然而，原始名称具有误导性，因为 'Drop Connect' 是另一篇论文中的不同形式的 dropout……
    参见讨论：https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ……
    我选择将层和参数名称更改为 'drop path'，而不是将 DropConnect 作为层名称混合使用，并使用 'survival rate' 作为参数。
    """
    if drop_prob == 0.0 or not training:
        return input
    keep_prob = 1 - drop_prob
    # 定义一个形状，确保与输入张量的维度匹配，支持不同维度的张量，而不仅仅是二维卷积神经网络
    shape = (input.shape[0],) + (1,) * (input.ndim - 1)
    # 创建一个随机张量，形状与输入张量相同，元素值在 [keep_prob, keep_prob+1) 之间，数据类型和设备与输入张量一致
    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
    # 将随机张量向下取整，实现二值化操作
    random_tensor.floor_()
    # 对输入张量进行二值化处理，输出保留的概率为 keep_prob，其余元素置零
    output = input.div(keep_prob) * random_tensor
    # 返回处理后的输出张量
    return output
# 从 transformers.models.convnext.modeling_convnext.ConvNextDropPath 复制代码，并将 ConvNext 改为 Pvt
class PvtV2DropPath(nn.Module):
    """每个样本的随机深度（Drop Path，应用于残差块的主路径）。"""

    def __init__(self, drop_prob: Optional[float] = None) -> None:
        super().__init__()
        self.drop_prob = drop_prob

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        return drop_path(hidden_states, self.drop_prob, self.training)

    def extra_repr(self) -> str:
        return "p={}".format(self.drop_prob)


class PvtV2OverlapPatchEmbeddings(nn.Module):
    """将图像转换为补丁嵌入。"""

    def __init__(self, config: PvtV2Config, layer_idx: int):
        super().__init__()
        patch_size = config.patch_sizes[layer_idx]
        patch_size = (patch_size, patch_size) if isinstance(patch_size, int) else patch_size
        stride = config.strides[layer_idx]
        num_channels = config.num_channels if layer_idx == 0 else config.hidden_sizes[layer_idx - 1]
        hidden_size = config.hidden_sizes[layer_idx]
        self.patch_size = patch_size
        # 使用二维卷积将输入的像素值转换为嵌入向量
        self.proj = nn.Conv2d(
            num_channels,
            hidden_size,
            kernel_size=patch_size,
            stride=stride,
            padding=(patch_size[0] // 2, patch_size[1] // 2),
        )
        # 对嵌入向量进行层归一化
        self.layer_norm = nn.LayerNorm(hidden_size, eps=config.layer_norm_eps)

    def forward(self, pixel_values):
        # 将像素值映射为嵌入向量
        embeddings = self.proj(pixel_values)
        _, _, height, width = embeddings.shape
        # 将嵌入向量展平以便进行后续处理，并转置维度
        embeddings = embeddings.flatten(2).transpose(1, 2)
        # 应用层归一化
        embeddings = self.layer_norm(embeddings)
        return embeddings, height, width


class PvtV2DepthWiseConv(nn.Module):
    """
    使用零填充的深度卷积（DW convolution），以融入位置信息。
    深度卷积的组数等于输入通道数，即每个输入通道一个滤波器，从而减少参数和计算成本，主要用于位置编码。
    """

    def __init__(self, config: PvtV2Config, dim: int = 768):
        super().__init__()
        # 定义深度卷积层，每个输入通道一个滤波器
        self.dwconv = nn.Conv2d(dim, dim, 3, 1, 1, bias=True, groups=dim)

    def forward(self, hidden_states, height, width):
        batch_size, seq_len, num_channels = hidden_states.shape
        # 调整张量形状以适应深度卷积的输入要求
        hidden_states = hidden_states.transpose(1, 2).view(batch_size, num_channels, height, width)
        # 应用深度卷积
        hidden_states = self.dwconv(hidden_states)
        # 将输出展平以便后续处理，并转置维度
        hidden_states = hidden_states.flatten(2).transpose(1, 2)

        return hidden_states


class PvtV2SelfAttention(nn.Module):
    """高效的自注意力机制。"""
    # 初始化函数，接受配置对象、隐藏层大小、注意力头数和空间缩减比例作为参数
    def __init__(self, config: PvtV2Config, hidden_size: int, num_attention_heads: int, spatial_reduction_ratio: int):
        super().__init__()  # 调用父类的初始化方法

        # 从配置对象中获取是否使用线性注意力的标志
        self.linear_attention = config.linear_attention
        # 初始化一个空的被修剪的注意力头集合
        self.pruned_heads = set()
        # 设置隐藏层的大小
        self.hidden_size = hidden_size
        # 设置注意力头的数量
        self.num_attention_heads = num_attention_heads

        # 检查隐藏层大小是否能够被注意力头的数量整除
        if self.hidden_size % self.num_attention_heads != 0:
            raise ValueError(
                f"The hidden size ({self.hidden_size}) is not a multiple of the number of attention "
                f"heads ({self.num_attention_heads})"
            )

        # 计算每个注意力头的大小
        self.attention_head_size = int(self.hidden_size / self.num_attention_heads)
        # 计算所有注意力头总共的大小
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        # 创建查询、键和值的线性层，用于生成注意力矩阵
        self.query = nn.Linear(self.hidden_size, self.all_head_size, bias=config.qkv_bias)
        self.key = nn.Linear(self.hidden_size, self.all_head_size, bias=config.qkv_bias)
        self.value = nn.Linear(self.hidden_size, self.all_head_size, bias=config.qkv_bias)
        # 创建注意力概率的dropout层
        self.attn_drop = nn.Dropout(config.attention_probs_dropout_prob)
        # 创建投影层，用于将注意力输出映射回原始隐藏状态的维度
        self.proj = nn.Linear(self.hidden_size, self.hidden_size)
        # 创建投影层的dropout层
        self.proj_drop = nn.Dropout(config.hidden_dropout_prob)

        # 设置空间缩减比例
        self.spatial_reduction_ratio = spatial_reduction_ratio
        # 如果使用线性注意力，初始化自适应平均池化层、空间缩减卷积层、层归一化和GELU激活函数
        if self.linear_attention:
            self.pool = nn.AdaptiveAvgPool2d(7)
            self.spatial_reduction = nn.Conv2d(self.hidden_size, self.hidden_size, kernel_size=1, stride=1)
            self.layer_norm = nn.LayerNorm(self.hidden_size, eps=config.layer_norm_eps)
            self.act = nn.GELU()
        # 否则，如果空间缩减比例大于1，初始化空间缩减卷积层和层归一化
        elif spatial_reduction_ratio > 1:
            self.spatial_reduction = nn.Conv2d(
                self.hidden_size, self.hidden_size, kernel_size=spatial_reduction_ratio, stride=spatial_reduction_ratio
            )
            self.layer_norm = nn.LayerNorm(self.hidden_size, eps=config.layer_norm_eps)

    # 转换隐藏状态张量为注意力分数张量的形状
    def transpose_for_scores(self, hidden_states) -> torch.Tensor:
        new_shape = hidden_states.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        hidden_states = hidden_states.view(new_shape)
        return hidden_states.permute(0, 2, 1, 3)

    # 前向传播函数，接受隐藏状态张量、高度、宽度和是否输出注意力矩阵作为输入
    def forward(
        self,
        hidden_states: torch.Tensor,
        height: int,
        width: int,
        output_attentions: bool = False,
    ) -> Tuple[torch.Tensor]:
        # 获取输入张量的维度信息：batch_size为批大小，seq_len为序列长度，num_channels为通道数
        batch_size, seq_len, num_channels = hidden_states.shape
        
        # 使用self.query对隐藏状态进行查询操作，为了后续的注意力计算做准备
        query_layer = self.transpose_for_scores(self.query(hidden_states))

        # 如果使用线性注意力
        if self.linear_attention:
            # 将隐藏状态重新排列以便空间池化
            hidden_states = hidden_states.permute(0, 2, 1).reshape(batch_size, num_channels, height, width)
            # 对空间维度进行池化操作，并将结果重新排列以供后续处理
            hidden_states = (
                self.spatial_reduction(self.pool(hidden_states)).reshape(batch_size, num_channels, -1).permute(0, 2, 1)
            )
            # 对处理后的隐藏状态进行激活函数操作
            hidden_states = self.act(self.layer_norm(hidden_states))
        # 如果使用空间缩减比例大于1
        elif self.spatial_reduction_ratio > 1:
            # 将隐藏状态重新排列以便空间池化
            hidden_states = hidden_states.permute(0, 2, 1).reshape(batch_size, num_channels, height, width)
            # 对空间维度进行池化操作，并将结果重新排列以供后续处理
            hidden_states = (
                self.spatial_reduction(hidden_states).reshape(batch_size, num_channels, -1).permute(0, 2, 1)
            )
            # 对处理后的隐藏状态进行LayerNorm操作
            hidden_states = self.layer_norm(hidden_states)

        # 使用self.key对隐藏状态进行键的计算
        key_layer = self.transpose_for_scores(self.key(hidden_states))
        # 使用self.value对隐藏状态进行值的计算
        value_layer = self.transpose_for_scores(self.value(hidden_states))

        # 计算注意力分数，通过query与key的点积得到
        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))

        # 对注意力分数进行缩放
        attention_scores = attention_scores / math.sqrt(self.attention_head_size)

        # 对注意力分数进行softmax操作，得到注意力权重
        attention_probs = nn.functional.softmax(attention_scores, dim=-1)

        # 对注意力权重进行dropout操作，用于随机失活整个token以供关注
        attention_probs = self.attn_drop(attention_probs)
        # 计算上下文层，将注意力权重与值进行加权求和，并对结果进行重排列和reshape
        context_layer = (attention_probs @ value_layer).transpose(1, 2).reshape(batch_size, seq_len, num_channels)
        # 对上下文层进行投影操作
        context_layer = self.proj(context_layer)
        # 对投影后的上下文层进行dropout操作
        context_layer = self.proj_drop(context_layer)

        # 如果需要输出注意力权重，则返回包含注意力权重的输出
        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)

        return outputs

    def prune_heads(self, heads):
        # 如果要剪枝的头部列表为空，则直接返回
        if len(heads) == 0:
            return
        
        # 查找可剪枝头部并获取相应索引
        heads, index = find_pruneable_heads_and_indices(
            heads, self.num_attention_heads, self.attention_head_size, self.pruned_heads
        )

        # 对query、key、value以及proj进行剪枝操作
        self.query = prune_linear_layer(self.query, index)
        self.key = prune_linear_layer(self.key, index)
        self.value = prune_linear_layer(self.value, index)
        self.proj = prune_linear_layer(self.proj, index, dim=1)

        # 更新超参数并存储已剪枝头部
        self.num_attention_heads = self.num_attention_heads - len(heads)
        self.all_head_size = self.attention_head_size * self.num_attention_heads
        self.pruned_heads = self.pruned_heads.union(heads)
# 定义了一个私有的PvtV2BlockLayer类，继承自nn.Module类
class PvtV2BlockLayer(nn.Module):
    # 初始化方法，接受配置config、层索引layer_idx和dropout路径drop_path参数
    def __init__(self, config: PvtV2Config, layer_idx: int, drop_path: float = 0.0):
        # 调用父类的初始化方法
        super().__init__()
        
        # 从配置中获取隐藏层大小
        hidden_size: int = config.hidden_sizes[layer_idx]
        # 从配置中获取注意力头数
        num_attention_heads: int = config.num_attention_heads[layer_idx]
        # 从配置中获取空间降维比率
        spatial_reduction_ratio: int = config.sr_ratios[layer_idx]
        # 从配置中获取MLP比率
        mlp_ratio: float = config.mlp_ratios[layer_idx]
        
        # 第一个LayerNorm层，用于规范化隐藏层状态
        self.layer_norm_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_eps)
        
        # PvtV2SelfAttention自注意力层，接收隐藏层大小、注意力头数和空间降维比率等参数
        self.attention = PvtV2SelfAttention(
            config=config,
            hidden_size=hidden_size,
            num_attention_heads=num_attention_heads,
            spatial_reduction_ratio=spatial_reduction_ratio,
        )
        
        # 根据drop_path是否大于0来决定是否添加PvtV2DropPath或者保持Identity
        self.drop_path = PvtV2DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
        
        # 第二个LayerNorm层，用于规范化隐藏层状态
        self.layer_norm_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_eps)
        
        # 计算MLP隐藏层大小
        mlp_hidden_size = int(hidden_size * mlp_ratio)
        
        # PvtV2ConvFeedForwardNetwork类，用于定义PVT V2的前馈神经网络，接收配置和隐藏层大小等参数
        self.mlp = PvtV2ConvFeedForwardNetwork(config=config, in_features=hidden_size, hidden_features=mlp_hidden_size)
    # 定义神经网络模型的前向传播函数，接受隐藏状态张量、图像高度、宽度和是否输出注意力矩阵作为参数
    def forward(self, hidden_states: torch.Tensor, height: int, width: int, output_attentions: bool = False):
        # 对隐藏状态进行 Layer Normalization 处理，并传递给注意力机制模块
        self_attention_outputs = self.attention(
            hidden_states=self.layer_norm_1(hidden_states),
            height=height,
            width=width,
            output_attentions=output_attentions,
        )
        # 从注意力机制模块的输出中获取注意力矩阵之外的结果
        attention_output = self_attention_outputs[0]
        outputs = self_attention_outputs[1:]

        # 对注意力输出进行 Drop Path 处理
        attention_output = self.drop_path(attention_output)
        # 将处理后的注意力输出与原始隐藏状态相加，得到新的隐藏状态
        hidden_states = attention_output + hidden_states

        # 将新的隐藏状态传递给 MLP 模块进行处理，并返回结果
        mlp_output = self.mlp(self.layer_norm_2(hidden_states), height, width)

        # 对 MLP 输出进行 Drop Path 处理
        mlp_output = self.drop_path(mlp_output)
        # 将 MLP 处理后的输出与原始隐藏状态相加，得到当前层的最终输出
        layer_output = hidden_states + mlp_output

        # 将当前层的输出添加到输出元组中，并返回
        outputs = (layer_output,) + outputs

        return outputs
# 定义私有版本2的编码器层，继承自nn.Module类
class PvtV2EncoderLayer(nn.Module):
    def __init__(self, config: PvtV2Config, layer_idx: int):
        super().__init__()
        # 初始化：重叠补丁嵌入层，用于将输入转换为补丁嵌入表示
        self.patch_embedding = PvtV2OverlapPatchEmbeddings(
            config=config,
            layer_idx=layer_idx,
        )
        
        # Transformer块
        # 随机深度衰减规则
        drop_path_decays = torch.linspace(0, config.drop_path_rate, sum(config.depths)).tolist()
        block_layers = []
        # 构建由多个PvtV2块层组成的列表
        for block_idx in range(config.depths[layer_idx]):
            block_layers.append(
                PvtV2BlockLayer(
                    config=config,
                    layer_idx=layer_idx,
                    drop_path=drop_path_decays[sum(config.depths[:layer_idx]) + block_idx],
                )
            )
        self.blocks = nn.ModuleList(block_layers)

        # 层归一化
        self.layer_norm = nn.LayerNorm(config.hidden_sizes[layer_idx], eps=config.layer_norm_eps)

    def forward(self, hidden_states, output_attentions):
        # 如果需要输出注意力矩阵，则初始化一个空元组
        all_self_attentions = () if output_attentions else None
        
        # 第一步：获取补丁嵌入
        hidden_states, height, width = self.patch_embedding(hidden_states)
        
        # 第二步：通过所有块层处理嵌入
        for block in self.blocks:
            layer_outputs = block(hidden_states, height, width, output_attentions)
            hidden_states = layer_outputs[0]
            # 如果需要输出注意力矩阵，则将每个块层的注意力矩阵追加到元组中
            if output_attentions:
                all_self_attentions += (layer_outputs[1],)
        
        # 第三步：应用层归一化
        hidden_states = self.layer_norm(hidden_states)

        # 准备输出：仅包含隐藏状态或者隐藏状态和所有注意力矩阵
        outputs = (hidden_states,)
        if output_attentions:
            outputs += (all_self_attentions,)

        return outputs, height, width


# 定义私有版本2的编码器，继承自nn.Module类
class PvtV2Encoder(nn.Module):
    def __init__(self, config: PvtV2Config):
        super().__init__()
        self.config = config
        self.gradient_checkpointing = False

        # 编码器层列表
        self.layers = nn.ModuleList([PvtV2EncoderLayer(config, i) for i in range(config.num_encoder_blocks)])

    def forward(
        self,
        pixel_values: torch.FloatTensor,
        output_attentions: Optional[bool] = False,
        output_hidden_states: Optional[bool] = False,
        return_dict: Optional[bool] = True,
        ) -> Union[Tuple, BaseModelOutput]:
        # 初始化隐藏状态和注意力矩阵的输出容器
        all_hidden_states = () if output_hidden_states else None
        all_self_attentions = () if output_attentions else None

        # 获取批处理大小并将输入像素值设置为初始隐藏状态
        batch_size = pixel_values.shape[0]
        hidden_states = pixel_values

        # 遍历所有层并处理每一层的输出
        for idx, layer in enumerate(self.layers):
            # 如果启用了梯度检查点且处于训练模式，则使用梯度检查点函数
            if self.gradient_checkpointing and self.training:
                layer_output = self._gradient_checkpointing_func(layer.__call__, hidden_states, output_attentions)
            else:
                # 否则直接调用层对象处理隐藏状态和注意力
                layer_output = layer(hidden_states, output_attentions)
            
            # 解包层输出，获取隐藏状态和尺寸信息
            outputs, height, width = layer_output
            hidden_states = outputs[0]  # 更新隐藏状态为当前层的主要输出

            # 如果需要输出注意力矩阵，则累加到全部自注意力矩阵中
            if output_attentions:
                all_self_attentions = all_self_attentions + (outputs[1],)

            # 将隐藏状态重塑回(batch_size, num_channels, height, width)的形状
            hidden_states = hidden_states.reshape(batch_size, height, width, -1).permute(0, 3, 1, 2).contiguous()

            # 如果需要输出所有隐藏状态，则累加当前层处理后的隐藏状态
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

        # 如果不需要以字典形式返回结果，则以元组形式返回所有输出
        if not return_dict:
            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
        
        # 否则以BaseModelOutput对象形式返回所有输出
        return BaseModelOutput(
            last_hidden_state=hidden_states,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
        )
class PvtV2PreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 定义配置类，指向PvtV2Config
    config_class = PvtV2Config
    # 模型基础名称前缀
    base_model_prefix = "pvt_v2"
    # 主要输入名称
    main_input_name = "pixel_values"
    # 支持梯度检查点
    supports_gradient_checkpointing = True

    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
        """Initialize the weights"""
        # 如果是线性层，使用截断正态分布初始化权重，初始化范围为配置中的initializer_range
        if isinstance(module, nn.Linear):
            module.weight.data = nn.init.trunc_normal_(module.weight.data, mean=0.0, std=self.config.initializer_range)
            # 如果有偏置，将偏置数据清零
            if module.bias is not None:
                module.bias.data.zero_()
        # 如果是LayerNorm层，将偏置数据清零，权重数据填充为1.0
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        # 如果是卷积层，使用正态分布初始化权重，标准差为sqrt(2.0 / fan_out)
        elif isinstance(module, nn.Conv2d):
            fan_out = module.kernel_size[0] * module.kernel_size[1] * module.out_channels
            fan_out //= module.groups
            module.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
            # 如果有偏置，将偏置数据清零
            if module.bias is not None:
                module.bias.data.zero_()


PVT_V2_START_DOCSTRING = r"""
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`~PvtV2Config`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

PVT_V2_INPUTS_DOCSTRING = r"""
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`PvtImageProcessor.__call__`] for details.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""


@add_start_docstrings(
    "The bare Pvt-v2 encoder outputting raw hidden-states without any specific head on top.",
    PVT_V2_START_DOCSTRING,
)
# PvtV2Model类继承自PvtV2PreTrainedModel类，代表Pvt-v2编码器的原始隐藏状态输出，没有特定的输出头部
class PvtV2Model(PvtV2PreTrainedModel):
    def __init__(self, config: PvtV2Config):
        super().__init__(config)
        self.config = config

        # hierarchical Transformer encoder
        self.encoder = PvtV2Encoder(config)

        # Initialize weights and apply final processing
        self.post_init()

    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        # 遍历需要修剪的层和对应需要修剪的注意力头
        for layer, heads in heads_to_prune.items():
            # 调用 encoder 中指定层的注意力模块，执行修剪操作
            self.encoder.layer[layer].attention.prune_heads(heads)

    @add_start_docstrings_to_model_forward(PVT_V2_INPUTS_DOCSTRING.format("(batch_size, channels, height, width)"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=BaseModelOutput,
        config_class=_CONFIG_FOR_DOC,
        modality="vision",
        expected_output=_EXPECTED_OUTPUT_SHAPE,
    )
    def forward(
        self,
        pixel_values: torch.FloatTensor,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutput]:
        # 根据参数或配置设置是否输出注意力权重
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 根据参数或配置设置是否输出隐藏状态
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 根据参数或配置设置是否使用返回字典
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 将输入数据传入编码器模型进行前向传播
        encoder_outputs = self.encoder(
            pixel_values=pixel_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # 获取编码器的序列输出
        sequence_output = encoder_outputs[0]

        if not return_dict:
            # 如果不使用返回字典，返回元组形式的序列输出和其他编码器输出
            return (sequence_output,) + encoder_outputs[1:]

        # 如果使用返回字典，则返回 BaseModelOutput 对象，包含序列输出、隐藏状态和注意力权重
        return BaseModelOutput(
            last_hidden_state=sequence_output,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )
# 使用带有图片分类头部的 Pvt-v2 模型转换器类定义，该头部是在 [CLS] 标记的最终隐藏状态之上的线性层，例如适用于 ImageNet 数据集。
# 这个类继承自 PvtV2PreTrainedModel 类。
@add_start_docstrings(
    """
    Pvt-v2 模型的图片分类器，顶部带有一个线性层（放在最终隐藏状态的 [CLS] 标记之上），例如用于 ImageNet。
    """,
    PVT_V2_START_DOCSTRING,
)
class PvtV2ForImageClassification(PvtV2PreTrainedModel):
    
    def __init__(self, config: PvtV2Config) -> None:
        super().__init__(config)

        # 设置分类标签数量
        self.num_labels = config.num_labels
        # 初始化 Pvt-v2 模型
        self.pvt_v2 = PvtV2Model(config)

        # 分类器头部
        self.classifier = (
            # 如果标签数量大于 0，创建一个线性层
            nn.Linear(config.hidden_sizes[-1], config.num_labels) if config.num_labels > 0 else nn.Identity()
        )

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(PVT_V2_INPUTS_DOCSTRING.format("(batch_size, channels, height, width)"))
    @add_code_sample_docstrings(
        checkpoint=_IMAGE_CLASS_CHECKPOINT,
        output_type=ImageClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
    )
    def forward(
        self,
        pixel_values: Optional[torch.Tensor],
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        # 前向传播函数定义，接受像素值张量和可选标签等参数

        pixel_values: Optional[torch.Tensor],
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[tuple, ImageClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 初始化返回字典，如果未指定则根据配置决定是否使用返回字典
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用 PVT-v2 模型进行前向传播
        outputs = self.pvt_v2(
            pixel_values=pixel_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取序列输出（最后一层的隐藏状态）
        sequence_output = outputs[0]

        # 将形状从 (batch_size, num_channels, height, width) 转换为 (batch_size, height*width, hidden_size)
        batch_size = sequence_output.shape[0]
        sequence_output = sequence_output.permute(0, 2, 3, 1)  # 调整维度顺序
        sequence_output = sequence_output.reshape(batch_size, -1, self.config.hidden_sizes[-1])  # 重新形状化

        # 全局平均池化
        sequence_output = sequence_output.mean(dim=1)

        # 使用分类器进行预测
        logits = self.classifier(sequence_output)

        # 初始化损失为 None
        loss = None
        if labels is not None:
            # 确定问题类型（回归、单标签分类、多标签分类）
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            # 根据问题类型计算相应的损失函数
            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        # 如果不要求返回字典，则返回元组
        if not return_dict:
            output = (logits,) + outputs[1:]  # 包含 logits 和额外的输出（隐藏状态等）
            return ((loss,) + output) if loss is not None else output

        # 如果要求返回字典，则创建 ImageClassifierOutput 对象并返回
        return ImageClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
@add_start_docstrings(
    """
    PVTv2 backbone, to be used with frameworks like DETR and MaskFormer.
    """,
    PVT_V2_START_DOCSTRING,
)
class PvtV2Backbone(PvtV2Model, BackboneMixin):
    def __init__(self, config: PvtV2Config):
        super().__init__(config)
        super()._init_backbone(config)
        self.num_features = config.hidden_sizes
        """
        初始化函数，接受一个配置对象作为参数，并初始化 PVTv2 模型的骨干部分。
        设置模型的特征数为配置中的隐藏层大小。
        """

    @add_start_docstrings_to_model_forward(PVT_V2_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        pixel_values: torch.FloatTensor,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> BackboneOutput:
        """
        正向传播函数，接受输入像素值和一些可选参数，返回骨干网络的输出。

        Args:
            pixel_values (torch.FloatTensor): 输入的像素值张量。
            output_attentions (Optional[bool], optional): 是否输出注意力权重。默认为None。
            output_hidden_states (Optional[bool], optional): 是否输出隐藏状态。默认为None。
            return_dict (Optional[bool], optional): 是否返回字典格式的输出。默认为None。

        Returns:
            BackboneOutput: 包含特征图、隐藏状态和注意力权重的输出对象。

        Examples:
        
        ```
        >>> from transformers import AutoImageProcessor, AutoBackbone
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> processor = AutoImageProcessor.from_pretrained("OpenGVLab/pvt_v2_b0")
        >>> model = AutoBackbone.from_pretrained(
        ...     "OpenGVLab/pvt_v2_b0", out_features=["stage1", "stage2", "stage3", "stage4"]
        ... )

        >>> inputs = processor(image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> feature_maps = outputs.feature_maps
        >>> list(feature_maps[-1].shape)
        [1, 256, 7, 7]
        ```
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        """
        确保返回字典的设置为配置中的默认值。
        确保输出隐藏状态的设置为配置中的默认值。
        """

        outputs = self.encoder(
            pixel_values=pixel_values,
            output_attentions=output_attentions,
            output_hidden_states=True,  # 总是输出隐藏状态以供后续使用
            return_dict=return_dict,
        )
        """
        使用编码器进行前向传播，传递输入像素值和其他参数。
        总是输出隐藏状态以确保能够提取特征。
        """

        hidden_states = outputs.hidden_states

        feature_maps = ()
        for idx, stage in enumerate(self.stage_names):
            if stage in self.out_features:
                feature_maps += (hidden_states[idx],)
        """
        根据设置的输出特征名称，从隐藏状态中选择对应阶段的特征图。
        """

        if not return_dict:
            output = (feature_maps,)
            if output_hidden_states:
                output += (outputs.hidden_states,)
            return output
        """
        如果不返回字典形式的输出，构建一个包含特征图和隐藏状态的元组返回。
        """

        return BackboneOutput(
            feature_maps=feature_maps,
            hidden_states=outputs.hidden_states if output_hidden_states else None,
            attentions=None,
        )
        """
        返回包含特征图、隐藏状态和注意力权重的 BackboneOutput 对象。
        """

Transformers-源码解析-九十-

Transformers 源码解析（九十）

.\models\pop2piano\tokenization_pop2piano.py

.\models\pop2piano\__init__.py

.\models\prophetnet\configuration_prophetnet.py

.\models\prophetnet\convert_prophetnet_original_pytorch_checkpoint_to_pytorch.py

.\models\prophetnet\modeling_prophetnet.py

.\models\prophetnet\tokenization_prophetnet.py

.\models\prophetnet\__init__.py

.\models\pvt\configuration_pvt.py

.\models\pvt\convert_pvt_to_pytorch.py

.\models\pvt\image_processing_pvt.py

.\models\pvt\modeling_pvt.py

.\models\pvt\__init__.py

.\models\pvt_v2\configuration_pvt_v2.py

.\models\pvt_v2\convert_pvt_v2_to_pytorch.py

.\models\pvt_v2\modeling_pvt_v2.py

`.\models\pop2piano\tokenization_pop2piano.py`

`.\models\pop2piano\init.py`

`.\models\prophetnet\configuration_prophetnet.py`

`.\models\prophetnet\convert_prophetnet_original_pytorch_checkpoint_to_pytorch.py`

`.\models\prophetnet\modeling_prophetnet.py`

`.\models\prophetnet\tokenization_prophetnet.py`

`.\models\prophetnet\init.py`

`.\models\pvt\configuration_pvt.py`

`.\models\pvt\convert_pvt_to_pytorch.py`

`.\models\pvt\image_processing_pvt.py`

`.\models\pvt\modeling_pvt.py`

`.\models\pvt\init.py`

`.\models\pvt_v2\configuration_pvt_v2.py`

`.\models\pvt_v2\convert_pvt_v2_to_pytorch.py`

`.\models\pvt_v2\modeling_pvt_v2.py`