Transformers 源码解析（二十六）

`.\models\clip\processing_clip.py`

# coding=utf-8
# 版权所有 2021 年 HuggingFace Inc. 团队
#
# 根据 Apache 许可证 2.0 版本进行许可；除非符合许可证，否则不得使用此文件。
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，本软件按“原样”分发，不提供任何形式的担保或条件，
# 无论是明示的还是默示的。详细信息请参阅许可证。
"""
CLIP 的图像/文本处理类
"""

import warnings

from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding


class CLIPProcessor(ProcessorMixin):
    r"""
    构建一个 CLIP 处理器，将 CLIP 图像处理器和 CLIP 分词器包装成一个单一处理器。

    [`CLIPProcessor`] 提供了 [`CLIPImageProcessor`] 和 [`CLIPTokenizerFast`] 的所有功能。参见
    [`~CLIPProcessor.__call__`] 和 [`~CLIPProcessor.decode`] 获取更多信息。

    Args:
        image_processor ([`CLIPImageProcessor`], *optional*):
            图像处理器，必需输入。
        tokenizer ([`CLIPTokenizerFast`], *optional*):
            分词器，必需输入。
    """

    attributes = ["image_processor", "tokenizer"]
    image_processor_class = "CLIPImageProcessor"
    tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")

    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
        feature_extractor = None
        # 如果 kwargs 中包含 `feature_extractor`，则发出警告并将其弹出
        if "feature_extractor" in kwargs:
            warnings.warn(
                "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
                " instead.",
                FutureWarning,
            )
            feature_extractor = kwargs.pop("feature_extractor")

        # 如果未提供 image_processor，则尝试使用 feature_extractor
        image_processor = image_processor if image_processor is not None else feature_extractor
        if image_processor is None:
            raise ValueError("You need to specify an `image_processor`.")
        if tokenizer is None:
            raise ValueError("You need to specify a `tokenizer`.")

        super().__init__(image_processor, tokenizer)

    def batch_decode(self, *args, **kwargs):
        """
        此方法将所有参数转发到 CLIPTokenizerFast 的 [`~PreTrainedTokenizer.batch_decode`]。请参阅该方法的文档字符串以获取更多信息。
        """
        return self.tokenizer.batch_decode(*args, **kwargs)

    def decode(self, *args, **kwargs):
        """
        此方法将所有参数转发到 CLIPTokenizerFast 的 [`~PreTrainedTokenizer.decode`]。请参阅该方法的文档字符串以获取更多信息。
        """
        return self.tokenizer.decode(*args, **kwargs)

    @property
    # 返回模型输入的名称列表，合并并去重来自于分词器和图像处理器的输入名称
    def model_input_names(self):
        # 获取分词器的模型输入名称列表
        tokenizer_input_names = self.tokenizer.model_input_names
        # 获取图像处理器的模型输入名称列表
        image_processor_input_names = self.image_processor.model_input_names
        # 返回合并并去重后的输入名称列表
        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))

    # 返回特征提取器的类，发出关于该属性即将在 v5 版本中删除的警告
    @property
    def feature_extractor_class(self):
        warnings.warn(
            "`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
            FutureWarning,
        )
        # 返回图像处理器的类
        return self.image_processor_class

    # 返回特征提取器，发出关于该属性即将在 v5 版本中删除的警告
    @property
    def feature_extractor(self):
        warnings.warn(
            "`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
            FutureWarning,
        )
        # 返回图像处理器
        return self.image_processor

`.\models\clip\tokenization_clip.py`

# coding=utf-8
# Copyright 2021 The Open AI Team Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes for CLIP."""

import json                # 导入处理 JSON 格式的模块
import os                  # 导入操作系统功能的模块
import unicodedata         # 导入 Unicode 数据处理模块
from functools import lru_cache  # 导入 functools 模块中的 lru_cache 装饰器
from typing import List, Optional, Tuple  # 导入类型提示相关的功能

import regex as re         # 导入正则表达式库 regex

from ...tokenization_utils import AddedToken, PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
                            # 导入上级目录中的 tokenization_utils 模块的部分功能
from ...utils import logging   # 导入上级目录中的 logging 模块

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器对象

VOCAB_FILES_NAMES = {
    "vocab_file": "vocab.json",     # 定义词汇表文件名对应的常量
    "merges_file": "merges.txt",    # 定义合并规则文件名对应的常量
}

PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "openai/clip-vit-base-patch32": "https://huggingface.co/openai/clip-vit-base-patch32/resolve/main/vocab.json",
    },  # 预训练模型词汇表文件的映射
    "merges_file": {
        "openai/clip-vit-base-patch32": "https://huggingface.co/openai/clip-vit-base-patch32/resolve/main/merges.txt",
    },  # 预训练模型合并规则文件的映射
}

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "openai/clip-vit-base-patch32": 77,  # 预训练模型位置嵌入的尺寸映射
}

PRETRAINED_INIT_CONFIGURATION = {
    "openai/clip-vit-base-patch32": {},  # 预训练模型的初始化配置信息
}

@lru_cache()
def bytes_to_unicode():
    """
    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
    characters the bpe code barfs on.

    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
    tables between utf-8 bytes and unicode strings.
    """
    bs = (
        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
    )  # 定义包含不同范围 Unicode 字节的列表
    cs = bs[:]  # 复制 bs 列表到 cs
    n = 0
    for b in range(2**8):
        if b not in bs:
            bs.append(b)
            cs.append(2**8 + n)
            n += 1
    cs = [chr(n) for n in cs]  # 将 cs 列表中的整数转换为对应的 Unicode 字符
    return dict(zip(bs, cs))   # 返回由 utf-8 字节到 Unicode 字符的映射表

def get_pairs(word):
    """
    Return set of symbol pairs in a word.

    Word is represented as tuple of symbols (symbols being variable-length strings).
    """
    pairs = set()          # 创建一个空集合用于存储符号对
    prev_char = word[0]    # 获取单词中的第一个字符作为上一个字符
    for char in word[1:]:  # 遍历单词中的每个字符，从第二个字符开始
        pairs.add((prev_char, char))  # 将前一个字符和当前字符作为符号对添加到集合中
        prev_char = char    # 更新前一个字符为当前字符
    return pairs           # 返回符号对集合

def whitespace_clean(text):
    text = re.sub(r"\s+", " ", text)  # 使用正则表达式将多个连续空白字符替换为单个空格
    text = text.strip()               # 去除字符串两端的空白字符
    return text                       # 返回清理后的文本
# 从 transformers.models.bert.tokenization_bert.whitespace_tokenize 复制而来
def whitespace_tokenize(text):
    """对文本进行基本的空白符清理和分割。"""
    # 去除文本两端的空白符
    text = text.strip()
    # 如果文本为空，则返回空列表
    if not text:
        return []
    # 使用空白符分割文本，生成 token 列表
    tokens = text.split()
    # 返回分割后的 token 列表
    return tokens


# 从 transformers.models.bert.tokenization_bert.BasicTokenizer 复制而来
class BasicTokenizer(object):
    """
    构建一个 BasicTokenizer 对象，执行基本的分词操作（标点符号分割、小写处理等）。

    Args:
        do_lower_case (`bool`, *可选*, 默认为 `True`):
            是否在分词时将输入转换为小写。
        never_split (`Iterable`, *可选*):
            在分词时不应该分割的 token 集合。仅在 `do_basic_tokenize=True` 时有效。
        tokenize_chinese_chars (`bool`, *可选*, 默认为 `True`):
            是否对中文字符进行分词处理。

            对于日语，应该将此选项禁用（参见这个
            [issue](https://github.com/huggingface/transformers/issues/328)）。
        strip_accents (`bool`, *可选*):
            是否去除所有的重音符号。如果未指定此选项，则根据 `lowercase` 的值决定（与原始 BERT 一致）。
        do_split_on_punc (`bool`, *可选*, 默认为 `True`):
            在某些情况下，我们希望跳过基本的标点符号分割，以便后续的分词可以捕获单词的完整上下文，如缩略词。

    """

    def __init__(
        self,
        do_lower_case=True,
        never_split=None,
        tokenize_chinese_chars=True,
        strip_accents=None,
        do_split_on_punc=True,
    ):
        # 如果 never_split 为 None，则初始化为空列表
        if never_split is None:
            never_split = []
        # 设定是否将输入转换为小写
        self.do_lower_case = do_lower_case
        # 设置不进行分割的 token 集合
        self.never_split = set(never_split)
        # 设置是否对中文字符进行分词处理
        self.tokenize_chinese_chars = tokenize_chinese_chars
        # 设置是否去除所有的重音符号
        self.strip_accents = strip_accents
        # 设置是否进行基本的标点符号分割
        self.do_split_on_punc = do_split_on_punc
    def tokenize(self, text, never_split=None):
        """
        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.

        Args:
            never_split (`List[str]`, *optional*)
                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
                [`PreTrainedTokenizer.tokenize`]) List of token not to split.
        """
        # union() returns a new set by concatenating the two sets.
        # 如果传入了新的不分割的词汇列表（never_split），则将其与类属性中的never_split集合进行合并，否则直接使用类属性的never_split集合
        never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
        # 清理文本，去除不必要的字符
        text = self._clean_text(text)

        # This was added on November 1st, 2018 for the multilingual and Chinese
        # models. This is also applied to the English models now, but it doesn't
        # matter since the English models were not trained on any Chinese data
        # and generally don't have any Chinese data in them (there are Chinese
        # characters in the vocabulary because Wikipedia does have some Chinese
        # words in the English Wikipedia.).
        # 如果开启了中文字符的分词处理，则对文本进行中文字符的特殊处理
        if self.tokenize_chinese_chars:
            text = self._tokenize_chinese_chars(text)
        # 使用Unicode NFC规范化文本，防止不同Unicode编码的相同字符被视为不同字符
        unicode_normalized_text = unicodedata.normalize("NFC", text)
        # 使用空白字符进行分词
        orig_tokens = whitespace_tokenize(unicode_normalized_text)
        split_tokens = []
        # 遍历每个原始token
        for token in orig_tokens:
            # 如果token不在不分割的词汇列表中
            if token not in never_split:
                # 如果需要转换为小写
                if self.do_lower_case:
                    token = token.lower()
                    # 如果需要去除重音符号，则进行相应处理
                    if self.strip_accents is not False:
                        token = self._run_strip_accents(token)
                # 如果需要去除重音符号，则进行相应处理
                elif self.strip_accents:
                    token = self._run_strip_accents(token)
            # 将分割后的token加入到split_tokens列表中
            split_tokens.extend(self._run_split_on_punc(token, never_split))

        # 使用空白字符再次分割合并后的tokens，得到最终的输出tokens列表
        output_tokens = whitespace_tokenize(" ".join(split_tokens))
        # 返回最终的输出tokens列表
        return output_tokens

    def _run_strip_accents(self, text):
        """Strips accents from a piece of text."""
        # 使用NFD规范化文本，将重音符号分离出来
        text = unicodedata.normalize("NFD", text)
        output = []
        # 遍历文本中的每个字符
        for char in text:
            # 获取字符的Unicode类别
            cat = unicodedata.category(char)
            # 如果字符是重音符号（Mn类别），则跳过该字符
            if cat == "Mn":
                continue
            # 否则将字符加入到输出列表中
            output.append(char)
        # 将输出列表中的字符连接成字符串，返回去除重音符号后的文本
        return "".join(output)
    def _run_split_on_punc(self, text, never_split=None):
        """Splits punctuation on a piece of text."""
        # 如果禁止分割标点或者给定的 text 在 never_split 中，则直接返回包含整个 text 的列表
        if not self.do_split_on_punc or (never_split is not None and text in never_split):
            return [text]
        # 将文本转换为字符列表
        chars = list(text)
        i = 0
        start_new_word = True
        output = []
        # 遍历字符列表
        while i < len(chars):
            char = chars[i]
            # 如果当前字符是标点符号
            if _is_punctuation(char):
                # 添加一个新的列表，该列表包含当前标点符号
                output.append([char])
                start_new_word = True
            else:
                # 如果不是标点符号，检查是否需要开始一个新单词
                if start_new_word:
                    output.append([])
                start_new_word = False
                # 将当前字符添加到最后一个列表中
                output[-1].append(char)
            i += 1

        # 将每个子列表中的字符连接起来，形成最终的分割后的字符串列表
        return ["".join(x) for x in output]

    def _tokenize_chinese_chars(self, text):
        """Adds whitespace around any CJK character."""
        output = []
        # 遍历文本中的每个字符
        for char in text:
            cp = ord(char)
            # 如果是中文字符，添加前后空格
            if self._is_chinese_char(cp):
                output.append(" ")
                output.append(char)
                output.append(" ")
            else:
                output.append(char)
        # 将列表中的字符连接成一个字符串并返回
        return "".join(output)

    def _is_chinese_char(self, cp):
        """Checks whether CP is the codepoint of a CJK character."""
        # 判断是否是中文字符的条件
        if (
            (cp >= 0x4E00 and cp <= 0x9FFF)
            or (cp >= 0x3400 and cp <= 0x4DBF)  #
            or (cp >= 0x20000 and cp <= 0x2A6DF)  #
            or (cp >= 0x2A700 and cp <= 0x2B73F)  #
            or (cp >= 0x2B740 and cp <= 0x2B81F)  #
            or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
            or (cp >= 0xF900 and cp <= 0xFAFF)
            or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
        ):  #
            return True

        return False

    def _clean_text(self, text):
        """Performs invalid character removal and whitespace cleanup on text."""
        output = []
        # 遍历文本中的每个字符
        for char in text:
            cp = ord(char)
            # 如果字符是无效字符或者控制字符，跳过
            if cp == 0 or cp == 0xFFFD or _is_control(char):
                continue
            # 如果字符是空白字符，用单个空格替换
            if _is_whitespace(char):
                output.append(" ")
            else:
                output.append(char)
        # 将列表中的字符连接成一个字符串并返回
        return "".join(output)
    """
    构造一个 CLIP 分词器。基于字节级别的 Byte-Pair-Encoding。

    这个分词器继承自 `PreTrainedTokenizer`，其中包含大部分主要方法。用户应该参考这个超类以获取有关这些方法的更多信息。

    Args:
        vocab_file (`str`):
            词汇文件的路径。
        merges_file (`str`):
            合并文件的路径。
        errors (`str`, *optional*, defaults to `"replace"`):
            将字节解码为 UTF-8 时的错误处理模式。参见
            [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) 获取更多信息。
        unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
            未知标记。词汇表中不存在的标记无法转换为 ID，因此将被设置为这个标记。
        bos_token (`str`, *optional*, defaults to `"<|startoftext|>"`):
            序列的起始标记。
        eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
            序列的结束标记。
        pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
            用于填充的标记，例如在对不同长度的序列进行批处理时使用。

    """
    vocab_files_names = VOCAB_FILES_NAMES  # 词汇文件的名称
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP  # 预训练词汇文件映射表
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES  # 预训练位置嵌入的最大输入尺寸
    model_input_names = ["input_ids", "attention_mask"]  # 模型输入名称列表

    def __init__(
        self,
        vocab_file,
        merges_file,
        errors="replace",
        unk_token="<|endoftext|>",
        bos_token="<|startoftext|>",
        eos_token="<|endoftext|>",
        pad_token="<|endoftext|>",  # 用于填充的标记，用来启用填充的一个小技巧
        **kwargs,
        ):
            # 如果 bos_token 是字符串，则创建一个 AddedToken 对象，用于表示序列的开头
            bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
            # 如果 eos_token 是字符串，则创建一个 AddedToken 对象，用于表示序列的结尾
            eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
            # 如果 unk_token 是字符串，则创建一个 AddedToken 对象，用于表示未知词
            unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
            try:
                import ftfy
                
                # 尝试导入 ftfy 库，若成功则设定修复文本的函数
                self.fix_text = ftfy.fix_text
            except ImportError:
                # 若导入失败，记录日志并使用自定义的 BasicTokenizer 替代 ftfy
                logger.info("ftfy or spacy is not installed using custom BasicTokenizer instead of ftfy.")
                self.nlp = BasicTokenizer(strip_accents=False, do_split_on_punc=False)
                self.fix_text = None

            # 打开并加载词汇文件到 self.encoder 中
            with open(vocab_file, encoding="utf-8") as vocab_handle:
                self.encoder = json.load(vocab_handle)
            # 创建 self.decoder，用于从编码到原始词汇的反向映射
            self.decoder = {v: k for k, v in self.encoder.items()}
            self.errors = errors  # 记录在解码时如何处理错误
            self.byte_encoder = bytes_to_unicode()
            # 创建字节到 Unicode 的反向映射
            self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
            # 打开并读取 BPE merges 文件，并处理成适合使用的格式
            with open(merges_file, encoding="utf-8") as merges_handle:
                bpe_merges = merges_handle.read().strip().split("\n")[1 : 49152 - 256 - 2 + 1]
            bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
            # 创建 BPE merges 的排名字典
            self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
            # 初始化缓存，用于存储特殊 token
            self.cache = {"<|startoftext|>": "<|startoftext|>", "<|endoftext|>": "<|endoftext|>"}

            # 编译正则表达式模式，用于分词和处理文本
            self.pat = re.compile(
                r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""",
                re.IGNORECASE,
            )

            # 调用父类的初始化方法，设置模型的各种参数和特殊 token
            super().__init__(
                errors=errors,
                unk_token=unk_token,
                bos_token=bos_token,
                eos_token=eos_token,
                pad_token=pad_token,
                **kwargs,
            )

    @property
    def vocab_size(self):
        return len(self.encoder)

    def get_vocab(self):
        return dict(self.encoder, **self.added_tokens_encoder)

    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Generate token type IDs from a list of token IDs representing sequences. This is typically used in sequence pair
        tasks to differentiate between the first and the second sequence.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs representing the first sequence.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs representing the second sequence in a pair task.

        Returns:
            `List[int]`: List of token type IDs where each ID corresponds to a token in the input sequences.
        """

        # Initialize token type ID lists for the special tokens
        if token_ids_1 is None:
            # If there is only one sequence, all tokens belong to that sequence (token type ID 0)
            return [0] * len(token_ids_0)
        
        # For two sequences, differentiate between them using token type IDs
        # Start with token type 0 for the first sequence, then switch to token type 1 for the second sequence
        token_type_ids = [0] * len(token_ids_0) + [1] * len(token_ids_1)
        
        return token_type_ids
    ) -> List[int]:
        """
        Create a mask from the two sequences passed. CLIP does not make use of token type ids, therefore a list of
        zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.
        """
        bos_token = [self.bos_token_id]  # Initialize list with beginning of sequence token ID
        eos_token = [self.eos_token_id]  # Initialize list with end of sequence token ID

        if token_ids_1 is None:
            return len(bos_token + token_ids_0 + eos_token) * [0]  # Return a list of zeros of length equal to the sum of the lengths of bos_token, token_ids_0, and eos_token
        return len(bos_token + token_ids_0 + eos_token + eos_token + token_ids_1 + eos_token) * [0]  # Return a list of zeros of length equal to the sum of the lengths of bos_token, token_ids_0, eos_token, another eos_token, token_ids_1, and eos_token

    def bpe(self, token):
        if token in self.cache:
            return self.cache[token]  # Return cached value if token exists in cache
        word = tuple(token[:-1]) + (token[-1] + "</w>",)  # Append "</w>" to the last character of the token and convert it to a tuple
        pairs = get_pairs(word)  # Get all pairs of characters in the token

        if not pairs:
            return token + "</w>"  # Append "</w>" to token if no character pairs are found

        while True:
            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))  # Find the pair with the lowest rank according to self.bpe_ranks
            if bigram not in self.bpe_ranks:
                break
            first, second = bigram  # Separate the first and second characters of the bigram
            new_word = []
            i = 0
            while i < len(word):
                try:
                    j = word.index(first, i)  # Find the index of the first character in word starting from index i
                except ValueError:
                    new_word.extend(word[i:])  # Extend new_word with remaining characters if first character is not found
                    break
                else:
                    new_word.extend(word[i:j])  # Extend new_word with characters from i to j (excluding j)
                    i = j

                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
                    new_word.append(first + second)  # Append the bigram to new_word if it matches first and second characters in sequence
                    i += 2
                else:
                    new_word.append(word[i])  # Append current character to new_word
                    i += 1
            new_word = tuple(new_word)  # Convert new_word to tuple
            word = new_word  # Update word with new_word
            if len(word) == 1:
                break
            else:
                pairs = get_pairs(word)  # Get new pairs for updated word
        word = " ".join(word)  # Join characters of word with spaces
        self.cache[token] = word  # Cache the token and its corresponding word
        return word  # Return the final word after BPE encoding

    def _tokenize(self, text):
        """Tokenize a string."""
        bpe_tokens = []  # Initialize list to store BPE tokens
        if self.fix_text is None:
            text = " ".join(self.nlp.tokenize(text))  # Tokenize text using self.nlp if fix_text is None
        else:
            text = whitespace_clean(self.fix_text(text)).lower()  # Clean and lowercase text using whitespace_clean function if fix_text is defined

        for token in re.findall(self.pat, text):  # Iterate through tokens found using regex pattern self.pat in text
            token = "".join(
                self.byte_encoder[b] for b in token.encode("utf-8")
            )  # Encode each character of token into byte_encoder and join them into a string
            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))  # Extend bpe_tokens with BPE tokens split by space
        return bpe_tokens  # Return list of BPE tokens

    def _convert_token_to_id(self, token):
        """Converts a token (str) in an id using the vocab."""
        return self.encoder.get(token, self.encoder.get(self.unk_token))  # Return ID of token from encoder or return ID of unk_token if token is not found
    def _convert_id_to_token(self, index):
        """Converts an index (integer) to a token (str) using the vocab."""
        # 使用词汇表将索引转换为对应的标记字符串
        return self.decoder.get(index)

    def convert_tokens_to_string(self, tokens):
        """Converts a sequence of tokens (strings) into a single string."""
        # 将一系列标记字符串连接成一个字符串
        text = "".join(tokens)
        # 根据字节解码器将字符串转换为字节数组
        byte_array = bytearray([self.byte_decoder[c] for c in text])
        # 使用 utf-8 解码字节数组，并替换特定字符串，去除首尾空格
        text = byte_array.decode("utf-8", errors=self.errors).replace("</w>", " ").strip()
        return text

    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # 检查保存目录是否存在，如果不存在则记录错误并返回
        if not os.path.isdir(save_directory):
            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
            return
        
        # 构建词汇表文件路径和合并文件路径
        vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
        )
        merge_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
        )

        # 写入词汇表到文件中，使用 UTF-8 编码，保证非 ASCII 字符的正确性
        with open(vocab_file, "w", encoding="utf-8") as f:
            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")

        index = 0
        # 写入合并文件的版本信息
        with open(merge_file, "w", encoding="utf-8") as writer:
            writer.write("#version: 0.2\n")
            # 遍历并写入 BPE merges 到文件中，按照索引排序
            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
                if index != token_index:
                    # 记录警告，如果 BPE 合并的索引不是连续的，可能意味着分词器出现问题
                    logger.warning(
                        "Saving vocabulary to {}: BPE merge indices are not consecutive."
                        " Please check that the tokenizer is not corrupted!".format(merge_file)
                    )
                    index = token_index
                writer.write(" ".join(bpe_tokens) + "\n")
                index += 1

        return vocab_file, merge_file

`.\models\clip\tokenization_clip_fast.py`

# coding=utf-8
# Copyright 2021 The Open AI Team Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Tokenization classes for OpenAI GPT.
"""

from typing import List, Optional, Tuple

from tokenizers import pre_tokenizers  # 导入 tokenizers 库中的 pre_tokenizers 模块

from ...tokenization_utils_fast import PreTrainedTokenizerFast  # 导入快速分词器的基类
from ...utils import logging  # 导入日志工具
from .tokenization_clip import CLIPTokenizer  # 导入 CLIPTokenizer 类

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器

VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}

PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "openai/clip-vit-base-patch32": "https://huggingface.co/openai/clip-vit-base-patch32/resolve/main/vocab.json",
    },
    "merges_file": {
        "openai/clip-vit-base-patch32": "https://huggingface.co/openai/clip-vit-base-patch32/resolve/main/merges.txt",
    },
    "tokenizer_file": {
        "openai/clip-vit-base-patch32": (
            "https://huggingface.co/openai/clip-vit-base-patch32/resolve/main/tokenizer.json"
        ),
    },
}

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "openai/clip-vit-base-patch32": 77,
}


class CLIPTokenizerFast(PreTrainedTokenizerFast):
    """
    Construct a "fast" CLIP tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level
    Byte-Pair-Encoding.

    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`, *optional*):
            Path to the vocabulary file.
        merges_file (`str`, *optional*):
            Path to the merges file.
        tokenizer_file (`str`, *optional*):
            The path to a tokenizer file to use instead of the vocab file.
        unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        bos_token (`str`, *optional*, defaults to `"<|startoftext|>"`):
            The beginning of sequence token.
        eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
            The end of sequence token.
        pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
            The token used for padding, for example when batching sequences of different lengths.
    """

    vocab_files_names = VOCAB_FILES_NAMES  # 设置词汇文件的名称字典
    # 使用预先定义的词汇文件映射
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    # 使用预先定义的模型输入最大尺寸
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    # 定义模型的输入名称列表
    model_input_names = ["input_ids", "attention_mask"]
    # 指定慢速分词器的类别为 CLIPTokenizer
    slow_tokenizer_class = CLIPTokenizer

    def __init__(
        self,
        vocab_file=None,
        merges_file=None,
        tokenizer_file=None,
        unk_token="<|endoftext|>",
        bos_token="<|startoftext|>",
        eos_token="<|endoftext|>",
        pad_token="<|endoftext|>",  # hack to enable padding
        **kwargs,
    ):
        # 调用父类的初始化方法，设置词汇、合并文件及分词器文件
        super().__init__(
            vocab_file,
            merges_file,
            tokenizer_file=tokenizer_file,
            unk_token=unk_token,
            bos_token=bos_token,
            eos_token=eos_token,
            pad_token=pad_token,
            **kwargs,
        )

        # 检查后端分词器的预处理器是否为序列预处理器，否则抛出值错误异常
        if not isinstance(self.backend_tokenizer.pre_tokenizer, pre_tokenizers.Sequence):
            raise ValueError(
                "The `backend_tokenizer` provided does not match the expected format. The CLIP tokenizer has been"
                " heavily modified from transformers version 4.17.0. You need to convert the tokenizer you are using"
                " to be compatible with this version.The easiest way to do so is"
                ' `CLIPTokenizerFast.from_pretrained("path_to_local_folder_or_hub_repo, from_slow=True)`. If you want'
                " to use your existing tokenizer, you will have to revert to a version prior to 4.17.0 of"
                " transformers."
            )

        # 修改后端分词器的解码方法，通过添加空格以确保正确的解码
        self._wrap_decode_method_backend_tokenizer()

    # 非常丑陋的hack，以使填充能够正确解码，详细见 https://github.com/huggingface/tokenizers/issues/872
    def _wrap_decode_method_backend_tokenizer(self):
        # 保存原始的解码方法
        orig_decode_method = self.backend_tokenizer.decode

        # 定义新的解码方法，替换结束词后缀为空格并去除两侧空格
        def new_decode_method(*args, **kwargs):
            text = orig_decode_method(*args, **kwargs)
            text = text.replace(self.backend_tokenizer.model.end_of_word_suffix, " ").strip()
            return text

        # 覆盖后端分词器的解码方法为新定义的方法
        self.backend_tokenizer.decode = new_decode_method

    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
        # 以下代码行需要继续添加注释
    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Create a mask from the two sequences passed. CLIP does not make use of token type ids, therefore a list of
        zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.
        """
        # 定义起始和结束特殊标记的 ID 列表
        bos_token = [self.bos_token_id]
        eos_token = [self.eos_token_id]

        # 如果只有一个序列（单个文本），返回带有特殊标记的输入 ID 列表
        if token_ids_1 is None:
            return len(bos_token + token_ids_0 + eos_token) * [0]
        
        # 如果有两个序列（文本对），返回带有特殊标记的输入 ID 列表
        return len(bos_token + token_ids_0 + eos_token + eos_token + token_ids_1 + eos_token) * [0]

    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        """
        Save the tokenizer's vocabulary to the specified directory.

        Args:
            save_directory (str):
                Directory where the vocabulary will be saved.
            filename_prefix (str, *optional*):
                Optional prefix for the saved files.

        Returns:
            `Tuple[str]`: Tuple containing the filenames of the saved vocabulary files.
        """
        # 调用内部的模型保存方法来保存词汇表
        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
        
        # 返回保存的文件名组成的元组
        return tuple(files)

`.\models\clip\init.py`

# 版权声明和许可信息，告知此文件受 Apache 2.0 许可证保护
# 详情可参阅 http://www.apache.org/licenses/LICENSE-2.0
#
# 在这里导入必要的模块和函数
from typing import TYPE_CHECKING
# 从当前项目的工具模块中导入所需的函数和异常类
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_flax_available,
    is_tf_available,
    is_tokenizers_available,
    is_torch_available,
    is_vision_available,
)

# 定义一个字典，描述导入结构和所需的组件
_import_structure = {
    "configuration_clip": [
        "CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP",
        "CLIPConfig",
        "CLIPOnnxConfig",
        "CLIPTextConfig",
        "CLIPVisionConfig",
    ],
    "processing_clip": ["CLIPProcessor"],
    "tokenization_clip": ["CLIPTokenizer"],
}

# 尝试导入 tokenizers 库，如果不可用则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_tokenizers_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，则将 CLIPTokenizerFast 添加到导入结构中
    _import_structure["tokenization_clip_fast"] = ["CLIPTokenizerFast"]

# 尝试导入 vision 库，如果不可用则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_vision_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，则将 CLIPFeatureExtractor 和 CLIPImageProcessor 添加到导入结构中
    _import_structure["feature_extraction_clip"] = ["CLIPFeatureExtractor"]
    _import_structure["image_processing_clip"] = ["CLIPImageProcessor"]

# 尝试导入 torch 库，如果不可用则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，则将 CLIP 相关的模型和预训练模型添加到导入结构中
    _import_structure["modeling_clip"] = [
        "CLIP_PRETRAINED_MODEL_ARCHIVE_LIST",
        "CLIPModel",
        "CLIPPreTrainedModel",
        "CLIPTextModel",
        "CLIPTextModelWithProjection",
        "CLIPVisionModel",
        "CLIPVisionModelWithProjection",
        "CLIPForImageClassification",
    ]

# 尝试导入 tensorflow 库，如果不可用则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_tf_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，则将 TFCLIP 相关的模型和预训练模型添加到导入结构中
    _import_structure["modeling_tf_clip"] = [
        "TF_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST",
        "TFCLIPModel",
        "TFCLIPPreTrainedModel",
        "TFCLIPTextModel",
        "TFCLIPVisionModel",
    ]

# 尝试导入 flax 库，如果不可用则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_flax_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，则将 FlaxCLIP 相关的模型和预训练模型添加到导入结构中
    _import_structure["modeling_flax_clip"] = [
        "FlaxCLIPModel",
        "FlaxCLIPPreTrainedModel",
        "FlaxCLIPTextModel",
        "FlaxCLIPTextPreTrainedModel",
        "FlaxCLIPTextModelWithProjection",
        "FlaxCLIPVisionModel",
        "FlaxCLIPVisionPreTrainedModel",
    ]

# 如果是类型检查阶段，则需要进一步的导入，暂时略过
if TYPE_CHECKING:
    pass
    # 从本地引入所需的配置信息和类
    from .configuration_clip import (
        CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP,
        CLIPConfig,
        CLIPOnnxConfig,
        CLIPTextConfig,
        CLIPVisionConfig,
    )
    # 从本地引入处理 CLIP 模型所需的处理器类
    from .processing_clip import CLIPProcessor
    # 从本地引入处理 CLIP 模型所需的分词器类
    from .tokenization_clip import CLIPTokenizer

    try:
        # 检查是否安装了 tokenizers 库，如果未安装，则抛出异常
        if not is_tokenizers_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        # 如果 OptionalDependencyNotAvailable 异常被抛出，不执行任何操作
        pass
    else:
        # 如果没有异常，则从本地引入加速版的 CLIPTokenizerFast 类
        from .tokenization_clip_fast import CLIPTokenizerFast

    try:
        # 检查是否安装了 vision 库，如果未安装，则抛出异常
        if not is_vision_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        # 如果 OptionalDependencyNotAvailable 异常被抛出，不执行任何操作
        pass
    else:
        # 如果没有异常，则从本地引入 CLIPFeatureExtractor 和 CLIPImageProcessor 类
        from .feature_extraction_clip import CLIPFeatureExtractor
        from .image_processing_clip import CLIPImageProcessor

    try:
        # 检查是否安装了 torch 库，如果未安装，则抛出异常
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        # 如果 OptionalDependencyNotAvailable 异常被抛出，不执行任何操作
        pass
    else:
        # 如果没有异常，则从本地引入相关的 CLIP 模型类和预训练模型列表
        from .modeling_clip import (
            CLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
            CLIPForImageClassification,
            CLIPModel,
            CLIPPreTrainedModel,
            CLIPTextModel,
            CLIPTextModelWithProjection,
            CLIPVisionModel,
            CLIPVisionModelWithProjection,
        )

    try:
        # 检查是否安装了 TensorFlow 库，如果未安装，则抛出异常
        if not is_tf_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        # 如果 OptionalDependencyNotAvailable 异常被抛出，不执行任何操作
        pass
    else:
        # 如果没有异常，则从本地引入 TensorFlow 版本的 CLIP 相关模型类和预训练模型列表
        from .modeling_tf_clip import (
            TF_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
            TFCLIPModel,
            TFCLIPPreTrainedModel,
            TFCLIPTextModel,
            TFCLIPVisionModel,
        )

    try:
        # 检查是否安装了 Flax 库，如果未安装，则抛出异常
        if not is_flax_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        # 如果 OptionalDependencyNotAvailable 异常被抛出，不执行任何操作
        pass
    else:
        # 如果没有异常，则从本地引入 Flax 版本的 CLIP 相关模型类和预训练模型列表
        from .modeling_flax_clip import (
            FlaxCLIPModel,
            FlaxCLIPPreTrainedModel,
            FlaxCLIPTextModel,
            FlaxCLIPTextModelWithProjection,
            FlaxCLIPTextPreTrainedModel,
            FlaxCLIPVisionModel,
            FlaxCLIPVisionPreTrainedModel,
        )
else:
    # 导入 sys 模块，用于动态配置模块
    import sys
    
    # 将当前模块注册到 sys.modules 中，使用 _LazyModule 进行懒加载
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\clipseg\configuration_clipseg.py`

# coding=utf-8
# 上面这行声明了文件的编码格式为 UTF-8，确保可以正确处理中文等特殊字符
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
# 版权声明，版权归 HuggingFace Inc. 团队所有，保留所有权利
#
# Licensed under the Apache License, Version 2.0 (the "License");
# 授权协议声明，使用 Apache License, Version 2.0，允许在符合许可的情况下使用该文件
# you may not use this file except in compliance with the License.
# 除非符合许可，否则不得使用本文件
# You may obtain a copy of the License at
# 可以通过上述链接获取许可协议的副本
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 除非法律要求或书面同意，本软件按"原样"分发，不附带任何形式的担保或条件
# See the License for the specific language governing permissions and
# limitations under the License.
# 查看许可协议以了解具体的语言控制权限和限制
""" CLIPSeg model configuration"""
# 说明这是 CLIPSeg 模型的配置文件

import os
# 导入操作系统相关功能
from typing import Union
# 导入 Union 类型，用于类型注解

from ...configuration_utils import PretrainedConfig
# 导入 PretrainedConfig 类，用于模型预训练配置
from ...utils import logging
# 导入 logging 工具，用于日志记录

logger = logging.get_logger(__name__)
# 获取当前模块的日志记录器

CLIPSEG_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "CIDAS/clipseg-rd64": "https://huggingface.co/CIDAS/clipseg-rd64/resolve/main/config.json",
}
# 定义一个预训练模型配置文件映射，包含模型名称和其对应的配置文件链接

class CLIPSegTextConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`CLIPSegModel`]. It is used to instantiate an
    CLIPSeg model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the CLIPSeg
    [CIDAS/clipseg-rd64](https://huggingface.co/CIDAS/clipseg-rd64) architecture.
    """
    # CLIPSegTextConfig 类，用于存储 CLIPSegModel 的配置信息

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    # 配置对象继承自 PretrainedConfig，并可用于控制模型输出。详细信息请阅读 PretrainedConfig 的文档。
    # 定义 CLIPSeg 文本模型的配置类，设置各种参数的默认值
    Args:
        vocab_size (`int`, *optional*, defaults to 49408):
            CLIPSeg 文本模型的词汇表大小，定义了在调用 `CLIPSegModel` 时 `inputs_ids` 可表示的不同标记数量。
        hidden_size (`int`, *optional*, defaults to 512):
            编码器层和池化层的维度。
        intermediate_size (`int`, *optional*, defaults to 2048):
            Transformer 编码器中“中间”（即前馈）层的维度。
        num_hidden_layers (`int`, *optional*, defaults to 12):
            Transformer 编码器中的隐藏层数量。
        num_attention_heads (`int`, *optional*, defaults to 8):
            Transformer 编码器中每个注意力层的注意头数量。
        max_position_embeddings (`int`, *optional*, defaults to 77):
            可能用于该模型的最大序列长度。通常设置为较大的值（例如 512、1024 或 2048）。
        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
            编码器和池化层中的非线性激活函数（函数或字符串）。如果是字符串，支持 `"gelu"`, `"relu"`, `"selu"` 和 `"gelu_new"` 
            `"quick_gelu"`。
        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
            层归一化层使用的 epsilon。
        attention_dropout (`float`, *optional*, defaults to 0.0):
            注意力概率的 dropout 比率。
        initializer_range (`float`, *optional*, defaults to 0.02):
            初始化所有权重矩阵的截断正态初始化器的标准差。
        initializer_factor (`float`, *optional*, defaults to 1.0):
            初始化所有权重矩阵的因子（内部用于初始化测试应保持为 1）。
        pad_token_id (`int`, *optional*, defaults to 1):
            填充标记 id。
        bos_token_id (`int`, *optional*, defaults to 49406):
            流的开始标记 id。
        eos_token_id (`int`, *optional*, defaults to 49407):
            流的结束标记 id。

    Example:

    ```
    >>> from transformers import CLIPSegTextConfig, CLIPSegTextModel

    >>> # 使用 CIDAS/clipseg-rd64 风格配置初始化 CLIPSegTextConfig
    >>> configuration = CLIPSegTextConfig()

    >>> # 使用 CIDAS/clipseg-rd64 风格配置初始化随机权重的 CLIPSegTextModel
    >>> model = CLIPSegTextModel(configuration)

    >>> # 访问模型配置
    >>> configuration = model.config
    ```
    
    # 模型类型设置为 "clipseg_text_model"
    model_type = "clipseg_text_model"
    # 初始化函数，设置模型配置参数
    def __init__(
        self,
        vocab_size=49408,                      # 词汇表大小，默认为 49408
        hidden_size=512,                       # 隐藏层大小，默认为 512
        intermediate_size=2048,                # 中间层大小，默认为 2048
        num_hidden_layers=12,                  # 隐藏层数，默认为 12
        num_attention_heads=8,                 # 注意力头数，默认为 8
        max_position_embeddings=77,            # 最大位置嵌入长度，默认为 77
        hidden_act="quick_gelu",               # 隐藏层激活函数，默认为 quick_gelu
        layer_norm_eps=1e-5,                   # 层归一化 epsilon，默认为 1e-5
        attention_dropout=0.0,                 # 注意力机制的 dropout 率，默认为 0.0
        initializer_range=0.02,                # 初始化范围，默认为 0.02
        initializer_factor=1.0,                # 初始化因子，默认为 1.0
        pad_token_id=1,                        # 填充标记的 ID，默认为 1
        bos_token_id=49406,                    # 开始标记的 ID，默认为 49406
        eos_token_id=49407,                    # 结束标记的 ID，默认为 49407
        **kwargs,
    ):
        # 调用父类构造函数，传入填充、开始和结束标记的 ID，以及其他关键字参数
        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)

        # 设置模型配置参数
        self.vocab_size = vocab_size                # 设置词汇表大小
        self.hidden_size = hidden_size              # 设置隐藏层大小
        self.intermediate_size = intermediate_size  # 设置中间层大小
        self.num_hidden_layers = num_hidden_layers  # 设置隐藏层数
        self.num_attention_heads = num_attention_heads  # 设置注意力头数
        self.max_position_embeddings = max_position_embeddings  # 设置最大位置嵌入长度
        self.layer_norm_eps = layer_norm_eps        # 设置层归一化 epsilon
        self.hidden_act = hidden_act                # 设置隐藏层激活函数
        self.initializer_range = initializer_range  # 设置初始化范围
        self.initializer_factor = initializer_factor  # 设置初始化因子
        self.attention_dropout = attention_dropout  # 设置注意力机制的 dropout 率

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
        # 设置 token 参数到 kwargs 中
        cls._set_token_in_kwargs(kwargs)

        # 获取配置字典和剩余的 kwargs
        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)

        # 如果从 CLIPSegConfig 加载，获取文本配置字典
        if config_dict.get("model_type") == "clipseg":
            config_dict = config_dict["text_config"]

        # 如果模型类型不匹配且不是所有配置的模型都支持的情况下，给出警告
        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
            logger.warning(
                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
            )

        # 从配置字典创建模型实例并返回
        return cls.from_dict(config_dict, **kwargs)
# CLIPSegVisionConfig 是一个配置类，用于存储 CLIPSegModel 的配置信息。
# 这个配置类定义了 CLIPSeg 模型的架构，根据指定的参数实例化一个模型。
# 当使用默认参数实例化时，会生成与 CIDAS/clipseg-rd64 架构类似的配置。
# 配置对象继承自 PretrainedConfig，可以用来控制模型的输出。详细信息请参阅 PretrainedConfig 的文档。

class CLIPSegVisionConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`CLIPSegModel`]. It is used to instantiate an
    CLIPSeg model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the CLIPSeg
    [CIDAS/clipseg-rd64](https://huggingface.co/CIDAS/clipseg-rd64) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        hidden_size (`int`, *optional*, defaults to 768):
            Dimensionality of the encoder layers and the pooler layer.
        intermediate_size (`int`, *optional*, defaults to 3072):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
        num_hidden_layers (`int`, *optional*, defaults to 12):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 12):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_channels (`int`, *optional*, defaults to 3):
            The number of input channels.
        image_size (`int`, *optional*, defaults to 224):
            The size (resolution) of each image.
        patch_size (`int`, *optional*, defaults to 32):
            The size (resolution) of each patch.
        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the layer normalization layers.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        initializer_factor (`float`, *optional*, defaults to 1.0):
            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
            testing).

    Example:

    ```
    >>> from transformers import CLIPSegVisionConfig, CLIPSegVisionModel

    >>> # Initializing a CLIPSegVisionConfig with CIDAS/clipseg-rd64 style configuration
    >>> configuration = CLIPSegVisionConfig()

    >>> # Initializing a CLIPSegVisionModel (with random weights) from the CIDAS/clipseg-rd64 style configuration
    >>> model = CLIPSegVisionModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""

    # 模型类型的标识符，用于指示这是 CLIPSeg 视觉模型的配置
    model_type = "clipseg_vision_model"
    # 初始化方法，设置模型配置参数
    def __init__(
        self,
        hidden_size=768,                 # 隐藏层大小，默认为768
        intermediate_size=3072,          # 中间层大小，默认为3072
        num_hidden_layers=12,            # 隐藏层数，默认为12
        num_attention_heads=12,          # 注意力头数，默认为12
        num_channels=3,                  # 图像通道数，默认为3
        image_size=224,                  # 图像大小，默认为224
        patch_size=32,                   # 图像分块大小，默认为32
        hidden_act="quick_gelu",         # 隐藏层激活函数，默认为"quick_gelu"
        layer_norm_eps=1e-5,             # Layer Normalization 的 epsilon，默认为1e-5
        attention_dropout=0.0,           # 注意力机制的dropout率，默认为0.0
        initializer_range=0.02,          # 初始化权重范围，默认为0.02
        initializer_factor=1.0,          # 初始化因子，默认为1.0
        **kwargs,
    ):
        super().__init__(**kwargs)       # 调用父类的初始化方法，并传入其他参数

        self.hidden_size = hidden_size   # 设置隐藏层大小
        self.intermediate_size = intermediate_size   # 设置中间层大小
        self.num_hidden_layers = num_hidden_layers   # 设置隐藏层数
        self.num_attention_heads = num_attention_heads   # 设置注意力头数
        self.num_channels = num_channels   # 设置图像通道数
        self.patch_size = patch_size     # 设置图像分块大小
        self.image_size = image_size     # 设置图像大小
        self.initializer_range = initializer_range   # 设置初始化权重范围
        self.initializer_factor = initializer_factor   # 设置初始化因子
        self.attention_dropout = attention_dropout   # 设置注意力dropout率
        self.layer_norm_eps = layer_norm_eps   # 设置Layer Normalization的epsilon
        self.hidden_act = hidden_act       # 设置隐藏层激活函数

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
        cls._set_token_in_kwargs(kwargs)   # 调用类方法，设置kwargs中的token参数

        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)   # 调用类方法获取配置字典和更新后的kwargs

        # 如果配置字典中的模型类型是"clipseg"，则使用其视觉配置
        if config_dict.get("model_type") == "clipseg":
            config_dict = config_dict["vision_config"]

        # 如果配置字典中包含"model_type"键，并且当前类的model_type属性存在且不同于配置字典中的类型，发出警告
        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
            logger.warning(
                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
            )

        # 根据配置字典和kwargs创建配置对象
        return cls.from_dict(config_dict, **kwargs)
# 定义 `CLIPSegConfig` 类，继承自 `PretrainedConfig` 类，用于存储 `CLIPSegModel` 的配置信息。
# 该类用于实例化 CLIPSeg 模型，根据指定的参数定义文本模型和视觉模型的配置。
# 使用默认参数实例化配置对象将产生与 `CIDAS/clipseg-rd64` 架构相似的配置。
class CLIPSegConfig(PretrainedConfig):
    r"""
    [`CLIPSegConfig`] is the configuration class to store the configuration of a [`CLIPSegModel`]. It is used to
    instantiate a CLIPSeg model according to the specified arguments, defining the text model and vision model configs.
    Instantiating a configuration with the defaults will yield a similar configuration to that of the CLIPSeg
    [CIDAS/clipseg-rd64](https://huggingface.co/CIDAS/clipseg-rd64) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        text_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize [`CLIPSegTextConfig`].
        vision_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize [`CLIPSegVisionConfig`].
        projection_dim (`int`, *optional*, defaults to 512):
            Dimensionality of text and vision projection layers.
        logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
            The inital value of the *logit_scale* paramter. Default is used as per the original CLIPSeg implementation.
        extract_layers (`List[int]`, *optional*, defaults to `[3, 6, 9]`):
            Layers to extract when forwarding the query image through the frozen visual backbone of CLIP.
        reduce_dim (`int`, *optional*, defaults to 64):
            Dimensionality to reduce the CLIP vision embedding.
        decoder_num_attention_heads (`int`, *optional*, defaults to 4):
            Number of attention heads in the decoder of CLIPSeg.
        decoder_attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        decoder_hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
        decoder_intermediate_size (`int`, *optional*, defaults to 2048):
            Dimensionality of the "intermediate" (i.e., feed-forward) layers in the Transformer decoder.
        conditional_layer (`int`, *optional*, defaults to 0):
            The layer to use of the Transformer encoder whose activations will be combined with the condition
            embeddings using FiLM (Feature-wise Linear Modulation). If 0, the last layer is used.
        use_complex_transposed_convolution (`bool`, *optional*, defaults to `False`):
            Whether to use a more complex transposed convolution in the decoder, enabling more fine-grained
            segmentation.
        kwargs (*optional*):
            Dictionary of keyword arguments.

    Example:

    ```
    >>> from transformers import CLIPSegConfig, CLIPSegModel
    # 初始化一个 CLIPSegConfig，使用 CIDAS/clipseg-rd64 风格的配置
    >>> configuration = CLIPSegConfig()

    # 使用 CIDAS/clipseg-rd64 风格的配置初始化一个 CLIPSegModel（带有随机权重）
    >>> model = CLIPSegModel(configuration)

    # 访问模型的配置信息
    >>> configuration = model.config

    # 我们也可以从 CLIPSegTextConfig 和 CLIPSegVisionConfig 初始化一个 CLIPSegConfig

    # 初始化一个 CLIPSegTextConfig 和 CLIPSegVisionConfig
    >>> config_text = CLIPSegTextConfig()
    >>> config_vision = CLIPSegVisionConfig()

    # 使用 CLIPSegTextConfig 和 CLIPSegVisionConfig 初始化一个 CLIPSegConfig 对象
    >>> config = CLIPSegConfig.from_text_vision_configs(config_text, config_vision)

`.\models\clipseg\convert_clipseg_original_pytorch_to_hf.py`

# 设置编码格式为 UTF-8
# 版权声明，该代码由 HuggingFace Inc. 团队版权所有
#
# 根据 Apache 许可证 2.0 版本发布，除非符合许可证规定，否则不得使用此文件
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，本软件是基于“原样”分发的，不提供任何形式的明示或暗示保证
# 请参阅许可证了解具体语言和限制
#

"""从原始存储库转换 CLIPSeg 检查点。URL: https://github.com/timojl/clipseg."""

# 导入所需的库
import argparse  # 解析命令行参数

import requests  # 发送 HTTP 请求
import torch  # PyTorch 库
from PIL import Image  # Python Imaging Library，处理图像

from transformers import (  # 导入 Transformers 库中的相关模块和类
    CLIPSegConfig,
    CLIPSegForImageSegmentation,
    CLIPSegProcessor,
    CLIPSegTextConfig,
    CLIPSegVisionConfig,
    CLIPTokenizer,
    ViTImageProcessor,
)

# 定义函数，根据模型名称获取对应的 CLIPSegConfig 配置对象
def get_clipseg_config(model_name):
    # 创建 CLIPSegTextConfig 对象
    text_config = CLIPSegTextConfig()
    # 创建 CLIPSegVisionConfig 对象，并指定 patch 大小为 16
    vision_config = CLIPSegVisionConfig(patch_size=16)

    # 根据模型名称确定是否使用复杂的转置卷积
    use_complex_transposed_convolution = True if "refined" in model_name else False
    # 根据模型名称确定降维大小
    reduce_dim = 16 if "rd16" in model_name else 64

    # 创建 CLIPSegConfig 对象，从 text_config 和 vision_config 创建配置
    config = CLIPSegConfig.from_text_vision_configs(
        text_config,
        vision_config,
        use_complex_transposed_convolution=use_complex_transposed_convolution,
        reduce_dim=reduce_dim,
    )
    return config

# 定义函数，重命名键名以匹配转换后的 CLIPSeg 模型
def rename_key(name):
    # 更新前缀
    if "clip_model" in name:
        name = name.replace("clip_model", "clip")
    if "transformer" in name:
        if "visual" in name:
            name = name.replace("visual.transformer", "vision_model")
        else:
            name = name.replace("transformer", "text_model")
    if "resblocks" in name:
        name = name.replace("resblocks", "encoder.layers")
    if "ln_1" in name:
        name = name.replace("ln_1", "layer_norm1")
    if "ln_2" in name:
        name = name.replace("ln_2", "layer_norm2")
    if "c_fc" in name:
        name = name.replace("c_fc", "fc1")
    if "c_proj" in name:
        name = name.replace("c_proj", "fc2")
    if "attn" in name and "self" not in name:
        name = name.replace("attn", "self_attn")
    # 文本编码器
    if "token_embedding" in name:
        name = name.replace("token_embedding", "text_model.embeddings.token_embedding")
    if "positional_embedding" in name and "visual" not in name:
        name = name.replace("positional_embedding", "text_model.embeddings.position_embedding.weight")
    if "ln_final" in name:
        name = name.replace("ln_final", "text_model.final_layer_norm")
    # 视觉编码器
    if "visual.class_embedding" in name:
        name = name.replace("visual.class_embedding", "vision_model.embeddings.class_embedding")
    if "visual.conv1" in name:
        name = name.replace("visual.conv1", "vision_model.embeddings.patch_embedding")
    # 检查字符串"name"是否包含"visual.positional_embedding"
    if "visual.positional_embedding" in name:
        # 如果包含，则用"vision_model.embeddings.position_embedding.weight"替换它
        name = name.replace("visual.positional_embedding", "vision_model.embeddings.position_embedding.weight")
    
    # 检查字符串"name"是否包含"visual.ln_pre"
    if "visual.ln_pre" in name:
        # 如果包含，则用"vision_model.pre_layrnorm"替换它
        name = name.replace("visual.ln_pre", "vision_model.pre_layrnorm")
    
    # 检查字符串"name"是否包含"visual.ln_post"
    if "visual.ln_post" in name:
        # 如果包含，则用"vision_model.post_layernorm"替换它
        name = name.replace("visual.ln_post", "vision_model.post_layernorm")
    
    # 检查字符串"name"是否包含"visual.proj"
    if "visual.proj" in name:
        # 如果包含，则用"visual_projection.weight"替换它
        name = name.replace("visual.proj", "visual_projection.weight")
    
    # 检查字符串"name"是否包含"text_projection"
    if "text_projection" in name:
        # 如果包含，则用"text_projection.weight"替换它
        name = name.replace("text_projection", "text_projection.weight")
    
    # 检查字符串"name"是否包含"trans_conv"
    if "trans_conv" in name:
        # 如果包含，则用"transposed_convolution"替换它
        name = name.replace("trans_conv", "transposed_convolution")
    
    # 如果字符串"name"包含"film_mul"、"film_add"、"reduce"或"transposed_convolution"中的任意一个
    if "film_mul" in name or "film_add" in name or "reduce" in name or "transposed_convolution" in name:
        # 替换"name"为"decoder." + name
        name = "decoder." + name
    
    # 检查字符串"name"是否包含"blocks"
    if "blocks" in name:
        # 如果包含，则用"decoder.layers"替换它
        name = name.replace("blocks", "decoder.layers")
    
    # 检查字符串"name"是否包含"linear1"
    if "linear1" in name:
        # 如果包含，则用"mlp.fc1"替换它
        name = name.replace("linear1", "mlp.fc1")
    
    # 检查字符串"name"是否包含"linear2"
    if "linear2" in name:
        # 如果包含，则用"mlp.fc2"替换它
        name = name.replace("linear2", "mlp.fc2")
    
    # 检查字符串"name"是否包含"norm1"且不包含"layer_"
    if "norm1" in name and "layer_" not in name:
        # 如果满足条件，则用"layer_norm1"替换它
        name = name.replace("norm1", "layer_norm1")
    
    # 检查字符串"name"是否包含"norm2"且不包含"layer_"
    if "norm2" in name and "layer_" not in name:
        # 如果满足条件，则用"layer_norm2"替换它
        name = name.replace("norm2", "layer_norm2")
    
    # 返回修改后的"name"
    return name
# 将原始状态字典转换为适合新模型的格式
def convert_state_dict(orig_state_dict, config):
    # 使用 .copy() 创建原始字典的副本，以便安全地迭代和修改
    for key in orig_state_dict.copy().keys():
        # 弹出当前键对应的值
        val = orig_state_dict.pop(key)

        # 如果键以 "clip_model" 开头并且包含 "attn.in_proj"，则进行下列操作
        if key.startswith("clip_model") and "attn.in_proj" in key:
            # 按 "." 分割键名
            key_split = key.split(".")
            # 根据键名中是否含有 "visual" 选择相应的处理
            if "visual" in key:
                # 提取层编号和隐藏层大小
                layer_num = int(key_split[4])
                dim = config.vision_config.hidden_size
                prefix = "vision_model"
            else:
                layer_num = int(key_split[3])
                dim = config.text_config.hidden_size
                prefix = "text_model"

            # 根据键名中是否含有 "weight"，更新对应的原始状态字典
            if "weight" in key:
                orig_state_dict[f"clip.{prefix}.encoder.layers.{layer_num}.self_attn.q_proj.weight"] = val[:dim, :]
                orig_state_dict[f"clip.{prefix}.encoder.layers.{layer_num}.self_attn.k_proj.weight"] = val[dim : dim * 2, :]
                orig_state_dict[f"clip.{prefix}.encoder.layers.{layer_num}.self_attn.v_proj.weight"] = val[-dim:, :]
            else:
                orig_state_dict[f"clip.{prefix}.encoder.layers.{layer_num}.self_attn.q_proj.bias"] = val[:dim]
                orig_state_dict[f"clip.{prefix}.encoder.layers.{layer_num}.self_attn.k_proj.bias"] = val[dim : dim * 2]
                orig_state_dict[f"clip.{prefix}.encoder.layers.{layer_num}.self_attn.v_proj.bias"] = val[-dim:]
        
        # 如果键包含 "self_attn" 但不含 "out_proj"，则进行下列操作
        elif "self_attn" in key and "out_proj" not in key:
            # 按 "." 分割键名
            key_split = key.split(".")
            # 提取层编号和降维大小
            layer_num = int(key_split[1])
            dim = config.reduce_dim
            # 根据键名中是否含有 "weight"，更新对应的原始状态字典
            if "weight" in key:
                orig_state_dict[f"decoder.layers.{layer_num}.self_attn.q_proj.weight"] = val[:dim, :]
                orig_state_dict[f"decoder.layers.{layer_num}.self_attn.k_proj.weight"] = val[dim : dim * 2, :]
                orig_state_dict[f"decoder.layers.{layer_num}.self_attn.v_proj.weight"] = val[-dim:, :]
            else:
                orig_state_dict[f"decoder.layers.{layer_num}.self_attn.q_proj.bias"] = val[:dim]
                orig_state_dict[f"decoder.layers.{layer_num}.self_attn.k_proj.bias"] = val[dim : dim * 2]
                orig_state_dict[f"decoder.layers.{layer_num}.self_attn.v_proj.bias"] = val[-dim:]
        
        # 否则，对当前键进行重命名并将值更新到原始状态字典中
        else:
            new_name = rename_key(key)
            # 如果新键名中含有 "visual_projection" 或 "text_projection"，则对值进行转置
            if "visual_projection" in new_name or "text_projection" in new_name:
                val = val.T
            orig_state_dict[new_name] = val

    # 返回转换后的原始状态字典
    return orig_state_dict
    # 使用 state_dict 的副本遍历所有键
    for key in state_dict.copy().keys():
        # 如果键以 "model" 开头，则从 state_dict 中删除该键
        if key.startswith("model"):
            state_dict.pop(key, None)

    # 重命名一些键值
    state_dict = convert_state_dict(state_dict, config)
    # 加载经过转换后的 state_dict 到模型中，允许部分不严格匹配
    missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)

    # 检查缺失的键是否符合预期
    if missing_keys != ["clip.text_model.embeddings.position_ids", "clip.vision_model.embeddings.position_ids"]:
        raise ValueError("Missing keys that are not expected: {}".format(missing_keys))
    # 检查意外的键是否符合预期
    if unexpected_keys != ["decoder.reduce.weight", "decoder.reduce.bias"]:
        raise ValueError(f"Unexpected keys: {unexpected_keys}")

    # 创建图像处理器和文本处理器
    image_processor = ViTImageProcessor(size=352)
    tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
    processor = CLIPSegProcessor(image_processor=image_processor, tokenizer=tokenizer)

    # 准备图像和文本输入
    image = prepare_img()
    text = ["a glass", "something to fill", "wood", "a jar"]

    # 使用处理器处理文本和图像输入，进行填充并返回 PyTorch 张量
    inputs = processor(text=text, images=[image] * len(text), padding="max_length", return_tensors="pt")

    # 使用无梯度计算环境执行模型推理
    with torch.no_grad():
        outputs = model(**inputs)

    # 验证输出的特定值是否符合预期
    expected_conditional = torch.tensor([0.1110, -0.1882, 0.1645])
    expected_pooled_output = torch.tensor([0.2692, -0.7197, -0.1328])
    if model_name == "clipseg-rd64-refined":
        expected_masks_slice = torch.tensor(
            [[-10.0407, -9.9431, -10.2646], [-9.9751, -9.7064, -9.9586], [-9.6891, -9.5645, -9.9618]]
        )
    elif model_name == "clipseg-rd64":
        expected_masks_slice = torch.tensor(
            [[-7.2877, -7.2711, -7.2463], [-7.2652, -7.2780, -7.2520], [-7.2239, -7.2204, -7.2001]]
        )
    elif model_name == "clipseg-rd16":
        expected_masks_slice = torch.tensor(
            [[-6.3955, -6.4055, -6.4151], [-6.3911, -6.4033, -6.4100], [-6.3474, -6.3702, -6.3762]]
        )
    else:
        # 如果模型名称不受支持，则引发 ValueError
        raise ValueError(f"Model name {model_name} not supported.")

    # 使用 allclose 函数验证张量是否在给定的容差内相等
    assert torch.allclose(outputs.logits[0, :3, :3], expected_masks_slice, atol=1e-3)
    assert torch.allclose(outputs.conditional_embeddings[0, :3], expected_conditional, atol=1e-3)
    assert torch.allclose(outputs.pooled_output[0, :3], expected_pooled_output, atol=1e-3)
    print("Looks ok!")

    # 如果指定了 pytorch_dump_folder_path，则保存模型和处理器
    if pytorch_dump_folder_path is not None:
        print(f"Saving model and processor to {pytorch_dump_folder_path}")
        model.save_pretrained(pytorch_dump_folder_path)
        processor.save_pretrained(pytorch_dump_folder_path)

    # 如果指定了 push_to_hub，则将模型和处理器推送到 Hub
    if push_to_hub:
        print(f"Pushing model and processor for {model_name} to the hub")
        model.push_to_hub(f"CIDAS/{model_name}")
        processor.push_to_hub(f"CIDAS/{model_name}")
if __name__ == "__main__":
    # 如果当前脚本作为主程序运行，则执行以下代码块

    parser = argparse.ArgumentParser()
    # 创建参数解析器对象

    # 必选参数
    parser.add_argument(
        "--model_name",
        default="clipseg-rd64",
        type=str,
        choices=["clipseg-rd16", "clipseg-rd64", "clipseg-rd64-refined"],
        help=(
            "Name of the model. Supported models are: clipseg-rd64, clipseg-rd16 and clipseg-rd64-refined (rd meaning"
            " reduce dimension)"
        ),
    )

    # 可选参数：原始检查点路径
    parser.add_argument(
        "--checkpoint_path",
        default="/Users/nielsrogge/Documents/CLIPSeg/clip_plus_rd64-uni.pth",
        type=str,
        help=(
            "Path to the original checkpoint. Note that the script assumes that the checkpoint includes both CLIP and"
            " the decoder weights."
        ),
    )

    # 可选参数：输出 PyTorch 模型目录路径
    parser.add_argument(
        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
    )

    # 可选参数：是否推送模型到 🤗 hub
    parser.add_argument(
        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
    )

    # 解析命令行参数
    args = parser.parse_args()

    # 调用函数，转换 CLIPSeg 检查点
    convert_clipseg_checkpoint(args.model_name, args.checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub)


这段代码是一个命令行工具的入口点，使用 argparse 模块解析命令行参数，并调用 `convert_clipseg_checkpoint` 函数进行处理。

`.\models\clipseg\modeling_clipseg.py`

# 设置文件编码为UTF-8

# 版权声明及许可证信息，该文件受 Apache 许可证版本 2.0 保护
# 除非符合许可证的规定，否则不得使用本文件
# 可以通过以下链接获取许可证的副本：http://www.apache.org/licenses/LICENSE-2.0

# 导入必要的库和模块
""" PyTorch CLIPSeg 模型."""

# 复制对象
import copy
# 数学计算函数库
import math
# 数据类装饰器
from dataclasses import dataclass
# 任意类型
from typing import Any, Optional, Tuple, Union

# 导入PyTorch库
import torch
# PyTorch模块
import torch.utils.checkpoint
# 导入神经网络模块
from torch import nn

# 导入自定义的模块
# 用于处理激活函数
from ...activations import ACT2FN
# 模型的注意力掩码工具函数
from ...modeling_attn_mask_utils import _create_4d_causal_attention_mask, _prepare_4d_attention_mask
# 模型输出类
from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
# 预训练模型基类
from ...modeling_utils import PreTrainedModel
# 工具函数
from ...utils import (
    ModelOutput,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)

# 获取日志记录器对象
logger = logging.get_logger(__name__)

# 模型检查点，用于文档
_CHECKPOINT_FOR_DOC = "CIDAS/clipseg-rd64-refined"

# 预训练模型存档列表
CLIPSEG_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "CIDAS/clipseg-rd64-refined",
    # 可在 https://huggingface.co/models?filter=clipseg 查看所有 CLIPSeg 模型
]

# 对比损失函数，源自 https://sachinruk.github.io/blog/pytorch/pytorch%20lightning/loss%20function/gpu/2021/03/07/CLIP.html
def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
    # 计算交叉熵损失
    return nn.functional.cross_entropy(logits, torch.arange(len(logits), device=logits.device))

# 复制自 transformers.models.clip.modeling_clip.clip_loss，用 CLIPSeg 替换 CLIP
def clipseg_loss(similarity: torch.Tensor) -> torch.Tensor:
    # 计算标题损失
    caption_loss = contrastive_loss(similarity)
    # 计算图像损失
    image_loss = contrastive_loss(similarity.t())
    # 返回损失的平均值
    return (caption_loss + image_loss) / 2.0

@dataclass
# 复制自 transformers.models.clip.modeling_clip.CLIPOutput，用 CLIPSeg 替换 CLIP
class CLIPSegOutput(ModelOutput):
    """
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
            Contrastive loss for image-text similarity.
        logits_per_image:(`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
            The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
            similarity scores.
        logits_per_text:(`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
            The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
            similarity scores.
        text_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
            The text embeddings obtained by applying the projection layer to the pooled output of [`CLIPSegTextModel`].
        image_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
            The image embeddings obtained by applying the projection layer to the pooled output of [`CLIPSegVisionModel`].
        text_model_output(`BaseModelOutputWithPooling`):
            The output of the [`CLIPSegTextModel`].
        vision_model_output(`BaseModelOutputWithPooling`):
            The output of the [`CLIPSegVisionModel`].
    """

    # Optional attribute: Contrastive loss for image-text similarity.
    loss: Optional[torch.FloatTensor] = None
    # Optional attribute: Scores of image-text similarity.
    logits_per_image: torch.FloatTensor = None
    # Optional attribute: Scores of text-image similarity.
    logits_per_text: torch.FloatTensor = None
    # Optional attribute: Text embeddings after projection from CLIPSegTextModel output.
    text_embeds: torch.FloatTensor = None
    # Optional attribute: Image embeddings after projection from CLIPSegVisionModel output.
    image_embeds: torch.FloatTensor = None
    # Optional attribute: Output of CLIPSegTextModel including pooling.
    text_model_output: BaseModelOutputWithPooling = None
    # Optional attribute: Output of CLIPSegVisionModel including pooling.
    vision_model_output: BaseModelOutputWithPooling = None

    def to_tuple(self) -> Tuple[Any]:
        # Convert attributes to a tuple, handling special cases for complex objects.
        return tuple(
            self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
            for k in self.keys()
        )
@dataclass
class CLIPSegDecoderOutput(ModelOutput):
    """
    Args:
        logits (`torch.FloatTensor` of shape `(batch_size, height, width)`):
            分类得分，用于每个像素的分类。
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, 当 `output_hidden_states=True` 或 `config.output_hidden_states=True` 时返回):
            包含多个元素的元组，每个元素是 `torch.FloatTensor` 类型，表示每个层的隐藏状态输出，如果模型有嵌入层则还包含嵌入层的输出。
        attentions (`tuple(torch.FloatTensor)`, *optional*, 当 `output_attentions=True` 或 `config.output_attentions=True` 时返回):
            包含多个元素的元组，每个元素是 `torch.FloatTensor` 类型，表示每个层的注意力权重，形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
            注意力权重经过 softmax 后的值，用于计算自注意力头中的加权平均值。
    """

    logits: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None


@dataclass
class CLIPSegImageSegmentationOutput(ModelOutput):
    """
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, 当 `return_loss` 为 `True` 时返回):
            图像与文本相似性的对比损失。
        ...
        vision_model_output (`BaseModelOutputWithPooling`):
            [`CLIPSegVisionModel`] 的输出。
    """

    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    conditional_embeddings: torch.FloatTensor = None
    pooled_output: torch.FloatTensor = None
    vision_model_output: BaseModelOutputWithPooling = None
    decoder_output: CLIPSegDecoderOutput = None

    def to_tuple(self) -> Tuple[Any]:
        """
        将对象转换为元组形式，包含所有属性值。特殊处理 `vision_model_output` 和 `decoder_output` 属性，
        将它们转换为元组形式。
        """
        return tuple(
            self[k] if k not in ["vision_model_output", "decoder_output"] else getattr(self, k).to_tuple()
            for k in self.keys()
        )


class CLIPSegVisionEmbeddings(nn.Module):
    """
    从 `transformers.models.clip.modeling_clip.CLIPVisionEmbeddings.__init__` 复制而来，将 `CLIP` 替换为 `CLIPSeg`。
    """
    def __init__(self, config: CLIPSegVisionConfig):
        super().__init__()
        self.config = config
        self.embed_dim = config.hidden_size  # 设置嵌入向量的维度为隐藏大小
        self.image_size = config.image_size  # 图像尺寸从配置中获取
        self.patch_size = config.patch_size  # 补丁大小从配置中获取

        self.class_embedding = nn.Parameter(torch.randn(self.embed_dim))  # 类别嵌入，使用随机张量初始化

        self.patch_embedding = nn.Conv2d(
            in_channels=config.num_channels,  # 输入通道数从配置中获取
            out_channels=self.embed_dim,  # 输出通道数设置为嵌入向量维度
            kernel_size=self.patch_size,  # 卷积核大小设置为补丁大小
            stride=self.patch_size,  # 卷积步长设置为补丁大小，实现非重叠补丁提取
            bias=False,  # 不使用偏置项
        )

        self.num_patches = (self.image_size // self.patch_size) ** 2  # 计算图像中补丁的数量
        self.num_positions = self.num_patches + 1  # 位置嵌入的数量，比补丁数量多一个用于类别嵌入
        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)  # 位置嵌入，根据数量和维度创建
        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)  # 注册位置 ID 的缓冲区

    def interpolate_position_embeddings(self, new_size):
        if len(new_size) != 2:
            raise ValueError("new_size should consist of 2 values")  # 抛出异常，如果 new_size 长度不为 2

        num_patches_one_direction = int(self.num_patches**0.5)  # 在一个方向上的补丁数的平方根
        # 在二维中插值位置嵌入
        a = self.position_embedding.weight[1:].T.view(
            1, self.config.hidden_size, num_patches_one_direction, num_patches_one_direction
        )
        b = (
            nn.functional.interpolate(a, new_size, mode="bicubic", align_corners=False)  # 使用双三次插值方法插值
            .squeeze(0)
            .view(self.config.hidden_size, new_size[0] * new_size[1])  # 调整形状以适应新大小
            .T
        )
        result = torch.cat([self.position_embedding.weight[:1], b])  # 将插值结果与第一个位置嵌入拼接

        return result

    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
        batch_size = pixel_values.shape[0]  # 批量大小
        patch_embeds = self.patch_embedding(pixel_values)  # 提取补丁嵌入
        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)  # 展平和转置以匹配形状

        class_embeds = self.class_embedding.expand(batch_size, 1, -1)  # 扩展类别嵌入以匹配批量大小
        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)  # 拼接类别嵌入和补丁嵌入

        if embeddings.shape[1] != self.num_positions:
            new_shape = int(math.sqrt(embeddings.shape[1] - 1))  # 计算新的形状大小
            embeddings = embeddings + self.interpolate_position_embeddings((new_shape, new_shape))  # 插值位置嵌入并加到嵌入向量上
            embeddings = embeddings.to(embeddings.dtype)  # 将嵌入向量转换为指定的数据类型
        else:
            embeddings = embeddings + self.position_embedding(self.position_ids)  # 添加位置嵌入到嵌入向量中

        return embeddings
# Copied from transformers.models.clip.modeling_clip.CLIPTextEmbeddings with CLIP->CLIPSeg
class CLIPSegTextEmbeddings(nn.Module):
    def __init__(self, config: CLIPSegTextConfig):
        super().__init__()
        embed_dim = config.hidden_size

        # 定义词嵌入层，用于将输入的token转换成对应的向量表示
        self.token_embedding = nn.Embedding(config.vocab_size, embed_dim)
        # 定义位置嵌入层，用于表示输入token的位置信息
        self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)

        # 创建并注册一个持久化的buffer，用于存储位置ID，以确保在序列化时被导出
        self.register_buffer(
            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
        )

    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
    ) -> torch.Tensor:
        # 获取序列的长度，如果没有提供input_ids，则使用inputs_embeds的长度
        seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]

        # 如果未提供位置ID，则使用预先注册的位置ID
        if position_ids is None:
            position_ids = self.position_ids[:, :seq_length]

        # 如果未提供嵌入向量，根据input_ids生成token嵌入向量
        if inputs_embeds is None:
            inputs_embeds = self.token_embedding(input_ids)

        # 根据位置ID生成位置嵌入向量
        position_embeddings = self.position_embedding(position_ids)
        # 将token嵌入向量和位置嵌入向量相加得到最终的嵌入表示
        embeddings = inputs_embeds + position_embeddings

        return embeddings


# Copied from transformers.models.clip.modeling_clip.CLIPAttention with CLIP->CLIPSeg
class CLIPSegAttention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    def __init__(self, config):
        super().__init__()
        self.config = config
        self.embed_dim = config.hidden_size
        self.num_heads = config.num_attention_heads
        self.head_dim = self.embed_dim // self.num_heads
        if self.head_dim * self.num_heads != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
                f" {self.num_heads})."
            )
        # 缩放因子，用于缩放点积注意力
        self.scale = self.head_dim**-0.5
        # 注意力层的dropout比率
        self.dropout = config.attention_dropout

        # 线性变换函数，用于计算Q、K、V和输出
        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)

    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        # 将输入的tensor进行reshape操作，以便进行多头注意力计算
        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        causal_attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        # 略
    # 定义初始化方法，接受一个配置对象作为参数
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()
        # 将传入的配置对象保存在实例变量中
        self.config = config
        # 根据配置对象中的隐藏激活函数名称从预定义的字典 ACT2FN 中获取对应的激活函数，并保存在实例变量中
        self.activation_fn = ACT2FN[config.hidden_act]
        # 创建一个全连接层，输入大小为配置对象中的隐藏大小，输出大小为配置对象中的中间大小，并保存在实例变量中
        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
        # 创建另一个全连接层，输入大小为配置对象中的中间大小，输出大小为配置对象中的隐藏大小，并保存在实例变量中
        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)

    # 定义前向传播方法，接受一个 torch.Tensor 类型的隐藏状态输入，并返回一个 torch.Tensor 类型的输出
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 将输入的隐藏状态传入第一个全连接层进行线性变换，并更新隐藏状态变量
        hidden_states = self.fc1(hidden_states)
        # 将经过线性变换后的隐藏状态通过预先定义的激活函数进行非线性变换，并更新隐藏状态变量
        hidden_states = self.activation_fn(hidden_states)
        # 将经过激活函数变换后的隐藏状态再次传入第二个全连接层进行线性变换，并更新隐藏状态变量
        hidden_states = self.fc2(hidden_states)
        # 返回经过两个全连接层和激活函数处理后的最终隐藏状态作为输出
        return hidden_states
# 从transformers.models.clip.modeling_clip.CLIPEncoderLayer复制而来，修改为CLIPSeg
class CLIPSegEncoderLayer(nn.Module):
    def __init__(self, config: CLIPSegConfig):
        super().__init__()
        self.embed_dim = config.hidden_size  # 从配置中获取隐藏层大小
        self.self_attn = CLIPSegAttention(config)  # 初始化自注意力机制
        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)  # 初始化第一个层归一化
        self.mlp = CLIPSegMLP(config)  # 初始化多层感知机
        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)  # 初始化第二个层归一化

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: torch.Tensor,
        causal_attention_mask: torch.Tensor,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.FloatTensor]:
        """
        Args:
            hidden_states (`torch.FloatTensor`): 输入层的张量，形状为 `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): 注意力遮罩，形状为 `(batch, 1, tgt_len, src_len)`，其中填充元素用非常大的负值表示
            causal_attention_mask (`torch.FloatTensor`): 因果注意力遮罩，形状为 `(batch, 1, tgt_len, src_len)`，用于生成因果关系
            output_attentions (`bool`, *optional*):
                是否返回所有注意力层的注意力张量。详见返回的张量中的 `attentions`。
        """
        residual = hidden_states  # 保存残差连接

        hidden_states = self.layer_norm1(hidden_states)  # 第一个层归一化
        hidden_states, attn_weights = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            causal_attention_mask=causal_attention_mask,
            output_attentions=output_attentions,
        )  # 执行自注意力机制

        hidden_states = residual + hidden_states  # 残差连接

        residual = hidden_states  # 保存残差连接
        hidden_states = self.layer_norm2(hidden_states)  # 第二个层归一化
        hidden_states = self.mlp(hidden_states)  # 执行多层感知机
        hidden_states = residual + hidden_states  # 残差连接

        outputs = (hidden_states,)

        if output_attentions:
            outputs += (attn_weights,)  # 如果需要输出注意力权重，则添加到输出中

        return outputs
    # 初始化模型的权重
    def _init_weights(self, module):
        """Initialize the weights"""
        factor = self.config.initializer_factor  # 从配置中获取初始化因子

        # 如果 module 是 CLIPSegTextEmbeddings 类型的实例
        if isinstance(module, CLIPSegTextEmbeddings):
            # 对 token_embedding 和 position_embedding 的权重进行正态分布初始化
            module.token_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
            module.position_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)

        # 如果 module 是 CLIPSegVisionEmbeddings 类型的实例
        elif isinstance(module, CLIPSegVisionEmbeddings):
            # 对 class_embedding、patch_embedding 和 position_embedding 的权重进行正态分布初始化
            nn.init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor)
            nn.init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor)
            nn.init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor)

        # 如果 module 是 CLIPSegAttention 类型的实例
        elif isinstance(module, CLIPSegAttention):
            # 根据模型参数和初始化因子计算各个权重的标准差并进行正态分布初始化
            in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
            out_proj_std = (module.embed_dim**-0.5) * factor
            nn.init.normal_(module.q_proj.weight, std=in_proj_std)
            nn.init.normal_(module.k_proj.weight, std=in_proj_std)
            nn.init.normal_(module.v_proj.weight, std=in_proj_std)
            nn.init.normal_(module.out_proj.weight, std=out_proj_std)

        # 如果 module 是 CLIPSegMLP 类型的实例
        elif isinstance(module, CLIPSegMLP):
            # 根据模型参数和初始化因子计算各个权重的标准差并进行正态分布初始化
            in_proj_std = (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
            fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
            nn.init.normal_(module.fc1.weight, std=fc_std)
            nn.init.normal_(module.fc2.weight, std=in_proj_std)

        # 如果 module 是 CLIPSegModel 类型的实例
        elif isinstance(module, CLIPSegModel):
            # 对 text_projection 和 visual_projection 的权重进行正态分布初始化
            nn.init.normal_(
                module.text_projection.weight,
                std=module.text_embed_dim**-0.5 * self.config.initializer_factor,
            )
            nn.init.normal_(
                module.visual_projection.weight,
                std=module.vision_embed_dim**-0.5 * self.config.initializer_factor,
            )

        # 如果 module 是 nn.LayerNorm 类型的实例
        if isinstance(module, nn.LayerNorm):
            # 将 LayerNorm 层的偏置项初始化为零，权重初始化为 1.0
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

        # 如果 module 是 nn.Linear 类型的实例且具有偏置项
        if isinstance(module, nn.Linear) and module.bias is not None:
            # 将 Linear 层的偏置项初始化为零
            module.bias.data.zero_()
# 定义一个多行字符串，包含关于 CLIPSeg 模型的文档说明，描述了它是一个 PyTorch 的 nn.Module 子类，如何使用以及参数说明
CLIPSEG_START_DOCSTRING = r"""
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`CLIPSegConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# 定义文本输入的文档说明，解释了输入参数包括 input_ids、attention_mask、position_ids、output_attentions、output_hidden_states 和 return_dict
CLIPSEG_TEXT_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""

# 定义视觉输入的文档说明，该段落暂未提供具体内容，留空
CLIPSEG_VISION_INPUTS_DOCSTRING = r"""
    # 定义函数签名和参数说明
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            像素值。默认情况下将忽略填充。可以使用 [`AutoImageProcessor`] 获取像素值。有关详细信息，请参见 [`CLIPImageProcessor.__call__`]。
        output_attentions (`bool`, *optional*):
            是否返回所有注意力层的注意力张量。有关更多详细信息，请参见返回的张量下的 `attentions`。
        output_hidden_states (`bool`, *optional*):
            是否返回所有层的隐藏状态。有关更多详细信息，请参见返回的张量下的 `hidden_states`。
        return_dict (`bool`, *optional*):
            是否返回 [`~utils.ModelOutput`] 而不是普通元组。
"""

CLIPSEG_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""


# Copied from transformers.models.clip.modeling_clip.CLIPEncoder with CLIP->CLIPSeg
class CLIPSegEncoder(nn.Module):
    """
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`CLIPSegEncoderLayer`].

    Args:
        config: CLIPSegConfig
    """

    def __init__(self, config: CLIPSegConfig):
        super().__init__()
        # 初始化函数，接收一个 CLIPSegConfig 对象作为参数，用于配置当前编码器
        self.config = config
        # 创建一个包含多个 CLIPSegEncoderLayer 的层列表，数量由 config.num_hidden_layers 决定
        self.layers = nn.ModuleList([CLIPSegEncoderLayer(config) for _ in range(config.num_hidden_layers)])
        # 梯度检查点标记，默认为 False，表示不启用梯度检查点
        self.gradient_checkpointing = False

    def forward(
        self,
        inputs_embeds,
        attention_mask: Optional[torch.Tensor] = None,
        causal_attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
# 定义 CLIPSegTextTransformer 类，继承自 nn.Module
class CLIPSegTextTransformer(nn.Module):
    # 从 transformers.models.clip.modeling_clip.CLIPTextTransformer.__init__ 复制而来，将 CLIP 替换为 CLIPSeg
    def __init__(self, config: CLIPSegTextConfig):
        super().__init__()
        # 将传入的配置对象保存到实例变量中
        self.config = config
        # 从配置中获取隐藏层大小作为嵌入维度
        embed_dim = config.hidden_size
        # 初始化嵌入层
        self.embeddings = CLIPSegTextEmbeddings(config)
        # 初始化编码器
        self.encoder = CLIPSegEncoder(config)
        # 初始化最终的 LayerNorm 层，用于归一化输出
        self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)

        # 用于计算 `pooled_output` 的属性
        self.eos_token_id = config.eos_token_id

    # 将 CLIPSEG_TEXT_INPUTS_DOCSTRING 添加到模型前向方法的文档字符串中
    # 使用 replace_return_docstrings 将返回文档字符串替换为 BaseModelOutputWithPooling 的输出类型和 CLIPSegTextConfig 配置类
    # 从 transformers.models.clip.modeling_clip.CLIPTextTransformer.forward 复制而来，将 clip 替换为 clipseg，CLIP 替换为 CLIPSeg
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
):



# 定义 CLIPSegTextModel 类，继承自 CLIPSegPreTrainedModel
class CLIPSegTextModel(CLIPSegPreTrainedModel):
    # 指定配置类为 CLIPSegTextConfig
    config_class = CLIPSegTextConfig

    # 指定不分割的模块名称列表
    _no_split_modules = ["CLIPSegTextEmbeddings", "CLIPSegEncoderLayer"]

    # 从 CLIPSegPreTrainedModel.__init__ 继承，初始化函数
    def __init__(self, config: CLIPSegTextConfig):
        super().__init__(config)
        # 初始化文本模型，使用 CLIPSegTextTransformer
        self.text_model = CLIPSegTextTransformer(config)
        # 调用 post_init 方法完成权重初始化和最终处理
        self.post_init()

    # 获取输入嵌入层的方法
    def get_input_embeddings(self) -> nn.Module:
        return self.text_model.embeddings.token_embedding

    # 设置输入嵌入层的方法
    def set_input_embeddings(self, value):
        self.text_model.embeddings.token_embedding = value

    # 将 CLIPSEG_TEXT_INPUTS_DOCSTRING 添加到模型前向方法的文档字符串中
    # 使用 replace_return_docstrings 将返回文档字符串替换为 BaseModelOutputWithPooling 的输出类型和 CLIPSegTextConfig 配置类
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
):
        ) -> Union[Tuple, BaseModelOutputWithPooling]:
        r"""
        Returns:

        Examples:

        ```
        >>> from transformers import AutoTokenizer, CLIPSegTextModel

        >>> tokenizer = AutoTokenizer.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegTextModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```"""
        调用 self 对象的 text_model 方法，传入各种参数来进行文本模型的推理和处理
        return self.text_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
# 定义一个名为 CLIPSegVisionTransformer 的类，继承自 nn.Module
class CLIPSegVisionTransformer(nn.Module):
    # 从 transformers.models.clip.modeling_clip.CLIPVisionTransformer.__init__ 复制并修改为 CLIPSeg->CLIPSegVision
    def __init__(self, config: CLIPSegVisionConfig):
        super().__init__()
        # 将传入的配置参数保存到实例变量中
        self.config = config
        # 从配置中获取嵌入维度
        embed_dim = config.hidden_size

        # 初始化嵌入层对象，用于处理视觉输入数据
        self.embeddings = CLIPSegVisionEmbeddings(config)
        # 初始化第一个 LayerNorm 层，用于归一化嵌入层的输出
        self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
        # 初始化编码器对象，处理嵌入层的输出
        self.encoder = CLIPSegEncoder(config)
        # 初始化第二个 LayerNorm 层，用于归一化编码器的输出
        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)

    # 从 transformers.models.clip.modeling_clip.CLIPVisionTransformer.forward 复制的文档字符串
    @add_start_docstrings_to_model_forward(CLIPSEG_VISION_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPSegVisionConfig)
    def forward(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutputWithPooling]:
        r"""
        Returns:

        """
        # 如果未提供像素值，则抛出值错误异常
        if pixel_values is None:
            raise ValueError("You have to specify pixel_values")

        # 将像素值通过嵌入层处理，得到隐藏状态
        hidden_states = self.embeddings(pixel_values)
        # 对隐藏状态进行预 LayerNorm 处理
        hidden_states = self.pre_layrnorm(hidden_states)

        # 将预处理后的隐藏状态传入编码器，得到编码器的输出
        encoder_outputs = self.encoder(
            inputs_embeds=hidden_states,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取编码器输出的最后一个隐藏状态
        last_hidden_state = encoder_outputs[0]
        # 提取池化输出，即最后一个隐藏状态的第一个位置的向量
        pooled_output = last_hidden_state[:, 0, :]
        # 对池化输出进行后 LayerNorm 处理
        pooled_output = self.post_layernorm(pooled_output)

        # 如果不使用返回字典，则返回一个元组，包含最后隐藏状态、池化输出和其他编码器输出
        if not return_dict:
            return (last_hidden_state, pooled_output) + encoder_outputs[1:]

        # 如果使用返回字典，则返回一个 BaseModelOutputWithPooling 对象，包含最后隐藏状态、池化输出、隐藏状态和注意力
        return BaseModelOutputWithPooling(
            last_hidden_state=last_hidden_state,
            pooler_output=pooled_output,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )


# 定义一个名为 CLIPSegVisionModel 的类，继承自 CLIPSegPreTrainedModel
class CLIPSegVisionModel(CLIPSegPreTrainedModel):
    # 指定配置类为 CLIPSegVisionConfig
    config_class = CLIPSegVisionConfig
    # 主输入名称为 "pixel_values"
    main_input_name = "pixel_values"

    # 初始化方法，接受配置对象作为参数
    def __init__(self, config: CLIPSegVisionConfig):
        super().__init__(config)
        # 初始化 CLIPSegVisionTransformer 模型，用于处理视觉任务
        self.vision_model = CLIPSegVisionTransformer(config)
        # 执行初始化权重和应用最终处理的方法
        self.post_init()

    # 返回嵌入层对象的方法
    def get_input_embeddings(self) -> nn.Module:
        return self.vision_model.embeddings.patch_embedding
    @add_start_docstrings_to_model_forward(CLIPSEG_VISION_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPSegVisionConfig)
    def forward(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutputWithPooling]:
        r"""
        此方法定义了模型的前向传播逻辑，用于推理过程。

        Args:
            pixel_values (Optional[torch.FloatTensor], optional): 输入图像的像素值张量。默认为None。
            output_attentions (Optional[bool], optional): 是否输出注意力权重。默认为None。
            output_hidden_states (Optional[bool], optional): 是否输出隐藏状态。默认为None。
            return_dict (Optional[bool], optional): 是否返回字典形式的输出。默认为None。

        Returns:
            Union[Tuple, BaseModelOutputWithPooling]: 根据return_dict决定返回类型，可能是元组或BaseModelOutputWithPooling对象。

        Examples:

        ```
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, CLIPSegVisionModel

        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegVisionModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```
        """
        return self.vision_model(
            pixel_values=pixel_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
# 使用装饰器为类添加文档字符串，以提供类的基本信息
@add_start_docstrings(CLIPSEG_START_DOCSTRING)
class CLIPSegModel(CLIPSegPreTrainedModel):
    # 指定配置类为CLIPSegConfig
    config_class = CLIPSegConfig

    def __init__(self, config: CLIPSegConfig):
        # 调用父类构造函数初始化模型
        super().__init__(config)

        # 检查文本配置是否为CLIPSegTextConfig类型，否则引发值错误异常
        if not isinstance(config.text_config, CLIPSegTextConfig):
            raise ValueError(
                "config.text_config is expected to be of type CLIPSegTextConfig but is of type"
                f" {type(config.text_config)}."
            )

        # 检查视觉配置是否为CLIPSegVisionConfig类型，否则引发值错误异常
        if not isinstance(config.vision_config, CLIPSegVisionConfig):
            raise ValueError(
                "config.vision_config is expected to be of type CLIPSegVisionConfig but is of type"
                f" {type(config.vision_config)}."
            )

        # 获取文本和视觉配置
        text_config = config.text_config
        vision_config = config.vision_config

        # 初始化模型的维度信息
        self.projection_dim = config.projection_dim
        self.text_embed_dim = text_config.hidden_size
        self.vision_embed_dim = vision_config.hidden_size

        # 初始化文本和视觉模型
        self.text_model = CLIPSegTextTransformer(text_config)
        self.vision_model = CLIPSegVisionTransformer(vision_config)

        # 创建视觉投影层和文本投影层
        self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
        self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
        
        # 初始化对数尺度参数
        self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))

        # 执行初始化权重和最终处理
        self.post_init()

    # 使用装饰器为方法添加文档字符串，以提供方法的输入参数和功能描述
    @add_start_docstrings_to_model_forward(CLIPSEG_TEXT_INPUTS_DOCSTRING)
    def get_text_features(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> torch.FloatTensor:
        r"""
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`CLIPSegTextModel`].

        Examples:

        ```
        >>> from transformers import AutoTokenizer, CLIPSegModel

        >>> tokenizer = AutoTokenizer.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
        >>> text_features = model.get_text_features(**inputs)
        ```"""
        # Use CLIPSEG model's config for some fields (if specified) instead of those of vision & text components.
        # 如果输出注意力未指定，则使用 self.config.output_attentions；否则使用指定的输出注意力
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 如果输出隐藏状态未指定，则使用 self.config.output_hidden_states；否则使用指定的输出隐藏状态
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 如果返回字典未指定，则使用 self.config.use_return_dict；否则使用指定的返回字典
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 使用文本模型进行前向传播，获取文本输出
        text_outputs = self.text_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从文本输出中取出汇总输出
        pooled_output = text_outputs[1]
        # 将汇总输出应用于文本投影层，生成文本特征
        text_features = self.text_projection(pooled_output)

        # 返回生成的文本特征
        return text_features

    @add_start_docstrings_to_model_forward(CLIPSEG_VISION_INPUTS_DOCSTRING)
    def get_image_features(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        ) -> torch.FloatTensor:
        r"""
        Returns:
            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
            applying the projection layer to the pooled output of [`CLIPSegVisionModel`].

        Examples:

        ```
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, CLIPSegModel

        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> image_features = model.get_image_features(**inputs)
        ```"""
        # Use CLIPSEG model's config for some fields (if specified) instead of those of vision & text components.
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # Pass the input arguments to the vision model and retrieve its outputs
        vision_outputs = self.vision_model(
            pixel_values=pixel_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # Extract the pooled output from the vision model's outputs
        pooled_output = vision_outputs[1]  # pooled_output
        # Apply visual projection layer to the pooled output to obtain image features
        image_features = self.visual_projection(pooled_output)

        # Return the computed image features
        return image_features
# 定义一个 CLIPSeg 解码器层，与 CLIPSegEncoderLayer 相同，不同之处在于在 self-attention/MLP 之后应用归一化，而不是之前。
class CLIPSegDecoderLayer(nn.Module):
    """
    CLIPSeg decoder layer, which is identical to `CLIPSegEncoderLayer`, except that normalization is applied after
    self-attention/MLP, rather than before.
    """

    # 从 transformers.models.clip.modeling_clip.CLIPEncoderLayer.__init__ 复制而来，仅将 CLIP 改为 CLIPSeg
    def __init__(self, config: CLIPSegConfig):
        super().__init__()
        # 设定嵌入维度为隐藏大小
        self.embed_dim = config.hidden_size
        # 初始化 self-attention 层
        self.self_attn = CLIPSegAttention(config)
        # 第一层归一化
        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
        # 初始化 MLP 层
        self.mlp = CLIPSegMLP(config)
        # 第二层归一化
        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: torch.Tensor,
        causal_attention_mask: torch.Tensor,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.FloatTensor]:
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            causal_attention_mask (`torch.FloatTensor`): mask applied to causal attention
                `(batch, 1, tgt_len, src_len)`
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers.
        """
        # 保存残差连接
        residual = hidden_states

        # 应用 self-attention 模块，得到新的 hidden_states 和 attention weights
        hidden_states, attn_weights = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            causal_attention_mask=causal_attention_mask,
            output_attentions=output_attentions,
        )

        # 添加残差连接并应用第一层归一化
        hidden_states = residual + hidden_states
        hidden_states = self.layer_norm1(hidden_states)

        # 再次保存残差连接
        residual = hidden_states

        # 应用 MLP 层
        hidden_states = self.mlp(hidden_states)

        # 添加残差连接并应用第二层归一化
        hidden_states = residual + hidden_states
        hidden_states = self.layer_norm2(hidden_states)

        # 准备输出
        outputs = (hidden_states,)

        # 如果需要输出 attention weights，则将它们加入输出
        if output_attentions:
            outputs += (attn_weights,)

        return outputs
    # 初始化方法，接收一个配置对象作为参数
    def __init__(self, config: CLIPSegConfig):
        # 调用父类的初始化方法
        super().__init__(config)

        # 设置条件层编号
        self.conditional_layer = config.conditional_layer

        # FILM（Feature-wise Linear Modulation）网络中的乘法线性层，将投影维度映射到减少维度
        self.film_mul = nn.Linear(config.projection_dim, config.reduce_dim)
        # FILM网络中的加法线性层，同样映射投影维度到减少维度
        self.film_add = nn.Linear(config.projection_dim, config.reduce_dim)

        # 如果配置指定使用复杂的转置卷积
        if config.use_complex_transposed_convolution:
            # 计算转置卷积核的大小
            transposed_kernels = (config.vision_config.patch_size // 4, config.vision_config.patch_size // 4)

            # 创建转置卷积层的序列模型
            self.transposed_convolution = nn.Sequential(
                # 普通卷积层，用于降低特征维度
                nn.Conv2d(config.reduce_dim, config.reduce_dim, kernel_size=3, padding=1),
                nn.ReLU(),
                # 第一个转置卷积层，使用指定的核大小和步幅
                nn.ConvTranspose2d(
                    config.reduce_dim,
                    config.reduce_dim // 2,
                    kernel_size=transposed_kernels[0],
                    stride=transposed_kernels[0],
                ),
                nn.ReLU(),
                # 第二个转置卷积层，使用不同的核大小和步幅
                nn.ConvTranspose2d(
                    config.reduce_dim // 2, 1, kernel_size=transposed_kernels[1], stride=transposed_kernels[1]
                ),
            )
        else:
            # 创建简单的转置卷积层，使用指定的核大小和步幅
            self.transposed_convolution = nn.ConvTranspose2d(
                config.reduce_dim, 1, config.vision_config.patch_size, stride=config.vision_config.patch_size
            )

        # 提取层的深度，即要减少特征维度的层数
        depth = len(config.extract_layers)
        # 创建多个线性层，用于将视觉特征的隐藏大小映射到减少维度
        self.reduces = nn.ModuleList(
            [nn.Linear(config.vision_config.hidden_size, config.reduce_dim) for _ in range(depth)]
        )

        # 复制视觉配置，用于解码器的配置
        decoder_config = copy.deepcopy(config.vision_config)
        # 设置解码器的隐藏大小为减少维度后的大小
        decoder_config.hidden_size = config.reduce_dim
        # 设置解码器的注意力头数和中间层大小
        decoder_config.num_attention_heads = config.decoder_num_attention_heads
        decoder_config.intermediate_size = config.decoder_intermediate_size
        # 设置解码器的激活函数为ReLU
        decoder_config.hidden_act = "relu"
        # 创建多个CLIPSeg解码层，与提取层数量相同
        self.layers = nn.ModuleList([CLIPSegDecoderLayer(decoder_config) for _ in range(len(config.extract_layers))])

    # 前向传播方法，接收隐藏状态、条件嵌入、输出注意力和隐藏状态标志等参数
    def forward(
        self,
        hidden_states: Tuple[torch.Tensor],
        conditional_embeddings: torch.Tensor,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = True,
        ):
        all_hidden_states = () if output_hidden_states else None  # 初始化存储所有隐藏状态的元组，如果不输出隐藏状态则为None
        all_attentions = () if output_attentions else None  # 初始化存储所有注意力权重的元组，如果不输出注意力权重则为None

        activations = hidden_states[::-1]  # 将隐藏状态列表倒序排列

        output = None  # 初始化输出变量为None
        for i, (activation, layer, reduce) in enumerate(zip(activations, self.layers, self.reduces)):
            if output is not None:
                output = reduce(activation) + output  # 如果输出不为None，应用reduce函数并累加到output中
            else:
                output = reduce(activation)  # 如果输出为None，直接应用reduce函数

            if i == self.conditional_layer:
                output = self.film_mul(conditional_embeddings) * output.permute(1, 0, 2) + self.film_add(
                    conditional_embeddings
                )  # 如果当前层是条件层，则应用条件嵌入乘法和加法操作到output上

                output = output.permute(1, 0, 2)  # 调整output的维度顺序

            layer_outputs = layer(
                output, attention_mask=None, causal_attention_mask=None, output_attentions=output_attentions
            )  # 应用当前层的前向传播函数，传入output和相应的注意力掩码参数

            output = layer_outputs[0]  # 更新output为当前层的输出结果

            if output_hidden_states:
                all_hidden_states += (output,)  # 如果需要输出隐藏状态，将当前层输出的隐藏状态添加到all_hidden_states中

            if output_attentions:
                all_attentions += (layer_outputs[1],)  # 如果需要输出注意力权重，将当前层输出的注意力权重添加到all_attentions中

        output = output[:, 1:, :].permute(0, 2, 1)  # 移除CLS标记并重塑维度为[batch_size, reduce_dim, seq_len]

        size = int(math.sqrt(output.shape[2]))  # 计算输出的第三维度的平方根作为size

        batch_size = conditional_embeddings.shape[0]  # 获取条件嵌入的批量大小
        output = output.view(batch_size, output.shape[1], size, size)  # 调整output的维度形状

        logits = self.transposed_convolution(output).squeeze(1)  # 应用转置卷积层到output上并压缩第一维度

        if not return_dict:
            return tuple(v for v in [logits, all_hidden_states, all_attentions] if v is not None)  # 如果不返回字典形式的结果，返回包含非空元素的元组

        return CLIPSegDecoderOutput(
            logits=logits,
            hidden_states=all_hidden_states,
            attentions=all_attentions,
        )  # 否则，返回CLIPSegDecoderOutput对象，包含logits、hidden_states和attentions字段
@add_start_docstrings(
    """
    CLIPSeg model with a Transformer-based decoder on top for zero-shot and one-shot image segmentation.
    """,
    CLIPSEG_START_DOCSTRING,
)
class CLIPSegForImageSegmentation(CLIPSegPreTrainedModel):
    config_class = CLIPSegConfig

    def __init__(self, config: CLIPSegConfig):
        super().__init__(config)

        self.config = config

        # Initialize CLIPSegModel with provided configuration
        self.clip = CLIPSegModel(config)
        
        # Store the list of layers to extract features from
        self.extract_layers = config.extract_layers

        # Initialize CLIPSegDecoder with provided configuration
        self.decoder = CLIPSegDecoder(config)

        # Initialize model weights and apply final processing
        self.post_init()

    def get_conditional_embeddings(
        self,
        batch_size: int = None,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        conditional_pixel_values: Optional[torch.Tensor] = None,
    ):
        if input_ids is not None:
            # Compute conditional embeddings from text inputs
            if len(input_ids) != batch_size:
                raise ValueError("Make sure to pass as many prompt texts as there are query images")
            with torch.no_grad():
                # Retrieve text features from CLIP model
                conditional_embeddings = self.clip.get_text_features(
                    input_ids, attention_mask=attention_mask, position_ids=position_ids
                )
        elif conditional_pixel_values is not None:
            # Compute conditional embeddings from image inputs
            if len(conditional_pixel_values) != batch_size:
                raise ValueError("Make sure to pass as many prompt images as there are query images")
            with torch.no_grad():
                # Retrieve image features from CLIP model
                conditional_embeddings = self.clip.get_image_features(conditional_pixel_values)
        else:
            raise ValueError(
                "Invalid conditional, should be either provided as `input_ids` or `conditional_pixel_values`"
            )

        return conditional_embeddings

    @add_start_docstrings_to_model_forward(CLIPSEG_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=CLIPSegImageSegmentationOutput, config_class=CLIPSegTextConfig)
    def forward(
        self,
        input_ids: Optional[torch.FloatTensor] = None,
        pixel_values: Optional[torch.FloatTensor] = None,
        conditional_pixel_values: Optional[torch.FloatTensor] = None,
        conditional_embeddings: Optional[torch.FloatTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        # Method signature conforms to CLIPSEG_INPUTS_DOCSTRING specifications
        # and replaces return type description with CLIPSegImageSegmentationOutput
        pass  # Placeholder for actual implementation, not provided here

`.\models\clipseg\processing_clipseg.py`

# 设置文件编码为 UTF-8
# 版权声明，使用 Apache License 2.0 许可证
# 警告：如果没有符合许可证要求的代码，不能使用本文件
"""
CLIPSeg 的图像/文本处理器类
"""

# 导入警告模块
import warnings

# 导入 ProcessorMixin 和 BatchEncoding 类
from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding


class CLIPSegProcessor(ProcessorMixin):
    r"""
    构建一个 CLIPSeg 处理器，将 CLIPSeg 图像处理器和 CLIP 分词器包装成一个单一处理器。

    [`CLIPSegProcessor`] 提供了 [`ViTImageProcessor`] 和 [`CLIPTokenizerFast`] 的所有功能。查看
    [`~CLIPSegProcessor.__call__`] 和 [`~CLIPSegProcessor.decode`] 获取更多信息。

    Args:
        image_processor ([`ViTImageProcessor`], *optional*):
            图像处理器是必需的输入。
        tokenizer ([`CLIPTokenizerFast`], *optional*):
            分词器是必需的输入。
    """

    # 定义类属性
    attributes = ["image_processor", "tokenizer"]
    image_processor_class = "ViTImageProcessor"
    tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")

    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
        # 检查是否存在 "feature_extractor" 参数，并给出警告信息
        feature_extractor = None
        if "feature_extractor" in kwargs:
            warnings.warn(
                "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
                " instead.",
                FutureWarning,
            )
            feature_extractor = kwargs.pop("feature_extractor")

        # 如果未指定 image_processor，则使用 feature_extractor
        image_processor = image_processor if image_processor is not None else feature_extractor
        # 如果未指定 tokenizer，则抛出 ValueError
        if image_processor is None:
            raise ValueError("You need to specify an `image_processor`.")
        if tokenizer is None:
            raise ValueError("You need to specify a `tokenizer`.")

        # 调用父类的初始化方法
        super().__init__(image_processor, tokenizer)

    def batch_decode(self, *args, **kwargs):
        """
        此方法将所有参数转发给 CLIPTokenizerFast 的 [`~PreTrainedTokenizer.batch_decode`]。更多信息请参考该方法的文档字符串。
        """
        return self.tokenizer.batch_decode(*args, **kwargs)

    def decode(self, *args, **kwargs):
        """
        此方法将所有参数转发给 CLIPTokenizerFast 的 [`~PreTrainedTokenizer.decode`]。更多信息请参考该方法的文档字符串。
        """
        return self.tokenizer.decode(*args, **kwargs)

    @property
    # 发出警告，提醒用户 `feature_extractor_class` 方法已弃用，将在 v5 版本中移除，建议使用 `image_processor_class` 方法代替。
    def feature_extractor_class(self):
        warnings.warn(
            "`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
            FutureWarning,
        )
        # 返回当前对象的 `image_processor_class` 属性
        return self.image_processor_class

    # 属性装饰器，发出警告，提醒用户 `feature_extractor` 属性已弃用，将在 v5 版本中移除，建议使用 `image_processor` 属性代替。
    @property
    def feature_extractor(self):
        warnings.warn(
            "`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
            FutureWarning,
        )
        # 返回当前对象的 `image_processor` 属性
        return self.image_processor

`.\models\clipseg\init.py`

# 版权声明及许可信息
# Copyright 2022 The HuggingFace Team. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# 导入类型检查模块
from typing import TYPE_CHECKING

# 导入自定义的异常类和模块延迟加载工具函数
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available

# 定义模块的导入结构
_import_structure = {
    "configuration_clipseg": [
        "CLIPSEG_PRETRAINED_CONFIG_ARCHIVE_MAP",  # CLIPSEG 预训练配置文件映射
        "CLIPSegConfig",  # CLIPSeg 模型配置
        "CLIPSegTextConfig",  # CLIPSeg 文本模型配置
        "CLIPSegVisionConfig",  # CLIPSeg 视觉模型配置
    ],
    "processing_clipseg": ["CLIPSegProcessor"],  # CLIPSeg 处理器模块
}

# 检查是否导入了 torch 模块，若未导入则抛出异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若导入成功则添加以下模块到导入结构中
    _import_structure["modeling_clipseg"] = [
        "CLIPSEG_PRETRAINED_MODEL_ARCHIVE_LIST",  # CLIPSEG 预训练模型归档列表
        "CLIPSegModel",  # CLIPSeg 模型
        "CLIPSegPreTrainedModel",  # CLIPSeg 预训练模型基类
        "CLIPSegTextModel",  # CLIPSeg 文本模型
        "CLIPSegVisionModel",  # CLIPSeg 视觉模型
        "CLIPSegForImageSegmentation",  # 用于图像分割的 CLIPSeg 模型
    ]

# 如果当前环境支持类型检查，则从相关模块导入具体类和常量
if TYPE_CHECKING:
    from .configuration_clipseg import (
        CLIPSEG_PRETRAINED_CONFIG_ARCHIVE_MAP,  # CLIPSEG 预训练配置文件映射
        CLIPSegConfig,  # CLIPSeg 模型配置
        CLIPSegTextConfig,  # CLIPSeg 文本模型配置
        CLIPSegVisionConfig,  # CLIPSeg 视觉模型配置
    )
    from .processing_clipseg import CLIPSegProcessor  # CLIPSeg 处理器模块

    # 检查是否导入了 torch 模块，若未导入则跳过
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 若导入成功则从相关模块导入具体类和常量
        from .modeling_clipseg import (
            CLIPSEG_PRETRAINED_MODEL_ARCHIVE_LIST,  # CLIPSEG 预训练模型归档列表
            CLIPSegForImageSegmentation,  # 用于图像分割的 CLIPSeg 模型
            CLIPSegModel,  # CLIPSeg 模型
            CLIPSegPreTrainedModel,  # CLIPSeg 预训练模型基类
            CLIPSegTextModel,  # CLIPSeg 文本模型
            CLIPSegVisionModel,  # CLIPSeg 视觉模型
        )

# 若不是类型检查模式，则使用懒加载模块加载当前模块结构
else:
    import sys

    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\clvp\configuration_clvp.py`

# 设置文件编码格式为 utf-8
# 版权声明及许可协议说明
# 这是 CLVP 模型的配置文件

# 导入操作系统模块和类型检查模块
import os
from typing import TYPE_CHECKING, Union

# 如果类型检查为真，则执行代码块
if TYPE_CHECKING:
    pass

# 从 HuggingFace 函数库中导入预训练配置和日志工具
from ...configuration_utils import PretrainedConfig
from ...utils import logging

# 获取日志记录器
logger = logging.get_logger(__name__)

# CLVP 预训练配置存档
CLVP_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "susnato/clvp_dev": "https://huggingface.co/susnato/clvp_dev/resolve/main/config.json",
}

# CLVP 编码器配置类，用于存储 CLVP 编码器的配置信息
class ClvpEncoderConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`ClvpEncoder`]. It is used to instantiate a CLVP
    text or CLVP speech encoder according to the specified arguments. Instantiating a configuration with the defaults
    will yield a similar configuration to that of the encoder of the CLVP
    [susnato/clvp_dev](https://huggingface.co/susnato/clvp_dev) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    # 设置模型类型为 "clvp_encoder"
    model_type = "clvp_encoder"
    # 初始化函数，用于设置模型的
# 定义 CLVP 解码器配置类，继承自预训练配置类 PretrainedConfig
class ClvpDecoderConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`ClvpDecoder`]. It is used to instantiate a CLVP
    Decoder Model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the Decoder part of the CLVP
    [susnato/clvp_dev](https://huggingface.co/susnato/clvp_dev) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    The architecture is similar to GPT2.

    Example:

    ```
    >>> from transformers import ClvpDecoderConfig, ClvpDecoder

    >>> # Initializing a ClvpDecoderConfig with susnato/clvp_dev style configuration
    >>> decoder_configuration = ClvpDecoderConfig()

    >>> # Initializing a ClvpDecoder (with random weights) from the susnato/clvp_dev style configuration
    >>> model = ClvpDecoder(decoder_configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""

    # 模型类型为 "clvp_decoder"
    model_type = "clvp_decoder"

    # 初始化函数，定义 CLVP 解码器配置的各项参数
    def __init__(
        self,
        vocab_size=8194,
        max_position_embeddings=608,
        max_text_tokens=404,
        hidden_size=1024,
        num_hidden_layers=30,
        num_attention_heads=16,
        n_inner=None,
        num_mel_attn_blocks=6,
        activation_function="gelu_new",
        resid_pdrop=0.1,
        embd_pdrop=0.1,
        attention_dropout=0.1,
        layer_norm_epsilon=1e-5,
        initializer_range=0.02,
        summary_type="cls_index",
        summary_use_proj=True,
        summary_activation=None,
        summary_proj_to_labels=True,
        summary_first_dropout=0.1,
        use_cache=True,
        bos_token_id=8192,
        eos_token_id=8193,
        feature_size=80,
        use_attention_bias=True,
        initializer_factor=1.0,
        decoder_fixing_codes=[83, 45, 45, 248],
        **kwargs,
    ):
        # 调用父类的初始化函数，传递所有参数
        super().__init__(**kwargs)
        # 定义 CLVP 解码器特有的参数
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.max_text_tokens = max_text_tokens
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.n_inner = n_inner
        self.num_mel_attn_blocks = num_mel_attn_blocks
        self.activation_function = activation_function
        self.resid_pdrop = resid_pdrop
        self.embd_pdrop = embd_pdrop
        self.attention_dropout = attention_dropout
        self.layer_norm_epsilon = layer_norm_epsilon
        self.initializer_range = initializer_range
        self.summary_type = summary_type
        self.summary_use_proj = summary_use_proj
        self.summary_activation = summary_activation
        self.summary_proj_to_labels = summary_proj_to_labels
        self.summary_first_dropout = summary_first_dropout
        self.use_cache = use_cache
        self.bos_token_id = bos_token_id
        self.eos_token_id = eos_token_id
        self.feature_size = feature_size
        self.use_attention_bias = use_attention_bias
        self.initializer_factor = initializer_factor
        self.decoder_fixing_codes = decoder_fixing_codes
        # 接受并处理未定义的额外参数
        self.update_from_kwargs(kwargs)
        ):
        # 初始化方法，接收多个参数来配置模型的各种属性
        self.vocab_size = vocab_size
        # 设置词汇表大小
        self.max_position_embeddings = max_position_embeddings
        # 设置最大位置编码长度
        self.max_text_tokens = max_text_tokens
        # 设置最大文本标记数
        self.hidden_size = hidden_size
        # 设置隐藏层大小
        self.num_hidden_layers = num_hidden_layers
        # 设置隐藏层数量
        self.num_attention_heads = num_attention_heads
        # 设置注意力头数
        self.n_inner = n_inner
        # 设置内部层大小
        self.num_mel_attn_blocks = num_mel_attn_blocks
        # 设置 MEL 注意力块数量
        self.activation_function = activation_function
        # 设置激活函数
        self.resid_pdrop = resid_pdrop
        # 设置残差连接丢弃率
        self.embd_pdrop = embd_pdrop
        # 设置嵌入层丢弃率
        self.attention_dropout = attention_dropout
        # 设置注意力丢弃率
        self.layer_norm_epsilon = layer_norm_epsilon
        # 设置层归一化的 epsilon 参数
        self.initializer_range = initializer_range
        # 设置初始化范围
        self.summary_type = summary_type
        # 设置摘要类型
        self.summary_use_proj = summary_use_proj
        # 设置是否使用摘要投影
        self.summary_activation = summary_activation
        # 设置摘要激活函数
        self.summary_first_dropout = summary_first_dropout
        # 设置摘要的首次丢弃率
        self.summary_proj_to_labels = summary_proj_to_labels
        # 设置摘要投影到标签
        self.use_cache = use_cache
        # 设置是否使用缓存
        self.feature_size = feature_size
        # 设置特征大小
        self.use_attention_bias = use_attention_bias
        # 设置是否使用注意力偏置
        self.initializer_factor = initializer_factor
        # 设置初始化因子
        self.decoder_fixing_codes = decoder_fixing_codes
        # 设置解码器修复码

        self.bos_token_id = bos_token_id
        # 设置起始标记 ID
        self.eos_token_id = eos_token_id
        # 设置结束标记 ID

        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
        # 调用父类初始化方法，传入起始和结束标记 ID 以及其他参数

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
        # 类方法：从预训练模型名或路径创建配置对象，返回预训练配置对象

        cls._set_token_in_kwargs(kwargs)
        # 将 token 设置到 kwargs 中

        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
        # 获取配置字典和更新后的 kwargs 参数

        # 如果从 ClvpConfig 加载，则获取语音配置字典
        if config_dict.get("model_type") == "clvp":
            config_dict = config_dict["decoder_config"]

        # 如果配置字典中有模型类型，并且类具有 model_type 属性，并且模型类型不等于 cls.model_type，则发出警告
        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
            logger.warning(
                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
            )

        return cls.from_dict(config_dict, **kwargs)
        # 从配置字典和 kwargs 参数创建配置对象并返回
# `ClvpConfig` 是存储 [`ClvpModelForConditionalGeneration`] 配置的类。
# 该配置类用于实例化 CLVP 模型，定义文本模型、语音模型和解码器模型的配置。
# 使用默认参数实例化配置对象将生成类似于 CLVP [susnato/clvp_dev](https://huggingface.co/susnato/clvp_dev) 架构的配置。
# 配置对象继承自 [`PretrainedConfig`]，用于控制模型输出。更多信息请参阅 [`PretrainedConfig`] 的文档。

class ClvpConfig(PretrainedConfig):
    model_type = "clvp"
    is_composition = True

    def __init__(
        self,
        text_config=None,
        speech_config=None,
        decoder_config=None,
        projection_dim=768,
        logit_scale_init_value=2.6592,
        initializer_factor=1.0,
        **kwargs,
    ):
        super().__init__(**kwargs)

        if text_config is None:
            text_config = {}
            # 如果未提供text_config参数，则使用默认空字典
            logger.info("`text_config` is `None`. Initializing the `ClvpEncoderConfig` with default values.")

        if speech_config is None:
            speech_config = {}
            # 如果未提供speech_config参数，则使用默认空字典
            logger.info("`speech_config` is `None`. initializing the `ClvpEncoderConfig` with default values.")

        if decoder_config is None:
            decoder_config = {}
            # 如果未提供decoder_config参数，则使用默认空字典
            logger.info("`decoder_config` is `None`. initializing the `ClvpDecoderConfig` with default values.")

        self.text_config = ClvpEncoderConfig(**text_config)
        # 初始化self.text_config，使用ClvpEncoderConfig类及其参数
        self.speech_config = ClvpEncoderConfig(**speech_config)
        # 初始化self.speech_config，使用ClvpEncoderConfig类及其参数
        self.decoder_config = ClvpDecoderConfig(**decoder_config)
        # 初始化self.decoder_config，使用ClvpDecoderConfig类及其参数

        self.projection_dim = projection_dim
        # 设置投影维度
        self.logit_scale_init_value = logit_scale_init_value
        # 设置logit缩放初始值
        self.initializer_factor = initializer_factor
        # 设置初始化因子

    @classmethod
    def from_sub_model_configs(
        cls,
        text_config: ClvpEncoderConfig,
        speech_config: ClvpEncoderConfig,
        decoder_config: ClvpDecoderConfig,
        **kwargs,
    ):
        r"""
        Instantiate a [`ClvpConfig`] (or a derived class) from CLVP text model configuration, CLVP speech model
        configuration and CLVP decoder model configuration.

        Args:
            text_config (`ClvpEncoderConfig`):
                Text model configuration of type [`ClvpEncoderConfig`].
            speech_config (`ClvpEncoderConfig`):
                Speech model configuration of type [`ClvpEncoderConfig`].
            decoder_config (`ClvpDecoderConfig`):
                Decoder model configuration of type [`ClvpDecoderConfig`].

        Returns:
            [`ClvpConfig`]: An instance of a configuration object
        """

        return cls(
            text_config=text_config.to_dict(),
            # 将text_config转换为字典形式传递给cls构造函数
            speech_config=speech_config.to_dict(),
            # 将speech_config转换为字典形式传递给cls构造函数
            decoder_config=decoder_config.to_dict(),
            # 将decoder_config转换为字典形式传递给cls构造函数
            **kwargs,
        )

`.\models\clvp\convert_clvp_to_hf.py`

# 设置编码方式为 UTF-8，确保脚本可以处理各种字符集
# Copyright 2023 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
CLVP权重转换脚本
"""

import argparse  # 导入处理命令行参数的模块
import os  # 导入操作系统功能的模块

import torch  # 导入PyTorch库
from huggingface_hub import hf_hub_download  # 从Hugging Face Hub下载模块

from transformers import ClvpConfig, ClvpModelForConditionalGeneration  # 导入CLVP模型相关组件


_MODELS = {
    "clvp": "https://huggingface.co/jbetker/tortoise-tts-v2/blob/main/.models/clvp2.pth",
    "decoder": "https://huggingface.co/jbetker/tortoise-tts-v2/blob/main/.models/autoregressive.pth",
}

dim = 1024  # 定义维度为1024的变量
sub_dim = dim // 16  # 计算子维度，为总维度除以16的结果

CLVP_ENCODERS_MAPPING = {
    "text_transformer.transformer.attn_layers": "text_encoder_model",
    "speech_transformer.transformer.attn_layers": "speech_encoder_model",
    "text_transformer.transformer.norm": "text_encoder_model.final_layer_norm",
    "speech_transformer.transformer.norm": "speech_encoder_model.final_layer_norm",
    "to_text_latent": "text_encoder_model.projection",
    "to_speech_latent": "speech_encoder_model.projection",
    "text_emb": "text_encoder_model.token_embedding",
    "speech_emb": "speech_encoder_model.token_embedding",
    "1.wrap.net.0": "mlp.fc1",
    "1.wrap.net.3": "mlp.fc2",
    "1.wrap": "self_attn",
    "to_out": "out_proj",
    "to_q": "q_proj",
    "to_k": "k_proj",
    "to_v": "v_proj",
    "temperature": "logit_scale",
}

CLVP_DECODER_MAPPING = {
    "conditioning_encoder.init": "conditioning_encoder.mel_conv",
    "conditioning_encoder.attn": "conditioning_encoder.mel_attn_blocks",
    "mel_attn_blocks": "group_norms",
    ".norm.weight": ".weight",
    ".norm.bias": ".bias",
    "text_embedding": "conditioning_encoder.text_token_embedding",
    "text_pos_embedding.emb": "conditioning_encoder.text_position_embedding",
    "final_norm": "speech_decoder_model.final_norm",
    "mel_head": "speech_decoder_model.lm_head",
    "gpt.ln_f": "speech_decoder_model.model.decoder.layer_norm",
    "mel_embedding": "speech_decoder_model.model.decoder.input_embeds_layer",
    "mel_pos_embedding.emb": "speech_decoder_model.model.decoder.position_embeds_layer",
    "gpt.h": "speech_decoder_model.model.decoder.layers",
    "ln_1": "input_layernorm",
    "ln_2": "post_attention_layernorm",
}


def update_index(present_index):
    # 如果给定索引为偶数，则返回其除以2的整数部分
    if present_index % 2 == 0:
        return int(present_index / 2)
    # 如果给定索引为奇数，则返回其减1后除以2的整数部分
    else:
        return int((present_index - 1) / 2)


def convert_encoder_weights(original_weights):
    converted_weights = {}
    # 对原始权重的键进行排序，以确保处理顺序一致性
    original_weights_keys = sorted(original_weights.keys())
    # 遍历排序后的原始权重键列表
    for original_key in original_weights_keys:
        # 初始化更新后的键为原始键
        updated_key = original_key
        
        # 替换特定模式的键名，根据条件替换为 "input_rmsnorm.weight" 或 "post_attention_rmsnorm.weight"
        if "0.0.g" in updated_key:
            # 提取特定位置的索引
            present_index = updated_key.split(".")[4]
            # 根据索引是否为偶数，决定替换为哪个新键名
            if int(present_index) % 2 == 0:
                updated_key = updated_key.replace("0.0.g", "input_rmsnorm.weight")
            else:
                updated_key = updated_key.replace("0.0.g", "post_attention_rmsnorm.weight")

        # 替换特定模式的键名，根据函数 update_index 处理索引更新
        if "transformer.attn_layers.layers" in updated_key:
            present_index = updated_key.split(".")[4]
            updated_index = update_index(int(present_index))
            updated_key = updated_key.replace(
                f"transformer.attn_layers.layers.{present_index}", f"transformer.attn_layers.layers.{updated_index}"
            )

        # 根据 CLVP_ENCODERS_MAPPING 字典替换键名中的特定字符串
        for k, v in CLVP_ENCODERS_MAPPING.items():
            if k in updated_key:
                updated_key = updated_key.replace(k, v)

        # 将更新后的键值对存入转换后的权重字典中，并从原始权重字典中移除原始键
        converted_weights[updated_key] = original_weights.pop(original_key)

    # 返回转换后的权重字典
    return converted_weights
# 定义一个函数，用于将原始权重转换为新的权重格式
def convert_decoder_weights(original_weights):
    # 创建一个空字典，用于存储转换后的权重
    converted_weights = {}
    # 获取原始权重字典的所有键，并按字母顺序排序
    original_weights_keys = sorted(original_weights.keys())
    # 返回转换后的权重字典
    return converted_weights


# 定义一个私有函数，用于从指定 URL 下载文件到指定路径
def _download(url: str, root: str):
    # 从 URL 提取仓库 ID 和文件名
    repo_id = f"{url.split('/')[3]}/{url.split('/')[4]}"
    filename = f"{url.split('/')[-2]}/{url.split('/')[-1]}"
    # 调用函数从 Hugging Face Hub 下载文件到指定路径
    hf_hub_download(
        repo_id=repo_id,
        filename=filename,
        force_filename=root,
        local_dir_use_symlinks=False,
    )


# 定义一个函数，用于转换 CLVP 模型的权重格式
def convert_clvp_weights(checkpoint_path, pytorch_dump_folder_path):
    # 创建一个空字典，用于存储转换后的检查点
    converted_checkpoint = {}

    # 遍历预定义的模型名称和其对应的下载 URL
    for each_model_name, each_model_url in _MODELS.items():
        # 构建每个模型文件的完整路径
        each_model_path = os.path.join(checkpoint_path, each_model_url.split("/")[-1])
        # 如果文件不存在，则下载该模型文件
        if not os.path.exists(each_model_path):
            print(f"\n{each_model_name} was not found! Downloading it to {each_model_path}")
            _download(url=each_model_url, root=each_model_path)

        # 根据模型名称选择加载对应的检查点文件
        if each_model_name == "clvp":
            clvp_checkpoint = torch.load(each_model_path, map_location="cpu")
        else:
            decoder_checkpoint = torch.load(each_model_path, map_location="cpu")

    # 将 CLVP 模型的编码器权重转换并更新到转换后的检查点中
    converted_checkpoint.update(**convert_encoder_weights(clvp_checkpoint))
    # 将解码器权重转换并更新到转换后的检查点中
    converted_checkpoint.update(**convert_decoder_weights(decoder_checkpoint))

    # 根据预训练配置创建 CLVP 模型配置对象
    config = ClvpConfig.from_pretrained("susnato/clvp_dev")
    # 根据配置对象创建条件生成用的 CLVP 模型
    model = ClvpModelForConditionalGeneration(config)

    # 加载转换后的检查点到模型中，严格模式
    model.load_state_dict(converted_checkpoint, strict=True)
    # 将模型保存到 PyTorch 转储文件夹路径中
    model.save_pretrained(pytorch_dump_folder_path)
    print(f"Model saved at {pytorch_dump_folder_path}!")


# 如果该脚本作为主程序运行，则执行以下代码
if __name__ == "__main__":
    # 创建一个参数解析器对象
    parser = argparse.ArgumentParser()
    # 添加必需的参数：检查点路径，指向已下载检查点的文件夹路径
    parser.add_argument(
        "--checkpoint_path", type=str, help="Path to the folder of downloaded checkpoints. (Please enter full path)"
    )
    # 添加可选的参数：PyTorch 模型转储文件夹路径
    parser.add_argument(
        "--pytorch_dump_folder_path",
        default=None,
        type=str,
        help="Path to the output PyTorch model. (Please enter full path)",
    )
    # 解析命令行参数
    args = parser.parse_args()

    # 调用函数，将 CLVP 模型的权重转换并保存为 PyTorch 模型
    convert_clvp_weights(args.checkpoint_path, args.pytorch_dump_folder_path)

`.\models\clvp\feature_extraction_clvp.py`

# coding=utf-8
# 定义了文件编码格式为 UTF-8

# 版权声明，声明代码版权归 HuggingFace Inc. 团队所有
# 根据 Apache 许可证版本 2.0 发布，除非符合许可证规定，否则不得使用此文件
# 可以在以下网址获取许可证的副本：http://www.apache.org/licenses/LICENSE-2.0

# 如果适用法律要求或书面同意，以“原样”方式分发本软件，不提供任何形式的担保或条件
# 请参阅许可证了解具体的法律条文及其约束
# 此处导入需要的模块和类
"""
Feature extractor class for CLVP
"""

# 导入必要的模块
from typing import List, Optional, Union

# 导入 NumPy 库，用于处理数值计算
import numpy as np

# 导入音频相关的工具函数
from ...audio_utils import mel_filter_bank, spectrogram, window_function

# 导入特征提取的序列工具函数
from ...feature_extraction_sequence_utils import SequenceFeatureExtractor

# 导入特征提取的批处理功能
from ...feature_extraction_utils import BatchFeature

# 导入自定义的张量类型和日志记录工具
from ...utils import TensorType, logging

# 获取当前文件的日志记录器
logger = logging.get_logger(__name__)


# 定义 CLVP 特征提取器类，继承自 SequenceFeatureExtractor 类
class ClvpFeatureExtractor(SequenceFeatureExtractor):
    r"""
    Constructs a CLVP feature extractor.

    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
    most of the main methods. Users should refer to this superclass for more information regarding those methods.

    This class extracts log-mel-spectrogram features from raw speech using a custom numpy implementation of the `Short
    Time Fourier Transform` which should match pytorch's `torch.stft` equivalent.
    """
    Args:
        feature_size (`int`, *optional*, defaults to 80):
            The feature dimension of the extracted features.
        sampling_rate (`int`, *optional*, defaults to 22050):
            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
        default_audio_length (`int`, *optional*, defaults to 6):
            The default length of raw audio in seconds. If `max_length` is not set during `__call__` then it will
            automatically be set to default_audio_length * `self.sampling_rate`.
        hop_length (`int`, *optional*, defaults to 256):
            Length of the overlapping windows for the STFT used to obtain the Mel Frequency coefficients.
        chunk_length (`int`, *optional*, defaults to 30):
            The maximum number of chunks of `sampling_rate` samples used to trim and pad longer or shorter audio
            sequences.
        n_fft (`int`, *optional*, defaults to 1024):
            Size of the Fourier transform.
        padding_value (`float`, *optional*, defaults to 0.0):
            Padding value used to pad the audio. Should correspond to silences.
        mel_norms (`list` of length `feature_size`, *optional*):
            If `mel_norms` is provided then it will be used to normalize the log-mel spectrograms along each
            mel-filter.
        return_attention_mask (`bool`, *optional*, defaults to `False`):
            Whether to return the attention mask. If left to the default, it will return the attention mask.

            [What are attention masks?](../glossary#attention-mask)
    """
    # 定义模型输入的名称，包括输入特征和注意力掩码
    model_input_names = ["input_features", "attention_mask"]

    def __init__(
        self,
        feature_size=80,
        sampling_rate=22050,
        default_audio_length=6,
        hop_length=256,
        chunk_length=30,
        n_fft=1024,
        padding_value=0.0,
        mel_norms=None,
        return_attention_mask=False,  # pad inputs to max length with silence token (zero) and no attention mask
        **kwargs,
    ):
        # 调用父类的初始化方法，设置基本参数
        super().__init__(
            feature_size=feature_size,
            sampling_rate=sampling_rate,
            padding_value=padding_value,
            return_attention_mask=return_attention_mask,
            **kwargs,
        )
        # 设置其他参数和属性
        self.n_fft = n_fft
        self.hop_length = hop_length
        self.chunk_length = chunk_length
        self.n_samples = chunk_length * sampling_rate  # 计算每个片段的采样数
        self.nb_max_frames = self.n_samples // hop_length  # 计算最大帧数
        self.sampling_rate = sampling_rate
        self.default_audio_length = default_audio_length
        self.mel_norms = mel_norms
        # 计算梅尔滤波器组
        self.mel_filters = mel_filter_bank(
            num_frequency_bins=1 + (n_fft // 2),
            num_mel_filters=feature_size,
            min_frequency=0.0,
            max_frequency=8000.0,
            sampling_rate=sampling_rate,
            norm="slaney",
            mel_scale="htk",
        )
    def _np_extract_fbank_features(self, waveform: np.array) -> np.ndarray:
        """
        This method first computes the log-mel spectrogram of the provided audio then applies normalization along the
        each mel-filterbank, if `mel_norms` is provided.
        """
        # 计算音频的对数梅尔频谱图
        log_spec = spectrogram(
            waveform,
            window_function(self.n_fft, "hann"),
            frame_length=self.n_fft,
            hop_length=self.hop_length,
            power=2.0,
            mel_filters=self.mel_filters,
            log_mel=None,
        )

        # 对计算得到的对数梅尔频谱图进行对数处理，并进行上下限裁剪
        log_spec = np.log(np.clip(log_spec, a_min=1e-5, a_max=None))

        # 如果提供了 `mel_norms`，则对对数梅尔频谱图进行归一化
        if self.mel_norms is not None:
            log_spec = log_spec / np.array(self.mel_norms)[:, None]

        # 返回处理后的对数梅尔频谱图作为结果
        return log_spec

    def __call__(
        self,
        raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
        sampling_rate: Optional[int] = None,
        truncation: bool = True,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_attention_mask: Optional[bool] = True,
        padding: Optional[str] = "max_length",
        max_length: Optional[int] = None,
        **kwargs,

`.\models\clvp\modeling_clvp.py`

# 导入必要的库和模块
import copy  # 导入 copy 模块用于复制对象
import math  # 导入 math 模块用于数学运算
from dataclasses import dataclass  # 导入 dataclass 用于定义数据类
from typing import Dict, Optional, Tuple, Union  # 导入类型提示相关模块

import torch  # 导入 PyTorch 库
import torch.utils.checkpoint  # 导入 PyTorch 的 checkpoint 模块
from torch import nn  # 导入 PyTorch 的神经网络模块
from torch.nn import CrossEntropyLoss  # 导入交叉熵损失函数

from ...activations import ACT2FN  # 导入激活函数映射
from ...generation import GenerationConfig  # 导入生成配置相关模块
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask
# 导入注意力掩码相关函数
from ...modeling_outputs import (  # 导入模型输出相关类
    BaseModelOutput,
    BaseModelOutputWithPastAndCrossAttentions,
    BaseModelOutputWithPooling,
    CausalLMOutputWithCrossAttentions,
)
from ...modeling_utils import PreTrainedModel, SequenceSummary  # 导入模型工具类和序列摘要类
from ...pytorch_utils import Conv1D  # 导入 PyTorch 的一维卷积类
from ...utils import (  # 导入工具函数和类
    ModelOutput,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
from .configuration_clvp import (  # 导入 CLVP 模型的配置类
    ClvpConfig,
    ClvpDecoderConfig,
    ClvpEncoderConfig,
)


logger = logging.get_logger(__name__)  # 获取日志记录器

_CHECKPOINT_FOR_DOC = "susnato/clvp_dev"  # 设置用于文档的检查点名称

CLVP_PRETRAINED_MODEL_ARCHIVE_LIST = [  # CLVP 预训练模型的存档列表
    "susnato/clvp_dev",
    # 查看所有 CLVP 模型：https://huggingface.co/models?filter=clvp
]


# 从 transformers.models.clip.modeling_clip.contrastive_loss 复制过来
def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
    """对比损失函数，计算交叉熵损失"""
    return nn.functional.cross_entropy(logits, torch.arange(len(logits), device=logits.device))


# 从 transformers.models.clip.modeling_clip.clip_loss 复制过来，将函数名和变量名改为 clvp_loss 和 speech_loss
def clvp_loss(similarity: torch.Tensor) -> torch.Tensor:
    """CLVP 损失函数，结合文本和语音的对比损失"""
    caption_loss = contrastive_loss(similarity)  # 计算文本部分的对比损失
    speech_loss = contrastive_loss(similarity.t())  # 计算语音部分的对比损失
    return (caption_loss + speech_loss) / 2.0  # 返回两部分损失的平均值


# 从 transformers.models.llama.modeling_llama.rotate_half 复制过来
def rotate_half(x):
    """对输入的隐藏维度的一半进行旋转"""
    x1 = x[..., : x.shape[-1] // 2]  # 取前一半的数据
    x2 = x[..., x.shape[-1] // 2 :]  # 取后一半的数据
    return torch.cat((-x2, x1), dim=-1)  # 将后一半和前一半的数据拼接并返回


def apply_rotary_pos_emb(q, k, v, cos, sin, position_ids, unsqueeze_dim=1):
    """应用旋转位置嵌入到查询和键的张量中"""
    # 这里是函数的实现部分，根据具体的旋转位置嵌入方法完成对输入张量的操作
    # 根据给定的位置索引从 cosine 和 sine 部分提取位置编码向量，并在指定维度上进行 unsqueeze 操作，以便与 q 和 k 张量的维度匹配
    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
    sin = sin[position_ids].unsqueeze(unsqueeze_dim)

    # 使用 Rotary Position Embedding 对查询向量 q 进行旋转编码
    q_embed = (q * cos) + (rotate_half(q) * sin)

    # 使用 Rotary Position Embedding 对键向量 k 进行旋转编码
    k_embed = (k * cos) + (rotate_half(k) * sin)

    # 使用 Rotary Position Embedding 对值向量 v 进行旋转编码
    v_embed = (v * cos) + (rotate_half(v) * sin)

    # 返回经过 Rotary Position Embedding 旋转编码后的查询、键、值向量
    return q_embed, k_embed, v_embed
def _pad_extra_bos_eos_tokens(
    input_ids,
    attention_mask=None,
    pad_token_id=0,
    bos_token_id=255,
    eos_token_id=0,
    add_bos_token=True,
    add_eos_token=True,
):
    """
    This method adds extra bos and eos tokens to input_ids and accordingly modifies the attention_mask which is used in
    `ClvpConditioningEncoder` and the generation loop of the `ClvpModelForConditionalGeneration`.
    """

    # 在开头添加 bos token
    if add_bos_token:
        # 使用 torch 的函数在 input_ids 前面填充一个位置，值为 bos_token_id
        input_ids = torch.nn.functional.pad(input_ids, (1, 0), value=bos_token_id)
        # 如果有 attention_mask，则在开头填充一个有效位置，值为 1
        attention_mask = (
            torch.nn.functional.pad(attention_mask, (1, 0), value=1) if attention_mask is not None else attention_mask
        )

    # 创建 modified_input_ids 变量并初始化为 input_ids
    modified_input_ids = input_ids
    # 如果要添加 eos token
    if add_eos_token:
        # 根据 input_ids 的形状创建一个扩展后的 modified_input_ids
        modified_input_ids = torch.zeros(
            (input_ids.shape[0], input_ids.shape[1] + 1), dtype=input_ids.dtype, device=input_ids.device
        )
        # 遍历每个 input_id
        for i, each_input_id in enumerate(input_ids):
            # 找到有效 token 结束的位置，然后添加 eos token
            if torch.isin(each_input_id, pad_token_id).sum():
                # 找到第一个 pad_token_id 的位置
                pos = torch.where(each_input_id == pad_token_id)[0].min()
                # 在找到的位置前后添加 eos_token_id 构成新的 modified_input_ids
                modified_input_ids[i] = torch.concatenate(
                    [each_input_id[:pos], torch.tensor([eos_token_id], device=input_ids.device), each_input_id[pos:]]
                )
            else:
                # 如果没有 pad tokens，则在结尾添加 eos token
                modified_input_ids[i] = torch.nn.functional.pad(each_input_id, (0, 1), value=eos_token_id)
        # 如果有 attention_mask，则在开头填充一个有效位置，值为 1
        attention_mask = (
            torch.nn.functional.pad(attention_mask, (1, 0), value=1) if attention_mask is not None else attention_mask
        )

    # 返回修改后的 input_ids 和 attention_mask
    return modified_input_ids, attention_mask
    # `embeds` 是一个可选参数，表示模型应用投影层到汇聚输出后得到的嵌入向量。
    embeds: Optional[torch.FloatTensor] = None
    
    # `last_hidden_state` 是必须的参数，表示模型最后一层的隐藏状态。
    last_hidden_state: torch.FloatTensor = None
    
    # `pooler_output` 是一个可选参数，表示经过汇聚层处理后得到的汇聚输出。
    pooler_output: Optional[torch.FloatTensor] = None
    
    # `hidden_states` 是一个可选参数，是一个元组，包含模型每一层的隐藏状态输出。
    # 如果模型有嵌入层，则包含嵌入层的输出，形状为 `(batch_size, sequence_length, hidden_size)`。
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    
    # `attentions` 是一个可选参数，是一个元组，包含模型每一层的注意力权重。
    # 每个元素的形状为 `(batch_size, num_heads, sequence_length, sequence_length)`，
    # 表示经过注意力 softmax 后的注意力权重，用于计算自注意力头的加权平均值。
    attentions: Optional[Tuple[torch.FloatTensor]] = None
@dataclass
class ClvpOutput(ModelOutput):
    """
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
            Contrastive loss for speech-text similarity.
        speech_ids (`torch.LongTensor`, *optional*):
            speech_ids (or speech candidates) generated by the `ClvpForCausalLM` model.
        logits_per_speech (`torch.FloatTensor` of shape `(speech_batch_size, text_batch_size)`):
            The scaled dot product scores between `speech_embeds` and `text_embeds`. This represents the speech-text
            similarity scores.
        logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, speech_batch_size)`):
            The scaled dot product scores between `text_embeds` and `speech_embeds`. This represents the text-speech
            similarity scores.
        text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
            The text embeddings obtained by applying the projection layer to the pooled output of the text encoder
            model.
        speech_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
            The speech embeddings obtained by applying the projection layer to the pooled output of the speech encoder
            model.
        text_model_output (`BaseModelOutputWithPooling`):
            The pooled output of the `last_hidden_state` of the text encoder Model.
        speech_model_output (`BaseModelOutputWithPooling`):
            The pooled output of the `last_hidden_state` of the speech encoder Model.
        decoder_hidden_states (`torch.FloatTensor`, *optional*):
            The hidden states of the decoder model.
        text_encoder_hidden_states (`torch.FloatTensor`, *optional*):
            The hidden states of the text encoder model.
        speech_encoder_hidden_states (`torch.FloatTensor`, *optional*):
            The hidden states of the speech encoder model.
    """

    loss: Optional[torch.FloatTensor] = None  # 损失值，用于表示语音文本相似性的对比损失
    speech_ids: Optional[torch.LongTensor] = None  # 由`ClvpForCausalLM`模型生成的语音ID（或语音候选项）
    logits_per_speech: torch.FloatTensor = None  # `speech_embeds`和`text_embeds`之间的缩放点积得分，表示语音文本相似性
    logits_per_text: torch.FloatTensor = None  # `text_embeds`和`speech_embeds`之间的缩放点积得分，表示文本语音相似性
    text_embeds: torch.FloatTensor = None  # 通过将文本编码器模型的池化输出应用到投影层获得的文本嵌入
    speech_embeds: torch.FloatTensor = None  # 通过将语音编码器模型的池化输出应用到投影层获得的语音嵌入
    text_model_output: BaseModelOutputWithPooling = None  # 文本编码器模型最后隐藏状态的池化输出
    speech_model_output: BaseModelOutputWithPooling = None  # 语音编码器模型最后隐藏状态的池化输出
    decoder_hidden_states: torch.FloatTensor = None  # 解码器模型的隐藏状态
    text_encoder_hidden_states: torch.FloatTensor = None  # 文本编码器模型的隐藏状态
    speech_encoder_hidden_states: torch.FloatTensor = None  # 语音编码器模型的隐藏状态


# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Clvp
class ClvpRMSNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-6):
        """
        ClvpRMSNorm is equivalent to T5LayerNorm
        """
        super().__init__()
        self.weight = nn.Parameter(torch.ones(hidden_size))  # 归一化层的权重参数，初始化为全1
        self.variance_epsilon = eps  # 方差的小值阈值，用于数值稳定性
    # 定义前向传播函数，用于处理隐藏状态
    def forward(self, hidden_states):
        # 获取输入张量的数据类型
        input_dtype = hidden_states.dtype
        # 将隐藏状态张量转换为 float32 类型
        hidden_states = hidden_states.to(torch.float32)
        # 计算隐藏状态张量每个元素的平方，并沿着最后一个维度求平均值，保持维度不变
        variance = hidden_states.pow(2).mean(-1, keepdim=True)
        # 将隐藏状态张量按元素乘以其标准差的倒数，以标准化数据
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
        # 返回经过权重调整后的隐藏状态张量
        return self.weight * hidden_states.to(input_dtype)
class ClvpRotaryPositionalEmbedding(nn.Module):
    """
    Rotary Position Embedding Class for CLVP. It was proposed in the paper 'ROFORMER: ENHANCED TRANSFORMER WITH ROTARY
    POSITION EMBEDDING', Please see https://arxiv.org/pdf/2104.09864v1.pdf .
    """

    def __init__(self, config):
        super().__init__()
        # Calculate dimension of each projection in the rotary positional embedding
        dim = max(config.projection_dim // (config.num_attention_heads * 2), 32)
        # Calculate inverse frequencies for positional encoding
        inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2, dtype=torch.int64).float() / dim))

        # Register inverse frequencies as a buffer tensor
        self.register_buffer("inv_freq", inv_freq)
        self.cached_sequence_length = None
        self.cached_rotary_positional_embedding = None

    def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
        # Get the length of the input sequence
        sequence_length = hidden_states.shape[1]

        # Return cached positional embeddings if sequence length matches and they are cached
        if sequence_length == self.cached_sequence_length and self.cached_rotary_positional_embedding is not None:
            return self.cached_rotary_positional_embedding

        # Cache the current sequence length
        self.cached_sequence_length = sequence_length
        # Generate timestamps for positional encoding
        time_stamps = torch.arange(sequence_length, device=hidden_states.device).type_as(self.inv_freq)
        # Compute frequencies multiplied by timestamps
        freqs = torch.einsum("i,j->ij", time_stamps, self.inv_freq)
        # Concatenate frequencies to form the rotary positional embeddings
        embeddings = torch.cat((freqs, freqs), dim=-1)

        # Cache the computed rotary positional embeddings
        self.cached_rotary_positional_embedding = embeddings.unsqueeze(0)
        return self.cached_rotary_positional_embedding


class ClvpSelfAttention(nn.Module):
    """
    Multi-headed attention to combine Absolute and Rotary Positional Embeddings into a single Attention module.
    """

    def __init__(self, config):
        super().__init__()
        self.config = config
        self.embed_dim = config.hidden_size
        self.num_heads = config.num_attention_heads
        self.head_dim = self.embed_dim // self.num_heads
        if self.head_dim * self.num_heads != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
                f" {self.num_heads})."
            )
        self.scale = self.head_dim**-0.5
        self.dropout = config.attention_dropout

        if hasattr(config, "max_position_embeddings"):
            max_positions = config.max_position_embeddings
            # Create a triangular bias matrix for masking future positions
            bias = torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool))
            bias = bias.view(1, 1, max_positions, max_positions)
            # Register the bias matrix as a non-persistent buffer
            self.register_buffer("bias", bias, persistent=False)

        # Projection layers for query, key, and value
        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.use_attention_bias)
        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.use_attention_bias)
        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.use_attention_bias)
        # Output projection layer
        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)

    # Copied from transformers.models.clip.modeling_clip.CLIPAttention._shape
    # 定义私有方法 `_shape`，用于调整输入张量的形状以符合注意力头的需求
    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        # 调整张量的形状为 (batch_size, seq_len, num_heads, head_dim)，并交换维度 1 和 2
        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()

    # 定义前向传播方法 `forward`
    def forward(
        self,
        hidden_states: torch.FloatTensor,
        rotary_pos_emb: Optional[torch.FloatTensor] = None,
        attention_mask: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        use_cache: Optional[bool] = False,
        head_mask: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = False,
    """
    This class defines an encoder layer for the CLVP model, comprising self-attention mechanism and MLP for processing hidden states.
    """

    def __init__(self, config: ClvpConfig):
        super().__init__()
        self.config = config
        self.embed_dim = config.hidden_size
        # Initialize self-attention mechanism for attending to input sequences
        self.self_attn = ClvpSelfAttention(config)
        # Initialize MLP for processing and transforming hidden states
        self.mlp = ClvpEncoderMLP(config)

        # Layer normalization for input to the self-attention mechanism
        self.input_rmsnorm = ClvpRMSNorm(self.embed_dim, eps=config.layer_norm_eps)
        # Layer normalization for output after self-attention
        self.post_attention_rmsnorm = ClvpRMSNorm(self.embed_dim, eps=config.layer_norm_eps)

    def forward(
        self,
        hidden_states: torch.FloatTensor,
        rotary_pos_emb: torch.FloatTensor,
        attention_mask: torch.LongTensor,
        position_ids: torch.LongTensor,
        output_attentions: Optional[bool] = False,
    ) -> torch.FloatTensor:
        # Apply layer normalization to the input hidden states
        hidden_states = self.input_rmsnorm(hidden_states)
        # Perform self-attention on the normalized hidden states
        hidden_states, attention_weights = self.self_attn(
            hidden_states, rotary_pos_emb, attention_mask, position_ids, output_attentions
        )
        # Apply layer normalization to the output of self-attention
        hidden_states = self.post_attention_rmsnorm(hidden_states)
        # Process the normalized hidden states through the MLP
        hidden_states = self.mlp(hidden_states)
        return hidden_states
    def forward(
        hidden_states: torch.FloatTensor,
        rotary_pos_emb: torch.FloatTensor,
        attention_mask: torch.FloatTensor,
        position_ids: torch.LongTensor,
        output_attentions: bool = False
    ) -> Tuple[torch.FloatTensor]:
        """
        Args:
            hidden_states (`torch.FloatTensor` of shape `(batch, seq_len, embed_dim)`):
                input to the layer.
            rotary_pos_emb (`torch.FloatTensor`):
                rotary position embeddings generated by `ClvpRotaryPositionalEmbedding` module.
            attention_mask (`torch.FloatTensor` of shape `(batch, 1, tgt_len, src_len)`):
                attention mask where padding elements are indicated by very large negative values.
            position_ids (`torch.LongTensor`):
                Denotes position ids of the input tokens.
            output_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        """
        # 保存残差连接，以便后续使用
        residual = hidden_states
    
        # 应用输入的 RMS 归一化
        hidden_states = self.input_rmsnorm(hidden_states)
    
        # 执行自注意力机制
        attention_outputs = self.self_attn(
            hidden_states=hidden_states,
            rotary_pos_emb=rotary_pos_emb,
            attention_mask=attention_mask,
            position_ids=position_ids,
            output_attentions=output_attentions,
        )
    
        # 从自注意力输出中提取隐藏状态
        hidden_states = attention_outputs[0]
    
        # 残差连接
        hidden_states = residual + hidden_states
    
        # 保存残差连接
        residual = hidden_states
    
        # 应用注意力后的 RMS 归一化
        hidden_states = self.post_attention_rmsnorm(hidden_states)
    
        # 应用多层感知机
        hidden_states = self.mlp(hidden_states)
    
        # 残差连接
        hidden_states = residual + hidden_states
    
        # 输出结果作为元组
        outputs = (hidden_states,)
    
        # 如果需要输出注意力权重
        if output_attentions:
            outputs += (attention_outputs[-1],)
    
        return outputs
# 从transformers.models.gpt2.modeling_gpt2.GPT2MLP复制代码，并将GPT2->ClvpDecoderMLP进行替换
class ClvpDecoderMLP(nn.Module):
    def __init__(self, intermediate_size, config):
        super().__init__()
        embed_dim = config.hidden_size
        # 创建一个一维卷积层，输入维度为embed_dim，输出维度为intermediate_size
        self.c_fc = Conv1D(intermediate_size, embed_dim)
        # 创建一个一维卷积层，输入维度为intermediate_size，输出维度为embed_dim
        self.c_proj = Conv1D(embed_dim, intermediate_size)
        # 激活函数为config.activation_function指定的函数
        self.act = ACT2FN[config.activation_function]
        # Dropout层，丢弃概率为config.resid_pdrop
        self.dropout = nn.Dropout(config.resid_pdrop)

    def forward(self, hidden_states: Optional[Tuple[torch.FloatTensor]]) -> torch.FloatTensor:
        # 使用self.c_fc进行一维卷积操作
        hidden_states = self.c_fc(hidden_states)
        # 使用激活函数self.act处理卷积后的隐藏状态
        hidden_states = self.act(hidden_states)
        # 使用self.c_proj进行一维卷积操作
        hidden_states = self.c_proj(hidden_states)
        # 使用Dropout层处理卷积后的隐藏状态
        hidden_states = self.dropout(hidden_states)
        return hidden_states


class ClvpDecoderLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        hidden_size = config.hidden_size
        # 如果config.n_inner不为None，则使用config.n_inner；否则使用4 * hidden_size
        inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size

        # Layer normalization层，输入维度为hidden_size，epsilon为config.layer_norm_epsilon
        self.input_layernorm = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
        # ClvpSelfAttention对象，处理self-attention相关逻辑
        self.attn = ClvpSelfAttention(config)
        # Layer normalization层，输入维度为hidden_size，epsilon为config.layer_norm_epsilon
        self.post_attention_layernorm = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)

        # ClvpDecoderMLP对象，处理MLP层的前向传播逻辑
        self.mlp = ClvpDecoderMLP(inner_dim, config)

    def forward(
        self,
        hidden_states: Optional[Tuple[torch.FloatTensor]],
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        attention_mask: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = False,
        output_attentions: Optional[bool] = False,
    ) -> Union[Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor, ...]]]]:
        # 保存残差连接
        residual = hidden_states
        # Layer normalization层处理hidden_states
        hidden_states = self.input_layernorm(hidden_states)
        # 使用self.attn进行attention计算
        attn_outputs = self.attn(
            hidden_states,
            past_key_value=past_key_value,
            attention_mask=attention_mask,
            position_ids=position_ids,
            head_mask=head_mask,
            use_cache=use_cache,
            output_attentions=output_attentions,
        )
        # 获取attention计算结果
        attn_output = attn_outputs[0]
        outputs = attn_outputs[1:]
        # 残差连接
        hidden_states = attn_output + residual

        # 保存残差连接
        residual = hidden_states
        # Layer normalization层处理hidden_states
        hidden_states = self.post_attention_layernorm(hidden_states)
        # 使用self.mlp进行MLP层的前向传播计算
        feed_forward_hidden_states = self.mlp(hidden_states)
        # 残差连接
        hidden_states = residual + feed_forward_hidden_states

        if use_cache:
            outputs = (hidden_states,) + outputs
        else:
            outputs = (hidden_states,) + outputs[1:]

        return outputs


class ClvpConditioningEncoder(nn.Module):
    """
    This class processes the log-mel spectrograms(extracted by the Feature Extractor) and text tokens(produced by the
    """
    # 在这里编写该类的其他逻辑和功能
    def __init__(self, config: ClvpConfig):
        super().__init__()

        # 保存文本配置和解码器配置
        self.text_config = config.text_config
        self.decoder_config = config.decoder_config

        # 创建文本token的嵌入层，维度为（词汇表大小，隐藏层大小）
        self.text_token_embedding = nn.Embedding(self.text_config.vocab_size, self.decoder_config.hidden_size)
        # 创建文本位置嵌入层，维度为（最大文本token数，隐藏层大小）
        self.text_position_embedding = nn.Embedding(
            self.decoder_config.max_text_tokens, self.decoder_config.hidden_size
        )

        # 创建用于将mel特征向量转换成隐藏层大小的卷积层
        self.mel_conv = nn.Conv1d(self.decoder_config.feature_size, self.decoder_config.hidden_size, kernel_size=1)

        # 计算用于每个注意力层之前的GroupNorm的组数
        num_groups = self.compute_groupnorm_groups(self.decoder_config.hidden_size)
        # 创建一组GroupNorm层，每个注意力层前面有一个
        self.group_norms = nn.ModuleList(
            [
                nn.GroupNorm(num_groups, self.decoder_config.hidden_size, eps=1e-5, affine=True)
                for _ in range(self.decoder_config.num_mel_attn_blocks)
            ]
        )

        # 创建一组自注意力层模块
        self.mel_attn_blocks = nn.ModuleList(
            [ClvpSelfAttention(self.decoder_config) for _ in range(self.decoder_config.num_mel_attn_blocks)]
        )

        # 设置梯度检查点为False
        self.gradient_checkpointing = False

    def compute_groupnorm_groups(self, channels: int, groups: int = 32):
        """
        计算用于nn.GroupNorm的`num_groups`的值。这个逻辑来自于官方的tortoise repository。
        链接：https://github.com/neonbjb/tortoise-tts/blob/4003544b6ff4b68c09856e04d3eff9da26d023c2/tortoise/models/arch_util.py#L26
        """
        # 根据隐藏层大小调整分组数
        if channels <= 16:
            groups = 8
        elif channels <= 64:
            groups = 16
        # 确保分组数可以整除通道数
        while channels % groups != 0:
            groups = int(groups / 2)

        # 如果分组数小于等于2，则抛出异常
        if groups <= 2:
            raise ValueError(
                f"Number of groups for the GroupNorm must be greater than 2, but it is {groups}."
                f"Please consider using a different `hidden_size`"
            )

        return groups

    def forward(
        self,
        input_features: torch.FloatTensor,
        input_ids: Optional[torch.LongTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        attention_mask: Optional[torch.LongTensor] = None,
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 定义配置类，用于本模型的配置管理
    config_class = ClvpConfig
    # 模型名称前缀
    base_model_prefix = "clvp"
    # 是否支持梯度检查点
    supports_gradient_checkpointing = True
    # 需要跳过设备放置的键名
    _skip_keys_device_placement = "past_key_values"

    def _init_weights(self, module):
        """Initialize the weights"""
        # 获取初始化因子
        factor = self.config.initializer_factor
        # 如果是 Embedding 层，使用正态分布初始化权重
        if isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=factor * 0.02)
        # 如果是 Linear 或 Conv1D 层，同时初始化权重和偏置
        elif isinstance(module, (nn.Linear, Conv1D, nn.Conv1d)):
            module.weight.data.normal_(mean=0.0, std=factor * 0.02)
            if module.bias is not None:
                module.bias.data.zero_()
        # 如果是 ClvpEncoderMLP 层，根据不同的层进行不同的初始化方式
        elif isinstance(module, ClvpEncoderMLP):
            # 计算输入投影的标准差和全连接层的标准差
            in_proj_std = (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
            fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
            # 使用正态分布初始化权重
            nn.init.normal_(module.fc1.proj.weight if getattr(module.fc1, "proj") else module.fc1.weight, std=fc_std)
            nn.init.normal_(module.fc2.weight, std=in_proj_std)
        # 如果是 ClvpEncoder 层，根据配置初始化权重
        elif isinstance(module, ClvpEncoder):
            config = self.config.text_config if hasattr(self.config, "text_config") else self.config
            factor = config.initializer_factor
            module.projection.weight.data.normal_(mean=0.0, std=factor * (config.hidden_size**-0.5))
        # 如果是 ClvpConditioningEncoder 层，使用正态分布初始化权重和偏置
        elif isinstance(module, ClvpConditioningEncoder):
            module.mel_conv.weight.data.normal_(mean=0.0, std=factor)
            module.mel_conv.bias.data.zero_()
        # 如果是 ClvpForCausalLM 层，根据名称初始化特定参数的权重
        elif isinstance(module, ClvpForCausalLM):
            for name, p in module.named_parameters():
                if name == "c_proj.weight":
                    p.data.normal_(
                        mean=0.0, std=(self.config.initializer_range / math.sqrt(2 * self.config.num_hidden_layers))
                    )
        # 如果是 LayerNorm 层，初始化偏置为零，权重为1
        if isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)


CLVP_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.
"""
    Parameters:
        config ([`ClvpConfig`]): Model configuration class with all the parameters of the model.
            # 参数：config是一个包含模型所有参数的配置类。
            # 使用配置文件初始化模型时，仅加载与模型相关的配置，并不加载模型的权重。
            # 若要加载模型权重，请查看[`~PreTrainedModel.from_pretrained`]方法。
# 定义 CLVP_INPUTS_DOCSTRING 常量，包含输入参数的文档字符串
CLVP_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, time_dim)`):
            Indicates log mel-spectrogram representations for audio returned by [`ClvpFeatureExtractor`].
        conditioning_encoder_inputs_embeds (`torch.FloatTensor`, *optional*):
            inputs_embeds for `ClvpConditioningEncoder`. Can be used in place of `input_ids`.
        text_encoder_inputs_embeds (`torch.FloatTensor`, *optional*):
            inputs_embeds for the text encoder model passed in place of `input_ids`.
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding text token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""

# 定义 CLVP_DECODER_INPUTS_DOCSTRING 常量，但是为空字符串，暂时未提供相关文档说明
CLVP_DECODER_INPUTS_DOCSTRING = r"""
"""


class ClvpEncoder(ClvpPreTrainedModel):
    """
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`ClvpEncoderLayer`].

    Args:
        config: ClvpConfig
    """
    # 初始化函数，接受一个配置对象作为参数
    def __init__(self, config: ClvpConfig):
        # 调用父类的初始化方法
        super().__init__(config)

        # 保存配置对象到实例变量中
        self.config = config

        # 创建一个词嵌入层对象，使用config中的词汇大小和隐藏尺寸作为参数
        self.token_embedding = nn.Embedding(config.vocab_size, config.hidden_size)

        # 如果配置中启用了旋转位置编码，则创建ClvpRotaryPositionalEmbedding对象，否则设为None
        self.rotary_pos_emb = ClvpRotaryPositionalEmbedding(config) if config.use_rotary_embedding else None

        # 创建一个包含多个ClvpEncoderLayer对象的模块列表，列表长度由config中的隐藏层数决定
        self.layers = nn.ModuleList([ClvpEncoderLayer(config) for _ in range(config.num_hidden_layers)])

        # 创建一个SequenceSummary对象，用于序列摘要
        self.sequence_summary = SequenceSummary(config)

        # 创建一个LayerNorm层，用于最终的归一化处理，参数为隐藏尺寸和配置中的LayerNorm epsilon值
        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

        # 创建一个线性投影层，将隐藏状态映射到投影维度，无偏置项
        self.projection = nn.Linear(config.hidden_size, config.projection_dim, bias=False)

        # 梯度检查点标志设为False
        self.gradient_checkpointing = False

        # 执行初始化后的附加操作
        self.post_init()

    # 返回token_embedding属性，即词嵌入层对象
    def get_input_embeddings(self):
        return self.token_embedding

    # 设置token_embedding属性为指定的值
    def set_input_embeddings(self, value):
        self.token_embedding = value

    # 前向传播方法
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        inputs_embeds: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
class ClvpDecoder(ClvpPreTrainedModel):
    """
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`ClvpDecoderLayer`]
    """

    def __init__(self, config):
        super().__init__(config)

        self.config = config  # 初始化函数，保存配置信息到实例变量

        self.input_embeds_layer = nn.Embedding(self.config.vocab_size, self.config.hidden_size)
        # 创建输入的嵌入层，根据词汇表大小和隐藏层大小进行初始化

        self.position_embeds_layer = nn.Embedding(self.config.max_position_embeddings, self.config.hidden_size)
        # 创建位置嵌入层，根据最大位置嵌入数和隐藏层大小进行初始化

        self.drop = nn.Dropout(self.config.embd_pdrop)  # 创建一个丢弃层，使用配置中的丢弃概率

        self.layers = nn.ModuleList([ClvpDecoderLayer(self.config) for _ in range(self.config.num_hidden_layers)])
        # 创建一个包含多个 ClvpDecoderLayer 的模块列表，数量由配置中的隐藏层数决定

        self.layer_norm = nn.LayerNorm(self.config.hidden_size, eps=self.config.layer_norm_epsilon)
        # 创建一个层归一化层，使用隐藏层大小和配置中的归一化参数进行初始化

        self.gradient_checkpointing = False  # 初始化梯度检查点标志为 False

        # 初始化权重并应用最终处理
        self.post_init()

    def get_input_embeddings(self):
        return self.input_embeds_layer  # 返回输入嵌入层

    def set_input_embeddings(self, new_embeddings):
        self.input_embeds_layer = new_embeddings  # 设置新的输入嵌入层

    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
        """
        for layer, heads in heads_to_prune.items():
            self.layers[layer].attn.prune_heads(heads)
        # 剪枝模型中的注意力头部

    @add_start_docstrings_to_model_forward(CLVP_DECODER_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        """
        Performs forward pass of the decoder model.

        Args:
            input_ids: Optionally provided input IDs.
            attention_mask: Optionally provided attention mask.
            token_type_ids: Optionally provided token type IDs.
            position_ids: Optionally provided position IDs.
            head_mask: Optionally provided head mask.
            past_key_values: Optionally provided past key values.
            inputs_embeds: Optionally provided input embeddings.
            use_cache: Optionally use cache.
            output_attentions: Optionally output attentions.
            output_hidden_states: Optionally output hidden states.
            return_dict: Optionally return as dictionary.

        Returns:
            Model output.
        """
        pass  # 前向传播函数声明，暂未实现具体逻辑

@add_start_docstrings(
    "The bare Clvp decoder model outputting raw hidden-states without any specific head on top.",
    CLVP_START_DOCSTRING,
)
class ClvpModel(ClvpPreTrainedModel):
    def __init__(self, config: ClvpDecoderConfig):
        super().__init__(config)
        self.config = config  # 初始化函数，保存配置信息到实例变量
        self.decoder = ClvpDecoder(self.config)  # 创建 ClvpDecoder 实例作为解码器

        # 初始化权重并应用最终处理
        self.post_init()

    def get_input_embeddings(self):
        return self.decoder.input_embeds_layer  # 返回解码器的输入嵌入层

    def set_input_embeddings(self, value):
        self.decoder.input_embeds_layer = value  # 设置解码器的新输入嵌入层

    def get_decoder(self):
        return self.decoder  # 返回解码器实例

    @add_start_docstrings_to_model_forward(CLVP_DECODER_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,  # 输入的token ID序列，可选的长整型张量
        attention_mask: Optional[torch.FloatTensor] = None,  # 注意力掩码张量，可选的浮点数张量
        token_type_ids: Optional[torch.LongTensor] = None,  # token类型ID张量，可选的长整型张量
        position_ids: Optional[torch.LongTensor] = None,  # 位置ID张量，可选的长整型张量
        head_mask: Optional[torch.FloatTensor] = None,  # 头部掩码张量，可选的浮点数张量
        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,  # 过去的键值对元组，可选的张量元组
        inputs_embeds: Optional[torch.FloatTensor] = None,  # 输入嵌入张量，可选的浮点数张量
        use_cache: Optional[bool] = None,  # 是否使用缓存，可选的布尔值
        output_attentions: Optional[bool] = None,  # 是否输出注意力，可选的布尔值
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态，可选的布尔值
        return_dict: Optional[bool] = None,  # 是否返回字典形式的结果，可选的布尔值
    ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:  # 返回值可以是元组或BaseModelOutputWithPastAndCrossAttentions对象

        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 如果没有指定output_attentions，则使用self.config中的设置

        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 如果没有指定output_hidden_states，则使用self.config中的设置

        use_cache = use_cache if use_cache is not None else self.config.use_cache
        # 如果没有指定use_cache，则使用self.config中的设置

        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 如果没有指定return_dict，则使用self.config中的设置

        # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
        # 解码器的输出包括(dec_features, past_key_value, dec_hidden, dec_attn)
        decoder_outputs = self.decoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        if not return_dict:
            # 如果不返回字典形式的结果，则直接返回解码器的输出
            return decoder_outputs

        # 如果返回字典形式的结果，则构造BaseModelOutputWithPastAndCrossAttentions对象
        return BaseModelOutputWithPastAndCrossAttentions(
            last_hidden_state=decoder_outputs.last_hidden_state,
            past_key_values=decoder_outputs.past_key_values,
            hidden_states=decoder_outputs.hidden_states,
            attentions=decoder_outputs.attentions,
            cross_attentions=decoder_outputs.cross_attentions,
        )
@add_start_docstrings(
    "The CLVP decoder model with a language modelling head on top.",
    CLVP_START_DOCSTRING,
)
class ClvpForCausalLM(ClvpPreTrainedModel):
    # CLVPForCausalLM 类的构造函数，初始化模型配置和相关组件
    def __init__(self, config):
        super().__init__(config)

        # 存储传入的配置信息
        self.config = config
        # 使用传入的配置初始化 CLVPModel 类的实例，作为模型的主体
        self.model = ClvpModel(self.config)

        # 初始化用于最终归一化的层
        self.final_norm = nn.LayerNorm(self.config.hidden_size)
        # 初始化用于语言模型头部的线性层
        self.lm_head = nn.Linear(self.config.hidden_size, self.config.vocab_size, bias=True)

        # 调用后初始化方法，用于权重初始化和最终处理
        self.post_init()

    # 返回模型解码器的输入嵌入层
    def get_input_embeddings(self):
        return self.model.decoder.input_embeds_layer

    # 设置模型解码器的输入嵌入层
    def set_input_embeddings(self, new_embeddings):
        self.model.decoder.input_embeds_layer = new_embeddings

    # 辅助方法：准备模型的输入，接受输入张量、开始词标识符和模型关键字参数
    def _prepare_model_inputs(
        self,
        inputs: Optional[torch.Tensor] = None,
        bos_token_id: Optional[int] = None,
        model_kwargs: Optional[Dict[str, torch.Tensor]] = None,
    ):
        ...

    # 生成推断过程的输入准备方法，接受输入的标识符、过去的键值对、输入嵌入和条件嵌入等参数
    def prepare_inputs_for_generation(
        self, input_ids, past_key_values=None, inputs_embeds=None, conditioning_embeds=None, **kwargs
    ):
        ...
        ):
        # 计算输入的序列长度
        input_ids_length = input_ids.shape[-1]
        # 获取额外的关键字参数中的 `token_type_ids`
        token_type_ids = kwargs.get("token_type_ids", None)
        # 如果有过去的键值对 `past_key_values`
        if past_key_values:
            # 获取过去状态的长度
            past_length = past_key_values[0][0].shape[2]

            # 一些生成方法已经只传递最后一个输入 ID
            if input_ids.shape[1] > past_length:
                remove_prefix_length = past_length
            else:
                # 默认的行为是保留最后一个 ID
                remove_prefix_length = input_ids.shape[1] - 1

            # 截取输入序列，只保留后部分
            input_ids = input_ids[:, remove_prefix_length:]
            # 如果有 `token_type_ids`，也相应地截取
            if token_type_ids is not None:
                token_type_ids = token_type_ids[:, -input_ids.shape[1] :]

        # 获取关键字参数中的 `attention_mask` 和 `position_ids`
        attention_mask = kwargs.get("attention_mask", None)
        position_ids = kwargs.get("position_ids", None)

        # 如果 `attention_mask` 不为空且 `position_ids` 为空
        if attention_mask is not None and position_ids is None:
            # 动态生成 `position_ids` 用于批量生成
            position_ids = attention_mask.long().cumsum(-1) - 1
            position_ids.masked_fill_(attention_mask == 0, 1)
            # 如果有 `past_key_values`，则只保留最后一个位置 ID
            if past_key_values:
                position_ids = position_ids[:, -1].unsqueeze(-1)
        else:
            # 否则置 `position_ids` 为空
            position_ids = None

        # 如果 `conditioning_embeds` 和 `past_key_values` 都不为空
        if conditioning_embeds is not None and past_key_values is not None:
            # 直接设置 `position_ids` 为输入序列长度的张量
            position_ids = torch.tensor([input_ids_length], dtype=torch.long, device=input_ids.device)

        # 如果传入了 `inputs_embeds`，且没有 `past_key_values`
        if inputs_embeds is not None and past_key_values is None:
            model_inputs = {"inputs_embeds": inputs_embeds}
        else:
            model_inputs = {"input_ids": input_ids}

        # 更新 `model_inputs` 字典
        model_inputs.update(
            {
                "past_key_values": past_key_values,
                "use_cache": kwargs.get("use_cache"),
                "position_ids": position_ids,
                "token_type_ids": token_type_ids,
            }
        )
        # 返回最终的 `model_inputs`
        return model_inputs

    # 添加预定义的文档字符串到模型的前向方法
    @add_start_docstrings_to_model_forward(CLVP_DECODER_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        """

        # 根据参数或者配置文件设置是否输出注意力权重
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 根据参数或者配置文件设置是否输出隐藏状态
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 根据参数或者配置文件设置是否使用缓存
        use_cache = use_cache if use_cache is not None else self.config.use_cache
        # 根据参数或者配置文件设置是否返回字典格式的输出
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用模型进行预测
        outputs = self.model(
            input_ids=input_ids,
            past_key_values=past_key_values,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取模型的隐藏状态
        hidden_states = outputs[0]

        # 对隐藏状态进行归一化处理
        lm_logits = self.final_norm(hidden_states)
        # 应用语言模型的头部进行最终的逻辑回归计算
        lm_logits = self.lm_head(lm_logits)

        # 初始化损失值
        loss = None
        # 如果存在标签数据，则计算损失值
        if labels is not None:
            # 将标签数据移到与 lm_logits 相同的设备上
            labels = labels.to(lm_logits.device)
            # 将 logits 向左移动一个位置，用于预测下一个 token
            shift_logits = lm_logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            # 将预测值与标签展平，计算交叉熵损失
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))

        # 如果不需要返回字典格式的输出，则按照元组格式返回结果
        if not return_dict:
            output = (lm_logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 如果需要返回字典格式的输出，则封装成 CausalLMOutputWithCrossAttentions 对象返回
        return CausalLMOutputWithCrossAttentions(
            loss=loss,
            logits=lm_logits,
            past_key_values=outputs.past_key_values,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
            cross_attentions=outputs.cross_attentions,
        )

    @staticmethod
    # 从 GPT2LMHeadModel._reorder_cache 复制过来的静态方法
    def _reorder_cache(
        past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor
        """
        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
        beam_idx at every generation step.
        """
        # 返回一个元组的元组，每个元组包含重新排序后的 `past_key_values` 中的每一层的状态
        return tuple(
            # 对于 `past_key_values` 中的每一层的状态，使用 `beam_idx` 来重新选择对应的状态，并移到相应设备上
            tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past)
            # 对于 `past_key_values` 中的每一层，进行上述操作并组成元组
            for layer_past in past_key_values
        )
# 为 CLVP 生成条件生成模型的类声明文档字符串，描述其包含文本编码器、语音编码器和语音解码器模型的结构和功能
@add_start_docstrings(
    "The composite CLVP model with a text encoder, speech encoder and speech decoder model."
    "The speech decoder model generates the speech_ids from the text and the text encoder and speech encoder works"
    "together to filter out the best speech_ids.",
    CLVP_START_DOCSTRING,
)
class ClvpModelForConditionalGeneration(ClvpPreTrainedModel):
    # 设置配置类为 ClvpConfig
    config_class = ClvpConfig

    # 初始化方法，接受一个 ClvpConfig 类型的参数 config
    def __init__(self, config: ClvpConfig):
        # 调用父类 ClvpPreTrainedModel 的初始化方法
        super().__init__(config)

        # 检查文本配置是否为 ClvpEncoderConfig 类型，若不是则抛出 ValueError 异常
        if not isinstance(config.text_config, ClvpEncoderConfig):
            raise ValueError(
                "config.text_config is expected to be of type `ClvpEncoderConfig` but is of type"
                f" {type(config.text_config)}."
            )

        # 检查语音配置是否为 ClvpEncoderConfig 类型，若不是则抛出 ValueError 异常
        if not isinstance(config.speech_config, ClvpEncoderConfig):
            raise ValueError(
                "config.speech_config is expected to be of type `ClvpEncoderConfig` but is of type"
                f" {type(config.speech_config)}."
            )

        # 检查解码器配置是否为 ClvpDecoderConfig 类型，若不是则抛出 ValueError 异常
        if not isinstance(config.decoder_config, ClvpDecoderConfig):
            raise ValueError(
                "config.decoder_config is expected to be of type `ClvpDecoderConfig` but is of type"
                f" {type(config.decoder_config)}."
            )

        # 创建 CLVP 条件编码器对象并赋值给 self.conditioning_encoder
        self.conditioning_encoder = ClvpConditioningEncoder(config)

        # 创建 CLVP 语音解码器模型对象并赋值给 self.speech_decoder_model
        self.speech_decoder_model = ClvpForCausalLM(config.decoder_config)

        # 创建 CLVP 文本编码器模型对象并赋值给 self.text_encoder_model
        self.text_encoder_model = ClvpEncoder(config.text_config)

        # 创建 CLVP 语音编码器模型对象并赋值给 self.speech_encoder_model
        self.speech_encoder_model = ClvpEncoder(config.speech_config)

        # 创建一个可学习参数 logit_scale，其值初始化为 config 中指定的 logit_scale_init_value
        self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))

        # 调用后处理方法，用于初始化权重和应用最终处理
        self.post_init()

    # 从原始代码库中提取的注释，指向具体代码位置的链接
    # 链接地址: https://github.com/neonbjb/tortoise-tts/blob/4003544b6ff4b68c09856e04d3eff9da26d023c2/tortoise/api.py#L117
    def fix_speech_decoder_output(self, speech_ids: torch.LongTensor) -> torch.LongTensor:
        """
        This method modifies the output of the decoder model, such as replacing the `eos_token_id` and changing the
        last few tokens of each sequence.

        Args:
            speech_ids (`torch.LongTensor`):
                This refers to the output of the decoder model.
        """
        # 获取解码器修正代码列表
        decoder_fixing_codes = self.config.decoder_config.decoder_fixing_codes
        
        # 去掉每个序列开头的第一个 token（通常是起始 token）
        speech_ids = speech_ids[:, 1:]

        # 找到所有结束 token 的索引位置
        stop_token_indices = torch.where(speech_ids == self.speech_decoder_model.config.eos_token_id, 1, 0)
        
        # 使用 decoder_fixing_codes[0] 替换所有结束 token 的位置
        speech_ids = torch.masked_fill(speech_ids, mask=stop_token_indices.bool(), value=decoder_fixing_codes[0])

        # 遍历每个序列的结束 token 索引
        for i, each_seq_stop_token_index in enumerate(stop_token_indices):
            # 如果某个序列中没有找到结束 token，则跳过对该序列的处理
            if each_seq_stop_token_index.sum() == 0:
                continue

            # 找到当前序列中第一个结束 token 的位置
            stm = each_seq_stop_token_index.argmax()
            
            # 将该位置及之后的 token 替换为 decoder_fixing_codes[0]
            speech_ids[i, stm:] = decoder_fixing_codes[0]
            
            # 如果序列长度允许，将序列末尾的最后三个 token 替换为指定的 decoder_fixing_codes[1:]
            if stm - 3 < speech_ids.shape[1]:
                speech_ids[i, -3:] = torch.tensor(
                    [decoder_fixing_codes[1:]], device=speech_ids.device, dtype=torch.long
                )

        return speech_ids

    def get_text_features(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        text_encoder_inputs_embeds: Optional[torch.FloatTensor] = None,
        attention_mask: Optional[torch.LongTensor] = None,
    ) -> torch.FloatTensor:
        r"""
        This method can be used to extract text_embeds from a text. The text embeddings obtained by applying the
        projection layer to the pooled output of the CLVP text encoder model.

        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                provide it.

                [What are input IDs?](../glossary#input-ids)
            text_encoder_inputs_embeds (`torch.FloatTensor`, *optional*):
                inputs_embeds for the text encoder model passed in place of `input_ids`.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)

        Returns:
            `torch.FloatTensor` of shape `(batch_size, output_dim)`:
                The text embeddings obtained by applying the projection layer to the pooled output of the CLVP Text
                Model.

        Examples:

        ```
        >>> from transformers import ClvpProcessor, ClvpModelForConditionalGeneration

        >>> # Define the Text
        >>> text = "This is an example text."

        >>> # Define processor and model
        >>> processor = ClvpProcessor.from_pretrained("susnato/clvp_dev")
        >>> model = ClvpModelForConditionalGeneration.from_pretrained("susnato/clvp_dev")

        >>> # Generate processor output and text embeds
        >>> processor_output = processor(text=text, return_tensors="pt")
        >>> text_embeds = model.get_text_features(input_ids=processor_output["input_ids"])
        ```
        """

        # 使用 text_encoder_model 对象进行文本编码器的前向传播，生成文本嵌入
        outputs = self.text_encoder_model(
            input_ids=input_ids,
            inputs_embeds=text_encoder_inputs_embeds,
            attention_mask=attention_mask,
        )

        # 返回经过投影层处理后的文本嵌入
        return outputs[0]

    def get_speech_features(
        self,
        speech_ids: Optional[torch.LongTensor] = None,
        input_ids: Optional[torch.LongTensor] = None,
        input_features: Optional[torch.FloatTensor] = None,
        conditioning_encoder_inputs_embeds: Optional[torch.FloatTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        generation_config: Optional[GenerationConfig] = None,
        **kwargs,
    @add_start_docstrings_to_model_forward(CLVP_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=ClvpOutput, config_class=ClvpConfig)
    # 定义类中的前向传播方法，接收多个输入参数
    def forward(
        self,
        input_ids: torch.LongTensor = None,  # 输入的 token IDs，数据类型为 LongTensor
        input_features: torch.FloatTensor = None,  # 输入的特征数据，数据类型为 FloatTensor
        conditioning_encoder_inputs_embeds: Optional[torch.FloatTensor] = None,  # 条件编码器输入的嵌入向量，可选的 FloatTensor
        text_encoder_inputs_embeds: Optional[torch.FloatTensor] = None,  # 文本编码器输入的嵌入向量，可选的 FloatTensor
        attention_mask: Optional[torch.LongTensor] = None,  # 注意力掩码，可选的 LongTensor
        return_loss: Optional[bool] = None,  # 是否返回损失，可选的布尔值
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态，可选的布尔值
        output_attentions: Optional[bool] = False,  # 是否输出注意力权重，可选的布尔值，默认为 False
        return_dict: Optional[bool] = None,  # 是否以字典形式返回结果，可选的布尔值
    ):
    
    # 使用装饰器标记此方法不会计算梯度
    @torch.no_grad()
    # 定义类中的生成方法，接收多个输入参数
    def generate(
        self,
        input_ids: torch.LongTensor = None,  # 输入的 token IDs，数据类型为 LongTensor
        input_features: torch.FloatTensor = None,  # 输入的特征数据，数据类型为 FloatTensor
        attention_mask: Optional[torch.LongTensor] = None,  # 注意力掩码，可选的 LongTensor
        generation_config: Optional[GenerationConfig] = None,  # 生成配置，可选的 GenerationConfig 对象
        pad_to_max_mel_tokens: Optional[int] = None,  # 填充到最大 mel tokens 的数量，可选的整数
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态，可选的布尔值
        **kwargs,  # 其他关键字参数

Transformers-源码解析-二十六-

Transformers 源码解析（二十六）

.\models\clip\processing_clip.py

.\models\clip\tokenization_clip.py

.\models\clip\tokenization_clip_fast.py

.\models\clip\__init__.py

.\models\clipseg\configuration_clipseg.py

.\models\clipseg\convert_clipseg_original_pytorch_to_hf.py

.\models\clipseg\modeling_clipseg.py

.\models\clipseg\processing_clipseg.py

.\models\clipseg\__init__.py

.\models\clvp\configuration_clvp.py

.\models\clvp\convert_clvp_to_hf.py

.\models\clvp\feature_extraction_clvp.py

.\models\clvp\modeling_clvp.py

`.\models\clip\processing_clip.py`

`.\models\clip\tokenization_clip.py`

`.\models\clip\tokenization_clip_fast.py`

`.\models\clip\init.py`

`.\models\clipseg\configuration_clipseg.py`

`.\models\clipseg\convert_clipseg_original_pytorch_to_hf.py`

`.\models\clipseg\modeling_clipseg.py`

`.\models\clipseg\processing_clipseg.py`

`.\models\clipseg\init.py`

`.\models\clvp\configuration_clvp.py`

`.\models\clvp\convert_clvp_to_hf.py`

`.\models\clvp\feature_extraction_clvp.py`

`.\models\clvp\modeling_clvp.py`