Transformers 源码解析(二十六)
.\models\clip\processing_clip.py
"""
CLIP 的图像/文本处理类
"""
import warnings
from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding
class CLIPProcessor(ProcessorMixin):
r"""
构建一个 CLIP 处理器,将 CLIP 图像处理器和 CLIP 分词器包装成一个单一处理器。
[`CLIPProcessor`] 提供了 [`CLIPImageProcessor`] 和 [`CLIPTokenizerFast`] 的所有功能。参见
[`~CLIPProcessor.__call__`] 和 [`~CLIPProcessor.decode`] 获取更多信息。
Args:
image_processor ([`CLIPImageProcessor`], *optional*):
图像处理器,必需输入。
tokenizer ([`CLIPTokenizerFast`], *optional*):
分词器,必需输入。
"""
attributes = ["image_processor", "tokenizer"]
image_processor_class = "CLIPImageProcessor"
tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
def __init__(self, image_processor=None, tokenizer=None, **kwargs):
feature_extractor = None
if "feature_extractor" in kwargs:
warnings.warn(
"The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
" instead.",
FutureWarning,
)
feature_extractor = kwargs.pop("feature_extractor")
image_processor = image_processor if image_processor is not None else feature_extractor
if image_processor is None:
raise ValueError("You need to specify an `image_processor`.")
if tokenizer is None:
raise ValueError("You need to specify a `tokenizer`.")
super().__init__(image_processor, tokenizer)
def batch_decode(self, *args, **kwargs):
"""
此方法将所有参数转发到 CLIPTokenizerFast 的 [`~PreTrainedTokenizer.batch_decode`]。请参阅该方法的文档字符串以获取更多信息。
"""
return self.tokenizer.batch_decode(*args, **kwargs)
def decode(self, *args, **kwargs):
"""
此方法将所有参数转发到 CLIPTokenizerFast 的 [`~PreTrainedTokenizer.decode`]。请参阅该方法的文档字符串以获取更多信息。
"""
return self.tokenizer.decode(*args, **kwargs)
@property
def model_input_names(self):
tokenizer_input_names = self.tokenizer.model_input_names
image_processor_input_names = self.image_processor.model_input_names
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
@property
def feature_extractor_class(self):
warnings.warn(
"`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
FutureWarning,
)
return self.image_processor_class
@property
def feature_extractor(self):
warnings.warn(
"`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
FutureWarning,
)
return self.image_processor
.\models\clip\tokenization_clip.py
"""Tokenization classes for CLIP."""
import json
import os
import unicodedata
from functools import lru_cache
from typing import List, Optional, Tuple
import regex as re
from ...tokenization_utils import AddedToken, PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
from ...utils import logging
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {
"vocab_file": "vocab.json",
"merges_file": "merges.txt",
}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"openai/clip-vit-base-patch32": "https://huggingface.co/openai/clip-vit-base-patch32/resolve/main/vocab.json",
},
"merges_file": {
"openai/clip-vit-base-patch32": "https://huggingface.co/openai/clip-vit-base-patch32/resolve/main/merges.txt",
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"openai/clip-vit-base-patch32": 77,
}
PRETRAINED_INIT_CONFIGURATION = {
"openai/clip-vit-base-patch32": {},
}
@lru_cache()
def bytes_to_unicode():
"""
Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
characters the bpe code barfs on.
The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
tables between utf-8 bytes and unicode strings.
"""
bs = (
list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
)
cs = bs[:]
n = 0
for b in range(2**8):
if b not in bs:
bs.append(b)
cs.append(2**8 + n)
n += 1
cs = [chr(n) for n in cs]
return dict(zip(bs, cs))
def get_pairs(word):
"""
Return set of symbol pairs in a word.
Word is represented as tuple of symbols (symbols being variable-length strings).
"""
pairs = set()
prev_char = word[0]
for char in word[1:]:
pairs.add((prev_char, char))
prev_char = char
return pairs
def whitespace_clean(text):
text = re.sub(r"\s+", " ", text)
text = text.strip()
return text
def whitespace_tokenize(text):
"""对文本进行基本的空白符清理和分割。"""
text = text.strip()
if not text:
return []
tokens = text.split()
return tokens
class BasicTokenizer(object):
"""
构建一个 BasicTokenizer 对象,执行基本的分词操作(标点符号分割、小写处理等)。
Args:
do_lower_case (`bool`, *可选*, 默认为 `True`):
是否在分词时将输入转换为小写。
never_split (`Iterable`, *可选*):
在分词时不应该分割的 token 集合。仅在 `do_basic_tokenize=True` 时有效。
tokenize_chinese_chars (`bool`, *可选*, 默认为 `True`):
是否对中文字符进行分词处理。
对于日语,应该将此选项禁用(参见这个
[issue](https://github.com/huggingface/transformers/issues/328))。
strip_accents (`bool`, *可选*):
是否去除所有的重音符号。如果未指定此选项,则根据 `lowercase` 的值决定(与原始 BERT 一致)。
do_split_on_punc (`bool`, *可选*, 默认为 `True`):
在某些情况下,我们希望跳过基本的标点符号分割,以便后续的分词可以捕获单词的完整上下文,如缩略词。
"""
def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None:
never_split = []
self.do_lower_case = do_lower_case
self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None):
"""
Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
Args:
never_split (`List[str]`, *optional*)
Kept for backward compatibility purposes. Now implemented directly at the base class level (see
[`PreTrainedTokenizer.tokenize`]) List of token not to split.
"""
never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
text = self._clean_text(text)
if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text)
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = []
for token in orig_tokens:
if token not in never_split:
if self.do_lower_case:
token = token.lower()
if self.strip_accents is not False:
token = self._run_strip_accents(token)
elif self.strip_accents:
token = self._run_strip_accents(token)
split_tokens.extend(self._run_split_on_punc(token, never_split))
output_tokens = whitespace_tokenize(" ".join(split_tokens))
return output_tokens
def _run_strip_accents(self, text):
"""Strips accents from a piece of text."""
text = unicodedata.normalize("NFD", text)
output = []
for char in text:
cat = unicodedata.category(char)
if cat == "Mn":
continue
output.append(char)
return "".join(output)
def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text."""
if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text]
chars = list(text)
i = 0
start_new_word = True
output = []
while i < len(chars):
char = chars[i]
if _is_punctuation(char):
output.append([char])
start_new_word = True
else:
if start_new_word:
output.append([])
start_new_word = False
output[-1].append(char)
i += 1
return ["".join(x) for x in output]
def _tokenize_chinese_chars(self, text):
"""Adds whitespace around any CJK character."""
output = []
for char in text:
cp = ord(char)
if self._is_chinese_char(cp):
output.append(" ")
output.append(char)
output.append(" ")
else:
output.append(char)
return "".join(output)
def _is_chinese_char(self, cp):
"""Checks whether CP is the codepoint of a CJK character."""
if (
(cp >= 0x4E00 and cp <= 0x9FFF)
or (cp >= 0x3400 and cp <= 0x4DBF)
or (cp >= 0x20000 and cp <= 0x2A6DF)
or (cp >= 0x2A700 and cp <= 0x2B73F)
or (cp >= 0x2B740 and cp <= 0x2B81F)
or (cp >= 0x2B820 and cp <= 0x2CEAF)
or (cp >= 0xF900 and cp <= 0xFAFF)
or (cp >= 0x2F800 and cp <= 0x2FA1F)
):
return True
return False
def _clean_text(self, text):
"""Performs invalid character removal and whitespace cleanup on text."""
output = []
for char in text:
cp = ord(char)
if cp == 0 or cp == 0xFFFD or _is_control(char):
continue
if _is_whitespace(char):
output.append(" ")
else:
output.append(char)
return "".join(output)
"""
构造一个 CLIP 分词器。基于字节级别的 Byte-Pair-Encoding。
这个分词器继承自 `PreTrainedTokenizer`,其中包含大部分主要方法。用户应该参考这个超类以获取有关这些方法的更多信息。
Args:
vocab_file (`str`):
词汇文件的路径。
merges_file (`str`):
合并文件的路径。
errors (`str`, *optional*, defaults to `"replace"`):
将字节解码为 UTF-8 时的错误处理模式。参见
[bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) 获取更多信息。
unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
未知标记。词汇表中不存在的标记无法转换为 ID,因此将被设置为这个标记。
bos_token (`str`, *optional*, defaults to `"<|startoftext|>"`):
序列的起始标记。
eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
序列的结束标记。
pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
用于填充的标记,例如在对不同长度的序列进行批处理时使用。
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["input_ids", "attention_mask"]
def __init__(
self,
vocab_file,
merges_file,
errors="replace",
unk_token="<|endoftext|>",
bos_token="<|startoftext|>",
eos_token="<|endoftext|>",
pad_token="<|endoftext|>",
**kwargs,
):
bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
try:
import ftfy
self.fix_text = ftfy.fix_text
except ImportError:
logger.info("ftfy or spacy is not installed using custom BasicTokenizer instead of ftfy.")
self.nlp = BasicTokenizer(strip_accents=False, do_split_on_punc=False)
self.fix_text = None
with open(vocab_file, encoding="utf-8") as vocab_handle:
self.encoder = json.load(vocab_handle)
self.decoder = {v: k for k, v in self.encoder.items()}
self.errors = errors
self.byte_encoder = bytes_to_unicode()
self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
with open(merges_file, encoding="utf-8") as merges_handle:
bpe_merges = merges_handle.read().strip().split("\n")[1 : 49152 - 256 - 2 + 1]
bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
self.cache = {"<|startoftext|>": "<|startoftext|>", "<|endoftext|>": "<|endoftext|>"}
self.pat = re.compile(
r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""",
re.IGNORECASE,
)
super().__init__(
errors=errors,
unk_token=unk_token,
bos_token=bos_token,
eos_token=eos_token,
pad_token=pad_token,
**kwargs,
)
@property
def vocab_size(self):
return len(self.encoder)
def get_vocab(self):
return dict(self.encoder, **self.added_tokens_encoder)
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Generate token type IDs from a list of token IDs representing sequences. This is typically used in sequence pair
tasks to differentiate between the first and the second sequence.
Args:
token_ids_0 (`List[int]`):
List of IDs representing the first sequence.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs representing the second sequence in a pair task.
Returns:
`List[int]`: List of token type IDs where each ID corresponds to a token in the input sequences.
"""
if token_ids_1 is None:
return [0] * len(token_ids_0)
token_type_ids = [0] * len(token_ids_0) + [1] * len(token_ids_1)
return token_type_ids
) -> List[int]:
"""
Create a mask from the two sequences passed. CLIP does not make use of token type ids, therefore a list of
zeros is returned.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of zeros.
"""
bos_token = [self.bos_token_id]
eos_token = [self.eos_token_id]
if token_ids_1 is None:
return len(bos_token + token_ids_0 + eos_token) * [0]
return len(bos_token + token_ids_0 + eos_token + eos_token + token_ids_1 + eos_token) * [0]
def bpe(self, token):
if token in self.cache:
return self.cache[token]
word = tuple(token[:-1]) + (token[-1] + "</w>",)
pairs = get_pairs(word)
if not pairs:
return token + "</w>"
while True:
bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
if bigram not in self.bpe_ranks:
break
first, second = bigram
new_word = []
i = 0
while i < len(word):
try:
j = word.index(first, i)
except ValueError:
new_word.extend(word[i:])
break
else:
new_word.extend(word[i:j])
i = j
if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
new_word.append(first + second)
i += 2
else:
new_word.append(word[i])
i += 1
new_word = tuple(new_word)
word = new_word
if len(word) == 1:
break
else:
pairs = get_pairs(word)
word = " ".join(word)
self.cache[token] = word
return word
def _tokenize(self, text):
"""Tokenize a string."""
bpe_tokens = []
if self.fix_text is None:
text = " ".join(self.nlp.tokenize(text))
else:
text = whitespace_clean(self.fix_text(text)).lower()
for token in re.findall(self.pat, text):
token = "".join(
self.byte_encoder[b] for b in token.encode("utf-8")
)
bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
return bpe_tokens
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
return self.encoder.get(token, self.encoder.get(self.unk_token))
def _convert_id_to_token(self, index):
"""Converts an index (integer) to a token (str) using the vocab."""
return self.decoder.get(index)
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (strings) into a single string."""
text = "".join(tokens)
byte_array = bytearray([self.byte_decoder[c] for c in text])
text = byte_array.decode("utf-8", errors=self.errors).replace("</w>", " ").strip()
return text
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
return
vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
merge_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
)
with open(vocab_file, "w", encoding="utf-8") as f:
f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
index = 0
with open(merge_file, "w", encoding="utf-8") as writer:
writer.write("#version: 0.2\n")
for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
if index != token_index:
logger.warning(
"Saving vocabulary to {}: BPE merge indices are not consecutive."
" Please check that the tokenizer is not corrupted!".format(merge_file)
)
index = token_index
writer.write(" ".join(bpe_tokens) + "\n")
index += 1
return vocab_file, merge_file
.\models\clip\tokenization_clip_fast.py
"""
Tokenization classes for OpenAI GPT.
"""
from typing import List, Optional, Tuple
from tokenizers import pre_tokenizers
from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ...utils import logging
from .tokenization_clip import CLIPTokenizer
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"openai/clip-vit-base-patch32": "https://huggingface.co/openai/clip-vit-base-patch32/resolve/main/vocab.json",
},
"merges_file": {
"openai/clip-vit-base-patch32": "https://huggingface.co/openai/clip-vit-base-patch32/resolve/main/merges.txt",
},
"tokenizer_file": {
"openai/clip-vit-base-patch32": (
"https://huggingface.co/openai/clip-vit-base-patch32/resolve/main/tokenizer.json"
),
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"openai/clip-vit-base-patch32": 77,
}
class CLIPTokenizerFast(PreTrainedTokenizerFast):
"""
Construct a "fast" CLIP tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level
Byte-Pair-Encoding.
This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
refer to this superclass for more information regarding those methods.
Args:
vocab_file (`str`, *optional*):
Path to the vocabulary file.
merges_file (`str`, *optional*):
Path to the merges file.
tokenizer_file (`str`, *optional*):
The path to a tokenizer file to use instead of the vocab file.
unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
bos_token (`str`, *optional*, defaults to `"<|startoftext|>"`):
The beginning of sequence token.
eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
The end of sequence token.
pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
The token used for padding, for example when batching sequences of different lengths.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["input_ids", "attention_mask"]
slow_tokenizer_class = CLIPTokenizer
def __init__(
self,
vocab_file=None,
merges_file=None,
tokenizer_file=None,
unk_token="<|endoftext|>",
bos_token="<|startoftext|>",
eos_token="<|endoftext|>",
pad_token="<|endoftext|>",
**kwargs,
):
super().__init__(
vocab_file,
merges_file,
tokenizer_file=tokenizer_file,
unk_token=unk_token,
bos_token=bos_token,
eos_token=eos_token,
pad_token=pad_token,
**kwargs,
)
if not isinstance(self.backend_tokenizer.pre_tokenizer, pre_tokenizers.Sequence):
raise ValueError(
"The `backend_tokenizer` provided does not match the expected format. The CLIP tokenizer has been"
" heavily modified from transformers version 4.17.0. You need to convert the tokenizer you are using"
" to be compatible with this version.The easiest way to do so is"
' `CLIPTokenizerFast.from_pretrained("path_to_local_folder_or_hub_repo, from_slow=True)`. If you want'
" to use your existing tokenizer, you will have to revert to a version prior to 4.17.0 of"
" transformers."
)
self._wrap_decode_method_backend_tokenizer()
def _wrap_decode_method_backend_tokenizer(self):
orig_decode_method = self.backend_tokenizer.decode
def new_decode_method(*args, **kwargs):
text = orig_decode_method(*args, **kwargs)
text = text.replace(self.backend_tokenizer.model.end_of_word_suffix, " ").strip()
return text
self.backend_tokenizer.decode = new_decode_method
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed. CLIP does not make use of token type ids, therefore a list of
zeros is returned.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of zeros.
"""
bos_token = [self.bos_token_id]
eos_token = [self.eos_token_id]
if token_ids_1 is None:
return len(bos_token + token_ids_0 + eos_token) * [0]
return len(bos_token + token_ids_0 + eos_token + eos_token + token_ids_1 + eos_token) * [0]
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
"""
Save the tokenizer's vocabulary to the specified directory.
Args:
save_directory (str):
Directory where the vocabulary will be saved.
filename_prefix (str, *optional*):
Optional prefix for the saved files.
Returns:
`Tuple[str]`: Tuple containing the filenames of the saved vocabulary files.
"""
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
return tuple(files)
.\models\clip\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_flax_available,
is_tf_available,
is_tokenizers_available,
is_torch_available,
is_vision_available,
)
_import_structure = {
"configuration_clip": [
"CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP",
"CLIPConfig",
"CLIPOnnxConfig",
"CLIPTextConfig",
"CLIPVisionConfig",
],
"processing_clip": ["CLIPProcessor"],
"tokenization_clip": ["CLIPTokenizer"],
}
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_clip_fast"] = ["CLIPTokenizerFast"]
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["feature_extraction_clip"] = ["CLIPFeatureExtractor"]
_import_structure["image_processing_clip"] = ["CLIPImageProcessor"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_clip"] = [
"CLIP_PRETRAINED_MODEL_ARCHIVE_LIST",
"CLIPModel",
"CLIPPreTrainedModel",
"CLIPTextModel",
"CLIPTextModelWithProjection",
"CLIPVisionModel",
"CLIPVisionModelWithProjection",
"CLIPForImageClassification",
]
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_tf_clip"] = [
"TF_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST",
"TFCLIPModel",
"TFCLIPPreTrainedModel",
"TFCLIPTextModel",
"TFCLIPVisionModel",
]
try:
if not is_flax_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_flax_clip"] = [
"FlaxCLIPModel",
"FlaxCLIPPreTrainedModel",
"FlaxCLIPTextModel",
"FlaxCLIPTextPreTrainedModel",
"FlaxCLIPTextModelWithProjection",
"FlaxCLIPVisionModel",
"FlaxCLIPVisionPreTrainedModel",
]
if TYPE_CHECKING:
pass
from .configuration_clip import (
CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP,
CLIPConfig,
CLIPOnnxConfig,
CLIPTextConfig,
CLIPVisionConfig,
)
from .processing_clip import CLIPProcessor
from .tokenization_clip import CLIPTokenizer
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_clip_fast import CLIPTokenizerFast
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .feature_extraction_clip import CLIPFeatureExtractor
from .image_processing_clip import CLIPImageProcessor
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_clip import (
CLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
CLIPForImageClassification,
CLIPModel,
CLIPPreTrainedModel,
CLIPTextModel,
CLIPTextModelWithProjection,
CLIPVisionModel,
CLIPVisionModelWithProjection,
)
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_tf_clip import (
TF_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
TFCLIPModel,
TFCLIPPreTrainedModel,
TFCLIPTextModel,
TFCLIPVisionModel,
)
try:
if not is_flax_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_flax_clip import (
FlaxCLIPModel,
FlaxCLIPPreTrainedModel,
FlaxCLIPTextModel,
FlaxCLIPTextModelWithProjection,
FlaxCLIPTextPreTrainedModel,
FlaxCLIPVisionModel,
FlaxCLIPVisionPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\clipseg\configuration_clipseg.py
""" CLIPSeg model configuration"""
import os
from typing import Union
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
CLIPSEG_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"CIDAS/clipseg-rd64": "https://huggingface.co/CIDAS/clipseg-rd64/resolve/main/config.json",
}
class CLIPSegTextConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`CLIPSegModel`]. It is used to instantiate an
CLIPSeg model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the CLIPSeg
[CIDAS/clipseg-rd64](https://huggingface.co/CIDAS/clipseg-rd64) architecture.
"""
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
vocab_size (`int`, *optional*, defaults to 49408):
CLIPSeg 文本模型的词汇表大小,定义了在调用 `CLIPSegModel` 时 `inputs_ids` 可表示的不同标记数量。
hidden_size (`int`, *optional*, defaults to 512):
编码器层和池化层的维度。
intermediate_size (`int`, *optional*, defaults to 2048):
Transformer 编码器中“中间”(即前馈)层的维度。
num_hidden_layers (`int`, *optional*, defaults to 12):
Transformer 编码器中的隐藏层数量。
num_attention_heads (`int`, *optional*, defaults to 8):
Transformer 编码器中每个注意力层的注意头数量。
max_position_embeddings (`int`, *optional*, defaults to 77):
可能用于该模型的最大序列长度。通常设置为较大的值(例如 512、1024 或 2048)。
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
编码器和池化层中的非线性激活函数(函数或字符串)。如果是字符串,支持 `"gelu"`, `"relu"`, `"selu"` 和 `"gelu_new"`
`"quick_gelu"`。
layer_norm_eps (`float`, *optional*, defaults to 1e-05):
层归一化层使用的 epsilon。
attention_dropout (`float`, *optional*, defaults to 0.0):
注意力概率的 dropout 比率。
initializer_range (`float`, *optional*, defaults to 0.02):
初始化所有权重矩阵的截断正态初始化器的标准差。
initializer_factor (`float`, *optional*, defaults to 1.0):
初始化所有权重矩阵的因子(内部用于初始化测试应保持为 1)。
pad_token_id (`int`, *optional*, defaults to 1):
填充标记 id。
bos_token_id (`int`, *optional*, defaults to 49406):
流的开始标记 id。
eos_token_id (`int`, *optional*, defaults to 49407):
流的结束标记 id。
Example:
```
>>> from transformers import CLIPSegTextConfig, CLIPSegTextModel
>>>
>>> configuration = CLIPSegTextConfig()
>>>
>>> model = CLIPSegTextModel(configuration)
>>>
>>> configuration = model.config
```
model_type = "clipseg_text_model"
def __init__(
self,
vocab_size=49408,
hidden_size=512,
intermediate_size=2048,
num_hidden_layers=12,
num_attention_heads=8,
max_position_embeddings=77,
hidden_act="quick_gelu",
layer_norm_eps=1e-5,
attention_dropout=0.0,
initializer_range=0.02,
initializer_factor=1.0,
pad_token_id=1,
bos_token_id=49406,
eos_token_id=49407,
**kwargs,
):
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.max_position_embeddings = max_position_embeddings
self.layer_norm_eps = layer_norm_eps
self.hidden_act = hidden_act
self.initializer_range = initializer_range
self.initializer_factor = initializer_factor
self.attention_dropout = attention_dropout
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
cls._set_token_in_kwargs(kwargs)
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
if config_dict.get("model_type") == "clipseg":
config_dict = config_dict["text_config"]
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
logger.warning(
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
)
return cls.from_dict(config_dict, **kwargs)
class CLIPSegVisionConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`CLIPSegModel`]. It is used to instantiate an
CLIPSeg model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the CLIPSeg
[CIDAS/clipseg-rd64](https://huggingface.co/CIDAS/clipseg-rd64) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
hidden_size (`int`, *optional*, defaults to 768):
Dimensionality of the encoder layers and the pooler layer.
intermediate_size (`int`, *optional*, defaults to 3072):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
num_hidden_layers (`int`, *optional*, defaults to 12):
Number of hidden layers in the Transformer encoder.
num_attention_heads (`int`, *optional*, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
num_channels (`int`, *optional*, defaults to 3):
The number of input channels.
image_size (`int`, *optional*, defaults to 224):
The size (resolution) of each image.
patch_size (`int`, *optional*, defaults to 32):
The size (resolution) of each patch.
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
layer_norm_eps (`float`, *optional*, defaults to 1e-05):
The epsilon used by the layer normalization layers.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
initializer_factor (`float`, *optional*, defaults to 1.0):
A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
testing).
Example:
```
>>> from transformers import CLIPSegVisionConfig, CLIPSegVisionModel
>>> # Initializing a CLIPSegVisionConfig with CIDAS/clipseg-rd64 style configuration
>>> configuration = CLIPSegVisionConfig()
>>> # Initializing a CLIPSegVisionModel (with random weights) from the CIDAS/clipseg-rd64 style configuration
>>> model = CLIPSegVisionModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "clipseg_vision_model"
def __init__(
self,
hidden_size=768,
intermediate_size=3072,
num_hidden_layers=12,
num_attention_heads=12,
num_channels=3,
image_size=224,
patch_size=32,
hidden_act="quick_gelu",
layer_norm_eps=1e-5,
attention_dropout=0.0,
initializer_range=0.02,
initializer_factor=1.0,
**kwargs,
):
super().__init__(**kwargs)
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.num_channels = num_channels
self.patch_size = patch_size
self.image_size = image_size
self.initializer_range = initializer_range
self.initializer_factor = initializer_factor
self.attention_dropout = attention_dropout
self.layer_norm_eps = layer_norm_eps
self.hidden_act = hidden_act
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
cls._set_token_in_kwargs(kwargs)
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
if config_dict.get("model_type") == "clipseg":
config_dict = config_dict["vision_config"]
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
logger.warning(
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
)
return cls.from_dict(config_dict, **kwargs)
class CLIPSegConfig(PretrainedConfig):
r"""
[`CLIPSegConfig`] is the configuration class to store the configuration of a [`CLIPSegModel`]. It is used to
instantiate a CLIPSeg model according to the specified arguments, defining the text model and vision model configs.
Instantiating a configuration with the defaults will yield a similar configuration to that of the CLIPSeg
[CIDAS/clipseg-rd64](https://huggingface.co/CIDAS/clipseg-rd64) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
text_config (`dict`, *optional*):
Dictionary of configuration options used to initialize [`CLIPSegTextConfig`].
vision_config (`dict`, *optional*):
Dictionary of configuration options used to initialize [`CLIPSegVisionConfig`].
projection_dim (`int`, *optional*, defaults to 512):
Dimensionality of text and vision projection layers.
logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
The inital value of the *logit_scale* paramter. Default is used as per the original CLIPSeg implementation.
extract_layers (`List[int]`, *optional*, defaults to `[3, 6, 9]`):
Layers to extract when forwarding the query image through the frozen visual backbone of CLIP.
reduce_dim (`int`, *optional*, defaults to 64):
Dimensionality to reduce the CLIP vision embedding.
decoder_num_attention_heads (`int`, *optional*, defaults to 4):
Number of attention heads in the decoder of CLIPSeg.
decoder_attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
decoder_hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
decoder_intermediate_size (`int`, *optional*, defaults to 2048):
Dimensionality of the "intermediate" (i.e., feed-forward) layers in the Transformer decoder.
conditional_layer (`int`, *optional*, defaults to 0):
The layer to use of the Transformer encoder whose activations will be combined with the condition
embeddings using FiLM (Feature-wise Linear Modulation). If 0, the last layer is used.
use_complex_transposed_convolution (`bool`, *optional*, defaults to `False`):
Whether to use a more complex transposed convolution in the decoder, enabling more fine-grained
segmentation.
kwargs (*optional*):
Dictionary of keyword arguments.
Example:
```
>>> from transformers import CLIPSegConfig, CLIPSegModel
# 初始化一个 CLIPSegConfig,使用 CIDAS/clipseg-rd64 风格的配置
>>> configuration = CLIPSegConfig()
# 使用 CIDAS/clipseg-rd64 风格的配置初始化一个 CLIPSegModel(带有随机权重)
>>> model = CLIPSegModel(configuration)
# 访问模型的配置信息
>>> configuration = model.config
# 我们也可以从 CLIPSegTextConfig 和 CLIPSegVisionConfig 初始化一个 CLIPSegConfig
# 初始化一个 CLIPSegTextConfig 和 CLIPSegVisionConfig
>>> config_text = CLIPSegTextConfig()
>>> config_vision = CLIPSegVisionConfig()
# 使用 CLIPSegTextConfig 和 CLIPSegVisionConfig 初始化一个 CLIPSegConfig 对象
>>> config = CLIPSegConfig.from_text_vision_configs(config_text, config_vision)
.\models\clipseg\convert_clipseg_original_pytorch_to_hf.py
"""从原始存储库转换 CLIPSeg 检查点。URL: https://github.com/timojl/clipseg."""
import argparse
import requests
import torch
from PIL import Image
from transformers import (
CLIPSegConfig,
CLIPSegForImageSegmentation,
CLIPSegProcessor,
CLIPSegTextConfig,
CLIPSegVisionConfig,
CLIPTokenizer,
ViTImageProcessor,
)
def get_clipseg_config(model_name):
text_config = CLIPSegTextConfig()
vision_config = CLIPSegVisionConfig(patch_size=16)
use_complex_transposed_convolution = True if "refined" in model_name else False
reduce_dim = 16 if "rd16" in model_name else 64
config = CLIPSegConfig.from_text_vision_configs(
text_config,
vision_config,
use_complex_transposed_convolution=use_complex_transposed_convolution,
reduce_dim=reduce_dim,
)
return config
def rename_key(name):
if "clip_model" in name:
name = name.replace("clip_model", "clip")
if "transformer" in name:
if "visual" in name:
name = name.replace("visual.transformer", "vision_model")
else:
name = name.replace("transformer", "text_model")
if "resblocks" in name:
name = name.replace("resblocks", "encoder.layers")
if "ln_1" in name:
name = name.replace("ln_1", "layer_norm1")
if "ln_2" in name:
name = name.replace("ln_2", "layer_norm2")
if "c_fc" in name:
name = name.replace("c_fc", "fc1")
if "c_proj" in name:
name = name.replace("c_proj", "fc2")
if "attn" in name and "self" not in name:
name = name.replace("attn", "self_attn")
if "token_embedding" in name:
name = name.replace("token_embedding", "text_model.embeddings.token_embedding")
if "positional_embedding" in name and "visual" not in name:
name = name.replace("positional_embedding", "text_model.embeddings.position_embedding.weight")
if "ln_final" in name:
name = name.replace("ln_final", "text_model.final_layer_norm")
if "visual.class_embedding" in name:
name = name.replace("visual.class_embedding", "vision_model.embeddings.class_embedding")
if "visual.conv1" in name:
name = name.replace("visual.conv1", "vision_model.embeddings.patch_embedding")
if "visual.positional_embedding" in name:
name = name.replace("visual.positional_embedding", "vision_model.embeddings.position_embedding.weight")
if "visual.ln_pre" in name:
name = name.replace("visual.ln_pre", "vision_model.pre_layrnorm")
if "visual.ln_post" in name:
name = name.replace("visual.ln_post", "vision_model.post_layernorm")
if "visual.proj" in name:
name = name.replace("visual.proj", "visual_projection.weight")
if "text_projection" in name:
name = name.replace("text_projection", "text_projection.weight")
if "trans_conv" in name:
name = name.replace("trans_conv", "transposed_convolution")
if "film_mul" in name or "film_add" in name or "reduce" in name or "transposed_convolution" in name:
name = "decoder." + name
if "blocks" in name:
name = name.replace("blocks", "decoder.layers")
if "linear1" in name:
name = name.replace("linear1", "mlp.fc1")
if "linear2" in name:
name = name.replace("linear2", "mlp.fc2")
if "norm1" in name and "layer_" not in name:
name = name.replace("norm1", "layer_norm1")
if "norm2" in name and "layer_" not in name:
name = name.replace("norm2", "layer_norm2")
return name
def convert_state_dict(orig_state_dict, config):
for key in orig_state_dict.copy().keys():
val = orig_state_dict.pop(key)
if key.startswith("clip_model") and "attn.in_proj" in key:
key_split = key.split(".")
if "visual" in key:
layer_num = int(key_split[4])
dim = config.vision_config.hidden_size
prefix = "vision_model"
else:
layer_num = int(key_split[3])
dim = config.text_config.hidden_size
prefix = "text_model"
if "weight" in key:
orig_state_dict[f"clip.{prefix}.encoder.layers.{layer_num}.self_attn.q_proj.weight"] = val[:dim, :]
orig_state_dict[f"clip.{prefix}.encoder.layers.{layer_num}.self_attn.k_proj.weight"] = val[dim : dim * 2, :]
orig_state_dict[f"clip.{prefix}.encoder.layers.{layer_num}.self_attn.v_proj.weight"] = val[-dim:, :]
else:
orig_state_dict[f"clip.{prefix}.encoder.layers.{layer_num}.self_attn.q_proj.bias"] = val[:dim]
orig_state_dict[f"clip.{prefix}.encoder.layers.{layer_num}.self_attn.k_proj.bias"] = val[dim : dim * 2]
orig_state_dict[f"clip.{prefix}.encoder.layers.{layer_num}.self_attn.v_proj.bias"] = val[-dim:]
elif "self_attn" in key and "out_proj" not in key:
key_split = key.split(".")
layer_num = int(key_split[1])
dim = config.reduce_dim
if "weight" in key:
orig_state_dict[f"decoder.layers.{layer_num}.self_attn.q_proj.weight"] = val[:dim, :]
orig_state_dict[f"decoder.layers.{layer_num}.self_attn.k_proj.weight"] = val[dim : dim * 2, :]
orig_state_dict[f"decoder.layers.{layer_num}.self_attn.v_proj.weight"] = val[-dim:, :]
else:
orig_state_dict[f"decoder.layers.{layer_num}.self_attn.q_proj.bias"] = val[:dim]
orig_state_dict[f"decoder.layers.{layer_num}.self_attn.k_proj.bias"] = val[dim : dim * 2]
orig_state_dict[f"decoder.layers.{layer_num}.self_attn.v_proj.bias"] = val[-dim:]
else:
new_name = rename_key(key)
if "visual_projection" in new_name or "text_projection" in new_name:
val = val.T
orig_state_dict[new_name] = val
return orig_state_dict
for key in state_dict.copy().keys():
if key.startswith("model"):
state_dict.pop(key, None)
state_dict = convert_state_dict(state_dict, config)
missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
if missing_keys != ["clip.text_model.embeddings.position_ids", "clip.vision_model.embeddings.position_ids"]:
raise ValueError("Missing keys that are not expected: {}".format(missing_keys))
if unexpected_keys != ["decoder.reduce.weight", "decoder.reduce.bias"]:
raise ValueError(f"Unexpected keys: {unexpected_keys}")
image_processor = ViTImageProcessor(size=352)
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPSegProcessor(image_processor=image_processor, tokenizer=tokenizer)
image = prepare_img()
text = ["a glass", "something to fill", "wood", "a jar"]
inputs = processor(text=text, images=[image] * len(text), padding="max_length", return_tensors="pt")
with torch.no_grad():
outputs = model(**inputs)
expected_conditional = torch.tensor([0.1110, -0.1882, 0.1645])
expected_pooled_output = torch.tensor([0.2692, -0.7197, -0.1328])
if model_name == "clipseg-rd64-refined":
expected_masks_slice = torch.tensor(
[[-10.0407, -9.9431, -10.2646], [-9.9751, -9.7064, -9.9586], [-9.6891, -9.5645, -9.9618]]
)
elif model_name == "clipseg-rd64":
expected_masks_slice = torch.tensor(
[[-7.2877, -7.2711, -7.2463], [-7.2652, -7.2780, -7.2520], [-7.2239, -7.2204, -7.2001]]
)
elif model_name == "clipseg-rd16":
expected_masks_slice = torch.tensor(
[[-6.3955, -6.4055, -6.4151], [-6.3911, -6.4033, -6.4100], [-6.3474, -6.3702, -6.3762]]
)
else:
raise ValueError(f"Model name {model_name} not supported.")
assert torch.allclose(outputs.logits[0, :3, :3], expected_masks_slice, atol=1e-3)
assert torch.allclose(outputs.conditional_embeddings[0, :3], expected_conditional, atol=1e-3)
assert torch.allclose(outputs.pooled_output[0, :3], expected_pooled_output, atol=1e-3)
print("Looks ok!")
if pytorch_dump_folder_path is not None:
print(f"Saving model and processor to {pytorch_dump_folder_path}")
model.save_pretrained(pytorch_dump_folder_path)
processor.save_pretrained(pytorch_dump_folder_path)
if push_to_hub:
print(f"Pushing model and processor for {model_name} to the hub")
model.push_to_hub(f"CIDAS/{model_name}")
processor.push_to_hub(f"CIDAS/{model_name}")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--model_name",
default="clipseg-rd64",
type=str,
choices=["clipseg-rd16", "clipseg-rd64", "clipseg-rd64-refined"],
help=(
"Name of the model. Supported models are: clipseg-rd64, clipseg-rd16 and clipseg-rd64-refined (rd meaning"
" reduce dimension)"
),
)
parser.add_argument(
"--checkpoint_path",
default="/Users/nielsrogge/Documents/CLIPSeg/clip_plus_rd64-uni.pth",
type=str,
help=(
"Path to the original checkpoint. Note that the script assumes that the checkpoint includes both CLIP and"
" the decoder weights."
),
)
parser.add_argument(
"--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
)
parser.add_argument(
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
)
args = parser.parse_args()
convert_clipseg_checkpoint(args.model_name, args.checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub)
这段代码是一个命令行工具的入口点,使用 argparse 模块解析命令行参数,并调用 `convert_clipseg_checkpoint` 函数进行处理。
.\models\clipseg\modeling_clipseg.py
""" PyTorch CLIPSeg 模型."""
import copy
import math
from dataclasses import dataclass
from typing import Any, Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from ...activations import ACT2FN
from ...modeling_attn_mask_utils import _create_4d_causal_attention_mask, _prepare_4d_attention_mask
from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
from ...modeling_utils import PreTrainedModel
from ...utils import (
ModelOutput,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "CIDAS/clipseg-rd64-refined"
CLIPSEG_PRETRAINED_MODEL_ARCHIVE_LIST = [
"CIDAS/clipseg-rd64-refined",
]
def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
return nn.functional.cross_entropy(logits, torch.arange(len(logits), device=logits.device))
def clipseg_loss(similarity: torch.Tensor) -> torch.Tensor:
caption_loss = contrastive_loss(similarity)
image_loss = contrastive_loss(similarity.t())
return (caption_loss + image_loss) / 2.0
@dataclass
class CLIPSegOutput(ModelOutput):
"""
Args:
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
Contrastive loss for image-text similarity.
logits_per_image:(`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
similarity scores.
logits_per_text:(`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
similarity scores.
text_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
The text embeddings obtained by applying the projection layer to the pooled output of [`CLIPSegTextModel`].
image_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
The image embeddings obtained by applying the projection layer to the pooled output of [`CLIPSegVisionModel`].
text_model_output(`BaseModelOutputWithPooling`):
The output of the [`CLIPSegTextModel`].
vision_model_output(`BaseModelOutputWithPooling`):
The output of the [`CLIPSegVisionModel`].
"""
loss: Optional[torch.FloatTensor] = None
logits_per_image: torch.FloatTensor = None
logits_per_text: torch.FloatTensor = None
text_embeds: torch.FloatTensor = None
image_embeds: torch.FloatTensor = None
text_model_output: BaseModelOutputWithPooling = None
vision_model_output: BaseModelOutputWithPooling = None
def to_tuple(self) -> Tuple[Any]:
return tuple(
self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
for k in self.keys()
)
@dataclass
class CLIPSegDecoderOutput(ModelOutput):
"""
Args:
logits (`torch.FloatTensor` of shape `(batch_size, height, width)`):
分类得分,用于每个像素的分类。
hidden_states (`tuple(torch.FloatTensor)`, *optional*, 当 `output_hidden_states=True` 或 `config.output_hidden_states=True` 时返回):
包含多个元素的元组,每个元素是 `torch.FloatTensor` 类型,表示每个层的隐藏状态输出,如果模型有嵌入层则还包含嵌入层的输出。
attentions (`tuple(torch.FloatTensor)`, *optional*, 当 `output_attentions=True` 或 `config.output_attentions=True` 时返回):
包含多个元素的元组,每个元素是 `torch.FloatTensor` 类型,表示每个层的注意力权重,形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
注意力权重经过 softmax 后的值,用于计算自注意力头中的加权平均值。
"""
logits: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
@dataclass
class CLIPSegImageSegmentationOutput(ModelOutput):
"""
Args:
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, 当 `return_loss` 为 `True` 时返回):
图像与文本相似性的对比损失。
...
vision_model_output (`BaseModelOutputWithPooling`):
[`CLIPSegVisionModel`] 的输出。
"""
loss: Optional[torch.FloatTensor] = None
logits: torch.FloatTensor = None
conditional_embeddings: torch.FloatTensor = None
pooled_output: torch.FloatTensor = None
vision_model_output: BaseModelOutputWithPooling = None
decoder_output: CLIPSegDecoderOutput = None
def to_tuple(self) -> Tuple[Any]:
"""
将对象转换为元组形式,包含所有属性值。特殊处理 `vision_model_output` 和 `decoder_output` 属性,
将它们转换为元组形式。
"""
return tuple(
self[k] if k not in ["vision_model_output", "decoder_output"] else getattr(self, k).to_tuple()
for k in self.keys()
)
class CLIPSegVisionEmbeddings(nn.Module):
"""
从 `transformers.models.clip.modeling_clip.CLIPVisionEmbeddings.__init__` 复制而来,将 `CLIP` 替换为 `CLIPSeg`。
"""
def __init__(self, config: CLIPSegVisionConfig):
super().__init__()
self.config = config
self.embed_dim = config.hidden_size
self.image_size = config.image_size
self.patch_size = config.patch_size
self.class_embedding = nn.Parameter(torch.randn(self.embed_dim))
self.patch_embedding = nn.Conv2d(
in_channels=config.num_channels,
out_channels=self.embed_dim,
kernel_size=self.patch_size,
stride=self.patch_size,
bias=False,
)
self.num_patches = (self.image_size // self.patch_size) ** 2
self.num_positions = self.num_patches + 1
self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
def interpolate_position_embeddings(self, new_size):
if len(new_size) != 2:
raise ValueError("new_size should consist of 2 values")
num_patches_one_direction = int(self.num_patches**0.5)
a = self.position_embedding.weight[1:].T.view(
1, self.config.hidden_size, num_patches_one_direction, num_patches_one_direction
)
b = (
nn.functional.interpolate(a, new_size, mode="bicubic", align_corners=False)
.squeeze(0)
.view(self.config.hidden_size, new_size[0] * new_size[1])
.T
)
result = torch.cat([self.position_embedding.weight[:1], b])
return result
def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
batch_size = pixel_values.shape[0]
patch_embeds = self.patch_embedding(pixel_values)
patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
class_embeds = self.class_embedding.expand(batch_size, 1, -1)
embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
if embeddings.shape[1] != self.num_positions:
new_shape = int(math.sqrt(embeddings.shape[1] - 1))
embeddings = embeddings + self.interpolate_position_embeddings((new_shape, new_shape))
embeddings = embeddings.to(embeddings.dtype)
else:
embeddings = embeddings + self.position_embedding(self.position_ids)
return embeddings
class CLIPSegTextEmbeddings(nn.Module):
def __init__(self, config: CLIPSegTextConfig):
super().__init__()
embed_dim = config.hidden_size
self.token_embedding = nn.Embedding(config.vocab_size, embed_dim)
self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
) -> torch.Tensor:
seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]
if position_ids is None:
position_ids = self.position_ids[:, :seq_length]
if inputs_embeds is None:
inputs_embeds = self.token_embedding(input_ids)
position_embeddings = self.position_embedding(position_ids)
embeddings = inputs_embeds + position_embeddings
return embeddings
class CLIPSegAttention(nn.Module):
"""Multi-headed attention from 'Attention Is All You Need' paper"""
def __init__(self, config):
super().__init__()
self.config = config
self.embed_dim = config.hidden_size
self.num_heads = config.num_attention_heads
self.head_dim = self.embed_dim // self.num_heads
if self.head_dim * self.num_heads != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
f" {self.num_heads})."
)
self.scale = self.head_dim**-0.5
self.dropout = config.attention_dropout
self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
causal_attention_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
def __init__(self, config):
super().__init__()
self.config = config
self.activation_fn = ACT2FN[config.hidden_act]
self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.fc1(hidden_states)
hidden_states = self.activation_fn(hidden_states)
hidden_states = self.fc2(hidden_states)
return hidden_states
class CLIPSegEncoderLayer(nn.Module):
def __init__(self, config: CLIPSegConfig):
super().__init__()
self.embed_dim = config.hidden_size
self.self_attn = CLIPSegAttention(config)
self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
self.mlp = CLIPSegMLP(config)
self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: torch.Tensor,
causal_attention_mask: torch.Tensor,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.FloatTensor]:
"""
Args:
hidden_states (`torch.FloatTensor`): 输入层的张量,形状为 `(batch, seq_len, embed_dim)`
attention_mask (`torch.FloatTensor`): 注意力遮罩,形状为 `(batch, 1, tgt_len, src_len)`,其中填充元素用非常大的负值表示
causal_attention_mask (`torch.FloatTensor`): 因果注意力遮罩,形状为 `(batch, 1, tgt_len, src_len)`,用于生成因果关系
output_attentions (`bool`, *optional*):
是否返回所有注意力层的注意力张量。详见返回的张量中的 `attentions`。
"""
residual = hidden_states
hidden_states = self.layer_norm1(hidden_states)
hidden_states, attn_weights = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
causal_attention_mask=causal_attention_mask,
output_attentions=output_attentions,
)
hidden_states = residual + hidden_states
residual = hidden_states
hidden_states = self.layer_norm2(hidden_states)
hidden_states = self.mlp(hidden_states)
hidden_states = residual + hidden_states
outputs = (hidden_states,)
if output_attentions:
outputs += (attn_weights,)
return outputs
def _init_weights(self, module):
"""Initialize the weights"""
factor = self.config.initializer_factor
if isinstance(module, CLIPSegTextEmbeddings):
module.token_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
module.position_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
elif isinstance(module, CLIPSegVisionEmbeddings):
nn.init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor)
nn.init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor)
nn.init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor)
elif isinstance(module, CLIPSegAttention):
in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
out_proj_std = (module.embed_dim**-0.5) * factor
nn.init.normal_(module.q_proj.weight, std=in_proj_std)
nn.init.normal_(module.k_proj.weight, std=in_proj_std)
nn.init.normal_(module.v_proj.weight, std=in_proj_std)
nn.init.normal_(module.out_proj.weight, std=out_proj_std)
elif isinstance(module, CLIPSegMLP):
in_proj_std = (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
nn.init.normal_(module.fc1.weight, std=fc_std)
nn.init.normal_(module.fc2.weight, std=in_proj_std)
elif isinstance(module, CLIPSegModel):
nn.init.normal_(
module.text_projection.weight,
std=module.text_embed_dim**-0.5 * self.config.initializer_factor,
)
nn.init.normal_(
module.visual_projection.weight,
std=module.vision_embed_dim**-0.5 * self.config.initializer_factor,
)
if isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
if isinstance(module, nn.Linear) and module.bias is not None:
module.bias.data.zero_()
CLIPSEG_START_DOCSTRING = r"""
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
config ([`CLIPSegConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
CLIPSEG_TEXT_INPUTS_DOCSTRING = r"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
config.max_position_embeddings - 1]`.
[What are position IDs?](../glossary#position-ids)
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
CLIPSEG_VISION_INPUTS_DOCSTRING = r"""
# 定义函数签名和参数说明
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
像素值。默认情况下将忽略填充。可以使用 [`AutoImageProcessor`] 获取像素值。有关详细信息,请参见 [`CLIPImageProcessor.__call__`]。
output_attentions (`bool`, *optional*):
是否返回所有注意力层的注意力张量。有关更多详细信息,请参见返回的张量下的 `attentions`。
output_hidden_states (`bool`, *optional*):
是否返回所有层的隐藏状态。有关更多详细信息,请参见返回的张量下的 `hidden_states`。
return_dict (`bool`, *optional*):
是否返回 [`~utils.ModelOutput`] 而不是普通元组。
"""
CLIPSEG_INPUTS_DOCSTRING = r"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
config.max_position_embeddings - 1]`.
[What are position IDs?](../glossary#position-ids)
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
[`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
return_loss (`bool`, *optional*):
Whether or not to return the contrastive loss.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
class CLIPSegEncoder(nn.Module):
"""
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`CLIPSegEncoderLayer`].
Args:
config: CLIPSegConfig
"""
def __init__(self, config: CLIPSegConfig):
super().__init__()
self.config = config
self.layers = nn.ModuleList([CLIPSegEncoderLayer(config) for _ in range(config.num_hidden_layers)])
self.gradient_checkpointing = False
def forward(
self,
inputs_embeds,
attention_mask: Optional[torch.Tensor] = None,
causal_attention_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
class CLIPSegTextTransformer(nn.Module):
def __init__(self, config: CLIPSegTextConfig):
super().__init__()
self.config = config
embed_dim = config.hidden_size
self.embeddings = CLIPSegTextEmbeddings(config)
self.encoder = CLIPSegEncoder(config)
self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
self.eos_token_id = config.eos_token_id
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
class CLIPSegTextModel(CLIPSegPreTrainedModel):
config_class = CLIPSegTextConfig
_no_split_modules = ["CLIPSegTextEmbeddings", "CLIPSegEncoderLayer"]
def __init__(self, config: CLIPSegTextConfig):
super().__init__(config)
self.text_model = CLIPSegTextTransformer(config)
self.post_init()
def get_input_embeddings(self) -> nn.Module:
return self.text_model.embeddings.token_embedding
def set_input_embeddings(self, value):
self.text_model.embeddings.token_embedding = value
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
) -> Union[Tuple, BaseModelOutputWithPooling]:
r"""
Returns:
Examples:
```
>>> from transformers import AutoTokenizer, CLIPSegTextModel
>>> tokenizer = AutoTokenizer.from_pretrained("CIDAS/clipseg-rd64-refined")
>>> model = CLIPSegTextModel.from_pretrained("CIDAS/clipseg-rd64-refined")
>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output # pooled (EOS token) states
```"""
调用 self 对象的 text_model 方法,传入各种参数来进行文本模型的推理和处理
return self.text_model(
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
class CLIPSegVisionTransformer(nn.Module):
def __init__(self, config: CLIPSegVisionConfig):
super().__init__()
self.config = config
embed_dim = config.hidden_size
self.embeddings = CLIPSegVisionEmbeddings(config)
self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
self.encoder = CLIPSegEncoder(config)
self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
@add_start_docstrings_to_model_forward(CLIPSEG_VISION_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPSegVisionConfig)
def forward(
self,
pixel_values: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPooling]:
r"""
Returns:
"""
if pixel_values is None:
raise ValueError("You have to specify pixel_values")
hidden_states = self.embeddings(pixel_values)
hidden_states = self.pre_layrnorm(hidden_states)
encoder_outputs = self.encoder(
inputs_embeds=hidden_states,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
last_hidden_state = encoder_outputs[0]
pooled_output = last_hidden_state[:, 0, :]
pooled_output = self.post_layernorm(pooled_output)
if not return_dict:
return (last_hidden_state, pooled_output) + encoder_outputs[1:]
return BaseModelOutputWithPooling(
last_hidden_state=last_hidden_state,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
class CLIPSegVisionModel(CLIPSegPreTrainedModel):
config_class = CLIPSegVisionConfig
main_input_name = "pixel_values"
def __init__(self, config: CLIPSegVisionConfig):
super().__init__(config)
self.vision_model = CLIPSegVisionTransformer(config)
self.post_init()
def get_input_embeddings(self) -> nn.Module:
return self.vision_model.embeddings.patch_embedding
@add_start_docstrings_to_model_forward(CLIPSEG_VISION_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPSegVisionConfig)
def forward(
self,
pixel_values: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPooling]:
r"""
此方法定义了模型的前向传播逻辑,用于推理过程。
Args:
pixel_values (Optional[torch.FloatTensor], optional): 输入图像的像素值张量。默认为None。
output_attentions (Optional[bool], optional): 是否输出注意力权重。默认为None。
output_hidden_states (Optional[bool], optional): 是否输出隐藏状态。默认为None。
return_dict (Optional[bool], optional): 是否返回字典形式的输出。默认为None。
Returns:
Union[Tuple, BaseModelOutputWithPooling]: 根据return_dict决定返回类型,可能是元组或BaseModelOutputWithPooling对象。
Examples:
```
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, CLIPSegVisionModel
>>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
>>> model = CLIPSegVisionModel.from_pretrained("CIDAS/clipseg-rd64-refined")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = processor(images=image, return_tensors="pt")
>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output # pooled CLS states
```
"""
return self.vision_model(
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
@add_start_docstrings(CLIPSEG_START_DOCSTRING)
class CLIPSegModel(CLIPSegPreTrainedModel):
config_class = CLIPSegConfig
def __init__(self, config: CLIPSegConfig):
super().__init__(config)
if not isinstance(config.text_config, CLIPSegTextConfig):
raise ValueError(
"config.text_config is expected to be of type CLIPSegTextConfig but is of type"
f" {type(config.text_config)}."
)
if not isinstance(config.vision_config, CLIPSegVisionConfig):
raise ValueError(
"config.vision_config is expected to be of type CLIPSegVisionConfig but is of type"
f" {type(config.vision_config)}."
)
text_config = config.text_config
vision_config = config.vision_config
self.projection_dim = config.projection_dim
self.text_embed_dim = text_config.hidden_size
self.vision_embed_dim = vision_config.hidden_size
self.text_model = CLIPSegTextTransformer(text_config)
self.vision_model = CLIPSegVisionTransformer(vision_config)
self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
self.post_init()
@add_start_docstrings_to_model_forward(CLIPSEG_TEXT_INPUTS_DOCSTRING)
def get_text_features(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> torch.FloatTensor:
r"""
Returns:
text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
applying the projection layer to the pooled output of [`CLIPSegTextModel`].
Examples:
```
>>> from transformers import AutoTokenizer, CLIPSegModel
>>> tokenizer = AutoTokenizer.from_pretrained("CIDAS/clipseg-rd64-refined")
>>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")
>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
>>> text_features = model.get_text_features(**inputs)
```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
text_outputs = self.text_model(
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
pooled_output = text_outputs[1]
text_features = self.text_projection(pooled_output)
return text_features
@add_start_docstrings_to_model_forward(CLIPSEG_VISION_INPUTS_DOCSTRING)
def get_image_features(
self,
pixel_values: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> torch.FloatTensor:
r"""
Returns:
image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
applying the projection layer to the pooled output of [`CLIPSegVisionModel`].
Examples:
```
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, CLIPSegModel
>>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
>>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = processor(images=image, return_tensors="pt")
>>> image_features = model.get_image_features(**inputs)
```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
vision_outputs = self.vision_model(
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
pooled_output = vision_outputs[1]
image_features = self.visual_projection(pooled_output)
return image_features
class CLIPSegDecoderLayer(nn.Module):
"""
CLIPSeg decoder layer, which is identical to `CLIPSegEncoderLayer`, except that normalization is applied after
self-attention/MLP, rather than before.
"""
def __init__(self, config: CLIPSegConfig):
super().__init__()
self.embed_dim = config.hidden_size
self.self_attn = CLIPSegAttention(config)
self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
self.mlp = CLIPSegMLP(config)
self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: torch.Tensor,
causal_attention_mask: torch.Tensor,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.FloatTensor]:
"""
Args:
hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
attention_mask (`torch.FloatTensor`): attention mask of size
`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
causal_attention_mask (`torch.FloatTensor`): mask applied to causal attention
`(batch, 1, tgt_len, src_len)`
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers.
"""
residual = hidden_states
hidden_states, attn_weights = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
causal_attention_mask=causal_attention_mask,
output_attentions=output_attentions,
)
hidden_states = residual + hidden_states
hidden_states = self.layer_norm1(hidden_states)
residual = hidden_states
hidden_states = self.mlp(hidden_states)
hidden_states = residual + hidden_states
hidden_states = self.layer_norm2(hidden_states)
outputs = (hidden_states,)
if output_attentions:
outputs += (attn_weights,)
return outputs
def __init__(self, config: CLIPSegConfig):
super().__init__(config)
self.conditional_layer = config.conditional_layer
self.film_mul = nn.Linear(config.projection_dim, config.reduce_dim)
self.film_add = nn.Linear(config.projection_dim, config.reduce_dim)
if config.use_complex_transposed_convolution:
transposed_kernels = (config.vision_config.patch_size // 4, config.vision_config.patch_size // 4)
self.transposed_convolution = nn.Sequential(
nn.Conv2d(config.reduce_dim, config.reduce_dim, kernel_size=3, padding=1),
nn.ReLU(),
nn.ConvTranspose2d(
config.reduce_dim,
config.reduce_dim // 2,
kernel_size=transposed_kernels[0],
stride=transposed_kernels[0],
),
nn.ReLU(),
nn.ConvTranspose2d(
config.reduce_dim // 2, 1, kernel_size=transposed_kernels[1], stride=transposed_kernels[1]
),
)
else:
self.transposed_convolution = nn.ConvTranspose2d(
config.reduce_dim, 1, config.vision_config.patch_size, stride=config.vision_config.patch_size
)
depth = len(config.extract_layers)
self.reduces = nn.ModuleList(
[nn.Linear(config.vision_config.hidden_size, config.reduce_dim) for _ in range(depth)]
)
decoder_config = copy.deepcopy(config.vision_config)
decoder_config.hidden_size = config.reduce_dim
decoder_config.num_attention_heads = config.decoder_num_attention_heads
decoder_config.intermediate_size = config.decoder_intermediate_size
decoder_config.hidden_act = "relu"
self.layers = nn.ModuleList([CLIPSegDecoderLayer(decoder_config) for _ in range(len(config.extract_layers))])
def forward(
self,
hidden_states: Tuple[torch.Tensor],
conditional_embeddings: torch.Tensor,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = True,
):
all_hidden_states = () if output_hidden_states else None
all_attentions = () if output_attentions else None
activations = hidden_states[::-1]
output = None
for i, (activation, layer, reduce) in enumerate(zip(activations, self.layers, self.reduces)):
if output is not None:
output = reduce(activation) + output
else:
output = reduce(activation)
if i == self.conditional_layer:
output = self.film_mul(conditional_embeddings) * output.permute(1, 0, 2) + self.film_add(
conditional_embeddings
)
output = output.permute(1, 0, 2)
layer_outputs = layer(
output, attention_mask=None, causal_attention_mask=None, output_attentions=output_attentions
)
output = layer_outputs[0]
if output_hidden_states:
all_hidden_states += (output,)
if output_attentions:
all_attentions += (layer_outputs[1],)
output = output[:, 1:, :].permute(0, 2, 1)
size = int(math.sqrt(output.shape[2]))
batch_size = conditional_embeddings.shape[0]
output = output.view(batch_size, output.shape[1], size, size)
logits = self.transposed_convolution(output).squeeze(1)
if not return_dict:
return tuple(v for v in [logits, all_hidden_states, all_attentions] if v is not None)
return CLIPSegDecoderOutput(
logits=logits,
hidden_states=all_hidden_states,
attentions=all_attentions,
)
@add_start_docstrings(
"""
CLIPSeg model with a Transformer-based decoder on top for zero-shot and one-shot image segmentation.
""",
CLIPSEG_START_DOCSTRING,
)
class CLIPSegForImageSegmentation(CLIPSegPreTrainedModel):
config_class = CLIPSegConfig
def __init__(self, config: CLIPSegConfig):
super().__init__(config)
self.config = config
self.clip = CLIPSegModel(config)
self.extract_layers = config.extract_layers
self.decoder = CLIPSegDecoder(config)
self.post_init()
def get_conditional_embeddings(
self,
batch_size: int = None,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
conditional_pixel_values: Optional[torch.Tensor] = None,
):
if input_ids is not None:
if len(input_ids) != batch_size:
raise ValueError("Make sure to pass as many prompt texts as there are query images")
with torch.no_grad():
conditional_embeddings = self.clip.get_text_features(
input_ids, attention_mask=attention_mask, position_ids=position_ids
)
elif conditional_pixel_values is not None:
if len(conditional_pixel_values) != batch_size:
raise ValueError("Make sure to pass as many prompt images as there are query images")
with torch.no_grad():
conditional_embeddings = self.clip.get_image_features(conditional_pixel_values)
else:
raise ValueError(
"Invalid conditional, should be either provided as `input_ids` or `conditional_pixel_values`"
)
return conditional_embeddings
@add_start_docstrings_to_model_forward(CLIPSEG_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=CLIPSegImageSegmentationOutput, config_class=CLIPSegTextConfig)
def forward(
self,
input_ids: Optional[torch.FloatTensor] = None,
pixel_values: Optional[torch.FloatTensor] = None,
conditional_pixel_values: Optional[torch.FloatTensor] = None,
conditional_embeddings: Optional[torch.FloatTensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
pass
.\models\clipseg\processing_clipseg.py
"""
CLIPSeg 的图像/文本处理器类
"""
import warnings
from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding
class CLIPSegProcessor(ProcessorMixin):
r"""
构建一个 CLIPSeg 处理器,将 CLIPSeg 图像处理器和 CLIP 分词器包装成一个单一处理器。
[`CLIPSegProcessor`] 提供了 [`ViTImageProcessor`] 和 [`CLIPTokenizerFast`] 的所有功能。查看
[`~CLIPSegProcessor.__call__`] 和 [`~CLIPSegProcessor.decode`] 获取更多信息。
Args:
image_processor ([`ViTImageProcessor`], *optional*):
图像处理器是必需的输入。
tokenizer ([`CLIPTokenizerFast`], *optional*):
分词器是必需的输入。
"""
attributes = ["image_processor", "tokenizer"]
image_processor_class = "ViTImageProcessor"
tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
def __init__(self, image_processor=None, tokenizer=None, **kwargs):
feature_extractor = None
if "feature_extractor" in kwargs:
warnings.warn(
"The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
" instead.",
FutureWarning,
)
feature_extractor = kwargs.pop("feature_extractor")
image_processor = image_processor if image_processor is not None else feature_extractor
if image_processor is None:
raise ValueError("You need to specify an `image_processor`.")
if tokenizer is None:
raise ValueError("You need to specify a `tokenizer`.")
super().__init__(image_processor, tokenizer)
def batch_decode(self, *args, **kwargs):
"""
此方法将所有参数转发给 CLIPTokenizerFast 的 [`~PreTrainedTokenizer.batch_decode`]。更多信息请参考该方法的文档字符串。
"""
return self.tokenizer.batch_decode(*args, **kwargs)
def decode(self, *args, **kwargs):
"""
此方法将所有参数转发给 CLIPTokenizerFast 的 [`~PreTrainedTokenizer.decode`]。更多信息请参考该方法的文档字符串。
"""
return self.tokenizer.decode(*args, **kwargs)
@property
def feature_extractor_class(self):
warnings.warn(
"`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
FutureWarning,
)
return self.image_processor_class
@property
def feature_extractor(self):
warnings.warn(
"`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
FutureWarning,
)
return self.image_processor
.\models\clipseg\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
_import_structure = {
"configuration_clipseg": [
"CLIPSEG_PRETRAINED_CONFIG_ARCHIVE_MAP",
"CLIPSegConfig",
"CLIPSegTextConfig",
"CLIPSegVisionConfig",
],
"processing_clipseg": ["CLIPSegProcessor"],
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_clipseg"] = [
"CLIPSEG_PRETRAINED_MODEL_ARCHIVE_LIST",
"CLIPSegModel",
"CLIPSegPreTrainedModel",
"CLIPSegTextModel",
"CLIPSegVisionModel",
"CLIPSegForImageSegmentation",
]
if TYPE_CHECKING:
from .configuration_clipseg import (
CLIPSEG_PRETRAINED_CONFIG_ARCHIVE_MAP,
CLIPSegConfig,
CLIPSegTextConfig,
CLIPSegVisionConfig,
)
from .processing_clipseg import CLIPSegProcessor
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_clipseg import (
CLIPSEG_PRETRAINED_MODEL_ARCHIVE_LIST,
CLIPSegForImageSegmentation,
CLIPSegModel,
CLIPSegPreTrainedModel,
CLIPSegTextModel,
CLIPSegVisionModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\clvp\configuration_clvp.py
import os
from typing import TYPE_CHECKING, Union
if TYPE_CHECKING:
pass
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
CLVP_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"susnato/clvp_dev": "https://huggingface.co/susnato/clvp_dev/resolve/main/config.json",
}
class ClvpEncoderConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`ClvpEncoder`]. It is used to instantiate a CLVP
text or CLVP speech encoder according to the specified arguments. Instantiating a configuration with the defaults
will yield a similar configuration to that of the encoder of the CLVP
[susnato/clvp_dev](https://huggingface.co/susnato/clvp_dev) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
# 设置模型类型为 "clvp_encoder"
model_type = "clvp_encoder"
# 初始化函数,用于设置模型的
# 定义 CLVP 解码器配置类,继承自预训练配置类 PretrainedConfig
class ClvpDecoderConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`ClvpDecoder`]. It is used to instantiate a CLVP
Decoder Model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the Decoder part of the CLVP
[susnato/clvp_dev](https://huggingface.co/susnato/clvp_dev) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
The architecture is similar to GPT2.
Example:
```
>>> from transformers import ClvpDecoderConfig, ClvpDecoder
>>>
>>> decoder_configuration = ClvpDecoderConfig()
>>>
>>> model = ClvpDecoder(decoder_configuration)
>>>
>>> configuration = model.config
```"""
# 模型类型为 "clvp_decoder"
model_type = "clvp_decoder"
# 初始化函数,定义 CLVP 解码器配置的各项参数
def __init__(
self,
vocab_size=8194,
max_position_embeddings=608,
max_text_tokens=404,
hidden_size=1024,
num_hidden_layers=30,
num_attention_heads=16,
n_inner=None,
num_mel_attn_blocks=6,
activation_function="gelu_new",
resid_pdrop=0.1,
embd_pdrop=0.1,
attention_dropout=0.1,
layer_norm_epsilon=1e-5,
initializer_range=0.02,
summary_type="cls_index",
summary_use_proj=True,
summary_activation=None,
summary_proj_to_labels=True,
summary_first_dropout=0.1,
use_cache=True,
bos_token_id=8192,
eos_token_id=8193,
feature_size=80,
use_attention_bias=True,
initializer_factor=1.0,
decoder_fixing_codes=[83, 45, 45, 248],
**kwargs,
):
# 调用父类的初始化函数,传递所有参数
super().__init__(**kwargs)
# 定义 CLVP 解码器特有的参数
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.max_text_tokens = max_text_tokens
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.n_inner = n_inner
self.num_mel_attn_blocks = num_mel_attn_blocks
self.activation_function = activation_function
self.resid_pdrop = resid_pdrop
self.embd_pdrop = embd_pdrop
self.attention_dropout = attention_dropout
self.layer_norm_epsilon = layer_norm_epsilon
self.initializer_range = initializer_range
self.summary_type = summary_type
self.summary_use_proj = summary_use_proj
self.summary_activation = summary_activation
self.summary_proj_to_labels = summary_proj_to_labels
self.summary_first_dropout = summary_first_dropout
self.use_cache = use_cache
self.bos_token_id = bos_token_id
self.eos_token_id = eos_token_id
self.feature_size = feature_size
self.use_attention_bias = use_attention_bias
self.initializer_factor = initializer_factor
self.decoder_fixing_codes = decoder_fixing_codes
# 接受并处理未定义的额外参数
self.update_from_kwargs(kwargs)
):
# 初始化方法,接收多个参数来配置模型的各种属性
self.vocab_size = vocab_size
# 设置词汇表大小
self.max_position_embeddings = max_position_embeddings
# 设置最大位置编码长度
self.max_text_tokens = max_text_tokens
# 设置最大文本标记数
self.hidden_size = hidden_size
# 设置隐藏层大小
self.num_hidden_layers = num_hidden_layers
# 设置隐藏层数量
self.num_attention_heads = num_attention_heads
# 设置注意力头数
self.n_inner = n_inner
# 设置内部层大小
self.num_mel_attn_blocks = num_mel_attn_blocks
# 设置 MEL 注意力块数量
self.activation_function = activation_function
# 设置激活函数
self.resid_pdrop = resid_pdrop
# 设置残差连接丢弃率
self.embd_pdrop = embd_pdrop
# 设置嵌入层丢弃率
self.attention_dropout = attention_dropout
# 设置注意力丢弃率
self.layer_norm_epsilon = layer_norm_epsilon
# 设置层归一化的 epsilon 参数
self.initializer_range = initializer_range
# 设置初始化范围
self.summary_type = summary_type
# 设置摘要类型
self.summary_use_proj = summary_use_proj
# 设置是否使用摘要投影
self.summary_activation = summary_activation
# 设置摘要激活函数
self.summary_first_dropout = summary_first_dropout
# 设置摘要的首次丢弃率
self.summary_proj_to_labels = summary_proj_to_labels
# 设置摘要投影到标签
self.use_cache = use_cache
# 设置是否使用缓存
self.feature_size = feature_size
# 设置特征大小
self.use_attention_bias = use_attention_bias
# 设置是否使用注意力偏置
self.initializer_factor = initializer_factor
# 设置初始化因子
self.decoder_fixing_codes = decoder_fixing_codes
# 设置解码器修复码
self.bos_token_id = bos_token_id
# 设置起始标记 ID
self.eos_token_id = eos_token_id
# 设置结束标记 ID
super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
# 调用父类初始化方法,传入起始和结束标记 ID 以及其他参数
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
# 类方法:从预训练模型名或路径创建配置对象,返回预训练配置对象
cls._set_token_in_kwargs(kwargs)
# 将 token 设置到 kwargs 中
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
# 获取配置字典和更新后的 kwargs 参数
# 如果从 ClvpConfig 加载,则获取语音配置字典
if config_dict.get("model_type") == "clvp":
config_dict = config_dict["decoder_config"]
# 如果配置字典中有模型类型,并且类具有 model_type 属性,并且模型类型不等于 cls.model_type,则发出警告
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
logger.warning(
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
)
return cls.from_dict(config_dict, **kwargs)
# 从配置字典和 kwargs 参数创建配置对象并返回
# `ClvpConfig` 是存储 [`ClvpModelForConditionalGeneration`] 配置的类。
# 该配置类用于实例化 CLVP 模型,定义文本模型、语音模型和解码器模型的配置。
# 使用默认参数实例化配置对象将生成类似于 CLVP [susnato/clvp_dev](https://huggingface.co/susnato/clvp_dev) 架构的配置。
# 配置对象继承自 [`PretrainedConfig`],用于控制模型输出。更多信息请参阅 [`PretrainedConfig`] 的文档。
class ClvpConfig(PretrainedConfig):
model_type = "clvp"
is_composition = True
def __init__(
self,
text_config=None,
speech_config=None,
decoder_config=None,
projection_dim=768,
logit_scale_init_value=2.6592,
initializer_factor=1.0,
**kwargs,
):
super().__init__(**kwargs)
if text_config is None:
text_config = {}
# 如果未提供text_config参数,则使用默认空字典
logger.info("`text_config` is `None`. Initializing the `ClvpEncoderConfig` with default values.")
if speech_config is None:
speech_config = {}
# 如果未提供speech_config参数,则使用默认空字典
logger.info("`speech_config` is `None`. initializing the `ClvpEncoderConfig` with default values.")
if decoder_config is None:
decoder_config = {}
# 如果未提供decoder_config参数,则使用默认空字典
logger.info("`decoder_config` is `None`. initializing the `ClvpDecoderConfig` with default values.")
self.text_config = ClvpEncoderConfig(**text_config)
# 初始化self.text_config,使用ClvpEncoderConfig类及其参数
self.speech_config = ClvpEncoderConfig(**speech_config)
# 初始化self.speech_config,使用ClvpEncoderConfig类及其参数
self.decoder_config = ClvpDecoderConfig(**decoder_config)
# 初始化self.decoder_config,使用ClvpDecoderConfig类及其参数
self.projection_dim = projection_dim
# 设置投影维度
self.logit_scale_init_value = logit_scale_init_value
# 设置logit缩放初始值
self.initializer_factor = initializer_factor
# 设置初始化因子
@classmethod
def from_sub_model_configs(
cls,
text_config: ClvpEncoderConfig,
speech_config: ClvpEncoderConfig,
decoder_config: ClvpDecoderConfig,
**kwargs,
):
r"""
Instantiate a [`ClvpConfig`] (or a derived class) from CLVP text model configuration, CLVP speech model
configuration and CLVP decoder model configuration.
Args:
text_config (`ClvpEncoderConfig`):
Text model configuration of type [`ClvpEncoderConfig`].
speech_config (`ClvpEncoderConfig`):
Speech model configuration of type [`ClvpEncoderConfig`].
decoder_config (`ClvpDecoderConfig`):
Decoder model configuration of type [`ClvpDecoderConfig`].
Returns:
[`ClvpConfig`]: An instance of a configuration object
"""
return cls(
text_config=text_config.to_dict(),
# 将text_config转换为字典形式传递给cls构造函数
speech_config=speech_config.to_dict(),
# 将speech_config转换为字典形式传递给cls构造函数
decoder_config=decoder_config.to_dict(),
# 将decoder_config转换为字典形式传递给cls构造函数
**kwargs,
)
.\models\clvp\convert_clvp_to_hf.py
"""
CLVP权重转换脚本
"""
import argparse
import os
import torch
from huggingface_hub import hf_hub_download
from transformers import ClvpConfig, ClvpModelForConditionalGeneration
_MODELS = {
"clvp": "https://huggingface.co/jbetker/tortoise-tts-v2/blob/main/.models/clvp2.pth",
"decoder": "https://huggingface.co/jbetker/tortoise-tts-v2/blob/main/.models/autoregressive.pth",
}
dim = 1024
sub_dim = dim // 16
CLVP_ENCODERS_MAPPING = {
"text_transformer.transformer.attn_layers": "text_encoder_model",
"speech_transformer.transformer.attn_layers": "speech_encoder_model",
"text_transformer.transformer.norm": "text_encoder_model.final_layer_norm",
"speech_transformer.transformer.norm": "speech_encoder_model.final_layer_norm",
"to_text_latent": "text_encoder_model.projection",
"to_speech_latent": "speech_encoder_model.projection",
"text_emb": "text_encoder_model.token_embedding",
"speech_emb": "speech_encoder_model.token_embedding",
"1.wrap.net.0": "mlp.fc1",
"1.wrap.net.3": "mlp.fc2",
"1.wrap": "self_attn",
"to_out": "out_proj",
"to_q": "q_proj",
"to_k": "k_proj",
"to_v": "v_proj",
"temperature": "logit_scale",
}
CLVP_DECODER_MAPPING = {
"conditioning_encoder.init": "conditioning_encoder.mel_conv",
"conditioning_encoder.attn": "conditioning_encoder.mel_attn_blocks",
"mel_attn_blocks": "group_norms",
".norm.weight": ".weight",
".norm.bias": ".bias",
"text_embedding": "conditioning_encoder.text_token_embedding",
"text_pos_embedding.emb": "conditioning_encoder.text_position_embedding",
"final_norm": "speech_decoder_model.final_norm",
"mel_head": "speech_decoder_model.lm_head",
"gpt.ln_f": "speech_decoder_model.model.decoder.layer_norm",
"mel_embedding": "speech_decoder_model.model.decoder.input_embeds_layer",
"mel_pos_embedding.emb": "speech_decoder_model.model.decoder.position_embeds_layer",
"gpt.h": "speech_decoder_model.model.decoder.layers",
"ln_1": "input_layernorm",
"ln_2": "post_attention_layernorm",
}
def update_index(present_index):
if present_index % 2 == 0:
return int(present_index / 2)
else:
return int((present_index - 1) / 2)
def convert_encoder_weights(original_weights):
converted_weights = {}
original_weights_keys = sorted(original_weights.keys())
for original_key in original_weights_keys:
updated_key = original_key
if "0.0.g" in updated_key:
present_index = updated_key.split(".")[4]
if int(present_index) % 2 == 0:
updated_key = updated_key.replace("0.0.g", "input_rmsnorm.weight")
else:
updated_key = updated_key.replace("0.0.g", "post_attention_rmsnorm.weight")
if "transformer.attn_layers.layers" in updated_key:
present_index = updated_key.split(".")[4]
updated_index = update_index(int(present_index))
updated_key = updated_key.replace(
f"transformer.attn_layers.layers.{present_index}", f"transformer.attn_layers.layers.{updated_index}"
)
for k, v in CLVP_ENCODERS_MAPPING.items():
if k in updated_key:
updated_key = updated_key.replace(k, v)
converted_weights[updated_key] = original_weights.pop(original_key)
return converted_weights
def convert_decoder_weights(original_weights):
converted_weights = {}
original_weights_keys = sorted(original_weights.keys())
return converted_weights
def _download(url: str, root: str):
repo_id = f"{url.split('/')[3]}/{url.split('/')[4]}"
filename = f"{url.split('/')[-2]}/{url.split('/')[-1]}"
hf_hub_download(
repo_id=repo_id,
filename=filename,
force_filename=root,
local_dir_use_symlinks=False,
)
def convert_clvp_weights(checkpoint_path, pytorch_dump_folder_path):
converted_checkpoint = {}
for each_model_name, each_model_url in _MODELS.items():
each_model_path = os.path.join(checkpoint_path, each_model_url.split("/")[-1])
if not os.path.exists(each_model_path):
print(f"\n{each_model_name} was not found! Downloading it to {each_model_path}")
_download(url=each_model_url, root=each_model_path)
if each_model_name == "clvp":
clvp_checkpoint = torch.load(each_model_path, map_location="cpu")
else:
decoder_checkpoint = torch.load(each_model_path, map_location="cpu")
converted_checkpoint.update(**convert_encoder_weights(clvp_checkpoint))
converted_checkpoint.update(**convert_decoder_weights(decoder_checkpoint))
config = ClvpConfig.from_pretrained("susnato/clvp_dev")
model = ClvpModelForConditionalGeneration(config)
model.load_state_dict(converted_checkpoint, strict=True)
model.save_pretrained(pytorch_dump_folder_path)
print(f"Model saved at {pytorch_dump_folder_path}!")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--checkpoint_path", type=str, help="Path to the folder of downloaded checkpoints. (Please enter full path)"
)
parser.add_argument(
"--pytorch_dump_folder_path",
default=None,
type=str,
help="Path to the output PyTorch model. (Please enter full path)",
)
args = parser.parse_args()
convert_clvp_weights(args.checkpoint_path, args.pytorch_dump_folder_path)
.\models\clvp\feature_extraction_clvp.py
"""
Feature extractor class for CLVP
"""
from typing import List, Optional, Union
import numpy as np
from ...audio_utils import mel_filter_bank, spectrogram, window_function
from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
from ...feature_extraction_utils import BatchFeature
from ...utils import TensorType, logging
logger = logging.get_logger(__name__)
class ClvpFeatureExtractor(SequenceFeatureExtractor):
r"""
Constructs a CLVP feature extractor.
This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
most of the main methods. Users should refer to this superclass for more information regarding those methods.
This class extracts log-mel-spectrogram features from raw speech using a custom numpy implementation of the `Short
Time Fourier Transform` which should match pytorch's `torch.stft` equivalent.
"""
Args:
feature_size (`int`, *optional*, defaults to 80):
The feature dimension of the extracted features.
sampling_rate (`int`, *optional*, defaults to 22050):
The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
default_audio_length (`int`, *optional*, defaults to 6):
The default length of raw audio in seconds. If `max_length` is not set during `__call__` then it will
automatically be set to default_audio_length * `self.sampling_rate`.
hop_length (`int`, *optional*, defaults to 256):
Length of the overlapping windows for the STFT used to obtain the Mel Frequency coefficients.
chunk_length (`int`, *optional*, defaults to 30):
The maximum number of chunks of `sampling_rate` samples used to trim and pad longer or shorter audio
sequences.
n_fft (`int`, *optional*, defaults to 1024):
Size of the Fourier transform.
padding_value (`float`, *optional*, defaults to 0.0):
Padding value used to pad the audio. Should correspond to silences.
mel_norms (`list` of length `feature_size`, *optional*):
If `mel_norms` is provided then it will be used to normalize the log-mel spectrograms along each
mel-filter.
return_attention_mask (`bool`, *optional*, defaults to `False`):
Whether to return the attention mask. If left to the default, it will return the attention mask.
[What are attention masks?](../glossary
"""
# 定义模型输入的名称,包括输入特征和注意力掩码
model_input_names = ["input_features", "attention_mask"]
def __init__(
self,
feature_size=80,
sampling_rate=22050,
default_audio_length=6,
hop_length=256,
chunk_length=30,
n_fft=1024,
padding_value=0.0,
mel_norms=None,
return_attention_mask=False, # pad inputs to max length with silence token (zero) and no attention mask
**kwargs,
):
# 调用父类的初始化方法,设置基本参数
super().__init__(
feature_size=feature_size,
sampling_rate=sampling_rate,
padding_value=padding_value,
return_attention_mask=return_attention_mask,
**kwargs,
)
# 设置其他参数和属性
self.n_fft = n_fft
self.hop_length = hop_length
self.chunk_length = chunk_length
self.n_samples = chunk_length * sampling_rate # 计算每个片段的采样数
self.nb_max_frames = self.n_samples // hop_length # 计算最大帧数
self.sampling_rate = sampling_rate
self.default_audio_length = default_audio_length
self.mel_norms = mel_norms
# 计算梅尔滤波器组
self.mel_filters = mel_filter_bank(
num_frequency_bins=1 + (n_fft // 2),
num_mel_filters=feature_size,
min_frequency=0.0,
max_frequency=8000.0,
sampling_rate=sampling_rate,
norm="slaney",
mel_scale="htk",
)
def _np_extract_fbank_features(self, waveform: np.array) -> np.ndarray:
"""
This method first computes the log-mel spectrogram of the provided audio then applies normalization along the
each mel-filterbank, if `mel_norms` is provided.
"""
# 计算音频的对数梅尔频谱图
log_spec = spectrogram(
waveform,
window_function(self.n_fft, "hann"),
frame_length=self.n_fft,
hop_length=self.hop_length,
power=2.0,
mel_filters=self.mel_filters,
log_mel=None,
)
# 对计算得到的对数梅尔频谱图进行对数处理,并进行上下限裁剪
log_spec = np.log(np.clip(log_spec, a_min=1e-5, a_max=None))
# 如果提供了 `mel_norms`,则对对数梅尔频谱图进行归一化
if self.mel_norms is not None:
log_spec = log_spec / np.array(self.mel_norms)[:, None]
# 返回处理后的对数梅尔频谱图作为结果
return log_spec
def __call__(
self,
raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
sampling_rate: Optional[int] = None,
truncation: bool = True,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_attention_mask: Optional[bool] = True,
padding: Optional[str] = "max_length",
max_length: Optional[int] = None,
**kwargs,
.\models\clvp\modeling_clvp.py
import copy
import math
from dataclasses import dataclass
from typing import Dict, Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import CrossEntropyLoss
from ...activations import ACT2FN
from ...generation import GenerationConfig
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask
from ...modeling_outputs import (
BaseModelOutput,
BaseModelOutputWithPastAndCrossAttentions,
BaseModelOutputWithPooling,
CausalLMOutputWithCrossAttentions,
)
from ...modeling_utils import PreTrainedModel, SequenceSummary
from ...pytorch_utils import Conv1D
from ...utils import (
ModelOutput,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from .configuration_clvp import (
ClvpConfig,
ClvpDecoderConfig,
ClvpEncoderConfig,
)
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "susnato/clvp_dev"
CLVP_PRETRAINED_MODEL_ARCHIVE_LIST = [
"susnato/clvp_dev",
]
def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
"""对比损失函数,计算交叉熵损失"""
return nn.functional.cross_entropy(logits, torch.arange(len(logits), device=logits.device))
def clvp_loss(similarity: torch.Tensor) -> torch.Tensor:
"""CLVP 损失函数,结合文本和语音的对比损失"""
caption_loss = contrastive_loss(similarity)
speech_loss = contrastive_loss(similarity.t())
return (caption_loss + speech_loss) / 2.0
def rotate_half(x):
"""对输入的隐藏维度的一半进行旋转"""
x1 = x[..., : x.shape[-1] // 2]
x2 = x[..., x.shape[-1] // 2 :]
return torch.cat((-x2, x1), dim=-1)
def apply_rotary_pos_emb(q, k, v, cos, sin, position_ids, unsqueeze_dim=1):
"""应用旋转位置嵌入到查询和键的张量中"""
cos = cos[position_ids].unsqueeze(unsqueeze_dim)
sin = sin[position_ids].unsqueeze(unsqueeze_dim)
q_embed = (q * cos) + (rotate_half(q) * sin)
k_embed = (k * cos) + (rotate_half(k) * sin)
v_embed = (v * cos) + (rotate_half(v) * sin)
return q_embed, k_embed, v_embed
def _pad_extra_bos_eos_tokens(
input_ids,
attention_mask=None,
pad_token_id=0,
bos_token_id=255,
eos_token_id=0,
add_bos_token=True,
add_eos_token=True,
):
"""
This method adds extra bos and eos tokens to input_ids and accordingly modifies the attention_mask which is used in
`ClvpConditioningEncoder` and the generation loop of the `ClvpModelForConditionalGeneration`.
"""
if add_bos_token:
input_ids = torch.nn.functional.pad(input_ids, (1, 0), value=bos_token_id)
attention_mask = (
torch.nn.functional.pad(attention_mask, (1, 0), value=1) if attention_mask is not None else attention_mask
)
modified_input_ids = input_ids
if add_eos_token:
modified_input_ids = torch.zeros(
(input_ids.shape[0], input_ids.shape[1] + 1), dtype=input_ids.dtype, device=input_ids.device
)
for i, each_input_id in enumerate(input_ids):
if torch.isin(each_input_id, pad_token_id).sum():
pos = torch.where(each_input_id == pad_token_id)[0].min()
modified_input_ids[i] = torch.concatenate(
[each_input_id[:pos], torch.tensor([eos_token_id], device=input_ids.device), each_input_id[pos:]]
)
else:
modified_input_ids[i] = torch.nn.functional.pad(each_input_id, (0, 1), value=eos_token_id)
attention_mask = (
torch.nn.functional.pad(attention_mask, (1, 0), value=1) if attention_mask is not None else attention_mask
)
return modified_input_ids, attention_mask
embeds: Optional[torch.FloatTensor] = None
last_hidden_state: torch.FloatTensor = None
pooler_output: Optional[torch.FloatTensor] = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
@dataclass
class ClvpOutput(ModelOutput):
"""
Args:
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
Contrastive loss for speech-text similarity.
speech_ids (`torch.LongTensor`, *optional*):
speech_ids (or speech candidates) generated by the `ClvpForCausalLM` model.
logits_per_speech (`torch.FloatTensor` of shape `(speech_batch_size, text_batch_size)`):
The scaled dot product scores between `speech_embeds` and `text_embeds`. This represents the speech-text
similarity scores.
logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, speech_batch_size)`):
The scaled dot product scores between `text_embeds` and `speech_embeds`. This represents the text-speech
similarity scores.
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
The text embeddings obtained by applying the projection layer to the pooled output of the text encoder
model.
speech_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
The speech embeddings obtained by applying the projection layer to the pooled output of the speech encoder
model.
text_model_output (`BaseModelOutputWithPooling`):
The pooled output of the `last_hidden_state` of the text encoder Model.
speech_model_output (`BaseModelOutputWithPooling`):
The pooled output of the `last_hidden_state` of the speech encoder Model.
decoder_hidden_states (`torch.FloatTensor`, *optional*):
The hidden states of the decoder model.
text_encoder_hidden_states (`torch.FloatTensor`, *optional*):
The hidden states of the text encoder model.
speech_encoder_hidden_states (`torch.FloatTensor`, *optional*):
The hidden states of the speech encoder model.
"""
loss: Optional[torch.FloatTensor] = None
speech_ids: Optional[torch.LongTensor] = None
logits_per_speech: torch.FloatTensor = None
logits_per_text: torch.FloatTensor = None
text_embeds: torch.FloatTensor = None
speech_embeds: torch.FloatTensor = None
text_model_output: BaseModelOutputWithPooling = None
speech_model_output: BaseModelOutputWithPooling = None
decoder_hidden_states: torch.FloatTensor = None
text_encoder_hidden_states: torch.FloatTensor = None
speech_encoder_hidden_states: torch.FloatTensor = None
class ClvpRMSNorm(nn.Module):
def __init__(self, hidden_size, eps=1e-6):
"""
ClvpRMSNorm is equivalent to T5LayerNorm
"""
super().__init__()
self.weight = nn.Parameter(torch.ones(hidden_size))
self.variance_epsilon = eps
def forward(self, hidden_states):
input_dtype = hidden_states.dtype
hidden_states = hidden_states.to(torch.float32)
variance = hidden_states.pow(2).mean(-1, keepdim=True)
hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
return self.weight * hidden_states.to(input_dtype)
class ClvpRotaryPositionalEmbedding(nn.Module):
"""
Rotary Position Embedding Class for CLVP. It was proposed in the paper 'ROFORMER: ENHANCED TRANSFORMER WITH ROTARY
POSITION EMBEDDING', Please see https://arxiv.org/pdf/2104.09864v1.pdf .
"""
def __init__(self, config):
super().__init__()
dim = max(config.projection_dim // (config.num_attention_heads * 2), 32)
inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2, dtype=torch.int64).float() / dim))
self.register_buffer("inv_freq", inv_freq)
self.cached_sequence_length = None
self.cached_rotary_positional_embedding = None
def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
sequence_length = hidden_states.shape[1]
if sequence_length == self.cached_sequence_length and self.cached_rotary_positional_embedding is not None:
return self.cached_rotary_positional_embedding
self.cached_sequence_length = sequence_length
time_stamps = torch.arange(sequence_length, device=hidden_states.device).type_as(self.inv_freq)
freqs = torch.einsum("i,j->ij", time_stamps, self.inv_freq)
embeddings = torch.cat((freqs, freqs), dim=-1)
self.cached_rotary_positional_embedding = embeddings.unsqueeze(0)
return self.cached_rotary_positional_embedding
class ClvpSelfAttention(nn.Module):
"""
Multi-headed attention to combine Absolute and Rotary Positional Embeddings into a single Attention module.
"""
def __init__(self, config):
super().__init__()
self.config = config
self.embed_dim = config.hidden_size
self.num_heads = config.num_attention_heads
self.head_dim = self.embed_dim // self.num_heads
if self.head_dim * self.num_heads != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
f" {self.num_heads})."
)
self.scale = self.head_dim**-0.5
self.dropout = config.attention_dropout
if hasattr(config, "max_position_embeddings"):
max_positions = config.max_position_embeddings
bias = torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool))
bias = bias.view(1, 1, max_positions, max_positions)
self.register_buffer("bias", bias, persistent=False)
self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.use_attention_bias)
self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.use_attention_bias)
self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.use_attention_bias)
self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
def forward(
self,
hidden_states: torch.FloatTensor,
rotary_pos_emb: Optional[torch.FloatTensor] = None,
attention_mask: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
use_cache: Optional[bool] = False,
head_mask: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = False,
"""
This class defines an encoder layer for the CLVP model, comprising self-attention mechanism and MLP for processing hidden states.
"""
def __init__(self, config: ClvpConfig):
super().__init__()
self.config = config
self.embed_dim = config.hidden_size
self.self_attn = ClvpSelfAttention(config)
self.mlp = ClvpEncoderMLP(config)
self.input_rmsnorm = ClvpRMSNorm(self.embed_dim, eps=config.layer_norm_eps)
self.post_attention_rmsnorm = ClvpRMSNorm(self.embed_dim, eps=config.layer_norm_eps)
def forward(
self,
hidden_states: torch.FloatTensor,
rotary_pos_emb: torch.FloatTensor,
attention_mask: torch.LongTensor,
position_ids: torch.LongTensor,
output_attentions: Optional[bool] = False,
) -> torch.FloatTensor:
hidden_states = self.input_rmsnorm(hidden_states)
hidden_states, attention_weights = self.self_attn(
hidden_states, rotary_pos_emb, attention_mask, position_ids, output_attentions
)
hidden_states = self.post_attention_rmsnorm(hidden_states)
hidden_states = self.mlp(hidden_states)
return hidden_states
def forward(
hidden_states: torch.FloatTensor,
rotary_pos_emb: torch.FloatTensor,
attention_mask: torch.FloatTensor,
position_ids: torch.LongTensor,
output_attentions: bool = False
) -> Tuple[torch.FloatTensor]:
"""
Args:
hidden_states (`torch.FloatTensor` of shape `(batch, seq_len, embed_dim)`):
input to the layer.
rotary_pos_emb (`torch.FloatTensor`):
rotary position embeddings generated by `ClvpRotaryPositionalEmbedding` module.
attention_mask (`torch.FloatTensor` of shape `(batch, 1, tgt_len, src_len)`):
attention mask where padding elements are indicated by very large negative values.
position_ids (`torch.LongTensor`):
Denotes position ids of the input tokens.
output_attentions (`bool`, *optional*, defaults to `False`):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
"""
residual = hidden_states
hidden_states = self.input_rmsnorm(hidden_states)
attention_outputs = self.self_attn(
hidden_states=hidden_states,
rotary_pos_emb=rotary_pos_emb,
attention_mask=attention_mask,
position_ids=position_ids,
output_attentions=output_attentions,
)
hidden_states = attention_outputs[0]
hidden_states = residual + hidden_states
residual = hidden_states
hidden_states = self.post_attention_rmsnorm(hidden_states)
hidden_states = self.mlp(hidden_states)
hidden_states = residual + hidden_states
outputs = (hidden_states,)
if output_attentions:
outputs += (attention_outputs[-1],)
return outputs
class ClvpDecoderMLP(nn.Module):
def __init__(self, intermediate_size, config):
super().__init__()
embed_dim = config.hidden_size
self.c_fc = Conv1D(intermediate_size, embed_dim)
self.c_proj = Conv1D(embed_dim, intermediate_size)
self.act = ACT2FN[config.activation_function]
self.dropout = nn.Dropout(config.resid_pdrop)
def forward(self, hidden_states: Optional[Tuple[torch.FloatTensor]]) -> torch.FloatTensor:
hidden_states = self.c_fc(hidden_states)
hidden_states = self.act(hidden_states)
hidden_states = self.c_proj(hidden_states)
hidden_states = self.dropout(hidden_states)
return hidden_states
class ClvpDecoderLayer(nn.Module):
def __init__(self, config):
super().__init__()
hidden_size = config.hidden_size
inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size
self.input_layernorm = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
self.attn = ClvpSelfAttention(config)
self.post_attention_layernorm = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
self.mlp = ClvpDecoderMLP(inner_dim, config)
def forward(
self,
hidden_states: Optional[Tuple[torch.FloatTensor]],
past_key_value: Optional[Tuple[torch.Tensor]] = None,
attention_mask: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = False,
output_attentions: Optional[bool] = False,
) -> Union[Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor, ...]]]]:
residual = hidden_states
hidden_states = self.input_layernorm(hidden_states)
attn_outputs = self.attn(
hidden_states,
past_key_value=past_key_value,
attention_mask=attention_mask,
position_ids=position_ids,
head_mask=head_mask,
use_cache=use_cache,
output_attentions=output_attentions,
)
attn_output = attn_outputs[0]
outputs = attn_outputs[1:]
hidden_states = attn_output + residual
residual = hidden_states
hidden_states = self.post_attention_layernorm(hidden_states)
feed_forward_hidden_states = self.mlp(hidden_states)
hidden_states = residual + feed_forward_hidden_states
if use_cache:
outputs = (hidden_states,) + outputs
else:
outputs = (hidden_states,) + outputs[1:]
return outputs
class ClvpConditioningEncoder(nn.Module):
"""
This class processes the log-mel spectrograms(extracted by the Feature Extractor) and text tokens(produced by the
"""
def __init__(self, config: ClvpConfig):
super().__init__()
self.text_config = config.text_config
self.decoder_config = config.decoder_config
self.text_token_embedding = nn.Embedding(self.text_config.vocab_size, self.decoder_config.hidden_size)
self.text_position_embedding = nn.Embedding(
self.decoder_config.max_text_tokens, self.decoder_config.hidden_size
)
self.mel_conv = nn.Conv1d(self.decoder_config.feature_size, self.decoder_config.hidden_size, kernel_size=1)
num_groups = self.compute_groupnorm_groups(self.decoder_config.hidden_size)
self.group_norms = nn.ModuleList(
[
nn.GroupNorm(num_groups, self.decoder_config.hidden_size, eps=1e-5, affine=True)
for _ in range(self.decoder_config.num_mel_attn_blocks)
]
)
self.mel_attn_blocks = nn.ModuleList(
[ClvpSelfAttention(self.decoder_config) for _ in range(self.decoder_config.num_mel_attn_blocks)]
)
self.gradient_checkpointing = False
def compute_groupnorm_groups(self, channels: int, groups: int = 32):
"""
计算用于nn.GroupNorm的`num_groups`的值。这个逻辑来自于官方的tortoise repository。
链接:https://github.com/neonbjb/tortoise-tts/blob/4003544b6ff4b68c09856e04d3eff9da26d023c2/tortoise/models/arch_util.py#L26
"""
if channels <= 16:
groups = 8
elif channels <= 64:
groups = 16
while channels % groups != 0:
groups = int(groups / 2)
if groups <= 2:
raise ValueError(
f"Number of groups for the GroupNorm must be greater than 2, but it is {groups}."
f"Please consider using a different `hidden_size`"
)
return groups
def forward(
self,
input_features: torch.FloatTensor,
input_ids: Optional[torch.LongTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
attention_mask: Optional[torch.LongTensor] = None,
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = ClvpConfig
base_model_prefix = "clvp"
supports_gradient_checkpointing = True
_skip_keys_device_placement = "past_key_values"
def _init_weights(self, module):
"""Initialize the weights"""
factor = self.config.initializer_factor
if isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=factor * 0.02)
elif isinstance(module, (nn.Linear, Conv1D, nn.Conv1d)):
module.weight.data.normal_(mean=0.0, std=factor * 0.02)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, ClvpEncoderMLP):
in_proj_std = (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
nn.init.normal_(module.fc1.proj.weight if getattr(module.fc1, "proj") else module.fc1.weight, std=fc_std)
nn.init.normal_(module.fc2.weight, std=in_proj_std)
elif isinstance(module, ClvpEncoder):
config = self.config.text_config if hasattr(self.config, "text_config") else self.config
factor = config.initializer_factor
module.projection.weight.data.normal_(mean=0.0, std=factor * (config.hidden_size**-0.5))
elif isinstance(module, ClvpConditioningEncoder):
module.mel_conv.weight.data.normal_(mean=0.0, std=factor)
module.mel_conv.bias.data.zero_()
elif isinstance(module, ClvpForCausalLM):
for name, p in module.named_parameters():
if name == "c_proj.weight":
p.data.normal_(
mean=0.0, std=(self.config.initializer_range / math.sqrt(2 * self.config.num_hidden_layers))
)
if isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
CLVP_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
"""
Parameters:
config ([`ClvpConfig`]): Model configuration class with all the parameters of the model.
CLVP_INPUTS_DOCSTRING = r"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, time_dim)`):
Indicates log mel-spectrogram representations for audio returned by [`ClvpFeatureExtractor`].
conditioning_encoder_inputs_embeds (`torch.FloatTensor`, *optional*):
inputs_embeds for `ClvpConditioningEncoder`. Can be used in place of `input_ids`.
text_encoder_inputs_embeds (`torch.FloatTensor`, *optional*):
inputs_embeds for the text encoder model passed in place of `input_ids`.
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding text token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
return_loss (`bool`, *optional*):
Whether or not to return the contrastive loss.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
CLVP_DECODER_INPUTS_DOCSTRING = r"""
"""
class ClvpEncoder(ClvpPreTrainedModel):
"""
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`ClvpEncoderLayer`].
Args:
config: ClvpConfig
"""
def __init__(self, config: ClvpConfig):
super().__init__(config)
self.config = config
self.token_embedding = nn.Embedding(config.vocab_size, config.hidden_size)
self.rotary_pos_emb = ClvpRotaryPositionalEmbedding(config) if config.use_rotary_embedding else None
self.layers = nn.ModuleList([ClvpEncoderLayer(config) for _ in range(config.num_hidden_layers)])
self.sequence_summary = SequenceSummary(config)
self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.projection = nn.Linear(config.hidden_size, config.projection_dim, bias=False)
self.gradient_checkpointing = False
self.post_init()
def get_input_embeddings(self):
return self.token_embedding
def set_input_embeddings(self, value):
self.token_embedding = value
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
inputs_embeds: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
class ClvpDecoder(ClvpPreTrainedModel):
"""
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`ClvpDecoderLayer`]
"""
def __init__(self, config):
super().__init__(config)
self.config = config
self.input_embeds_layer = nn.Embedding(self.config.vocab_size, self.config.hidden_size)
self.position_embeds_layer = nn.Embedding(self.config.max_position_embeddings, self.config.hidden_size)
self.drop = nn.Dropout(self.config.embd_pdrop)
self.layers = nn.ModuleList([ClvpDecoderLayer(self.config) for _ in range(self.config.num_hidden_layers)])
self.layer_norm = nn.LayerNorm(self.config.hidden_size, eps=self.config.layer_norm_epsilon)
self.gradient_checkpointing = False
self.post_init()
def get_input_embeddings(self):
return self.input_embeds_layer
def set_input_embeddings(self, new_embeddings):
self.input_embeds_layer = new_embeddings
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
"""
for layer, heads in heads_to_prune.items():
self.layers[layer].attn.prune_heads(heads)
@add_start_docstrings_to_model_forward(CLVP_DECODER_INPUTS_DOCSTRING)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
"""
Performs forward pass of the decoder model.
Args:
input_ids: Optionally provided input IDs.
attention_mask: Optionally provided attention mask.
token_type_ids: Optionally provided token type IDs.
position_ids: Optionally provided position IDs.
head_mask: Optionally provided head mask.
past_key_values: Optionally provided past key values.
inputs_embeds: Optionally provided input embeddings.
use_cache: Optionally use cache.
output_attentions: Optionally output attentions.
output_hidden_states: Optionally output hidden states.
return_dict: Optionally return as dictionary.
Returns:
Model output.
"""
pass
@add_start_docstrings(
"The bare Clvp decoder model outputting raw hidden-states without any specific head on top.",
CLVP_START_DOCSTRING,
)
class ClvpModel(ClvpPreTrainedModel):
def __init__(self, config: ClvpDecoderConfig):
super().__init__(config)
self.config = config
self.decoder = ClvpDecoder(self.config)
self.post_init()
def get_input_embeddings(self):
return self.decoder.input_embeds_layer
def set_input_embeddings(self, value):
self.decoder.input_embeds_layer = value
def get_decoder(self):
return self.decoder
@add_start_docstrings_to_model_forward(CLVP_DECODER_INPUTS_DOCSTRING)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
use_cache = use_cache if use_cache is not None else self.config.use_cache
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
decoder_outputs = self.decoder(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
if not return_dict:
return decoder_outputs
return BaseModelOutputWithPastAndCrossAttentions(
last_hidden_state=decoder_outputs.last_hidden_state,
past_key_values=decoder_outputs.past_key_values,
hidden_states=decoder_outputs.hidden_states,
attentions=decoder_outputs.attentions,
cross_attentions=decoder_outputs.cross_attentions,
)
@add_start_docstrings(
"The CLVP decoder model with a language modelling head on top.",
CLVP_START_DOCSTRING,
)
class ClvpForCausalLM(ClvpPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.config = config
self.model = ClvpModel(self.config)
self.final_norm = nn.LayerNorm(self.config.hidden_size)
self.lm_head = nn.Linear(self.config.hidden_size, self.config.vocab_size, bias=True)
self.post_init()
def get_input_embeddings(self):
return self.model.decoder.input_embeds_layer
def set_input_embeddings(self, new_embeddings):
self.model.decoder.input_embeds_layer = new_embeddings
def _prepare_model_inputs(
self,
inputs: Optional[torch.Tensor] = None,
bos_token_id: Optional[int] = None,
model_kwargs: Optional[Dict[str, torch.Tensor]] = None,
):
...
def prepare_inputs_for_generation(
self, input_ids, past_key_values=None, inputs_embeds=None, conditioning_embeds=None, **kwargs
):
...
):
input_ids_length = input_ids.shape[-1]
token_type_ids = kwargs.get("token_type_ids", None)
if past_key_values:
past_length = past_key_values[0][0].shape[2]
if input_ids.shape[1] > past_length:
remove_prefix_length = past_length
else:
remove_prefix_length = input_ids.shape[1] - 1
input_ids = input_ids[:, remove_prefix_length:]
if token_type_ids is not None:
token_type_ids = token_type_ids[:, -input_ids.shape[1] :]
attention_mask = kwargs.get("attention_mask", None)
position_ids = kwargs.get("position_ids", None)
if attention_mask is not None and position_ids is None:
position_ids = attention_mask.long().cumsum(-1) - 1
position_ids.masked_fill_(attention_mask == 0, 1)
if past_key_values:
position_ids = position_ids[:, -1].unsqueeze(-1)
else:
position_ids = None
if conditioning_embeds is not None and past_key_values is not None:
position_ids = torch.tensor([input_ids_length], dtype=torch.long, device=input_ids.device)
if inputs_embeds is not None and past_key_values is None:
model_inputs = {"inputs_embeds": inputs_embeds}
else:
model_inputs = {"input_ids": input_ids}
model_inputs.update(
{
"past_key_values": past_key_values,
"use_cache": kwargs.get("use_cache"),
"position_ids": position_ids,
"token_type_ids": token_type_ids,
}
)
return model_inputs
@add_start_docstrings_to_model_forward(CLVP_DECODER_INPUTS_DOCSTRING)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
`labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
use_cache = use_cache if use_cache is not None else self.config.use_cache
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.model(
input_ids=input_ids,
past_key_values=past_key_values,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
hidden_states = outputs[0]
lm_logits = self.final_norm(hidden_states)
lm_logits = self.lm_head(lm_logits)
loss = None
if labels is not None:
labels = labels.to(lm_logits.device)
shift_logits = lm_logits[..., :-1, :].contiguous()
shift_labels = labels[..., 1:].contiguous()
loss_fct = CrossEntropyLoss()
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
if not return_dict:
output = (lm_logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
return CausalLMOutputWithCrossAttentions(
loss=loss,
logits=lm_logits,
past_key_values=outputs.past_key_values,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
cross_attentions=outputs.cross_attentions,
)
@staticmethod
def _reorder_cache(
past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor
"""
This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
[`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
beam_idx at every generation step.
"""
return tuple(
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past)
for layer_past in past_key_values
)
@add_start_docstrings(
"The composite CLVP model with a text encoder, speech encoder and speech decoder model."
"The speech decoder model generates the speech_ids from the text and the text encoder and speech encoder works"
"together to filter out the best speech_ids.",
CLVP_START_DOCSTRING,
)
class ClvpModelForConditionalGeneration(ClvpPreTrainedModel):
config_class = ClvpConfig
def __init__(self, config: ClvpConfig):
super().__init__(config)
if not isinstance(config.text_config, ClvpEncoderConfig):
raise ValueError(
"config.text_config is expected to be of type `ClvpEncoderConfig` but is of type"
f" {type(config.text_config)}."
)
if not isinstance(config.speech_config, ClvpEncoderConfig):
raise ValueError(
"config.speech_config is expected to be of type `ClvpEncoderConfig` but is of type"
f" {type(config.speech_config)}."
)
if not isinstance(config.decoder_config, ClvpDecoderConfig):
raise ValueError(
"config.decoder_config is expected to be of type `ClvpDecoderConfig` but is of type"
f" {type(config.decoder_config)}."
)
self.conditioning_encoder = ClvpConditioningEncoder(config)
self.speech_decoder_model = ClvpForCausalLM(config.decoder_config)
self.text_encoder_model = ClvpEncoder(config.text_config)
self.speech_encoder_model = ClvpEncoder(config.speech_config)
self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
self.post_init()
def fix_speech_decoder_output(self, speech_ids: torch.LongTensor) -> torch.LongTensor:
"""
This method modifies the output of the decoder model, such as replacing the `eos_token_id` and changing the
last few tokens of each sequence.
Args:
speech_ids (`torch.LongTensor`):
This refers to the output of the decoder model.
"""
decoder_fixing_codes = self.config.decoder_config.decoder_fixing_codes
speech_ids = speech_ids[:, 1:]
stop_token_indices = torch.where(speech_ids == self.speech_decoder_model.config.eos_token_id, 1, 0)
speech_ids = torch.masked_fill(speech_ids, mask=stop_token_indices.bool(), value=decoder_fixing_codes[0])
for i, each_seq_stop_token_index in enumerate(stop_token_indices):
if each_seq_stop_token_index.sum() == 0:
continue
stm = each_seq_stop_token_index.argmax()
speech_ids[i, stm:] = decoder_fixing_codes[0]
if stm - 3 < speech_ids.shape[1]:
speech_ids[i, -3:] = torch.tensor(
[decoder_fixing_codes[1:]], device=speech_ids.device, dtype=torch.long
)
return speech_ids
def get_text_features(
self,
input_ids: Optional[torch.LongTensor] = None,
text_encoder_inputs_embeds: Optional[torch.FloatTensor] = None,
attention_mask: Optional[torch.LongTensor] = None,
) -> torch.FloatTensor:
r"""
This method can be used to extract text_embeds from a text. The text embeddings obtained by applying the
projection layer to the pooled output of the CLVP text encoder model.
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
provide it.
[What are input IDs?](../glossary#input-ids)
text_encoder_inputs_embeds (`torch.FloatTensor`, *optional*):
inputs_embeds for the text encoder model passed in place of `input_ids`.
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
Returns:
`torch.FloatTensor` of shape `(batch_size, output_dim)`:
The text embeddings obtained by applying the projection layer to the pooled output of the CLVP Text
Model.
Examples:
```
>>> from transformers import ClvpProcessor, ClvpModelForConditionalGeneration
>>> # Define the Text
>>> text = "This is an example text."
>>> # Define processor and model
>>> processor = ClvpProcessor.from_pretrained("susnato/clvp_dev")
>>> model = ClvpModelForConditionalGeneration.from_pretrained("susnato/clvp_dev")
>>> # Generate processor output and text embeds
>>> processor_output = processor(text=text, return_tensors="pt")
>>> text_embeds = model.get_text_features(input_ids=processor_output["input_ids"])
```
"""
outputs = self.text_encoder_model(
input_ids=input_ids,
inputs_embeds=text_encoder_inputs_embeds,
attention_mask=attention_mask,
)
return outputs[0]
def get_speech_features(
self,
speech_ids: Optional[torch.LongTensor] = None,
input_ids: Optional[torch.LongTensor] = None,
input_features: Optional[torch.FloatTensor] = None,
conditioning_encoder_inputs_embeds: Optional[torch.FloatTensor] = None,
attention_mask: Optional[torch.Tensor] = None,
generation_config: Optional[GenerationConfig] = None,
**kwargs,
@add_start_docstrings_to_model_forward(CLVP_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=ClvpOutput, config_class=ClvpConfig)
def forward(
self,
input_ids: torch.LongTensor = None,
input_features: torch.FloatTensor = None,
conditioning_encoder_inputs_embeds: Optional[torch.FloatTensor] = None,
text_encoder_inputs_embeds: Optional[torch.FloatTensor] = None,
attention_mask: Optional[torch.LongTensor] = None,
return_loss: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
output_attentions: Optional[bool] = False,
return_dict: Optional[bool] = None,
):
@torch.no_grad()
def generate(
self,
input_ids: torch.LongTensor = None,
input_features: torch.FloatTensor = None,
attention_mask: Optional[torch.LongTensor] = None,
generation_config: Optional[GenerationConfig] = None,
pad_to_max_mel_tokens: Optional[int] = None,
output_hidden_states: Optional[bool] = None,
**kwargs,