Transformers 源码解析(一百零六)
.\models\speech_to_text_2\tokenization_speech_to_text_2.py
VOCAB_FILES_NAMES = {
"vocab_file": "vocab.json",
"tokenizer_config_file": "tokenizer_config.json",
"merges_file": "merges.txt",
}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"facebook/s2t-wav2vec2-large-en-de": (
"https://huggingface.co/facebook/s2t-wav2vec2-large-en-de/resolve/main/vocab.json"
),
},
"tokenizer_config_file": {
"facebook/s2t-wav2vec2-large-en-de": (
"https://huggingface.co/facebook/s2t-wav2vec2-large-en-de/resolve/main/tokenizer_config.json"
),
},
"merges_file": {
"facebook/s2t-wav2vec2-large-en-de": (
"https://huggingface.co/facebook/s2t-wav2vec2-large-en-de/resolve/main/merges.txt"
),
},
}
BPE_TOKEN_MERGES = "</w>"
BPE_TOKEN_VOCAB = "@@ "
def get_pairs(word):
"""
返回单词中的符号对集合。单词被表示为符号(符号是长度可变的字符串)的元组。
"""
pairs = set()
prev_char = word[0]
for char in word[1:]:
pairs.add((prev_char, char))
prev_char = char
return pairs
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"facebook/s2t-wav2vec2-large-en-de": 1024}
class Speech2Text2Tokenizer(PreTrainedTokenizer):
"""
构建 Speech2Text2Tokenizer。
该分词器继承自 `PreTrainedTokenizer`,其中包含一些主要方法。用户应参考超类以获取有关这些方法的更多信息。
"""
Args:
vocab_file (`str`):
File containing the vocabulary.
bos_token (`str`, *optional*, defaults to `"<s>"`):
The beginning of sentence token.
eos_token (`str`, *optional*, defaults to `"</s>"`):
The end of sentence token.
unk_token (`str`, *optional*, defaults to `"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
pad_token (`str`, *optional*, defaults to `"<pad>"`):
The token used for padding, for example when batching sequences of different lengths.
**kwargs
Additional keyword arguments passed along to `PreTrainedTokenizer`.
```
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["input_ids", "attention_mask"]
def __init__(
self,
vocab_file,
bos_token="<s>",
pad_token="<pad>",
eos_token="</s>",
unk_token="<unk>",
do_lower_case=False,
merges_file=None,
**kwargs,
):
self.do_lower_case = do_lower_case
with open(vocab_file, encoding="utf-8") as vocab_handle:
self.encoder = json.load(vocab_handle)
self.decoder = {v: k for k, v in self.encoder.items()}
if merges_file is None:
logger.info(f"No merges files provided. {self.__class__.__name__} can only be used for decoding.")
self.bpe_ranks = None
self.cache = None
else:
with open(merges_file, encoding="utf-8") as merges_handle:
merges = merges_handle.read().split("\n")[:-1]
merges = [tuple(merge.split()[:2]) for merge in merges]
self.bpe_ranks = dict(zip(merges, range(len(merges))))
self.cache = {}
super().__init__(
unk_token=unk_token,
bos_token=bos_token,
eos_token=eos_token,
pad_token=pad_token,
do_lower_case=do_lower_case,
**kwargs,
)
@property
def vocab_size(self) -> int:
return len(self.decoder)
def get_vocab(self) -> Dict:
return dict(self.encoder, **self.added_tokens_encoder)
def bpe(self, token):
word = tuple(token[:-1]) + (token[-1] + BPE_TOKEN_MERGES,)
if token in self.cache:
return self.cache[token]
pairs = get_pairs(word)
if not pairs:
return token
while True:
bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
if bigram not in self.bpe_ranks:
break
first, second = bigram
new_word = []
i = 0
while i < len(word):
try:
j = word.index(first, i)
except ValueError:
new_word.extend(word[i:])
break
else:
new_word.extend(word[i:j])
i = j
if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
new_word.append(first + second)
i += 2
else:
new_word.append(word[i])
i += 1
new_word = tuple(new_word)
word = new_word
if len(word) == 1:
break
else:
pairs = get_pairs(word)
word = " ".join(word)
if word == "\n " + BPE_TOKEN_MERGES:
word = "\n" + BPE_TOKEN_MERGES
if word.endswith(BPE_TOKEN_MERGES):
word = word.replace(BPE_TOKEN_MERGES, "")
word = word.replace(" ", BPE_TOKEN_VOCAB)
self.cache[token] = word
return word
def _tokenize(self, text):
"""Tokenize a string."""
if self.bpe_ranks is None:
raise ValueError(
"This tokenizer was instantiated without a `merges.txt` file, so"
" that it can only be used for decoding, not for encoding. "
"Make sure to provide `merges.txt` file at instantiation to enable "
"encoding."
)
if self.do_lower_case:
text = text.lower()
text = text.split()
split_tokens = []
for token in text:
if token:
split_tokens.extend(list(self.bpe(token).split(" ")))
return split_tokens
def _convert_token_to_id(self, token: str) -> int:
"""Converts a token (str) in an index (integer) using the vocab."""
return self.encoder.get(token, self.encoder.get(self.unk_token))
def _convert_id_to_token(self, index: int) -> str:
"""Converts an index (integer) in a token (str) using the vocab."""
result = self.decoder.get(index, self.unk_token)
return result
def convert_tokens_to_string(self, tokens: List[str]) -> str:
"""
Converts a list of output tokens into a single string.
"""
string = " ".join(tokens)
string = "".join(string.split(BPE_TOKEN_VOCAB))
return string
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
merges_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
)
with open(vocab_file, "w", encoding="utf-8") as f:
f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
index = 0
if self.bpe_ranks is None:
return (vocab_file,)
with open(merges_file, "w", encoding="utf-8") as writer:
for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
if index != token_index:
logger.warning(
f"Saving vocabulary to {merges_file}: BPE merge indices are not consecutive."
" Please check that the tokenizer is not corrupted!"
)
index = token_index
writer.write(" ".join(bpe_tokens) + "\n")
index += 1
return (vocab_file, merges_file)
.\models\speech_to_text_2\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_sentencepiece_available,
is_speech_available,
is_torch_available,
)
_import_structure = {
"configuration_speech_to_text_2": ["SPEECH_TO_TEXT_2_PRETRAINED_CONFIG_ARCHIVE_MAP", "Speech2Text2Config"],
"processing_speech_to_text_2": ["Speech2Text2Processor"],
"tokenization_speech_to_text_2": ["Speech2Text2Tokenizer"],
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_speech_to_text_2"] = [
"SPEECH_TO_TEXT_2_PRETRAINED_MODEL_ARCHIVE_LIST",
"Speech2Text2ForCausalLM",
"Speech2Text2PreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_speech_to_text_2 import SPEECH_TO_TEXT_2_PRETRAINED_CONFIG_ARCHIVE_MAP, Speech2Text2Config
from .processing_speech_to_text_2 import Speech2Text2Processor
from .tokenization_speech_to_text_2 import Speech2Text2Tokenizer
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_speech_to_text_2 import (
SPEECH_TO_TEXT_2_PRETRAINED_MODEL_ARCHIVE_LIST,
Speech2Text2ForCausalLM,
Speech2Text2PreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\splinter\configuration_splinter.py
"""
Splinter model configuration
"""
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
SPLINTER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"tau/splinter-base": "https://huggingface.co/tau/splinter-base/resolve/main/config.json",
"tau/splinter-base-qass": "https://huggingface.co/tau/splinter-base-qass/resolve/main/config.json",
"tau/splinter-large": "https://huggingface.co/tau/splinter-large/resolve/main/config.json",
"tau/splinter-large-qass": "https://huggingface.co/tau/splinter-large-qass/resolve/main/config.json",
}
class SplinterConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`SplinterModel`]. It is used to instantiate an
Splinter model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the Splinter
[tau/splinter-base](https://huggingface.co/tau/splinter-base) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
"""
model_type = "splinter"
def __init__(
self,
vocab_size=30522,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=2,
initializer_range=0.02,
layer_norm_eps=1e-12,
use_cache=True,
pad_token_id=0,
question_token_id=104,
**kwargs,
):
super().__init__(pad_token_id=pad_token_id, **kwargs)
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.initializer_range = initializer_range
self.type_vocab_size = type_vocab_size
self.layer_norm_eps = layer_norm_eps
self.use_cache = use_cache
self.question_token_id = question_token_id
.\models\splinter\modeling_splinter.py
""" PyTorch Splinter model."""
import math
from dataclasses import dataclass
from typing import List, Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import CrossEntropyLoss
from ...activations import ACT2FN
from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, ModelOutput, QuestionAnsweringModelOutput
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
from .configuration_splinter import SplinterConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "tau/splinter-base"
_CONFIG_FOR_DOC = "SplinterConfig"
SPLINTER_PRETRAINED_MODEL_ARCHIVE_LIST = [
"tau/splinter-base",
"tau/splinter-base-qass",
"tau/splinter-large",
"tau/splinter-large-qass",
]
class SplinterEmbeddings(nn.Module):
"""Construct the embeddings from word, position and token_type embeddings."""
def __init__(self, config):
super().__init__()
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
past_key_values_length: Optional[int] = 0,
) -> Tuple:
if input_ids is not None:
input_shape = input_ids.size()
else:
input_shape = inputs_embeds.size()[:-1]
seq_length = input_shape[1]
if position_ids is None:
position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
if token_type_ids is None:
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
if inputs_embeds is None:
inputs_embeds = self.word_embeddings(input_ids)
token_type_embeddings = self.token_type_embeddings(token_type_ids)
embeddings = inputs_embeds + token_type_embeddings
if self.position_embedding_type == "absolute":
position_embeddings = self.position_embeddings(position_ids)
embeddings += position_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings
class SplinterSelfAttention(nn.Module):
def __init__(self, config, position_embedding_type=None):
super().__init__()
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
raise ValueError(
f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
f"heads ({config.num_attention_heads})"
)
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query = nn.Linear(config.hidden_size, self.all_head_size)
self.key = nn.Linear(config.hidden_size, self.all_head_size)
self.value = nn.Linear(config.hidden_size, self.all_head_size)
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
self.position_embedding_type = position_embedding_type or getattr(
config, "position_embedding_type", "absolute"
)
if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
self.max_position_embeddings = config.max_position_embeddings
self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
self.is_decoder = config.is_decoder
def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(new_x_shape)
return x.permute(0, 2, 1, 3)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
output_attentions: Optional[bool] = False,
class SplinterSelfOutput(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class SplinterAttention(nn.Module):
def __init__(self, config, position_embedding_type=None):
super().__init__()
self.self = SplinterSelfAttention(config, position_embedding_type=position_embedding_type)
self.output = SplinterSelfOutput(config)
self.pruned_heads = set()
def prune_heads(self, heads):
if len(heads) == 0:
return
heads, index = find_pruneable_heads_and_indices(
heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
)
self.self.query = prune_linear_layer(self.self.query, index)
self.self.key = prune_linear_layer(self.self.key, index)
self.self.value = prune_linear_layer(self.self.value, index)
self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
self.pruned_heads = self.pruned_heads.union(heads)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor]:
self_outputs = self.self(
hidden_states,
attention_mask,
head_mask,
encoder_hidden_states,
encoder_attention_mask,
past_key_value,
output_attentions,
)
attention_output = self.output(self_outputs[0], hidden_states)
outputs = (attention_output,) + self_outputs[1:]
return outputs
class SplinterIntermediate(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
class SplinterOutput(nn.Module):
def __init__(self, config):
super().__init__()
pass
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class SplinterLayer(nn.Module):
def __init__(self, config):
super().__init__()
self.chunk_size_feed_forward = config.chunk_size_feed_forward
self.seq_len_dim = 1
self.attention = SplinterAttention(config)
self.is_decoder = config.is_decoder
self.add_cross_attention = config.add_cross_attention
if self.add_cross_attention:
if not self.is_decoder:
raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
self.crossattention = SplinterAttention(config, position_embedding_type="absolute")
self.intermediate = SplinterIntermediate(config)
self.output = SplinterOutput(config)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor]:
self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
self_attention_outputs = self.attention(
hidden_states,
attention_mask,
head_mask,
output_attentions=output_attentions,
past_key_value=self_attn_past_key_value,
)
attention_output = self_attention_outputs[0]
if self.is_decoder:
outputs = self_attention_outputs[1:-1]
present_key_value = self_attention_outputs[-1]
else:
outputs = self_attention_outputs[1:]
cross_attn_present_key_value = None
if self.is_decoder and encoder_hidden_states is not None:
if not hasattr(self, "crossattention"):
raise ValueError(
f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
" by setting `config.add_cross_attention=True`"
)
cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
cross_attention_outputs = self.crossattention(
attention_output,
attention_mask,
head_mask,
encoder_hidden_states,
encoder_attention_mask,
cross_attn_past_key_value,
output_attentions,
)
attention_output = cross_attention_outputs[0]
outputs = outputs + cross_attention_outputs[1:-1]
cross_attn_present_key_value = cross_attention_outputs[-1]
present_key_value = present_key_value + cross_attn_present_key_value
layer_output = apply_chunking_to_forward(
self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
)
outputs = (layer_output,) + outputs
if self.is_decoder:
outputs = outputs + (present_key_value,)
return outputs
def feed_forward_chunk(self, attention_output):
intermediate_output = self.intermediate(attention_output)
layer_output = self.output(intermediate_output, attention_output)
return layer_output
class SplinterEncoder(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.layer = nn.ModuleList([SplinterLayer(config) for _ in range(config.num_hidden_layers)])
self.gradient_checkpointing = False
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = False,
output_hidden_states: Optional[bool] = False,
return_dict: Optional[bool] = True,
) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
all_hidden_states = () if output_hidden_states else None
all_self_attentions = () if output_attentions else None
all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
if self.gradient_checkpointing and self.training:
if use_cache:
logger.warning_once(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache = False
next_decoder_cache = () if use_cache else None
for i, layer_module in enumerate(self.layer):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
layer_head_mask = head_mask[i] if head_mask is not None else None
past_key_value = past_key_values[i] if past_key_values is not None else None
if self.gradient_checkpointing and self.training:
layer_outputs = self._gradient_checkpointing_func(
layer_module.__call__,
hidden_states,
attention_mask,
layer_head_mask,
encoder_hidden_states,
encoder_attention_mask,
past_key_value,
output_attentions,
)
else:
layer_outputs = layer_module(
hidden_states,
attention_mask,
layer_head_mask,
encoder_hidden_states,
encoder_attention_mask,
past_key_value,
output_attentions,
)
hidden_states = layer_outputs[0]
if use_cache:
next_decoder_cache += (layer_outputs[-1],)
if output_attentions:
all_self_attentions = all_self_attentions + (layer_outputs[1],)
if self.config.add_cross_attention:
all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(
v
for v in [
hidden_states,
next_decoder_cache,
all_hidden_states,
all_self_attentions,
all_cross_attentions,
]
if v is not None
)
return BaseModelOutputWithPastAndCrossAttentions(
last_hidden_state=hidden_states,
past_key_values=next_decoder_cache,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
cross_attentions=all_cross_attentions,
)
"""
SPLINTER_INPUTS_DOCSTRING定义了一个用于SPLINTER模型输入说明文档的字符串常量。
"""
Args:
input_ids (`torch.LongTensor` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary
attention_mask (`torch.FloatTensor` of shape `{0}`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary
token_type_ids (`torch.LongTensor` of shape `{0}`, *optional*):
Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
1]`:
- 0 corresponds to a *sentence A* token,
- 1 corresponds to a *sentence B* token.
[What are token type IDs?](../glossary
position_ids (`torch.LongTensor` of shape `{0}`, *optional*):
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
config.max_position_embeddings - 1]`.
[What are position IDs?](../glossary
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
model's internal embedding lookup matrix.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
@add_start_docstrings(
"The bare Splinter Model transformer outputting raw hidden-states without any specific head on top.",
SPLINTER_START_DOCSTRING,
)
class SplinterModel(SplinterPreTrainedModel):
"""
The model is an encoder (with only self-attention) following the architecture described in [Attention is all you
need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones,
Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
"""
def __init__(self, config):
super().__init__(config)
self.config = config
# Initialize embeddings and encoder layers
self.embeddings = SplinterEmbeddings(config)
self.encoder = SplinterEncoder(config)
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.embeddings.word_embeddings
def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
# Iterate over layers and prune specified heads in attention layers
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_model_forward(SPLINTER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPastAndCrossAttentions,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
"""
Forward pass of the SplinterModel. See superclass for more details.
"""
# Implement the forward pass using the specified inputs and return model outputs
# including attentions, hidden states, and past key values if applicable
...
"""
实现了基于问题感知的跨度选择(QASS)头部,参考Splinter的论文描述。
"""
# 初始化函数,设置QASS模型的各个层
def __init__(self, config):
super().__init__()
# 定义转换层,将输入特征转换为问题起始和结束的表示
self.query_start_transform = SplinterFullyConnectedLayer(config.hidden_size, config.hidden_size)
self.query_end_transform = SplinterFullyConnectedLayer(config.hidden_size, config.hidden_size)
# 定义转换层,将输入特征转换为序列起始和结束的表示
self.start_transform = SplinterFullyConnectedLayer(config.hidden_size, config.hidden_size)
self.end_transform = SplinterFullyConnectedLayer(config.hidden_size, config.hidden_size)
# 定义分类器层,用于预测起始位置和结束位置的概率分布
self.start_classifier = nn.Linear(config.hidden_size, config.hidden_size, bias=False)
self.end_classifier = nn.Linear(config.hidden_size, config.hidden_size, bias=False)
# 前向传播函数,接收输入和位置信息,返回起始位置和结束位置的预测 logits
def forward(self, inputs, positions):
_, _, dim = inputs.size()
index = positions.unsqueeze(-1).repeat(1, 1, dim) # 创建位置索引张量 [batch_size, num_positions, dim]
gathered_reps = torch.gather(inputs, dim=1, index=index) # 根据位置索引收集输入特征表示 [batch_size, num_positions, dim]
# 对问题起始和结束的特征表示进行变换
query_start_reps = self.query_start_transform(gathered_reps) # [batch_size, num_positions, dim]
query_end_reps = self.query_end_transform(gathered_reps) # [batch_size, num_positions, dim]
# 对序列起始和结束的特征表示进行变换
start_reps = self.start_transform(inputs) # [batch_size, seq_length, dim]
end_reps = self.end_transform(inputs) # [batch_size, seq_length, dim]
# 使用分类器预测起始位置的 logits
hidden_states = self.start_classifier(query_start_reps) # [batch_size, num_positions, dim]
start_reps = start_reps.permute(0, 2, 1) # 调整维度顺序为 [batch_size, dim, seq_length]
start_logits = torch.matmul(hidden_states, start_reps) # 计算起始位置的 logits
# 使用分类器预测结束位置的 logits
hidden_states = self.end_classifier(query_end_reps) # [batch_size, num_positions, dim]
end_reps = end_reps.permute(0, 2, 1) # 调整维度顺序为 [batch_size, dim, seq_length]
end_logits = torch.matmul(hidden_states, end_reps) # 计算结束位置的 logits
# 返回起始位置和结束位置的预测 logits
return start_logits, end_logits
# 使用装饰器添加文档字符串到类定义,描述了这个类在提取式问答任务(如SQuAD)中的作用,特别是在隐藏状态输出之上有一个线性层来计算“起始位置logits”和“结束位置logits”。
@add_start_docstrings(
"""
Splinter Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
SPLINTER_START_DOCSTRING, # 添加了预定义的起始文档字符串
)
# 声明一个新的类,继承自SplinterPreTrainedModel,用于问题回答的Splinter模型
class SplinterForQuestionAnswering(SplinterPreTrainedModel):
def __init__(self, config):
super().__init__(config)
# 初始化Splinter模型
self.splinter = SplinterModel(config)
# 初始化用于问题感知的跨度选择头部
self.splinter_qass = QuestionAwareSpanSelectionHead(config)
# 设置问题的token ID
self.question_token_id = config.question_token_id
# 初始化权重并应用最终处理
self.post_init()
# 使用装饰器添加文档字符串到模型前向传播函数,描述了输入参数及其作用
@add_start_docstrings_to_model_forward(SPLINTER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
# 使用装饰器添加代码示例文档字符串,包括检查点、输出类型和配置类信息
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=QuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
)
# 模型的前向传播函数定义,处理各种输入参数并返回输出结果
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
start_positions: Optional[torch.LongTensor] = None,
end_positions: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
question_positions: Optional[torch.LongTensor] = None,
Args:
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when start and end positions are provided):
Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
start_logits (`torch.FloatTensor` of shape `(batch_size, num_questions, sequence_length)`):
Span-start scores (before SoftMax).
end_logits (`torch.FloatTensor` of shape `(batch_size, num_questions, sequence_length)`):
Span-end scores (before SoftMax).
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
# Optional: A scalar tensor representing the total loss for span extraction tasks.
loss: Optional[torch.FloatTensor] = None
# Optional: Tensor containing scores for the start positions of spans in each question.
start_logits: torch.FloatTensor = None
# Optional: Tensor containing scores for the end positions of spans in each question.
end_logits: torch.FloatTensor = None
# Optional: Tuple of tensors representing hidden states of the model at each layer and embeddings.
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
# Optional: Tuple of tensors representing attention weights for each layer's self-attention mechanism.
attentions: Optional[Tuple[torch.FloatTensor]] = None
@add_start_docstrings(
"""
Splinter Model for the recurring span selection task as done during the pretraining. The difference to the QA task
is that we do not have a question, but multiple question tokens that replace the occurrences of recurring spans
instead.
""",
SPLINTER_START_DOCSTRING,
)
class SplinterForPreTraining(SplinterPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.splinter = SplinterModel(config)
self.splinter_qass = QuestionAwareSpanSelectionHead(config)
self.question_token_id = config.question_token_id
self.post_init()
@add_start_docstrings_to_model_forward(
SPLINTER_INPUTS_DOCSTRING.format("batch_size, num_questions, sequence_length")
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
start_positions: Optional[torch.LongTensor] = None,
end_positions: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
question_positions: Optional[torch.LongTensor] = None,
):
def _prepare_question_positions(self, input_ids: torch.Tensor) -> torch.Tensor:
rows, flat_positions = torch.where(input_ids == self.config.question_token_id)
num_questions = torch.bincount(rows)
positions = torch.full(
(input_ids.size(0), num_questions.max()),
self.config.pad_token_id,
dtype=torch.long,
device=input_ids.device,
)
cols = torch.cat([torch.arange(n) for n in num_questions])
positions[rows, cols] = flat_positions
return positions
.\models\splinter\tokenization_splinter.py
"""Splinter 的标记化类。"""
import collections
import os
import unicodedata
from typing import List, Optional, Tuple
from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
from ...utils import logging
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"tau/splinter-base": "https://huggingface.co/tau/splinter-base/resolve/main/vocab.txt",
"tau/splinter-base-qass": "https://huggingface.co/tau/splinter-base-qass/resolve/main/vocab.txt",
"tau/splinter-large": "https://huggingface.co/tau/splinter-large/resolve/main/vocab.txt",
"tau/splinter-large-qass": "https://huggingface.co/tau/splinter-large-qass/resolve/main/vocab.txt",
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"tau/splinter-base": 512,
"tau/splinter-base-qass": 512,
"tau/splinter-large": 512,
"tau/splinter-large-qass": 512,
}
PRETRAINED_INIT_CONFIGURATION = {
"tau/splinter-base": {"do_lower_case": False},
"tau/splinter-base-qass": {"do_lower_case": False},
"tau/splinter-large": {"do_lower_case": False},
"tau/splinter-large-qass": {"do_lower_case": False},
}
def load_vocab(vocab_file):
"""从词汇文件加载词汇表到一个有序字典中。"""
vocab = collections.OrderedDict()
with open(vocab_file, "r", encoding="utf-8") as reader:
tokens = reader.readlines()
for index, token in enumerate(tokens):
token = token.rstrip("\n")
vocab[token] = index
return vocab
def whitespace_tokenize(text):
"""对文本进行基本的空格清理和分割。"""
text = text.strip()
if not text:
return []
tokens = text.split()
return tokens
class SplinterTokenizer(PreTrainedTokenizer):
r"""
构建一个 Splinter 分词器,基于 WordPiece 算法。
这个分词器继承自 [`PreTrainedTokenizer`],包含大多数主要方法。用户可以参考这个超类获取更多关于这些方法的信息。
"""
Args:
vocab_file (`str`):
包含词汇表的文件。
do_lower_case (`bool`, *optional*, defaults to `True`):
在分词时是否将输入转换为小写。
do_basic_tokenize (`bool`, *optional*, defaults to `True`):
在使用 WordPiece 分词之前是否进行基本分词。
never_split (`Iterable`, *optional*):
在分词时永远不会分割的标记集合。仅在 `do_basic_tokenize=True` 时生效。
unk_token (`str`, *optional*, defaults to `"[UNK]"`):
未知标记。词汇表中不存在的标记会被设置为该标记。
sep_token (`str`, *optional*, defaults to `"[SEP]"`):
分隔符标记,用于从多个序列构建一个序列,例如用于序列分类或问答任务中。
也是使用特殊标记构建序列时的最后一个标记。
pad_token (`str`, *optional*, defaults to `"[PAD]"`):
用于填充的标记,例如在批处理不同长度的序列时使用。
cls_token (`str`, *optional*, defaults to `"[CLS]"`):
分类器标记,用于序列分类任务。使用特殊标记构建序列时的第一个标记。
mask_token (`str`, *optional*, defaults to `"[MASK]"`):
掩码标记,用于掩码语言建模任务中。模型将尝试预测此标记。
question_token (`str`, *optional*, defaults to `"[QUESTION]"`):
用于构建问题表示的标记。
tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
是否对中文字符进行分词。
对于日文应该禁用此选项(参见此处的问题链接)。
strip_accents (`bool`, *optional*):
是否删除所有重音符号。如果未指定此选项,则将根据 `lowercase` 的值确定(与原始的 BERT 行为相同)。
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def __init__(
self,
vocab_file,
do_lower_case=True,
do_basic_tokenize=True,
never_split=None,
unk_token="[UNK]",
sep_token="[SEP]",
pad_token="[PAD]",
cls_token="[CLS]",
mask_token="[MASK]",
question_token="[QUESTION]",
tokenize_chinese_chars=True,
strip_accents=None,
**kwargs,
):
if not os.path.isfile(vocab_file):
raise ValueError(
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
" model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
)
self.vocab = load_vocab(vocab_file)
self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
self.do_basic_tokenize = do_basic_tokenize
if do_basic_tokenize:
self.basic_tokenizer = BasicTokenizer(
do_lower_case=do_lower_case,
never_split=never_split,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
)
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
self.question_token = question_token
super().__init__(
do_lower_case=do_lower_case,
do_basic_tokenize=do_basic_tokenize,
never_split=never_split,
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
**kwargs,
)
@property
def question_token_id(self):
"""
`Optional[int]`: Id of the question token in the vocabulary, used to condition the answer on a question
representation.
"""
return self.convert_tokens_to_ids(self.question_token)
@property
def do_lower_case(self):
return self.basic_tokenizer.do_lower_case
@property
def vocab_size(self):
return len(self.vocab)
def get_vocab(self):
return dict(self.vocab, **self.added_tokens_encoder)
def _tokenize(self, text):
split_tokens = []
if self.do_basic_tokenize:
for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
if token in self.basic_tokenizer.never_split:
split_tokens.append(token)
else:
split_tokens += self.wordpiece_tokenizer.tokenize(token)
else:
split_tokens = self.wordpiece_tokenizer.tokenize(text)
return split_tokens
def _convert_token_to_id(self, token):
"""Converts a token (str) into an id using the vocab."""
return self.vocab.get(token, self.vocab.get(self.unk_token))
def _convert_id_to_token(self, index):
"""Converts an index (integer) into a token (str) using the vocab."""
return self.ids_to_tokens.get(index, self.unk_token)
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) into a single string."""
out_string = " ".join(tokens).replace(" ##", "").strip()
return out_string
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a pair of sequences for question answering tasks by concatenating and adding special
tokens. A Splinter sequence has the following format:
- single sequence: `[CLS] X [SEP]`
- pair of sequences for question answering: `[CLS] question_tokens [QUESTION] . [SEP] context_tokens [SEP]`
Args:
token_ids_0 (`List[int]`):
The question token IDs if pad_on_right, else context tokens IDs
token_ids_1 (`List[int]`, *optional*):
The context token IDs if pad_on_right, else question token IDs
Returns:
`List[int]`: List of input IDs with the appropriate special tokens.
"""
if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
cls = [self.cls_token_id]
sep = [self.sep_token_id]
question_suffix = [self.question_token_id] + [self.convert_tokens_to_ids(".")]
if self.padding_side == "right":
return cls + token_ids_0 + question_suffix + sep + token_ids_1 + sep
else:
return cls + token_ids_0 + sep + token_ids_1 + question_suffix + sep
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieve a mask indicating the positions of special tokens in the input sequences.
Args:
token_ids_0 (`List[int]`):
The question token IDs if pad_on_right, else context tokens IDs
token_ids_1 (`List[int]`, *optional*):
The context token IDs if pad_on_right, else question token IDs
already_has_special_tokens (`bool`):
Whether the input IDs already include special tokens
Returns:
`List[int]`: List indicating positions of special tokens (1 for special token, 0 for others)
"""
special_tokens_mask = [0] * len(token_ids_0)
if token_ids_1 is not None:
special_tokens_mask += [1] * len(token_ids_1)
special_tokens_mask[:1] = [1]
if token_ids_1 is None:
special_tokens_mask[-1:] = [1]
else:
special_tokens_mask[len(token_ids_0) + 2] = 1
if self.question_token_id is not None:
special_tokens_mask[len(token_ids_0):len(token_ids_0) + 2] = [1, 1]
return special_tokens_mask
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)
if token_ids_1 is not None:
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1]
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create the token type IDs corresponding to the sequences passed. [What are token type
IDs?](../glossary#token-type-ids)
Should be overridden in a subclass if the model has a special way of building those.
Args:
token_ids_0 (`List[int]`): The first tokenized sequence.
token_ids_1 (`List[int]`, *optional*): The second tokenized sequence.
Returns:
`List[int]`: The token type ids.
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
question_suffix = [self.question_token_id] + [self.convert_tokens_to_ids(".")]
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
if self.padding_side == "right":
return len(cls + token_ids_0 + question_suffix + sep) * [0] + len(token_ids_1 + sep) * [1]
else:
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + question_suffix + sep) * [1]
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
index = 0
if os.path.isdir(save_directory):
vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
else:
vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
with open(vocab_file, "w", encoding="utf-8") as writer:
for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
if index != token_index:
logger.warning(
f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
" Please check that the vocabulary is not corrupted!"
)
index = token_index
writer.write(token + "\n")
index += 1
return (vocab_file,)
class BasicTokenizer(object):
"""
Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
Args:
do_lower_case (`bool`, *optional*, defaults to `True`):
Whether or not to lowercase the input when tokenizing.
是否在分词时将输入转换为小写,默认为 True。
never_split (`Iterable`, *optional*):
Collection of tokens which will never be split during tokenization. Only has an effect when
`do_basic_tokenize=True`
在分词时不会拆分的 token 集合。仅在 `do_basic_tokenize=True` 时有效。
tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
Whether or not to tokenize Chinese characters.
是否对中文字符进行分词,默认为 True。
This should likely be deactivated for Japanese (see this
[issue](https://github.com/huggingface/transformers/issues/328)).
这可能需要为日语禁用(参见此问题链接)。
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT).
是否去除所有的重音符号。如果未指定此选项,则由 `lowercase` 的值决定(与原始的 BERT 类似)。
"""
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
if never_split is None:
never_split = []
self.do_lower_case = do_lower_case
self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents
def tokenize(self, text, never_split=None):
"""
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
WordPieceTokenizer.
Args:
**never_split**: (*optional*) list of str
Kept for backward compatibility purposes. Now implemented directly at the base class level (see
[`PreTrainedTokenizer.tokenize`]) List of token not to split.
"""
never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
text = self._clean_text(text)
if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text)
split_tokens = []
for token in orig_tokens:
if token not in never_split:
if self.do_lower_case:
token = token.lower()
if self.strip_accents is not False:
token = self._run_strip_accents(token)
elif self.strip_accents:
token = self._run_strip_accents(token)
split_tokens.extend(self._run_split_on_punc(token, never_split))
output_tokens = whitespace_tokenize(" ".join(split_tokens))
return output_tokens
def _run_strip_accents(self, text):
"""Strips accents from a piece of text."""
text = unicodedata.normalize("NFD", text)
output = []
for char in text:
cat = unicodedata.category(char)
if cat == "Mn":
continue
output.append(char)
return "".join(output)
def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text."""
if never_split is not None and text in never_split:
return [text]
chars = list(text)
i = 0
start_new_word = True
output = []
while i < len(chars):
char = chars[i]
if _is_punctuation(char):
output.append([char])
start_new_word = True
else:
if start_new_word:
output.append([])
start_new_word = False
output[-1].append(char)
i += 1
return ["".join(x) for x in output]
def _tokenize_chinese_chars(self, text):
"""Adds whitespace around any CJK character."""
output = []
for char in text:
cp = ord(char)
if self._is_chinese_char(cp):
output.append(" ")
output.append(char)
output.append(" ")
else:
output.append(char)
return "".join(output)
def _is_chinese_char(self, cp):
"""Checks whether CP is the codepoint of a CJK character."""
if (
(cp >= 0x4E00 and cp <= 0x9FFF)
or (cp >= 0x3400 and cp <= 0x4DBF)
or (cp >= 0x20000 and cp <= 0x2A6DF)
or (cp >= 0x2A700 and cp <= 0x2B73F)
or (cp >= 0x2B740 and cp <= 0x2B81F)
or (cp >= 0x2B820 and cp <= 0x2CEAF)
or (cp >= 0xF900 and cp <= 0xFAFF)
or (cp >= 0x2F800 and cp <= 0x2FA1F)
):
return True
return False
def _clean_text(self, text):
"""Performs invalid character removal and whitespace cleanup on text."""
output = []
for char in text:
cp = ord(char)
if cp == 0 or cp == 0xFFFD or self._is_control(char):
continue
if self._is_whitespace(char):
output.append(" ")
else:
output.append(char)
return "".join(output)
class WordpieceTokenizer(object):
"""Runs WordPiece tokenization."""
def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
self.vocab = vocab
self.unk_token = unk_token
self.max_input_chars_per_word = max_input_chars_per_word
def tokenize(self, text):
"""
Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
tokenization using the given vocabulary.
For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.
Args:
text: A single token or whitespace separated tokens. This should have
already been passed through *BasicTokenizer*.
Returns:
A list of wordpiece tokens.
"""
output_tokens = []
for token in whitespace_tokenize(text):
chars = list(token)
if len(chars) > self.max_input_chars_per_word:
output_tokens.append(self.unk_token)
continue
is_bad = False
start = 0
sub_tokens = []
while start < len(chars):
end = len(chars)
cur_substr = None
while start < end:
substr = "".join(chars[start:end])
if start > 0:
substr = "##" + substr
if substr in self.vocab:
cur_substr = substr
break
end -= 1
if cur_substr is None:
is_bad = True
break
sub_tokens.append(cur_substr)
start = end
if is_bad:
output_tokens.append(self.unk_token)
else:
output_tokens.extend(sub_tokens)
return output_tokens
.\models\splinter\tokenization_splinter_fast.py
"""Fast Tokenization classes for Splinter."""
import json
from typing import List, Optional, Tuple
from tokenizers import normalizers
from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ...utils import logging
from .tokenization_splinter import SplinterTokenizer
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"tau/splinter-base": "https://huggingface.co/tau/splinter-base/resolve/main/vocab.txt",
"tau/splinter-base-qass": "https://huggingface.co/tau/splinter-base-qass/resolve/main/vocab.txt",
"tau/splinter-large": "https://huggingface.co/tau/splinter-large/resolve/main/vocab.txt",
"tau/splinter-large-qass": "https://huggingface.co/tau/splinter-large-qass/resolve/main/vocab.txt",
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"tau/splinter-base": 512,
"tau/splinter-base-qass": 512,
"tau/splinter-large": 512,
"tau/splinter-large-qass": 512,
}
PRETRAINED_INIT_CONFIGURATION = {
"tau/splinter-base": {"do_lower_case": False},
"tau/splinter-base-qass": {"do_lower_case": False},
"tau/splinter-large": {"do_lower_case": False},
"tau/splinter-large-qass": {"do_lower_case": False},
}
class SplinterTokenizerFast(PreTrainedTokenizerFast):
r"""
Construct a "fast" Splinter tokenizer (backed by HuggingFace's *tokenizers* library). Based on WordPiece.
This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
refer to this superclass for more information regarding those methods.
构建一个快速的 Splinter 分词器,基于 HuggingFace 的 tokenizers 库,基于 WordPiece。
This class inherits from PreTrainedTokenizerFast, which includes most of the primary methods. Users should refer to
the superclass for more information on those methods.
此类继承自 PreTrainedTokenizerFast,该类包含大多数主要方法。用户应参考超类以获取有关这些方法的更多信息。
```
# 定义函数参数和默认值的说明
Args:
vocab_file (`str`):
Vocabulary 文件的路径。
do_lower_case (`bool`, *optional*, defaults to `True`):
是否在标记化时将输入转换为小写。
unk_token (`str`, *optional*, defaults to `"[UNK]"`):
未知标记。如果标记不在词汇表中,则无法转换为 ID,并将其设置为此标记。
sep_token (`str`, *optional*, defaults to `"[SEP]"`):
分隔符标记,用于从多个序列构建一个序列,例如用于序列分类或问答任务中的问题与文本的分隔。
pad_token (`str`, *optional*, defaults to `"[PAD]"`):
填充标记,用于将不同长度的序列进行批处理时进行填充。
cls_token (`str`, *optional*, defaults to `"[CLS]"`):
分类器标记,在序列分类任务中作为序列的第一个标记。
mask_token (`str`, *optional*, defaults to `"[MASK]"`):
掩码标记,用于掩码语言建模任务中模型尝试预测的标记。
question_token (`str`, *optional*, defaults to `"[QUESTION]"`):
构建问题表示时使用的标记。
clean_text (`bool`, *optional*, defaults to `True`):
是否在标记化前清理文本,例如删除控制字符并替换所有空格。
tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
是否标记化中文字符,对于日文可能需要禁用此选项。
strip_accents (`bool`, *optional*):
是否去除所有重音符号。如果未指定,则将根据 `lowercase` 的值来确定(与原始的 BERT 行为一致)。
wordpieces_prefix (`str`, *optional*, defaults to `"##"`):
子词的前缀。
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
slow_tokenizer_class = SplinterTokenizer
def __init__(
self,
vocab_file=None,
tokenizer_file=None,
do_lower_case=True,
unk_token="[UNK]",
sep_token="[SEP]",
pad_token="[PAD]",
cls_token="[CLS]",
mask_token="[MASK]",
question_token="[QUESTION]",
tokenize_chinese_chars=True,
strip_accents=None,
**kwargs,
):
super().__init__(
vocab_file,
tokenizer_file=tokenizer_file,
do_lower_case=do_lower_case,
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
additional_special_tokens=(question_token,),
**kwargs,
)
pre_tok_state = json.loads(self.backend_tokenizer.normalizer.__getstate__())
if (
pre_tok_state.get("lowercase", do_lower_case) != do_lower_case
or pre_tok_state.get("strip_accents", strip_accents) != strip_accents
):
pre_tok_class = getattr(normalizers, pre_tok_state.pop("type"))
pre_tok_state["lowercase"] = do_lower_case
pre_tok_state["strip_accents"] = strip_accents
self.backend_tokenizer.normalizer = pre_tok_class(**pre_tok_state)
self.do_lower_case = do_lower_case
@property
def question_token_id(self):
"""
`Optional[int]`: Id of the question token in the vocabulary, used to condition the answer on a question
representation.
"""
return self.convert_tokens_to_ids(self.question_token)
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a pair of sequences for question answering tasks by concatenating and adding special
tokens. A Splinter sequence has the following format:
- single sequence: `[CLS] X [SEP]`
- pair of sequences for question answering: `[CLS] question_tokens [QUESTION] . [SEP] context_tokens [SEP]`
Args:
token_ids_0 (`List[int]`):
The question token IDs if pad_on_right, else context tokens IDs
token_ids_1 (`List[int]`, *optional*):
The context token IDs if pad_on_right, else question token IDs
Returns:
`List[int]`: List of input IDs with the appropriate special tokens.
"""
if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
cls = [self.cls_token_id]
sep = [self.sep_token_id]
question_suffix = [self.question_token_id] + [self.convert_tokens_to_ids(".")]
if self.padding_side == "right":
return cls + token_ids_0 + question_suffix + sep + token_ids_1 + sep
else:
return cls + token_ids_0 + sep + token_ids_1 + question_suffix + sep
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create the token type IDs corresponding to the sequences passed. What are token type
IDs? See glossary.
Should be overridden in a subclass if the model has a special way of building those.
Args:
token_ids_0 (`List[int]`): The first tokenized sequence.
token_ids_1 (`List[int]`, *optional*): The second tokenized sequence.
Returns:
`List[int]`: The token type ids.
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
question_suffix = [self.question_token_id] + [self.convert_tokens_to_ids(".")]
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
if self.padding_side == "right":
return len(cls + token_ids_0 + question_suffix + sep) * [0] + len(token_ids_1 + sep) * [1]
else:
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + question_suffix + sep) * [1]
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
"""
Save the tokenizer's model vocabulary to a directory.
Args:
save_directory (str): The directory where the vocabulary files will be saved.
filename_prefix (str, *optional*): Optional prefix for the saved vocabulary files.
Returns:
`Tuple[str]`: Tuple of filenames saved.
"""
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
return tuple(files)
.\models\splinter\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available
_import_structure = {
"configuration_splinter": ["SPLINTER_PRETRAINED_CONFIG_ARCHIVE_MAP", "SplinterConfig"],
"tokenization_splinter": ["SplinterTokenizer"],
}
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_splinter_fast"] = ["SplinterTokenizerFast"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_splinter"] = [
"SPLINTER_PRETRAINED_MODEL_ARCHIVE_LIST",
"SplinterForQuestionAnswering",
"SplinterForPreTraining",
"SplinterLayer",
"SplinterModel",
"SplinterPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_splinter import SPLINTER_PRETRAINED_CONFIG_ARCHIVE_MAP, SplinterConfig
from .tokenization_splinter import SplinterTokenizer
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_splinter_fast import SplinterTokenizerFast
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_splinter import (
SPLINTER_PRETRAINED_MODEL_ARCHIVE_LIST,
SplinterForPreTraining,
SplinterForQuestionAnswering,
SplinterLayer,
SplinterModel,
SplinterPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\squeezebert\configuration_squeezebert.py
""" SqueezeBERT model configuration"""
from collections import OrderedDict
from typing import Mapping
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfig
from ...utils import logging
logger = logging.get_logger(__name__)
SQUEEZEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"squeezebert/squeezebert-uncased": (
"https://huggingface.co/squeezebert/squeezebert-uncased/resolve/main/config.json"
),
"squeezebert/squeezebert-mnli": "https://huggingface.co/squeezebert/squeezebert-mnli/resolve/main/config.json",
"squeezebert/squeezebert-mnli-headless": (
"https://huggingface.co/squeezebert/squeezebert-mnli-headless/resolve/main/config.json"
),
}
class SqueezeBertConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`SqueezeBertModel`]. It is used to instantiate a
SqueezeBERT model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the SqueezeBERT
[squeezebert/squeezebert-uncased](https://huggingface.co/squeezebert/squeezebert-uncased) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Examples:
```
>>> from transformers import SqueezeBertConfig, SqueezeBertModel
>>> # Initializing a SqueezeBERT configuration
>>> configuration = SqueezeBertConfig()
>>> # Initializing a model (with random weights) from the configuration above
>>> model = SqueezeBertModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```
Attributes: pretrained_config_archive_map (Dict[str, str]): A dictionary containing all the available pre-trained
checkpoints.
"""
pretrained_config_archive_map = SQUEEZEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
model_type = "squeezebert"
def __init__(
self,
vocab_size=30522,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=2,
initializer_range=0.02,
layer_norm_eps=1e-12,
pad_token_id=0,
embedding_size=768,
q_groups=4,
k_groups=4,
v_groups=4,
post_attention_groups=1,
intermediate_groups=4,
output_groups=4,
**kwargs,
):
super().__init__(pad_token_id=pad_token_id, **kwargs)
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.hidden_act = hidden_act
self.intermediate_size = intermediate_size
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.embedding_size = embedding_size
self.q_groups = q_groups
self.k_groups = k_groups
self.v_groups = v_groups
self.post_attention_groups = post_attention_groups
self.intermediate_groups = intermediate_groups
self.output_groups = output_groups
class SqueezeBertOnnxConfig(OnnxConfig):
@property
def inputs(self) -> Mapping[str, Mapping[int, str]]:
if self.task == "multiple-choice":
dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
else:
dynamic_axis = {0: "batch", 1: "sequence"}
return OrderedDict(
[
("input_ids", dynamic_axis),
("attention_mask", dynamic_axis),
("token_type_ids", dynamic_axis),
]
)
.\models\squeezebert\modeling_squeezebert.py
"""
PyTorch SqueezeBert 模型。
"""
import math
from typing import Optional, Tuple, Union
import torch
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN
from ...modeling_outputs import (
BaseModelOutput,
BaseModelOutputWithPooling,
MaskedLMOutput,
MultipleChoiceModelOutput,
QuestionAnsweringModelOutput,
SequenceClassifierOutput,
TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
from .configuration_squeezebert import SqueezeBertConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "squeezebert/squeezebert-uncased"
_CONFIG_FOR_DOC = "SqueezeBertConfig"
SQUEEZEBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"squeezebert/squeezebert-uncased",
"squeezebert/squeezebert-mnli",
"squeezebert/squeezebert-mnli-headless",
]
class SqueezeBertEmbeddings(nn.Module):
"""Construct the embeddings from word, position and token_type embeddings."""
def __init__(self, config):
super().__init__()
self.word_embeddings = nn.Embedding(config.vocab_size, config.embedding_size, padding_idx=config.pad_token_id)
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.embedding_size)
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.embedding_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):
if input_ids is not None:
input_shape = input_ids.size()
else:
input_shape = inputs_embeds.size()[:-1]
seq_length = input_shape[1]
if position_ids is None:
position_ids = self.position_ids[:, :seq_length]
if token_type_ids is None:
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
if inputs_embeds is None:
inputs_embeds = self.word_embeddings(input_ids)
position_embeddings = self.position_embeddings(position_ids)
token_type_embeddings = self.token_type_embeddings(token_type_ids)
embeddings = inputs_embeds + position_embeddings + token_type_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings
class MatMulWrapper(nn.Module):
"""
Wrapper for torch.matmul(). This makes flop-counting easier to implement. Note that if you directly call
torch.matmul() in your code, the flop counter will typically ignore the flops of the matmul.
"""
def __init__(self):
super().__init__()
def forward(self, mat1, mat2):
"""
执行前向传播计算
:param mat1: 第一个 torch 张量
:param mat2: 第二个 torch 张量
:return: 两个张量的矩阵乘积
这里描述了 BERT 中典型的张量维度,mat1.shape: [B, <optional extra dims>, M, K]
mat2.shape: [B, <optional extra dims>, K, N] 输出形状: [B, <optional extra dims>, M, N]
"""
return torch.matmul(mat1, mat2)
class SqueezeBertLayerNorm(nn.LayerNorm):
"""
This is a nn.LayerNorm subclass that accepts NCW data layout and performs normalization in the C dimension.
N = batch C = channels W = sequence length
"""
def __init__(self, hidden_size, eps=1e-12):
nn.LayerNorm.__init__(self, normalized_shape=hidden_size, eps=eps)
def forward(self, x):
x = x.permute(0, 2, 1)
x = nn.LayerNorm.forward(self, x)
return x.permute(0, 2, 1)
class ConvDropoutLayerNorm(nn.Module):
"""
ConvDropoutLayerNorm: Conv, Dropout, LayerNorm
"""
def __init__(self, cin, cout, groups, dropout_prob):
super().__init__()
self.conv1d = nn.Conv1d(in_channels=cin, out_channels=cout, kernel_size=1, groups=groups)
self.layernorm = SqueezeBertLayerNorm(cout)
self.dropout = nn.Dropout(dropout_prob)
def forward(self, hidden_states, input_tensor):
x = self.conv1d(hidden_states)
x = self.dropout(x)
x = x + input_tensor
x = self.layernorm(x)
return x
class ConvActivation(nn.Module):
"""
ConvActivation: Conv, Activation
"""
def __init__(self, cin, cout, groups, act):
super().__init__()
self.conv1d = nn.Conv1d(in_channels=cin, out_channels=cout, kernel_size=1, groups=groups)
self.act = ACT2FN[act]
def forward(self, x):
output = self.conv1d(x)
return self.act(output)
class SqueezeBertSelfAttention(nn.Module):
pass
def __init__(self, config, cin, q_groups=1, k_groups=1, v_groups=1):
"""
config = used for some things; ignored for others (work in progress...) cin = input channels = output channels
groups = number of groups to use in conv1d layers
"""
super().__init__()
if cin % config.num_attention_heads != 0:
raise ValueError(
f"cin ({cin}) is not a multiple of the number of attention heads ({config.num_attention_heads})"
)
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(cin / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query = nn.Conv1d(in_channels=cin, out_channels=cin, kernel_size=1, groups=q_groups)
self.key = nn.Conv1d(in_channels=cin, out_channels=cin, kernel_size=1, groups=k_groups)
self.value = nn.Conv1d(in_channels=cin, out_channels=cin, kernel_size=1, groups=v_groups)
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
self.softmax = nn.Softmax(dim=-1)
self.matmul_qk = MatMulWrapper()
self.matmul_qkv = MatMulWrapper()
def transpose_for_scores(self, x):
"""
- input: [N, C, W]
- output: [N, C1, W, C2] where C1 is the head index, and C2 is one head's contents
"""
new_x_shape = (x.size()[0], self.num_attention_heads, self.attention_head_size, x.size()[-1])
x = x.view(*new_x_shape)
return x.permute(0, 1, 3, 2)
def transpose_key_for_scores(self, x):
"""
- input: [N, C, W]
- output: [N, C1, C2, W] where C1 is the head index, and C2 is one head's contents
"""
new_x_shape = (x.size()[0], self.num_attention_heads, self.attention_head_size, x.size()[-1])
x = x.view(*new_x_shape)
return x
def transpose_output(self, x):
"""
- input: [N, C1, W, C2]
- output: [N, C, W]
"""
x = x.permute(0, 1, 3, 2).contiguous()
new_x_shape = (x.size()[0], self.all_head_size, x.size()[3])
x = x.view(*new_x_shape)
return x
def forward(self, hidden_states, attention_mask, output_attentions):
"""
前向传播函数,用于计算自注意力机制后的结果。
hidden_states: 输入的隐藏状态张量,数据布局为 [N, C, W]。
attention_mask: 注意力掩码张量,数据布局为 [N, W],不需要转置。
output_attentions: 布尔值,指示是否输出注意力分数。
返回包含上下文层和(可选)注意力分数的字典结果。
"""
mixed_query_layer = self.query(hidden_states)
mixed_key_layer = self.key(hidden_states)
mixed_value_layer = self.value(hidden_states)
query_layer = self.transpose_for_scores(mixed_query_layer)
key_layer = self.transpose_key_for_scores(mixed_key_layer)
value_layer = self.transpose_for_scores(mixed_value_layer)
attention_score = self.matmul_qk(query_layer, key_layer)
attention_score = attention_score / math.sqrt(self.attention_head_size)
attention_score = attention_score + attention_mask
attention_probs = self.softmax(attention_score)
attention_probs = self.dropout(attention_probs)
context_layer = self.matmul_qkv(attention_probs, value_layer)
context_layer = self.transpose_output(context_layer)
result = {"context_layer": context_layer}
if output_attentions:
result["attention_score"] = attention_score
return result
class SqueezeBertModule(nn.Module):
def __init__(self, config):
"""
初始化函数,用于设置模块的各个层次
- hidden_size = 输入通道数 = 输出通道数(Q、K、V 通道数相同)= 模块的输出通道数
- intermediate_size = 中间层的输出通道数
- groups = BertModule 中所有层的分组数(未来可以更改接口以允许不同层的不同分组)
"""
super().__init__()
c0 = config.hidden_size
c1 = config.hidden_size
c2 = config.intermediate_size
c3 = config.hidden_size
self.attention = SqueezeBertSelfAttention(
config=config, cin=c0, q_groups=config.q_groups, k_groups=config.k_groups, v_groups=config.v_groups
)
self.post_attention = ConvDropoutLayerNorm(
cin=c0, cout=c1, groups=config.post_attention_groups, dropout_prob=config.hidden_dropout_prob
)
self.intermediate = ConvActivation(cin=c1, cout=c2, groups=config.intermediate_groups, act=config.hidden_act)
self.output = ConvDropoutLayerNorm(
cin=c2, cout=c3, groups=config.output_groups, dropout_prob=config.hidden_dropout_prob
)
def forward(self, hidden_states, attention_mask, output_attentions):
"""
前向传播函数,用于计算模块的输出
Args:
- hidden_states: 输入的隐藏状态张量
- attention_mask: 注意力掩码张量
- output_attentions: 是否输出注意力分数
Returns:
- output_dict: 包含模块输出的字典,至少包含 "feature_map" 键
"""
att = self.attention(hidden_states, attention_mask, output_attentions)
attention_output = att["context_layer"]
post_attention_output = self.post_attention(attention_output, hidden_states)
intermediate_output = self.intermediate(post_attention_output)
layer_output = self.output(intermediate_output, post_attention_output)
output_dict = {"feature_map": layer_output}
if output_attentions:
output_dict["attention_score"] = att["attention_score"]
return output_dict
class SqueezeBertEncoder(nn.Module):
def __init__(self, config):
"""
初始化函数,用于设置编码器的层数
Args:
- config: 包含模型配置信息的对象
"""
super().__init__()
assert config.embedding_size == config.hidden_size, (
"If you want embedding_size != intermediate hidden_size, "
"please insert a Conv1d layer to adjust the number of channels "
"before the first SqueezeBertModule."
)
self.layers = nn.ModuleList(SqueezeBertModule(config) for _ in range(config.num_hidden_layers))
def forward(
self,
hidden_states,
attention_mask=None,
head_mask=None,
output_attentions=False,
output_hidden_states=False,
return_dict=True,
):
"""
前向传播函数,用于计算编码器的输出
Args:
- hidden_states: 输入的隐藏状态张量
- attention_mask: 注意力掩码张量(默认为 None)
- head_mask: 注意力头掩码张量(默认为 None)
- output_attentions: 是否输出注意力分数(默认为 False)
- output_hidden_states: 是否输出隐藏状态(默认为 False)
- return_dict: 是否返回字典格式的输出(默认为 True)
Returns:
- 输出结果,根据 return_dict 参数决定返回类型
"""
for layer in self.layers:
hidden_states = layer(hidden_states, attention_mask, output_attentions)["feature_map"]
if return_dict:
return {"last_hidden_state": hidden_states}
return hidden_states
if head_mask is None:
head_mask_is_all_none = True
elif head_mask.count(None) == len(head_mask):
head_mask_is_all_none = True
else:
head_mask_is_all_none = False
assert head_mask_is_all_none is True, "head_mask is not yet supported in the SqueezeBert implementation."
hidden_states = hidden_states.permute(0, 2, 1)
all_hidden_states = () if output_hidden_states else None
all_attentions = () if output_attentions else None
for layer in self.layers:
if output_hidden_states:
hidden_states = hidden_states.permute(0, 2, 1)
all_hidden_states += (hidden_states,)
hidden_states = hidden_states.permute(0, 2, 1)
layer_output = layer.forward(hidden_states, attention_mask, output_attentions)
hidden_states = layer_output["feature_map"]
if output_attentions:
all_attentions += (layer_output["attention_score"],)
hidden_states = hidden_states.permute(0, 2, 1)
if output_hidden_states:
all_hidden_states += (hidden_states,)
if not return_dict:
return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)
return BaseModelOutput(
last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
)
class SqueezeBertPooler(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.activation = nn.Tanh()
def forward(self, hidden_states):
first_token_tensor = hidden_states[:, 0]
pooled_output = self.dense(first_token_tensor)
pooled_output = self.activation(pooled_output)
return pooled_output
class SqueezeBertPredictionHeadTransform(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
if isinstance(config.hidden_act, str):
self.transform_act_fn = ACT2FN[config.hidden_act]
else:
self.transform_act_fn = config.hidden_act
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
def forward(self, hidden_states):
hidden_states = self.dense(hidden_states)
hidden_states = self.transform_act_fn(hidden_states)
hidden_states = self.LayerNorm(hidden_states)
return hidden_states
class SqueezeBertLMPredictionHead(nn.Module):
def __init__(self, config):
super().__init__()
self.transform = SqueezeBertPredictionHeadTransform(config)
self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.bias = nn.Parameter(torch.zeros(config.vocab_size))
self.decoder.bias = self.bias
def forward(self, hidden_states):
hidden_states = self.transform(hidden_states)
hidden_states = self.decoder(hidden_states)
return hidden_states
class SqueezeBertOnlyMLMHead(nn.Module):
def __init__(self, config):
super().__init__()
self.predictions = SqueezeBertLMPredictionHead(config)
def forward(self, sequence_output):
prediction_scores = self.predictions(sequence_output)
return prediction_scores
class SqueezeBertPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = SqueezeBertConfig
base_model_prefix = "transformer"
def _init_weights(self, module):
"""Initialize the weights"""
if isinstance(module, (nn.Linear, nn.Conv1d)):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
elif isinstance(module, SqueezeBertLayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
SQUEEZEBERT_START_DOCSTRING = r"""
The SqueezeBERT model was proposed in [SqueezeBERT: What can computer vision teach NLP about efficient neural
networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W.
Keutzer
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
For best results finetuning SqueezeBERT on text classification tasks, it is recommended to use the
*squeezebert/squeezebert-mnli-headless* checkpoint as a starting point.
Parameters:
config ([`SqueezeBertConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
Hierarchy:
```
Internal class hierarchy:
SqueezeBertModel
SqueezeBertEncoder
SqueezeBertModule
SqueezeBertSelfAttention
ConvActivation
ConvDropoutLayerNorm
```
Data layouts:
```
Input data is in [batch, sequence_length, hidden_size] format.
Data inside the encoder is in [batch, hidden_size, sequence_length] format. But, if `output_hidden_states == True`, the data from inside the encoder is returned in [batch, sequence_length, hidden_size] format.
The final output of the encoder is in [batch, sequence_length, hidden_size] format.
```
"""
SQUEEZEBERT_INPUTS_DOCSTRING = r"""
Args:
input_ids (`torch.LongTensor` of shape `({0})`):
# 输入序列标记在词汇表中的索引。
# 可以使用 `AutoTokenizer` 获得这些索引。详见 `PreTrainedTokenizer.encode` 和 `PreTrainedTokenizer.__call__`。
# [什么是输入 ID?](../glossary#input-ids)
attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
# 遮罩,用于在填充标记上避免执行注意力操作。遮罩值选取范围为 `[0, 1]`:
# - 1 表示 **未遮罩** 的标记,
# - 0 表示 **已遮罩** 的标记。
# [什么是注意力遮罩?](../glossary#attention-mask)
token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
# 分段标记索引,用于指示输入的第一部分和第二部分。索引选取范围为 `[0, 1]`:
# - 0 对应 *句子 A* 的标记,
# - 1 对应 *句子 B* 的标记。
# [什么是分段标记 ID?](../glossary#token-type-ids)
position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
# 输入序列每个标记在位置嵌入中的位置索引。索引选取范围为 `[0, config.max_position_embeddings - 1]`。
# [什么是位置 ID?](../glossary#position-ids)
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
# 遮罩,用于将自注意力模块的特定头部置零。遮罩值选取范围为 `[0, 1]`:
# - 1 表示头部 **未遮罩**,
# - 0 表示头部 **已遮罩**。
inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
# 可选参数,可以直接传入嵌入表示,而不是传递 `input_ids`。如果需要对 `input_ids` 索引转换为相关向量具有更多控制权,则很有用。
# 如果模型内部嵌入查找矩阵不符合需求,这将非常有用。
output_attentions (`bool`, *optional*):
# 是否返回所有注意力层的注意力张量。返回的张量中有关 `attentions` 的详细信息。
output_hidden_states (`bool`, *optional*):
# 是否返回所有层的隐藏状态。返回的张量中有关 `hidden_states` 的详细信息。
return_dict (`bool`, *optional*):
# 是否返回 [`~utils.ModelOutput`] 而不是普通的元组。
"""
@add_start_docstrings(
"The bare SqueezeBERT Model transformer outputting raw hidden-states without any specific head on top.",
SQUEEZEBERT_START_DOCSTRING,
)
class SqueezeBertModel(SqueezeBertPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.embeddings = SqueezeBertEmbeddings(config)
self.encoder = SqueezeBertEncoder(config)
self.pooler = SqueezeBertPooler(config)
self.post_init()
def get_input_embeddings(self):
return self.embeddings.word_embeddings
def set_input_embeddings(self, new_embeddings):
self.embeddings.word_embeddings = new_embeddings
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_model_forward(SQUEEZEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPooling,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
"""
) -> Union[Tuple, BaseModelOutputWithPooling]:
# 如果未指定output_attentions,则使用配置中的默认值
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
# 如果未指定output_hidden_states,则使用配置中的默认值
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
# 如果未指定return_dict,则使用配置中的默认值
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 如果同时指定了input_ids和inputs_embeds,则抛出异常
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
# 如果只指定了input_ids,则检查padding情况,并获取其形状
elif input_ids is not None:
self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
input_shape = input_ids.size()
# 如果只指定了inputs_embeds,则获取其形状(去除最后一维)
elif inputs_embeds is not None:
input_shape = inputs_embeds.size()[:-1]
else:
# 如果既没有指定input_ids也没有指定inputs_embeds,则抛出异常
raise ValueError("You have to specify either input_ids or inputs_embeds")
# 根据input_ids或inputs_embeds确定设备
device = input_ids.device if input_ids is not None else inputs_embeds.device
# 如果未提供attention_mask,则创建一个全为1的mask,形状与input_shape相同
if attention_mask is None:
attention_mask = torch.ones(input_shape, device=device)
# 如果未提供token_type_ids,则创建一个全为0的token_type_ids,数据类型为long,设备为device
if token_type_ids is None:
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
# 获取扩展后的attention_mask,以确保形状匹配
extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
# 如果需要,准备头部掩码
# 在head_mask中的1.0表示保留该头部
# attention_probs的形状为batch_size x num_heads x N x N
# 输入的head_mask形状为[num_heads]或[num_hidden_layers x num_heads]
# head_mask转换为形状为[num_hidden_layers x batch x num_heads x seq_length x seq_length]
head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
# 将输入传递给嵌入层,获取嵌入输出
embedding_output = self.embeddings(
input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
)
# 将嵌入输出传递给编码器,获取编码器的输出
encoder_outputs = self.encoder(
hidden_states=embedding_output,
attention_mask=extended_attention_mask,
head_mask=head_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 从编码器输出中获取序列输出(通常是最后一层的隐藏状态)
sequence_output = encoder_outputs[0]
# 将序列输出传递给池化层,获取池化后的输出
pooled_output = self.pooler(sequence_output)
# 如果不要求返回字典,则返回一个元组
if not return_dict:
return (sequence_output, pooled_output) + encoder_outputs[1:]
# 如果要求返回字典,则构建BaseModelOutputWithPooling对象并返回
return BaseModelOutputWithPooling(
last_hidden_state=sequence_output,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
# 使用自定义的文档字符串初始化 SqueezeBERT 模型,其顶部带有一个语言建模的头部
@add_start_docstrings("""SqueezeBERT Model with a `language modeling` head on top.""", SQUEEZEBERT_START_DOCSTRING)
# 定义 SqueezeBertForMaskedLM 类,继承自 SqueezeBertPreTrainedModel
class SqueezeBertForMaskedLM(SqueezeBertPreTrainedModel):
# 指定共享权重的键列表
_tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
# 初始化函数,接受一个配置对象作为参数
def __init__(self, config):
# 调用父类的初始化方法
super().__init__(config)
# 创建 SqueezeBertModel 对象,并赋值给 self.transformer
self.transformer = SqueezeBertModel(config)
# 创建 SqueezeBertOnlyMLMHead 对象,并赋值给 self.cls
self.cls = SqueezeBertOnlyMLMHead(config)
# 初始化权重并进行最终处理
self.post_init()
# 获取输出嵌入的方法,返回预测的解码器
def get_output_embeddings(self):
return self.cls.predictions.decoder
# 设置输出嵌入的方法,将新的嵌入赋给解码器
def set_output_embeddings(self, new_embeddings):
self.cls.predictions.decoder = new_embeddings
# 前向传播方法,接受多个输入参数并返回输出
@add_start_docstrings_to_model_forward(SQUEEZEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=MaskedLMOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
# 确定前向传播是否返回一个字典形式的结果
return_dict: Optional[bool] = None,
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
"""
# 如果 `return_dict` 为 `None`,则使用模型配置中的 `use_return_dict` 参数值
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 将输入传递给 Transformer 模型进行处理
outputs = self.transformer(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 从 Transformer 输出中获取序列输出
sequence_output = outputs[0]
# 将序列输出传递给分类器,生成预测分数
prediction_scores = self.cls(sequence_output)
# 初始化 masked_lm_loss 为 None
masked_lm_loss = None
# 如果提供了 labels,则计算 masked language modeling 的损失
if labels is not None:
# 使用交叉熵损失函数,-100 索引对应填充标记
loss_fct = CrossEntropyLoss()
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
# 如果 `return_dict` 为 False,则返回的 output 包括预测分数和其他输出
if not return_dict:
output = (prediction_scores,) + outputs[2:]
return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
# 如果 `return_dict` 为 True,则返回 MaskedLMOutput 对象,包括损失、预测 logits、隐藏状态和注意力权重
return MaskedLMOutput(
loss=masked_lm_loss,
logits=prediction_scores,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""
SqueezeBERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the
pooled output) e.g. for GLUE tasks.
""",
SQUEEZEBERT_START_DOCSTRING,
)
class SqueezeBertForSequenceClassification(SqueezeBertPreTrainedModel):
"""
SqueezeBERT模型的序列分类/回归头部变换器(在汇总输出顶部的线性层),例如用于GLUE任务。
继承自SqueezeBertPreTrainedModel。
"""
def __init__(self, config):
"""
初始化方法,设置模型配置和层。
Args:
config (:class:`~transformers.SqueezeBertConfig`): 模型的配置对象。
"""
super().__init__(config)
self.num_labels = config.num_labels
self.config = config
# SqueezeBERT模型,用于特征提取
self.transformer = SqueezeBertModel(config)
# Dropout层,用于防止过拟合
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# 分类器线性层,将隐藏状态映射到标签数量
self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)
# 初始化权重并进行最终处理
self.post_init()
@add_start_docstrings_to_model_forward(SQUEEZEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=SequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, SequenceClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
# 根据参数 return_dict 的值确定是否使用配置中的 return_dict 设置
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 将输入传递给 Transformer 模型进行处理,并返回相应的输出
outputs = self.transformer(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 从 Transformer 输出中获取池化后的表示
pooled_output = outputs[1]
# 对池化后的表示应用 dropout
pooled_output = self.dropout(pooled_output)
# 将池化后的表示输入分类器,得到预测 logits
logits = self.classifier(pooled_output)
# 初始化损失值为 None
loss = None
# 如果有指定 labels,则计算损失
if labels is not None:
# 如果问题类型未指定,则根据 num_labels 和 labels 的类型确定问题类型
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
# 根据问题类型选择相应的损失函数
if self.config.problem_type == "regression":
loss_fct = MSELoss()
if self.num_labels == 1:
loss = loss_fct(logits.squeeze(), labels.squeeze())
else:
loss = loss_fct(logits, labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(logits, labels)
# 如果 return_dict 是 False,则返回包含 logits 和可能的隐藏状态的元组
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
# 如果 return_dict 是 True,则返回一个 SequenceClassifierOutput 对象,包含损失、logits、隐藏状态和注意力权重
return SequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
# 添加模型文档字符串,描述该模型是基于 SqueezeBERT 的多选分类模型
@add_start_docstrings(
"""
SqueezeBERT Model with a multiple choice classification head on top (a linear layer on top of the pooled output and
a softmax) e.g. for RocStories/SWAG tasks.
""",
SQUEEZEBERT_START_DOCSTRING,
)
# 定义 SqueezeBertForMultipleChoice 类,继承自 SqueezeBertPreTrainedModel
class SqueezeBertForMultipleChoice(SqueezeBertPreTrainedModel):
# 初始化方法,接受一个配置对象 config
def __init__(self, config):
# 调用父类的初始化方法
super().__init__(config)
# 初始化 SqueezeBERT 模型
self.transformer = SqueezeBertModel(config)
# Dropout 层,使用配置中的隐藏层 dropout 概率
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# 分类器,使用线性层将隐藏状态大小映射到 1(二进制分类)
self.classifier = nn.Linear(config.hidden_size, 1)
# 初始化权重并应用最终处理
self.post_init()
# 前向传播方法
@add_start_docstrings_to_model_forward(
SQUEEZEBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=MultipleChoiceModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
# 方法接收多个参数,用于进行前向传播
# 输入参数说明如下:
# - input_ids: 输入的 token ids
# - attention_mask: 注意力掩码,指示哪些元素是 padding 的
# - token_type_ids: token 类型 ids,对于单句或双句模型有用
# - position_ids: 位置 ids,指示每个 token 在序列中的位置
# - head_mask: 头部掩码,用于控制哪些注意力头是有效的
# - inputs_embeds: 可选的输入嵌入向量
# - labels: 可选的标签,用于训练时的损失计算
# - output_attentions: 是否输出注意力权重
# - output_hidden_states: 是否输出隐藏状态
# - return_dict: 是否返回字典形式的输出
# 返回模型的输出,包括分类器的输出和其他可选的中间状态
pass
) -> Union[Tuple, MultipleChoiceModelOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
num_choices-1]` where *num_choices* is the size of the second dimension of the input tensors. (see
*input_ids* above)
"""
# 如果 return_dict 参数为 None,则根据配置决定是否使用返回字典
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 计算 num_choices,即输入张量的第二个维度大小
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
# 以下四行代码将输入张量展平成二维张量,如果输入为 None 则保持为 None
input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
inputs_embeds = (
inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
if inputs_embeds is not None
else None
)
# 调用 Transformer 模型进行前向传播,返回结果保存在 outputs 中
outputs = self.transformer(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 从 Transformer 输出中获取 pooled_output,并应用 dropout
pooled_output = outputs[1]
pooled_output = self.dropout(pooled_output)
# 将 pooled_output 输入分类器,得到 logits
logits = self.classifier(pooled_output)
# 将 logits 重塑成二维张量,形状为 (batch_size, num_choices)
reshaped_logits = logits.view(-1, num_choices)
# 如果提供了 labels,则计算交叉熵损失
loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(reshaped_logits, labels)
# 如果 return_dict 为 False,则返回不同的输出格式
if not return_dict:
output = (reshaped_logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
# 如果 return_dict 为 True,则返回 MultipleChoiceModelOutput 对象
return MultipleChoiceModelOutput(
loss=loss,
logits=reshaped_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
"""
SqueezeBERT Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
for Named-Entity-Recognition (NER) tasks.
"""
# 导入所需模块和类
@add_start_docstrings(
"""
SqueezeBERT Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
for Named-Entity-Recognition (NER) tasks.
""",
SQUEEZEBERT_START_DOCSTRING,
)
# 定义SqueezeBertForTokenClassification类,继承自SqueezeBertPreTrainedModel类
class SqueezeBertForTokenClassification(SqueezeBertPreTrainedModel):
# 初始化方法,接收一个config对象作为参数
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
# SqueezeBERT模型作为transformer层
self.transformer = SqueezeBertModel(config)
# Dropout层,根据配置中的hidden_dropout_prob概率丢弃部分神经元,防止过拟合
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# 线性层,将隐藏状态的输出映射到config.num_labels个类别上
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
# 初始化权重并应用最终处理
self.post_init()
# 前向传播方法,接收多个输入参数并返回输出结果或损失
@add_start_docstrings_to_model_forward(SQUEEZEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, TokenClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
# 根据self.config.use_return_dict决定是否返回字典形式的输出
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 将输入参数传递给transformer层,并接收输出结果
outputs = self.transformer(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 获取transformer层的输出中的序列输出
sequence_output = outputs[0]
# 对序列输出进行dropout操作
sequence_output = self.dropout(sequence_output)
# 将dropout后的序列输出通过线性层映射到分类标签空间
logits = self.classifier(sequence_output)
# 初始化损失值为None
loss = None
# 如果labels不为None,则计算token分类损失
if labels is not None:
loss_fct = CrossEntropyLoss()
# 将logits展平为二维张量,计算损失
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
# 如果不返回字典形式的输出,则按元组形式构造输出结果
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
# 返回TokenClassifierOutput对象,包含损失值、logits、隐藏状态和注意力权重等
return TokenClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
"""
SqueezeBERT Model with a span classification head on top for extractive question-answering tasks like SQuAD (a
linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
这段代码是一个字符串文档,描述了SqueezeBERT模型及其用途,特别是在类似SQuAD的抽取式问答任务中使用的特定功能。
)
class SqueezeBertForQuestionAnswering(SqueezeBertPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.transformer = SqueezeBertModel(config)
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(SQUEEZEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=QuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None, # 输入的token IDs,可选的张量
attention_mask: Optional[torch.Tensor] = None, # 注意力遮罩,用于指示哪些tokens要参与attention计算
token_type_ids: Optional[torch.Tensor] = None, # token类型IDs,用于区分segment
position_ids: Optional[torch.Tensor] = None, # 位置IDs,指示每个token的位置
head_mask: Optional[torch.Tensor] = None, # 头部掩码,指定要mask的attention头
inputs_embeds: Optional[torch.Tensor] = None, # 输入的嵌入向量,代替input_ids和token_type_ids
start_positions: Optional[torch.Tensor] = None, # 起始位置的token索引
end_positions: Optional[torch.Tensor] = None, # 结束位置的token索引
output_attentions: Optional[bool] = None, # 是否输出attention权重
output_hidden_states: Optional[bool] = None, # 是否输出隐藏状态
return_dict: Optional[bool] = None, # 是否返回一个字典格式的输出
r"""
start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (*sequence_length*). Position outside of the sequence
are not taken into account for computing the loss.
end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (*sequence_length*). Position outside of the sequence
are not taken into account for computing the loss.
"""
# 初始化返回字典,如果未提供则使用配置中的设定
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 使用 transformer 模型处理输入数据
outputs = self.transformer(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 获取序列输出
sequence_output = outputs[0]
# 使用 qa_outputs 进行问答模型的输出
logits = self.qa_outputs(sequence_output)
# 将 logits 拆分为 start_logits 和 end_logits
start_logits, end_logits = logits.split(1, dim=-1)
# 去除多余的维度并确保连续性
start_logits = start_logits.squeeze(-1).contiguous()
end_logits = end_logits.squeeze(-1).contiguous()
total_loss = None
# 如果提供了起始位置和结束位置
if start_positions is not None and end_positions is not None:
# 如果是在多 GPU 环境下,添加一个维度以便操作
if len(start_positions.size()) > 1:
start_positions = start_positions.squeeze(-1)
if len(end_positions.size()) > 1:
end_positions = end_positions.squeeze(-1)
# 忽略超出模型输入范围的位置
ignored_index = start_logits.size(1)
start_positions = start_positions.clamp(0, ignored_index)
end_positions = end_positions.clamp(0, ignored_index)
# 定义交叉熵损失函数,忽略指定的索引
loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
start_loss = loss_fct(start_logits, start_positions)
end_loss = loss_fct(end_logits, end_positions)
total_loss = (start_loss + end_loss) / 2
# 如果不需要返回字典形式的输出
if not return_dict:
output = (start_logits, end_logits) + outputs[2:]
return ((total_loss,) + output) if total_loss is not None else output
# 返回 QuestionAnsweringModelOutput 对象,包含损失、起始位置 logits、结束位置 logits、隐藏状态和注意力权重
return QuestionAnsweringModelOutput(
loss=total_loss,
start_logits=start_logits,
end_logits=end_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
.\models\squeezebert\tokenization_squeezebert.py
"""SqueezeBERT 的标记化类。"""
import collections
import os
import unicodedata
from typing import List, Optional, Tuple
from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
from ...utils import logging
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"squeezebert/squeezebert-uncased": (
"https://huggingface.co/squeezebert/squeezebert-uncased/resolve/main/vocab.txt"
),
"squeezebert/squeezebert-mnli": "https://huggingface.co/squeezebert/squeezebert-mnli/resolve/main/vocab.txt",
"squeezebert/squeezebert-mnli-headless": (
"https://huggingface.co/squeezebert/squeezebert-mnli-headless/resolve/main/vocab.txt"
),
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"squeezebert/squeezebert-uncased": 512,
"squeezebert/squeezebert-mnli": 512,
"squeezebert/squeezebert-mnli-headless": 512,
}
PRETRAINED_INIT_CONFIGURATION = {
"squeezebert/squeezebert-uncased": {"do_lower_case": True},
"squeezebert/squeezebert-mnli": {"do_lower_case": True},
"squeezebert/squeezebert-mnli-headless": {"do_lower_case": True},
}
def load_vocab(vocab_file):
"""加载词汇文件到一个字典中。"""
vocab = collections.OrderedDict()
with open(vocab_file, "r", encoding="utf-8") as reader:
tokens = reader.readlines()
for index, token in enumerate(tokens):
token = token.rstrip("\n")
vocab[token] = index
return vocab
def whitespace_tokenize(text):
"""在文本上执行基本的空白字符清理和分割。"""
text = text.strip()
if not text:
return []
tokens = text.split()
return tokens
class SqueezeBertTokenizer(PreTrainedTokenizer):
r"""
构建一个 SqueezeBERT 分词器。基于 WordPiece。
这个分词器继承自 [`PreTrainedTokenizer`],其中包含大多数主要方法。用户应参考其文档
# 定义一个类,用于处理词汇表和标记化的相关功能,继承自PreTrainedTokenizerBase类,
# 可以查阅更多关于这些方法的信息。
Args:
vocab_file (`str`):
包含词汇表的文件路径。
do_lower_case (`bool`, *optional*, defaults to `True`):
是否在标记化时将输入转换为小写。
do_basic_tokenize (`bool`, *optional*, defaults to `True`):
是否在WordPiece标记化之前进行基本的分词处理。
never_split (`Iterable`, *optional*):
在标记化时不会被分割的标记集合。仅在 `do_basic_tokenize=True` 时有效。
unk_token (`str`, *optional*, defaults to `"[UNK]"`):
未知标记。词汇表中不存在的标记将被设置为这个标记。
sep_token (`str`, *optional*, defaults to `"[SEP]"`):
分隔符标记,用于将多个序列组合成一个序列,例如用于序列分类或问答任务中。
也用作带有特殊标记的序列的最后一个标记。
pad_token (`str`, *optional*, defaults to `"[PAD]"`):
用于填充的标记,例如在对不同长度的序列进行批处理时使用。
cls_token (`str`, *optional*, defaults to `"[CLS]"`):
分类器标记,用于序列分类任务中。构建带有特殊标记时是序列的第一个标记。
mask_token (`str`, *optional*, defaults to `"[MASK]"`):
用于掩码值的标记。在使用掩码语言建模训练时,模型将尝试预测这个标记。
tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
是否对中文字符进行标记化。对于日语,可能需要禁用此选项(参见此处的相关问题)。
strip_accents (`bool`, *optional*):
是否去除所有的重音符号。如果未指定此选项,则根据 `lowercase` 的值(如原始的SqueezeBERT中)确定。
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def __init__(
self,
vocab_file,
do_lower_case=True,
do_basic_tokenize=True,
never_split=None,
unk_token="[UNK]",
sep_token="[SEP]",
pad_token="[PAD]",
cls_token="[CLS]",
mask_token="[MASK]",
tokenize_chinese_chars=True,
strip_accents=None,
**kwargs,
):
if not os.path.isfile(vocab_file):
raise ValueError(
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
" model use `tokenizer = SqueezeBertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
)
self.vocab = load_vocab(vocab_file)
self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
self.do_basic_tokenize = do_basic_tokenize
if do_basic_tokenize:
self.basic_tokenizer = BasicTokenizer(
do_lower_case=do_lower_case,
never_split=never_split,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
)
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
super().__init__(
do_lower_case=do_lower_case,
do_basic_tokenize=do_basic_tokenize,
never_split=never_split,
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
**kwargs,
)
@property
def do_lower_case(self):
return self.basic_tokenizer.do_lower_case
@property
def vocab_size(self):
return len(self.vocab)
def get_vocab(self):
return dict(self.vocab, **self.added_tokens_encoder)
def _tokenize(self, text, split_special_tokens=False):
split_tokens = []
if self.do_basic_tokenize:
for token in self.basic_tokenizer.tokenize(
text, never_split=self.all_special_tokens if not split_special_tokens else None
):
if token in self.basic_tokenizer.never_split:
split_tokens.append(token)
else:
split_tokens += self.wordpiece_tokenizer.tokenize(token)
else:
split_tokens = self.wordpiece_tokenizer.tokenize(text)
return split_tokens
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
return self.vocab.get(token, self.vocab.get(self.unk_token))
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
return self.ids_to_tokens.get(index, self.unk_token)
def convert_tokens_to_string(self, tokens):
"""
Converts a sequence of tokens (string) into a single string.
Args:
tokens (`List[str]`): List of tokens to be joined into a string.
Returns:
`str`: The concatenated string of tokens with "##" markers removed.
"""
out_string = " ".join(tokens).replace(" ##", "").strip()
return out_string
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Builds model inputs from a sequence or a pair of sequences for sequence classification tasks by concatenating and
adding special tokens. A SqueezeBERT sequence has the following format:
- single sequence: `[CLS] X [SEP]`
- pair of sequences: `[CLS] A [SEP] B [SEP]`
Args:
token_ids_0 (`List[int]`): List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*): Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of input IDs with the appropriate special tokens added.
"""
if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
cls = [self.cls_token_id]
sep = [self.sep_token_id]
return cls + token_ids_0 + sep + token_ids_1 + sep
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieves sequence IDs from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.
Args:
token_ids_0 (`List[int]`): List of IDs.
token_ids_1 (`List[int]`, *optional*): Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
`List[int]`: A list of integers indicating the presence of special tokens (1) or sequence tokens (0).
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)
if token_ids_1 is not None:
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1]
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Creates token type IDs from a sequence or a pair of sequences for sequence classification tasks. Token type IDs
distinguish between the first and the second sequences in a pair.
Args:
token_ids_0 (`List[int]`): List of IDs for the first sequence.
token_ids_1 (`List[int]`, *optional*): Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of token type IDs with appropriate distinctions for sequence pairs.
"""
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A SqueezeBERT sequence
pair mask has the following format:
```
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
```
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
index = 0
if os.path.isdir(save_directory):
vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
else:
vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
with open(vocab_file, "w", encoding="utf-8") as writer:
for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
if index != token_index:
logger.warning(
f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
" Please check that the vocabulary is not corrupted!"
)
index = token_index
writer.write(token + "\n")
index += 1
return (vocab_file,)
class BasicTokenizer(object):
"""
Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
Args:
do_lower_case (`bool`, *optional*, defaults to `True`):
Whether or not to lowercase the input when tokenizing.
never_split (`Iterable`, *optional*):
Collection of tokens which will never be split during tokenization. Only has an effect when
`do_basic_tokenize=True`
tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
Whether or not to tokenize Chinese characters.
This should likely be deactivated for Japanese (see this
[issue](https://github.com/huggingface/transformers/issues/328)).
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
"""
def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None:
never_split = []
self.do_lower_case = do_lower_case
self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None):
"""
Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
Args:
never_split (`List[str]`, *optional*)
Kept for backward compatibility purposes. Now implemented directly at the base class level (see
[`PreTrainedTokenizer.tokenize`]) List of token not to split.
"""
never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
text = self._clean_text(text)
if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text)
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = []
for token in orig_tokens:
if token not in never_split:
if self.do_lower_case:
token = token.lower()
if self.strip_accents is not False:
token = self._run_strip_accents(token)
elif self.strip_accents:
token = self._run_strip_accents(token)
split_tokens.extend(self._run_split_on_punc(token, never_split))
output_tokens = whitespace_tokenize(" ".join(split_tokens))
return output_tokens
def _run_strip_accents(self, text):
"""Strips accents from a piece of text."""
text = unicodedata.normalize("NFD", text)
output = []
for char in text:
cat = unicodedata.category(char)
if cat == "Mn":
continue
output.append(char)
return "".join(output)
def _run_split_on_punc(self, text, never_split=None):
"""在文本上执行标点符号的分割。"""
if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text]
chars = list(text)
i = 0
start_new_word = True
output = []
while i < len(chars):
char = chars[i]
if _is_punctuation(char):
output.append([char])
start_new_word = True
else:
if start_new_word:
output.append([])
start_new_word = False
output[-1].append(char)
i += 1
return ["".join(x) for x in output]
def _tokenize_chinese_chars(self, text):
"""在CJK字符周围添加空格。"""
output = []
for char in text:
cp = ord(char)
if self._is_chinese_char(cp):
output.append(" ")
output.append(char)
output.append(" ")
else:
output.append(char)
return "".join(output)
def _is_chinese_char(self, cp):
"""检查CP是否是CJK字符的码位。"""
if (
(cp >= 0x4E00 and cp <= 0x9FFF)
or (cp >= 0x3400 and cp <= 0x4DBF)
or (cp >= 0x20000 and cp <= 0x2A6DF)
or (cp >= 0x2A700 and cp <= 0x2B73F)
or (cp >= 0x2B740 and cp <= 0x2B81F)
or (cp >= 0x2B820 and cp <= 0x2CEAF)
or (cp >= 0xF900 and cp <= 0xFAFF)
or (cp >= 0x2F800 and cp <= 0x2FA1F)
):
return True
return False
def _clean_text(self, text):
"""对文本执行无效字符移除和空白字符清理。"""
output = []
for char in text:
cp = ord(char)
if cp == 0 or cp == 0xFFFD or _is_control(char):
continue
if _is_whitespace(char):
output.append(" ")
else:
output.append(char)
return "".join(output)
class WordpieceTokenizer(object):
"""Runs WordPiece tokenization."""
def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
self.vocab = vocab
self.unk_token = unk_token
self.max_input_chars_per_word = max_input_chars_per_word
def tokenize(self, text):
"""
Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
tokenization using the given vocabulary.
For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.
Args:
text: A single token or whitespace separated tokens. This should have
already been passed through *BasicTokenizer*.
Returns:
A list of wordpiece tokens.
"""
output_tokens = []
for token in whitespace_tokenize(text):
chars = list(token)
if len(chars) > self.max_input_chars_per_word:
output_tokens.append(self.unk_token)
continue
is_bad = False
start = 0
sub_tokens = []
while start < len(chars):
end = len(chars)
cur_substr = None
while start < end:
substr = "".join(chars[start:end])
if start > 0:
substr = "##" + substr
if substr in self.vocab:
cur_substr = substr
break
end -= 1
if cur_substr is None:
is_bad = True
break
sub_tokens.append(cur_substr)
start = end
if is_bad:
output_tokens.append(self.unk_token)
else:
output_tokens.extend(sub_tokens)
return output_tokens
.\models\squeezebert\tokenization_squeezebert_fast.py
"""SqueezeBERT 的标记化类。"""
import json
from typing import List, Optional, Tuple
from tokenizers import normalizers
from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ...utils import logging
from .tokenization_squeezebert import SqueezeBertTokenizer
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"squeezebert/squeezebert-uncased": (
"https://huggingface.co/squeezebert/squeezebert-uncased/resolve/main/vocab.txt"
),
"squeezebert/squeezebert-mnli": "https://huggingface.co/squeezebert/squeezebert-mnli/resolve/main/vocab.txt",
"squeezebert/squeezebert-mnli-headless": (
"https://huggingface.co/squeezebert/squeezebert-mnli-headless/resolve/main/vocab.txt"
),
},
"tokenizer_file": {
"squeezebert/squeezebert-uncased": (
"https://huggingface.co/squeezebert/squeezebert-uncased/resolve/main/tokenizer.json"
),
"squeezebert/squeezebert-mnli": (
"https://huggingface.co/squeezebert/squeezebert-mnli/resolve/main/tokenizer.json"
),
"squeezebert/squeezebert-mnli-headless": (
"https://huggingface.co/squeezebert/squeezebert-mnli-headless/resolve/main/tokenizer.json"
),
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"squeezebert/squeezebert-uncased": 512,
"squeezebert/squeezebert-mnli": 512,
"squeezebert/squeezebert-mnli-headless": 512,
}
PRETRAINED_INIT_CONFIGURATION = {
"squeezebert/squeezebert-uncased": {"do_lower_case": True},
"squeezebert/squeezebert-mnli": {"do_lower_case": True},
"squeezebert/squeezebert-mnli-headless": {"do_lower_case": True},
}
class SqueezeBertTokenizerFast(PreTrainedTokenizerFast):
r"""
构建一个“快速”SqueezeBERT标记器(由 HuggingFace 的 *tokenizers* 库支持)。基于 WordPiece。
此标记器继承自 [`PreTrainedTokenizerFast`],其中包含大多数主要方法。用户应参考此超类以获取有关这些方法的更多信息。
```
"""
Args:
vocab_file (`str`):
File containing the vocabulary.
do_lower_case (`bool`, *optional*, defaults to `True`):
Whether or not to lowercase the input when tokenizing.
unk_token (`str`, *optional*, defaults to `"[UNK]"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
sep_token (`str`, *optional*, defaults to `"[SEP]"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
pad_token (`str`, *optional*, defaults to `"[PAD]"`):
The token used for padding, for example when batching sequences of different lengths.
cls_token (`str`, *optional*, defaults to `"[CLS]"`):
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
mask_token (`str`, *optional*, defaults to `"[MASK]"`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
clean_text (`bool`, *optional*, defaults to `True`):
Whether or not to clean the text before tokenization by removing any control characters and replacing all
whitespaces by the classic one.
tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see [this
issue](https://github.com/huggingface/transformers/issues/328)).
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original SqueezeBERT).
wordpieces_prefix (`str`, *optional*, defaults to `"##"`):
The prefix for subwords.
"""
# 引入全局变量
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
slow_tokenizer_class = SqueezeBertTokenizer
# 初始化函数,接收多个参数,包括必填和可选参数
def __init__(
self,
vocab_file=None,
tokenizer_file=None,
do_lower_case=True,
unk_token="[UNK]",
sep_token="[SEP]",
pad_token="[PAD]",
cls_token="[CLS]",
mask_token="[MASK]",
tokenize_chinese_chars=True,
strip_accents=None,
**kwargs,
):
):
# 调用父类初始化方法,设置词汇文件、分词器文件、大小写处理、未知标记、分隔符标记、填充标记、类别标记、掩码标记、中文字符分词处理和重音处理等参数
super().__init__(
vocab_file,
tokenizer_file=tokenizer_file,
do_lower_case=do_lower_case,
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
**kwargs,
)
# 获取当前标准化器的状态,并转换成 JSON 格式
normalizer_state = json.loads(self.backend_tokenizer.normalizer.__getstate__())
# 检查当前标准化器的设置与初始化时传入的设置是否一致,若不一致则更新标准化器
if (
normalizer_state.get("lowercase", do_lower_case) != do_lower_case
or normalizer_state.get("strip_accents", strip_accents) != strip_accents
or normalizer_state.get("handle_chinese_chars", tokenize_chinese_chars) != tokenize_chinese_chars
):
# 获取当前标准化器的类别,并更新设置参数
normalizer_class = getattr(normalizers, normalizer_state.pop("type"))
normalizer_state["lowercase"] = do_lower_case
normalizer_state["strip_accents"] = strip_accents
normalizer_state["handle_chinese_chars"] = tokenize_chinese_chars
self.backend_tokenizer.normalizer = normalizer_class(**normalizer_state)
# 更新对象的小写处理设置
self.do_lower_case = do_lower_case
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A SqueezeBERT sequence has the following format:
- single sequence: `[CLS] X [SEP]`
- pair of sequences: `[CLS] A [SEP] B [SEP]`
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [input IDs](../glossary
"""
# 构建带有特殊标记的输入序列,根据是否提供第二个序列决定是否添加第二个分隔符标记
output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
if token_ids_1 is not None:
output += token_ids_1 + [self.sep_token_id]
return output
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
):
def create_seq_pair_mask(self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A SqueezeBERT sequence
pair mask has the following format:
```
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
```
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
Args:
token_ids_0 (`List[int]`):
List of IDs for the first sequence.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of token type IDs according to the given sequence(s). 0 represents the first sequence, and 1 represents the second sequence.
"""
# Define special tokens
sep = [self.sep_token_id] # Separator token ID
cls = [self.cls_token_id] # Classification token ID
# If token_ids_1 is not provided, return mask for token_ids_0 only
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
# Return mask including both token_ids_0 and token_ids_1
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
"""
Save the vocabulary associated with the tokenizer's model to a specified directory.
Args:
save_directory (str):
Directory where the vocabulary files will be saved.
filename_prefix (str, *optional*):
Optional prefix for the saved files.
Returns:
Tuple[str]: Tuple containing the filenames of the saved vocabulary files.
"""
# Save the vocabulary files using the tokenizer's model
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
return tuple(files)
.\models\squeezebert\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available
_import_structure = {
"configuration_squeezebert": [
"SQUEEZEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP",
"SqueezeBertConfig",
"SqueezeBertOnnxConfig",
],
"tokenization_squeezebert": ["SqueezeBertTokenizer"],
}
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_squeezebert_fast"] = ["SqueezeBertTokenizerFast"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_squeezebert"] = [
"SQUEEZEBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
"SqueezeBertForMaskedLM",
"SqueezeBertForMultipleChoice",
"SqueezeBertForQuestionAnswering",
"SqueezeBertForSequenceClassification",
"SqueezeBertForTokenClassification",
"SqueezeBertModel",
"SqueezeBertModule",
"SqueezeBertPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_squeezebert import (
SQUEEZEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
SqueezeBertConfig,
SqueezeBertOnnxConfig,
)
from .tokenization_squeezebert import SqueezeBertTokenizer
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_squeezebert_fast import SqueezeBertTokenizerFast
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_squeezebert import (
SQUEEZEBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
SqueezeBertForMaskedLM,
SqueezeBertForMultipleChoice,
SqueezeBertForQuestionAnswering,
SqueezeBertForSequenceClassification,
SqueezeBertForTokenClassification,
SqueezeBertModel,
SqueezeBertModule,
SqueezeBertPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\stablelm\configuration_stablelm.py
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
STABLELM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"stabilityai/stablelm-3b-4e1t": "https://huggingface.co/stabilityai/stablelm-3b-4e1t/resolve/main/config.json",
}
class StableLmConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`~StableLmModel`].
It is used to instantiate an StableLM model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the StableLM [stabilityai/stablelm-3b-4e1t](https://huggingface.co/stabilityai/stablelm-3b-4e1t) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used
to control the model outputs. Read the documentation from [`PretrainedConfig`]
for more information.
Example:
```
>>> from transformers import StableLmModel, StableLmConfig
>>> # Initializing a StableLM stablelm-3b style configuration
>>> configuration = StableLmConfig()
```
"""
model_type = "stablelm"
keys_to_ignore_at_inference = ["past_key_values"]
def __init__(
self,
vocab_size=50304,
intermediate_size=6912,
hidden_size=2560,
num_hidden_layers=32,
num_attention_heads=32,
num_key_value_heads=32,
hidden_act="silu",
max_position_embeddings=4096,
initializer_range=0.02,
layer_norm_eps=1.0e-5,
use_cache=True,
tie_word_embeddings=False,
rope_theta=10_000,
rope_scaling=None,
use_qkv_bias=False,
hidden_dropout=0.0,
attention_dropout=0.0,
partial_rotary_factor=0.25,
bos_token_id=0,
eos_token_id=0,
**kwargs,
):
super().__init__(
vocab_size=vocab_size,
intermediate_size=intermediate_size,
hidden_size=hidden_size,
num_hidden_layers=num_hidden_layers,
num_attention_heads=num_attention_heads,
num_key_value_heads=num_key_value_heads,
hidden_act=hidden_act,
max_position_embeddings=max_position_embeddings,
initializer_range=initializer_range,
layer_norm_eps=layer_norm_eps,
use_cache=use_cache,
tie_word_embeddings=tie_word_embeddings,
rope_theta=rope_theta,
rope_scaling=rope_scaling,
use_qkv_bias=use_qkv_bias,
hidden_dropout=hidden_dropout,
attention_dropout=attention_dropout,
partial_rotary_factor=partial_rotary_factor,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
**kwargs,
)
):
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.num_key_value_heads = num_key_value_heads
self.hidden_act = hidden_act
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.use_cache = use_cache
self.rope_theta = rope_theta
self.rope_scaling = rope_scaling
self.use_qkv_bias = use_qkv_bias
self.hidden_dropout = hidden_dropout
self.attention_dropout = attention_dropout
self.partial_rotary_factor = partial_rotary_factor
self._rope_scaling_validation()
super().__init__(
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)
def _rope_scaling_validation(self):
"""
Validate the `rope_scaling` configuration.
"""
if self.rope_scaling is None:
return
if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
raise ValueError(
"`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
f"got {self.rope_scaling}"
)
rope_scaling_type = self.rope_scaling.get("type", None)
rope_scaling_factor = self.rope_scaling.get("factor", None)
if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
raise ValueError(
f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
)
if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")
.\models\stablelm\modeling_stablelm.py
""" PyTorch StableLM 模型。"""
import math
from typing import List, Optional, Tuple, Union
import torch
import torch.nn.functional as F
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN
from ...cache_utils import Cache, DynamicCache
from ...modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa
from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
from ...modeling_utils import PreTrainedModel
from ...utils import (
add_start_docstrings,
add_start_docstrings_to_model_forward,
is_flash_attn_2_available,
is_flash_attn_greater_or_equal_2_10,
logging,
replace_return_docstrings,
)
from .configuration_stablelm import StableLmConfig
if is_flash_attn_2_available():
from flash_attn import flash_attn_func, flash_attn_varlen_func
from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "StableLmConfig"
def _get_unpad_data(attention_mask):
seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
max_seqlen_in_batch = seqlens_in_batch.max().item()
cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
return (
indices,
cu_seqlens,
max_seqlen_in_batch,
)
class StableLmRotaryEmbedding(nn.Module):
def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
super().__init__()
self.dim = dim
self.max_position_embeddings = max_position_embeddings
self.base = base
inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
self.register_buffer("inv_freq", inv_freq, persistent=False)
self._set_cos_sin_cache(
seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
)
def _set_cos_sin_cache(self, seq_len, device, dtype):
self.max_seq_len_cached = seq_len
t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
freqs = torch.outer(t, self.inv_freq)
emb = torch.cat((freqs, freqs), dim=-1)
self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
def forward(self, x, seq_len=None):
if seq_len > self.max_seq_len_cached:
self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
return (
self.cos_cached[:seq_len].to(dtype=x.dtype),
self.sin_cached[:seq_len].to(dtype=x.dtype),
)
class StableLmLinearScalingRotaryEmbedding(StableLmRotaryEmbedding):
"""StableLmRotaryEmbedding扩展,带有线性缩放功能。由Reddit用户/u/kaiokendev贡献"""
def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
self.scaling_factor = scaling_factor
super().__init__(dim, max_position_embeddings, base, device)
def _set_cos_sin_cache(self, seq_len, device, dtype):
self.max_seq_len_cached = seq_len
t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
t = t / self.scaling_factor
freqs = torch.outer(t, self.inv_freq)
emb = torch.cat((freqs, freqs), dim=-1)
self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
class StableLmDynamicNTKScalingRotaryEmbedding(StableLmRotaryEmbedding):
"""StableLmRotaryEmbedding扩展,带有动态NTK缩放功能。由Reddit用户/u/bloc97和/u/emozilla贡献"""
def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
self.scaling_factor = scaling_factor
super().__init__(dim, max_position_embeddings, base, device)
def _set_cos_sin_cache(self, seq_len, device, dtype):
self.max_seq_len_cached = seq_len
if seq_len > self.max_position_embeddings:
base = self.base * (
(self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
) ** (self.dim / (self.dim - 2))
inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
self.register_buffer("inv_freq", inv_freq, persistent=False)
t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
freqs = torch.outer(t, self.inv_freq)
emb = torch.cat((freqs, freqs), dim=-1)
self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
def rotate_half(x):
"""旋转输入的一半隐藏维度。"""
x1 = x[..., : x.shape[-1] // 2]
x2 = x[..., x.shape[-1] // 2 :]
return torch.cat((-x2, x1), dim=-1)
def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
"""
Args:
q (`torch.Tensor`): 查询张量。
k (`torch.Tensor`): 键张量。
cos (`torch.Tensor`): 旋转嵌入的余弦部分。
sin (`torch.Tensor`): 旋转嵌入的正弦部分。
position_ids (`torch.Tensor`):
与查询和键张量对应的位置索引。例如,当与KV缓存一起使用时,可以传递偏移的位置ID。
unsqueeze_dim (`int`, *可选*, 默认为 1):
指定沿着哪个维度展开 cos[position_ids] 和 sin[position_ids],以便它们可以正确地广播到 q 和 k 的维度。
例如,如果 cos[position_ids] 和 sin[position_ids] 的形状是 [batch_size, seq_len, head_dim],
当 q 和 k 的形状是 [batch_size, heads, seq_len, head_dim] 时,设置 unsqueeze_dim=1 可以使 cos[position_ids]
和 sin[position_ids] 能够广播到 q 和 k 的形状。类似地,如果 q 和 k 的形状是 [batch_size, seq_len, heads, head_dim],
则设置 unsqueeze_dim=2。
Returns:
`tuple(torch.Tensor)`: 旋转使用旋转位置嵌入后的查询和键张量。
"""
cos = cos[position_ids].unsqueeze(unsqueeze_dim)
sin = sin[position_ids].unsqueeze(unsqueeze_dim)
q_embed = (q * cos) + (rotate_half(q) * sin)
k_embed = (k * cos) + (rotate_half(k) * sin)
return q_embed, k_embed
class StableLmMLP(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.hidden_size = config.hidden_size
self.intermediate_size = config.intermediate_size
self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
self.act_fn = ACT2FN[config.hidden_act]
def forward(self, x):
return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
"""
这相当于 torch.repeat_interleave(x, dim=1, repeats=n_rep)。将隐藏状态从 (batch, num_key_value_heads, seqlen, head_dim)
重复为 (batch, num_attention_heads, seqlen, head_dim)。
"""
batch, num_key_value_heads, slen, head_dim = hidden_states.shape
if n_rep == 1:
return hidden_states
hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
def __init__(self, config: StableLmConfig, layer_idx: Optional[int] = None):
super().__init__()
self.config = config
self.layer_idx = layer_idx
if layer_idx is None:
logger.warning_once(
f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
"lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
"when creating this class."
)
self.hidden_size = config.hidden_size
self.num_heads = config.num_attention_heads
self.head_dim = self.hidden_size // self.num_heads
self.num_key_value_heads = config.num_key_value_heads
self.num_key_value_groups = self.num_heads // self.num_key_value_heads
self.max_position_embeddings = config.max_position_embeddings
self.rope_theta = config.rope_theta
self.partial_rotary_factor = config.partial_rotary_factor
self.is_causal = True
if (self.head_dim * self.num_heads) != self.hidden_size:
raise ValueError(
f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
f" and `num_heads`: {self.num_heads})."
)
self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.use_qkv_bias)
self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.use_qkv_bias)
self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.use_qkv_bias)
self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
self.attention_dropout = nn.Dropout(config.attention_dropout)
self._init_rope()
def _init_rope(self):
if self.config.rope_scaling is None:
self.rotary_emb = StableLmRotaryEmbedding(
int(self.partial_rotary_factor * self.head_dim),
max_position_embeddings=self.max_position_embeddings,
base=self.rope_theta,
)
else:
scaling_type = self.config.rope_scaling["type"]
scaling_factor = self.config.rope_scaling["factor"]
if scaling_type == "linear":
self.rotary_emb = StableLmLinearScalingRotaryEmbedding(
int(self.partial_rotary_factor * self.head_dim),
max_position_embeddings=self.max_position_embeddings,
scaling_factor=scaling_factor,
base=self.rope_theta,
)
elif scaling_type == "dynamic":
self.rotary_emb = StableLmDynamicNTKScalingRotaryEmbedding(
int(self.partial_rotary_factor * self.head_dim),
max_position_embeddings=self.max_position_embeddings,
scaling_factor=scaling_factor,
base=self.rope_theta,
)
else:
raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_value: Optional[Cache] = None,
output_attentions: bool = False,
use_cache: bool = False,
class StableLmSdpaAttention(StableLmAttention):
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_value: Optional[Cache] = None,
output_attentions: bool = False,
use_cache: bool = False,
):
class StableLmFlashAttention2(StableLmAttention):
"""
StableLM flash attention module. This module inherits from `StableLmAttention` as the weights of the module stays
untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
flash attention and deal with padding tokens in case the input contains any of them.
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_value: Optional[Cache] = None,
output_attentions: bool = False,
use_cache: bool = False,
**kwargs,
):
def _flash_attention_forward(
self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
):
):
"""
Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
first unpad the input, then computes the attention scores and pad the final attention scores.
Args:
query_states (`torch.Tensor`):
Input query states to be passed to Flash Attention API
key_states (`torch.Tensor`):
Input key states to be passed to Flash Attention API
value_states (`torch.Tensor`):
Input value states to be passed to Flash Attention API
attention_mask (`torch.Tensor`):
The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
position of padding tokens and 1 for the position of non-padding tokens.
dropout (`float`):
Attention dropout
softmax_scale (`float`, *optional*):
The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
"""
if not self._flash_attn_uses_top_left_mask:
causal = self.is_causal
else:
causal = self.is_causal and query_length != 1
if attention_mask is not None:
batch_size = query_states.shape[0]
query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
query_states, key_states, value_states, attention_mask, query_length
)
cu_seqlens_q, cu_seqlens_k = cu_seq_lens
max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
attn_output_unpad = flash_attn_varlen_func(
query_states,
key_states,
value_states,
cu_seqlens_q=cu_seqlens_q,
cu_seqlens_k=cu_seqlens_k,
max_seqlen_q=max_seqlen_in_batch_q,
max_seqlen_k=max_seqlen_in_batch_k,
dropout_p=dropout,
softmax_scale=softmax_scale,
causal=causal,
)
attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
else:
attn_output = flash_attn_func(
query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
)
return attn_output
def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
key_layer = index_first_axis(
key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
)
value_layer = index_first_axis(
value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
)
if query_length == kv_seq_len:
query_layer = index_first_axis(
query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
)
cu_seqlens_q = cu_seqlens_k
max_seqlen_in_batch_q = max_seqlen_in_batch_k
indices_q = indices_k
elif query_length == 1:
max_seqlen_in_batch_q = 1
cu_seqlens_q = torch.arange(
batch_size + 1, dtype=torch.int32, device=query_layer.device
)
indices_q = cu_seqlens_q[:-1]
query_layer = query_layer.squeeze(1)
else:
attention_mask = attention_mask[:, -query_length:]
query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
return (
query_layer,
key_layer,
value_layer,
indices_q,
(cu_seqlens_q, cu_seqlens_k),
(max_seqlen_in_batch_q, max_seqlen_in_batch_k),
)
ATTENTION_CLASSES = {
"eager": StableLmAttention,
"sdpa": StableLmSdpaAttention,
"flash_attention_2": StableLmFlashAttention2,
}
class StableLmDecoderLayer(nn.Module):
def __init__(self, config: StableLmConfig, layer_idx: int):
super().__init__()
self.hidden_size = config.hidden_size
self.self_attn = ATTENTION_CLASSES[config._attn_implementation](config, layer_idx=layer_idx)
self.mlp = StableLmMLP(config)
self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
output_attentions: Optional[bool] = False,
use_cache: Optional[bool] = False,
) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
"""
Args:
hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range
`[0, config.n_positions - 1]`.
[What are position IDs?](../glossary#position-ids)
past_key_value (`Tuple(torch.FloatTensor)`, *optional*):
cached past key and value projection states
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
use_cache (`bool`, *optional*):
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
(see `past_key_values`).
"""
residual = hidden_states
hidden_states = self.input_layernorm(hidden_states)
hidden_states, self_attn_weights, present_key_value = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_value=past_key_value,
output_attentions=output_attentions,
use_cache=use_cache,
)
hidden_states = residual + hidden_states
residual = hidden_states
hidden_states = self.post_attention_layernorm(hidden_states)
hidden_states = self.mlp(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = hidden_states + residual
outputs = (hidden_states,)
if output_attentions:
outputs += (self_attn_weights,)
if use_cache:
outputs += (present_key_value,)
return outputs
STABLELM_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`StableLmConfig`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
@add_start_docstrings(
"The bare StableLm Model outputting raw hidden-states without any specific head on top.",
STABLELM_START_DOCSTRING,
)
class StableLmPreTrainedModel(PreTrainedModel):
config_class = StableLmConfig
base_model_prefix = "model"
supports_gradient_checkpointing = True
_no_split_modules = ["StableLmDecoderLayer"]
_skip_keys_device_placement = "past_key_values"
_supports_flash_attn_2 = True
_supports_cache_class = True
_supports_sdpa = True
def _init_weights(self, module):
std = self.config.initializer_range
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=std)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=std)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
STABLELM_INPUTS_DOCSTRING = r"""
"""
@add_start_docstrings(
"The bare StableLm Model outputting raw hidden-states without any specific head on top.",
STABLELM_START_DOCSTRING,
)
class StableLmModel(StableLmPreTrainedModel):
"""
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`StableLmDecoderLayer`]
Args:
config: StableLmConfig
"""
def __init__(self, config: StableLmConfig):
super().__init__(config)
self.padding_idx = config.pad_token_id
self.vocab_size = config.vocab_size
self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
self.layers = nn.ModuleList(
[StableLmDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
)
self.norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self._attn_implementation = config._attn_implementation
self.gradient_checkpointing = False
self.post_init()
def get_input_embeddings(self):
return self.embed_tokens
def set_input_embeddings(self, value):
self.embed_tokens = value
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
class StableLmForCausalLM(StableLmPreTrainedModel):
_tied_weights_keys = ["lm_head.weight"]
def __init__(self, config):
super().__init__(config)
self.model = StableLmModel(config)
self.vocab_size = config.vocab_size
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.post_init()
def get_input_embeddings(self):
return self.model.embed_tokens
def set_input_embeddings(self, value):
self.model.embed_tokens = value
def get_output_embeddings(self):
return self.lm_head
def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings
def set_decoder(self, decoder):
self.model = decoder
def get_decoder(self):
return self.model
@add_start_docstrings_to_model_forward(STABLELM_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
pass
def prepare_inputs_for_generation(
self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
):
pass
):
if past_key_values is not None:
if isinstance(past_key_values, Cache):
cache_length = past_key_values.get_seq_length()
past_length = past_key_values.seen_tokens
max_cache_length = past_key_values.get_max_length()
else:
cache_length = past_length = past_key_values[0][0].shape[2]
max_cache_length = None
if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
elif past_length < input_ids.shape[1]:
input_ids = input_ids[:, past_length:]
if (
max_cache_length is not None
and attention_mask is not None
and cache_length + input_ids.shape[1] > max_cache_length
):
attention_mask = attention_mask[:, -max_cache_length:]
position_ids = kwargs.get("position_ids", None)
if attention_mask is not None and position_ids is None:
position_ids = attention_mask.long().cumsum(-1) - 1
position_ids.masked_fill_(attention_mask == 0, 1)
if past_key_values:
position_ids = position_ids[:, -input_ids.shape[1] :]
if inputs_embeds is not None and past_key_values is None:
model_inputs = {"inputs_embeds": inputs_embeds}
else:
model_inputs = {"input_ids": input_ids}
model_inputs.update(
{
"position_ids": position_ids,
"past_key_values": past_key_values,
"use_cache": kwargs.get("use_cache"),
"attention_mask": attention_mask,
}
)
return model_inputs
def _reorder_cache(past_key_values, beam_idx):
reordered_past = ()
for layer_past in past_key_values:
reordered_past += (
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
)
return reordered_past
@add_start_docstrings(
"""
The StableLm transformer with a sequence classification head on top (linear layer).
[`StableLmForSequenceClassification`] uses the last token in order to do the classification, as other causal
models (e.g. GPT-2) do.
Since it does classification on the last token, it requires to know the position of the last token. If a
`pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
each row of the batch).
""",
STABLELM_START_DOCSTRING,
)
class StableLmForSequenceClassification(StableLmPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.model = StableLmModel(config)
self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
self.post_init()
def get_input_embeddings(self):
return self.model.embed_tokens
def set_input_embeddings(self, value):
self.model.embed_tokens = value
@add_start_docstrings_to_model_forward(STABLELM_INPUTS_DOCSTRING)
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
pass