Transformers 源码解析(三十)
.\models\convnextv2\modeling_convnextv2.py
""" PyTorch ConvNextV2 model."""
from typing import Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN
from ...modeling_outputs import (
BackboneOutput,
BaseModelOutputWithNoAttention,
BaseModelOutputWithPoolingAndNoAttention,
ImageClassifierOutputWithNoAttention,
)
from ...modeling_utils import PreTrainedModel
from ...utils import (
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from ...utils.backbone_utils import BackboneMixin
from .configuration_convnextv2 import ConvNextV2Config
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "ConvNextV2Config"
_CHECKPOINT_FOR_DOC = "facebook/convnextv2-tiny-1k-224"
_EXPECTED_OUTPUT_SHAPE = [1, 768, 7, 7]
_IMAGE_CLASS_CHECKPOINT = "facebook/convnextv2-tiny-1k-224"
_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"
CONVNEXTV2_PRETRAINED_MODEL_ARCHIVE_LIST = [
"facebook/convnextv2-tiny-1k-224",
]
def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
"""
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
argument.
"""
if drop_prob == 0.0 or not training:
return input
keep_prob = 1 - drop_prob
shape = (input.shape[0],) + (1,) * (input.ndim - 1)
random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
random_tensor.floor_()
output = input.div(keep_prob) * random_tensor
return output
class ConvNextV2DropPath(nn.Module):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
def __init__(self, drop_prob: Optional[float] = None) -> None:
super().__init__()
self.drop_prob = drop_prob
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
return drop_path(hidden_states, self.drop_prob, self.training)
def extra_repr(self) -> str:
return "p={}".format(self.drop_prob)
class ConvNextV2LayerNorm(nn.Module):
r"""LayerNorm that supports two data formats: channels_last (default) or channels_first.
The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height,
width, channels) while channels_first corresponds to inputs with shape (batch_size, channels, height, width).
"""
def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
super().__init__()
self.weight = nn.Parameter(torch.ones(normalized_shape))
self.bias = nn.Parameter(torch.zeros(normalized_shape))
self.eps = eps
self.data_format = data_format
if self.data_format not in ["channels_last", "channels_first"]:
raise NotImplementedError(f"Unsupported data format: {self.data_format}")
self.normalized_shape = (normalized_shape,)
def forward(self, x: torch.Tensor) -> torch.Tensor:
if self.data_format == "channels_last":
x = torch.nn.functional.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
elif self.data_format == "channels_first":
input_dtype = x.dtype
x = x.float()
u = x.mean(1, keepdim=True)
s = (x - u).pow(2).mean(1, keepdim=True)
x = (x - u) / torch.sqrt(s + self.eps)
x = x.to(dtype=input_dtype)
x = self.weight[:, None, None] * x + self.bias[:, None, None]
return x
class ConvNextV2Embeddings(nn.Module):
"""This class is comparable to (and inspired by) the SwinEmbeddings class
found in src/transformers/models/swin/modeling_swin.py.
"""
def __init__(self, config):
super().__init__()
self.patch_embeddings = nn.Conv2d(
config.num_channels, config.hidden_sizes[0], kernel_size=config.patch_size, stride=config.patch_size
)
self.layernorm = ConvNextV2LayerNorm(config.hidden_sizes[0], eps=1e-6, data_format="channels_first")
self.num_channels = config.num_channels
def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
num_channels = pixel_values.shape[1]
if num_channels != self.num_channels:
raise ValueError(
"Make sure that the channel dimension of the pixel values match with the one set in the configuration."
)
embeddings = self.patch_embeddings(pixel_values)
embeddings = self.layernorm(embeddings)
return embeddings
class ConvNextV2Layer(nn.Module):
"""This corresponds to the `Block` class in the original implementation.
There are two equivalent implementations: [DwConv, LayerNorm (channels_first), Conv, GELU,1x1 Conv]; all in (N, C,
H, W) (2) [DwConv, Permute to (N, H, W, C), LayerNorm (channels_last), Linear, GELU, Linear]; Permute back
The authors used (2) as they find it slightly faster in PyTorch.
Args:
config ([`ConvNextV2Config`]): Model configuration class.
dim (`int`): Number of input channels.
drop_path (`float`): Stochastic depth rate. Default: 0.0.
"""
def __init__(self, config, dim, drop_path=0):
super().__init__()
self.dwconv = nn.Conv2d(dim, dim, kernel_size=7, padding=3, groups=dim)
self.layernorm = ConvNextV2LayerNorm(dim, eps=1e-6)
self.pwconv1 = nn.Linear(dim, 4 * dim)
self.act = ACT2FN[config.hidden_act]
self.grn = ConvNextV2GRN(4 * dim)
self.pwconv2 = nn.Linear(4 * dim, dim)
self.drop_path = ConvNextV2DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
def forward(self, hidden_states: torch.FloatTensor) -> torch.Tensor:
input = hidden_states
x = self.dwconv(hidden_states)
x = x.permute(0, 2, 3, 1)
x = self.layernorm(x)
x = self.pwconv1(x)
x = self.act(x)
x = self.grn(x)
x = self.pwconv2(x)
x = x.permute(0, 3, 1, 2)
x = input + self.drop_path(x)
return x
class ConvNextV2Stage(nn.Module):
"""Represents a stage in the ConvNeXTV2 model."""
"""ConvNeXTV2 stage, consisting of an optional downsampling layer + multiple residual blocks.
Args:
config ([`ConvNextV2Config`]): Model configuration class.
in_channels (`int`): Number of input channels.
out_channels (`int`): Number of output channels.
depth (`int`): Number of residual blocks.
drop_path_rates(`List[float]`): Stochastic depth rates for each layer.
"""
def __init__(self, config, in_channels, out_channels, kernel_size=2, stride=2, depth=2, drop_path_rates=None):
super().__init__()
if in_channels != out_channels or stride > 1:
self.downsampling_layer = nn.Sequential(
ConvNextV2LayerNorm(in_channels, eps=1e-6, data_format="channels_first"),
nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride),
)
else:
self.downsampling_layer = nn.Identity()
drop_path_rates = drop_path_rates or [0.0] * depth
self.layers = nn.Sequential(
*[ConvNextV2Layer(config, dim=out_channels, drop_path=drop_path_rates[j]) for j in range(depth)]
)
def forward(self, hidden_states: torch.FloatTensor) -> torch.Tensor:
hidden_states = self.downsampling_layer(hidden_states)
hidden_states = self.layers(hidden_states)
return hidden_states
class ConvNextV2Encoder(nn.Module):
def __init__(self, config):
super().__init__()
self.stages = nn.ModuleList()
drop_path_rates = [
x.tolist() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths)).split(config.depths)
]
prev_chs = config.hidden_sizes[0]
for i in range(config.num_stages):
out_chs = config.hidden_sizes[i]
stage = ConvNextV2Stage(
config,
in_channels=prev_chs,
out_channels=out_chs,
stride=2 if i > 0 else 1,
depth=config.depths[i],
drop_path_rates=drop_path_rates[i],
)
self.stages.append(stage)
prev_chs = out_chs
def forward(
self,
hidden_states: torch.FloatTensor,
output_hidden_states: Optional[bool] = False,
return_dict: Optional[bool] = True,
) -> Union[Tuple, BaseModelOutputWithNoAttention]:
all_hidden_states = () if output_hidden_states else None
for i, layer_module in enumerate(self.stages):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
hidden_states = layer_module(hidden_states)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(v for v in [hidden_states, all_hidden_states] if v is not None)
return BaseModelOutputWithNoAttention(
last_hidden_state=hidden_states,
hidden_states=all_hidden_states,
)
class ConvNextV2PreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = ConvNextV2Config
base_model_prefix = "convnextv2"
main_input_name = "pixel_values"
def _init_weights(self, module):
"""Initialize the weights"""
if isinstance(module, (nn.Linear, nn.Conv2d)):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
CONVNEXTV2_START_DOCSTRING = r"""
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
# 将其作为常规的 PyTorch 模块使用,并参考 PyTorch 文档处理所有与一般使用和行为相关的问题。
Parameters:
config ([`ConvNextV2Config`]): 包含模型所有参数的模型配置类。
使用配置文件初始化模型时,不会加载与模型关联的权重,仅加载配置信息。
可查看 [`~PreTrainedModel.from_pretrained`] 方法来加载模型权重。
"""
CONVNEXTV2_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`ConvNextImageProcessor`]. See
[`ConvNextImageProcessor.__call__`] for details.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
@add_start_docstrings(
"The bare ConvNextV2 model outputting raw features without any specific head on top.",
CONVNEXTV2_START_DOCSTRING,
)
class ConvNextV2Model(ConvNextV2PreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.config = config
self.embeddings = ConvNextV2Embeddings(config)
self.encoder = ConvNextV2Encoder(config)
self.layernorm = nn.LayerNorm(config.hidden_sizes[-1], eps=config.layer_norm_eps)
self.post_init()
@add_start_docstrings_to_model_forward(CONVNEXTV2_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPoolingAndNoAttention,
config_class=_CONFIG_FOR_DOC,
modality="vision",
expected_output=_EXPECTED_OUTPUT_SHAPE,
)
def forward(
self,
pixel_values: torch.FloatTensor = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPoolingAndNoAttention]:
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if pixel_values is None:
raise ValueError("You have to specify pixel_values")
embedding_output = self.embeddings(pixel_values)
encoder_outputs = self.encoder(
embedding_output,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
last_hidden_state = encoder_outputs[0]
pooled_output = self.layernorm(last_hidden_state.mean([-2, -1]))
if not return_dict:
return (last_hidden_state, pooled_output) + encoder_outputs[1:]
return BaseModelOutputWithPoolingAndNoAttention(
last_hidden_state=last_hidden_state,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states,
)
"""
ConvNextV2 Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
ImageNet.
"""
CONVNEXTV2_START_DOCSTRING,
class ConvNextV2ForImageClassification(ConvNextV2PreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.convnextv2 = ConvNextV2Model(config)
self.classifier = (
nn.Linear(config.hidden_sizes[-1], config.num_labels) if config.num_labels > 0 else nn.Identity()
)
self.post_init()
@add_start_docstrings_to_model_forward(CONVNEXTV2_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=ImageClassifierOutputWithNoAttention,
config_class=_CONFIG_FOR_DOC,
expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
)
def forward(
self,
pixel_values: torch.FloatTensor = None,
labels: Optional[torch.LongTensor] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, ImageClassifierOutputWithNoAttention]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.convnextv2(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict)
pooled_output = outputs.pooler_output if return_dict else outputs[1]
logits = self.classifier(pooled_output)
loss = None
if labels is not None:
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
if self.config.problem_type == "regression":
loss_fct = MSELoss()
if self.num_labels == 1:
loss = loss_fct(logits.squeeze(), labels.squeeze())
else:
loss = loss_fct(logits, labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(logits, labels)
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return ImageClassifierOutputWithNoAttention(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
)
@add_start_docstrings(
"""
ConvNeXT V2 backbone, to be used with frameworks like DETR and MaskFormer.
""",
CONVNEXTV2_START_DOCSTRING,
)
class ConvNextV2Backbone(ConvNextV2PreTrainedModel, BackboneMixin):
def __init__(self, config):
super().__init__(config)
super()._init_backbone(config)
self.embeddings = ConvNextV2Embeddings(config)
self.encoder = ConvNextV2Encoder(config)
self.num_features = [config.hidden_sizes[0]] + config.hidden_sizes
hidden_states_norms = {}
for stage, num_channels in zip(self._out_features, self.channels):
hidden_states_norms[stage] = ConvNextV2LayerNorm(num_channels, data_format="channels_first")
self.hidden_states_norms = nn.ModuleDict(hidden_states_norms)
self.post_init()
@add_start_docstrings_to_model_forward(CONVNEXTV2_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
pixel_values: torch.Tensor,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> BackboneOutput:
"""
返回:模型输出的BackboneOutput对象。
Examples: 示例代码展示了如何使用该函数来处理图像和调用模型。
```
>>> from transformers import AutoImageProcessor, AutoBackbone
>>> import torch
>>> from PIL import Image
>>> import requests
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> processor = AutoImageProcessor.from_pretrained("facebook/convnextv2-tiny-1k-224")
>>> model = AutoBackbone.from_pretrained("facebook/convnextv2-tiny-1k-224")
>>> inputs = processor(image, return_tensors="pt")
>>> outputs = model(**inputs)
```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
embedding_output = self.embeddings(pixel_values)
outputs = self.encoder(
embedding_output,
output_hidden_states=True,
return_dict=return_dict,
)
hidden_states = outputs.hidden_states if return_dict else outputs[1]
feature_maps = ()
for stage, hidden_state in zip(self.stage_names, hidden_states):
if stage in self.out_features:
hidden_state = self.hidden_states_norms[stage](hidden_state)
feature_maps += (hidden_state,)
if not return_dict:
output = (feature_maps,)
if output_hidden_states:
output += (hidden_states,)
return output
return BackboneOutput(
feature_maps=feature_maps,
hidden_states=hidden_states if output_hidden_states else None,
attentions=None,
)
.\models\convnextv2\modeling_tf_convnextv2.py
""" TF 2.0 ConvNextV2 model."""
from __future__ import annotations
from typing import List, Optional, Tuple, Union
import numpy as np
import tensorflow as tf
from ...activations_tf import get_tf_activation
from ...modeling_tf_outputs import (
TFBaseModelOutputWithNoAttention,
TFBaseModelOutputWithPooling,
TFBaseModelOutputWithPoolingAndNoAttention,
TFImageClassifierOutputWithNoAttention,
)
from ...modeling_tf_utils import (
TFModelInputType,
TFPreTrainedModel,
TFSequenceClassificationLoss,
get_initializer,
keras,
keras_serializable,
unpack_inputs,
)
from ...tf_utils import shape_list
from ...utils import (
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
)
from .configuration_convnextv2 import ConvNextV2Config
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "ConvNextV2Config"
_CHECKPOINT_FOR_DOC = "facebook/convnextv2-tiny-1k-224"
_EXPECTED_OUTPUT_SHAPE = [1, 768, 7, 7]
_IMAGE_CLASS_CHECKPOINT = "facebook/convnextv2-tiny-1k-224"
_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"
CONVNEXTV2_PRETRAINED_MODEL_ARCHIVE_LIST = [
"facebook/convnextv2-tiny-1k-224",
]
class TFConvNextV2DropPath(keras.layers.Layer):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
References:
(1) github.com:rwightman/pytorch-image-models
"""
def __init__(self, drop_path: float, **kwargs):
super().__init__(**kwargs)
self.drop_path = drop_path
def call(self, x: tf.Tensor, training=None):
if training:
keep_prob = 1 - self.drop_path
shape = (tf.shape(x)[0],) + (1,) * (len(tf.shape(x)) - 1)
random_tensor = keep_prob + tf.random.uniform(shape, 0, 1)
random_tensor = tf.floor(random_tensor)
return (x / keep_prob) * random_tensor
return x
class TFConvNextV2GRN(keras.layers.Layer):
"""GRN (Global Response Normalization) layer"""
def __init__(self, config: ConvNextV2Config, dim: int, **kwargs):
super().__init__(**kwargs)
self.dim = dim
def build(self, input_shape: tf.TensorShape = None):
self.weight = self.add_weight(
name="weight",
shape=(1, 1, 1, self.dim),
initializer=keras.initializers.Zeros(),
)
self.bias = self.add_weight(
name="bias",
shape=(1, 1, 1, self.dim),
initializer=keras.initializers.Zeros(),
)
return super().build(input_shape)
def call(self, hidden_states: tf.Tensor):
global_features = tf.norm(hidden_states, ord="euclidean", axis=(1, 2), keepdims=True)
norm_features = global_features / (tf.reduce_mean(global_features, axis=-1, keepdims=True) + 1e-6)
hidden_states = self.weight * (hidden_states * norm_features) + self.bias + hidden_states
return hidden_states
class TFConvNextV2Embeddings(keras.layers.Layer):
"""这个类与src/transformers/models/swin/modeling_swin.py中的SwinEmbeddings类类似(并受其启发)。"""
def __init__(self, config: ConvNextV2Config, **kwargs):
super().__init__(**kwargs)
self.patch_embeddings = keras.layers.Conv2D(
filters=config.hidden_sizes[0],
kernel_size=config.patch_size,
strides=config.patch_size,
name="patch_embeddings",
kernel_initializer=get_initializer(config.initializer_range),
bias_initializer=keras.initializers.Zeros(),
)
self.layernorm = keras.layers.LayerNormalization(epsilon=1e-6, name="layernorm")
self.num_channels = config.num_channels
self.config = config
def call(self, pixel_values):
if isinstance(pixel_values, dict):
pixel_values = pixel_values["pixel_values"]
tf.debugging.assert_equal(
shape_list(pixel_values)[1],
self.num_channels,
message="确保像素值的通道维度与配置中设置的一致。",
)
pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1))
embeddings = self.patch_embeddings(pixel_values)
embeddings = self.layernorm(embeddings)
return embeddings
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "patch_embeddings", None) is not None:
with tf.name_scope(self.patch_embeddings.name):
self.patch_embeddings.build([None, None, None, self.config.num_channels])
if getattr(self, "layernorm", None) is not None:
with tf.name_scope(self.layernorm.name):
self.layernorm.build([None, None, None, self.config.hidden_sizes[0]])
class TFConvNextV2Layer(keras.layers.Layer):
"""这对应于原始实现中的`Block`类。
有两个等效的实现方式:
[DwConv, LayerNorm (channels_first), Conv, GELU, 1x1 Conv]; 全部在(N, C, H, W)中
[DwConv, 转换到(N, H, W, C), LayerNorm (channels_last), Linear, GELU, Linear]; 再转换回来
作者在PyTorch中发现第二种方式略快。由于我们已经将输入调整为遵循NHWC顺序,因此可以直接应用操作而无需排列。
"""
def __init__(self, config: ConvNextV2Config, dim: int, drop_path: float = 0.0, **kwargs):
super().__init__(**kwargs)
self.dim = dim
self.config = config
self.dwconv = keras.layers.Conv2D(
filters=dim,
kernel_size=7,
padding="same",
groups=dim,
kernel_initializer=get_initializer(config.initializer_range),
bias_initializer=keras.initializers.Zeros(),
name="dwconv",
)
self.layernorm = keras.layers.LayerNormalization(
epsilon=1e-6,
name="layernorm",
)
self.pwconv1 = keras.layers.Dense(
units=4 * dim,
kernel_initializer=get_initializer(config.initializer_range),
bias_initializer=keras.initializers.Zeros(),
name="pwconv1",
)
self.act = get_tf_activation(config.hidden_act)
self.grn = TFConvNextV2GRN(config, 4 * dim, dtype=tf.float32, name="grn")
self.pwconv2 = keras.layers.Dense(
units=dim,
kernel_initializer=get_initializer(config.initializer_range),
bias_initializer=keras.initializers.Zeros(),
name="pwconv2",
)
self.drop_path = (
TFConvNextV2DropPath(drop_path, name="drop_path")
if drop_path > 0.0
else keras.layers.Activation("linear", name="drop_path")
)
def call(self, hidden_states, training=False):
input = hidden_states
x = self.dwconv(hidden_states)
x = self.layernorm(x)
x = self.pwconv1(x)
x = self.act(x)
x = self.grn(x)
x = self.pwconv2(x)
x = self.drop_path(x, training=training)
x = input + x
return x
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dwconv", None) is not None:
with tf.name_scope(self.dwconv.name):
self.dwconv.build([None, None, None, self.dim])
if getattr(self, "layernorm", None) is not None:
with tf.name_scope(self.layernorm.name):
self.layernorm.build([None, None, None, self.dim])
if getattr(self, "pwconv1", None) is not None:
with tf.name_scope(self.pwconv1.name):
self.pwconv1.build([None, None, self.dim])
if getattr(self, "grn", None) is not None:
with tf.name_scope(self.grn.name):
self.grn.build(None)
if getattr(self, "pwconv2", None) is not None:
with tf.name_scope(self.pwconv2.name):
self.pwconv2.build([None, None, 4 * self.dim])
if getattr(self, "drop_path", None) is not None:
with tf.name_scope(self.drop_path.name):
self.drop_path.build(None)
class TFConvNextV2Stage(keras.layers.Layer):
"""ConvNextV2 stage, consisting of an optional downsampling layer + multiple residual blocks.
Args:
config (`ConvNextV2Config`):
Model configuration class.
in_channels (`int`):
Number of input channels.
out_channels (`int`):
Number of output channels.
depth (`int`):
Number of residual blocks.
drop_path_rates(`List[float]`):
Stochastic depth rates for each layer.
"""
def __init__(
self,
config: ConvNextV2Config,
in_channels: int,
out_channels: int,
kernel_size: int = 2,
stride: int = 2,
depth: int = 2,
drop_path_rates: Optional[List[float]] = None,
**kwargs,
):
super().__init__(**kwargs)
if in_channels != out_channels or stride > 1:
self.downsampling_layer = [
keras.layers.LayerNormalization(
epsilon=1e-6,
name="downsampling_layer.0",
),
keras.layers.Conv2D(
filters=out_channels,
kernel_size=kernel_size,
strides=stride,
kernel_initializer=get_initializer(config.initializer_range),
bias_initializer=keras.initializers.Zeros(),
name="downsampling_layer.1",
),
]
else:
self.downsampling_layer = [tf.identity]
drop_path_rates = drop_path_rates or [0.0] * depth
self.layers = [
TFConvNextV2Layer(
config,
dim=out_channels,
drop_path=drop_path_rates[j],
name=f"layers.{j}",
)
for j in range(depth)
]
self.in_channels = in_channels
self.out_channels = out_channels
self.stride = stride
def call(self, hidden_states):
for layer in self.downsampling_layer:
hidden_states = layer(hidden_states)
for layer in self.layers:
hidden_states = layer(hidden_states)
return hidden_states
if self.built:
return
self.built = True
if getattr(self, "layers", None) is not None:
for layer in self.layers:
with tf.name_scope(layer.name):
layer.build(None)
if self.in_channels != self.out_channels or self.stride > 1:
with tf.name_scope(self.downsampling_layer[0].name):
self.downsampling_layer[0].build([None, None, None, self.in_channels])
with tf.name_scope(self.downsampling_layer[1].name):
self.downsampling_layer[1].build([None, None, None, self.in_channels])
class TFConvNextV2Encoder(keras.layers.Layer):
def __init__(self, config: ConvNextV2Config, **kwargs):
super().__init__(**kwargs)
self.stages = []
drop_path_rates = tf.linspace(0.0, config.drop_path_rate, sum(config.depths))
drop_path_rates = tf.split(drop_path_rates, config.depths)
drop_path_rates = [x.numpy().tolist() for x in drop_path_rates]
prev_chs = config.hidden_sizes[0]
for i in range(config.num_stages):
out_chs = config.hidden_sizes[i]
stage = TFConvNextV2Stage(
config,
in_channels=prev_chs,
out_channels=out_chs,
stride=2 if i > 0 else 1,
depth=config.depths[i],
drop_path_rates=drop_path_rates[i],
name=f"stages.{i}",
)
self.stages.append(stage)
prev_chs = out_chs
def call(
self,
hidden_states: tf.Tensor,
output_hidden_states: Optional[bool] = False,
return_dict: Optional[bool] = True,
) -> Union[Tuple, TFBaseModelOutputWithNoAttention]:
all_hidden_states = () if output_hidden_states else None
for i, layer_module in enumerate(self.stages):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
hidden_states = layer_module(hidden_states)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(v for v in [hidden_states, all_hidden_states] if v is not None)
return TFBaseModelOutputWithNoAttention(last_hidden_state=hidden_states, hidden_states=all_hidden_states)
def build(self, input_shape=None):
for stage in self.stages:
with tf.name_scope(stage.name):
stage.build(None)
@keras_serializable
class TFConvNextV2MainLayer(keras.layers.Layer):
config_class = ConvNextV2Config
def __init__(self, config: ConvNextV2Config, **kwargs):
super().__init__(**kwargs)
self.config = config
self.embeddings = TFConvNextV2Embeddings(config, name="embeddings")
self.encoder = TFConvNextV2Encoder(config, name="encoder")
self.layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm")
self.pooler = keras.layers.GlobalAvgPool2D(data_format="channels_last")
@unpack_inputs
def call(
self,
pixel_values: TFModelInputType | None = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if pixel_values is None:
raise ValueError("You have to specify pixel_values")
embedding_output = self.embeddings(pixel_values, training=training)
encoder_outputs = self.encoder(
embedding_output,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
last_hidden_state = encoder_outputs[0]
pooled_output = self.pooler(last_hidden_state)
last_hidden_state = tf.transpose(last_hidden_state, perm=(0, 3, 1, 2))
pooled_output = self.layernorm(pooled_output)
if output_hidden_states:
hidden_states = tuple([tf.transpose(h, perm=(0, 3, 1, 2)) for h in encoder_outputs[1]])
if not return_dict:
hidden_states = hidden_states if output_hidden_states else ()
return (last_hidden_state, pooled_output) + hidden_states
return TFBaseModelOutputWithPoolingAndNoAttention(
last_hidden_state=last_hidden_state,
pooler_output=pooled_output,
hidden_states=hidden_states if output_hidden_states else encoder_outputs.hidden_states,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embeddings", None) is not None:
with tf.name_scope(self.embeddings.name):
self.embeddings.build(None)
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
if getattr(self, "layernorm", None) is not None:
with tf.name_scope(self.layernorm.name):
self.layernorm.build([None, self.config.hidden_sizes[-1]])
"""
<Tip>
TensorFlow models and layers in `transformers` accept two formats as input:
- having all inputs as keyword arguments (like PyTorch models), or
- having all inputs as a list, tuple or dict in the first positional argument.
The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
positional argument:
- a single Tensor with `pixel_values` only and nothing else: `model(pixel_values)`
- a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
`model([pixel_values, attention_mask])` or `model([pixel_values, attention_mask, token_type_ids])`
- a dictionary with one or several input Tensors associated to the input names given in the docstring:
`model({"pixel_values": pixel_values, "token_type_ids": token_type_ids})`
Note that when creating models and layers with
[subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
about any of this, as you can just pass inputs like you would to any other Python function!
</Tip>
"""
Args:
pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]`, `Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
[`ConvNextImageProcessor.__call__`] for details.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
used instead.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
eager mode, in graph mode the value will always be set to `True`.
"""
@add_start_docstrings(
"The bare ConvNextV2 model outputting raw features without any specific head on top.",
CONVNEXTV2_START_DOCSTRING,
)
"""
class TFConvNextV2Model(TFConvNextV2PreTrainedModel):
"""
@add_start_docstrings_to_model_forward(CONVNEXTV2_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFBaseModelOutputWithPoolingAndNoAttention,
config_class=_CONFIG_FOR_DOC,
modality="vision",
expected_output=_EXPECTED_OUTPUT_SHAPE,
)
"""
def call(
self,
pixel_values: TFModelInputType | None = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
) -> Union[TFBaseModelOutputWithPoolingAndNoAttention, Tuple[tf.Tensor]]:
"""
Process inputs through the TFConvNextV2 model and return outputs.
Args:
pixel_values: Input pixel values (image tensors).
output_hidden_states: Whether to output hidden states.
return_dict: Whether to return outputs as a dictionary.
training: Whether the model is in training mode.
Returns:
Either a TFBaseModelOutputWithPoolingAndNoAttention object or a tuple with tensors.
"""
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if pixel_values is None:
raise ValueError("You have to specify pixel_values")
outputs = self.convnextv2(
pixel_values=pixel_values,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
if not return_dict:
return outputs[:]
return TFBaseModelOutputWithPoolingAndNoAttention(
last_hidden_state=outputs.last_hidden_state,
pooler_output=outputs.pooler_output,
hidden_states=outputs.hidden_states,
)
def build(self, input_shape=None):
"""
Build method for TFConvNextV2Model. Checks if model is already built before constructing layers.
Args:
input_shape: Shape of the input tensors (not used in this implementation).
"""
if self.built:
return
self.built = True
if getattr(self, "convnextv2", None) is not None:
with tf.name_scope(self.convnextv2.name):
self.convnextv2.build(None)
"""
@add_start_docstrings(
"""
ConvNextV2 Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
ImageNet.
""",
CONVNEXTV2_START_DOCSTRING,
)
"""
class TFConvNextV2ForImageClassification(TFConvNextV2PreTrainedModel, TFSequenceClassificationLoss):
"""
Initialize TFConvNextV2ForImageClassification model.
Args:
config: ConvNextV2 configuration object.
*inputs: Variable length argument list.
**kwargs: Additional keyword arguments.
Attributes:
num_labels: Number of output labels for classification.
convnextv2: TFConvNextV2MainLayer instance for feature extraction.
classifier: Dense layer for classification head.
"""
def __init__(self, config: ConvNextV2Config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels
self.convnextv2 = TFConvNextV2MainLayer(config, name="convnextv2")
self.classifier = keras.layers.Dense(
units=config.num_labels,
kernel_initializer=get_initializer(config.initializer_range),
bias_initializer=keras.initializers.Zeros(),
name="classifier",
)
@unpack_inputs
@add_start_docstrings_to_model_forward(CONVNEXTV2_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=TFImageClassifierOutputWithNoAttention,
config_class=_CONFIG_FOR_DOC,
expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
)
def call(
self,
pixel_values: TFModelInputType | None = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
training: Optional[bool] = False,
) -> Union[TFImageClassifierOutputWithNoAttention, Tuple[tf.Tensor]]:
r"""
labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if pixel_values is None:
raise ValueError("You have to specify pixel_values")
outputs = self.convnextv2(
pixel_values,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
pooled_output = outputs.pooler_output if return_dict else outputs[1]
logits = self.classifier(pooled_output)
loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits)
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return TFImageClassifierOutputWithNoAttention(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "convnextv2", None) is not None:
with tf.name_scope(self.convnextv2.name):
self.convnextv2.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_sizes[-1]])
.\models\convnextv2\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_torch_available,
is_tf_available,
)
_import_structure = {
"configuration_convnextv2": [
"CONVNEXTV2_PRETRAINED_CONFIG_ARCHIVE_MAP",
"ConvNextV2Config",
]
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_convnextv2"] = [
"CONVNEXTV2_PRETRAINED_MODEL_ARCHIVE_LIST",
"ConvNextV2ForImageClassification",
"ConvNextV2Model",
"ConvNextV2PreTrainedModel",
"ConvNextV2Backbone",
]
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_tf_convnextv2"] = [
"TFConvNextV2ForImageClassification",
"TFConvNextV2Model",
"TFConvNextV2PreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_convnextv2 import (
CONVNEXTV2_PRETRAINED_CONFIG_ARCHIVE_MAP,
ConvNextV2Config,
)
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_convnextv2 import (
CONVNEXTV2_PRETRAINED_MODEL_ARCHIVE_LIST,
ConvNextV2Backbone,
ConvNextV2ForImageClassification,
ConvNextV2Model,
ConvNextV2PreTrainedModel,
)
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_tf_convnextv2 import (
TFConvNextV2ForImageClassification,
TFConvNextV2Model,
TFConvNextV2PreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
.\models\cpm\tokenization_cpm.py
"""Tokenization classes."""
import os
import unicodedata
from shutil import copyfile
from typing import Any, Dict, List, Optional, Tuple
import sentencepiece as spm
from ...tokenization_utils import AddedToken, PreTrainedTokenizer
from ...utils import SPIECE_UNDERLINE, logging
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"TsinghuaAI/CPM-Generate": "https://huggingface.co/TsinghuaAI/CPM-Generate/resolve/main/spiece.model",
}
}
class CpmTokenizer(PreTrainedTokenizer):
"""Runs pre-tokenization with Jieba segmentation tool. It is used in CPM models."""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
def __init__(
self,
vocab_file,
do_lower_case=False,
remove_space=True,
keep_accents=False,
bos_token="<s>",
eos_token="</s>",
unk_token="<unk>",
sep_token="<sep>",
pad_token="<pad>",
cls_token="<cls>",
mask_token="<mask>",
additional_special_tokens=["<eop>", "<eod>"],
sp_model_kwargs: Optional[Dict[str, Any]] = None,
**kwargs,
):
super().__init__(
bos_token=bos_token,
eos_token=eos_token,
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
additional_special_tokens=additional_special_tokens,
**kwargs,
)
self.vocab_file = vocab_file
self.do_lower_case = do_lower_case
self.remove_space = remove_space
self.keep_accents = keep_accents
if sp_model_kwargs is None:
sp_model_kwargs = {}
self.sp_model_kwargs = sp_model_kwargs
@property
def vocab_size(self):
return len(self.sp_model)
def get_vocab(self):
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
vocab.update(self.added_tokens_encoder)
return vocab
def __getstate__(self):
state = self.__dict__.copy()
state["sp_model"] = None
return state
def __setstate__(self, d):
self.__dict__ = d
if not hasattr(self, "sp_model_kwargs"):
self.sp_model_kwargs = {}
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(self.vocab_file)
def preprocess_text(self, inputs):
if self.remove_space:
outputs = " ".join(inputs.strip().split())
else:
outputs = inputs
outputs = outputs.replace("``", '"').replace("''", '"')
if not self.keep_accents:
outputs = unicodedata.normalize("NFKD", outputs)
outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
if self.do_lower_case:
outputs = outputs.lower()
return outputs
def _tokenize(self, text: str) -> List[str]:
"""Tokenize a string."""
text = self.preprocess_text(text)
pieces = self.sp_model.encode(text, out_type=str)
new_pieces = []
for piece in pieces:
if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit():
cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, ""))
if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
if len(cur_pieces[0]) == 1:
cur_pieces = cur_pieces[1:]
else:
cur_pieces[0] = cur_pieces[0][1:]
cur_pieces.append(piece[-1])
new_pieces.extend(cur_pieces)
else:
new_pieces.append(piece)
return new_pieces
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
return self.sp_model.PieceToId(token)
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
return self.sp_model.IdToPiece(index)
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (strings for sub-words) in a single string."""
out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
return out_string
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
):
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Generate token type IDs from a sequence or a pair of sequences. XLNet uses token type IDs to distinguish
between sequences in a pair.
Args:
token_ids_0 (`List[int]`):
List of IDs for the first sequence.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of token type IDs where each ID corresponds to a sequence or a pair of sequences.
"""
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. An XLNet
sequence pair mask has the following format:
```
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
```
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
"""
sep = [self.sep_token_id]
cls_segment_id = [2]
if token_ids_1 is None:
return len(token_ids_0 + sep) * [0] + cls_segment_id
return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + cls_segment_id
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
out_vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
copyfile(self.vocab_file, out_vocab_file)
elif not os.path.isfile(self.vocab_file):
with open(out_vocab_file, "wb") as fi:
content_spiece_model = self.sp_model.serialized_model_proto()
fi.write(content_spiece_model)
return (out_vocab_file,)
def _decode(self, *args, **kwargs):
text = super()._decode(*args, **kwargs)
text = text.replace(" ", "").replace("\u2582", " ").replace("\u2583", "\n")
return text
.\models\cpm\tokenization_cpm_fast.py
"""Tokenization classes."""
import os
from shutil import copyfile
from typing import List, Optional, Tuple
from ...tokenization_utils_fast import AddedToken, PreTrainedTokenizerFast
from ...utils import logging
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.json"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"TsinghuaAI/CPM-Generate": "https://huggingface.co/TsinghuaAI/CPM-Generate/resolve/main/spiece.model",
},
"tokenizer_file": {
"TsinghuaAI/CPM-Generate": "https://huggingface.co/TsinghuaAI/CPM-Generate/resolve/main/tokenizer.json",
},
}
class CpmTokenizerFast(PreTrainedTokenizerFast):
"""Runs pre-tokenization with Jieba segmentation tool. It is used in CPM models."""
def __init__(
self,
vocab_file=None,
tokenizer_file=None,
do_lower_case=False,
remove_space=True,
keep_accents=False,
bos_token="<s>",
eos_token="</s>",
unk_token="<unk>",
sep_token="<sep>",
pad_token="<pad>",
cls_token="<cls>",
mask_token="<mask>",
additional_special_tokens=["<eop>", "<eod>"],
**kwargs,
):
super().__init__(
vocab_file=vocab_file,
tokenizer_file=tokenizer_file,
do_lower_case=do_lower_case,
remove_space=remove_space,
keep_accents=keep_accents,
bos_token=bos_token,
eos_token=eos_token,
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
additional_special_tokens=additional_special_tokens,
**kwargs,
)
@property
def can_save_slow_tokenizer(self) -> bool:
return os.path.isfile(self.vocab_file) if self.vocab_file else False
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
def create_inputs_for_sequence_classification(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return token_ids_0 + sep + cls
return token_ids_0 + sep + token_ids_1 + sep + cls
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
sep = [self.sep_token_id]
cls_segment_id = [2]
if token_ids_1 is None:
return len(token_ids_0 + sep) * [0] + cls_segment_id
return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + cls_segment_id
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not self.can_save_slow_tokenizer:
raise ValueError(
"Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
"tokenizer."
)
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
out_vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
copyfile(self.vocab_file, out_vocab_file)
return (out_vocab_file,)
def _batch_encode_plus(self, batch_text_or_text_pairs, *args, **kwargs):
batch_text_or_text_pairs = [
" ".join([x.translate(self.translator) for x in self.jieba.cut(text, cut_all=False)])
for text in batch_text_or_text_pairs
]
return super()._batch_encode_plus(batch_text_or_text_pairs, *args, **kwargs)
def _decode(self, *args, **kwargs):
text = super()._decode(*args, **kwargs)
text = text.replace(" ", "").replace("\u2582", " ").replace("\u2583", "\n")
return text
.\models\cpm\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_sentencepiece_available, is_tokenizers_available
_import_structure = {}
try:
if not is_sentencepiece_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_cpm"] = ["CpmTokenizer"]
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_cpm_fast"] = ["CpmTokenizerFast"]
if TYPE_CHECKING:
try:
if not is_sentencepiece_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_cpm import CpmTokenizer
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_cpm_fast import CpmTokenizerFast
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\cpmant\configuration_cpmant.py
""" CPMAnt model configuration"""
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
CPMANT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"openbmb/cpm-ant-10b": "https://huggingface.co/openbmb/cpm-ant-10b/blob/main/config.json"
}
class CpmAntConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`CpmAntModel`]. It is used to instantiate an
CPMAnt model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the CPMAnt
[openbmb/cpm-ant-10b](https://huggingface.co/openbmb/cpm-ant-10b) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
# 模型类型设定为 "cpmant"
model_type = "cpmant"
# 初始化函数,用于初始化一个 Transformer 模型的参数和配置
def __init__(
self,
vocab_size: int = 30720, # 词汇表大小,默认为 30720
hidden_size: int = 4096, # 隐藏层的尺寸,默认为 4096
num_attention_heads: int = 32, # 注意力头的数量,默认为 32
dim_head: int = 128, # 注意力头的维度,默认为 128
dim_ff: int = 10240, # FeedForward 层的尺寸,默认为 10240
num_hidden_layers: int = 48, # Transformer 层的数量,默认为 48
dropout_p: int = 0.0, # Dropout 概率,默认为 0.0,即无 dropout
position_bias_num_buckets: int = 512, # 位置偏置的哈希桶数量,默认为 512
position_bias_max_distance: int = 2048, # 位置偏置的最大距离,默认为 2048
eps: int = 1e-6, # 避免除零的小数,默认为 1e-6
init_std: float = 1.0, # 参数初始化的标准差,默认为 1.0
prompt_types: int = 32, # 提示类型的数量,默认为 32
prompt_length: int = 32, # 提示长度,默认为 32
segment_types: int = 32, # 段落类型的数量,默认为 32
use_cache: bool = True, # 是否使用缓存,默认为 True
**kwargs, # 其他额外的参数,以字典形式接收
):
# 调用父类的初始化方法,传递额外的关键字参数
super().__init__(**kwargs)
# 初始化特定于 Prompt 的参数
self.prompt_types = prompt_types
self.prompt_length = prompt_length
self.segment_types = segment_types
# 初始化通用的 Transformer 参数
self.hidden_size = hidden_size
self.num_attention_heads = num_attention_heads
self.dim_head = dim_head
self.dim_ff = dim_ff
self.num_hidden_layers = num_hidden_layers
self.position_bias_num_buckets = position_bias_num_buckets
self.position_bias_max_distance = position_bias_max_distance
self.dropout_p = dropout_p
self.eps = eps
self.use_cache = use_cache
self.vocab_size = vocab_size
self.init_std = init_std
.\models\cpmant\modeling_cpmant.py
""" PyTorch CPMAnt"""
import math
from typing import List, Optional, Tuple, Union
import torch
import torch.nn.functional as F
import torch.utils.checkpoint
from torch import nn
from torch.nn import CrossEntropyLoss
from ...activations import ACT2FN
from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
from ...modeling_utils import PreTrainedModel
from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
from .configuration_cpmant import CpmAntConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "openbmb/cpm-ant-10b"
_CONFIG_FOR_DOC = "CpmAntConfig"
CPMANT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"openbmb/cpm-ant-10b",
]
class CpmAntLayerNorm(nn.Module):
"""
We use Root Mean Square (RMS) Layer Normalization, please see https://arxiv.org/abs/1910.07467 for details."
"""
def __init__(self, config: CpmAntConfig):
super().__init__()
self.eps = config.eps
self.dim_norm = config.hidden_size
self.weight = nn.Parameter(torch.empty(config.hidden_size))
def forward(self, hidden_states: torch.Tensor):
"""
Args:
hidden_states (`torch.Tensor` of shape `(batch, seq_len, dim_in)`)
"""
if hidden_states.size(-1) != self.dim_norm:
raise AssertionError("hidden_states.size(-1) != self.dim_norm")
old_dtype = hidden_states.dtype
variance = hidden_states.to(torch.float32).pow(2).mean(dim=-1, keepdim=True)
hidden_states = (hidden_states * torch.rsqrt(variance + self.eps)).to(old_dtype) * self.weight
return hidden_states
class CpmAntAttention(nn.Module):
def __init__(self, config: CpmAntConfig):
super().__init__()
self.dim_model = config.hidden_size
self.num_heads = config.num_attention_heads
self.dim_head = config.dim_head
self.project_q = nn.Linear(self.dim_model, self.num_heads * self.dim_head, bias=False)
self.project_k = nn.Linear(self.dim_model, self.num_heads * self.dim_head, bias=False)
self.project_v = nn.Linear(self.dim_model, self.num_heads * self.dim_head, bias=False)
self.attention_out = nn.Linear(self.num_heads * self.dim_head, self.dim_model, bias=False)
self.softmax = torch.nn.Softmax(dim=-1)
if config.dropout_p is not None:
self.dropout = torch.nn.Dropout(p=config.dropout_p)
else:
self.dropout = None
class CpmAntSelfAttentionBlock(nn.Module):
def __init__(self, config: CpmAntConfig):
super().__init__()
self.layernorm_before_attention = CpmAntLayerNorm(config)
self.self_attention = CpmAntAttention(config)
if config.dropout_p:
self.dropout = torch.nn.Dropout(config.dropout_p)
else:
self.dropout = None
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: torch.Tensor,
position_bias: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = False,
past_key_values: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
use_cache: Optional[bool] = None,
):
"""
Args:
hidden_states (`torch.Tensor` of shape `(batch, len_seq, dim_model)`):
自注意力模块的输入,可以是一批序列的原始嵌入。
attention_mask (`torch.Tensor` of shape `(batch, len_seq, len_seq)`):
遮罩矩阵,避免无效区域参与自注意力计算。
position_bias (`torch.Tensor` of shape `(batch, len_seq, len_seq)`):
位置偏置,提供给自注意力模块的位置信息。
output_attentions (`bool`, *optional*):
是否返回所有注意力层的注意力张量。
past_key_values (`Tuple(torch.FloatTensor)`, *optional*):
缓存的过去键和值投影状态。
use_cache (`bool`, *optional*):
如果设置为 `True`,则返回 `past_key_values` 键值状态,可用于加速解码过程(参见 `past_key_values`)。
"""
outputs = self.layernorm_before_attention(hidden_states)
outputs = self.self_attention(
outputs, outputs, attention_mask, position_bias, output_attentions, past_key_values, use_cache
)
outputs, attn_weights, current_key_value = outputs
if self.dropout is not None:
outputs = self.dropout(outputs)
hidden_states = hidden_states + outputs
return hidden_states, attn_weights, current_key_value
class CpmAntDenseGatedACT(nn.Module):
def __init__(self, config: CpmAntConfig):
super().__init__()
self.w_0 = nn.Linear(config.hidden_size, config.dim_ff, bias=False)
self.w_1 = nn.Linear(config.hidden_size, config.dim_ff, bias=False)
self.act = torch.nn.GELU()
def forward(self, hidden_states: torch.Tensor):
"""通过非线性操作将输入张量从一个特征空间转换到另一个特征空间
Args:
hidden_states (`torch.Tensor` of shape `(batch, seq_len, dim_in)`)
"""
gate_score = self.act(self.w_0(hidden_states))
hidden_states = self.w_1(hidden_states)
hidden_states = gate_score * hidden_states
return hidden_states
class CpmAntTransformerBlock(nn.Module):
def __init__(self, config: CpmAntConfig):
super().__init__()
self.self_att = CpmAntSelfAttentionBlock(config)
self.ffn = CpmAntFFNBlock(config)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: torch.Tensor,
position_bias: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = False,
past_key_values: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
use_cache: Optional[bool] = None,
):
"""
Args:
hidden_states (`torch.Tensor` of shape `(batch, len_seq, dim_model)`):
输入的隐藏状态,形状为 (batch, len_seq, dim_model)。
attention_mask (`torch.Tensor`):
注意力掩码,形状可以根据具体应用而变化。
position_bias (`Optional[torch.Tensor]`, optional):
位置偏置张量,形状可以根据具体应用而变化,默认为 None。
output_attentions (`Optional[bool]`, optional):
是否输出注意力权重,默认为 False。
past_key_values (`Optional[Tuple[torch.Tensor, torch.Tensor]]`, optional):
过去的键-值对,用于缓存,形状为 (key, value),默认为 None。
use_cache (`Optional[bool]`, optional):
是否使用缓存,默认为 None。
Returns:
`torch.Tensor`: 经过自注意力和前馈网络后的隐藏状态张量。
"""
ln_outputs = self.self_att(hidden_states, attention_mask, position_bias,
output_attentions, past_key_values, use_cache)
outputs = self.ffn(ln_outputs)
if self.dropout is not None:
outputs = self.dropout(outputs)
hidden_states = hidden_states + outputs
return hidden_states
):
"""
Args:
hidden_states (`torch.Tensor`):
输入到层的张量,形状为 `(batch, seq_len, dim_model)`
attention_mask (`torch.Tensor`):
避免无效区域参与计算的张量,形状为 `(batch, seq_len, seq_len)`
position_bias (`torch.Tensor`):
提供位置信息给注意力机制的张量,形状为 `(num_heads, seq_len, seq_len)`
output_attentions (`bool`, *可选*):
是否返回所有注意力层的注意力张量。
past_key_values (`Tuple[torch.Tensor, torch.Tensor]`, *可选*):
缓存的过去键和值投影状态
use_cache (`bool`, *可选*):
如果设置为 `True`,则返回 `past_key_values` 键值状态,可用于加速解码 (参见 `past_key_values`)。
"""
hidden_states = self.self_att(
hidden_states,
attention_mask=attention_mask,
position_bias=position_bias,
output_attentions=output_attentions,
past_key_values=past_key_values,
use_cache=use_cache,
)
hidden_states, attn_weights, current_key_value = hidden_states
hidden_states = self.ffn(hidden_states)
return hidden_states, attn_weights, current_key_value
class CpmAntEncoder(nn.Module):
def __init__(self, config: CpmAntConfig):
super().__init__()
self.num_layers = config.num_hidden_layers
self.layers = nn.ModuleList([CpmAntTransformerBlock(config) for ith in range(self.num_layers)])
self.output_layernorm = CpmAntLayerNorm(config)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: torch.Tensor,
position_bias: torch.Tensor,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
past_key_values: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
use_cache: Optional[bool] = None,
):
"""
Args:
hidden_states (`torch.Tensor`):
输入的张量,形状为 `(batch, seq_len, dim_model)`
attention_mask (`torch.Tensor`):
注意力掩码张量,形状为 `(batch, seq_len, seq_len)`
position_bias (`torch.Tensor`):
位置偏置张量,提供位置信息给注意力机制,形状为 `(num_heads, seq_len, seq_len)`
output_attentions (`bool`, *optional*):
是否返回所有注意力层的注意力张量
output_hidden_states (`bool`, *optional*):
是否返回所有层的隐藏状态
past_key_values (`Tuple[torch.Tensor, torch.Tensor])`, *optional*):
缓存的过去键和值投影状态
use_cache (`bool`, *optional*):
如果为 `True`,返回 `past_key_values` 键值状态以加速解码
"""
all_hidden_states = () if output_hidden_states else None
all_self_attns = () if output_attentions else None
current_key_values = () if use_cache else None
for i, layer in enumerate(self.layers):
if output_hidden_states:
all_hidden_states += (hidden_states,)
layer_outputs = layer(
hidden_states,
attention_mask,
position_bias,
output_attentions=output_attentions,
past_key_values=past_key_values[i] if past_key_values else None,
use_cache=use_cache,
)
hidden_states, attn_weights, current_key_value = layer_outputs
if output_attentions:
all_self_attns += (attn_weights,)
if current_key_value is not None:
current_key_values = current_key_values + (current_key_value,)
hidden_states = self.output_layernorm(hidden_states)
if output_hidden_states:
all_hidden_states += (hidden_states,)
return hidden_states, current_key_values, all_hidden_states, all_self_attns
class CpmAntIntermediate(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
class CpmAntSegmentPositionEmbedding(nn.Module):
def __init__(self, config: CpmAntConfig):
super().__init__()
self.num_heads = config.num_attention_heads
self.num_buckets = config.position_bias_num_buckets
self.max_distance = config.position_bias_max_distance
self.num_segments = config.segment_types
self.relative_attention_bias = nn.Parameter(
torch.empty(
config.segment_types * config.segment_types + config.position_bias_num_buckets,
config.num_attention_heads,
)
)
def forward(
self,
key_pos: torch.Tensor,
query_pos: torch.Tensor,
key_segment: torch.Tensor,
query_segment: torch.Tensor,
):
with torch.no_grad():
batch = key_pos.size(0)
keylen = key_pos.size(1)
querylen = query_pos.size(1)
if key_pos.size(0) != query_pos.size(0):
raise AssertionError(
f"key_pos.size(0) should be equal to query_pos.size(0), but got {key_pos.size(0)} and {query_pos.size(0)}!"
)
if keylen != key_segment.size(1) or querylen != query_segment.size(1):
raise AssertionError(
f"keylen should be equal to key_segment.size(1), but got {keylen} and {key_segment.size(1)}!"
)
if querylen != query_segment.size(1):
raise AssertionError(
f"querylen should be equal to query_segment.size(1), but got {querylen} and {query_segment.szie(1)}!"
)
key_pos = key_pos.view(batch, -1, keylen)
query_pos = query_pos.view(batch, querylen, -1)
key_segment = key_segment.view(batch, -1, keylen)
query_segment = query_segment.view(batch, querylen, -1)
relative_position_bucket = self._segment_relative_position_bucket(query_segment, key_segment)
relative_position_bucket = relative_position_bucket + self.num_buckets
absolute_position_bucket = self._position_bucket(
torch.arange(keylen, dtype=torch.int32, device=relative_position_bucket.device)[None, :]
- torch.arange(querylen, dtype=torch.int32, device=relative_position_bucket.device)[:, None],
num_buckets=self.num_buckets,
max_distance=self.max_distance,
)
relative_position_bucket = torch.where(
(key_segment == query_segment),
absolute_position_bucket[None, :, :],
relative_position_bucket,
)
embeds = F.embedding(relative_position_bucket, self.relative_attention_bias)
embeds = embeds.permute(0, 3, 1, 2).contiguous()
return embeds
def _segment_relative_position_bucket(self, query_segment, key_segment):
return query_segment * self.num_segments + key_segment
def _position_bucket(self, relative_position, num_buckets=32, max_distance=128):
num_buckets //= 2
relative_buckets = (relative_position > 0).to(torch.int32) * num_buckets
relative_position = torch.abs(relative_position)
max_exact = num_buckets // 2
is_small = relative_position < max_exact
relative_postion_if_large = max_exact + (
torch.log(relative_position.float() / max_exact)
/ math.log(max_distance / max_exact)
* (num_buckets - max_exact)
).to(torch.int32)
relative_postion_if_large = torch.min(
relative_postion_if_large,
torch.full_like(relative_postion_if_large, num_buckets - 1),
)
relative_buckets += torch.where(is_small, relative_position.to(torch.int32), relative_postion_if_large)
return relative_buckets
class CpmAntOutput(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class CpmAntPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = CpmAntConfig
base_model_prefix = "cpmant"
def _init_weights(self, module):
"""初始化权重"""
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=self.config.init_std)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=self.config.init_std)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
elif isinstance(module, CpmAntLayerNorm):
module.weight.data.fill_(1.0)
elif isinstance(module, CpmAntSegmentPositionEmbedding):
module.relative_attention_bias.data.normal_(mean=0.0, std=self.config.init_std)
CPMANT_START_DOCSTRING = r"""
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters
config ([`~CpmAntConfig`]): Model configuration class with all the parameters of the
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
CPMANT_INPUTS_DOCSTRING = r"""
Args:
input_ids (`torch.Tensor` of shape `(batch_size, seq_len)`):
# 输入序列标记在词汇表中的索引。
# 可以使用 `CPMAntTokenizer` 获得这些索引。参见 `PreTrainedTokenizer.encode` 和 `PreTrainedTokenizer.__call__` 获取更多细节。
# [什么是输入ID?](../glossary#input-ids)
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
# 包含预先计算的隐藏状态(自注意力块和交叉注意力块中的键和值),可以用于加速序列解码。
# 当 `use_cache=True` 或 `config.use_cache=True` 时返回。
use_cache (`bool`, *optional*):
# 如果设置为 `True`,则返回 `past_key_values` 中的键值状态,可用于加速解码。
output_attentions (`bool`, *optional*):
# 是否返回所有注意力层的注意力张量。
output_hidden_states (`bool`, *optional*):
# 是否返回所有层的隐藏状态。
return_dict (`bool`, *optional*):
# 是否返回一个 `~utils.ModelOutput` 而不是一个普通的元组。
"""
@add_start_docstrings(
"""
The CPMAnt Model outputting raw hidden-states without any specific head on top.
""",
CPMANT_START_DOCSTRING,
)
"""
定义一个 CPMAnt 模型类,用于生成不带特定输出头的原始隐藏状态。
class CpmAntModel(CpmAntPreTrainedModel):
"""
CPMAnt 模型类,继承自 CpmAntPreTrainedModel。
"""
def __init__(self, config: CpmAntConfig):
"""
初始化方法,接受一个 CpmAntConfig 对象作为参数。
"""
super().__init__(config)
# 初始化编码器
self.encoder = CpmAntEncoder(config)
# 初始化分段嵌入
self.segment_embedding = nn.Embedding(config.segment_types, config.hidden_size)
# 初始化输入嵌入
self.input_embedding = nn.Embedding(
config.vocab_size + config.prompt_types * config.prompt_length, config.hidden_size
)
# 初始化位置偏置
self.position_bias = CpmAntSegmentPositionEmbedding(config)
# 设置提示长度和词汇表大小
self.prompt_length = config.prompt_length
self.vocab_size = config.vocab_size
# 执行初始化后的附加步骤
self.post_init()
def get_input_embeddings(self):
"""
返回输入嵌入层。
"""
return self.input_embedding
def set_input_embeddings(self, embeddings, **kwargs):
"""
设置输入嵌入层。
"""
self.input_embedding = embeddings
def _prepare_attention_mask(self, input_ids, span, context, length):
"""
准备注意力掩码。
"""
batch = input_ids.size(0)
seqlen = input_ids.size(1)
device = input_ids.device
# 创建方向性掩码
directional_mask_2d = torch.arange(seqlen, device=device) <= torch.arange(seqlen, device=device).view(-1, 1)
attention_mask = context[:, None, :] | (
context[:, :, None].logical_not() & directional_mask_2d.view(1, seqlen, seqlen)
)
attention_mask = attention_mask & (span[:, None, :] == span[:, :, None])
# 创建左填充掩码
mask_1d = (
torch.tensor(list(range(seqlen - self.prompt_length))[::-1], device=device)[None, :].repeat(batch, 1)
< length[:, None]
)
mask_1d = torch.cat((torch.ones(batch, self.prompt_length, device=device).bool(), mask_1d), dim=1)
attention_mask = mask_1d.view(batch, seqlen, 1) & mask_1d.view(batch, 1, seqlen) & attention_mask
return attention_mask
@add_start_docstrings_to_model_forward(CPMANT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPast,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
use_cache: Optional[bool] = None,
return_dict: Optional[bool] = None,
**kwargs,
):
"""
CPMAnt 模型的前向传播方法,接受多个输入参数并返回输出。
Args:
input_ids (Optional[torch.Tensor], optional): 输入张量,默认为 None。
output_attentions (Optional[bool], optional): 是否输出注意力,默认为 None。
output_hidden_states (Optional[bool], optional): 是否输出隐藏状态,默认为 None。
past_key_values (Optional[Tuple[Tuple[torch.Tensor]]], optional): 过去键值元组,默认为 None。
use_cache (Optional[bool], optional): 是否使用缓存,默认为 None。
return_dict (Optional[bool], optional): 是否返回字典,默认为 None。
**kwargs: 其他关键字参数。
Returns:
模型的输出,包含过去的键值对。
"""
# 实际前向传播逻辑在子类中实现
pass
@add_start_docstrings(
"""
The CPMAnt Model with a language modeling head on top (linear layer with weights tied to the input embeddings).
""",
CPMANT_START_DOCSTRING,
)
"""
定义一个带有语言建模头的 CPMAnt 模型类,使用输入嵌入层权重来绑定线性层。
class CpmAntForCausalLM(CpmAntPreTrainedModel):
"""
CPMAnt 用于因果语言建模的模型类,继承自 CpmAntPreTrainedModel。
"""
_tied_weights_keys = ["lm_head.weight"]
def __init__(self, config: CpmAntConfig):
super().__init__(config)
self.cpmant = CpmAntModel(config)
self.lm_head = nn.Linear(
config.hidden_size, config.vocab_size + config.prompt_types * config.prompt_length, bias=False
)
self.post_init()
@add_start_docstrings_to_model_forward(CPMANT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=CausalLMOutputWithPast,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
past_key_values: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
labels: Optional[torch.Tensor] = None,
return_dict: Optional[bool] = None,
attention_mask: Optional[torch.Tensor] = None,
**kwargs,
):
pass
def get_input_embeddings(self):
return self.cpmant.input_embedding
def set_input_embeddings(self, embeddings):
self.cpmant.input_embedding = embeddings
def get_output_embeddings(self):
return self.lm_head
def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings
def prepare_inputs_for_generation(self, input_ids, **kwargs):
input_ids = input_ids.int()
if "attention_mask" in kwargs:
kwargs["attention_mask"] = torch.zeros(1, 1)
return {
"input_ids": input_ids,
"use_cache": kwargs["use_cache"],
"past_key_values": kwargs.get("past_key_values", None),
}
def _reorder_cache(self, past_key_values, beam_idx):
past_key_values = [list(each) if each is not None else each for each in past_key_values]
for key_value_layer in past_key_values:
key_value_layer[0] = key_value_layer[0][beam_idx]
key_value_layer[1] = key_value_layer[1][beam_idx]
return past_key_values
.\models\cpmant\tokenization_cpmant.py
"""CPMAnt 的标记化类。"""
import collections
import os
from typing import List, Optional, Tuple
from transformers.utils import is_jieba_available, requires_backends
if is_jieba_available():
import jieba
from ...tokenization_utils import PreTrainedTokenizer
from ...utils import logging
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"openbmb/cpm-ant-10b": "https://huggingface.co/openbmb/cpm-ant-10b/blob/main/vocab.txt",
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"openbmb/cpm-ant-10b": 1024,
}
def load_vocab(vocab_file):
"""加载词汇文件到字典中。"""
vocab = collections.OrderedDict()
with open(vocab_file, "r", encoding="utf-8") as reader:
tokens = reader.readlines()
for index, token in enumerate(tokens):
token = token.rstrip("\n")
vocab[token] = index
return vocab
class WordpieceTokenizer(object):
"""基于词片段的标记化器。"""
def __init__(self, vocab, unk_token="<unk>", max_input_chars_per_word=200):
self.vocab = vocab
self.unk_token = unk_token
self.max_input_chars_per_word = max_input_chars_per_word
def tokenize(self, token):
"""将单词标记化为词片段列表。"""
chars = list(token)
if len(chars) > self.max_input_chars_per_word:
return [self.unk_token]
start = 0
sub_tokens = []
while start < len(chars):
end = len(chars)
cur_substr = None
while start < end:
substr = "".join(chars[start:end])
if substr in self.vocab:
cur_substr = substr
break
end -= 1
if cur_substr is None:
sub_tokens.append(self.unk_token)
start += 1
else:
sub_tokens.append(cur_substr)
start = end
return sub_tokens
class CpmAntTokenizer(PreTrainedTokenizer):
"""
构造一个 CPMAnt 标记化器。基于字节级别的字节对编码。
继承自 PreTrainedTokenizer 类。
"""
pass
class BartTokenizer(BertTokenizer):
"""
Args:
vocab_file (`str`):
Path to the vocabulary file.
bod_token (`str`, *optional*, defaults to `"<d>"`):
The beginning of document token.
eod_token (`str`, *optional*, defaults to `"</d>"`):
The end of document token.
bos_token (`str`, *optional*, defaults to `"<s>"`):
The beginning of sequence token.
eos_token (`str`, *optional*, defaults to `"</s>"`):
The end of sequence token.
pad_token (`str`, *optional*, defaults to `"<pad>"`):
The token used for padding.
unk_token (`str`, *optional*, defaults to `"<unk>"`):
The unknown token.
line_token (`str`, *optional*, defaults to `"</n>"`):
The line token.
space_token (`str`, *optional*, defaults to `"</_>"`):
The space token.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["input_ids", "attention_mask"]
add_prefix_space = False
def __init__(
self,
vocab_file,
bod_token="<d>",
eod_token="</d>",
bos_token="<s>",
eos_token="</s>",
pad_token="<pad>",
unk_token="<unk>",
line_token="</n>",
space_token="</_>",
padding_side="left",
**kwargs,
):
requires_backends(self, ["jieba"])
self.bod_token = bod_token
self.eod_token = eod_token
self.encoder = load_vocab(vocab_file)
self.encoder[" "] = self.encoder[space_token]
self.encoder["\n"] = self.encoder[line_token]
del self.encoder[space_token]
del self.encoder[line_token]
self.encoder = collections.OrderedDict(sorted(self.encoder.items(), key=lambda x: x[1]))
self.decoder = {v: k for k, v in self.encoder.items()}
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.encoder, unk_token=unk_token)
super().__init__(
bod_token=bod_token,
eod_token=eod_token,
bos_token=bos_token,
eos_token=eos_token,
pad_token=pad_token,
unk_token=unk_token,
line_token=line_token,
space_token=space_token,
padding_side=padding_side,
**kwargs,
)
@property
def bod_token_id(self):
return self.encoder[self.bod_token]
@property
def eod_token_id(self):
return self.encoder[self.eod_token]
@property
def newline_id(self):
return self.encoder["\n"]
@property
def vocab_size(self) -> int:
return len(self.encoder)
def get_vocab(self):
return dict(self.encoder, **self.added_tokens_encoder)
def _tokenize(self, text):
output_tokens = []
for x in jieba.cut(text, cut_all=False):
output_tokens.extend(self.wordpiece_tokenizer.tokenize(x))
return output_tokens
def _decode(self, token_ids, **kwargs):
token_ids = [i for i in token_ids if i >= 0]
token_ids = [
x for x in token_ids if x != self.pad_token_id and x != self.eos_token_id and x != self.bos_token_id
]
return super()._decode(token_ids, **kwargs)
def check(self, token):
return token in self.encoder
def convert_tokens_to_string(self, tokens: List[str]) -> str:
return "".join(tokens)
def _convert_token_to_id(self, token):
return self.encoder.get(token, self.encoder.get(self.unk_token))
def _convert_id_to_token(self, index):
return self.decoder.get(index, self.unk_token)
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if os.path.isdir(save_directory):
vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
else:
vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
index = 0
if " " in self.encoder:
self.encoder["</_>"] = self.encoder[" "]
del self.encoder[" "]
if "\n" in self.encoder:
self.encoder["</n>"] = self.encoder["\n"]
del self.encoder["\n"]
self.encoder = collections.OrderedDict(sorted(self.encoder.items(), key=lambda x: x[1]))
with open(vocab_file, "w", encoding="utf-8") as writer:
for token, token_index in self.encoder.items():
if index != token_index:
logger.warning(
f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
" Please check that the vocabulary is not corrupted!"
)
index = token_index
writer.write(token + "\n")
index += 1
return (vocab_file,)
def build_inputs_with_special_tokens(self, token_ids_0: List[int], token_ids_1: List[int] = None) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequences for sequence classification tasks by concatenating and
adding special tokens. A CPMAnt sequence has the following format:
- single sequence: `[BOS] Sequence`.
Args:
token_ids_0 (`List[int]`): The first tokenized sequence that special tokens will be added.
token_ids_1 (`List[int]`): The optional second tokenized sequence that special tokens will be added.
Returns:
`List[int]`: The model input with special tokens.
"""
if token_ids_1 is None:
return [self.bos_token_id] + token_ids_0
return [self.bos_token_id] + token_ids_0 + [self.bos_token_id] + token_ids_1
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.
Args:
token_ids_0 (`List[int]`): List of IDs.
token_ids_1 (`List[int]`, *optional*): Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)
if token_ids_1 is not None:
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1))
return [1] + ([0] * len(token_ids_0))
.\models\cpmant\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available
_import_structure = {
"configuration_cpmant": ["CPMANT_PRETRAINED_CONFIG_ARCHIVE_MAP", "CpmAntConfig"],
"tokenization_cpmant": ["CpmAntTokenizer"],
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_cpmant"] = [
"CPMANT_PRETRAINED_MODEL_ARCHIVE_LIST",
"CpmAntForCausalLM",
"CpmAntModel",
"CpmAntPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_cpmant import CPMANT_PRETRAINED_CONFIG_ARCHIVE_MAP, CpmAntConfig
from .tokenization_cpmant import CpmAntTokenizer
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_cpmant import (
CPMANT_PRETRAINED_MODEL_ARCHIVE_LIST,
CpmAntForCausalLM,
CpmAntModel,
CpmAntPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\ctrl\configuration_ctrl.py
"""
Salesforce CTRL configuration
This module defines the configuration class `CTRLConfig` for the CTRL model. It provides a mapping of pretrained model names
to their corresponding configuration files.
The `CTRLConfig` class inherits from `PretrainedConfig` and defines parameters that control the architecture and behavior
of the CTRL model. It provides defaults that align with the Salesforce/ctrl architecture.
For more details on how to use configuration objects like `CTRLConfig` to instantiate CTRL models, refer to the
documentation of `PretrainedConfig`.
"""
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"Salesforce/ctrl": "https://huggingface.co/Salesforce/ctrl/resolve/main/config.json"
}
class CTRLConfig(PretrainedConfig):
"""
This is the configuration class to store the configuration of a `CTRLModel` or a `TFCTRLModel`.
It defines parameters that control the architecture and behavior of the model when instantiated.
Instantiating a configuration with the defaults will yield a similar configuration to that of
the Salesforce/ctrl architecture from SalesForce.
Configuration objects inherit from `PretrainedConfig` and can be used to control the model outputs.
For more detailed information about configuring CTRL models, refer to the documentation of `PretrainedConfig`.
"""
pass
model_type = "ctrl"
keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {
"max_position_embeddings": "n_positions",
"hidden_size": "n_embd",
"num_attention_heads": "n_head",
"num_hidden_layers": "n_layer",
}
def __init__(
self,
vocab_size=246534,
n_positions=256,
n_embd=1280,
dff=8192,
n_layer=48,
n_head=16,
resid_pdrop=0.1,
embd_pdrop=0.1,
layer_norm_epsilon=1e-6,
initializer_range=0.02,
use_cache=True,
**kwargs,
):
self.vocab_size = vocab_size
self.n_positions = n_positions
self.n_embd = n_embd
self.n_layer = n_layer
self.n_head = n_head
self.dff = dff
self.resid_pdrop = resid_pdrop
self.embd_pdrop = embd_pdrop
self.layer_norm_epsilon = layer_norm_epsilon
self.initializer_range = initializer_range
self.use_cache = use_cache
super().__init__(**kwargs)
.\models\ctrl\modeling_ctrl.py
""" PyTorch CTRL model."""
from typing import Optional, Tuple, Union
import numpy as np
import torch
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutput
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import Conv1D, find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
from .configuration_ctrl import CTRLConfig
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "CTRLConfig"
CTRL_PRETRAINED_MODEL_ARCHIVE_LIST = [
"Salesforce/ctrl"
]
def angle_defn(pos, i, d_model_size):
angle_rates = 1 / torch.pow(10000, (2 * (i // 2)) / d_model_size)
return pos * angle_rates
def positional_encoding(position, d_model_size, dtype):
angle_rads = angle_defn(
torch.arange(position, dtype=torch.int64).to(dtype).unsqueeze(1),
torch.arange(d_model_size, dtype=torch.int64).to(dtype).unsqueeze(0),
d_model_size,
)
sines = torch.sin(angle_rads[:, 0::2])
cosines = torch.cos(angle_rads[:, 1::2])
pos_encoding = torch.cat([sines, cosines], dim=-1)
return pos_encoding
def scaled_dot_product_attention(q, k, v, mask, attention_mask=None, head_mask=None):
matmul_qk = torch.matmul(q, k.permute(0, 1, 3, 2))
dk = k.shape[-1]
scaled_attention_logits = matmul_qk / np.sqrt(dk)
if mask is not None:
nd, ns = scaled_attention_logits.size(-2), scaled_attention_logits.size(-1)
scaled_attention_logits += mask[ns - nd : ns, :ns] * -1e4
if attention_mask is not None:
scaled_attention_logits = scaled_attention_logits + attention_mask
attention_weights = torch.softmax(scaled_attention_logits, dim=-1)
if head_mask is not None:
attention_weights = attention_weights * head_mask
output = torch.matmul(attention_weights, v)
return output, attention_weights
class MultiHeadAttention(nn.Module):
def __init__(self, d_model_size, num_heads):
super().__init__()
self.num_heads = num_heads
self.d_model_size = d_model_size
self.depth = int(d_model_size / self.num_heads)
self.Wq = nn.Linear(d_model_size, d_model_size)
self.Wk = nn.Linear(d_model_size, d_model_size)
self.Wv = nn.Linear(d_model_size, d_model_size)
self.dense = nn.Linear(d_model_size, d_model_size)
self.pruned_heads = set()
def prune_heads(self, heads):
attention_head_size = self.d_model_size // self.num_heads
if len(heads) == 0:
return
heads, index = find_pruneable_heads_and_indices(heads, self.num_heads, attention_head_size, self.pruned_heads)
self.Wq = prune_linear_layer(self.Wq, index)
self.Wk = prune_linear_layer(self.Wk, index)
self.Wv = prune_linear_layer(self.Wv, index)
self.dense = prune_linear_layer(self.dense, index, dim=1)
self.num_heads = self.num_heads - len(heads)
self.d_model_size = attention_head_size * self.num_heads
self.pruned_heads = self.pruned_heads.union(heads)
def split_into_heads(self, x, batch_size):
x = x.reshape(batch_size, -1, self.num_heads, self.depth)
return x.permute([0, 2, 1, 3])
def forward(
self,
v,
k,
q,
mask,
layer_past=None,
attention_mask=None,
head_mask=None,
use_cache=False,
output_attentions=False,
):
batch_size = q.shape[0]
q = self.Wq(q)
k = self.Wk(k)
v = self.Wv(v)
q = self.split_into_heads(q, batch_size)
k = self.split_into_heads(k, batch_size)
v = self.split_into_heads(v, batch_size)
if layer_past is not None:
past_key, past_value = layer_past[0], layer_past[1]
k = torch.cat((past_key, k), dim=-2)
v = torch.cat((past_value, v), dim=-2)
if use_cache is True:
present = torch.stack((k, v))
else:
present = (None,)
output = scaled_dot_product_attention(q, k, v, mask, attention_mask, head_mask)
scaled_attention = output[0].permute([0, 2, 1, 3])
attn = output[1]
original_size_attention = scaled_attention.reshape(batch_size, -1, self.d_model_size)
output = self.dense(original_size_attention)
outputs = (output, present)
if output_attentions:
outputs = outputs + (attn,)
return outputs
def point_wise_feed_forward_network(d_model_size, dff):
return nn.Sequential(nn.Linear(d_model_size, dff), nn.ReLU(), nn.Linear(dff, d_model_size))
class EncoderLayer(nn.Module):
def __init__(self, d_model_size, num_heads, dff, rate=0.1):
super().__init__()
self.multi_head_attention = MultiHeadAttention(d_model_size, num_heads)
self.ffn = point_wise_feed_forward_network(d_model_size, dff)
self.layernorm1 = nn.LayerNorm(d_model_size, eps=1e-6)
self.layernorm2 = nn.LayerNorm(d_model_size, eps=1e-6)
self.dropout1 = nn.Dropout(rate)
self.dropout2 = nn.Dropout(rate)
def forward(
self, x, mask, layer_past=None, attention_mask=None, head_mask=None, use_cache=False, output_attentions=False
):
normed = self.layernorm1(x)
attn_outputs = self.multi_head_attention(
normed,
normed,
normed,
mask,
layer_past=layer_past,
attention_mask=attention_mask,
head_mask=head_mask,
use_cache=use_cache,
output_attentions=output_attentions,
)
attn_output = attn_outputs[0]
attn_output = self.dropout1(attn_output)
out1 = x + attn_output
out2 = self.layernorm2(out1)
ffn_output = self.ffn(out2)
ffn_output = self.dropout2(ffn_output)
out2 = out1 + ffn_output
outputs = (out2,) + attn_outputs[1:]
return outputs
@add_start_docstrings(
"The bare CTRL Model transformer outputting raw hidden-states without any specific head on top.",
CTRL_START_DOCSTRING,
)
class CTRLModel(CTRLPreTrainedModel):
"""CTRL 模型类,继承自 CTRLPreTrainedModel。用于生成原始隐藏状态,没有特定的输出头部。"""
def __init__(self, config):
"""CTRL 模型的初始化函数。
Args:
config (`CTRLConfig`): 包含模型所有参数的配置类对象。
通过配置文件初始化模型时不会加载与模型关联的权重,只加载配置。
可以查看 [`~PreTrainedModel.from_pretrained`] 方法来加载模型权重。
"""
super().__init__(config)
self.d_model_size = config.n_embd
self.num_layers = config.n_layer
self.pos_encoding = positional_encoding(config.n_positions, self.d_model_size, torch.float)
self.w = nn.Embedding(config.vocab_size, config.n_embd)
self.dropout = nn.Dropout(config.embd_pdrop)
self.h = nn.ModuleList(
[EncoderLayer(config.n_embd, config.n_head, config.dff, config.resid_pdrop) for _ in range(config.n_layer)]
)
self.layernorm = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
self.post_init()
def get_input_embeddings(self):
"""返回输入词嵌入层 `w`。"""
return self.w
def set_input_embeddings(self, new_embeddings):
self.w = new_embeddings
def _prune_heads(self, heads_to_prune):
for layer, heads in heads_to_prune.items():
self.h[layer].multi_head_attention.prune_heads(heads)
@add_start_docstrings_to_model_forward(CTRL_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BaseModelOutputWithPast, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
@add_start_docstrings(
"""
The CTRL Model transformer with a language modeling head on top (linear layer with weights tied to the input
embeddings).
""",
CTRL_START_DOCSTRING,
)
"""
定义了一个带有语言建模头部的CTRL模型变换器。语言建模头部是一个线性层,其权重与输入的嵌入层相绑定。
"""
class CTRLLMHeadModel(CTRLPreTrainedModel):
"""
CTRL语言模型的头部模型,继承自CTRL预训练模型。
"""
_tied_weights_keys = ["lm_head.weight"]
def __init__(self, config):
"""
初始化函数,接受一个配置参数config,并调用父类的初始化函数。
创建了CTRL模型的transformer部分和语言建模头部的线性层。
"""
super().__init__(config)
self.transformer = CTRLModel(config)
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=True)
self.post_init()
def get_output_embeddings(self):
"""
返回语言建模头部的嵌入层。
"""
return self.lm_head
def set_output_embeddings(self, new_embeddings):
"""
设置语言建模头部的新嵌入层。
"""
self.lm_head = new_embeddings
def prepare_inputs_for_generation(self, input_ids, past_key_values=None, use_cache=None, **kwargs):
"""
根据输入准备生成过程中的输入。
如果past_key_values不为None,则只保留输入ids的最后一个token。
返回一个包含输入信息的字典。
"""
if past_key_values is not None:
past_length = past_key_values[0][0].shape[2]
if input_ids.shape[1] > past_length:
remove_prefix_length = past_length
else:
remove_prefix_length = input_ids.shape[1] - 1
input_ids = input_ids[:, remove_prefix_length:]
return {"input_ids": input_ids, "past_key_values": past_key_values, "use_cache": use_cache}
@add_start_docstrings_to_model_forward(CTRL_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
"""
前向传播函数,接受多种输入参数,执行CTRL模型的前向计算。
返回一个CausalLMOutputWithPast对象,其中包含模型的输出和过去的关键值。
"""
pass
@staticmethod
def _reorder_cache(
past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor
) -> Tuple[Tuple[torch.Tensor]]:
"""
静态方法,用于重新排序past_key_values缓存,以匹配每个生成步骤的正确beam_idx。
返回重新排序后的past_key_values。
"""
return tuple(
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past)
for layer_past in past_key_values
)
@add_start_docstrings(
"""
The CTRL Model transformer with a sequence classification head on top (linear layer).
[`CTRLForSequenceClassification`] uses the last token in order to do the classification, as other causal models
(e.g. GPT-2) do. Since it does classification on the last token, it requires to know the position of the last
token. If a `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in
each row. If no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot
guess the padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last
value in each row of the batch).
"""
CTRL_START_DOCSTRING,
)
class CTRLForSequenceClassification(CTRLPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.transformer = CTRLModel(config)
self.classifier = nn.Linear(config.n_embd, self.num_labels, bias=False)
self.post_init()
@add_start_docstrings_to_model_forward(CTRL_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
.\models\ctrl\modeling_tf_ctrl.py
""" TF 2.0 CTRL model."""
from __future__ import annotations
from typing import Optional, Tuple, Union
import numpy as np
import tensorflow as tf
from ...modeling_tf_outputs import TFBaseModelOutputWithPast, TFCausalLMOutputWithPast, TFSequenceClassifierOutput
from ...modeling_tf_utils import (
TFCausalLanguageModelingLoss,
TFModelInputType,
TFPreTrainedModel,
TFSequenceClassificationLoss,
get_initializer,
keras,
keras_serializable,
unpack_inputs,
)
from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
from .configuration_ctrl import CTRLConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "Salesforce/ctrl"
_CONFIG_FOR_DOC = "CTRLConfig"
TF_CTRL_PRETRAINED_MODEL_ARCHIVE_LIST = [
"Salesforce/ctrl"
]
def angle_defn(pos, i, d_model_size):
angle_rates = 1 / np.power(10000, (2 * (i // 2)) / d_model_size)
return pos * angle_rates
def positional_encoding(position, d_model_size):
angle_rads = angle_defn(np.arange(position)[:, np.newaxis], np.arange(d_model_size)[np.newaxis, :], d_model_size)
sines = np.sin(angle_rads[:, 0::2])
cosines = np.cos(angle_rads[:, 1::2])
pos_encoding = tf.convert_to_tensor(np.concatenate([sines, cosines], axis=-1))
return pos_encoding
def scaled_dot_product_attention(q, k, v, mask, attention_mask=None, head_mask=None):
matmul_qk = tf.matmul(q, k, transpose_b=True)
dk = tf.cast(shape_list(k)[-1], dtype=matmul_qk.dtype)
scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
if mask is not None:
scaled_attention_logits += tf.cast(mask * -1e4, dtype=scaled_attention_logits.dtype)
if attention_mask is not None:
attention_mask = tf.cast(attention_mask, dtype=scaled_attention_logits.dtype)
scaled_attention_logits = scaled_attention_logits + attention_mask
attention_weights = stable_softmax(scaled_attention_logits, axis=-1)
if head_mask is not None:
attention_weights = attention_weights * head_mask
output = tf.matmul(attention_weights, v)
return output, attention_weights
class TFMultiHeadAttention(keras.layers.Layer):
def __init__(self, d_model_size, num_heads, output_attentions=False, **kwargs):
super().__init__(**kwargs)
self.num_heads = num_heads
self.d_model_size = d_model_size
self.output_attentions = output_attentions
self.depth = int(d_model_size / self.num_heads)
self.Wq = keras.layers.Dense(d_model_size, name="Wq")
self.Wk = keras.layers.Dense(d_model_size, name="Wk")
self.Wv = keras.layers.Dense(d_model_size, name="Wv")
self.dense = keras.layers.Dense(d_model_size, name="dense")
def split_into_heads(self, x, batch_size):
x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
return tf.transpose(x, perm=[0, 2, 1, 3])
def call(self, v, k, q, mask, layer_past, attention_mask, head_mask, use_cache, output_attentions, training=False):
batch_size = shape_list(q)[0]
q = self.Wq(q)
k = self.Wk(k)
v = self.Wv(v)
q = self.split_into_heads(q, batch_size)
k = self.split_into_heads(k, batch_size)
v = self.split_into_heads(v, batch_size)
if layer_past is not None:
past_key, past_value = tf.unstack(layer_past, axis=0)
k = tf.concat((past_key, k), axis=-2)
v = tf.concat((past_value, v), axis=-2)
if use_cache:
present = tf.stack((k, v), axis=0)
else:
present = (None,)
output = scaled_dot_product_attention(q, k, v, mask, attention_mask, head_mask)
scaled_attention = tf.transpose(output[0], perm=[0, 2, 1, 3])
attn = output[1]
original_size_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model_size))
output = self.dense(original_size_attention)
outputs = (output, present)
if output_attentions:
outputs = outputs + (attn,)
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "Wq", None) is not None:
with tf.name_scope(self.Wq.name):
self.Wq.build([None, None, self.d_model_size])
if getattr(self, "Wk", None) is not None:
with tf.name_scope(self.Wk.name):
self.Wk.build([None, None, self.d_model_size])
if getattr(self, "Wv", None) is not None:
with tf.name_scope(self.Wv.name):
self.Wv.build([None, None, self.d_model_size])
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.d_model_size])
def __init__(self, d_model_size, dff, **kwargs):
super().__init__(**kwargs)
self.dense_0 = keras.layers.Dense(dff, activation="relu", name="0")
self.dense_2 = keras.layers.Dense(d_model_size, name="2")
self.d_model_size = d_model_size
self.dff = dff
def call(self, inputs, trainable=False):
dense_0_output = self.dense_0(inputs)
dense_2_output = self.dense_2(dense_0_output)
return dense_2_output
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense_0", None) is not None:
with tf.name_scope(self.dense_0.name):
self.dense_0.build([None, None, self.d_model_size])
if getattr(self, "dense_2", None) is not None:
with tf.name_scope(self.dense_2.name):
self.dense_2.build([None, None, self.dff])
class TFEncoderLayer(keras.layers.Layer):
def __init__(
self, d_model_size, num_heads, dff, rate=0.1, layer_norm_epsilon=1e-6, output_attentions=False, **kwargs
):
super().__init__(**kwargs)
self.output_attentions = output_attentions
self.multi_head_attention = TFMultiHeadAttention(
d_model_size, num_heads, output_attentions=self.output_attentions, name="multi_head_attention"
)
self.ffn = TFPointWiseFeedForwardLayer(d_model_size, dff, name="ffn")
self.layernorm1 = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layernorm1")
self.layernorm2 = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layernorm2")
self.dropout1 = keras.layers.Dropout(rate)
self.dropout2 = keras.layers.Dropout(rate)
self.d_model_size = d_model_size
def call(self, x, mask, layer_past, attention_mask, head_mask, use_cache, output_attentions, training=False):
normed = self.layernorm1(x)
attn_outputs = self.multi_head_attention(
normed,
normed,
normed,
mask,
layer_past,
attention_mask,
head_mask,
use_cache,
output_attentions,
training=training,
)
attn_output = attn_outputs[0]
attn_output = self.dropout1(attn_output, training=training)
out1 = x + attn_output
out2 = self.layernorm2(out1)
ffn_output = self.ffn(out2)
ffn_output = self.dropout2(ffn_output, training=training)
out2 = out1 + ffn_output
outputs = (out2,) + attn_outputs[1:]
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "multi_head_attention", None) is not None:
with tf.name_scope(self.multi_head_attention.name):
self.multi_head_attention.build(None)
if getattr(self, "ffn", None) is not None:
with tf.name_scope(self.ffn.name):
self.ffn.build(None)
if getattr(self, "layernorm1", None) is not None:
with tf.name_scope(self.layernorm1.name):
self.layernorm1.build([None, None, self.d_model_size])
if getattr(self, "layernorm2", None) is not None:
with tf.name_scope(self.layernorm2.name):
self.layernorm2.build([None, None, self.d_model_size])
@keras_serializable
class TFCTRLMainLayer(keras.layers.Layer):
config_class = CTRLConfig
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.config = config
self.output_hidden_states = config.output_hidden_states
self.output_attentions = config.output_attentions
self.use_cache = config.use_cache
self.return_dict = config.use_return_dict
self.d_model_size = config.n_embd
self.num_layers = config.n_layer
self.pos_encoding = positional_encoding(config.n_positions, self.d_model_size)
self.w = keras.layers.Embedding(
input_dim=config.vocab_size,
output_dim=config.n_embd,
embeddings_initializer=get_initializer(config.initializer_range),
name="w",
)
self.dropout = keras.layers.Dropout(config.embd_pdrop)
self.h = [
TFEncoderLayer(
config.n_embd,
config.n_head,
config.dff,
config.resid_pdrop,
config.layer_norm_epsilon,
self.output_attentions,
name=f"h_._{i}",
)
for i in range(config.n_layer)
]
self.layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="layernorm")
def get_input_embeddings(self):
return self.w
def set_input_embeddings(self, new_embeddings):
self.w = new_embeddings
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
"""
raise NotImplementedError
@unpack_inputs
def call(
self,
input_ids: TFModelInputType | None = None,
past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: Optional[bool] = False,
):
...
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "w", None) is not None:
with tf.name_scope(self.w.name):
self.w.build(None)
if getattr(self, "layernorm", None) is not None:
with tf.name_scope(self.layernorm.name):
self.layernorm.build([None, None, self.config.n_embd])
if getattr(self, "h", None) is not None:
for layer in self.h:
with tf.name_scope(layer.name):
layer.build(None)
class TFCTRLPreTrainedModel(TFPreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = CTRLConfig
base_model_prefix = "transformer"
CTRL_START_DOCSTRING = r"""
This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior.
<Tip>
TensorFlow models and layers in `transformers` accept two formats as input:
- having all inputs as keyword arguments (like PyTorch models), or
- having all inputs as a list, tuple or dict in the first positional argument.
The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
positional argument:
- a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
- a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
`model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
- a dictionary with one or several input Tensors associated to the input names given in the docstring:
`model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
Note that when creating models and layers with
[subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
about any of this, as you can just pass inputs like you would to any other Python function!
</Tip>
Parameters:
config ([`CTRLConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
CTRL_INPUTS_DOCSTRING = r"""
"""
@add_start_docstrings(
"The bare CTRL Model transformer outputting raw hidden-states without any specific head on top.",
CTRL_START_DOCSTRING,
)
class TFCTRLModel(TFCTRLPreTrainedModel):
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.transformer = TFCTRLMainLayer(config, name="transformer")
@unpack_inputs
@add_start_docstrings_to_model_forward(CTRL_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFBaseModelOutputWithPast,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: Optional[bool] = False,
) -> Union[Tuple, TFBaseModelOutputWithPast]:
outputs = self.transformer(
input_ids=input_ids,
past_key_values=past_key_values,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "transformer", None) is not None:
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
class TFCTRLBiasLayer(keras.layers.Layer):
"""
Bias as a layer. It is used for serialization purposes: `keras.Model.save_weights` stores on a per-layer basis,
so all weights have to be registered in a layer.
"""
def __init__(self, shape, initializer, trainable, name, **kwargs):
super().__init__(name=name, **kwargs)
self.shape = shape
self.initializer = initializer
self.trainable = trainable
def build(self, input_shape):
self.bias = self.add_weight(
name="bias", shape=self.shape, initializer=self.initializer, trainable=self.trainable
)
super().build(input_shape)
def call(self, x):
return x + self.bias
@add_start_docstrings(
"""
The CTRL Model transformer with a language modeling head on top (linear layer with weights tied to the input
embeddings).
""",
CTRL_START_DOCSTRING,
)
class TFCTRLLMHeadModel(TFCTRLPreTrainedModel, TFCausalLanguageModelingLoss):
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.transformer = TFCTRLMainLayer(config, name="transformer")
self.bias_layer = TFCTRLBiasLayer(
name="lm_head", shape=[1, config.vocab_size], initializer="zeros", trainable=True
)
def get_output_embeddings(self):
return self.get_input_embeddings()
def set_output_embeddings(self, value):
self.set_input_embeddings(value)
def get_bias(self):
return {"lm_head.bias": self.bias_layer.bias}
def set_bias(self, value):
vocab_size = value["lm_head.bias"].shape[-1]
self.bias_layer = TFCTRLBiasLayer(
name="final_logits_bias", shape=[1, vocab_size], initializer="zeros", trainable=True
)
self.bias_layer.build(None)
self.bias_layer.bias.assign(value["lm_head.bias"])
def prepare_inputs_for_generation(self, inputs, past_key_values=None, use_cache=None, **kwargs):
token_type_ids = kwargs.get("token_type_ids", None)
if past_key_values:
inputs = tf.expand_dims(inputs[:, -1], -1)
if token_type_ids is not None:
token_type_ids = tf.expand_dims(token_type_ids[:, -1], -1)
position_ids = kwargs.get("position_ids", None)
attention_mask = kwargs.get("attention_mask", None)
if attention_mask is not None and position_ids is None:
position_ids = tf.math.cumsum(attention_mask, axis=-1, exclusive=True)
if past_key_values:
position_ids = tf.expand_dims(position_ids[:, -1], -1)
return {
"input_ids": inputs,
"attention_mask": attention_mask,
"position_ids": position_ids,
"past_key_values": past_key_values,
"use_cache": use_cache,
"token_type_ids": token_type_ids,
}
@unpack_inputs
@add_start_docstrings_to_model_forward(CTRL_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFCausalLMOutputWithPast,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
training: Optional[bool] = False,
) -> Union[Tuple, TFCausalLMOutputWithPast]:
r"""
labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the cross entropy classification loss. Indices should be in `[0, ...,
config.vocab_size - 1]`.
"""
transformer_outputs = self.transformer(
input_ids=input_ids,
past_key_values=past_key_values,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
hidden_states = transformer_outputs[0]
logits = tf.matmul(hidden_states, self.transformer.w.weights, transpose_b=True)
logits = self.bias_layer(logits)
loss = None
if labels is not None:
shifted_logits = logits[:, :-1]
labels = labels[:, 1:]
loss = self.hf_compute_loss(labels, shifted_logits)
if not return_dict:
output = (logits,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output
return TFCausalLMOutputWithPast(
loss=loss,
logits=logits,
past_key_values=transformer_outputs.past_key_values,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "transformer", None) is not None:
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
if getattr(self, "bias_layer", None) is not None:
with tf.name_scope(self.bias_layer.name):
self.bias_layer.build(None)
@add_start_docstrings(
"""
The CTRL Model transformer with a sequence classification head on top (linear layer).
[`TFCTRLForSequenceClassification`] uses the last token in order to do the classification, as other causal models
(e.g. GPT-1, GPT-2) do.
Since it does classification on the last token, it requires to know the position of the last token. If a
`pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
each row of the batch).
""",
CTRL_START_DOCSTRING,
)
class TFCTRLForSequenceClassification(TFCTRLPreTrainedModel, TFSequenceClassificationLoss):
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels
self.classifier = keras.layers.Dense(
config.num_labels,
kernel_initializer=get_initializer(config.initializer_range),
name="classifier",
use_bias=False,
)
self.transformer = TFCTRLMainLayer(config, name="transformer")
self.config = config
def get_output_embeddings(self):
logger.warning(
"Sequence classification models do not have output embeddings. `.get_output_embeddings` will be removed "
"in transformers v4.32."
)
return self.transformer.w
@unpack_inputs
@add_start_docstrings_to_model_forward(CTRL_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFSequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
training: Optional[bool] = False,
):
pass
) -> Union[Tuple, TFSequenceClassifierOutput]:
r"""
labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the cross entropy classification loss. Indices should be in `[0, ...,
config.vocab_size - 1]`.
"""
transformer_outputs = self.transformer(
input_ids=input_ids,
past_key_values=past_key_values,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
hidden_states = transformer_outputs[0]
logits = self.classifier(hidden_states)
in_logits = None
if self.config.pad_token_id is None:
sequence_lengths = -1
else:
if input_ids is not None:
sequence_lengths = (
tf.argmax(tf.cast(tf.math.equal(input_ids, self.config.pad_token_id), input_ids.dtype), axis=-1)
- 1
)
sequence_lengths = tf.where(sequence_lengths >= 0, sequence_lengths, input_ids.shape[-1] - 1)
in_logits = tf.gather(logits, sequence_lengths, batch_dims=1, axis=1)
else:
sequence_lengths = -1
logger.warning(
f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
"unexpected if using padding tokens in conjunction with `inputs_embeds.`"
)
loss = None
if labels is not None:
if input_ids is not None:
batch_size, sequence_length = shape_list(input_ids)[:2]
else:
batch_size, sequence_length = shape_list(inputs_embeds)[:2]
if self.config.pad_token_id is None and batch_size != 1:
raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
if not tf.is_tensor(sequence_lengths):
in_logits = logits[0:batch_size, sequence_lengths]
loss = self.hf_compute_loss(tf.reshape(labels, [-1, 1]), tf.reshape(in_logits, [-1, self.num_labels]))
pooled_logits = in_logits if in_logits is not None else logits
if not return_dict:
output = (pooled_logits,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output
return TFSequenceClassifierOutput(
loss=loss,
logits=pooled_logits,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.n_embd])
if getattr(self, "transformer", None) is not None:
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
.\models\ctrl\tokenization_ctrl.py
"""Tokenization classes for Salesforce CTRL."""
import json
import os
from typing import Optional, Tuple
import regex as re
from ...tokenization_utils import PreTrainedTokenizer
from ...utils import logging
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {
"vocab_file": "vocab.json",
"merges_file": "merges.txt",
}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {"Salesforce/ctrl": "https://raw.githubusercontent.com/salesforce/ctrl/master/ctrl-vocab.json"},
"merges_file": {"Salesforce/ctrl": "https://raw.githubusercontent.com/salesforce/ctrl/master/ctrl-merges.txt"},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"Salesforce/ctrl": 256,
}
CONTROL_CODES = {
"Pregnancy": 168629,
"Christianity": 7675,
"Explain": 106423,
"Fitness": 63440,
"Saving": 63163,
"Ask": 27171,
"Ass": 95985,
"Joke": 163509,
"Questions": 45622,
"Thoughts": 49605,
"Retail": 52342,
"Feminism": 164338,
"Writing": 11992,
"Atheism": 192263,
"Netflix": 48616,
"Computing": 39639,
"Opinion": 43213,
"Alone": 44967,
"Funny": 58917,
"Gaming": 40358,
"Human": 4088,
"India": 1331,
"Joker": 77138,
"Diet": 36206,
"Legal": 11859,
"Norman": 4939,
"Tip": 72689,
"Weight": 52343,
"Movies": 46273,
"Running": 23425,
"Science": 2090,
"Horror": 37793,
"Confession": 60572,
"Finance": 12250,
"Politics": 16360,
"Scary": 191985,
"Support": 12654,
"Technologies": 32516,
"Teenage": 66160,
"Event": 32769,
"Learned": 67460,
"Notion": 182770,
"Wikipedia": 37583,
"Books": 6665,
"Extract": 76050,
"Confessions": 102701,
"Conspiracy": 75932,
"Links": 63674,
"Narcissus": 150425,
"Relationship": 54766,
"Relationships": 134796,
"Reviews": 41671,
"News": 4256,
"Translation": 26820,
"multilingual": 128406,
}
def get_pairs(word):
"""
Return set of symbol pairs in a word.
Word is represented as tuple of symbols (symbols being variable-length strings).
"""
pairs = set()
prev_char = word[0]
for char in word[1:]:
pairs.add((prev_char, char))
prev_char = char
pairs = set(pairs)
return pairs
class CTRLTokenizer(PreTrainedTokenizer):
"""
Construct a CTRL tokenizer. Based on Byte-Pair-Encoding.
构造一个CTRL分词器,基于字节对编码(Byte-Pair-Encoding)。
"""
"""
This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.
Args:
vocab_file (`str`):
Path to the vocabulary file.
merges_file (`str`):
Path to the merges file.
unk_token (`str`, *optional*, defaults to `"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
control_codes = CONTROL_CODES
def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs):
with open(vocab_file, encoding="utf-8") as vocab_handle:
self.encoder = json.load(vocab_handle)
self.decoder = {v: k for k, v in self.encoder.items()}
with open(merges_file, encoding="utf-8") as merges_handle:
merges = merges_handle.read().split("\n")[1:-1]
merges = [tuple(merge.split()) for merge in merges]
self.bpe_ranks = dict(zip(merges, range(len(merges))))
self.cache = {}
super().__init__(unk_token=unk_token, **kwargs)
@property
def vocab_size(self):
return len(self.encoder)
def get_vocab(self):
return dict(self.encoder, **self.added_tokens_encoder)
def bpe(self, token):
if token in self.cache:
return self.cache[token]
word = tuple(token)
word = tuple(list(word[:-1]) + [word[-1] + "</w>"])
pairs = get_pairs(word)
if not pairs:
return token
while True:
bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
if bigram not in self.bpe_ranks:
break
first, second = bigram
new_word = []
i = 0
while i < len(word):
try:
j = word.index(first, i)
except ValueError:
new_word.extend(word[i:])
break
else:
new_word.extend(word[i:j])
i = j
if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
new_word.append(first + second)
i += 2
else:
new_word.append(word[i])
i += 1
new_word = tuple(new_word)
word = new_word
if len(word) == 1:
break
else:
pairs = get_pairs(word)
word = "@@ ".join(word)
word = word[:-4]
self.cache[token] = word
return word
def _tokenize(self, text):
split_tokens = []
words = re.findall(r"\S+\n?", text)
for token in words:
split_tokens.extend(list(self.bpe(token).split(" ")))
return split_tokens
def _convert_token_to_id(self, token):
return self.encoder.get(token, self.encoder.get(self.unk_token))
def _convert_id_to_token(self, index):
return self.decoder.get(index, self.unk_token)
def convert_tokens_to_string(self, tokens):
out_string = " ".join(tokens).replace("@@ ", "").strip()
return out_string
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
merge_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
)
with open(vocab_file, "w", encoding="utf-8") as f:
f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
index = 0
with open(merge_file, "w", encoding="utf-8") as writer:
writer.write("#version: 0.2\n")
for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
if index != token_index:
logger.warning(
f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
" Please check that the tokenizer is not corrupted!"
)
index = token_index
writer.write(" ".join(bpe_tokens) + "\n")
index += 1
return vocab_file, merge_file