Transformers 源码解析(三十六)
.\models\deprecated\mmbt\__init__.py
from typing import TYPE_CHECKING
from ....utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
_import_structure = {"configuration_mmbt": ["MMBTConfig"]}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_mmbt"] = ["MMBTForClassification", "MMBTModel", "ModalEmbeddings"]
if TYPE_CHECKING:
from .configuration_mmbt import MMBTConfig
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_mmbt import MMBTForClassification, MMBTModel, ModalEmbeddings
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\deprecated\open_llama\configuration_open_llama.py
""" Open-Llama model configuration"""
from ....configuration_utils import PretrainedConfig
from ....utils import logging
logger = logging.get_logger(__name__)
OPEN_LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"s-JoL/Open-Llama-V1": "https://huggingface.co/s-JoL/Open-Llama-V1/blob/main/config.json",
}
class OpenLlamaConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`OpenLlamaModel`]. It is used to instantiate an
Open-Llama model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the
[s-JoL/Open-Llama-V1](https://huggingface.co/s-JoL/Open-Llama-V1).
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
"""
Args:
vocab_size (`int`, *optional*, defaults to 32000):
Open-Llama 模型的词汇表大小。定义可以表示的不同标记数量,当调用 [`OpenLlamaModel`] 时传递 `inputs_ids`。
hidden_size (`int`, *optional*, defaults to 4096):
隐藏表示的维度。
intermediate_size (`int`, *optional*, defaults to 11008):
MLP 表示的维度。
num_hidden_layers (`int`, *optional*, defaults to 32):
Transformer 编码器中的隐藏层数量。
num_attention_heads (`int`, *optional*, defaults to 32):
Transformer 编码器中每个 attention 层的注意力头数量。
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
解码器中的非线性激活函数(函数或字符串)。
max_position_embeddings (`int`, *optional*, defaults to 2048):
此模型可能使用的最大序列长度。通常设置一个较大的值(例如 512、1024 或 2048)以防万一。
initializer_range (`float`, *optional*, defaults to 0.02):
用于初始化所有权重矩阵的截断正态初始化器的标准偏差。
rms_norm_eps (`float`, *optional*, defaults to 1e-12):
rms 归一化层使用的 epsilon。
use_cache (`bool`, *optional*, defaults to `True`):
模型是否应返回最后的关键/值注意力(并非所有模型都使用)。只在 `config.is_decoder=True` 时相关。
tie_word_embeddings(`bool`, *optional*, defaults to `False`):
是否绑定权重嵌入
rope_scaling (`Dict`, *optional*):
包含 RoPE 嵌入的缩放配置的字典。目前支持两种缩放策略:线性和动态。它们的缩放因子必须是大于1的浮点数。
期望格式是 `{"type": 策略名称, "factor": 缩放因子}`。在使用此标志时,不更新 `max_position_embeddings` 到预期的新最大值。
更多关于这些缩放策略行为的信息,请参阅以下主题:
https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/。这是一个实验性功能,可能在未来版本中发生重大 API 更改。
Example:
```
>>> from transformers import OpenLlamaModel, OpenLlamaConfig
>>>
>>> configuration = OpenLlamaConfig()
>>>
>>> model = OpenLlamaModel(configuration)
>>>
>>> configuration = model.config
```"""
# 设定模型类型为 "open-llama"
model_type = "open-llama"
# 定义初始化方法,接受多个配置参数
def __init__(
self,
vocab_size=100000,
hidden_size=4096,
intermediate_size=11008,
num_hidden_layers=32,
num_attention_heads=32,
hidden_act="silu",
max_position_embeddings=2048,
initializer_range=0.02,
rms_norm_eps=1e-6,
use_cache=True,
pad_token_id=0,
bos_token_id=1,
eos_token_id=2,
tie_word_embeddings=False,
use_memory_efficient_attention=True,
hidden_dropout_prob=0.1,
attention_dropout_prob=0.1,
use_stable_embedding=True,
shared_input_output_embedding=True,
rope_scaling=None,
**kwargs,
):
# 初始化实例变量
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.hidden_act = hidden_act
self.initializer_range = initializer_range
self.rms_norm_eps = rms_norm_eps
self.use_cache = use_cache
# 设置注意事项中的 "use_memory_efficient_attention" 参数,若未提供则使用默认值
self.use_memory_efficient_attention = kwargs.pop(
"use_memorry_efficient_attention", use_memory_efficient_attention
)
# 设置隐藏层和注意力层的 dropout 概率
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_dropout_prob = attention_dropout_prob
# 稳定嵌入使用标志
self.use_stable_embedding = use_stable_embedding
# 共享输入输出嵌入标志
self.shared_input_output_embedding = shared_input_output_embedding
# "rope_scaling" 是可选参数,用于某些定制操作
self.rope_scaling = rope_scaling
# 调用内部方法,验证 "rope_scaling" 参数的有效性
self._rope_scaling_validation()
# 调用父类的初始化方法,传递必要的参数和可能的其他关键字参数
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)
# 以下注释标识来自于 transformers.models.llama.configuration_llama.LlamaConfig._rope_scaling_validation
# (此处应该有 _rope_scaling_validation 方法的具体实现,但在这段代码中未提供)
def _rope_scaling_validation(self):
"""
Validate the `rope_scaling` configuration.
"""
# 检查是否设置了 `rope_scaling`,如果没有设置则直接返回
if self.rope_scaling is None:
return
# 检查 `rope_scaling` 是否为字典类型且包含两个字段
if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
# 如果不是符合要求的字典类型,则抛出数值错误异常
raise ValueError(
"`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
f"got {self.rope_scaling}"
)
# 获取 `rope_scaling` 字典中的 `type` 和 `factor` 字段
rope_scaling_type = self.rope_scaling.get("type", None)
rope_scaling_factor = self.rope_scaling.get("factor", None)
# 检查 `type` 字段是否为 `linear` 或 `dynamic`
if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
# 如果不是预期的类型,则抛出数值错误异常
raise ValueError(
f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
)
# 检查 `factor` 字段是否为大于 1 的浮点数
if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
# 如果不是预期的浮点数或者小于等于 1,则抛出数值错误异常
raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")
.\models\deprecated\open_llama\modeling_open_llama.py
""" PyTorch Open-Llama 模型。"""
import math
from typing import List, Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ....activations import ACT2FN
from ....modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
from ....modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
from ....modeling_utils import PreTrainedModel
from ....utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
from .configuration_open_llama import OpenLlamaConfig
logger = logging.get_logger(__name__)
try:
from xformers import ops as xops
except ImportError:
xops = None
_CONFIG_FOR_DOC = "OpenLlamaConfig"
class OpenLlamaRMSNorm(nn.Module):
def __init__(self, hidden_size, eps=1e-6):
"""
OpenLlamaRMSNorm 等效于 T5LayerNorm。
"""
super().__init__()
self.weight = nn.Parameter(torch.ones(hidden_size))
self.variance_epsilon = eps
def forward(self, hidden_states):
input_dtype = hidden_states.dtype
hidden_states = hidden_states.to(torch.float32)
variance = hidden_states.pow(2).mean(-1, keepdim=True)
hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
return self.weight * hidden_states.to(input_dtype)
class OpenLlamaRotaryEmbedding(nn.Module):
def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
super().__init__()
self.dim = dim
self.max_position_embeddings = max_position_embeddings
self.base = base
inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
self.register_buffer("inv_freq", inv_freq, persistent=False)
self._set_cos_sin_cache(
seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
)
def _set_cos_sin_cache(self, seq_len, device, dtype):
self.max_seq_len_cached = seq_len
t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
freqs = torch.outer(t, self.inv_freq)
emb = torch.cat((freqs, freqs), dim=-1)
self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
def forward(self, x, seq_len=None):
if seq_len > self.max_seq_len_cached:
self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
return (
self.cos_cached[:seq_len].to(dtype=x.dtype),
self.sin_cached[:seq_len].to(dtype=x.dtype),
)
class OpenLlamaLinearScalingRotaryEmbedding(OpenLlamaRotaryEmbedding):
"""OpenLlamaRotaryEmbedding扩展了线性缩放。鸣谢Reddit用户/u/kaiokendev"""
def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
self.scaling_factor = scaling_factor
super().__init__(dim, max_position_embeddings, base, device)
def _set_cos_sin_cache(self, seq_len, device, dtype):
self.max_seq_len_cached = seq_len
t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
t = t / self.scaling_factor
freqs = torch.outer(t, self.inv_freq)
emb = torch.cat((freqs, freqs), dim=-1)
self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
class OpenLlamaDynamicNTKScalingRotaryEmbedding(OpenLlamaRotaryEmbedding):
"""OpenLlamaRotaryEmbedding扩展了动态NTK缩放。鸣谢Reddit用户/u/bloc97和/u/emozilla"""
def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
self.scaling_factor = scaling_factor
super().__init__(dim, max_position_embeddings, base, device)
def _set_cos_sin_cache(self, seq_len, device, dtype):
self.max_seq_len_cached = seq_len
if seq_len > self.max_position_embeddings:
base = self.base * (
(self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
) ** (self.dim / (self.dim - 2))
inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
self.register_buffer("inv_freq", inv_freq, persistent=False)
t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
freqs = torch.outer(t, self.inv_freq)
emb = torch.cat((freqs, freqs), dim=-1)
self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
def rotate_half(x):
"""旋转输入的一半隐藏维度。"""
x1 = x[..., : x.shape[-1] // 2]
x2 = x[..., x.shape[-1] // 2 :]
return torch.cat((-x2, x1), dim=-1)
def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
"""Applies Rotary Position Embedding to the query and key tensors.
Args:
q (`torch.Tensor`): The query tensor.
k (`torch.Tensor`): The key tensor.
cos (`torch.Tensor`): The cosine part of the rotary embedding.
sin (`torch.Tensor`): The sine part of the rotary embedding.
position_ids (`torch.Tensor`):
The position indices of the tokens corresponding to the query and key tensors. For example, this can be
used to pass offsetted position ids when working with a KV-cache.
unsqueeze_dim (`int`, *optional*, defaults to 1):
The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
`tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
"""
cos = cos[position_ids].unsqueeze(unsqueeze_dim)
sin = sin[position_ids].unsqueeze(unsqueeze_dim)
q_embed = (q * cos) + (rotate_half(q) * sin)
k_embed = (k * cos) + (rotate_half(k) * sin)
return q_embed, k_embed
class OpenLlamaMLP(nn.Module):
def __init__(
self,
hidden_size: int,
intermediate_size: int,
hidden_act: str,
dropout_prob: float,
):
"""Initialize the OpenLlamaMLP module.
Args:
hidden_size (int): The size of the input and output hidden layers.
intermediate_size (int): The size of the intermediate layer.
hidden_act (str): The activation function to be used in the hidden layers.
dropout_prob (float): The dropout probability for regularization.
"""
super().__init__()
self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False)
self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
self.act_fn = ACT2FN[hidden_act]
self.dropout = nn.Dropout(dropout_prob)
def forward(self, x):
out = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
return self.dropout(out)
class OpenLlamaAttention(nn.Module):
"""Multi-headed attention from 'Attention Is All You Need' paper"""
def __init__(self, config: OpenLlamaConfig):
super().__init__()
self.config = config
self.hidden_size = config.hidden_size
self.num_heads = config.num_attention_heads
self.head_dim = self.hidden_size // self.num_heads
self.max_position_embeddings = config.max_position_embeddings
self.dropout_prob = config.attention_dropout_prob
if (self.head_dim * self.num_heads) != self.hidden_size:
raise ValueError(
f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
f" and `num_heads`: {self.num_heads})."
)
self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
self.k_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
self.v_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
self._init_rope()
def _init_rope(self):
if self.config.rope_scaling is None:
self.rotary_emb = OpenLlamaRotaryEmbedding(
self.head_dim,
max_position_embeddings=self.max_position_embeddings,
base=self.rope_theta,
)
else:
scaling_type = self.config.rope_scaling["type"]
scaling_factor = self.config.rope_scaling["factor"]
if scaling_type == "linear":
self.rotary_emb = OpenLlamaLinearScalingRotaryEmbedding(
self.head_dim,
max_position_embeddings=self.max_position_embeddings,
scaling_factor=scaling_factor,
base=self.rope_theta,
)
elif scaling_type == "dynamic":
self.rotary_emb = OpenLlamaDynamicNTKScalingRotaryEmbedding(
self.head_dim,
max_position_embeddings=self.max_position_embeddings,
scaling_factor=scaling_factor,
base=self.rope_theta,
)
else:
raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
output_attentions: bool = False,
use_cache: bool = False,
):
class OpenLlamaDecoderLayer(nn.Module):
def __init__(self, config: OpenLlamaConfig):
super().__init__()
self.hidden_size = config.hidden_size
self.self_attn = OpenLlamaAttention(config=config)
self.mlp = OpenLlamaMLP(
hidden_size=self.hidden_size,
intermediate_size=config.intermediate_size,
hidden_act=config.hidden_act,
dropout_prob=config.hidden_dropout_prob,
)
self.input_layernorm = OpenLlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
self.post_attention_layernorm = OpenLlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
output_attentions: Optional[bool] = False,
use_cache: Optional[bool] = False,
) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
"""
Args:
hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
use_cache (`bool`, *optional*):
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
(see `past_key_values`).
past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
"""
residual = hidden_states
hidden_states = self.input_layernorm(hidden_states)
hidden_states, self_attn_weights, present_key_value = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_value=past_key_value,
output_attentions=output_attentions,
use_cache=use_cache,
)
hidden_states = residual + hidden_states
residual = hidden_states
hidden_states = self.post_attention_layernorm(hidden_states)
hidden_states = self.mlp(hidden_states)
hidden_states = residual + hidden_states
outputs = (hidden_states,)
if output_attentions:
outputs += (self_attn_weights,)
if use_cache:
outputs += (present_key_value,)
return outputs
OPEN_LLAMA_START_DOCSTRING = r"""
# 这个模型继承自`PreTrainedModel`。查看超类文档可以了解到库实现的通用方法,例如下载或保存模型、调整输入嵌入大小、修剪头部等。
# 这个模型也是一个PyTorch的`torch.nn.Module`子类。可以像使用常规PyTorch模块一样使用它,并参考PyTorch文档了解有关一般用法和行为的所有内容。
# 参数:
# config ([`OpenLlamaConfig`]):
# 包含模型所有参数的模型配置类。使用配置文件初始化模型不会加载与模型关联的权重,仅加载配置。查看[`~PreTrainedModel.from_pretrained`]方法以加载模型权重。
"""
"""
@add_start_docstrings(
"The bare Open-Llama Model outputting raw hidden-states without any specific head on top.",
OPEN_LLAMA_START_DOCSTRING,
)
class OpenLlamaPreTrainedModel(PreTrainedModel):
# 使用特定的文档字符串为类添加描述
config_class = OpenLlamaConfig
# 模型中基础模型的名称前缀
base_model_prefix = "model"
# 支持梯度检查点的标志
supports_gradient_checkpointing = True
# 不需要分割的模块名称列表
_no_split_modules = ["OpenLlamaDecoderLayer"]
def _init_weights(self, module):
# 初始化模型权重的函数
std = self.config.initializer_range
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=std)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
if self.config.use_stable_embedding:
torch.nn.init.xavier_normal_(module.weight.data)
else:
module.weight.data.normal_(mean=0.0, std=std)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
OPEN_LLAMA_INPUTS_DOCSTRING = r"""
"""
@add_start_docstrings(
"The bare Open-Llama Model outputting raw hidden-states without any specific head on top.",
OPEN_LLAMA_START_DOCSTRING,
)
class OpenLlamaModel(OpenLlamaPreTrainedModel):
"""
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`OpenLlamaDecoderLayer`]
Args:
config: OpenLlamaConfig
"""
def __init__(self, config: OpenLlamaConfig):
super().__init__(config)
self.padding_idx = config.pad_token_id
self.vocab_size = config.vocab_size
# 词嵌入层,根据配置初始化不同的稳定或非稳定嵌入方式
self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
if config.use_stable_embedding:
self.embed_layer_norm = nn.LayerNorm(config.hidden_size)
else:
self.embed_layer_norm = None
# 使用配置中的层数初始化解码器层列表
self.layers = nn.ModuleList([OpenLlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)])
# 使用自定义的 RMS 标准化器
self.norm = OpenLlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
self.gradient_checkpointing = False
# 初始化权重并应用最终处理
self.post_init()
def get_input_embeddings(self):
return self.embed_tokens
def set_input_embeddings(self, value):
self.embed_tokens = value
@add_start_docstrings_to_model_forward(OPEN_LLAMA_INPUTS_DOCSTRING)
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
# OpenLlamaForCausalLM 类的 forward 方法尚未完全添加
# 初始化函数,接受一个配置对象作为参数
def __init__(self, config):
# 调用父类的初始化方法,传入配置对象
super().__init__(config)
# 使用配置对象创建一个 OpenLlamaModel 实例,并赋值给 self.model
self.model = OpenLlamaModel(config)
# 根据配置参数决定是否创建 lm_head 层
if config.shared_input_output_embedding:
self.lm_head = None # 如果配置中设置共享输入输出嵌入,则 lm_head 为 None
else:
# 否则,创建一个线性层,将隐藏大小为 config.hidden_size,输出大小为 config.vocab_size,无偏置
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
# 初始化权重并应用最终处理
self.post_init()
# 返回模型的输入嵌入层
def get_input_embeddings(self):
return self.model.embed_tokens
# 设置模型的输入嵌入层
def set_input_embeddings(self, value):
self.model.embed_tokens = value
# 返回模型的输出嵌入层(lm_head)
def get_output_embeddings(self):
return self.lm_head
# 设置模型的输出嵌入层(lm_head)
def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings
# 设置模型的解码器
def set_decoder(self, decoder):
self.model = decoder
# 返回模型的解码器
def get_decoder(self):
return self.model
# 前向传播函数,用装饰器添加了文档字符串和返回值替换
@add_start_docstrings_to_model_forward(OPEN_LLAMA_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
# 前向传播函数具体实现在后续的方法和类中定义,此处不直接实现
# 为生成准备输入的方法,接受多个参数,包括 input_ids, past_key_values, attention_mask, inputs_embeds
def prepare_inputs_for_generation(
self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
):
# 具体的输入准备过程在后续的方法和类中定义,此处不直接实现
):
# 如果之前的键值对不为 None,则获取其长度
if past_key_values is not None:
past_length = past_key_values[0][0].shape[2]
# 一些生成方法可能只传递了最后一个输入 ID
if input_ids.shape[1] > past_length:
# 移除前缀的长度设为过去长度
remove_prefix_length = past_length
else:
# 默认旧行为:仅保留最后一个 ID
remove_prefix_length = input_ids.shape[1] - 1
# 更新输入 IDs 为移除前缀后的部分
input_ids = input_ids[:, remove_prefix_length:]
# 从 kwargs 中获取位置 IDs
position_ids = kwargs.get("position_ids", None)
# 如果存在注意力遮罩但没有位置 IDs
if attention_mask is not None and position_ids is None:
# 为批量生成动态创建位置 IDs
position_ids = attention_mask.long().cumsum(-1) - 1
position_ids.masked_fill_(attention_mask == 0, 1)
if past_key_values:
# 如果存在过去的键值对,则仅保留与输入 IDs 形状相匹配的位置 IDs
position_ids = position_ids[:, -input_ids.shape[1] :]
# 如果传递了 `inputs_embeds`,则仅在第一个生成步骤中使用它们
if inputs_embeds is not None and past_key_values is None:
model_inputs = {"inputs_embeds": inputs_embeds}
else:
model_inputs = {"input_ids": input_ids}
# 更新 model_inputs 字典
model_inputs.update(
{
"position_ids": position_ids,
"past_key_values": past_key_values,
"use_cache": kwargs.get("use_cache"),
"attention_mask": attention_mask,
}
)
return model_inputs
@staticmethod
def _reorder_cache(past_key_values, beam_idx):
reordered_past = ()
# 重新排序过去的键值对
for layer_past in past_key_values:
reordered_past += (
# 对每个层的过去状态根据 beam_idx 重新排序
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
)
return reordered_past
# 使用装饰器添加文档字符串到类上,描述LLaMa模型带有顶部序列分类头部的转换器
# 这里是使用了OpenLlamaForSequenceClassification类,它在顺序分类时使用最后一个token,类似于其他因果模型(如GPT-2)的做法。
# 根据配置的pad_token_id,确定最后一个非填充token的位置以进行分类。如果未定义pad_token_id,则直接取批次中每行的最后一个值。
# 当传入inputs_embeds而非input_ids时,无法猜测填充token,因此同样采用这种方式(取批次中每行的最后一个值)。
class OpenLlamaForSequenceClassification(OpenLlamaPreTrainedModel):
def __init__(self, config):
# 调用父类构造函数,传入配置
super().__init__(config)
# 设置类别数目
self.num_labels = config.num_labels
# 初始化OpenLlama模型
self.model = OpenLlamaModel(config)
# 定义线性层,将隐藏状态映射到类别数目,无偏置
self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
# 初始化权重并应用最终处理
self.post_init()
# 获取输入嵌入
def get_input_embeddings(self):
return self.model.embed_tokens
# 设置输入嵌入
def set_input_embeddings(self, value):
self.model.embed_tokens = value
# 使用装饰器添加文档字符串到模型前向传播函数上,描述输入参数的文档字符串
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
.\models\deprecated\open_llama\__init__.py
from typing import TYPE_CHECKING
from ....utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_sentencepiece_available,
is_tokenizers_available,
is_torch_available,
)
_import_structure = {
"configuration_open_llama": ["OPEN_LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP", "OpenLlamaConfig"],
}
try:
if not is_sentencepiece_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_open_llama"] = ["LlamaTokenizer"]
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_open_llama_fast"] = ["LlamaTokenizerFast"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_open_llama"] = [
"OpenLlamaForCausalLM",
"OpenLlamaModel",
"OpenLlamaPreTrainedModel",
"OpenLlamaForSequenceClassification",
]
if TYPE_CHECKING:
from .configuration_open_llama import OPEN_LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP, OpenLlamaConfig
try:
if not is_sentencepiece_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from transformers import LlamaTokenizer
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from transformers import LlamaTokenizerFast
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_open_llama import (
OpenLlamaForCausalLM,
OpenLlamaForSequenceClassification,
OpenLlamaModel,
OpenLlamaPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\deprecated\retribert\configuration_retribert.py
""" RetriBERT model configuration"""
from ....configuration_utils import PretrainedConfig
from ....utils import logging
logger = logging.get_logger(__name__)
RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"yjernite/retribert-base-uncased": (
"https://huggingface.co/yjernite/retribert-base-uncased/resolve/main/config.json"
),
}
class RetriBertConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`RetriBertModel`]. It is used to instantiate a
RetriBertModel model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the RetriBERT
[yjernite/retribert-base-uncased](https://huggingface.co/yjernite/retribert-base-uncased) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
"""
model_type = "retribert"
def __init__(
self,
vocab_size=30522,
hidden_size=768,
num_hidden_layers=8,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=2,
initializer_range=0.02,
layer_norm_eps=1e-12,
share_encoders=True,
projection_dim=128,
pad_token_id=0,
**kwargs,
):
):
super().__init__(pad_token_id=pad_token_id, **kwargs)
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.hidden_act = hidden_act
self.intermediate_size = intermediate_size
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.share_encoders = share_encoders
self.projection_dim = projection_dim
.\models\deprecated\retribert\modeling_retribert.py
"""
RetriBERT model
"""
import math
from typing import Optional
import torch
import torch.utils.checkpoint as checkpoint
from torch import nn
from ....modeling_utils import PreTrainedModel
from ....utils import add_start_docstrings, logging
from ...bert.modeling_bert import BertModel
from .configuration_retribert import RetriBertConfig
logger = logging.get_logger(__name__)
RETRIBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"yjernite/retribert-base-uncased",
]
class RetriBertPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = RetriBertConfig
load_tf_weights = None
base_model_prefix = "retribert"
def _init_weights(self, module):
"""Initialize the weights"""
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
RETRIBERT_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
"""
Parameters:
config ([`RetriBertConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
@add_start_docstrings(
"""Bert Based model to embed queries or document for document retrieval.""",
RETRIBERT_START_DOCSTRING,
)
"""
class RetriBertModel(RetriBertPreTrainedModel):
def __init__(self, config: RetriBertConfig) -> None:
super().__init__(config)
self.projection_dim = config.projection_dim
self.bert_query = BertModel(config)
self.bert_doc = None if config.share_encoders else BertModel(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.project_query = nn.Linear(config.hidden_size, config.projection_dim, bias=False)
self.project_doc = nn.Linear(config.hidden_size, config.projection_dim, bias=False)
self.ce_loss = nn.CrossEntropyLoss(reduction="mean")
self.post_init()
def embed_sentences_checkpointed(
self,
input_ids,
attention_mask,
sent_encoder,
checkpoint_batch_size=-1,
):
if checkpoint_batch_size < 0 or input_ids.shape[0] < checkpoint_batch_size:
return sent_encoder(input_ids, attention_mask=attention_mask)[1]
else:
device = input_ids.device
input_shape = input_ids.size()
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
head_mask = [None] * sent_encoder.config.num_hidden_layers
extended_attention_mask: torch.Tensor = sent_encoder.get_extended_attention_mask(
attention_mask, input_shape
)
def partial_encode(*inputs):
encoder_outputs = sent_encoder.encoder(
inputs[0],
attention_mask=inputs[1],
head_mask=head_mask,
)
sequence_output = encoder_outputs[0]
pooled_output = sent_encoder.pooler(sequence_output)
return pooled_output
embedding_output = sent_encoder.embeddings(
input_ids=input_ids, position_ids=None, token_type_ids=token_type_ids, inputs_embeds=None
)
pooled_output_list = []
for b in range(math.ceil(input_ids.shape[0] / checkpoint_batch_size)):
b_embedding_output = embedding_output[b * checkpoint_batch_size : (b + 1) * checkpoint_batch_size]
b_attention_mask = extended_attention_mask[b * checkpoint_batch_size : (b + 1) * checkpoint_batch_size]
pooled_output = checkpoint.checkpoint(partial_encode, b_embedding_output, b_attention_mask)
pooled_output_list.append(pooled_output)
return torch.cat(pooled_output_list, dim=0)
def embed_questions(
self,
input_ids,
attention_mask=None,
checkpoint_batch_size=-1,
):
q_reps = self.embed_sentences_checkpointed(
input_ids,
attention_mask,
self.bert_query,
checkpoint_batch_size,
)
return self.project_query(q_reps)
def embed_answers(
self,
input_ids,
attention_mask=None,
checkpoint_batch_size=-1,
):
a_reps = self.embed_sentences_checkpointed(
input_ids,
attention_mask,
self.bert_query if self.bert_doc is None else self.bert_doc,
checkpoint_batch_size,
)
return self.project_doc(a_reps)
) -> torch.FloatTensor:
r"""
Args:
input_ids_query (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary for the queries in a batch.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
attention_mask_query (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
input_ids_doc (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary for the documents in a batch.
attention_mask_doc (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on documents padding token indices.
checkpoint_batch_size (`int`, *optional*, defaults to `-1`):
If greater than 0, uses gradient checkpointing to only compute sequence representation on
`checkpoint_batch_size` examples at a time on the GPU. All query representations are still compared to
all document representations in the batch.
Return:
`torch.FloatTensor``: The bidirectional cross-entropy loss obtained while trying to match each query to its
corresponding document and each document to its corresponding query in the batch
"""
device = input_ids_query.device
q_reps = self.embed_questions(input_ids_query, attention_mask_query, checkpoint_batch_size)
a_reps = self.embed_answers(input_ids_doc, attention_mask_doc, checkpoint_batch_size)
compare_scores = torch.mm(q_reps, a_reps.t())
loss_qa = self.ce_loss(compare_scores, torch.arange(compare_scores.shape[1]).to(device))
loss_aq = self.ce_loss(compare_scores.t(), torch.arange(compare_scores.shape[0]).to(device))
loss = (loss_qa + loss_aq) / 2
return loss
.\models\deprecated\retribert\tokenization_retribert.py
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"yjernite/retribert-base-uncased": (
"https://huggingface.co/yjernite/retribert-base-uncased/resolve/main/vocab.txt"
),
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"yjernite/retribert-base-uncased": 512,
}
PRETRAINED_INIT_CONFIGURATION = {
"yjernite/retribert-base-uncased": {"do_lower_case": True},
}
def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict()
with open(vocab_file, "r", encoding="utf-8") as reader:
tokens = reader.readlines()
for index, token in enumerate(tokens):
token = token.rstrip("\n")
vocab[token] = index
return vocab
def whitespace_tokenize(text):
"""Runs basic whitespace cleaning and splitting on a piece of text."""
text = text.strip()
if not text:
return []
tokens = text.split()
return tokens
class RetriBertTokenizer(PreTrainedTokenizer):
r"""
Constructs a RetriBERT tokenizer.
[`RetriBertTokenizer`] is identical to [`BertTokenizer`] and runs end-to-end tokenization: punctuation splitting
and wordpiece.
This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer
to: this superclass for more information regarding those methods.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
model_input_names = ["input_ids", "attention_mask"]
def __init__(
self,
vocab_file,
do_lower_case=True,
do_basic_tokenize=True,
never_split=None,
unk_token="[UNK]",
sep_token="[SEP]",
pad_token="[PAD]",
cls_token="[CLS]",
mask_token="[MASK]",
tokenize_chinese_chars=True,
strip_accents=None,
**kwargs,
):
if not os.path.isfile(vocab_file):
raise ValueError(
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
" model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
)
self.vocab = load_vocab(vocab_file)
self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
self.do_basic_tokenize = do_basic_tokenize
if do_basic_tokenize:
self.basic_tokenizer = BasicTokenizer(
do_lower_case=do_lower_case,
never_split=never_split,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
)
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
super().__init__(
do_lower_case=do_lower_case,
do_basic_tokenize=do_basic_tokenize,
never_split=never_split,
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
**kwargs,
)
@property
def do_lower_case(self):
return self.basic_tokenizer.do_lower_case
@property
def vocab_size(self):
return len(self.vocab)
def get_vocab(self):
return dict(self.vocab, **self.added_tokens_encoder)
def _tokenize(self, text, split_special_tokens=False):
split_tokens = []
if self.do_basic_tokenize:
for token in self.basic_tokenizer.tokenize(
text, never_split=self.all_special_tokens if not split_special_tokens else None
):
if token in self.basic_tokenizer.never_split:
split_tokens.append(token)
else:
split_tokens += self.wordpiece_tokenizer.tokenize(token)
else:
split_tokens = self.wordpiece_tokenizer.tokenize(text)
return split_tokens
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
return self.vocab.get(token, self.vocab.get(self.unk_token))
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
return self.ids_to_tokens.get(index, self.unk_token)
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) in a single string."""
out_string = " ".join(tokens).replace(" ##", "").strip()
return out_string
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A BERT sequence has the following format:
- single sequence: `[CLS] X [SEP]`
- pair of sequences: `[CLS] A [SEP] B [SEP]`
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
cls = [self.cls_token_id]
sep = [self.sep_token_id]
return cls + token_ids_0 + sep + token_ids_1 + sep
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieves sequence ids from a token list that has no special tokens added. It adds the special tokens according
to the BERT-like model requirements.
Args:
token_ids_0 (`List[int]`):
List of IDs corresponding to the first sequence.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for BERT.
Returns:
`List[int]`: List of `1` and `0`, with `1` indicating a special token, `0` indicating a regular token.
"""
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)
if token_ids_1 is not None:
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1]
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
pair mask has the following format:
```
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
```
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
index = 0
if os.path.isdir(save_directory):
vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
else:
vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
with open(vocab_file, "w", encoding="utf-8") as writer:
for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
if index != token_index:
logger.warning(
f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
" Please check that the vocabulary is not corrupted!"
)
index = token_index
writer.write(token + "\n")
index += 1
return (vocab_file,)
class BasicTokenizer(object):
"""
构造一个BasicTokenizer对象,用于运行基本的分词操作(如分割标点符号、转换为小写等)。
Args:
do_lower_case (`bool`, *optional*, defaults to `True`):
是否在分词时将输入文本转换为小写。
never_split (`Iterable`, *optional*):
在分词时永远不会被分割的标记集合。仅在`do_basic_tokenize=True`时生效。
tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
是否对中文字符进行分词。
对于日语,这应该被禁用(参见这个
[问题](https://github.com/huggingface/transformers/issues/328))。
strip_accents (`bool`, *optional*):
是否去除所有的重音符号。如果未指定此选项,则由`lowercase`的值决定(与原始BERT相同)。
do_split_on_punc (`bool`, *optional*, defaults to `True`):
在某些情况下,我们希望跳过基本的标点分割,以便后续的分词可以捕捉到完整的词语上下文,比如缩写词。
"""
def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None:
never_split = []
self.do_lower_case = do_lower_case
self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None):
never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
text = self._clean_text(text)
if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text)
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = []
for token in orig_tokens:
if token not in never_split:
if self.do_lower_case:
token = token.lower()
if self.strip_accents is not False:
token = self._run_strip_accents(token)
elif self.strip_accents:
token = self._run_strip_accents(token)
split_tokens.extend(self._run_split_on_punc(token, never_split))
output_tokens = whitespace_tokenize(" ".join(split_tokens))
return output_tokens
def _run_strip_accents(self, text):
"""Strips accents from a piece of text."""
text = unicodedata.normalize("NFD", text)
output = []
for char in text:
cat = unicodedata.category(char)
if cat == "Mn":
continue
output.append(char)
return "".join(output)
def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text."""
if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text]
chars = list(text)
i = 0
start_new_word = True
output = []
while i < len(chars):
char = chars[i]
if _is_punctuation(char):
output.append([char])
start_new_word = True
else:
if start_new_word:
output.append([])
start_new_word = False
output[-1].append(char)
i += 1
return ["".join(x) for x in output]
def _tokenize_chinese_chars(self, text):
"""Adds whitespace around any CJK character."""
output = []
for char in text:
cp = ord(char)
if self._is_chinese_char(cp):
output.append(" ")
output.append(char)
output.append(" ")
else:
output.append(char)
return "".join(output)
def _is_chinese_char(self, cp):
"""Checks whether CP is the codepoint of a CJK character."""
if (
(cp >= 0x4E00 and cp <= 0x9FFF)
or (cp >= 0x3400 and cp <= 0x4DBF)
or (cp >= 0x20000 and cp <= 0x2A6DF)
or (cp >= 0x2A700 and cp <= 0x2B73F)
or (cp >= 0x2B740 and cp <= 0x2B81F)
or (cp >= 0x2B820 and cp <= 0x2CEAF)
or (cp >= 0xF900 and cp <= 0xFAFF)
or (cp >= 0x2F800 and cp <= 0x2FA1F)
):
return True
return False
def _clean_text(self, text):
"""Performs invalid character removal and whitespace cleanup on text."""
output = []
for char in text:
cp = ord(char)
if cp == 0 or cp == 0xFFFD or _is_control(char):
continue
if _is_whitespace(char):
output.append(" ")
else:
output.append(char)
return "".join(output)
class WordpieceTokenizer(object):
"""Runs WordPiece tokenization."""
def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
self.vocab = vocab
self.unk_token = unk_token
self.max_input_chars_per_word = max_input_chars_per_word
def tokenize(self, text):
"""
Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
tokenization using the given vocabulary.
For example, `input = "unaffable"` will return as output `["un", "##aff", "##able"]`.
Args:
text: A single token or whitespace separated tokens. This should have
already been passed through *BasicTokenizer*.
Returns:
A list of wordpiece tokens.
"""
output_tokens = []
for token in whitespace_tokenize(text):
chars = list(token)
if len(chars) > self.max_input_chars_per_word:
output_tokens.append(self.unk_token)
continue
is_bad = False
start = 0
sub_tokens = []
while start < len(chars):
end = len(chars)
cur_substr = None
while start < end:
substr = "".join(chars[start:end])
if start > 0:
substr = "##" + substr
if substr in self.vocab:
cur_substr = substr
break
end -= 1
if cur_substr is None:
is_bad = True
break
sub_tokens.append(cur_substr)
start = end
if is_bad:
output_tokens.append(self.unk_token)
else:
output_tokens.extend(sub_tokens)
return output_tokens
.\models\deprecated\retribert\tokenization_retribert_fast.py
"""RetriBERT 的分词类。"""
import json
from typing import List, Optional, Tuple
from tokenizers import normalizers
from ....tokenization_utils_fast import PreTrainedTokenizerFast
from ....utils import logging
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"yjernite/retribert-base-uncased": (
"https://huggingface.co/yjernite/retribert-base-uncased/resolve/main/vocab.txt"
),
},
"tokenizer_file": {
"yjernite/retribert-base-uncased": (
"https://huggingface.co/yjernite/retribert-base-uncased/resolve/main/tokenizer.json"
),
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"yjernite/retribert-base-uncased": 512,
}
PRETRAINED_INIT_CONFIGURATION = {
"yjernite/retribert-base-uncased": {"do_lower_case": True},
}
class RetriBertTokenizerFast(PreTrainedTokenizerFast):
r"""
构建一个“快速”RetriBERT 分词器(基于 HuggingFace 的 *tokenizers* 库)。
[`RetriBertTokenizerFast`] 与 [`BertTokenizerFast`] 相同,并且支持端到端的分词:标点符号拆分和 wordpiece。
此分词器继承自 [`PreTrainedTokenizerFast`],其中包含大多数主要方法。用户应参考此超类获取有关这些方法的更多信息。
"""
Args:
vocab_file (`str`):
File containing the vocabulary.
do_lower_case (`bool`, *optional*, defaults to `True`):
Whether or not to lowercase the input when tokenizing.
unk_token (`str`, *optional*, defaults to `"[UNK]"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
sep_token (`str`, *optional*, defaults to `"[SEP]"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
pad_token (`str`, *optional*, defaults to `"[PAD]"`):
The token used for padding, for example when batching sequences of different lengths.
cls_token (`str`, *optional*, defaults to `"[CLS]"`):
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
mask_token (`str`, *optional*, defaults to `"[MASK]"`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
clean_text (`bool`, *optional*, defaults to `True`):
Whether or not to clean the text before tokenization by removing any control characters and replacing all
whitespaces by the classic one.
tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see [this
issue](https://github.com/huggingface/transformers/issues/328)).
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT).
wordpieces_prefix (`str`, *optional*, defaults to `"##"`):
The prefix for subwords.
"""
# 以下代码段是变量声明,用于配置和初始化BERT类型的分词器的各种设置和参数
# 文件名列表,指定了与模型相关的词汇表文件名
vocab_files_names = VOCAB_FILES_NAMES
# 预训练模型的词汇表文件映射,指定了不同预训练模型的词汇表文件路径
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
# 预训练模型输入的最大长度,指定了不同预训练模型的最大输入长度
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
# 预训练模型的初始化配置,包含了不同预训练模型的初始化参数
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
# 用于慢速分词器的类,指定了用于BERT类型模型的慢速分词器类
slow_tokenizer_class = RetriBertTokenizer
# 模型输入名称列表,指定了模型的输入ID和注意力掩码
model_input_names = ["input_ids", "attention_mask"]
# 以下代码段的功能与transformers库中的BertTokenizerFast.__init__方法相同,但未完全展示
# 初始化方法,用于创建一个新的实例对象
def __init__(
self,
vocab_file=None, # 词汇表文件路径,默认为None
tokenizer_file=None, # 分词器文件路径,默认为None
do_lower_case=True, # 是否将输入文本转换为小写,默认为True
unk_token="[UNK]", # 未知标记的字符串表示,默认为"[UNK]"
sep_token="[SEP]", # 分隔标记的字符串表示,默认为"[SEP]"
pad_token="[PAD]", # 填充标记的字符串表示,默认为"[PAD]"
cls_token="[CLS]", # 类别标记的字符串表示,默认为"[CLS]"
mask_token="[MASK]", # 掩码标记的字符串表示,默认为"[MASK]"
tokenize_chinese_chars=True, # 是否分词中文字符,默认为True
strip_accents=None, # 是否去除重音,默认为None
**kwargs, # 其他关键字参数
):
# 调用父类的初始化方法
super().__init__(
vocab_file,
tokenizer_file=tokenizer_file,
do_lower_case=do_lower_case,
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
**kwargs,
)
# 获取后端分词器的正常化状态
normalizer_state = json.loads(self.backend_tokenizer.normalizer.__getstate__())
# 检查正常化状态是否与当前初始化参数匹配,若不匹配则更新分词器的正常化类
if (
normalizer_state.get("lowercase", do_lower_case) != do_lower_case
or normalizer_state.get("strip_accents", strip_accents) != strip_accents
or normalizer_state.get("handle_chinese_chars", tokenize_chinese_chars) != tokenize_chinese_chars
):
normalizer_class = getattr(normalizers, normalizer_state.pop("type"))
normalizer_state["lowercase"] = do_lower_case
normalizer_state["strip_accents"] = strip_accents
normalizer_state["handle_chinese_chars"] = tokenize_chinese_chars
self.backend_tokenizer.normalizer = normalizer_class(**normalizer_state)
# 设置实例对象的小写处理状态
self.do_lower_case = do_lower_case
# 从给定的token_ids_0和可选的token_ids_1构建包含特殊标记的模型输入序列,用于序列分类任务
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A BERT sequence has the following format:
- single sequence: `[CLS] X [SEP]`
- pair of sequences: `[CLS] A [SEP] B [SEP]`
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [input IDs](../glossary
"""
# 构建包含特殊标记的输入序列,以用于模型输入
output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
# 若存在第二个token_ids_1,则构建双序列的输入格式
if token_ids_1 is not None:
output += token_ids_1 + [self.sep_token_id]
return output
# 根据token_ids_0和可选的token_ids_1创建token类型ID序列
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
def create_bert_seq_classification_mask(
self,
token_ids_0: List[int],
token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
pair mask has the following format:
```
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
```
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
Args:
token_ids_0 (`List[int]`):
List of token IDs for the first sequence.
token_ids_1 (`List[int]`, *optional*):
Optional list of token IDs for the second sequence in sequence pairs.
Returns:
`List[int]`: List of token type IDs according to the given sequence(s).
"""
# Define the separator and classification tokens
sep = [self.sep_token_id]
cls = [self.cls_token_id]
# If there is no second sequence, return a mask with all zeros for the first sequence
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
# Return a mask that identifies the token type IDs for both sequences
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
# Copied from transformers.models.bert.tokenization_bert_fast.BertTokenizerFast.save_vocabulary
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
"""
Save the tokenizer's vocabulary to files in the specified directory.
Args:
save_directory (str):
Directory where vocabulary files will be saved.
filename_prefix (str, *optional*):
Optional prefix for the saved vocabulary filenames.
Returns:
Tuple[str]: Tuple containing the filenames of the saved vocabulary files.
"""
# Save the vocabulary files using the underlying tokenizer model
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
return tuple(files)
.\models\deprecated\retribert\__init__.py
from typing import TYPE_CHECKING
from ....utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available
_import_structure = {
"configuration_retribert": ["RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "RetriBertConfig"],
"tokenization_retribert": ["RetriBertTokenizer"],
}
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_retribert_fast"] = ["RetriBertTokenizerFast"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_retribert"] = [
"RETRIBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
"RetriBertModel",
"RetriBertPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_retribert import RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, RetriBertConfig
from .tokenization_retribert import RetriBertTokenizer
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_retribert_fast import RetriBertTokenizerFast
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_retribert import (
RETRIBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
RetriBertModel,
RetriBertPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\deprecated\tapex\tokenization_tapex.py
"""TAPEX 的标记类。"""
import json
import os
import random
from functools import lru_cache
from typing import Dict, List, Optional, Tuple, Union
import regex as re
from ....file_utils import ExplicitEnum, PaddingStrategy, TensorType, add_end_docstrings, is_pandas_available
from ....tokenization_utils import AddedToken, PreTrainedTokenizer
from ....tokenization_utils_base import ENCODE_KWARGS_DOCSTRING, BatchEncoding, TextInput, TruncationStrategy
from ....utils import logging
if is_pandas_available():
import pandas as pd
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"microsoft/tapex-base": "https://huggingface.co/microsoft/tapex-base/resolve/main/vocab.json",
},
"merges_file": {
"microsoft/tapex-base": "https://huggingface.co/microsoft/tapex-base/resolve/main/merges.txt",
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"microsoft/tapex-base": 512,
}
PRETRAINED_INIT_CONFIGURATION = {
"microsoft/tapex-base": {"do_lower_case": True},
}
class TapexTruncationStrategy(ExplicitEnum):
"""
[`~TapasTokenizer.__call__`] 的 `truncation` 参数的可能取值。在 IDE 中进行代码补全时非常有用。
"""
DROP_ROWS_TO_FIT = "drop_rows_to_fit"
@lru_cache()
def bytes_to_unicode():
"""
返回 utf-8 字节列表及其对应的 unicode 字符映射。我们特别避免映射到空格/控制字符,以免引起 BPE 编码错误。
可逆的 BPE 编码工作在 unicode 字符串上。这意味着如果要避免 UNK 标记,词汇表中需要大量的 unicode 字符。
当处理类似于 10B 令牌的数据集时,您大约需要 5K 个字符才能实现良好的覆盖率。这在常规的 32K BPE 词汇表中占据了相当大的比例。
为了避免这种情况,我们希望在 utf-8 字节和 unicode 字符串之间建立查找表。
"""
bs = (
list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
)
cs = bs[:]
n = 0
for b in range(2**8):
if b not in bs:
bs.append(b)
cs.append(2**8 + n)
n += 1
cs = [chr(n) for n in cs]
return dict(zip(bs, cs))
def get_pairs(word):
"""
返回单词中的符号对集合。单词被表示为符号元组(符号是可变长度的字符串)。
"""
pairs = set()
prev_char = word[0]
for char in word[1:]:
pairs.add((prev_char, char))
prev_char = char
return pairs
class IndexedRowTableLinearize:
"""
FORMAT: col: col1 | col2 | col 3 row 1 : val1 | val2 | val3 row 2 : ...
"""
def process_table(self, table_content: Dict):
"""
Given a table, TableLinearize aims at converting it into a flatten sequence with special symbols.
"""
assert "header" in table_content and "rows" in table_content, self.PROMPT_MESSAGE
table_str = self.process_header(table_content["header"]) + " "
for i, row_example in enumerate(table_content["rows"]):
table_str += self.process_row(row_example, row_index=i + 1) + " "
return table_str.strip()
def process_header(self, headers: List):
"""
Given a list of headers, TableLinearize aims at converting it into a flatten sequence with special symbols.
"""
return "col : " + " | ".join(headers)
def process_row(self, row: List, row_index: int):
"""
Given a row, TableLinearize aims at converting it into a flatten sequence with special symbols.
"""
row_str = ""
row_cell_values = []
for cell_value in row:
if isinstance(cell_value, int):
row_cell_values.append(str(cell_value))
else:
row_cell_values.append(cell_value)
row_str += " | ".join(row_cell_values)
return "row " + str(row_index) + " : " + row_str
class TapexTokenizer(PreTrainedTokenizer):
r"""
Construct a TAPEX tokenizer. Based on byte-level Byte-Pair-Encoding (BPE).
This tokenizer can be used to flatten one or more table(s) and concatenate them with one or more related sentences
to be used by TAPEX models. The format that the TAPEX tokenizer creates is the following:
sentence col: col1 | col2 | col 3 row 1 : val1 | val2 | val3 row 2 : ...
The tokenizer supports a single table + single query, a single table and multiple queries (in which case the table
will be duplicated for every query), a single query and multiple tables (in which case the query will be duplicated
for every table), and multiple tables and queries. In other words, you can provide a batch of tables + questions to
the tokenizer for instance to prepare them for the model.
Tokenization itself is based on the BPE algorithm. It is identical to the one used by BART, RoBERTa and GPT-2.
This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.
"""
def __init__(
vocab_file: str,
merges_file: str,
do_lower_case: bool = True,
errors: str = "replace",
bos_token: str = "<s>",
eos_token: str = "</s>",
sep_token: str = "</s>",
cls_token: str = "<s>",
unk_token: str = "<unk>",
pad_token: str = "<pad>",
mask_token: str = "<mask>",
add_prefix_space: bool = False,
max_cell_length: int = 15
):
"""
Args:
vocab_file (`str`):
词汇文件的路径。
merges_file (`str`):
合并文件的路径。
do_lower_case (`bool`, *optional*, defaults to `True`):
在分词时是否将输入转换为小写。
errors (`str`, *optional*, defaults to `"replace"`):
解码字节为 UTF-8 时使用的策略。参见 [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) 获取更多信息。
bos_token (`str`, *optional*, defaults to `"<s>"`):
在预训练期间用作序列开头的特殊标记。可用作序列分类器的标记。
<Tip>
当使用特殊标记构建序列时,此处的标记并非序列开头的标记。序列开头的标记是 `cls_token`。
</Tip>
eos_token (`str`, *optional*, defaults to `"</s>"`):
用作序列结尾的特殊标记。
<Tip>
当使用特殊标记构建序列时,此处的标记并非序列结尾的标记。序列结尾的标记是 `sep_token`。
</Tip>
sep_token (`str`, *optional*, defaults to `"</s>"`):
分隔标记,在构建多序列组合序列时使用,例如用于序列分类或问答任务的文本和问题的组合序列。也用作使用特殊标记构建的序列的最后一个标记。
cls_token (`str`, *optional*, defaults to `"<s>"`):
分类器标记,在进行序列分类(整体序列而非逐标记分类)时使用。在使用特殊标记构建序列时是序列的第一个标记。
unk_token (`str`, *optional*, defaults to `"<unk>"`):
未知标记。词汇表中不存在的标记将被设置为此标记。
pad_token (`str`, *optional*, defaults to `"<pad>"`):
用于填充的标记,例如在批处理不同长度序列时使用。
mask_token (`str`, *optional*, defaults to `"<mask>"`):
用于掩码值的标记。在使用掩码语言建模训练模型时使用。模型将尝试预测此标记。
add_prefix_space (`bool`, *optional*, defaults to `False`):
是否在输入的开头添加空格。这允许将开头的单词视为任何其他单词。 (BART 分词器通过前导空格检测单词的开始)。
max_cell_length (`int`, *optional*, defaults to 15):
线性化表格时每个单元格的最大字符数。如果超过此数字,则进行截断。
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
model_input_names = ["input_ids", "attention_mask"]
def __init__(
self,
vocab_file,
merges_file,
do_lower_case=True,
errors="replace",
bos_token="<s>",
eos_token="</s>",
sep_token="</s>",
cls_token="<s>",
unk_token="<unk>",
pad_token="<pad>",
mask_token="<mask>",
add_prefix_space=False,
max_cell_length=15,
**kwargs,
):
bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
with open(vocab_file, encoding="utf-8") as vocab_handle:
self.encoder = json.load(vocab_handle)
self.decoder = {v: k for k, v in self.encoder.items()}
self.errors = errors
self.byte_encoder = bytes_to_unicode()
self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
with open(merges_file, encoding="utf-8") as merges_handle:
bpe_merges = merges_handle.read().split("\n")[1:-1]
bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
self.cache = {}
self.add_prefix_space = add_prefix_space
self.do_lower_case = do_lower_case
self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
super().__init__(
vocab_file=vocab_file,
merges_file=merges_file,
do_lower_case=do_lower_case,
errors=errors,
bos_token=bos_token,
eos_token=eos_token,
unk_token=unk_token,
sep_token=sep_token,
cls_token=cls_token,
pad_token=pad_token,
mask_token=mask_token,
add_prefix_space=add_prefix_space,
max_cell_length=max_cell_length,
**kwargs,
)
self.max_cell_length = max_cell_length
self.table_linearize = IndexedRowTableLinearize()
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequences for sequence classification tasks by concatenating and
adding special tokens. A TAPEX sequence has the following format:
- single sequence: `<s> X </s>`
- pair of sequences: `<s> A </s></s> B </s>`
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
cls = [self.cls_token_id]
sep = [self.sep_token_id]
return cls + token_ids_0 + sep + sep + token_ids_1 + sep
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)
if token_ids_1 is None:
return [1] + ([0] * len(token_ids_0)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create token type IDs tensor from a list of token ids. This is used for sequence classification tasks where each
sequence pair gets a different token type ID (0 or 1).
Args:
token_ids_0 (`List[int]`):
List of IDs representing the first sequence.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs representing the second sequence in a pair.
Returns:
`List[int]`: A list of token type IDs where each ID corresponds to a token in the input sequences.
"""
) -> List[int]:
"""
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of zeros.
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
"""
Args:
text (str): The input text to be tokenized.
is_split_into_words (bool): Whether the input text is already split into words.
**kwargs: Additional keyword arguments.
add_prefix_space (bool): Whether to add a prefix space to the text if necessary.
Returns:
tuple: A tuple containing the modified text and remaining keyword arguments.
"""
add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
if (is_split_into_words or add_prefix_space) and (len(text) > 0 and not text[0].isspace()):
text = " " + text
return (text, kwargs)
@property
def vocab_size(self):
return len(self.encoder)
def get_vocab(self):
return dict(self.encoder, **self.added_tokens_encoder)
def bpe(self, token):
"""
Args:
token (str): The token to apply BPE encoding.
Returns:
str: The token after BPE encoding.
"""
if token in self.cache:
return self.cache[token]
word = tuple(token)
pairs = get_pairs(word)
if not pairs:
return token
while True:
bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
if bigram not in self.bpe_ranks:
break
first, second = bigram
new_word = []
i = 0
while i < len(word):
try:
j = word.index(first, i)
except ValueError:
new_word.extend(word[i:])
break
else:
new_word.extend(word[i:j])
i = j
if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
new_word.append(first + second)
i += 2
else:
new_word.append(word[i])
i += 1
new_word = tuple(new_word)
word = new_word
if len(word) == 1:
break
else:
pairs = get_pairs(word)
word = " ".join(word)
self.cache[token] = word
return word
def _tokenize(self, text):
"""
Tokenize a string using Byte-Pair Encoding (BPE).
Args:
text (str): The input text to tokenize.
Returns:
List[str]: List of tokens after tokenization.
"""
bpe_tokens = []
for token in re.findall(self.pat, text):
token = "".join(
self.byte_encoder[b] for b in token.encode("utf-8")
)
bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
return bpe_tokens
def _convert_token_to_id(self, token):
return self.encoder.get(token, self.encoder.get(self.unk_token))
def _convert_id_to_token(self, index):
return self.decoder.get(index)
def convert_tokens_to_string(self, tokens):
text = "".join(tokens)
text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
return text
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
merge_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
)
with open(vocab_file, "w", encoding="utf-8") as f:
f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
index = 0
with open(merge_file, "w", encoding="utf-8") as writer:
writer.write("#version: 0.2\n")
for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
if index != token_index:
logger.warning(
f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
" Please check that the tokenizer is not corrupted!"
)
index = token_index
writer.write(" ".join(bpe_tokens) + "\n")
index += 1
return vocab_file, merge_file
@add_end_docstrings(ENCODE_KWARGS_DOCSTRING, TAPEX_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
def __call__(
self,
table: Union["pd.DataFrame", List["pd.DataFrame"]] = None,
query: Optional[Union[TextInput, List[TextInput]]] = None,
answer: Union[str, List[str]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
) -> BatchEncoding:
"""
Main method to tokenize and prepare for the model one or several table-sequence pair(s).
Args:
table (`pd.DataFrame`, `List[pd.DataFrame]`):
Table(s) containing tabular data.
query (`str` or `List[str]`, *optional*):
Sentence or batch of sentences related to one or more table(s) to be encoded. Note that the number of
sentences must match the number of tables.
answer (`str` or `List[str]`, *optional*):
Optionally, the corresponding answer to the questions as supervision.
"""
if table is not None:
return self.source_call_func(
table=table,
query=query,
answer=answer,
add_special_tokens=add_special_tokens,
padding=padding,
truncation=truncation,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs,
)
elif answer is not None:
return self.target_call_func(
answer=answer,
add_special_tokens=add_special_tokens,
padding=padding,
truncation=truncation,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs,
)
else:
raise ValueError("You need to provide either a `table` or an `answer`.")
def source_call_func(
self,
table: Union["pd.DataFrame", List["pd.DataFrame"]],
query: Optional[Union[TextInput, List[TextInput]]] = None,
answer: Union[str, List[str]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
) -> BatchEncoding:
valid_table = False
valid_query = False
if isinstance(table, pd.DataFrame):
valid_table = True
elif isinstance(table, (list, tuple)) and isinstance(table[0], pd.DataFrame):
valid_table = True
if query is None or isinstance(query, str):
valid_query = True
elif isinstance(query, (list, tuple)):
if len(query) == 0 or isinstance(query[0], str):
valid_query = True
if not valid_table:
raise ValueError(
"table input must of type `pd.DataFrame` (single example), `List[pd.DataFrame]` (batch of examples). "
)
if not valid_query:
raise ValueError("query input must of type `str` (single example), `List[str]` (batch of examples). ")
is_batched = isinstance(table, (list, tuple)) or isinstance(query, (list, tuple))
if is_batched:
return self.batch_encode_plus(
table=table,
query=query,
answer=answer,
add_special_tokens=add_special_tokens,
padding=padding,
truncation=truncation,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs,
)
else:
return self.encode_plus(
table=table,
query=query,
answer=answer,
add_special_tokens=add_special_tokens,
padding=padding,
truncation=truncation,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs,
)
@add_end_docstrings(ENCODE_KWARGS_DOCSTRING, TAPEX_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
def batch_encode_plus(
self,
table: Union["pd.DataFrame", List["pd.DataFrame"]],
query: Optional[List[TextInput]] = None,
answer: List[str] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str] = None,
max_length: Optional[int] = None,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
) -> BatchEncoding:
"""
<Tip warning={true}>
This method is deprecated, `__call__` should be used instead.
</Tip>
"""
padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
padding=padding,
truncation=truncation,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
verbose=verbose,
**kwargs,
)
return self._batch_encode_plus(
table=table,
query=query,
answer=answer,
add_special_tokens=add_special_tokens,
padding_strategy=padding_strategy,
truncation_strategy=truncation_strategy,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs,
)
def _batch_encode_plus(
self,
table: Union["pd.DataFrame", List["pd.DataFrame"]],
query: Optional[List[TextInput]] = None,
answer: Optional[List[str]] = None,
add_special_tokens: bool = True,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
) -> BatchEncoding:
if return_offsets_mapping:
raise NotImplementedError(
"return_offset_mapping is not available when using Python tokenizers. "
"To use this feature, change your tokenizer to one deriving from "
"transformers.PreTrainedTokenizerFast."
)
if isinstance(table, pd.DataFrame) and isinstance(query, (list, tuple)):
table = [table] * len(query)
if isinstance(table, (list, tuple)) and isinstance(query, str):
query = [query] * len(table)
batch_outputs = self._batch_prepare_for_model(
table=table,
query=query,
answer=answer,
add_special_tokens=add_special_tokens,
padding_strategy=padding_strategy,
truncation_strategy=truncation_strategy,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_attention_mask=return_attention_mask,
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_length=return_length,
return_tensors=return_tensors,
verbose=verbose,
)
return BatchEncoding(batch_outputs)
@add_end_docstrings(ENCODE_KWARGS_DOCSTRING, TAPEX_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
self,
table: Union["pd.DataFrame", List["pd.DataFrame"]],
query: Optional[Union[TextInput, List[TextInput]]] = None,
answer: Optional[Union[str, List[str]]] = None,
add_special_tokens: bool = True,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[str] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_length: bool = False,
verbose: bool = True,
) -> BatchEncoding:
"""
This method adds special tokens, truncates sequences if overflowing while taking into account the special
tokens and manages a moving window (with user defined stride) for overflowing tokens.
"""
batch_outputs = {}
if answer is None:
answer = [None] * len(table)
for _table, _query, _answer in zip(table, query, answer):
text = self.prepare_table_query(
_table, _query, _answer, truncation_strategy=truncation_strategy, max_length=max_length
)
if self.do_lower_case:
text = text.lower()
tokens = self.tokenize(text)
outputs = self.prepare_for_model(
ids=self.convert_tokens_to_ids(tokens),
add_special_tokens=add_special_tokens,
padding=PaddingStrategy.DO_NOT_PAD.value,
truncation=truncation_strategy.value,
max_length=max_length,
stride=stride,
pad_to_multiple_of=None,
return_attention_mask=False,
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_length=return_length,
return_tensors=None,
prepend_batch_axis=False,
verbose=verbose,
)
for key, value in outputs.items():
if key not in batch_outputs:
batch_outputs[key] = []
batch_outputs[key].append(value)
batch_outputs = self.pad(
batch_outputs,
padding=padding_strategy.value,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
return_attention_mask=return_attention_mask,
)
batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors)
return batch_outputs
@add_end_docstrings(ENCODE_KWARGS_DOCSTRING)
def encode(
self,
table: "pd.DataFrame",
query: Optional[TextInput] = None,
answer: Optional[str] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy, TapexTruncationStrategy] = None,
max_length: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
**kwargs,
) -> List[int]:
"""
Prepare a table, a string and possible answer for the model. This method does not return token type IDs,
attention masks, etc. which are necessary for the model to work correctly. Use this method if you want to build
your processing on your own, otherwise refer to `__call__`.
"""
encoded_inputs = self.encode_plus(
table,
query=query,
answer=answer,
add_special_tokens=add_special_tokens,
padding=padding,
truncation=truncation,
max_length=max_length,
return_tensors=return_tensors,
**kwargs,
)
return encoded_inputs["input_ids"]
@add_end_docstrings(ENCODE_KWARGS_DOCSTRING, TAPEX_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
def encode_plus(
self,
table: "pd.DataFrame",
query: Optional[TextInput] = None,
answer: Optional[str] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str] = None,
max_length: Optional[int] = None,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
) -> BatchEncoding:
padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
padding=padding,
truncation=truncation,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
verbose=verbose,
**kwargs,
)
return self._encode_plus(
table=table,
query=query,
answer=answer,
add_special_tokens=add_special_tokens,
padding_strategy=padding_strategy,
truncation_strategy=truncation_strategy,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs,
)
def _encode_plus(
self,
table: "pd.DataFrame",
query: Optional[TextInput] = None,
answer: Optional[str] = None,
add_special_tokens: bool = True,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
) -> BatchEncoding:
if return_offsets_mapping:
raise NotImplementedError(
"return_offset_mapping is not available when using Python tokenizers. "
"To use this feature, change your tokenizer to one deriving from "
"transformers.PreTrainedTokenizerFast. "
"More information on available tokenizers at "
"https://github.com/huggingface/transformers/pull/2674"
)
text = self.prepare_table_query(
table, query, answer, truncation_strategy=truncation_strategy, max_length=max_length
)
if self.do_lower_case:
text = text.lower()
tokens = self.tokenize(text)
return self.prepare_for_model(
ids=self.convert_tokens_to_ids(tokens),
add_special_tokens=add_special_tokens,
padding=padding_strategy.value,
truncation=truncation_strategy.value,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
prepend_batch_axis=True,
return_attention_mask=return_attention_mask,
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_length=return_length,
verbose=verbose,
)
def target_call_func(
self,
answer: Union[str, List[str]],
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
) -> BatchEncoding:
"""
The method tokenizes and prepares the answer label for the model.
Args:
answer (`str` or `List[str]`):
Corresponding answer supervision to the queries for training the model.
"""
is_batched = isinstance(answer, (list, tuple))
if is_batched:
return self.target_batch_encode_plus(
answer=answer,
add_special_tokens=add_special_tokens,
padding=padding,
truncation=truncation,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs,
)
else:
return self.target_encode_plus(
answer=answer,
add_special_tokens=add_special_tokens,
padding=padding,
truncation=truncation,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs,
)
def target_batch_encode_plus(
self,
answer: List[str],
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str] = None,
max_length: Optional[int] = None,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
) -> BatchEncoding:
"""
Prepare answer strings for the model.
Args:
answer `List[str]`:
Corresponding answer supervision to the queries for training the model.
"""
padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
padding=padding,
truncation=truncation,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
verbose=verbose,
**kwargs,
)
return self._target_batch_encode_plus(
answer=answer,
add_special_tokens=add_special_tokens,
padding_strategy=padding_strategy,
truncation_strategy=truncation_strategy,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs,
)
def _target_batch_encode_plus(
self,
answer: List[str],
add_special_tokens: bool = True,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
):
"""
Internal method to perform batch encoding of answers.
Args:
answer `List[str]`:
List of answer strings to encode.
add_special_tokens `bool`:
Whether to add special tokens.
padding_strategy `PaddingStrategy`:
Strategy for padding sequences.
truncation_strategy `TruncationStrategy`:
Strategy for truncating sequences.
max_length `Optional[int]`:
Maximum length of the sequences.
stride `int`:
Stride for tokenization.
pad_to_multiple_of `Optional[int]`:
Pad to a multiple of this value.
return_tensors `Optional[Union[str, TensorType]]`:
Optionally return tensors.
return_token_type_ids `Optional[bool]`:
Whether to return token type IDs.
return_attention_mask `Optional[bool]`:
Whether to return attention masks.
return_overflowing_tokens `bool`:
Whether to return overflowing tokens.
return_special_tokens_mask `bool`:
Whether to return special tokens mask.
return_offsets_mapping `bool`:
Whether to return offsets mapping.
return_length `bool`:
Whether to return sequence lengths.
verbose `bool`:
Whether to print verbose information.
**kwargs:
Additional keyword arguments.
Returns:
`BatchEncoding`: Batch encoding containing encoded answers.
"""
pass
) -> BatchEncoding:
batch_outputs = {}
for text in answer:
if self.do_lower_case:
text = text.lower()
tokens = self.tokenize(text)
outputs = self.prepare_for_model(
ids=self.convert_tokens_to_ids(tokens),
add_special_tokens=add_special_tokens,
padding=PaddingStrategy.DO_NOT_PAD.value,
truncation=truncation_strategy.value,
max_length=max_length,
stride=stride,
pad_to_multiple_of=None,
return_attention_mask=False,
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_length=return_length,
return_tensors=None,
prepend_batch_axis=False,
verbose=verbose,
)
for key, value in outputs.items():
if key not in batch_outputs:
batch_outputs[key] = []
batch_outputs[key].append(value)
batch_outputs = self.pad(
batch_outputs,
padding=padding_strategy.value,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
return_attention_mask=return_attention_mask,
)
batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors)
return BatchEncoding(batch_outputs)
def target_encode(
self,
answer: str,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy, TapexTruncationStrategy] = None,
max_length: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
**kwargs,
) -> List[int]:
"""
Prepare the answer string for the model. This method does not return token type IDs, attention masks, etc.
which are necessary for the model to work correctly. Use this method if you want to build your processing on
your own, otherwise refer to `__call__`.
Args:
answer `str`:
Corresponding answer supervision to the queries for training the model
"""
encoded_outputs = self.target_encode_plus(
answer=answer,
add_special_tokens=add_special_tokens,
padding=padding,
truncation=truncation,
max_length=max_length,
return_tensors=return_tensors,
**kwargs,
)
return encoded_outputs["input_ids"]
def target_encode_plus(
self,
answer: str,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str] = None,
max_length: Optional[int] = None,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
) -> BatchEncoding:
"""
Prepare a answer string for the model.
Args:
answer `str`:
Corresponding answer supervision to the queries for training the model.
"""
padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
padding=padding,
truncation=truncation,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
verbose=verbose,
**kwargs,
)
return self._target_encode_plus(
answer=answer,
add_special_tokens=add_special_tokens,
padding_strategy=padding_strategy,
truncation_strategy=truncation_strategy,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs,
)
def _target_encode_plus(
self,
answer: str,
add_special_tokens: bool = True,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
):
pass
) -> BatchEncoding:
if return_offsets_mapping:
raise NotImplementedError(
"return_offset_mapping is not available when using Python tokenizers. "
"To use this feature, change your tokenizer to one deriving from "
"transformers.PreTrainedTokenizerFast. "
"More information on available tokenizers at "
"https://github.com/huggingface/transformers/pull/2674"
)
text = answer
if self.do_lower_case:
text = text.lower()
tokens = self.tokenize(text)
return self.prepare_for_model(
ids=self.convert_tokens_to_ids(tokens),
add_special_tokens=add_special_tokens,
padding=padding_strategy.value,
truncation=truncation_strategy.value,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
prepend_batch_axis=True,
return_attention_mask=return_attention_mask,
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_length=return_length,
verbose=verbose,
)
):
"""
This method can be used to linearize a table and add a corresponding query.
Optionally, it also handles truncation of the table (cells).
An answer can be provided for more precise truncation.
"""
if not table.empty:
table_content = {"header": list(table.columns), "rows": [list(row.values) for i, row in table.iterrows()]}
self.truncate_table_cells(table_content, query, answer)
if truncation_strategy == TapexTruncationStrategy.DROP_ROWS_TO_FIT:
self.truncate_table_rows(table_content, query, answer, max_length=max_length)
linear_table = self.table_linearize.process_table(table_content)
else:
linear_table = ""
if linear_table == "":
logger.warning(
"You provide an empty table, or all cells contain much tokens (e.g., >= 1024 tokens). "
+ f"Please carefully check the corresponding table with the query : {query}."
)
if query == "":
logger.warning("You provide nothing to query with respect to the table.")
separator = " " if query and linear_table else ""
joint_input = (query + separator + linear_table) if query else linear_table
return joint_input
def truncate_table_cells(self, table_content: Dict, question: str, answer: List):
cell_mapping = {}
for row in table_content["rows"]:
for i, cell in enumerate(row):
truncate_cell = self.truncate_cell(cell)
if truncate_cell is not None:
cell_mapping[cell] = truncate_cell
row[i] = truncate_cell
if answer is not None:
for i, case in enumerate(answer):
if case in cell_mapping.keys():
answer[i] = cell_mapping[case]
def truncate_cell(self, cell_value):
if isinstance(cell_value, int) or isinstance(cell_value, float):
return cell_value
if cell_value.strip() != "":
try_tokens = self.tokenize(cell_value)
if len(try_tokens) >= self.max_cell_length:
retain_tokens = try_tokens[: self.max_cell_length]
retain_cell_value = self.convert_tokens_to_string(retain_tokens)
return retain_cell_value
else:
return None
else:
return cell_value
def truncate_table_rows(
self, table_content: Dict, question: str, answer: Optional[Union[str, List[str]]] = None, max_length=None
):
"""
Args:
table_content:
{"header": xxx, "rows": xxx, "id" (Optionally): xxx}
question:
natural language sentence
answer:
if for training, is the supervision; otherwise will be empty
"""
delete_ratio, remain_token_len = self.estimate_delete_ratio(table_content, question, max_length)
self.delete_unrelated_rows(table_content, question, answer, delete_ratio)
maximum_keep_rows = 0
for ind, row_example in enumerate(table_content["rows"]):
value_string = self.table_linearize.process_row(row_example, ind + 1)
value_token_len = len(self.tokenize(value_string))
if value_token_len > remain_token_len:
break
remain_token_len -= value_token_len
maximum_keep_rows += 1
del table_content["rows"][maximum_keep_rows:]
def estimate_delete_ratio(self, table_content: Dict, question: str, max_length=None):
if "header" not in table_content or "rows" not in table_content:
raise ValueError("The table content should contain both 'header' and 'rows' keys.")
question_tokens = self.tokenize(question, add_special_tokens=True)
header_string = self.table_linearize.process_header(table_content["header"])
header_tokens = self.tokenize(header_string, add_special_tokens=False)
used_token_len = len(question_tokens) + len(header_tokens)
remain_token_len = max_length - used_token_len
value_string = ""
for _, row_example in enumerate(table_content["rows"]):
value_string += self.table_linearize.process_row(row_example, 100) + " "
value_token_len = len(self.tokenize(value_string))
if value_token_len < remain_token_len:
return 0.0, remain_token_len
else:
return 1.0 - remain_token_len / value_token_len, remain_token_len
def delete_unrelated_rows(self, table_content: Dict, question: str, answer: List, delete_ratio: float):
"""
The argument answer is used only during training.
参数 answer 仅在训练过程中使用。
"""
truncated_unrelated_indices = []
related_indices = []
if answer is None or len(answer) == 0:
answer_set = set()
else:
answer_set = {ans_ex.lower() for ans_ex in answer}
if question is not None:
answer_set.update(question.split())
question_set = set(question.strip("?!.,").split(" "))
row_max_len = len(table_content["rows"])
for _row_idx, row in enumerate(table_content["rows"]):
lower_row = {str(cell).lower() for cell in row}
if len(lower_row & answer_set) == 0 and len(lower_row & question_set) == 0:
truncated_unrelated_indices.append(_row_idx)
else:
related_indices.extend([_row_idx - 2, _row_idx - 1, _row_idx, _row_idx + 1, _row_idx + 2])
truncated_unrelated_indices = [
_row_idx for _row_idx in truncated_unrelated_indices if _row_idx not in related_indices
]
drop_items = min(len(truncated_unrelated_indices), int(len(table_content["rows"]) * delete_ratio))
drop_row_indices = random.choices(truncated_unrelated_indices, k=drop_items)
for _row_idx in reversed(range(row_max_len)):
if _row_idx in drop_row_indices:
del table_content["rows"][_row_idx]
if "id" in table_content and len(drop_row_indices) > 0:
logger.warning("Delete {:.2f} rows in table {}".format(len(drop_row_indices), table_content["id"]))
.\models\deprecated\tapex\__init__.py
from typing import TYPE_CHECKING
from ....utils import _LazyModule
_import_structure = {"tokenization_tapex": ["TapexTokenizer"]}
if TYPE_CHECKING:
from .tokenization_tapex import TapexTokenizer
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
.\models\deprecated\trajectory_transformer\configuration_trajectory_transformer.py
""" TrajectoryTransformer模型配置"""
from ....configuration_utils import PretrainedConfig
from ....utils import logging
logger = logging.get_logger(__name__)
TRAJECTORY_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"CarlCochet/trajectory-transformer-halfcheetah-medium-v2": (
"https://huggingface.co/CarlCochet/trajectory-transformer-halfcheetah-medium-v2/resolve/main/config.json"
),
}
class TrajectoryTransformerConfig(PretrainedConfig):
r"""
这是一个配置类,用于存储[`TrajectoryTransformerModel`]的配置。根据指定的参数实例化TrajectoryTransformer模型,
定义模型架构。使用默认参数实例化一个配置将产生类似于TrajectoryTransformer
[CarlCochet/trajectory-transformer-halfcheetah-medium-v2](https://huggingface.co/CarlCochet/trajectory-transformer-halfcheetah-medium-v2)
架构的配置。
配置对象继承自[`PretrainedConfig`],可用于控制模型的输出。有关更多信息,请阅读[`PretrainedConfig`]的文档。
```
>>> from transformers import TrajectoryTransformerConfig, TrajectoryTransformerModel
>>> # 初始化一个TrajectoryTransformer模型,以CarlCochet/trajectory-transformer-halfcheetah-medium-v2风格的配置
>>> configuration = TrajectoryTransformerConfig()
>>> # 使用随机权重从CarlCochet/trajectory-transformer-halfcheetah-medium-v2风格的配置初始化一个模型
>>> model = TrajectoryTransformerModel(configuration)
>>> # 访问模型的配置
>>> configuration = model.config
```
"""
model_type = "trajectory_transformer"
keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {
"hidden_size": "n_embd",
"num_attention_heads": "n_head",
"num_hidden_layers": "n_layer",
}
def __init__(
self,
vocab_size=100,
action_weight=5,
reward_weight=1,
value_weight=1,
block_size=249,
action_dim=6,
observation_dim=17,
transition_dim=25,
n_layer=4,
n_head=4,
n_embd=128,
embd_pdrop=0.1,
attn_pdrop=0.1,
resid_pdrop=0.1,
learning_rate=0.0006,
max_position_embeddings=512,
initializer_range=0.02,
layer_norm_eps=1e-12,
kaiming_initializer_range=1,
use_cache=True,
pad_token_id=1,
bos_token_id=50256,
eos_token_id=50256,
**kwargs,
):
self.vocab_size = vocab_size
self.action_weight = action_weight
self.reward_weight = reward_weight
self.value_weight = value_weight
self.max_position_embeddings = max_position_embeddings
self.block_size = block_size
self.action_dim = action_dim
self.observation_dim = observation_dim
self.transition_dim = transition_dim
self.learning_rate = learning_rate
self.n_layer = n_layer
self.n_head = n_head
self.n_embd = n_embd
self.embd_pdrop = embd_pdrop
self.attn_pdrop = attn_pdrop
self.resid_pdrop = resid_pdrop
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.kaiming_initializer_range = kaiming_initializer_range
self.use_cache = use_cache
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
**kwargs,
)
.\models\deprecated\trajectory_transformer\convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch.py
""" TrajectoryTransformer pytorch checkpoint conversion"""
import torch
import trajectory.utils as utils
from transformers import TrajectoryTransformerModel
class Parser(utils.Parser):
dataset: str = "halfcheetah-medium-expert-v2"
config: str = "config.offline"
def convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch(logbase, dataset, loadpath, epoch, device):
"""Converting Sequential blocks to ModuleList"""
gpt, gpt_epoch = utils.load_model(logbase, dataset, loadpath, epoch=epoch, device=device)
trajectory_transformer = TrajectoryTransformerModel(gpt.config)
trajectory_transformer.tok_emb.load_state_dict(gpt.tok_emb.state_dict())
trajectory_transformer.pos_emb = gpt.pos_emb
trajectory_transformer.drop.load_state_dict(gpt.drop.state_dict())
trajectory_transformer.ln_f.load_state_dict(gpt.ln_f.state_dict())
trajectory_transformer.head.load_state_dict(gpt.head.state_dict())
for i, block in enumerate(gpt.blocks):
trajectory_transformer.blocks[i].ln1.load_state_dict(gpt.blocks[i].ln1.state_dict())
trajectory_transformer.blocks[i].ln2.load_state_dict(gpt.blocks[i].ln2.state_dict())
trajectory_transformer.blocks[i].attn.load_state_dict(gpt.blocks[i].attn.state_dict())
trajectory_transformer.blocks[i].l1.load_state_dict(gpt.blocks[i].mlp[0].state_dict())
trajectory_transformer.blocks[i].act.load_state_dict(gpt.blocks[i].mlp[1].state_dict())
trajectory_transformer.blocks[i].l2.load_state_dict(gpt.blocks[i].mlp[2].state_dict())
trajectory_transformer.blocks[i].drop.load_state_dict(gpt.blocks[i].mlp[3].state_dict())
torch.save(trajectory_transformer.state_dict(), "pytorch_model.bin")
if __name__ == "__main__":
"""
To run this script you will need to install the original repository to run the original model. You can find it
here: https://github.com/jannerm/trajectory-transformer From this repository code you can also download the
original pytorch checkpoints.
Run with the command:
```
>>> python convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch.py --dataset <dataset_name>
... --gpt_loadpath <path_to_original_pytorch_checkpoint>
```
"""
args = Parser().parse_args("plan")
convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch(
args.logbase, args.dataset, args.gpt_loadpath, args.gpt_epoch, args.device
)
.\models\deprecated\trajectory_transformer\modeling_trajectory_transformer.py
""" PyTorch TrajectoryTransformer model."""
import math
import os
from dataclasses import dataclass
from typing import Optional, Tuple, Union
import numpy as np
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import functional as F
from ....modeling_utils import PreTrainedModel
from ....utils import (
ModelOutput,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from .configuration_trajectory_transformer import TrajectoryTransformerConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "CarlCochet/trajectory-transformer-halfcheetah-medium-v2"
_CONFIG_FOR_DOC = "TrajectoryTransformerConfig"
TRAJECTORY_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
"CarlCochet/trajectory-transformer-halfcheetah-medium-v2",
]
def load_tf_weights_in_trajectory_transformer(model, config, tf_checkpoint_path):
"""Load tf checkpoints in a pytorch model."""
try:
import re
import numpy as np
import tensorflow as tf
except ImportError:
logger.error(
"Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions."
)
raise
tf_path = os.path.abspath(tf_checkpoint_path)
logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
init_vars = tf.train.list_variables(tf_path)
names = []
arrays = []
for name, shape in init_vars:
logger.info(f"Loading TF weight {name} with shape {shape}")
array = tf.train.load_variable(tf_path, name)
names.append(name)
arrays.append(array)
for name, array in zip(names, arrays):
name = name.split("/")
if any(
n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
for n in name
):
logger.info(f"Skipping {'/'.join(name)}")
continue
pointer = model
for m_name in name:
if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
scope_names = re.split(r"_(\d+)", m_name)
else:
scope_names = [m_name]
if scope_names[0] == "kernel" or scope_names[0] == "gamma":
pointer = getattr(pointer, "weight")
elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
pointer = getattr(pointer, "bias")
elif scope_names[0] == "output_weights":
pointer = getattr(pointer, "weight")
elif scope_names[0] == "squad":
pointer = getattr(pointer, "classifier")
else:
try:
pointer = getattr(pointer, scope_names[0])
except AttributeError:
logger.info(f"Skipping {'/'.join(name)}")
continue
if len(scope_names) >= 2:
num = int(scope_names[1])
pointer = pointer[num]
if m_name[-11:] == "_embeddings":
pointer = getattr(pointer, "weight")
elif m_name == "kernel":
array = np.transpose(array)
try:
if pointer.shape != array.shape:
raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
except AssertionError as e:
e.args += (pointer.shape, array.shape)
raise
logger.info(f"Initialize PyTorch weight {name}")
pointer.data = torch.from_numpy(array)
return model
@dataclass
class TrajectoryTransformerOutput(ModelOutput):
"""
Base class for model's outputs that also contains a pooling of the last hidden states.
Args:
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Language modeling loss. 语言建模损失(可选),在提供标签时返回。
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). 语言建模头的预测分数(SoftMax 之前每个词汇标记的分数)。
past_key_values (`Tuple[Tuple[torch.Tensor]]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Tuple of length `config.n_layers`, containing tuples of tensors of shape `(batch_size, num_heads,
sequence_length, embed_size_per_head)`). Contains pre-computed hidden-states (key and values in the
attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
长度为 `config.n_layers` 的元组,包含形状为 `(batch_size, num_heads, sequence_length, embed_size_per_head)` 的张量元组。
包含预先计算的隐藏状态(注意力块中的键和值),可用于加速顺序解码。
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
plus the initial embedding outputs.
包含模型每一层输出的隐藏状态的元组(每层一个),以及初始嵌入输出。
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`. GPT2Attentions weights after the attention softmax, used to compute the weighted average
in the self-attention heads.
包含注意力权重的元组(每层一个),用于计算自注意力头中的加权平均值。
"""
loss: Optional[torch.FloatTensor] = None
logits: torch.FloatTensor = None
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
class TrajectoryTransformerPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = TrajectoryTransformerConfig
load_tf_weights = load_tf_weights_in_trajectory_transformer
base_model_prefix = "trajectory_transformer"
main_input_name = "trajectories"
supports_gradient_checkpointing = True
def _init_weights(self, module):
if isinstance(module, (nn.Linear, nn.Embedding)):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if isinstance(module, nn.Linear) and module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
elif isinstance(module, EinLinear):
for i in range(module.n_models):
nn.init.kaiming_uniform_(module.weight[i], a=math.sqrt(5) / self.config.kaiming_initializer_range)
if module.bias is not None:
fan_in, _ = nn.init._calculate_fan_in_and_fan_out(module.weight[i])
bound = (1 / math.sqrt(fan_in)) * self.config.initializer_range
nn.init.uniform_(module.bias[i], -bound, bound)
TRAJECTORY_TRANSFORMER_START_DOCSTRING = r"""
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
config ([`TrajectoryTransformerConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
TRAJECTORY_TRANSFORMER_INPUTS_DOCSTRING = r"""
Args:
trajectories (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Batch of trajectories, where a trajectory is a sequence of states, actions and rewards.
past_key_values (`Tuple[Tuple[torch.Tensor]]` of length `config.n_layers`, *optional*):
Contains precomputed hidden-states (key and values in the attention blocks) as computed by the model (see
`past_key_values` output below). Can be used to speed up sequential decoding. The `input_ids` which have
their past given to this model should not be passed as `input_ids` as they have already been computed.
targets (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Desired targets used to compute the loss.
attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
use_cache (`bool`, *optional*):
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
`past_key_values`).
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
def __init__(self, n_models, in_features, out_features, bias):
super().__init__()
self.n_models = n_models
self.out_features = out_features
self.in_features = in_features
self.weight = nn.Parameter(torch.Tensor(n_models, out_features, in_features))
if bias:
self.bias = nn.Parameter(torch.Tensor(n_models, out_features))
else:
self.register_parameter("bias", None)
super().__init__()
self.n_models = n_models
self.out_features = out_features
self.in_features = in_features
self.weight = nn.Parameter(torch.Tensor(n_models, out_features, in_features))
if bias:
self.bias = nn.Parameter(torch.Tensor(n_models, out_features))
else:
self.register_parameter("bias", None)
def reset_parameters(self):
for i in range(self.n_models):
nn.init.kaiming_uniform_(self.weight[i], a=math.sqrt(5))
if self.bias is not None:
fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight[i])
bound = 1 / math.sqrt(fan_in)
nn.init.uniform_(self.bias[i], -bound, bound)
for i in range(self.n_models):
nn.init.kaiming_uniform_(self.weight[i], a=math.sqrt(5))
if self.bias is not None:
fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight[i])
bound = 1 / math.sqrt(fan_in)
nn.init.uniform_(self.bias[i], -bound, bound)
def forward(self, input):
"""
Args:
input (`torch.FloatTensor` of shape `(B, n_models, input_dim)`):
The input to the layer.
"""
output = torch.einsum("eoi,bei->beo", self.weight, input)
if self.bias is not None:
raise RuntimeError()
return output
output = torch.einsum("eoi,bei->beo", self.weight, input)
if self.bias is not None:
raise RuntimeError()
return output
class CausalSelfAttention(nn.Module):
def __init__(self, config):
super().__init__()
if config.n_embd % config.n_head != 0:
raise ValueError(f"n_head ({config.n_head}) should be a divisor of n_embd ({config.n_embd})")
self.key = nn.Linear(config.n_embd, config.n_embd)
self.query = nn.Linear(config.n_embd, config.n_embd)
self.value = nn.Linear(config.n_embd, config.n_embd)
self.attn_drop = nn.Dropout(config.attn_pdrop)
self.resid_drop = nn.Dropout(config.resid_pdrop)
self.proj = nn.Linear(config.n_embd, config.n_embd)
self.register_buffer(
"mask",
torch.tril(torch.ones(config.block_size, config.block_size)).view(
1, 1, config.block_size, config.block_size
),
persistent=False,
)
joined_dim = config.observation_dim + config.action_dim + 2
self.mask.squeeze()[:, joined_dim - 1 :: joined_dim] = 0
self.n_head = config.n_head
def forward(
self,
hidden_states: Optional[Tuple[torch.FloatTensor]],
layer_past: Optional[Tuple[torch.Tensor]] = None,
use_cache: Optional[bool] = False,
output_attentions: Optional[bool] = False,
):
batch_size, sequence_length, embedding_dim = hidden_states.size()
key = (
self.key(hidden_states)
.view(batch_size, sequence_length, self.n_head, embedding_dim // self.n_head)
.transpose(1, 2)
)
query = (
self.query(hidden_states)
.view(batch_size, sequence_length, self.n_head, embedding_dim // self.n_head)
.transpose(1, 2)
)
value = (
self.value(hidden_states)
.view(batch_size, sequence_length, self.n_head, embedding_dim // self.n_head)
.transpose(1, 2)
)
if layer_past is not None:
past_key, past_value = layer_past
key = torch.cat((past_key, key), dim=-2)
value = torch.cat((past_value, value), dim=-2)
if use_cache is True:
present = (key, value)
else:
present = None
attn_weights = (torch.matmul(query, key.transpose(-2, -1))) * (1.0 / math.sqrt(key.size(-1)))
attn_weights = attn_weights.masked_fill(
self.mask[:, :, :sequence_length, :sequence_length] == 0, torch.finfo(attn_weights.dtype).min
)
attn_weights = F.softmax(attn_weights, dim=-1)
self._attn_map = attn_weights.clone()
attn_weights = self.attn_drop(attn_weights)
output = torch.matmul(attn_weights, value)
output = output.transpose(1, 2).contiguous().view(batch_size, sequence_length, embedding_dim)
output = self.resid_drop(self.proj(output))
outputs = (output, present)
if output_attentions:
outputs += (attn_weights,)
return outputs
class Block(nn.Module):
def __init__(self, config):
super().__init__()
self.ln1 = nn.LayerNorm(config.n_embd)
self.ln2 = nn.LayerNorm(config.n_embd)
self.attn = CausalSelfAttention(config)
self.l1 = nn.Linear(config.n_embd, 4 * config.n_embd)
self.act = nn.GELU()
self.l2 = nn.Linear(4 * config.n_embd, config.n_embd)
self.drop = nn.Dropout(config.resid_pdrop)
def forward(
self,
hidden_states: Optional[Tuple[torch.FloatTensor]],
layer_past: Optional[Tuple[torch.Tensor]] = None,
use_cache: Optional[bool] = False,
output_attentions: Optional[bool] = False,
):
residual = hidden_states
hidden_states = self.ln1(hidden_states)
attn_outputs = self.attn(
hidden_states, layer_past=layer_past, use_cache=use_cache, output_attentions=output_attentions
)
attn_output = attn_outputs[0]
outputs = attn_outputs[1:]
hidden_states = attn_output + residual
residual = hidden_states
hidden_states = self.ln2(hidden_states)
hidden_states = self.l1(hidden_states)
hidden_states = self.act(hidden_states)
hidden_states = self.l2(hidden_states)
hidden_states = residual + self.drop(hidden_states)
if use_cache:
outputs = (hidden_states,) + outputs
else:
outputs = (hidden_states,) + outputs[1:]
return outputs
@add_start_docstrings(
"The bare TrajectoryTransformer Model transformer outputting raw hidden-states without any specific head on top.",
TRAJECTORY_TRANSFORMER_START_DOCSTRING,
)
class TrajectoryTransformerModel(TrajectoryTransformerPreTrainedModel):
"""the full GPT language model, with a context size of block_size"""
def __init__(self, config):
super().__init__(config)
self.tok_emb = nn.Embedding(config.vocab_size * config.transition_dim + 1, config.n_embd)
self.pos_emb = nn.Parameter(torch.zeros(1, config.block_size, config.n_embd))
self.drop = nn.Dropout(config.embd_pdrop)
self.blocks = nn.ModuleList([Block(config) for _ in range(config.n_layer)])
self.ln_f = nn.LayerNorm(config.n_embd)
self.head = EinLinear(config.transition_dim, config.n_embd, config.vocab_size + 1, bias=False)
self.vocab_size = config.vocab_size
self.stop_token = config.vocab_size * config.transition_dim
self.block_size = config.block_size
self.observation_dim = config.observation_dim
self.action_dim = config.action_dim
self.transition_dim = config.transition_dim
self.embedding_dim = config.n_embd
self.action_weight = config.action_weight
self.reward_weight = config.reward_weight
self.value_weight = config.value_weight
self.gradient_checkpointing = False
self.post_init()
def get_block_size(self):
return self.block_size
def offset_tokens(self, trajectories):
_, sequence_length = trajectories.shape
n_states = int(np.ceil(sequence_length / self.transition_dim))
offsets = torch.arange(self.transition_dim) * self.vocab_size
offsets = offsets.repeat(n_states).to(trajectories.device)
offset_trajectories = trajectories + offsets[:sequence_length]
offset_trajectories[trajectories == self.vocab_size] = self.stop_token
return offset_trajectories
def pad_to_full_observation(self, hidden_states):
batch_size, sequence_length, _ = hidden_states.shape
n_pad = (self.transition_dim - sequence_length % self.transition_dim) % self.transition_dim
padding = torch.zeros(batch_size, n_pad, self.embedding_dim, device=hidden_states.device)
hidden_states_pad = torch.cat([hidden_states, padding], dim=1)
hidden_states_pad = hidden_states_pad.view(-1, self.transition_dim, self.embedding_dim)
return hidden_states_pad, n_pad
def forward(
trajectories: Optional[torch.LongTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
targets: Optional[torch.FloatTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
.\models\deprecated\trajectory_transformer\__init__.py
from typing import TYPE_CHECKING
from ....utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
_import_structure = {
"configuration_trajectory_transformer": [
"TRAJECTORY_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
"TrajectoryTransformerConfig",
],
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_trajectory_transformer"] = [
"TRAJECTORY_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
"TrajectoryTransformerModel",
"TrajectoryTransformerPreTrainedModel",
"load_tf_weights_in_trajectory_transformer",
]
if TYPE_CHECKING:
from .configuration_trajectory_transformer import (
TRAJECTORY_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
TrajectoryTransformerConfig,
)
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_trajectory_transformer import (
TRAJECTORY_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
TrajectoryTransformerModel,
TrajectoryTransformerPreTrainedModel,
load_tf_weights_in_trajectory_transformer,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\deprecated\transfo_xl\configuration_transfo_xl.py
from ....configuration_utils import PretrainedConfig
from ....utils import logging
logger = logging.get_logger(__name__)
TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"transfo-xl/transfo-xl-wt103": "https://huggingface.co/transfo-xl/transfo-xl-wt103/resolve/main/config.json",
}
class TransfoXLConfig(PretrainedConfig):
"""
这是一个配置类,用于存储[`TransfoXLModel`]或[`TFTransfoXLModel`]的配置。它被用来根据指定的参数实例化
Transformer-XL模型,定义模型架构。使用默认参数实例化配置将得到类似于TransfoXL
[transfo-xl/transfo-xl-wt103](https://huggingface.co/transfo-xl/transfo-xl-wt103)架构的配置。
配置对象继承自[`PretrainedConfig`],可以用来控制模型的输出。阅读[`PretrainedConfig`]的文档获取更多信息。
示例:
```
>>> from transformers import TransfoXLConfig, TransfoXLModel
>>> # 初始化一个Transformer XL配置
>>> configuration = TransfoXLConfig()
>>> # 使用配置初始化一个模型(随机权重)
>>> model = TransfoXLModel(configuration)
>>> # 访问模型配置
>>> configuration = model.config
```
"""
model_type = "transfo-xl"
keys_to_ignore_at_inference = ["mems"]
attribute_map = {
"n_token": "vocab_size",
"hidden_size": "d_model",
"num_attention_heads": "n_head",
"num_hidden_layers": "n_layer",
}
def __init__(
self,
vocab_size=267735,
cutoffs=[20000, 40000, 200000],
d_model=1024,
d_embed=1024,
n_head=16,
d_head=64,
d_inner=4096,
div_val=4,
pre_lnorm=False,
n_layer=18,
mem_len=1600,
clamp_len=1000,
same_length=True,
proj_share_all_but_first=True,
attn_type=0,
sample_softmax=-1,
adaptive=True,
dropout=0.1,
dropatt=0.0,
untie_r=True,
init="normal",
init_range=0.01,
proj_init_std=0.01,
init_std=0.02,
layer_norm_epsilon=1e-5,
eos_token_id=0,
**kwargs,
):
self.vocab_size = vocab_size
self.cutoffs = []
self.cutoffs.extend(cutoffs)
if proj_share_all_but_first:
self.tie_projs = [False] + [True] * len(self.cutoffs)
else:
self.tie_projs = [False] + [False] * len(self.cutoffs)
self.d_model = d_model
self.d_embed = d_embed
self.d_head = d_head
self.d_inner = d_inner
self.div_val = div_val
self.pre_lnorm = pre_lnorm
self.n_layer = n_layer
self.n_head = n_head
self.mem_len = mem_len
self.same_length = same_length
self.attn_type = attn_type
self.clamp_len = clamp_len
self.sample_softmax = sample_softmax
self.adaptive = adaptive
self.dropout = dropout
self.dropatt = dropatt
self.untie_r = untie_r
self.init = init
self.init_range = init_range
self.proj_init_std = proj_init_std
self.init_std = init_std
self.layer_norm_epsilon = layer_norm_epsilon
super().__init__(eos_token_id=eos_token_id, **kwargs)
@property
def max_position_embeddings(self):
logger.info(f"The model {self.model_type} is one of the few models that has no sequence length limit.")
return -1
@max_position_embeddings.setter
def max_position_embeddings(self, value):
raise NotImplementedError(
f"The model {self.model_type} is one of the few models that has no sequence length limit."
)