Transformers 源码解析(九十一)
.\models\pvt_v2\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_torch_available,
is_vision_available,
)
_import_structure = {
"configuration_pvt_v2": ["PVT_V2_PRETRAINED_CONFIG_ARCHIVE_MAP", "PvtV2Config"],
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_pvt_v2"] = [
"PVT_V2_PRETRAINED_MODEL_ARCHIVE_LIST",
"PvtV2ForImageClassification",
"PvtV2Model",
"PvtV2PreTrainedModel",
"PvtV2Backbone",
]
if TYPE_CHECKING:
from .configuration_pvt_v2 import PVT_V2_PRETRAINED_CONFIG_ARCHIVE_MAP, PvtV2Config
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_pvt_v2 import (
PVT_V2_PRETRAINED_MODEL_ARCHIVE_LIST,
PvtV2Backbone,
PvtV2ForImageClassification,
PvtV2Model,
PvtV2PreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\qdqbert\configuration_qdqbert.py
"""
QDQBERT model configuration
"""
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
QDQBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"google-bert/bert-base-uncased": "https://huggingface.co/google-bert/bert-base-uncased/resolve/main/config.json",
}
class QDQBertConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`QDQBertModel`]. It is used to instantiate an
QDQBERT model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the BERT
[google-bert/bert-base-uncased](https://huggingface.co/google-bert/bert-base-uncased) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
# 定义模型类型为 "qdqbert"
model_type = "qdqbert"
# 初始化函数,用于创建一个新的实例
def __init__(
self,
vocab_size=30522, # 词汇表大小,默认为30522
hidden_size=768, # 隐藏层大小,默认为768
num_hidden_layers=12, # 隐藏层的数量,默认为12
num_attention_heads=12, # 注意力头的数量,默认为12
intermediate_size=3072, # 中间层大小,默认为3072
hidden_act="gelu", # 隐藏层激活函数,默认为gelu
hidden_dropout_prob=0.1, # 隐藏层的Dropout概率,默认为0.1
attention_probs_dropout_prob=0.1, # 注意力概率的Dropout概率,默认为0.1
max_position_embeddings=512, # 最大位置嵌入数,默认为512
type_vocab_size=2, # 类型词汇表大小,默认为2
initializer_range=0.02, # 初始化范围,默认为0.02
layer_norm_eps=1e-12, # Layer Norm的epsilon值,默认为1e-12
use_cache=True, # 是否使用缓存,默认为True
pad_token_id=1, # 填充token的ID,默认为1
bos_token_id=0, # 开始token的ID,默认为0
eos_token_id=2, # 结束token的ID,默认为2
**kwargs,
):
# 调用父类的初始化函数,设置特殊token的ID和额外的关键字参数
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
# 初始化类的成员变量,设置模型的各种参数
self.vocab_size = vocab_size # 设置词汇表大小
self.max_position_embeddings = max_position_embeddings # 设置最大位置嵌入数
self.hidden_size = hidden_size # 设置隐藏层大小
self.num_hidden_layers = num_hidden_layers # 设置隐藏层的数量
self.num_attention_heads = num_attention_heads # 设置注意力头的数量
self.intermediate_size = intermediate_size # 设置中间层大小
self.hidden_act = hidden_act # 设置隐藏层激活函数
self.hidden_dropout_prob = hidden_dropout_prob # 设置隐藏层的Dropout概率
self.attention_probs_dropout_prob = attention_probs_dropout_prob # 设置注意力概率的Dropout概率
self.initializer_range = initializer_range # 设置初始化范围
self.type_vocab_size = type_vocab_size # 设置类型词汇表大小
self.layer_norm_eps = layer_norm_eps # 设置Layer Norm的epsilon值
self.use_cache = use_cache # 设置是否使用缓存
.\models\qdqbert\modeling_qdqbert.py
"""
PyTorch QDQBERT model.
"""
import math
import os
import warnings
from typing import Dict, List, Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN
from ...modeling_outputs import (
BaseModelOutputWithPastAndCrossAttentions,
BaseModelOutputWithPoolingAndCrossAttentions,
CausalLMOutputWithCrossAttentions,
MaskedLMOutput,
MultipleChoiceModelOutput,
NextSentencePredictorOutput,
QuestionAnsweringModelOutput,
SequenceClassifierOutput,
TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import (
find_pruneable_heads_and_indices,
prune_linear_layer,
)
from ...utils import (
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
is_pytorch_quantization_available,
logging,
replace_return_docstrings,
requires_backends,
)
from .configuration_qdqbert import QDQBertConfig
logger = logging.get_logger(__name__)
if is_pytorch_quantization_available():
try:
from pytorch_quantization import nn as quant_nn
from pytorch_quantization.nn.modules.tensor_quantizer import TensorQuantizer
except OSError:
logger.error(
"QDQBERT model are not usable since `pytorch_quantization` can't be loaded. Please try to reinstall it"
" following the instructions here:"
" https://github.com/NVIDIA/TensorRT/tree/master/tools/pytorch-quantization."
)
_CHECKPOINT_FOR_DOC = "google-bert/bert-base-uncased"
_CONFIG_FOR_DOC = "QDQBertConfig"
QDQBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"google-bert/bert-base-uncased",
]
def load_tf_weights_in_qdqbert(model, tf_checkpoint_path):
"""Load tf checkpoints in a pytorch model."""
try:
import re
import numpy as np
import tensorflow as tf
except ImportError:
logger.error(
"Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions."
)
raise
tf_path = os.path.abspath(tf_checkpoint_path)
logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
init_vars = tf.train.list_variables(tf_path)
names = []
arrays = []
for name, shape in init_vars:
logger.info(f"Loading TF weight {name} with shape {shape}")
array = tf.train.load_variable(tf_path, name)
names.append(name)
arrays.append(array)
for name, array in zip(names, arrays):
name = name.split("/")
if any(
n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
for n in name
):
logger.info(f"Skipping {'/'.join(name)}")
continue
pointer = model
for m_name in name:
if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
scope_names = re.split(r"_(\d+)", m_name)
else:
scope_names = [m_name]
if scope_names[0] == "kernel" or scope_names[0] == "gamma":
pointer = getattr(pointer, "weight")
elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
pointer = getattr(pointer, "bias")
elif scope_names[0] == "output_weights":
pointer = getattr(pointer, "weight")
elif scope_names[0] == "squad":
pointer = getattr(pointer, "classifier")
else:
try:
pointer = getattr(pointer, scope_names[0])
except AttributeError:
logger.info(f"Skipping {'/'.join(name)}")
continue
if len(scope_names) >= 2:
num = int(scope_names[1])
pointer = pointer[num]
if m_name[-11:] == "_embeddings":
pointer = getattr(pointer, "weight")
elif m_name == "kernel":
array = np.transpose(array)
try:
if pointer.shape != array.shape:
raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
except AssertionError as e:
e.args += (pointer.shape, array.shape)
raise
logger.info(f"Initialize PyTorch weight {name}")
pointer.data = torch.from_numpy(array)
return model
class QDQBertEmbeddings(nn.Module):
"""Construct the embeddings from word, position and token_type embeddings."""
def __init__(self, config):
super().__init__()
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
self.register_buffer(
"token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
past_key_values_length: int = 0,
if input_ids is not None:
input_shape = input_ids.size()
else:
input_shape = inputs_embeds.size()[:-1]
seq_length = input_shape[1]
if position_ids is None:
position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
if token_type_ids is None:
if hasattr(self, "token_type_ids"):
buffered_token_type_ids = self.token_type_ids[:, :seq_length]
buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
token_type_ids = buffered_token_type_ids_expanded
else:
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
if inputs_embeds is None:
inputs_embeds = self.word_embeddings(input_ids)
token_type_embeddings = self.token_type_embeddings(token_type_ids)
embeddings = inputs_embeds + token_type_embeddings
if self.position_embedding_type == "absolute":
position_embeddings = self.position_embeddings(position_ids)
embeddings += position_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings
class QDQBertSelfOutput(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = quant_nn.QuantLinear(config.hidden_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.add_local_input_quantizer = TensorQuantizer(quant_nn.QuantLinear.default_quant_desc_input)
self.add_residual_input_quantizer = TensorQuantizer(quant_nn.QuantLinear.default_quant_desc_input)
def forward(self, hidden_states, input_tensor):
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
add_local = self.add_local_input_quantizer(hidden_states)
add_residual = self.add_residual_input_quantizer(input_tensor)
hidden_states = self.LayerNorm(add_local + add_residual)
return hidden_states
class QDQBertAttention(nn.Module):
def __init__(self, config):
super().__init__()
self.self = QDQBertSelfAttention(config)
self.output = QDQBertSelfOutput(config)
self.pruned_heads = set()
def prune_heads(self, heads):
if len(heads) == 0:
return
heads, index = find_pruneable_heads_and_indices(
heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
)
self.self.query = prune_linear_layer(self.self.query, index)
self.self.key = prune_linear_layer(self.self.key, index)
self.self.value = prune_linear_layer(self.self.value, index)
self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
self.pruned_heads = self.pruned_heads.union(heads)
def forward(
self,
hidden_states,
attention_mask=None,
head_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
past_key_value=None,
output_attentions=False,
):
self_outputs = self.self(
hidden_states,
attention_mask,
head_mask,
encoder_hidden_states,
encoder_attention_mask,
past_key_value,
output_attentions,
)
attention_output = self.output(self_outputs[0], hidden_states)
outputs = (attention_output,) + self_outputs[1:]
return outputs
class QDQBertIntermediate(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = quant_nn.QuantLinear(config.hidden_size, config.intermediate_size)
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
def forward(self, hidden_states):
hidden_states = self.dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
class QDQBertOutput(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = quant_nn.QuantLinear(config.intermediate_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.add_local_input_quantizer = TensorQuantizer(quant_nn.QuantLinear.default_quant_desc_input)
self.add_residual_input_quantizer = TensorQuantizer(quant_nn.QuantLinear.default_quant_desc_input)
def forward(self, hidden_states, input_tensor):
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
add_local = self.add_local_input_quantizer(hidden_states)
add_residual = self.add_residual_input_quantizer(input_tensor)
hidden_states = self.LayerNorm(add_local + add_residual)
return hidden_states
class QDQBertLayer(nn.Module):
def __init__(self, config):
super().__init__()
self.seq_len_dim = 1
self.attention = QDQBertAttention(config)
self.is_decoder = config.is_decoder
self.add_cross_attention = config.add_cross_attention
if self.add_cross_attention:
if not self.is_decoder:
raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
self.crossattention = QDQBertAttention(config)
self.intermediate = QDQBertIntermediate(config)
self.output = QDQBertOutput(config)
def forward(
self,
hidden_states,
attention_mask=None,
head_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
past_key_value=None,
output_attentions=False,
):
self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
self_attention_outputs = self.attention(
hidden_states,
attention_mask,
head_mask,
output_attentions=output_attentions,
past_key_value=self_attn_past_key_value,
)
attention_output = self_attention_outputs[0]
if self.is_decoder:
outputs = self_attention_outputs[1:-1]
present_key_value = self_attention_outputs[-1]
else:
outputs = self_attention_outputs[1:]
cross_attn_present_key_value = None
if self.is_decoder and encoder_hidden_states is not None:
if not hasattr(self, "crossattention"):
raise ValueError(
f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
" by setting `config.add_cross_attention=True`"
)
cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
cross_attention_outputs = self.crossattention(
attention_output,
attention_mask,
head_mask,
encoder_hidden_states,
encoder_attention_mask,
cross_attn_past_key_value,
output_attentions,
)
attention_output = cross_attention_outputs[0]
outputs = outputs + cross_attention_outputs[1:-1]
cross_attn_present_key_value = cross_attention_outputs[-1]
present_key_value = present_key_value + cross_attn_present_key_value
layer_output = self.feed_forward_chunk(attention_output)
outputs = (layer_output,) + outputs
if self.is_decoder:
outputs = outputs + (present_key_value,)
return outputs
def feed_forward_chunk(self, attention_output):
intermediate_output = self.intermediate(attention_output)
layer_output = self.output(intermediate_output, attention_output)
return layer_output
class QDQBertEncoder(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.layer = nn.ModuleList([QDQBertLayer(config) for _ in range(config.num_hidden_layers)])
self.gradient_checkpointing = False
def forward(
self,
hidden_states,
attention_mask=None,
head_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
past_key_values=None,
use_cache=None,
output_attentions=False,
output_hidden_states=False,
return_dict=True,
):
all_hidden_states = () if output_hidden_states else None
all_self_attentions = () if output_attentions else None
all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
next_decoder_cache = () if use_cache else None
for i, layer_module in enumerate(self.layer):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
layer_head_mask = head_mask[i] if head_mask is not None else None
past_key_value = past_key_values[i] if past_key_values is not None else None
if self.gradient_checkpointing and self.training:
if use_cache:
logger.warning_once(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache = False
layer_outputs = self._gradient_checkpointing_func(
layer_module.__call__,
hidden_states,
attention_mask,
layer_head_mask,
encoder_hidden_states,
encoder_attention_mask,
past_key_value,
output_attentions,
)
else:
layer_outputs = layer_module(
hidden_states,
attention_mask,
layer_head_mask,
encoder_hidden_states,
encoder_attention_mask,
past_key_value,
output_attentions,
)
hidden_states = layer_outputs[0]
if use_cache:
next_decoder_cache += (layer_outputs[-1],)
if output_attentions:
all_self_attentions = all_self_attentions + (layer_outputs[1],)
if self.config.add_cross_attention:
all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(
v
for v in [
hidden_states,
next_decoder_cache,
all_hidden_states,
all_self_attentions,
all_cross_attentions,
]
if v is not None
)
return BaseModelOutputWithPastAndCrossAttentions(
last_hidden_state=hidden_states,
past_key_values=next_decoder_cache,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
cross_attentions=all_cross_attentions,
)
class QDQBertPooler(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.activation = nn.Tanh()
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
first_token_tensor = hidden_states[:, 0]
pooled_output = self.dense(first_token_tensor)
pooled_output = self.activation(pooled_output)
return pooled_output
class QDQBertPredictionHeadTransform(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
if isinstance(config.hidden_act, str):
self.transform_act_fn = ACT2FN[config.hidden_act]
else:
self.transform_act_fn = config.hidden_act
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.transform_act_fn(hidden_states)
hidden_states = self.LayerNorm(hidden_states)
return hidden_states
class QDQBertLMPredictionHead(nn.Module):
def __init__(self, config):
super().__init__()
self.transform = QDQBertPredictionHeadTransform(config)
self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.bias = nn.Parameter(torch.zeros(config.vocab_size))
self.decoder.bias = self.bias
def forward(self, hidden_states):
hidden_states = self.transform(hidden_states)
hidden_states = self.decoder(hidden_states)
return hidden_states
class QDQBertOnlyMLMHead(nn.Module):
def __init__(self, config):
super().__init__()
self.predictions = QDQBertLMPredictionHead(config)
def forward(self, sequence_output):
prediction_scores = self.predictions(sequence_output)
return prediction_scores
class QDQBertOnlyNSPHead(nn.Module):
def __init__(self, config):
super().__init__()
self.seq_relationship = nn.Linear(config.hidden_size, 2)
def forward(self, pooled_output):
seq_relationship_score = self.seq_relationship(pooled_output)
return seq_relationship_score
class QDQBertPreTrainingHeads(nn.Module):
def __init__(self, config):
super().__init__()
self.predictions = QDQBertLMPredictionHead(config)
self.seq_relationship = nn.Linear(config.hidden_size, 2)
def forward(self, sequence_output, pooled_output):
prediction_scores = self.predictions(sequence_output)
seq_relationship_score = self.seq_relationship(pooled_output)
return prediction_scores, seq_relationship_score
class QDQBertPreTrainedModel(PreTrainedModel):
"""
一个抽象类,处理权重初始化以及下载和加载预训练模型的简单接口。
"""
config_class = QDQBertConfig
load_tf_weights = load_tf_weights_in_qdqbert
base_model_prefix = "bert"
supports_gradient_checkpointing = True
def _init_weights(self, module):
"""初始化权重"""
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
QDQBERT_START_DOCSTRING = r"""
此模型继承自 [`PreTrainedModel`]。查看超类文档以了解库实现的通用方法(例如下载或保存模型、调整输入嵌入、修剪头等)。
此模型还是 PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) 的子类。
将其用作常规的 PyTorch 模块,并参考 PyTorch 文档以获取所有与一般使用和行为相关的事项。
参数:
config ([`QDQBertConfig`]): 包含模型所有参数的配置类。
使用配置文件初始化不会加载与模型相关的权重,只加载配置。查看 [`~PreTrainedModel.from_pretrained`] 方法以加载模型权重。
"""
QDQBERT_INPUTS_DOCSTRING = r"""
Args:
input_ids (`torch.LongTensor` of shape `({0})`):
# 输入序列中的token索引,用于词汇表中的位置。
# 可以使用`AutoTokenizer`获得这些索引。详情请参见`PreTrainedTokenizer.encode`和`PreTrainedTokenizer.__call__`。
# [什么是input IDs?](../glossary#input-ids)
attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
# 遮盖掩模,用于避免在填充token索引上进行注意力计算。遮盖值为0或1:
# - 1 表示对**未遮盖**的token进行注意力计算,
# - 0 表示对**遮盖**的token进行注意力计算。
# [什么是attention masks?](../glossary#attention-mask)
token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
# 段落token索引,指示输入中第一部分和第二部分。索引为0或1:
# - 0 对应*句子A*的token,
# - 1 对应*句子B*的token。
# [什么是token type IDs?](../glossary#token-type-ids)
position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
# 输入序列token在位置嵌入中的位置索引。选择范围为`[0, config.max_position_embeddings - 1]`。
# [什么是position IDs?](../glossary#position-ids)
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
# 用于屏蔽自注意力模块中选定头部的遮盖。遮盖值为0或1:
# - 1 表示该头部**未遮盖**,
# - 0 表示该头部**遮盖**。
inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
# 可选地,您可以直接传递嵌入表示而不是`input_ids`。如果您想对如何将`input_ids`索引转换为相关向量有更多控制权,这很有用。
output_attentions (`bool`, *optional*):
# 是否返回所有注意力层的注意力张量。返回的张量中的`attentions`部分会有更多细节。
output_hidden_states (`bool`, *optional*):
# 是否返回所有层的隐藏状态。返回的张量中的`hidden_states`部分会有更多细节。
return_dict (`bool`, *optional*):
# 是否返回[`~utils.ModelOutput`]而不是普通元组。
"""
@add_start_docstrings(
"The bare QDQBERT Model transformer outputting raw hidden-states without any specific head on top.",
QDQBERT_START_DOCSTRING,
)
class QDQBertModel(QDQBertPreTrainedModel):
"""
The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
cross-attention is added between the self-attention layers, following the architecture described in [Attention is
all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
`add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
"""
def __init__(self, config, add_pooling_layer: bool = True):
requires_backends(self, "pytorch_quantization")
super().__init__(config)
self.config = config
self.embeddings = QDQBertEmbeddings(config)
self.encoder = QDQBertEncoder(config)
self.pooler = QDQBertPooler(config) if add_pooling_layer else None
self.post_init()
def get_input_embeddings(self):
return self.embeddings.word_embeddings
def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value
def _prune_heads(self, heads_to_prune: Dict[int, List[int]]):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_model_forward(QDQBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPoolingAndCrossAttentions,
config_class=_CONFIG_FOR_DOC,
)
```
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
@add_start_docstrings(
"""QDQBERT Model with a `language modeling` head on top for CLM fine-tuning.""", QDQBERT_START_DOCSTRING
)
class QDQBertLMHeadModel(QDQBertPreTrainedModel):
_tied_weights_keys = ["predictions.decoder.weight", "predictions.decoder.bias"]
def __init__(self, config):
super().__init__(config)
if not config.is_decoder:
logger.warning("If you want to use `QDQBertLMHeadModel` as a standalone, add `is_decoder=True.`")
self.bert = QDQBertModel(config, add_pooling_layer=False)
self.cls = QDQBertOnlyMLMHead(config)
self.post_init()
def get_output_embeddings(self):
return self.cls.predictions.decoder
def set_output_embeddings(self, new_embeddings):
self.cls.predictions.decoder = new_embeddings
@add_start_docstrings_to_model_forward(QDQBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.LongTensor]]] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
def prepare_inputs_for_generation(
self,
input_ids: Optional[torch.LongTensor],
past_key_values=None,
attention_mask: Optional[torch.Tensor] = None,
**model_kwargs,
):
input_shape = input_ids.shape
if attention_mask is None:
attention_mask = input_ids.new_ones(input_shape)
if past_key_values is not None:
past_length = past_key_values[0][0].shape[2]
if input_ids.shape[1] > past_length:
remove_prefix_length = past_length
else:
remove_prefix_length = input_ids.shape[1] - 1
input_ids = input_ids[:, remove_prefix_length:]
return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}
def _reorder_cache(self, past_key_values, beam_idx):
reordered_past = ()
for layer_past in past_key_values:
reordered_past += (
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
)
return reordered_past
@add_start_docstrings("""QDQBERT Model with a `language modeling` head on top.""", QDQBERT_START_DOCSTRING)
class QDQBertForMaskedLM(QDQBertPreTrainedModel):
_tied_weights_keys = ["predictions.decoder.weight", "predictions.decoder.bias"]
def __init__(self, config):
super().__init__(config)
if config.is_decoder:
logger.warning(
"If you want to use `QDQBertForMaskedLM` make sure `config.is_decoder=False` for "
"bi-directional self-attention."
)
self.bert = QDQBertModel(config, add_pooling_layer=False)
self.cls = QDQBertOnlyMLMHead(config)
self.post_init()
def get_output_embeddings(self):
return self.cls.predictions.decoder
def set_output_embeddings(self, new_embeddings):
self.cls.predictions.decoder = new_embeddings
@add_start_docstrings_to_model_forward(QDQBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=MaskedLMOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.bert(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
prediction_scores = self.cls(sequence_output)
masked_lm_loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
if not return_dict:
output = (prediction_scores,) + outputs[2:]
return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
return MaskedLMOutput(
loss=masked_lm_loss,
logits=prediction_scores,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def prepare_inputs_for_generation(
self, input_ids: torch.LongTensor, attention_mask: Optional[torch.FloatTensor] = None, **model_kwargs
):
input_shape = input_ids.shape
effective_batch_size = input_shape[0]
if self.config.pad_token_id is None:
raise ValueError("The PAD token should be defined for generation")
attention_mask = torch.cat([attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1)
dummy_token = torch.full(
(effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device
)
input_ids = torch.cat([input_ids, dummy_token], dim=1)
return {"input_ids": input_ids, "attention_mask": attention_mask}
@add_start_docstrings(
"""Bert Model with a `next sentence prediction (classification)` head on top.""",
QDQBERT_START_DOCSTRING,
)
class QDQBertForNextSentencePrediction(QDQBertPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.bert = QDQBertModel(config)
self.cls = QDQBertOnlyNSPHead(config)
self.post_init()
@add_start_docstrings_to_model_forward(QDQBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
**kwargs,
) -> Union[Tuple, NextSentencePredictorOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
(see `input_ids` docstring). Indices should be in `[0, 1]`:
- 0 indicates sequence B is a continuation of sequence A,
- 1 indicates sequence B is a random sequence.
Returns:
If `return_dict=True`, returns a `NextSentencePredictorOutput` object containing:
- loss (`torch.FloatTensor`, *optional*): Next sentence prediction loss.
- logits (`torch.FloatTensor` of shape `(batch_size, 2)`): Scores for next sentence prediction.
- hidden_states (`Optional[Tuple[torch.FloatTensor]]`): Tuple of hidden states at each layer of the model.
- attentions (`Optional[Tuple[torch.FloatTensor]]`): Tuple of attention tensors at each layer of the model.
If `return_dict=False`, returns a tuple:
- next_sentence_loss (`Optional[torch.FloatTensor]`): Next sentence prediction loss.
- seq_relationship_scores (`torch.FloatTensor`): Scores for next sentence prediction.
- hidden_states (`Optional[Tuple[torch.FloatTensor]]`): Tuple of hidden states at each layer of the model.
- attentions (`Optional[Tuple[torch.FloatTensor]]`): Tuple of attention tensors at each layer of the model.
Example:
```
>>> from transformers import AutoTokenizer, QDQBertForNextSentencePrediction
>>> import torch
>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
>>> model = QDQBertForNextSentencePrediction.from_pretrained("google-bert/bert-base-uncased")
>>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
>>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
>>> encoding = tokenizer(prompt, next_sentence, return_tensors="pt")
>>> outputs = model(**encoding, labels=torch.LongTensor([1]))
>>> logits = outputs.logits
>>> assert logits[0, 0] < logits[0, 1] # next sentence was random
```
Check if `next_sentence_label` is provided in `kwargs`; issue a warning and use `labels` instead if found.
"""
if "next_sentence_label" in kwargs:
warnings.warn(
"The `next_sentence_label` argument is deprecated and will be removed in a future version, use"
" `labels` instead.",
FutureWarning,
)
labels = kwargs.pop("next_sentence_label")
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.bert(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
pooled_output = outputs[1]
seq_relationship_scores = self.cls(pooled_output)
next_sentence_loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
next_sentence_loss = loss_fct(seq_relationship_scores.view(-1, 2), labels.view(-1))
if not return_dict:
output = (seq_relationship_scores,) + outputs[2:]
return ((next_sentence_loss,) + output) if next_sentence_loss is not None else output
return NextSentencePredictorOutput(
loss=next_sentence_loss,
logits=seq_relationship_scores,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""
Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
output) e.g. for GLUE tasks.
""",
QDQBERT_START_DOCSTRING,
)
class QDQBertForSequenceClassification(QDQBertPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.config = config
self.bert = QDQBertModel(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.post_init()
@add_start_docstrings_to_model_forward(QDQBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=SequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.bert(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
pooled_output = outputs[1]
pooled_output = self.dropout(pooled_output)
logits = self.classifier(pooled_output)
loss = None
if labels is not None:
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
if self.config.problem_type == "regression":
loss_fct = MSELoss()
if self.num_labels == 1:
loss = loss_fct(logits.squeeze(), labels.squeeze())
else:
loss = loss_fct(logits, labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(logits, labels)
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return SequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""
Bert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
softmax) e.g. for RocStories/SWAG tasks.
""",
QDQBERT_START_DOCSTRING,
)
class QDQBertForMultipleChoice(QDQBertPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.bert = QDQBertModel(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, 1)
self.post_init()
@add_start_docstrings_to_model_forward(QDQBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=MultipleChoiceModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, MultipleChoiceModelOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
`input_ids` above)
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
inputs_embeds = (
inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
if inputs_embeds is not None
else None
)
outputs = self.bert(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
pooled_output = outputs[1]
pooled_output = self.dropout(pooled_output)
logits = self.classifier(pooled_output)
reshaped_logits = logits.view(-1, num_choices)
loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(reshaped_logits, labels)
if not return_dict:
output = (reshaped_logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return MultipleChoiceModelOutput(
loss=loss,
logits=reshaped_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
QDQBERT Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
Named-Entity-Recognition (NER) tasks.
"""
QDQBERT_START_DOCSTRING,
)
class QDQBertForTokenClassification(QDQBertPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
# Initialize QDQBertModel with provided configuration
self.bert = QDQBertModel(config, add_pooling_layer=False)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(QDQBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, TokenClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# Perform forward pass through QDQBertModel
outputs = self.bert(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
# Apply dropout to the sequence output
sequence_output = self.dropout(sequence_output)
logits = self.classifier(sequence_output)
loss = None
if labels is not None:
# Calculate CrossEntropyLoss if labels are provided
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
if not return_dict:
# Prepare output tuple if return_dict is False
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
# Return TokenClassifierOutput if return_dict is True
return TokenClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
QDQBERT Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
QDQBERT_START_DOCSTRING,
)
class QDQBertForQuestionAnswering(QDQBertPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.bert = QDQBertModel(config, add_pooling_layer=False)
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
self.post_init()
@add_start_docstrings_to_model_forward(QDQBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=QuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
start_positions: Optional[torch.LongTensor] = None,
end_positions: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, QuestionAnsweringModelOutput]:
r"""
start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.bert(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
logits = self.qa_outputs(sequence_output)
start_logits, end_logits = logits.split(1, dim=-1)
start_logits = start_logits.squeeze(-1).contiguous()
end_logits = end_logits.squeeze(-1).contiguous()
total_loss = None
if start_positions is not None and end_positions is not None:
if len(start_positions.size()) > 1:
start_positions = start_positions.squeeze(-1)
if len(end_positions.size()) > 1:
end_positions = end_positions.squeeze(-1)
ignored_index = start_logits.size(1)
start_positions = start_positions.clamp(0, ignored_index)
end_positions = end_positions.clamp(0, ignored_index)
loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
start_loss = loss_fct(start_logits, start_positions)
end_loss = loss_fct(end_logits, end_positions)
total_loss = (start_loss + end_loss) / 2
if not return_dict:
output = (start_logits, end_logits) + outputs[2:]
return ((total_loss,) + output) if total_loss is not None else output
return QuestionAnsweringModelOutput(
loss=total_loss,
start_logits=start_logits,
end_logits=end_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
.\models\qdqbert\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
_import_structure = {"configuration_qdqbert": ["QDQBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "QDQBertConfig"]}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_qdqbert"] = [
"QDQBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
"QDQBertForMaskedLM",
"QDQBertForMultipleChoice",
"QDQBertForNextSentencePrediction",
"QDQBertForQuestionAnswering",
"QDQBertForSequenceClassification",
"QDQBertForTokenClassification",
"QDQBertLayer",
"QDQBertLMHeadModel",
"QDQBertModel",
"QDQBertPreTrainedModel",
"load_tf_weights_in_qdqbert",
]
if TYPE_CHECKING:
from .configuration_qdqbert import QDQBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, QDQBertConfig
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_qdqbert import (
QDQBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
QDQBertForMaskedLM,
QDQBertForMultipleChoice,
QDQBertForNextSentencePrediction,
QDQBertForQuestionAnswering,
QDQBertForSequenceClassification,
QDQBertForTokenClassification,
QDQBertLayer,
QDQBertLMHeadModel,
QDQBertModel,
QDQBertPreTrainedModel,
load_tf_weights_in_qdqbert,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\qwen2\configuration_qwen2.py
""" Qwen2 model configuration"""
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
QWEN2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"Qwen/Qwen2-7B-beta": "https://huggingface.co/Qwen/Qwen2-7B-beta/resolve/main/config.json",
}
class Qwen2Config(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`Qwen2Model`]. It is used to instantiate a
Qwen2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of
Qwen2-7B-beta [Qwen/Qwen2-7B-beta](https://huggingface.co/Qwen/Qwen2-7B-beta).
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
```
>>> from transformers import Qwen2Model, Qwen2Config
>>> # Initializing a Qwen2 style configuration
>>> configuration = Qwen2Config()
>>> # Initializing a model from the Qwen2-7B style configuration
>>> model = Qwen2Model(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```
model_type = "qwen2" # 模型类型为 Qwen2
keys_to_ignore_at_inference = ["past_key_values"] # 推断时忽略的键列表
def __init__(
self,
vocab_size=151936, # 词汇表大小,默认为 151936
hidden_size=4096, # 隐藏层大小,默认为 4096
intermediate_size=22016, # 中间层大小,默认为 22016
num_hidden_layers=32, # 隐藏层层数,默认为 32
num_attention_heads=32, # 注意力头数,默认为 32
num_key_value_heads=32, # 键值头数,默认为 32
hidden_act="silu", # 隐藏层激活函数,默认为 silu
max_position_embeddings=32768, # 最大位置嵌入数,默认为 32768
initializer_range=0.02, # 初始化范围,默认为 0.02
rms_norm_eps=1e-6, # RMS 归一化参数,默认为 1e-6
use_cache=True, # 是否使用缓存,默认为 True
tie_word_embeddings=False, # 是否绑定词嵌入,默认为 False
rope_theta=10000.0, # ROPE 参数,默认为 10000.0
use_sliding_window=False, # 是否使用滑动窗口,默认为 False
sliding_window=4096, # 滑动窗口大小,默认为 4096
max_window_layers=28, # 最大窗口层数,默认为 28
attention_dropout=0.0, # 注意力机制的 dropout,默认为 0.0
**kwargs, # 其他关键字参数
):
# 设置模型的超参数
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.use_sliding_window = use_sliding_window
self.sliding_window = sliding_window
self.max_window_layers = max_window_layers
# 为了向后兼容性
if num_key_value_heads is None:
num_key_value_heads = num_attention_heads
# 设置键值头的数量
self.num_key_value_heads = num_key_value_heads
self.hidden_act = hidden_act
self.initializer_range = initializer_range
self.rms_norm_eps = rms_norm_eps
self.use_cache = use_cache
self.rope_theta = rope_theta
self.attention_dropout = attention_dropout
# 调用父类的初始化方法,传入参数和关键字参数
super().__init__(
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)
.\models\qwen2\modeling_qwen2.py
""" PyTorch Qwen2 模型。"""
import inspect
import math
import warnings
from typing import List, Optional, Tuple, Union
import torch
import torch.nn.functional as F
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN
from ...cache_utils import Cache, DynamicCache
from ...modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa
from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
from ...modeling_utils import PreTrainedModel
from ...utils import (
add_start_docstrings,
add_start_docstrings_to_model_forward,
is_flash_attn_2_available,
is_flash_attn_greater_or_equal_2_10,
logging,
replace_return_docstrings,
)
from .configuration_qwen2 import Qwen2Config
if is_flash_attn_2_available():
from flash_attn import flash_attn_func, flash_attn_varlen_func
from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input
_flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "Qwen/Qwen2-7B-beta"
_CONFIG_FOR_DOC = "Qwen2Config"
QWEN2_PRETRAINED_MODEL_ARCHIVE_LIST = [
"Qwen/Qwen2-7B-beta",
]
def _get_unpad_data(attention_mask):
seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
max_seqlen_in_batch = seqlens_in_batch.max().item()
cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
return (
indices,
cu_seqlens,
max_seqlen_in_batch,
)
class Qwen2RMSNorm(nn.Module):
def __init__(self, hidden_size, eps=1e-6):
"""
Qwen2RMSNorm is equivalent to T5LayerNorm
"""
super().__init__()
self.weight = nn.Parameter(torch.ones(hidden_size))
self.variance_epsilon = eps
def forward(self, hidden_states):
input_dtype = hidden_states.dtype
hidden_states = hidden_states.to(torch.float32)
variance = hidden_states.pow(2).mean(-1, keepdim=True)
hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
return self.weight * hidden_states.to(input_dtype)
class Qwen2RotaryEmbedding(nn.Module):
def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
super().__init__()
self.dim = dim
self.max_position_embeddings = max_position_embeddings
self.base = base
inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
self.register_buffer("inv_freq", inv_freq, persistent=False)
self._set_cos_sin_cache(
seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
)
def _set_cos_sin_cache(self, seq_len, device, dtype):
self.max_seq_len_cached = seq_len
t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
freqs = torch.outer(t, self.inv_freq)
emb = torch.cat((freqs, freqs), dim=-1)
self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
def forward(self, x, seq_len=None):
if seq_len > self.max_seq_len_cached:
self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
return (
self.cos_cached[:seq_len].to(dtype=x.dtype),
self.sin_cached[:seq_len].to(dtype=x.dtype),
)
def rotate_half(x):
"""Rotates half the hidden dims of the input."""
x1 = x[..., : x.shape[-1] // 2]
x2 = x[..., x.shape[-1] // 2 :]
return torch.cat((-x2, x1), dim=-1)
def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
"""Applies Rotary Position Embedding to the query and key tensors."""
Args:
q (`torch.Tensor`): 查询张量。
k (`torch.Tensor`): 键张量。
cos (`torch.Tensor`): 旋转嵌入的余弦部分。
sin (`torch.Tensor`): 旋转嵌入的正弦部分。
position_ids (`torch.Tensor`):
查询和键张量对应的位置索引。例如,当使用 KV 缓存时,可以传递偏移的位置 id。
unsqueeze_dim (`int`, *optional*, 默认为 1):
'unsqueeze_dim' 参数指定沿其展开 cos[position_ids] 和 sin[position_ids] 的维度,以便它们可以正确地广播到 q 和 k 的维度。
例如,cos[position_ids] 和 sin[position_ids] 的形状为 [batch_size, seq_len, head_dim]。
如果 q 和 k 的形状为 [batch_size, heads, seq_len, head_dim],则设置 unsqueeze_dim=1 使得 cos[position_ids] 和 sin[position_ids] 可以广播到 q 和 k 的形状。
类似地,如果 q 和 k 的形状为 [batch_size, seq_len, heads, head_dim],则设置 unsqueeze_dim=2。
Returns:
`tuple(torch.Tensor)`: 包含使用旋转位置嵌入旋转后的查询和键张量的元组。
"""
# 按照位置索引从 cos 中选择并展开维度
cos = cos[position_ids].unsqueeze(unsqueeze_dim)
# 按照位置索引从 sin 中选择并展开维度
sin = sin[position_ids].unsqueeze(unsqueeze_dim)
# 计算旋转后的查询嵌入
q_embed = (q * cos) + (rotate_half(q) * sin)
# 计算旋转后的键嵌入
k_embed = (k * cos) + (rotate_half(k) * sin)
# 返回旋转后的查询和键张量
return q_embed, k_embed
# 从 transformers.models.mistral.modeling_mistral.MistralMLP 复制并修改为 Qwen2MLP
class Qwen2MLP(nn.Module):
# 初始化方法,接收一个配置对象 config
def __init__(self, config):
super().__init__()
# 将配置对象保存在实例中
self.config = config
# 从配置中获取隐藏层大小和中间层大小
self.hidden_size = config.hidden_size
self.intermediate_size = config.intermediate_size
# 创建一个线性层,用于门控投影,输入维度是隐藏层大小,输出维度是中间层大小,无偏置
self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
# 创建一个线性层,用于上游投影,输入维度是隐藏层大小,输出维度是中间层大小,无偏置
self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
# 创建一个线性层,用于下游投影,输入维度是中间层大小,输出维度是隐藏层大小,无偏置
self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
# 根据配置中的激活函数名,选择相应的激活函数,并保存在实例中
self.act_fn = ACT2FN[config.hidden_act]
# 前向传播方法,接收输入张量 x
def forward(self, x):
# 对输入张量进行门控投影,然后应用激活函数,再乘以上游投影结果,最后下游投影得到输出
return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
# 从 transformers.models.llama.modeling_llama.repeat_kv 复制过来
def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
"""
这相当于 torch.repeat_interleave(x, dim=1, repeats=n_rep)。将隐藏状态从 (batch,
num_key_value_heads, seqlen, head_dim) 扩展为 (batch, num_attention_heads, seqlen, head_dim)
"""
# 获取隐藏状态张量的形状信息
batch, num_key_value_heads, slen, head_dim = hidden_states.shape
# 如果重复次数 n_rep 为 1,直接返回原始隐藏状态张量
if n_rep == 1:
return hidden_states
# 在第二维度上扩展隐藏状态张量,重复 n_rep 次
hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
# 重新整形扩展后的张量,将第二和第三维度合并为新的第二维度
return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
class Qwen2Attention(nn.Module):
"""
从 'Attention Is All You Need' 论文中的多头注意力机制修改而来。修改为使用滑动窗口注意力:Longformer
和 "Generating Long Sequences with Sparse Transformers"。
"""
def __init__(self, config: Qwen2Config, layer_idx: Optional[int] = None):
super().__init__()
self.config = config # 设置实例的配置参数对象
self.layer_idx = layer_idx # 设置实例的层索引,可选
if layer_idx is None:
logger.warning_once(
f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
"lead to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
"when creating this class."
)
self.hidden_size = config.hidden_size # 从配置中获取隐藏层大小
self.num_heads = config.num_attention_heads # 从配置中获取注意力头的数量
self.head_dim = self.hidden_size // self.num_heads # 计算每个注意力头的维度
self.num_key_value_heads = config.num_key_value_heads # 从配置中获取键值头的数量
self.num_key_value_groups = self.num_heads // self.num_key_value_heads # 计算键值头的组数
self.max_position_embeddings = config.max_position_embeddings # 从配置中获取最大位置嵌入数
self.rope_theta = config.rope_theta # 从配置中获取绳索旋转角度
self.is_causal = True # 设置实例是否因果
self.attention_dropout = config.attention_dropout # 从配置中获取注意力丢弃率
if (self.head_dim * self.num_heads) != self.hidden_size:
raise ValueError(
f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
f" and `num_heads`: {self.num_heads})."
)
self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True)
# 初始化查询投影层线性变换
self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
# 初始化键投影层线性变换
self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
# 初始化值投影层线性变换
self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
# 初始化输出投影层线性变换,没有偏置项
self.rotary_emb = Qwen2RotaryEmbedding(
self.head_dim,
max_position_embeddings=self.max_position_embeddings,
base=self.rope_theta,
)
# 初始化旋转嵌入层对象,用于处理注意力旋转操作
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_value: Optional[Cache] = None,
output_attentions: bool = False,
use_cache: bool = False,
**kwargs,
# Qwen2FlashAttention2 类,继承自 Qwen2Attention,实现了 Qwen2 闪存注意力模块。
# 该模块主要的改动在于前向传播过程中需要正确调用闪存注意力的公共 API,并处理可能包含的填充标记。
# 另外,对于滑动窗口注意力,仅应用于底部 config.max_window_layers 层。
# 从 transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__ 复制而来
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# TODO: 当 Flash Attention 版本升级到 2.1 后应移除此部分。
# flash_attn<2.1 生成左上对齐的因果蒙版,而需要的是右下对齐,默认在 flash_attn>=2.1 中已实现。该属性用于处理此差异。
# 参考: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0。
# 注意,对于 flash_attn<2.1,当 q_seqlen != k_seqlen(除了 q_seqlen == 1 的情况)会产生错误的蒙版(左上)。
self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_value: Optional[Cache] = None,
output_attentions: bool = False,
use_cache: bool = False,
**kwargs,
):
# Qwen2FlashAttention2 类的前向传播函数
# hidden_states: 输入的隐藏状态张量
# attention_mask: 可选的注意力蒙版张量
# position_ids: 可选的位置 ID 张量
# past_key_value: 可选的缓存键值对
# output_attentions: 是否输出注意力权重
# use_cache: 是否使用缓存
# **kwargs: 其他关键字参数
def _flash_attention_forward(
self,
query_states,
key_states,
value_states,
attention_mask,
query_length,
dropout=0.0,
softmax_scale=None,
use_sliding_windows=False,
):
# Qwen2FlashAttention2 类的闪存注意力前向传播函数
# query_states: 查询状态
# key_states: 键状态
# value_states: 值状态
# attention_mask: 注意力蒙版
# query_length: 查询长度
# dropout: dropout 比率,默认为 0.0
# softmax_scale: softmax 缩放参数,可选
# use_sliding_windows: 是否使用滑动窗口, 默认为 False
# 从 transformers.models.mistral.modeling_mistral.MistralFlashAttention2._upad_input 复制而来
# 定义一个方法 _upad_input,接受查询层、键层、值层、注意力掩码和查询长度作为输入参数
def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
# 获取批处理大小、键值序列长度、头数、头维度
batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape
# 如果键值序列长度不等于注意力掩码的最后一个维度长度
if kv_seq_len != attention_mask.shape[-1]:
# 调整注意力掩码,使其匹配键值序列的长度
attention_mask_num_tokens = attention_mask.shape[-1]
attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :]
# 从注意力掩码中获取未填充数据的索引、当前序列长度、批处理中最大序列长度
indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
# 根据获取的索引重新排序键层和值层
key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
# 如果查询长度等于键值序列长度
if query_length == kv_seq_len:
# 重新排序查询层
query_layer = index_first_axis(
query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k
)
cu_seqlens_q = cu_seqlens_k
max_seqlen_in_batch_q = max_seqlen_in_batch_k
indices_q = indices_k
# 如果查询长度为1
elif query_length == 1:
# 将查询层缩减为一维
max_seqlen_in_batch_q = 1
cu_seqlens_q = torch.arange(
batch_size + 1, dtype=torch.int32, device=query_layer.device
) # 这里有一个内存复制操作,效率不高。
indices_q = cu_seqlens_q[:-1]
query_layer = query_layer.squeeze(1)
else:
# 根据查询长度调整注意力掩码,获取未填充的输入
attention_mask = attention_mask[:, -query_length:]
query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
# 返回重新排序后的查询层、键层、值层,以及查询层索引、当前序列长度元组和最大序列长度元组
return (
query_layer,
key_layer,
value_layer,
indices_q,
(cu_seqlens_q, cu_seqlens_k),
(max_seqlen_in_batch_q, max_seqlen_in_batch_k),
)
# Copied from transformers.models.mistral.modeling_mistral.MistralSdpaAttention with Mistral->Qwen2
class Qwen2SdpaAttention(Qwen2Attention):
"""
Qwen2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
`Qwen2Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
SDPA API.
"""
# Adapted from Qwen2Attention.forward
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_value: Optional[Cache] = None,
output_attentions: bool = False,
use_cache: bool = False,
):
"""
Override of forward method from Qwen2Attention to adapt to SDPA API.
Parameters:
- hidden_states (torch.Tensor): Input tensor to the attention module.
- attention_mask (Optional[torch.Tensor]): Mask tensor indicating which elements should be attended to.
- position_ids (Optional[torch.LongTensor]): Tensor containing positional ids.
- past_key_value (Optional[Cache]): Cached key value pairs from previous computations.
- output_attentions (bool): Whether to output attention weights.
- use_cache (bool): Whether to use cached key value pairs for future computations.
Returns:
- Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: Tuple containing:
- torch.Tensor: Output tensor from the attention module.
- Optional[torch.Tensor]: Attention weights if `output_attentions` is `True`.
- Optional[Tuple[torch.Tensor]]: Cached key value pairs if `use_cache` is `True`.
"""
raise NotImplementedError
QWEN2_ATTENTION_CLASSES = {
"eager": Qwen2Attention,
"flash_attention_2": Qwen2FlashAttention2,
"sdpa": Qwen2SdpaAttention,
}
class Qwen2DecoderLayer(nn.Module):
def __init__(self, config: Qwen2Config, layer_idx: int):
super().__init__()
self.hidden_size = config.hidden_size
if config.use_sliding_window and config._attn_implementation != "flash_attention_2":
logger.warning_once(
f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
"unexpected results may be encountered."
)
# Initialize self attention mechanism based on configuration
self.self_attn = QWEN2_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
# Initialize MLP layer
self.mlp = Qwen2MLP(config)
# Layer normalization for input to the layer
self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
# Layer normalization after attention mechanism
self.post_attention_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
output_attentions: Optional[bool] = False,
use_cache: Optional[bool] = False,
**kwargs,
):
"""
Forward pass for Qwen2 decoder layer.
Parameters:
- hidden_states (torch.Tensor): Input tensor to the decoder layer.
- attention_mask (Optional[torch.Tensor]): Mask tensor indicating which elements should be attended to.
- position_ids (Optional[torch.LongTensor]): Tensor containing positional ids.
- past_key_value (Optional[Tuple[torch.Tensor]]): Cached key value pairs from previous computations.
- output_attentions (Optional[bool]): Whether to output attention weights.
- use_cache (Optional[bool]): Whether to use cached key value pairs for future computations.
- **kwargs: Additional keyword arguments for future expansion.
Returns:
- Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: Tuple containing:
- torch.Tensor: Output tensor from the decoder layer.
- Optional[torch.Tensor]: Attention weights if `output_attentions` is `True`.
- Optional[Tuple[torch.Tensor]]: Cached key value pairs if `use_cache` is `True`.
"""
raise NotImplementedError
) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
# 如果传入了 `padding_mask` 参数,则发出警告,提示该参数在 v4.37 版本中将被移除
if "padding_mask" in kwargs:
warnings.warn(
"Passing `padding_mask` is deprecated and will be removed in v4.37. "
"Please make sure use `attention_mask` instead.`"
)
"""
Args:
hidden_states (`torch.FloatTensor`): 输入层的输入,形状为 `(batch, seq_len, embed_dim)`
attention_mask (`torch.FloatTensor`, *可选*): 注意力掩码,形状为 `(batch, sequence_length)`,其中填充元素为 0
output_attentions (`bool`, *可选*):
是否返回所有注意力层的注意力张量。查看返回的张量中 `attentions` 以获取更多细节。
use_cache (`bool`, *可选*):
如果设置为 `True`,将返回 `past_key_values` 键值状态,可用于加速解码(参见 `past_key_values`)。
past_key_value (`Tuple(torch.FloatTensor)`, *可选*): 缓存的过去键和值投影状态
"""
# 记录输入的残差连接
residual = hidden_states
# 应用输入层归一化
hidden_states = self.input_layernorm(hidden_states)
# 自注意力机制
hidden_states, self_attn_weights, present_key_value = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_value=past_key_value,
output_attentions=output_attentions,
use_cache=use_cache,
)
# 将残差连接应用到自注意力输出上
hidden_states = residual + hidden_states
# 全连接层
residual = hidden_states
hidden_states = self.post_attention_layernorm(hidden_states)
hidden_states = self.mlp(hidden_states)
hidden_states = residual + hidden_states
# 构建输出
outputs = (hidden_states,)
# 如果需要输出注意力权重,则添加到输出中
if output_attentions:
outputs += (self_attn_weights,)
# 如果需要使用缓存,则添加当前的键值状态到输出中
if use_cache:
outputs += (present_key_value,)
return outputs
# QWEN2_START_DOCSTRING 是一个原始字符串文档,描述了该模型的继承和基本使用说明
QWEN2_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`Qwen2Config`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
# 使用装饰器 @add_start_docstrings 添加文档注释,说明该类是基于 Qwen2PreTrainedModel 的一个裸模型
@add_start_docstrings(
"The bare Qwen2 Model outputting raw hidden-states without any specific head on top.",
QWEN2_START_DOCSTRING,
)
class Qwen2PreTrainedModel(PreTrainedModel):
# 设置模型的配置类和模型名称前缀
config_class = Qwen2Config
base_model_prefix = "model"
# 支持梯度检查点
supports_gradient_checkpointing = True
# 不拆分的模块列表
_no_split_modules = ["Qwen2DecoderLayer"]
# 跳过设备放置的键
_skip_keys_device_placement = "past_key_values"
# 支持快闪注意力机制 2
_supports_flash_attn_2 = True
# 支持自我注意力分配
_supports_sdpa = True
# 支持缓存类
_supports_cache_class = True
# 初始化权重函数,根据配置的初始化范围初始化线性层和嵌入层
def _init_weights(self, module):
std = self.config.initializer_range
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=std)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=std)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
# QWEN2_INPUTS_DOCSTRING 是一个原始字符串文档,目前为空
QWEN2_INPUTS_DOCSTRING = r"""
"""
# 使用装饰器 @add_start_docstrings 添加文档注释,说明该类是 Qwen2PreTrainedModel 的一个具体实现,用于 Transformer 解码器
@add_start_docstrings(
"The bare Qwen2 Model outputting raw hidden-states without any specific head on top.",
QWEN2_START_DOCSTRING,
)
class Qwen2Model(Qwen2PreTrainedModel):
"""
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen2DecoderLayer`]
Args:
config: Qwen2Config
"""
# 初始化方法,接受一个 Qwen2Config 类型的参数 config
def __init__(self, config: Qwen2Config):
super().__init__(config)
self.padding_idx = config.pad_token_id # 设置填充索引
self.vocab_size = config.vocab_size # 设置词汇表大小
# 创建词嵌入层,使用 config 中的参数进行初始化
self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
# 创建多层解码器层的列表,每层都是 Qwen2DecoderLayer 类的实例
self.layers = nn.ModuleList(
[Qwen2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
)
self._attn_implementation = config._attn_implementation # 设置注意力实现方式
self.norm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) # 设置 RMS 归一化器
self.gradient_checkpointing = False # 设置是否使用梯度检查点
# 初始化权重并应用最终处理
self.post_init()
# 返回词嵌入层对象
def get_input_embeddings(self):
return self.embed_tokens
# 定义一个方法,用于设置输入的嵌入向量
def set_input_embeddings(self, value):
# 将输入的嵌入向量赋给对象的embed_tokens属性
self.embed_tokens = value
# 使用装饰器将下面的方法添加文档字符串,文档字符串内容在外部定义为QWEN2_INPUTS_DOCSTRING
@add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
# 定义前向传播方法,接受多个输入参数
def forward(
self,
input_ids: torch.LongTensor = None, # 输入的token ids,数据类型为torch中的LongTensor
attention_mask: Optional[torch.Tensor] = None, # 可选的注意力遮罩张量,数据类型为torch.Tensor
position_ids: Optional[torch.LongTensor] = None, # 可选的位置ids张量,数据类型为torch中的LongTensor
past_key_values: Optional[List[torch.FloatTensor]] = None, # 可选的过去的键值张量列表,数据类型为包含torch中的FloatTensor的列表
inputs_embeds: Optional[torch.FloatTensor] = None, # 可选的输入嵌入张量,数据类型为torch.FloatTensor
use_cache: Optional[bool] = None, # 是否使用缓存的标志,数据类型为bool型,可选
output_attentions: Optional[bool] = None, # 是否输出注意力权重的标志,数据类型为bool型,可选
output_hidden_states: Optional[bool] = None, # 是否输出隐藏状态的标志,数据类型为bool型,可选
return_dict: Optional[bool] = None, # 是否返回字典格式的输出,数据类型为bool型,可选
class Qwen2ForCausalLM(Qwen2PreTrainedModel):
_tied_weights_keys = ["lm_head.weight"]
def __init__(self, config):
# 调用父类的初始化方法,传入配置对象
super().__init__(config)
# 使用配置对象初始化 Qwen2Model 模型
self.model = Qwen2Model(config)
# 设置词汇表大小
self.vocab_size = config.vocab_size
# 使用线性层初始化 lm_head,连接隐藏状态和词汇表大小,不带偏置
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
# 初始化权重并进行最终处理
self.post_init()
def get_input_embeddings(self):
# 返回模型中的 embed_tokens 属性,用作输入嵌入层
return self.model.embed_tokens
def set_input_embeddings(self, value):
# 设置模型中的 embed_tokens 属性为给定的 value,用作输入嵌入层
self.model.embed_tokens = value
def get_output_embeddings(self):
# 返回模型的输出嵌入层 lm_head
return self.lm_head
def set_output_embeddings(self, new_embeddings):
# 设置模型的输出嵌入层 lm_head 为给定的 new_embeddings
self.lm_head = new_embeddings
def set_decoder(self, decoder):
# 设置模型的 decoder 属性为给定的 decoder
self.model = decoder
def get_decoder(self):
# 返回模型的 decoder 属性
return self.model
@add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
"""
实现模型的前向传播逻辑,支持文档化字符串和返回字符串替换。
"""
# 具体的前向传播逻辑在模型的实现中处理,这里只是声明
pass
def prepare_inputs_for_generation(
self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
):
"""
准备用于生成的输入,根据需要定制输入。
"""
# 具体的输入准备逻辑在模型的实现中处理,这里只是声明
pass
# 如果传入的 past_key_values 不为空,则进行处理
if past_key_values is not None:
# 如果 past_key_values 是 Cache 类型的实例
if isinstance(past_key_values, Cache):
# 获取缓存的序列长度、已看到的 token 数量和最大缓存长度
cache_length = past_key_values.get_seq_length()
past_length = past_key_values.seen_tokens
max_cache_length = past_key_values.get_max_length()
else:
# 否则从 past_key_values 中获取缓存长度和已看到的 token 数量
cache_length = past_length = past_key_values[0][0].shape[2]
max_cache_length = None
# 仅保留未处理的 token:
# 1 - 如果 attention_mask 的长度超过 input_ids 的长度,则说明部分输入仅作为缓存传递
if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
# 2 - 如果 past_length 小于 input_ids 的长度,则说明 input_ids 包含所有输入 token,可以根据 past_length 舍弃部分 input_ids
elif past_length < input_ids.shape[1]:
input_ids = input_ids[:, past_length:]
# 3 - 否则 (past_length >= input_ids.shape[1]),假设 input_ids 仅包含未处理的 token
# 如果将超过最大缓存长度,需要裁剪输入 attention_mask
if (
max_cache_length is not None
and attention_mask is not None
and cache_length + input_ids.shape[1] > max_cache_length
):
attention_mask = attention_mask[:, -max_cache_length:]
# 获取 kwargs 中的 position_ids,如果 attention_mask 不为空且 position_ids 为空,则动态生成 position_ids
position_ids = kwargs.get("position_ids", None)
if attention_mask is not None and position_ids is None:
position_ids = attention_mask.long().cumsum(-1) - 1
position_ids.masked_fill_(attention_mask == 0, 1)
if past_key_values:
position_ids = position_ids[:, -input_ids.shape[1] :]
# 如果传入 inputs_embeds,则仅在第一代生成步骤中使用它们
if inputs_embeds is not None and past_key_values is None:
model_inputs = {"inputs_embeds": inputs_embeds}
else:
model_inputs = {"input_ids": input_ids}
# 更新 model_inputs 字典,包括 position_ids、past_key_values、use_cache 和 attention_mask 等
model_inputs.update(
{
"position_ids": position_ids,
"past_key_values": past_key_values,
"use_cache": kwargs.get("use_cache"),
"attention_mask": attention_mask,
}
)
return model_inputs
# 定义一个函数 _reorder_cache,用于重新排序缓存中的过去键值对
def _reorder_cache(past_key_values, beam_idx):
# 初始化一个空的重新排序后的过去键值对元组
reordered_past = ()
# 遍历过去键值对列表中的每一层
for layer_past in past_key_values:
# 对每一层的过去状态,按照给定的 beam_idx 重新排序,并转移到相同的设备上
reordered_past += (
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
)
# 返回重新排序后的过去键值对元组
return reordered_past
"""
Qwen2模型变换器,顶部带有序列分类头(线性层)。
[`Qwen2ForSequenceClassification`] 使用最后一个标记进行分类,类似其他因果模型(例如GPT-2)的做法。
由于它在最后一个标记上进行分类,因此需要知道最后一个标记的位置。如果配置中定义了 `pad_token_id`,它会找到每行中不是填充标记的最后一个标记。如果没有定义 `pad_token_id`,则简单地取每个批次行中的最后一个值。当传递 `inputs_embeds` 而不是 `input_ids` 时,由于无法猜测填充标记,它也会采取同样的做法(取每个批次行中的最后一个值)。
"""
@add_start_docstrings(
"""
`forward` 方法用于执行前向传播。
参数:
- `input_ids` (torch.LongTensor, optional): 输入的token ID序列.
- `attention_mask` (torch.Tensor, optional): 注意力遮罩,指示哪些元素是填充的.
- `position_ids` (torch.LongTensor, optional): 指示每个token的位置ID.
- `past_key_values` (List[torch.FloatTensor], optional): 过去的键值对,用于缓存计算.
- `inputs_embeds` (torch.FloatTensor, optional): 替代 `input_ids` 的嵌入表示.
- `labels` (torch.LongTensor, optional): 分类标签.
- `use_cache` (bool, optional): 是否使用缓存.
- `output_attentions` (bool, optional): 是否输出注意力权重.
- `output_hidden_states` (bool, optional): 是否输出隐藏状态.
- `return_dict` (bool, optional): 是否返回字典格式的输出.
返回:
- 根据模型配置返回不同的输出,包括分类结果、注意力权重等.
""",
QWEN2_FORWARD_DOCSTRING,
)
class Qwen2ForSequenceClassification(Qwen2PreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.model = Qwen2Model(config)
self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
"""
返回模型中的输入嵌入层。
"""
return self.model.embed_tokens
def set_input_embeddings(self, value):
"""
设置模型的输入嵌入层。
"""
self.model.embed_tokens = value
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
"""
Qwen2模型的前向传播方法。
"""
.\models\qwen2\tokenization_qwen2.py
VOCAB_FILES_NAMES = {
"vocab_file": "vocab.json",
"merges_file": "merges.txt",
}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {"qwen/qwen-tokenizer": "https://huggingface.co/qwen/qwen-tokenizer/resolve/main/vocab.json"},
"merges_file": {"qwen/qwen-tokenizer": "https://huggingface.co/qwen/qwen-tokenizer/resolve/main/merges.txt"},
}
MAX_MODEL_INPUT_SIZES = {"qwen/qwen-tokenizer": 32768}
PRETOKENIZE_REGEX = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
@lru_cache()
def bytes_to_unicode():
"""
返回 utf-8 字节列表及其与 Unicode 字符的映射表。避免映射到空白字符或控制字符,以避免 BPE 代码错误。
可逆的 BPE 代码在 Unicode 字符串上工作。这意味着如果要避免 UNK(未知标记),需要在词汇表中包含大量的 Unicode 字符。
例如,对于约 100 亿个标记的数据集,您大约需要包含 5000 个 Unicode 字符才能获得良好的覆盖率。
"""
bs = (
list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
)
cs = bs[:]
n = 0
for b in range(2**8):
if b not in bs:
bs.append(b)
cs.append(2**8 + n)
n += 1
cs = [chr(n) for n in cs]
return dict(zip(bs, cs))
def get_pairs(word):
"""
返回单词中的符号对集合。
单词表示为符号元组(符号是长度可变的字符串)。
"""
pairs = set()
prev_char = word[0]
for char in word[1:]:
pairs.add((prev_char, char))
prev_char = char
return pairs
class Qwen2Tokenizer(PreTrainedTokenizer):
"""
Qwen2 的 tokenizer 类,继承自 PreTrainedTokenizer 类。
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = MAX_MODEL_INPUT_SIZES
model_input_names = ["input_ids", "attention_mask"]
def __init__(
self,
vocab_file,
merges_file,
errors="replace",
unk_token="<|endoftext|>",
bos_token=None,
eos_token="<|endoftext|>",
pad_token="<|endoftext|>",
clean_up_tokenization_spaces=False,
split_special_tokens=False,
**kwargs,
):
bos_token = (
AddedToken(bos_token, lstrip=False, rstrip=False, special=True, normalized=False)
if isinstance(bos_token, str)
else bos_token
)
eos_token = (
AddedToken(eos_token, lstrip=False, rstrip=False, special=True, normalized=False)
if isinstance(eos_token, str)
else eos_token
)
unk_token = (
AddedToken(unk_token, lstrip=False, rstrip=False, special=True, normalized=False)
if isinstance(unk_token, str)
else unk_token
)
pad_token = (
AddedToken(pad_token, lstrip=False, rstrip=False, special=True, normalized=False)
if isinstance(pad_token, str)
else pad_token
)
with open(vocab_file, encoding="utf-8") as vocab_handle:
self.encoder = json.load(vocab_handle)
self.decoder = {v: k for k, v in self.encoder.items()}
self.errors = errors
self.byte_encoder = bytes_to_unicode()
self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
bpe_merges = []
with open(merges_file, encoding="utf-8") as merges_handle:
for line in merges_handle:
line = line.strip()
if not line or line.startswith("#"):
continue
bpe_merges.append(tuple(line.split()))
self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
self.cache = {}
self.pat = re.compile(PRETOKENIZE_REGEX)
if kwargs.get("add_prefix_space", False):
logger.warning_once(
f"{self.__class__.__name__} does not support `add_prefix_space`, setting it to True has no effect."
)
super().__init__(
errors=errors,
bos_token=bos_token,
eos_token=eos_token,
pad_token=pad_token,
unk_token=unk_token,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
split_special_tokens=split_special_tokens,
**kwargs,
)
@property
def vocab_size(self) -> int:
return len(self.encoder)
def get_vocab(self):
return dict(self.encoder, **self.added_tokens_encoder)
def bpe(self, token):
if token in self.cache:
return self.cache[token]
word = tuple(token)
pairs = get_pairs(word)
if not pairs:
return token
while True:
bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
if bigram not in self.bpe_ranks:
break
first, second = bigram
new_word = []
i = 0
while i < len(word):
try:
j = word.index(first, i)
except ValueError:
new_word.extend(word[i:])
break
else:
new_word.extend(word[i:j])
i = j
if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
new_word.append(first + second)
i += 2
else:
new_word.append(word[i])
i += 1
new_word = tuple(new_word)
word = new_word
if len(word) == 1:
break
else:
pairs = get_pairs(word)
word = " ".join(word)
self.cache[token] = word
return word
def _tokenize(self, text):
"""Tokenize a string."""
bpe_tokens = []
for token in re.findall(self.pat, text):
token = "".join(
self.byte_encoder[b] for b in token.encode("utf-8")
)
bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
return bpe_tokens
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
return self.encoder.get(token, self.encoder.get(self.unk_token))
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
return self.decoder.get(index)
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) in a single string."""
text = "".join(tokens)
text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
return text
def decode(
self,
token_ids,
skip_special_tokens: bool = False,
clean_up_tokenization_spaces: Optional[bool] = False,
spaces_between_special_tokens: bool = False,
**kwargs,
) -> str:
return super().decode(
token_ids,
skip_special_tokens=skip_special_tokens,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
spaces_between_special_tokens=spaces_between_special_tokens,
**kwargs,
)
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
merge_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
)
with open(vocab_file, "w", encoding="utf-8") as f:
f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
index = 0
with open(merge_file, "w", encoding="utf-8") as writer:
writer.write("#version: 0.2\n")
for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
if index != token_index:
logger.warning(
f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
" Please check that the tokenizer is not corrupted!"
)
index = token_index
writer.write(" ".join(bpe_tokens) + "\n")
index += 1
return vocab_file, merge_file
def prepare_for_tokenization(self, text, **kwargs):
text = unicodedata.normalize("NFC", text)
return (text, kwargs)
.\models\qwen2\tokenization_qwen2_fast.py
"""Qwen2 的标记化类。"""
from typing import Optional, Tuple
from ...tokenization_utils import AddedToken
from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ...utils import logging
from .tokenization_qwen2 import Qwen2Tokenizer
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {
"vocab_file": "vocab.json",
"merges_file": "merges.txt",
"tokenizer_file": "tokenizer.json",
}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {"qwen/qwen-tokenizer": "https://huggingface.co/qwen/qwen-tokenizer/resolve/main/vocab.json"},
"merges_file": {"qwen/qwen-tokenizer": "https://huggingface.co/qwen/qwen-tokenizer/resolve/main/merges.txt"},
"tokenizer_file": {
"qwen/qwen-tokenizer": "https://huggingface.co/qwen/qwen-tokenizer/resolve/main/tokenizer.json"
},
}
MAX_MODEL_INPUT_SIZES = {"qwen/qwen-tokenizer": 32768}
class Qwen2TokenizerFast(PreTrainedTokenizerFast):
"""
构建一个“快速”的 Qwen2 分词器(基于 HuggingFace 的 *tokenizers* 库)。基于字节级的 Byte-Pair-Encoding。
与 GPT2Tokenizer 类似,此分词器经过训练,将空格视为标记的一部分,因此一个单词在句子开头(没有空格)和其他位置将被编码为不同的标记:
```
>>> from transformers import Qwen2TokenizerFast
>>> tokenizer = Qwen2TokenizerFast.from_pretrained("Qwen/Qwen-tokenizer")
>>> tokenizer("Hello world")["input_ids"]
[9707, 1879]
>>> tokenizer(" Hello world")["input_ids"]
[21927, 1879]
```
这是预期的行为。
此分词器继承自 [`PreTrainedTokenizerFast`],其中包含大多数主要方法。用户应参考该超类以获取有关这些方法的更多信息。
"""
Args:
vocab_file (`str`, *optional*):
Path to the vocabulary file.
merges_file (`str`, *optional*):
Path to the merges file.
tokenizer_file (`str`, *optional*):
Path to [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
contains everything needed to load the tokenizer.
unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead. Not applicable to this tokenizer.
bos_token (`str`, *optional`):
The beginning of sequence token. Not applicable for this tokenizer.
eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
The end of sequence token.
pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
The token used for padding, for example when batching sequences of different lengths.
"""
# These variables define certain constants for the tokenizer configuration
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = MAX_MODEL_INPUT_SIZES
model_input_names = ["input_ids", "attention_mask"]
slow_tokenizer_class = Qwen2Tokenizer
def __init__(
self,
vocab_file=None,
merges_file=None,
tokenizer_file=None,
unk_token="<|endoftext|>",
bos_token=None,
eos_token="<|endoftext|>",
pad_token="<|endoftext|>",
**kwargs,
):
"""
Initializes a new instance of the Qwen2Tokenizer class.
Args:
vocab_file (str, optional): Path to the vocabulary file.
merges_file (str, optional): Path to the merges file.
tokenizer_file (str, optional): Path to tokenizers file.
unk_token (str, optional, default="<|endoftext|>"): The unknown token.
bos_token (str, optional): The beginning of sequence token.
eos_token (str, optional, default="<|endoftext|>"): The end of sequence token.
pad_token (str, optional, default="<|endoftext|>"): The padding token.
**kwargs: Additional keyword arguments passed to the base class constructor.
"""
# Set bos_token, eos_token, unk_token, and pad_token as AddedToken objects if they are strings
bos_token = (
AddedToken(bos_token, lstrip=False, rstrip=False, special=True, normalized=False)
if isinstance(bos_token, str)
else bos_token
)
eos_token = (
AddedToken(eos_token, lstrip=False, rstrip=False, special=True, normalized=False)
if isinstance(eos_token, str)
else eos_token
)
unk_token = (
AddedToken(unk_token, lstrip=False, rstrip=False, special=True, normalized=False)
if isinstance(unk_token, str)
else unk_token
)
pad_token = (
AddedToken(pad_token, lstrip=False, rstrip=False, special=True, normalized=False)
if isinstance(pad_token, str)
else pad_token
)
# Call the base class constructor with the provided arguments
super().__init__(
vocab_file,
merges_file,
tokenizer_file=tokenizer_file,
unk_token=unk_token,
bos_token=bos_token,
eos_token=eos_token,
pad_token=pad_token,
**kwargs,
)
# 从 transformers 库中 GPT2TokenizerFast 类的 save_vocabulary 方法复制而来
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
# 调用内部的 tokenizer 模块的 save 方法,将模型保存到指定的目录中,并使用给定的前缀作为文件名前缀
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
# 返回保存的文件名组成的元组
return tuple(files)
.\models\qwen2\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_tokenizers_available,
is_torch_available,
)
_import_structure = {
"configuration_qwen2": ["QWEN2_PRETRAINED_CONFIG_ARCHIVE_MAP", "Qwen2Config"],
"tokenization_qwen2": ["Qwen2Tokenizer"],
}
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_qwen2_fast"] = ["Qwen2TokenizerFast"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_qwen2"] = [
"Qwen2ForCausalLM",
"Qwen2Model",
"Qwen2PreTrainedModel",
"Qwen2ForSequenceClassification",
]
if TYPE_CHECKING:
from .configuration_qwen2 import QWEN2_PRETRAINED_CONFIG_ARCHIVE_MAP, Qwen2Config
from .tokenization_qwen2 import Qwen2Tokenizer
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_qwen2_fast import Qwen2TokenizerFast
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_qwen2 import (
Qwen2ForCausalLM,
Qwen2ForSequenceClassification,
Qwen2Model,
Qwen2PreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\rag\configuration_rag.py
from ...configuration_utils import PretrainedConfig
from ...utils import add_start_docstrings
RAG_CONFIG_DOC = r"""
[`RagConfig`] 存储了 *RagModel* 的配置。配置对象继承自 [`PretrainedConfig`],
可以用于控制模型的输出。更多信息请参阅 [`PretrainedConfig`] 的文档。
"""
@add_start_docstrings(RAG_CONFIG_DOC)
class RagConfig(PretrainedConfig):
model_type = "rag"
is_composition = True
def __init__(
self,
vocab_size=None,
is_encoder_decoder=True,
prefix=None,
bos_token_id=None,
pad_token_id=None,
eos_token_id=None,
decoder_start_token_id=None,
title_sep=" / ",
doc_sep=" // ",
n_docs=5,
max_combined_length=300,
retrieval_vector_size=768,
retrieval_batch_size=8,
dataset="wiki_dpr",
dataset_split="train",
index_name="compressed",
index_path=None,
passages_path=None,
use_dummy_dataset=False,
reduce_loss=False,
label_smoothing=0.0,
do_deduplication=True,
exclude_bos_score=False,
do_marginalize=False,
output_retrieved=False,
use_cache=True,
forced_eos_token_id=None,
dataset_revision=None,
**kwargs,
):
super().__init__(
vocab_size=vocab_size,
is_encoder_decoder=is_encoder_decoder,
prefix=prefix,
bos_token_id=bos_token_id,
pad_token_id=pad_token_id,
eos_token_id=eos_token_id,
decoder_start_token_id=decoder_start_token_id,
**kwargs,
)
):
super().__init__(
bos_token_id=bos_token_id,
pad_token_id=pad_token_id,
eos_token_id=eos_token_id,
decoder_start_token_id=decoder_start_token_id,
forced_eos_token_id=forced_eos_token_id,
is_encoder_decoder=is_encoder_decoder,
prefix=prefix,
vocab_size=vocab_size,
**kwargs,
)
assert (
"question_encoder" in kwargs and "generator" in kwargs
), "Config has to be initialized with question_encoder and generator config"
question_encoder_config = kwargs.pop("question_encoder")
question_encoder_model_type = question_encoder_config.pop("model_type")
decoder_config = kwargs.pop("generator")
decoder_model_type = decoder_config.pop("model_type")
from ..auto.configuration_auto import AutoConfig
self.question_encoder = AutoConfig.for_model(question_encoder_model_type, **question_encoder_config)
self.generator = AutoConfig.for_model(decoder_model_type, **decoder_config)
self.reduce_loss = reduce_loss
self.label_smoothing = label_smoothing
self.exclude_bos_score = exclude_bos_score
self.do_marginalize = do_marginalize
self.title_sep = title_sep
self.doc_sep = doc_sep
self.n_docs = n_docs
self.max_combined_length = max_combined_length
self.dataset = dataset
self.dataset_split = dataset_split
self.index_name = index_name
self.retrieval_vector_size = retrieval_vector_size
self.retrieval_batch_size = retrieval_batch_size
self.passages_path = passages_path
self.index_path = index_path
self.use_dummy_dataset = use_dummy_dataset
self.dataset_revision = dataset_revision
self.output_retrieved = output_retrieved
self.do_deduplication = do_deduplication
self.use_cache = use_cache
if self.forced_eos_token_id is None:
self.forced_eos_token_id = getattr(self.generator, "forced_eos_token_id", None)
@classmethod
def from_question_encoder_generator_configs(
cls, question_encoder_config: PretrainedConfig, generator_config: PretrainedConfig, **kwargs
) -> PretrainedConfig:
r"""
Instantiate a [`EncoderDecoderConfig`] (or a derived class) from a pre-trained encoder model configuration and
decoder model configuration.
Returns:
[`EncoderDecoderConfig`]: An instance of a configuration object
"""
return cls(question_encoder=question_encoder_config.to_dict(), generator=generator_config.to_dict(), **kwargs)
.\models\rag\modeling_rag.py
import copy
from dataclasses import dataclass
from typing import Callable, List, Optional, Tuple, Union
import torch
from torch import nn
from ...configuration_utils import PretrainedConfig
from ...generation import BeamSearchScorer, GenerationConfig, LogitsProcessorList, StoppingCriteriaList
from ...modeling_outputs import ModelOutput
from ...modeling_utils import PreTrainedModel
from ...utils import add_start_docstrings_to_model_forward, logging, replace_return_docstrings
from .configuration_rag import RagConfig
from .retrieval_rag import RagRetriever
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "RagConfig"
@dataclass
class RetrievAugLMMarginOutput(ModelOutput):
"""
检索增强的边际化模型输出的基类。
"""
loss: Optional[torch.FloatTensor] = None
logits: torch.FloatTensor = None
doc_scores: torch.FloatTensor = None
past_key_values: Optional[List[torch.FloatTensor]] = None
retrieved_doc_embeds: Optional[torch.FloatTensor] = None
retrieved_doc_ids: Optional[torch.LongTensor] = None
context_input_ids: Optional[torch.LongTensor] = None
context_attention_mask: Optional[torch.LongTensor] = None
question_encoder_last_hidden_state: Optional[torch.FloatTensor] = None
question_enc_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
question_enc_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
generator_enc_last_hidden_state: Optional[torch.FloatTensor] = None
generator_enc_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
generator_enc_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
generator_dec_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
generator_dec_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
generator_cross_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@dataclass
class RetrievAugLMOutput(ModelOutput):
"""
检索增强的语言模型输出基类。
"""
logits: torch.FloatTensor = None
doc_scores: torch.FloatTensor = None
past_key_values: Optional[List[torch.FloatTensor]] = None
retrieved_doc_embeds: Optional[torch.FloatTensor] = None
retrieved_doc_ids: Optional[torch.LongTensor] = None
context_input_ids: Optional[torch.LongTensor] = None
context_attention_mask: Optional[torch.LongTensor] = None
question_encoder_last_hidden_state: Optional[torch.FloatTensor] = None
question_enc_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
question_enc_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
generator_enc_last_hidden_state: Optional[torch.FloatTensor] = None
generator_enc_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
generator_enc_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
generator_dec_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
generator_dec_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
generator_cross_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
class RagPreTrainedModel(PreTrainedModel):
r"""
RAG models were released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP
Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandra Piktus et al.
RAG is a retriever augmented model and encapsulate three components: a question encoder, a dataset retriever and a
generator, the encoder and generator are trainable while the retriever is just an indexed dataset.
"""
config_class = RagConfig
base_model_prefix = "rag"
@classmethod
def from_pretrained(cls, *args, **kwargs):
kwargs["_fast_init"] = False
return super().from_pretrained(*args, **kwargs)
@classmethod
def from_pretrained_question_encoder_generator(
cls,
question_encoder_pretrained_model_name_or_path: str = None,
generator_pretrained_model_name_or_path: str = None,
retriever: RagRetriever = None,
**kwargs,
):
RAG_START_DOCSTRING = r"""
RAG is a seq2seq model which encapsulates two core components: a question encoder and a generator. During a forward
pass, we encode the input with the question encoder and pass it to the retriever to extract relevant context
documents. The documents are then prepended to the input. Such contextualized inputs is passed to the generator.
The question encoder can be any *autoencoding* model, preferably [`DPRQuestionEncoder`], and the generator can be
any *seq2seq* model, preferably [`BartForConditionalGeneration`].
The model can be initialized with a [`RagRetriever`] for end-to-end generation or used in combination with the
outputs of a retriever in multiple steps---see examples for more details. The model is compatible any
*autoencoding* model as the `question_encoder` and any *seq2seq* model with language model head as the `generator`.
It has been tested with [`DPRQuestionEncoder`] as the `question_encoder` and [`BartForConditionalGeneration`] or
[`T5ForConditionalGeneration`] as the `generator`.
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
"""
Args:
config ([`RagConfig`]):
模型配置类,包含模型的所有参数。通过配置文件初始化不会加载与模型相关的权重,只加载配置信息。
若要加载模型权重,请查看 [`~PreTrainedModel.from_pretrained`] 方法。
question_encoder ([`PreTrainedModel`]):
编码器模型,与由 `retriever` 封装的 faiss 索引兼容。
generator ([`PreTrainedModel`]):
在 RAG 结构中用作生成器的 seq2seq 模型。
retriever ([`RagRetriever`]):
检索器类,封装了一个 faiss 索引,用于查询获取当前输入的上下文文档。
"""
"""
RAG_FORWARD_INPUTS_DOCSTRING = r"""
"""
@add_start_docstrings_to_model_forward(RAG_START_DOCSTRING)
class RagModel(RagPreTrainedModel):
def __init__(
self,
config: Optional[PretrainedConfig] = None,
question_encoder: Optional[PreTrainedModel] = None,
generator: Optional[PreTrainedModel] = None,
retriever: Optional[RagRetriever] = None,
**kwargs,
):
assert config is not None or (
question_encoder is not None and generator is not None
), "Either a configuration or an question_encoder and a generator has to be provided."
if config is None:
config = RagConfig.from_question_encoder_generator_configs(
question_encoder.config, generator.config, **kwargs
)
else:
assert isinstance(config, self.config_class), f"config: {config} has to be of type {self.config_class}"
super().__init__(config)
if question_encoder is None:
from ..auto.modeling_auto import AutoModel
question_encoder = AutoModel.from_config(config.question_encoder)
if generator is None:
from ..auto.modeling_auto import AutoModelForSeq2SeqLM
generator = AutoModelForSeq2SeqLM.from_config(config.generator)
self.retriever = retriever
if self.retriever is not None:
assert isinstance(
retriever, RagRetriever
), f"`self.retriever` is of type {type(self.retriever)}, but should be of type `RagRetriever`"
self.retriever = retriever
self.question_encoder = question_encoder
self.generator = generator
self.ctx_encoder = None
self.context_encoder_training = False
@add_start_docstrings_to_model_forward(
"""
A RAG-sequence model implementation. It performs RAG-sequence specific marginalization in the forward pass.
"""
)
@replace_return_docstrings(output_type=RetrievAugLMOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.BoolTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
doc_scores: Optional[torch.FloatTensor] = None,
context_input_ids: Optional[torch.LongTensor] = None,
context_attention_mask: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
output_retrieved: Optional[bool] = None,
n_docs: Optional[int] = None,
):
"""
Perform a forward pass through the RAG model.
This method implements specific marginalization for RAG-sequence models.
Args:
input_ids (Optional[torch.LongTensor]): Input tensor of token indices.
attention_mask (Optional[torch.Tensor]): Mask tensor indicating which elements in the input should be attended to.
encoder_outputs (Optional[Tuple[Tuple[torch.FloatTensor]]]): Outputs of the encoder.
decoder_input_ids (Optional[torch.LongTensor]): Input tensor for decoder.
decoder_attention_mask (Optional[torch.BoolTensor]): Mask tensor for decoder attention.
past_key_values (Optional[Tuple[Tuple[torch.FloatTensor]]]): Cached key-values for faster decoding.
doc_scores (Optional[torch.FloatTensor]): Scores indicating relevance of retrieved documents.
context_input_ids (Optional[torch.LongTensor]): Tensor of token indices for context.
context_attention_mask (Optional[torch.LongTensor]): Mask tensor for context attention.
use_cache (Optional[bool]): Whether to use cached values.
output_attentions (Optional[bool]): Whether to output attention weights.
output_hidden_states (Optional[bool]): Whether to output hidden states.
output_retrieved (Optional[bool]): Whether to output retrieved documents.
n_docs (Optional[int]): Number of documents to retrieve.
Returns:
RetrievAugLMOutput: Object containing the model outputs.
"""
pass
RAG_START_DOCSTRING,
class RagSequenceForGeneration(RagPreTrainedModel):
def __init__(
self,
config: Optional[PretrainedConfig] = None,
question_encoder: Optional[PreTrainedModel] = None,
generator: Optional[PreTrainedModel] = None,
retriever: Optional[RagRetriever] = None,
**kwargs,
):
assert config is not None or (
question_encoder is not None and generator is not None
), "Either a configuration or an encoder and a generator has to be provided."
if config is None:
config = RagConfig.from_question_encoder_generator_configs(
question_encoder.config, generator.config, **kwargs
)
super().__init__(config)
self.rag = RagModel(config=config, question_encoder=question_encoder, generator=generator, retriever=retriever)
def set_retriever(self, retriever: RagRetriever):
self.rag.retriever = retriever
def set_context_encoder_for_training(self, ctx_encoder: PreTrainedModel):
self.rag.context_encoder_training = True
self.rag.ctx_encoder = ctx_encoder
@add_start_docstrings_to_model_forward(RAG_FORWARD_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=RetrievAugLMMarginOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.BoolTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
context_input_ids: Optional[torch.LongTensor] = None,
context_attention_mask: Optional[torch.LongTensor] = None,
doc_scores: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
output_retrieved: Optional[bool] = None,
exclude_bos_score: Optional[bool] = None,
reduce_loss: Optional[bool] = None,
labels: Optional[torch.LongTensor] = None,
n_docs: Optional[int] = None,
**kwargs,
):
pass
@property
def retriever(self):
return self.rag.retriever
@property
def generator(self):
return self.rag.generator
@property
def question_encoder(self):
return self.rag.question_encoder
@torch.no_grad()
def generate(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.LongTensor] = None,
context_input_ids: Optional[torch.LongTensor] = None,
context_attention_mask: Optional[torch.LongTensor] = None,
doc_scores: Optional[torch.FloatTensor] = None,
do_deduplication: Optional[bool] = None,
num_return_sequences: Optional[int] = None,
num_beams: Optional[int] = None,
n_docs: Optional[int] = None,
**model_kwargs,
):
def get_nll(
self,
seq_logits,
doc_scores,
target,
reduce_loss=False,
epsilon=0.0,
exclude_bos_score=False,
n_docs=None
target = torch.cat(
[target[:, 1:], target.new(target.shape[0], 1).fill_(self.config.generator.pad_token_id)], 1
)
n_docs = n_docs if n_docs is not None else self.config.n_docs
bos_token_id = self.config.bos_token_id or self.config.generator.bos_token_id
use_bos = bos_token_id is not None and target[:, 0].eq(bos_token_id).all()
def _mask_pads(ll, smooth_obj):
pad_mask = target.eq(self.config.generator.pad_token_id)
if pad_mask.any():
ll.masked_fill_(pad_mask, 0.0)
smooth_obj.masked_fill_(pad_mask, 0.0)
return ll.squeeze(-1), smooth_obj.squeeze(-1)
seq_logprobs = nn.functional.log_softmax(seq_logits, dim=-1).view(
seq_logits.shape[0] // n_docs, n_docs, -1, seq_logits.size(-1)
)
doc_logprobs = nn.functional.log_softmax(doc_scores, dim=1).unsqueeze(-1).unsqueeze(-1)
first_token_scores = seq_logprobs[:, :, :1, :]
second_token_scores = seq_logprobs[:, :, 1:2, :]
remainder = seq_logprobs[:, :, 2:, :]
rag_logprobs = torch.cat([first_token_scores, second_token_scores + doc_logprobs, remainder], dim=2)
target = target.unsqueeze(1).unsqueeze(-1).repeat(1, n_docs, 1, 1)
assert target.dim() == rag_logprobs.dim()
ll = rag_logprobs.gather(dim=-1, index=target)
smooth_obj = rag_logprobs.sum(dim=-1, keepdim=True)
ll, smooth_obj = _mask_pads(ll, smooth_obj)
ll = ll[:, :, 1:].sum(2) if exclude_bos_score and use_bos else ll.sum(2)
smooth_obj = smooth_obj.sum(2)
ll = ll.logsumexp(1)
smooth_obj = smooth_obj.logsumexp(1)
nll_loss = -ll
smooth_loss = -smooth_obj
if reduce_loss:
nll_loss = nll_loss.sum()
smooth_loss = smooth_loss.sum()
eps_i = epsilon / rag_logprobs.size(-1)
loss = (1.0 - epsilon) * nll_loss + eps_i * smooth_loss
return loss
@staticmethod
def _cat_and_pad(tensors, pad_token_id):
output = (
tensors[0].new(sum([t.shape[0] for t in tensors]), max([t.shape[1] for t in tensors])).fill_(pad_token_id)
)
ind = 0
for t in tensors:
output[ind : ind + t.shape[0], : t.shape[1]] = t
ind += t.shape[0]
return output
"""
一个实现了RAG-token模型的类。在前向传播中执行了RAG-token特定的边缘化操作。
"""
@add_start_docstrings_to_model_forward(
"""
A RAG-token model implementation. It performs RAG-token specific marginalization in the forward pass.
""",
RAG_START_DOCSTRING,
)
class RagTokenForGeneration(RagPreTrainedModel):
def __init__(
self,
config: Optional[PretrainedConfig] = None,
question_encoder: Optional[PreTrainedModel] = None,
generator: Optional[PreTrainedModel] = None,
retriever: Optional[RagRetriever] = None,
**kwargs,
):
assert config is not None or (
question_encoder is not None and generator is not None
), "Either a configuration or an encoder and a generator has to be provided."
if config is None:
config = RagConfig.from_question_encoder_generator_configs(
question_encoder.config, generator.config, **kwargs
)
super().__init__(config)
self.rag = RagModel(config=config, question_encoder=question_encoder, generator=generator, retriever=retriever)
def set_retriever(self, retriever: RagRetriever):
self.rag.retriever = retriever
def set_context_encoder_for_training(self, ctx_encoder: PreTrainedModel):
self.rag.context_encoder_training = True
self.rag.ctx_encoder = ctx_encoder
def prepare_inputs_for_generation(
self,
decoder_input_ids,
past_key_values=None,
attention_mask=None,
use_cache=None,
encoder_outputs=None,
doc_scores=None,
n_docs=None,
**kwargs,
):
if past_key_values is not None:
decoder_input_ids = decoder_input_ids[:, -1:]
return {
"input_ids": None,
"encoder_outputs": encoder_outputs,
"doc_scores": doc_scores,
"context_attention_mask": attention_mask,
"decoder_input_ids": decoder_input_ids,
"past_key_values": past_key_values,
"use_cache": use_cache,
"do_marginalize": True,
"n_docs": n_docs,
}
@property
def retriever(self):
return self.rag.retriever
@property
def generator(self):
return self.rag.generator
@property
def question_encoder(self):
return self.rag.question_encoder
@staticmethod
def _reorder_cache(past_key_values, beam_idx):
"""Reorders cache for generation. BART-inspired but we need to take care of the extra dimension for docs"""
def _reorder_stacked(hidden_states, new_order):
n_docs = hidden_states.shape[0] // new_order.shape[0]
hidden_states = hidden_states.view(-1, n_docs, *hidden_states.shape[1:])
hidden_states = hidden_states.index_select(0, new_order)
result = hidden_states.view(-1, *hidden_states.shape[2:])
return result
reordered_past = ()
for layer_past in past_key_values:
reordered_past += (
tuple(_reorder_stacked(past_state, beam_idx.to(past_state.device)) for past_state in layer_past),
)
return reordered_past
def marginalize(self, seq_logits, doc_scores, n_docs=None):
n_docs = n_docs if n_docs is not None else self.config.n_docs
seq_logprobs = nn.functional.log_softmax(seq_logits, dim=-1).view(
seq_logits.shape[0] // n_docs, n_docs, -1, seq_logits.size(-1)
)
doc_logprobs = torch.log_softmax(doc_scores, dim=1)
log_prob_sum = seq_logprobs + doc_logprobs.unsqueeze(-1).unsqueeze(-1)
return torch.logsumexp(log_prob_sum, dim=1)
@add_start_docstrings_to_model_forward(RAG_FORWARD_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=RetrievAugLMMarginOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.BoolTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
context_input_ids: Optional[torch.LongTensor] = None,
context_attention_mask: Optional[torch.LongTensor] = None,
doc_scores: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
output_retrieved: Optional[bool] = None,
do_marginalize: Optional[bool] = None,
reduce_loss: Optional[bool] = None,
labels: Optional[torch.LongTensor] = None,
n_docs: Optional[int] = None,
**kwargs,
):
@torch.no_grad()
def generate(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.LongTensor] = None,
context_input_ids: Optional[torch.LongTensor] = None,
context_attention_mask: Optional[torch.LongTensor] = None,
doc_scores: Optional[torch.FloatTensor] = None,
n_docs: Optional[int] = None,
generation_config: Optional[GenerationConfig] = None,
prefix_allowed_tokens_fn: Callable[[int, torch.Tensor], List[int]] = None,
logits_processor: Optional[LogitsProcessorList] = LogitsProcessorList(),
stopping_criteria: Optional[StoppingCriteriaList] = StoppingCriteriaList(),
**kwargs,
):
"""
Generate function for the model to generate text outputs based on given inputs.
"""
def get_input_embeddings(self):
"""
Retrieve input embeddings from the RAG generator.
"""
return self.rag.generator.get_input_embeddings()
def get_output_embeddings(self):
"""
Retrieve output embeddings from the RAG generator.
"""
return self.rag.generator.get_output_embeddings()
def set_output_embeddings(self, new_embeddings):
"""
Set new output embeddings for the RAG generator.
"""
return self.rag.generator.set_output_embeddings(new_embeddings)
def shift_tokens_right(self, input_ids, start_token_id=None):
"""
Shift input ids one token to the right, and pad with start_token_id.
"""
if start_token_id is None:
start_token_id = self.config.decoder_start_token_id
shifted_input_ids = input_ids.new_zeros(input_ids.shape)
shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
shifted_input_ids[:, 0] = start_token_id
return shifted_input_ids
def get_nll(self, seq_logits, doc_scores, target, reduce_loss=False, epsilon=0.0, n_docs=None):
"""
Calculate negative log likelihood loss for sequence logits and document scores.
"""
n_docs = n_docs if n_docs is not None else self.config.n_docs
target = torch.cat(
[target[:, 1:], target.new(target.shape[0], 1).fill_(self.config.generator.pad_token_id)], 1
)
def _mask_pads(ll, smooth_obj):
"""
Mask padding tokens in loss calculations.
"""
pad_mask = target.eq(self.config.generator.pad_token_id)
if pad_mask.any():
ll.masked_fill_(pad_mask, 0.0)
smooth_obj.masked_fill_(pad_mask, 0.0)
return ll.squeeze(-1), smooth_obj.squeeze(-1)
rag_logprobs = self.marginalize(seq_logits, doc_scores, n_docs)
target = target.unsqueeze(-1)
assert target.dim() == rag_logprobs.dim()
ll = rag_logprobs.gather(dim=-1, index=target)
smooth_obj = rag_logprobs.sum(dim=-1, keepdim=True)
ll, smooth_obj = _mask_pads(ll, smooth_obj)
ll = ll.sum(1)
smooth_obj = smooth_obj.sum(1)
nll_loss = -ll
smooth_loss = -smooth_obj
if reduce_loss:
nll_loss = nll_loss.sum()
smooth_loss = smooth_loss.sum()
eps_i = epsilon / rag_logprobs.size(-1)
loss = (1.0 - epsilon) * nll_loss + eps_i * smooth_loss
return loss
.\models\rag\modeling_tf_rag.py
"""TFRAG 模型实现。"""
from __future__ import annotations
import copy
from dataclasses import dataclass
from typing import List, Optional, Tuple, Union
import numpy as np
import tensorflow as tf
from ...configuration_utils import PretrainedConfig
from ...generation import TFLogitsProcessorList
from ...modeling_tf_utils import (
TFCausalLanguageModelingLoss,
TFModelInputType,
TFPreTrainedModel,
keras,
shape_list,
unpack_inputs,
)
from ...utils import ModelOutput, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
from .configuration_rag import RagConfig
from .retrieval_rag import RagRetriever
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "RagConfig"
@dataclass
class TFRetrievAugLMMarginOutput(ModelOutput):
"""
用于检索增强的边缘化模型输出的基类。
"""
loss: tf.Tensor | None = None
logits: tf.Tensor = None
past_key_values: List[tf.Tensor] | None = None
doc_scores: tf.Tensor | None = None
retrieved_doc_embeds: tf.Tensor | None = None
retrieved_doc_ids: tf.Tensor | None = None
context_input_ids: tf.Tensor | None = None
context_attention_mask: tf.Tensor | None = None
question_encoder_last_hidden_state: tf.Tensor | None = None
question_enc_hidden_states: Tuple[tf.Tensor, ...] | None = None
question_enc_attentions: Tuple[tf.Tensor, ...] | None = None
generator_enc_last_hidden_state: tf.Tensor | None = None
generator_enc_hidden_states: Tuple[tf.Tensor, ...] | None = None
generator_enc_attentions: Tuple[tf.Tensor, ...] | None = None
generator_dec_hidden_states: Tuple[tf.Tensor, ...] | None = None
generator_dec_attentions: Tuple[tf.Tensor, ...] | None = None
@dataclass
class TFRetrievAugLMOutput(ModelOutput):
"""
"""
logits: tf.Tensor = None
past_key_values: List[tf.Tensor] | None = None
doc_scores: tf.Tensor | None = None
retrieved_doc_embeds: tf.Tensor | None = None
retrieved_doc_ids: tf.Tensor | None = None
context_input_ids: tf.Tensor | None = None
context_attention_mask: tf.Tensor | None = None
question_encoder_last_hidden_state: tf.Tensor | None = None
question_enc_hidden_states: Tuple[tf.Tensor, ...] | None = None
question_enc_attentions: Tuple[tf.Tensor, ...] | None = None
generator_enc_last_hidden_state: tf.Tensor | None = None
generator_enc_hidden_states: Tuple[tf.Tensor, ...] | None = None
generator_enc_attentions: Tuple[tf.Tensor, ...] | None = None
generator_dec_hidden_states: Tuple[tf.Tensor, ...] | None = None
generator_dec_attentions: Tuple[tf.Tensor, ...] | None = None
class TFRagPreTrainedModel(TFPreTrainedModel):
r"""
RAG models were released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP
Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandra Piktus et al.
RAG is a retriever augmented model and encapsulate three components: a question encoder, a dataset retriever and a
generator, the encoder and generator are trainable while the retriever is just an indexed dataset.
"""
config_class = RagConfig
base_model_prefix = "rag"
_keys_to_ignore_on_load_missing = [r"position_ids"]
@classmethod
def from_pretrained_question_encoder_generator(
cls,
question_encoder_pretrained_model_name_or_path: str = None,
generator_pretrained_model_name_or_path: str = None,
retriever: RagRetriever = None,
*model_args,
**kwargs,
RAG_START_DOCSTRING = r"""
Args:
config ([`RagConfig`]):
# 模型配置类,包含模型的所有参数。使用配置文件初始化时不会加载模型的权重,只加载配置信息。
# 若要加载模型权重,请参考 [`~TFPreTrainedModel.from_pretrained`] 方法。
question_encoder ([`TFPreTrainedModel`]):
# 编码器模型,与由 `retriever` 封装的 faiss 索引兼容。
generator ([`TFPreTrainedModel`]):
# 在 RAG 架构中用作生成器的 seq2seq 模型。
retriever ([`RagRetriever`]):
# 检索器类,封装了一个 faiss 索引,用于获取当前输入的上下文文档。
"""
"""
RAG_FORWARD_INPUTS_DOCSTRING = r"""
"""
@add_start_docstrings_to_model_forward(RAG_START_DOCSTRING)
class TFRagModel(TFRagPreTrainedModel):
load_weight_prefix = "tf_rag_model_1"
def __init__(
self,
config: Optional[PretrainedConfig] = None,
question_encoder: Optional[TFPreTrainedModel] = None,
generator: Optional[TFPreTrainedModel] = None,
retriever: Optional[RagRetriever] = None,
load_weight_prefix: Optional[str] = None,
**kwargs,
):
assert config is not None or (
question_encoder is not None and generator is not None
), "Either a configuration or an question_encoder and a generator has to be provided."
if config is None:
# 从问题编码器和生成器的配置中创建一个 RagConfig 对象
config = RagConfig.from_question_encoder_generator_configs(
question_encoder.config, generator.config, **kwargs
)
else:
assert isinstance(config, self.config_class), f"config: {config} has to be of type {self.config_class}"
super().__init__(config, **kwargs)
if question_encoder is None:
# 如果没有提供问题编码器,则使用自动加载的 TFAutoModel 创建一个
from ..auto.modeling_tf_auto import TFAutoModel
question_encoder = TFAutoModel.from_config(config.question_encoder, name="question_encoder")
if generator is None:
# 如果没有提供生成器,则使用自动加载的 TFAutoModelForSeq2SeqLM 创建一个
from ..auto.modeling_tf_auto import TFAutoModelForSeq2SeqLM
load_weight_prefix = load_weight_prefix if load_weight_prefix is not None else self.load_weight_prefix
generator = TFAutoModelForSeq2SeqLM.from_config(
config.generator, name="generator", load_weight_prefix=load_weight_prefix + "/generator"
)
self.retriever = retriever
if self.retriever is not None:
# 如果提供了检索器,确保它是 RagRetriever 类型的对象
assert isinstance(
retriever, RagRetriever
), f"`self.retriever` is of type {type(self.retriever)}, but should be of type `RagRetriever`"
self.retriever = retriever
self.question_encoder = question_encoder
self.generator = generator
def set_retriever(self, retriever: RagRetriever):
# 设置检索器
self.retriever = retriever
@unpack_inputs
@add_start_docstrings_to_model_forward(RAG_FORWARD_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=TFRetrievAugLMOutput, config_class=_CONFIG_FOR_DOC)
# 定义类方法 `call`,用于模型调用和推理
def call(
self,
input_ids: TFModelInputType | None = None, # 输入序列的标识符(TensorFlow 模型输入类型或 None)
attention_mask: np.ndarray | tf.Tensor | None = None, # 注意力掩码,用于指定哪些位置的输入要被关注
encoder_outputs: np.ndarray | tf.Tensor | None = None, # 编码器的输出,可能是 numpy 数组或 TensorFlow 张量
decoder_input_ids: np.ndarray | tf.Tensor | None = None, # 解码器的输入标识符序列,可能是 numpy 数组或 TensorFlow 张量
decoder_attention_mask: np.ndarray | tf.Tensor | None = None, # 解码器的注意力掩码
past_key_values: Tuple[Tuple[Union[np.ndarray, tf.Tensor]]] | None = None, # 过去的键值对,用于 Transformer 模型的存储和重用
doc_scores: np.ndarray | tf.Tensor | None = None, # 文档评分,可能是 numpy 数组或 TensorFlow 张量
context_input_ids: np.ndarray | tf.Tensor | None = None, # 上下文输入的标识符序列
context_attention_mask: np.ndarray | tf.Tensor | None = None, # 上下文输入的注意力掩码
use_cache: bool | None = None, # 是否使用缓存来加速解码过程
output_attentions: bool | None = None, # 是否输出注意力权重
output_hidden_states: bool | None = None, # 是否输出隐藏状态
output_retrieved: bool | None = None, # 是否输出检索到的信息(如检索式推理)
n_docs: int | None = None, # 文档数量
return_dict: bool | None = None, # 是否返回字典格式的输出
training: bool = False, # 是否在训练模式下
**kwargs, # 其他关键字参数,用于接收任何未指定的额外参数
):
# 如果模型已经构建好,则直接返回
if self.built:
return
# 标记模型已经构建
self.built = True
# 使用 TensorFlow 的名称作用域,构建生成器部分的模型
with tf.name_scope(self.generator.name):
self.generator.build(None)
# 使用 TensorFlow 的名称作用域,构建问题编码器部分的模型
with tf.name_scope(self.question_encoder.name):
self.question_encoder.build(None)
@add_start_docstrings_to_model_forward(
"""
A TF RAG-token model implementation. It performs RAG-token specific marginalization in the forward pass.
""",
RAG_START_DOCSTRING,
)
class TFRagTokenForGeneration(TFRagPreTrainedModel, TFCausalLanguageModelingLoss):
load_weight_prefix = "tf_rag_token_for_generation_1/rag"
def __init__(
self,
config: Optional[PretrainedConfig] = None,
question_encoder: Optional[TFPreTrainedModel] = None,
generator: Optional[TFPreTrainedModel] = None,
retriever: Optional[RagRetriever] = None,
**kwargs,
):
assert config is not None or (
question_encoder is not None and generator is not None
), "Either a configuration or an encoder and a generator has to be provided."
if config is None:
# 如果未提供配置,根据提供的问题编码器和生成器配置生成一个新的RagConfig对象
config = RagConfig.from_question_encoder_generator_configs(
question_encoder.config, generator.config, **kwargs
)
super().__init__(config)
# 实例化RAG模型
self.rag = TFRagModel(
config=config,
question_encoder=question_encoder,
generator=generator,
retriever=retriever,
load_weight_prefix=self.load_weight_prefix,
name="rag",
)
def set_retriever(self, retriever: RagRetriever):
# 设置RAG模型的检索器
self.rag.retriever = retriever
# 从 https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_tf_bart.py 改编而来
def prepare_inputs_for_generation(
self,
decoder_input_ids,
past_key_values=None,
attention_mask=None,
use_cache=None,
encoder_outputs=None,
doc_scores=None,
n_docs=None,
**kwargs,
):
if past_key_values is not None:
# 如果定义了过去的键值,只使用最后一个decoder_input_ids
decoder_input_ids = decoder_input_ids[:, -1:]
return {
"input_ids": None,
"encoder_outputs": encoder_outputs,
"doc_scores": doc_scores,
"context_attention_mask": attention_mask,
"decoder_input_ids": decoder_input_ids,
"past_key_values": past_key_values,
"use_cache": use_cache,
"do_marginalize": True,
"n_docs": n_docs,
}
@property
def retriever(self):
# 返回RAG模型的检索器
return self.rag.retriever
@property
def generator(self):
# 返回RAG模型的生成器
return self.rag.generator
@property
def question_encoder(self):
# 返回RAG模型的问题编码器
return self.rag.question_encoder
@staticmethod
def _gather_beams(nested, beam_indices, batch_axis=0):
"""
RAG-specific `_gather_beams`: gathers the beam slices indexed by beam_indices into new beam array. If the
nested tensor has a shape mismatch with the beam indices, then it means it is the cache. In that case, isolates
and takes care of the extra dimension for ndocs.
"""
def gather_fn(tensor):
# 判断是否为 RAG 的缓存数据
is_rag_cache = tensor.shape[0] != beam_indices.shape[0]
if is_rag_cache:
# 如果是缓存数据,则计算每个文档的数量和批次大小
n_docs = tensor.shape[0] // beam_indices.shape[0]
batch_size = beam_indices.shape[0]
# 重塑张量为 (批次大小, num beams, n_docs, ...) 的格式,这是 RAG 期望的缓存格式
tensor = tf.reshape(tensor, (batch_size, -1, n_docs, *tensor.shape[2:]))
# 使用给定的索引从张量中收集数据
gathered_tensor = tf.gather(params=tensor, indices=beam_indices, axis=1, batch_dims=1)
if is_rag_cache:
# 如果是缓存数据,则重新塑造成 beam search 期望的形状
gathered_tensor = tf.reshape(gathered_tensor, (batch_size * n_docs, -1, *gathered_tensor.shape[3:]))
return gathered_tensor
# 对嵌套结构应用 gather_fn 函数,用于收集索引的数据
return tf.nest.map_structure(gather_fn, nested)
def marginalize(self, seq_logits, doc_scores, n_docs=None):
n_docs = n_docs if n_docs is not None else self.config.n_docs
# RAG-token marginalization
# 对序列 logits 应用 log_softmax,在指定轴上进行归一化
seq_logprobs = tf.nn.log_softmax(seq_logits, axis=-1)
# 重新塑造成 [batch_size // n_docs, n_docs, -1, seq_logits.shape[-1]] 的形状
seq_logprobs = tf.reshape(seq_logprobs, [seq_logits.shape[0] // n_docs, n_docs, -1, seq_logits.shape[-1]])
# 对文档分数应用 log_softmax,在第 1 轴上进行归一化
doc_logprobs = tf.nn.log_softmax(doc_scores, axis=1)
# 在最后添加两个维度
doc_logprobs = tf.expand_dims(doc_logprobs, axis=-1)
doc_logprobs = tf.expand_dims(doc_logprobs, axis=-1) # 两次
# 计算序列和文档 log-probabilities 的总和
log_prob_sum = seq_logprobs + doc_logprobs
# 在第 1 轴上计算 logsumexp
return tf.reduce_logsumexp(log_prob_sum, axis=1)
@unpack_inputs
@add_start_docstrings_to_model_forward(RAG_FORWARD_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=TFRetrievAugLMMarginOutput, config_class=_CONFIG_FOR_DOC)
# 定义一个方法 `call`,用于模型调用。接受多个输入参数,包括输入的编码 `input_ids`,
# 注意力掩码 `attention_mask`,解码器的输入编码 `decoder_input_ids` 和注意力掩码 `decoder_attention_mask`,
# 编码器的输出 `encoder_outputs`,过去的键值对 `past_key_values`,文档分数 `doc_scores`,
# 上下文输入编码 `context_input_ids` 和上下文注意力掩码 `context_attention_mask` 等等。
# 其他参数包括是否使用缓存 `use_cache`,是否输出注意力 `output_attentions` 和隐藏状态 `output_hidden_states`,
# 是否输出检索结果 `output_retrieved`,文档数量 `n_docs`,是否边际化 `do_marginalize`,
# 标签 `labels`,是否减少损失 `reduce_loss`,是否返回字典 `return_dict`,
# 是否处于训练模式 `training` 等等。
# 方法允许传递任意其他关键字参数 `kwargs`。
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
decoder_input_ids: np.ndarray | tf.Tensor | None = None,
decoder_attention_mask: np.ndarray | tf.Tensor | None = None,
encoder_outputs: np.ndarray | tf.Tensor | None = None,
past_key_values: Tuple[Tuple[Union[np.ndarray, tf.Tensor]]] | None = None,
doc_scores: np.ndarray | tf.Tensor | None = None,
context_input_ids: np.ndarray | tf.Tensor | None = None,
context_attention_mask: np.ndarray | tf.Tensor | None = None,
use_cache: bool | None = None,
output_attentions: bool | None = None,
output_hidden_states: bool | None = None,
output_retrieved: bool | None = None,
n_docs: int | None = None,
do_marginalize: bool | None = None,
labels: np.ndarray | tf.Tensor | None = None,
reduce_loss: bool | None = None,
return_dict: bool | None = None,
training: bool = False,
**kwargs, # needs kwargs for generation
):
pass # 方法主体未提供
# 定义一个生成方法 `generate`,用于生成文本。接受多个输入参数,包括输入的编码 `input_ids`,
# 注意力掩码 `attention_mask`,上下文输入编码 `context_input_ids` 和上下文注意力掩码 `context_attention_mask`,
# 文档分数 `doc_scores`,文档数量 `n_docs`,生成配置 `generation_config`,
# 对 logits 进行处理的处理器 `logits_processor`,以及其他关键字参数 `kwargs`。
def generate(
self,
input_ids: TFModelInputType | None = None,
attention_mask: tf.Tensor | None = None,
context_input_ids=None,
context_attention_mask=None,
doc_scores=None,
n_docs=None,
generation_config=None,
logits_processor=TFLogitsProcessorList(),
**kwargs,
):
pass # 方法主体未提供
# 返回 RAG 模型中生成器的输入嵌入
def get_input_embeddings(self):
return self.rag.generator.get_input_embeddings()
# 返回 RAG 模型中生成器的输出嵌入
def get_output_embeddings(self):
return self.rag.generator.get_output_embeddings()
# 从 tf_t5 和 tf_bart 的 _shift_right 方法进行适配
# 该方法可能实现了类似于将序列向右移动一个位置的功能
# 但具体实现细节不在此处提供
# 适配自 tf_t5 和 tf_bart 的 _shift_right 方法
# 将输入的 token ids 向右移动一位,并用 start_token_id 进行填充
def shift_tokens_right(self, input_ids, start_token_id=None):
"""Shift input ids one token to the right, and pad with start_token_id"""
if start_token_id is None:
start_token_id = self.generator.config.decoder_start_token_id
assert start_token_id is not None, (
"self.generator.config.decoder_start_token_id has to be defined. In Rag we commonly use Bart as"
" generator, see Bart docs for more information"
)
pad_token_id = self.generator.config.pad_token_id
assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined."
# 创建一个形状为 (batch_size, 1) 的张量,用 start_token_id 填充
start_tokens = tf.fill((shape_list(input_ids)[0], 1), tf.cast(start_token_id, input_ids.dtype))
# 将 start_tokens 与 input_ids 的前 n-1 列拼接起来,实现向右移动一位的效果
shifted_input_ids = tf.concat([start_tokens, input_ids[:, :-1]], -1)
# 将 labels 中可能存在的 -100 值替换为 pad_token_id
shifted_input_ids = tf.where(
shifted_input_ids == -100,
tf.fill(shape_list(shifted_input_ids), tf.cast(pad_token_id, input_ids.dtype)),
shifted_input_ids,
)
# 使用断言确保 `labels` 中只有正值和 -100
assert_gte0 = tf.debugging.assert_greater_equal(shifted_input_ids, tf.cast(0, shifted_input_ids.dtype))
# 确保断言操作被调用,通过在结果中包裹一个 identity 操作
with tf.control_dependencies([assert_gte0]):
shifted_input_ids = tf.identity(shifted_input_ids)
return shifted_input_ids
# nll 代表 'negative log likelihood'
def get_nll(self, seq_logits, doc_scores, target, reduce_loss=False, epsilon=0.0, n_docs=None):
n_docs = n_docs if n_docs is not None else self.config.n_docs
# 将 tokens 向左移动(来自原始的 PyTorch 版本)
# 将 target 的每一行向左移动一个 token,并用 self.config.generator.pad_token_id 进行填充
target = tf.concat(
[target[:, 1:], tf.fill([target.shape[0], 1], tf.cast(self.config.generator.pad_token_id, target.dtype))],
axis=1,
)
# 对 seq_logits 和 doc_scores 进行边缘化,得到 rag_logprobs
rag_logprobs = self.marginalize(seq_logits, doc_scores, n_docs)
# 计算损失,匹配 logits 版本,reduce_loss 参数决定是否减少损失
loss = self.hf_compute_loss(target, rag_logprobs, from_logits=True, reduce_loss=reduce_loss)
return loss
# 采用 modeling_tf_bart,并添加 smooth_loss 以匹配 PyTorch 版本
def hf_compute_loss(self, labels, y_pred, smooth_epsilon=0.0, from_logits=True, reduce_loss=False):
"""计算损失函数,忽略填充标记的交叉熵损失"""
# Matt: 该损失函数目前无法与XLA兼容,但它执行了一些非常奇怪的操作,
# 我不太确定如何转换它。
# 这里执行了一些非常奇怪的操作,我不太确定如何转换它。
# 定义损失函数为稀疏分类交叉熵损失,用于处理输出为 logits 的情况
loss_fn = keras.losses.SparseCategoricalCrossentropy(
from_logits=True, # 输出是否为 logits
reduction=keras.losses.Reduction.SUM, # 损失函数如何进行汇总
)
if from_logits is False: # 如果输出不是 logits,则将其转换为 logits
eps = 1e-9
y_pred = tf.clip_by_value(y_pred, clip_value_min=eps, clip_value_max=1 - eps)
y_pred = tf.math.log(y_pred)
logits = y_pred
melted_labels = tf.reshape(labels, (-1,))
# 找出非填充标记的位置
active_loss = tf.not_equal(melted_labels, self.config.generator.pad_token_id)
# 根据非填充标记的位置,筛选出有效的 logits 和 labels
reduced_logits = tf.boolean_mask(tf.reshape(logits, (-1, logits.shape[2])), active_loss)
labels = tf.boolean_mask(melted_labels, active_loss)
# 计算交叉熵损失
nll_loss = loss_fn(labels, reduced_logits)
# 计算平滑损失
smooth_loss = -tf.reduce_sum(reduced_logits, axis=-1)
smooth_loss = tf.reduce_sum(smooth_loss) # 类似于 torch 的 sum 和 squeeze 操作
eps_i = smooth_epsilon / reduced_logits.shape[-1]
# 计算最终损失函数,结合交叉熵损失和平滑损失
loss = (1.0 - smooth_epsilon) * nll_loss + eps_i * smooth_loss
return loss
def build(self, input_shape=None):
# 如果已经构建过,则直接返回
if self.built:
return
self.built = True
# 如果存在 rag 属性,则在 rag 的命名空间下构建模型
if getattr(self, "rag", None) is not None:
with tf.name_scope(self.rag.name):
self.rag.build(None)
# 使用装饰器为模型的 call 方法添加文档字符串,描述其功能为执行RAG-sequence模型的前向传播过程
@add_start_docstrings_to_model_forward(
"""
A TF RAG-sequence model implementation. It performs RAG-sequence specific marginalization in the forward pass.
""",
RAG_START_DOCSTRING,
)
class TFRagSequenceForGeneration(TFRagPreTrainedModel, TFCausalLanguageModelingLoss):
# 加载权重的前缀
load_weight_prefix = "tf_rag_sequence_for_generation_1/rag"
# 初始化方法
def __init__(
self,
config: Optional[PretrainedConfig] = None,
question_encoder: Optional[TFPreTrainedModel] = None,
generator: Optional[TFPreTrainedModel] = None,
retriever: Optional[RagRetriever] = None,
**kwargs,
):
# 断言确保提供了配置或者问题编码器与生成器
assert config is not None or (
question_encoder is not None and generator is not None
), "Either a configuration or an encoder and a generator has to be provided."
# 如果未提供配置,则从问题编码器和生成器配置中创建 RagConfig 对象
if config is None:
config = RagConfig.from_question_encoder_generator_configs(
question_encoder.config, generator.config, **kwargs
)
# 调用父类初始化方法
super().__init__(config)
# 实例化模型
self.rag = TFRagModel(
config=config,
question_encoder=question_encoder,
generator=generator,
retriever=retriever,
load_weight_prefix=self.load_weight_prefix,
name="rag",
)
# 设置检索器的方法
def set_retriever(self, retriever: RagRetriever):
self.rag.retriever = retriever
# 检索器属性的 getter 方法
@property
def retriever(self):
return self.rag.retriever
# 生成器属性的 getter 方法
@property
def generator(self):
return self.rag.generator
# 问题编码器属性的 getter 方法
@property
def question_encoder(self):
return self.rag.question_encoder
# 装饰器为 call 方法添加文档字符串,描述其输入输出及功能
@unpack_inputs
@add_start_docstrings_to_model_forward(RAG_FORWARD_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=TFRetrievAugLMMarginOutput, config_class=_CONFIG_FOR_DOC)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
decoder_input_ids: np.ndarray | tf.Tensor | None = None,
decoder_attention_mask: np.ndarray | tf.Tensor | None = None,
encoder_outputs: np.ndarray | tf.Tensor | None = None,
past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
doc_scores: np.ndarray | tf.Tensor | None = None,
context_input_ids: np.ndarray | tf.Tensor | None = None,
context_attention_mask: np.ndarray | tf.Tensor | None = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
output_retrieved: Optional[bool] = None,
n_docs: Optional[int] = None,
exclude_bos_score: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
reduce_loss: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
**kwargs, # needs kwargs for generation
):
# 实现模型的前向传播
pass # The actual implementation details would follow here, but are not provided in the snippet
# 定义一个方法 get_nll,接受一些参数:seq_logits 是序列的逻辑回归输出,doc_scores 是文档得分,target 是目标值
# reduce_loss 控制是否减少损失,默认为 False;epsilon 是一个小数,排除 BOS 得分,默认为 False
# n_docs 是文档数量,默认为 None
def get_nll(
self, seq_logits, doc_scores, target, reduce_loss=False, epsilon=0.0, exclude_bos_score=False, n_docs=None
):
pass # 这里只是定义方法的结构,没有具体实现
# 定义一个生成方法 generate,接受多个输入参数:input_ids 是模型输入的 token IDs
# attention_mask 是注意力掩码,context_input_ids 和 context_attention_mask 是上下文相关的输入
# doc_scores 是文档得分,do_deduplication 控制是否去重,默认为 True;num_return_sequences 控制返回的序列数量,默认为 1
# num_beams 控制束搜索的数量,默认为 1,n_docs 是文档数量,默认为 None
def generate(
self,
input_ids: TFModelInputType | None = None,
attention_mask: tf.Tensor | None = None,
context_input_ids=None,
context_attention_mask=None,
doc_scores=None,
do_deduplication=None, # 默认为 True
num_return_sequences=None, # 默认为 1
num_beams=None, # 默认为 1
n_docs=None,
**model_kwargs,
):
pass # 这里只是定义方法的结构,没有具体实现
# 静态方法 _cat_and_pad 用于生成方法 generate 中的输入张量列表的拼接和填充
@staticmethod
def _cat_and_pad(tensors, pad_token_id):
# used by generate(): tensors is a (batched) list of (candidates, len); len is varied across batch
# 方法 generate 的辅助方法,tensors 是一个 (批量化的) 列表,每个元素是 (候选项,长度);长度在批次中可能不同
# 初始化一个填充后的张量,形状为 (所有候选项总数,最大候选项长度)
new_shape = sum([t.shape[0] for t in tensors]), max([t.shape[1] for t in tensors])
output = tf.fill(new_shape, pad_token_id)
# 使用 tf.Variable 创建可变张量,因为普通张量不支持切片赋值
output = tf.Variable(output)
# 逐个赋值每个输入张量的内容到 output 中相应位置
ind = 0
for t in tensors:
output[ind : ind + t.shape[0], : t.shape[1]].assign(t)
ind += t.shape[0]
# 转换回普通张量并返回,确保类型与第一个张量的元素类型一致
output = tf.convert_to_tensor(output)
return tf.cast(output, tensors[0][0].dtype)
# 方法 build 用于构建对象,初始化对象的属性和状态
def build(self, input_shape=None):
if self.built:
return # 如果已经构建过,直接返回
self.built = True # 标记对象已经构建
# 如果对象具有属性 rag,则在 rag 的命名空间下构建对象
if getattr(self, "rag", None) is not None:
with tf.name_scope(self.rag.name):
self.rag.build(None)
.\models\rag\retrieval_rag.py
"""
RAG Retriever model implementation.
"""
import os
import pickle
import time
from typing import Iterable, List, Optional, Tuple
import numpy as np
from ...tokenization_utils import PreTrainedTokenizer
from ...tokenization_utils_base import BatchEncoding
from ...utils import cached_file, is_datasets_available, is_faiss_available, logging, requires_backends, strtobool
from .configuration_rag import RagConfig
from .tokenization_rag import RagTokenizer
if is_datasets_available():
from datasets import Dataset, load_dataset, load_from_disk
if is_faiss_available():
import faiss
logger = logging.get_logger(__name__)
LEGACY_INDEX_PATH = "https://storage.googleapis.com/huggingface-nlp/datasets/wiki_dpr/"
class Index:
"""
A base class for the Indices encapsulated by the [`RagRetriever`].
"""
def get_doc_dicts(self, doc_ids: np.ndarray) -> List[dict]:
"""
Returns a list of dictionaries, containing titles and text of the retrieved documents.
Args:
doc_ids (`np.ndarray` of shape `(batch_size, n_docs)`):
A tensor of document indices.
"""
raise NotImplementedError
def get_top_docs(self, question_hidden_states: np.ndarray, n_docs=5) -> Tuple[np.ndarray, np.ndarray]:
"""
For each query in the batch, retrieves `n_docs` documents.
Args:
question_hidden_states (`np.ndarray` of shape `(batch_size, vector_size)`):
An array of query vectors.
n_docs (`int`):
The number of docs retrieved per query.
Returns:
`np.ndarray` of shape `(batch_size, n_docs)`: A tensor of indices of retrieved documents.
`np.ndarray` of shape `(batch_size, vector_size)`: A tensor of vector representations of retrieved documents.
"""
raise NotImplementedError
def is_initialized(self):
"""
Returns `True` if index is already initialized.
"""
raise NotImplementedError
def init_index(self):
"""
A function responsible for loading the index into memory. Should be called only once per training run of a RAG
model. E.g. if the model is trained on multiple GPUs in a distributed setup, only one of the workers will load
the index.
"""
raise NotImplementedError
"""
一个可以从使用 https://github.com/facebookresearch/DPR 构建的文件中反序列化的索引。我们使用该仓库中指定的默认 faiss 索引参数。
Args:
vector_size (`int`):
索引向量的维度。
index_path (`str`):
包含与 [`~models.rag.retrieval_rag.LegacyIndex`] 兼容的索引文件的 *目录* 路径。
"""
INDEX_FILENAME = "hf_bert_base.hnswSQ8_correct_phi_128.c_index"
PASSAGE_FILENAME = "psgs_w100.tsv.pkl"
def __init__(self, vector_size, index_path):
self.index_id_to_db_id = []
self.index_path = index_path
self.passages = self._load_passages()
self.vector_size = vector_size
self.index = None
self._index_initialized = False
def _resolve_path(self, index_path, filename):
is_local = os.path.isdir(index_path)
try:
resolved_archive_file = cached_file(index_path, filename)
except EnvironmentError:
msg = (
f"Can't load '{filename}'. Make sure that:\n\n"
f"- '{index_path}' is a correct remote path to a directory containing a file named {filename}\n\n"
f"- or '{index_path}' is the correct path to a directory containing a file named {filename}.\n\n"
)
raise EnvironmentError(msg)
if is_local:
logger.info(f"loading file {resolved_archive_file}")
else:
logger.info(f"loading file {filename} from cache at {resolved_archive_file}")
return resolved_archive_file
def _load_passages(self):
logger.info(f"Loading passages from {self.index_path}")
passages_path = self._resolve_path(self.index_path, self.PASSAGE_FILENAME)
if not strtobool(os.environ.get("TRUST_REMOTE_CODE", "False")):
raise ValueError(
"This part uses `pickle.load` which is insecure and will execute arbitrary code that is potentially "
"malicious. It's recommended to never unpickle data that could have come from an untrusted source, or "
"that could have been tampered with. If you already verified the pickle data and decided to use it, "
"you can set the environment variable `TRUST_REMOTE_CODE` to `True` to allow it."
)
with open(passages_path, "rb") as passages_file:
passages = pickle.load(passages_file)
return passages
logger.info(f"Loading index from {self.index_path}")
resolved_index_path = self._resolve_path(self.index_path, self.INDEX_FILENAME + ".index.dpr")
self.index = faiss.read_index(resolved_index_path)
resolved_meta_path = self._resolve_path(self.index_path, self.INDEX_FILENAME + ".index_meta.dpr")
if not strtobool(os.environ.get("TRUST_REMOTE_CODE", "False")):
raise ValueError(
"This part uses `pickle.load` which is insecure and will execute arbitrary code that is potentially "
"malicious. It's recommended to never unpickle data that could have come from an untrusted source, or "
"that could have been tampered with. If you already verified the pickle data and decided to use it, "
"you can set the environment variable `TRUST_REMOTE_CODE` to `True` to allow it."
)
with open(resolved_meta_path, "rb") as metadata_file:
self.index_id_to_db_id = pickle.load(metadata_file)
assert (
len(self.index_id_to_db_id) == self.index.ntotal
), "Deserialized index_id_to_db_id should match faiss index size"
class HFIndexBase(Index):
def __init__(self, vector_size, dataset, index_initialized=False):
self.vector_size = vector_size
self.dataset = dataset
self._index_initialized = index_initialized
self._check_dataset_format(with_index=index_initialized)
dataset.set_format("numpy", columns=["embeddings"], output_all_columns=True, dtype="float32")
def _check_dataset_format(self, with_index: bool):
if not isinstance(self.dataset, Dataset):
raise ValueError(f"Dataset should be a datasets.Dataset object, but got {type(self.dataset)}")
if len({"title", "text", "embeddings"} - set(self.dataset.column_names)) > 0:
raise ValueError(
"Dataset should be a dataset with the following columns: "
"title (str), text (str) and embeddings (arrays of dimension vector_size), "
f"but got columns {self.dataset.column_names}"
)
if with_index and "embeddings" not in self.dataset.list_indexes():
raise ValueError(
"Missing faiss index in the dataset. Make sure you called `dataset.add_faiss_index` to compute it "
"or `dataset.load_faiss_index` to load one from the disk."
)
def init_index(self):
raise NotImplementedError()
def is_initialized(self):
return self._index_initialized
def get_doc_dicts(self, doc_ids: np.ndarray) -> List[dict]:
return [self.dataset[doc_ids[i].tolist()] for i in range(doc_ids.shape[0])]
def get_top_docs(self, question_hidden_states: np.ndarray, n_docs=5) -> Tuple[np.ndarray, np.ndarray]:
_, ids = self.dataset.search_batch("embeddings", question_hidden_states, n_docs)
docs = [self.dataset[[i for i in indices if i >= 0]] for indices in ids]
vectors = [doc["embeddings"] for doc in docs]
for i in range(len(vectors)):
if len(vectors[i]) < n_docs:
vectors[i] = np.vstack([vectors[i], np.zeros((n_docs - len(vectors[i]), self.vector_size))])
return np.array(ids), np.array(vectors)
class CanonicalHFIndex(HFIndexBase):
"""
A wrapper around an instance of [`~datasets.Datasets`]. If `index_path` is set to `None`, we load the pre-computed
index available with the [`~datasets.arrow_dataset.Dataset`], otherwise, we load the index from the indicated path
on disk.
"""
"""
Args:
vector_size (`int`): the dimension of the passages embeddings used by the index
dataset_name (`str`, optional, defaults to `wiki_dpr`):
A dataset identifier of the indexed dataset on HuggingFace AWS bucket (list all available datasets and ids
with `datasets.list_datasets()`).
dataset_split (`str`, optional, defaults to `train`):
Which split of the `dataset` to load.
index_name (`str`, optional, defaults to `train`):
The index_name of the index associated with the `dataset`. The index loaded from `index_path` will be saved
under this name.
index_path (`str`, optional, defaults to `None`):
The path to the serialized faiss index on disk.
use_dummy_dataset (`bool`, optional, defaults to `False`):
If True, use the dummy configuration of the dataset for tests.
"""
def __init__(
self,
vector_size: int,
dataset_name: str = "wiki_dpr",
dataset_split: str = "train",
index_name: Optional[str] = None,
index_path: Optional[str] = None,
use_dummy_dataset: bool = False,
dataset_revision=None,
):
if int(index_path is None) + int(index_name is None) != 1:
raise ValueError("Please provide `index_name` or `index_path`.")
self.dataset_name = dataset_name
self.dataset_split = dataset_split
self.index_name = index_name
self.index_path = index_path
self.use_dummy_dataset = use_dummy_dataset
self.dataset_revision = dataset_revision
logger.info(f"Loading passages from {self.dataset_name}")
dataset = load_dataset(
self.dataset_name,
with_index=False,
split=self.dataset_split,
dummy=self.use_dummy_dataset,
revision=self.dataset_revision,
)
super().__init__(vector_size, dataset, index_initialized=False)
def init_index(self):
if self.index_path is not None:
logger.info(f"Loading index from {self.index_path}")
self.dataset.load_faiss_index("embeddings", file=self.index_path)
else:
logger.info(f"Loading index from {self.dataset_name} with index name {self.index_name}")
self.dataset = load_dataset(
self.dataset_name,
with_embeddings=True,
with_index=True,
split=self.dataset_split,
index_name=self.index_name,
dummy=self.use_dummy_dataset,
revision=self.dataset_revision,
)
self.dataset.set_format("numpy", columns=["embeddings"], output_all_columns=True)
self._index_initialized = True
class CustomHFIndex(HFIndexBase):
"""
A wrapper around an instance of [`datasets.Datasets`]. The dataset and the index are both loaded from the
indicated paths on disk.
Args:
vector_size (`int`): the dimension of the passages embeddings used by the index
dataset_path (`str`):
The path to the serialized dataset on disk. The dataset should have 3 columns: title (str), text (str) and
embeddings (arrays of dimension vector_size)
index_path (`str`)
The path to the serialized faiss index on disk.
"""
def __init__(self, vector_size: int, dataset, index_path=None):
super().__init__(vector_size, dataset, index_initialized=index_path is None)
self.index_path = index_path
@classmethod
def load_from_disk(cls, vector_size, dataset_path, index_path):
logger.info(f"Loading passages from {dataset_path}")
if dataset_path is None or index_path is None:
raise ValueError(
"Please provide `dataset_path` and `index_path` after calling `dataset.save_to_disk(dataset_path)` "
"and `dataset.get_index('embeddings').save(index_path)`."
)
dataset = load_from_disk(dataset_path)
return cls(vector_size=vector_size, dataset=dataset, index_path=index_path)
def init_index(self):
if not self.is_initialized():
logger.info(f"Loading index from {self.index_path}")
self.dataset.load_faiss_index("embeddings", file=self.index_path)
self._index_initialized = True
class RagRetriever:
def __init__(self, config, question_encoder_tokenizer, generator_tokenizer, index=None, init_retrieval=True):
self._init_retrieval = init_retrieval
requires_backends(self, ["datasets", "faiss"])
super().__init__()
self.index = index or self._build_index(config)
self.generator_tokenizer = generator_tokenizer
self.question_encoder_tokenizer = question_encoder_tokenizer
self.n_docs = config.n_docs
self.batch_size = config.retrieval_batch_size
self.config = config
if self._init_retrieval:
self.init_retrieval()
self.ctx_encoder_tokenizer = None
self.return_tokenized_docs = False
@staticmethod
def _build_index(config):
if config.index_name == "legacy":
return LegacyIndex(
config.retrieval_vector_size,
config.index_path or LEGACY_INDEX_PATH,
)
elif config.index_name == "custom":
return CustomHFIndex.load_from_disk(
vector_size=config.retrieval_vector_size,
dataset_path=config.passages_path,
index_path=config.index_path,
)
else:
return CanonicalHFIndex(
vector_size=config.retrieval_vector_size,
dataset_name=config.dataset,
dataset_split=config.dataset_split,
index_name=config.index_name,
index_path=config.index_path,
use_dummy_dataset=config.use_dummy_dataset,
dataset_revision=config.dataset_revision,
)
@classmethod
def from_pretrained(cls, retriever_name_or_path, indexed_dataset=None, **kwargs):
requires_backends(cls, ["datasets", "faiss"])
config = kwargs.pop("config", None) or RagConfig.from_pretrained(retriever_name_or_path, **kwargs)
rag_tokenizer = RagTokenizer.from_pretrained(retriever_name_or_path, config=config)
question_encoder_tokenizer = rag_tokenizer.question_encoder
generator_tokenizer = rag_tokenizer.generator
if indexed_dataset is not None:
config.index_name = "custom"
index = CustomHFIndex(config.retrieval_vector_size, indexed_dataset)
else:
index = cls._build_index(config)
return cls(
config,
question_encoder_tokenizer=question_encoder_tokenizer,
generator_tokenizer=generator_tokenizer,
index=index,
)
def save_pretrained(self, save_directory):
if isinstance(self.index, CustomHFIndex):
if self.config.index_path is None:
index_path = os.path.join(save_directory, "hf_dataset_index.faiss")
self.index.dataset.get_index("embeddings").save(index_path)
self.config.index_path = index_path
if self.config.passages_path is None:
passages_path = os.path.join(save_directory, "hf_dataset")
faiss_index = self.index.dataset._indexes.pop("embeddings")
self.index.dataset.save_to_disk(passages_path)
self.index.dataset._indexes["embeddings"] = faiss_index
self.config.passages_path = passages_path
self.config.save_pretrained(save_directory)
rag_tokenizer = RagTokenizer(
question_encoder=self.question_encoder_tokenizer,
generator=self.generator_tokenizer,
)
rag_tokenizer.save_pretrained(save_directory)
def init_retrieval(self):
"""
Retriever initialization function. It loads the index into memory.
"""
logger.info("initializing retrieval")
self.index.init_index()
def postprocess_docs(self, docs, input_strings, prefix, n_docs, return_tensors=None):
r"""
Postprocessing retrieved `docs` and combining them with `input_strings`.
Args:
docs (`dict`):
Retrieved documents.
input_strings (`str`):
Input strings decoded by `preprocess_query`.
prefix (`str`):
Prefix added at the beginning of each input, typically used with T5-based models.
Return:
`tuple(tensors)`: a tuple consisting of two elements: contextualized `input_ids` and a compatible
`attention_mask`.
"""
def cat_input_and_doc(doc_title, doc_text, input_string, prefix):
if doc_title.startswith('"'):
doc_title = doc_title[1:]
if doc_title.endswith('"'):
doc_title = doc_title[:-1]
if prefix is None:
prefix = ""
out = (prefix + doc_title + self.config.title_sep + doc_text + self.config.doc_sep + input_string).replace(
" ", " "
)
return out
rag_input_strings = [
cat_input_and_doc(
docs[i]["title"][j],
docs[i]["text"][j],
input_strings[i],
prefix,
)
for i in range(len(docs))
for j in range(n_docs)
]
contextualized_inputs = self.generator_tokenizer.batch_encode_plus(
rag_input_strings,
max_length=self.config.max_combined_length,
return_tensors=return_tensors,
padding="max_length",
truncation=True,
)
return contextualized_inputs["input_ids"], contextualized_inputs["attention_mask"]
def _chunk_tensor(self, t: Iterable, chunk_size: int) -> List[Iterable]:
return [t[i : i + chunk_size] for i in range(0, len(t), chunk_size)]
def _main_retrieve(self, question_hidden_states: np.ndarray, n_docs: int) -> Tuple[np.ndarray, np.ndarray]:
question_hidden_states_batched = self._chunk_tensor(question_hidden_states, self.batch_size)
ids_batched = []
vectors_batched = []
for question_hidden_states in question_hidden_states_batched:
start_time = time.time()
ids, vectors = self.index.get_top_docs(question_hidden_states, n_docs)
logger.debug(
f"index search time: {time.time() - start_time} sec, batch size {question_hidden_states.shape}"
)
ids_batched.extend(ids)
vectors_batched.extend(vectors)
return (
np.array(ids_batched),
np.array(vectors_batched),
)
def retrieve(self, question_hidden_states: np.ndarray, n_docs: int) -> Tuple[np.ndarray, List[dict]]:
"""
为指定的 `question_hidden_states` 检索文档。
Args:
question_hidden_states (`np.ndarray` of shape `(batch_size, vector_size)`):
要检索的查询向量的批次。
n_docs (`int`):
每个查询检索的文档数量。
Return:
`Tuple[np.ndarray, np.ndarray, List[dict]]`: 返回包含以下对象的元组:
- **retrieved_doc_embeds** (`np.ndarray` of shape `(batch_size, n_docs, dim)`) -- 每个查询的检索嵌入的文档。
- **doc_ids** (`np.ndarray` of shape `(batch_size, n_docs)`) -- 索引中文档的 ids。
- **doc_dicts** (`List[dict]`): 每个查询的 `retrieved_doc_embeds` 示例。
"""
doc_ids, retrieved_doc_embeds = self._main_retrieve(question_hidden_states, n_docs)
return retrieved_doc_embeds, doc_ids, self.index.get_doc_dicts(doc_ids)
def set_ctx_encoder_tokenizer(self, ctx_encoder_tokenizer: PreTrainedTokenizer):
self.ctx_encoder_tokenizer = ctx_encoder_tokenizer
self.return_tokenized_docs = True
def __call__(
self,
question_input_ids: List[List[int]],
question_hidden_states: np.ndarray,
prefix=None,
n_docs=None,
return_tensors=None,
):