Transformers 源码解析(一百三十四)
.\tf_utils.py
from typing import List, Optional, Union
import numpy as np
import tensorflow as tf
from .feature_extraction_utils import BatchFeature
from .tokenization_utils_base import BatchEncoding
from .utils import logging
logger = logging.get_logger(__name__)
def shape_list(tensor: Union[tf.Tensor, np.ndarray]) -> List[int]:
"""
处理 TensorFlow 中的动态形状。
Args:
tensor (`tf.Tensor` or `np.ndarray`): 要获取形状的张量或数组。
Returns:
`List[int]`: 张量的形状列表。
"""
if isinstance(tensor, np.ndarray):
return list(tensor.shape)
dynamic = tf.shape(tensor)
if tensor.shape == tf.TensorShape(None):
return dynamic
static = tensor.shape.as_list()
return [dynamic[i] if s is None else s for i, s in enumerate(static)]
def stable_softmax(logits: tf.Tensor, axis: Optional[int] = None, name: Optional[str] = None) -> tf.Tensor:
"""
稳定的 softmax 函数,用于解决 TensorFlow 在 CPU 上与 XLA 结合时的问题。
Args:
logits (`tf.Tensor`): 输入的对数概率张量。
axis (`int`, *optional*): 执行 softmax 操作的维度,默认为 -1 表示最后一个维度。
name (`str`, *optional*): 操作的名称。
Returns:
`tf.Tensor`: 与 logits 具有相同类型和形状的张量,经过 softmax 处理。
"""
return tf.nn.softmax(logits=logits + 1e-9, axis=axis, name=name)
def functional_layernorm(inputs, weight, bias, epsilon=1e-5, axis=-1):
if weight.shape.rank != 1 or bias.shape.rank != 1 or not isinstance(axis, int):
raise NotImplementedError("Only 1D weight and bias tensors are supported for now, with only a single axis.")
mean, variance = tf.nn.moments(inputs, axes=[axis], keepdims=True)
if axis != -1:
shape = [1] * inputs.shape.rank
shape[axis] = shape_list(inputs)[axis]
weight = tf.reshape(weight, shape)
bias = tf.reshape(bias, shape)
outputs = tf.nn.batch_normalization(
inputs,
mean,
variance,
offset=bias,
scale=weight,
variance_epsilon=epsilon,
)
return outputs
def flatten(input, start_dim=0, end_dim=-1):
if end_dim < 0:
end_dim += input.shape.rank
if start_dim < 0:
start_dim += input.shape.rank
if start_dim == end_dim:
return input
in_shape = tf.shape(input)
flattened_dim = tf.math.reduce_prod(in_shape[start_dim : end_dim + 1])
out_shape = tf.concat([in_shape[:start_dim], [flattened_dim], in_shape[end_dim + 1 :]], axis=0)
return tf.reshape(input, out_shape)
def invert_attention_mask(encoder_attention_mask: tf.Tensor) -> tf.Tensor:
"""
Invert an attention mask (e.g., switches 0. and 1.).
Args:
encoder_attention_mask (`torch.Tensor`): An attention mask.
Returns:
`tf.Tensor`: The inverted attention mask.
"""
if not isinstance(encoder_attention_mask, tf.Tensor):
encoder_attention_mask = tf.convert_to_tensor(encoder_attention_mask)
if encoder_attention_mask.shape.rank == 3:
encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
if encoder_attention_mask.shape.rank == 2:
encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
encoder_extended_attention_mask = (
tf.cast(1, encoder_attention_mask.dtype) - encoder_extended_attention_mask
) * encoder_extended_attention_mask.dtype.min
return encoder_extended_attention_mask
def check_embeddings_within_bounds(tensor: tf.Tensor, embed_dim: int, tensor_name: str = "input_ids") -> None:
"""
`tf.gather`, on which TF embedding layers are based, won't check positive out of bound indices on GPU, returning
zeros instead. This function adds a check against that dangerous silent behavior.
Args:
tensor (`tf.Tensor`): The tensor of indices to check.
embed_dim (`int`): The embedding dimension.
tensor_name (`str`, *optional*): The name of the tensor to use in the error message.
"""
tf.debugging.assert_less(
tensor,
tf.cast(embed_dim, dtype=tensor.dtype),
message=(
f"The maximum value of {tensor_name} ({tf.math.reduce_max(tensor)}) must be smaller than the embedding "
f"layer's input dimension ({embed_dim}). The likely cause is some problem at tokenization time."
),
)
def save_attributes_to_hdf5_group(group, name, data):
"""Saves attributes (data) of the specified name into the HDF5 group.
This function saves attributes (data) with a given name into the specified HDF5 group.
Args:
group: HDF5 group where the attributes will be saved.
name: Name of the attribute to save.
data: Data to be saved as the attribute.
"""
"""
This method deals with an inherent problem of HDF5 file which is not able to store data larger than
HDF5_OBJECT_HEADER_LIMIT bytes.
Args:
group: A pointer to a HDF5 group.
name: A name of the attributes to save.
data: Attributes data to store.
Raises:
RuntimeError: If any single attribute is too large to be saved.
Copied from Keras to Transformers to avoid versioning issues.
"""
HDF5_OBJECT_HEADER_LIMIT = 64512
bad_attributes = [x for x in data if len(x) > HDF5_OBJECT_HEADER_LIMIT]
if bad_attributes:
raise RuntimeError(
"The following attributes cannot be saved to HDF5 file because "
f"they are larger than {HDF5_OBJECT_HEADER_LIMIT} "
f"bytes: {bad_attributes}"
)
data_npy = np.asarray(data)
num_chunks = 1
chunked_data = np.array_split(data_npy, num_chunks)
while any(x.nbytes > HDF5_OBJECT_HEADER_LIMIT for x in chunked_data):
num_chunks += 1
chunked_data = np.array_split(data_npy, num_chunks)
if num_chunks > 1:
for chunk_id, chunk_data in enumerate(chunked_data):
group.attrs["%s%d" % (name, chunk_id)] = chunk_data
else:
group.attrs[name] = data
def load_attributes_from_hdf5_group(group, name):
"""Loads attributes of the specified name from the HDF5 group.
This method deals with an inherent problem of HDF5 file which is not able to store data larger than
HDF5_OBJECT_HEADER_LIMIT bytes.
Args:
group: A pointer to a HDF5 group.
name: A name of the attributes to load.
Returns:
data: Attributes data.
Copied from Keras to Transformers to avoid versioning issues.
"""
if name in group.attrs:
data = [n.decode("utf8") if hasattr(n, "decode") else n for n in group.attrs[name]]
else:
data = []
chunk_id = 0
while "%s%d" % (name, chunk_id) in group.attrs:
data.extend(
[n.decode("utf8") if hasattr(n, "decode") else n for n in group.attrs["%s%d" % (name, chunk_id)]]
)
chunk_id += 1
return data
def expand_1d(data):
"""Expands 1-dimensional `Tensor`s into 2-dimensional `Tensor`s.
Copied from Keras to here to avoid versioning issues."""
def _expand_single_1d_tensor(t):
if isinstance(t, tf.Tensor) and t.shape.rank == 1:
return tf.expand_dims(t, axis=-1)
return t
return tf.nest.map_structure(_expand_single_1d_tensor, data)
def convert_batch_encoding(*args, **kwargs):
if args and isinstance(args[0], (BatchEncoding, BatchFeature)):
args = list(args)
args[0] = dict(args[0])
elif "x" in kwargs and isinstance(kwargs["x"], (BatchEncoding, BatchFeature)):
kwargs["x"] = dict(kwargs["x"])
return args, kwargs
.\time_series_utils.py
"""
时间序列分布输出类和实用程序。
"""
from typing import Callable, Dict, Optional, Tuple
import torch
from torch import nn
from torch.distributions import (
AffineTransform,
Distribution,
Independent,
NegativeBinomial,
Normal,
StudentT,
TransformedDistribution,
)
class AffineTransformed(TransformedDistribution):
def __init__(self, base_distribution: Distribution, loc=None, scale=None, event_dim=0):
self.loc = 0.0 if loc is None else loc
self.scale = 1.0 if scale is None else scale
super().__init__(base_distribution, [AffineTransform(loc=self.loc, scale=self.scale, event_dim=event_dim)])
@property
def mean(self):
"""
返回分布的均值。
"""
return self.base_dist.mean * self.scale + self.loc
@property
def variance(self):
"""
返回分布的方差。
"""
return self.base_dist.variance * self.scale**2
@property
def stddev(self):
"""
返回分布的标准差。
"""
return self.variance.sqrt()
class ParameterProjection(nn.Module):
def __init__(
self, in_features: int, args_dim: Dict[str, int], domain_map: Callable[..., Tuple[torch.Tensor]], **kwargs
) -> None:
super().__init__(**kwargs)
self.args_dim = args_dim
self.proj = nn.ModuleList([nn.Linear(in_features, dim) for dim in args_dim.values()])
self.domain_map = domain_map
def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor]:
params_unbounded = [proj(x) for proj in self.proj]
return self.domain_map(*params_unbounded)
class LambdaLayer(nn.Module):
def __init__(self, function):
super().__init__()
self.function = function
def forward(self, x, *args):
return self.function(x, *args)
class DistributionOutput:
distribution_class: type
in_features: int
args_dim: Dict[str, int]
def __init__(self, dim: int = 1) -> None:
self.dim = dim
self.args_dim = {k: dim * self.args_dim[k] for k in self.args_dim}
def _base_distribution(self, distr_args):
if self.dim == 1:
return self.distribution_class(*distr_args)
else:
return Independent(self.distribution_class(*distr_args), 1)
def distribution(
self,
distr_args,
loc: Optional[torch.Tensor] = None,
scale: Optional[torch.Tensor] = None,
) -> Distribution:
distr = self._base_distribution(distr_args)
if loc is None and scale is None:
return distr
else:
return AffineTransformed(distr, loc=loc, scale=scale, event_dim=self.event_dim)
@property
def event_shape(self) -> Tuple:
r"""
Shape of each individual event contemplated by the distributions that this object constructs.
"""
return () if self.dim == 1 else (self.dim,)
@property
def event_dim(self) -> int:
r"""
Number of event dimensions, i.e., length of the `event_shape` tuple, of the distributions that this object
constructs.
"""
return len(self.event_shape)
@property
def value_in_support(self) -> float:
r"""
A float that will have a valid numeric value when computing the log-loss of the corresponding distribution. By
default 0.0. This value will be used when padding data series.
"""
return 0.0
def get_parameter_projection(self, in_features: int) -> nn.Module:
r"""
Return the parameter projection layer that maps the input to the appropriate parameters of the distribution.
"""
return ParameterProjection(
in_features=in_features,
args_dim=self.args_dim,
domain_map=LambdaLayer(self.domain_map),
)
def domain_map(self, *args: torch.Tensor):
r"""
Converts arguments to the right shape and domain. The domain depends on the type of distribution, while the
correct shape is obtained by reshaping the trailing axis in such a way that the returned tensors define a
distribution of the right event_shape.
"""
raise NotImplementedError()
@staticmethod
def squareplus(x: torch.Tensor) -> torch.Tensor:
r"""
Helper to map inputs to the positive orthant by applying the square-plus operation. Reference:
https://twitter.com/jon_barron/status/1387167648669048833
"""
return (x + torch.sqrt(torch.square(x) + 4.0)) / 2.0
class StudentTOutput(DistributionOutput):
"""
Student-T distribution output class.
"""
args_dim: Dict[str, int] = {"df": 1, "loc": 1, "scale": 1}
distribution_class: type = StudentT
@classmethod
def domain_map(cls, df: torch.Tensor, loc: torch.Tensor, scale: torch.Tensor):
scale = cls.squareplus(scale).clamp_min(torch.finfo(scale.dtype).eps)
df = 2.0 + cls.squareplus(df)
return df.squeeze(-1), loc.squeeze(-1), scale.squeeze(-1)
class NormalOutput(DistributionOutput):
"""
Normal distribution output class.
"""
args_dim: Dict[str, int] = {"loc": 1, "scale": 1}
distribution_class: type = Normal
@classmethod
def domain_map(cls, loc: torch.Tensor, scale: torch.Tensor):
scale = cls.squareplus(scale).clamp_min(torch.finfo(scale.dtype).eps)
return loc.squeeze(-1), scale.squeeze(-1)
class NegativeBinomialOutput(DistributionOutput):
"""
Negative Binomial distribution output class.
"""
args_dim: Dict[str, int] = {"total_count": 1, "logits": 1}
distribution_class: type = NegativeBinomial
@classmethod
def domain_map(cls, total_count: torch.Tensor, logits: torch.Tensor):
total_count = cls.squareplus(total_count)
return total_count.squeeze(-1), logits.squeeze(-1)
def _base_distribution(self, distr_args) -> Distribution:
total_count, logits = distr_args
if self.dim == 1:
return self.distribution_class(total_count=total_count, logits=logits)
else:
return Independent(self.distribution_class(total_count=total_count, logits=logits), 1)
def distribution(
self, distr_args, loc: Optional[torch.Tensor] = None, scale: Optional[torch.Tensor] = None
) -> Distribution:
total_count, logits = distr_args
if scale is not None:
logits += scale.log()
return self._base_distribution((total_count, logits))
.\tokenization_utils.py
"""
用于 Python Tokenizers 的标记化类。对于快速标记化器(由 HuggingFace 的 tokenizers 库提供),请参见 tokenization_utils_fast.py
"""
import bisect
import itertools
import re
import unicodedata
from collections import OrderedDict
from typing import Any, Dict, List, Optional, Tuple, Union, overload
from .tokenization_utils_base import (
ENCODE_KWARGS_DOCSTRING,
ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING,
INIT_TOKENIZER_DOCSTRING,
AddedToken,
BatchEncoding,
EncodedInput,
EncodedInputPair,
PreTokenizedInput,
PreTokenizedInputPair,
PreTrainedTokenizerBase,
TextInput,
TextInputPair,
TruncationStrategy,
)
from .utils import PaddingStrategy, TensorType, add_end_docstrings, logging
logger = logging.get_logger(__name__)
SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json"
ADDED_TOKENS_FILE = "added_tokens.json"
TOKENIZER_CONFIG_FILE = "tokenizer_config.json"
class Trie:
"""
Trie(字典树)的实现。基于给定的单词列表创建 Trie 结构,用于在一个步骤中分割 `added_tokens`。
参考资料 https://en.wikipedia.org/wiki/Trie
"""
def __init__(self):
self.data = {}
self._tokens = set()
def add(self, word: str):
"""
将给定单词添加到 Trie 中。
通过每个字符(UTF-8 字符)递归地添加到内部 `data` Trie 表示中。
使用特殊键 `""` 表示终止状态。
此函数是幂等的,添加两次相同的单词不会改变 Trie 结构。
示例:
```
>>> trie = Trie()
>>> trie.add("Hello 友達")
>>> trie.data
{"H": {"e": {"l": {"l": {"o": {" ": {"友": {"達": {"": 1}}}}}}}}}
>>> trie.add("Hello")
>>> trie.data
{"H": {"e": {"l": {"l": {"o": {"": 1, " ": {"友": {"達": {"": 1}}}}}}}}}
```
"""
if not word:
return
self._tokens.add(word)
ref = self.data
for char in word:
ref[char] = ref.get(char, {})
ref = ref[char]
ref[""] = 1
def cut_text(self, text, offsets):
offsets.append(len(text))
tokens = []
start = 0
for end in offsets:
if start > end:
logger.error(
"There was a bug in Trie algorithm in tokenization. Attempting to recover. Please report it"
" anyway."
)
continue
elif start == end:
continue
tokens.append(text[start:end])
start = end
return tokens
def _is_whitespace(char):
"""Checks whether `char` is a whitespace character."""
if char == " " or char == "\t" or char == "\n" or char == "\r":
return True
cat = unicodedata.category(char)
if cat == "Zs":
return True
return False
def _is_control(char):
"""Checks whether `char` is a control character."""
if char == "\t" or char == "\n" or char == "\r":
return False
cat = unicodedata.category(char)
if cat.startswith("C"):
return True
return False
def _is_punctuation(char):
"""Checks whether `char` is a punctuation character."""
cp = ord(char)
if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126):
return True
cat = unicodedata.category(char)
if cat.startswith("P"):
return True
return False
def _is_end_of_word(text):
"""Checks whether the last character in text is one of a punctuation, control or whitespace character."""
last_char = text[-1]
return bool(_is_control(last_char) | _is_punctuation(last_char) | _is_whitespace(last_char))
def _is_start_of_word(text):
"""Checks whether the first character in text is one of a punctuation, control or whitespace character."""
first_char = text[0]
return bool(_is_control(first_char) | _is_punctuation(first_char) | _is_whitespace(first_char))
def _insert_one_token_to_ordered_list(token_list: List[str], new_token: str):
"""
Inserts one token to an ordered list if it does not already exist. Note: token_list must be sorted.
"""
insertion_idx = bisect.bisect_left(token_list, new_token)
if insertion_idx < len(token_list) and token_list[insertion_idx] == new_token:
return
else:
token_list.insert(insertion_idx, new_token)
"""
specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).
"""
def __init__(self, **kwargs):
self.tokens_trie = Trie()
if not hasattr(self, "_added_tokens_decoder"):
self._added_tokens_decoder: Dict[int, AddedToken] = {}
self._added_tokens_decoder.update(kwargs.pop("added_tokens_decoder", {}))
self._added_tokens_encoder: Dict[str, int] = {k.content: v for v, k in self._added_tokens_decoder.items()}
super().__init__(**kwargs)
self._add_tokens(
[token for token in self.all_special_tokens_extended if token not in self._added_tokens_encoder],
special_tokens=True,
)
self._decode_use_source_tokenizer = False
@property
def is_fast(self) -> bool:
return False
@property
def vocab_size(self) -> int:
"""
`int`: 基础词汇表的大小(不包括添加的特殊标记)。
"""
raise NotImplementedError
@property
def added_tokens_encoder(self) -> Dict[str, int]:
"""
返回从字符串到索引的排序映射。为了性能优化,缓存了慢速分词器中的 `_added_tokens_encoder`。
"""
return {k.content: v for v, k in sorted(self._added_tokens_decoder.items(), key=lambda item: item[0])}
@property
def added_tokens_decoder(self) -> Dict[int, AddedToken]:
"""
返回词汇表中的添加标记,作为索引到 AddedToken 的字典。
Returns:
`Dict[str, int]`: 添加的标记。
"""
return dict(sorted(self._added_tokens_decoder.items(), key=lambda item: item[0]))
@added_tokens_decoder.setter
def added_tokens_decoder(self, value: Dict[int, Union[AddedToken, str]]) -> Dict[int, AddedToken]:
for index, token in value.items():
if not isinstance(token, (str, AddedToken)) or not isinstance(index, int):
raise ValueError(
f"The provided `added_tokens_decoder` has an element of type {index.__class__, token.__class__}, should be a dict of {int, Union[AddedToken, str]}"
)
self._added_tokens_decoder[index] = AddedToken(token) if isinstance(token, str) else token
self._added_tokens_encoder[str(token)] = index
def get_added_vocab(self) -> Dict[str, int]:
"""
返回已添加到词汇表中的词汇作为一个字典,键为词汇,值为索引。结果可能与快速调用不同,因为我们当前总是添加这些词汇,即使它们已经存在于词汇表中。这是我们应该改变的事情。
Returns:
`Dict[str, int]`: 已添加的词汇表。
"""
return self._added_tokens_encoder
def __len__(self):
"""
返回包含已添加词汇的完整词汇表的大小。计数的是 `keys` 而不是 `values`,因为如果词汇表中有空洞,我们会在错误的索引处添加分词器。
"""
return len(set(self.get_vocab().keys()))
def _update_trie(self, unique_no_split_tokens: Optional[str] = []):
"""
更新 Trie 树,将新增的无需分割的词汇加入到词汇表中。
Args:
unique_no_split_tokens (`Optional[str]`, *optional*, defaults to `[]`):
需要添加到 Trie 树中的唯一词汇列表。
"""
for token in self._added_tokens_decoder.values():
if token not in self.tokens_trie._tokens:
self.tokens_trie.add(token.content)
for token in unique_no_split_tokens:
if token not in self.tokens_trie._tokens:
self.tokens_trie.add(token)
def num_special_tokens_to_add(self, pair: bool = False) -> int:
"""
返回在编码序列时添加的特殊标记数量。
<Tip>
这会对一个虚拟输入进行编码,并检查添加的特殊标记数量,因此不是效率高的操作。不要将其放在训练循环内。
</Tip>
Args:
pair (`bool`, *optional*, defaults to `False`):
是否计算序列对或单序列中添加的特殊标记数量。
Returns:
`int`: 添加到序列中的特殊标记数量。
"""
token_ids_0 = []
token_ids_1 = []
return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None))
def _tokenize(self, text, **kwargs):
"""
使用分词器将字符串转换为一系列标记(字符串)。基于词汇表的单词分割或基于子词的分割(BPE/SentencePieces/WordPieces)。
不处理已添加的标记。
Args:
text (str): 要分词的文本。
**kwargs: 其他参数传递给分词器的选项。
Raises:
NotImplementedError: 如果子类没有实现这个方法。
"""
raise NotImplementedError
def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:
"""
Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the
vocabulary.
Args:
tokens (`str` or `List[str]`): One or several token(s) to convert to token id(s).
Returns:
`int` or `List[int]`: The token id or list of token ids.
"""
if tokens is None:
return None
if isinstance(tokens, str):
return self._convert_token_to_id_with_added_voc(tokens)
ids = []
for token in tokens:
ids.append(self._convert_token_to_id_with_added_voc(token))
return ids
def _convert_token_to_id_with_added_voc(self, token):
if token is None:
return None
if token in self._added_tokens_encoder:
return self._added_tokens_encoder[token]
return self._convert_token_to_id(token)
def _convert_token_to_id(self, token):
raise NotImplementedError
def _encode_plus(
self,
text: Union[TextInput, PreTokenizedInput, EncodedInput],
text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
add_special_tokens: bool = True,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
max_length: Optional[int] = None,
stride: int = 0,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
):
pass
) -> BatchEncoding:
def get_input_ids(text):
if isinstance(text, str):
tokens = self.tokenize(text, **kwargs)
return self.convert_tokens_to_ids(tokens)
elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
if is_split_into_words:
tokens = list(
itertools.chain(*(self.tokenize(t, is_split_into_words=True, **kwargs) for t in text))
)
return self.convert_tokens_to_ids(tokens)
else:
return self.convert_tokens_to_ids(text)
elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
return text
else:
if is_split_into_words:
raise ValueError(
f"Input {text} is not valid. Should be a string or a list/tuple of strings when"
" `is_split_into_words=True`."
)
else:
raise ValueError(
f"Input {text} is not valid. Should be a string, a list/tuple of strings or a list/tuple of"
" integers."
)
if return_offsets_mapping:
raise NotImplementedError(
"return_offset_mapping is not available when using Python tokenizers. "
"To use this feature, change your tokenizer to one deriving from "
"transformers.PreTrainedTokenizerFast. "
"More information on available tokenizers at "
"https://github.com/huggingface/transformers/pull/2674"
)
first_ids = get_input_ids(text)
second_ids = get_input_ids(text_pair) if text_pair is not None else None
return self.prepare_for_model(
first_ids,
pair_ids=second_ids,
add_special_tokens=add_special_tokens,
padding=padding_strategy.value,
truncation=truncation_strategy.value,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
prepend_batch_axis=True,
return_attention_mask=return_attention_mask,
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_length=return_length,
verbose=verbose,
)
def _batch_encode_plus(
self,
batch_text_or_text_pairs: Union[
List[TextInput],
List[TextInputPair],
List[PreTokenizedInput],
List[PreTokenizedInputPair],
List[EncodedInput],
List[EncodedInputPair],
],
add_special_tokens: bool = True,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
max_length: Optional[int] = None,
stride: int = 0,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
) -> BatchEncoding:
def get_input_ids(text):
if isinstance(text, str):
tokens = self.tokenize(text, **kwargs)
return self.convert_tokens_to_ids(tokens)
elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
if is_split_into_words:
tokens = list(
itertools.chain(*(self.tokenize(t, is_split_into_words=True, **kwargs) for t in text))
)
return self.convert_tokens_to_ids(tokens)
else:
return self.convert_tokens_to_ids(text)
elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
return text
else:
raise ValueError(
"Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
)
if return_offsets_mapping:
raise NotImplementedError(
"return_offset_mapping is not available when using Python tokenizers. "
"To use this feature, change your tokenizer to one deriving from "
"transformers.PreTrainedTokenizerFast."
)
input_ids = []
for ids_or_pair_ids in batch_text_or_text_pairs:
if not isinstance(ids_or_pair_ids, (list, tuple)):
ids, pair_ids = ids_or_pair_ids, None
elif is_split_into_words and not isinstance(ids_or_pair_ids[0], (list, tuple)):
ids, pair_ids = ids_or_pair_ids, None
else:
ids, pair_ids = ids_or_pair_ids
first_ids = get_input_ids(ids)
second_ids = get_input_ids(pair_ids) if pair_ids is not None else None
input_ids.append((first_ids, second_ids))
batch_outputs = self._batch_prepare_for_model(
input_ids,
add_special_tokens=add_special_tokens,
padding_strategy=padding_strategy,
truncation_strategy=truncation_strategy,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_attention_mask=return_attention_mask,
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_length=return_length,
return_tensors=return_tensors,
verbose=verbose,
)
return BatchEncoding(batch_outputs)
@add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
def _batch_prepare_for_model(
self,
batch_ids_pairs: List[Union[PreTokenizedInputPair, Tuple[List[int], None]]],
add_special_tokens: bool = True,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[str] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_length: bool = False,
verbose: bool = True,
) -> BatchEncoding:
"""
Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
manages a moving window (with user defined stride) for overflowing tokens
Args:
batch_ids_pairs: list of tokenized input ids or input ids pairs
"""
batch_outputs = {}
for first_ids, second_ids in batch_ids_pairs:
outputs = self.prepare_for_model(
first_ids,
second_ids,
add_special_tokens=add_special_tokens,
padding=PaddingStrategy.DO_NOT_PAD.value,
truncation=truncation_strategy.value,
max_length=max_length,
stride=stride,
pad_to_multiple_of=None,
return_attention_mask=False,
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_length=return_length,
return_tensors=None,
prepend_batch_axis=False,
verbose=verbose,
)
for key, value in outputs.items():
if key not in batch_outputs:
batch_outputs[key] = []
batch_outputs[key].append(value)
batch_outputs = self.pad(
batch_outputs,
padding=padding_strategy.value,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
return_attention_mask=return_attention_mask,
)
batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors)
return batch_outputs
) -> Tuple[str, Dict[str, Any]]:
"""
Performs any necessary transformations before tokenization.
This method should pop the arguments from kwargs and return the remaining `kwargs` as well. We test the
`kwargs` at the end of the encoding process to be sure all the arguments have been used.
Args:
text (`str`):
The text to prepare.
is_split_into_words (`bool`, *optional*, defaults to `False`):
Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the
tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
which it will tokenize. This is useful for NER or token classification.
kwargs (`Dict[str, Any]`, *optional*):
Keyword arguments to use for the tokenization.
Returns:
`Tuple[str, Dict[str, Any]]`: The prepared text and the unused kwargs.
"""
return (text, kwargs)
def get_special_tokens_mask(
self, token_ids_0: List, token_ids_1: Optional[List] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
Args:
token_ids_0 (`List[int]`):
List of ids of the first sequence.
token_ids_1 (`List[int]`, *optional*):
List of ids of the second sequence.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
if token_ids_1 is not None:
raise ValueError(
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
)
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)
return [0] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0))
@overload
def convert_ids_to_tokens(self, ids: int, skip_special_tokens: bool = False) -> str:
...
@overload
def convert_ids_to_tokens(self, ids: List[int], skip_special_tokens: bool = False) -> List[str]:
...
def convert_ids_to_tokens(
self, ids: Union[int, List[int]], skip_special_tokens: bool = False
) -> Union[str, List[str]]:
"""
Converts token ids into strings.
Args:
ids (`int` or `List[int]`):
Token ids to convert.
skip_special_tokens (`bool`, *optional*, defaults to `False`):
Whether to skip special tokens during conversion.
Returns:
`str` or `List[str]`: Converted token(s) into string(s).
"""
def convert_ids_to_tokens(self, ids: Union[int, List[int]]) -> Union[str, List[str]]:
"""
Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
added tokens.
Args:
ids (`int` or `List[int]`):
The token id (or token ids) to convert to tokens.
skip_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not to remove special tokens in the decoding.
Returns:
`str` or `List[str]`: The decoded token(s).
"""
if isinstance(ids, int):
if ids in self._added_tokens_decoder:
return self._added_tokens_decoder[ids].content
else:
return self._convert_id_to_token(ids)
tokens = []
for index in ids:
index = int(index)
if skip_special_tokens and index in self.all_special_ids:
continue
if index in self._added_tokens_decoder:
tokens.append(self._added_tokens_decoder[index].content)
else:
tokens.append(self._convert_id_to_token(index))
return tokens
def _convert_id_to_token(self, index: int) -> str:
raise NotImplementedError
def convert_tokens_to_string(self, tokens: List[str]) -> str:
return " ".join(tokens)
def _decode(
self,
token_ids: List[int],
skip_special_tokens: bool = False,
clean_up_tokenization_spaces: bool = None,
spaces_between_special_tokens: bool = True,
**kwargs,
):
pass
) -> str:
self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False)
filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
legacy_added_tokens = set(self._added_tokens_encoder.keys()) - set(self.all_special_tokens) | {
token for token in self.additional_special_tokens if self.convert_tokens_to_ids(token) >= self.vocab_size
}
sub_texts = []
current_sub_text = []
for token in filtered_tokens:
if skip_special_tokens and token in self.all_special_ids:
continue
if token in legacy_added_tokens:
if current_sub_text:
string = self.convert_tokens_to_string(current_sub_text)
if len(string) > 0:
sub_texts.append(string)
current_sub_text = []
sub_texts.append(token)
else:
current_sub_text.append(token)
if current_sub_text:
sub_texts.append(self.convert_tokens_to_string(current_sub_text))
if spaces_between_special_tokens:
text = " ".join(sub_texts)
else:
text = "".join(sub_texts)
clean_up_tokenization_spaces = (
clean_up_tokenization_spaces
if clean_up_tokenization_spaces is not None
else self.clean_up_tokenization_spaces
)
if clean_up_tokenization_spaces:
clean_text = self.clean_up_tokenization(text)
return clean_text
else:
return text
.\tokenization_utils_base.py
"""
包含慢速和快速标记化类共有的基础类:
- PreTrainedTokenizerBase:包含所有用户界面的编码方法
- Special token mixing:包含特殊标记逻辑
- BatchEncoding:用于快速标记化器的输出字典包装,带有特殊方法
"""
import copy
import json
import os
import re
import warnings
from collections import UserDict
from collections.abc import Mapping, Sized
from contextlib import contextmanager
from dataclasses import dataclass
from functools import lru_cache
from typing import (
TYPE_CHECKING,
Any,
Dict,
List,
NamedTuple,
Optional,
Sequence,
Tuple,
Union,
)
import numpy as np
from packaging import version
from . import __version__
from .dynamic_module_utils import custom_object_save
from .utils import (
ExplicitEnum,
PaddingStrategy,
PushToHubMixin,
TensorType,
add_end_docstrings,
add_model_info_to_auto_map,
cached_file,
copy_func,
download_url,
extract_commit_hash,
is_flax_available,
is_jax_tensor,
is_mlx_available,
is_numpy_array,
is_offline_mode,
is_remote_url,
is_tf_available,
is_tf_tensor,
is_tokenizers_available,
is_torch_available,
is_torch_device,
is_torch_tensor,
logging,
requires_backends,
to_py_obj,
)
if TYPE_CHECKING:
if is_torch_available():
import torch
if is_tf_available():
import tensorflow as tf
if is_flax_available():
import jax.numpy as jnp
from .pipelines.conversational import Conversation
if is_tokenizers_available():
from tokenizers import AddedToken
from tokenizers import Encoding as EncodingFast
else:
@dataclass(frozen=False, eq=True)
class AddedToken:
"""
AddedToken represents a token to be added to a Tokenizer An AddedToken can have special options defining the
way it should behave.
The `normalized` will default to `not special` if it is not specified, similarly to the definition in
`tokenizers`.
"""
def __init__(
self, content: str, single_word=False, lstrip=False, rstrip=False, special=False, normalized=None
):
self.content = content
self.single_word = single_word
self.lstrip = lstrip
self.rstrip = rstrip
self.special = special
self.normalized = normalized if normalized is not None else not special
def __getstate__(self):
return self.__dict__
def __str__(self):
return self.content
@dataclass
class EncodingFast:
"""This is dummy class because without the `tokenizers` library we don't have these objects anyway"""
pass
logger = logging.get_logger(__name__)
VERY_LARGE_INTEGER = int(1e30)
LARGE_INTEGER = int(1e20)
TextInput = str
PreTokenizedInput = List[str]
EncodedInput = List[int]
TextInputPair = Tuple[str, str]
PreTokenizedInputPair = Tuple[List[str], List[str]]
EncodedInputPair = Tuple[List[int], List[int]]
SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json"
ADDED_TOKENS_FILE = "added_tokens.json"
TOKENIZER_CONFIG_FILE = "tokenizer_config.json"
FULL_TOKENIZER_FILE = "tokenizer.json"
_re_tokenizer_file = re.compile(r"tokenizer\.(.*)\.json")
class TruncationStrategy(ExplicitEnum):
"""
`PreTrainedTokenizerBase.__call__` 方法中 `truncation` 参数的可能取值。
在IDE中进行选项补全时非常有用。
"""
ONLY_FIRST = "only_first"
ONLY_SECOND = "only_second"
LONGEST_FIRST = "longest_first"
DO_NOT_TRUNCATE = "do_not_truncate"
class CharSpan(NamedTuple):
"""
原始字符串中的字符范围。
Args:
start (`int`): 原始字符串中第一个字符的索引。
end (`int`): 原始字符串中最后一个字符后面的字符的索引。
"""
start: int
end: int
class TokenSpan(NamedTuple):
"""
编码字符串(token列表)中的token范围。
Args:
start (`int`): 范围中第一个token的索引。
end (`int`): 范围中最后一个token后面的token的索引。
"""
start: int
end: int
class BatchEncoding(UserDict):
"""
[`~tokenization_utils_base.PreTrainedTokenizerBase.__call__`],
[`~tokenization_utils_base.PreTrainedTokenizerBase.encode_plus`] 和
[`~tokenization_utils_base.PreTrainedTokenizerBase.batch_encode_plus`] 方法的输出(tokens, attention_masks等)。
这个类继承自Python字典类,可以像字典一样使用。此外,这个类还提供了从单词/字符空间到token空间的映射工具方法。
"""
"""
Args:
data (`dict`, *optional*):
Dictionary of lists/arrays/tensors returned by the `__call__`/`encode_plus`/`batch_encode_plus` methods
('input_ids', 'attention_mask', etc.).
encoding (`tokenizers.Encoding` or `Sequence[tokenizers.Encoding]`, *optional*):
If the tokenizer is a fast tokenizer which outputs additional information like mapping from word/character
space to token space the `tokenizers.Encoding` instance or list of instance (for batches) hold this
information.
tensor_type (`Union[None, str, TensorType]`, *optional*):
You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at
initialization.
prepend_batch_axis (`bool`, *optional*, defaults to `False`):
Whether or not to add a batch axis when converting to tensors (see `tensor_type` above).
n_sequences (`Optional[int]`, *optional*):
You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at
initialization.
"""
def __init__(
self,
data: Optional[Dict[str, Any]] = None,
encoding: Optional[Union[EncodingFast, Sequence[EncodingFast]]] = None,
tensor_type: Union[None, str, TensorType] = None,
prepend_batch_axis: bool = False,
n_sequences: Optional[int] = None,
):
super().__init__(data)
if isinstance(encoding, EncodingFast):
encoding = [encoding]
self._encodings = encoding
if n_sequences is None and encoding is not None and len(encoding):
n_sequences = encoding[0].n_sequences
self._n_sequences = n_sequences
self.convert_to_tensors(tensor_type=tensor_type, prepend_batch_axis=prepend_batch_axis)
@property
def n_sequences(self) -> Optional[int]:
"""
`Optional[int]`: The number of sequences used to generate each sample from the batch encoded in this
[`BatchEncoding`]. Currently can be one of `None` (unknown), `1` (a single sentence) or `2` (a pair of
sentences)
"""
return self._n_sequences
@property
def is_fast(self) -> bool:
"""
`bool`: Indicate whether this [`BatchEncoding`] was generated from the result of a [`PreTrainedTokenizerFast`]
or not.
"""
return self._encodings is not None
def __getitem__(self, item: Union[int, str]) -> Union[Any, EncodingFast]:
"""
If the key is a string, returns the value of the dict associated to `key` ('input_ids', 'attention_mask',
etc.).
If the key is an integer, get the `tokenizers.Encoding` for batch item with index `key`.
If the key is a slice, returns the value of the dict associated to `key` ('input_ids', 'attention_mask', etc.)
with the constraint of slice.
"""
if isinstance(item, str):
return self.data[item]
elif self._encodings is not None:
return self._encodings[item]
elif isinstance(item, slice):
return {key: self.data[key][item] for key in self.data.keys()}
else:
raise KeyError(
"Invalid key. Only three types of key are available: "
"(1) string, (2) integers for backend Encoding, and (3) slices for data subsetting."
)
def __getattr__(self, item: str):
try:
return self.data[item]
except KeyError:
raise AttributeError
def __getstate__(self):
return {"data": self.data, "encodings": self._encodings}
def __setstate__(self, state):
if "data" in state:
self.data = state["data"]
if "encodings" in state:
self._encodings = state["encodings"]
def keys(self):
return self.data.keys()
def values(self):
return self.data.values()
def items(self):
return self.data.items()
@property
def encodings(self) -> Optional[List[EncodingFast]]:
"""
`Optional[List[tokenizers.Encoding]]`: The list all encodings from the tokenization process. Returns `None` if
the input was tokenized through Python (i.e., not a fast) tokenizer.
"""
return self._encodings
def tokens(self, batch_index: int = 0) -> List[str]:
"""
Return the list of tokens (sub-parts of the input strings after word/subword splitting and before conversion to
integer indices) at a given batch index (only works for the output of a fast tokenizer).
Args:
batch_index (`int`, *optional*, defaults to 0): The index to access in the batch.
Returns:
`List[str]`: The list of tokens at that index.
"""
if not self._encodings:
raise ValueError(
"tokens() is not available when using non-fast tokenizers (e.g. instance of a `XxxTokenizerFast`"
" class)."
)
return self._encodings[batch_index].tokens
def sequence_ids(self, batch_index: int = 0) -> List[Optional[int]]:
"""
Return a list mapping the tokens to the id of their original sentences:
- `None` for special tokens added around or between sequences,
- `0` for tokens corresponding to words in the first sequence,
- `1` for tokens corresponding to words in the second sequence when a pair of sequences was jointly
encoded.
Args:
batch_index (`int`, *optional*, defaults to 0): The index to access in the batch.
Returns:
`List[Optional[int]]`: A list indicating the sequence id corresponding to each token. Special tokens added
by the tokenizer are mapped to `None` and other tokens are mapped to the index of their corresponding
sequence.
"""
if not self._encodings:
raise ValueError(
"sequence_ids() is not available when using non-fast tokenizers (e.g. instance of a `XxxTokenizerFast`"
" class)."
)
return self._encodings[batch_index].sequence_ids
def words(self, batch_index: int = 0) -> List[Optional[int]]:
"""
Return a list mapping the tokens to their actual word in the initial sentence for a fast tokenizer.
Args:
batch_index (`int`, *optional*, defaults to 0): The index to access in the batch.
Returns:
`List[Optional[int]]`: A list indicating the word corresponding to each token. Special tokens added by the
tokenizer are mapped to `None` and other tokens are mapped to the index of their corresponding word
(several tokens will be mapped to the same word index if they are parts of that word).
"""
if not self._encodings:
raise ValueError(
"words() is not available when using non-fast tokenizers (e.g. instance of a `XxxTokenizerFast`"
" class)."
)
warnings.warn(
"`BatchEncoding.words()` property is deprecated and should be replaced with the identical, "
"but more self-explanatory `BatchEncoding.word_ids()` property.",
FutureWarning,
)
return self.word_ids(batch_index)
def word_ids(self, batch_index: int = 0) -> List[Optional[int]]:
"""
Return a list mapping the tokens to their actual word in the initial sentence for a fast tokenizer.
Args:
batch_index (`int`, *optional*, defaults to 0): The index to access in the batch.
Returns:
`List[Optional[int]]`: A list indicating the word corresponding to each token. Special tokens added by the
tokenizer are mapped to `None` and other tokens are mapped to the index of their corresponding word
(several tokens will be mapped to the same word index if they are parts of that word).
"""
if not self._encodings:
raise ValueError(
"word_ids() is not available when using non-fast tokenizers (e.g. instance of a `XxxTokenizerFast`"
" class)."
)
return self._encodings[batch_index].word_ids
def token_to_sequence(self, batch_or_token_index: int, token_index: Optional[int] = None) -> int:
"""
Get the index of the sequence represented by the given token. In the general use case, this method returns `0`
for a single sequence or the first sequence of a pair, and `1` for the second sequence of a pair
Can be called as:
- `self.token_to_sequence(token_index)` if batch size is 1
- `self.token_to_sequence(batch_index, token_index)` if batch size is greater than 1
This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e.,
words are defined by the user). In this case it allows to easily associate encoded tokens with provided
tokenized words.
Args:
batch_or_token_index (`int`):
Index of the sequence in the batch. If the batch only comprises one sequence, this can be the index of
the token in the sequence.
token_index (`int`, *optional*):
If a batch index is provided in *batch_or_token_index*, this can be the index of the token in the
sequence.
Returns:
`int`: Index of the word in the input sequence.
"""
if not self._encodings:
raise ValueError("token_to_sequence() is not available when using Python based tokenizers")
if token_index is not None:
batch_index = batch_or_token_index
else:
batch_index = 0
token_index = batch_or_token_index
if batch_index < 0:
batch_index = self._batch_size + batch_index
if token_index < 0:
token_index = self._seq_len + token_index
return self._encodings[batch_index].token_to_sequence(token_index)
def token_to_word(self, batch_or_token_index: int, token_index: Optional[int] = None) -> int:
"""
Get the index of the word corresponding (i.e. comprising) to an encoded token in a sequence of the batch.
Can be called as:
- `self.token_to_word(token_index)` if batch size is 1
- `self.token_to_word(batch_index, token_index)` if batch size is greater than 1
This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e.,
words are defined by the user). In this case it allows to easily associate encoded tokens with provided
tokenized words.
Args:
batch_or_token_index (`int`):
Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
the token in the sequence.
token_index (`int`, *optional*):
If a batch index is provided in *batch_or_token_index*, this can be the index of the token in the
sequence.
Returns:
`int`: Index of the word in the input sequence.
"""
if not self._encodings:
raise ValueError("token_to_word() is not available when using Python based tokenizers")
if token_index is not None:
batch_index = batch_or_token_index
else:
batch_index = 0
token_index = batch_or_token_index
if batch_index < 0:
batch_index = self._batch_size + batch_index
if token_index < 0:
token_index = self._seq_len + token_index
return self._encodings[batch_index].token_to_word(token_index)
def word_to_tokens(
self, batch_or_word_index: int, word_index: Optional[int] = None, sequence_index: int = 0
) -> Optional[TokenSpan]:
"""
Get the encoded token span corresponding to a word in a sequence of the batch.
Token spans are returned as a [`~tokenization_utils_base.TokenSpan`] with:
- **start** -- Index of the first token.
- **end** -- Index of the token following the last token.
Can be called as:
- `self.word_to_tokens(word_index, sequence_index: int = 0)` if batch size is 1
- `self.word_to_tokens(batch_index, word_index, sequence_index: int = 0)` if batch size is greater or equal to
1
This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words
are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized
words.
Args:
batch_or_word_index (`int`):
Index of the sequence in the batch. If the batch only comprises one sequence, this can be the index of
the word in the sequence.
word_index (`int`, *optional*):
If a batch index is provided in *batch_or_token_index*, this can be the index of the word in the
sequence.
sequence_index (`int`, *optional*, defaults to 0):
If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
or 1) the provided word index belongs to.
Returns:
([`~tokenization_utils_base.TokenSpan`], *optional*): Span of tokens in the encoded sequence. Returns
`None` if no tokens correspond to the word. This can happen especially when the token is a special token
that has been used to format the tokenization. For example when we add a class token at the very beginning
of the tokenization.
"""
if not self._encodings:
raise ValueError("word_to_tokens() is not available when using Python based tokenizers")
if word_index is not None:
batch_index = batch_or_word_index
else:
batch_index = 0
word_index = batch_or_word_index
if batch_index < 0:
batch_index = self._batch_size + batch_index
if word_index < 0:
word_index = self._seq_len + word_index
span = self._encodings[batch_index].word_to_tokens(word_index, sequence_index)
return TokenSpan(*span) if span is not None else None
def token_to_chars(self, batch_or_token_index: int, token_index: Optional[int] = None) -> CharSpan:
"""
Get the character span corresponding to an encoded token in a sequence of the batch.
Character spans are returned as a [`~tokenization_utils_base.CharSpan`] with:
- **start** -- Index of the first character in the original string associated to the token.
- **end** -- Index of the character following the last character in the original string associated to the
token.
Can be called as:
- `self.token_to_chars(token_index)` if batch size is 1
- `self.token_to_chars(batch_index, token_index)` if batch size is greater or equal to 1
Args:
batch_or_token_index (`int`):
Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
the token in the sequence.
token_index (`int`, *optional*):
If a batch index is provided in *batch_or_token_index*, this can be the index of the token or tokens in
the sequence.
Returns:
[`~tokenization_utils_base.CharSpan`]: Span of characters in the original string, or None, if the token
(e.g. <s>, </s>) doesn't correspond to any chars in the origin string.
"""
if not self._encodings:
raise ValueError("token_to_chars() is not available when using Python based tokenizers")
if token_index is not None:
batch_index = batch_or_token_index
else:
batch_index = 0
token_index = batch_or_token_index
span_indices = self._encodings[batch_index].token_to_chars(token_index)
return CharSpan(*span_indices) if span_indices is not None else None
) -> int:
"""
Get the index of the token in the encoded output comprising a character in the original string for a sequence
of the batch.
Can be called as:
- `self.char_to_token(char_index)` if batch size is 1
- `self.char_to_token(batch_index, char_index)` if batch size is greater or equal to 1
This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words
are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized
words.
Args:
batch_or_char_index (`int`):
Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
the word in the sequence
char_index (`int`, *optional*):
If a batch index is provided in *batch_or_token_index*, this can be the index of the word in the
sequence.
sequence_index (`int`, *optional*, defaults to 0):
If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
or 1) the provided character index belongs to.
Returns:
`int`: Index of the token.
"""
if not self._encodings:
raise ValueError("char_to_token() is not available when using Python based tokenizers")
if char_index is not None:
batch_index = batch_or_char_index
else:
batch_index = 0
char_index = batch_or_char_index
return self._encodings[batch_index].char_to_token(char_index, sequence_index)
def word_to_chars(batch_or_word_index: int, word_index: Optional[int] = None, sequence_index: int = 0) -> CharSpan:
"""
获取给定批次中的序列中指定单词在原始字符串中的字符跨度。
字符跨度以 CharSpan 命名元组的形式返回,具有以下字段:
- start: 原始字符串中第一个字符的索引
- end: 原始字符串中最后一个字符之后的索引
可以按以下方式调用:
- `self.word_to_chars(word_index)` 如果批次大小为 1
- `self.word_to_chars(batch_index, word_index)` 如果批次大小大于等于 1
参数:
batch_or_word_index (`int`):
批次中序列的索引。如果批次只包含一个序列,则可以是序列中单词的索引。
word_index (`int`, *optional*):
如果在 `batch_or_word_index` 中提供了批次索引,则可以是序列中单词的索引。
sequence_index (`int`, *optional*, 默认为 0):
如果批次编码了一对序列,则可以用来指定所提供单词索引属于哪一个序列 (0 或 1)。
返回:
`CharSpan` 或 `List[CharSpan]`: 字符串中相关字符或字符组的跨度。CharSpan 是一个命名元组,具有以下字段:
- start: 原始字符串中与令牌关联的第一个字符的索引
- end: 原始字符串中与令牌关联的最后一个字符之后的索引
"""
if not self._encodings:
raise ValueError("word_to_chars() 在使用基于 Python 的分词器时不可用")
if word_index is not None:
batch_index = batch_or_word_index
else:
batch_index = 0
word_index = batch_or_word_index
return CharSpan(*(self._encodings[batch_index].word_to_chars(word_index, sequence_index)))
def to(self, device: Union[str, "torch.device"]) -> "BatchEncoding":
"""
将所有值发送到指定设备,通过调用 `v.to(device)` (仅适用于 PyTorch)。
Args:
device (`str` or `torch.device`): 要放置张量的设备。
Returns:
[`BatchEncoding`]: 修改后的相同实例。
"""
requires_backends(self, ["torch"])
if isinstance(device, str) or is_torch_device(device) or isinstance(device, int):
self.data = {k: v.to(device=device) for k, v in self.data.items()}
else:
logger.warning(f"Attempting to cast a BatchEncoding to type {str(device)}. This is not supported.")
return self
class SpecialTokensMixin:
"""
A mixin derived by [`PreTrainedTokenizer`] and [`PreTrainedTokenizerFast`] to handle specific behaviors related to
special tokens. In particular, this class hold the attributes which can be used to directly access these special
tokens in a model-independent manner and allow to set and update the special tokens.
Args:
bos_token (`str` or `tokenizers.AddedToken`, *optional*):
A special token representing the beginning of a sentence.
eos_token (`str` or `tokenizers.AddedToken`, *optional*):
A special token representing the end of a sentence.
unk_token (`str` or `tokenizers.AddedToken`, *optional*):
A special token representing an out-of-vocabulary token.
sep_token (`str` or `tokenizers.AddedToken`, *optional*):
A special token separating two different sentences in the same input (used by BERT for instance).
pad_token (`str` or `tokenizers.AddedToken`, *optional*):
A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by
attention mechanisms or loss computation.
cls_token (`str` or `tokenizers.AddedToken`, *optional*):
A special token representing the class of the input (used by BERT for instance).
mask_token (`str` or `tokenizers.AddedToken`, *optional*):
A special token representing a masked token (used by masked-language modeling pretraining objectives, like
BERT).
additional_special_tokens (tuple or list of `str` or `tokenizers.AddedToken`, *optional*):
A tuple or a list of additional tokens, which will be marked as `special`, meaning that they will be
skipped when decoding if `skip_special_tokens` is set to `True`.
"""
SPECIAL_TOKENS_ATTRIBUTES = [
"bos_token",
"eos_token",
"unk_token",
"sep_token",
"pad_token",
"cls_token",
"mask_token",
"additional_special_tokens",
]
def __init__(self, verbose=False, **kwargs):
self._bos_token = None
self._eos_token = None
self._unk_token = None
self._sep_token = None
self._pad_token = None
self._cls_token = None
self._mask_token = None
self._pad_token_type_id = 0
self._additional_special_tokens = []
self.verbose = verbose
for key, value in kwargs.items():
if value is None:
continue
if key in self.SPECIAL_TOKENS_ATTRIBUTES:
if key == "additional_special_tokens":
assert isinstance(value, (list, tuple)), f"Value {value} is not a list or tuple"
assert all(
isinstance(t, (str, AddedToken)) for t in value
), "One of the tokens is not a string or an AddedToken"
setattr(self, key, value)
elif isinstance(value, (str, AddedToken)):
setattr(self, key, value)
else:
raise TypeError(f"Special token {key} has to be either str or AddedToken but got: {type(value)}")
) -> int:
"""
Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to
it with indices starting from length of the current vocabulary and will be isolated before the tokenization
algorithm is applied. Added tokens and tokens from the vocabulary of the tokenization algorithm are therefore
not treated in the same way.
Note, when adding new tokens to the vocabulary, you should make sure to also resize the token embedding matrix
of the model so that its embedding matrix matches the tokenizer.
In order to do that, please use the [`~PreTrainedModel.resize_token_embeddings`] method.
Args:
new_tokens (`str`, `tokenizers.AddedToken` or a list of *str* or `tokenizers.AddedToken`):
Tokens are only added if they are not already in the vocabulary. `tokenizers.AddedToken` wraps a string
token to let you personalize its behavior: whether this token should only match against a single word,
whether this token should strip all potential whitespaces on the left side, whether this token should
strip all potential whitespaces on the right side, etc.
special_tokens (`bool`, *optional*, defaults to `False`):
Can be used to specify if the token is a special token. This mostly changes the normalization behavior
(special tokens like CLS or [MASK] are usually not lower-cased for instance).
See details for `tokenizers.AddedToken` in HuggingFace tokenizers library.
Returns:
`int`: Number of tokens added to the vocabulary.
Examples:
```
# Let's see how to increase the vocabulary of Bert model and tokenizer
tokenizer = BertTokenizerFast.from_pretrained("google-bert/bert-base-uncased")
model = BertModel.from_pretrained("google-bert/bert-base-uncased")
num_added_toks = tokenizer.add_tokens(["new_tok1", "my_new-tok2"])
print("We have added", num_added_toks, "tokens")
# Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer.
model.resize_token_embeddings(len(tokenizer))
```"""
if not new_tokens:
return 0
if not isinstance(new_tokens, (list, tuple)):
new_tokens = [new_tokens]
return self._add_tokens(new_tokens, special_tokens=special_tokens)
def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
raise NotImplementedError
@property
"""
Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to
it with indices starting from length of the current vocabulary and will be isolated before the tokenization
algorithm is applied. Added tokens and tokens from the vocabulary of the tokenization algorithm are therefore
not treated in the same way.
Note, when adding new tokens to the vocabulary, you should make sure to also resize the token embedding matrix
of the model so that its embedding matrix matches the tokenizer.
In order to do that, please use the [`~PreTrainedModel.resize_token_embeddings`] method.
Args:
new_tokens (`str`, `tokenizers.AddedToken` or a list of *str* or `tokenizers.AddedToken`):
Tokens are only added if they are not already in the vocabulary. `tokenizers.AddedToken` wraps a string
token to let you personalize its behavior: whether this token should only match against a single word,
whether this token should strip all potential whitespaces on the left side, whether this token should
strip all potential whitespaces on the right side, etc.
special_tokens (`bool`, *optional*, defaults to `False`):
Can be used to specify if the token is a special token. This mostly changes the normalization behavior
(special tokens like CLS or [MASK] are usually not lower-cased for instance).
See details for `tokenizers.AddedToken` in HuggingFace tokenizers library.
Returns:
`int`: Number of tokens added to the vocabulary.
Examples:
```
# Let's see how to increase the vocabulary of Bert model and tokenizer
tokenizer = BertTokenizerFast.from_pretrained("google-bert/bert-base-uncased")
model = BertModel.from_pretrained("google-bert/bert-base-uncased")
num_added_toks = tokenizer.add_tokens(["new_tok1", "my_new-tok2"])
print("We have added", num_added_toks, "tokens")
# Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer.
model.resize_token_embeddings(len(tokenizer))
```"""
if not new_tokens:
return 0
if not isinstance(new_tokens, (list, tuple)):
new_tokens = [new_tokens]
return self._add_tokens(new_tokens, special_tokens=special_tokens)
def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
raise NotImplementedError
@property
def bos_token(self) -> str:
"""
`str`: Beginning of sentence token. Log an error if used while not having been set.
"""
if self._bos_token is None:
if self.verbose:
logger.error("Using bos_token, but it is not set yet.")
return None
return str(self._bos_token)
@property
def eos_token(self) -> str:
"""
`str`: End of sentence token. Log an error if used while not having been set.
"""
if self._eos_token is None:
if self.verbose:
logger.error("Using eos_token, but it is not set yet.")
return None
return str(self._eos_token)
@property
def unk_token(self) -> str:
"""
`str`: Unknown token. Log an error if used while not having been set.
"""
if self._unk_token is None:
if self.verbose:
logger.error("Using unk_token, but it is not set yet.")
return None
return str(self._unk_token)
@property
def sep_token(self) -> str:
"""
`str`: Separation token, to separate context and query in an input sequence. Log an error if used while not
having been set.
"""
if self._sep_token is None:
if self.verbose:
logger.error("Using sep_token, but it is not set yet.")
return None
return str(self._sep_token)
@property
def pad_token(self) -> str:
"""
`str`: Padding token. Log an error if used while not having been set.
"""
if self._pad_token is None:
if self.verbose:
logger.error("Using pad_token, but it is not set yet.")
return None
return str(self._pad_token)
@property
def cls_token(self) -> str:
"""
`str`: Classification token, to extract a summary of an input sequence leveraging self-attention along the full
depth of the model. Log an error if used while not having been set.
"""
if self._cls_token is None:
if self.verbose:
logger.error("Using cls_token, but it is not set yet.")
return None
return str(self._cls_token)
@property
def mask_token(self) -> str:
"""
`str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while not
having been set.
"""
if self._mask_token is None:
if self.verbose:
logger.error("Using mask_token, but it is not set yet.")
return None
return str(self._mask_token)
def additional_special_tokens(self) -> List[str]:
"""
`List[str]`: Returns a list of additional special tokens. Raises an error if accessed before being set.
"""
if self._additional_special_tokens is None:
if self.verbose:
logger.error("Using additional_special_tokens, but it is not set yet.")
return None
return [str(tok) for tok in self._additional_special_tokens]
@bos_token.setter
def bos_token(self, value):
if not isinstance(value, (str, AddedToken)) and value is not None:
raise ValueError("Cannot set a non-string value as the BOS token")
self._bos_token = value
@eos_token.setter
def eos_token(self, value):
if not isinstance(value, (str, AddedToken)) and value is not None:
raise ValueError("Cannot set a non-string value as the EOS token")
self._eos_token = value
@unk_token.setter
def unk_token(self, value):
if not isinstance(value, (str, AddedToken)) and value is not None:
raise ValueError("Cannot set a non-string value as the UNK token")
self._unk_token = value
@sep_token.setter
def sep_token(self, value):
if not isinstance(value, (str, AddedToken)) and value is not None:
raise ValueError("Cannot set a non-string value as the SEP token")
self._sep_token = value
@pad_token.setter
def pad_token(self, value):
if not isinstance(value, (str, AddedToken)) and value is not None:
raise ValueError("Cannot set a non-string value as the PAD token")
self._pad_token = value
@cls_token.setter
def cls_token(self, value):
if not isinstance(value, (str, AddedToken)) and value is not None:
raise ValueError("Cannot set a non-string value as the CLS token")
self._cls_token = value
@mask_token.setter
def mask_token(self, value):
if not isinstance(value, (str, AddedToken)) and value is not None:
raise ValueError("Cannot set a non-string value as the MASK token")
self._mask_token = value
@additional_special_tokens.setter
def additional_special_tokens(self, value):
self._additional_special_tokens = value if value is not None else None
@property
def bos_token_id(self) -> Optional[int]:
"""
`Optional[int]`: Returns the ID of the beginning of sentence token in the vocabulary. Returns `None` if the
token has not been set.
"""
if self._bos_token is None:
return None
return self.convert_tokens_to_ids(self.bos_token)
@property
def eos_token_id(self) -> Optional[int]:
"""
`Optional[int]`: Returns the ID of the end of sentence token in the vocabulary. Returns `None` if the token has
not been set.
"""
if self._eos_token is None:
return None
return self.convert_tokens_to_ids(self.eos_token)
@property
def unk_token_id(self) -> Optional[int]:
"""
`Optional[int]`: Id of the unknown token in the vocabulary. Returns `None` if the token has not been set.
"""
if self._unk_token is None:
return None
return self.convert_tokens_to_ids(self.unk_token)
@property
def sep_token_id(self) -> Optional[int]:
"""
`Optional[int]`: Id of the separation token in the vocabulary, to separate context and query in an input
sequence. Returns `None` if the token has not been set.
"""
if self._sep_token is None:
return None
return self.convert_tokens_to_ids(self.sep_token)
@property
def pad_token_id(self) -> Optional[int]:
"""
`Optional[int]`: Id of the padding token in the vocabulary. Returns `None` if the token has not been set.
"""
if self._pad_token is None:
return None
return self.convert_tokens_to_ids(self.pad_token)
@property
def pad_token_type_id(self) -> int:
"""
`int`: Id of the padding token type in the vocabulary.
"""
return self._pad_token_type_id
@property
def cls_token_id(self) -> Optional[int]:
"""
`Optional[int]`: Id of the classification token in the vocabulary, to extract a summary of an input sequence
leveraging self-attention along the full depth of the model.
Returns `None` if the token has not been set.
"""
if self._cls_token is None:
return None
return self.convert_tokens_to_ids(self.cls_token)
@property
def mask_token_id(self) -> Optional[int]:
"""
`Optional[int]`: Id of the mask token in the vocabulary, used when training a model with masked-language
modeling. Returns `None` if the token has not been set.
"""
if self._mask_token is None:
return None
return self.convert_tokens_to_ids(self.mask_token)
@property
def additional_special_tokens_ids(self) -> List[int]:
"""
`List[int]`: Ids of all the additional special tokens in the vocabulary. Log an error if used while not having
been set.
"""
return self.convert_tokens_to_ids(self.additional_special_tokens)
@bos_token_id.setter
def bos_token_id(self, value):
self._bos_token = self.convert_ids_to_tokens(value) if value is not None else None
@eos_token_id.setter
def eos_token_id(self, value):
self._eos_token = self.convert_ids_to_tokens(value) if value is not None else None
@unk_token_id.setter
def unk_token_id(self, value):
self._unk_token = self.convert_ids_to_tokens(value) if value is not None else None
@sep_token_id.setter
def sep_token_id(self, value):
self._sep_token = self.convert_ids_to_tokens(value) if value is not None else None
@pad_token_id.setter
def pad_token_id(self, value):
self._pad_token = self.convert_ids_to_tokens(value) if value is not None else None
@cls_token_id.setter
def cls_token_id(self, value):
self._cls_token = self.convert_ids_to_tokens(value) if value is not None else None
@mask_token_id.setter
def mask_token_id(self, value):
self._mask_token = self.convert_ids_to_tokens(value) if value is not None else None
@additional_special_tokens_ids.setter
def additional_special_tokens_ids(self, values):
self._additional_special_tokens = [self.convert_ids_to_tokens(value) for value in values]
@property
def special_tokens_map(self) -> Dict[str, Union[str, List[str]]]:
"""
`Dict[str, Union[str, List[str]]]`: 将特殊标记类属性(如 `cls_token`、`unk_token` 等)映射到它们的值(如 `'<unk>'`、`'<cls>'` 等)的字典。
将 `tokenizers.AddedToken` 类型的潜在标记转换为字符串。
"""
set_attr = {}
for attr in self.SPECIAL_TOKENS_ATTRIBUTES:
attr_value = getattr(self, attr)
if attr_value:
set_attr[attr] = attr_value
return set_attr
@property
def special_tokens_map_extended(self) -> Dict[str, Union[str, AddedToken, List[Union[str, AddedToken]]]]:
"""
`Dict[str, Union[str, tokenizers.AddedToken, List[Union[str, tokenizers.AddedToken]]]]`: 将特殊标记类属性(如 `cls_token`、`unk_token` 等)映射到它们的值(如 `'<unk>'`、`'<cls>'` 等)的字典。
不将 `tokenizers.AddedToken` 类型的标记转换为字符串,以便更精细地控制特殊标记的分词。
"""
set_attr = {}
for attr in self.SPECIAL_TOKENS_ATTRIBUTES:
attr_value = getattr(self, "_" + attr)
if attr_value:
set_attr[attr] = attr_value
return set_attr
def all_special_tokens_extended(self) -> List[Union[str, AddedToken]]:
"""
`List[Union[str, tokenizers.AddedToken]]`: 返回所有特殊标记(如 `<unk>`、`<cls>` 等)的列表,
其顺序与每个标记的索引无关。如果需要正确的索引,请查看 `self.added_tokens_encoder`。
无法按顺序创建了,因为键是 `AddedToken` 而不是 `String`。
不要将 `tokenizers.AddedToken` 类型的标记转换为字符串,以便更精细地控制特殊标记的分词过程。
"""
all_tokens = []
seen = set()
for value in self.special_tokens_map_extended.values():
if isinstance(value, (list, tuple)):
tokens_to_add = [token for token in value if str(token) not in seen]
else:
tokens_to_add = [value] if str(value) not in seen else []
seen.update(map(str, tokens_to_add))
all_tokens.extend(tokens_to_add)
return all_tokens
@property
def all_special_tokens(self) -> List[str]:
"""
`List[str]`: 返回唯一特殊标记(`'<unk>'`、`'<cls>'` 等)的列表。
将 `tokenizers.AddedToken` 类型的标记转换为字符串。
"""
all_toks = [str(s) for s in self.all_special_tokens_extended]
return all_toks
@property
def all_special_ids(self) -> List[int]:
"""
`List[int]`: 返回特殊标记(`'<unk>'`、`'<cls>'` 等)映射到类属性的 id 列表。
"""
all_toks = self.all_special_tokens
all_ids = self.convert_tokens_to_ids(all_toks)
return all_ids
"""
"""
INIT_TOKENIZER_DOCSTRING = r"""
Class attributes (overridden by derived classes)
- **vocab_files_names** (`Dict[str, str]`) -- A dictionary with, as keys, the `__init__` keyword name of each
vocabulary file required by the model, and as associated values, the filename for saving the associated file
(string).
- **pretrained_vocab_files_map** (`Dict[str, Dict[str, str]]`) -- A dictionary of dictionaries, with the
high-level keys being the `__init__` keyword name of each vocabulary file required by the model, the
low-level being the `short-cut-names` of the pretrained models with, as associated values, the `url` to the
associated pretrained vocabulary file.
- **max_model_input_sizes** (`Dict[str, Optional[int]]`) -- A dictionary with, as keys, the `short-cut-names`
of the pretrained models, and as associated values, the maximum length of the sequence inputs of this model,
or `None` if the model has no maximum input size.
- **pretrained_init_configuration** (`Dict[str, Dict[str, Any]]`) -- A dictionary with, as keys, the
`short-cut-names` of the pretrained models, and as associated values, a dictionary of specific arguments to
pass to the `__init__` method of the tokenizer class for this pretrained model when loading the tokenizer
with the [`~tokenization_utils_base.PreTrainedTokenizerBase.from_pretrained`] method.
- **model_input_names** (`List[str]`) -- A list of inputs expected in the forward pass of the model.
- **padding_side** (`str`) -- The default value for the side on which the model should have padding applied.
Should be `'right'` or `'left'`.
- **truncation_side** (`str`) -- The default value for the side on which the model should have truncation
applied. Should be `'right'` or `'left'`.
"""
@add_end_docstrings(INIT_TOKENIZER_DOCSTRING)
class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
"""
Base class for [`PreTrainedTokenizer`] and [`PreTrainedTokenizerFast`].
Handles shared (mostly boiler plate) methods for those two classes.
"""
vocab_files_names: Dict[str, str] = {}
pretrained_vocab_files_map: Dict[str, Dict[str, str]] = {}
max_model_input_sizes: Dict[str, Optional[int]] = {}
pretrained_init_configuration: Dict[str, Dict[str, Any]] = {}
model_input_names: List[str] = ["input_ids", "token_type_ids", "attention_mask"]
padding_side: str = "right"
truncation_side: str = "right"
slow_tokenizer_class = None
def __init__(self, **kwargs):
self.init_inputs = ()
self.init_kwargs = copy.deepcopy(kwargs)
self.name_or_path = kwargs.pop("name_or_path", "")
model_max_length = kwargs.pop("model_max_length", kwargs.pop("max_len", None))
self.model_max_length = model_max_length if model_max_length is not None else VERY_LARGE_INTEGER
self.padding_side = kwargs.pop("padding_side", self.padding_side)
if self.padding_side not in ["right", "left"]:
raise ValueError(
f"Padding side should be selected between 'right' and 'left', current value: {self.padding_side}"
)
self.truncation_side = kwargs.pop("truncation_side", self.truncation_side)
if self.truncation_side not in ["right", "left"]:
raise ValueError(
f"Truncation side should be selected between 'right' and 'left', current value: {self.truncation_side}"
)
self.model_input_names = kwargs.pop("model_input_names", self.model_input_names)
self.clean_up_tokenization_spaces = kwargs.pop("clean_up_tokenization_spaces", True)
self.split_special_tokens = kwargs.pop("split_special_tokens", False)
self.deprecation_warnings = {}
self._in_target_context_manager = False
self.chat_template = kwargs.pop("chat_template", None)
if isinstance(self.chat_template, (list, tuple)):
self.chat_template = {template["name"]: template["template"] for template in self.chat_template}
super().__init__(**kwargs)
@staticmethod
def max_len_single_sentence() -> int:
"""
`int`: 单句可以输入到模型的最大长度。
"""
return self.model_max_length - self.num_special_tokens_to_add(pair=False)
@staticmethod
def max_len_sentences_pair() -> int:
"""
`int`: 一对句子可以输入到模型的最大结合长度。
"""
return self.model_max_length - self.num_special_tokens_to_add(pair=True)
def max_len_single_sentence(self, value) -> int:
if value == self.model_max_length - self.num_special_tokens_to_add(pair=False) and self.verbose:
if not self.deprecation_warnings.get("max_len_single_sentence", False):
logger.warning(
"Setting 'max_len_single_sentence' is now deprecated. This value is automatically set up."
)
self.deprecation_warnings["max_len_single_sentence"] = True
else:
raise ValueError(
"Setting 'max_len_single_sentence' is now deprecated. This value is automatically set up."
)
def max_len_sentences_pair(self, value) -> int:
if value == self.model_max_length - self.num_special_tokens_to_add(pair=True) and self.verbose:
if not self.deprecation_warnings.get("max_len_sentences_pair", False):
logger.warning(
"Setting 'max_len_sentences_pair' is now deprecated. This value is automatically set up."
)
self.deprecation_warnings["max_len_sentences_pair"] = True
else:
raise ValueError("Setting 'max_len_sentences_pair' is now deprecated. This value is automatically set up.")
def _set_processor_class(self, processor_class: str):
"""Sets processor class as an attribute."""
self._processor_class = processor_class
@property
def added_tokens_decoder(self) -> Dict[int, AddedToken]:
raise NotImplementedError()
def __repr__(self) -> str:
added_tokens_decoder_rep = "\n\t".join([f"{k}: {v.__repr__()}," for k, v in self.added_tokens_decoder.items()])
return (
f"{self.__class__.__name__}(name_or_path='{self.name_or_path}',"
f" vocab_size={self.vocab_size}, model_max_length={self.model_max_length}, is_fast={self.is_fast},"
f" padding_side='{self.padding_side}', truncation_side='{self.truncation_side}',"
f" special_tokens={self.special_tokens_map}, clean_up_tokenization_spaces={self.clean_up_tokenization_spaces}), "
" added_tokens_decoder={\n\t" + added_tokens_decoder_rep + "\n}"
)
def __len__(self) -> int:
raise NotImplementedError()
def get_vocab(self) -> Dict[str, int]:
"""
Returns the vocabulary as a dictionary of token to index.
`tokenizer.get_vocab()[token]` is equivalent to `tokenizer.convert_tokens_to_ids(token)` when `token` is in the
vocab.
Returns:
`Dict[str, int]`: The vocabulary.
"""
raise NotImplementedError()
def apply_chat_template(
self,
conversation: Union[List[Dict[str, str]], "Conversation"],
chat_template: Optional[str] = None,
add_generation_prompt: bool = False,
tokenize: bool = True,
padding: bool = False,
truncation: bool = False,
max_length: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_dict: bool = False,
tokenizer_kwargs: Optional[Dict[str, Any]] = None,
**kwargs,
):
"""
Applies a chat template to format a conversation.
Args:
conversation (Union[List[Dict[str, str]], "Conversation"]): The conversation data to format.
chat_template (Optional[str]): Optional template string to format messages.
add_generation_prompt (bool): Whether to add a generation prompt at the end.
tokenize (bool): Whether to tokenize the formatted output.
padding (bool): Whether to apply padding to the tokens.
truncation (bool): Whether to truncate tokens if exceeding max_length.
max_length (Optional[int]): Maximum length of the formatted output.
return_tensors (Optional[Union[str, TensorType]]): Return type for tokenized outputs.
return_dict (bool): Whether to return the output as a dictionary.
tokenizer_kwargs (Optional[Dict[str, Any]]): Optional keyword arguments for the tokenizer.
**kwargs: Additional keyword arguments.
Returns:
Formatted conversation based on the provided chat template.
"""
try:
import jinja2
from jinja2.exceptions import TemplateError
from jinja2.sandbox import ImmutableSandboxedEnvironment
except ImportError:
raise ImportError("apply_chat_template requires jinja2 to be installed.")
if version.parse(jinja2.__version__) < version.parse("3.0.0"):
raise ImportError(
"apply_chat_template requires jinja2>=3.0.0 to be installed. Your version is " f"{jinja2.__version__}."
)
def raise_exception(message):
"""
Helper function to raise a TemplateError with a specified message.
Args:
message (str): Error message to raise.
Raises:
TemplateError: Exception with the provided message.
"""
raise TemplateError(message)
jinja_env = ImmutableSandboxedEnvironment(trim_blocks=True, lstrip_blocks=True)
jinja_env.globals["raise_exception"] = raise_exception
return jinja_env.from_string(chat_template)
@lru_cache
def _compile_jinja_template(self, chat_template):
"""
Compiles a Jinja template using a sandboxed environment.
Args:
chat_template (str): The Jinja template string to compile.
Returns:
Jinja Template: Compiled Jinja template object.
"""
try:
import jinja2
from jinja2.exceptions import TemplateError
from jinja2.sandbox import ImmutableSandboxedEnvironment
except ImportError:
raise ImportError("_compile_jinja_template requires jinja2 to be installed.")
if version.parse(jinja2.__version__) < version.parse("3.0.0"):
raise ImportError(
"_compile_jinja_template requires jinja2>=3.0.0 to be installed. Your version is " f"{jinja2.__version__}."
)
def raise_exception(message):
"""
Helper function to raise a TemplateError with a specified message.
Args:
message (str): Error message to raise.
Raises:
TemplateError: Exception with the provided message.
"""
raise TemplateError(message)
jinja_env = ImmutableSandboxedEnvironment(trim_blocks=True, lstrip_blocks=True)
jinja_env.globals["raise_exception"] = raise_exception
return jinja_env.from_string(chat_template)
@property
def default_chat_template(self):
"""
Property representing the default chat template in ChatML format.
Returns:
str: Default chat template.
"""
logger.warning_once(
"\nNo chat template is defined for this tokenizer - using a default chat template "
"that implements the ChatML format (without BOS/EOS tokens!). If the default is not appropriate for "
"your model, please set `tokenizer.chat_template` to an appropriate template. "
"See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
)
return (
"{% for message in messages %}"
"{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}"
"{% endfor %}"
"{% if add_generation_prompt %}"
"{{ '<|im_start|>assistant\n' }}"
"{% endif %}"
)
@classmethod
def from_pretrained(
cls,
pretrained_model_name_or_path: Union[str, os.PathLike],
*init_inputs,
cache_dir: Optional[Union[str, os.PathLike]] = None,
force_download: bool = False,
local_files_only: bool = False,
token: Optional[Union[str, bool]] = None,
revision: str = "main",
trust_remote_code=False,
**kwargs,
):
"""
Creates an instance of the class from a pretrained model or path.
Args:
pretrained_model_name_or_path (Union[str, os.PathLike]): Name or path of the pretrained model.
*init_inputs: Additional positional arguments for initialization.
cache_dir (Optional[Union[str, os.PathLike]]): Optional directory to cache downloaded files.
force_download (bool): Whether to force download the model files.
local_files_only (bool): Whether to use only local files without downloading.
token (Optional[Union[str, bool]]): Token to authenticate access to the pretrained model.
revision (str): Revision of the pretrained model to use.
trust_remote_code (bool): Whether to trust remote code for model initialization.
**kwargs: Additional keyword arguments.
Returns:
Instance of the class initialized with the pretrained model.
"""
@staticmethod
def _eventually_correct_t5_max_length(pretrained_model_name_or_path, max_model_length, init_max_model_length):
return max_model_length
@classmethod
def convert_added_tokens(cls, obj: Union[AddedToken, Any], save=False, add_type_field=True):
if isinstance(obj, dict) and "__type" in obj and obj["__type"] == "AddedToken":
obj.pop("__type")
return AddedToken(**obj)
if isinstance(obj, AddedToken) and save:
obj = obj.__getstate__()
if add_type_field:
obj["__type"] = "AddedToken"
else:
obj.pop("special")
return obj
elif isinstance(obj, (list, tuple)):
return [cls.convert_added_tokens(o, save=save, add_type_field=add_type_field) for o in obj]
elif isinstance(obj, dict):
return {k: cls.convert_added_tokens(v, save=save, add_type_field=add_type_field) for k, v in obj.items()}
return obj
def save_pretrained(
self,
save_directory: Union[str, os.PathLike],
legacy_format: Optional[bool] = None,
filename_prefix: Optional[str] = None,
push_to_hub: bool = False,
**kwargs,
):
pass
def _save_pretrained(
self,
save_directory: Union[str, os.PathLike],
file_names: Tuple[str],
legacy_format: Optional[bool] = None,
filename_prefix: Optional[str] = None,
):
pass
) -> Tuple[str]:
"""
Save a tokenizer using the slow-tokenizer/legacy format: vocabulary + added tokens.
Fast tokenizers can also be saved in a unique JSON file containing {config + vocab + added-tokens} using the
specific [`~tokenization_utils_fast.PreTrainedTokenizerFast._save_pretrained`]
"""
if legacy_format is False:
raise ValueError(
"Only fast tokenizers (instances of PreTrainedTokenizerFast) can be saved in non legacy format."
)
save_directory = str(save_directory)
added_tokens_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + ADDED_TOKENS_FILE
)
added_vocab = {tok: index for tok, index in self.added_tokens_encoder.items() if index >= self.vocab_size}
if added_vocab:
with open(added_tokens_file, "w", encoding="utf-8") as f:
out_str = json.dumps(added_vocab, indent=2, sort_keys=True, ensure_ascii=False) + "\n"
f.write(out_str)
logger.info(f"added tokens file saved in {added_tokens_file}")
vocab_files = self.save_vocabulary(save_directory, filename_prefix=filename_prefix)
return file_names + vocab_files + (added_tokens_file,)
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
"""
Save only the vocabulary of the tokenizer (vocabulary + added tokens).
This method won't save the configuration and special token mappings of the tokenizer. Use
[`~PreTrainedTokenizerFast._save_pretrained`] to save the whole state of the tokenizer.
Args:
save_directory (`str`):
The directory in which to save the vocabulary.
filename_prefix (`str`, *optional*):
An optional prefix to add to the named of the saved files.
Returns:
`Tuple(str)`: Paths to the files saved.
"""
raise NotImplementedError
@add_end_docstrings(
ENCODE_KWARGS_DOCSTRING,
"""
**kwargs: Passed along to the `.tokenize()` method.
""",
"""
Returns:
`List[int]`, `torch.Tensor`, `tf.Tensor` or `np.ndarray`: The tokenized ids of the text.
""",
)
def encode(
self,
text: Union[TextInput, PreTokenizedInput, EncodedInput],
text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
stride: int = 0,
return_tensors: Optional[Union[str, TensorType]] = None,
**kwargs,
) -> List[int]:
encoded_inputs = self.encode_plus(
text,
text_pair=text_pair,
add_special_tokens=add_special_tokens,
padding=padding,
truncation=truncation,
max_length=max_length,
stride=stride,
return_tensors=return_tensors,
**kwargs,
)
return encoded_inputs["input_ids"]
def num_special_tokens_to_add(self, pair: bool = False) -> int:
raise NotImplementedError
def _get_padding_truncation_strategies(
self, padding=False, truncation=None, max_length=None, pad_to_multiple_of=None, verbose=True, **kwargs
):
@add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
def __call__(
self,
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
text_target: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
text_pair_target: Optional[
Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]
] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
stride: int = 0,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
):
def _call_one(
self,
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
stride: int = 0,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
):
@add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
def encode_plus(
self,
text: Union[TextInput, PreTokenizedInput, EncodedInput],
text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
stride: int = 0,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
) -> BatchEncoding:
"""
Tokenize and prepare for the model a sequence or a pair of sequences.
<Tip warning={true}>
This method is deprecated, `__call__` should be used instead.
</Tip>
Args:
text (`str`, `List[str]` or `List[int]` (the latter only for not-fast tokenizers)):
The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
`tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
method).
text_pair (`str`, `List[str]` or `List[int]`, *optional*):
Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using
the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
method).
"""
padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
padding=padding,
truncation=truncation,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
verbose=verbose,
**kwargs,
)
return self._encode_plus(
text=text,
text_pair=text_pair,
add_special_tokens=add_special_tokens,
padding_strategy=padding_strategy,
truncation_strategy=truncation_strategy,
max_length=max_length,
stride=stride,
is_split_into_words=is_split_into_words,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs,
)
def _encode_plus(
self,
text: Union[TextInput, PreTokenizedInput, EncodedInput],
text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
add_special_tokens: bool = True,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
max_length: Optional[int] = None,
stride: int = 0,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
) -> BatchEncoding:
raise NotImplementedError
@add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
def batch_encode_plus(
self,
batch_text_or_text_pairs: Union[
List[TextInput],
List[TextInputPair],
List[PreTokenizedInput],
List[PreTokenizedInputPair],
List[EncodedInput],
List[EncodedInputPair],
],
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
stride: int = 0,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
):
) -> BatchEncoding:
"""
Tokenize and prepare for the model a list of sequences or a list of pairs of sequences.
<Tip warning={true}>
This method is deprecated, `__call__` should be used instead.
</Tip>
Args:
batch_text_or_text_pairs (`List[str]`, `List[Tuple[str, str]]`, `List[List[str]]`, `List[Tuple[List[str], List[str]]]`, and for not-fast tokenizers, also `List[List[int]]`, `List[Tuple[List[int], List[int]]]`):
Batch of sequences or pair of sequences to be encoded. This can be a list of
string/string-sequences/int-sequences or a list of pair of string/string-sequences/int-sequence (see
details in `encode_plus`).
"""
padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
padding=padding,
truncation=truncation,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
verbose=verbose,
**kwargs,
)
return self._batch_encode_plus(
batch_text_or_text_pairs=batch_text_or_text_pairs,
add_special_tokens=add_special_tokens,
padding_strategy=padding_strategy,
truncation_strategy=truncation_strategy,
max_length=max_length,
stride=stride,
is_split_into_words=is_split_into_words,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs,
)
def _batch_encode_plus(
self,
batch_text_or_text_pairs: Union[
List[TextInput],
List[TextInputPair],
List[PreTokenizedInput],
List[PreTokenizedInputPair],
List[EncodedInput],
List[EncodedInputPair],
],
add_special_tokens: bool = True,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
max_length: Optional[int] = None,
stride: int = 0,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
) -> BatchEncoding:
raise NotImplementedError
def pad(
self,
encoded_inputs: Union[
BatchEncoding,
List[BatchEncoding],
Dict[str, EncodedInput],
Dict[str, List[EncodedInput]],
List[Dict[str, EncodedInput]],
],
padding: Union[bool, str, PaddingStrategy] = True,
max_length: Optional[int] = None,
pad_to_multiple_of: Optional[int] = None,
return_attention_mask: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
verbose: bool = True,
):
raise NotImplementedError
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create the token type IDs corresponding to the sequences passed. [What are token type
IDs?](../glossary#token-type-ids)
Should be overridden in a subclass if the model has a special way of building those.
Args:
token_ids_0 (`List[int]`): The first tokenized sequence.
token_ids_1 (`List[int]`, *optional*): The second tokenized sequence.
Returns:
`List[int]`: The token type ids.
"""
if token_ids_1 is None:
return len(token_ids_0) * [0]
return [0] * len(token_ids_0) + [1] * len(token_ids_1)
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
):
raise NotImplementedError
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens.
This implementation does not add special tokens and this method should be overridden in a subclass.
Args:
token_ids_0 (`List[int]`): The first tokenized sequence.
token_ids_1 (`List[int]`, *optional*): The second tokenized sequence.
Returns:
`List[int]`: The model input with special tokens.
"""
if token_ids_1 is None:
return token_ids_0
return token_ids_0 + token_ids_1
@add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
def prepare_for_model(
self,
ids: List[int],
pair_ids: Optional[List[int]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
prepend_batch_axis: bool = False,
**kwargs,
):
"""
Placeholder method that should be overridden in a subclass to prepare model inputs with special tokens.
This method includes various options for processing inputs such as padding, truncation, and returning
tensors in specific formats.
Args:
ids (`List[int]`): List of input token IDs.
pair_ids (`Optional[List[int]]`, *optional*): List of token IDs for the second sequence in pair inputs.
add_special_tokens (`bool`, *optional*): Whether to add special tokens to the input sequences.
padding (`Union[bool, str, PaddingStrategy]`, *optional*): Padding strategy or boolean for padding sequences.
truncation (`Union[bool, str, TruncationStrategy]`, *optional*): Truncation strategy or boolean for truncating sequences.
max_length (`Optional[int]`, *optional*): Maximum length of the sequences after processing.
stride (`int`, *optional*): Stride to use when overflowing tokens.
pad_to_multiple_of (`Optional[int]`, *optional*): Pad to a multiple of this value.
return_tensors (`Optional[Union[str, TensorType]]`, *optional*): Return type of tensors (e.g., 'tf', 'pt').
return_token_type_ids (`Optional[bool]`, *optional*): Whether to return token type IDs.
return_attention_mask (`Optional[bool]`, *optional*): Whether to return attention mask.
return_overflowing_tokens (`bool`, *optional*): Whether to return overflowing tokens.
return_special_tokens_mask (`bool`, *optional*): Whether to return special tokens mask.
return_offsets_mapping (`bool`, *optional*): Whether to return offsets mapping.
return_length (`bool`, *optional*): Whether to return sequence lengths.
verbose (`bool`, *optional*): Whether to print verbose logs.
prepend_batch_axis (`bool`, *optional*): Whether to prepend batch axis to the returned tensors.
**kwargs: Additional keyword arguments for specific implementations.
Returns:
`Dict[str, Union[torch.Tensor, tf.Tensor, np.ndarray]]`: Dictionary with model inputs prepared according to the specified arguments.
"""
raise NotImplementedError
def truncate_sequences(
self,
ids: List[int],
pair_ids: Optional[List[int]] = None,
num_tokens_to_remove: int = 0,
truncation_strategy: Union[str, TruncationStrategy] = "longest_first",
stride: int = 0,
):
"""
Truncate sequences of token IDs to a specified maximum length.
Args:
ids (`List[int]`): List of input token IDs.
pair_ids (`Optional[List[int]]`, *optional*): List of token IDs for the second sequence in pair inputs.
num_tokens_to_remove (`int`, *optional*): Number of tokens to remove from the sequences.
truncation_strategy (`Union[str, TruncationStrategy]`, *optional*): Strategy for truncation ('longest_first' or 'only_first').
stride (`int`, *optional*): Stride to use when overflowing tokens.
"""
raise NotImplementedError
def _pad(
self,
encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
max_length: Optional[int] = None,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
pad_to_multiple_of: Optional[int] = None,
return_attention_mask: Optional[bool] = None,
):
"""
Internal method to pad encoded inputs to a specified length.
Args:
encoded_inputs (`Union[Dict[str, EncodedInput], BatchEncoding]`): Dictionary or BatchEncoding object containing encoded inputs.
max_length (`Optional[int]`, *optional*): Maximum length to pad sequences to.
padding_strategy (`PaddingStrategy`, *optional*): Strategy for padding sequences.
pad_to_multiple_of (`Optional[int]`, *optional*): Pad to a multiple of this value.
return_attention_mask (`Optional[bool]`, *optional*): Whether to return attention mask.
Returns:
`Dict[str, torch.Tensor]`: Dictionary containing padded inputs.
"""
raise NotImplementedError
def convert_tokens_to_string(self, tokens: List[str]) -> str:
"""
Converts a sequence of tokens into a single string representation.
Args:
tokens (`List[str]`): List of tokens to join.
Returns:
`str`: Joined string of tokens.
"""
raise NotImplementedError
def batch_decode(
self,
sequences: Union[List[int], List[List[int]], "np.ndarray", "torch.Tensor", "tf.Tensor"],
skip_special_tokens: bool = False,
clean_up_tokenization_spaces: bool = None,
**kwargs,
):
"""
Batch decode sequences of token IDs into a list of strings.
Args:
sequences (`Union[List[int], List[List[int]], "np.ndarray", "torch.Tensor", "tf.Tensor"]`): List or tensor of token IDs.
skip_special_tokens (`bool`, *optional*): Whether to skip special tokens during decoding.
clean_up_tokenization_spaces (`bool`, *optional*): Whether to clean up tokenization spaces in the decoded text.
**kwargs: Additional keyword arguments for specific implementations.
"""
raise NotImplementedError
) -> List[str]:
"""
Convert a list of lists of token ids into a list of strings by calling decode.
Args:
sequences (`Union[List[int], List[List[int]], np.ndarray, torch.Tensor, tf.Tensor]`):
List of tokenized input ids. Can be obtained using the `__call__` method.
skip_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not to remove special tokens in the decoding.
clean_up_tokenization_spaces (`bool`, *optional*):
Whether or not to clean up the tokenization spaces. If `None`, will default to
`self.clean_up_tokenization_spaces`.
kwargs (additional keyword arguments, *optional*):
Will be passed to the underlying model specific decode method.
Returns:
`List[str]`: The list of decoded sentences.
"""
return [
self.decode(
seq,
skip_special_tokens=skip_special_tokens,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
**kwargs,
)
for seq in sequences
]
def decode(
self,
token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"],
skip_special_tokens: bool = False,
clean_up_tokenization_spaces: bool = None,
**kwargs,
) -> str:
"""
Converts a sequence of ids into a string, using the tokenizer and vocabulary with options to remove special
tokens and clean up tokenization spaces.
Args:
token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
List of tokenized input ids. Can be obtained using the `__call__` method.
skip_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not to remove special tokens in the decoding.
clean_up_tokenization_spaces (`bool`, *optional*):
Whether or not to clean up the tokenization spaces. If `None`, will default to
`self.clean_up_tokenization_spaces`.
kwargs (additional keyword arguments, *optional*):
Will be passed to the underlying model specific decode method.
Returns:
`str`: The decoded sentence.
"""
token_ids = to_py_obj(token_ids)
return self._decode(
token_ids=token_ids,
skip_special_tokens=skip_special_tokens,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
**kwargs,
)
def _decode(
self,
token_ids: Union[int, List[int]],
skip_special_tokens: bool = False,
clean_up_tokenization_spaces: bool = None,
**kwargs,
) -> str:
"""
Internal method to convert token ids into a string, with options to remove special tokens and clean up
tokenization spaces.
Args:
token_ids (`Union[int, List[int]]`):
List of tokenized input ids.
skip_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not to remove special tokens in the decoding.
clean_up_tokenization_spaces (`bool`, *optional*):
Whether or not to clean up the tokenization spaces. If `None`, will default to
`self.clean_up_tokenization_spaces`.
kwargs (additional keyword arguments, *optional*):
Will be passed to the underlying model specific decode method.
Returns:
`str`: The decoded sentence.
"""
return self.convert_tokens_to_string(
self.convert_ids_to_tokens(token_ids),
skip_special_tokens=skip_special_tokens,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
**kwargs,
)
) -> str:
raise NotImplementedError
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
Args:
token_ids_0 (`List[int]`):
List of ids of the first sequence.
token_ids_1 (`List[int]`, *optional*):
List of ids of the second sequence.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
assert already_has_special_tokens and token_ids_1 is None, (
"You cannot use ``already_has_special_tokens=False`` with this tokenizer. "
"Please use a slow (full python) tokenizer to activate this argument. "
"Or set `return_special_tokens_mask=True` when calling the encoding method "
"to get the special tokens mask in any tokenizer. "
)
all_special_ids = self.all_special_ids
special_tokens_mask = [1 if token in all_special_ids else 0 for token in token_ids_0]
return special_tokens_mask
@staticmethod
def clean_up_tokenization(out_string: str) -> str:
"""
Clean up a list of simple English tokenization artifacts like spaces before punctuations and abbreviated forms.
Args:
out_string (`str`): The text to clean up.
Returns:
`str`: The cleaned-up string.
"""
out_string = (
out_string.replace(" .", ".")
.replace(" ?", "?")
.replace(" !", "!")
.replace(" ,", ",")
.replace(" ' ", "'")
.replace(" n't", "n't")
.replace(" 'm", "'m")
.replace(" 's", "'s")
.replace(" 've", "'ve")
.replace(" 're", "'re")
)
return out_string
def _eventual_warn_about_too_long_sequence(self, ids: List[int], max_length: Optional[int], verbose: bool):
"""
根据输入和内部状态,可能会触发关于序列过长的警告,超过了模型指定的最大长度。
Args:
ids (`List[str]`): 标记化产生的 id 列表
max_length (`int`, *optional*): 所需的最大长度(如果设置则不会触发警告)
verbose (`bool`): 是否打印更多信息和警告。
"""
if max_length is None and len(ids) > self.model_max_length and verbose:
if not self.deprecation_warnings.get("sequence-length-is-longer-than-the-specified-maximum", False):
logger.warning(
"Token indices sequence length is longer than the specified maximum sequence length "
f"for this model ({len(ids)} > {self.model_max_length}). Running this sequence through the model "
"will result in indexing errors"
)
self.deprecation_warnings["sequence-length-is-longer-than-the-specified-maximum"] = True
def _switch_to_input_mode(self):
"""
将分词器切换到输入模式的私有方法(当分词器具有不同的输入/输出模式时)
"""
pass
def _switch_to_target_mode(self):
"""
将分词器切换到目标模式的私有方法(当分词器具有不同的输入/输出模式时)
"""
pass
@contextmanager
def as_target_tokenizer(self):
"""
临时将分词器设置为用于编码目标的模式。对于需要为标签处理稍微不同的序列到序列模型的分词器非常有用。
"""
warnings.warn(
"`as_target_tokenizer` is deprecated and will be removed in v5 of Transformers. You can tokenize your "
"labels by using the argument `text_target` of the regular `__call__` method (either in the same call as "
"your input texts if you use the same keyword arguments, or in a separate call."
)
self._switch_to_target_mode()
self._in_target_context_manager = True
yield
self._in_target_context_manager = False
self._switch_to_input_mode()
@classmethod
def register_for_auto_class(cls, auto_class="AutoTokenizer"):
"""
Register this class with a given auto class. This should only be used for custom tokenizers as the ones in the
library are already mapped with `AutoTokenizer`.
<Tip warning={true}>
This API is experimental and may have some slight breaking changes in the next releases.
</Tip>
Args:
auto_class (`str` or `type`, *optional*, defaults to `"AutoTokenizer"`):
The auto class to register this new tokenizer with.
"""
if not isinstance(auto_class, str):
auto_class = auto_class.__name__
import transformers.models.auto as auto_module
if not hasattr(auto_module, auto_class):
raise ValueError(f"{auto_class} is not a valid auto class.")
cls._auto_class = auto_class
def prepare_seq2seq_batch(
self,
src_texts: List[str],
tgt_texts: Optional[List[str]] = None,
max_length: Optional[int] = None,
max_target_length: Optional[int] = None,
padding: str = "longest",
return_tensors: str = None,
truncation: bool = True,
**kwargs,
"""
`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and targets.
Here is a short example:
model_inputs = tokenizer(src_texts, text_target=tgt_texts, ...)
If you either need to use different keyword arguments for the source and target texts, you should do two calls like
this:
model_inputs = tokenizer(src_texts, ...)
labels = tokenizer(text_target=tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]
See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.
"""
warnings.warn(formatted_warning, FutureWarning)
kwargs.pop("src_lang", None)
kwargs.pop("tgt_lang", None)
if max_length is None:
max_length = self.model_max_length
model_inputs = self(
src_texts,
add_special_tokens=True,
return_tensors=return_tensors,
max_length=max_length,
padding=padding,
truncation=truncation,
**kwargs,
)
if tgt_texts is None:
return model_inputs
if max_target_length is None:
max_target_length = max_length
with self.as_target_tokenizer():
labels = self(
tgt_texts,
add_special_tokens=True,
return_tensors=return_tensors,
padding=padding,
max_length=max_target_length,
truncation=truncation,
**kwargs,
)
model_inputs["labels"] = labels["input_ids"]
return model_inputs
def get_fast_tokenizer_file(tokenization_files: List[str]) -> str:
"""
Get the tokenization file to use for this version of transformers.
Args:
tokenization_files (`List[str]`): The list of available configuration files.
Returns:
`str`: The tokenization file to use.
"""
tokenizer_files_map = {}
for file_name in tokenization_files:
search = _re_tokenizer_file.search(file_name)
if search is not None:
v = search.groups()[0]
tokenizer_files_map[v] = file_name
available_versions = sorted(tokenizer_files_map.keys())
tokenizer_file = FULL_TOKENIZER_FILE
transformers_version = version.parse(__version__)
for v in available_versions:
if version.parse(v) <= transformers_version:
tokenizer_file = tokenizer_files_map[v]
else:
break
return tokenizer_file
PreTrainedTokenizerBase.push_to_hub = copy_func(PreTrainedTokenizerBase.push_to_hub)
if PreTrainedTokenizerBase.push_to_hub.__doc__ is not None:
PreTrainedTokenizerBase.push_to_hub.__doc__ = PreTrainedTokenizerBase.push_to_hub.__doc__.format(
object="tokenizer", object_class="AutoTokenizer", object_files="tokenizer files"
)
.\tokenization_utils_fast.py
"""
Tokenization classes for fast tokenizers (provided by HuggingFace's tokenizers library). For slow (python) tokenizers
see tokenization_utils.py
"""
import copy
import json
import os
from collections import defaultdict
from typing import Any, Dict, List, Optional, Tuple, Union
import tokenizers.pre_tokenizers as pre_tokenizers_fast
from tokenizers import Encoding as EncodingFast
from tokenizers import Tokenizer as TokenizerFast
from tokenizers.decoders import Decoder as DecoderFast
from tokenizers.trainers import BpeTrainer, UnigramTrainer, WordLevelTrainer, WordPieceTrainer
from .convert_slow_tokenizer import convert_slow_tokenizer
from .tokenization_utils import PreTrainedTokenizer
from .tokenization_utils_base import (
INIT_TOKENIZER_DOCSTRING,
AddedToken,
BatchEncoding,
PreTokenizedInput,
PreTokenizedInputPair,
PreTrainedTokenizerBase,
SpecialTokensMixin,
TextInput,
TextInputPair,
TruncationStrategy,
)
from .utils import PaddingStrategy, add_end_docstrings, logging
logger = logging.get_logger(__name__)
TOKENIZER_FILE = "tokenizer.json"
SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json"
TOKENIZER_CONFIG_FILE = "tokenizer_config.json"
ADDED_TOKENS_FILE = "added_tokens.json"
INIT_TOKENIZER_DOCSTRING += """
tokenizer_object ([`tokenizers.Tokenizer`]):
A [`tokenizers.Tokenizer`] object from 🤗 tokenizers to instantiate from. See [Using tokenizers from 🤗
tokenizers](../fast_tokenizers) for more information.
tokenizer_file ([`str`]):
A path to a local JSON file representing a previously serialized [`tokenizers.Tokenizer`] object from 🤗
tokenizers.
"""
MODEL_TO_TRAINER_MAPPING = {
"BPE": BpeTrainer,
"Unigram": UnigramTrainer,
"WordLevel": WordLevelTrainer,
"WordPiece": WordPieceTrainer,
}
VOCAB_FILES_NAMES = {"tokenizer_file": TOKENIZER_FILE}
@add_end_docstrings(INIT_TOKENIZER_DOCSTRING)
class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
"""
Base class for all fast tokenizers (wrapping HuggingFace tokenizers library).
Inherits from [`~tokenization_utils_base.PreTrainedTokenizerBase`].
"""
"""
Handles all the shared methods for tokenization and special tokens, as well as methods for
downloading/caching/loading pretrained tokenizers, as well as adding tokens to the vocabulary.
This class also contains the added tokens in a unified way on top of all tokenizers so we don't have to handle the
specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).
"""
vocab_files_names = VOCAB_FILES_NAMES
slow_tokenizer_class: PreTrainedTokenizer = None
@property
def is_fast(self) -> bool:
return True
@property
def can_save_slow_tokenizer(self) -> bool:
"""
`bool`: Whether or not the slow tokenizer can be saved. Usually for sentencepiece based slow tokenizer, this
can only be `True` if the original `"sentencepiece.model"` was not deleted.
"""
return True
@property
def vocab_size(self) -> int:
"""
`int`: Size of the base vocabulary (without the added tokens).
"""
return self._tokenizer.get_vocab_size(with_added_tokens=False)
def get_vocab(self) -> Dict[str, int]:
return self._tokenizer.get_vocab(with_added_tokens=True)
@property
def vocab(self) -> Dict[str, int]:
return self.get_vocab()
@property
def added_tokens_encoder(self) -> Dict[str, int]:
"""
Returns the sorted mapping from string to index. The added tokens encoder is cached for performance
optimisation in `self._added_tokens_encoder` for the slow tokenizers.
"""
return {k.content: v for v, k in sorted(self.added_tokens_decoder.items(), key=lambda item: item[0])}
@property
def added_tokens_decoder(self) -> Dict[int, AddedToken]:
"""
Returns the added tokens in the vocabulary as a dictionary of index to AddedToken.
Returns:
`Dict[str, int]`: The added tokens.
"""
return self._tokenizer.get_added_tokens_decoder()
def get_added_vocab(self) -> Dict[str, int]:
"""
Returns the added tokens in the vocabulary as a dictionary of token to index.
Returns:
`Dict[str, int]`: The added tokens.
"""
return {k.content: v for v, k in sorted(self.added_tokens_decoder.items(), key=lambda item: item[0])}
def __len__(self) -> int:
"""
Size of the full vocabulary with the added tokens.
"""
return self._tokenizer.get_vocab_size(with_added_tokens=True)
@property
def backend_tokenizer(self) -> TokenizerFast:
"""
`tokenizers.implementations.BaseTokenizer`: The Rust tokenizer used as a backend.
"""
return self._tokenizer
@property
def decoder(self) -> DecoderFast:
"""
`tokenizers.decoders.Decoder`: The Rust decoder for this tokenizer.
"""
return self._tokenizer.decoder
def _convert_encoding(
self,
encoding: EncodingFast,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
) -> Tuple[Dict[str, Any], List[EncodingFast]]:
"""
Convert the encoding representation (from low-level HuggingFace tokenizer output) to a python Dict and a list
of encodings, take care of building a batch from overflowing tokens.
Overflowing tokens are converted to additional examples (like batches) so the output values of the dict are
lists (overflows) of lists (tokens).
Output shape: (overflows, sequence length)
"""
if return_token_type_ids is None:
return_token_type_ids = "token_type_ids" in self.model_input_names
if return_attention_mask is None:
return_attention_mask = "attention_mask" in self.model_input_names
if return_overflowing_tokens and encoding.overflowing is not None:
encodings = [encoding] + encoding.overflowing
else:
encodings = [encoding]
encoding_dict = defaultdict(list)
for e in encodings:
encoding_dict["input_ids"].append(e.ids)
if return_token_type_ids:
encoding_dict["token_type_ids"].append(e.type_ids)
if return_attention_mask:
encoding_dict["attention_mask"].append(e.attention_mask)
if return_special_tokens_mask:
encoding_dict["special_tokens_mask"].append(e.special_tokens_mask)
if return_offsets_mapping:
encoding_dict["offset_mapping"].append(e.offsets)
if return_length:
encoding_dict["length"].append(len(e.ids))
return encoding_dict, encodings
def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:
"""
Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the
vocabulary.
Args:
tokens (`str` or `List[str]`): One or several token(s) to convert to token id(s).
Returns:
`int` or `List[int]`: The token id or list of token ids.
"""
if tokens is None:
return None
if isinstance(tokens, str):
return self._convert_token_to_id_with_added_voc(tokens)
return [self._convert_token_to_id_with_added_voc(token) for token in tokens]
def _convert_token_to_id_with_added_voc(self, token: str) -> int:
index = self._tokenizer.token_to_id(token)
if index is None:
return self.unk_token_id
return index
def _convert_id_to_token(self, index: int) -> Optional[str]:
return self._tokenizer.id_to_token(int(index))
def _add_tokens(self, new_tokens: List[Union[str, AddedToken]], special_tokens=False) -> int:
if special_tokens:
return self._tokenizer.add_special_tokens(new_tokens)
else:
return self._tokenizer.add_tokens(new_tokens)
def num_special_tokens_to_add(self, pair: bool = False) -> int:
"""
返回在编码序列时添加的特殊标记数量。
<Tip>
这会对虚拟输入进行编码并检查添加的标记数量,因此效率较低。不要将此函数放在训练循环中。
</Tip>
Args:
pair (`bool`, *optional*, 默认为 `False`):
是否在序列对(sequence pair)情况下计算添加的特殊标记数量,或单独序列的情况。
Returns:
`int`: 添加到序列中的特殊标记数量。
"""
return self._tokenizer.num_special_tokens_to_add(pair)
def convert_ids_to_tokens(
self, ids: Union[int, List[int]], skip_special_tokens: bool = False
) -> Union[str, List[str]]:
"""
使用词汇表和已添加的标记,将单个索引或索引序列转换为标记或标记序列。
Args:
ids (`int` 或 `List[int]`):
要转换为标记或标记序列的标记 ID(或标记 IDs)。
skip_special_tokens (`bool`, *optional*, 默认为 `False`):
是否在解码时跳过特殊标记。
Returns:
`str` 或 `List[str]`: 解码后的标记(或标记列表)。
"""
if isinstance(ids, int):
return self._tokenizer.id_to_token(ids)
tokens = []
for index in ids:
index = int(index)
if skip_special_tokens and index in self.all_special_ids:
continue
tokens.append(self._tokenizer.id_to_token(index))
return tokens
def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, **kwargs) -> List[str]:
return self.encode_plus(text=text, text_pair=pair, add_special_tokens=add_special_tokens, **kwargs).tokens()
def set_truncation_and_padding(
self,
padding_strategy: PaddingStrategy,
truncation_strategy: TruncationStrategy,
max_length: int,
stride: int,
pad_to_multiple_of: Optional[int],
"""
Define the truncation and the padding strategies for fast tokenizers (provided by HuggingFace tokenizers
library) and restore the tokenizer settings afterwards.
The provided tokenizer has no padding / truncation strategy before the managed section. If your tokenizer set a
padding / truncation strategy before, then it will be reset to no padding / truncation when exiting the managed
section.
Args:
padding_strategy ([`~utils.PaddingStrategy`]):
The kind of padding that will be applied to the input
truncation_strategy ([`~tokenization_utils_base.TruncationStrategy`]):
The kind of truncation that will be applied to the input
max_length (`int`):
The maximum size of a sequence.
stride (`int`):
The stride to use when handling overflow.
pad_to_multiple_of (`int`, *optional*):
If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
"""
_truncation = self._tokenizer.truncation
_padding = self._tokenizer.padding
if truncation_strategy == TruncationStrategy.DO_NOT_TRUNCATE:
if _truncation is not None:
self._tokenizer.no_truncation()
else:
target = {
"max_length": max_length,
"stride": stride,
"strategy": truncation_strategy.value,
"direction": self.truncation_side,
}
if _truncation is None:
current = None
else:
current = {k: _truncation.get(k, None) for k in target}
if current != target:
self._tokenizer.enable_truncation(**target)
if padding_strategy == PaddingStrategy.DO_NOT_PAD:
if _padding is not None:
self._tokenizer.no_padding()
else:
length = max_length if padding_strategy == PaddingStrategy.MAX_LENGTH else None
target = {
"length": length,
"direction": self.padding_side,
"pad_id": self.pad_token_id,
"pad_token": self.pad_token,
"pad_type_id": self.pad_token_type_id,
"pad_to_multiple_of": pad_to_multiple_of,
}
if _padding != target:
self._tokenizer.enable_padding(**target)
def _batch_encode_plus(
self,
batch_text_or_text_pairs: Union[
List[TextInput], List[TextInputPair], List[PreTokenizedInput], List[PreTokenizedInputPair]
],
add_special_tokens: bool = True,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
max_length: Optional[int] = None,
stride: int = 0,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[str] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
):
def _encode_plus(
self,
text: Union[TextInput, PreTokenizedInput],
text_pair: Optional[Union[TextInput, PreTokenizedInput]] = None,
add_special_tokens: bool = True,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
max_length: Optional[int] = None,
stride: int = 0,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[bool] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
):
) -> BatchEncoding:
batched_input = [(text, text_pair)] if text_pair else [text]
batched_output = self._batch_encode_plus(
batched_input,
is_split_into_words=is_split_into_words,
add_special_tokens=add_special_tokens,
padding_strategy=padding_strategy,
truncation_strategy=truncation_strategy,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs,
)
if return_tensors is None and not return_overflowing_tokens:
batched_output = BatchEncoding(
{
key: value[0] if len(value) > 0 and isinstance(value[0], list) else value
for key, value in batched_output.items()
},
batched_output.encodings,
)
self._eventual_warn_about_too_long_sequence(batched_output["input_ids"], max_length, verbose)
return batched_output
def convert_tokens_to_string(self, tokens: List[str]) -> str:
return self.backend_tokenizer.decoder.decode(tokens)
def _decode(
self,
token_ids: Union[int, List[int]],
skip_special_tokens: bool = False,
clean_up_tokenization_spaces: bool = None,
**kwargs,
) -> str:
self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False)
if isinstance(token_ids, int):
token_ids = [token_ids]
text = self._tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
clean_up_tokenization_spaces = (
clean_up_tokenization_spaces
if clean_up_tokenization_spaces is not None
else self.clean_up_tokenization_spaces
)
if clean_up_tokenization_spaces:
clean_text = self.clean_up_tokenization(text)
return clean_text
else:
return text
def _save_pretrained(
self,
save_directory: Union[str, os.PathLike],
file_names: Tuple[str],
legacy_format: Optional[bool] = None,
filename_prefix: Optional[str] = None,
) -> Tuple[str]:
"""
Save a tokenizer using the slow-tokenizer/legacy format: vocabulary + added tokens as well as in a unique JSON
file containing {config + vocab + added-tokens}.
"""
save_directory = str(save_directory)
if self.slow_tokenizer_class is None and legacy_format is True:
raise ValueError(
"Your tokenizer does not have a legacy version defined and therefore cannot register this version. You"
" might consider leaving the legacy_format at `None` or setting it to `False`."
)
save_slow = (
(legacy_format is None or legacy_format is True)
and self.slow_tokenizer_class is not None
and self.can_save_slow_tokenizer
)
save_fast = legacy_format is None or legacy_format is False
if save_slow:
added_tokens_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + ADDED_TOKENS_FILE
)
added_vocab = {tok: index for tok, index in self.added_tokens_encoder.items() if index >= self.vocab_size}
if added_vocab:
with open(added_tokens_file, "w", encoding="utf-8") as f:
out_str = json.dumps(added_vocab, indent=2, sort_keys=True, ensure_ascii=False) + "\n"
f.write(out_str)
vocab_files = self.save_vocabulary(save_directory, filename_prefix=filename_prefix)
file_names = file_names + vocab_files + (added_tokens_file,)
if save_fast:
tokenizer_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + TOKENIZER_FILE
)
self.backend_tokenizer.save(tokenizer_file)
file_names = file_names + (tokenizer_file,)
return file_names
.\tools\agents.py
import importlib.util
import json
import os
import time
from dataclasses import dataclass
from typing import Dict
import requests
from huggingface_hub import HfFolder, hf_hub_download, list_spaces
from ..models.auto import AutoTokenizer
from ..utils import is_offline_mode, is_openai_available, is_torch_available, logging
from .base import TASK_MAPPING, TOOL_CONFIG_FILE, Tool, load_tool, supports_remote
from .prompts import CHAT_MESSAGE_PROMPT, download_prompt
from .python_interpreter import evaluate
logger = logging.get_logger(__name__)
if is_openai_available():
import openai
if is_torch_available():
from ..generation import StoppingCriteria, StoppingCriteriaList
from ..models.auto import AutoModelForCausalLM
else:
StoppingCriteria = object
_tools_are_initialized = False
BASE_PYTHON_TOOLS = {
"print": print,
"range": range,
"float": float,
"int": int,
"bool": bool,
"str": str,
}
@dataclass
class PreTool:
task: str
description: str
repo_id: str
HUGGINGFACE_DEFAULT_TOOLS = {}
HUGGINGFACE_DEFAULT_TOOLS_FROM_HUB = [
"image-transformation",
"text-download",
"text-to-image",
"text-to-video",
]
def get_remote_tools(organization="huggingface-tools"):
if is_offline_mode():
logger.info("You are in offline mode, so remote tools are not available.")
return {}
spaces = list_spaces(author=organization)
tools = {}
for space_info in spaces:
repo_id = space_info.id
resolved_config_file = hf_hub_download(repo_id, TOOL_CONFIG_FILE, repo_type="space")
with open(resolved_config_file, encoding="utf-8") as reader:
config = json.load(reader)
task = repo_id.split("/")[-1]
tools[config["name"]] = PreTool(task=task, description=config["description"], repo_id=repo_id)
return tools
def _setup_default_tools():
global HUGGINGFACE_DEFAULT_TOOLS
global _tools_are_initialized
if _tools_are_initialized:
return
main_module = importlib.import_module("transformers")
tools_module = main_module.tools
remote_tools = get_remote_tools()
for task_name, tool_class_name in TASK_MAPPING.items():
tool_class = getattr(tools_module, tool_class_name)
description = tool_class.description
HUGGINGFACE_DEFAULT_TOOLS[tool_class.name] = PreTool(task=task_name, description=description, repo_id=None)
if not is_offline_mode():
for task_name in HUGGINGFACE_DEFAULT_TOOLS_FROM_HUB:
found = False
for tool_name, tool in remote_tools.items():
if tool.task == task_name:
HUGGINGFACE_DEFAULT_TOOLS[tool_name] = tool
found = True
break
if not found:
raise ValueError(f"{task_name} is not implemented on the Hub.")
_tools_are_initialized = True
def resolve_tools(code, toolbox, remote=False, cached_tools=None):
if cached_tools is None:
resolved_tools = BASE_PYTHON_TOOLS.copy()
else:
resolved_tools = cached_tools
for name, tool in toolbox.items():
if name not in code or name in resolved_tools:
continue
if isinstance(tool, Tool):
resolved_tools[name] = tool
else:
task_or_repo_id = tool.task if tool.repo_id is None else tool.repo_id
_remote = remote and supports_remote(task_or_repo_id)
resolved_tools[name] = load_tool(task_or_repo_id, remote=_remote)
return resolved_tools
def get_tool_creation_code(code, toolbox, remote=False):
code_lines = ["from transformers import load_tool", ""]
for name, tool in toolbox.items():
if name not in code or isinstance(tool, Tool):
continue
task_or_repo_id = tool.task if tool.repo_id is None else tool.repo_id
line = f'{name} = load_tool("{task_or_repo_id}"'
if remote:
line += ", remote=True"
line += ")"
code_lines.append(line)
return "\n".join(code_lines) + "\n"
def clean_code_for_chat(result):
lines = result.split("\n")
idx = 0
while idx < len(lines) and not lines[idx].lstrip().startswith("```"):
idx += 1
explanation = "\n".join(lines[:idx]).strip()
if idx == len(lines):
return explanation, None
idx += 1
start_idx = idx
while not lines[idx].lstrip().startswith("```"):
idx += 1
code = "\n".join(lines[start_idx:idx]).strip()
return explanation, code
def clean_code_for_run(result):
result = f"I will use the following {result}"
explanation, code = result.split("Answer:")
explanation = explanation.strip()
code = code.strip()
code_lines = code.split("\n")
if code_lines[0] in ["```", "```", "```"]:
code_lines = code_lines[1:]
if code_lines[-1] == "```":
code_lines = code_lines[:-1]
code = "\n".join(code_lines)
return explanation, code
Args:
chat_prompt_template (`str`, *optional*):
Pass along your own prompt if you want to override the default template for the `chat` method. Can be the
actual prompt template or a repo ID (on the Hugging Face Hub). The prompt should be in a file named
`chat_prompt_template.txt` in this repo in this case.
run_prompt_template (`str`, *optional*):
Pass along your own prompt if you want to override the default template for the `run` method. Can be the
actual prompt template or a repo ID (on the Hugging Face Hub). The prompt should be in a file named
`run_prompt_template.txt` in this repo in this case.
additional_tools ([`Tool`], list of tools or dictionary with tool values, *optional*):
Any additional tools to include on top of the default ones. If you pass along a tool with the same name as
one of the default tools, that default tool will be overridden.
"""
# 定义一个类,用于处理对话生成模型的配置和工具
def __init__(self, chat_prompt_template=None, run_prompt_template=None, additional_tools=None):
# 设置默认工具集
_setup_default_tools()
# 获取当前类的名称作为代理名称
agent_name = self.__class__.__name__
# 下载指定的对话生成模板,用于聊天模式
self.chat_prompt_template = download_prompt(chat_prompt_template, agent_name, mode="chat")
# 下载指定的对话生成模板,用于运行模式
self.run_prompt_template = download_prompt(run_prompt_template, agent_name, mode="run")
# 复制默认的 Hugging Face 工具集到代理的工具箱
self._toolbox = HUGGINGFACE_DEFAULT_TOOLS.copy()
# 设置日志功能为打印输出
self.log = print
# 如果提供了额外的工具,则更新代理的工具箱
if additional_tools is not None:
# 如果 additional_tools 是列表或元组,则将其转换为字典,以工具名称作为键
if isinstance(additional_tools, (list, tuple)):
additional_tools = {t.name: t for t in additional_tools}
# 如果 additional_tools 不是字典,则将其转换为包含单个工具的字典
elif not isinstance(additional_tools, dict):
additional_tools = {additional_tools.name: additional_tools}
# 找出在 additional_tools 中已经存在于默认工具集中的工具,并将其用新工具替换
replacements = {name: tool for name, tool in additional_tools.items() if name in HUGGINGFACE_DEFAULT_TOOLS}
# 更新代理的工具箱
self._toolbox.update(additional_tools)
# 如果有工具被替换了,则记录警告信息
if len(replacements) > 1:
names = "\n".join([f"- {n}: {t}" for n, t in replacements.items()])
logger.warning(
f"The following tools have been replaced by the ones provided in `additional_tools`:\n{names}."
)
elif len(replacements) == 1:
name = list(replacements.keys())[0]
logger.warning(f"{name} has been replaced by {replacements[name]} as provided in `additional_tools`.")
# 准备进行新的聊天会话的初始化工作
self.prepare_for_new_chat()
@property
def toolbox(self) -> Dict[str, Tool]:
"""Get all tool currently available to the agent"""
# 返回当前代理可用的所有工具集合
return self._toolbox
def format_prompt(self, task, chat_mode=False):
# 构建描述工具的字符串,每行包含工具名称和描述
description = "\n".join([f"- {name}: {tool.description}" for name, tool in self.toolbox.items()])
# 根据聊天模式选择不同的提示模板
if chat_mode:
# 如果历史记录为空,使用聊天提示模板并替换工具描述部分
if self.chat_history is None:
prompt = self.chat_prompt_template.replace("<<all_tools>>", description)
else:
# 否则,使用已有的聊天历史记录
prompt = self.chat_history
# 添加当前任务到聊天提示中
prompt += CHAT_MESSAGE_PROMPT.replace("<<task>>", task)
else:
# 使用运行提示模板并替换工具描述和任务部分
prompt = self.run_prompt_template.replace("<<all_tools>>", description)
prompt = prompt.replace("<<prompt>>", task)
# 返回生成的提示
return prompt
def set_stream(self, streamer):
"""
Set the function use to stream results (which is `print` by default).
Args:
streamer (`callable`): The function to call when streaming results from the LLM.
"""
# 设置日志输出函数,用于从语言模型生成的结果流式输出
self.log = streamer
def chat(self, task, *, return_code=False, remote=False, **kwargs):
"""
Sends a new request to the agent in a chat. Will use the previous ones in its history.
Args:
task (`str`): The task to perform
return_code (`bool`, *optional*, defaults to `False`):
Whether to just return code and not evaluate it.
remote (`bool`, *optional*, defaults to `False`):
Whether or not to use remote tools (inference endpoints) instead of local ones.
kwargs (additional keyword arguments, *optional*):
Any keyword argument to send to the agent when evaluating the code.
"""
# 根据任务构建格式化后的提示字符串,用于发送给语言模型
prompt = self.format_prompt(task, chat_mode=True)
# 生成一个对话回复,停止条件为指定字符串
result = self.generate_one(prompt, stop=["Human:", "====="])
# 更新聊天历史记录,包含之前的提示和生成的结果
self.chat_history = prompt + result.strip() + "\n"
# 清理生成的代码,获取解释和代码
explanation, code = clean_code_for_chat(result)
# 输出语言模型生成的解释信息
self.log(f"==Explanation from the agent==\n{explanation}")
# 如果生成了代码,则输出生成的代码,并根据需求返回或评估代码结果
if code is not None:
self.log(f"\n\n==Code generated by the agent==\n{code}")
if not return_code:
# 解析工具并返回评估结果
self.cached_tools = resolve_tools(code, self.toolbox, remote=remote, cached_tools=self.cached_tools)
self.chat_state.update(kwargs)
return evaluate(code, self.cached_tools, self.chat_state, chat_mode=True)
else:
# 获取生成代码的工具创建代码并返回完整的生成代码
tool_code = get_tool_creation_code(code, self.toolbox, remote=remote)
return f"{tool_code}\n{code}"
def prepare_for_new_chat(self):
"""
Clears the history of prior calls to [`~Agent.chat`].
"""
# 清空聊天历史记录
self.chat_history = None
# 重置聊天状态
self.chat_state = {}
# 清空缓存的工具
self.cached_tools = None
def clean_code_for_run(self, result):
"""
Override this method if you want to change the way the code is
cleaned for the `run` method.
"""
# 调用特定函数来清理运行代码
return clean_code_for_run(result)
def run(self, task, *, return_code=False, remote=False, **kwargs):
"""
Sends a request to the agent.
Args:
task (`str`): The task to perform
return_code (`bool`, *optional*, defaults to `False`):
Whether to just return code and not evaluate it.
remote (`bool`, *optional*, defaults to `False`):
Whether or not to use remote tools (inference endpoints) instead of local ones.
kwargs (additional keyword arguments, *optional*):
Any keyword argument to send to the agent when evaluating the code.
Example:
```
from transformers import HfAgent
agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder")
agent.run("Draw me a picture of rivers and lakes")
```
"""
# 格式化任务提示信息
prompt = self.format_prompt(task)
# 生成一次结果,返回解释和生成的代码
result = self.generate_one(prompt, stop=["Task:"])
explanation, code = self.clean_code_for_run(result)
# 记录代理生成的解释
self.log(f"==Explanation from the agent==\n{explanation}")
# 记录代理生成的代码
self.log(f"\n\n==Code generated by the agent==\n{code}")
if not return_code:
# 如果不仅返回代码而不评估,则解析工具并评估生成的代码
self.log("\n\n==Result==")
self.cached_tools = resolve_tools(code, self.toolbox, remote=remote, cached_tools=self.cached_tools)
return evaluate(code, self.cached_tools, state=kwargs.copy())
else:
# 获取工具创建代码并返回与生成代码的组合结果
tool_code = get_tool_creation_code(code, self.toolbox, remote=remote)
return f"{tool_code}\n{code}"
def generate_one(self, prompt, stop):
# This is the method to implement in your custom agent.
# 这是需要在自定义代理中实现的方法,抛出未实现错误
raise NotImplementedError
def generate_many(self, prompts, stop):
# Override if you have a way to do batch generation faster than one by one
# 如果有批量生成的更快方式,则重写此方法
return [self.generate_one(prompt, stop) for prompt in prompts]
"""
Agent that uses the openai API to generate code.
<Tip warning={true}>
The openAI models are used in generation mode, so even for the `chat()` API, it's better to use models like
`"text-davinci-003"` over the chat-GPT variant. Proper support for chat-GPT models will come in a next version.
</Tip>
Args:
model (`str`, *optional*, defaults to `"text-davinci-003"`):
The name of the OpenAI model to use.
api_key (`str`, *optional*):
The API key to use. If unset, will look for the environment variable `"OPENAI_API_KEY"`.
chat_prompt_template (`str`, *optional*):
Pass along your own prompt if you want to override the default template for the `chat` method. Can be the
actual prompt template or a repo ID (on the Hugging Face Hub). The prompt should be in a file named
`chat_prompt_template.txt` in this repo in this case.
run_prompt_template (`str`, *optional*):
Pass along your own prompt if you want to override the default template for the `run` method. Can be the
actual prompt template or a repo ID (on the Hugging Face Hub). The prompt should be in a file named
`run_prompt_template.txt` in this repo in this case.
additional_tools ([`Tool`], list of tools or dictionary with tool values, *optional*):
Any additional tools to include on top of the default ones. If you pass along a tool with the same name as
one of the default tools, that default tool will be overridden.
Example:
```
from transformers import OpenAiAgent
agent = OpenAiAgent(model="text-davinci-003", api_key=xxx)
agent.run("Is the following `text` (in Spanish) positive or negative?", text="¡Este es un API muy agradable!")
```
"""
def __init__(
self,
model="text-davinci-003",
api_key=None,
chat_prompt_template=None,
run_prompt_template=None,
additional_tools=None,
):
# 检查是否安装了 openai 库,如果没有则抛出 ImportError 异常
if not is_openai_available():
raise ImportError("Using `OpenAiAgent` requires `openai`: `pip install openai`.")
# 如果未提供 API 密钥,则尝试从环境变量 "OPENAI_API_KEY" 中获取
if api_key is None:
api_key = os.environ.get("OPENAI_API_KEY", None)
# 如果仍未设置 API 密钥,则抛出 ValueError 异常
if api_key is None:
raise ValueError(
"You need an openai key to use `OpenAIAgent`. You can get one here: Get one here "
"https://openai.com/api/`. If you have one, set it in your env with `os.environ['OPENAI_API_KEY'] = "
"xxx."
)
else:
# 设置 openai 的 API 密钥
openai.api_key = api_key
# 初始化父类 Agent,传入可能的自定义聊天和运行模板以及额外工具
super().__init__(
chat_prompt_template=chat_prompt_template,
run_prompt_template=run_prompt_template,
additional_tools=additional_tools,
)
# 根据给定的 prompts 列表和 stop 标志,生成多个聊天或完成文本
def generate_many(self, prompts, stop):
# 如果模型名称中包含 "gpt",则使用 _chat_generate 方法生成每个 prompt 的结果
if "gpt" in self.model:
return [self._chat_generate(prompt, stop) for prompt in prompts]
else:
# 否则,使用 _completion_generate 方法生成所有 prompts 的结果
return self._completion_generate(prompts, stop)
# 根据给定的 prompt 和 stop 标志,生成一个聊天或完成文本
def generate_one(self, prompt, stop):
# 如果模型名称中包含 "gpt",则使用 _chat_generate 方法生成结果
if "gpt" in self.model:
return self._chat_generate(prompt, stop)
else:
# 否则,使用 _completion_generate 方法生成结果,并返回第一个元素
return self._completion_generate([prompt], stop)[0]
# 使用 OpenAI 的聊天 API 生成文本
def _chat_generate(self, prompt, stop):
# 调用 OpenAI 的 chat.completions.create 方法,传入模型、消息内容、温度和停止条件
result = openai.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}],
temperature=0,
stop=stop,
)
# 返回生成的文本内容
return result.choices[0].message.content
# 使用 OpenAI 的 Completion API 生成文本
def _completion_generate(self, prompts, stop):
# 调用 OpenAI 的 Completion.create 方法,传入模型、prompt、温度、停止条件和最大 token 数
result = openai.Completion.create(
model=self.model,
prompt=prompts,
temperature=0,
stop=stop,
max_tokens=200,
)
# 返回生成的文本内容列表
return [answer["text"] for answer in result["choices"]]
"""
AzureOpenAiAgent 是一个继承自 Agent 的代理类,用于利用 Azure OpenAI 生成代码。参考官方文档来了解如何在 Azure 上部署 OpenAI 模型。
Args:
deployment_id (`str`):
要使用的已部署 Azure OpenAI 模型的名称。
api_key (`str`, *optional*):
要使用的 API 密钥。如果未设置,将查找环境变量 `"AZURE_OPENAI_API_KEY"`。
resource_name (`str`, *optional*):
Azure OpenAI 资源的名称。如果未设置,将查找环境变量 `"AZURE_OPENAI_RESOURCE_NAME"`。
api_version (`str`, *optional*, default to `"2022-12-01"`):
该代理使用的 API 版本。
is_chat_mode (`bool`, *optional*):
是否使用聊天模型而非完成模型(参见上述注释,聊天模型的效率较低)。默认根据 `deployment_id` 是否包含 `'gpt'` 来判断。
chat_prompt_template (`str`, *optional*):
如果要覆盖 `chat` 方法的默认模板,请传递自定义的提示模板。可以是实际的提示模板或 Hugging Face Hub 上的 repo ID。在这种情况下,提示应该在该 repo 中命名为 `chat_prompt_template.txt`。
run_prompt_template (`str`, *optional*):
如果要覆盖 `run` 方法的默认模板,请传递自定义的提示模板。可以是实际的提示模板或 Hugging Face Hub 上的 repo ID。在这种情况下,提示应该在该 repo 中命名为 `run_prompt_template.txt`。
additional_tools ([`Tool`], list of tools or dictionary with tool values, *optional*):
除默认工具外要包含的任何附加工具。如果传递与默认工具同名的工具,将覆盖默认工具。
Example:
```
from transformers import AzureOpenAiAgent
agent = AzureAiAgent(deployment_id="Davinci-003", api_key=xxx, resource_name=yyy)
agent.run("Is the following `text` (in Spanish) positive or negative?", text="¡Este es un API muy agradable!")
```
"""
def __init__(
self,
deployment_id,
api_key=None,
resource_name=None,
api_version="2022-12-01",
is_chat_model=None,
chat_prompt_template=None,
run_prompt_template=None,
additional_tools=None,
):
"""
初始化 AzureOpenAiAgent 实例。
Args:
deployment_id (`str`):
要使用的已部署 Azure OpenAI 模型的名称。
api_key (`str`, *optional*):
要使用的 API 密钥。如果未设置,将查找环境变量 `"AZURE_OPENAI_API_KEY"`。
resource_name (`str`, *optional*):
Azure OpenAI 资源的名称。如果未设置,将查找环境变量 `"AZURE_OPENAI_RESOURCE_NAME"`。
api_version (`str`, *optional*, default to `"2022-12-01"`):
该代理使用的 API 版本。
is_chat_mode (`bool`, *optional*):
是否使用聊天模型而非完成模型(参见上述注释,聊天模型的效率较低)。默认根据 `deployment_id` 是否包含 `'gpt'` 来判断。
chat_prompt_template (`str`, *optional*):
如果要覆盖 `chat` 方法的默认模板,请传递自定义的提示模板。可以是实际的提示模板或 Hugging Face Hub 上的 repo ID。在这种情况下,提示应该在该 repo 中命名为 `chat_prompt_template.txt`。
run_prompt_template (`str`, *optional*):
如果要覆盖 `run` 方法的默认模板,请传递自定义的提示模板。可以是实际的提示模板或 Hugging Face Hub 上的 repo ID。在这种情况下,提示应该在该 repo 中命名为 `run_prompt_template.txt`。
additional_tools ([`Tool`], list of tools or dictionary with tool values, *optional*):
除默认工具外要包含的任何附加工具。如果传递与默认工具同名的工具,将覆盖默认工具。
"""
super().__init__() # 调用父类 Agent 的构造函数
self.deployment_id = deployment_id
self.api_key = api_key if api_key else os.getenv("AZURE_OPENAI_API_KEY") # 设置 API 密钥,如果未提供则从环境变量获取
self.resource_name = resource_name if resource_name else os.getenv("AZURE_OPENAI_RESOURCE_NAME") # 设置 Azure OpenAI 资源名称,如果未提供则从环境变量获取
self.api_version = api_version # 设置 API 版本
self.is_chat_mode = is_chat_mode if is_chat_mode is not None else 'gpt' in deployment_id.lower() # 设置是否为聊天模式,默认根据 deployment_id 是否包含 'gpt' 来判断
self.chat_prompt_template = chat_prompt_template # 设置聊天模式的提示模板
self.run_prompt_template = run_prompt_template # 设置运行模式的提示模板
self.additional_tools = additional_tools # 设置额外的工具列表或字典
):
# 检查是否安装了 openai 库,如果未安装则抛出 ImportError 异常
if not is_openai_available():
raise ImportError("Using `OpenAiAgent` requires `openai`: `pip install openai`.")
# 设置部署 ID
self.deployment_id = deployment_id
# 设置 OpenAI API 类型为 "azure"
openai.api_type = "azure"
# 如果 API 密钥未提供,则尝试从环境变量中获取 AZURE_OPENAI_API_KEY
if api_key is None:
api_key = os.environ.get("AZURE_OPENAI_API_KEY", None)
# 如果仍然没有 API 密钥,则抛出 ValueError 异常
if api_key is None:
raise ValueError(
"You need an Azure openAI key to use `AzureOpenAIAgent`. If you have one, set it in your env with "
"`os.environ['AZURE_OPENAI_API_KEY'] = xxx."
)
else:
# 设置 OpenAI API 密钥
openai.api_key = api_key
# 如果资源名称未提供,则尝试从环境变量中获取 AZURE_OPENAI_RESOURCE_NAME
if resource_name is None:
resource_name = os.environ.get("AZURE_OPENAI_RESOURCE_NAME", None)
# 如果仍然没有资源名称,则抛出 ValueError 异常
if resource_name is None:
raise ValueError(
"You need a resource_name to use `AzureOpenAIAgent`. If you have one, set it in your env with "
"`os.environ['AZURE_OPENAI_RESOURCE_NAME'] = xxx."
)
else:
# 设置 OpenAI API 基础 URL
openai.api_base = f"https://{resource_name}.openai.azure.com"
# 设置 OpenAI API 版本
openai.api_version = api_version
# 如果 is_chat_model 未提供,则根据 deployment_id 决定是否是聊天模型
if is_chat_model is None:
is_chat_model = "gpt" in deployment_id.lower()
# 设置实例的 is_chat_model 属性
self.is_chat_model = is_chat_model
# 调用父类的构造函数,初始化实例
super().__init__(
chat_prompt_template=chat_prompt_template,
run_prompt_template=run_prompt_template,
additional_tools=additional_tools,
)
# 生成多个结果的方法
def generate_many(self, prompts, stop):
# 如果是聊天模型,则使用 _chat_generate 方法生成多个结果
if self.is_chat_model:
return [self._chat_generate(prompt, stop) for prompt in prompts]
else:
# 否则使用 _completion_generate 方法生成多个结果
return self._completion_generate(prompts, stop)
# 生成单个结果的方法
def generate_one(self, prompt, stop):
# 如果是聊天模型,则使用 _chat_generate 方法生成单个结果
if self.is_chat_model:
return self._chat_generate(prompt, stop)
else:
# 否则使用 _completion_generate 方法生成单个结果并返回第一个结果
return self._completion_generate([prompt], stop)[0]
# 聊天生成方法,使用 OpenAI ChatCompletion API
def _chat_generate(self, prompt, stop):
result = openai.ChatCompletion.create(
engine=self.deployment_id,
messages=[{"role": "user", "content": prompt}],
temperature=0,
stop=stop,
)
# 返回生成的聊天消息内容
return result["choices"][0]["message"]["content"]
# 完整生成方法,使用 OpenAI Completion API
def _completion_generate(self, prompts, stop):
result = openai.Completion.create(
engine=self.deployment_id,
prompt=prompts,
temperature=0,
stop=stop,
max_tokens=200,
)
# 返回生成的每个答案的文本内容列表
return [answer["text"] for answer in result["choices"]]
class HfAgent(Agent):
"""
Agent that uses an inference endpoint to generate code.
Args:
url_endpoint (`str`):
The name of the url endpoint to use.
token (`str`, *optional*):
The token to use as HTTP bearer authorization for remote files. If unset, will use the token generated when
running `huggingface-cli login` (stored in `~/.huggingface`).
chat_prompt_template (`str`, *optional*):
Pass along your own prompt if you want to override the default template for the `chat` method. Can be the
actual prompt template or a repo ID (on the Hugging Face Hub). The prompt should be in a file named
`chat_prompt_template.txt` in this repo in this case.
run_prompt_template (`str`, *optional*):
Pass along your own prompt if you want to override the default template for the `run` method. Can be the
actual prompt template or a repo ID (on the Hugging Face Hub). The prompt should be in a file named
`run_prompt_template.txt` in this repo in this case.
additional_tools ([`Tool`], list of tools or dictionary with tool values, *optional*):
Any additional tools to include on top of the default ones. If you pass along a tool with the same name as
one of the default tools, that default tool will be overridden.
Example:
```
from transformers import HfAgent
agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder")
agent.run("Is the following `text` (in Spanish) positive or negative?", text="¡Este es un API muy agradable!")
```
"""
def __init__(
self, url_endpoint, token=None, chat_prompt_template=None, run_prompt_template=None, additional_tools=None
):
# 设置推理端点的URL
self.url_endpoint = url_endpoint
# 根据传入的token参数或者从本地获取的token来设置HTTP授权token
if token is None:
self.token = f"Bearer {HfFolder().get_token()}"
elif token.startswith("Bearer") or token.startswith("Basic"):
self.token = token
else:
self.token = f"Bearer {token}"
# 调用父类构造函数初始化Agent基类
super().__init__(
chat_prompt_template=chat_prompt_template,
run_prompt_template=run_prompt_template,
additional_tools=additional_tools,
)
# 生成一段文本,使用给定的提示和停止条件
def generate_one(self, prompt, stop):
# 设置请求头,包含授权信息
headers = {"Authorization": self.token}
# 构造请求体,包含输入提示、生成参数和停止条件
inputs = {
"inputs": prompt,
"parameters": {"max_new_tokens": 200, "return_full_text": False, "stop": stop},
}
# 发送 POST 请求到推理 API
response = requests.post(self.url_endpoint, json=inputs, headers=headers)
# 处理请求返回的状态码
if response.status_code == 429:
# 如果返回状态码为 429 表示请求过多,记录日志并等待一秒后重试
logger.info("Getting rate-limited, waiting a tiny bit before trying again.")
time.sleep(1)
return self._generate_one(prompt)
elif response.status_code != 200:
# 如果返回状态码不是 200,则抛出异常并附带错误信息
raise ValueError(f"Error {response.status_code}: {response.json()}")
# 解析响应内容,获取生成的文本结果
result = response.json()[0]["generated_text"]
# 检查生成的文本是否以任一停止序列结尾
for stop_seq in stop:
if result.endswith(stop_seq):
# 如果是,则返回去掉停止序列部分的文本
return result[: -len(stop_seq)]
# 如果没有匹配到停止序列,则直接返回生成的文本结果
return result
@classmethod
# 类方法装饰器,用于定义一个类的类方法,即可以不用实例化类就可以调用的方法
def from_local(cls, model_name_or_path, chat_prompt_template=None, run_prompt_template=None, additional_tools=None):
# 从本地加载模型和分词器
model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
# 返回一个新的 LocalAgent 实例,初始化时会传入加载的模型和分词器
return cls(model, tokenizer, chat_prompt_template, run_prompt_template, additional_tools)
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
"""
Convenience method to build a `LocalAgent` from a pretrained checkpoint.
Args:
pretrained_model_name_or_path (`str` or `os.PathLike`):
The name of a repo on the Hub or a local path to a folder containing both model and tokenizer.
kwargs (`Dict[str, Any]`, *optional*):
Keyword arguments passed along to [`~PreTrainedModel.from_pretrained`].
Example:
```
import torch
from transformers import LocalAgent
agent = LocalAgent.from_pretrained("bigcode/starcoder", device_map="auto", torch_dtype=torch.bfloat16)
agent.run("Draw me a picture of rivers and lakes.")
```
"""
# 使用预训练模型名称或路径创建 AutoModelForCausalLM 对象
model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, **kwargs)
# 使用预训练模型名称或路径创建 AutoTokenizer 对象
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
# 返回使用创建的模型和分词器构建的 LocalAgent 对象
return cls(model, tokenizer)
@property
def _model_device(self):
# 检查模型是否有 hf_device_map 属性,返回第一个设备映射的值
if hasattr(self.model, "hf_device_map"):
return list(self.model.hf_device_map.values())[0]
# 如果模型没有 hf_device_map 属性,则返回第一个参数的设备
for param in self.model.parameters():
return param.device
def generate_one(self, prompt, stop):
# 使用分词器对提示进行编码,返回张量,并移动到模型所在设备
encoded_inputs = self.tokenizer(prompt, return_tensors="pt").to(self._model_device)
# 计算输入序列的长度
src_len = encoded_inputs["input_ids"].shape[1]
# 创建停止条件列表
stopping_criteria = StoppingCriteriaList([StopSequenceCriteria(stop, self.tokenizer)])
# 使用模型生成文本序列,限制最大生成的新令牌数为200,并应用停止条件
outputs = self.model.generate(
encoded_inputs["input_ids"], max_new_tokens=200, stopping_criteria=stopping_criteria
)
# 解码生成的输出,去除原始输入部分并返回结果
result = self.tokenizer.decode(outputs[0].tolist()[src_len:])
# 如果结果以停止序列之一结尾,则去除停止序列并更新结果
for stop_seq in stop:
if result.endswith(stop_seq):
result = result[: -len(stop_seq)]
# 返回生成的结果文本
return result
class StopSequenceCriteria(StoppingCriteria):
"""
This class can be used to stop generation whenever a sequence of tokens is encountered.
Args:
stop_sequences (`str` or `List[str]`):
The sequence (or list of sequences) on which to stop execution.
tokenizer:
The tokenizer used to decode the model outputs.
"""
def __init__(self, stop_sequences, tokenizer):
# 如果stop_sequences是单个字符串,则转换为列表
if isinstance(stop_sequences, str):
stop_sequences = [stop_sequences]
# 初始化停止序列和分词器
self.stop_sequences = stop_sequences
self.tokenizer = tokenizer
def __call__(self, input_ids, scores, **kwargs) -> bool:
# 解码输入的模型输出,转换为字符串
decoded_output = self.tokenizer.decode(input_ids.tolist()[0])
# 检查解码后的输出是否以任何停止序列结尾
return any(decoded_output.endswith(stop_sequence) for stop_sequence in self.stop_sequences)