Transformers 源码解析(三)
.\convert_slow_tokenizers_checkpoints_to_fast.py
""" 转换慢速分词器检查点为快速分词器的序列化格式(tokenizers 库的序列化格式)"""
import argparse
import os
import transformers
from .convert_slow_tokenizer import SLOW_TO_FAST_CONVERTERS
from .utils import logging
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
TOKENIZER_CLASSES = {name: getattr(transformers, name + "Fast") for name in SLOW_TO_FAST_CONVERTERS}
def convert_slow_checkpoint_to_fast(tokenizer_name, checkpoint_name, dump_path, force_download):
if tokenizer_name is not None and tokenizer_name not in TOKENIZER_CLASSES:
raise ValueError(f"Unrecognized tokenizer name, should be one of {list(TOKENIZER_CLASSES.keys())}.")
if tokenizer_name is None:
tokenizer_names = TOKENIZER_CLASSES
else:
tokenizer_names = {tokenizer_name: getattr(transformers, tokenizer_name + "Fast")}
logger.info(f"Loading tokenizer classes: {tokenizer_names}")
for tokenizer_name in tokenizer_names:
tokenizer_class = TOKENIZER_CLASSES[tokenizer_name]
add_prefix = True
if checkpoint_name is None:
checkpoint_names = list(tokenizer_class.max_model_input_sizes.keys())
else:
checkpoint_names = [checkpoint_name]
logger.info(f"For tokenizer {tokenizer_class.__class__.__name__} loading checkpoints: {checkpoint_names}")
for checkpoint in checkpoint_names:
logger.info(f"Loading {tokenizer_class.__class__.__name__} {checkpoint}")
tokenizer = tokenizer_class.from_pretrained(checkpoint, force_download=force_download)
logger.info(f"Save fast tokenizer to {dump_path} with prefix {checkpoint} add_prefix {add_prefix}")
if "/" in checkpoint:
checkpoint_directory, checkpoint_prefix_name = checkpoint.split("/")
dump_path_full = os.path.join(dump_path, checkpoint_directory)
elif add_prefix:
checkpoint_prefix_name = checkpoint
dump_path_full = dump_path
else:
checkpoint_prefix_name = None
dump_path_full = dump_path
logger.info(f"=> {dump_path_full} with prefix {checkpoint_prefix_name}, add_prefix {add_prefix}")
if checkpoint in list(tokenizer.pretrained_vocab_files_map.values())[0]:
file_path = list(tokenizer.pretrained_vocab_files_map.values())[0][checkpoint]
next_char = file_path.split(checkpoint)[-1][0]
if next_char == "/":
dump_path_full = os.path.join(dump_path_full, checkpoint_prefix_name)
checkpoint_prefix_name = None
logger.info(f"=> {dump_path_full} with prefix {checkpoint_prefix_name}, add_prefix {add_prefix}")
file_names = tokenizer.save_pretrained(
dump_path_full, legacy_format=False, filename_prefix=checkpoint_prefix_name
)
logger.info(f"=> File names {file_names}")
for file_name in file_names:
if not file_name.endswith("tokenizer.json"):
os.remove(file_name)
logger.info(f"=> removing {file_name}")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--dump_path", default=None, type=str, required=True, help="Path to output generated fast tokenizer files."
)
parser.add_argument(
"--tokenizer_name",
default=None,
type=str,
help=(
f"Optional tokenizer type selected in the list of {list(TOKENIZER_CLASSES.keys())}. If not given, will "
"download and convert all the checkpoints from AWS."
),
)
parser.add_argument(
"--checkpoint_name",
default=None,
type=str,
help="Optional checkpoint name. If not given, will download and convert the canonical checkpoints from AWS.",
)
parser.add_argument(
"--force_download",
action="store_true",
help="Re-download checkpoints.",
)
args = parser.parse_args()
convert_slow_checkpoint_to_fast(args.tokenizer_name, args.checkpoint_name, args.dump_path, args.force_download)
.\convert_tf_hub_seq_to_seq_bert_to_pytorch.py
"""Convert Seq2Seq TF Hub checkpoint."""
import argparse
from . import (
BertConfig,
BertGenerationConfig,
BertGenerationDecoder,
BertGenerationEncoder,
load_tf_weights_in_bert_generation,
logging,
)
logging.set_verbosity_info()
def convert_tf_checkpoint_to_pytorch(tf_hub_path, pytorch_dump_path, is_encoder_named_decoder, vocab_size, is_encoder):
bert_config = BertConfig.from_pretrained(
"google-bert/bert-large-cased",
vocab_size=vocab_size,
max_position_embeddings=512,
is_decoder=True,
add_cross_attention=True,
)
bert_config_dict = bert_config.to_dict()
del bert_config_dict["type_vocab_size"]
config = BertGenerationConfig(**bert_config_dict)
if is_encoder:
model = BertGenerationEncoder(config)
else:
model = BertGenerationDecoder(config)
print(f"Building PyTorch model from configuration: {config}")
load_tf_weights_in_bert_generation(
model,
tf_hub_path,
model_class="bert",
is_encoder_named_decoder=is_encoder_named_decoder,
is_encoder=is_encoder,
)
print(f"Save PyTorch model and config to {pytorch_dump_path}")
model.save_pretrained(pytorch_dump_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--tf_hub_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
)
parser.add_argument(
"--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
)
parser.add_argument(
"--is_encoder_named_decoder",
action="store_true",
help="If decoder has to be renamed to encoder in PyTorch model.",
)
parser.add_argument("--is_encoder", action="store_true", help="If model is an encoder.")
parser.add_argument("--vocab_size", default=50358, type=int, help="Vocab size of model")
args = parser.parse_args()
convert_tf_checkpoint_to_pytorch(
args.tf_hub_path,
args.pytorch_dump_path,
args.is_encoder_named_decoder,
args.vocab_size,
is_encoder=args.is_encoder,
)
.\data\datasets\glue.py
import os
import time
import warnings
from dataclasses import dataclass, field
from enum import Enum
from typing import List, Optional, Union
import torch
from filelock import FileLock
from torch.utils.data import Dataset
from ...utils import logging
from ..processors.glue import glue_convert_examples_to_features, glue_output_modes, glue_processors
from ..processors.utils import InputFeatures
logger = logging.get_logger(__name__)
@dataclass
class GlueDataTrainingArguments:
"""
用于指定模型训练和评估所需数据的参数。
使用 `HfArgumentParser` 可以将此类转换为 argparse 参数,以便能够在命令行上指定它们。
"""
task_name: str = field(metadata={"help": "The name of the task to train on: " + ", ".join(glue_processors.keys())})
data_dir: str = field(
metadata={"help": "The input data dir. Should contain the .tsv files (or other data files) for the task."}
)
max_seq_length: int = field(
default=128,
metadata={
"help": (
"The maximum total input sequence length after tokenization. Sequences longer "
"than this will be truncated, sequences shorter will be padded."
)
},
)
overwrite_cache: bool = field(
default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
)
def __post_init__(self):
self.task_name = self.task_name.lower()
class Split(Enum):
train = "train"
dev = "dev"
test = "test"
class GlueDataset(Dataset):
"""
此类将很快被一个与框架无关的方法替代。
"""
args: GlueDataTrainingArguments
output_mode: str
features: List[InputFeatures]
def __init__(
self,
args: GlueDataTrainingArguments,
tokenizer: PreTrainedTokenizerBase,
limit_length: Optional[int] = None,
mode: Union[str, Split] = Split.train,
cache_dir: Optional[str] = None,
):
"""
初始化 GLUE 数据集。
Args:
args: GlueDataTrainingArguments 类的实例,包含数据集相关参数。
tokenizer: 预训练的分词器。
limit_length: 可选参数,限制数据长度。
mode: 数据集模式,可以是字符串或 Split 枚举类型。
cache_dir: 可选参数,缓存目录。
"""
self.args = args
self.output_mode = glue_output_modes[args.task_name]
self.features = glue_convert_examples_to_features(
tokenizer=tokenizer,
examples=None,
max_length=args.max_seq_length,
task=args.task_name,
label_list=None,
output_mode=self.output_mode,
)
def __len__(self):
"""
返回数据集中的样本数量。
"""
return len(self.features)
def __getitem__(self, i) -> InputFeatures:
"""
获取指定索引处的输入特征。
Args:
i: 索引值。
Returns:
输入特征的实例。
"""
return self.features[i]
def get_labels(self):
"""
返回数据集的标签列表。
"""
return self.label_list
.\data\datasets\language_modeling.py
import json
import os
import pickle
import random
import time
import warnings
from typing import Dict, List, Optional
import torch
from filelock import FileLock
from torch.utils.data import Dataset
from ...tokenization_utils import PreTrainedTokenizer
from ...utils import logging
logger = logging.get_logger(__name__)
DEPRECATION_WARNING = (
"This dataset will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets "
"library. You can have a look at this example script for pointers: {0}"
)
class TextDataset(Dataset):
"""
这个类将很快被一个与框架无关的方法所取代。
"""
def __init__(
self,
tokenizer: PreTrainedTokenizer,
file_path: str,
block_size: int,
overwrite_cache=False,
cache_dir: Optional[str] = None,
):
warnings.warn(
DEPRECATION_WARNING.format(
"https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm.py"
),
FutureWarning,
)
if os.path.isfile(file_path) is False:
raise ValueError(f"Input file path {file_path} not found")
block_size = block_size - tokenizer.num_special_tokens_to_add(pair=False)
directory, filename = os.path.split(file_path)
cached_features_file = os.path.join(
cache_dir if cache_dir is not None else directory,
f"cached_lm_{tokenizer.__class__.__name__}_{block_size}_{filename}",
)
lock_path = cached_features_file + ".lock"
with FileLock(lock_path):
if os.path.exists(cached_features_file) and not overwrite_cache:
start = time.time()
with open(cached_features_file, "rb") as handle:
self.examples = pickle.load(handle)
logger.info(
f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start
)
else:
logger.info(f"Creating features from dataset file at {directory}")
self.examples = []
with open(file_path, encoding="utf-8") as f:
text = f.read()
tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
for i in range(0, len(tokenized_text) - block_size + 1, block_size):
self.examples.append(
tokenizer.build_inputs_with_special_tokens(tokenized_text[i : i + block_size])
)
start = time.time()
with open(cached_features_file, "wb") as handle:
pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
logger.info(
f"Saving features into cached file {cached_features_file} [took {time.time() - start:.3f} s]"
)
def __len__(self):
return len(self.examples)
def __getitem__(self, i) -> torch.Tensor:
return torch.tensor(self.examples[i], dtype=torch.long)
class LineByLineTextDataset(Dataset):
"""
This will be superseded by a framework-agnostic approach soon.
"""
def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int):
warnings.warn(
DEPRECATION_WARNING.format(
"https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm.py"
),
FutureWarning,
)
if os.path.isfile(file_path) is False:
raise ValueError(f"Input file path {file_path} not found")
logger.info(f"Creating features from dataset file at {file_path}")
with open(file_path, encoding="utf-8") as f:
lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]
batch_encoding = tokenizer(lines, add_special_tokens=True, truncation=True, max_length=block_size)
self.examples = batch_encoding["input_ids"]
self.examples = [{"input_ids": torch.tensor(e, dtype=torch.long)} for e in self.examples]
def __len__(self):
return len(self.examples)
def __getitem__(self, i) -> Dict[str, torch.tensor]:
return self.examples[i]
class LineByLineWithRefDataset(Dataset):
"""
This will be superseded by a framework-agnostic approach soon.
"""
def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int, ref_path: str):
warnings.warn(
DEPRECATION_WARNING.format(
"https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm_wwm.py"
),
FutureWarning,
)
if os.path.isfile(file_path) is False:
raise ValueError(f"Input file path {file_path} not found")
if os.path.isfile(ref_path) is False:
raise ValueError(f"Ref file path {file_path} not found")
logger.info(f"Creating features from dataset file at {file_path}")
logger.info(f"Use ref segment results at {ref_path}")
with open(file_path, encoding="utf-8") as f:
data = f.readlines()
data = [line.strip() for line in data if len(line) > 0 and not line.isspace()]
with open(ref_path, encoding="utf-8") as f:
ref = [json.loads(line) for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]
if len(data) != len(ref):
raise ValueError(
f"Length of Input file should be equal to Ref file. But the length of {file_path} is {len(data)} "
f"while length of {ref_path} is {len(ref)}"
)
batch_encoding = tokenizer(data, add_special_tokens=True, truncation=True, max_length=block_size)
self.examples = batch_encoding["input_ids"]
self.examples = [{"input_ids": torch.tensor(e, dtype=torch.long)} for e in self.examples]
n = len(self.examples)
for i in range(n):
self.examples[i]["chinese_ref"] = torch.tensor(ref[i], dtype=torch.long)
def __len__(self):
return len(self.examples)
def __getitem__(self, i) -> Dict[str, torch.tensor]:
return self.examples[i]
class LineByLineWithSOPTextDataset(Dataset):
"""
Dataset for sentence order prediction task, prepare sentence pairs for SOP task
"""
def __init__(self, tokenizer: PreTrainedTokenizer, file_dir: str, block_size: int):
warnings.warn(
DEPRECATION_WARNING.format(
"https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm.py"
),
FutureWarning,
)
if os.path.isdir(file_dir) is False:
raise ValueError(f"{file_dir} is not a directory")
logger.info(f"Creating features from dataset file folder at {file_dir}")
self.examples = []
for file_name in os.listdir(file_dir):
file_path = os.path.join(file_dir, file_name)
if os.path.isfile(file_path) is False:
raise ValueError(f"{file_path} is not a file")
article_open = False
with open(file_path, encoding="utf-8") as f:
original_lines = f.readlines()
article_lines = []
for line in original_lines:
if "<doc id=" in line:
article_open = True
elif "</doc>" in line:
article_open = False
document = [
tokenizer.convert_tokens_to_ids(tokenizer.tokenize(line))
for line in article_lines[1:]
if (len(line) > 0 and not line.isspace())
]
examples = self.create_examples_from_document(document, block_size, tokenizer)
self.examples.extend(examples)
article_lines = []
else:
if article_open:
article_lines.append(line)
logger.info("Dataset parse finished.")
def __len__(self):
return len(self.examples)
def __getitem__(self, i) -> Dict[str, torch.tensor]:
return self.examples[i]
class TextDatasetForNextSentencePrediction(Dataset):
"""
This will be superseded by a framework-agnostic approach soon.
"""
def __init__(
self,
tokenizer: PreTrainedTokenizer,
file_path: str,
block_size: int,
overwrite_cache=False,
short_seq_probability=0.1,
nsp_probability=0.5,
):
self.examples = []
def __len__(self):
return len(self.examples)
def __getitem__(self, i):
return self.examples[i]
.\data\datasets\squad.py
import os
import time
from dataclasses import dataclass, field
from enum import Enum
from typing import Dict, List, Optional, Union
import torch
from filelock import FileLock
from torch.utils.data import Dataset
from ...models.auto.modeling_auto import MODEL_FOR_QUESTION_ANSWERING_MAPPING
from ...tokenization_utils import PreTrainedTokenizer
from ...utils import logging
from ..processors.squad import SquadFeatures, SquadV1Processor, SquadV2Processor, squad_convert_examples_to_features
logger = logging.get_logger(__name__)
MODEL_CONFIG_CLASSES = list(MODEL_FOR_QUESTION_ANSWERING_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
@dataclass
class SquadDataTrainingArguments:
"""
Arguments pertaining to what data we are going to input our model for training and eval.
"""
model_type: str = field(
default=None, metadata={"help": "Model type selected in the list: " + ", ".join(MODEL_TYPES)}
)
data_dir: str = field(
default=None, metadata={"help": "The input data dir. Should contain the .json files for the SQuAD task."}
)
max_seq_length: int = field(
default=128,
metadata={
"help": (
"The maximum total input sequence length after tokenization. Sequences longer "
"than this will be truncated, sequences shorter will be padded."
)
},
)
doc_stride: int = field(
default=128,
metadata={"help": "When splitting up a long document into chunks, how much stride to take between chunks."},
)
max_query_length: int = field(
default=64,
metadata={
"help": (
"The maximum number of tokens for the question. Questions longer than this will "
"be truncated to this length."
)
},
)
max_answer_length: int = field(
default=30,
metadata={
"help": (
"The maximum length of an answer that can be generated. This is needed because the start "
"and end predictions are not conditioned on one another."
)
},
)
overwrite_cache: bool = field(
default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
)
version_2_with_negative: bool = field(
default=False, metadata={"help": "If true, the SQuAD examples contain some that do not have an answer."}
)
null_score_diff_threshold: float = field(
default=0.0, metadata={"help": "If null_score - best_non_null is greater than the threshold predict null."}
)
n_best_size: int = field(
default=20, metadata={"help": "If null_score - best_non_null is greater than the threshold predict null."}
)
lang_id: int = field(
default=0,
metadata={
"help": (
"language id of input for language-specific xlm models (see"
" tokenization_xlm.PRETRAINED_INIT_CONFIGURATION)"
)
},
)
threads: int = field(default=1, metadata={"help": "multiple threads for converting example to features"})
class Split(Enum):
train = "train"
dev = "dev"
class SquadDataset(Dataset):
"""
This will be superseded by a framework-agnostic approach soon.
"""
args: SquadDataTrainingArguments
features: List[SquadFeatures]
mode: Split
is_language_sensitive: bool
def __init__(
self,
args: SquadDataTrainingArguments,
tokenizer: PreTrainedTokenizer,
limit_length: Optional[int] = None,
mode: Union[str, Split] = Split.train,
is_language_sensitive: Optional[bool] = False,
cache_dir: Optional[str] = None,
dataset_format: Optional[str] = "pt",
):
def __len__(self):
def __getitem__(self, i) -> Dict[str, torch.Tensor]:
feature = self.features[i]
input_ids = torch.tensor(feature.input_ids, dtype=torch.long)
attention_mask = torch.tensor(feature.attention_mask, dtype=torch.long)
token_type_ids = torch.tensor(feature.token_type_ids, dtype=torch.long)
cls_index = torch.tensor(feature.cls_index, dtype=torch.long)
p_mask = torch.tensor(feature.p_mask, dtype=torch.float)
is_impossible = torch.tensor(feature.is_impossible, dtype=torch.float)
inputs = {
"input_ids": input_ids,
"attention_mask": attention_mask,
"token_type_ids": token_type_ids,
}
if self.args.model_type in ["xlm", "roberta", "distilbert", "camembert"]:
del inputs["token_type_ids"]
if self.args.model_type in ["xlnet", "xlm"]:
inputs.update({"cls_index": cls_index, "p_mask": p_mask})
if self.args.version_2_with_negative:
inputs.update({"is_impossible": is_impossible})
if self.is_language_sensitive:
inputs.update({"langs": (torch.ones(input_ids.shape, dtype=torch.int64) * self.args.lang_id)})
if self.mode == Split.train:
start_positions = torch.tensor(feature.start_position, dtype=torch.long)
end_positions = torch.tensor(feature.end_position, dtype=torch.long)
inputs.update({"start_positions": start_positions, "end_positions": end_positions})
return inputs
.\data\datasets\__init__.py
from .glue import GlueDataset, GlueDataTrainingArguments
from .language_modeling import (
LineByLineTextDataset,
LineByLineWithRefDataset,
LineByLineWithSOPTextDataset,
TextDataset,
TextDatasetForNextSentencePrediction,
)
from .squad import SquadDataset, SquadDataTrainingArguments
.\data\data_collator.py
import random
import warnings
from collections.abc import Mapping
from dataclasses import dataclass
from random import randint
from typing import Any, Callable, Dict, List, NewType, Optional, Tuple, Union
import numpy as np
from ..models.bert import BertTokenizer, BertTokenizerFast
from ..tokenization_utils_base import PreTrainedTokenizerBase
from ..utils import PaddingStrategy
InputDataClass = NewType("InputDataClass", Any)
"""
A DataCollator is a function that takes a list of samples from a Dataset and collate them into a batch, as a dictionary
of PyTorch/TensorFlow tensors or NumPy arrays.
"""
DataCollator = NewType("DataCollator", Callable[[List[InputDataClass]], Dict[str, Any]])
class DataCollatorMixin:
def __call__(self, features, return_tensors=None):
if return_tensors is None:
return_tensors = self.return_tensors
if return_tensors == "tf":
return self.tf_call(features)
elif return_tensors == "pt":
return self.torch_call(features)
elif return_tensors == "np":
return self.numpy_call(features)
else:
raise ValueError(f"Framework '{return_tensors}' not recognized!")
def pad_without_fast_tokenizer_warning(tokenizer, *pad_args, **pad_kwargs):
"""
Pads without triggering the warning about how using the pad function is sub-optimal when using a fast tokenizer.
"""
if not hasattr(tokenizer, "deprecation_warnings"):
return tokenizer.pad(*pad_args, **pad_kwargs)
warning_state = tokenizer.deprecation_warnings.get("Asking-to-pad-a-fast-tokenizer", False)
tokenizer.deprecation_warnings["Asking-to-pad-a-fast-tokenizer"] = True
try:
padded = tokenizer.pad(*pad_args, **pad_kwargs)
finally:
tokenizer.deprecation_warnings["Asking-to-pad-a-fast-tokenizer"] = warning_state
return padded
def default_data_collator(features: List[InputDataClass], return_tensors="pt") -> Dict[str, Any]:
"""
Very simple data collator that simply collates batches of dict-like objects and performs special handling for
"""
"""
potential keys named:
- `label`: handles a single value (int or float) per object
- `label_ids`: handles a list of values per object
Does not do any additional preprocessing: property names of the input object will be used as corresponding inputs
to the model. See glue and ner for example of how it's useful.
"""
if return_tensors == "pt":
return torch_default_data_collator(features)
elif return_tensors == "tf":
return tf_default_data_collator(features)
elif return_tensors == "np":
return numpy_default_data_collator(features)
@dataclass
class DefaultDataCollator(DataCollatorMixin):
"""
Very simple data collator that simply collates batches of dict-like objects and performs special handling for
potential keys named:
- `label`: handles a single value (int or float) per object
- `label_ids`: handles a list of values per object
Does not do any additional preprocessing: property names of the input object will be used as corresponding inputs
to the model. See glue and ner for example of how it's useful.
This is an object (like other data collators) rather than a pure function like default_data_collator. This can be
helpful if you need to set a return_tensors value at initialization.
Args:
return_tensors (`str`, *optional*, defaults to `"pt"`):
The type of Tensor to return. Allowable values are "np", "pt" and "tf".
"""
return_tensors: str = "pt"
def __call__(self, features: List[Dict[str, Any]], return_tensors=None) -> Dict[str, Any]:
if return_tensors is None:
return_tensors = self.return_tensors
return default_data_collator(features, return_tensors)
def torch_default_data_collator(features: List[InputDataClass]) -> Dict[str, Any]:
import torch
if not isinstance(features[0], Mapping):
features = [vars(f) for f in features]
first = features[0]
batch = {}
if "label" in first and first["label"] is not None:
label = first["label"].item() if isinstance(first["label"], torch.Tensor) else first["label"]
dtype = torch.long if isinstance(label, int) else torch.float
batch["labels"] = torch.tensor([f["label"] for f in features], dtype=dtype)
elif "label_ids" in first and first["label_ids"] is not None:
if isinstance(first["label_ids"], torch.Tensor):
batch["labels"] = torch.stack([f["label_ids"] for f in features])
else:
dtype = torch.long if isinstance(first["label_ids"][0], int) else torch.float
batch["labels"] = torch.tensor([f["label_ids"] for f in features], dtype=dtype)
for k, v in first.items():
if k not in ("label", "label_ids") and v is not None and not isinstance(v, str):
if isinstance(v, torch.Tensor):
batch[k] = torch.stack([f[k] for f in features])
elif isinstance(v, np.ndarray):
batch[k] = torch.tensor(np.stack([f[k] for f in features]))
else:
batch[k] = torch.tensor([f[k] for f in features])
return batch
def tf_default_data_collator(features: List[InputDataClass]) -> Dict[str, Any]:
import tensorflow as tf
pass
if not isinstance(features[0], Mapping):
features = [vars(f) for f in features]
first = features[0]
batch = {}
if "label" in first and first["label"] is not None:
label_col_name = "label"
elif "label_ids" in first and first["label_ids"] is not None:
label_col_name = "label_ids"
elif "labels" in first and first["labels"] is not None:
label_col_name = "labels"
else:
label_col_name = None
if label_col_name is not None:
if isinstance(first[label_col_name], tf.Tensor):
dtype = tf.int64 if first[label_col_name].dtype.is_integer else tf.float32
elif isinstance(first[label_col_name], np.ndarray) or isinstance(first[label_col_name], np.generic):
dtype = tf.int64 if np.issubdtype(first[label_col_name].dtype, np.integer) else tf.float32
elif isinstance(first[label_col_name], (tuple, list)):
dtype = tf.int64 if isinstance(first[label_col_name][0], int) else tf.float32
else:
dtype = tf.int64 if isinstance(first[label_col_name], int) else tf.float32
batch["labels"] = tf.convert_to_tensor([f[label_col_name] for f in features], dtype=dtype)
for k, v in first.items():
if k not in ("label", "label_ids", "labels") and v is not None and not isinstance(v, str):
if isinstance(v, (tf.Tensor, np.ndarray)):
batch[k] = tf.stack([f[k] for f in features])
else:
batch[k] = tf.convert_to_tensor([f[k] for f in features])
return batch
def numpy_default_data_collator(features: List[InputDataClass]) -> Dict[str, Any]:
if not isinstance(features[0], Mapping):
features = [vars(f) for f in features]
first = features[0]
batch = {}
if "label" in first and first["label"] is not None:
label = first["label"].item() if isinstance(first["label"], np.ndarray) else first["label"]
dtype = np.int64 if isinstance(label, int) else np.float32
batch["labels"] = np.array([f["label"] for f in features], dtype=dtype)
elif "label_ids" in first and first["label_ids"] is not None:
if isinstance(first["label_ids"], np.ndarray):
batch["labels"] = np.stack([f["label_ids"] for f in features])
else:
dtype = np.int64 if isinstance(first["label_ids"][0], int) else np.float32
batch["labels"] = np.array([f["label_ids"] for f in features], dtype=dtype)
for k, v in first.items():
if k not in ("label", "label_ids") and v is not None and not isinstance(v, str):
if isinstance(v, np.ndarray):
batch[k] = np.stack([f[k] for f in features])
else:
batch[k] = np.array([f[k] for f in features])
return batch
Args:
tokenizer ([`PreTrainedTokenizer`] or [`PreTrainedTokenizerFast`]):
用于编码数据的分词器。
padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
选择一种策略来对返回的序列进行填充(根据模型的填充位置和填充索引),可选值包括:
- `True` 或 `'longest'`(默认):对批次中最长的序列进行填充(如果只提供一个序列,则不进行填充)。
- `'max_length'`:按照参数 `max_length` 指定的最大长度进行填充,或者如果未提供该参数,则按照模型可接受的最大输入长度进行填充。
- `False` 或 `'do_not_pad'`:不进行填充(即可以输出长度不同的序列批次)。
max_length (`int`, *optional*):
返回列表的最大长度,也可选用于填充长度(见上文)。
pad_to_multiple_of (`int`, *optional*):
如果设置,将序列填充到提供的值的倍数。
这对于在具有计算能力 >= 7.5(Volta)的 NVIDIA 硬件上启用 Tensor Core 特别有用。
return_tensors (`str`, *optional*, defaults to `"pt"`):
要返回的张量类型。允许的值有 "np"、"pt" 和 "tf"。
"""
tokenizer: PreTrainedTokenizerBase
padding: Union[bool, str, PaddingStrategy] = True
max_length: Optional[int] = None
pad_to_multiple_of: Optional[int] = None
return_tensors: str = "pt"
def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
# 调用 pad_without_fast_tokenizer_warning 函数进行批量填充
batch = pad_without_fast_tokenizer_warning(
self.tokenizer,
features,
padding=self.padding,
max_length=self.max_length,
pad_to_multiple_of=self.pad_to_multiple_of,
return_tensors=self.return_tensors,
)
# 如果 batch 中有 "label" 键,将其重命名为 "labels",并删除 "label"
if "label" in batch:
batch["labels"] = batch["label"]
del batch["label"]
# 如果 batch 中有 "label_ids" 键,将其重命名为 "labels",并删除 "label_ids"
if "label_ids" in batch:
batch["labels"] = batch["label_ids"]
del batch["label_ids"]
# 返回处理后的 batch 字典
return batch
@dataclass
class DataCollatorForTokenClassification(DataCollatorMixin):
"""
Data collator that will dynamically pad the inputs received, as well as the labels.
Args:
tokenizer ([`PreTrainedTokenizer`] or [`PreTrainedTokenizerFast`]):
The tokenizer used for encoding the data.
padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
among:
- `True` or `'longest'` (default): Pad to the longest sequence in the batch (or no padding if only a single
sequence is provided).
- `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
acceptable input length for the model if that argument is not provided.
- `False` or `'do_not_pad'`: No padding (i.e., can output a batch with sequences of different lengths).
max_length (`int`, *optional*):
Maximum length of the returned list and optionally padding length (see above).
pad_to_multiple_of (`int`, *optional*):
If set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
7.5 (Volta).
label_pad_token_id (`int`, *optional*, defaults to -100):
The id to use when padding the labels (-100 will be automatically ignore by PyTorch loss functions).
return_tensors (`str`, *optional*, defaults to `"pt"`):
The type of Tensor to return. Allowable values are "np", "pt" and "tf".
"""
tokenizer: PreTrainedTokenizerBase # Tokenizer对象,用于数据编码
padding: Union[bool, str, PaddingStrategy] = True # 序列填充策略:可以是布尔值、字符串或填充策略对象,默认为True
max_length: Optional[int] = None # 返回列表的最大长度及填充长度(可选)
pad_to_multiple_of: Optional[int] = None # 如果设置,将序列填充到提供的值的倍数(可选)
label_pad_token_id: int = -100 # 标签填充时使用的ID,默认为-100,PyTorch损失函数会自动忽略这些ID
return_tensors: str = "pt" # 返回的Tensor类型,默认为"pt",可选值有"np"、"pt"和"tf"
# 定义一个使用 Torch 处理特征的方法
def torch_call(self, features):
# 导入 PyTorch 库
import torch
# 确定标签名称是 "label" 还是 "labels"
label_name = "label" if "label" in features[0].keys() else "labels"
# 如果特征中包含标签,提取所有标签值到列表;否则设置标签为 None
labels = [feature[label_name] for feature in features] if label_name in features[0].keys() else None
# 剔除特征中的标签,生成没有标签的特征字典列表
no_labels_features = [{k: v for k, v in feature.items() if k != label_name} for feature in features]
# 使用自定义函数进行填充(此处函数的具体实现未显示),生成批处理数据
batch = pad_without_fast_tokenizer_warning(
self.tokenizer,
no_labels_features,
padding=self.padding,
max_length=self.max_length,
pad_to_multiple_of=self.pad_to_multiple_of,
return_tensors="pt", # 返回 PyTorch 张量
)
# 如果没有标签,直接返回批处理数据
if labels is None:
return batch
# 获取输入序列的长度
sequence_length = batch["input_ids"].shape[1]
# 获取填充的位置(左或右)
padding_side = self.tokenizer.padding_side
# 定义一个函数,将张量或可迭代对象转换为列表
def to_list(tensor_or_iterable):
if isinstance(tensor_or_iterable, torch.Tensor):
return tensor_or_iterable.tolist()
return list(tensor_or_iterable)
# 根据填充的位置对标签进行填充
if padding_side == "right":
batch[label_name] = [
to_list(label) + [self.label_pad_token_id] * (sequence_length - len(label)) for label in labels
]
else:
batch[label_name] = [
[self.label_pad_token_id] * (sequence_length - len(label)) + to_list(label) for label in labels
]
# 将填充后的标签转换为 PyTorch 的 int64 类型张量
batch[label_name] = torch.tensor(batch[label_name], dtype=torch.int64)
return batch
# 定义一个使用 TensorFlow 处理特征的方法
def tf_call(self, features):
# 导入 TensorFlow 库
import tensorflow as tf
# 确定标签名称是 "label" 还是 "labels"
label_name = "label" if "label" in features[0].keys() else "labels"
# 如果特征中包含标签,提取所有标签值到列表;否则设置标签为 None
labels = [feature[label_name] for feature in features] if label_name in features[0].keys() else None
# 使用自定义函数进行填充(此处函数的具体实现未显示),生成批处理数据
batch = pad_without_fast_tokenizer_warning(
self.tokenizer,
features,
padding=self.padding,
max_length=self.max_length,
pad_to_multiple_of=self.pad_to_multiple_of,
# 如果标签为 None,则返回 TensorFlow 张量;否则不进行转换
return_tensors="tf" if labels is None else None,
)
# 如果没有标签,直接返回批处理数据
if labels is None:
return batch
# 获取输入序列的长度
sequence_length = tf.convert_to_tensor(batch["input_ids"]).shape[1]
# 获取填充的位置(左或右)
padding_side = self.tokenizer.padding_side
# 根据填充的位置对标签进行填充
if padding_side == "right":
batch["labels"] = [
list(label) + [self.label_pad_token_id] * (sequence_length - len(label)) for label in labels
]
else:
batch["labels"] = [
[self.label_pad_token_id] * (sequence_length - len(label)) + list(label) for label in labels
]
# 将填充后的标签转换为 TensorFlow 的 int64 类型张量,并将批处理字典中的所有值转换为 TensorFlow 张量
batch = {k: tf.convert_to_tensor(v, dtype=tf.int64) for k, v in batch.items()}
return batch
# 定义一个方法,用于处理特征数据并返回一个批次的 numpy 数组
def numpy_call(self, features):
# 确定标签名称是 "label" 还是 "labels"
label_name = "label" if "label" in features[0].keys() else "labels"
# 如果特征中包含标签信息,则从每个特征中提取标签,否则设为 None
labels = [feature[label_name] for feature in features] if label_name in features[0].keys() else None
# 使用自定义的 pad_without_fast_tokenizer_warning 函数对特征进行填充,转换成批次
batch = pad_without_fast_tokenizer_warning(
self.tokenizer,
features,
padding=self.padding,
max_length=self.max_length,
pad_to_multiple_of=self.pad_to_multiple_of,
# 如果没有标签信息,则返回的批次为 numpy 数组
return_tensors="np" if labels is None else None,
)
# 如果特征中没有标签信息,则直接返回批次
if labels is None:
return batch
# 计算输入序列长度
sequence_length = np.array(batch["input_ids"]).shape[1]
# 获取填充位置(左侧或右侧)
padding_side = self.tokenizer.padding_side
# 根据填充位置,为每个标签添加填充标记,使它们长度相同
if padding_side == "right":
batch["labels"] = [
list(label) + [self.label_pad_token_id] * (sequence_length - len(label)) for label in labels
]
else:
batch["labels"] = [
[self.label_pad_token_id] * (sequence_length - len(label)) + list(label) for label in labels
]
# 将批次中的每个键值对的值转换为 numpy 数组,类型为 int64
batch = {k: np.array(v, dtype=np.int64) for k, v in batch.items()}
# 返回处理后的批次
return batch
def _torch_collate_batch(examples, tokenizer, pad_to_multiple_of: Optional[int] = None):
"""Collate `examples` into a batch, using the information in `tokenizer` for padding if necessary."""
import torch
# Tensorize if necessary.
if isinstance(examples[0], (list, tuple, np.ndarray)):
examples = [torch.tensor(e, dtype=torch.long) for e in examples]
length_of_first = examples[0].size(0)
# Check if padding is necessary.
are_tensors_same_length = all(x.size(0) == length_of_first for x in examples)
if are_tensors_same_length and (pad_to_multiple_of is None or length_of_first % pad_to_multiple_of == 0):
return torch.stack(examples, dim=0)
# If yes, check if we have a `pad_token`.
if tokenizer._pad_token is None:
raise ValueError(
"You are attempting to pad samples but the tokenizer you are using"
f" ({tokenizer.__class__.__name__}) does not have a pad token."
)
# Creating the full tensor and filling it with our data.
max_length = max(x.size(0) for x in examples)
if pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
result = examples[0].new_full([len(examples), max_length], tokenizer.pad_token_id)
for i, example in enumerate(examples):
if tokenizer.padding_side == "right":
result[i, : example.shape[0]] = example
else:
result[i, -example.shape[0] :] = example
return result
def _tf_collate_batch(examples, tokenizer, pad_to_multiple_of: Optional[int] = None):
import tensorflow as tf
"""Collate `examples` into a batch, using the information in `tokenizer` for padding if necessary."""
# Tensorize if necessary.
if isinstance(examples[0], (list, tuple)):
examples = [tf.convert_to_tensor(e, dtype=tf.int64) for e in examples]
# Check if padding is necessary.
length_of_first = len(examples[0])
are_tensors_same_length = all(len(x) == length_of_first for x in examples)
if are_tensors_same_length and (pad_to_multiple_of is None or length_of_first % pad_to_multiple_of == 0):
return tf.stack(examples, axis=0)
# If yes, check if we have a `pad_token`.
if tokenizer._pad_token is None:
raise ValueError(
"You are attempting to pad samples but the tokenizer you are using"
f" ({tokenizer.__class__.__name__}) does not have a pad token."
)
# Creating the full tensor and filling it with our data.
max_length = max(len(x) for x in examples)
if pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
# Prepare paddings based on tensor rank.
result = []
rank = tf.rank(examples[0])
paddings = np.zeros((rank, 2), dtype=np.int32)
# 遍历给定的示例列表
for example in examples:
# 检查分词器的填充位置是否在右侧
if tokenizer.padding_side == "right":
# 如果填充在右侧,计算需要填充的长度并更新填充数组的第一行第二列
paddings[0, 1] = max_length - len(example)
else:
# 如果填充在左侧,计算需要填充的长度并更新填充数组的第一行第一列
paddings[0, 0] = max_length - len(example)
# 使用 TensorFlow 的填充函数对示例进行填充,使用填充数组和分词器的填充标记值
result.append(tf.pad(example, paddings, constant_values=tokenizer.pad_token_id))
# 将填充后的示例堆叠成一个张量,沿着第一个维度(批次维度)
return tf.stack(result, axis=0)
# 定义一个数据收集器,用于序列到序列模型的数据处理
@dataclass
class DataCollatorForSeq2Seq:
"""
Data collator that will dynamically pad the inputs received, as well as the labels.
"""
# 定义函数参数和类型注解,说明函数的输入参数和可选参数
Args:
tokenizer ([`PreTrainedTokenizer`] or [`PreTrainedTokenizerFast`]):
用于对数据进行编码的分词器。
model ([`PreTrainedModel`], *optional*):
正在训练的模型。如果设置并且具有 *prepare_decoder_input_ids_from_labels* 方法,
则使用它来准备 *decoder_input_ids*。
当使用 *label_smoothing* 时,这很有用,可以避免重复计算损失。
padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
选择一种策略来填充返回的序列(根据模型的填充方向和填充索引),可选值包括:
- `True` 或 `'longest'`(默认):填充到批次中最长的序列(如果只提供单个序列,则不填充)。
- `'max_length'`:填充到指定的最大长度(通过参数 `max_length` 提供),或者如果未提供该参数,则填充到模型的最大可接受输入长度。
- `False` 或 `'do_not_pad'`:不填充(即可以输出具有不同长度的序列批次)。
max_length (`int`, *optional*):
返回列表的最大长度,也可以作为填充长度(参见上文)。
pad_to_multiple_of (`int`, *optional*):
如果设置,将序列填充到提供的值的倍数。
这对于在具有计算能力 >= 7.5(Volta)的 NVIDIA 硬件上启用张量核心特别有用。
label_pad_token_id (`int`, *optional*, defaults to -100):
用于填充标签时的标识符(-100 将被 PyTorch 损失函数自动忽略)。
return_tensors (`str`, *optional*, defaults to `"pt"`):
要返回的张量类型。允许的值有 "np"、"pt" 和 "tf"。
def __call__(self, features, return_tensors=None):
# 如果没有指定返回张量类型,则使用预设的 return_tensors
if return_tensors is None:
return_tensors = self.return_tensors
# 从 features 中提取标签列表,如果 features 的第一个元素包含 "labels" 键
labels = [feature["labels"] for feature in features] if "labels" in features[0].keys() else None
# 在调用 `tokenizer.pad` 之前,需要对标签进行填充,因为该方法不会进行填充,并且需要所有标签长度相同以返回张量
if labels is not None:
# 计算最长的标签长度
max_label_length = max(len(l) for l in labels)
# 如果指定了 pad_to_multiple_of,调整最大标签长度使其成为该值的倍数
if self.pad_to_multiple_of is not None:
max_label_length = (
(max_label_length + self.pad_to_multiple_of - 1)
// self.pad_to_multiple_of
* self.pad_to_multiple_of
)
# 获取填充的位置(左侧或右侧)
padding_side = self.tokenizer.padding_side
# 对每个 feature 进行标签填充
for feature in features:
# 计算需要填充的空位,用 label_pad_token_id 填充
remainder = [self.label_pad_token_id] * (max_label_length - len(feature["labels"]))
# 如果标签是列表
if isinstance(feature["labels"], list):
feature["labels"] = (
feature["labels"] + remainder if padding_side == "right" else remainder + feature["labels"]
)
# 如果填充在右侧
elif padding_side == "right":
feature["labels"] = np.concatenate([feature["labels"], remainder]).astype(np.int64)
# 如果填充在左侧
else:
feature["labels"] = np.concatenate([remainder, feature["labels"]]).astype(np.int64)
# 使用 pad_without_fast_tokenizer_warning 函数对 features 进行填充,避免使用快速分词器警告
features = pad_without_fast_tokenizer_warning(
self.tokenizer,
features,
padding=self.padding,
max_length=self.max_length,
pad_to_multiple_of=self.pad_to_multiple_of,
return_tensors=return_tensors,
)
# 准备 decoder_input_ids
if (
labels is not None
and self.model is not None
and hasattr(self.model, "prepare_decoder_input_ids_from_labels")
):
# 根据标签准备 decoder_input_ids
decoder_input_ids = self.model.prepare_decoder_input_ids_from_labels(labels=features["labels"])
features["decoder_input_ids"] = decoder_input_ids
# 返回处理后的 features
return features
"""
Language modeling数据收集器。如果输入的长度不同,输入会被动态填充到一个batch的最大长度。
Args:
tokenizer ([`PreTrainedTokenizer`] or [`PreTrainedTokenizerFast`]):
用于编码数据的分词器。
mlm (`bool`, *optional*, defaults to `True`):
是否使用masked language modeling。如果设置为`False`,则标签与输入相同,忽略填充的标记(通过将它们设置为-100)。否则,非masked的标记为-100,要预测的masked标记为其他值。
mlm_probability (`float`, *optional*, defaults to 0.15):
当`mlm`设置为`True`时,随机mask输入中的token的概率。
pad_to_multiple_of (`int`, *optional*):
如果设置,则将序列填充到提供的值的倍数。
return_tensors (`str`):
要返回的Tensor类型。允许的值为"np"、"pt"和"tf"。
<Tip>
为了最佳性能,此数据收集器应与具有项为字典或BatchEncoding的数据集一起使用,这些数据集具有"special_tokens_mask"键,该键由[`PreTrainedTokenizer`]或[`PreTrainedTokenizerFast`]返回,参数`return_special_tokens_mask=True`。
</Tip>"""
tokenizer: PreTrainedTokenizerBase
mlm: bool = True
mlm_probability: float = 0.15
pad_to_multiple_of: Optional[int] = None
tf_experimental_compile: bool = False
return_tensors: str = "pt"
def __post_init__(self):
if self.mlm and self.tokenizer.mask_token is None:
raise ValueError(
"This tokenizer does not have a mask token which is necessary for masked language modeling. "
"You should pass `mlm=False` to train on causal language modeling instead."
)
if self.tf_experimental_compile:
import tensorflow as tf
self.tf_mask_tokens = tf.function(self.tf_mask_tokens, jit_compile=True)
@staticmethod
def tf_bernoulli(shape, probability):
import tensorflow as tf
prob_matrix = tf.fill(shape, probability)
return tf.cast(prob_matrix - tf.random.uniform(shape, 0, 1) >= 0, tf.bool)
def tf_mask_tokens(
self, inputs: Any, vocab_size, mask_token_id, special_tokens_mask: Optional[Any] = None
):
"""
用于在TensorFlow中执行token masking的函数。
Args:
inputs (Any): 输入的数据(例如token IDs)。
vocab_size (int): 词汇表的大小。
mask_token_id (int): 要用作masked token的token ID。
special_tokens_mask (Optional[Any]): 特殊token的mask,与词汇表外的token相关。
"""
) -> Tuple[Any, Any]:
"""
Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
"""
import tensorflow as tf
# 将 mask_token_id 转换为与 inputs 相同的数据类型
mask_token_id = tf.cast(mask_token_id, inputs.dtype)
# 获取输入张量的形状
input_shape = tf.shape(inputs)
# 为了进行 MLM 训练,以概率 self.mlm_probability 对每个序列中的部分 token 进行掩码操作
masked_indices = self.tf_bernoulli(input_shape, self.mlm_probability) & ~special_tokens_mask
# 用 -100 替换 labels 中未被掩码的位置,因为损失只计算掩码 token 的损失
labels = tf.where(masked_indices, inputs, -100)
# 80% 的概率用 mask_token_id 替换掩码的输入 token
indices_replaced = self.tf_bernoulli(input_shape, 0.8) & masked_indices
inputs = tf.where(indices_replaced, mask_token_id, inputs)
# 10% 的概率用随机单词替换掩码的输入 token
indices_random = self.tf_bernoulli(input_shape, 0.1) & masked_indices & ~indices_replaced
random_words = tf.random.uniform(input_shape, maxval=vocab_size, dtype=inputs.dtype)
inputs = tf.where(indices_random, random_words, inputs)
# 剩余 10% 的概率保持掩码的输入 token 不变
return inputs, labels
# 定义一个 TensorFlow 模型的调用函数,处理输入的样本列表并返回预测结果字典
def tf_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
import tensorflow as tf
# 根据输入样本类型的不同进行处理:如果是字典,则使用自定义函数进行填充和转换为张量
if isinstance(examples[0], Mapping):
batch = pad_without_fast_tokenizer_warning(
self.tokenizer, examples, return_tensors="tf", pad_to_multiple_of=self.pad_to_multiple_of
)
else:
# 否则,将样本列表转换为包含 "input_ids" 键的字典,使用内置函数进行填充
batch = {
"input_ids": _tf_collate_batch(examples, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
}
# 如果预处理了特殊 token 掩码,则从字典中移除该项
special_tokens_mask = batch.pop("special_tokens_mask", None)
# 如果采用 MLM(Masked Language Modeling),则进行相应处理
if self.mlm:
if special_tokens_mask is None:
# 如果没有预处理特殊 token 掩码,根据输入的 input_ids 创建掩码
special_tokens_mask = [
self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True)
for val in batch["input_ids"].numpy().tolist()
]
# 将掩码转换为 TensorFlow 中的布尔类型张量
special_tokens_mask = tf.cast(tf.convert_to_tensor(special_tokens_mask, dtype=tf.int64), tf.bool)
else:
# 否则,直接将已有的特殊 token 掩码转换为 TensorFlow 的布尔类型张量
special_tokens_mask = tf.cast(special_tokens_mask, tf.bool)
# 使用 TensorFlow 函数 tf_mask_tokens 处理 input_ids 和 labels,并更新 batch 字典
batch["input_ids"], batch["labels"] = self.tf_mask_tokens(
tf.cast(batch["input_ids"], tf.int64),
special_tokens_mask=special_tokens_mask,
mask_token_id=self.tokenizer.mask_token_id,
vocab_size=len(self.tokenizer),
)
else:
# 如果不是 MLM 模式,则直接将 input_ids 作为 labels,同时处理 padding 的情况
labels = batch["input_ids"]
if self.tokenizer.pad_token_id is not None:
# 将 padding 的位置替换为 -100
labels = tf.where(labels == self.tokenizer.pad_token_id, -100, labels)
else:
# 如果没有定义 pad_token_id,创建 labels 的深拷贝以防万一
labels = tf.identity(labels)
batch["labels"] = labels
# 返回处理后的 batch 字典,其中包含处理过的 input_ids 和相应的 labels
return batch
def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
# 处理输入数据 examples,可以是字典或列表,根据不同类型进行填充和转换为张量。
if isinstance(examples[0], Mapping):
# 对字典类型的 examples 进行填充,使用自定义的 pad_without_fast_tokenizer_warning 函数进行填充,并返回 PyTorch 张量。
batch = pad_without_fast_tokenizer_warning(
self.tokenizer, examples, return_tensors="pt", pad_to_multiple_of=self.pad_to_multiple_of
)
else:
# 对列表类型的 examples 进行处理,仅填充 "input_ids" 键,调用 _torch_collate_batch 函数进行填充。
batch = {
"input_ids": _torch_collate_batch(examples, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
}
# 如果特殊标记掩码已经预处理,则从字典中移除。
special_tokens_mask = batch.pop("special_tokens_mask", None)
if self.mlm:
# 如果是 MLM(Masked Language Modeling)任务,调用 torch_mask_tokens 函数处理 input_ids 和 labels。
batch["input_ids"], batch["labels"] = self.torch_mask_tokens(
batch["input_ids"], special_tokens_mask=special_tokens_mask
)
else:
# 如果不是 MLM 任务,将 input_ids 复制到 labels,并根据 tokenizer 的 pad_token_id 设置 labels 中相应位置的值为 -100。
labels = batch["input_ids"].clone()
if self.tokenizer.pad_token_id is not None:
labels[labels == self.tokenizer.pad_token_id] = -100
batch["labels"] = labels
return batch
def torch_mask_tokens(self, inputs: Any, special_tokens_mask: Optional[Any] = None) -> Tuple[Any, Any]:
"""
准备用于掩码语言建模的输入/标签:80% MASK,10% 随机词,10% 原始词。
"""
import torch
labels = inputs.clone()
# 对每个序列进行 MLM 训练时,以概率 self.mlm_probability 对输入进行掩码。
probability_matrix = torch.full(labels.shape, self.mlm_probability)
if special_tokens_mask is None:
# 如果特殊标记掩码为空,则使用 tokenizer 获取每个序列的特殊标记掩码。
special_tokens_mask = [
self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
]
special_tokens_mask = torch.tensor(special_tokens_mask, dtype=torch.bool)
else:
special_tokens_mask = special_tokens_mask.bool()
# 根据特殊标记掩码,将概率矩阵中的特定位置置为 0.0。
probability_matrix.masked_fill_(special_tokens_mask, value=0.0)
# 使用伯努利分布生成掩码索引。
masked_indices = torch.bernoulli(probability_matrix).bool()
labels[~masked_indices] = -100 # 只计算掩码位置的损失
# 80% 的时间,用 tokenizer.mask_token ([MASK]) 替换掩码位置的输入标记。
indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
# 10% 的时间,用随机词替换掩码位置的输入标记。
indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long)
inputs[indices_random] = random_words[indices_random]
# 剩余 10% 的时间,保持掩码位置的输入标记不变。
return inputs, labels
# 定义一个方法,用于处理包含各种数据结构的示例列表,并返回处理后的结果字典
def numpy_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
# 如果第一个示例是字典类型,则使用适当的填充方式和转换为张量
if isinstance(examples[0], Mapping):
# 使用适当的填充方法(避免快速分词器警告),将示例列表转换为 NumPy 张量
batch = pad_without_fast_tokenizer_warning(
self.tokenizer, examples, return_tensors="np", pad_to_multiple_of=self.pad_to_multiple_of
)
else:
# 如果示例不是字典类型,则只包含输入 ID 的批次
batch = {
"input_ids": _numpy_collate_batch(examples, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
}
# 如果已预处理特殊标记掩码,则从字典中弹出
special_tokens_mask = batch.pop("special_tokens_mask", None)
# 如果是 MLM 任务,调用方法对输入 ID 进行掩码处理,并将结果存入 batch 中
if self.mlm:
batch["input_ids"], batch["labels"] = self.numpy_mask_tokens(
batch["input_ids"], special_tokens_mask=special_tokens_mask
)
else:
# 如果不是 MLM 任务,则创建 labels 副本,并将填充标记转换为 -100
labels = np.copy(batch["input_ids"])
if self.tokenizer.pad_token_id is not None:
labels[labels == self.tokenizer.pad_token_id] = -100
batch["labels"] = labels
# 返回处理后的批次数据字典
return batch
def numpy_mask_tokens(self, inputs: Any, special_tokens_mask: Optional[Any] = None) -> Tuple[Any, Any]:
"""
Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
"""
# 创建输入的副本作为标签
labels = np.copy(inputs)
# 创建一个与输入形状相同的概率矩阵,每个位置的值为 self.mlm_probability
probability_matrix = np.full(labels.shape, self.mlm_probability)
# 如果特殊标记掩码为空,则根据每个序列的值获取特殊标记掩码
if special_tokens_mask is None:
special_tokens_mask = [
self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
]
special_tokens_mask = np.array(special_tokens_mask, dtype=bool)
else:
special_tokens_mask = special_tokens_mask.astype(bool)
# 将特殊标记掩码位置的概率设为 0,这些位置不会被选为被屏蔽的位置
probability_matrix[special_tokens_mask] = 0
# 使用二项分布随机生成屏蔽的索引
masked_indices = np.random.binomial(1, probability_matrix, size=probability_matrix.shape).astype(bool)
# 将未屏蔽的标签设为 -100,用于损失计算
labels[~masked_indices] = -100 # We only compute loss on masked tokens
# 80% 的概率,将屏蔽的输入标记替换为 tokenizer.mask_token_id
indices_replaced = np.random.binomial(1, 0.8, size=labels.shape).astype(bool) & masked_indices
inputs[indices_replaced] = self.tokenizer.mask_token_id
# 10% 的概率,将屏蔽的输入标记替换为随机词
indices_random = (
np.random.binomial(1, 0.5, size=labels.shape).astype(bool) & masked_indices & ~indices_replaced
)
random_words = np.random.randint(
low=0, high=len(self.tokenizer), size=np.count_nonzero(indices_random), dtype=np.int64
)
inputs[indices_random] = random_words
# 剩余的 10% 的概率,保持屏蔽的输入标记不变
# 返回处理后的输入和标签
return inputs, labels
@DataCollatorForWholeWordMask
class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling):
"""
Data collator used for language modeling that masks entire words.
- collates batches of tensors, honoring their tokenizer's pad_token
- preprocesses batches for masked language modeling
<Tip>
This collator relies on details of the implementation of subword tokenization by [`BertTokenizer`], specifically
that subword tokens are prefixed with *
produce an output that is roughly equivalent to [`.DataCollatorForLanguageModeling`].
</Tip>
"""
def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
# Determine if examples are provided as a list of mappings or as a list of input_ids
if isinstance(examples[0], Mapping):
input_ids = [e["input_ids"] for e in examples] # Extract input_ids from each example mapping
else:
input_ids = examples # Examples are directly input_ids, wrap each in a mapping
# Collate input_ids into a batch tensor respecting tokenizer's padding rules
batch_input = _torch_collate_batch(input_ids, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
mask_labels = []
for e in examples:
ref_tokens = []
for id in tolist(e["input_ids"]): # Convert input_ids to tokens using tokenizer
token = self.tokenizer._convert_id_to_token(id)
ref_tokens.append(token)
# For Chinese tokens, mark sub-words with "##", e.g., [喜,欢]->[喜,##欢]
if "chinese_ref" in e:
ref_pos = tolist(e["chinese_ref"]) # Positions in input_ids that are sub-words
len_seq = len(e["input_ids"]) # Length of the input sequence
for i in range(len_seq):
if i in ref_pos:
ref_tokens[i] = "##" + ref_tokens[i] # Prefix sub-word tokens with "##"
mask_labels.append(self._whole_word_mask(ref_tokens)) # Apply whole word masking to tokens
# Collate mask_labels into a batch tensor respecting tokenizer's padding rules
batch_mask = _torch_collate_batch(mask_labels, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
# Mask input_ids and create labels for masked language modeling
inputs, labels = self.torch_mask_tokens(batch_input, batch_mask)
return {"input_ids": inputs, "labels": labels}
# 定义 TensorFlow 版本的调用方法,接受一个例子列表,并返回处理后的输入和标签字典
def tf_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
# 导入 TensorFlow 库
import tensorflow as tf
# 检查第一个例子的类型,若为映射类型(字典),则提取其中的 "input_ids" 列表
if isinstance(examples[0], Mapping):
input_ids = [e["input_ids"] for e in examples]
else:
# 否则,假设每个例子本身就是一个 input_ids 列表,将其赋值给 input_ids,并用例子包装成带 "input_ids" 键的字典列表
input_ids = examples
examples = [{"input_ids": e} for e in examples]
# 调用内部函数 _tf_collate_batch,将 input_ids 列表和 tokenizer 进行批处理
batch_input = _tf_collate_batch(input_ids, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
# 初始化一个空列表,用于存储每个例子的掩码标签
mask_labels = []
for e in examples:
ref_tokens = []
# 遍历每个例子中的 input_ids,将每个 id 转换为对应的 token
for id in tolist(e["input_ids"]):
token = self.tokenizer._convert_id_to_token(id)
ref_tokens.append(token)
# 对于中文 token,如果指定了 "chinese_ref" 键,需添加额外的标记 "##" 标识子词,例如 [喜,欢]-> [喜,##欢]
if "chinese_ref" in e:
ref_pos = tolist(e["chinese_ref"])
len_seq = len(e["input_ids"])
for i in range(len_seq):
if i in ref_pos:
ref_tokens[i] = "##" + ref_tokens[i]
# 将处理后的 token 列表传入 _whole_word_mask 方法,得到该例子的掩码标签,添加到 mask_labels 列表中
mask_labels.append(self._whole_word_mask(ref_tokens))
# 再次调用 _tf_collate_batch,将 mask_labels 列表和 tokenizer 进行批处理,得到批量化的掩码标签
batch_mask = _tf_collate_batch(mask_labels, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
# 调用对象自身的 tf_mask_tokens 方法,传入批量化的输入和掩码标签,得到 inputs 和 labels,返回作为字典的 "input_ids" 和 "labels"
inputs, labels = self.tf_mask_tokens(tf.cast(batch_input, tf.int64), batch_mask)
return {"input_ids": inputs, "labels": labels}
# 定义 NumPy 版本的调用方法,接受一个例子列表,并返回处理后的输入和标签字典
def numpy_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
# 检查第一个例子的类型,若为映射类型(字典),则提取其中的 "input_ids" 列表
if isinstance(examples[0], Mapping):
input_ids = [e["input_ids"] for e in examples]
else:
# 否则,假设每个例子本身就是一个 input_ids 列表,将其赋值给 input_ids,并用例子包装成带 "input_ids" 键的字典列表
input_ids = examples
examples = [{"input_ids": e} for e in examples]
# 调用内部函数 _numpy_collate_batch,将 input_ids 列表和 tokenizer 进行批处理
batch_input = _numpy_collate_batch(input_ids, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
# 初始化一个空列表,用于存储每个例子的掩码标签
mask_labels = []
for e in examples:
ref_tokens = []
# 遍历每个例子中的 input_ids,将每个 id 转换为对应的 token
for id in tolist(e["input_ids"]):
token = self.tokenizer._convert_id_to_token(id)
ref_tokens.append(token)
# 对于中文 token,如果指定了 "chinese_ref" 键,需添加额外的标记 "##" 标识子词,例如 [喜,欢]-> [喜,##欢]
if "chinese_ref" in e:
ref_pos = tolist(e["chinese_ref"])
len_seq = len(e["input_ids"])
for i in range(len_seq):
if i in ref_pos:
ref_tokens[i] = "##" + ref_tokens[i]
# 将处理后的 token 列表传入 _whole_word_mask 方法,得到该例子的掩码标签,添加到 mask_labels 列表中
mask_labels.append(self._whole_word_mask(ref_tokens))
# 再次调用 _numpy_collate_batch,将 mask_labels 列表和 tokenizer 进行批处理,得到批量化的掩码标签
batch_mask = _numpy_collate_batch(mask_labels, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
# 调用对象自身的 numpy_mask_tokens 方法,传入批量化的输入和掩码标签,得到 inputs 和 labels,返回作为字典的 "input_ids" 和 "labels"
inputs, labels = self.numpy_mask_tokens(batch_input, batch_mask)
return {"input_ids": inputs, "labels": labels}
def _whole_word_mask(self, input_tokens: List[str], max_predictions=512):
"""
Get 0/1 labels for masked tokens with whole word mask proxy
"""
# 如果当前的分词器不是BertTokenizer或BertTokenizerFast,则发出警告
if not isinstance(self.tokenizer, (BertTokenizer, BertTokenizerFast)):
warnings.warn(
"DataCollatorForWholeWordMask is only suitable for BertTokenizer-like tokenizers. "
"Please refer to the documentation for more information."
)
# 初始化候选索引列表
cand_indexes = []
# 遍历输入的token列表
for i, token in enumerate(input_tokens):
# 跳过特殊token,如"[CLS]"和"[SEP]"
if token == "[CLS]" or token == "[SEP]":
continue
# 如果当前候选索引列表不为空且当前token是一个以"##"开头的部分token,则将当前token加入最后一个候选索引的列表中
if len(cand_indexes) >= 1 and token.startswith("##"):
cand_indexes[-1].append(i)
else:
# 否则,创建一个新的候选索引列表并加入当前token的索引
cand_indexes.append([i])
# 随机打乱候选索引列表
random.shuffle(cand_indexes)
# 计算应该预测的masked token数量,取最小值为max_predictions和输入token数量乘以mlm_probability的整数部分
num_to_predict = min(max_predictions, max(1, int(round(len(input_tokens) * self.mlm_probability))))
# 初始化masked tokens列表
masked_lms = []
# 初始化已覆盖索引的集合
covered_indexes = set()
# 遍历候选索引列表
for index_set in cand_indexes:
# 如果已经预测的masked token数量达到了num_to_predict,则退出循环
if len(masked_lms) >= num_to_predict:
break
# 如果当前候选索引集合加上已预测的masked token数量超过了num_to_predict,则跳过该候选集合
if len(masked_lms) + len(index_set) > num_to_predict:
continue
# 检查当前候选索引集合中是否有已覆盖的索引
is_any_index_covered = False
for index in index_set:
if index in covered_indexes:
is_any_index_covered = True
break
# 如果有任何已覆盖的索引,则跳过该候选索引集合
if is_any_index_covered:
continue
# 否则,将候选索引集合中的每个索引加入已覆盖索引集合,并将其加入masked tokens列表
for index in index_set:
covered_indexes.add(index)
masked_lms.append(index)
# 如果已覆盖索引的数量不等于masked tokens列表的长度,则抛出异常
if len(covered_indexes) != len(masked_lms):
raise ValueError("Length of covered_indexes is not equal to length of masked_lms.")
# 根据已覆盖的索引集合生成mask标签列表,即标记哪些token是masked的
mask_labels = [1 if i in covered_indexes else 0 for i in range(len(input_tokens))]
# 返回mask标签列表作为结果
return mask_labels
def torch_mask_tokens(self, inputs: Any, mask_labels: Any) -> Tuple[Any, Any]:
"""
Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. Set
'mask_labels' means we use whole word mask (wwm), we directly mask idxs according to it's ref.
"""
import torch
# 检查当前的分词器是否有掩码标记,这是进行掩码语言建模所必需的
if self.tokenizer.mask_token is None:
raise ValueError(
"This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the"
" --mlm flag if you want to use this tokenizer."
)
# 复制输入以保留原始标签
labels = inputs.clone()
# 我们在每个序列中随机抽样几个标记,用于掩码语言建模训练(概率默认为0.15,适用于Bert/RoBERTa)
probability_matrix = mask_labels
# 获取特殊标记的掩码,用于排除掉特殊标记的影响
special_tokens_mask = [
self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
]
probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
# 如果存在填充标记,将其添加到掩码中
if self.tokenizer._pad_token is not None:
padding_mask = labels.eq(self.tokenizer.pad_token_id)
probability_matrix.masked_fill_(padding_mask, value=0.0)
# 确定要掩码的索引
masked_indices = probability_matrix.bool()
labels[~masked_indices] = -100 # 只计算掩码标记上的损失
# 80%的时间,将掩码输入标记替换为tokenizer.mask_token ([MASK])
indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
# 10%的时间,将掩码输入标记替换为随机单词
indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long)
inputs[indices_random] = random_words[indices_random]
# 剩余的时间(10%),保持掩码输入标记不变
return inputs, labels
def tf_mask_tokens(self, inputs: Any, mask_labels: Any) -> Tuple[Any, Any]:
"""
Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. Set
'mask_labels' means we use whole word mask (wwm), we directly mask idxs according to it's ref.
"""
import tensorflow as tf # 导入 TensorFlow 库
input_shape = tf.shape(inputs) # 获取输入张量的形状
if self.tokenizer.mask_token is None:
raise ValueError(
"This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the"
" --mlm flag if you want to use this tokenizer."
)
labels = tf.identity(inputs) # 创建输入张量的副本作为标签
# We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
masked_indices = tf.cast(mask_labels, tf.bool) # 将掩码标签转换为布尔类型张量
# Exclude special tokens from masking
special_tokens_mask = [
self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels
] # 获取特殊标记的掩码,排除已有的特殊标记
masked_indices = masked_indices & ~tf.cast(special_tokens_mask, dtype=tf.bool) # 更新掩码索引,排除特殊标记
if self.tokenizer._pad_token is not None:
padding_mask = inputs == self.tokenizer.pad_token_id # 获取填充标记的掩码
masked_indices = masked_indices & ~padding_mask # 更新掩码索引,排除填充标记
# Replace unmasked indices with -100 in the labels since we only compute loss on masked tokens
labels = tf.where(masked_indices, inputs, -100) # 根据掩码索引,将未掩码的位置在标签中替换为-100,仅计算掩码位置的损失
# 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
indices_replaced = self.tf_bernoulli(input_shape, 0.8) & masked_indices # 使用伯努利采样确定掩码位置,80%的时间用[MASK]标记替换掩码输入
inputs = tf.where(indices_replaced, self.tokenizer.mask_token_id, inputs)
# 10% of the time, we replace masked input tokens with random word
indices_random = self.tf_bernoulli(input_shape, 0.5) & masked_indices & ~indices_replaced # 使用伯努利采样确定掩码位置,10%的时间用随机词替换掩码输入
random_words = tf.random.uniform(input_shape, maxval=len(self.tokenizer), dtype=tf.int64) # 生成随机词
inputs = tf.where(indices_random, random_words, inputs)
# The rest of the time (10% of the time) we keep the masked input tokens unchanged
return inputs, labels # 返回处理后的输入和标签
def numpy_mask_tokens(self, inputs: Any, mask_labels: Any) -> Tuple[Any, Any]:
"""
Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. Set
'mask_labels' means we use whole word mask (wwm), we directly mask idxs according to it's ref.
"""
if self.tokenizer.mask_token is None:
raise ValueError(
"This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the"
" --mlm flag if you want to use this tokenizer."
)
labels = np.copy(inputs)
# We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
# Convert mask_labels to boolean array indicating which tokens to mask
masked_indices = mask_labels.astype(bool)
# Mask special tokens so they are not selected for masking
special_tokens_mask = [
self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
]
masked_indices[np.array(special_tokens_mask, dtype=bool)] = 0
# If there is a padding token, mask it so it's not selected for masking
if self.tokenizer._pad_token is not None:
padding_mask = labels == self.tokenizer.pad_token_id
masked_indices[padding_mask] = 0
labels[~masked_indices] = -100
indices_replaced = np.random.binomial(1, 0.8, size=labels.shape).astype(bool) & masked_indices
inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
indices_random = (
np.random.binomial(1, 0.5, size=labels.shape).astype(bool) & masked_indices & ~indices_replaced
)
random_words = np.random.randint(low=0, high=len(self.tokenizer), size=labels.shape, dtype=np.int64)
inputs[indices_random] = random_words[indices_random]
return inputs, labels
@Dataclass
class DataCollatorForSOP(DataCollatorForLanguageModeling):
"""
Data collator used for sentence order prediction task.
- collates batches of tensors, honoring their tokenizer's pad_token
- preprocesses batches for both masked language modeling and sentence order prediction
"""
def __init__(self, *args, **kwargs):
warnings.warn(
"DataCollatorForSOP is deprecated and will be removed in a future version, you can now use "
"DataCollatorForLanguageModeling instead.",
FutureWarning,
)
def __call__(self, examples: List[Dict[str, Any]]) -> Dict[str, Any]:
import torch
from torch.nn.utils.rnn import pad_sequence
input_ids = [example["input_ids"] for example in examples]
input_ids = _torch_collate_batch(input_ids, self.tokenizer)
input_ids, labels, attention_mask = self.mask_tokens(input_ids)
token_type_ids = [example["token_type_ids"] for example in examples]
token_type_ids = pad_sequence(token_type_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id)
sop_label_list = [example["sentence_order_label"] for example in examples]
sentence_order_label = torch.stack(sop_label_list)
return {
"input_ids": input_ids,
"labels": labels,
"attention_mask": attention_mask,
"token_type_ids": token_type_ids,
"sentence_order_label": sentence_order_label,
}
def mask_tokens(self, inputs: Any) -> Tuple[Any, Any, Any]:
"""
Prepare masked tokens inputs/labels/attention_mask for masked language modeling: 80% MASK, 10% random, 10%
original. N-gram not applied yet.
"""
import torch
if self.tokenizer.mask_token is None:
raise ValueError(
"This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the"
" --mlm flag if you want to use this tokenizer."
)
labels = inputs.clone()
probability_matrix = torch.full(labels.shape, self.mlm_probability)
special_tokens_mask = [
self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
]
probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
if self.tokenizer._pad_token is not None:
padding_mask = labels.eq(self.tokenizer.pad_token_id)
probability_matrix.masked_fill_(padding_mask, value=0.0)
masked_indices = torch.bernoulli(probability_matrix).bool()
attention_mask = (~masked_indices).float()
if self.tokenizer._pad_token is not None:
attention_padding_mask = labels.eq(self.tokenizer.pad_token_id)
attention_mask.masked_fill_(attention_padding_mask, value=1.0)
labels[~masked_indices] = -100
indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long)
inputs[indices_random] = random_words[indices_random]
return inputs, labels, attention_mask
@dataclass
class DataCollatorForPermutationLanguageModeling(DataCollatorMixin):
"""
Data collator used for permutation language modeling.
- collates batches of tensors, honoring their tokenizer's pad_token
- preprocesses batches for permutation language modeling with procedures specific to XLNet
"""
tokenizer: PreTrainedTokenizerBase
plm_probability: float = 1 / 6
max_span_length: int = 5
return_tensors: str = "pt"
def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
if isinstance(examples[0], Mapping):
examples = [e["input_ids"] for e in examples]
batch = _torch_collate_batch(examples, self.tokenizer)
inputs, perm_mask, target_mapping, labels = self.torch_mask_tokens(batch)
return {"input_ids": inputs, "perm_mask": perm_mask, "target_mapping": target_mapping, "labels": labels}
def tf_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
if isinstance(examples[0], Mapping):
examples = [e["input_ids"] for e in examples]
batch = _tf_collate_batch(examples, self.tokenizer)
inputs, perm_mask, target_mapping, labels = self.tf_mask_tokens(batch)
return {"input_ids": inputs, "perm_mask": perm_mask, "target_mapping": target_mapping, "labels": labels}
def numpy_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
if isinstance(examples[0], Mapping):
examples = [e["input_ids"] for e in examples]
batch = _numpy_collate_batch(examples, self.tokenizer)
inputs, perm_mask, target_mapping, labels = self.numpy_mask_tokens(batch)
return {"input_ids": inputs, "perm_mask": perm_mask, "target_mapping": target_mapping, "labels": labels}
.\data\metrics\squad_metrics.py
import collections
import json
import math
import re
import string
from ...models.bert import BasicTokenizer
from ...utils import logging
logger = logging.get_logger(__name__)
def normalize_answer(s):
"""Lower text and remove punctuation, articles and extra whitespace."""
def remove_articles(text):
regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
return re.sub(regex, " ", text)
def white_space_fix(text):
return " ".join(text.split())
def remove_punc(text):
exclude = set(string.punctuation)
return "".join(ch for ch in text if ch not in exclude)
def lower(text):
return text.lower()
return white_space_fix(remove_articles(remove_punc(lower(s))))
def get_tokens(s):
if not s:
return []
return normalize_answer(s).split()
def compute_exact(a_gold, a_pred):
return int(normalize_answer(a_gold) == normalize_answer(a_pred))
def compute_f1(a_gold, a_pred):
gold_toks = get_tokens(a_gold)
pred_toks = get_tokens(a_pred)
common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
num_same = sum(common.values())
if len(gold_toks) == 0 or len(pred_toks) == 0:
return int(gold_toks == pred_toks)
if num_same == 0:
return 0
precision = 1.0 * num_same / len(pred_toks)
recall = 1.0 * num_same / len(gold_toks)
f1 = (2 * precision * recall) / (precision + recall)
return f1
def get_raw_scores(examples, preds):
"""
Computes the exact and f1 scores from the examples and the model predictions
"""
exact_scores = {}
f1_scores = {}
qas_id = example.qas_id
gold_answers = [answer["text"] for answer in example.answers if normalize_answer(answer["text"])]
if not gold_answers:
gold_answers = [""]
if qas_id not in preds:
print(f"Missing prediction for {qas_id}")
continue
prediction = preds[qas_id]
exact_scores[qas_id] = max(compute_exact(a, prediction) for a in gold_answers)
f1_scores[qas_id] = max(compute_f1(a, prediction) for a in gold_answers)
return exact_scores, f1_scores
def apply_no_ans_threshold(scores, na_probs, qid_to_has_ans, na_prob_thresh):
new_scores = {}
for qid, s in scores.items():
pred_na = na_probs[qid] > na_prob_thresh
if pred_na:
new_scores[qid] = float(not qid_to_has_ans[qid])
else:
new_scores[qid] = s
return new_scores
def make_eval_dict(exact_scores, f1_scores, qid_list=None):
if not qid_list:
total = len(exact_scores)
return collections.OrderedDict(
[
("exact", 100.0 * sum(exact_scores.values()) / total),
("f1", 100.0 * sum(f1_scores.values()) / total),
("total", total),
]
)
else:
total = len(qid_list)
return collections.OrderedDict(
[
("exact", 100.0 * sum(exact_scores[k] for k in qid_list) / total),
("f1", 100.0 * sum(f1_scores[k] for k in qid_list) / total),
("total", total),
]
)
def merge_eval(main_eval, new_eval, prefix):
for k in new_eval:
main_eval[f"{prefix}_{k}"] = new_eval[k]
def find_best_thresh_v2(preds, scores, na_probs, qid_to_has_ans):
num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
cur_score = num_no_ans
best_score = cur_score
best_thresh = 0.0
qid_list = sorted(na_probs, key=lambda k: na_probs[k])
for i, qid in enumerate(qid_list):
if qid not in scores:
continue
if qid_to_has_ans[qid]:
diff = scores[qid]
else:
if preds[qid]:
diff = -1
else:
diff = 0
cur_score += diff
if cur_score > best_score:
best_score = cur_score
best_thresh = na_probs[qid]
has_ans_score, has_ans_cnt = 0, 0
for qid in qid_list:
if not qid_to_has_ans[qid]:
continue
has_ans_cnt += 1
if qid not in scores:
continue
has_ans_score += scores[qid]
return 100.0 * best_score / len(scores), best_thresh, 1.0 * has_ans_score / has_ans_cnt
def find_all_best_thresh_v2(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
best_exact, exact_thresh, has_ans_exact = find_best_thresh_v2(preds, exact_raw, na_probs, qid_to_has_ans)
best_f1, f1_thresh, has_ans_f1 = find_best_thresh_v2(preds, f1_raw, na_probs, qid_to_has_ans)
main_eval["best_exact"] = best_exact
main_eval["best_exact_thresh"] = exact_thresh
main_eval["best_f1"] = best_f1
main_eval["best_f1_thresh"] = f1_thresh
main_eval["has_ans_exact"] = has_ans_exact
main_eval["has_ans_f1"] = has_ans_f1
def find_best_thresh(preds, scores, na_probs, qid_to_has_ans):
num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
cur_score = num_no_ans
best_score = cur_score
best_thresh = 0.0
qid_list = sorted(na_probs, key=lambda k: na_probs[k])
for _, qid in enumerate(qid_list):
if qid not in scores:
continue
if qid_to_has_ans[qid]:
diff = scores[qid]
else:
if preds[qid]:
diff = -1
else:
diff = 0
cur_score += diff
if cur_score > best_score:
best_score = cur_score
best_thresh = na_probs[qid]
return 100.0 * best_score / len(scores), best_thresh
def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
best_exact, exact_thresh = find_best_thresh(preds, exact_raw, na_probs, qid_to_has_ans)
best_f1, f1_thresh = find_best_thresh(preds, f1_raw, na_probs, qid_to_has_ans)
main_eval["best_exact"] = best_exact
main_eval["best_exact_thresh"] = exact_thresh
main_eval["best_f1"] = best_f1
main_eval["best_f1_thresh"] = f1_thresh
def squad_evaluate(examples, preds, no_answer_probs=None, no_answer_probability_threshold=1.0):
qas_id_to_has_answer = {example.qas_id: bool(example.answers) for example in examples}
has_answer_qids = [qas_id for qas_id, has_answer in qas_id_to_has_answer.items() if has_answer]
no_answer_qids = [qas_id for qas_id, has_answer in qas_id_to_has_answer.items() if not has_answer]
if no_answer_probs is None:
no_answer_probs = {k: 0.0 for k in preds}
exact, f1 = get_raw_scores(examples, preds)
exact_threshold = apply_no_ans_threshold(
exact, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold
)
f1_threshold = apply_no_ans_threshold(f1, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold)
evaluation = make_eval_dict(exact_threshold, f1_threshold)
if has_answer_qids:
has_ans_eval = make_eval_dict(exact_threshold, f1_threshold, qid_list=has_answer_qids)
merge_eval(evaluation, has_ans_eval, "HasAns")
if no_answer_qids:
no_ans_eval = make_eval_dict(exact_threshold, f1_threshold, qid_list=no_answer_qids)
merge_eval(evaluation, no_ans_eval, "NoAns")
if no_answer_probs:
find_all_best_thresh(evaluation, preds, exact, f1, no_answer_probs, qas_id_to_has_ans)
return evaluation
def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
"""Project the tokenized prediction back to the original text."""
def _strip_spaces(text):
ns_chars = []
ns_to_s_map = collections.OrderedDict()
for i, c in enumerate(text):
if c == " ":
continue
ns_to_s_map[len(ns_chars)] = i
ns_chars.append(c)
ns_text = "".join(ns_chars)
return (ns_text, ns_to_s_map)
tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
tok_text = " ".join(tokenizer.tokenize(orig_text))
start_position = tok_text.find(pred_text)
if start_position == -1:
if verbose_logging:
logger.info(f"Unable to find text: '{pred_text}' in '{orig_text}'")
return orig_text
end_position = start_position + len(pred_text) - 1
(orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
(tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
if len(orig_ns_text) != len(tok_ns_text):
if verbose_logging:
logger.info(f"Length not equal after stripping spaces: '{orig_ns_text}' vs '{tok_ns_text}'")
return orig_text
tok_s_to_ns_map = {}
for i, tok_index in tok_ns_to_s_map.items():
tok_s_to_ns_map[tok_index] = i
orig_start_position = None
if start_position in tok_s_to_ns_map:
ns_start_position = tok_s_to_ns_map[start_position]
if ns_start_position in orig_ns_to_s_map:
orig_start_position = orig_ns_to_s_map[ns_start_position]
if orig_start_position is None:
if verbose_logging:
logger.info("Couldn't map start position")
return orig_text
orig_end_position = None
if end_position in tok_s_to_ns_map:
ns_end_position = tok_s_to_ns_map[end_position]
if ns_end_position in orig_ns_to_s_map:
orig_end_position = orig_ns_to_s_map[ns_end_position]
if orig_end_position is None:
if verbose_logging:
logger.info("Couldn't map end position")
return orig_text
output_text = orig_text[orig_start_position : (orig_end_position + 1)]
return output_text
def _get_best_indexes(logits, n_best_size):
index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True)
best_indexes = []
for i in range(len(index_and_score)):
if i >= n_best_size:
break
best_indexes.append(index_and_score[i][0])
return best_indexes
def _compute_softmax(scores):
if not scores:
return []
max_score = None
for score in scores:
if max_score is None or score > max_score:
max_score = score
exp_scores = []
total_sum = 0.0
for score in scores:
x = math.exp(score - max_score)
exp_scores.append(x)
total_sum += x
probs = []
for score in exp_scores:
probs.append(score / total_sum)
return probs
def compute_predictions_logits(
all_examples,
all_features,
all_results,
n_best_size,
max_answer_length,
do_lower_case,
output_prediction_file,
output_nbest_file,
output_null_log_odds_file,
verbose_logging,
version_2_with_negative,
null_score_diff_threshold,
tokenizer,
):
if output_prediction_file:
logger.info(f"Writing predictions to: {output_prediction_file}")
if output_nbest_file:
logger.info(f"Writing nbest to: {output_nbest_file}")
if output_null_log_odds_file and version_2_with_negative:
logger.info(f"Writing null_log_odds to: {output_null_log_odds_file}")
example_index_to_features = collections.defaultdict(list)
for feature in all_features:
example_index_to_features[feature.example_index].append(feature)
unique_id_to_result = {}
for result in all_results:
unique_id_to_result[result.unique_id] = result
_PrelimPrediction = collections.namedtuple(
"PrelimPrediction", ["feature_index", "start_index", "end_index", "start_logit", "end_logit"]
)
all_predictions = collections.OrderedDict()
all_nbest_json = collections.OrderedDict()
scores_diff_json = collections.OrderedDict()
if output_prediction_file:
with open(output_prediction_file, "w") as writer:
writer.write(json.dumps(all_predictions, indent=4) + "\n")
if output_nbest_file:
with open(output_nbest_file, "w") as writer:
writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
if output_null_log_odds_file and version_2_with_negative:
with open(output_null_log_odds_file, "w") as writer:
writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
return all_predictions
def compute_predictions_log_probs(
all_examples,
all_features,
all_results,
n_best_size,
max_answer_length,
output_prediction_file,
output_nbest_file,
output_null_log_odds_file,
start_n_top,
):
end_n_top,
version_2_with_negative,
tokenizer,
verbose_logging,
_PrelimPrediction = collections.namedtuple(
"PrelimPrediction", ["feature_index", "start_index", "end_index", "start_log_prob", "end_log_prob"]
)
_NbestPrediction = collections.namedtuple(
"NbestPrediction", ["text", "start_log_prob", "end_log_prob"]
)
logger.info(f"Writing predictions to: {output_prediction_file}")
example_index_to_features = collections.defaultdict(list)
for feature in all_features:
example_index_to_features[feature.example_index].append(feature)
unique_id_to_result = {}
for result in all_results:
unique_id_to_result[result.unique_id] = result
all_predictions = collections.OrderedDict()
all_nbest_json = collections.OrderedDict()
scores_diff_json = collections.OrderedDict()
with open(output_prediction_file, "w") as writer:
writer.write(json.dumps(all_predictions, indent=4) + "\n")
with open(output_nbest_file, "w") as writer:
writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
if version_2_with_negative:
with open(output_null_log_odds_file, "w") as writer:
writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
return all_predictions
.\data\metrics\__init__.py
import warnings
from ...utils import is_sklearn_available, requires_backends
if is_sklearn_available():
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import f1_score, matthews_corrcoef
DEPRECATION_WARNING = (
"This metric will be removed from the library soon, metrics should be handled with the 🤗 Evaluate "
"library. You can have a look at this example script for pointers: "
"https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_glue.py"
)
def simple_accuracy(preds, labels):
warnings.warn(DEPRECATION_WARNING, FutureWarning)
requires_backends(simple_accuracy, "sklearn")
return (preds == labels).mean()
def acc_and_f1(preds, labels):
warnings.warn(DEPRECATION_WARNING, FutureWarning)
requires_backends(acc_and_f1, "sklearn")
acc = simple_accuracy(preds, labels)
f1 = f1_score(y_true=labels, y_pred=preds)
return {
"acc": acc,
"f1": f1,
"acc_and_f1": (acc + f1) / 2,
}
def pearson_and_spearman(preds, labels):
warnings.warn(DEPRECATION_WARNING, FutureWarning)
requires_backends(pearson_and_spearman, "sklearn")
pearson_corr = pearsonr(preds, labels)[0]
spearman_corr = spearmanr(preds, labels)[0]
return {
"pearson": pearson_corr,
"spearmanr": spearman_corr,
"corr": (pearson_corr + spearman_corr) / 2,
}
def glue_compute_metrics(task_name, preds, labels):
warnings.warn(DEPRECATION_WARNING, FutureWarning)
requires_backends(glue_compute_metrics, "sklearn")
assert len(preds) == len(labels), f"Predictions and labels have mismatched lengths {len(preds)} and {len(labels)}"
if task_name == "cola":
return {"mcc": matthews_corrcoef(labels, preds)}
elif task_name == "sst-2":
return {"acc": simple_accuracy(preds, labels)}
elif task_name == "mrpc":
return acc_and_f1(preds, labels)
elif task_name == "sts-b":
return pearson_and_spearman(preds, labels)
elif task_name == "qqp":
return acc_and_f1(preds, labels)
elif task_name == "mnli":
return {"mnli/acc": simple_accuracy(preds, labels)}
elif task_name == "mnli-mm":
return {"mnli-mm/acc": simple_accuracy(preds, labels)}
elif task_name == "qnli":
return {"acc": simple_accuracy(preds, labels)}
elif task_name == "rte":
return {"acc": simple_accuracy(preds, labels)}
elif task_name == "wnli":
return {"acc": simple_accuracy(preds, labels)}
elif task_name == "hans":
return {"acc": simple_accuracy(preds, labels)}
else:
raise KeyError(task_name)
def xnli_compute_metrics(task_name, preds, labels):
warnings.warn(DEPRECATION_WARNING, FutureWarning)
requires_backends(xnli_compute_metrics, "sklearn")
if len(preds) != len(labels):
raise ValueError(f"Predictions and labels have mismatched lengths {len(preds)} and {len(labels)}")
if task_name == "xnli":
return {"acc": simple_accuracy(preds, labels)}
else:
raise KeyError(task_name)
.\data\processors\glue.py
""" GLUE processors and helpers"""
import os
import warnings
from dataclasses import asdict
from enum import Enum
from typing import List, Optional, Union
from ...tokenization_utils import PreTrainedTokenizer
from ...utils import is_tf_available, logging
from .utils import DataProcessor, InputExample, InputFeatures
if is_tf_available():
import tensorflow as tf
logger = logging.get_logger(__name__)
DEPRECATION_WARNING = (
"This {0} will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets "
"library. You can have a look at this example script for pointers: "
"https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_glue.py"
)
def glue_convert_examples_to_features(
examples: Union[List[InputExample], "tf.data.Dataset"],
tokenizer: PreTrainedTokenizer,
max_length: Optional[int] = None,
task=None,
label_list=None,
output_mode=None,
):
"""
Loads a data file into a list of `InputFeatures`
Args:
examples: List of `InputExamples` or `tf.data.Dataset` containing the examples.
tokenizer: Instance of a tokenizer that will tokenize the examples
max_length: Maximum example length. Defaults to the tokenizer's max_len
task: GLUE task
label_list: List of labels. Can be obtained from the processor using the `processor.get_labels()` method
output_mode: String indicating the output mode. Either `regression` or `classification`
Returns:
If the `examples` input is a `tf.data.Dataset`, will return a `tf.data.Dataset` containing the task-specific
features. If the input is a list of `InputExamples`, will return a list of task-specific `InputFeatures` which
can be fed to the model.
"""
warnings.warn(DEPRECATION_WARNING.format("function"), FutureWarning)
if is_tf_available() and isinstance(examples, tf.data.Dataset):
if task is None:
raise ValueError("When calling glue_convert_examples_to_features from TF, the task parameter is required.")
return _tf_glue_convert_examples_to_features(examples, tokenizer, max_length=max_length, task=task)
return _glue_convert_examples_to_features(
examples, tokenizer, max_length=max_length, task=task, label_list=label_list, output_mode=output_mode
)
if is_tf_available():
def _tf_glue_convert_examples_to_features(
examples: tf.data.Dataset,
tokenizer: PreTrainedTokenizer,
task=str,
max_length: Optional[int] = None,
) -> tf.data.Dataset:
"""
将示例转换为特征集合的 TensorFlow 数据集。
Returns:
包含特定任务特征的 `tf.data.Dataset` 对象。
"""
processor = glue_processors[task]()
examples = [processor.tfds_map(processor.get_example_from_tensor_dict(example)) for example in examples]
features = glue_convert_examples_to_features(examples, tokenizer, max_length=max_length, task=task)
label_type = tf.float32 if task == "sts-b" else tf.int64
def gen():
for ex in features:
d = {k: v for k, v in asdict(ex).items() if v is not None}
label = d.pop("label")
yield (d, label)
input_names = tokenizer.model_input_names
return tf.data.Dataset.from_generator(
gen,
({k: tf.int32 for k in input_names}, label_type),
({k: tf.TensorShape([None]) for k in input_names}, tf.TensorShape([])),
)
def _glue_convert_examples_to_features(
examples: List[InputExample],
tokenizer: PreTrainedTokenizer,
max_length: Optional[int] = None,
task=None,
label_list=None,
output_mode=None,
):
if max_length is None:
max_length = tokenizer.model_max_length
if task is not None:
processor = glue_processors[task]()
if label_list is None:
label_list = processor.get_labels()
logger.info(f"Using label list {label_list} for task {task}")
if output_mode is None:
output_mode = glue_output_modes[task]
logger.info(f"Using output mode {output_mode} for task {task}")
label_map = {label: i for i, label in enumerate(label_list)}
def label_from_example(example: InputExample) -> Union[int, float, None]:
if example.label is None:
return None
if output_mode == "classification":
return label_map[example.label]
elif output_mode == "regression":
return float(example.label)
raise KeyError(output_mode)
labels = [label_from_example(example) for example in examples]
batch_encoding = tokenizer(
[(example.text_a, example.text_b) for example in examples],
max_length=max_length,
padding="max_length",
truncation=True,
)
features = []
for i in range(len(examples)):
inputs = {k: batch_encoding[k][i] for k in batch_encoding}
feature = InputFeatures(**inputs, label=labels[i])
features.append(feature)
for i, example in enumerate(examples[:5]):
logger.info("*** Example ***")
logger.info(f"guid: {example.guid}")
logger.info(f"features: {features[i]}")
return features
class OutputMode(Enum):
classification = "classification"
regression = "regression"
class MrpcProcessor(DataProcessor):
"""Processor for the MRPC data set (GLUE version)."""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)
def get_example_from_tensor_dict(self, tensor_dict):
"""See base class."""
return InputExample(
tensor_dict["idx"].numpy(),
tensor_dict["sentence1"].numpy().decode("utf-8"),
tensor_dict["sentence2"].numpy().decode("utf-8"),
str(tensor_dict["label"].numpy()),
)
def get_train_examples(self, data_dir):
"""See base class."""
logger.info(f"LOOKING AT {os.path.join(data_dir, 'train.tsv')}")
return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
def get_dev_examples(self, data_dir):
"""See base class."""
return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
def get_test_examples(self, data_dir):
"""See base class."""
return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
def get_labels(self):
"""See base class."""
return ["0", "1"]
def _create_examples(self, lines, set_type):
"""Creates examples for the training, dev and test sets."""
examples = []
for i, line in enumerate(lines):
if i == 0:
continue
guid = f"{set_type}-{i}"
text_a = line[3]
text_b = line[4]
label = None if set_type == "test" else line[0]
examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples
class MnliProcessor(DataProcessor):
"""Processor for the MultiNLI data set (GLUE version)."""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)
def get_example_from_tensor_dict(self, tensor_dict):
"""See base class."""
return InputExample(
tensor_dict["idx"].numpy(),
tensor_dict["premise"].numpy().decode("utf-8"),
tensor_dict["hypothesis"].numpy().decode("utf-8"),
str(tensor_dict["label"].numpy()),
)
def get_train_examples(self, data_dir):
"""See base class."""
return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
def get_dev_examples(self, data_dir):
"""See base class."""
return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")), "dev_matched")
def get_test_examples(self, data_dir):
"""See base class."""
return self._create_examples(self._read_tsv(os.path.join(data_dir, "test_matched.tsv")), "test_matched")
def get_labels(self):
"""See base class."""
return ["contradiction", "entailment", "neutral"]
def _create_examples(self, lines, set_type):
"""Creates examples for the training, dev and test sets."""
examples = []
for i, line in enumerate(lines):
if i == 0:
continue
guid = f"{set_type}-{line[0]}"
text_a = line[8]
text_b = line[9]
label = None if set_type.startswith("test") else line[-1]
examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples
class MnliMismatchedProcessor(MnliProcessor):
"""Processor for the MultiNLI Mismatched data set (GLUE version)."""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)
def get_dev_examples(self, data_dir):
"""See base class."""
return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_mismatched.tsv")), "dev_mismatched")
def get_test_examples(self, data_dir):
"""See base class."""
return self._create_examples(self._read_tsv(os.path.join(data_dir, "test_mismatched.tsv")), "test_mismatched")
class ColaProcessor(DataProcessor):
"""Processor for the CoLA data set (GLUE version)."""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)
def get_example_from_tensor_dict(self, tensor_dict):
"""See base class."""
return InputExample(
tensor_dict["idx"].numpy(),
tensor_dict["sentence"].numpy().decode("utf-8"),
None,
str(tensor_dict["label"].numpy()),
)
def get_train_examples(self, data_dir):
"""See base class."""
return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
def get_dev_examples(self, data_dir):
"""See base class."""
return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
def get_test_examples(self, data_dir):
"""See base class."""
return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
def get_labels(self):
"""See base class."""
return ["0", "1"]
def _create_examples(self, lines, set_type):
"""Creates examples for the training, dev and test sets."""
test_mode = set_type == "test"
if test_mode:
lines = lines[1:]
text_index = 1 if test_mode else 3
examples = []
for i, line in enumerate(lines):
guid = f"{set_type}-{i}"
text_a = line[text_index]
label = None if test_mode else line[1]
examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
return examples
class Sst2Processor(DataProcessor):
"""Processor for the SST-2 data set (GLUE version)."""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)
def get_example_from_tensor_dict(self, tensor_dict):
"""See base class."""
return InputExample(
tensor_dict["idx"].numpy(),
tensor_dict["sentence"].numpy().decode("utf-8"),
None,
str(tensor_dict["label"].numpy()),
)
def get_train_examples(self, data_dir):
"""See base class."""
return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
def get_dev_examples(self, data_dir):
"""See base class."""
return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
def get_test_examples(self, data_dir):
"""See base class."""
return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
def get_labels(self):
"""See base class."""
return ["0", "1"]
def _create_examples(self, lines, set_type):
"""Creates examples for the training, dev and test sets."""
examples = []
text_index = 1 if set_type == "test" else 0
for i, line in enumerate(lines):
if i == 0:
continue
guid = f"{set_type}-{i}"
text_a = line[text_index]
label = None if set_type == "test" else line[1]
examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
return examples
class StsbProcessor(DataProcessor):
"""Processor for the STS-B data set (GLUE version)."""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)
def get_example_from_tensor_dict(self, tensor_dict):
"""See base class."""
return InputExample(
tensor_dict["idx"].numpy(),
tensor_dict["sentence1"].numpy().decode("utf-8"),
tensor_dict["sentence2"].numpy().decode("utf-8"),
str(tensor_dict["label"].numpy()),
)
def get_train_examples(self, data_dir):
"""See base class."""
return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
def get_dev_examples(self, data_dir):
"""See base class."""
return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
def get_test_examples(self, data_dir):
"""See base class."""
return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
def get_labels(self):
"""See base class."""
return [None]
def _create_examples(self, lines, set_type):
"""Creates examples for the training, dev and test sets."""
examples = []
for i, line in enumerate(lines):
if i == 0:
continue
guid = f"{set_type}-{line[0]}"
text_a = line[7]
text_b = line[8]
label = None if set_type == "test" else line[-1]
examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples
class QqpProcessor(DataProcessor):
"""Processor for the QQP data set (GLUE version)."""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)
def get_example_from_tensor_dict(self, tensor_dict):
"""See base class."""
return InputExample(
tensor_dict["idx"].numpy(),
tensor_dict["question1"].numpy().decode("utf-8"),
tensor_dict["question2"].numpy().decode("utf-8"),
str(tensor_dict["label"].numpy()),
)
def get_train_examples(self, data_dir):
"""See base class."""
return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
def get_dev_examples(self, data_dir):
"""See base class."""
return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
def get_test_examples(self, data_dir):
"""See base class."""
return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
def get_labels(self):
"""See base class."""
return ["0", "1"]
def _create_examples(self, lines, set_type):
"""Creates examples for the training, dev and test sets."""
test_mode = set_type == "test"
q1_index = 1 if test_mode else 3
q2_index = 2 if test_mode else 4
examples = []
for i, line in enumerate(lines):
if i == 0:
continue
guid = f"{set_type}-{line[0]}"
try:
text_a = line[q1_index]
text_b = line[q2_index]
label = None if test_mode else line[5]
except IndexError:
continue
examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples
class QnliProcessor(DataProcessor):
"""Processor for the QNLI data set (GLUE version)."""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)
def get_example_from_tensor_dict(self, tensor_dict):
"""See base class."""
return InputExample(
tensor_dict["idx"].numpy(),
tensor_dict["question"].numpy().decode("utf-8"),
tensor_dict["sentence"].numpy().decode("utf-8"),
str(tensor_dict["label"].numpy()),
)
def get_train_examples(self, data_dir):
"""See base class."""
return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
def get_dev_examples(self, data_dir):
"""See base class."""
return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
def get_test_examples(self, data_dir):
"""See base class."""
return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
def get_labels(self):
"""See base class."""
return ["entailment", "not_entailment"]
def _create_examples(self, lines, set_type):
"""Creates examples for the training, dev and test sets."""
examples = []
for i, line in enumerate(lines):
if i == 0:
continue
guid = f"{set_type}-{line[0]}"
text_a = line[1]
text_b = line[2]
label = None if set_type == "test" else line[-1]
examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples
class RteProcessor(DataProcessor):
"""Processor for the RTE data set (GLUE version)."""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)
def get_example_from_tensor_dict(self, tensor_dict):
"""See base class."""
return InputExample(
tensor_dict["idx"].numpy(),
tensor_dict["sentence1"].numpy().decode("utf-8"),
tensor_dict["sentence2"].numpy().decode("utf-8"),
str(tensor_dict["label"].numpy()),
)
def get_train_examples(self, data_dir):
"""See base class."""
return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
def get_dev_examples(self, data_dir):
"""See base class."""
return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
def get_test_examples(self, data_dir):
"""See base class."""
return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
def get_labels(self):
"""See base class."""
return ["entailment", "not_entailment"]
def _create_examples(self, lines, set_type):
"""Creates examples for the training, dev and test sets."""
examples = []
for i, line in enumerate(lines):
if i == 0:
continue
guid = f"{set_type}-{line[0]}"
text_a = line[1]
text_b = line[2]
label = None if set_type == "test" else line[-1]
examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples
class WnliProcessor(DataProcessor):
"""Processor for the WNLI data set (GLUE version)."""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)
def get_example_from_tensor_dict(self, tensor_dict):
"""See base class."""
return InputExample(
tensor_dict["idx"].numpy(),
tensor_dict["sentence1"].numpy().decode("utf-8"),
tensor_dict["sentence2"].numpy().decode("utf-8"),
str(tensor_dict["label"].numpy()),
)
def get_train_examples(self, data_dir):
"""See base class."""
return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
def get_dev_examples(self, data_dir):
"""See base class."""
return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
def get_test_examples(self, data_dir):
"""See base class."""
return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
def get_labels(self):
"""See base class."""
return ["0", "1"]
def _create_examples(self, lines, set_type):
"""Creates examples for the training, dev and test sets."""
examples = []
for i, line in enumerate(lines):
if i == 0:
continue
guid = f"{set_type}-{line[0]}"
text_a = line[1]
text_b = line[2]
label = None if set_type == "test" else line[-1]
examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples
glue_tasks_num_labels = {
"cola": 2,
"mnli": 3,
"mrpc": 2,
"sst-2": 2,
"sts-b": 1,
"qqp": 2,
"qnli": 2,
"rte": 2,
"wnli": 2,
}
glue_processors = {
"cola": ColaProcessor,
"mnli": MnliProcessor,
"mnli-mm": MnliMismatchedProcessor,
"mrpc": MrpcProcessor,
"sst-2": Sst2Processor,
"sts-b": StsbProcessor,
"qqp": QqpProcessor,
"qnli": QnliProcessor,
"rte": RteProcessor,
"wnli": WnliProcessor,
}
glue_output_modes = {
"cola": "classification",
"mnli": "classification",
"mnli-mm": "classification",
"mrpc": "classification",
"sst-2": "classification",
"sts-b": "regression",
"qqp": "classification",
"qnli": "classification",
"rte": "classification",
"wnli": "classification",
}
.\data\processors\squad.py
import json
import os
from functools import partial
from multiprocessing import Pool, cpu_count
import numpy as np
from tqdm import tqdm
from ...models.bert.tokenization_bert import whitespace_tokenize
from ...tokenization_utils_base import BatchEncoding, PreTrainedTokenizerBase, TruncationStrategy
from ...utils import is_tf_available, is_torch_available, logging
from .utils import DataProcessor
MULTI_SEP_TOKENS_TOKENIZERS_SET = {"roberta", "camembert", "bart", "mpnet"}
if is_torch_available():
import torch
from torch.utils.data import TensorDataset
if is_tf_available():
import tensorflow as tf
logger = logging.get_logger(__name__)
def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, orig_answer_text):
"""Returns tokenized answer spans that better match the annotated answer."""
tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
for new_start in range(input_start, input_end + 1):
for new_end in range(input_end, new_start - 1, -1):
text_span = " ".join(doc_tokens[new_start : (new_end + 1)])
if text_span == tok_answer_text:
return (new_start, new_end)
return (input_start, input_end)
def _check_is_max_context(doc_spans, cur_span_index, position):
"""Check if this is the 'max context' doc span for the token."""
best_score = None
best_span_index = None
for span_index, doc_span in enumerate(doc_spans):
end = doc_span.start + doc_span.length - 1
if position < doc_span.start:
continue
if position > end:
continue
num_left_context = position - doc_span.start
num_right_context = end - position
score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
if best_score is None or score > best_score:
best_score = score
best_span_index = span_index
return cur_span_index == best_span_index
def _new_check_is_max_context(doc_spans, cur_span_index, position):
"""Check if this is the 'max context' doc span for the token."""
best_score = None
best_span_index = None
for span_index, doc_span in enumerate(doc_spans):
end = doc_span["start"] + doc_span["length"] - 1
if position < doc_span["start"]:
continue
if position > end:
continue
num_left_context = position - doc_span["start"]
num_right_context = end - position
score = min(num_left_context, num_right_context) + 0.01 * doc_span["length"]
if best_score is None or score > best_score:
best_score = score
best_span_index = span_index
return cur_span_index == best_span_index
def _is_whitespace(c):
if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
return True
return False
def squad_convert_example_to_features(
example, max_seq_length, doc_stride, max_query_length, padding_strategy, is_training
):
features = []
if is_training and not example.is_impossible:
start_position = example.start_position
end_position = example.end_position
actual_text = " ".join(example.doc_tokens[start_position : (end_position + 1)])
cleaned_answer_text = " ".join(whitespace_tokenize(example.answer_text))
if actual_text.find(cleaned_answer_text) == -1:
logger.warning(f"Could not find answer: '{actual_text}' vs. '{cleaned_answer_text}'")
return []
tok_to_orig_index = []
orig_to_tok_index = []
all_doc_tokens = []
for i, token in enumerate(example.doc_tokens):
orig_to_tok_index.append(len(all_doc_tokens))
if tokenizer.__class__.__name__ in [
"RobertaTokenizer",
"LongformerTokenizer",
"BartTokenizer",
"RobertaTokenizerFast",
"LongformerTokenizerFast",
"BartTokenizerFast",
]:
sub_tokens = tokenizer.tokenize(token, add_prefix_space=True)
else:
sub_tokens = tokenizer.tokenize(token)
for sub_token in sub_tokens:
tok_to_orig_index.append(i)
all_doc_tokens.append(sub_token)
if is_training and not example.is_impossible:
tok_start_position = orig_to_tok_index[example.start_position]
if example.end_position < len(example.doc_tokens) - 1:
tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
else:
tok_end_position = len(all_doc_tokens) - 1
(tok_start_position, tok_end_position) = _improve_answer_span(
all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.answer_text
)
spans = []
truncated_query = tokenizer.encode(
example.question_text, add_special_tokens=False, truncation=True, max_length=max_query_length
)
tokenizer_type = type(tokenizer).__name__.replace("Tokenizer", "").lower()
sequence_added_tokens = (
tokenizer.model_max_length - tokenizer.max_len_single_sentence + 1
if tokenizer_type in MULTI_SEP_TOKENS_TOKENIZERS_SET
else tokenizer.model_max_length - tokenizer.max_len_single_sentence
)
sequence_pair_added_tokens = tokenizer.model_max_length - tokenizer.max_len_sentences_pair
span_doc_tokens = all_doc_tokens
while len(spans) * doc_stride < len(all_doc_tokens):
if tokenizer.padding_side == "right":
texts = truncated_query
pairs = span_doc_tokens
truncation = TruncationStrategy.ONLY_SECOND.value
else:
texts = span_doc_tokens
pairs = truncated_query
truncation = TruncationStrategy.ONLY_FIRST.value
encoded_dict = tokenizer.encode_plus(
texts,
pairs,
truncation=truncation,
padding=padding_strategy,
max_length=max_seq_length,
return_overflowing_tokens=True,
stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens,
return_token_type_ids=True,
)
paragraph_len = min(
len(all_doc_tokens) - len(spans) * doc_stride,
max_seq_length - len(truncated_query) - sequence_pair_added_tokens,
)
if tokenizer.pad_token_id in encoded_dict["input_ids"]:
if tokenizer.padding_side == "right":
non_padded_ids = encoded_dict["input_ids"][: encoded_dict["input_ids"].index(tokenizer.pad_token_id)]
else:
last_padding_id_position = (
len(encoded_dict["input_ids"]) - 1 - encoded_dict["input_ids"][::-1].index(tokenizer.pad_token_id)
)
non_padded_ids = encoded_dict["input_ids"][last_padding_id_position + 1 :]
else:
non_padded_ids = encoded_dict["input_ids"]
tokens = tokenizer.convert_ids_to_tokens(non_padded_ids)
token_to_orig_map = {}
for i in range(paragraph_len):
index = len(truncated_query) + sequence_added_tokens + i if tokenizer.padding_side == "right" else i
token_to_orig_map[index] = tok_to_orig_index[len(spans) * doc_stride + i]
encoded_dict["paragraph_len"] = paragraph_len
encoded_dict["tokens"] = tokens
encoded_dict["token_to_orig_map"] = token_to_orig_map
encoded_dict["truncated_query_with_special_tokens_length"] = len(truncated_query) + sequence_added_tokens
encoded_dict["token_is_max_context"] = {}
encoded_dict["start"] = len(spans) * doc_stride
encoded_dict["length"] = paragraph_len
spans.append(encoded_dict)
if "overflowing_tokens" not in encoded_dict or (
"overflowing_tokens" in encoded_dict and len(encoded_dict["overflowing_tokens"]) == 0
):
break
span_doc_tokens = encoded_dict["overflowing_tokens"]
for doc_span_index in range(len(spans)):
for j in range(spans[doc_span_index]["paragraph_len"]):
is_max_context = _new_check_is_max_context(spans, doc_span_index, doc_span_index * doc_stride + j)
index = (
j
if tokenizer.padding_side == "left"
else spans[doc_span_index]["truncated_query_with_special_tokens_length"] + j
)
spans[doc_span_index]["token_is_max_context"][index] = is_max_context
for span in spans:
cls_index = span["input_ids"].index(tokenizer.cls_token_id)
p_mask = np.ones_like(span["token_type_ids"])
if tokenizer.padding_side == "right":
p_mask[len(truncated_query) + sequence_added_tokens :] = 0
else:
p_mask[-len(span["tokens"]) : -(len(truncated_query) + sequence_added_tokens)] = 0
pad_token_indices = np.where(span["input_ids"] == tokenizer.pad_token_id)
special_token_indices = np.asarray(
tokenizer.get_special_tokens_mask(span["input_ids"], already_has_special_tokens=True)
).nonzero()
p_mask[pad_token_indices] = 1
p_mask[special_token_indices] = 1
p_mask[cls_index] = 0
span_is_impossible = example.is_impossible
start_position = 0
end_position = 0
if is_training and not span_is_impossible:
doc_start = span["start"]
doc_end = span["start"] + span["length"] - 1
out_of_span = False
if not (tok_start_position >= doc_start and tok_end_position <= doc_end):
out_of_span = True
if out_of_span:
start_position = cls_index
end_position = cls_index
span_is_impossible = True
else:
if tokenizer.padding_side == "left":
doc_offset = 0
else:
doc_offset = len(truncated_query) + sequence_added_tokens
start_position = tok_start_position - doc_start + doc_offset
end_position = tok_end_position - doc_start + doc_offset
features.append(
SquadFeatures(
span["input_ids"],
span["attention_mask"],
span["token_type_ids"],
cls_index,
p_mask.tolist(),
example_index=0,
unique_id=0,
paragraph_len=span["paragraph_len"],
token_is_max_context=span["token_is_max_context"],
tokens=span["tokens"],
token_to_orig_map=span["token_to_orig_map"],
start_position=start_position,
end_position=end_position,
is_impossible=span_is_impossible,
qas_id=example.qas_id,
)
)
return features
global tokenizer
tokenizer = tokenizer_for_convert
):
if not example_features:
continue
for example_feature in example_features:
example_feature.example_index = example_index
example_feature.unique_id = unique_id
new_features.append(example_feature)
unique_id += 1
example_index += 1
features = new_features
del new_features
if return_dataset == "pt":
if not is_torch_available():
raise RuntimeError("PyTorch must be installed to return a PyTorch dataset.")
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
all_attention_masks = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
all_is_impossible = torch.tensor([f.is_impossible for f in features], dtype=torch.float)
if not is_training:
all_feature_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
dataset = TensorDataset(
all_input_ids, all_attention_masks, all_token_type_ids, all_feature_index, all_cls_index, all_p_mask
)
else:
all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
dataset = TensorDataset(
all_input_ids,
all_attention_masks,
all_token_type_ids,
all_start_positions,
all_end_positions,
all_cls_index,
all_p_mask,
all_is_impossible,
)
return features, dataset
else:
return features
"""
Processor for the SQuAD data set. overridden by SquadV1Processor and SquadV2Processor, used by the version 1.1 and
version 2.0 of SQuAD, respectively.
"""
train_file = None
dev_file = None
def _get_example_from_tensor_dict(self, tensor_dict, evaluate=False):
if not evaluate:
answer = tensor_dict["answers"]["text"][0].numpy().decode("utf-8")
answer_start = tensor_dict["answers"]["answer_start"][0].numpy()
answers = []
else:
answers = [
{"answer_start": start.numpy(), "text": text.numpy().decode("utf-8")}
for start, text in zip(tensor_dict["answers"]["answer_start"], tensor_dict["answers"]["text"])
]
answer = None
answer_start = None
return SquadExample(
qas_id=tensor_dict["id"].numpy().decode("utf-8"),
question_text=tensor_dict["question"].numpy().decode("utf-8"),
context_text=tensor_dict["context"].numpy().decode("utf-8"),
answer_text=answer,
start_position_character=answer_start,
title=tensor_dict["title"].numpy().decode("utf-8"),
answers=answers,
)
def get_examples_from_dataset(self, dataset, evaluate=False):
"""
Creates a list of [`~data.processors.squad.SquadExample`] using a TFDS dataset.
Args:
dataset: The tfds dataset loaded from *tensorflow_datasets.load("squad")*
evaluate: Boolean specifying if in evaluation mode or in training mode
Returns:
List of SquadExample
Examples:
```
>>> import tensorflow_datasets as tfds
>>> dataset = tfds.load("squad")
>>> training_examples = get_examples_from_dataset(dataset, evaluate=False)
>>> evaluation_examples = get_examples_from_dataset(dataset, evaluate=True)
```"""
if evaluate:
dataset = dataset["validation"]
else:
dataset = dataset["train"]
examples = []
for tensor_dict in tqdm(dataset):
examples.append(self._get_example_from_tensor_dict(tensor_dict, evaluate=evaluate))
return examples
def get_train_examples(self, data_dir, filename=None):
"""
Returns the training examples from the data directory.
Args:
data_dir: Directory containing the data files used for training and evaluating.
filename: None by default, specify this if the training file has a different name than the original one
which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively.
"""
if data_dir is None:
data_dir = ""
if self.train_file is None:
raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor")
with open(
os.path.join(data_dir, self.train_file if filename is None else filename), "r", encoding="utf-8"
) as reader:
input_data = json.load(reader)["data"]
return self._create_examples(input_data, "train")
def get_dev_examples(self, data_dir, filename=None):
"""
Returns the evaluation example from the data directory.
Args:
data_dir: Directory containing the data files used for training and evaluating.
filename: None by default, specify this if the evaluation file has a different name than the original one
which is `dev-v1.1.json` and `dev-v2.0.json` for squad versions 1.1 and 2.0 respectively.
"""
if data_dir is None:
data_dir = ""
if self.dev_file is None:
raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor")
with open(
os.path.join(data_dir, self.dev_file if filename is None else filename), "r", encoding="utf-8"
) as reader:
input_data = json.load(reader)["data"]
return self._create_examples(input_data, "dev")
def _create_examples(self, input_data, set_type):
is_training = set_type == "train"
examples = []
for entry in tqdm(input_data):
title = entry["title"]
for paragraph in entry["paragraphs"]:
context_text = paragraph["context"]
for qa in paragraph["qas"]:
qas_id = qa["id"]
question_text = qa["question"]
start_position_character = None
answer_text = None
answers = []
is_impossible = qa.get("is_impossible", False)
if not is_impossible:
if is_training:
answer = qa["answers"][0]
answer_text = answer["text"]
start_position_character = answer["answer_start"]
else:
answers = qa["answers"]
example = SquadExample(
qas_id=qas_id,
question_text=question_text,
context_text=context_text,
answer_text=answer_text,
start_position_character=start_position_character,
title=title,
is_impossible=is_impossible,
answers=answers,
)
examples.append(example)
return examples
class SquadV1Processor(SquadProcessor):
train_file = "train-v1.1.json"
dev_file = "dev-v1.1.json"
class SquadV1Processor(SquadProcessor):
train_file = "train-v1.1.json"
dev_file = "dev-v1.1.json"
class SquadV2Processor(SquadProcessor):
train_file = "train-v2.0.json"
dev_file = "dev-v2.0.json"
class SquadV2Processor(SquadProcessor):
train_file = "train-v2.0.json"
dev_file = "dev-v2.0.json"
class SquadExample:
"""
A single training/test example for the Squad dataset, as loaded from disk.
Args:
qas_id: The example's unique identifier
question_text: The question string
context_text: The context string
answer_text: The answer string
start_position_character: The character position of the start of the answer
title: The title of the example
answers: None by default, this is used during evaluation. Holds answers as well as their start positions.
is_impossible: False by default, set to True if the example has no possible answer.
"""
def __init__(
self,
qas_id,
question_text,
context_text,
answer_text,
start_position_character,
title,
answers=[],
is_impossible=False,
):
self.qas_id = qas_id
self.question_text = question_text
self.context_text = context_text
self.answer_text = answer_text
self.title = title
self.is_impossible = is_impossible
self.answers = answers
self.start_position, self.end_position = 0, 0
doc_tokens = []
char_to_word_offset = []
prev_is_whitespace = True
for c in self.context_text:
if _is_whitespace(c):
prev_is_whitespace = True
else:
if prev_is_whitespace:
doc_tokens.append(c)
else:
doc_tokens[-1] += c
prev_is_whitespace = False
char_to_word_offset.append(len(doc_tokens) - 1)
self.doc_tokens = doc_tokens
self.char_to_word_offset = char_to_word_offset
if start_position_character is not None and not is_impossible:
self.start_position = char_to_word_offset[start_position_character]
self.end_position = char_to_word_offset[
min(start_position_character + len(answer_text) - 1, len(char_to_word_offset) - 1)
]
class SquadFeatures:
"""
Single squad example features to be fed to a model. Those features are model-specific and can be crafted from
[`~data.processors.squad.SquadExample`] using the
:method:*~transformers.data.processors.squad.squad_convert_examples_to_features* method.
# 单个SQuAD示例的特征,用于模型输入。这些特征是特定于模型的,
# 可以从[`~data.processors.squad.SquadExample`]使用
# :method:*~transformers.data.processors.squad.squad_convert_examples_to_features*方法进行构建。
class SquadFeatures:
pass # 这里仅占位,没有额外的实现
# 初始化函数,用于创建一个新的对象来存储输入特征和答案相关信息
def __init__(
self,
input_ids, # 输入序列的token索引列表
attention_mask, # 避免在填充token索引上执行注意力计算的掩码
token_type_ids, # 指示输入中第一部分和第二部分的段落token索引
cls_index, # CLS(分类)token的索引位置
p_mask, # 用于标识可以作为答案和不可以作为答案的token的掩码
# 为不可作为答案的token设置为1,可以作为答案的设置为0
example_index, # 示例的索引
unique_id, # 特征的唯一标识符
paragraph_len, # 上下文段落的长度
token_is_max_context, # 布尔值列表,标识哪些token在此特征对象中具有最大上下文。
# 如果一个token没有在此特征对象中具有最大上下文,则意味着另一个特征对象对该token有更多相关信息,应优先考虑那个特征对象。
tokens, # 输入ids对应的token列表
token_to_orig_map, # token到原始文本的映射,用于识别答案
start_position, # 答案起始token索引
end_position, # 答案结束token索引
is_impossible, # 标识答案是否不可行
qas_id: str = None, # 问题-答案对的唯一标识符(可选)
encoding: BatchEncoding = None, # 可选,存储使用快速分词器对齐方法的BatchEncoding
):
self.input_ids = input_ids # 初始化对象属性:输入ids
self.attention_mask = attention_mask # 初始化对象属性:注意力掩码
self.token_type_ids = token_type_ids # 初始化对象属性:段落token类型ids
self.cls_index = cls_index # 初始化对象属性:CLS token索引
self.p_mask = p_mask # 初始化对象属性:答案标记掩码
self.example_index = example_index # 初始化对象属性:示例索引
self.unique_id = unique_id # 初始化对象属性:唯一标识符
self.paragraph_len = paragraph_len # 初始化对象属性:段落长度
self.token_is_max_context = token_is_max_context # 初始化对象属性:最大上下文标记
self.tokens = tokens # 初始化对象属性:tokens
self.token_to_orig_map = token_to_orig_map # 初始化对象属性:token到原始文本的映射
self.start_position = start_position # 初始化对象属性:答案起始位置
self.end_position = end_position # 初始化对象属性:答案结束位置
self.is_impossible = is_impossible # 初始化对象属性:是否不可行的标记
self.qas_id = qas_id # 初始化对象属性:问题-答案对的唯一标识符
self.encoding = encoding # 初始化对象属性:BatchEncoding对象
class SquadResult:
"""
Constructs a SquadResult which can be used to evaluate a model's output on the SQuAD dataset.
Args:
unique_id: The unique identifier corresponding to that example.
start_logits: The logits corresponding to the start of the answer
end_logits: The logits corresponding to the end of the answer
"""
# 定义 SquadResult 类,用于存储 SQuAD 数据集上模型输出的结果
def __init__(self, unique_id, start_logits, end_logits, start_top_index=None, end_top_index=None, cls_logits=None):
# 初始化实例属性
self.start_logits = start_logits # 存储回答开始位置的 logits
self.end_logits = end_logits # 存储回答结束位置的 logits
self.unique_id = unique_id # 存储唯一标识符
# 如果提供了 start_top_index 参数,则初始化以下属性
if start_top_index:
self.start_top_index = start_top_index # 存储开始位置的 top index
self.end_top_index = end_top_index # 存储结束位置的 top index
self.cls_logits = cls_logits # 存储对应的 cls logits
.\data\processors\utils.py
import csv
import dataclasses
import json
from dataclasses import dataclass
from typing import List, Optional, Union
from ...utils import is_tf_available, is_torch_available, logging
logger = logging.get_logger(__name__)
@dataclass
class InputExample:
"""
用于简单序列分类的单个训练/测试示例。
Args:
guid: 示例的唯一标识符。
text_a: 字符串。第一个序列的未分词文本。对于单序列任务,只需指定此序列。
text_b: (可选) 字符串。第二个序列的未分词文本。仅对序列对任务必须指定。
label: (可选) 示例的标签。对于训练和开发示例应指定,但对于测试示例不应指定。
"""
guid: str
text_a: str
text_b: Optional[str] = None
label: Optional[str] = None
def to_json_string(self):
"""将该实例序列化为JSON字符串。"""
return json.dumps(dataclasses.asdict(self), indent=2) + "\n"
@dataclass(frozen=True)
class InputFeatures:
"""
单个数据特征集合。属性名称与模型输入的相应名称相同。
Args:
input_ids: 序列标记在词汇表中的索引。
attention_mask: 避免对填充标记索引执行注意力的掩码。
掩码值选择在 `[0, 1]` 范围内:通常为 `1` 表示非MASKED的标记, `0` 表示MASKED的标记(填充)。
token_type_ids: (可选) 段标记索引,指示输入的第一和第二部分。只有某些模型会使用。
label: (可选) 输入对应的标签。对于分类问题为整数,对于回归问题为浮点数。
"""
input_ids: List[int]
attention_mask: Optional[List[int]] = None
token_type_ids: Optional[List[int]] = None
label: Optional[Union[int, float]] = None
def to_json_string(self):
"""将该实例序列化为JSON字符串。"""
return json.dumps(dataclasses.asdict(self)) + "\n"
class DataProcessor:
"""用于序列分类数据集的数据转换器的基类。"""
def get_example_from_tensor_dict(self, tensor_dict):
"""
Gets an example from a dict with tensorflow tensors.
Args:
tensor_dict: Keys and values should match the corresponding Glue
tensorflow_dataset examples.
"""
raise NotImplementedError()
def get_train_examples(self, data_dir):
"""Gets a collection of [`InputExample`] for the train set."""
raise NotImplementedError()
def get_dev_examples(self, data_dir):
"""Gets a collection of [`InputExample`] for the dev set."""
raise NotImplementedError()
def get_test_examples(self, data_dir):
"""Gets a collection of [`InputExample`] for the test set."""
raise NotImplementedError()
def get_labels(self):
"""Gets the list of labels for this data set."""
raise NotImplementedError()
def tfds_map(self, example):
"""
Some tensorflow_datasets datasets are not formatted the same way the GLUE datasets are. This method converts
examples to the correct format.
"""
if len(self.get_labels()) > 1:
example.label = self.get_labels()[int(example.label)]
return example
@classmethod
def _read_tsv(cls, input_file, quotechar=None):
"""Reads a tab separated value file."""
with open(input_file, "r", encoding="utf-8-sig") as f:
return list(csv.reader(f, delimiter="\t", quotechar=quotechar))
class SingleSentenceClassificationProcessor(DataProcessor):
"""Generic processor for a single sentence classification data set."""
def __init__(self, labels=None, examples=None, mode="classification", verbose=False):
self.labels = [] if labels is None else labels
self.examples = [] if examples is None else examples
self.mode = mode
self.verbose = verbose
def __len__(self):
return len(self.examples)
def __getitem__(self, idx):
if isinstance(idx, slice):
return SingleSentenceClassificationProcessor(labels=self.labels, examples=self.examples[idx])
return self.examples[idx]
@classmethod
def create_from_csv(
cls, file_name, split_name="", column_label=0, column_text=1, column_id=None, skip_first_row=False, **kwargs
):
processor = cls(**kwargs)
processor.add_examples_from_csv(
file_name,
split_name=split_name,
column_label=column_label,
column_text=column_text,
column_id=column_id,
skip_first_row=skip_first_row,
overwrite_labels=True,
overwrite_examples=True,
)
return processor
@classmethod
def create_from_examples(cls, texts_or_text_and_labels, labels=None, **kwargs):
processor = cls(**kwargs)
processor.add_examples(texts_or_text_and_labels, labels=labels)
return processor
def add_examples_from_csv(
self,
file_name,
split_name="",
column_label=0,
column_text=1,
column_id=None,
skip_first_row=False,
overwrite_labels=False,
overwrite_examples=False,
):
lines = self._read_tsv(file_name)
if skip_first_row:
lines = lines[1:]
texts = []
labels = []
ids = []
for i, line in enumerate(lines):
texts.append(line[column_text])
labels.append(line[column_label])
if column_id is not None:
ids.append(line[column_id])
else:
guid = f"{split_name}-{i}" if split_name else str(i)
ids.append(guid)
return self.add_examples(
texts, labels, ids, overwrite_labels=overwrite_labels, overwrite_examples=overwrite_examples
)
def add_examples(
self, texts_or_text_and_labels, labels=None, ids=None, overwrite_labels=False, overwrite_examples=False
):
):
if labels is not None and len(texts_or_text_and_labels) != len(labels):
raise ValueError(
f"Text and labels have mismatched lengths {len(texts_or_text_and_labels)} and {len(labels)}"
)
if ids is not None and len(texts_or_text_and_labels) != len(ids):
raise ValueError(f"Text and ids have mismatched lengths {len(texts_or_text_and_labels)} and {len(ids)}")
if ids is None:
ids = [None] * len(texts_or_text_and_labels)
if labels is None:
labels = [None] * len(texts_or_text_and_labels)
examples = []
added_labels = set()
for text_or_text_and_label, label, guid in zip(texts_or_text_and_labels, labels, ids):
if isinstance(text_or_text_and_label, (tuple, list)) and label is None:
text, label = text_or_text_and_label
else:
text = text_or_text_and_label
added_labels.add(label)
examples.append(InputExample(guid=guid, text_a=text, text_b=None, label=label))
if overwrite_examples:
self.examples = examples
else:
self.examples.extend(examples)
if overwrite_labels:
self.labels = list(added_labels)
else:
self.labels = list(set(self.labels).union(added_labels))
return self.examples
def get_features(
self,
tokenizer,
max_length=None,
pad_on_left=False,
pad_token=0,
mask_padding_with_zero=True,
return_tensors=None,
.\data\processors\xnli.py
""" XNLI utils (dataset loading and evaluation)"""
import os
from ...utils import logging
from .utils import DataProcessor, InputExample
logger = logging.get_logger(__name__)
class XnliProcessor(DataProcessor):
"""
Processor for the XNLI dataset. Adapted from
https://github.com/google-research/bert/blob/f39e881b169b9d53bea03d2d341b31707a6c052b/run_classifier.py#L207
"""
def __init__(self, language, train_language=None):
self.language = language
self.train_language = train_language
def get_train_examples(self, data_dir):
"""See base class."""
lg = self.language if self.train_language is None else self.train_language
lines = self._read_tsv(os.path.join(data_dir, f"XNLI-MT-1.0/multinli/multinli.train.{lg}.tsv"))
examples = []
for i, line in enumerate(lines):
if i == 0:
continue
guid = f"train-{i}"
text_a = line[0]
text_b = line[1]
label = "contradiction" if line[2] == "contradictory" else line[2]
if not isinstance(text_a, str):
raise ValueError(f"Training input {text_a} is not a string")
if not isinstance(text_b, str):
raise ValueError(f"Training input {text_b} is not a string")
if not isinstance(label, str):
raise ValueError(f"Training label {label} is not a string")
examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples
lines = self._read_tsv(os.path.join(data_dir, "XNLI-1.0/xnli.test.tsv"))
examples = []
for i, line in enumerate(lines):
if i == 0:
continue
language = line[0]
if language != self.language:
continue
guid = f"test-{i}"
text_a = line[6]
text_b = line[7]
label = line[1]
if not isinstance(text_a, str):
raise ValueError(f"Training input {text_a} is not a string")
if not isinstance(text_b, str):
raise ValueError(f"Training input {text_b} is not a string")
if not isinstance(label, str):
raise ValueError(f"Training label {label} is not a string")
examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples
def get_labels(self):
"""See base class."""
return ["contradiction", "entailment", "neutral"]
xnli_processors = {
"xnli": XnliProcessor,
}
xnli_output_modes = {
"xnli": "classification",
}
xnli_tasks_num_labels = {
"xnli": 3,
}
.\data\processors\__init__.py
from .glue import glue_convert_examples_to_features, glue_output_modes, glue_processors, glue_tasks_num_labels
from .squad import SquadExample, SquadFeatures, SquadV1Processor, SquadV2Processor, squad_convert_examples_to_features
from .utils import DataProcessor, InputExample, InputFeatures, SingleSentenceClassificationProcessor
from .xnli import xnli_output_modes, xnli_processors, xnli_tasks_num_labels
.\data\__init__.py
from .data_collator import (
DataCollatorForLanguageModeling,
DataCollatorForPermutationLanguageModeling,
DataCollatorForSeq2Seq,
DataCollatorForSOP,
DataCollatorForTokenClassification,
DataCollatorForWholeWordMask,
DataCollatorWithPadding,
DefaultDataCollator,
default_data_collator,
)
from .metrics import glue_compute_metrics, xnli_compute_metrics
from .processors import (
DataProcessor,
InputExample,
InputFeatures,
SingleSentenceClassificationProcessor,
SquadExample,
SquadFeatures,
SquadV1Processor,
SquadV2Processor,
glue_convert_examples_to_features,
glue_output_modes,
glue_processors,
glue_tasks_num_labels,
squad_convert_examples_to_features,
xnli_output_modes,
xnli_processors,
xnli_tasks_num_labels,
)
.\debug_utils.py
import collections
from .utils import ExplicitEnum, is_torch_available, logging
if is_torch_available():
import torch
logger = logging.get_logger(__name__)
class DebugUnderflowOverflow:
"""
This debug class helps detect and understand where the model starts getting very large or very small, and more
importantly `nan` or `inf` weight and activation elements.
There are 2 working modes:
1. Underflow/overflow detection (default)
2. Specific batch absolute min/max tracing without detection
Mode 1: Underflow/overflow detection
To activate the underflow/overflow detection, initialize the object with the model :
```
debug_overflow = DebugUnderflowOverflow(model)
```
then run the training as normal and if `nan` or `inf` gets detected in at least one of the weight, input or output
elements this module will throw an exception and will print `max_frames_to_save` frames that lead to this event,
each frame reporting
1. the fully qualified module name plus the class name whose `forward` was run
2. the absolute min and max value of all elements for each module weights, and the inputs and output
For example, here is the header and the last few frames in detection report for `google/mt5-small` run in fp16
mixed precision :
```
Detected inf/nan during batch_number=0
Last 21 forward frames:
abs min abs max metadata
[...]
encoder.block.2.layer.1.DenseReluDense.wi_0 Linear
2.17e-07 4.50e+00 weight
1.79e-06 4.65e+00 input[0]
2.68e-06 3.70e+01 output
encoder.block.2.layer.1.DenseReluDense.wi_1 Linear
8.08e-07 2.66e+01 weight
1.79e-06 4.65e+00 input[0]
1.27e-04 2.37e+02 output
encoder.block.2.layer.1.DenseReluDense.wo Linear
1.01e-06 6.44e+00 weight
0.00e+00 9.74e+03 input[0]
3.18e-04 6.27e+04 output
encoder.block.2.layer.1.DenseReluDense T5DenseGatedGeluDense
1.79e-06 4.65e+00 input[0]
3.18e-04 6.27e+04 output
encoder.block.2.layer.1.dropout Dropout
3.18e-04 6.27e+04 input[0]
0.00e+00 inf output
```
You can see here, that `T5DenseGatedGeluDense.forward` resulted in output activations, whose absolute max value was
```
"""
pass
"""
As this module measures absolute `min`/``max` of each weight of the model on every forward it'll slow the training
down. Therefore remember to turn it off once the debugging needs have been met.
Args:
model (`nn.Module`):
The model to debug.
max_frames_to_save (`int`, *optional*, defaults to 21):
How many frames back to record
trace_batch_nums(`List[int]`, *optional*, defaults to `[]`):
Which batch numbers to trace (turns detection off)
abort_after_batch_num (`int``, *optional*):
Whether to abort after a certain batch number has finished
"""
def __init__(self, model, max_frames_to_save=21, trace_batch_nums=[], abort_after_batch_num=None):
self.model = model
self.trace_batch_nums = trace_batch_nums
self.abort_after_batch_num = abort_after_batch_num
self.frames = collections.deque([], max_frames_to_save)
self.frame = []
self.batch_number = 0
self.total_calls = 0
self.detected_overflow = False
self.prefix = " "
self.analyse_model()
self.register_forward_hook()
def save_frame(self, frame=None):
if frame is not None:
self.expand_frame(frame)
self.frames.append("\n".join(self.frame))
self.frame = []
def expand_frame(self, line):
self.frame.append(line)
def trace_frames(self):
print("\n".join(self.frames))
self.frames = []
def reset_saved_frames(self):
self.frames = []
def dump_saved_frames(self):
print(f"\nDetected inf/nan during batch_number={self.batch_number}")
print(f"Last {len(self.frames)} forward frames:")
print(f"{'abs min':8} {'abs max':8} metadata")
print("\n".join(self.frames))
print("\n\n")
self.frames = []
def analyse_model(self):
self.module_names = {m: name for name, m in self.model.named_modules()}
def analyse_variable(self, var, ctx):
if torch.is_tensor(var):
self.expand_frame(get_abs_min_max(var, ctx))
if detect_overflow(var, ctx):
self.detected_overflow = True
elif var is None:
self.expand_frame(f"{'None':>17} {ctx}")
else:
self.expand_frame(f"{'not a tensor':>17} {ctx}")
def batch_start_frame(self):
self.expand_frame(f"\n\n{self.prefix} *** Starting batch number={self.batch_number} ***")
self.expand_frame(f"{'abs min':8} {'abs max':8} metadata")
def batch_end_frame(self):
self.expand_frame(f"{self.prefix} *** Finished batch number={self.batch_number-1} ***\n\n")
def create_frame(self, module, input, output):
self.expand_frame(f"{self.prefix} {self.module_names[module]} {module.__class__.__name__}")
for name, p in module.named_parameters(recurse=False):
self.analyse_variable(p, name)
if isinstance(input, tuple):
for i, x in enumerate(input):
self.analyse_variable(x, f"input[{i}]")
else:
self.analyse_variable(input, "input")
if isinstance(output, tuple):
for i, x in enumerate(output):
if isinstance(x, tuple):
for j, y in enumerate(x):
self.analyse_variable(y, f"output[{i}][{j}]")
else:
self.analyse_variable(x, f"output[{i}]")
else:
self.analyse_variable(output, "output")
self.save_frame()
def register_forward_hook(self):
self.model.apply(self._register_forward_hook)
def _register_forward_hook(self, module):
module.register_forward_hook(self.forward_hook)
def forward_hook(self, module, input, output):
last_frame_of_batch = False
trace_mode = True if self.batch_number in self.trace_batch_nums else False
if trace_mode:
self.reset_saved_frames()
if self.total_calls == 0:
self.batch_start_frame()
self.total_calls += 1
if module == self.model:
self.batch_number += 1
last_frame_of_batch = True
self.create_frame(module, input, output)
if last_frame_of_batch:
self.batch_start_frame()
if trace_mode:
self.trace_frames()
if self.detected_overflow and not trace_mode:
self.dump_saved_frames()
raise ValueError(
"DebugUnderflowOverflow: inf/nan detected, aborting as there is no point running further. "
"Please scroll up above this traceback to see the activation values prior to this event."
)
if self.abort_after_batch_num is not None and self.batch_number > self.abort_after_batch_num:
raise ValueError(
f"DebugUnderflowOverflow: aborting after {self.batch_number} batches due to"
f" `abort_after_batch_num={self.abort_after_batch_num}` arg"
)
def get_abs_min_max(var, ctx):
abs_var = var.abs()
return f"{abs_var.min():8.2e} {abs_var.max():8.2e} {ctx}"
def detect_overflow(var, ctx):
"""
Report whether the tensor contains any `nan` or `inf` entries.
This is useful for detecting overflows/underflows and best to call right after the function that did some math that
modified the tensor in question.
This function contains a few other helper features that you can enable and tweak directly if you want to track
various other things.
Args:
var: the tensor variable to check
ctx: the message to print as a context
Return:
`True` if `inf` or `nan` was detected, `False` otherwise
"""
detected = False
if torch.isnan(var).any().item():
detected = True
print(f"{ctx} has nans")
if torch.isinf(var).any().item():
detected = True
print(f"{ctx} has infs")
if 0:
n100 = var[torch.ge(var.abs(), 100)]
if n100.numel() > 0:
print(f"{ctx}: n100={n100.numel()}")
n1000 = var[torch.ge(var.abs(), 1000)]
if n1000.numel() > 0:
print(f"{ctx}: n1000={n1000.numel()}")
n10000 = var[torch.ge(var.abs(), 10000)]
if n10000.numel() > 0:
print(f"{ctx}: n10000={n10000.numel()}")
if 0:
print(f"min={var.min():9.2e} max={var.max():9.2e}")
if 0:
print(f"min={var.min():9.2e} max={var.max():9.2e} var={var.var():9.2e} mean={var.mean():9.2e} ({ctx})")
return detected
class DebugOption(ExplicitEnum):
UNDERFLOW_OVERFLOW = "underflow_overflow"
TPU_METRICS_DEBUG = "tpu_metrics_debug"