Transformers 源码解析(一百三十一)
.\pipelines\audio_classification.py
import subprocess
from typing import Union
import numpy as np
import requests
from ..utils import add_end_docstrings, is_torch_available, is_torchaudio_available, logging
from .base import Pipeline, build_pipeline_init_args
if is_torch_available():
from ..models.auto.modeling_auto import MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES
logger = logging.get_logger(__name__)
def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.array:
"""
使用 ffmpeg 读取音频文件的辅助函数。
"""
ar = f"{sampling_rate}"
ac = "1"
format_for_conversion = "f32le"
ffmpeg_command = [
"ffmpeg",
"-i",
"pipe:0",
"-ac",
ac,
"-ar",
ar,
"-f",
format_for_conversion,
"-hide_banner",
"-loglevel",
"quiet",
"pipe:1",
]
try:
ffmpeg_process = subprocess.Popen(ffmpeg_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
except FileNotFoundError:
raise ValueError("ffmpeg was not found but is required to load audio files from filename")
output_stream = ffmpeg_process.communicate(bpayload)
out_bytes = output_stream[0]
audio = np.frombuffer(out_bytes, np.float32)
if audio.shape[0] == 0:
raise ValueError("Malformed soundfile")
return audio
@add_end_docstrings(build_pipeline_init_args(has_feature_extractor=True))
class AudioClassificationPipeline(Pipeline):
"""
使用任意 `AutoModelForAudioClassification` 进行音频分类的管道。此管道预测原始波形或音频文件的类别。
对于音频文件,需要安装 ffmpeg 支持多种音频格式的解析。
示例:
```
>>> from transformers import pipeline
>>> classifier = pipeline(model="superb/wav2vec2-base-superb-ks")
>>> classifier("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac")
[{'score': 0.997, 'label': '_unknown_'}, {'score': 0.002, 'label': 'left'}, {'score': 0.0, 'label': 'yes'}, {'score': 0.0, 'label': 'down'}, {'score': 0.0, 'label': 'stop'}]
```
了解如何在 [管道教程](../pipeline_tutorial) 中使用管道的基础知识。
此管道可以通过 [`pipeline`] 使用以下任务标识符加载:
`"audio-classification"`.
"""
"""
See the list of available models on
[huggingface.co/models](https://huggingface.co/models?filter=audio-classification).
"""
def __init__(self, *args, **kwargs):
kwargs["top_k"] = 5
super().__init__(*args, **kwargs)
if self.framework != "pt":
raise ValueError(f"The {self.__class__} is only available in PyTorch.")
self.check_model_type(MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES)
def __call__(
self,
inputs: Union[np.ndarray, bytes, str],
**kwargs,
):
"""
Classify the sequence(s) given as inputs. See the [`AutomaticSpeechRecognitionPipeline`] documentation for more
information.
Args:
inputs (`np.ndarray` or `bytes` or `str` or `dict`):
The inputs is either :
- `str` that is the filename of the audio file, the file will be read at the correct sampling rate
to get the waveform using *ffmpeg*. This requires *ffmpeg* to be installed on the system.
- `bytes` it is supposed to be the content of an audio file and is interpreted by *ffmpeg* in the
same way.
- (`np.ndarray` of shape (n, ) of type `np.float32` or `np.float64`)
Raw audio at the correct sampling rate (no further check will be done)
- `dict` form can be used to pass raw audio sampled at arbitrary `sampling_rate` and let this
pipeline do the resampling. The dict must be either be in the format `{"sampling_rate": int,
"raw": np.array}`, or `{"sampling_rate": int, "array": np.array}`, where the key `"raw"` or
`"array"` is used to denote the raw audio waveform.
top_k (`int`, *optional*, defaults to None):
The number of top labels that will be returned by the pipeline. If the provided number is `None` or
higher than the number of labels available in the model configuration, it will default to the number of
labels.
Return:
A list of `dict` with the following keys:
- **label** (`str`) -- The label predicted.
- **score** (`float`) -- The corresponding probability.
"""
return super().__call__(inputs, **kwargs)
def _sanitize_parameters(self, top_k=None, **kwargs):
postprocess_params = {}
if top_k is not None:
if top_k > self.model.config.num_labels:
top_k = self.model.config.num_labels
postprocess_params["top_k"] = top_k
return {}, {}, postprocess_params
def preprocess(self, inputs):
if isinstance(inputs, str):
if inputs.startswith("http://") or inputs.startswith("https://"):
inputs = requests.get(inputs).content
else:
with open(inputs, "rb") as f:
inputs = f.read()
if isinstance(inputs, bytes):
inputs = ffmpeg_read(inputs, self.feature_extractor.sampling_rate)
if isinstance(inputs, dict):
if not ("sampling_rate" in inputs and ("raw" in inputs or "array" in inputs)):
raise ValueError(
"When passing a dictionary to AudioClassificationPipeline, the dict needs to contain a "
'"raw" key containing the numpy array representing the audio and a "sampling_rate" key, '
"containing the sampling_rate associated with that array"
)
_inputs = inputs.pop("raw", None)
if _inputs is None:
inputs.pop("path", None)
_inputs = inputs.pop("array", None)
in_sampling_rate = inputs.pop("sampling_rate")
inputs = _inputs
if in_sampling_rate != self.feature_extractor.sampling_rate:
import torch
if is_torchaudio_available():
from torchaudio import functional as F
else:
raise ImportError(
"torchaudio is required to resample audio samples in AudioClassificationPipeline. "
"The torchaudio package can be installed through: `pip install torchaudio`."
)
inputs = F.resample(
torch.from_numpy(inputs), in_sampling_rate, self.feature_extractor.sampling_rate
).numpy()
if not isinstance(inputs, np.ndarray):
raise ValueError("We expect a numpy ndarray as input")
if len(inputs.shape) != 1:
raise ValueError("We expect a single channel audio input for AudioClassificationPipeline")
processed = self.feature_extractor(
inputs, sampling_rate=self.feature_extractor.sampling_rate, return_tensors="pt"
)
return processed
def _forward(self, model_inputs):
model_outputs = self.model(**model_inputs)
return model_outputs
def postprocess(self, model_outputs, top_k=5):
probs = model_outputs.logits[0].softmax(-1)
scores, ids = probs.topk(top_k)
scores = scores.tolist()
ids = ids.tolist()
labels = [{"score": score, "label": self.model.config.id2label[_id]} for score, _id in zip(scores, ids)]
return labels
.\pipelines\audio_utils.py
import datetime
import platform
import subprocess
from typing import Optional, Tuple, Union
import numpy as np
def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.array:
"""
通过ffmpeg读取音频文件的辅助函数。
"""
ar = f"{sampling_rate}"
ac = "1"
format_for_conversion = "f32le"
ffmpeg_command = [
"ffmpeg",
"-i", "pipe:0",
"-ac", ac,
"-ar", ar,
"-f", format_for_conversion,
"-hide_banner",
"-loglevel", "quiet",
"pipe:1",
]
try:
with subprocess.Popen(ffmpeg_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE) as ffmpeg_process:
output_stream = ffmpeg_process.communicate(bpayload)
except FileNotFoundError as error:
raise ValueError("ffmpeg was not found but is required to load audio files from filename") from error
out_bytes = output_stream[0]
audio = np.frombuffer(out_bytes, np.float32)
if audio.shape[0] == 0:
raise ValueError(
"Soundfile is either not in the correct format or is malformed. Ensure that the soundfile has "
"a valid audio file extension (e.g. wav, flac or mp3) and is not corrupted. If reading from a remote "
"URL, ensure that the URL is the full address to **download** the audio file."
)
return audio
def ffmpeg_microphone(
sampling_rate: int,
chunk_length_s: float,
format_for_conversion: str = "f32le",
):
"""
读取原始麦克风数据的辅助函数。
"""
ar = f"{sampling_rate}"
ac = "1"
if format_for_conversion == "s16le":
size_of_sample = 2
elif format_for_conversion == "f32le":
size_of_sample = 4
else:
raise ValueError(f"Unhandled format `{format_for_conversion}`. Please use `s16le` or `f32le`")
system = platform.system()
if system == "Linux":
format_ = "alsa"
input_ = "default"
elif system == "Darwin":
format_ = "avfoundation"
input_ = ":0"
elif system == "Windows":
format_ = "dshow"
input_ = _get_microphone_name()
ffmpeg_command = [
"ffmpeg",
"-f", format_,
"-i", input_,
"-ac", ac,
"-ar", ar,
"-f", format_for_conversion,
"-fflags", "nobuffer",
"-hide_banner",
"-loglevel", "quiet",
"pipe:1",
]
chunk_len = int(round(sampling_rate * chunk_length_s)) * size_of_sample
iterator = _ffmpeg_stream(ffmpeg_command, chunk_len)
for item in iterator:
yield item
def ffmpeg_microphone_live(
sampling_rate: int,
chunk_length_s: float,
stream_chunk_s: Optional[int] = None,
stride_length_s: Optional[Union[Tuple[float, float], float]] = None,
format_for_conversion: str = "f32le",
):
"""
实时读取麦克风音频数据的辅助函数。
"""
if stream_chunk_s is not None:
chunk_s = stream_chunk_s
else:
chunk_s = chunk_length_s
microphone = ffmpeg_microphone(sampling_rate, chunk_s, format_for_conversion=format_for_conversion)
if format_for_conversion == "s16le":
dtype = np.int16
size_of_sample = 2
elif format_for_conversion == "f32le":
dtype = np.float32
size_of_sample = 4
else:
raise ValueError(f"Unhandled format `{format_for_conversion}`. Please use `s16le` or `f32le`")
if stride_length_s is None:
stride_length_s = chunk_length_s / 6
chunk_len = int(round(sampling_rate * chunk_length_s)) * size_of_sample
if isinstance(stride_length_s, (int, float)):
stride_length_s = [stride_length_s, stride_length_s]
stride_left = int(round(sampling_rate * stride_length_s[0])) * size_of_sample
stride_right = int(round(sampling_rate * stride_length_s[1])) * size_of_sample
audio_time = datetime.datetime.now()
delta = datetime.timedelta(seconds=chunk_s)
for item in chunk_bytes_iter(microphone, chunk_len, stride=(stride_left, stride_right), stream=True):
item["raw"] = np.frombuffer(item["raw"], dtype=dtype)
item["stride"] = (
item["stride"][0] // size_of_sample,
item["stride"][1] // size_of_sample,
)
item["sampling_rate"] = sampling_rate
audio_time += delta
if datetime.datetime.now() > audio_time + 10 * delta:
continue
yield item
def chunk_bytes_iter(iterator, chunk_len: int, stride: Tuple[int, int], stream: bool = False):
"""
Reads raw bytes from an iterator and does chunks of length `chunk_len`. Optionally adds `stride` to each chunks to
get overlaps. `stream` is used to return partial results even if a full `chunk_len` is not yet available.
"""
acc = b""
stride_left, stride_right = stride
if stride_left + stride_right >= chunk_len:
raise ValueError(
f"Stride needs to be strictly smaller than chunk_len: ({stride_left}, {stride_right}) vs {chunk_len}"
)
_stride_left = 0
for raw in iterator:
acc += raw
if stream and len(acc) < chunk_len:
stride = (_stride_left, 0)
yield {"raw": acc[:chunk_len], "stride": stride, "partial": True}
else:
while len(acc) >= chunk_len:
stride = (_stride_left, stride_right)
item = {"raw": acc[:chunk_len], "stride": stride}
if stream:
item["partial"] = False
yield item
_stride_left = stride_left
acc = acc[chunk_len - stride_left - stride_right :]
if len(acc) > stride_left:
item = {"raw": acc, "stride": (_stride_left, 0)}
if stream:
item["partial"] = False
yield item
def _ffmpeg_stream(ffmpeg_command, buflen: int):
"""
Internal function to create the generator of data through ffmpeg
"""
bufsize = 2**24
try:
with subprocess.Popen(ffmpeg_command, stdout=subprocess.PIPE, bufsize=bufsize) as ffmpeg_process:
while True:
raw = ffmpeg_process.stdout.read(buflen)
if raw == b"":
break
yield raw
except FileNotFoundError as error:
raise ValueError("ffmpeg was not found but is required to stream audio files from filename") from error
def _get_microphone_name():
"""
Retrieve the microphone name in Windows .
"""
command = ["ffmpeg", "-list_devices", "true", "-f", "dshow", "-i", ""]
try:
ffmpeg_devices = subprocess.run(command, text=True, stderr=subprocess.PIPE, encoding="utf-8")
microphone_lines = [line for line in ffmpeg_devices.stderr.splitlines() if "(audio)" in line]
if microphone_lines:
microphone_name = microphone_lines[0].split('"')[1]
print(f"Using microphone: {microphone_name}")
return f"audio={microphone_name}"
except FileNotFoundError:
print("ffmpeg was not found. Please install it or make sure it is in your system PATH.")
return "default"
.\pipelines\automatic_speech_recognition.py
from collections import defaultdict
from typing import TYPE_CHECKING, Dict, Optional, Union
import numpy as np
import requests
from ..tokenization_utils import PreTrainedTokenizer
from ..utils import is_torch_available, is_torchaudio_available, logging
from .audio_utils import ffmpeg_read
from .base import ChunkPipeline
if TYPE_CHECKING:
from pyctcdecode import BeamSearchDecoderCTC
from ..feature_extraction_sequence_utils import SequenceFeatureExtractor
from ..modeling_utils import PreTrainedModel
logger = logging.get_logger(__name__)
if is_torch_available():
import torch
from ..models.auto.modeling_auto import MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES
def rescale_stride(stride, ratio):
"""
Rescales the stride values from audio space to tokens/logits space.
(160_000, 16_000, 16_000) -> (2000, 200, 200) for instance.
"""
new_strides = []
for input_n, left, right in stride:
token_n = int(round(input_n * ratio))
left = int(round(left / input_n * token_n))
right = int(round(right / input_n * token_n))
new_stride = (token_n, left, right)
new_strides.append(new_stride)
return new_strides
def chunk_iter(inputs, feature_extractor, chunk_len, stride_left, stride_right, dtype=None):
"""
Iterates over chunks of input data, yielding processed chunks.
inputs: numpy array, the input data to be chunked
feature_extractor: SequenceFeatureExtractor, object for extracting features from chunks
chunk_len: int, length of each chunk
stride_left: int, left stride length
stride_right: int, right stride length
dtype: optional, data type to convert processed chunks
Yields dictionaries containing processed chunk data and metadata.
"""
inputs_len = inputs.shape[0]
step = chunk_len - stride_left - stride_right
for chunk_start_idx in range(0, inputs_len, step):
chunk_end_idx = chunk_start_idx + chunk_len
chunk = inputs[chunk_start_idx:chunk_end_idx]
processed = feature_extractor(chunk, sampling_rate=feature_extractor.sampling_rate, return_tensors="pt")
if dtype is not None:
processed = processed.to(dtype=dtype)
_stride_left = 0 if chunk_start_idx == 0 else stride_left
is_last = chunk_end_idx > inputs_len if stride_right > 0 else chunk_end_idx >= inputs_len
_stride_right = 0 if is_last else stride_right
chunk_len = chunk.shape[0]
stride = (chunk_len, _stride_left, _stride_right)
if chunk.shape[0] > _stride_left:
yield {"is_last": is_last, "stride": stride, **processed}
if is_last:
break
def _fast_find_longest_common_sequence(sequence_left, sequence_right):
seq_len_left = len(sequence_left)
seq_len_right = len(sequence_right)
counter = [[0] * (seq_len_right + 1) for _ in range(seq_len_left + 1)]
longest = 0
for i in range(seq_len_left):
for j in range(seq_len_right):
if sequence_left[i] == sequence_right[j]:
previous_counter = counter[i][j] + 1
counter[i + 1][j + 1] = previous_counter
if previous_counter > longest:
longest = previous_counter
counter = np.array(counter)
index_left = np.argwhere(counter == longest)[-1][0] - longest if longest != 0 else -1
index_right = np.argwhere(counter == longest)[-1][1] - longest if longest != 0 else -1
return index_left, index_right, longest
def _find_longest_common_sequence(sequences, tokenizer):
sequence = [tok_id for tok_id in sequences[0][0].tolist() if tok_id not in tokenizer.all_special_ids]
for new_seq in sequences[1:]:
new_sequence = [tok_id for tok_id in new_seq[0].tolist() if tok_id not in tokenizer.all_special_ids]
index = 0
max_ = 0.0
for i in range(1, len(new_sequence) + 1):
eps = i / 10000.0
matches = np.sum(np.array(sequence[-i:]) == np.array(new_sequence[:i]))
matching = matches / i + eps
if matches > 1 and matching > max_:
index = i
max_ = matching
sequence.extend(new_sequence[index:])
return np.array(sequence)
class AutomaticSpeechRecognitionPipeline(ChunkPipeline):
"""
Pipeline that aims at extracting spoken text contained within some audio.
The input can be either a raw waveform or a audio file. In case of the audio file, ffmpeg should be installed for
to support multiple audio formats
Example:
```
>>> from transformers import pipeline
>>> transcriber = pipeline(model="openai/whisper-base")
>>> transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac")
{'text': ' He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered flour-fatten sauce.'}
```
Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
"""
Arguments:
model ([`PreTrainedModel`] or [`TFPreTrainedModel`]):
模型将用于通过管道进行预测。必须是继承自[`PreTrainedModel`](PyTorch)或[`TFPreTrainedModel`](TensorFlow)的模型。
feature_extractor ([`SequenceFeatureExtractor`]):
特征提取器将用于对波形进行编码,以供模型使用。
tokenizer ([`PreTrainedTokenizer`]):
分词器将用于对数据进行编码,以供模型使用。此对象继承自[`PreTrainedTokenizer`]。
decoder (`pyctcdecode.BeamSearchDecoderCTC`, *optional*):
可选参数,用于语言模型增强解码的PyCTCDecode的BeamSearchDecoderCTC。详见[`Wav2Vec2ProcessorWithLM`]获取更多信息。
chunk_length_s (`float`, *optional*, defaults to 0):
每个分块的输入长度(秒)。如果`chunk_length_s = 0`,则禁用分块(默认)。
<Tip>
有关如何有效使用`chunk_length_s`的更多信息,请查看ASR分块博文。
</Tip>
stride_length_s (`float`, *optional*, defaults to `chunk_length_s / 6`):
每个分块左右的步幅长度。仅在`chunk_length_s > 0`时使用。这使得模型能够查看更多的上下文,并更好地推断字母,但管道会丢弃最后的步幅位,以尽可能完美地重构最终结果。
<Tip>
有关如何有效使用`stride_length_s`的更多信息,请查看ASR分块博文。
</Tip>
framework (`str`, *optional*):
要使用的框架,可以是`"pt"`表示PyTorch或`"tf"`表示TensorFlow。必须安装指定的框架。如果未指定框架,默认使用当前安装的框架。如果未指定框架且两个框架都安装,则默认使用模型的框架,或者如果没有提供模型,则默认使用PyTorch。
device (Union[`int`, `torch.device`], *optional*):
CPU/GPU设备编号。将其设置为`None`将使用CPU,设置为正整数将在关联的CUDA设备ID上运行模型。
torch_dtype (Union[`int`, `torch.dtype`], *optional*):
计算的数据类型(dtype)。将其设置为`None`将使用float32精度。设置为`torch.float16`或`torch.bfloat16`将使用相应的半精度dtype。
def __init__(
self,
model: "PreTrainedModel",
feature_extractor: Union["SequenceFeatureExtractor", str] = None,
tokenizer: Optional[PreTrainedTokenizer] = None,
decoder: Optional[Union["BeamSearchDecoderCTC", str]] = None,
device: Union[int, "torch.device"] = None,
torch_dtype: Optional[Union[str, "torch.dtype"]] = None,
**kwargs,
):
if model.config.model_type == "whisper":
self.type = "seq2seq_whisper"
elif model.__class__.__name__ in MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES.values():
self.type = "seq2seq"
elif (
feature_extractor._processor_class
and feature_extractor._processor_class.endswith("WithLM")
and decoder is not None
):
self.decoder = decoder
self.type = "ctc_with_lm"
else:
self.type = "ctc"
super().__init__(model, tokenizer, feature_extractor, device=device, torch_dtype=torch_dtype, **kwargs)
def __call__(
self,
inputs: Union[np.ndarray, bytes, str],
**kwargs,
):
def _sanitize_parameters(
self,
chunk_length_s=None,
stride_length_s=None,
ignore_warning=None,
decoder_kwargs=None,
return_timestamps=None,
return_language=None,
generate_kwargs=None,
max_new_tokens=None,
):
def postprocess(
self, model_outputs,
decoder_kwargs: Optional[Dict] = None,
return_timestamps=None,
return_language=None,
def _find_timestamp_sequence(sequences, tokenizer, feature_extractor, max_source_positions):
"""
Computes the final sequences by merging the end of the nth sequence with the beginning of the n+1th sequence. Since
`WhisperForConditionalGeneration` produces the timestamps pairwise, we filter the consecutive timestamps and only
iterate over them. We keep track of the `time` which indicates the actual starting time of the chunk that is
processed. We need to make sure to offset the timestamps tokens by the `time` in order for the tokenizer to
properly compute the final `offset`.
"""
timestamp_begin = tokenizer.convert_tokens_to_ids("<|notimestamps|>") + 1
items = []
time_precision = feature_extractor.chunk_length / max_source_positions
time = 0
result = []
for i in range(len(items)):
result += items[i].tolist()
return result
.\pipelines\base.py
import collections
import csv
import importlib
import json
import os
import pickle
import sys
import traceback
import types
import warnings
from abc import ABC, abstractmethod
from collections import UserDict
from contextlib import contextmanager
from os.path import abspath, exists
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
from ..dynamic_module_utils import custom_object_save
from ..feature_extraction_utils import PreTrainedFeatureExtractor
from ..image_processing_utils import BaseImageProcessor
from ..modelcard import ModelCard
from ..models.auto.configuration_auto import AutoConfig
from ..tokenization_utils import PreTrainedTokenizer
from ..utils import (
ModelOutput,
add_end_docstrings,
infer_framework,
is_tf_available,
is_torch_available,
is_torch_cuda_available,
is_torch_npu_available,
is_torch_xpu_available,
logging,
)
GenericTensor = Union[List["GenericTensor"], "torch.Tensor", "tf.Tensor"]
if is_tf_available():
import tensorflow as tf
from ..models.auto.modeling_tf_auto import TFAutoModel
if is_torch_available():
import torch
from torch.utils.data import DataLoader, Dataset
from ..models.auto.modeling_auto import AutoModel
from .pt_utils import KeyDataset
else:
Dataset = None
KeyDataset = None
if TYPE_CHECKING:
from ..modeling_tf_utils import TFPreTrainedModel
from ..modeling_utils import PreTrainedModel
logger = logging.get_logger(__name__)
def no_collate_fn(items):
if len(items) != 1:
raise ValueError("This collate_fn is meant to be used with batch_size=1")
return items[0]
def _pad(items, key, padding_value, padding_side):
batch_size = len(items)
if isinstance(items[0][key], torch.Tensor):
shape = items[0][key].shape
dim = len(shape)
if key in ["pixel_values", "image"]:
return torch.cat([item[key] for item in items], dim=0)
elif dim == 4 and key == "input_features":
return torch.cat([item[key] for item in items], dim=0)
max_length = max(item[key].shape[1] for item in items)
min_length = min(item[key].shape[1] for item in items)
dtype = items[0][key].dtype
if dim == 2:
if max_length == min_length:
return torch.cat([item[key] for item in items], dim=0)
tensor = torch.zeros((batch_size, max_length), dtype=dtype) + padding_value
elif dim == 3:
tensor = torch.zeros((batch_size, max_length, shape[-1]), dtype=dtype) + padding_value
elif dim == 4:
tensor = torch.zeros((batch_size, max_length, shape[-2], shape[-1]), dtype=dtype) + padding_value
for i, item in enumerate(items):
if dim == 2:
if padding_side == "left":
tensor[i, -len(item[key][0]) :] = item[key][0].clone()
else:
tensor[i, : len(item[key][0])] = item[key][0].clone()
elif dim == 3:
if padding_side == "left":
tensor[i, -len(item[key][0]) :, :] = item[key][0].clone()
else:
tensor[i, : len(item[key][0]), :] = item[key][0].clone()
elif dim == 4:
if padding_side == "left":
tensor[i, -len(item[key][0]) :, :, :] = item[key][0].clone()
else:
tensor[i, : len(item[key][0]), :, :] = item[key][0].clone()
return tensor
else:
return [item[key] for item in items]
def pad_collate_fn(tokenizer, feature_extractor):
t_padding_side = None
f_padding_side = None
if tokenizer is None and feature_extractor is None:
raise ValueError("Pipeline without tokenizer or feature_extractor cannot do batching")
if tokenizer is not None:
if tokenizer.pad_token_id is None:
raise ValueError(
"Pipeline with tokenizer without pad_token cannot do batching. You can try to set it with "
"`pipe.tokenizer.pad_token_id = model.config.eos_token_id`."
)
else:
t_padding_value = tokenizer.pad_token_id
t_padding_side = tokenizer.padding_side
if feature_extractor is not None:
f_padding_value = getattr(feature_extractor, "padding_value", None)
f_padding_side = getattr(feature_extractor, "padding_side", None)
if t_padding_side is not None and f_padding_side is not None and t_padding_side != f_padding_side:
raise ValueError(
f"The feature extractor, and tokenizer don't agree on padding side {t_padding_side} != {f_padding_side}"
)
padding_side = "right"
if t_padding_side is not None:
padding_side = t_padding_side
if f_padding_side is not None:
padding_side = f_padding_side
def inner(items):
keys = set(items[0].keys())
for item in items:
if set(item.keys()) != keys:
raise ValueError(
f"The elements of the batch contain different keys. Cannot batch them ({set(item.keys())} !="
f" {keys})"
)
padded = {}
for key in keys:
if key in {"input_ids"}:
if tokenizer is None and feature_extractor is not None:
_padding_value = f_padding_value
else:
_padding_value = t_padding_value
elif key in {"input_values", "pixel_values", "input_features"}:
_padding_value = f_padding_value
elif key in {"p_mask", "special_tokens_mask"}:
_padding_value = 1
elif key in {"attention_mask", "token_type_ids"}:
_padding_value = 0
else:
_padding_value = 0
padded[key] = _pad(items, key, _padding_value, padding_side)
return padded
return inner
def infer_framework_load_model(
model,
config: AutoConfig,
model_classes: Optional[Dict[str, Tuple[type]]] = None,
task: Optional[str] = None,
framework: Optional[str] = None,
**model_kwargs,
):
"""
模型加载函数,根据不同的框架和任务加载模型
Parameters:
- model: 加载的模型实例
- config: 自动配置对象
- model_classes: 模型类别的字典,可选
- task: 任务名称,可选
- framework: 框架名称,可选
- **model_kwargs: 其他模型相关的参数
"""
if not is_tf_available() and not is_torch_available():
raise RuntimeError(
"At least one of TensorFlow 2.0 or PyTorch should be installed. "
"To install TensorFlow 2.0, read the instructions at https://www.tensorflow.org/install/ "
"To install PyTorch, read the instructions at https://pytorch.org/."
)
if isinstance(model, str):
model_kwargs["_from_pipeline"] = task
class_tuple = ()
look_pt = is_torch_available() and framework in {"pt", None}
look_tf = is_tf_available() and framework in {"tf", None}
if model_classes:
if look_pt:
class_tuple = class_tuple + model_classes.get("pt", (AutoModel,))
if look_tf:
class_tuple = class_tuple + model_classes.get("tf", (TFAutoModel,))
if config.architectures:
classes = []
for architecture in config.architectures:
transformers_module = importlib.import_module("transformers")
if look_pt:
_class = getattr(transformers_module, architecture, None)
if _class is not None:
classes.append(_class)
if look_tf:
_class = getattr(transformers_module, f"TF{architecture}", None)
if _class is not None:
classes.append(_class)
class_tuple = class_tuple + tuple(classes)
if len(class_tuple) == 0:
raise ValueError(f"Pipeline cannot infer suitable model classes from {model}")
all_traceback = {}
for model_class in class_tuple:
kwargs = model_kwargs.copy()
if framework == "pt" and model.endswith(".h5"):
kwargs["from_tf"] = True
logger.warning(
"Model might be a TensorFlow model (ending with `.h5`) but TensorFlow is not available. "
"Trying to load the model with PyTorch."
)
elif framework == "tf" and model.endswith(".bin"):
kwargs["from_pt"] = True
logger.warning(
"Model might be a PyTorch model (ending with `.bin`) but PyTorch is not available. "
"Trying to load the model with Tensorflow."
)
try:
model = model_class.from_pretrained(model, **kwargs)
if hasattr(model, "eval"):
model = model.eval()
break
except (OSError, ValueError):
all_traceback[model_class.__name__] = traceback.format_exc()
if isinstance(model, str):
error = ""
for class_name, trace in all_traceback.items():
error += f"while loading with {class_name}, an error is thrown:\n{trace}\n"
raise ValueError(
f"Could not load model {model} with any of the following classes: {class_tuple}. See the original errors:\n\n{error}\n"
)
if framework is None:
framework = infer_framework(model.__class__)
return framework, model
def infer_framework_from_model(
model,
model_classes: Optional[Dict[str, Tuple[type]]] = None,
task: Optional[str] = None,
framework: Optional[str] = None,
**model_kwargs,
):
"""
从传入的 `model` 推断出要使用的框架(TensorFlow 或 PyTorch)。返回一个元组 (框架, 模型)。
如果 `model` 已经被实例化,此函数将从模型类中推断框架。否则,如果 `model` 是一个检查点名称,
此方法将尝试使用 `model_classes` 实例化它。由于我们不希望实例化模型两次,这个模型将被返回以供流水线使用。
如果两种框架都安装并可用于 `model`,将选择 PyTorch。
Args:
model (`str`, [`PreTrainedModel`] or [`TFPreTrainedModel`]):
要从中推断框架的模型。如果是 `str`,则是检查点名称。要推断框架的模型。
model_classes (dictionary `str` to `type`, *optional*):
框架到类的映射。
task (`str`):
定义将返回的流水线的任务。
model_kwargs:
传递给模型的 `from_pretrained(..., **model_kwargs)` 函数的额外关键字参数。
Returns:
`Tuple`: 框架和模型的元组。
"""
if isinstance(model, str):
config = AutoConfig.from_pretrained(model, _from_pipeline=task, **model_kwargs)
else:
config = model.config
return infer_framework_load_model(
model, config, model_classes=model_classes, _from_pipeline=task, task=task, framework=framework, **model_kwargs
)
def get_framework(model, revision: Optional[str] = None):
"""
选择要使用的框架(TensorFlow 或 PyTorch)。
Args:
model (`str`, [`PreTrainedModel`] or [`TFPreTrainedModel`]):
如果两种框架都安装,选择与传入的模型对应的框架(模型类或模型名称)。如果未提供特定模型,则默认使用 PyTorch。
"""
warnings.warn(
"`get_framework` is deprecated and will be removed in v5, use `infer_framework_from_model` instead.",
FutureWarning,
)
if not is_tf_available() and not is_torch_available():
raise RuntimeError(
"At least one of TensorFlow 2.0 or PyTorch should be installed. "
"To install TensorFlow 2.0, read the instructions at https://www.tensorflow.org/install/ "
"To install PyTorch, read the instructions at https://pytorch.org/."
)
if isinstance(model, str):
if is_torch_available() and not is_tf_available():
model = AutoModel.from_pretrained(model, revision=revision)
elif is_tf_available() and not is_torch_available():
model = TFAutoModel.from_pretrained(model, revision=revision)
else:
try:
model = AutoModel.from_pretrained(model, revision=revision)
except OSError:
model = TFAutoModel.from_pretrained(model, revision=revision)
framework = infer_framework(model.__class__)
return framework
def get_default_model_and_revision(
targeted_task: Dict, framework: Optional[str], task_options: Optional[Any]
) -> Union[str, Tuple[str, str]]:
"""
Select a default model to use for a given task. Defaults to pytorch if ambiguous.
Args:
targeted_task (`Dict` ):
Dictionary representing the given task, that should contain default models
framework (`str`, None)
"pt", "tf" or None, representing a specific framework if it was specified, or None if we don't know yet.
task_options (`Any`, None)
Any further value required by the task to get fully specified, for instance (SRC, TGT) languages for
translation task.
Returns
`str` The model string representing the default model for this pipeline
"""
if is_torch_available() and not is_tf_available():
framework = "pt"
elif is_tf_available() and not is_torch_available():
framework = "tf"
defaults = targeted_task["default"]
if task_options:
if task_options not in defaults:
raise ValueError(f"The task does not provide any default models for options {task_options}")
default_models = defaults[task_options]["model"]
elif "model" in defaults:
default_models = targeted_task["default"]["model"]
else:
raise ValueError('The task defaults can\'t be correctly selected. You probably meant "translation_XX_to_YY"')
if framework is None:
framework = "pt"
return default_models[framework]
class PipelineException(Exception):
"""
Raised by a [`Pipeline`] when handling __call__.
Args:
task (`str`): The task of the pipeline.
model (`str`): The model used by the pipeline.
reason (`str`): The error message to display.
"""
def __init__(self, task: str, model: str, reason: str):
super().__init__(reason)
self.task = task
self.model = model
class ArgumentHandler(ABC):
"""
Base interface for handling arguments for each [`~pipelines.Pipeline`].
"""
@abstractmethod
def __call__(self, *args, **kwargs):
raise NotImplementedError()
class PipelineDataFormat:
"""
Base class for all the pipeline supported data format both for reading and writing. Supported data formats
currently includes:
- JSON
- CSV
- stdin/stdout (pipe)
`PipelineDataFormat` also includes some utilities to work with multi-columns like mapping from datasets columns to
pipelines keyword arguments through the `dataset_kwarg_1=dataset_column_1` format.
"""
Args:
output_path (`str`): Where to save the outgoing data.
input_path (`str`): Where to look for the input data.
column (`str`): The column to read.
overwrite (`bool`, *optional*, defaults to `False`):
Whether or not to overwrite the `output_path`.
"""
# 支持的数据格式
SUPPORTED_FORMATS = ["json", "csv", "pipe"]
def __init__(
self,
output_path: Optional[str],
input_path: Optional[str],
column: Optional[str],
overwrite: bool = False,
):
# 输出路径
self.output_path = output_path
# 输入路径
self.input_path = input_path
# 要读取的列,如果没有指定则为空字符串列表
self.column = column.split(",") if column is not None else [""]
# 是否多列读取
self.is_multi_columns = len(self.column) > 1
# 如果是多列读取,则解析每列的键值对形式
if self.is_multi_columns:
self.column = [tuple(c.split("=")) if "=" in c else (c, c) for c in self.column]
# 如果指定了输出路径且不允许覆盖写入,则检查输出路径是否已存在
if output_path is not None and not overwrite:
if exists(abspath(self.output_path)):
raise OSError(f"{self.output_path} already exists on disk")
# 如果指定了输入路径,则检查输入路径是否存在
if input_path is not None:
if not exists(abspath(self.input_path)):
raise OSError(f"{self.input_path} doesnt exist on disk")
@abstractmethod
def __iter__(self):
raise NotImplementedError()
@abstractmethod
def save(self, data: Union[dict, List[dict]]):
"""
Save the provided data object with the representation for the current [`~pipelines.PipelineDataFormat`].
Args:
data (`dict` or list of `dict`): The data to store.
"""
raise NotImplementedError()
def save_binary(self, data: Union[dict, List[dict]]) -> str:
"""
Save the provided data object as a pickle-formatted binary data on the disk.
Args:
data (`dict` or list of `dict`): The data to store.
Returns:
`str`: Path where the data has been saved.
"""
# 获取输出路径的文件名(去除扩展名)
path, _ = os.path.splitext(self.output_path)
# 构建保存为 pickle 格式的二进制文件路径
binary_path = os.path.extsep.join((path, "pickle"))
# 将数据以二进制形式写入到文件中
with open(binary_path, "wb+") as f_output:
pickle.dump(data, f_output)
# 返回保存数据的文件路径
return binary_path
@staticmethod
def from_str(
format: str,
output_path: Optional[str],
input_path: Optional[str],
column: Optional[str],
overwrite=False,
) -> "PipelineDataFormat":
"""
根据 `format` 参数创建相应的 [`~pipelines.PipelineDataFormat`] 子类实例。
Args:
format (`str`):
所需流水线的格式。可接受的值为 `"json"`、`"csv"` 或 `"pipe"`。
output_path (`str`, *optional*):
输出数据保存的路径。
input_path (`str`, *optional*):
输入数据所在路径。
column (`str`, *optional*):
要读取的列。
overwrite (`bool`, *optional*, 默认为 `False`):
是否覆盖 `output_path`。
Returns:
[`~pipelines.PipelineDataFormat`]: 合适的数据格式对象。
"""
if format == "json":
return JsonPipelineDataFormat(output_path, input_path, column, overwrite=overwrite)
elif format == "csv":
return CsvPipelineDataFormat(output_path, input_path, column, overwrite=overwrite)
elif format == "pipe":
return PipedPipelineDataFormat(output_path, input_path, column, overwrite=overwrite)
else:
raise KeyError(f"未知的数据格式 {format} (可用的格式为 json/csv/pipe)")
class CsvPipelineDataFormat(PipelineDataFormat):
"""
Support for pipelines using CSV data format.
Args:
output_path (`str`): Where to save the outgoing data.
input_path (`str`): Where to look for the input data.
column (`str`): The column to read.
overwrite (`bool`, *optional*, defaults to `False`):
Whether or not to overwrite the `output_path`.
"""
def __init__(
self,
output_path: Optional[str],
input_path: Optional[str],
column: Optional[str],
overwrite=False,
):
# 调用父类的构造函数
super().__init__(output_path, input_path, column, overwrite=overwrite)
def __iter__(self):
# 打开输入文件并创建 CSV 字典读取器
with open(self.input_path, "r") as f:
reader = csv.DictReader(f)
for row in reader:
if self.is_multi_columns:
yield {k: row[c] for k, c in self.column} # 如果有多列数据,以字典形式返回
else:
yield row[self.column[0]] # 否则返回指定列
def save(self, data: List[dict]):
"""
Save the provided data object with the representation for the current [`~pipelines.PipelineDataFormat`].
Args:
data (`List[dict]`): The data to store.
"""
with open(self.output_path, "w") as f:
if len(data) > 0:
writer = csv.DictWriter(f, list(data[0].keys()))
writer.writeheader() # 写入头部
writer.writerows(data) # 写入数据
class JsonPipelineDataFormat(PipelineDataFormat):
"""
Support for pipelines using JSON file format.
Args:
output_path (`str`): Where to save the outgoing data.
input_path (`str`): Where to look for the input data.
column (`str`): The column to read.
overwrite (`bool`, *optional*, defaults to `False`):
Whether or not to overwrite the `output_path`.
"""
def __init__(
self,
output_path: Optional[str],
input_path: Optional[str],
column: Optional[str],
overwrite=False,
):
super().__init__(output_path, input_path, column, overwrite=overwrite) # 调用父类的构造函数
with open(input_path, "r") as f:
self._entries = json.load(f) # 读取 JSON 格式的数据并存储在 _entries 变量中
def __iter__(self):
for entry in self._entries:
if self.is_multi_columns:
yield {k: entry[c] for k, c in self.column} # 如果有多列数据,以字典形式返回
else:
yield entry[self.column[0]] # 否则返回指定列
def save(self, data: dict):
"""
Save the provided data object in a json file.
Args:
data (`dict`): The data to store.
"""
with open(self.output_path, "w") as f:
json.dump(data, f) # 将数据存储为 JSON 格式
class PipedPipelineDataFormat(PipelineDataFormat):
"""
Read data from piped input to the python process. For multi columns data, columns should separated by \t
If columns are provided, then the output will be a dictionary with {column_x: value_x}
"""
def __iter__(self):
"""
Iterate over input lines from stdin.
Yields:
- If the line contains tabs (`\t`):
- If `self.column` is defined, yield a dictionary mapping column names to line values.
- Otherwise, yield a tuple of line values.
- If no tabs are present, yield the entire line.
"""
for line in sys.stdin:
# Split for multi-columns
if "\t" in line:
line = line.split("\t")
if self.column:
# Dictionary to map arguments
yield {kwargs: l for (kwargs, _), l in zip(self.column, line)}
else:
yield tuple(line)
# No dictionary to map arguments
else:
yield line
def save(self, data: dict):
"""
Print the provided data.
Args:
data (`dict`): The data to be printed.
"""
print(data)
def save_binary(self, data: Union[dict, List[dict]]) -> str:
"""
Save binary data to the specified output path.
Args:
data (Union[dict, List[dict]]): The binary data to be saved.
Returns:
str: The output path where the data was saved.
Raises:
KeyError: If `self.output_path` is `None`, indicating a missing output path.
"""
if self.output_path is None:
raise KeyError(
"When using piped input on pipeline outputting large object requires an output file path. "
"Please provide such output path through --output argument."
)
return super().save_binary(data)
class _ScikitCompat(ABC):
"""
Interface layer for the Scikit and Keras compatibility.
"""
@abstractmethod
def transform(self, X):
# 抽象方法:子类需实现数据转换的逻辑
raise NotImplementedError()
@abstractmethod
def predict(self, X):
# 抽象方法:子类需实现预测的逻辑
raise NotImplementedError()
def build_pipeline_init_args(
has_tokenizer: bool = False,
has_feature_extractor: bool = False,
has_image_processor: bool = False,
supports_binary_output: bool = True,
) -> str:
docstring = r"""
Arguments:
model ([`PreTrainedModel`] or [`TFPreTrainedModel`]):
The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from
[`PreTrainedModel`] for PyTorch and [`TFPreTrainedModel`] for TensorFlow."""
if has_tokenizer:
docstring += r"""
tokenizer ([`PreTrainedTokenizer`]):
The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from
[`PreTrainedTokenizer`]."""
if has_feature_extractor:
docstring += r"""
feature_extractor ([`SequenceFeatureExtractor`]):
The feature extractor that will be used by the pipeline to encode data for the model. This object inherits from
[`SequenceFeatureExtractor`]."""
if has_image_processor:
docstring += r"""
image_processor ([`BaseImageProcessor`]):
The image processor that will be used by the pipeline to encode data for the model. This object inherits from
[`BaseImageProcessor`]."""
docstring += r"""
modelcard (`str` or [`ModelCard`], *optional*):
Model card attributed to the model for this pipeline.
framework (`str`, *optional*):
The framework to use, either `"pt"` for PyTorch or `"tf"` for TensorFlow. The specified framework must be
installed.
If no framework is specified, will default to the one currently installed. If no framework is specified and
both frameworks are installed, will default to the framework of the `model`, or to PyTorch if no model is
provided.
task (`str`, defaults to `""`):
A task-identifier for the pipeline.
num_workers (`int`, *optional*, defaults to 8):
When the pipeline will use *DataLoader* (when passing a dataset, on GPU for a Pytorch model), the number of
workers to be used.
batch_size (`int`, *optional*, defaults to 1):
When the pipeline will use *DataLoader* (when passing a dataset, on GPU for a Pytorch model), the size of
the batch to use, for inference this is not always beneficial, please read [Batching with
pipelines](https://huggingface.co/transformers/main_classes/pipelines.html
args_parser ([`~pipelines.ArgumentHandler`], *optional*):
Reference to the object in charge of parsing supplied pipeline parameters.
device (`int`, *optional*, defaults to -1):
Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, a positive will run the model on
the associated CUDA device id. You can pass native `torch.device` or a `str` too
torch_dtype (`str` or `torch.dtype`, *optional*):
Sent directly as `model_kwargs` (just a simpler shortcut) to use the available precision for this model
(`torch.float16`, `torch.bfloat16`, ... or `"auto"`)
"""
# 如果支持二进制输出,则添加下面的描述
if supports_binary_output:
docstring += r"""
binary_output (`bool`, *optional*, defaults to `False`):
Flag indicating if the output the pipeline should happen in a serialized format (i.e., pickle) or as
the raw output data e.g. text."""
# 返回完整的文档字符串
return docstring
# 使用指定的参数构建初始化 Pipeline 的参数字典
PIPELINE_INIT_ARGS = build_pipeline_init_args(
has_tokenizer=True, has_feature_extractor=True, has_image_processor=True, supports_binary_output=True
)
# 如果当前环境支持 Torch,则导入相关的工具类和函数
if is_torch_available():
from transformers.pipelines.pt_utils import (
PipelineChunkIterator, # 导入 PipelineChunkIterator 类,用于处理分块迭代
PipelineDataset, # 导入 PipelineDataset 类,用于管道的数据集操作
PipelineIterator, # 导入 PipelineIterator 类,用于管道的迭代操作
PipelinePackIterator, # 导入 PipelinePackIterator 类,用于管道的打包迭代操作
)
# 根据指定的参数设置构建 Pipeline,并添加相应的文档字符串
@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True, has_feature_extractor=True, has_image_processor=True))
class Pipeline(_ScikitCompat):
"""
The Pipeline class is the class from which all pipelines inherit. Refer to this class for methods shared across
different pipelines.
Base class implementing pipelined operations. Pipeline workflow is defined as a sequence of the following
operations:
Input -> Tokenization -> Model Inference -> Post-Processing (task dependent) -> Output
Pipeline supports running on CPU or GPU through the device argument (see below).
Some pipeline, like for instance [`FeatureExtractionPipeline`] (`'feature-extraction'`) output large tensor object
as nested-lists. In order to avoid dumping such large structure as textual data we provide the `binary_output`
constructor argument. If set to `True`, the output will be stored in the pickle format.
"""
default_input_names = None
def __init__(
self,
model: Union["PreTrainedModel", "TFPreTrainedModel"],
tokenizer: Optional[PreTrainedTokenizer] = None,
feature_extractor: Optional[PreTrainedFeatureExtractor] = None,
image_processor: Optional[BaseImageProcessor] = None,
modelcard: Optional[ModelCard] = None,
framework: Optional[str] = None,
task: str = "",
args_parser: ArgumentHandler = None,
device: Union[int, "torch.device"] = None,
torch_dtype: Optional[Union[str, "torch.dtype"]] = None,
binary_output: bool = False,
**kwargs,
):
"""
Initialize a pipeline object.
Parameters:
- model (Union["PreTrainedModel", "TFPreTrainedModel"]): The pretrained model to use.
- tokenizer (Optional[PreTrainedTokenizer]): Tokenizer used to preprocess the inputs.
- feature_extractor (Optional[PreTrainedFeatureExtractor]): Feature extractor for inputs.
- image_processor (Optional[BaseImageProcessor]): Image processor for image inputs.
- modelcard (Optional[ModelCard]): ModelCard describing the model's attributes.
- framework (Optional[str]): The framework where the model is implemented (e.g., 'pt' for PyTorch).
- task (str): The task associated with the pipeline.
- args_parser (ArgumentHandler): Custom argument handler for parsing pipeline arguments.
- device (Union[int, "torch.device"]): Device where the model will be run (CPU/GPU).
- torch_dtype (Optional[Union[str, "torch.dtype"]]): Data type used in PyTorch models.
- binary_output (bool): Whether to output results in binary (pickle) format.
- **kwargs: Additional keyword arguments passed to the pipeline.
Notes:
- This constructor initializes a pipeline object with the specified parameters.
- It supports various preprocessing and postprocessing operations for different tasks.
- The 'binary_output' flag controls whether outputs are stored in binary format.
"""
super().__init__()
# Initialize the pipeline object with the provided parameters
self.model = model
self.tokenizer = tokenizer
self.feature_extractor = feature_extractor
self.image_processor = image_processor
self.modelcard = modelcard
self.framework = framework
self.task = task
self.args_parser = args_parser
self.device = device
self.torch_dtype = torch_dtype
self.binary_output = binary_output
self.kwargs = kwargs
def save_pretrained(self, save_directory: str, safe_serialization: bool = True):
"""
Save the pipeline's model and tokenizer.
Args:
save_directory (`str`):
A path to the directory where to saved. It will be created if it doesn't exist.
safe_serialization (`str`):
Whether to save the model using `safetensors` or the traditional way for PyTorch or Tensorflow.
"""
# 检查保存路径是否为文件,若是则记录错误并返回
if os.path.isfile(save_directory):
logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
return
# 创建保存路径的目录,若目录已存在则不会重复创建
os.makedirs(save_directory, exist_ok=True)
# 如果对象具有 `_registered_impl` 属性,则保存自定义流水线信息和代码
if hasattr(self, "_registered_impl"):
# 复制已注册的流水线信息
pipeline_info = self._registered_impl.copy()
custom_pipelines = {}
# 遍历流水线信息
for task, info in pipeline_info.items():
# 只保留与当前类相关的流水线信息
if info["impl"] != self.__class__:
continue
info = info.copy()
module_name = info["impl"].__module__
last_module = module_name.split(".")[-1]
# 将类名转换为完整的模块.类名 形式
info["impl"] = f"{last_module}.{info['impl'].__name__}"
# 转换为元组,包含每个任务支持的类的名称
info["pt"] = tuple(c.__name__ for c in info["pt"])
info["tf"] = tuple(c.__name__ for c in info["tf"])
custom_pipelines[task] = info
# 将自定义流水线信息设置到模型配置中
self.model.config.custom_pipelines = custom_pipelines
# 保存流水线的自定义代码和对象
custom_object_save(self, save_directory)
# 调用模型的保存方法,将模型保存到指定路径
self.model.save_pretrained(save_directory, safe_serialization=safe_serialization)
# 如果存在 tokenizer,则也将其保存到指定路径
if self.tokenizer is not None:
self.tokenizer.save_pretrained(save_directory)
# 如果存在特征提取器,则也将其保存到指定路径
if self.feature_extractor is not None:
self.feature_extractor.save_pretrained(save_directory)
# 如果存在图像处理器,则也将其保存到指定路径
if self.image_processor is not None:
self.image_processor.save_pretrained(save_directory)
# 如果存在模型卡片信息,则也将其保存到指定路径
if self.modelcard is not None:
self.modelcard.save_pretrained(save_directory)
# 定义上下文管理器,允许在用户指定的设备上进行张量分配,与框架无关。
def device_placement(self):
"""
Context Manager allowing tensor allocation on the user-specified device in framework agnostic way.
Returns:
Context manager
Examples:
```
# Explicitly ask for tensor allocation on CUDA device :0
pipe = pipeline(..., device=0)
with pipe.device_placement():
# Every framework specific tensor allocation will be done on the request device
output = pipe(...)
```"""
# 如果当前框架是 TensorFlow
if self.framework == "tf":
# 使用 tf.device 确定张量分配在指定的 CPU 或 GPU 设备上
with tf.device("/CPU:0" if self.device == -1 else f"/device:GPU:{self.device}"):
# 使用 yield 将控制权交给调用者
yield
else:
# 如果当前框架不是 TensorFlow
# 检查设备类型是否为 CUDA
if self.device.type == "cuda":
# 使用 torch.cuda.device 确定张量分配在指定的 CUDA 设备上
with torch.cuda.device(self.device):
# 使用 yield 将控制权交给调用者
yield
else:
# 对于其他类型的设备,默认使用 yield 将控制权交给调用者
yield
# 确保 PyTorch 张量位于指定设备上
def ensure_tensor_on_device(self, **inputs):
"""
Ensure PyTorch tensors are on the specified device.
Args:
inputs (keyword arguments that should be `torch.Tensor`, the rest is ignored):
The tensors to place on `self.device`.
Recursive on lists **only**.
Return:
`Dict[str, torch.Tensor]`: The same as `inputs` but on the proper device.
"""
# 调用内部方法 _ensure_tensor_on_device,将输入张量放置在指定的设备上
return self._ensure_tensor_on_device(inputs, self.device)
# 内部方法,递归确保张量位于指定设备上
def _ensure_tensor_on_device(self, inputs, device):
# 如果输入是 ModelOutput 类型的对象
if isinstance(inputs, ModelOutput):
# 递归处理每个项,并确保它们位于指定设备上
return ModelOutput(
{name: self._ensure_tensor_on_device(tensor, device) for name, tensor in inputs.items()}
)
# 如果输入是字典类型
elif isinstance(inputs, dict):
# 递归处理每个键值对,并确保值位于指定设备上
return {name: self._ensure_tensor_on_device(tensor, device) for name, tensor in inputs.items()}
# 如果输入是 UserDict 类型
elif isinstance(inputs, UserDict):
# 递归处理每个键值对,并确保值位于指定设备上
return UserDict({name: self._ensure_tensor_on_device(tensor, device) for name, tensor in inputs.items()})
# 如果输入是列表类型
elif isinstance(inputs, list):
# 递归处理列表中的每个元素,并确保它们位于指定设备上
return [self._ensure_tensor_on_device(item, device) for item in inputs]
# 如果输入是元组类型
elif isinstance(inputs, tuple):
# 递归处理元组中的每个元素,并确保它们位于指定设备上
return tuple([self._ensure_tensor_on_device(item, device) for item in inputs])
# 如果输入是 PyTorch 的张量类型
elif isinstance(inputs, torch.Tensor):
# 如果目标设备是 CPU,并且张量的数据类型是 float16 或 bfloat16,则将其转换为 float 类型
if device == torch.device("cpu") and inputs.dtype in {torch.float16, torch.bfloat16}:
inputs = inputs.float()
# 将张量移到指定设备上,并返回结果
return inputs.to(device)
else:
# 对于其他类型的输入,直接返回原始输入
return inputs
def check_model_type(self, supported_models: Union[List[str], dict]):
"""
检查模型类是否被流水线支持。
Args:
supported_models (`List[str]` or `dict`):
支持的模型列表或包含模型类值的字典。
"""
if not isinstance(supported_models, list): # 如果不是列表,则从模型映射创建
supported_models_names = []
for _, model_name in supported_models.items():
# 映射现在可以包含相同配置的模型元组。
if isinstance(model_name, tuple):
supported_models_names.extend(list(model_name))
else:
supported_models_names.append(model_name)
if hasattr(supported_models, "_model_mapping"):
for _, model in supported_models._model_mapping._extra_content.items():
if isinstance(model_name, tuple):
supported_models_names.extend([m.__name__ for m in model])
else:
supported_models_names.append(model.__name__)
supported_models = supported_models_names
if self.model.__class__.__name__ not in supported_models:
logger.error(
f"The model '{self.model.__class__.__name__}' is not supported for {self.task}. Supported models are"
f" {supported_models}."
)
@abstractmethod
def _sanitize_parameters(self, **pipeline_parameters):
"""
_sanitize_parameters 将会被调用,传入来自 `__init__` 或 `__call__` 方法的任意多余的命名参数。
它应该返回三个字典,这些字典包含各种 `preprocess`、`forward` 和 `postprocess` 方法使用的解析参数。
如果调用者未指定 kwargs,则不要填充字典。这使您可以在函数签名中保留默认值,这更加自然。
它不应该直接调用,将会在 `__init__` 和 `__call__` 中自动调用,并由这些方法解析最终参数。
"""
raise NotImplementedError("_sanitize_parameters not implemented")
@abstractmethod
def preprocess(self, input_: Any, **preprocess_parameters: Dict) -> Dict[str, GenericTensor]:
"""
preprocess 将接受流水线特定的 `input_` 并返回一个包含一切 `_forward` 正常运行所需的字典。
它应该至少包含一个张量,但也可能包含任意其他项目。
"""
raise NotImplementedError("preprocess not implemented")
# 定义一个抽象方法,用于模型的前向传播,接收经过 `preprocess` 处理后的输入数据字典,并返回模型输出
def _forward(self, input_tensors: Dict[str, GenericTensor], **forward_parameters: Dict) -> ModelOutput:
"""
_forward will receive the prepared dictionary from `preprocess` and run it on the model. This method might
involve the GPU or the CPU and should be agnostic to it. Isolating this function is the reason for `preprocess`
and `postprocess` to exist, so that the hot path, this method generally can run as fast as possible.
It is not meant to be called directly, `forward` is preferred. It is basically the same but contains additional
code surrounding `_forward` making sure tensors and models are on the same device, disabling the training part
of the code (leading to faster inference).
"""
raise NotImplementedError("_forward not implemented")
# 定义一个抽象方法,用于对 `_forward` 方法的输出进行后处理,将模型的原始输出转换成更友好的格式
@abstractmethod
def postprocess(self, model_outputs: ModelOutput, **postprocess_parameters: Dict) -> Any:
"""
Postprocess will receive the raw outputs of the `_forward` method, generally tensors, and reformat them into
something more friendly. Generally it will output a list or a dict or results (containing just strings and
numbers).
"""
raise NotImplementedError("postprocess not implemented")
# 返回一个上下文管理器,用于在推理过程中关闭梯度计算,提高推理效率
def get_inference_context(self):
return torch.no_grad
# 定义模型的前向传播方法 `forward`,根据框架类型选择不同的处理逻辑,确保模型和张量在相同的设备上,并禁用训练部分的代码以加快推理速度
def forward(self, model_inputs, **forward_params):
with self.device_placement():
if self.framework == "tf":
model_inputs["training"] = False
model_outputs = self._forward(model_inputs, **forward_params)
elif self.framework == "pt":
inference_context = self.get_inference_context()
with inference_context():
model_inputs = self._ensure_tensor_on_device(model_inputs, device=self.device)
model_outputs = self._forward(model_inputs, **forward_params)
model_outputs = self._ensure_tensor_on_device(model_outputs, device=torch.device("cpu"))
else:
raise ValueError(f"Framework {self.framework} is not supported")
return model_outputs
# 获取数据迭代器,用于模型推理过程中的数据加载和预处理
def get_iterator(
self, inputs, num_workers: int, batch_size: int, preprocess_params, forward_params, postprocess_params
):
):
# 检查输入是否可迭代
if isinstance(inputs, collections.abc.Sized):
# 如果可迭代,创建 PipelineDataset 对象
dataset = PipelineDataset(inputs, self.preprocess, preprocess_params)
else:
# 如果不可迭代,并且设置了多个工作进程
if num_workers > 1:
# 发出警告,因为在可迭代数据集中使用多个工作进程可能会导致错误
logger.warning(
"For iterable dataset using num_workers>1 is likely to result"
" in errors since everything is iterable, setting `num_workers=1`"
" to guarantee correctness."
)
# 设置 num_workers 为 1 以确保正确性
num_workers = 1
# 创建 PipelineIterator 对象
dataset = PipelineIterator(inputs, self.preprocess, preprocess_params)
# 如果环境变量中未设置 TOKENIZERS_PARALLELISM
if "TOKENIZERS_PARALLELISM" not in os.environ:
# 输出日志,禁用 tokenizer 的并行处理,因为 DataLoader 已经在多线程处理了
logger.info("Disabling tokenizer parallelism, we're using DataLoader multithreading already")
# 设置环境变量 TOKENIZERS_PARALLELISM 为 false
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# TODO hack by collating feature_extractor and image_processor
# 根据情况选择特征提取器或图像处理器作为 feature_extractor
feature_extractor = self.feature_extractor if self.feature_extractor is not None else self.image_processor
# 根据 batch_size 是否为 1 选择使用 no_collate_fn 或 pad_collate_fn 作为 collate_fn
collate_fn = no_collate_fn if batch_size == 1 else pad_collate_fn(self.tokenizer, feature_extractor)
# 创建 DataLoader 对象
dataloader = DataLoader(dataset, num_workers=num_workers, batch_size=batch_size, collate_fn=collate_fn)
# 使用 dataloader 创建 PipelineIterator 对象
model_iterator = PipelineIterator(dataloader, self.forward, forward_params, loader_batch_size=batch_size)
# 使用 model_iterator 创建最终 PipelineIterator 对象
final_iterator = PipelineIterator(model_iterator, self.postprocess, postprocess_params)
# 返回最终的迭代器
return final_iterator
# 定义一个特殊方法 __call__,使对象可以像函数一样调用,接收输入参数 inputs 和额外的可变参数 args
def __call__(self, inputs, *args, num_workers=None, batch_size=None, **kwargs):
# 如果有额外的位置参数 args,则记录警告并忽略这些参数
if args:
logger.warning(f"Ignoring args : {args}")
# 确定并设置 num_workers 参数:如果未提供,则使用对象属性 _num_workers 或默认值 0
if num_workers is None:
if self._num_workers is None:
num_workers = 0
else:
num_workers = self._num_workers
# 确定并设置 batch_size 参数:如果未提供,则使用对象属性 _batch_size 或默认值 1
if batch_size is None:
if self._batch_size is None:
batch_size = 1
else:
batch_size = self._batch_size
# 根据传入的关键字参数 kwargs,从中提取预处理、前向传播和后处理的参数
preprocess_params, forward_params, postprocess_params = self._sanitize_parameters(**kwargs)
# 将 __init__ 方法的参数与当前 __call__ 方法的参数合并,不影响 __init__ 方法的参数
preprocess_params = {**self._preprocess_params, **preprocess_params}
forward_params = {**self._forward_params, **forward_params}
postprocess_params = {**self._postprocess_params, **postprocess_params}
# 增加调用计数器
self.call_count += 1
# 如果调用次数超过 10 次,并且使用的框架是 "pt",并且设备是 CUDA GPU,则发出警告
if self.call_count > 10 and self.framework == "pt" and self.device.type == "cuda":
logger.warning_once(
"You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a"
" dataset",
UserWarning,
)
# 判断输入是否为 Dataset 类型并且 Dataset 类型存在,或者是生成器类型,或者是列表类型
is_dataset = Dataset is not None and isinstance(inputs, Dataset)
is_generator = isinstance(inputs, types.GeneratorType)
is_list = isinstance(inputs, list)
# 判断输入是否为可迭代对象,包括 Dataset、生成器和列表
is_iterable = is_dataset or is_generator or is_list
# 判断是否可以使用迭代器处理输入数据:当前框架为 "pt" 并且输入是 Dataset、生成器或列表
can_use_iterator = self.framework == "pt" and (is_dataset or is_generator or is_list)
# 如果输入是列表类型,并且可以使用迭代器处理,则获取迭代器并返回迭代器的结果列表
if is_list:
if can_use_iterator:
final_iterator = self.get_iterator(
inputs, num_workers, batch_size, preprocess_params, forward_params, postprocess_params
)
outputs = list(final_iterator)
return outputs
else:
# 否则,使用 run_multi 方法处理列表输入并返回结果
return self.run_multi(inputs, preprocess_params, forward_params, postprocess_params)
# 如果可以使用迭代器处理输入,则直接返回迭代器对象
elif can_use_iterator:
return self.get_iterator(
inputs, num_workers, batch_size, preprocess_params, forward_params, postprocess_params
)
# 如果输入是可迭代对象,则使用 iterate 方法处理输入并返回结果
elif is_iterable:
return self.iterate(inputs, preprocess_params, forward_params, postprocess_params)
# 如果框架为 "pt" 并且当前对象是 ChunkPipeline 类的实例,则处理单个输入并返回结果
elif self.framework == "pt" and isinstance(self, ChunkPipeline):
return next(
iter(
self.get_iterator(
[inputs], num_workers, batch_size, preprocess_params, forward_params, postprocess_params
)
)
)
# 否则,使用 run_single 方法处理单个输入并返回结果
else:
return self.run_single(inputs, preprocess_params, forward_params, postprocess_params)
# 使用给定的输入数据列表并行运行模型,对每个输入调用 `run_single` 方法
def run_multi(self, inputs, preprocess_params, forward_params, postprocess_params):
# 返回一个列表,包含每个输入数据经过模型处理后的结果
return [self.run_single(item, preprocess_params, forward_params, postprocess_params) for item in inputs]
# 对单个输入数据进行预处理、前向推理和后处理,返回处理后的结果
def run_single(self, inputs, preprocess_params, forward_params, postprocess_params):
# 对输入数据进行预处理,得到模型所需的输入格式
model_inputs = self.preprocess(inputs, **preprocess_params)
# 对预处理后的输入进行模型推理,得到模型输出
model_outputs = self.forward(model_inputs, **forward_params)
# 对模型输出进行后处理,得到最终的处理结果
outputs = self.postprocess(model_outputs, **postprocess_params)
# 返回处理后的输出结果
return outputs
# 迭代给定的输入数据集,对每个输入数据进行模型处理,并通过生成器返回结果
def iterate(self, inputs, preprocess_params, forward_params, postprocess_params):
# 这个函数应该重新命名为 `get_iterator`,这是一个临时的简单解决方案。
for input_ in inputs:
# 对每个输入数据调用 `run_single` 方法,通过生成器 `yield` 返回处理结果
yield self.run_single(input_, preprocess_params, forward_params, postprocess_params)
# 定义 ChunkPipeline 类,继承自 Pipeline 类
class ChunkPipeline(Pipeline):
# 重写 run_single 方法,处理单个输入数据
def run_single(self, inputs, preprocess_params, forward_params, postprocess_params):
# 存储所有模型的输出结果
all_outputs = []
# 遍历预处理后的输入数据
for model_inputs in self.preprocess(inputs, **preprocess_params):
# 调用模型的前向推理方法,获取模型输出
model_outputs = self.forward(model_inputs, **forward_params)
# 将模型输出添加到结果列表中
all_outputs.append(model_outputs)
# 对所有模型的输出进行后处理,得到最终的输出结果
outputs = self.postprocess(all_outputs, **postprocess_params)
return outputs
# 获取迭代器方法,用于生成数据迭代器
def get_iterator(
self, inputs, num_workers: int, batch_size: int, preprocess_params, forward_params, postprocess_params
):
# 如果环境变量中没有设置 TOKENIZERS_PARALLELISM,则设置为 false
if "TOKENIZERS_PARALLELISM" not in os.environ:
logger.info("Disabling tokenizer parallelism, we're using DataLoader multithreading already")
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# 如果 num_workers 大于 1,发出警告信息并将其设置为 1,以确保正确性
if num_workers > 1:
logger.warning(
"For ChunkPipeline using num_workers>0 is likely to result in errors since everything is iterable,"
" setting `num_workers=1` to guarantee correctness."
)
num_workers = 1
# 使用输入数据和预处理函数参数创建 PipelineChunkIterator 对象
dataset = PipelineChunkIterator(inputs, self.preprocess, preprocess_params)
# 根据 batch_size 的不同选择不同的数据整合函数
feature_extractor = self.feature_extractor if self.feature_extractor is not None else self.image_processor
collate_fn = no_collate_fn if batch_size == 1 else pad_collate_fn(self.tokenizer, feature_extractor)
# 使用 DataLoader 创建数据加载器对象
dataloader = DataLoader(dataset, num_workers=num_workers, batch_size=batch_size, collate_fn=collate_fn)
# 使用 PipelinePackIterator 封装数据加载器,创建模型迭代器
model_iterator = PipelinePackIterator(dataloader, self.forward, forward_params, loader_batch_size=batch_size)
# 使用 PipelineIterator 封装模型迭代器,创建最终迭代器对象
final_iterator = PipelineIterator(model_iterator, self.postprocess, postprocess_params)
return final_iterator
# 定义 PipelineRegistry 类,用于管理支持的任务和任务别名
class PipelineRegistry:
# 初始化方法,接受支持的任务字典和任务别名字典作为参数
def __init__(self, supported_tasks: Dict[str, Any], task_aliases: Dict[str, str]) -> None:
self.supported_tasks = supported_tasks # 存储支持的任务字典
self.task_aliases = task_aliases # 存储任务别名字典
# 获取所有支持任务名称的方法
def get_supported_tasks(self) -> List[str]:
# 获取所有支持任务的名称列表,包括任务字典中的键和任务别名字典中的键
supported_task = list(self.supported_tasks.keys()) + list(self.task_aliases.keys())
supported_task.sort() # 对任务名称列表进行排序
return supported_task # 返回排序后的任务名称列表
# 检查给定的任务是否存在别名,若存在则替换为其真实任务名
def check_task(self, task: str) -> Tuple[str, Dict, Any]:
if task in self.task_aliases:
task = self.task_aliases[task]
# 检查任务是否在支持的任务列表中,若是则返回任务名、目标任务配置和空的参数
if task in self.supported_tasks:
targeted_task = self.supported_tasks[task]
return task, targeted_task, None
# 若任务以"translation"开头,进一步解析任务格式,并返回任务名、翻译任务配置以及相关参数
if task.startswith("translation"):
tokens = task.split("_")
if len(tokens) == 4 and tokens[0] == "translation" and tokens[2] == "to":
targeted_task = self.supported_tasks["translation"]
task = "translation"
return task, targeted_task, (tokens[1], tokens[3])
# 抛出格式错误的异常信息,要求任务名称使用正确的格式'translation_XX_to_YY'
raise KeyError(f"Invalid translation task {task}, use 'translation_XX_to_YY' format")
# 抛出未知任务异常信息,显示当前可用任务列表及格式示例
raise KeyError(
f"Unknown task {task}, available tasks are {self.get_supported_tasks() + ['translation_XX_to_YY']}"
)
# 注册新的任务流水线,并配置相关的模型类、默认模型和类型信息
def register_pipeline(
self,
task: str,
pipeline_class: type,
pt_model: Optional[Union[type, Tuple[type]]] = None,
tf_model: Optional[Union[type, Tuple[type]]] = None,
default: Optional[Dict] = None,
type: Optional[str] = None,
) -> None:
# 如果任务已存在于支持的任务列表中,发出警告并覆盖现有的任务流水线配置
if task in self.supported_tasks:
logger.warning(f"{task} is already registered. Overwriting pipeline for task {task}...")
# 如果没有提供 PyTorch 模型,则设为空元组
if pt_model is None:
pt_model = ()
elif not isinstance(pt_model, tuple):
pt_model = (pt_model,)
# 如果没有提供 TensorFlow 模型,则设为空元组
if tf_model is None:
tf_model = ()
elif not isinstance(tf_model, tuple):
tf_model = (tf_model,)
# 构建任务实现的字典,包括实现类、PyTorch 和 TensorFlow 模型
task_impl = {"impl": pipeline_class, "pt": pt_model, "tf": tf_model}
# 如果提供了默认配置,则检查是否包含模型信息,如果只有 'pt' 或 'tf',则封装为包含模型键的字典
if default is not None:
if "model" not in default and ("pt" in default or "tf" in default):
default = {"model": default}
task_impl["default"] = default
# 如果提供了类型信息,则添加到任务实现字典中
if type is not None:
task_impl["type"] = type
# 将任务实现字典注册到支持的任务列表中,并将其绑定到流水线类的注册实现字典中
self.supported_tasks[task] = task_impl
pipeline_class._registered_impl = {task: task_impl}
# 返回当前支持的任务列表及其配置的字典表示形式
def to_dict(self):
return self.supported_tasks
.\pipelines\conversational.py
import uuid
import warnings
from typing import Any, Dict, List, Union
from ..utils import add_end_docstrings, is_tf_available, is_torch_available, logging
from .base import Pipeline, build_pipeline_init_args
if is_tf_available():
import tensorflow as tf
if is_torch_available():
import torch
logger = logging.get_logger(__name__)
class Conversation:
"""
Utility class containing a conversation and its history. This class is meant to be used as an input to the
[`ConversationalPipeline`]. The conversation contains several utility functions to manage the addition of new user
inputs and generated model responses.
Arguments:
messages (Union[str, List[Dict[str, str]]], *optional*):
The initial messages to start the conversation, either a string, or a list of dicts containing "role" and
"content" keys. If a string is passed, it is interpreted as a single message with the "user" role.
conversation_id (`uuid.UUID`, *optional*):
Unique identifier for the conversation. If not provided, a random UUID4 id will be assigned to the
conversation.
Usage:
```
conversation = Conversation("Going to the movies tonight - any suggestions?")
conversation.add_message({"role": "assistant", "content": "The Big lebowski."})
conversation.add_message({"role": "user", "content": "Is it good?"})
```
"""
def __init__(
self, messages: Union[str, List[Dict[str, str]]] = None, conversation_id: uuid.UUID = None, **deprecated_kwargs
):
):
if not conversation_id:
conversation_id = uuid.uuid4()
if messages is None:
text = deprecated_kwargs.pop("text", None)
if text is not None:
messages = [{"role": "user", "content": text}]
else:
messages = []
elif isinstance(messages, str):
messages = [{"role": "user", "content": messages}]
self._num_processed_user_inputs = 0
generated_responses = deprecated_kwargs.pop("generated_responses", None)
past_user_inputs = deprecated_kwargs.pop("past_user_inputs", None)
if generated_responses is not None and past_user_inputs is None:
raise ValueError("generated_responses cannot be passed without past_user_inputs!")
if past_user_inputs is not None:
legacy_messages = []
if generated_responses is None:
generated_responses = []
for i in range(max([len(past_user_inputs), len(generated_responses)])):
if i < len(past_user_inputs):
legacy_messages.append({"role": "user", "content": past_user_inputs[i]})
if i < len(generated_responses):
legacy_messages.append({"role": "assistant", "content": generated_responses[i]})
messages = legacy_messages + messages
self.uuid = conversation_id
self.messages = messages
def __eq__(self, other):
if not isinstance(other, Conversation):
return False
return self.uuid == other.uuid or self.messages == other.messages
def add_message(self, message: Dict[str, str]):
if not set(message.keys()) == {"role", "content"}:
raise ValueError("Message should contain only 'role' and 'content' keys!")
if message["role"] not in ("user", "assistant", "system"):
raise ValueError("Only 'user', 'assistant' and 'system' roles are supported for now!")
self.messages.append(message)
def add_user_input(self, text: str, overwrite: bool = False):
"""
Add a user input to the conversation for the next round. This is a legacy method that assumes that inputs must
alternate user/assistant/user/assistant, and so will not add multiple user messages in succession. We recommend
just using `add_message` with role "user" instead.
"""
if len(self) > 0 and self[-1]["role"] == "user":
if overwrite:
logger.warning(
f'User input added while unprocessed input was existing: "{self[-1]["content"]}" was overwritten '
f'with: "{text}".'
)
self[-1]["content"] = text
else:
logger.warning(
f'User input added while unprocessed input was existing: "{self[-1]["content"]}" new input '
f'ignored: "{text}". Set `overwrite` to True to overwrite unprocessed user input'
)
else:
self.messages.append({"role": "user", "content": text})
def append_response(self, response: str):
"""
This is a legacy method. We recommend just using `add_message` with an appropriate role instead.
"""
self.messages.append({"role": "assistant", "content": response})
def mark_processed(self):
"""
This is a legacy method, as the Conversation no longer distinguishes between processed and unprocessed user
input. We set a counter here to keep behaviour mostly backward-compatible, but in general you should just read
the messages directly when writing new code.
"""
self._num_processed_user_inputs = len(self._user_messages)
def __iter__(self):
for message in self.messages:
yield message
def __getitem__(self, item):
return self.messages[item]
def __setitem__(self, key, value):
self.messages[key] = value
def __len__(self):
return len(self.messages)
def __repr__(self):
"""
Generates a string representation of the conversation.
Returns:
`str`:
Example:
Conversation id: 7d15686b-dc94-49f2-9c4b-c9eac6a1f114 user: Going to the movies tonight - any suggestions?
bot: The Big Lebowski
"""
output = f"Conversation id: {self.uuid}\n"
for message in self.messages:
output += f"{message['role']}: {message['content']}\n"
return output
def iter_texts(self):
for message in self.messages:
yield message["role"] == "user", message["content"]
@property
def _user_messages(self):
return [message["content"] for message in self.messages if message["role"] == "user"]
@property
def past_user_inputs(self):
if not self._user_messages:
return []
if self.messages[-1]["role"] != "user" or self._num_processed_user_inputs == len(self._user_messages):
return self._user_messages[:-1]
return self._user_messages
@property
def generated_responses(self):
return [message["content"] for message in self.messages if message["role"] == "assistant"]
@property
def new_user_input(self):
return self._user_messages[-1]
@add_end_docstrings(
build_pipeline_init_args(has_tokenizer=True),
r"""
min_length_for_response (`int`, *optional*, defaults to 32):
The minimum length (in number of tokens) for a response.""",
)
class ConversationalPipeline(Pipeline):
"""
Multi-turn conversational pipeline.
Example:
```
>>> from transformers import pipeline, Conversation
# Any model with a chat template can be used in a ConversationalPipeline.
>>> chatbot = pipeline(model="facebook/blenderbot-400M-distill")
>>> # Conversation objects initialized with a string will treat it as a user message
>>> conversation = Conversation("I'm looking for a movie - what's your favourite one?")
>>> conversation = chatbot(conversation)
>>> conversation.messages[-1]["content"]
"I don't really have a favorite movie, but I do like action movies. What about you?"
>>> conversation.add_message({"role": "user", "content": "That's interesting, why do you like action movies?"})
>>> conversation = chatbot(conversation)
>>> conversation.messages[-1]["content"]
" I think it's just because they're so fast-paced and action-fantastic."
```
Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
This conversational pipeline can currently be loaded from [`pipeline`] using the following task identifier:
`"conversational"`.
This pipeline can be used with any model that has a [chat
template](https://huggingface.co/docs/transformers/chat_templating) set.
"""
def __init__(self, *args, **kwargs):
warnings.warn(
"`ConversationalPipeline` is now deprecated, and the functionality has been moved to the standard `text-generation` pipeline, which now accepts lists of message dicts as well as strings. This class will be removed in v4.42.",
DeprecationWarning,
)
super().__init__(*args, **kwargs)
if self.tokenizer.pad_token_id is None:
self.tokenizer.pad_token = self.tokenizer.eos_token
def _sanitize_parameters(self, min_length_for_response=None, clean_up_tokenization_spaces=None, **generate_kwargs):
"""
Prepares and sanitizes generation parameters for text generation.
Args:
min_length_for_response (int, optional): Minimum length of response in tokens.
clean_up_tokenization_spaces (bool, optional): Whether to clean up tokenization spaces.
**generate_kwargs: Additional keyword arguments for text generation.
Returns:
tuple: Three dictionaries containing pre-process, forward, and post-process parameters.
"""
preprocess_params = {}
forward_params = {}
postprocess_params = {}
if min_length_for_response is not None:
preprocess_params["min_length_for_response"] = min_length_for_response
if "max_length" in generate_kwargs:
forward_params["max_length"] = generate_kwargs["max_length"]
if clean_up_tokenization_spaces is not None:
postprocess_params["clean_up_tokenization_spaces"] = clean_up_tokenization_spaces
if generate_kwargs:
forward_params.update(generate_kwargs)
return preprocess_params, forward_params, postprocess_params
def __call__(self, conversations: Union[List[Dict], Conversation, List[Conversation]], num_workers=0, **kwargs):
"""
Generate responses for the conversation(s) given as inputs.
Args:
conversations (a [`Conversation`] or a list of [`Conversation`]):
Conversation to generate responses for. Inputs can also be passed as a list of dictionaries with `role`
and `content` keys - in this case, they will be converted to `Conversation` objects automatically.
Multiple conversations in either format may be passed as a list.
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
Whether or not to clean up the potential extra spaces in the text output.
generate_kwargs:
Additional keyword arguments to pass along to the generate method of the model (see the generate method
corresponding to your framework [here](./model#generative-models)).
Returns:
[`Conversation`] or a list of [`Conversation`]: Conversation(s) with updated generated responses for those
containing a new user input.
"""
if isinstance(conversations, list) and isinstance(conversations[0], dict):
conversations = Conversation(conversations)
elif isinstance(conversations, list) and isinstance(conversations[0], list):
conversations = [Conversation(conv) for conv in conversations]
outputs = super().__call__(conversations, num_workers=num_workers, **kwargs)
if isinstance(outputs, list) and len(outputs) == 1:
return outputs[0]
return outputs
def preprocess(self, conversation: Conversation, min_length_for_response=32) -> Dict[str, Any]:
"""
Preprocesses the conversation to generate model inputs.
Args:
conversation (`Conversation`): Conversation object containing role and content information.
min_length_for_response (`int`, *optional*, defaults to `32`):
Minimum length required for the model to generate a response.
Returns:
Dict[str, Any]: Dictionary containing input_ids (tokenized input) and the original conversation object.
"""
input_ids = self.tokenizer.apply_chat_template(conversation, add_generation_prompt=True)
if self.framework == "pt":
input_ids = torch.LongTensor([input_ids])
elif self.framework == "tf":
input_ids = tf.constant([input_ids])
return {"input_ids": input_ids, "conversation": conversation}
def _forward(self, model_inputs, **generate_kwargs):
"""
Perform forward pass through the model to generate output IDs.
Args:
model_inputs (Dict[str, Any]): Dictionary containing input_ids (token IDs) and conversation object.
generate_kwargs: Additional keyword arguments passed to the generate method of the model.
Returns:
Dict[str, Any]: Dictionary containing output_ids (generated token IDs) and conversation object.
"""
n = model_inputs["input_ids"].shape[1]
conversation = model_inputs.pop("conversation")
if "max_length" not in generate_kwargs and "max_new_tokens" not in generate_kwargs:
generate_kwargs["max_new_tokens"] = 256
output_ids = self.model.generate(**model_inputs, **generate_kwargs)
if self.model.config.is_encoder_decoder:
start_position = 1
else:
start_position = n
return {"output_ids": output_ids[:, start_position:], "conversation": conversation}
def postprocess(self, model_outputs, clean_up_tokenization_spaces=True):
output_ids = model_outputs["output_ids"]
answer = self.tokenizer.decode(
output_ids[0],
skip_special_tokens=True,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
)
conversation = model_outputs["conversation"]
conversation.add_message({"role": "assistant", "content": answer})
return conversation
.\pipelines\depth_estimation.py
from typing import List, Union
import numpy as np
from ..utils import add_end_docstrings, is_torch_available, is_vision_available, logging, requires_backends
from .base import Pipeline, build_pipeline_init_args
if is_torch_available():
import torch
from ..models.auto.modeling_auto import MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES
logger = logging.get_logger(__name__)
@add_end_docstrings(build_pipeline_init_args(has_image_processor=True))
class DepthEstimationPipeline(Pipeline):
"""
Depth estimation pipeline using any `AutoModelForDepthEstimation`. This pipeline predicts the depth of an image.
Example:
```
>>> from transformers import pipeline
>>> depth_estimator = pipeline(task="depth-estimation", model="LiheYoung/depth-anything-base-hf")
>>> output = depth_estimator("http://images.cocodataset.org/val2017/000000039769.jpg")
>>> # This is a tensor with the values being the depth expressed in meters for each pixel
>>> output["predicted_depth"].shape
torch.Size([1, 384, 384])
```
Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
This depth estimation pipeline can currently be loaded from [`pipeline`] using the following task identifier:
`"depth-estimation"`.
See the list of available models on [huggingface.co/models](https://huggingface.co/models?filter=depth-estimation).
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
requires_backends(self, "vision")
self.check_model_type(MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES)
def __call__(self, images: Union[str, List[str], "Image.Image", List["Image.Image"]], **kwargs):
"""
Predict the depth(s) of the image(s) passed as inputs.
Args:
images (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
The pipeline handles three types of images:
- A string containing a http link pointing to an image
- A string containing a local path to an image
- An image loaded in PIL directly
The pipeline accepts either a single image or a batch of images, which must then be passed as a string.
Images in a batch must all be in the same format: all as http links, all as local paths, or all as PIL
images.
timeout (`float`, *optional*, defaults to None):
The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
the call may block forever.
Return:
A dictionary or a list of dictionaries containing result. If the input is a single image, will return a
dictionary, if the input is a list of several images, will return a list of dictionaries corresponding to
the images.
The dictionaries contain the following keys:
- **predicted_depth** (`torch.Tensor`) -- The predicted depth by the model as a `torch.Tensor`.
- **depth** (`PIL.Image`) -- The predicted depth by the model as a `PIL.Image`.
"""
return super().__call__(images, **kwargs)
def _sanitize_parameters(self, timeout=None, **kwargs):
preprocess_params = {}
if timeout is not None:
preprocess_params["timeout"] = timeout
return preprocess_params, {}, {}
def preprocess(self, image, timeout=None):
image = load_image(image, timeout)
self.image_size = image.size
model_inputs = self.image_processor(images=image, return_tensors=self.framework)
return model_inputs
def _forward(self, model_inputs):
model_outputs = self.model(**model_inputs)
return model_outputs
def postprocess(self, model_outputs):
predicted_depth = model_outputs.predicted_depth
prediction = torch.nn.functional.interpolate(
predicted_depth.unsqueeze(1), size=self.image_size[::-1], mode="bicubic", align_corners=False
)
output = prediction.squeeze().cpu().numpy()
formatted = (output * 255 / np.max(output)).astype("uint8")
depth = Image.fromarray(formatted)
output_dict = {}
output_dict["predicted_depth"] = predicted_depth
output_dict["depth"] = depth
return output_dict
.\pipelines\document_question_answering.py
import re
from typing import List, Optional, Tuple, Union
import numpy as np
from ..utils import (
ExplicitEnum,
add_end_docstrings,
is_pytesseract_available,
is_torch_available,
is_vision_available,
logging,
)
from .base import ChunkPipeline, build_pipeline_init_args
from .question_answering import select_starts_ends
if is_vision_available():
from PIL import Image
from ..image_utils import load_image
if is_torch_available():
import torch
from ..models.auto.modeling_auto import MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES
TESSERACT_LOADED = False
if is_pytesseract_available():
TESSERACT_LOADED = True
import pytesseract
logger = logging.get_logger(__name__)
def normalize_box(box, width, height):
"""根据图像宽度和高度,归一化边界框的坐标值,并返回归一化后的边界框列表。"""
return [
int(1000 * (box[0] / width)),
int(1000 * (box[1] / height)),
int(1000 * (box[2] / width)),
int(1000 * (box[3] / height)),
]
def apply_tesseract(image: "Image.Image", lang: Optional[str], tesseract_config: Optional[str]):
"""对文档图像应用 Tesseract OCR,返回识别的单词及其归一化的边界框。"""
data = pytesseract.image_to_data(image, lang=lang, output_type="dict", config=tesseract_config)
words, left, top, width, height = data["text"], data["left"], data["top"], data["width"], data["height"]
irrelevant_indices = [idx for idx, word in enumerate(words) if not word.strip()]
words = [word for idx, word in enumerate(words) if idx not in irrelevant_indices]
left = [coord for idx, coord in enumerate(left) if idx not in irrelevant_indices]
top = [coord for idx, coord in enumerate(top) if idx not in irrelevant_indices]
width = [coord for idx, coord in enumerate(width) if idx not in irrelevant_indices]
height = [coord for idx, coord in enumerate(height) if idx not in irrelevant_indices]
actual_boxes = []
for x, y, w, h in zip(left, top, width, height):
actual_box = [x, y, x + w, y + h]
actual_boxes.append(actual_box)
image_width, image_height = image.size
normalized_boxes = []
for box in actual_boxes:
normalized_boxes.append(normalize_box(box, image_width, image_height))
if len(words) != len(normalized_boxes):
raise ValueError("Not as many words as there are bounding boxes")
return words, normalized_boxes
class ModelType(ExplicitEnum):
LayoutLM = "layoutlm"
LayoutLMv2andv3 = "layoutlmv2andv3"
VisionEncoderDecoder = "vision_encoder_decoder"
@add_end_docstrings(build_pipeline_init_args(has_image_processor=True, has_tokenizer=True))
class DocumentQuestionAnsweringPipeline(ChunkPipeline):
"""
Document Question Answering pipeline using any `AutoModelForDocumentQuestionAnswering`. The inputs/outputs are
similar to the (extractive) question answering pipeline; however, the pipeline takes an image (and optional OCR'd
words/boxes) as input instead of text context.
Example:
```
>>> from transformers import pipeline
>>> document_qa = pipeline(model="impira/layoutlm-document-qa")
>>> document_qa(
... image="https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/invoice.png",
... question="What is the invoice number?",
... )
[{'score': 0.425, 'answer': 'us-001', 'start': 16, 'end': 16}]
```
Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
This document question answering pipeline can currently be loaded from [`pipeline`] using the following task
identifier: `"document-question-answering"`.
The models that this pipeline can use are models that have been fine-tuned on a document question answering task.
See the up-to-date list of available models on
[huggingface.co/models](https://huggingface.co/models?filter=document-question-answering).
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
if self.tokenizer is not None and not self.tokenizer.__class__.__name__.endswith("Fast"):
raise ValueError(
"`DocumentQuestionAnsweringPipeline` requires a fast tokenizer, but a slow tokenizer "
f"(`{self.tokenizer.__class__.__name__}`) is provided."
)
if self.model.config.__class__.__name__ == "VisionEncoderDecoderConfig":
self.model_type = ModelType.VisionEncoderDecoder
if self.model.config.encoder.model_type != "donut-swin":
raise ValueError("Currently, the only supported VisionEncoderDecoder model is Donut")
else:
self.check_model_type(MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES)
if self.model.config.__class__.__name__ == "LayoutLMConfig":
self.model_type = ModelType.LayoutLM
else:
self.model_type = ModelType.LayoutLMv2andv3
def _sanitize_parameters(
self,
padding=None,
doc_stride=None,
max_question_len=None,
lang: Optional[str] = None,
tesseract_config: Optional[str] = None,
max_answer_len=None,
max_seq_len=None,
top_k=None,
handle_impossible_answer=None,
timeout=None,
**kwargs,
):
preprocess_params, postprocess_params = {}, {}
if padding is not None:
preprocess_params["padding"] = padding
if doc_stride is not None:
preprocess_params["doc_stride"] = doc_stride
if max_question_len is not None:
preprocess_params["max_question_len"] = max_question_len
if max_seq_len is not None:
preprocess_params["max_seq_len"] = max_seq_len
if lang is not None:
preprocess_params["lang"] = lang
if tesseract_config is not None:
preprocess_params["tesseract_config"] = tesseract_config
if timeout is not None:
preprocess_params["timeout"] = timeout
if top_k is not None:
if top_k < 1:
raise ValueError(f"top_k parameter should be >= 1 (got {top_k})")
postprocess_params["top_k"] = top_k
if max_answer_len is not None:
if max_answer_len < 1:
raise ValueError(f"max_answer_len parameter should be >= 1 (got {max_answer_len}")
postprocess_params["max_answer_len"] = max_answer_len
if handle_impossible_answer is not None:
postprocess_params["handle_impossible_answer"] = handle_impossible_answer
return preprocess_params, {}, postprocess_params
def __call__(
self,
image: Union["Image.Image", str],
question: Optional[str] = None,
word_boxes: Tuple[str, List[float]] = None,
**kwargs,
):
def preprocess(
self,
input,
padding="do_not_pad",
doc_stride=None,
max_seq_len=None,
word_boxes: Tuple[str, List[float]] = None,
lang=None,
tesseract_config="",
timeout=None,
):
def _forward(self, model_inputs, **generate_kwargs):
p_mask = model_inputs.pop("p_mask", None)
word_ids = model_inputs.pop("word_ids", None)
words = model_inputs.pop("words", None)
is_last = model_inputs.pop("is_last", False)
if self.model_type == ModelType.VisionEncoderDecoder:
model_outputs = self.model.generate(**model_inputs, **generate_kwargs)
else:
model_outputs = self.model(**model_inputs)
model_outputs = dict(model_outputs.items())
model_outputs["p_mask"] = p_mask
model_outputs["word_ids"] = word_ids
model_outputs["words"] = words
model_outputs["attention_mask"] = model_inputs.get("attention_mask", None)
model_outputs["is_last"] = is_last
return model_outputs
def postprocess(self, model_outputs, top_k=1, **kwargs):
if self.model_type == ModelType.VisionEncoderDecoder:
answers = [self.postprocess_encoder_decoder_single(o) for o in model_outputs]
else:
answers = self.postprocess_extractive_qa(model_outputs, top_k=top_k, **kwargs)
answers = sorted(answers, key=lambda x: x.get("score", 0), reverse=True)[:top_k]
return answers
def postprocess_encoder_decoder_single(self, model_outputs, **kwargs):
sequence = self.tokenizer.batch_decode(model_outputs["sequences"])[0]
sequence = sequence.replace(self.tokenizer.eos_token, "").replace(self.tokenizer.pad_token, "")
sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()
ret = {
"answer": None,
}
answer = re.search(r"<s_answer>(.*)</s_answer>", sequence)
if answer is not None:
ret["answer"] = answer.group(1).strip()
return ret
def postprocess_extractive_qa(
self, model_outputs, top_k=1, handle_impossible_answer=False, max_answer_len=15, **kwargs
):
min_null_score = 1000000
answers = []
for output in model_outputs:
words = output["words"]
starts, ends, scores, min_null_score = select_starts_ends(
start=output["start_logits"],
end=output["end_logits"],
p_mask=output["p_mask"],
attention_mask=output["attention_mask"].numpy()
if output.get("attention_mask", None) is not None
else None,
min_null_score=min_null_score,
top_k=top_k,
handle_impossible_answer=handle_impossible_answer,
max_answer_len=max_answer_len,
)
word_ids = output["word_ids"]
for start, end, score in zip(starts, ends, scores):
word_start, word_end = word_ids[start], word_ids[end]
if word_start is not None and word_end is not None:
answers.append(
{
"score": float(score),
"answer": " ".join(words[word_start : word_end + 1]),
"start": word_start,
"end": word_end,
}
)
if handle_impossible_answer:
answers.append({"score": min_null_score, "answer": "", "start": 0, "end": 0})
return answers
.\pipelines\feature_extraction.py
from typing import Dict
from ..utils import add_end_docstrings
from .base import GenericTensor, Pipeline, build_pipeline_init_args
@add_end_docstrings(
build_pipeline_init_args(has_tokenizer=True, supports_binary_output=False),
r"""
tokenize_kwargs (`dict`, *optional*):
Additional dictionary of keyword arguments passed along to the tokenizer.
return_tensors (`bool`, *optional*):
If `True`, returns a tensor according to the specified framework, otherwise returns a list.""",
)
class FeatureExtractionPipeline(Pipeline):
"""
Feature extraction pipeline uses no model head. This pipeline extracts the hidden states from the base
transformer, which can be used as features in downstream tasks.
Example:
```
>>> from transformers import pipeline
>>> extractor = pipeline(model="google-bert/bert-base-uncased", task="feature-extraction")
>>> result = extractor("This is a simple test.", return_tensors=True)
>>> result.shape # This is a tensor of shape [1, sequence_lenth, hidden_dimension] representing the input string.
torch.Size([1, 8, 768])
```
Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
This feature extraction pipeline can currently be loaded from [`pipeline`] using the task identifier:
`"feature-extraction"`.
All models may be used for this pipeline. See a list of all models, including community-contributed models on
[huggingface.co/models](https://huggingface.co/models).
"""
def _sanitize_parameters(self, truncation=None, tokenize_kwargs=None, return_tensors=None, **kwargs):
if tokenize_kwargs is None:
tokenize_kwargs = {}
if truncation is not None:
if "truncation" in tokenize_kwargs:
raise ValueError(
"truncation parameter defined twice (given as keyword argument as well as in tokenize_kwargs)"
)
tokenize_kwargs["truncation"] = truncation
preprocess_params = tokenize_kwargs
postprocess_params = {}
if return_tensors is not None:
postprocess_params["return_tensors"] = return_tensors
return preprocess_params, {}, postprocess_params
def preprocess(self, inputs, **tokenize_kwargs) -> Dict[str, GenericTensor]:
model_inputs = self.tokenizer(inputs, return_tensors=self.framework, **tokenize_kwargs)
return model_inputs
def _forward(self, model_inputs):
model_outputs = self.model(**model_inputs)
return model_outputs
def postprocess(self, model_outputs, return_tensors=False):
if return_tensors:
return model_outputs[0]
if self.framework == "pt":
return model_outputs[0].tolist()
elif self.framework == "tf":
return model_outputs[0].numpy().tolist()
def __call__(self, *args, **kwargs):
"""
提取输入文本的特征。
Args:
args (`str` or `List[str]`): 一个或多个文本(或文本列表),用于提取特征。
Return:
A nested list of `float`: 模型计算得到的特征。
"""
return super().__call__(*args, **kwargs)
.\pipelines\fill_mask.py
from typing import Dict
import numpy as np
from ..utils import add_end_docstrings, is_tf_available, is_torch_available, logging
from .base import GenericTensor, Pipeline, PipelineException, build_pipeline_init_args
if is_tf_available():
import tensorflow as tf
from ..tf_utils import stable_softmax
if is_torch_available():
import torch
logger = logging.get_logger(__name__)
@add_end_docstrings(
build_pipeline_init_args(has_tokenizer=True),
r"""
top_k (`int`, defaults to 5):
The number of predictions to return.
targets (`str` or `List[str]`, *optional*):
When passed, the model will limit the scores to the passed targets instead of looking up in the whole
vocab. If the provided targets are not in the model vocab, they will be tokenized and the first resulting
token will be used (with a warning, and that might be slower).
tokenizer_kwargs (`dict`, *optional*):
Additional dictionary of keyword arguments passed along to the tokenizer."""
)
class FillMaskPipeline(Pipeline):
"""
Masked language modeling prediction pipeline using any `ModelWithLMHead`. See the [masked language modeling
examples](../task_summary#masked-language-modeling) for more information.
Example:
```
>>> from transformers import pipeline
>>> fill_masker = pipeline(model="google-bert/bert-base-uncased")
>>> fill_masker("This is a simple [MASK].")
[{'score': 0.042, 'token': 3291, 'token_str': 'problem', 'sequence': 'this is a simple problem.'}, {'score': 0.031, 'token': 3160, 'token_str': 'question', 'sequence': 'this is a simple question.'}, {'score': 0.03, 'token': 8522, 'token_str': 'equation', 'sequence': 'this is a simple equation.'}, {'score': 0.027, 'token': 2028, 'token_str': 'one', 'sequence': 'this is a simple one.'}, {'score': 0.024, 'token': 3627, 'token_str': 'rule', 'sequence': 'this is a simple rule.'}]
```
Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
This mask filling pipeline can currently be loaded from [`pipeline`] using the following task identifier:
`"fill-mask"`.
The models that this pipeline can use are models that have been trained with a masked language modeling objective,
which includes the bi-directional models in the library. See the up-to-date list of available models on
[huggingface.co/models](https://huggingface.co/models?filter=fill-mask).
<Tip>
This pipeline only works for inputs with exactly one token masked. Experimental: We added support for multiple
masks. The returned values are raw model output, and correspond to disjoint probabilities where one might expect
joint probabilities (See [discussion](https://github.com/huggingface/transformers/pull/10222)).
</Tip>
<Tip>
This pipeline now supports tokenizer_kwargs. For example try:
```
>>> from transformers import pipeline
>>> fill_masker = pipeline(model="google-bert/bert-base-uncased", tokenizer_kwargs={"do_lower_case": False})
>>> fill_masker("This is a simple [MASK].")
```
This will make the tokenizer to treat "This" and "this" as distinct words.
</Tip>
"""
pass
>>> from transformers import pipeline
导入transformers库中的pipeline模块,用于创建基于预训练模型的NLP处理管道。
>>> fill_masker = pipeline(model="google-bert/bert-base-uncased")
创建一个新的填充掩码(fill-mask)管道,使用Google BERT模型的基本未大写模型。
>>> tokenizer_kwargs = {"truncation": True}
定义一个字典tokenizer_kwargs,其中包含了一个键值对,用于设置tokenizer的参数,这里指定了截断为True。
>>> fill_masker(
... "This is a simple [MASK]. " + "...with a large amount of repeated text appended. " * 100,
... tokenizer_kwargs=tokenizer_kwargs,
... )
调用填充掩码(fill-mask)管道的函数,传入一个带有填充掩码标记的文本以及tokenizer的额外参数。这里的文本是一个简单的句子,加上了大量重复的文本内容。
"""
def get_masked_index(self, input_ids: GenericTensor) -> np.ndarray:
if self.framework == "tf":
masked_index = tf.where(input_ids == self.tokenizer.mask_token_id).numpy()
elif self.framework == "pt":
masked_index = torch.nonzero(input_ids == self.tokenizer.mask_token_id, as_tuple=False)
else:
raise ValueError("Unsupported framework")
return masked_index
定义一个方法get_masked_index,用于获取输入张量中掩码标记的索引。根据self.framework属性,如果是"tf"则使用TensorFlow库的方法找到掩码标记索引并转换为NumPy数组,如果是"pt"则使用PyTorch库的方法返回掩码标记索引,否则抛出异常。
def _ensure_exactly_one_mask_token(self, input_ids: GenericTensor) -> np.ndarray:
masked_index = self.get_masked_index(input_ids)
numel = np.prod(masked_index.shape)
if numel < 1:
raise PipelineException(
"fill-mask",
self.model.base_model_prefix,
f"No mask_token ({self.tokenizer.mask_token}) found on the input",
)
定义一个方法_ensure_exactly_one_mask_token,确保输入张量中只有一个掩码标记。首先调用get_masked_index方法获取掩码标记的索引,然后计算索引数组的元素数量。如果数量小于1,则抛出PipelineException异常,提示输入中未找到掩码标记。
def ensure_exactly_one_mask_token(self, model_inputs: GenericTensor):
if isinstance(model_inputs, list):
for model_input in model_inputs:
self._ensure_exactly_one_mask_token(model_input["input_ids"][0])
else:
for input_ids in model_inputs["input_ids"]:
self._ensure_exactly_one_mask_token(input_ids)
定义一个公共方法ensure_exactly_one_mask_token,用于确保模型输入中每个示例只有一个掩码标记。根据输入类型(列表或单个输入),对每个模型输入调用_ensure_exactly_one_mask_token方法。
def preprocess(
self, inputs, return_tensors=None, tokenizer_kwargs=None, **preprocess_parameters
) -> Dict[str, GenericTensor]:
if return_tensors is None:
return_tensors = self.framework
if tokenizer_kwargs is None:
tokenizer_kwargs = {}
model_inputs = self.tokenizer(inputs, return_tensors=return_tensors, **tokenizer_kwargs)
self.ensure_exactly_one_mask_token(model_inputs)
return model_inputs
定义一个预处理方法preprocess,用于将原始输入处理成适合模型的输入格式。根据参数设置,调用tokenizer将输入转换成张量表示,并调用ensure_exactly_one_mask_token方法确保每个模型输入只有一个掩码标记。
def _forward(self, model_inputs):
model_outputs = self.model(**model_inputs)
model_outputs["input_ids"] = model_inputs["input_ids"]
return model_outputs
定义一个方法_forward,执行模型的前向传播。调用模型将输入传递给模型并返回输出,同时保留输入中的input_ids信息。
# 定义一个方法用于后处理模型输出,接受模型输出、top_k 参数和目标标识符作为输入
def postprocess(self, model_outputs, top_k=5, target_ids=None):
# 如果存在目标标识符并且目标标识符的数量少于 top_k,则将 top_k 设置为目标标识符的数量
if target_ids is not None and target_ids.shape[0] < top_k:
top_k = target_ids.shape[0]
# 获取模型输出中的输入标识符
input_ids = model_outputs["input_ids"][0]
# 获取模型输出中的预测 logits
outputs = model_outputs["logits"]
# 如果使用 TensorFlow 框架
if self.framework == "tf":
# 找到输入标识符中等于 tokenizer 的 mask_token_id 的位置索引
masked_index = tf.where(input_ids == self.tokenizer.mask_token_id).numpy()[:, 0]
# 将 logits 转换为 numpy 数组
outputs = outputs.numpy()
# 提取特定位置的 logits
logits = outputs[0, masked_index, :]
# 对 logits 进行稳定的 softmax 操作
probs = stable_softmax(logits, axis=-1)
# 如果存在目标标识符,则根据目标标识符从 probs 中抽取对应的概率
if target_ids is not None:
probs = tf.gather_nd(tf.squeeze(probs, 0), target_ids.reshape(-1, 1))
probs = tf.expand_dims(probs, 0)
# 获取概率最高的 top_k 个值和对应的索引
topk = tf.math.top_k(probs, k=top_k)
values, predictions = topk.values.numpy(), topk.indices.numpy()
else:
# 如果使用的是 PyTorch 框架,找到输入标识符中等于 tokenizer 的 mask_token_id 的位置索引
masked_index = torch.nonzero(input_ids == self.tokenizer.mask_token_id, as_tuple=False).squeeze(-1)
# Fill mask pipeline supports only one ${mask_token} per sample
# 提取特定位置的 logits
logits = outputs[0, masked_index, :]
# 对 logits 进行 softmax 操作
probs = logits.softmax(dim=-1)
# 如果存在目标标识符,则根据目标标识符从 probs 中抽取对应的概率
if target_ids is not None:
probs = probs[..., target_ids]
# 获取概率最高的 top_k 个值和对应的索引
values, predictions = probs.topk(top_k)
# 初始化结果列表
result = []
# 检查是否只有单个 mask
single_mask = values.shape[0] == 1
# 遍历概率值和对应的预测值
for i, (_values, _predictions) in enumerate(zip(values.tolist(), predictions.tolist())):
row = []
for v, p in zip(_values, _predictions):
# 创建输入标识符的副本,因为后续会修改此数组
tokens = input_ids.numpy().copy()
# 如果存在目标标识符,则将 p 替换为目标标识符中的对应值
if target_ids is not None:
p = target_ids[p].tolist()
# 将输入标识符中的 mask 位置替换为 p
tokens[masked_index[i]] = p
# 过滤掉填充标记
tokens = tokens[np.where(tokens != self.tokenizer.pad_token_id)]
# 使用 tokenizer 解码 tokens 生成序列,根据 single_mask 决定是否跳过特殊标记
sequence = self.tokenizer.decode(tokens, skip_special_tokens=single_mask)
# 创建建议字典,包含分数、标记、标记字符串和序列
proposition = {"score": v, "token": p, "token_str": self.tokenizer.decode([p]), "sequence": sequence}
row.append(proposition)
result.append(row)
# 如果只有单个 mask,则返回结果列表的第一个元素
if single_mask:
return result[0]
# 否则返回完整的结果列表
return result
# 获取目标标记的对应 ID 列表
def get_target_ids(self, targets, top_k=None):
# 如果目标是字符串,则转换为列表形式
if isinstance(targets, str):
targets = [targets]
try:
# 获取当前 tokenizer 的词汇表
vocab = self.tokenizer.get_vocab()
except Exception:
# 若获取失败则设置空词汇表
vocab = {}
# 初始化目标 ID 列表
target_ids = []
# 遍历每个目标标记
for target in targets:
# 获取目标标记在词汇表中的 ID,若不存在则为 None
id_ = vocab.get(target, None)
# 如果 ID 不存在
if id_ is None:
# 使用 tokenizer 处理目标标记,获取其对应的 input_ids
input_ids = self.tokenizer(
target,
add_special_tokens=False,
return_attention_mask=False,
return_token_type_ids=False,
max_length=1,
truncation=True,
)["input_ids"]
# 如果 input_ids 长度为 0,表示标记在模型词汇表中不存在
if len(input_ids) == 0:
# 发出警告,指出指定的目标标记在模型词汇表中不存在
logger.warning(
f"The specified target token `{target}` does not exist in the model vocabulary. "
"We cannot replace it with anything meaningful, ignoring it"
)
# 继续下一个目标标记的处理
continue
# 将第一个 input_id 作为替代标记的 ID
id_ = input_ids[0]
# 发出警告,指出替代了不存在的目标标记,并提示替代的标记
logger.warning(
f"The specified target token `{target}` does not exist in the model vocabulary. "
f"Replacing with `{self.tokenizer.convert_ids_to_tokens(id_)}`."
)
# 将获取到的目标标记 ID 添加到列表中
target_ids.append(id_)
# 去重目标 ID 列表
target_ids = list(set(target_ids))
# 如果目标 ID 列表为空,则抛出数值错误异常
if len(target_ids) == 0:
raise ValueError("At least one target must be provided when passed.")
# 转换目标 ID 列表为 NumPy 数组格式
target_ids = np.array(target_ids)
# 返回目标 ID 数组
return target_ids
# 清理参数函数,返回预处理、后处理参数及空字典
def _sanitize_parameters(self, top_k=None, targets=None, tokenizer_kwargs=None):
preprocess_params = {}
# 如果存在 tokenizer_kwargs 参数,则添加到预处理参数中
if tokenizer_kwargs is not None:
preprocess_params["tokenizer_kwargs"] = tokenizer_kwargs
postprocess_params = {}
# 如果存在 targets 参数,则获取目标标记的 ID 列表
if targets is not None:
target_ids = self.get_target_ids(targets, top_k)
postprocess_params["target_ids"] = target_ids
# 如果存在 top_k 参数,则添加到后处理参数中
if top_k is not None:
postprocess_params["top_k"] = top_k
# 如果 tokenizer 的 mask_token_id 为 None,则抛出管道异常
if self.tokenizer.mask_token_id is None:
raise PipelineException(
"fill-mask", self.model.base_model_prefix, "The tokenizer does not define a `mask_token`."
)
# 返回预处理参数、空字典和后处理参数
return preprocess_params, {}, postprocess_params
# 覆盖父类的 __call__ 方法,用于填充输入文本中的掩码标记。
outputs = super().__call__(inputs, **kwargs)
# 调用父类的 __call__ 方法,传入输入参数 inputs 和其他关键字参数 kwargs,并获取输出结果
if isinstance(inputs, list) and len(inputs) == 1:
# 检查 inputs 是否为列表且长度为1
return outputs[0]
# 如果是单个文本输入,则直接返回第一个输出结果
return outputs
# 否则返回所有输出结果
.\pipelines\image_classification.py
from typing import List, Union
import numpy as np
from ..utils import (
ExplicitEnum,
add_end_docstrings,
is_tf_available,
is_torch_available,
is_vision_available,
logging,
requires_backends,
)
from .base import Pipeline, build_pipeline_init_args
if is_vision_available():
from PIL import Image
from ..image_utils import load_image
if is_tf_available():
from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES
if is_torch_available():
from ..models.auto.modeling_auto import MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES
logger = logging.get_logger(__name__)
def sigmoid(_outputs):
return 1.0 / (1.0 + np.exp(-_outputs))
def softmax(_outputs):
maxes = np.max(_outputs, axis=-1, keepdims=True)
shifted_exp = np.exp(_outputs - maxes)
return shifted_exp / shifted_exp.sum(axis=-1, keepdims=True)
class ClassificationFunction(ExplicitEnum):
SIGMOID = "sigmoid"
SOFTMAX = "softmax"
NONE = "none"
@add_end_docstrings(
build_pipeline_init_args(has_image_processor=True),
r"""
function_to_apply (`str`, *optional*, defaults to `"default"`):
The function to apply to the model outputs in order to retrieve the scores. Accepts four different values:
- `"default"`: if the model has a single label, will apply the sigmoid function on the output. If the model
has several labels, will apply the softmax function on the output.
- `"sigmoid"`: Applies the sigmoid function on the output.
- `"softmax"`: Applies the softmax function on the output.
- `"none"`: Does not apply any function on the output.""",
)
class ImageClassificationPipeline(Pipeline):
"""
Image classification pipeline using any `AutoModelForImageClassification`. This pipeline predicts the class of an
image.
Example:
```
>>> from transformers import pipeline
>>> classifier = pipeline(model="microsoft/beit-base-patch16-224-pt22k-ft22k")
>>> classifier("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png")
[{'score': 0.442, 'label': 'macaw'}, {'score': 0.088, 'label': 'popinjay'}, {'score': 0.075, 'label': 'parrot'}, {'score': 0.073, 'label': 'parodist, lampooner'}, {'score': 0.046, 'label': 'poll, poll_parrot'}]
```
Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
This image classification pipeline can currently be loaded from [`pipeline`] using the following task identifier:
`"image-classification"`.
See the list of available models on
[huggingface.co/models](https://huggingface.co/models?filter=image-classification).
"""
function_to_apply: ClassificationFunction = ClassificationFunction.NONE
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
requires_backends(self, "vision")
self.check_model_type(
TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES
if self.framework == "tf"
else MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES
)
def _sanitize_parameters(self, top_k=None, function_to_apply=None, timeout=None):
preprocess_params = {}
if timeout is not None:
preprocess_params["timeout"] = timeout
postprocess_params = {}
if top_k is not None:
postprocess_params["top_k"] = top_k
if isinstance(function_to_apply, str):
function_to_apply = ClassificationFunction(function_to_apply.lower())
if function_to_apply is not None:
postprocess_params["function_to_apply"] = function_to_apply
return preprocess_params, {}, postprocess_params
def __call__(self, images: Union[str, List[str], "Image.Image", List["Image.Image"]], **kwargs):
"""
Assign labels to the image(s) passed as inputs.
Args:
images (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
The pipeline handles three types of images:
- A string containing a http link pointing to an image
- A string containing a local path to an image
- An image loaded in PIL directly
The pipeline accepts either a single image or a batch of images, which must then be passed as a string.
Images in a batch must all be in the same format: all as http links, all as local paths, or all as PIL
images.
function_to_apply (`str`, *optional*, defaults to `"default"`):
The function to apply to the model outputs in order to retrieve the scores. Accepts four different
values:
If this argument is not specified, then it will apply the following functions according to the number
of labels:
- If the model has a single label, will apply the sigmoid function on the output.
- If the model has several labels, will apply the softmax function on the output.
Possible values are:
- `"sigmoid"`: Applies the sigmoid function on the output.
- `"softmax"`: Applies the softmax function on the output.
- `"none"`: Does not apply any function on the output.
top_k (`int`, *optional*, defaults to 5):
The number of top labels that will be returned by the pipeline. If the provided number is higher than
the number of labels available in the model configuration, it will default to the number of labels.
timeout (`float`, *optional*, defaults to None):
The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
the call may block forever.
Return:
A dictionary or a list of dictionaries containing result. If the input is a single image, will return a
dictionary, if the input is a list of several images, will return a list of dictionaries corresponding to
the images.
The dictionaries contain the following keys:
- **label** (`str`) -- The label identified by the model.
- **score** (`int`) -- The score attributed by the model for that label.
"""
return super().__call__(images, **kwargs)
def preprocess(self, image, timeout=None):
image = load_image(image, timeout=timeout)
model_inputs = self.image_processor(images=image, return_tensors=self.framework)
return model_inputs
def _forward(self, model_inputs):
model_outputs = self.model(**model_inputs)
return model_outputs
def postprocess(self, model_outputs, function_to_apply=None, top_k=5):
if function_to_apply is None:
if self.model.config.problem_type == "multi_label_classification" or self.model.config.num_labels == 1:
function_to_apply = ClassificationFunction.SIGMOID
elif self.model.config.problem_type == "single_label_classification" or self.model.config.num_labels > 1:
function_to_apply = ClassificationFunction.SOFTMAX
elif hasattr(self.model.config, "function_to_apply") and function_to_apply is None:
function_to_apply = self.model.config.function_to_apply
else:
function_to_apply = ClassificationFunction.NONE
if top_k > self.model.config.num_labels:
top_k = self.model.config.num_labels
outputs = model_outputs["logits"][0]
outputs = outputs.numpy()
if function_to_apply == ClassificationFunction.SIGMOID:
scores = sigmoid(outputs)
elif function_to_apply == ClassificationFunction.SOFTMAX:
scores = softmax(outputs)
elif function_to_apply == ClassificationFunction.NONE:
scores = outputs
else:
raise ValueError(f"Unrecognized `function_to_apply` argument: {function_to_apply}")
dict_scores = [
{"label": self.model.config.id2label[i], "score": score.item()} for i, score in enumerate(scores)
]
dict_scores.sort(key=lambda x: x["score"], reverse=True)
if top_k is not None:
dict_scores = dict_scores[:top_k]
return dict_scores
.\pipelines\image_feature_extraction.py
from typing import Dict
from ..utils import add_end_docstrings, is_vision_available
from .base import GenericTensor, Pipeline, build_pipeline_init_args
if is_vision_available():
from ..image_utils import load_image
@add_end_docstrings(
build_pipeline_init_args(has_image_processor=True),
"""
image_processor_kwargs (`dict`, *optional*):
Additional dictionary of keyword arguments passed along to the image processor e.g.
{"size": {"height": 100, "width": 100}}
pool (`bool`, *optional*, defaults to `False`):
Whether or not to return the pooled output. If `False`, the model will return the raw hidden states.
""",
)
class ImageFeatureExtractionPipeline(Pipeline):
"""
Image feature extraction pipeline uses no model head. This pipeline extracts the hidden states from the base
transformer, which can be used as features in downstream tasks.
Example:
```
>>> from transformers import pipeline
>>> extractor = pipeline(model="google/vit-base-patch16-224", task="image-feature-extraction")
>>> result = extractor("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png", return_tensors=True)
>>> result.shape # This is a tensor of shape [1, sequence_lenth, hidden_dimension] representing the input image.
torch.Size([1, 197, 768])
```
Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
This image feature extraction pipeline can currently be loaded from [`pipeline`] using the task identifier:
`"image-feature-extraction"`.
All vision models may be used for this pipeline. See a list of all models, including community-contributed models on
[huggingface.co/models](https://huggingface.co/models).
"""
def _sanitize_parameters(self, image_processor_kwargs=None, return_tensors=None, pool=None, **kwargs):
preprocess_params = {} if image_processor_kwargs is None else image_processor_kwargs
postprocess_params = {}
if pool is not None:
postprocess_params["pool"] = pool
if return_tensors is not None:
postprocess_params["return_tensors"] = return_tensors
if "timeout" in kwargs:
preprocess_params["timeout"] = kwargs["timeout"]
return preprocess_params, {}, postprocess_params
def preprocess(self, image, timeout=None, **image_processor_kwargs) -> Dict[str, GenericTensor]:
image = load_image(image, timeout=timeout)
model_inputs = self.image_processor(image, return_tensors=self.framework, **image_processor_kwargs)
return model_inputs
def _forward(self, model_inputs):
model_outputs = self.model(**model_inputs)
return model_outputs
def postprocess(self, model_outputs, pool=None, return_tensors=False):
pool = pool if pool is not None else False
if pool:
if "pooler_output" not in model_outputs:
raise ValueError(
"No pooled output was returned. Make sure the model has a `pooler` layer when using the `pool` option."
)
outputs = model_outputs["pooler_output"]
else:
outputs = model_outputs[0]
if return_tensors:
return outputs
if self.framework == "pt":
return outputs.tolist()
elif self.framework == "tf":
return outputs.numpy().tolist()
def __call__(self, *args, **kwargs):
"""
Extract the features of the input(s).
Args:
images (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
The pipeline handles three types of images:
- A string containing a http link pointing to an image
- A string containing a local path to an image
- An image loaded in PIL directly
The pipeline accepts either a single image or a batch of images, which must then be passed as a string.
Images in a batch must all be in the same format: all as http links, all as local paths, or all as PIL
images.
timeout (`float`, *optional*, defaults to None):
The maximum time in seconds to wait for fetching images from the web. If None, no timeout is used and
the call may block forever.
Return:
A nested list of `float`: The features computed by the model.
"""
return super().__call__(*args, **kwargs)
.\pipelines\image_segmentation.py
from typing import Any, Dict, List, Union
import numpy as np
from ..utils import add_end_docstrings, is_torch_available, is_vision_available, logging, requires_backends
from .base import Pipeline, build_pipeline_init_args
if is_vision_available():
from PIL import Image
from ..image_utils import load_image
if is_torch_available():
from ..models.auto.modeling_auto import (
MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES,
MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING_NAMES,
MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES,
MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES,
)
logger = logging.get_logger(__name__)
Prediction = Dict[str, Any]
Predictions = List[Prediction]
@add_end_docstrings(build_pipeline_init_args(has_image_processor=True))
class ImageSegmentationPipeline(Pipeline):
"""
Image segmentation pipeline using any `AutoModelForXXXSegmentation`. This pipeline predicts masks of objects and
their classes.
Example:
```
>>> from transformers import pipeline
>>> segmenter = pipeline(model="facebook/detr-resnet-50-panoptic")
>>> segments = segmenter("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png")
>>> len(segments)
2
>>> segments[0]["label"]
'bird'
>>> segments[1]["label"]
'bird'
>>> type(segments[0]["mask"]) # This is a black and white mask showing where is the bird on the original image.
<class 'PIL.Image.Image'>
>>> segments[0]["mask"].size
(768, 512)
```
This image segmentation pipeline can currently be loaded from [`pipeline`] using the following task identifier:
`"image-segmentation"`.
See the list of available models on
[huggingface.co/models](https://huggingface.co/models?filter=image-segmentation).
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
if self.framework == "tf":
raise ValueError(f"The {self.__class__} is only available in PyTorch.")
requires_backends(self, "vision")
mapping = MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES.copy()
mapping.update(MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES)
mapping.update(MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING_NAMES)
mapping.update(MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES)
self.check_model_type(mapping)
def _sanitize_parameters(self, **kwargs):
preprocess_kwargs = {}
postprocess_kwargs = {}
if "subtask" in kwargs:
postprocess_kwargs["subtask"] = kwargs["subtask"]
preprocess_kwargs["subtask"] = kwargs["subtask"]
if "threshold" in kwargs:
postprocess_kwargs["threshold"] = kwargs["threshold"]
if "mask_threshold" in kwargs:
postprocess_kwargs["mask_threshold"] = kwargs["mask_threshold"]
if "overlap_mask_area_threshold" in kwargs:
postprocess_kwargs["overlap_mask_area_threshold"] = kwargs["overlap_mask_area_threshold"]
if "timeout" in kwargs:
preprocess_kwargs["timeout"] = kwargs["timeout"]
return preprocess_kwargs, {}, postprocess_kwargs
def __call__(self, images, **kwargs) -> Union[Predictions, List[Prediction]]:
"""
执行图像分割(检测掩码和类别)在作为输入的图像中。
Args:
images (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
处理三种类型的图像:
- 包含 HTTP(S) 链接指向图像的字符串
- 包含指向本地图像路径的字符串
- 直接加载的 PIL 图像
管道接受单个图像或批量图像。批量图像必须统一格式:全部是 HTTP(S) 链接,全部是本地路径,或全部是 PIL 图像。
subtask (`str`, *optional*):
要执行的分割任务,根据模型能力选择 [`semantic`, `instance` 和 `panoptic`]。如果未设置,管道将按照以下顺序尝试解析:
`panoptic`, `instance`, `semantic`.
threshold (`float`, *optional*, defaults to 0.9):
过滤预测掩码的概率阈值。
mask_threshold (`float`, *optional*, defaults to 0.5):
在将预测掩码转换为二进制值时使用的阈值。
overlap_mask_area_threshold (`float`, *optional*, defaults to 0.5):
掩码重叠阈值,用于消除小的断开分段。
timeout (`float`, *optional*, defaults to None):
从网络获取图像的最大等待时间(秒)。如果为 None,则不设置超时,调用可能会永远阻塞。
Return:
包含结果的字典或字典列表。如果输入是单个图像,则返回字典列表;如果输入是多个图像,则返回与每个图像对应的字典列表。
字典包含每个检测到对象的掩码、标签和得分(适用时),包含以下键:
- **label** (`str`) -- 模型识别的类别标签。
- **mask** (`PIL.Image`) -- 检测到对象的二进制掩码,作为原始图像的 PIL 图像,形状为 (width, height)。如果未找到对象,则返回填充零的掩码。
- **score** (*optional* `float`) -- 当模型能够估计标签和掩码描述的 "对象" 的置信度时,可选地包含。
"""
return super().__call__(images, **kwargs)
image = load_image(image, timeout=timeout)
target_size = [(image.height, image.width)]
if self.model.config.__class__.__name__ == "OneFormerConfig":
if subtask is None:
kwargs = {}
else:
kwargs = {"task_inputs": [subtask]}
inputs = self.image_processor(images=[image], return_tensors="pt", **kwargs)
inputs["task_inputs"] = self.tokenizer(
inputs["task_inputs"],
padding="max_length",
max_length=self.model.config.task_seq_len,
return_tensors=self.framework,
)["input_ids"]
else:
inputs = self.image_processor(images=[image], return_tensors="pt")
inputs["target_size"] = target_size
return inputs
def _forward(self, model_inputs):
target_size = model_inputs.pop("target_size")
model_outputs = self.model(**model_inputs)
model_outputs["target_size"] = target_size
return model_outputs
def postprocess(
self, model_outputs, subtask=None, threshold=0.9, mask_threshold=0.5, overlap_mask_area_threshold=0.5
):
):
fn = None
if subtask in {"panoptic", None} and hasattr(self.image_processor, "post_process_panoptic_segmentation"):
fn = self.image_processor.post_process_panoptic_segmentation
elif subtask in {"instance", None} and hasattr(self.image_processor, "post_process_instance_segmentation"):
fn = self.image_processor.post_process_instance_segmentation
if fn is not None:
outputs = fn(
model_outputs,
threshold=threshold,
mask_threshold=mask_threshold,
overlap_mask_area_threshold=overlap_mask_area_threshold,
target_sizes=model_outputs["target_size"],
)[0]
annotation = []
segmentation = outputs["segmentation"]
for segment in outputs["segments_info"]:
mask = (segmentation == segment["id"]) * 255
mask = Image.fromarray(mask.numpy().astype(np.uint8), mode="L")
label = self.model.config.id2label[segment["label_id"]]
score = segment["score"]
annotation.append({"score": score, "label": label, "mask": mask})
elif subtask in {"semantic", None} and hasattr(self.image_processor, "post_process_semantic_segmentation"):
outputs = self.image_processor.post_process_semantic_segmentation(
model_outputs, target_sizes=model_outputs["target_size"]
)[0]
annotation = []
segmentation = outputs.numpy()
labels = np.unique(segmentation)
for label in labels:
mask = (segmentation == label) * 255
mask = Image.fromarray(mask.astype(np.uint8), mode="L")
label = self.model.config.id2label[label]
annotation.append({"score": None, "label": label, "mask": mask})
else:
raise ValueError(f"Subtask {subtask} is not supported for model {type(self.model)}")
return annotation
.\pipelines\image_to_image.py
from typing import List, Union
import numpy as np
from ..utils import (
add_end_docstrings,
is_torch_available,
is_vision_available,
logging,
requires_backends,
)
from .base import Pipeline, build_pipeline_init_args
if is_vision_available():
from PIL import Image
from ..image_utils import load_image
if is_torch_available():
from ..models.auto.modeling_auto import MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES
logger = logging.get_logger(__name__)
@add_end_docstrings(build_pipeline_init_args(has_image_processor=True))
class ImageToImagePipeline(Pipeline):
"""
Image to Image pipeline using any `AutoModelForImageToImage`. This pipeline generates an image based on a previous
image input.
Example:
```
>>> from PIL import Image
>>> import requests
>>> from transformers import pipeline
>>> upscaler = pipeline("image-to-image", model="caidas/swin2SR-classical-sr-x2-64")
>>> img = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
>>> img = img.resize((64, 64))
>>> upscaled_img = upscaler(img)
>>> img.size
(64, 64)
>>> upscaled_img.size
(144, 144)
```
This image to image pipeline can currently be loaded from [`pipeline`] using the following task identifier:
`"image-to-image"`.
See the list of available models on [huggingface.co/models](https://huggingface.co/models?filter=image-to-image).
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
requires_backends(self, "vision")
self.check_model_type(MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES)
def _sanitize_parameters(self, **kwargs):
preprocess_params = {}
postprocess_params = {}
forward_params = {}
if "timeout" in kwargs:
preprocess_params["timeout"] = kwargs["timeout"]
if "head_mask" in kwargs:
forward_params["head_mask"] = kwargs["head_mask"]
return preprocess_params, forward_params, postprocess_params
def __call__(
self, images: Union[str, List[str], "Image.Image", List["Image.Image"]], **kwargs
) -> Union["Image.Image", List["Image.Image"]]:
"""
Transform the image(s) passed as inputs.
Args:
images (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
The pipeline handles three types of images:
- A string containing a http link pointing to an image
- A string containing a local path to an image
- An image loaded in PIL directly
The pipeline accepts either a single image or a batch of images, which must then be passed as a string.
Images in a batch must all be in the same format: all as http links, all as local paths, or all as PIL
images.
timeout (`float`, *optional*, defaults to None):
The maximum time in seconds to wait for fetching images from the web. If None, no timeout is used and
the call may block forever.
Return:
An image (Image.Image) or a list of images (List["Image.Image"]) containing result(s). If the input is a
single image, the return will be also a single image, if the input is a list of several images, it will
return a list of transformed images.
"""
return super().__call__(images, **kwargs)
def _forward(self, model_inputs):
model_outputs = self.model(**model_inputs)
return model_outputs
def preprocess(self, image, timeout=None):
image = load_image(image, timeout=timeout)
inputs = self.image_processor(images=[image], return_tensors="pt")
return inputs
def postprocess(self, model_outputs):
images = []
if "reconstruction" in model_outputs.keys():
outputs = model_outputs.reconstruction
for output in outputs:
output = output.data.squeeze().float().cpu().clamp_(0, 1).numpy()
output = np.moveaxis(output, source=0, destination=-1)
output = (output * 255.0).round().astype(np.uint8)
images.append(Image.fromarray(output))
return images if len(images) > 1 else images[0]
.\pipelines\image_to_text.py
from typing import List, Union
from ..utils import (
add_end_docstrings,
is_tf_available,
is_torch_available,
is_vision_available,
logging,
requires_backends,
)
from .base import Pipeline, build_pipeline_init_args
if is_vision_available():
from PIL import Image
from ..image_utils import load_image
if is_tf_available():
from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES
if is_torch_available():
import torch
from ..models.auto.modeling_auto import MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES
logger = logging.get_logger(__name__)
@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True, has_image_processor=True))
class ImageToTextPipeline(Pipeline):
"""
图像到文本的 Pipeline,使用 AutoModelForVision2Seq 模型。该 Pipeline 预测给定图像的标题。
示例:
```
>>> from transformers import pipeline
>>> captioner = pipeline(model="ydshieh/vit-gpt2-coco-en")
>>> captioner("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png")
[{'generated_text': 'two birds are standing next to each other '}]
```
在[pipeline 教程](../pipeline_tutorial)中了解有关使用 Pipeline 的基础知识
当前的图像到文本 Pipeline 可以使用下面的任务标识符加载:
"image-to-text"。
查看 [huggingface.co/models](https://huggingface.co/models?pipeline_tag=image-to-text)
上可用模型的列表。
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
requires_backends(self, "vision")
self.check_model_type(
TF_MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES if self.framework == "tf" else MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES
)
def _sanitize_parameters(self, max_new_tokens=None, generate_kwargs=None, prompt=None, timeout=None):
forward_params = {}
preprocess_params = {}
if prompt is not None:
preprocess_params["prompt"] = prompt
if timeout is not None:
preprocess_params["timeout"] = timeout
if max_new_tokens is not None:
forward_params["max_new_tokens"] = max_new_tokens
if generate_kwargs is not None:
if max_new_tokens is not None and "max_new_tokens" in generate_kwargs:
raise ValueError(
"`max_new_tokens` is defined both as an argument and inside `generate_kwargs` argument, please use"
" only 1 version"
)
forward_params.update(generate_kwargs)
return preprocess_params, forward_params, {}
def __call__(self, images: Union[str, List[str], "Image.Image", List["Image.Image"]], **kwargs):
"""
为传入的图像或图像列表分配标签。
Args:
images (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
该管道处理三种类型的图像:
- 包含指向图像的HTTP(s)链接的字符串
- 包含指向图像的本地路径的字符串
- 直接加载到PIL中的图像
该管道可以接受单个图像或批量图像。
max_new_tokens (`int`, *optional*):
要生成的最大标记数。默认情况下,将使用`generate`的默认值。
generate_kwargs (`Dict`, *optional*):
将这些参数直接传递给`generate`,允许对此函数进行完全控制。
timeout (`float`, *optional*, 默认为None):
从网络获取图像的最大等待时间(以秒为单位)。如果为None,则不设置超时,调用可能会永久阻塞。
Return:
`list` 或 `list` 的 `list`,每个结果作为一个字典返回,包含以下键:
- **generated_text** (`str`) -- 生成的文本。
"""
return super().__call__(images, **kwargs)
def preprocess(self, image, prompt=None, timeout=None):
image = load_image(image, timeout=timeout)
if prompt is not None:
if not isinstance(prompt, str):
raise ValueError(
f"Received an invalid text input, got - {type(prompt)} - but expected a single string. "
"Note also that one single text can be provided for conditional image to text generation."
)
model_type = self.model.config.model_type
if model_type == "git":
model_inputs = self.image_processor(images=image, return_tensors=self.framework)
input_ids = self.tokenizer(text=prompt, add_special_tokens=False).input_ids
input_ids = [self.tokenizer.cls_token_id] + input_ids
input_ids = torch.tensor(input_ids).unsqueeze(0)
model_inputs.update({"input_ids": input_ids})
elif model_type == "pix2struct":
model_inputs = self.image_processor(images=image, header_text=prompt, return_tensors=self.framework)
elif model_type != "vision-encoder-decoder":
model_inputs = self.image_processor(images=image, return_tensors=self.framework)
text_inputs = self.tokenizer(prompt, return_tensors=self.framework)
model_inputs.update(text_inputs)
else:
raise ValueError(f"Model type {model_type} does not support conditional text generation")
else:
model_inputs = self.image_processor(images=image, return_tensors=self.framework)
if self.model.config.model_type == "git" and prompt is None:
model_inputs["input_ids"] = None
return model_inputs
def _forward(self, model_inputs, **generate_kwargs):
if (
"input_ids" in model_inputs
and isinstance(model_inputs["input_ids"], list)
and all(x is None for x in model_inputs["input_ids"])
):
model_inputs["input_ids"] = None
inputs = model_inputs.pop(self.model.main_input_name)
model_outputs = self.model.generate(inputs, **model_inputs, **generate_kwargs)
return model_outputs
def postprocess(self, model_outputs):
records = []
for output_ids in model_outputs:
record = {
"generated_text": self.tokenizer.decode(
output_ids,
skip_special_tokens=True,
)
}
records.append(record)
return records
.\pipelines\mask_generation.py
from collections import defaultdict
from typing import Optional
from ..image_utils import load_image
from ..utils import (
add_end_docstrings,
is_torch_available,
logging,
requires_backends,
)
from .base import ChunkPipeline, build_pipeline_init_args
if is_torch_available():
import torch
from ..models.auto.modeling_auto import MODEL_FOR_MASK_GENERATION_MAPPING_NAMES
logger = logging.get_logger(__name__)
@add_end_docstrings(
build_pipeline_init_args(has_image_processor=True),
r"""
points_per_batch (*optional*, int, default to 64):
设置模型同时运行的点数。数字越高可能速度更快但使用更多 GPU 内存。
output_bboxes_mask (`bool`, *optional*, default to `False`):
是否输出边界框预测。
output_rle_masks (`bool`, *optional*, default to `False`):
是否以 RLE 格式输出掩码。""",
)
class MaskGenerationPipeline(ChunkPipeline):
"""
自动为图像生成掩模,使用 `SamForMaskGeneration` 模型。该管道预测给定图像的二进制掩模。它是一个 `ChunkPipeline`,
因为可以将小批量中的点分开处理,以避免内存不足问题。使用 `points_per_batch` 参数控制同时处理的点数,默认为 `64`。
该管道工作分为三个步骤:
1. `preprocess`: 生成一个均匀分布的 1024 个点网格,以及边界框和点标签。
更多关于如何创建点和边界框的细节,请查看 `_generate_crop_boxes` 函数。同时使用 `image_processor` 预处理图像。
该函数生成一个 `points_per_batch` 的小批量。
2. `forward`: 将 `preprocess` 的输出馈送到模型。仅计算图像嵌入一次。
调用 `self.model.get_image_embeddings`,确保不计算梯度,并且张量和模型在同一设备上。
3. `postprocess`: 自动掩模生成的最重要部分发生在这里。包括三个步骤:
- image_processor.postprocess_masks(在每个小批量循环中运行):处理原始输出掩模,根据图像大小调整它们的大小,并将其转换为二进制掩模。
- image_processor.filter_masks(在每个小批量循环中运行):使用 `pred_iou_thresh` 和 `stability_scores`,以及基于非最大抑制的各种过滤器,去除不良掩模。
- image_processor.postprocess_masks_for_amg:对掩模应用 NSM,仅保留相关掩模。
示例:
```
>>> from transformers import pipeline
```
>>> generator = pipeline(model="facebook/sam-vit-base", task="mask-generation")
# 使用预定义的pipeline函数创建一个生成器,用于执行模型推断任务,指定模型和任务类型为“mask-generation”。
>>> outputs = generator(
... "http://images.cocodataset.org/val2017/000000039769.jpg",
... )
# 使用生成器执行推断任务,输入为指定的图像URL。该步骤将返回推断结果。
>>> outputs = generator(
... "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png", points_per_batch=128
... )
# 再次使用生成器执行推断任务,输入为另一个图像URL,并设置额外的参数points_per_batch为128。该步骤将返回推断结果。
"""
Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
This segmentation pipeline can currently be loaded from [`pipeline`] using the following task identifier:
`"mask-generation"`.
See the list of available models on [huggingface.co/models](https://huggingface.co/models?filter=mask-generation).
"""
# 提供了有关使用管道的基础信息,并指出此分割管道可使用task标识符“mask-generation”从[`pipeline`]加载。
class YourClassName:
def __init__(self, **kwargs):
super().__init__(**kwargs)
requires_backends(self, "vision")
requires_backends(self, "torch")
if self.framework != "pt":
raise ValueError(f"The {self.__class__} is only available in PyTorch.")
self.check_model_type(MODEL_FOR_MASK_GENERATION_MAPPING_NAMES)
# 初始化类的构造函数,进行基本的设置和检查,确保所需的后端库和框架为PyTorch。
def _sanitize_parameters(self, **kwargs):
preprocess_kwargs = {}
postprocess_kwargs = {}
forward_params = {}
# 预处理参数
if "points_per_batch" in kwargs:
preprocess_kwargs["points_per_batch"] = kwargs["points_per_batch"]
if "points_per_crop" in kwargs:
preprocess_kwargs["points_per_crop"] = kwargs["points_per_crop"]
if "crops_n_layers" in kwargs:
preprocess_kwargs["crops_n_layers"] = kwargs["crops_n_layers"]
if "crop_overlap_ratio" in kwargs:
preprocess_kwargs["crop_overlap_ratio"] = kwargs["crop_overlap_ratio"]
if "crop_n_points_downscale_factor" in kwargs:
preprocess_kwargs["crop_n_points_downscale_factor"] = kwargs["crop_n_points_downscale_factor"]
if "timeout" in kwargs:
preprocess_kwargs["timeout"] = kwargs["timeout"]
# 后处理参数
if "pred_iou_thresh" in kwargs:
forward_params["pred_iou_thresh"] = kwargs["pred_iou_thresh"]
if "stability_score_offset" in kwargs:
forward_params["stability_score_offset"] = kwargs["stability_score_offset"]
if "mask_threshold" in kwargs:
forward_params["mask_threshold"] = kwargs["mask_threshold"]
if "stability_score_thresh" in kwargs:
forward_params["stability_score_thresh"] = kwargs["stability_score_thresh"]
if "crops_nms_thresh" in kwargs:
postprocess_kwargs["crops_nms_thresh"] = kwargs["crops_nms_thresh"]
if "output_rle_mask" in kwargs:
postprocess_kwargs["output_rle_mask"] = kwargs["output_rle_mask"]
if "output_bboxes_mask" in kwargs:
postprocess_kwargs["output_bboxes_mask"] = kwargs["output_bboxes_mask"]
return preprocess_kwargs, forward_params, postprocess_kwargs
# 对传入的参数进行清理和预处理,将预处理、前向和后处理的参数分别整理到三个字典中。
def __call__(self, image, *args, num_workers=None, batch_size=None, **kwargs):
"""
通过调用实例对象,生成二进制分割掩码
Args:
inputs (`np.ndarray` or `bytes` or `str` or `dict`):
图像或图像列表。
mask_threshold (`float`, *optional*, defaults to 0.0):
将预测的掩码转换为二进制值时使用的阈值。
pred_iou_thresh (`float`, *optional*, defaults to 0.88):
应用于模型预测掩码质量的过滤阈值,取值范围为 `[0,1]`。
stability_score_thresh (`float`, *optional*, defaults to 0.95):
应用于模型掩码预测稳定性的过滤阈值,取值范围为 `[0,1]`。
stability_score_offset (`int`, *optional*, defaults to 1):
在计算稳定性分数时,用于偏移截断的量。
crops_nms_thresh (`float`, *optional*, defaults to 0.7):
由非极大值抑制使用的框 IoU 截断,用于过滤重复的掩码。
crops_n_layers (`int`, *optional*, defaults to 0):
如果 `crops_n_layers>0`,则将再次对图像的裁剪运行掩码预测。设置运行的层数,每层有 2**i_layer 个图像裁剪。
crop_overlap_ratio (`float`, *optional*, defaults to `512 / 1500`):
设置裁剪重叠的程度。在第一层裁剪中,裁剪将以图像长度的这一分数重叠。随后的层级通过更多的裁剪减少此重叠。
crop_n_points_downscale_factor (`int`, *optional*, defaults to `1`):
在第 n 层采样的每边点数按 crop_n_points_downscale_factor**n 缩小。
timeout (`float`, *optional*, defaults to None):
从网页获取图像的最大等待时间(秒)。如果为 None,则不设置超时,调用可能会一直阻塞。
Return:
`Dict`: 包含以下键的字典:
- **mask** (`PIL.Image`) -- 检测到对象的二进制掩码,作为原始图像 `(width, height)` 的 PIL 图像。如果未检测到对象,则返回一个填充零的掩码。
- **score** (*optional* `float`) -- 可选,当模型能够估计标签和掩码描述的 "对象" 的置信度时。
"""
return super().__call__(image, *args, num_workers=num_workers, batch_size=batch_size, **kwargs)
def preprocess(
self,
image,
points_per_batch=64, # 每批处理的点数,默认为64
crops_n_layers: int = 0, # 裁剪层数,默认为0
crop_overlap_ratio: float = 512 / 1500, # 裁剪重叠比例,默认为512/1500
points_per_crop: Optional[int] = 32, # 每个裁剪的点数,默认为32
crop_n_points_downscale_factor: Optional[int] = 1, # 裁剪点数缩放因子,默认为1
timeout: Optional[float] = None, # 超时时间,默认为None
):
image = load_image(image, timeout=timeout) # 调用load_image函数加载图像,可以设置超时时间
target_size = self.image_processor.size["longest_edge"] # 获取图像处理器中最长边的尺寸作为目标尺寸
crop_boxes, grid_points, cropped_images, input_labels = self.image_processor.generate_crop_boxes(
image, target_size, crops_n_layers, crop_overlap_ratio, points_per_crop, crop_n_points_downscale_factor
) # 使用图像处理器生成裁剪框、网格点、裁剪后的图像和输入标签
model_inputs = self.image_processor(images=cropped_images, return_tensors="pt") # 使用图像处理器处理裁剪后的图像,返回PyTorch张量格式的模型输入
with self.device_placement(): # 使用设备分配上下文管理器
if self.framework == "pt": # 如果框架是PyTorch
inference_context = self.get_inference_context() # 获取推断上下文
with inference_context(): # 使用推断上下文管理器
model_inputs = self._ensure_tensor_on_device(model_inputs, device=self.device) # 确保模型输入张量位于指定设备上
image_embeddings = self.model.get_image_embeddings(model_inputs.pop("pixel_values")) # 获取图像嵌入向量
model_inputs["image_embeddings"] = image_embeddings # 将图像嵌入向量添加到模型输入中
n_points = grid_points.shape[1] # 获取网格点的数量
points_per_batch = points_per_batch if points_per_batch is not None else n_points # 如果指定了每批处理的点数则使用,否则使用网格点的数量
if points_per_batch <= 0: # 如果每批处理的点数小于等于0
raise ValueError(
"Cannot have points_per_batch<=0. Must be >=1 to returned batched outputs. "
"To return all points at once, set points_per_batch to None"
) # 抛出数值错误异常,要求每批处理的点数必须大于等于1,或者设置为None以一次返回所有点
for i in range(0, n_points, points_per_batch): # 遍历网格点,每次处理points_per_batch个点
batched_points = grid_points[:, i : i + points_per_batch, :, :] # 分批次获取网格点
labels = input_labels[:, i : i + points_per_batch] # 获取对应的输入标签
is_last = i == n_points - points_per_batch # 判断是否是最后一批
yield {
"input_points": batched_points, # 返回批次的输入点
"input_labels": labels, # 返回对应的输入标签
"input_boxes": crop_boxes, # 返回裁剪框
"is_last": is_last, # 返回是否是最后一批
**model_inputs, # 返回模型输入的其它内容
}
def _forward(
self,
model_inputs,
pred_iou_thresh=0.88, # 预测IOU阈值,默认为0.88
stability_score_thresh=0.95, # 稳定性分数阈值,默认为0.95
mask_threshold=0, # 掩码阈值,默认为0
stability_score_offset=1, # 稳定性分数偏移量,默认为1
):
# 从模型输入中弹出"input_boxes",并保存在input_boxes变量中
input_boxes = model_inputs.pop("input_boxes")
# 从模型输入中弹出"is_last",并保存在is_last变量中
is_last = model_inputs.pop("is_last")
# 从模型输入中弹出"original_sizes",并将其转换为列表保存在original_sizes变量中
original_sizes = model_inputs.pop("original_sizes").tolist()
# 从模型输入中弹出"reshaped_input_sizes",并将其转换为列表保存在reshaped_input_sizes变量中
reshaped_input_sizes = model_inputs.pop("reshaped_input_sizes").tolist()
# 使用模型进行推理,将模型输入传递给模型并获取模型输出
model_outputs = self.model(**model_inputs)
# 在这里进行后处理,以避免复制所有掩码的CPU GPU
# 从模型输出中获取"pred_masks",即低分辨率掩码
low_resolution_masks = model_outputs["pred_masks"]
# 调用图像处理器的方法对掩码进行后处理,得到更高分辨率的掩码
masks = self.image_processor.post_process_masks(
low_resolution_masks, original_sizes, reshaped_input_sizes, mask_threshold, binarize=False
)
# 从模型输出中获取"iou_scores",即IoU分数
iou_scores = model_outputs["iou_scores"]
# 使用图像处理器的方法对掩码进行筛选,得到最终的掩码、IoU分数和边界框
masks, iou_scores, boxes = self.image_processor.filter_masks(
masks[0],
iou_scores[0],
original_sizes[0],
input_boxes[0],
pred_iou_thresh,
stability_score_thresh,
mask_threshold,
stability_score_offset,
)
# 返回处理后的结果,包括掩码、is_last标志、边界框和IoU分数
return {
"masks": masks,
"is_last": is_last,
"boxes": boxes,
"iou_scores": iou_scores,
}
# 定义后处理方法,用于整合多个模型输出并生成最终的掩码和分数
def postprocess(
self,
model_outputs,
output_rle_mask=False,
output_bboxes_mask=False,
crops_nms_thresh=0.7,
):
# 存储所有模型输出的IoU分数、掩码和边界框
all_scores = []
all_masks = []
all_boxes = []
for model_output in model_outputs:
# 弹出模型输出中的"IoU_scores"并添加到all_scores列表中
all_scores.append(model_output.pop("iou_scores"))
# 扩展模型输出中的"masks"并添加到all_masks列表中
all_masks.extend(model_output.pop("masks"))
# 弹出模型输出中的"boxes"并添加到all_boxes列表中
all_boxes.append(model_output.pop("boxes"))
# 使用PyTorch的方法连接所有IoU分数和边界框
all_scores = torch.cat(all_scores)
all_boxes = torch.cat(all_boxes)
# 调用图像处理器的方法进行掩码生成的后处理,得到输出掩码、IoU分数、RLE掩码和边界框
output_masks, iou_scores, rle_mask, bounding_boxes = self.image_processor.post_process_for_mask_generation(
all_masks, all_scores, all_boxes, crops_nms_thresh
)
# 创建默认字典,用于存储额外的输出结果
extra = defaultdict(list)
for output in model_outputs:
for k, v in output.items():
extra[k].append(v)
# 创建可选项字典,根据需要添加RLE掩码或边界框
optional = {}
if output_rle_mask:
optional["rle_mask"] = rle_mask
if output_bboxes_mask:
optional["bounding_boxes"] = bounding_boxes
# 返回最终处理结果,包括输出掩码、IoU分数以及额外的输出结果
return {"masks": output_masks, "scores": iou_scores, **optional, **extra}
.\pipelines\object_detection.py
from typing import Any, Dict, List, Union
from ..utils import add_end_docstrings, is_torch_available, is_vision_available, logging, requires_backends
from .base import Pipeline, build_pipeline_init_args
if is_vision_available():
from ..image_utils import load_image
if is_torch_available():
import torch
from ..models.auto.modeling_auto import (
MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES,
MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES,
)
logger = logging.get_logger(__name__)
Prediction = Dict[str, Any]
Predictions = List[Prediction]
@add_end_docstrings(build_pipeline_init_args(has_image_processor=True))
class ObjectDetectionPipeline(Pipeline):
"""
Object detection pipeline using any `AutoModelForObjectDetection`. This pipeline predicts bounding boxes of objects
and their classes.
Example:
```
>>> from transformers import pipeline
>>> detector = pipeline(model="facebook/detr-resnet-50")
>>> detector("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png")
[{'score': 0.997, 'label': 'bird', 'box': {'xmin': 69, 'ymin': 171, 'xmax': 396, 'ymax': 507}}, {'score': 0.999, 'label': 'bird', 'box': {'xmin': 398, 'ymin': 105, 'xmax': 767, 'ymax': 507}}]
>>> # x, y are expressed relative to the top left hand corner.
```
Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
This object detection pipeline can currently be loaded from [`pipeline`] using the following task identifier:
`"object-detection"`.
See the list of available models on [huggingface.co/models](https://huggingface.co/models?filter=object-detection).
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
if self.framework == "tf":
raise ValueError(f"The {self.__class__} is only available in PyTorch.")
requires_backends(self, "vision")
mapping = MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES.copy()
mapping.update(MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES)
self.check_model_type(mapping)
def _sanitize_parameters(self, **kwargs):
preprocess_params = {}
if "timeout" in kwargs:
preprocess_params["timeout"] = kwargs["timeout"]
postprocess_kwargs = {}
if "threshold" in kwargs:
postprocess_kwargs["threshold"] = kwargs["threshold"]
return preprocess_params, {}, postprocess_kwargs
def __call__(self, *args, **kwargs) -> Union[Predictions, List[Prediction]]:
"""
Detect objects (bounding boxes & classes) in the image(s) passed as inputs.
Args:
images (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
The pipeline handles three types of images:
- A string containing an HTTP(S) link pointing to an image
- A string containing a local path to an image
- An image loaded in PIL directly
The pipeline accepts either a single image or a batch of images. Images in a batch must all be in the
same format: all as HTTP(S) links, all as local paths, or all as PIL images.
threshold (`float`, *optional*, defaults to 0.9):
The probability necessary to make a prediction.
timeout (`float`, *optional*, defaults to None):
The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
the call may block forever.
Return:
A list of dictionaries or a list of list of dictionaries containing the result. If the input is a single
image, will return a list of dictionaries, if the input is a list of several images, will return a list of
list of dictionaries corresponding to each image.
The dictionaries contain the following keys:
- **label** (`str`) -- The class label identified by the model.
- **score** (`float`) -- The score attributed by the model for that label.
- **box** (`List[Dict[str, int]]`) -- The bounding box of detected object in image's original size.
"""
return super().__call__(*args, **kwargs)
def preprocess(self, image, timeout=None):
image = load_image(image, timeout=timeout)
target_size = torch.IntTensor([[image.height, image.width]])
inputs = self.image_processor(images=[image], return_tensors="pt")
if self.tokenizer is not None:
inputs = self.tokenizer(text=inputs["words"], boxes=inputs["boxes"], return_tensors="pt")
inputs["target_size"] = target_size
return inputs
def _forward(self, model_inputs):
target_size = model_inputs.pop("target_size")
outputs = self.model(**model_inputs)
model_outputs = outputs.__class__({"target_size": target_size, **outputs})
if self.tokenizer is not None:
model_outputs["bbox"] = model_inputs["bbox"]
return model_outputs
def postprocess(self, model_outputs, threshold=0.9):
target_size = model_outputs["target_size"]
if self.tokenizer is not None:
height, width = target_size[0].tolist()
def unnormalize(bbox):
return self._get_bounding_box(
torch.Tensor(
[
(width * bbox[0] / 1000),
(height * bbox[1] / 1000),
(width * bbox[2] / 1000),
(height * bbox[3] / 1000),
]
)
)
scores, classes = model_outputs["logits"].squeeze(0).softmax(dim=-1).max(dim=-1)
labels = [self.model.config.id2label[prediction] for prediction in classes.tolist()]
boxes = [unnormalize(bbox) for bbox in model_outputs["bbox"].squeeze(0)]
keys = ["score", "label", "box"]
annotation = [dict(zip(keys, vals)) for vals in zip(scores.tolist(), labels, boxes) if vals[0] > threshold]
else:
raw_annotations = self.image_processor.post_process_object_detection(model_outputs, threshold, target_size)
raw_annotation = raw_annotations[0]
scores = raw_annotation["scores"]
labels = raw_annotation["labels"]
boxes = raw_annotation["boxes"]
raw_annotation["scores"] = scores.tolist()
raw_annotation["labels"] = [self.model.config.id2label[label.item()] for label in labels]
raw_annotation["boxes"] = [self._get_bounding_box(box) for box in boxes]
keys = ["score", "label", "box"]
annotation = [
dict(zip(keys, vals))
for vals in zip(raw_annotation["scores"], raw_annotation["labels"], raw_annotation["boxes"])
]
return annotation
def _get_bounding_box(self, box: "torch.Tensor") -> Dict[str, int]:
"""
将列表 [xmin, xmax, ymin, ymax] 转换为字典 { "xmin": xmin, ... }
Args:
box (`torch.Tensor`): 包含角落格式坐标的张量。
Returns:
bbox (`Dict[str, int]`): 包含角落格式坐标的字典。
"""
if self.framework != "pt":
raise ValueError("The ObjectDetectionPipeline is only available in PyTorch.")
xmin, ymin, xmax, ymax = box.int().tolist()
bbox = {
"xmin": xmin,
"ymin": ymin,
"xmax": xmax,
"ymax": ymax,
}
return bbox