Transformers 源码解析(一百二十一)
.\models\wav2vec2\modeling_wav2vec2.py
""" PyTorch Wav2Vec2 model."""
import math
import warnings
from dataclasses import dataclass
from typing import Optional, Tuple, Union
import numpy as np
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import CrossEntropyLoss
from ...activations import ACT2FN
from ...integrations.deepspeed import is_deepspeed_zero3_enabled
from ...modeling_outputs import (
BaseModelOutput,
CausalLMOutput,
MaskedLMOutput,
SequenceClassifierOutput,
TokenClassifierOutput,
Wav2Vec2BaseModelOutput,
XVectorOutput,
)
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import is_torch_greater_or_equal_than_1_13
from ...utils import (
ModelOutput,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
cached_file,
is_peft_available,
is_safetensors_available,
logging,
replace_return_docstrings,
)
from .configuration_wav2vec2 import Wav2Vec2Config
WAV2VEC2_ADAPTER_PT_FILE = "adapter.{}.bin"
WAV2VEC2_ADAPTER_SAFE_FILE = "adapter.{}.safetensors"
if is_safetensors_available():
from safetensors.torch import load_file as safe_load_file
logger = logging.get_logger(__name__)
_HIDDEN_STATES_START_POSITION = 2
_CONFIG_FOR_DOC = "Wav2Vec2Config"
_CHECKPOINT_FOR_DOC = "facebook/wav2vec2-base-960h"
_EXPECTED_OUTPUT_SHAPE = [1, 292, 768]
_CTC_EXPECTED_OUTPUT = "'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL'"
_CTC_EXPECTED_LOSS = 53.48
_SEQ_CLASS_CHECKPOINT = "superb/wav2vec2-base-superb-ks"
_SEQ_CLASS_EXPECTED_OUTPUT = "'_unknown_'"
_SEQ_CLASS_EXPECTED_LOSS = 6.54
_FRAME_CLASS_CHECKPOINT = "anton-l/wav2vec2-base-superb-sd"
_FRAME_EXPECTED_OUTPUT = [0, 0]
_XVECTOR_CHECKPOINT = "anton-l/wav2vec2-base-superb-sv"
_XVECTOR_EXPECTED_OUTPUT = 0.98
WAV_2_VEC_2_PRETRAINED_MODEL_ARCHIVE_LIST = [
"facebook/wav2vec2-base-960h",
"facebook/wav2vec2-large-960h",
"facebook/wav2vec2-large-960h-lv60",
"facebook/wav2vec2-large-960h-lv60-self",
]
@dataclass
class Wav2Vec2ForPreTrainingOutput(ModelOutput):
"""
输出类的数据类,用于Wav2Vec2的预训练模型。
"""
Args:
loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`):
总损失,包括对比损失 (L_m) 和多样性损失 (L_d),详见[官方论文](https://arxiv.org/pdf/2006.11477.pdf)。
projected_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
模型隐藏状态,投影到 *config.proj_codevector_dim* 维度,可用于预测掩码投影量化状态。
projected_quantized_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
量化的提取特征向量,投影到 *config.proj_codevector_dim* 维度,代表对比损失的正目标向量。
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
模型每一层的隐藏状态,包括初始嵌入输出。
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
自注意力机制 softmax 后的注意力权重,用于计算自注意力头中的加权平均值。
contrastive_loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`):
对比损失 (L_m),详见[官方论文](https://arxiv.org/pdf/2006.11477.pdf)。
diversity_loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`):
多样性损失 (L_d),详见[官方论文](https://arxiv.org/pdf/2006.11477.pdf)。
def _compute_mask_indices(
shape: Tuple[int, int],
mask_prob: float,
mask_length: int,
attention_mask: Optional[torch.LongTensor] = None,
min_masks: int = 0,
) -> np.ndarray:
"""
Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run on
CPU as part of the preprocessing during training.
Args:
shape: The shape for which to compute masks. This should be of a tuple of size 2 where
the first element is the batch size and the second element is the length of the axis to span.
mask_prob: The percentage of the whole axis (between 0 and 1) which will be masked. The number of
independently generated mask spans of length `mask_length` is computed by
`mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
actual percentage will be smaller.
mask_length: size of the mask
min_masks: minimum number of masked spans
attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
each batch dimension.
"""
batch_size, sequence_length = shape
if mask_length < 1:
raise ValueError("`mask_length` has to be bigger than 0.")
if mask_length > sequence_length:
raise ValueError(
f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
f" and `sequence_length`: {sequence_length}`"
)
epsilon = np.random.rand(1).item()
def compute_num_masked_span(input_length):
"""Given input length, compute how many spans should be masked"""
num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
num_masked_span = max(num_masked_span, min_masks)
if num_masked_span * mask_length > sequence_length:
num_masked_span = sequence_length // mask_length
if input_length - (mask_length - 1) < num_masked_span:
num_masked_span = max(input_length - (mask_length - 1), 0)
return num_masked_span
input_lengths = (
attention_mask.sum(-1).detach().tolist()
if attention_mask is not None
else [sequence_length for _ in range(batch_size)]
)
spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=bool)
spec_aug_mask_idxs = []
max_num_masked_span = compute_num_masked_span(sequence_length)
if max_num_masked_span == 0:
return spec_aug_mask
for input_length in input_lengths:
num_masked_span = compute_num_masked_span(input_length)
spec_aug_mask_idx = np.random.choice(
np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
)
if len(spec_aug_mask_idx) == 0:
dummy_mask_idx = sequence_length - 1
else:
dummy_mask_idx = spec_aug_mask_idx[0]
spec_aug_mask_idx = np.concatenate(
[spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx]
)
spec_aug_mask_idxs.append(spec_aug_mask_idx)
spec_aug_mask_idxs = np.array(spec_aug_mask_idxs)
spec_aug_mask_idxs = np.broadcast_to(
spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length)
)
spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
offsets = np.arange(mask_length)[None, None, :]
offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
batch_size, max_num_masked_span * mask_length
)
spec_aug_mask_idxs = spec_aug_mask_idxs + offsets
if spec_aug_mask_idxs.max() > sequence_length - 1:
spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1
np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)
return spec_aug_mask
def _sample_negative_indices(
features_shape: Tuple, num_negatives: int, mask_time_indices: Optional[np.ndarray] = None
):
"""
Sample `num_negatives` vectors from feature vectors.
"""
batch_size, sequence_length = features_shape
sequence_length_range = np.arange(sequence_length)
sampled_negative_indices = np.zeros(shape=(batch_size, sequence_length, num_negatives), dtype=np.int32)
mask_time_indices = (
mask_time_indices.astype(bool) if mask_time_indices is not None else np.ones(features_shape, dtype=bool)
)
for batch_idx in range(batch_size):
high = mask_time_indices[batch_idx].sum() - 1
mapped_masked_indices = sequence_length_range[mask_time_indices[batch_idx]]
feature_indices = np.broadcast_to(np.arange(high + 1)[:, None], (high + 1, num_negatives))
sampled_indices = np.random.randint(0, high, size=(high + 1, num_negatives))
sampled_indices[sampled_indices >= feature_indices] += 1
sampled_negative_indices[batch_idx][mask_time_indices[batch_idx]] = mapped_masked_indices[sampled_indices]
sampled_negative_indices[batch_idx] += batch_idx * sequence_length
return sampled_negative_indices
class Wav2Vec2NoLayerNormConvLayer(nn.Module):
def __init__(self, config, layer_id=0):
super().__init__()
self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
self.out_conv_dim = config.conv_dim[layer_id]
self.conv = nn.Conv1d(
self.in_conv_dim,
self.out_conv_dim,
kernel_size=config.conv_kernel[layer_id],
stride=config.conv_stride[layer_id],
bias=config.conv_bias,
)
self.activation = ACT2FN[config.feat_extract_activation]
def forward(self, hidden_states):
hidden_states = self.conv(hidden_states)
hidden_states = self.activation(hidden_states)
return hidden_states
class Wav2Vec2LayerNormConvLayer(nn.Module):
def __init__(self, config, layer_id=0):
super().__init__()
self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
self.out_conv_dim = config.conv_dim[layer_id]
self.conv = nn.Conv1d(
self.in_conv_dim,
self.out_conv_dim,
kernel_size=config.conv_kernel[layer_id],
stride=config.conv_stride[layer_id],
bias=config.conv_bias,
)
self.layer_norm = nn.LayerNorm(self.out_conv_dim, elementwise_affine=True)
self.activation = ACT2FN[config.feat_extract_activation]
def forward(self, hidden_states):
hidden_states = self.conv(hidden_states)
hidden_states = hidden_states.transpose(-2, -1)
hidden_states = self.layer_norm(hidden_states)
hidden_states = hidden_states.transpose(-2, -1)
hidden_states = self.activation(hidden_states)
return hidden_states
class Wav2Vec2GroupNormConvLayer(nn.Module):
def __init__(self, config, layer_id=0):
super().__init__()
self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
self.out_conv_dim = config.conv_dim[layer_id]
self.conv = nn.Conv1d(
self.in_conv_dim,
self.out_conv_dim,
kernel_size=config.conv_kernel[layer_id],
stride=config.conv_stride[layer_id],
bias=config.conv_bias,
)
self.activation = ACT2FN[config.feat_extract_activation]
self.layer_norm = nn.GroupNorm(num_groups=self.out_conv_dim, num_channels=self.out_conv_dim, affine=True)
def forward(self, hidden_states):
hidden_states = self.conv(hidden_states)
hidden_states = self.layer_norm(hidden_states)
hidden_states = self.activation(hidden_states)
return hidden_states
class Wav2Vec2PositionalConvEmbedding(nn.Module):
def __init__(self, config):
super().__init__()
self.conv = nn.Conv1d(
config.hidden_size,
config.hidden_size,
kernel_size=config.num_conv_pos_embeddings,
padding=config.num_conv_pos_embeddings // 2,
groups=config.num_conv_pos_embedding_groups,
)
weight_norm = nn.utils.weight_norm
if hasattr(nn.utils.parametrizations, "weight_norm"):
weight_norm = nn.utils.parametrizations.weight_norm
if is_deepspeed_zero3_enabled():
import deepspeed
with deepspeed.zero.GatheredParameters(self.conv.weight, modifier_rank=0):
self.conv = weight_norm(self.conv, name="weight", dim=2)
deepspeed.zero.register_external_parameter(self, self.conv.weight_v)
deepspeed.zero.register_external_parameter(self, self.conv.weight_g)
else:
self.conv = weight_norm(self.conv, name="weight", dim=2)
self.padding = Wav2Vec2SamePadLayer(config.num_conv_pos_embeddings)
self.activation = ACT2FN[config.feat_extract_activation]
def forward(self, hidden_states):
hidden_states = hidden_states.transpose(1, 2)
hidden_states = self.conv(hidden_states)
hidden_states = self.padding(hidden_states)
hidden_states = self.activation(hidden_states)
hidden_states = hidden_states.transpose(1, 2)
return hidden_states
class Wav2Vec2SamePadLayer(nn.Module):
def __init__(self, num_conv_pos_embeddings):
super().__init__()
self.num_pad_remove = 1 if num_conv_pos_embeddings % 2 == 0 else 0
def forward(self, hidden_states):
if self.num_pad_remove > 0:
hidden_states = hidden_states[:, :, : -self.num_pad_remove]
return hidden_states
class Wav2Vec2FeatureEncoder(nn.Module):
"""Construct the features from raw audio waveform"""
def __init__(self, config):
super().__init__()
if config.feat_extract_norm == "group":
conv_layers = [Wav2Vec2GroupNormConvLayer(config, layer_id=0)] + [
Wav2Vec2NoLayerNormConvLayer(config, layer_id=i + 1) for i in range(config.num_feat_extract_layers - 1)
]
elif config.feat_extract_norm == "layer":
conv_layers = [
Wav2Vec2LayerNormConvLayer(config, layer_id=i) for i in range(config.num_feat_extract_layers)
]
else:
raise ValueError(
f"`config.feat_extract_norm` is {config.feat_extract_norm}, but has to be one of ['group', 'layer']"
)
self.conv_layers = nn.ModuleList(conv_layers)
self.gradient_checkpointing = False
self._requires_grad = True
def _freeze_parameters(self):
for param in self.parameters():
param.requires_grad = False
self._requires_grad = False
def forward(self, input_values):
hidden_states = input_values[:, None]
if self._requires_grad and self.training:
hidden_states.requires_grad = True
for conv_layer in self.conv_layers:
if self._requires_grad and self.gradient_checkpointing and self.training:
hidden_states = self._gradient_checkpointing_func(
conv_layer.__call__,
hidden_states,
)
else:
hidden_states = conv_layer(hidden_states)
return hidden_states
class Wav2Vec2FeatureExtractor(Wav2Vec2FeatureEncoder):
def __init__(self, config):
super().__init__(config)
warnings.warn(
f"The class `{self.__class__.__name__}` has been depreciated "
"and will be removed in Transformers v5. "
f"Use `{self.__class__.__bases__[0].__name__}` instead.",
FutureWarning,
)
class Wav2Vec2FeatureProjection(nn.Module):
def __init__(self, config):
super().__init__()
self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.layer_norm_eps)
self.projection = nn.Linear(config.conv_dim[-1], config.hidden_size)
self.dropout = nn.Dropout(config.feat_proj_dropout)
def forward(self, hidden_states):
norm_hidden_states = self.layer_norm(hidden_states)
hidden_states = self.projection(norm_hidden_states)
hidden_states = self.dropout(hidden_states)
return hidden_states, norm_hidden_states
class Wav2Vec2Attention(nn.Module):
"""基于 'Attention Is All You Need' 论文的多头注意力机制"""
def __init__(
self,
embed_dim: int,
num_heads: int,
dropout: float = 0.0,
is_decoder: bool = False,
bias: bool = True,
is_causal: bool = False,
config: Optional[Wav2Vec2Config] = None,
):
super().__init__()
self.embed_dim = embed_dim
self.num_heads = num_heads
self.dropout = dropout
self.head_dim = embed_dim // num_heads
self.config = config
if (self.head_dim * num_heads) != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})."
)
self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder
self.is_causal = is_causal
self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
def forward(
self,
hidden_states: torch.Tensor,
key_value_states: Optional[torch.Tensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
attention_mask: Optional[torch.Tensor] = None,
layer_head_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
````
def __init__(self, config):
super().__init__()
self.intermediate_dropout = nn.Dropout(config.activation_dropout)
self.intermediate_dense = nn.Linear(config.hidden_size, config.intermediate_size)
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
self.output_dense = nn.Linear(config.intermediate_size, config.hidden_size)
self.output_dropout = nn.Dropout(config.hidden_dropout)
def forward(self, hidden_states):
hidden_states = self.intermediate_dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
hidden_states = self.intermediate_dropout(hidden_states)
hidden_states = self.output_dense(hidden_states)
hidden_states = self.output_dropout(hidden_states)
return hidden_states
class Wav2Vec2EncoderLayer(nn.Module):
def __init__(self, config):
super().__init__()
self.attention = Wav2Vec2Attention(
embed_dim=config.hidden_size,
num_heads=config.num_attention_heads,
dropout=config.attention_dropout,
is_decoder=False,
)
self.dropout = nn.Dropout(config.hidden_dropout)
self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.feed_forward = Wav2Vec2FeedForward(config)
self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
def forward(self, hidden_states, attention_mask=None, output_attentions=False):
attn_residual = hidden_states
hidden_states, attn_weights, _ = self.attention(
hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
)
hidden_states = self.dropout(hidden_states)
hidden_states = attn_residual + hidden_states
hidden_states = self.layer_norm(hidden_states)
hidden_states = hidden_states + self.feed_forward(hidden_states)
hidden_states = self.final_layer_norm(hidden_states)
outputs = (hidden_states,)
if output_attentions:
outputs += (attn_weights,)
return outputs
class Wav2Vec2EncoderLayerStableLayerNorm(nn.Module):
def __init__(self, config):
super().__init__()
self.attention = Wav2Vec2Attention(
embed_dim=config.hidden_size,
num_heads=config.num_attention_heads,
dropout=config.attention_dropout,
is_decoder=False,
)
self.dropout = nn.Dropout(config.hidden_dropout)
self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.feed_forward = Wav2Vec2FeedForward(config)
self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
if getattr(config, "adapter_attn_dim", None) is not None:
self.adapter_layer = Wav2Vec2AttnAdapterLayer(config)
else:
self.adapter_layer = None
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
):
):
attn_residual = hidden_states
hidden_states = self.layer_norm(hidden_states)
hidden_states, attn_weights, _ = self.attention(
hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
)
hidden_states = self.dropout(hidden_states)
hidden_states = attn_residual + hidden_states
hidden_states = hidden_states + self.feed_forward(self.final_layer_norm(hidden_states))
if self.adapter_layer is not None:
hidden_states = hidden_states + self.adapter_layer(hidden_states)
outputs = (hidden_states,)
if output_attentions:
outputs += (attn_weights,)
return outputs
class Wav2Vec2Encoder(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.pos_conv_embed = Wav2Vec2PositionalConvEmbedding(config)
self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout)
self.layers = nn.ModuleList([Wav2Vec2EncoderLayer(config) for _ in range(config.num_hidden_layers)])
self.gradient_checkpointing = False
def forward(
self,
hidden_states: torch.tensor,
attention_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
):
):
all_hidden_states = () if output_hidden_states else None
all_self_attentions = () if output_attentions else None
if attention_mask is not None:
expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2])
hidden_states[~expand_attention_mask] = 0
attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype)
attention_mask = attention_mask * torch.finfo(hidden_states.dtype).min
attention_mask = attention_mask.expand(
attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1]
)
position_embeddings = self.pos_conv_embed(hidden_states)
hidden_states = hidden_states + position_embeddings
hidden_states = self.layer_norm(hidden_states)
hidden_states = self.dropout(hidden_states)
deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
for layer in self.layers:
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
dropout_probability = torch.rand([])
skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
if not skip_the_layer or deepspeed_zero3_is_enabled:
if self.gradient_checkpointing and self.training:
layer_outputs = self._gradient_checkpointing_func(
layer.__call__,
hidden_states,
attention_mask,
output_attentions,
)
else:
layer_outputs = layer(
hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
)
hidden_states = layer_outputs[0]
if skip_the_layer:
layer_outputs = (None, None)
if output_attentions:
all_self_attentions = all_self_attentions + (layer_outputs[1],)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
return BaseModelOutput(
last_hidden_state=hidden_states,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
)
class Wav2Vec2EncoderStableLayerNorm(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.pos_conv_embed = Wav2Vec2PositionalConvEmbedding(config)
self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout)
self.layers = nn.ModuleList(
[Wav2Vec2EncoderLayerStableLayerNorm(config) for _ in range(config.num_hidden_layers)]
)
self.gradient_checkpointing = False
def forward(
self,
hidden_states,
attention_mask=None,
output_attentions=False,
output_hidden_states=False,
return_dict=True,
):
):
all_hidden_states = () if output_hidden_states else None
all_self_attentions = () if output_attentions else None
if attention_mask is not None:
expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2])
hidden_states[~expand_attention_mask] = 0
attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype)
attention_mask = attention_mask * torch.finfo(hidden_states.dtype).min
attention_mask = attention_mask.expand(
attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1]
)
position_embeddings = self.pos_conv_embed(hidden_states)
hidden_states = hidden_states + position_embeddings
hidden_states = self.dropout(hidden_states)
deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
for layer in self.layers:
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
dropout_probability = torch.rand([])
skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
if not skip_the_layer or deepspeed_zero3_is_enabled:
if self.gradient_checkpointing and self.training:
layer_outputs = self._gradient_checkpointing_func(
layer.__call__,
hidden_states,
attention_mask,
output_attentions,
)
else:
layer_outputs = layer(
hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
)
hidden_states = layer_outputs[0]
if skip_the_layer:
layer_outputs = (None, None)
if output_attentions:
all_self_attentions = all_self_attentions + (layer_outputs[1],)
hidden_states = self.layer_norm(hidden_states)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
return BaseModelOutput(
last_hidden_state=hidden_states,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
)
class Wav2Vec2GumbelVectorQuantizer(nn.Module):
"""
Vector quantization using gumbel softmax. See `[CATEGORICAL REPARAMETERIZATION WITH
GUMBEL-SOFTMAX](https://arxiv.org/pdf/1611.01144.pdf) for more information.
"""
def __init__(self, config):
super().__init__()
self.num_groups = config.num_codevector_groups
self.num_vars = config.num_codevectors_per_group
if config.codevector_dim % self.num_groups != 0:
raise ValueError(
f"`config.codevector_dim {config.codevector_dim} must be divisible "
f"by `config.num_codevector_groups` {self.num_groups} for concatenation"
)
self.codevectors = nn.Parameter(
torch.FloatTensor(1, self.num_groups * self.num_vars, config.codevector_dim // self.num_groups)
)
self.weight_proj = nn.Linear(config.conv_dim[-1], self.num_groups * self.num_vars)
self.temperature = 2
@staticmethod
def _compute_perplexity(probs, mask=None):
if mask is not None:
mask_extended = mask.flatten()[:, None, None].expand(probs.shape)
probs = torch.where(mask_extended, probs, torch.zeros_like(probs))
marginal_probs = probs.sum(dim=0) / mask.sum()
else:
marginal_probs = probs.mean(dim=0)
perplexity = torch.exp(-torch.sum(marginal_probs * torch.log(marginal_probs + 1e-7), dim=-1)).sum()
return perplexity
def forward(self, hidden_states, mask_time_indices=None):
batch_size, sequence_length, hidden_size = hidden_states.shape
hidden_states = self.weight_proj(hidden_states)
hidden_states = hidden_states.view(batch_size * sequence_length * self.num_groups, -1)
if self.training:
codevector_probs = nn.functional.gumbel_softmax(
hidden_states.float(), tau=self.temperature, hard=True
).type_as(hidden_states)
codevector_soft_dist = torch.softmax(
hidden_states.view(batch_size * sequence_length, self.num_groups, -1).float(), dim=-1
)
perplexity = self._compute_perplexity(codevector_soft_dist, mask_time_indices)
else:
codevector_idx = hidden_states.argmax(dim=-1)
codevector_probs = hidden_states.new_zeros(hidden_states.shape).scatter_(
-1, codevector_idx.view(-1, 1), 1.0
)
codevector_probs = codevector_probs.view(batch_size * sequence_length, self.num_groups, -1)
perplexity = self._compute_perplexity(codevector_probs, mask_time_indices)
codevector_probs = codevector_probs.view(batch_size * sequence_length, -1)
codevectors_per_group = codevector_probs.unsqueeze(-1) * self.codevectors
codevectors = codevectors_per_group.view(batch_size * sequence_length, self.num_groups, self.num_vars, -1)
codevectors = codevectors.sum(-2).view(batch_size, sequence_length, -1)
return codevectors, perplexity
class Wav2Vec2Adapter(nn.Module):
def __init__(self, config):
super().__init__()
if config.output_hidden_size != config.hidden_size:
self.proj = nn.Linear(config.hidden_size, config.output_hidden_size)
self.proj_layer_norm = nn.LayerNorm(config.output_hidden_size)
else:
self.proj = self.proj_layer_norm = None
self.layers = nn.ModuleList(Wav2Vec2AdapterLayer(config) for _ in range(config.num_adapter_layers))
self.layerdrop = config.layerdrop
def forward(self, hidden_states):
if self.proj is not None and self.proj_layer_norm is not None:
hidden_states = self.proj(hidden_states)
hidden_states = self.proj_layer_norm(hidden_states)
hidden_states = hidden_states.transpose(1, 2)
for layer in self.layers:
layerdrop_prob = np.random.random()
if not self.training or (layerdrop_prob > self.layerdrop):
hidden_states = layer(hidden_states)
hidden_states = hidden_states.transpose(1, 2)
return hidden_states
class Wav2Vec2AdapterLayer(nn.Module):
def __init__(self, config):
super().__init__()
self.conv = nn.Conv1d(
config.output_hidden_size,
2 * config.output_hidden_size,
config.adapter_kernel_size,
stride=config.adapter_stride,
padding=1,
)
def forward(self, hidden_states):
hidden_states = self.conv(hidden_states)
hidden_states = nn.functional.glu(hidden_states, dim=1)
return hidden_states
class Wav2Vec2AttnAdapterLayer(nn.Module):
def __init__(self, config):
"""
Implements adapter modules directly with 3D tensor weight as parameters and without using ModuleList to speed
up training throughput.
"""
super().__init__()
self.input_dim = config.adapter_attn_dim
self.hidden_dim = config.hidden_size
self.norm = nn.LayerNorm(self.hidden_dim)
self.linear_1 = nn.Linear(self.hidden_dim, self.input_dim)
self.act_fn = nn.ReLU()
self.linear_2 = nn.Linear(self.input_dim, self.hidden_dim)
def forward(self, hidden_states: torch.FloatTensor):
hidden_states = self.norm(hidden_states)
hidden_states = self.linear_1(hidden_states)
hidden_states = self.act_fn(hidden_states)
hidden_states = self.linear_2(hidden_states)
return hidden_states
class Wav2Vec2PreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = Wav2Vec2Config
base_model_prefix = "wav2vec2"
main_input_name = "input_values"
supports_gradient_checkpointing = True
def _init_weights(self, module):
"""Initialize the weights"""
if isinstance(module, Wav2Vec2ForPreTraining):
module.project_hid.reset_parameters()
module.project_q.reset_parameters()
module.project_hid._is_hf_initialized = True
module.project_q._is_hf_initialized = True
elif isinstance(module, Wav2Vec2GumbelVectorQuantizer):
module.weight_proj.weight.data.normal_(mean=0.0, std=1)
module.weight_proj.bias.data.zero_()
nn.init.uniform_(module.codevectors)
elif isinstance(module, Wav2Vec2PositionalConvEmbedding):
nn.init.normal_(
module.conv.weight,
mean=0,
std=2 * math.sqrt(1 / (module.conv.kernel_size[0] * module.conv.in_channels)),
)
nn.init.constant_(module.conv.bias, 0)
elif isinstance(module, Wav2Vec2FeatureProjection):
k = math.sqrt(1 / module.projection.in_features)
nn.init.uniform_(module.projection.weight, a=-k, b=k)
nn.init.uniform_(module.projection.bias, a=-k, b=k)
elif isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
elif isinstance(module, nn.Conv1d):
nn.init.kaiming_normal_(module.weight)
if module.bias is not None:
k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
nn.init.uniform_(module.bias, a=-k, b=k)
def _get_feat_extract_output_lengths(
self, input_lengths: Union[torch.LongTensor, int], add_adapter: Optional[bool] = None
):
"""
Computes the output length of the convolutional layers
"""
add_adapter = self.config.add_adapter if add_adapter is None else add_adapter
def _conv_out_length(input_length, kernel_size, stride):
return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
if add_adapter:
for _ in range(self.config.num_adapter_layers):
input_lengths = _conv_out_length(input_lengths, 1, self.config.adapter_stride)
return input_lengths
def _get_feature_vector_attention_mask(
self, feature_vector_length: int, attention_mask: torch.LongTensor, add_adapter=None
):
non_padded_lengths = attention_mask.cumsum(dim=-1)[:, -1]
output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths, add_adapter=add_adapter)
output_lengths = output_lengths.to(torch.long)
batch_size = attention_mask.shape[0]
attention_mask = torch.zeros(
(batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
)
attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1
attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
return attention_mask
def _get_adapters(self):
if self.config.adapter_attn_dim is None:
raise ValueError(f"{self.__class__} has no adapter layers. Make sure to define `config.adapter_attn_dim`.")
adapter_weights = {}
for name, module in self.named_modules():
if isinstance(module, Wav2Vec2AttnAdapterLayer):
for param_name, param in module.named_parameters():
adapter_weights[".".join([name, param_name])] = param
if isinstance(self, Wav2Vec2ForCTC):
for name, param in self.lm_head.named_parameters():
adapter_weights[".".join(["lm_head", name])] = param
return adapter_weights
def init_adapter_layers(self):
"""
(重新-)初始化注意力适配器层和 LM 头部,用于仅适配器微调
"""
for module in self.modules():
if isinstance(module, Wav2Vec2AttnAdapterLayer):
self._init_weights(module)
if isinstance(self, Wav2Vec2ForCTC):
self._init_weights(self.lm_head)
WAV_2_VEC_2_START_DOCSTRING = r"""
Wav2Vec2 was proposed in [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech
Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael
Auli.
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving etc.).
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
config ([`Wav2Vec2Config`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
WAV_2_VEC_2_INPUTS_DOCSTRING = r"""
# 接收输入参数并解析为输入的原始语音波形张量,类型为 `torch.FloatTensor`,形状为 `(batch_size, sequence_length)`
Args:
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2Processor.__call__`] for details.
# 可选参数,注意力遮罩张量,形状为 `(batch_size, sequence_length)`,类型为 `torch.LongTensor`
attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0,
1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
<Tip warning={true}>
`attention_mask` should only be passed if the corresponding processor has `config.return_attention_mask ==
True`. For all models whose processor has `config.return_attention_mask == False`, such as
[wav2vec2-base](https://huggingface.co/facebook/wav2vec2-base-960h), `attention_mask` should **not** be
passed to avoid degraded performance when doing batched inference. For such models `input_values` should
simply be padded with 0 and passed without `attention_mask`. Be aware that these models also yield slightly
different results depending on whether `input_values` is padded or not.
</Tip>
# 可选参数,控制是否返回所有注意力层的注意力张量
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
# 可选参数,控制是否返回所有层的隐藏状态张量
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
# 可选参数,控制是否返回一个包含多个输出的 [`~utils.ModelOutput`] 对象,而不是一个普通的元组
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
Wav2Vec2Model 类,继承自 Wav2Vec2PreTrainedModel 类,表示一个基本的 Wav2Vec2 模型,输出未经特定顶层头部处理的原始隐藏状态。
@param config: Wav2Vec2Config 对象,配置当前模型的参数
初始化函数,设置模型的各个组件和参数
"""
class Wav2Vec2Model(Wav2Vec2PreTrainedModel):
def __init__(self, config: Wav2Vec2Config):
super().__init__(config)
self.config = config
self.feature_extractor = Wav2Vec2FeatureEncoder(config) # 初始化音频特征提取器
self.feature_projection = Wav2Vec2FeatureProjection(config) # 初始化音频特征投影器
# 如果配置要求,初始化遮罩向量
if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
self.masked_spec_embed = nn.Parameter(torch.FloatTensor(config.hidden_size).uniform_())
# 根据配置选择稳定层归一化的编码器或普通编码器
if config.do_stable_layer_norm:
self.encoder = Wav2Vec2EncoderStableLayerNorm(config)
else:
self.encoder = Wav2Vec2Encoder(config)
# 如果配置添加适配器,则初始化适配器
self.adapter = Wav2Vec2Adapter(config) if config.add_adapter else None
# 初始化权重并应用最终处理
self.post_init()
def freeze_feature_extractor(self):
"""
调用此函数将禁用特征编码器的梯度计算,使其参数在训练过程中不会更新。
"""
warnings.warn(
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
"Please use the equivalent `freeze_feature_encoder` method instead.",
FutureWarning,
)
self.freeze_feature_encoder()
def freeze_feature_encoder(self):
"""
调用此函数将禁用特征编码器的梯度计算,使其参数在训练过程中不会更新。
"""
self.feature_extractor._freeze_parameters()
def _mask_hidden_states(
self,
hidden_states: torch.FloatTensor,
mask_time_indices: Optional[torch.FloatTensor] = None,
attention_mask: Optional[torch.LongTensor] = None,
):
"""
Masks extracted features along time axis and/or along feature axis according to
[SpecAugment](https://arxiv.org/abs/1904.08779).
"""
# `config.apply_spec_augment` can set masking to False
if not getattr(self.config, "apply_spec_augment", True):
return hidden_states
# generate indices & apply SpecAugment along time axis
batch_size, sequence_length, hidden_size = hidden_states.size()
if mask_time_indices is not None:
# apply SpecAugment along time axis with given mask_time_indices
hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
elif self.config.mask_time_prob > 0 and self.training:
# compute mask indices for time axis based on configuration parameters
mask_time_indices = _compute_mask_indices(
(batch_size, sequence_length),
mask_prob=self.config.mask_time_prob,
mask_length=self.config.mask_time_length,
attention_mask=attention_mask,
min_masks=self.config.mask_time_min_masks,
)
# convert computed indices to a Torch tensor for device compatibility and dtype
mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
# apply SpecAugment along time axis using computed mask indices
hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
if self.config.mask_feature_prob > 0 and self.training:
# generate indices & apply SpecAugment along feature axis
mask_feature_indices = _compute_mask_indices(
(batch_size, hidden_size),
mask_prob=self.config.mask_feature_prob,
mask_length=self.config.mask_feature_length,
min_masks=self.config.mask_feature_min_masks,
)
# convert computed indices to a Torch tensor for device compatibility and dtype
mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
# expand feature axis mask indices to match hidden_states dimensions
mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
# apply SpecAugment along feature axis using expanded mask indices
hidden_states[mask_feature_indices] = 0
return hidden_states
@add_start_docstrings_to_model_forward(WAV_2_VEC_2_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=Wav2Vec2BaseModelOutput,
config_class=_CONFIG_FOR_DOC,
modality="audio",
expected_output=_EXPECTED_OUTPUT_SHAPE,
)
def forward(
self,
input_values: Optional[torch.Tensor],
attention_mask: Optional[torch.Tensor] = None,
mask_time_indices: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, Wav2Vec2BaseModelOutput]:
# 如果输出注意力张量未指定,则使用配置中的默认设置
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
# 如果输出隐藏状态未指定,则使用配置中的默认设置
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
# 如果返回字典未指定,则使用配置中的默认设置
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 提取特征向量
extract_features = self.feature_extractor(input_values)
# 调整特征向量的维度顺序
extract_features = extract_features.transpose(1, 2)
if attention_mask is not None:
# 计算与特征向量对应的降维注意力掩码
attention_mask = self._get_feature_vector_attention_mask(
extract_features.shape[1], attention_mask, add_adapter=False
)
# 投影特征向量到隐藏状态空间
hidden_states, extract_features = self.feature_projection(extract_features)
# 对隐藏状态进行屏蔽处理,根据时间索引和注意力掩码
hidden_states = self._mask_hidden_states(
hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask
)
# 编码器处理
encoder_outputs = self.encoder(
hidden_states,
attention_mask=attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 获取编码器的隐藏状态
hidden_states = encoder_outputs[0]
# 如果有适配器,应用适配器到隐藏状态
if self.adapter is not None:
hidden_states = self.adapter(hidden_states)
# 如果不返回字典,则返回一个元组,包含隐藏状态、提取的特征向量以及可能的其他输出
if not return_dict:
return (hidden_states, extract_features) + encoder_outputs[1:]
# 返回带有详细输出的 Wav2Vec2BaseModelOutput 对象
return Wav2Vec2BaseModelOutput(
last_hidden_state=hidden_states,
extract_features=extract_features,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
# 使用装饰器为 Wav2Vec2ForPreTraining 类添加文档字符串,描述该类包含量化器和 `VQ` 头部。
@add_start_docstrings("""Wav2Vec2 Model with a quantizer and `VQ` head on top.""", WAV_2_VEC_2_START_DOCSTRING)
class Wav2Vec2ForPreTraining(Wav2Vec2PreTrainedModel):
def __init__(self, config: Wav2Vec2Config):
# 调用父类构造函数初始化对象
super().__init__(config)
# 初始化 Wav2Vec2 模型
self.wav2vec2 = Wav2Vec2Model(config)
# 根据配置初始化特征量化器的丢弃层
self.dropout_features = nn.Dropout(config.feat_quantizer_dropout)
# 初始化 Gumbel 向量量化器
self.quantizer = Wav2Vec2GumbelVectorQuantizer(config)
# 将隐藏层的输出映射到代码向量的维度
self.project_hid = nn.Linear(config.hidden_size, config.proj_codevector_dim)
# 将代码向量映射到投影向量的维度
self.project_q = nn.Linear(config.codevector_dim, config.proj_codevector_dim)
# 初始化权重并应用最终处理
self.post_init()
def set_gumbel_temperature(self, temperature: int):
"""
设置 Gumbel softmax 温度为给定值。仅在训练时需要。
"""
self.quantizer.temperature = temperature
def freeze_feature_extractor(self):
"""
调用此函数将禁用特征编码器的梯度计算,使其参数在训练过程中不会被更新。
"""
warnings.warn(
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
"Please use the equivalent `freeze_feature_encoder` method instead.",
FutureWarning,
)
self.freeze_feature_encoder()
def freeze_feature_encoder(self):
"""
调用此函数将禁用特征编码器的梯度计算,使其参数在训练过程中不会被更新。
"""
self.wav2vec2.feature_extractor._freeze_parameters()
@staticmethod
def compute_contrastive_logits(
target_features: torch.FloatTensor,
negative_features: torch.FloatTensor,
predicted_features: torch.FloatTensor,
temperature: int = 0.1,
):
"""
计算对比损失的对数估计,使用余弦相似性作为 `[positive_feature, negative_features]` 和 `[predicted_features]` 之间的距离度量。
可以应用温度调节。
"""
# 将目标特征和负特征连接起来
target_features = torch.cat([target_features, negative_features], dim=0)
# 计算余弦相似性
logits = torch.cosine_similarity(predicted_features.float(), target_features.float(), dim=-1).type_as(
target_features
)
# 应用温度调节
logits = logits / temperature
return logits
# 使用装饰器为 model_forward 方法添加文档字符串,根据输入的 WAV_2_VEC_2_INPUTS_DOCSTRING 描述
@add_start_docstrings_to_model_forward(WAV_2_VEC_2_INPUTS_DOCSTRING)
# 替换返回值文档字符串为 Wav2Vec2ForPreTrainingOutput 类型,并使用 _CONFIG_FOR_DOC 作为配置类
@replace_return_docstrings(output_type=Wav2Vec2ForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
# 定义一个方法 `forward`,用于模型的前向传播
(
self,
input_values: Optional[torch.Tensor], # 输入的张量数据,可选
attention_mask: Optional[torch.Tensor] = None, # 注意力掩码张量,可选,默认为None
mask_time_indices: Optional[torch.BoolTensor] = None, # 时间索引掩码张量,可选,默认为None
sampled_negative_indices: Optional[torch.BoolTensor] = None, # 负采样索引掩码张量,可选,默认为None
output_attentions: Optional[bool] = None, # 是否输出注意力权重,可选,默认为None
output_hidden_states: Optional[bool] = None, # 是否输出隐藏状态,可选,默认为None
return_dict: Optional[bool] = None, # 是否以字典形式返回结果,可选,默认为None
# 使用装饰器为类添加文档字符串,说明该类是在 Wav2Vec2 模型基础上加上语言建模头部的模型
@add_start_docstrings("""Wav2Vec2 Model with a `language modeling` head on top.""", WAV_2_VEC_2_START_DOCSTRING)
class Wav2Vec2ForMaskedLM(Wav2Vec2PreTrainedModel):
def __init__(self, config):
super().__init__(config)
# 发出警告,提醒用户类 `Wav2Vec2ForMaskedLM` 已被弃用,请使用 `Wav2Vec2ForCTC` 替代
warnings.warn(
"The class `Wav2Vec2ForMaskedLM` is deprecated. Please use `Wav2Vec2ForCTC` instead.", FutureWarning
)
# 初始化 Wav2Vec2 模型和相关组件
self.wav2vec2 = Wav2Vec2Model(config)
self.dropout = nn.Dropout(config.final_dropout)
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size)
# 初始化权重并进行最终处理
self.post_init()
# 使用装饰器为前向传播方法添加文档字符串,引用了 WAV_2_VEC_2_INPUTS_DOCSTRING
@add_start_docstrings_to_model_forward(WAV_2_VEC_2_INPUTS_DOCSTRING)
def forward(
self,
input_values: torch.FloatTensor,
attention_mask: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: Optional[torch.Tensor] = None,
) -> Union[Tuple, MaskedLMOutput]:
# 如果 return_dict 未指定,则使用配置中的默认值
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 调用 Wav2Vec2 模型的前向传播,获取输出
outputs = self.wav2vec2(
input_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 获取隐藏状态,并对其进行 dropout 处理
hidden_states = outputs[0]
hidden_states = self.dropout(hidden_states)
# 将处理后的隐藏状态传递给语言建模头部进行预测
logits = self.lm_head(hidden_states)
# 如果不要求返回字典,则返回一个包含 logits 和其他输出的元组
if not return_dict:
output = (logits,) + outputs[2:]
return output
# 如果要求返回字典,则返回 MaskedLMOutput 类型的对象,包含 logits、隐藏状态和注意力分布
return MaskedLMOutput(logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions)
# 使用装饰器为类添加文档字符串,说明该类是在 Wav2Vec2 模型基础上加上用于 CTC 的语言建模头部的模型
@add_start_docstrings(
"""Wav2Vec2 Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
WAV_2_VEC_2_START_DOCSTRING,
"""
target_lang (`str`, *optional*):
Language id of adapter weights. Adapter weights are stored in the format adapter.<lang>.safetensors or
adapter.<lang>.bin. Only relevant when using an instance of [`Wav2Vec2ForCTC`] with adapters. Uses 'eng' by
default.
""",
)
class Wav2Vec2ForCTC(Wav2Vec2PreTrainedModel):
def __init__(self, config, target_lang: Optional[str] = None):
# 调用父类的构造方法初始化模型
super().__init__(config)
# 创建一个 Wav2Vec2 模型对象
self.wav2vec2 = Wav2Vec2Model(config)
# 根据配置中的最终 dropout 比率创建一个 dropout 层
self.dropout = nn.Dropout(config.final_dropout)
# 设置目标语言属性
self.target_lang = target_lang
# 检查配置中是否定义了词汇表大小,如果未定义则抛出 ValueError 异常
if config.vocab_size is None:
raise ValueError(
f"You are trying to instantiate {self.__class__} with a configuration that "
"does not define the vocabulary size of the language model head. Please "
"instantiate the model as follows: `Wav2Vec2ForCTC.from_pretrained(..., vocab_size=vocab_size)`. "
"or define `vocab_size` of your model's configuration."
)
# 根据配置设置输出的隐藏层大小,如果配置中定义了添加适配器且有适配器,则使用配置中的隐藏层大小,否则使用常规的隐藏层大小
output_hidden_size = (
config.output_hidden_size if hasattr(config, "add_adapter") and config.add_adapter else config.hidden_size
)
# 创建线性层,用于语言模型的输出
self.lm_head = nn.Linear(output_hidden_size, config.vocab_size)
# 初始化权重并进行最终的处理
self.post_init()
def tie_weights(self):
"""
This method overwrites [`~PreTrainedModel.tie_weights`] so that adapter weights can be correctly loaded when
passing `target_lang=...` to `from_pretrained(...)`.
This method is **not** supposed to be called by the user and is prone to be changed in the future.
"""
# 注意,`tie_weights` 方法通常用于绑定输入和输出的嵌入权重。这里重新定义此方法,以便在通过 `from_pretrained(...)` 传递 `target_lang=...` 时能正确加载适配器层权重。
# 这种做法虽然有些巧妙,但是由于 Wav2Vec2 模型不需要绑定输入和输出的嵌入权重,因此可以在这里重新定义此函数。
target_lang = self.target_lang
# 如果指定了 `target_lang` 且配置中的 `adapter_attn_dim` 未定义,则抛出 ValueError 异常
if target_lang is not None and getattr(self.config, "adapter_attn_dim", None) is None:
raise ValueError(f"Cannot pass `target_lang`: {target_lang} if `config.adapter_attn_dim` is not defined.")
# 如果未指定 `target_lang` 但配置中的 `adapter_attn_dim` 定义了,则记录日志提示默认将 `target_lang` 设置为 'eng'
elif target_lang is None and getattr(self.config, "adapter_attn_dim", None) is not None:
logger.info("By default `target_lang` is set to 'eng'.")
# 如果指定了 `target_lang`,则加载相应的适配器
elif target_lang is not None:
self.load_adapter(target_lang, force_load=True)
def freeze_feature_extractor(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
"""
# 发出警告,指出该方法已被弃用,并将在 Transformers v5 中移除,建议使用 `freeze_feature_encoder` 方法代替。
warnings.warn(
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
"Please use the equivalent `freeze_feature_encoder` method instead.",
FutureWarning,
)
# 调用 `freeze_feature_encoder` 方法来冻结特征编码器
self.freeze_feature_encoder()
# 冻结特征编码器的参数,使其在训练过程中不会更新梯度
def freeze_feature_encoder(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
"""
self.wav2vec2.feature_extractor._freeze_parameters()
# 冻结基础模型的参数,使其在训练过程中不会更新梯度,只有分类头会被更新
def freeze_base_model(self):
"""
Calling this function will disable the gradient computation for the base model so that its parameters will not
be updated during training. Only the classification head will be updated.
"""
for param in self.wav2vec2.parameters():
param.requires_grad = False
# 前向传播函数,接受多个参数并返回模型输出
@add_start_docstrings_to_model_forward(WAV_2_VEC_2_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=CausalLMOutput,
config_class=_CONFIG_FOR_DOC,
expected_output=_CTC_EXPECTED_OUTPUT,
expected_loss=_CTC_EXPECTED_LOSS,
)
def forward(
self,
input_values: Optional[torch.Tensor],
attention_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: Optional[torch.Tensor] = None,
) -> Union[Tuple, CausalLMOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
config.vocab_size - 1]`.
"""
# Determine if a return_dict is specified; otherwise, use the model's default setting
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# Pass input_values and optional arguments to wav2vec2 model for feature extraction
outputs = self.wav2vec2(
input_values,
attention_mask=attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# Extract hidden states from the model outputs and apply dropout
hidden_states = outputs[0]
hidden_states = self.dropout(hidden_states)
# Generate logits using the language model head
logits = self.lm_head(hidden_states)
# Initialize loss variable
loss = None
if labels is not None:
# Check if any label index exceeds the vocabulary size
if labels.max() >= self.config.vocab_size:
raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
# Retrieve input_lengths based on attention_mask
attention_mask = (
attention_mask if attention_mask is not None else torch.ones_like(input_values, dtype=torch.long)
)
input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)
# Mask out labels set to -100 and calculate target_lengths and flattened_targets
labels_mask = labels >= 0
target_lengths = labels_mask.sum(-1)
flattened_targets = labels.masked_select(labels_mask)
# Apply log_softmax to logits and transpose for CTC loss computation
log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1)
# Disable cuDNN optimization for CTC loss computation
with torch.backends.cudnn.flags(enabled=False):
loss = nn.functional.ctc_loss(
log_probs,
flattened_targets,
input_lengths,
target_lengths,
blank=self.config.pad_token_id,
reduction=self.config.ctc_loss_reduction,
zero_infinity=self.config.ctc_zero_infinity,
)
# If return_dict is False, format output accordingly
if not return_dict:
output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
return ((loss,) + output) if loss is not None else output
# Return output as a CausalLMOutput object when return_dict is True
return CausalLMOutput(
loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
)
@add_start_docstrings(
"""
Wav2Vec2 Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like
SUPERB Keyword Spotting.
""",
WAV_2_VEC_2_START_DOCSTRING,
)
class Wav2Vec2ForSequenceClassification(Wav2Vec2PreTrainedModel):
def __init__(self, config):
super().__init__(config)
if hasattr(config, "add_adapter") and config.add_adapter:
raise ValueError(
"Sequence classification does not support the use of Wav2Vec2 adapters (config.add_adapter=True)"
)
# 初始化 Wav2Vec2 模型
self.wav2vec2 = Wav2Vec2Model(config)
# 计算层数,包括 Transformer 层和输入嵌入层
num_layers = config.num_hidden_layers + 1 # transformer layers + input embeddings
# 如果配置指定使用加权层求和,则初始化权重
if config.use_weighted_layer_sum:
self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
# 线性投影层,将隐藏状态映射到分类器投影大小
self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size)
# 分类器线性层,将投影后的特征映射到标签数量的输出
self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels)
# 初始化权重并进行最终处理
self.post_init()
def freeze_feature_extractor(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameters will
not be updated during training.
"""
warnings.warn(
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
"Please use the equivalent `freeze_feature_encoder` method instead.",
FutureWarning,
)
# 调用底层方法冻结特征编码器的参数梯度
self.freeze_feature_encoder()
def freeze_feature_encoder(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
"""
# 冻结特征编码器,禁用其参数的梯度计算
self.wav2vec2.feature_extractor._freeze_parameters()
def freeze_base_model(self):
"""
Calling this function will disable the gradient computation for the base model so that its parameters will not
be updated during training. Only the classification head will be updated.
"""
# 冻结基础模型,禁用其所有参数的梯度计算,只允许分类头更新
for param in self.wav2vec2.parameters():
param.requires_grad = False
@add_start_docstrings_to_model_forward(WAV_2_VEC_2_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_SEQ_CLASS_CHECKPOINT,
output_type=SequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
modality="audio",
expected_output=_SEQ_CLASS_EXPECTED_OUTPUT,
expected_loss=_SEQ_CLASS_EXPECTED_LOSS,
)
def forward(
self,
input_values: Optional[torch.Tensor],
attention_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: Optional[torch.Tensor] = None,
"""
Runs the forward pass of the Wav2Vec2ForSequenceClassification model.
"""
# 省略了 forward 方法中的具体实现,但是加了装饰器和示例代码文档字符串
) -> Union[Tuple, SequenceClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
# 初始化返回字典,若未提供则使用配置中的默认设置
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 根据配置决定是否使用加权层的隐藏状态输出
output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
# 调用Wav2Vec2模型进行前向传播
outputs = self.wav2vec2(
input_values,
attention_mask=attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 若配置中设置使用加权层求和,则进行加权求和操作
if self.config.use_weighted_layer_sum:
hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
hidden_states = torch.stack(hidden_states, dim=1)
norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
else:
# 否则直接使用模型输出的第一个隐藏状态
hidden_states = outputs[0]
# 将隐藏状态投影到输出空间
hidden_states = self.projector(hidden_states)
# 计算池化输出,根据是否提供了注意力掩码选择不同的方式
if attention_mask is None:
pooled_output = hidden_states.mean(dim=1)
else:
padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask)
hidden_states[~padding_mask] = 0.0
pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1)
# 使用分类器预测结果
logits = self.classifier(pooled_output)
# 如果提供了标签,则计算损失
loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
# 根据是否使用返回字典,决定返回格式
if not return_dict:
output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
return ((loss,) + output) if loss is not None else output
# 使用自定义的SequenceClassifierOutput类来返回结果,包括损失、预测结果、隐藏状态和注意力权重
return SequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
"""
Wav2Vec2 Model with a frame classification head on top for tasks like Speaker Diarization.
"""
# 继承自Wav2Vec2PreTrainedModel,用于音频帧分类任务,例如说话人辨识
class Wav2Vec2ForAudioFrameClassification(Wav2Vec2PreTrainedModel):
def __init__(self, config):
super().__init__(config)
# 如果配置允许使用适配器,并且添加了适配器,抛出异常
if hasattr(config, "add_adapter") and config.add_adapter:
raise ValueError(
"Audio frame classification does not support the use of Wav2Vec2 adapters (config.add_adapter=True)"
)
# 初始化Wav2Vec2模型
self.wav2vec2 = Wav2Vec2Model(config)
num_layers = config.num_hidden_layers + 1 # transformer layers + input embeddings
# 如果配置使用加权层求和,则初始化层权重
if config.use_weighted_layer_sum:
self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
# 初始化分类器线性层
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.num_labels = config.num_labels
# 初始化权重
self.init_weights()
# 弃用警告:冻结特征提取器方法,建议使用freeze_feature_encoder代替
def freeze_feature_extractor(self):
warnings.warn(
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
"Please use the equivalent `freeze_feature_encoder` method instead.",
FutureWarning,
)
self.freeze_feature_encoder()
# 冻结特征编码器,禁用特征编码器参数的梯度计算,保持在训练过程中不更新
def freeze_feature_encoder(self):
self.wav2vec2.feature_extractor._freeze_parameters()
# 冻结基础模型,禁用基础模型参数的梯度计算,只更新分类头部
def freeze_base_model(self):
for param in self.wav2vec2.parameters():
param.requires_grad = False
# 前向传播方法,执行模型的前向计算
@add_start_docstrings_to_model_forward(WAV_2_VEC_2_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_FRAME_CLASS_CHECKPOINT,
output_type=TokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
modality="audio",
expected_output=_FRAME_EXPECTED_OUTPUT,
)
def forward(
self,
input_values: Optional[torch.Tensor],
attention_mask: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
) -> Union[Tuple, TokenClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
# 确定是否需要返回字典形式的输出,若未指定则使用配置中的默认设置
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 根据配置决定是否输出隐藏状态
output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
# 调用 wav2vec2 模型进行前向传播
outputs = self.wav2vec2(
input_values,
attention_mask=attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 若使用加权层求和策略,则对隐藏状态进行加权求和
if self.config.use_weighted_layer_sum:
hidden_states = outputs[_HIDDEN_STATES_START_POSITION] # 获取隐藏状态列表
hidden_states = torch.stack(hidden_states, dim=1) # 在新维度上堆叠隐藏状态张量
norm_weights = nn.functional.softmax(self.layer_weights, dim=-1) # 对层权重进行 softmax 归一化
hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1) # 加权求和隐藏状态
else:
hidden_states = outputs[0] # 直接使用第一个输出作为隐藏状态
# 将隐藏状态输入分类器,生成 logits
logits = self.classifier(hidden_states)
loss = None
if labels is not None:
# 如果提供了标签,计算交叉熵损失
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), torch.argmax(labels.view(-1, self.num_labels), axis=1))
# 若不需要返回字典形式的输出,则返回 logits 和隐藏状态列表
if not return_dict:
output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
return output
# 返回 TokenClassifierOutput 对象,包括损失、logits、隐藏状态和注意力
return TokenClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
# 定义 AMSoftmaxLoss 类,继承自 nn.Module
class AMSoftmaxLoss(nn.Module):
def __init__(self, input_dim, num_labels, scale=30.0, margin=0.4):
super(AMSoftmaxLoss, self).__init__()
# 初始化参数 scale 和 margin
self.scale = scale
self.margin = margin
self.num_labels = num_labels
# 使用 nn.Parameter 定义可学习参数 weight
self.weight = nn.Parameter(torch.randn(input_dim, num_labels), requires_grad=True)
# 使用交叉熵损失函数
self.loss = nn.CrossEntropyLoss()
# 前向传播函数,接收 hidden_states 和 labels 作为输入
def forward(self, hidden_states, labels):
# 将 labels 展平以适应 CrossEntropyLoss 函数的要求
labels = labels.flatten()
# 对 weight 和 hidden_states 进行 L2 归一化
weight = nn.functional.normalize(self.weight, dim=0)
hidden_states = nn.functional.normalize(hidden_states, dim=1)
# 计算余弦相似度 cos_theta
cos_theta = torch.mm(hidden_states, weight)
# 计算 AMSoftmax 中的 psi 值
psi = cos_theta - self.margin
# 根据 labels 生成 one-hot 编码
onehot = nn.functional.one_hot(labels, self.num_labels)
# 根据是否为 one-hot 中的类别,调整 logits 的值
logits = self.scale * torch.where(onehot.bool(), psi, cos_theta)
# 计算最终的损失值
loss = self.loss(logits, labels)
return loss
# 定义 TDNNLayer 类,继承自 nn.Module
class TDNNLayer(nn.Module):
def __init__(self, config, layer_id=0):
super().__init__()
# 初始化 TDNN 层的参数
self.in_conv_dim = config.tdnn_dim[layer_id - 1] if layer_id > 0 else config.tdnn_dim[layer_id]
self.out_conv_dim = config.tdnn_dim[layer_id]
self.kernel_size = config.tdnn_kernel[layer_id]
self.dilation = config.tdnn_dilation[layer_id]
# 使用 nn.Linear 定义 kernel(权重矩阵)
self.kernel = nn.Linear(self.in_conv_dim * self.kernel_size, self.out_conv_dim)
# 激活函数使用 ReLU
self.activation = nn.ReLU()
# 前向传播函数,接收 hidden_states 作为输入,返回 torch.Tensor
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
# 检查是否 peft 可用,如果可用,导入相关模块
if is_peft_available():
from peft.tuners.lora import LoraLayer
# 如果 kernel 是 LoraLayer 类型,则发出警告
if isinstance(self.kernel, LoraLayer):
warnings.warn(
"Detected LoRA on TDNNLayer. LoRA weights won't be applied due to optimization. "
"You should exclude TDNNLayer from LoRA's target modules.",
)
# 转置 hidden_states 的维度,以便与 conv1d 函数的要求匹配
hidden_states = hidden_states.transpose(1, 2)
# 将 self.kernel 的权重矩阵重新视图成 conv1d 函数所需的形状,并转置维度
weight = self.kernel.weight.view(self.out_conv_dim, self.kernel_size, self.in_conv_dim).transpose(1, 2)
# 使用 conv1d 函数进行卷积操作
hidden_states = nn.functional.conv1d(hidden_states, weight, self.kernel.bias, dilation=self.dilation)
# 再次转置 hidden_states 的维度,使其与输入形状相匹配
hidden_states = hidden_states.transpose(1, 2)
# 应用激活函数 ReLU
hidden_states = self.activation(hidden_states)
# 返回处理后的 hidden_states
return hidden_states
# 使用 add_start_docstrings 装饰器为 Wav2Vec2ForXVector 类添加文档字符串
@add_start_docstrings(
"""
Wav2Vec2 Model with an XVector feature extraction head on top for tasks like Speaker Verification.
""",
WAV_2_VEC_2_START_DOCSTRING,
)
# 定义 Wav2Vec2ForXVector 类,继承自 Wav2Vec2PreTrainedModel
class Wav2Vec2ForXVector(Wav2Vec2PreTrainedModel):
def __init__(self, config):
super().__init__(config)
# 初始化 WAV2VEC 2 模型
self.wav2vec2 = Wav2Vec2Model(config)
# 计算总层数,包括 Transformer 层和输入嵌入层
num_layers = config.num_hidden_layers + 1 # transformer layers + input embeddings
# 如果配置要使用加权层求和,则初始化层权重为均匀分布的参数
if config.use_weighted_layer_sum:
self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
# 初始化投影层,将隐藏状态映射到 TDNN 的第一个维度
self.projector = nn.Linear(config.hidden_size, config.tdnn_dim[0])
# 初始化 TDNN 层列表,根据配置文件中的维度定义
tdnn_layers = [TDNNLayer(config, i) for i in range(len(config.tdnn_dim))]
self.tdnn = nn.ModuleList(tdnn_layers)
# 初始化特征提取器,将 TDNN 的最后一层输出映射到 x-vector 的输出维度
self.feature_extractor = nn.Linear(config.tdnn_dim[-1] * 2, config.xvector_output_dim)
# 初始化分类器,将 x-vector 的输出映射到最终的标签数量维度
self.classifier = nn.Linear(config.xvector_output_dim, config.xvector_output_dim)
# 初始化损失函数,使用 AMSoftmaxLoss,配置为 x-vector 的输出维度和标签数量
self.objective = AMSoftmaxLoss(config.xvector_output_dim, config.num_labels)
# 初始化模型权重
self.init_weights()
def freeze_feature_extractor(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
"""
# 发出警告信息,提示该方法即将被弃用,并建议使用等效的 freeze_feature_encoder 方法
warnings.warn(
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
"Please use the equivalent `freeze_feature_encoder` method instead.",
FutureWarning,
)
# 调用 freeze_feature_encoder 方法,冻结特征编码器的参数,停止其在训练期间的梯度计算
self.freeze_feature_encoder()
def freeze_feature_encoder(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
"""
# 冻结特征编码器的参数,停止其在训练期间的梯度计算
self.wav2vec2.feature_extractor._freeze_parameters()
def freeze_base_model(self):
"""
Calling this function will disable the gradient computation for the base model so that its parameters will not
be updated during training. Only the classification head will be updated.
"""
# 遍历 WAV2VEC 2 模型的所有参数,并设置其 requires_grad 属性为 False,从而停止其在训练期间的梯度计算
for param in self.wav2vec2.parameters():
param.requires_grad = False
def _get_tdnn_output_lengths(self, input_lengths: Union[torch.LongTensor, int]):
"""
Computes the output length of the TDNN layers
"""
def _conv_out_length(input_length, kernel_size, stride):
# 从 PyTorch 文档中获取的一维卷积层输出长度计算公式
return (input_length - kernel_size) // stride + 1
# 根据配置文件中的每个 TDNN 层的卷积核大小计算输出长度
for kernel_size in self.config.tdnn_kernel:
input_lengths = _conv_out_length(input_lengths, kernel_size, 1)
return input_lengths
@add_start_docstrings_to_model_forward(WAV_2_VEC_2_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_XVECTOR_CHECKPOINT,
output_type=XVectorOutput,
config_class=_CONFIG_FOR_DOC,
modality="audio",
expected_output=_XVECTOR_EXPECTED_OUTPUT,
)
# 定义神经网络模型的前向传播方法
def forward(
# 输入的张量数据,可以为空
self,
input_values: Optional[torch.Tensor],
# 注意力掩码,可选参数,默认为空
attention_mask: Optional[torch.Tensor] = None,
# 是否输出注意力权重,可选参数,默认为空
output_attentions: Optional[bool] = None,
# 是否输出隐藏状态,可选参数,默认为空
output_hidden_states: Optional[bool] = None,
# 是否返回结果的字典格式,可选参数,默认为空
return_dict: Optional[bool] = None,
# 标签数据,可选参数,默认为空
labels: Optional[torch.Tensor] = None,
) -> Union[Tuple, XVectorOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
# 根据需要确定是否返回字典形式的输出
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 根据配置决定是否输出隐藏状态
output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
# 调用wav2vec2模型进行推理
outputs = self.wav2vec2(
input_values,
attention_mask=attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 如果使用加权层求和,则进行加权和操作
if self.config.use_weighted_layer_sum:
hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
hidden_states = torch.stack(hidden_states, dim=1)
norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
else:
hidden_states = outputs[0]
# 将隐藏状态投影到特征空间
hidden_states = self.projector(hidden_states)
# 遍历所有TDNN层进行特征提取
for tdnn_layer in self.tdnn:
hidden_states = tdnn_layer(hidden_states)
# 统计池化
if attention_mask is None:
# 如果没有注意力掩码,则计算平均特征和标准差特征
mean_features = hidden_states.mean(dim=1)
std_features = hidden_states.std(dim=1)
else:
# 根据注意力掩码计算特征提取器的输出长度
feat_extract_output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(dim=1))
tdnn_output_lengths = self._get_tdnn_output_lengths(feat_extract_output_lengths)
mean_features = []
std_features = []
# 根据长度截取隐藏状态并计算平均特征和标准差特征
for i, length in enumerate(tdnn_output_lengths):
mean_features.append(hidden_states[i, :length].mean(dim=0))
std_features.append(hidden_states[i, :length].std(dim=0))
mean_features = torch.stack(mean_features)
std_features = torch.stack(std_features)
statistic_pooling = torch.cat([mean_features, std_features], dim=-1)
# 将统计池化后的特征传入特征提取器和分类器
output_embeddings = self.feature_extractor(statistic_pooling)
logits = self.classifier(output_embeddings)
# 计算损失
loss = None
if labels is not None:
loss = self.objective(logits, labels)
# 根据return_dict决定返回的输出形式
if not return_dict:
# 如果不返回字典,则返回一个元组
output = (logits, output_embeddings) + outputs[_HIDDEN_STATES_START_POSITION:]
return ((loss,) + output) if loss is not None else output
# 如果需要返回字典形式的输出,则返回XVectorOutput对象
return XVectorOutput(
loss=loss,
logits=logits,
embeddings=output_embeddings,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
.\models\wav2vec2\processing_wav2vec2.py
"""
Speech processor class for Wav2Vec2
"""
import warnings
from contextlib import contextmanager
from ...processing_utils import ProcessorMixin
from .feature_extraction_wav2vec2 import Wav2Vec2FeatureExtractor
from .tokenization_wav2vec2 import Wav2Vec2CTCTokenizer
class Wav2Vec2Processor(ProcessorMixin):
r"""
Constructs a Wav2Vec2 processor which wraps a Wav2Vec2 feature extractor and a Wav2Vec2 CTC tokenizer into a single
processor.
[`Wav2Vec2Processor`] offers all the functionalities of [`Wav2Vec2FeatureExtractor`] and [`PreTrainedTokenizer`].
See the docstring of [`~Wav2Vec2Processor.__call__`] and [`~Wav2Vec2Processor.decode`] for more information.
Args:
feature_extractor (`Wav2Vec2FeatureExtractor`):
An instance of [`Wav2Vec2FeatureExtractor`]. The feature extractor is a required input.
tokenizer ([`PreTrainedTokenizer`]):
An instance of [`PreTrainedTokenizer`]. The tokenizer is a required input.
"""
feature_extractor_class = "Wav2Vec2FeatureExtractor"
tokenizer_class = "AutoTokenizer"
def __init__(self, feature_extractor, tokenizer):
super().__init__(feature_extractor, tokenizer)
self.current_processor = self.feature_extractor
self._in_target_context_manager = False
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
try:
return super().from_pretrained(pretrained_model_name_or_path, **kwargs)
except OSError:
warnings.warn(
f"Loading a tokenizer inside {cls.__name__} from a config that does not"
" include a `tokenizer_class` attribute is deprecated and will be "
"removed in v5. Please add `'tokenizer_class': 'Wav2Vec2CTCTokenizer'`"
" attribute to either your `config.json` or `tokenizer_config.json` "
"file to suppress this warning: ",
FutureWarning,
)
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
return cls(feature_extractor=feature_extractor, tokenizer=tokenizer)
def __call__(self, *args, **kwargs):
"""
When used in normal mode, this method forwards all its arguments to Wav2Vec2FeatureExtractor's
[`~Wav2Vec2FeatureExtractor.__call__`] and returns its output. If used in the context
[`~Wav2Vec2Processor.as_target_processor`] this method forwards all its arguments to PreTrainedTokenizer's
[`~PreTrainedTokenizer.__call__`]. Please refer to the docstring of the above two methods for more information.
"""
if self._in_target_context_manager:
return self.current_processor(*args, **kwargs)
if "raw_speech" in kwargs:
warnings.warn("Using `raw_speech` as a keyword argument is deprecated. Use `audio` instead.")
audio = kwargs.pop("raw_speech")
else:
audio = kwargs.pop("audio", None)
sampling_rate = kwargs.pop("sampling_rate", None)
text = kwargs.pop("text", None)
if len(args) > 0:
audio = args[0]
args = args[1:]
if audio is None and text is None:
raise ValueError("You need to specify either an `audio` or `text` input to process.")
if audio is not None:
inputs = self.feature_extractor(audio, *args, sampling_rate=sampling_rate, **kwargs)
if text is not None:
encodings = self.tokenizer(text, **kwargs)
if text is None:
return inputs
elif audio is None:
return encodings
else:
inputs["labels"] = encodings["input_ids"]
return inputs
def pad(self, *args, **kwargs):
"""
When used in normal mode, this method forwards all its arguments to Wav2Vec2FeatureExtractor's
[`~Wav2Vec2FeatureExtractor.pad`] and returns its output. If used in the context
[`~Wav2Vec2Processor.as_target_processor`] this method forwards all its arguments to PreTrainedTokenizer's
[`~PreTrainedTokenizer.pad`]. Please refer to the docstring of the above two methods for more information.
"""
if self._in_target_context_manager:
return self.current_processor.pad(*args, **kwargs)
input_features = kwargs.pop("input_features", None)
labels = kwargs.pop("labels", None)
if len(args) > 0:
input_features = args[0]
args = args[1:]
if input_features is not None:
input_features = self.feature_extractor.pad(input_features, *args, **kwargs)
if labels is not None:
labels = self.tokenizer.pad(labels, **kwargs)
if labels is None:
return input_features
elif input_features is None:
return labels
else:
input_features["labels"] = labels["input_ids"]
return input_features
def batch_decode(self, *args, **kwargs):
return self.tokenizer.batch_decode(*args, **kwargs)
def decode(self, *args, **kwargs):
return self.tokenizer.decode(*args, **kwargs)
def as_target_processor(self):
warnings.warn(
"`as_target_processor` is deprecated and will be removed in v5 of Transformers. You can process your "
"labels by using the argument `text` of the regular `__call__` method (either in the same call as "
"your audio inputs, or in a separate call."
)
self._in_target_context_manager = True
self.current_processor = self.tokenizer
yield
self.current_processor = self.feature_extractor
self._in_target_context_manager = False
.\models\wav2vec2\tokenization_wav2vec2.py
"""Tokenization class for Wav2Vec2."""
import json
import os
import sys
import warnings
from dataclasses import dataclass
from itertools import groupby
from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
import numpy as np
from ...tokenization_utils import PreTrainedTokenizer
from ...tokenization_utils_base import AddedToken, BatchEncoding
from ...utils import (
ModelOutput,
PaddingStrategy,
TensorType,
add_end_docstrings,
is_flax_available,
is_tf_available,
is_torch_available,
logging,
to_py_obj,
)
logger = logging.get_logger(__name__)
if TYPE_CHECKING:
if is_torch_available():
import torch
if is_tf_available():
import tensorflow as tf
if is_flax_available():
import jax.numpy as jnp
VOCAB_FILES_NAMES = {
"vocab_file": "vocab.json",
"tokenizer_config_file": "tokenizer_config.json",
}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"facebook/wav2vec2-base-960h": "https://huggingface.co/facebook/wav2vec2-base-960h/resolve/main/vocab.json",
},
"tokenizer_config_file": {
"facebook/wav2vec2-base-960h": (
"https://huggingface.co/facebook/wav2vec2-base-960h/resolve/main/tokenizer_config.json"
),
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"facebook/wav2vec2-base-960h": sys.maxsize}
WAV2VEC2_KWARGS_DOCSTRING = r"""
padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
Activates and controls padding. Accepts the following values:
- `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
sequence if provided).
- `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
acceptable input length for the model if that argument is not provided.
- `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
lengths).
max_length (`int`, *optional*):
Controls the maximum length to use by one of the truncation/padding parameters.
If left unset or set to `None`, this will use the predefined model maximum length if a maximum length
is required by one of the truncation/padding parameters. If the model has no specific maximum input
length (like XLNet) truncation/padding to a maximum length will be deactivated.
pad_to_multiple_of (`int`, *optional*):
If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
return_tensors (`str` or [`~utils.TensorType`], *optional*):
If set, will return tensors instead of list of python integers. Acceptable values are:
- `'tf'`: Return TensorFlow `tf.constant` objects.
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return Numpy `np.ndarray` objects.
verbose (`bool`, *optional*, defaults to `True`):
Whether or not to print more information and warnings.
"""
ListOfDict = List[Dict[str, Union[int, str]]]
@dataclass
class Wav2Vec2CTCTokenizerOutput(ModelOutput):
"""
Output type of [` Wav2Vec2CTCTokenizer`], with transcription.
Args:
text (list of `str` or `str`):
Decoded logits in text from. Usually the speech transcription.
char_offsets (list of `List[Dict[str, Union[int, str]]]` or `List[Dict[str, Union[int, str]]]`):
Offsets of the decoded characters. In combination with sampling rate and model downsampling rate char
offsets can be used to compute time stamps for each charater. Total logit score of the beam associated with
produced text.
word_offsets (list of `List[Dict[str, Union[int, str]]]` or `List[Dict[str, Union[int, str]]]`):
Offsets of the decoded words. In combination with sampling rate and model downsampling rate word offsets
can be used to compute time stamps for each word.
"""
text: Union[List[str], str]
char_offsets: Union[List[ListOfDict], ListOfDict] = None
word_offsets: Union[List[ListOfDict], ListOfDict] = None
class Wav2Vec2CTCTokenizer(PreTrainedTokenizer):
"""
Constructs a Wav2Vec2CTC tokenizer.
This tokenizer inherits from [`PreTrainedTokenizer`] which contains some of the main methods. Users should refer to
the superclass for more information regarding such methods.
Args:
vocab_file (`str`):
File containing the vocabulary.
bos_token (`str`, *optional*, defaults to `"<s>"`):
The beginning of sentence token.
eos_token (`str`, *optional*, defaults to `"</s>"`):
The end of sentence token.
unk_token (`str`, *optional*, defaults to `"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
pad_token (`str`, *optional*, defaults to `"<pad>"`):
The token used for padding, for example when batching sequences of different lengths.
word_delimiter_token (`str`, *optional*, defaults to `"|"`):
The token used for defining the end of a word.
do_lower_case (`bool`, *optional*, defaults to `False`):
Whether or not to accept lowercase input and lowercase the output when decoding.
target_lang (`str`, *optional*):
A target language the tokenizer should set by default. `target_lang` has to be defined for multi-lingual,
nested vocabulary such as [facebook/mms-1b-all](https://huggingface.co/facebook/mms-1b-all).
**kwargs
Additional keyword arguments passed along to [`PreTrainedTokenizer`]
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["input_ids", "attention_mask"]
def __init__(
self,
vocab_file,
bos_token="<s>",
eos_token="</s>",
unk_token="<unk>",
pad_token="<pad>",
word_delimiter_token="|",
replace_word_delimiter_char=" ",
do_lower_case=False,
target_lang=None,
**kwargs,
):
self._word_delimiter_token = word_delimiter_token
self.do_lower_case = do_lower_case
self.replace_word_delimiter_char = replace_word_delimiter_char
self.target_lang = target_lang
with open(vocab_file, encoding="utf-8") as vocab_handle:
self.vocab = json.load(vocab_handle)
if target_lang is not None:
self.encoder = self.vocab[target_lang]
else:
self.encoder = self.vocab
self.decoder = {v: k for k, v in self.encoder.items()}
super().__init__(
unk_token=unk_token,
bos_token=bos_token,
eos_token=eos_token,
pad_token=pad_token,
do_lower_case=do_lower_case,
word_delimiter_token=word_delimiter_token,
replace_word_delimiter_char=replace_word_delimiter_char,
target_lang=target_lang,
**kwargs,
)
for token in self.encoder.keys():
if len(token) > 1:
self.add_tokens(AddedToken(token, rstrip=True, lstrip=True, normalized=False))
def set_target_lang(self, target_lang: str):
"""
Set the target language of a nested multi-lingual dictionary
"""
if self.vocab == self.encoder:
raise ValueError(f"{self.vocab} is not a multi-lingual, nested tokenizer. Cannot set target language.")
if target_lang not in self.vocab:
raise ValueError(f"{target_lang} does not exist. Choose one of {', '.join(self.vocab.keys())}.")
self.target_lang = target_lang
self.init_kwargs["target_lang"] = target_lang
self.encoder = self.vocab[target_lang]
self.decoder = {v: k for k, v in self.encoder.items()}
for token in self.encoder.keys():
if len(token) > 1:
self.add_tokens(AddedToken(token, rstrip=True, lstrip=True, normalized=False))
@property
def word_delimiter_token(self) -> str:
"""
`str`: Word delimiter token. Log an error if used while not having been set.
"""
if self._word_delimiter_token is None and self.verbose:
logger.error("Using word_delimiter_token, but it is not set yet.")
return None
return str(self._word_delimiter_token)
@property
def word_delimiter_token_id(self) -> Optional[int]:
"""
`Optional[int]`: Id of the word_delimiter_token in the vocabulary. Returns `None` if the token has not been
set.
"""
if self._word_delimiter_token is None:
return None
return self.convert_tokens_to_ids(self.word_delimiter_token)
@word_delimiter_token.setter
def word_delimiter_token(self, value):
self._word_delimiter_token = value
@word_delimiter_token_id.setter
def word_delimiter_token_id(self, value):
self._word_delimiter_token = self.convert_tokens_to_ids(value)
@property
def vocab_size(self) -> int:
return len(self.decoder)
def get_vocab(self) -> Dict:
vocab = dict(self.encoder)
vocab.update(self.added_tokens_encoder)
return vocab
def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
to_add = []
for token in new_tokens:
if isinstance(token, str):
to_add.append(AddedToken(token, rstrip=False, lstrip=False, normalized=False))
else:
to_add.append(token)
return super()._add_tokens(to_add, special_tokens)
def _tokenize(self, text, **kwargs):
"""
Converts a string into a sequence of tokens (string), using the tokenizer.
"""
if self.do_lower_case:
text = text.upper()
return list(text.replace(" ", self.word_delimiter_token))
def _convert_token_to_id(self, token: str) -> int:
"""Converts a token (str) in an index (integer) using the vocab."""
return self.encoder.get(token, self.encoder.get(self.unk_token))
def _convert_id_to_token(self, index: int) -> str:
"""Converts an index (integer) in a token (str) using the vocab."""
result = self.decoder.get(index, self.unk_token)
return result
def convert_tokens_to_string(
self,
tokens: List[str],
group_tokens: bool = True,
spaces_between_special_tokens: bool = False,
output_char_offsets: bool = False,
output_word_offsets: bool = False,
):
pass
`
)n
Dict[str, Union[str,ict[str, Union[str, float]]:
"""
Converts a connectionist-temporal-classification (CTC) output tokens[str, Union[str, float]]:
"""
Converts a connectionist-temporal-classification (CTC) output tokens into a single Union[str, float]]:
"""
Converts a connectionist-temporal-classification (CTC) output tokens into a single string.
on[str, float]]:
"""
Converts a connectionist-temporal-classification (CTC) output tokens into a single string.
"""
# 如果 tokens 为空,返回n[str, float]]:
"""
Converts a connectionist-temporal-classification (CTC) output tokens into a single string.
"""
# 如果 tokens 为空,返回一个包含空字符串、空字符偏 float]]:
"""
Converts a connectionist-temporal-classification (CTC) output tokens into a single string.
"""
# 如果 tokens 为空,返回一个包含空字符串、空字符偏移列表和空单词偏移列表at]]:
"""
Converts a connectionist-temporal-classification (CTC) output tokens into a single string.
"""
# 如果 tokens 为空,返回一个包含空字符串、空字符偏移列表和空单词偏移列表的字t]]:
"""
Converts a connectionist-temporal-classification (CTC) output tokens into a single string.
"""
# 如果 tokens 为空,返回一个包含空字符串、空字符偏移列表和空单词偏移列表的字典 """
Converts a connectionist-temporal-classification (CTC) output tokens into a single string.
"""
# 如果 tokens 为空,返回一个包含空字符串、空字符偏移列表和空单词偏移列表的字典
"""
Converts a connectionist-temporal-classification (CTC) output tokens into a single string.
"""
# 如果 tokens 为空,返回一个包含空字符串、空字符偏移列表和空单词偏移列表的字典
if Converts a connectionist-temporal-classification (CTC) output tokens into a single string.
"""
if len(tokens) == Converts a connectionist-temporal-classification (CTC) output tokens into a single string.
"""
# 如果 tokens 为空,返回一个包含空字符串、空字符偏移列表和空单词偏移列表的字典
if len(tokens) == 0:
return {"text": "", "char_offsetserts a connectionist-temporal-classification (CTC) output tokens into a single string.
"""
if len(tokens) == 0:
return {"text": "", "char_offsets":nnectionist-temporal-classification (CTC) output tokens into a single string.
"""
# 如果 tokens 为空,返回一个包含空字符串、空字符偏移列表和空单词偏移列表的字典
if len(tokens) == 0:
return {"text": "", "char_offsets": [],ectionist-temporal-classification (CTC) output tokens into a single string.
"""
if len(tokens) == 0:
return {"text": "", "char_offsets": [], "emporal-classification (CTC) output tokens into a single string.
"""
if len(tokens) == 0:
return {"text": "", "char_offsets": [], "wordoral-classification (CTC) output tokens into a single string.
"""
if len(tokens) == 0:
return {"text": "", "char_offsets": [], "word_offsetsclassification (CTC) output tokens into a single string.
"""
if len(tokens) == 0:
return {"text": "", "char_offsets": [], "word_offsets":sification (CTC) output tokens into a single string.
"""
# 如果 tokens 为空,返回一个包含空字符串、空字符偏移列表和空单词偏移列表的字典
if len(tokens) == 0:
return {"text": "", "char_offsets": [], "word_offsets": []tion (CTC) output tokens into a single string.
"""
if len(tokens) == 0:
return {"text": "", "char_offsets": [], "word_offsets": []}
) output tokens into a single string.
"""
# 如果 tokens 为空,返回一个包含空字符串、空字符偏移列表和空单词偏移列表的字典
if len(tokens) == 0:
return {"text": "", "char_offsets": [], "word_offsets": []}
# 如果output tokens into a single string.
"""
if len(tokens) == 0:
return {"text": "", "char_offsets": [], "word_offsets": []}
"""
# 如果 tokens 为空,返回一个包含空字符串、空字符偏移列表和空单词偏移列表的字典
if len(tokens) == 0:
return {"text": "", "char_offsets": [], "word_offsets": []}
# 如果需要分组 tokens,将相put tokens into a single string.
"""
if len(tokens) == 0:
return {"text": "", "char_offsets": [], "word_offsets": []}
"""
# 如果 tokens 为空,返回一个包含空字符串、空字符偏移列表和空单词偏移列表的字典
if len(tokens) == 0:
return {"text": "", "char_offsets": [], "word_offsets": []}
# 如果需要分组 tokens,将相同的 tokens 合并为不重复的ns into a single string.
"""
if len(tokens) == 0:
return {"text": "", "char_offsets": [], "word_offsets": []}
"""
# 如果 tokens 为空,返回一个包含空字符串、空字符偏移列表和空单词偏移列表的字典
if len(tokens) == 0:
return {"text": "", "char_offsets": [], "word_offsets": []}
# 如果需要分组 tokens,将相同的 tokens 合并为不重复的 tokens,ngle string.
"""
if len(tokens) == 0:
return {"text": "", "char_offsets": [], "word_offsets": []}
"""
# 如果 tokens 为空,返回一个包含空字符串、空字符偏移列表和空单词偏移列表的字典
if len(tokens) == 0:
return {"text": "", "char_offsets": [], "word_offsets": []}
# 如果需要分组 tokens,将相同的 tokens 合并为不重复的 tokens,按照 CTC 风ng.
"""
if len(tokens) == 0:
return {"text": "", "char_offsets": [], "word_offsets": []}
if len(tokens) == 0:
return {"text": "", "char_offsets": [], "word_offsets": []}
if len(tokens) == 0:
return {"text": "", "char_offsets": [], "word_offsets": []}
if len(tokens) == 0:
return {"text": "", "char_offsets": [], "word_offsets": []}
if len(tokens) == 0:
return {"text": "", "char_offsets": [], "word_offsets": []}
tokens 为空,返回一个包含空字符串、空字符偏移列表和空单词偏移列表的字典
if len(tokens) == 0:
return {"text": "", "char_offsets": [], "word_offsets": []}
ifokens 为空,返回一个包含空字符串、空字符偏移列表和空单词偏移列表的字典
if len(tokens) == 0:
return {"text": "", "char_offsets": [], "word_offsets": []}
if group_tokens:
ns 为空,返回一个包含空字符串、空字符偏移列表和空单词偏移列表的字典
if len(tokens) == 0:
return {"text": "", "char_offsets": [], "word_offsets": []}
if group_tokens:
if len(tokens) == 0:
return {"text": "", "char_offsets": [], "word_offsets": []}
if group_tokens:
chars,个包含空字符串、空字符偏移列表和空单词偏移列表的字典
if len(tokens) == 0:
return {"text": "", "char_offsets": [], "word_offsets": []}
if group_tokens:
chars, char_repetitions = zip(*((包含空字符串、空字符偏移列表和空单词偏移列表的字典
if len(tokens) == 0:
return {"text": "", "char_offsets": [], "word_offsets": []}
if group_tokens:
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token空字符串、空字符偏移列表和空单词偏移列表的字典
if len(tokens) == 0:
return {"text": "", "char_offsets": [], "word_offsets": []}
if group_tokens:
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
符串、空字符偏移列表和空单词偏移列表的字典
if len(tokens) == 0:
return {"text": "", "char_offsets": [], "word_offsets": []}
if group_tokens:
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
if len(tokens) == 0:
return {"text": "", "char_offsets": [], "word_offsets": []}
if group_tokens:
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
if len(tokens) == 0:
return {"text": "", "char_offsets": [], "word_offsets": []}
if group_tokens:
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
偏移列表和空单词偏移列表的字典
if len(tokens) == 0:
return {"text": "", "char_offsets": [], "word_offsets": []}
if group_tokens:
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
chars = tokens
char_re移列表和空单词偏移列表的字典
if len(tokens) == 0:
return {"text": "", "char_offsets": [], "word_offsets": []}
if group_tokens:
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len列表和空单词偏移列表的字典
if len(tokens) == 0:
return {"text": "", "char_offsets": [], "word_offsets": []}
if group_tokens:
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens)和空单词偏移列表的字典
if len(tokens) == 0:
return {"text": "", "char_offsets": [], "word_offsets": []}
if group_tokens:
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
空单词偏移列表的字典
if len(tokens) == 0:
return {"text": "", "char_offsets": [], "word_offsets": []}
if group_tokens:
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
if len(tokens) == 0:
return {"text": "", "char_offsets": [], "word_offsets": []}
if group_tokens:
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
if len(tokens) == 0:
return {"text": "", "char_offsets": [], "word_offsets": []}
if group_tokens:
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
if len(tokens) == 0:
return {"text": "", "char_offsets": [], "word_offsets": []}
if group_tokens:
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
if len(tokens) == 0:
return {"text": "", "char_offsets": [], "word_offsets": []}
if group_tokens:
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
列表的字典
if len(tokens) == 0:
return {"text": "", "char_offsets": [], "word_offsets": []}
if group_tokens:
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars if len(tokens) == 0:
return {"text": "", "char_offsets": [], "word_offsets": []}
if group_tokens:
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter if len(tokens) == 0:
return {"text": "", "char_offsets": [], "word_offsets": []}
if group_tokens:
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: charif len(tokens) == 0:
return {"text": "", "char_offsets": [], "word_offsets": []}
if group_tokens:
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.padtokens) == 0:
return {"text": "", "char_offsets": [], "word_offsets": []}
if group_tokens:
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
return {"text": "", "char_offsets": [], "word_offsets": []}
if group_tokens:
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
return {"text": "", "char_offsets": [], "word_offsets": []}
if group_tokens:
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_charss) == 0:
return {"text": "", "char_offsets": [], "word_offsets": []}
if group_tokens:
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
) == 0:
return {"text": "", "char_offsets": [], "word_offsets": []}
if group_tokens:
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_del== 0:
return {"text": "", "char_offsets": [], "word_offsets": []}
if group_tokens:
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if return {"text": "", "char_offsets": [], "word_offsets": []}
if group_tokens:
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == selfreturn {"text": "", "char_offsets": [], "word_offsets": []}
if group_tokens:
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiterurn {"text": "", "char_offsets": [], "word_offsets": []}
if group_tokens:
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for{"text": "", "char_offsets": [], "word_offsets": []}
if group_tokens:
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processedtext": "", "char_offsets": [], "word_offsets": []}
# 如果需要分组 tokens,将相同的 tokens 合并为不重复的 tokens,按照 CTC 风格解码
if group_tokens:
# 使用 groupby 函数将 tokens 分组,并计算每个 token 的重复次数
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
# 否则,直接将 chars 设置为 tokens,并将 char_repetitions 设置为每个 token 重复次数为 1
chars = tokens
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
": "", "char_offsets": [], "word_offsets": []}
if group_tokens:
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if group_tokens:
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if group_tokens:
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if group_tokens:
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if output_charsets": [], "word_offsets": []}
# 如果需要分组 tokens,将相同的 tokens 合并为不重复的 tokens,按照 CTC 风格解码
if group_tokens:
# 使用 groupby 函数将 tokens 分组,并计算每个 token 的重复次数
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
# 否则,直接将 chars 设置为 tokens,并将 char_repetitions 设置为每个 token 重复次数为 1
chars = tokens
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
": [], "word_offsets": []}
if group_tokens:
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if output_char_offsets or output_word_offsets:
char_offsets = self [], "word_offsets": []}
if group_tokens:
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets "word_offsets": []}
if group_tokens:
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetffsets": []}
# 如果需要分组 tokens,将相同的 tokens 合并为不重复的 tokens,按照 CTC 风格解码
if group_tokens:
# 使用 groupby 函数将 tokens 分组,并计算每个 token 的重复次数
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
# 否则,直接将 chars 设置为 tokens,并将 char_repetitions 设置为每个 token 重复次数为 1
chars = tokens
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars,sets": []}
if group_tokens:
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_tokens": []}
# 如果需要分组 tokens,将相同的 tokens 合并为不重复的 tokens,按照 CTC 风格解码
if group_tokens:
# 使用 groupby 函数将 tokens 分组,并计算每个 token 的重复次数
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
# 否则,直接将 chars 设置为 tokens,并将 char_repetitions 设置为每个 token 重复次数为 1
chars = tokens
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
#: []}
# 如果需要分组 tokens,将相同的 tokens 合并为不重复的 tokens,按照 CTC 风格解码
if group_tokens:
# 使用 groupby 函数将 tokens 分组,并计算每个 token 的重复次数
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
# 否则,直接将 chars 设置为 tokens,并将 char_repetitions 设置为每个 token 重复次数为 1
chars = tokens
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char # 如果需要分组 tokens,将相同的 tokens 合并为不重复的 tokens,按照 CTC 风格解码
if group_tokens:
# 使用 groupby 函数将 tokens 分组,并计算每个 token 的重复次数
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
# 否则,直接将 chars 设置为 tokens,并将 char_repetitions 设置为每个 token 重复次数为 1
chars = tokens
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed # 如果需要分组 tokens,将相同的 tokens 合并为不重复的 tokens,按照 CTC 风格解码
if group_tokens:
# 使用 groupby 函数将 tokens 分组,并计算每个 token 的重复次数
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
# 否则,直接将 chars 设置为 tokens,并将 char_repetitions 设置为每个 token 重复次数为 1
chars = tokens
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度 # 如果需要分组 tokens,将相同的 tokens 合并为不重复的 tokens,按照 CTC 风格解码
if group_tokens:
# 使用 groupby 函数将 tokens 分组,并计算每个 token 的重复次数
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
# 否则,直接将 chars 设置为 tokens,并将 char_repetitions 设置为每个 token 重复次数为 1
chars = tokens
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len# 如果需要分组 tokens,将相同的 tokens 合并为不重复的 tokens,按照 CTC 风格解码
if group_tokens:
# 使用 groupby 函数将 tokens 分组,并计算每个 token 的重复次数
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
# 否则,直接将 chars 设置为 tokens,并将 char_repetitions 设置为每个 token 重复次数为 1
chars = tokens
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char 如果需要分组 tokens,将相同的 tokens 合并为不重复的 tokens,按照 CTC 风格解码
if group_tokens:
# 使用 groupby 函数将 tokens 分组,并计算每个 token 的重复次数
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
# 否则,直接将 chars 设置为 tokens,并将 char_repetitions 设置为每个 token 重复次数为 1
chars = tokens
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets tokens,将相同的 tokens 合并为不重复的 tokens,按照 CTC 风格解码
if group_tokens:
# 使用 groupby 函数将 tokens 分组,并计算每个 token 的重复次数
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
# 否则,直接将 chars 设置为 tokens,并将 char_repetitions 设置为每个 token 重复次数为 1
chars = tokens
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets)okens,将相同的 tokens 合并为不重复的 tokens,按照 CTC 风格解码
if group_tokens:
# 使用 groupby 函数将 tokens 分组,并计算每个 token 的重复次数
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
# 否则,直接将 chars 设置为 tokens,并将 char_repetitions 设置为每个 token 重复次数为 1
chars = tokens
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) !=kens,将相同的 tokens 合并为不重复的 tokens,按照 CTC 风格解码
if group_tokens:
# 使用 groupby 函数将 tokens 分组,并计算每个 token 的重复次数
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
# 否则,直接将 chars 设置为 tokens,并将 char_repetitions 设置为每个 token 重复次数为 1
chars = tokens
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise Valueens,将相同的 tokens 合并为不重复的 tokens,按照 CTC 风格解码
if group_tokens:
# 使用 groupby 函数将 tokens 分组,并计算每个 token 的重复次数
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
# 否则,直接将 chars 设置为 tokens,并将 char_repetitions 设置为每个 token 重复次数为 1
chars = tokens
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {ns,将相同的 tokens 合并为不重复的 tokens,按照 CTC 风格解码
if group_tokens:
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} ands,将相同的 tokens 合并为不重复的 tokens,按照 CTC 风格解码
if group_tokens:
# 使用 groupby 函数将 tokens 分组,并计算每个 token 的重复次数
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
# 否则,直接将 chars 设置为 tokens,并将 char_repetitions 设置为每个 token 重复次数为 1
chars = tokens
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokenstokens 合并为不重复的 tokens,按照 CTC 风格解码
if group_tokens:
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processedkens 合并为不重复的 tokens,按照 CTC 风格解码
if group_tokens:
# 使用 groupby 函数将 tokens 分组,并计算每个 token 的重复次数
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
# 否则,直接将 chars 设置为 tokens,并将 char_repetitions 设置为每个 token 重复次数为 1
chars = tokens
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to bens 合并为不重复的 tokens,按照 CTC 风格解码
if group_tokens:
# 使用 groupby 函数将 tokens 分组,并计算每个 token 的重复次数
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
# 否则,直接将 chars 设置为 tokens,并将 char_repetitions 设置为每个 token 重复次数为 1
chars = tokens
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the sames 合并为不重复的 tokens,按照 CTC 风格解码
if group_tokens:
# 使用 groupby 函数将 tokens 分组,并计算每个 token 的重复次数
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
# 否则,直接将 chars 设置为 tokens,并将 char_repetitions 设置为每个 token 重复次数为 1
chars = tokens
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but 合并为不重复的 tokens,按照 CTC 风格解码
if group_tokens:
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
合并为不重复的 tokens,按照 CTC 风格解码
if group_tokens:
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len并为不重复的 tokens,按照 CTC 风格解码
if group_tokens:
# 使用 groupby 函数将 tokens 分组,并计算每个 token 的重复次数
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
# 否则,直接将 chars 设置为 tokens,并将 char_repetitions 设置为每个 token 重复次数为 1
chars = tokens
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offset tokens,按照 CTC 风格解码
if group_tokens:
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {lenokens,按照 CTC 风格解码
if group_tokens:
# 使用 groupby 函数将 tokens 分组,并计算每个 token 的重复次数
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
# 否则,直接将 chars 设置为 tokens,并将 char_repetitions 设置为每个 token 重复次数为 1
chars = tokens
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} andkens,按照 CTC 风格解码
if group_tokens:
# 使用 groupby 函数将 tokens 分组,并计算每个 token 的重复次数
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
# 否则,直接将 chars 设置为 tokens,并将 char_repetitions 设置为每个 token 重复次数为 1
chars = tokens
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
ns,按照 CTC 风格解码
if group_tokens:
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {s,按照 CTC 风格解码
if group_tokens:
# 使用 groupby 函数将 tokens 分组,并计算每个 token 的重复次数
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
# 否则,直接将 chars 设置为 tokens,并将 char_repetitions 设置为每个 token 重复次数为 1
chars = tokens
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed,按照 CTC 风格解码
if group_tokens:
# 使用 groupby 函数将 tokens 分组,并计算每个 token 的重复次数
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
# 否则,直接将 chars 设置为 tokens,并将 char_repetitions 设置为每个 token 重复次数为 1
chars = tokens
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
按照 CTC 风格解码
if group_tokens:
# 使用 groupby 函数将 tokens 分组,并计算每个 token 的重复次数
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
# 否则,直接将 chars 设置为 tokens,并将 char_repetitions 设置为每个 token 重复次数为 1
chars = tokens
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
#照 CTC 风格解码
if group_tokens:
# 使用 groupby 函数将 tokens 分组,并计算每个 token 的重复次数
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
# 否则,直接将 chars 设置为 tokens,并将 char_repetitions 设置为每个 token 重复次数为 1
chars = tokens
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的 CTC 风格解码
if group_tokens:
# 使用 groupby 函数将 tokens 分组,并计算每个 token 的重复次数
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
# 否则,直接将 chars 设置为 tokens,并将 char_repetitions 设置为每个 token 重复次数为 1
chars = tokens
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字CTC 风格解码
if group_tokens:
# 使用 groupby 函数将 tokens 分组,并计算每个 token 的重复次数
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
# 否则,直接将 chars 设置为 tokens,并将 char_repetitions 设置为每个 token 重复次数为 1
chars = tokens
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "TC 风格解码
if group_tokens:
# 使用 groupby 函数将 tokens 分组,并计算每个 token 的重复次数
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
# 否则,直接将 chars 设置为 tokens,并将 char_repetitions 设置为每个 token 重复次数为 1
chars = tokens
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char"C 风格解码
if group_tokens:
# 使用 groupby 函数将 tokens 分组,并计算每个 token 的重复次数
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
# 否则,直接将 chars 设置为 tokens,并将 char_repetitions 设置为每个 token 重复次数为 1
chars = tokens
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应 风格解码
if group_tokens:
# 使用 groupby 函数将 tokens 分组,并计算每个 token 的重复次数
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
# 否则,直接将 chars 设置为 tokens,并将 char_repetitions 设置为每个 token 重复次数为 1
chars = tokens
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的风格解码
if group_tokens:
# 使用 groupby 函数将 tokens 分组,并计算每个 token 的重复次数
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
# 否则,直接将 chars 设置为 tokens,并将 char_repetitions 设置为每个 token 重复次数为 1
chars = tokens
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
格解码
if group_tokens:
# 使用 groupby 函数将 tokens 分组,并计算每个 token 的重复次数
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
# 否则,直接将 chars 设置为 tokens,并将 char_repetitions 设置为每个 token 重复次数为 1
chars = tokens
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i,解码
if group_tokens:
# 使用 groupby 函数将 tokens 分组,并计算每个 token 的重复次数
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
# 否则,直接将 chars 设置为 tokens,并将 char_repetitions 设置为每个 token 重复次数为 1
chars = tokens
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in if group_tokens:
# 使用 groupby 函数将 tokens 分组,并计算每个 token 的重复次数
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
# 否则,直接将 chars 设置为 tokens,并将 char_repetitions 设置为每个 token 重复次数为 1
chars = tokens
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars if group_tokens:
# 使用 groupby 函数将 tokens 分组,并计算每个 token 的重复次数
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
# 否则,直接将 chars 设置为 tokens,并将 char_repetitions 设置为每个 token 重复次数为 1
chars = tokens
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars):
char_offsetsoup_tokens:
# 使用 groupby 函数将 tokens 分组,并计算每个 token 的重复次数
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
# 否则,直接将 chars 设置为 tokens,并将 char_repetitions 设置为每个 token 重复次数为 1
chars = tokens
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars):
char_offsets[i]["charens:
# 使用 groupby 函数将 tokens 分组,并计算每个 token 的重复次数
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
# 否则,直接将 chars 设置为 tokens,并将 char_repetitions 设置为每个 token 重复次数为 1
chars = tokens
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = chars:
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
word数将 tokens 分组,并计算每个 token 的重复次数
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
word_offsets = Noneens 分组,并计算每个 token 的重复次数
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
word_offsets = None
s 分组,并计算每个 token 的重复次数
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
word_offsets = None
if output_word_offsets:
,并计算每个 token 的重复次数
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
word_offsets = None
if output_word_offsets:
word计算每个 token 的重复次数
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
word_offsets = None
if output_word_offsets:
word_offsets = self每个 token 的重复次数
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(charken 的重复次数
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delen 的重复次数
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
ifars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
if not output_char char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
if not output_char_offsets:
charrepetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
if not output_char_offsets:
char_offsets = Noneepetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
if not output_char_offsets:
char_offsets = None
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
if not output_char_offsets:
char_offsets = None
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
if not output_char_offsets:
char_offsets = None
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
if not output_char_offsets:
char_offsets = None
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
if not output_char_offsets:
char_offsets = None
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
if not output_char_offsets:
char_offsets = None
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
if not output_char_offsets:
char_offsets = None
st(group_iter))) for token, group_iter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
if not output_char_offsets:
char_offsets = None
join_charp_iter))) for token, group_iter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
if not output_char_offsets:
char_offsets = None
join_char = " " ifr))) for token, group_iter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
if not output_char_offsets:
char_offsets = None
join_char = " " if spaces_between_special)) for token, group_iter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
if not output_char_offsets:
char_offsets = None
join_char = " " if spaces_between_special_tokens else ""
n, group_iter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
if not output_char_offsets:
char_offsets = None
join_char = " " if spaces_between_special_tokens else ""
string = join group_iter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
if not output_char_offsets:
char_offsets = None
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
roup_iter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
if not output_char_offsets:
char_offsets = None
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
if not output_char_offsets:
char_offsets = None
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
if not output_char_offsets:
char_offsets = None
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
if not output_char_offsets:
char_offsets = None
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
iter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
if not output_char_offsets:
char_offsets = None
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
if self.do_lowerter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
if not output_char_offsets:
char_offsets = None
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
if self.do_lower_case:
er in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
if not output_char_offsets:
char_offsets = None
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
if self.do_lower_case:
string =r in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
if not output_char_offsets:
char_offsets = None
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
if self.do_lower_case:
string = string.lower()
in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
if not output_char_offsets:
char_offsets = None
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
if self.do_lower_case:
string = string.lower()
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
if not output_char_offsets:
char_offsets = None
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
if self.do_lower_case:
string = string.lower()
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
if not output_char_offsets:
char_offsets = None
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
if self.do_lower_case:
string = string.lower()
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
if not output_char_offsets:
char_offsets = None
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
if self.do_lower_case:
string = string.lower()
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
if not output_char_offsets:
char_offsets = None
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
if self.do_lower_case:
string = string.lower()
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
if not output_char_offsets:
char_offsets = None
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
if self.do_lower_case:
string = string.lower()
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
if not output_char_offsets:
char_offsets = None
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
if self.do_lower_case:
string = string.lower()
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
if not output_char_offsets:
char_offsets = None
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
if self.do_lower_case:
string = string.lower()
return
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
if not output_char_offsets:
char_offsets = None
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
if self.do_lower_case:
string = string.lower()
return {"# 否则,直接将 chars 设置为 tokens,并将 char_repetitions 设置为每个 token 重复次数为 1
chars = tokens
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
# 如果需要输出单词偏移,计算单词偏移
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
# 如果不需要输出字符偏移,将 char_offsets 设置为 None
if not output_char_offsets:
char_offsets = None
# 将处理后的字符列表连接成一个字符串,特殊 token 之间用空格分隔
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
# 如果需要将字符串转换为小写,进行转换
if self.do_lower_case:
string = string.lower()
# 返回一个包含生成的字符串、字符偏移和单词偏移的字典
return {"text直接将 chars 设置为 tokens,并将 char_repetitions 设置为每个 token 重复次数为 1
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
char_offsets = word_offsets = None
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
if not output_char_offsets:
char_offsets = None
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
if self.do_lower_case:
string = string.lower()
return {"text": string, "char chars 设置为 tokens,并将 char_repetitions 设置为每个 token 重复次数为 1
chars = tokens
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
# 如果需要输出单词偏移,计算单词偏移
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
# 如果不需要输出字符偏移,将 char_offsets 设置为 None
if not output_char_offsets:
char_offsets = None
# 将处理后的字符列表连接成一个字符串,特殊 token 之间用空格分隔
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
# 如果需要将字符串转换为小写,进行转换
if self.do_lower_case:
string = string.lower()
# 返回一个包含生成的字符串、字符偏移和单词偏移的字典
return {"text": string, "char_offsets": char_offsets,ars 设置为 tokens,并将 char_repetitions 设置为每个 token 重复次数为 1
chars = tokens
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
# 如果需要输出单词偏移,计算单词偏移
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
# 如果不需要输出字符偏移,将 char_offsets 设置为 None
if not output_char_offsets:
char_offsets = None
# 将处理后的字符列表连接成一个字符串,特殊 token 之间用空格分隔
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
# 如果需要将字符串转换为小写,进行转换
if self.do_lower_case:
string = string.lower()
# 返回一个包含生成的字符串、字符偏移和单词偏移的字典
return {"text": string, "char_offsets": char_offsets, "word_offsets": word_offsets}
为 tokens,并将 char_repetitions 设置为每个 token 重复次数为 1
chars = tokens
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
# 如果需要输出单词偏移,计算单词偏移
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
# 如果不需要输出字符偏移,将 char_offsets 设置为 None
if not output_char_offsets:
char_offsets = None
# 将处理后的字符列表连接成一个字符串,特殊 token 之间用空格分隔
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
# 如果需要将字符串转换为小写,进行转换
if self.do_lower_case:
string = string.lower()
# 返回一个包含生成的字符串、字符偏移和单词偏移的字典
return {"text": string, "char_offsets": char_offsets, "word_offsets": word_offsets}
@staticmethod
tokens,并将 char_repetitions 设置为每个 token 重复次数为 1
chars = tokens
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
# 如果需要输出单词偏移,计算单词偏移
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
# 如果不需要输出字符偏移,将 char_offsets 设置为 None
if not output_char_offsets:
char_offsets = None
# 将处理后的字符列表连接成一个字符串,特殊 token 之间用空格分隔
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
# 如果需要将字符串转换为小写,进行转换
if self.do_lower_case:
string = string.lower()
# 返回一个包含生成的字符串、字符偏移和单词偏移的字典
return {"text": string, "char_offsets": char_offsets, "word_offsets": word_offsets}
@staticmethod
def _computens,并将 char_repetitions 设置为每个 token 重复次数为 1
chars = tokens
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
# 如果需要输出单词偏移,计算单词偏移
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
# 如果不需要输出字符偏移,将 char_offsets 设置为 None
if not output_char_offsets:
char_offsets = None
# 将处理后的字符列表连接成一个字符串,特殊 token 之间用空格分隔
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
# 如果需要将字符串转换为小写,进行转换
if self.do_lower_case:
string = string.lower()
# 返回一个包含生成的字符串、字符偏移和单词偏移的字典
return {"text": string, "char_offsets": char_offsets, "word_offsets": word_offsets}
@staticmethod
def _compute_offsets(
har_repetitions 设置为每个 token 重复次数为 1
chars = tokens
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
# 如果需要输出单词偏移,计算单词偏移
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
# 如果不需要输出字符偏移,将 char_offsets 设置为 None
if not output_char_offsets:
char_offsets = None
# 将处理后的字符列表连接成一个字符串,特殊 token 之间用空格分隔
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
# 如果需要将字符串转换为小写,进行转换
if self.do_lower_case:
string = string.lower()
# 返回一个包含生成的字符串、字符偏移和单词偏移的字典
return {"text": string, "char_offsets": char_offsets, "word_offsets": word_offsets}
@staticmethod
def _compute_offsets(
char_repetar_repetitions 设置为每个 token 重复次数为 1
chars = tokens
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
# 如果需要输出单词偏移,计算单词偏移
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
# 如果不需要输出字符偏移,将 char_offsets 设置为 None
if not output_char_offsets:
char_offsets = None
# 将处理后的字符列表连接成一个字符串,特殊 token 之间用空格分隔
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
# 如果需要将字符串转换为小写,进行转换
if self.do_lower_case:
string = string.lower()
# 返回一个包含生成的字符串、字符偏移和单词偏移的字典
return {"text": string, "char_offsets": char_offsets, "word_offsets": word_offsets}
@staticmethod
def _compute_offsets(
char_repetitions: List_repetitions 设置为每个 token 重复次数为 1
chars = tokens
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
# 如果需要输出单词偏移,计算单词偏移
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
# 如果不需要输出字符偏移,将 char_offsets 设置为 None
if not output_char_offsets:
char_offsets = None
# 将处理后的字符列表连接成一个字符串,特殊 token 之间用空格分隔
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
# 如果需要将字符串转换为小写,进行转换
if self.do_lower_case:
string = string.lower()
# 返回一个包含生成的字符串、字符偏移和单词偏移的字典
return {"text": string, "char_offsets": char_offsets, "word_offsets": word_offsets}
@staticmethod
def _compute_offsets(
char_repetitions: List[intepetitions 设置为每个 token 重复次数为 1
chars = tokens
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
# 如果需要输出单词偏移,计算单词偏移
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
# 如果不需要输出字符偏移,将 char_offsets 设置为 None
if not output_char_offsets:
char_offsets = None
# 将处理后的字符列表连接成一个字符串,特殊 token 之间用空格分隔
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
# 如果需要将字符串转换为小写,进行转换
if self.do_lower_case:
string = string.lower()
# 返回一个包含生成的字符串、字符偏移和单词偏移的字典
return {"text": string, "char_offsets": char_offsets, "word_offsets": word_offsets}
@staticmethod
def _compute_offsets(
char_repetitions: List[int],petitions 设置为每个 token 重复次数为 1
chars = tokens
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
# 如果需要输出单词偏移,计算单词偏移
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
# 如果不需要输出字符偏移,将 char_offsets 设置为 None
if not output_char_offsets:
char_offsets = None
# 将处理后的字符列表连接成一个字符串,特殊 token 之间用空格分隔
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
# 如果需要将字符串转换为小写,进行转换
if self.do_lower_case:
string = string.lower()
# 返回一个包含生成的字符串、字符偏移和单词偏移的字典
return {"text": string, "char_offsets": char_offsets, "word_offsets": word_offsets}
@staticmethod
def _compute_offsets(
char_repetitions: List[int], charsitions 设置为每个 token 重复次数为 1
chars = tokens
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
# 如果需要输出单词偏移,计算单词偏移
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
# 如果不需要输出字符偏移,将 char_offsets 设置为 None
if not output_char_offsets:
char_offsets = None
# 将处理后的字符列表连接成一个字符串,特殊 token 之间用空格分隔
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
# 如果需要将字符串转换为小写,进行转换
if self.do_lower_case:
string = string.lower()
# 返回一个包含生成的字符串、字符偏移和单词偏移的字典
return {"text": string, "char_offsets": char_offsets, "word_offsets": word_offsets}
@staticmethod
def _compute_offsets(
char_repetitions: List[int], chars:设置为每个 token 重复次数为 1
chars = tokens
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
# 如果需要输出单词偏移,计算单词偏移
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
# 如果不需要输出字符偏移,将 char_offsets 设置为 None
if not output_char_offsets:
char_offsets = None
# 将处理后的字符列表连接成一个字符串,特殊 token 之间用空格分隔
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
# 如果需要将字符串转换为小写,进行转换
if self.do_lower_case:
string = string.lower()
# 返回一个包含生成的字符串、字符偏移和单词偏移的字典
return {"text": string, "char_offsets": char_offsets, "word_offsets": word_offsets}
@staticmethod
def _compute_offsets(
char_repetitions: List[int], chars: List为每个 token 重复次数为 1
chars = tokens
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
# 如果需要输出单词偏移,计算单词偏移
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
# 如果不需要输出字符偏移,将 char_offsets 设置为 None
if not output_char_offsets:
char_offsets = None
# 将处理后的字符列表连接成一个字符串,特殊 token 之间用空格分隔
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
# 如果需要将字符串转换为小写,进行转换
if self.do_lower_case:
string = string.lower()
# 返回一个包含生成的字符串、字符偏移和单词偏移的字典
return {"text": string, "char_offsets": char_offsets, "word_offsets": word_offsets}
@staticmethod
def _compute_offsets(
char_repetitions: List[int], chars: List[str], c个 token 重复次数为 1
chars = tokens
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
# 如果需要输出单词偏移,计算单词偏移
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
# 如果不需要输出字符偏移,将 char_offsets 设置为 None
if not output_char_offsets:
char_offsets = None
# 将处理后的字符列表连接成一个字符串,特殊 token 之间用空格分隔
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
# 如果需要将字符串转换为小写,进行转换
if self.do_lower_case:
string = string.lower()
# 返回一个包含生成的字符串、字符偏移和单词偏移的字典
return {"text": string, "char_offsets": char_offsets, "word_offsets": word_offsets}
@staticmethod
def _compute_offsets(
char_repetitions: List[int], chars: List[str], ctc_token: int
) -> List[Dict[str, Union[str, int]]]:
# token 重复次数为 1
chars = tokens
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
# 如果需要输出单词偏移,计算单词偏移
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
# 如果不需要输出字符偏移,将 char_offsets 设置为 None
if not output_char_offsets:
char_offsets = None
# 将处理后的字符列表连接成一个字符串,特殊 token 之间用空格分隔
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
# 如果需要将字符串转换为小写,进行转换
if self.do_lower_case:
string = string.lower()
# 返回一个包含生成的字符串、字符偏移和单词偏移的字典
return {"text": string, "char_offsets": char_offsets, "word_offsets": word_offsets}
@staticmethod
def _compute_offsets(
char_repetitions: List[int], chars: List[str], ctc_token: int
) -> List[Dict[str, Union[str, int]]]:
# 计算字符的结束索引,token 重复次数为 1
chars = tokens
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
# 如果需要输出单词偏移,计算单词偏移
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
# 如果不需要输出字符偏移,将 char_offsets 设置为 None
if not output_char_offsets:
char_offsets = None
# 将处理后的字符列表连接成一个字符串,特殊 token 之间用空格分隔
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
# 如果需要将字符串转换为小写,进行转换
if self.do_lower_case:
string = string.lower()
# 返回一个包含生成的字符串、字符偏移和单词偏移的字典
return {"text": string, "char_offsets": char_offsets, "word_offsets": word_offsets}
@staticmethod
def _compute_offsets(
char_repetitions: List[int], chars: List[str], ctc_token: int
) -> List[Dict[str, Union[str, int]]]:
# 计算字符的结束索引,使用字符ken 重复次数为 1
chars = tokens
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
# 如果需要输出单词偏移,计算单词偏移
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
# 如果不需要输出字符偏移,将 char_offsets 设置为 None
if not output_char_offsets:
char_offsets = None
# 将处理后的字符列表连接成一个字符串,特殊 token 之间用空格分隔
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
# 如果需要将字符串转换为小写,进行转换
if self.do_lower_case:
string = string.lower()
# 返回一个包含生成的字符串、字符偏移和单词偏移的字典
return {"text": string, "char_offsets": char_offsets, "word_offsets": word_offsets}
@staticmethod
def _compute_offsets(
char_repetitions: List[int], chars: List[str], ctc_token: int
) -> List[Dict[str, Union[str, int]]]:
# 计算字符的结束索引,使用字符重复次数的n 重复次数为 1
chars = tokens
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
# 如果需要输出单词偏移,计算单词偏移
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
# 如果不需要输出字符偏移,将 char_offsets 设置为 None
if not output_char_offsets:
char_offsets = None
# 将处理后的字符列表连接成一个字符串,特殊 token 之间用空格分隔
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
# 如果需要将字符串转换为小写,进行转换
if self.do_lower_case:
string = string.lower()
# 返回一个包含生成的字符串、字符偏移和单词偏移的字典
return {"text": string, "char_offsets": char_offsets, "word_offsets": word_offsets}
@staticmethod
def _compute_offsets(
char_repetitions: List[int], chars: List[str], ctc_token: int
) -> List[Dict[str, Union[str, int]]]:
# 计算字符的结束索引,使用字符重复次数的累积复次数为 1
chars = tokens
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
# 如果需要输出单词偏移,计算单词偏移
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
# 如果不需要输出字符偏移,将 char_offsets 设置为 None
if not output_char_offsets:
char_offsets = None
# 将处理后的字符列表连接成一个字符串,特殊 token 之间用空格分隔
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
# 如果需要将字符串转换为小写,进行转换
if self.do_lower_case:
string = string.lower()
# 返回一个包含生成的字符串、字符偏移和单词偏移的字典
return {"text": string, "char_offsets": char_offsets, "word_offsets": word_offsets}
@staticmethod
def _compute_offsets(
char_repetitions: List[int], chars: List[str], ctc_token: int
) -> List[Dict[str, Union[str, int]]]:
# 计算字符的结束索引,使用字符重复次数的累积和
为 1
chars = tokens
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
# 如果需要输出单词偏移,计算单词偏移
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
# 如果不需要输出字符偏移,将 char_offsets 设置为 None
if not output_char_offsets:
char_offsets = None
# 将处理后的字符列表连接成一个字符串,特殊 token 之间用空格分隔
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
# 如果需要将字符串转换为小写,进行转换
if self.do_lower_case:
string = string.lower()
# 返回一个包含生成的字符串、字符偏移和单词偏移的字典
return {"text": string, "char_offsets": char_offsets, "word_offsets": word_offsets}
@staticmethod
def _compute_offsets(
char_repetitions: List[int], chars: List[str], ctc_token: int
) -> List[Dict[str, Union[str, int]]]:
# 计算字符的结束索引,使用字符重复次数的累积和
end_indices = np chars = tokens
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
# 如果需要输出单词偏移,计算单词偏移
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
# 如果不需要输出字符偏移,将 char_offsets 设置为 None
if not output_char_offsets:
char_offsets = None
# 将处理后的字符列表连接成一个字符串,特殊 token 之间用空格分隔
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
# 如果需要将字符串转换为小写,进行转换
if self.do_lower_case:
string = string.lower()
# 返回一个包含生成的字符串、字符偏移和单词偏移的字典
return {"text": string, "char_offsets": char_offsets, "word_offsets": word_offsets}
@staticmethod
def _compute_offsets(
char_repetitions: List[int], chars: List[str], ctc_token: int
) -> List[Dict[str, Union[str, int]]]:
# 计算字符的结束索引,使用字符重复次数的累积和
end_indices = np.asarray(char chars = tokens
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
# 如果需要输出单词偏移,计算单词偏移
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
# 如果不需要输出字符偏移,将 char_offsets 设置为 None
if not output_char_offsets:
char_offsets = None
# 将处理后的字符列表连接成一个字符串,特殊 token 之间用空格分隔
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
# 如果需要将字符串转换为小写,进行转换
if self.do_lower_case:
string = string.lower()
# 返回一个包含生成的字符串、字符偏移和单词偏移的字典
return {"text": string, "char_offsets": char_offsets, "word_offsets": word_offsets}
@staticmethod
def _compute_offsets(
char_repetitions: List[int], chars: List[str], ctc_token: int
) -> List[Dict[str, Union[str, int]]]:
# 计算字符的结束索引,使用字符重复次数的累积和
end_indices = np.asarray(char_repetitions chars = tokens
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
# 如果需要输出单词偏移,计算单词偏移
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
# 如果不需要输出字符偏移,将 char_offsets 设置为 None
if not output_char_offsets:
char_offsets = None
# 将处理后的字符列表连接成一个字符串,特殊 token 之间用空格分隔
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
# 如果需要将字符串转换为小写,进行转换
if self.do_lower_case:
string = string.lower()
# 返回一个包含生成的字符串、字符偏移和单词偏移的字典
return {"text": string, "char_offsets": char_offsets, "word_offsets": word_offsets}
@staticmethod
def _compute_offsets(
char_repetitions: List[int], chars: List[str], ctc_token: int
) -> List[Dict[str, Union[str, int]]]:
# 计算字符的结束索引,使用字符重复次数的累积和
end_indices = np.asarray(char_repetitions).cumsum()
chars = tokens
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
# 如果需要输出单词偏移,计算单词偏移
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
# 如果不需要输出字符偏移,将 char_offsets 设置为 None
if not output_char_offsets:
char_offsets = None
# 将处理后的字符列表连接成一个字符串,特殊 token 之间用空格分隔
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
# 如果需要将字符串转换为小写,进行转换
if self.do_lower_case:
string = string.lower()
# 返回一个包含生成的字符串、字符偏移和单词偏移的字典
return {"text": string, "char_offsets": char_offsets, "word_offsets": word_offsets}
@staticmethod
def _compute_offsets(
char_repetitions: List[int], chars: List[str], ctc_token: int
) -> List[Dict[str, Union[str, int]]]:
# 计算字符的结束索引,使用字符重复次数的累积和
end_indices = np.asarray(char_repetitions).cumsum()
# 计算字符的开始 = tokens
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
# 如果需要输出单词偏移,计算单词偏移
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
# 如果不需要输出字符偏移,将 char_offsets 设置为 None
if not output_char_offsets:
char_offsets = None
# 将处理后的字符列表连接成一个字符串,特殊 token 之间用空格分隔
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
# 如果需要将字符串转换为小写,进行转换
if self.do_lower_case:
string = string.lower()
# 返回一个包含生成的字符串、字符偏移和单词偏移的字典
return {"text": string, "char_offsets": char_offsets, "word_offsets": word_offsets}
@staticmethod
def _compute_offsets(
char_repetitions: List[int], chars: List[str], ctc_token: int
) -> List[Dict[str, Union[str, int]]]:
# 计算字符的结束索引,使用字符重复次数的累积和
end_indices = np.asarray(char_repetitions).cumsum()
# 计算字符的开始索引,拼 tokens
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
# 如果需要输出单词偏移,计算单词偏移
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
# 如果不需要输出字符偏移,将 char_offsets 设置为 None
if not output_char_offsets:
char_offsets = None
# 将处理后的字符列表连接成一个字符串,特殊 token 之间用空格分隔
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
# 如果需要将字符串转换为小写,进行转换
if self.do_lower_case:
string = string.lower()
# 返回一个包含生成的字符串、字符偏移和单词偏移的字典
return {"text": string, "char_offsets": char_offsets, "word_offsets": word_offsets}
@staticmethod
def _compute_offsets(
char_repetitions: List[int], chars: List[str], ctc_token: int
) -> List[Dict[str, Union[str, int]]]:
# 计算字符的结束索引,使用字符重复次数的累积和
end_indices = np.asarray(char_repetitions).cumsum()
# 计算字符的开始索引,拼接一个
char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
# 如果需要输出单词偏移,计算单词偏移
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
# 如果不需要输出字符偏移,将 char_offsets 设置为 None
if not output_char_offsets:
char_offsets = None
# 将处理后的字符列表连接成一个字符串,特殊 token 之间用空格分隔
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
# 如果需要将字符串转换为小写,进行转换
if self.do_lower_case:
string = string.lower()
# 返回一个包含生成的字符串、字符偏移和单词偏移的字典
return {"text": string, "char_offsets": char_offsets, "word_offsets": word_offsets}
@staticmethod
def _compute_offsets(
char_repetitions: List[int], chars: List[str], ctc_token: int
) -> List[Dict[str, Union[str, int]]]:
# 计算字符的结束索引,使用字符重复次数的累积和
end_indices = np.asarray(char_repetitions).cumsum()
# 计算字符的开始索引,拼接一个 0 和去 char_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
# 如果需要输出单词偏移,计算单词偏移
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
# 如果不需要输出字符偏移,将 char_offsets 设置为 None
if not output_char_offsets:
char_offsets = None
# 将处理后的字符列表连接成一个字符串,特殊 token 之间用空格分隔
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
# 如果需要将字符串转换为小写,进行转换
if self.do_lower_case:
string = string.lower()
# 返回一个包含生成的字符串、字符偏移和单词偏移的字典
return {"text": string, "char_offsets": char_offsets, "word_offsets": word_offsets}
@staticmethod
def _compute_offsets(
char_repetitions: List[int], chars: List[str], ctc_token: int
) -> List[Dict[str, Union[str, int]]]:
# 计算字符的结束索引,使用字符重复次数的累积和
end_indices = np.asarray(char_repetitions).cumsum()
# 计算字符的开始索引,拼接一个 0 和去掉最后一个r_repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
# 如果需要输出单词偏移,计算单词偏移
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
# 如果不需要输出字符偏移,将 char_offsets 设置为 None
if not output_char_offsets:
char_offsets = None
# 将处理后的字符列表连接成一个字符串,特殊 token 之间用空格分隔
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
# 如果需要将字符串转换为小写,进行转换
if self.do_lower_case:
string = string.lower()
# 返回一个包含生成的字符串、字符偏移和单词偏移的字典
return {"text": string, "char_offsets": char_offsets, "word_offsets": word_offsets}
@staticmethod
def _compute_offsets(
char_repetitions: List[int], chars: List[str], ctc_token: int
) -> List[Dict[str, Union[str, int]]]:
# 计算字符的结束索引,使用字符重复次数的累积和
end_indices = np.asarray(char_repetitions).cumsum()
# 计算字符的开始索引,拼接一个 0 和去掉最后一个结束索引的数组repetitions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
# 如果需要输出单词偏移,计算单词偏移
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
# 如果不需要输出字符偏移,将 char_offsets 设置为 None
if not output_char_offsets:
char_offsets = None
# 将处理后的字符列表连接成一个字符串,特殊 token 之间用空格分隔
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
# 如果需要将字符串转换为小写,进行转换
if self.do_lower_case:
string = string.lower()
# 返回一个包含生成的字符串、字符偏移和单词偏移的字典
return {"text": string, "char_offsets": char_offsets, "word_offsets": word_offsets}
@staticmethod
def _compute_offsets(
char_repetitions: List[int], chars: List[str], ctc_token: int
) -> List[Dict[str, Union[str, int]]]:
# 计算字符的结束索引,使用字符重复次数的累积和
end_indices = np.asarray(char_repetitions).cumsum()
# 计算字符的开始索引,拼接一个 0 和去掉最后一个结束索引的数组
itions = len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
# 如果需要输出单词偏移,计算单词偏移
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
# 如果不需要输出字符偏移,将 char_offsets 设置为 None
if not output_char_offsets:
char_offsets = None
# 将处理后的字符列表连接成一个字符串,特殊 token 之间用空格分隔
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
# 如果需要将字符串转换为小写,进行转换
if self.do_lower_case:
string = string.lower()
# 返回一个包含生成的字符串、字符偏移和单词偏移的字典
return {"text": string, "char_offsets": char_offsets, "word_offsets": word_offsets}
@staticmethod
def _compute_offsets(
char_repetitions: List[int], chars: List[str], ctc_token: int
) -> List[Dict[str, Union[str, int]]]:
# 计算字符的结束索引,使用字符重复次数的累积和
end_indices = np.asarray(char_repetitions).cumsum()
# 计算字符的开始索引,拼接一个 0 和去掉最后一个结束索引的数组
start_indices = np.concatenate(([= len(tokens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
# 如果需要输出单词偏移,计算单词偏移
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
# 如果不需要输出字符偏移,将 char_offsets 设置为 None
if not output_char_offsets:
char_offsets = None
# 将处理后的字符列表连接成一个字符串,特殊 token 之间用空格分隔
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
# 如果需要将字符串转换为小写,进行转换
if self.do_lower_case:
string = string.lower()
# 返回一个包含生成的字符串、字符偏移和单词偏移的字典
return {"text": string, "char_offsets": char_offsets, "word_offsets": word_offsets}
@staticmethod
def _compute_offsets(
char_repetitions: List[int], chars: List[str], ctc_token: int
) -> List[Dict[str, Union[str, int]]]:
# 计算字符的结束索引,使用字符重复次数的累积和
end_indices = np.asarray(char_repetitions).cumsum()
# 计算字符的开始索引,拼接一个 0 和去掉最后一个结束索引的数组
start_indices = np.concatenate(([0okens) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
# 如果需要输出单词偏移,计算单词偏移
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
# 如果不需要输出字符偏移,将 char_offsets 设置为 None
if not output_char_offsets:
char_offsets = None
# 将处理后的字符列表连接成一个字符串,特殊 token 之间用空格分隔
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
# 如果需要将字符串转换为小写,进行转换
if self.do_lower_case:
string = string.lower()
# 返回一个包含生成的字符串、字符偏移和单词偏移的字典
return {"text": string, "char_offsets": char_offsets, "word_offsets": word_offsets}
@staticmethod
def _compute_offsets(
char_repetitions: List[int], chars: List[str], ctc_token: int
) -> List[Dict[str, Union[str, int]]]:
# 计算字符的结束索引,使用字符重复次数的累积和
end_indices = np.asarray(char_repetitions).cumsum()
# 计算字符的开始索引,拼接一个 0 和去掉最后一个结束索引的数组
start_indices = np.concatenate(([0], end_indices) * [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
# 如果需要输出单词偏移,计算单词偏移
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
# 如果不需要输出字符偏移,将 char_offsets 设置为 None
if not output_char_offsets:
char_offsets = None
# 将处理后的字符列表连接成一个字符串,特殊 token 之间用空格分隔
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
# 如果需要将字符串转换为小写,进行转换
if self.do_lower_case:
string = string.lower()
# 返回一个包含生成的字符串、字符偏移和单词偏移的字典
return {"text": string, "char_offsets": char_offsets, "word_offsets": word_offsets}
@staticmethod
def _compute_offsets(
char_repetitions: List[int], chars: List[str], ctc_token: int
) -> List[Dict[str, Union[str, int]]]:
# 计算字符的结束索引,使用字符重复次数的累积和
end_indices = np.asarray(char_repetitions).cumsum()
# 计算字符的开始索引,拼接一个 0 和去掉最后一个结束索引的数组
start_indices = np.concatenate(([0], end_indices[:-1]))
* [1]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
# 如果需要输出单词偏移,计算单词偏移
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
# 如果不需要输出字符偏移,将 char_offsets 设置为 None
if not output_char_offsets:
char_offsets = None
# 将处理后的字符列表连接成一个字符串,特殊 token 之间用空格分隔
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
# 如果需要将字符串转换为小写,进行转换
if self.do_lower_case:
string = string.lower()
# 返回一个包含生成的字符串、字符偏移和单词偏移的字典
return {"text": string, "char_offsets": char_offsets, "word_offsets": word_offsets}
@staticmethod
def _compute_offsets(
char_repetitions: List[int], chars: List[str], ctc_token: int
) -> List[Dict[str, Union[str, int]]]:
# 计算字符的结束索引,使用字符重复次数的累积和
end_indices = np.asarray(char_repetitions).cumsum()
# 计算字符的开始索引,拼接一个 0 和去掉最后一个结束索引的数组
start_indices = np.concatenate(([0], end_indices[:-1]))
# 根据字符、开始]
# 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
# 如果需要输出单词偏移,计算单词偏移
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
# 如果不需要输出字符偏移,将 char_offsets 设置为 None
if not output_char_offsets:
char_offsets = None
# 将处理后的字符列表连接成一个字符串,特殊 token 之间用空格分隔
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
# 如果需要将字符串转换为小写,进行转换
if self.do_lower_case:
string = string.lower()
# 返回一个包含生成的字符串、字符偏移和单词偏移的字典
return {"text": string, "char_offsets": char_offsets, "word_offsets": word_offsets}
@staticmethod
def _compute_offsets(
char_repetitions: List[int], chars: List[str], ctc_token: int
) -> List[Dict[str, Union[str, int]]]:
# 计算字符的结束索引,使用字符重复次数的累积和
end_indices = np.asarray(char_repetitions).cumsum()
# 计算字符的开始索引,拼接一个 0 和去掉最后一个结束索引的数组
start_indices = np.concatenate(([0], end_indices[:-1]))
# 根据字符、开始索引和 # 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
# 如果需要输出单词偏移,计算单词偏移
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
# 如果不需要输出字符偏移,将 char_offsets 设置为 None
if not output_char_offsets:
char_offsets = None
# 将处理后的字符列表连接成一个字符串,特殊 token 之间用空格分隔
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
# 如果需要将字符串转换为小写,进行转换
if self.do_lower_case:
string = string.lower()
# 返回一个包含生成的字符串、字符偏移和单词偏移的字典
return {"text": string, "char_offsets": char_offsets, "word_offsets": word_offsets}
@staticmethod
def _compute_offsets(
char_repetitions: List[int], chars: List[str], ctc_token: int
) -> List[Dict[str, Union[str, int]]]:
# 计算字符的结束索引,使用字符重复次数的累积和
end_indices = np.asarray(char_repetitions).cumsum()
# 计算字符的开始索引,拼接一个 0 和去掉最后一个结束索引的数组
start_indices = np.concatenate(([0], end_indices[:-1]))
# 根据字符、开始索引和结束索 # 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
# 如果需要输出单词偏移,计算单词偏移
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
# 如果不需要输出字符偏移,将 char_offsets 设置为 None
if not output_char_offsets:
char_offsets = None
# 将处理后的字符列表连接成一个字符串,特殊 token 之间用空格分隔
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
# 如果需要将字符串转换为小写,进行转换
if self.do_lower_case:
string = string.lower()
# 返回一个包含生成的字符串、字符偏移和单词偏移的字典
return {"text": string, "char_offsets": char_offsets, "word_offsets": word_offsets}
@staticmethod
def _compute_offsets(
char_repetitions: List[int], chars: List[str], ctc_token: int
) -> List[Dict[str, Union[str, int]]]:
# 计算字符的结束索引,使用字符重复次数的累积和
end_indices = np.asarray(char_repetitions).cumsum()
# 计算字符的开始索引,拼接一个 0 和去掉最后一个结束索引的数组
start_indices = np.concatenate(([0], end_indices[:-1]))
# 根据字符、开始索引和结束索引创建偏移 过滤掉 self.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
# 如果需要输出单词偏移,计算单词偏移
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
# 如果不需要输出字符偏移,将 char_offsets 设置为 None
if not output_char_offsets:
char_offsets = None
# 将处理后的字符列表连接成一个字符串,特殊 token 之间用空格分隔
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
# 如果需要将字符串转换为小写,进行转换
if self.do_lower_case:
string = string.lower()
# 返回一个包含生成的字符串、字符偏移和单词偏移的字典
return {"text": string, "char_offsets": char_offsets, "word_offsets": word_offsets}
@staticmethod
def _compute_offsets(
char_repetitions: List[int], chars: List[str], ctc_token: int
) -> List[Dict[str, Union[str, int]]]:
# 计算字符的结束索引,使用字符重复次数的累积和
end_indices = np.asarray(char_repetitions).cumsum()
# 计算字符的开始索引,拼接一个 0 和去掉最后一个结束索引的数组
start_indices = np.concatenate(([0], end_indices[:-1]))
# 根据字符、开始索引和结束索引创建偏移字典elf.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
# 如果需要输出单词偏移,计算单词偏移
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
# 如果不需要输出字符偏移,将 char_offsets 设置为 None
if not output_char_offsets:
char_offsets = None
# 将处理后的字符列表连接成一个字符串,特殊 token 之间用空格分隔
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
# 如果需要将字符串转换为小写,进行转换
if self.do_lower_case:
string = string.lower()
# 返回一个包含生成的字符串、字符偏移和单词偏移的字典
return {"text": string, "char_offsets": char_offsets, "word_offsets": word_offsets}
@staticmethod
def _compute_offsets(
char_repetitions: List[int], chars: List[str], ctc_token: int
) -> List[Dict[str, Union[str, int]]]:
# 计算字符的结束索引,使用字符重复次数的累积和
end_indices = np.asarray(char_repetitions).cumsum()
# 计算字符的开始索引,拼接一个 0 和去掉最后一个结束索引的数组
start_indices = np.concatenate(([0], end_indices[:-1]))
# 根据字符、开始索引和结束索引创建偏移字典列表
lf.pad_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
# 如果需要输出单词偏移,计算单词偏移
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
# 如果不需要输出字符偏移,将 char_offsets 设置为 None
if not output_char_offsets:
char_offsets = None
# 将处理后的字符列表连接成一个字符串,特殊 token 之间用空格分隔
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
# 如果需要将字符串转换为小写,进行转换
if self.do_lower_case:
string = string.lower()
# 返回一个包含生成的字符串、字符偏移和单词偏移的字典
return {"text": string, "char_offsets": char_offsets, "word_offsets": word_offsets}
@staticmethod
def _compute_offsets(
char_repetitions: List[int], chars: List[str], ctc_token: int
) -> List[Dict[str, Union[str, int]]]:
# 计算字符的结束索引,使用字符重复次数的累积和
end_indices = np.asarray(char_repetitions).cumsum()
# 计算字符的开始索引,拼接一个 0 和去掉最后一个结束索引的数组
start_indices = np.concatenate(([0], end_indices[:-1]))
# 根据字符、开始索引和结束索引创建偏移字典列表
offsets = [
_token,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
# 如果需要输出单词偏移,计算单词偏移
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
# 如果不需要输出字符偏移,将 char_offsets 设置为 None
if not output_char_offsets:
char_offsets = None
# 将处理后的字符列表连接成一个字符串,特殊 token 之间用空格分隔
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
# 如果需要将字符串转换为小写,进行转换
if self.do_lower_case:
string = string.lower()
# 返回一个包含生成的字符串、字符偏移和单词偏移的字典
return {"text": string, "char_offsets": char_offsets, "word_offsets": word_offsets}
@staticmethod
def _compute_offsets(
char_repetitions: List[int], chars: List[str], ctc_token: int
) -> List[Dict[str, Union[str, int]]]:
# 计算字符的结束索引,使用字符重复次数的累积和
end_indices = np.asarray(char_repetitions).cumsum()
# 计算字符的开始索引,拼接一个 0 和去掉最后一个结束索引的数组
start_indices = np.concatenate(([0], end_indices[:-1]))
# 根据字符、开始索引和结束索引创建偏移字典列表
offsets = [
{"char":en,这个 token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
# 如果需要输出单词偏移,计算单词偏移
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
# 如果不需要输出字符偏移,将 char_offsets 设置为 None
if not output_char_offsets:
char_offsets = None
# 将处理后的字符列表连接成一个字符串,特殊 token 之间用空格分隔
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
# 如果需要将字符串转换为小写,进行转换
if self.do_lower_case:
string = string.lower()
# 返回一个包含生成的字符串、字符偏移和单词偏移的字典
return {"text": string, "char_offsets": char_offsets, "word_offsets": word_offsets}
@staticmethod
def _compute_offsets(
char_repetitions: List[int], chars: List[str], ctc_token: int
) -> List[Dict[str, Union[str, int]]]:
# 计算字符的结束索引,使用字符重复次数的累积和
end_indices = np.asarray(char_repetitions).cumsum()
# 计算字符的开始索引,拼接一个 0 和去掉最后一个结束索引的数组
start_indices = np.concatenate(([0], end_indices[:-1]))
# 根据字符、开始索引和结束索引创建偏移字典列表
offsets = [
{"char": token 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
# 如果需要输出单词偏移,计算单词偏移
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
# 如果不需要输出字符偏移,将 char_offsets 设置为 None
if not output_char_offsets:
char_offsets = None
# 将处理后的字符列表连接成一个字符串,特殊 token 之间用空格分隔
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
# 如果需要将字符串转换为小写,进行转换
if self.do_lower_case:
string = string.lower()
# 返回一个包含生成的字符串、字符偏移和单词偏移的字典
return {"text": string, "char_offsets": char_offsets, "word_offsets": word_offsets}
@staticmethod
def _compute_offsets(
char_repetitions: List[int], chars: List[str], ctc_token: int
) -> List[Dict[str, Union[str, int]]]:
# 计算字符的结束索引,使用字符重复次数的累积和
end_indices = np.asarray(char_repetitions).cumsum()
# 计算字符的开始索引,拼接一个 0 和去掉最后一个结束索引的数组
start_indices = np.concatenate(([0], end_indices[:-1]))
# 根据字符、开始索引和结束索引创建偏移字典列表
offsets = [
{"char": t, "start_offset": sen 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
# 如果需要输出单词偏移,计算单词偏移
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
# 如果不需要输出字符偏移,将 char_offsets 设置为 None
if not output_char_offsets:
char_offsets = None
# 将处理后的字符列表连接成一个字符串,特殊 token 之间用空格分隔
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
# 如果需要将字符串转换为小写,进行转换
if self.do_lower_case:
string = string.lower()
# 返回一个包含生成的字符串、字符偏移和单词偏移的字典
return {"text": string, "char_offsets": char_offsets, "word_offsets": word_offsets}
@staticmethod
def _compute_offsets(
char_repetitions: List[int], chars: List[str], ctc_token: int
) -> List[Dict[str, Union[str, int]]]:
# 计算字符的结束索引,使用字符重复次数的累积和
end_indices = np.asarray(char_repetitions).cumsum()
# 计算字符的开始索引,拼接一个 0 和去掉最后一个结束索引的数组
start_indices = np.concatenate(([0], end_indices[:-1]))
# 根据字符、开始索引和结束索引创建偏移字典列表
offsets = [
{"char": t, "start_offset": s, "end_offset": e} for t, s 用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
# 如果需要输出单词偏移,计算单词偏移
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
# 如果不需要输出字符偏移,将 char_offsets 设置为 None
if not output_char_offsets:
char_offsets = None
# 将处理后的字符列表连接成一个字符串,特殊 token 之间用空格分隔
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
# 如果需要将字符串转换为小写,进行转换
if self.do_lower_case:
string = string.lower()
# 返回一个包含生成的字符串、字符偏移和单词偏移的字典
return {"text": string, "char_offsets": char_offsets, "word_offsets": word_offsets}
@staticmethod
def _compute_offsets(
char_repetitions: List[int], chars: List[str], ctc_token: int
) -> List[Dict[str, Union[str, int]]]:
# 计算字符的结束索引,使用字符重复次数的累积和
end_indices = np.asarray(char_repetitions).cumsum()
# 计算字符的开始索引,拼接一个 0 和去掉最后一个结束索引的数组
start_indices = np.concatenate(([0], end_indices[:-1]))
# 根据字符、开始索引和结束索引创建偏移字典列表
offsets = [
{"char": t, "start_offset": s, "end_offset": e} for t, s, e in zip(chars用作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
# 如果需要输出单词偏移,计算单词偏移
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
# 如果不需要输出字符偏移,将 char_offsets 设置为 None
if not output_char_offsets:
char_offsets = None
# 将处理后的字符列表连接成一个字符串,特殊 token 之间用空格分隔
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
# 如果需要将字符串转换为小写,进行转换
if self.do_lower_case:
string = string.lower()
# 返回一个包含生成的字符串、字符偏移和单词偏移的字典
return {"text": string, "char_offsets": char_offsets, "word_offsets": word_offsets}
@staticmethod
def _compute_offsets(
char_repetitions: List[int], chars: List[str], ctc_token: int
) -> List[Dict[str, Union[str, int]]]:
# 计算字符的结束索引,使用字符重复次数的累积和
end_indices = np.asarray(char_repetitions).cumsum()
# 计算字符的开始索引,拼接一个 0 和去掉最后一个结束索引的数组
start_indices = np.concatenate(([0], end_indices[:-1]))
# 根据字符、开始索引和结束索引创建偏移字典列表
offsets = [
{"char": t, "start_offset": s, "end_offset": e} for t, s, e in zip(chars, start_indices,作 CTC 的空白 token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
# 如果需要输出单词偏移,计算单词偏移
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
# 如果不需要输出字符偏移,将 char_offsets 设置为 None
if not output_char_offsets:
char_offsets = None
# 将处理后的字符列表连接成一个字符串,特殊 token 之间用空格分隔
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
# 如果需要将字符串转换为小写,进行转换
if self.do_lower_case:
string = string.lower()
# 返回一个包含生成的字符串、字符偏移和单词偏移的字典
return {"text": string, "char_offsets": char_offsets, "word_offsets": word_offsets}
@staticmethod
def _compute_offsets(
char_repetitions: List[int], chars: List[str], ctc_token: int
) -> List[Dict[str, Union[str, int]]]:
# 计算字符的结束索引,使用字符重复次数的累积和
end_indices = np.asarray(char_repetitions).cumsum()
# 计算字符的开始索引,拼接一个 0 和去掉最后一个结束索引的数组
start_indices = np.concatenate(([0], end_indices[:-1]))
# 根据字符、开始索引和结束索引创建偏移字典列表
offsets = [
{"char": t, "start_offset": s, "end_offset": e} for t, s, e in zip(chars, start_indices, end_indices)
token
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
# 如果需要输出单词偏移,计算单词偏移
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
# 如果不需要输出字符偏移,将 char_offsets 设置为 None
if not output_char_offsets:
char_offsets = None
# 将处理后的字符列表连接成一个字符串,特殊 token 之间用空格分隔
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
# 如果需要将字符串转换为小写,进行转换
if self.do_lower_case:
string = string.lower()
# 返回一个包含生成的字符串、字符偏移和单词偏移的字典
return {"text": string, "char_offsets": char_offsets, "word_offsets": word_offsets}
@staticmethod
def _compute_offsets(
char_repetitions: List[int], chars: List[str], ctc_token: int
) -> List[Dict[str, Union[str, int]]]:
# 计算字符的结束索引,使用字符重复次数的累积和
end_indices = np.asarray(char_repetitions).cumsum()
# 计算字符的开始索引,拼接一个 0 和去掉最后一个结束索引的数组
start_indices = np.concatenate(([0], end_indices[:-1]))
# 根据字符、开始索引和结束索引创建偏移字典列表
offsets = [
{"char": t, "start_offset": s, "end_offset": e} for t, s, e in zip(chars, start_indices, end_indices)
]
# processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
# 如果需要输出单词偏移,计算单词偏移
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
# 如果不需要输出字符偏移,将 char_offsets 设置为 None
if not output_char_offsets:
char_offsets = None
# 将处理后的字符列表连接成一个字符串,特殊 token 之间用空格分隔
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
# 如果需要将字符串转换为小写,进行转换
if self.do_lower_case:
string = string.lower()
# 返回一个包含生成的字符串、字符偏移和单词偏移的字典
return {"text": string, "char_offsets": char_offsets, "word_offsets": word_offsets}
@staticmethod
def _compute_offsets(
char_repetitions: List[int], chars: List[str], ctc_token: int
) -> List[Dict[str, Union[str, int]]]:
# 计算字符的结束索引,使用字符重复次数的累积和
end_indices = np.asarray(char_repetitions).cumsum()
# 计算字符的开始索引,拼接一个 0 和去掉最后一个结束索引的数组
start_indices = np.concatenate(([0], end_indices[:-1]))
# 根据字符、开始索引和结束索引创建偏移字典列表
offsets = [
{"char": t, "start_offset": s, "end_offset": e} for t, s, e in zip(chars, start_indices, end_indices)
]
# 过滤掉 C processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
# 如果需要输出单词偏移,计算单词偏移
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
# 如果不需要输出字符偏移,将 char_offsets 设置为 None
if not output_char_offsets:
char_offsets = None
# 将处理后的字符列表连接成一个字符串,特殊 token 之间用空格分隔
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
# 如果需要将字符串转换为小写,进行转换
if self.do_lower_case:
string = string.lower()
# 返回一个包含生成的字符串、字符偏移和单词偏移的字典
return {"text": string, "char_offsets": char_offsets, "word_offsets": word_offsets}
@staticmethod
def _compute_offsets(
char_repetitions: List[int], chars: List[str], ctc_token: int
) -> List[Dict[str, Union[str, int]]]:
# 计算字符的结束索引,使用字符重复次数的累积和
end_indices = np.asarray(char_repetitions).cumsum()
# 计算字符的开始索引,拼接一个 0 和去掉最后一个结束索引的数组
start_indices = np.concatenate(([0], end_indices[:-1]))
# 根据字符、开始索引和结束索引创建偏移字典列表
offsets = [
{"char": t, "start_offset": s, "end_offset": e} for t, s, e in zip(chars, start_indices, end_indices)
]
# 过滤掉 CTC token processed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
# 如果需要输出单词偏移,计算单词偏移
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
# 如果不需要输出字符偏移,将 char_offsets 设置为 None
if not output_char_offsets:
char_offsets = None
# 将处理后的字符列表连接成一个字符串,特殊 token 之间用空格分隔
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
# 如果需要将字符串转换为小写,进行转换
if self.do_lower_case:
string = string.lower()
# 返回一个包含生成的字符串、字符偏移和单词偏移的字典
return {"text": string, "char_offsets": char_offsets, "word_offsets": word_offsets}
@staticmethod
def _compute_offsets(
char_repetitions: List[int], chars: List[str], ctc_token: int
) -> List[Dict[str, Union[str, int]]]:
# 计算字符的结束索引,使用字符重复次数的累积和
end_indices = np.asarray(char_repetitions).cumsum()
# 计算字符的开始索引,拼接一个 0 和去掉最后一个结束索引的数组
start_indices = np.concatenate(([0], end_indices[:-1]))
# 根据字符、开始索引和结束索引创建偏移字典列表
offsets = [
{"char": t, "start_offset": s, "end_offset": e} for t, s, e in zip(chars, start_indices, end_indices)
]
# 过滤掉 CTC token
offsets =ocessed_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
# 如果需要输出单词偏移,计算单词偏移
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
# 如果不需要输出字符偏移,将 char_offsets 设置为 None
if not output_char_offsets:
char_offsets = None
# 将处理后的字符列表连接成一个字符串,特殊 token 之间用空格分隔
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
# 如果需要将字符串转换为小写,进行转换
if self.do_lower_case:
string = string.lower()
# 返回一个包含生成的字符串、字符偏移和单词偏移的字典
return {"text": string, "char_offsets": char_offsets, "word_offsets": word_offsets}
@staticmethod
def _compute_offsets(
char_repetitions: List[int], chars: List[str], ctc_token: int
) -> List[Dict[str, Union[str, int]]]:
# 计算字符的结束索引,使用字符重复次数的累积和
end_indices = np.asarray(char_repetitions).cumsum()
# 计算字符的开始索引,拼接一个 0 和去掉最后一个结束索引的数组
start_indices = np.concatenate(([0], end_indices[:-1]))
# 根据字符、开始索引和结束索引创建偏移字典列表
offsets = [
{"char": t, "start_offset": s, "end_offset": e} for t, s, e in zip(chars, start_indices, end_indices)
]
# 过滤掉 CTC token
offsets = list(filter(lambdased_chars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
# 如果需要输出单词偏移,计算单词偏移
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
# 如果不需要输出字符偏移,将 char_offsets 设置为 None
if not output_char_offsets:
char_offsets = None
# 将处理后的字符列表连接成一个字符串,特殊 token 之间用空格分隔
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
# 如果需要将字符串转换为小写,进行转换
if self.do_lower_case:
string = string.lower()
# 返回一个包含生成的字符串、字符偏移和单词偏移的字典
return {"text": string, "char_offsets": char_offsets, "word_offsets": word_offsets}
@staticmethod
def _compute_offsets(
char_repetitions: List[int], chars: List[str], ctc_token: int
) -> List[Dict[str, Union[str, int]]]:
# 计算字符的结束索引,使用字符重复次数的累积和
end_indices = np.asarray(char_repetitions).cumsum()
# 计算字符的开始索引,拼接一个 0 和去掉最后一个结束索引的数组
start_indices = np.concatenate(([0], end_indices[:-1]))
# 根据字符、开始索引和结束索引创建偏移字典列表
offsets = [
{"char": t, "start_offset": s, "end_offset": e} for t, s, e in zip(chars, start_indices, end_indices)
]
# 过滤掉 CTC token
offsets = list(filter(lambda offsets: offsetsars = list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
# 如果需要输出单词偏移,计算单词偏移
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
# 如果不需要输出字符偏移,将 char_offsets 设置为 None
if not output_char_offsets:
char_offsets = None
# 将处理后的字符列表连接成一个字符串,特殊 token 之间用空格分隔
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
# 如果需要将字符串转换为小写,进行转换
if self.do_lower_case:
string = string.lower()
# 返回一个包含生成的字符串、字符偏移和单词偏移的字典
return {"text": string, "char_offsets": char_offsets, "word_offsets": word_offsets}
@staticmethod
def _compute_offsets(
char_repetitions: List[int], chars: List[str], ctc_token: int
) -> List[Dict[str, Union[str, int]]]:
# 计算字符的结束索引,使用字符重复次数的累积和
end_indices = np.asarray(char_repetitions).cumsum()
# 计算字符的开始索引,拼接一个 0 和去掉最后一个结束索引的数组
start_indices = np.concatenate(([0], end_indices[:-1]))
# 根据字符、开始索引和结束索引创建偏移字典列表
offsets = [
{"char": t, "start_offset": s, "end_offset": e} for t, s, e in zip(chars, start_indices, end_indices)
]
# 过滤掉 CTC token
offsets = list(filter(lambda offsets: offsets["char"]list(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
# 如果需要输出单词偏移,计算单词偏移
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
# 如果不需要输出字符偏移,将 char_offsets 设置为 None
if not output_char_offsets:
char_offsets = None
# 将处理后的字符列表连接成一个字符串,特殊 token 之间用空格分隔
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
# 如果需要将字符串转换为小写,进行转换
if self.do_lower_case:
string = string.lower()
# 返回一个包含生成的字符串、字符偏移和单词偏移的字典
return {"text": string, "char_offsets": char_offsets, "word_offsets": word_offsets}
@staticmethod
def _compute_offsets(
char_repetitions: List[int], chars: List[str], ctc_token: int
) -> List[Dict[str, Union[str, int]]]:
# 计算字符的结束索引,使用字符重复次数的累积和
end_indices = np.asarray(char_repetitions).cumsum()
# 计算字符的开始索引,拼接一个 0 和去掉最后一个结束索引的数组
start_indices = np.concatenate(([0], end_indices[:-1]))
# 根据字符、开始索引和结束索引创建偏移字典列表
offsets = [
{"char": t, "start_offset": s, "end_offset": e} for t, s, e in zip(chars, start_indices, end_indices)
]
# 过滤掉 CTC token
offsets = list(filter(lambda offsets: offsets["char"] != ctc_token, offsets))
st(filter(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
# 如果需要输出单词偏移,计算单词偏移
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
# 如果不需要输出字符偏移,将 char_offsets 设置为 None
if not output_char_offsets:
char_offsets = None
# 将处理后的字符列表连接成一个字符串,特殊 token 之间用空格分隔
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
# 如果需要将字符串转换为小写,进行转换
if self.do_lower_case:
string = string.lower()
# 返回一个包含生成的字符串、字符偏移和单词偏移的字典
return {"text": string, "char_offsets": char_offsets, "word_offsets": word_offsets}
@staticmethod
def _compute_offsets(
char_repetitions: List[int], chars: List[str], ctc_token: int
) -> List[Dict[str, Union[str, int]]]:
# 计算字符的结束索引,使用字符重复次数的累积和
end_indices = np.asarray(char_repetitions).cumsum()
# 计算字符的开始索引,拼接一个 0 和去掉最后一个结束索引的数组
start_indices = np.concatenate(([0], end_indices[:-1]))
# 根据字符、开始索引和结束索引创建偏移字典列表
offsets = [
{"char": t, "start_offset": s, "end_offset": e} for t, s, e in zip(chars, start_indices, end_indices)
]
# 过滤掉 CTC token
offsets = list(filter(lambda offsets: offsets["char"] != ctc_token, offsets))
#er(lambda char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
# 如果需要输出单词偏移,计算单词偏移
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
# 如果不需要输出字符偏移,将 char_offsets 设置为 None
if not output_char_offsets:
char_offsets = None
# 将处理后的字符列表连接成一个字符串,特殊 token 之间用空格分隔
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
# 如果需要将字符串转换为小写,进行转换
if self.do_lower_case:
string = string.lower()
# 返回一个包含生成的字符串、字符偏移和单词偏移的字典
return {"text": string, "char_offsets": char_offsets, "word_offsets": word_offsets}
@staticmethod
def _compute_offsets(
char_repetitions: List[int], chars: List[str], ctc_token: int
) -> List[Dict[str, Union[str, int]]]:
# 计算字符的结束索引,使用字符重复次数的累积和
end_indices = np.asarray(char_repetitions).cumsum()
# 计算字符的开始索引,拼接一个 0 和去掉最后一个结束索引的数组
start_indices = np.concatenate(([0], end_indices[:-1]))
# 根据字符、开始索引和结束索引创建偏移字典列表
offsets = [
{"char": t, "start_offset": s, "end_offset": e} for t, s, e in zip(chars, start_indices, end_indices)
]
# 过滤掉 CTC token
offsets = list(filter(lambda offsets: offsets["char"] != ctc_token, offsets))
# 返回过滤后的偏移列表
char: char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
# 如果需要输出单词偏移,计算单词偏移
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
# 如果不需要输出字符偏移,将 char_offsets 设置为 None
if not output_char_offsets:
char_offsets = None
# 将处理后的字符列表连接成一个字符串,特殊 token 之间用空格分隔
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
# 如果需要将字符串转换为小写,进行转换
if self.do_lower_case:
string = string.lower()
# 返回一个包含生成的字符串、字符偏移和单词偏移的字典
return {"text": string, "char_offsets": char_offsets, "word_offsets": word_offsets}
@staticmethod
def _compute_offsets(
char_repetitions: List[int], chars: List[str], ctc_token: int
) -> List[Dict[str, Union[str, int]]]:
# 计算字符的结束索引,使用字符重复次数的累积和
end_indices = np.asarray(char_repetitions).cumsum()
# 计算字符的开始索引,拼接一个 0 和去掉最后一个结束索引的数组
start_indices = np.concatenate(([0], end_indices[:-1]))
# 根据字符、开始索引和结束索引创建偏移字典列表
offsets = [
{"char": t, "start_offset": s, "end_offset": e} for t, s, e in zip(chars, start_indices, end_indices)
]
# 过滤掉 CTC token
offsets = list(filter(lambda offsets: offsets["char"] != ctc_token, offsets))
# 返回过滤后的偏移列表
return offsets
char != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
# 如果需要输出单词偏移,计算单词偏移
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
# 如果不需要输出字符偏移,将 char_offsets 设置为 None
if not output_char_offsets:
char_offsets = None
# 将处理后的字符列表连接成一个字符串,特殊 token 之间用空格分隔
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
# 如果需要将字符串转换为小写,进行转换
if self.do_lower_case:
string = string.lower()
# 返回一个包含生成的字符串、字符偏移和单词偏移的字典
return {"text": string, "char_offsets": char_offsets, "word_offsets": word_offsets}
@staticmethod
def _compute_offsets(
char_repetitions: List[int], chars: List[str], ctc_token: int
) -> List[Dict[str, Union[str, int]]]:
# 计算字符的结束索引,使用字符重复次数的累积和
end_indices = np.asarray(char_repetitions).cumsum()
# 计算字符的开始索引,拼接一个 0 和去掉最后一个结束索引的数组
start_indices = np.concatenate(([0], end_indices[:-1]))
# 根据字符、开始索引和结束索引创建偏移字典列表
offsets = [
{"char": t, "start_offset": s, "end_offset": e} for t, s, e in zip(chars, start_indices, end_indices)
]
# 过滤掉 CTC token
offsets = list(filter(lambda offsets: offsets["char"] != ctc_token, offsets))
# 返回过滤后的偏移列表
return offsets
@staticmethodar != self.pad_token, chars))
# 替换分隔符 token
processed_chars = [
self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
]
# 初始化 char_offsets 和 word_offsets 为 None
char_offsets = word_offsets = None
# 如果需要输出字符偏移或单词偏移,计算字符偏移
if output_char_offsets or output_word_offsets:
char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
# 检查 char_offsets 和 processed_chars 长度是否一致
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: "
f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
f" {len(processed_chars)}"
)
# 将 char_offsets 中的每个字典的 "char" 键更新为对应的处理后的 token
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
# 如果需要输出单词偏移,计算单词偏移
word_offsets = None
if output_word_offsets:
word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
# 如果不需要输出字符偏移,将 char_offsets 设置为 None
if not output_char_offsets:
char_offsets = None
# 将处理后的字符列表连接成一个字符串,特殊 token 之间用空格分隔
join_char = " " if spaces_between_special_tokens else ""
string = join_char.join(processed_chars).strip()
# 如果需要将字符串转换为小写,进行转换
if self.do_lower_case:
string = string.lower()
# 返回一个包含生成的字符串、字符偏移和单词偏移的字典
return {"text": string, "char_offsets": char_offsets, "word_offsets": word_offsets}
@staticmethod
def _compute_offsets(
char_repetitions: List[int], chars: List[str], ctc_token: int
) -> List[Dict[str, Union[str, int]]]:
# 计算字符的结束索引,使用字符重复次数的累积和
end_indices = np.asarray(char_repetitions).cumsum()
# 计算字符的开始索引,拼接一个 0 和去掉最后一个结束索引的数组
start_indices = np.concatenate(([0], end_indices[:-1]))
# 根据字符、开始索引和结束索引创建偏移字典列表
offsets = [
{"char": t, "start_offset": s, "end_offset": e} for t, s, e in zip(chars, start_indices, end_indices)
]
# 过滤掉 CTC token
offsets = list(filter(lambda offsets: offsets["char"] != ctc_token, offsets))
# 返回过滤后的偏移列表
return offsets
@staticmethod
def _get_word_offsets(
offsets: Dict[str, Union[str, float]], word_delimiter_char: str = " "
) -> Dict[str, Union[str, float]]:
# 初始化一个空列表,用于存储单词的偏移量信息
word_offsets = []
# 初始化上一个字符的状态为"SPACE"
last_state = "SPACE"
# 初始化单词字符串为空
word = ""
# 初始化单词的起始偏移量和结束偏移量为0
start_offset = 0
end_offset = 0
# 遍历偏移量字典的索引和值
for i, offset in enumerate(offsets):
# 获取当前字符
char = offset["char"]
# 根据当前字符是否为单词分隔符,确定当前状态是"SPACE"还是"WORD"
state = "SPACE" if char == word_delimiter_char else "WORD"
# 如果当前状态和上一个状态相同,则继续处理当前单词
if state == last_state:
# 更新结束偏移量
end_offset = offset["end_offset"]
# 将当前字符添加到单词字符串中
word += char
else:
# 如果状态不同,表示单词边界发生变化
if state == "SPACE":
# 完成一个单词的识别,将其信息加入到单词偏移量列表中
word_offsets.append({"word": word, "start_offset": start_offset, "end_offset": end_offset})
else:
# 开始识别一个新单词,更新起始偏移量和结束偏移量,并重新设置单词字符串
start_offset = offset["start_offset"]
end_offset = offset["end_offset"]
word = char
# 更新上一个状态为当前状态
last_state = state
# 最后处理最后一个单词,如果上一个状态是"WORD",则将其加入到单词偏移量列表中
if last_state == "WORD":
word_offsets.append({"word": word, "start_offset": start_offset, "end_offset": end_offset})
# 返回单词偏移量列表
return word_offsets
def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
# 如果已经将文本分割为单词,则在文本前添加一个空格
if is_split_into_words:
text = " " + text
# 返回处理后的文本和附加的关键字参数
return (text, kwargs)
def _decode(
self,
token_ids: List[int],
skip_special_tokens: bool = False,
clean_up_tokenization_spaces: bool = None,
group_tokens: bool = True,
spaces_between_special_tokens: bool = False,
output_word_offsets: Optional[bool] = False,
output_char_offsets: Optional[bool] = False,
```
) -> str:
"""
special _decode function is needed for Wav2Vec2Tokenizer because added tokens should be treated exactly the
same as tokens of the base vocabulary and therefore the function `convert_tokens_to_string` has to be called on
the whole token list and not individually on added tokens
"""
# 将 token_ids 转换为 tokens,并过滤掉特殊 token(如果需要)
filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
result = []
# 遍历过滤后的 tokens
for token in filtered_tokens:
# 如果需要跳过特殊 token 并且 token 在所有特殊 token 的集合中,则跳过此 token
if skip_special_tokens and token in self.all_special_ids:
continue
# 将 token 添加到结果列表中
result.append(token)
# 将过滤后的 tokens 转换为字符串输出
string_output = self.convert_tokens_to_string(
result,
group_tokens=group_tokens,
spaces_between_special_tokens=spaces_between_special_tokens,
output_word_offsets=output_word_offsets,
output_char_offsets=output_char_offsets,
)
# 获取字符串形式的文本结果
text = string_output["text"]
# 根据需要清理 token 化的空格
clean_up_tokenization_spaces = (
clean_up_tokenization_spaces
if clean_up_tokenization_spaces is not None
else self.clean_up_tokenization_spaces
)
if clean_up_tokenization_spaces:
text = self.clean_up_tokenization(text)
# 如果需要输出单词偏移或字符偏移,则返回 `Wav2Vec2CTCTokenizerOutput` 对象
if output_word_offsets or output_char_offsets:
return Wav2Vec2CTCTokenizerOutput(
text=text,
char_offsets=string_output["char_offsets"],
word_offsets=string_output["word_offsets"],
)
else:
# 否则,直接返回文本结果
return text
# 从 `tokenization_utils_base.py` 覆写,因为 tokenizer 可以输出 `ModelOutput`,这不应该是批量输出的列表,并且在这里需要对 `output_char_offsets` 进行文档化
def batch_decode(
self,
sequences: Union[List[int], List[List[int]], "np.ndarray", "torch.Tensor", "tf.Tensor"],
skip_special_tokens: bool = False,
clean_up_tokenization_spaces: bool = None,
output_char_offsets: bool = False,
output_word_offsets: bool = False,
**kwargs,
) -> List[str]:
"""
Convert a list of lists of token ids into a list of strings by calling decode.
Args:
sequences (`Union[List[int], List[List[int]], np.ndarray, torch.Tensor, tf.Tensor]`):
List of tokenized input ids. Can be obtained using the `__call__` method.
skip_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not to remove special tokens in the decoding.
clean_up_tokenization_spaces (`bool`, *optional*):
Whether or not to clean up the tokenization spaces.
output_char_offsets (`bool`, *optional*, defaults to `False`):
Whether or not to output character offsets. Character offsets can be used in combination with the
sampling rate and model downsampling rate to compute the time-stamps of transcribed characters.
<Tip>
Please take a look at the Example of [`~Wav2Vec2CTCTokenizer.decode`] to better understand how to make
use of `output_char_offsets`. [`~Wav2Vec2CTCTokenizer.batch_decode`] works the same way with batched
output.
</Tip>
output_word_offsets (`bool`, *optional*, defaults to `False`):
Whether or not to output word offsets. Word offsets can be used in combination with the sampling rate
and model downsampling rate to compute the time-stamps of transcribed words.
<Tip>
Please take a look at the Example of [`~Wav2Vec2CTCTokenizer.decode`] to better understand how to make
use of `output_word_offsets`. [`~Wav2Vec2CTCTokenizer.batch_decode`] works the same way with batched
output.
</Tip>
kwargs (additional keyword arguments, *optional*):
Will be passed to the underlying model specific decode method.
Returns:
`List[str]` or [`~models.wav2vec2.tokenization_wav2vec2.Wav2Vec2CTCTokenizerOutput`]: The list of decoded
sentences. Will be a [`~models.wav2vec2.tokenization_wav2vec2.Wav2Vec2CTCTokenizerOutput`] when
`output_char_offsets == True` or `output_word_offsets == True`.
"""
# Decode each sequence in the batch using the `decode` method of the tokenizer
batch_decoded = [
self.decode(
seq,
skip_special_tokens=skip_special_tokens,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
output_char_offsets=output_char_offsets,
output_word_offsets=output_word_offsets,
**kwargs,
)
for seq in sequences
]
# If either `output_char_offsets` or `output_word_offsets` is True
if output_char_offsets or output_word_offsets:
# Transform list of dictionaries to a dictionary of lists
return Wav2Vec2CTCTokenizerOutput({k: [d[k] for d in batch_decoded] for k in batch_decoded[0]})
# Otherwise, return the list of decoded sentences
return batch_decoded
# 重写自 `tokenization_utils_base.py`,因为这里需要文档关于 `output_char_offsets` 和 `output_word_offsets`
def decode(
self,
token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"],
skip_special_tokens: bool = False,
clean_up_tokenization_spaces: bool = None,
output_char_offsets: bool = False,
output_word_offsets: bool = False,
**kwargs,
):
# 检查保存目录是否存在,若不存在则报错
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
# 拼接词汇文件路径,根据给定的前缀和标准的文件名
vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
# 将词汇表写入到 JSON 格式的文件中
with open(vocab_file, "w", encoding="utf-8") as f:
f.write(json.dumps(self.vocab, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
# 返回保存的词汇文件路径的元组
return (vocab_file,)
class Wav2Vec2Tokenizer(PreTrainedTokenizer):
"""
Constructs a Wav2Vec2 tokenizer.
This tokenizer inherits from [`PreTrainedTokenizer`] which contains some of the main methods. Users should refer to
the superclass for more information regarding such methods.
Args:
vocab_file (`str`):
File containing the vocabulary.
bos_token (`str`, *optional*, defaults to `"<s>"`):
The beginning of sentence token.
eos_token (`str`, *optional*, defaults to `"</s>"`):
The end of sentence token.
unk_token (`str`, *optional*, defaults to `"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
pad_token (`str`, *optional*, defaults to `"<pad>"`):
The token used for padding, for example when batching sequences of different lengths.
word_delimiter_token (`str`, *optional*, defaults to `"|"`):
The token used for defining the end of a word.
do_lower_case (`bool`, *optional*, defaults to `False`):
Whether or not to lowercase the output when decoding.
do_normalize (`bool`, *optional*, defaults to `False`):
Whether or not to zero-mean unit-variance normalize the input. Normalizing can help to significantly
improve the performance for some models, *e.g.*,
[wav2vec2-lv60](https://huggingface.co/models?search=lv60).
return_attention_mask (`bool`, *optional*, defaults to `False`):
Whether or not [`~Wav2Vec2Tokenizer.__call__`] should return `attention_mask`.
<Tip>
Wav2Vec2 models that have set `config.feat_extract_norm == "group"`, such as
[wav2vec2-base](https://huggingface.co/facebook/wav2vec2-base-960h), have **not** been trained using
`attention_mask`. For such models, `input_values` should simply be padded with 0 and no `attention_mask`
should be passed.
For Wav2Vec2 models that have set `config.feat_extract_norm == "layer"`, such as
[wav2vec2-lv60](https://huggingface.co/facebook/wav2vec2-large-960h-lv60-self), `attention_mask` should be
passed for batched inference.
</Tip>
**kwargs
Additional keyword arguments passed along to [`PreTrainedTokenizer`]
"""
# 定义类变量,包含了词汇文件的名称列表
vocab_files_names = VOCAB_FILES_NAMES
# 定义预训练模型的词汇文件和配置文件的映射
pretrained_vocab_files_map = {
"vocab_file": {
"facebook/wav2vec2-base-960h": "https://huggingface.co/facebook/wav2vec2-base-960h/resolve/main/vocab.json"
},
"tokenizer_config_file": {
"facebook/wav2vec2-base-960h": (
"https://huggingface.co/facebook/wav2vec2-base-960h/resolve/main/tokenizer.json"
),
},
}
# 定义模型的输入名称列表
model_input_names = ["input_values", "attention_mask"]
# 初始化函数,用于创建一个新的 Wav2Vec2Tokenizer 对象
def __init__(
self,
vocab_file, # 词汇文件路径,用于加载词汇表
bos_token="<s>", # 开始标记,默认为 "<s>"
eos_token="</s>", # 结束标记,默认为 "</s>"
unk_token="<unk>", # 未知标记,默认为 "<unk>"
pad_token="<pad>", # 填充标记,默认为 "<pad>"
word_delimiter_token="|", # 单词分隔标记,默认为 "|"
do_lower_case=False, # 是否将文本转换为小写,默认为 False
do_normalize=False, # 是否对文本进行正规化,默认为 False
return_attention_mask=False, # 是否返回注意力掩码,默认为 False
**kwargs, # 其他关键字参数
):
# 发出警告,表明 Wav2Vec2Tokenizer 类已被弃用,并将在 Transformers 的第五版中移除
warnings.warn(
"The class `Wav2Vec2Tokenizer` is deprecated and will be removed in version 5 of Transformers. Please use"
" `Wav2Vec2Processor` or `Wav2Vec2CTCTokenizer` instead.",
FutureWarning,
)
# 设置单词分隔标记
self._word_delimiter_token = word_delimiter_token
# 设置是否转换为小写
self.do_lower_case = do_lower_case
# 设置是否返回注意力掩码
self.return_attention_mask = return_attention_mask
# 设置是否正规化
self.do_normalize = do_normalize
# 从 UTF-8 编码的词汇文件中加载词汇表到 encoder 字典中
with open(vocab_file, encoding="utf-8") as vocab_handle:
self.encoder = json.load(vocab_handle)
# 通过 encoder 字典创建 decoder 字典,用于反向查找
self.decoder = {v: k for k, v in self.encoder.items()}
# 调用父类的初始化函数,传入各种标记和参数
super().__init__(
unk_token=unk_token,
bos_token=bos_token,
eos_token=eos_token,
pad_token=pad_token,
do_lower_case=do_lower_case,
do_normalize=do_normalize,
return_attention_mask=return_attention_mask,
word_delimiter_token=word_delimiter_token,
**kwargs,
)
@property
def word_delimiter_token(self) -> str:
"""
`str`: 单词分隔标记。如果在未设置的情况下使用,记录错误日志。
"""
# 如果单词分隔标记为 None 且 verbose 为 True,则记录错误日志并返回 None
if self._word_delimiter_token is None and self.verbose:
logger.error("Using word_delimiter_token, but it is not set yet.")
return None
# 否则,返回单词分隔标记的字符串形式
return str(self._word_delimiter_token)
@property
def word_delimiter_token_id(self) -> Optional[int]:
"""
`Optional[int]`: 单词分隔标记在词汇表中的 id。如果未设置,则返回 None。
"""
# 如果单词分隔标记为 None,则返回 None
if self._word_delimiter_token is None:
return None
# 否则,返回单词分隔标记在词汇表中的 id
return self.convert_tokens_to_ids(self.word_delimiter_token)
@word_delimiter_token.setter
def word_delimiter_token(self, value):
# 设置单词分隔标记的值
self._word_delimiter_token = value
@word_delimiter_token_id.setter
def word_delimiter_token_id(self, value):
# 根据给定的值设置单词分隔标记在词汇表中的 id
self._word_delimiter_token = self.convert_tokens_to_ids(value)
@add_end_docstrings(WAV2VEC2_KWARGS_DOCSTRING)
def __call__(
self,
raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
padding: Union[bool, str, PaddingStrategy] = False,
max_length: Optional[int] = None,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
verbose: bool = True,
**kwargs,
):
# 调用实例对象,接受原始语音输入以及一系列处理参数,并返回处理后的结果
) -> BatchEncoding:
"""
Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of
sequences.
Args:
raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
values, a list of numpy array or a list of list of float values. Must be mono channel audio, not
stereo, i.e. single float per timestep.
"""
# 检查输入是否为批处理的 numpy 数组,并且数组维度大于1
is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1
# 如果是批处理的 numpy 数组,并且维度大于2,抛出异常
if is_batched_numpy and len(raw_speech.shape) > 2:
raise ValueError(f"Only mono-channel audio is supported for input to {self}")
# 检查是否为批处理数据,可以是 numpy 数组或者列表/元组中包含 numpy 数组、元组、列表
is_batched = is_batched_numpy or (
isinstance(raw_speech, (list, tuple)) and (isinstance(raw_speech[0], (np.ndarray, tuple, list)))
)
# 确保输入格式为列表
if is_batched and not isinstance(raw_speech[0], np.ndarray):
raw_speech = [np.asarray(speech) for speech in raw_speech]
elif not is_batched and not isinstance(raw_speech, np.ndarray):
raw_speech = np.asarray(raw_speech)
# 如果不是批处理形式,则将输入封装成列表
if not is_batched:
raw_speech = [raw_speech]
# 如果需要进行归一化处理
if self.do_normalize:
# 对每个序列进行零均值和单位方差归一化
raw_speech = [(x - np.mean(x)) / np.sqrt(np.var(x) + 1e-5) for x in raw_speech]
# 将输入编码为适合填充的格式
encoded_inputs = BatchEncoding({"input_values": raw_speech})
# 使用指定参数对输入进行填充
padded_inputs = self.pad(
encoded_inputs,
padding=padding,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
return_attention_mask=self.return_attention_mask,
return_tensors=return_tensors,
verbose=verbose,
)
return padded_inputs
@property
def vocab_size(self) -> int:
# 返回解码器中的词汇表大小
return len(self.decoder)
def get_vocab(self) -> Dict:
# 返回编码器和添加的特殊标记编码器的字典
return dict(self.encoder, **self.added_tokens_encoder)
def _convert_token_to_id(self, token: str) -> int:
"""Converts a token (str) in an index (integer) using the vocab."""
# 根据词汇表将单词转换为对应的索引
return self.encoder.get(token, self.encoder.get(self.unk_token))
def _convert_id_to_token(self, index: int) -> str:
"""Converts an index (integer) in a token (str) using the vocab."""
# 根据词汇表将索引转换为对应的单词
result = self.decoder.get(index, self.unk_token)
return result
def convert_tokens_to_string(self, tokens: List[str]) -> str:
"""
Converts a list of connectionist-temporal-classification (CTC) output tokens into a single string.
Args:
tokens (List[str]): List of tokens to be converted into a string.
Returns:
str: Converted string from tokens.
"""
# Group tokens into non-repeating tokens in CTC style decoding
grouped_tokens = [token_group[0] for token_group in groupby(tokens)]
# Filter out self.pad_token which serves as the CTC-blank token
filtered_tokens = list(filter(lambda token: token != self.pad_token, grouped_tokens))
# Replace delimiter token with spaces and join tokens into a single string
string = "".join([" " if token == self.word_delimiter_token else token for token in filtered_tokens]).strip()
# Convert to lowercase if do_lower_case is True
if self.do_lower_case:
string = string.lower()
return string
def _decode(
self,
token_ids: List[int],
skip_special_tokens: bool = False,
clean_up_tokenization_spaces: bool = None,
**kwargs,
) -> str:
"""
Special _decode function for Wav2Vec2Tokenizer to handle added tokens exactly like base vocabulary tokens.
Args:
token_ids (List[int]): List of token IDs to be decoded into a string.
skip_special_tokens (bool): Whether to skip special tokens.
clean_up_tokenization_spaces (bool): Whether to clean up tokenization spaces.
Returns:
str: Decoded string from token IDs.
"""
# Convert token IDs to tokens, filtering out special tokens if skip_special_tokens is True
filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
result = []
for token in filtered_tokens:
if skip_special_tokens and token in self.all_special_ids:
continue
result.append(token)
# Convert filtered tokens into a single string
text = self.convert_tokens_to_string(result)
# Determine whether to clean up tokenization spaces
clean_up_tokenization_spaces = (
clean_up_tokenization_spaces
if clean_up_tokenization_spaces is not None
else self.clean_up_tokenization_spaces
)
if clean_up_tokenization_spaces:
clean_text = self.clean_up_tokenization(text)
return clean_text
else:
return text
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
"""
Saves the vocabulary as a JSON file in the specified directory.
Args:
save_directory (str): Directory path where vocabulary JSON should be saved.
filename_prefix (Optional[str]): Optional prefix for the vocabulary JSON file name.
Returns:
Tuple[str]: Tuple containing the path to the saved vocabulary file.
"""
# Ensure save_directory exists; otherwise log an error and return
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
# Construct the full path for the vocabulary file
vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
# Write the vocabulary dictionary to the JSON file
with open(vocab_file, "w", encoding="utf-8") as f:
f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
return (vocab_file,)