|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" Tokenization classes for python tokenizers. |
|
For fast tokenizers (provided by HuggingFace's tokenizers library) see tokenization_utils_fast.py |
|
""" |
|
|
|
import itertools |
|
import logging |
|
import re |
|
import unicodedata |
|
from typing import Dict, List, Optional, Tuple, Union |
|
|
|
from .file_utils import add_end_docstrings |
|
from .tokenization_utils_base import ( |
|
ENCODE_KWARGS_DOCSTRING, |
|
ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING, |
|
AddedToken, |
|
BatchEncoding, |
|
EncodedInput, |
|
EncodedInputPair, |
|
PaddingStrategy, |
|
PreTokenizedInput, |
|
PreTokenizedInputPair, |
|
PreTrainedTokenizerBase, |
|
TensorType, |
|
TextInput, |
|
TextInputPair, |
|
TruncationStrategy, |
|
) |
|
|
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
def _is_whitespace(char): |
|
"""Checks whether `chars` is a whitespace character.""" |
|
|
|
|
|
if char == " " or char == "\t" or char == "\n" or char == "\r": |
|
return True |
|
cat = unicodedata.category(char) |
|
if cat == "Zs": |
|
return True |
|
return False |
|
|
|
|
|
def _is_control(char): |
|
"""Checks whether `chars` is a control character.""" |
|
|
|
|
|
if char == "\t" or char == "\n" or char == "\r": |
|
return False |
|
cat = unicodedata.category(char) |
|
if cat.startswith("C"): |
|
return True |
|
return False |
|
|
|
|
|
def _is_punctuation(char): |
|
"""Checks whether `chars` is a punctuation character.""" |
|
cp = ord(char) |
|
|
|
|
|
|
|
|
|
if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126): |
|
return True |
|
cat = unicodedata.category(char) |
|
if cat.startswith("P"): |
|
return True |
|
return False |
|
|
|
|
|
def _is_end_of_word(text): |
|
"""Checks whether the last character in text is one of a punctuation, control or whitespace character.""" |
|
last_char = text[-1] |
|
return bool(_is_control(last_char) | _is_punctuation(last_char) | _is_whitespace(last_char)) |
|
|
|
|
|
def _is_start_of_word(text): |
|
"""Checks whether the first character in text is one of a punctuation, control or whitespace character.""" |
|
first_char = text[0] |
|
return bool(_is_control(first_char) | _is_punctuation(first_char) | _is_whitespace(first_char)) |
|
|
|
|
|
class PreTrainedTokenizer(PreTrainedTokenizerBase): |
|
""" Base class for all slow tokenizers. |
|
|
|
Handle all the shared methods for tokenization and special tokens as well as methods |
|
downloading/caching/loading pretrained tokenizers as well as adding tokens to the vocabulary. |
|
|
|
This class also contain the added tokens in a unified way on top of all tokenizers so we don't |
|
have to handle the specific vocabulary augmentation methods of the various underlying |
|
dictionary structures (BPE, sentencepiece...). |
|
|
|
Class attributes (overridden by derived classes): |
|
|
|
- ``vocab_files_names``: a python ``dict`` with, as keys, the ``__init__`` keyword name of each vocabulary file |
|
required by the model, and as associated values, the filename for saving the associated file (string). |
|
- ``pretrained_vocab_files_map``: a python ``dict of dict`` the high-level keys |
|
being the ``__init__`` keyword name of each vocabulary file required by the model, the low-level being the |
|
`short-cut-names` (string) of the pretrained models with, as associated values, the `url` (string) to the |
|
associated pretrained vocabulary file. |
|
- ``max_model_input_sizes``: a python ``dict`` with, as keys, the `short-cut-names` (string) of the pretrained |
|
models, and as associated values, the maximum length of the sequence inputs of this model, or None if the |
|
model has no maximum input size. |
|
- ``pretrained_init_configuration``: a python ``dict`` with, as keys, the `short-cut-names` (string) of the |
|
pretrained models, and as associated values, a dictionnary of specific arguments to pass to the |
|
``__init__``method of the tokenizer class for this pretrained model when loading the tokenizer with the |
|
``from_pretrained()`` method. |
|
|
|
Args: |
|
- ``model_max_length``: (`Optional`) int: the maximum length in number of tokens for the inputs to the transformer model. |
|
When the tokenizer is loaded with `from_pretrained`, this will be set to the value stored for the associated |
|
model in ``max_model_input_sizes`` (see above). If no value is provided, will default to VERY_LARGE_INTEGER (`int(1e30)`). |
|
no associated max_length can be found in ``max_model_input_sizes``. |
|
- ``padding_side``: (`Optional`) string: the side on which the model should have padding applied. |
|
Should be selected between ['right', 'left'] |
|
- ``model_input_names``: (`Optional`) List[string]: the list of the forward pass inputs accepted by the |
|
model ("token_type_ids", "attention_mask"...). |
|
- ``bos_token``: (`Optional`) string: a beginning of sentence token. |
|
Will be associated to ``self.bos_token`` and ``self.bos_token_id`` |
|
- ``eos_token``: (`Optional`) string: an end of sentence token. |
|
Will be associated to ``self.eos_token`` and ``self.eos_token_id`` |
|
- ``unk_token``: (`Optional`) string: an unknown token. |
|
Will be associated to ``self.unk_token`` and ``self.unk_token_id`` |
|
- ``sep_token``: (`Optional`) string: a separation token (e.g. to separate context and query in an input sequence). |
|
Will be associated to ``self.sep_token`` and ``self.sep_token_id`` |
|
- ``pad_token``: (`Optional`) string: a padding token. |
|
Will be associated to ``self.pad_token`` and ``self.pad_token_id`` |
|
- ``cls_token``: (`Optional`) string: a classification token (e.g. to extract a summary of an input sequence |
|
leveraging self-attention along the full depth of the model). |
|
Will be associated to ``self.cls_token`` and ``self.cls_token_id`` |
|
- ``mask_token``: (`Optional`) string: a masking token (e.g. when training a model with masked-language |
|
modeling). Will be associated to ``self.mask_token`` and ``self.mask_token_id`` |
|
- ``additional_special_tokens``: (`Optional`) list: a list of additional special tokens. |
|
Adding all special tokens here ensure they won't be split by the tokenization process. |
|
Will be associated to ``self.additional_special_tokens`` and ``self.additional_special_tokens_ids`` |
|
|
|
|
|
.. automethod:: __call__ |
|
""" |
|
|
|
def __init__(self, **kwargs): |
|
super().__init__(**kwargs) |
|
|
|
|
|
|
|
self.added_tokens_encoder: Dict[str, int] = {} |
|
self.added_tokens_decoder: Dict[int, str] = {} |
|
self.unique_no_split_tokens: List[str] = [] |
|
|
|
@property |
|
def is_fast(self) -> bool: |
|
return False |
|
|
|
@property |
|
def vocab_size(self) -> int: |
|
""" Size of the base vocabulary (without the added tokens) """ |
|
raise NotImplementedError |
|
|
|
def get_vocab(self): |
|
""" Returns the vocabulary as a dict of {token: index} pairs. `tokenizer.get_vocab()[token]` is equivalent to `tokenizer.convert_tokens_to_ids(token)` when `token` is in the vocab. """ |
|
raise NotImplementedError() |
|
|
|
def get_added_vocab(self) -> Dict[str, int]: |
|
return self.added_tokens_encoder |
|
|
|
def __len__(self): |
|
""" Size of the full vocabulary with the added tokens """ |
|
return self.vocab_size + len(self.added_tokens_encoder) |
|
|
|
def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens=False) -> int: |
|
""" |
|
Add a list of new tokens to the tokenizer class. If the new tokens are not in the |
|
vocabulary, they are added to it with indices starting from length of the current vocabulary. |
|
|
|
Args: |
|
new_tokens: string or list of string. Each string is a token to add. Tokens are only added if they are not |
|
already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them). |
|
|
|
Returns: |
|
Number of tokens added to the vocabulary. |
|
|
|
Examples:: |
|
|
|
# Let's see how to increase the vocabulary of Bert model and tokenizer |
|
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') |
|
model = BertModel.from_pretrained('bert-base-uncased') |
|
|
|
num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2']) |
|
print('We have added', num_added_toks, 'tokens') |
|
model.resize_token_embeddings(len(tokenizer)) # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer. |
|
""" |
|
new_tokens = [str(tok) for tok in new_tokens] |
|
|
|
tokens_to_add = [] |
|
for token in new_tokens: |
|
assert isinstance(token, str) |
|
if not special_tokens and self.init_kwargs.get("do_lower_case", False): |
|
token = token.lower() |
|
if ( |
|
token != self.unk_token |
|
and self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token) |
|
and token not in tokens_to_add |
|
): |
|
tokens_to_add.append(token) |
|
if self.verbose: |
|
logger.info("Adding %s to the vocabulary", token) |
|
|
|
added_tok_encoder = dict((tok, len(self) + i) for i, tok in enumerate(tokens_to_add)) |
|
added_tok_decoder = {v: k for k, v in added_tok_encoder.items()} |
|
self.added_tokens_encoder.update(added_tok_encoder) |
|
self.added_tokens_decoder.update(added_tok_decoder) |
|
|
|
|
|
if special_tokens: |
|
self.unique_no_split_tokens = list(set(self.unique_no_split_tokens).union(set(new_tokens))) |
|
else: |
|
|
|
self.unique_no_split_tokens = list(set(self.unique_no_split_tokens).union(set(tokens_to_add))) |
|
|
|
return len(tokens_to_add) |
|
|
|
def num_special_tokens_to_add(self, pair=False): |
|
""" |
|
Returns the number of added tokens when encoding a sequence with special tokens. |
|
|
|
Note: |
|
This encodes inputs and checks the number of added tokens, and is therefore not efficient. Do not put this |
|
inside your training loop. |
|
|
|
Args: |
|
pair: Returns the number of added tokens in the case of a sequence pair if set to True, returns the |
|
number of added tokens in the case of a single sequence if set to False. |
|
|
|
Returns: |
|
Number of tokens added to sequences |
|
""" |
|
token_ids_0 = [] |
|
token_ids_1 = [] |
|
return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None)) |
|
|
|
def tokenize(self, text: TextInput, **kwargs): |
|
""" Converts a string in a sequence of tokens (string), using the tokenizer. |
|
Split in words for word-based vocabulary or sub-words for sub-word-based |
|
vocabularies (BPE/SentencePieces/WordPieces). |
|
|
|
Take care of added tokens. |
|
|
|
Args: |
|
text (:obj:`string`): The sequence to be encoded. |
|
**kwargs (:obj: `dict`): Arguments passed to the model-specific `prepare_for_tokenization` preprocessing method. |
|
""" |
|
|
|
all_special_tokens_extended = dict( |
|
(str(t), t) for t in self.all_special_tokens_extended if isinstance(t, AddedToken) |
|
) |
|
|
|
text, kwargs = self.prepare_for_tokenization(text, **kwargs) |
|
|
|
if kwargs: |
|
logger.warning(f"Keyword arguments {kwargs} not recognized.") |
|
|
|
|
|
if self.init_kwargs.get("do_lower_case", False): |
|
|
|
escaped_special_toks = [re.escape(s_tok) for s_tok in self.all_special_tokens] |
|
pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)" |
|
text = re.sub(pattern, lambda m: m.groups()[0] or m.groups()[1].lower(), text) |
|
|
|
def split_on_token(tok, text): |
|
result = [] |
|
tok_extended = all_special_tokens_extended.get(tok, None) |
|
split_text = text.split(tok) |
|
full_word = "" |
|
for i, sub_text in enumerate(split_text): |
|
|
|
|
|
|
|
|
|
if isinstance(tok_extended, AddedToken): |
|
if tok_extended.single_word: |
|
|
|
if ( |
|
i < len(split_text) - 1 |
|
and not _is_end_of_word(sub_text) |
|
and not _is_start_of_word(split_text[i + 1]) |
|
): |
|
|
|
full_word += sub_text + tok |
|
elif full_word: |
|
full_word += sub_text |
|
result += [full_word] |
|
full_word = "" |
|
continue |
|
|
|
if tok_extended.rstrip and i > 0: |
|
|
|
|
|
sub_text = sub_text.lstrip() |
|
|
|
if tok_extended.lstrip and i < len(split_text) - 1: |
|
sub_text = sub_text.rstrip() |
|
else: |
|
|
|
if i < len(split_text) - 1: |
|
sub_text = sub_text.rstrip() |
|
if i > 0: |
|
sub_text = sub_text.lstrip() |
|
|
|
if i == 0 and not sub_text: |
|
result += [tok] |
|
elif i == len(split_text) - 1: |
|
if sub_text: |
|
result += [sub_text] |
|
else: |
|
pass |
|
else: |
|
if sub_text: |
|
result += [sub_text] |
|
result += [tok] |
|
return result |
|
|
|
def split_on_tokens(tok_list, text): |
|
if not text.strip(): |
|
return [] |
|
if not tok_list: |
|
return self._tokenize(text) |
|
|
|
tokenized_text = [] |
|
text_list = [text] |
|
for tok in tok_list: |
|
tokenized_text = [] |
|
for sub_text in text_list: |
|
if sub_text not in self.unique_no_split_tokens: |
|
tokenized_text += split_on_token(tok, sub_text) |
|
else: |
|
tokenized_text += [sub_text] |
|
text_list = tokenized_text |
|
|
|
return list( |
|
itertools.chain.from_iterable( |
|
( |
|
self._tokenize(token) if token not in self.unique_no_split_tokens else [token] |
|
for token in tokenized_text |
|
) |
|
) |
|
) |
|
|
|
no_split_token = self.unique_no_split_tokens |
|
tokenized_text = split_on_tokens(no_split_token, text) |
|
return tokenized_text |
|
|
|
def _tokenize(self, text, **kwargs): |
|
""" Converts a string in a sequence of tokens (string), using the tokenizer. |
|
Split in words for word-based vocabulary or sub-words for sub-word-based |
|
vocabularies (BPE/SentencePieces/WordPieces). |
|
|
|
Do NOT take care of added tokens. |
|
""" |
|
raise NotImplementedError |
|
|
|
def convert_tokens_to_ids(self, tokens): |
|
""" Converts a token string (or a sequence of tokens) in a single integer id |
|
(or a sequence of ids), using the vocabulary. |
|
""" |
|
if tokens is None: |
|
return None |
|
|
|
if isinstance(tokens, str): |
|
return self._convert_token_to_id_with_added_voc(tokens) |
|
|
|
ids = [] |
|
for token in tokens: |
|
ids.append(self._convert_token_to_id_with_added_voc(token)) |
|
return ids |
|
|
|
def _convert_token_to_id_with_added_voc(self, token): |
|
if token is None: |
|
return None |
|
|
|
if token in self.added_tokens_encoder: |
|
return self.added_tokens_encoder[token] |
|
return self._convert_token_to_id(token) |
|
|
|
def _convert_token_to_id(self, token): |
|
raise NotImplementedError |
|
|
|
def _encode_plus( |
|
self, |
|
text: Union[TextInput, PreTokenizedInput, EncodedInput], |
|
text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None, |
|
add_special_tokens: bool = True, |
|
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, |
|
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, |
|
max_length: Optional[int] = None, |
|
stride: int = 0, |
|
is_pretokenized: bool = False, |
|
pad_to_multiple_of: Optional[int] = None, |
|
return_tensors: Optional[Union[str, TensorType]] = None, |
|
return_token_type_ids: Optional[bool] = None, |
|
return_attention_mask: Optional[bool] = None, |
|
return_overflowing_tokens: bool = False, |
|
return_special_tokens_mask: bool = False, |
|
return_offsets_mapping: bool = False, |
|
return_length: bool = False, |
|
verbose: bool = True, |
|
**kwargs |
|
) -> BatchEncoding: |
|
def get_input_ids(text): |
|
if isinstance(text, str): |
|
tokens = self.tokenize(text, **kwargs) |
|
return self.convert_tokens_to_ids(tokens) |
|
elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str): |
|
if is_pretokenized: |
|
tokens = list(itertools.chain(*(self.tokenize(t, is_pretokenized=True, **kwargs) for t in text))) |
|
return self.convert_tokens_to_ids(tokens) |
|
else: |
|
return self.convert_tokens_to_ids(text) |
|
elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int): |
|
return text |
|
else: |
|
if is_pretokenized: |
|
raise ValueError( |
|
f"Input {text} is not valid. Should be a string or a list/tuple of strings when `is_pretokenized=True`." |
|
) |
|
else: |
|
raise ValueError( |
|
f"Input {text} is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers." |
|
) |
|
|
|
if return_offsets_mapping: |
|
raise NotImplementedError( |
|
"return_offset_mapping is not available when using Python tokenizers." |
|
"To use this feature, change your tokenizer to one deriving from " |
|
"transformers.PreTrainedTokenizerFast." |
|
"More information on available tokenizers at " |
|
"https://github.com/huggingface/transformers/pull/2674" |
|
) |
|
|
|
first_ids = get_input_ids(text) |
|
second_ids = get_input_ids(text_pair) if text_pair is not None else None |
|
|
|
return self.prepare_for_model( |
|
first_ids, |
|
pair_ids=second_ids, |
|
add_special_tokens=add_special_tokens, |
|
padding=padding_strategy.value, |
|
truncation=truncation_strategy.value, |
|
max_length=max_length, |
|
stride=stride, |
|
pad_to_multiple_of=pad_to_multiple_of, |
|
return_tensors=return_tensors, |
|
prepend_batch_axis=True, |
|
return_attention_mask=return_attention_mask, |
|
return_token_type_ids=return_token_type_ids, |
|
return_overflowing_tokens=return_overflowing_tokens, |
|
return_special_tokens_mask=return_special_tokens_mask, |
|
return_length=return_length, |
|
verbose=verbose, |
|
) |
|
|
|
def _batch_encode_plus( |
|
self, |
|
batch_text_or_text_pairs: Union[ |
|
List[TextInput], |
|
List[TextInputPair], |
|
List[PreTokenizedInput], |
|
List[PreTokenizedInputPair], |
|
List[EncodedInput], |
|
List[EncodedInputPair], |
|
], |
|
add_special_tokens: bool = True, |
|
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, |
|
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, |
|
max_length: Optional[int] = None, |
|
stride: int = 0, |
|
is_pretokenized: bool = False, |
|
pad_to_multiple_of: Optional[int] = None, |
|
return_tensors: Optional[Union[str, TensorType]] = None, |
|
return_token_type_ids: Optional[bool] = None, |
|
return_attention_mask: Optional[bool] = None, |
|
return_overflowing_tokens: bool = False, |
|
return_special_tokens_mask: bool = False, |
|
return_offsets_mapping: bool = False, |
|
return_length: bool = False, |
|
verbose: bool = True, |
|
**kwargs |
|
) -> BatchEncoding: |
|
def get_input_ids(text): |
|
if isinstance(text, str): |
|
tokens = self.tokenize(text, **kwargs) |
|
return self.convert_tokens_to_ids(tokens) |
|
elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str): |
|
if is_pretokenized: |
|
tokens = list(itertools.chain(*(self.tokenize(t, is_pretokenized=True, **kwargs) for t in text))) |
|
return self.convert_tokens_to_ids(tokens) |
|
else: |
|
return self.convert_tokens_to_ids(text) |
|
elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int): |
|
return text |
|
else: |
|
raise ValueError( |
|
"Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers." |
|
) |
|
|
|
if return_offsets_mapping: |
|
raise NotImplementedError( |
|
"return_offset_mapping is not available when using Python tokenizers." |
|
"To use this feature, change your tokenizer to one deriving from " |
|
"transformers.PreTrainedTokenizerFast." |
|
) |
|
|
|
input_ids = [] |
|
for ids_or_pair_ids in batch_text_or_text_pairs: |
|
if not isinstance(ids_or_pair_ids, (list, tuple)): |
|
ids, pair_ids = ids_or_pair_ids, None |
|
elif is_pretokenized and not isinstance(ids_or_pair_ids[0], (list, tuple)): |
|
ids, pair_ids = ids_or_pair_ids, None |
|
else: |
|
ids, pair_ids = ids_or_pair_ids |
|
|
|
first_ids = get_input_ids(ids) |
|
second_ids = get_input_ids(pair_ids) if pair_ids is not None else None |
|
input_ids.append((first_ids, second_ids)) |
|
|
|
batch_outputs = self._batch_prepare_for_model( |
|
input_ids, |
|
add_special_tokens=add_special_tokens, |
|
padding_strategy=padding_strategy, |
|
truncation_strategy=truncation_strategy, |
|
max_length=max_length, |
|
stride=stride, |
|
pad_to_multiple_of=pad_to_multiple_of, |
|
return_attention_mask=return_attention_mask, |
|
return_token_type_ids=return_token_type_ids, |
|
return_overflowing_tokens=return_overflowing_tokens, |
|
return_special_tokens_mask=return_special_tokens_mask, |
|
return_length=return_length, |
|
return_tensors=return_tensors, |
|
verbose=verbose, |
|
) |
|
|
|
return BatchEncoding(batch_outputs) |
|
|
|
@add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING) |
|
def _batch_prepare_for_model( |
|
self, |
|
batch_ids_pairs: List[Union[PreTokenizedInputPair, Tuple[List[int], None]]], |
|
add_special_tokens: bool = True, |
|
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, |
|
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, |
|
max_length: Optional[int] = None, |
|
stride: int = 0, |
|
pad_to_multiple_of: Optional[int] = None, |
|
return_tensors: Optional[str] = None, |
|
return_token_type_ids: Optional[bool] = None, |
|
return_attention_mask: Optional[bool] = None, |
|
return_overflowing_tokens: bool = False, |
|
return_special_tokens_mask: bool = False, |
|
return_length: bool = False, |
|
verbose: bool = True, |
|
) -> BatchEncoding: |
|
""" Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. |
|
It adds special tokens, truncates sequences if overflowing while taking into account the special tokens and |
|
manages a moving window (with user defined stride) for overflowing tokens |
|
|
|
Args: |
|
batch_ids_pairs: list of tokenized input ids or input ids pairs |
|
""" |
|
|
|
batch_outputs = {} |
|
for first_ids, second_ids in batch_ids_pairs: |
|
outputs = self.prepare_for_model( |
|
first_ids, |
|
second_ids, |
|
add_special_tokens=add_special_tokens, |
|
padding=PaddingStrategy.DO_NOT_PAD.value, |
|
truncation=truncation_strategy.value, |
|
max_length=max_length, |
|
stride=stride, |
|
pad_to_multiple_of=None, |
|
return_attention_mask=False, |
|
return_token_type_ids=return_token_type_ids, |
|
return_overflowing_tokens=return_overflowing_tokens, |
|
return_special_tokens_mask=return_special_tokens_mask, |
|
return_length=return_length, |
|
return_tensors=None, |
|
prepend_batch_axis=False, |
|
verbose=verbose, |
|
) |
|
|
|
for key, value in outputs.items(): |
|
if key not in batch_outputs: |
|
batch_outputs[key] = [] |
|
batch_outputs[key].append(value) |
|
|
|
batch_outputs = self.pad( |
|
batch_outputs, |
|
padding=padding_strategy.value, |
|
max_length=max_length, |
|
pad_to_multiple_of=pad_to_multiple_of, |
|
return_attention_mask=return_attention_mask, |
|
) |
|
|
|
batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors) |
|
|
|
return batch_outputs |
|
|
|
def prepare_for_tokenization(self, text: str, is_pretokenized=False, **kwargs) -> (str, dict): |
|
""" Performs any necessary transformations before tokenization. |
|
|
|
This method should pop the arguments from kwargs and return kwargs as well. |
|
We test kwargs at the end of the encoding process to be sure all the arguments have been used. |
|
""" |
|
return (text, kwargs) |
|
|
|
def get_special_tokens_mask( |
|
self, token_ids_0: List, token_ids_1: Optional[List] = None, already_has_special_tokens: bool = False |
|
) -> List[int]: |
|
""" |
|
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding |
|
special tokens using the tokenizer ``prepare_for_model`` method. |
|
|
|
Args: |
|
token_ids_0: list of ids (must not contain special tokens) |
|
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids |
|
for sequence pairs |
|
already_has_special_tokens: (default False) Set to True if the token list is already formated with |
|
special tokens for the model |
|
|
|
Returns: |
|
A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. |
|
""" |
|
return [0] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0)) |
|
|
|
def convert_ids_to_tokens( |
|
self, ids: Union[int, List[int]], skip_special_tokens: bool = False |
|
) -> Union[str, List[str]]: |
|
""" Converts a single index or a sequence of indices (integers) in a token " |
|
(resp.) a sequence of tokens (str), using the vocabulary and added tokens. |
|
|
|
Args: |
|
skip_special_tokens: Don't decode special tokens (self.all_special_tokens). Default: False |
|
""" |
|
if isinstance(ids, int): |
|
if ids in self.added_tokens_decoder: |
|
return self.added_tokens_decoder[ids] |
|
else: |
|
return self._convert_id_to_token(ids) |
|
tokens = [] |
|
for index in ids: |
|
index = int(index) |
|
if skip_special_tokens and index in self.all_special_ids: |
|
continue |
|
if index in self.added_tokens_decoder: |
|
tokens.append(self.added_tokens_decoder[index]) |
|
else: |
|
tokens.append(self._convert_id_to_token(index)) |
|
return tokens |
|
|
|
def _convert_id_to_token(self, index: int) -> str: |
|
raise NotImplementedError |
|
|
|
def convert_tokens_to_string(self, tokens: List[str]) -> str: |
|
""" Converts a sequence of tokens (string) in a single string. |
|
The most simple way to do it is ' '.join(self.convert_ids_to_tokens(token_ids)) |
|
but we often want to remove sub-word tokenization artifacts at the same time. |
|
""" |
|
return " ".join(self.convert_ids_to_tokens(tokens)) |
|
|
|
def decode( |
|
self, token_ids: List[int], skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = True |
|
) -> str: |
|
filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens) |
|
|
|
|
|
|
|
|
|
sub_texts = [] |
|
current_sub_text = [] |
|
for token in filtered_tokens: |
|
if skip_special_tokens and token in self.all_special_ids: |
|
continue |
|
if token in self.added_tokens_encoder: |
|
if current_sub_text: |
|
sub_texts.append(self.convert_tokens_to_string(current_sub_text)) |
|
current_sub_text = [] |
|
sub_texts.append(token) |
|
else: |
|
current_sub_text.append(token) |
|
if current_sub_text: |
|
sub_texts.append(self.convert_tokens_to_string(current_sub_text)) |
|
text = " ".join(sub_texts) |
|
|
|
if clean_up_tokenization_spaces: |
|
clean_text = self.clean_up_tokenization(text) |
|
return clean_text |
|
else: |
|
return text |
|
|
|
def save_vocabulary(self, save_directory) -> Tuple[str]: |
|
""" Save the tokenizer vocabulary to a directory. This method does *NOT* save added tokens |
|
and special token mappings. |
|
|
|
Please use :func:`~transformers.PreTrainedTokenizer.save_pretrained` `()` to save the full |
|
Tokenizer state if you want to reload it using the :func:`~transformers.PreTrainedTokenizer.from_pretrained` |
|
class method. |
|
""" |
|
raise NotImplementedError |
|
|