|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" Base classes common to both the slow and the fast tokenization classes: |
|
PreTrainedTokenizerBase (host all the user fronting encoding methodes) |
|
Special token mixing (host the special tokens logic) and |
|
BatchEncoding (wrap the dictionnary of output with special method for the Fast tokenizers) |
|
""" |
|
|
|
import copy |
|
import json |
|
import logging |
|
import os |
|
import warnings |
|
from collections import UserDict |
|
from enum import Enum |
|
from typing import Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union |
|
|
|
import numpy as np |
|
from tokenizers import AddedToken |
|
from tokenizers import Encoding as EncodingFast |
|
|
|
from .file_utils import ( |
|
add_end_docstrings, |
|
cached_path, |
|
hf_bucket_url, |
|
is_remote_url, |
|
is_tf_available, |
|
is_torch_available, |
|
torch_required, |
|
) |
|
|
|
|
|
if is_tf_available(): |
|
import tensorflow as tf |
|
if is_torch_available(): |
|
import torch |
|
|
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
VERY_LARGE_INTEGER = int(1e30) |
|
LARGE_INTEGER = int(1e20) |
|
|
|
|
|
TextInput = str |
|
PreTokenizedInput = List[str] |
|
EncodedInput = List[int] |
|
TextInputPair = Tuple[str, str] |
|
PreTokenizedInputPair = Tuple[List[str], List[str]] |
|
EncodedInputPair = Tuple[List[int], List[int]] |
|
|
|
|
|
|
|
SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json" |
|
ADDED_TOKENS_FILE = "added_tokens.json" |
|
TOKENIZER_CONFIG_FILE = "tokenizer_config.json" |
|
|
|
|
|
FULL_TOKENIZER_FILE = "tokenizer.json" |
|
|
|
|
|
class ExplicitEnum(Enum): |
|
""" Enum with more explicit error message for missing values. |
|
""" |
|
|
|
@classmethod |
|
def _missing_(cls, value): |
|
raise ValueError( |
|
"%r is not a valid %s, please select one of %s" |
|
% (value, cls.__name__, str(list(cls._value2member_map_.keys()))) |
|
) |
|
|
|
|
|
class TruncationStrategy(ExplicitEnum): |
|
ONLY_FIRST = "only_first" |
|
ONLY_SECOND = "only_second" |
|
LONGEST_FIRST = "longest_first" |
|
DO_NOT_TRUNCATE = "do_not_truncate" |
|
|
|
|
|
class PaddingStrategy(ExplicitEnum): |
|
LONGEST = "longest" |
|
MAX_LENGTH = "max_length" |
|
DO_NOT_PAD = "do_not_pad" |
|
|
|
|
|
class TensorType(ExplicitEnum): |
|
PYTORCH = "pt" |
|
TENSORFLOW = "tf" |
|
NUMPY = "np" |
|
|
|
|
|
class CharSpan(NamedTuple): |
|
""" Character span in the original string |
|
|
|
Args: |
|
start: index of the first character in the original string |
|
end: index of the character following the last character in the original string |
|
""" |
|
|
|
start: int |
|
end: int |
|
|
|
|
|
class TokenSpan(NamedTuple): |
|
""" Token span in an encoded string (list of tokens) |
|
|
|
Args: |
|
start: index of the first token in the span |
|
end: index of the token following the last token in the span |
|
""" |
|
|
|
start: int |
|
end: int |
|
|
|
|
|
class BatchEncoding(UserDict): |
|
""" BatchEncoding hold the output of the encode and batch_encode methods (tokens, attention_masks, etc). |
|
This class is derived from a python Dictionary and can be used as a dictionnary. |
|
In addition, this class expose utility methods to map from word/char space to token space. |
|
|
|
Args: |
|
data (:obj:`dict`): Dictionary of lists/arrays returned by the encode/batch_encode methods ('input_ids', 'attention_mask'...) |
|
encoding (:obj:`EncodingFast`, :obj:`list(EncodingFast)`, `optional`, defaults to :obj:`None`): |
|
If the tokenizer is a fast tokenizer which outputs additional informations like mapping from word/char space to token space |
|
the `EncodingFast` instance or list of instance (for batches) hold these informations. |
|
tensor_type (:obj:`Union[None, str, TensorType]`, `optional`, defaults to :obj:`None`): |
|
You can give a tensor_type here to convert the lists of integers in PyTorch/TF/Numpy Tensors at initialization |
|
prepend_batch_axis (:obj:`bool`, `optional`, defaults to :obj:`False`): |
|
Set to True to add a batch axis when converting in Tensors (see :obj:`tensor_type` above) |
|
""" |
|
|
|
def __init__( |
|
self, |
|
data: Optional[Dict[str, Any]] = None, |
|
encoding: Optional[Union[EncodingFast, Sequence[EncodingFast]]] = None, |
|
tensor_type: Union[None, str, TensorType] = None, |
|
prepend_batch_axis: bool = False, |
|
): |
|
super().__init__(data) |
|
|
|
if isinstance(encoding, EncodingFast): |
|
encoding = [encoding] |
|
|
|
self._encodings = encoding |
|
|
|
self.convert_to_tensors(tensor_type=tensor_type, prepend_batch_axis=prepend_batch_axis) |
|
|
|
@property |
|
def is_fast(self): |
|
""" |
|
Indicate if this BatchEncoding was generated from the result of a PreTrainedTokenizerFast |
|
Returns: True if generated from subclasses of PreTrainedTokenizerFast, else otherwise |
|
""" |
|
return self._encodings is not None |
|
|
|
def __getitem__(self, item: Union[int, str]) -> EncodingFast: |
|
""" If the key is a string, get the value of the dict associated to `key` ('input_ids', 'attention_mask'...) |
|
If the key is an integer, get the EncodingFast for batch item with index `key` |
|
""" |
|
if isinstance(item, str): |
|
return self.data[item] |
|
elif self._encodings is not None: |
|
return self._encodings[item] |
|
else: |
|
raise KeyError( |
|
"Indexing with integers (to access backend Encoding for a given batch index) " |
|
"is not available when using Python based tokenizers" |
|
) |
|
|
|
def __getattr__(self, item: str): |
|
try: |
|
return self.data[item] |
|
except KeyError: |
|
raise AttributeError |
|
|
|
def __getstate__(self): |
|
return {"data": self.data, "encodings": self._encodings} |
|
|
|
def __setstate__(self, state): |
|
if "data" in state: |
|
self.data = state["data"] |
|
|
|
if "encodings" in state: |
|
self._encodings = state["encodings"] |
|
|
|
def keys(self): |
|
return self.data.keys() |
|
|
|
def values(self): |
|
return self.data.values() |
|
|
|
def items(self): |
|
return self.data.items() |
|
|
|
|
|
|
|
|
|
|
|
@property |
|
def encodings(self) -> Optional[List[EncodingFast]]: |
|
""" |
|
Return the list all encoding from the tokenization process |
|
|
|
Returns: List[EncodingFast] or None if input was tokenized through Python (i.e. not fast) tokenizer |
|
""" |
|
return self._encodings |
|
|
|
def tokens(self, batch_index: int = 0) -> List[str]: |
|
if not self._encodings: |
|
raise ValueError("tokens() is not available when using Python based tokenizers") |
|
return self._encodings[batch_index].tokens |
|
|
|
def words(self, batch_index: int = 0) -> List[Optional[int]]: |
|
if not self._encodings: |
|
raise ValueError("words() is not available when using Python based tokenizers") |
|
return self._encodings[batch_index].words |
|
|
|
def token_to_word(self, batch_or_token_index: int, token_index: Optional[int] = None) -> int: |
|
""" |
|
Get the index of the word corresponding (i.e. comprising) to an encoded token |
|
in a sequence of the batch. |
|
|
|
Can be called as: |
|
|
|
- ``self.token_to_word(token_index)`` if batch size is 1 |
|
- ``self.token_to_word(batch_index, token_index)`` if batch size is greater than 1 |
|
|
|
This method is particularly suited when the input sequences are provided as |
|
pre-tokenized sequences (i.e. words are defined by the user). In this case it allows |
|
to easily associate encoded tokens with provided tokenized words. |
|
|
|
Args: |
|
batch_or_token_index (:obj:`int`): |
|
Index of the sequence in the batch. If the batch only comprise one sequence, |
|
this can be the index of the token in the sequence |
|
token_index (:obj:`int`, `optional`): |
|
If a batch index is provided in `batch_or_token_index`, this can be the index |
|
of the token in the sequence. |
|
|
|
Returns: |
|
:obj:`int`: |
|
index of the word in the input sequence. |
|
|
|
""" |
|
|
|
if not self._encodings: |
|
raise ValueError("token_to_word() is not available when using Python based tokenizers") |
|
if token_index is not None: |
|
batch_index = batch_or_token_index |
|
else: |
|
batch_index = 0 |
|
token_index = batch_or_token_index |
|
if batch_index < 0: |
|
batch_index = self._batch_size + batch_index |
|
if token_index < 0: |
|
token_index = self._seq_len + token_index |
|
return self._encodings[batch_index].token_to_word(token_index) |
|
|
|
def word_to_tokens(self, batch_or_word_index: int, word_index: Optional[int] = None) -> TokenSpan: |
|
""" |
|
Get the encoded token span corresponding to a word in the sequence of the batch. |
|
|
|
Token spans are returned as a TokenSpan NamedTuple with: |
|
|
|
- start: index of the first token |
|
- end: index of the token following the last token |
|
|
|
Can be called as: |
|
|
|
- ``self.word_to_tokens(word_index)`` if batch size is 1 |
|
- ``self.word_to_tokens(batch_index, word_index)`` if batch size is greater or equal to 1 |
|
|
|
This method is particularly suited when the input sequences are provided as |
|
pre-tokenized sequences (i.e. words are defined by the user). In this case it allows |
|
to easily associate encoded tokens with provided tokenized words. |
|
|
|
Args: |
|
batch_or_word_index (:obj:`int`): |
|
Index of the sequence in the batch. If the batch only comprises one sequence, |
|
this can be the index of the word in the sequence |
|
word_index (:obj:`int`, `optional`): |
|
If a batch index is provided in `batch_or_token_index`, this can be the index |
|
of the word in the sequence. |
|
|
|
Returns: |
|
:obj:`TokenSpan`: |
|
Span of tokens in the encoded sequence. |
|
|
|
:obj:`TokenSpan` are NamedTuple with: |
|
|
|
- start: index of the first token |
|
- end: index of the token following the last token |
|
""" |
|
|
|
if not self._encodings: |
|
raise ValueError("word_to_tokens() is not available when using Python based tokenizers") |
|
if word_index is not None: |
|
batch_index = batch_or_word_index |
|
else: |
|
batch_index = 0 |
|
word_index = batch_or_word_index |
|
if batch_index < 0: |
|
batch_index = self._batch_size + batch_index |
|
if word_index < 0: |
|
word_index = self._seq_len + word_index |
|
return TokenSpan(*(self._encodings[batch_index].word_to_tokens(word_index))) |
|
|
|
def token_to_chars(self, batch_or_token_index: int, token_index: Optional[int] = None) -> CharSpan: |
|
""" |
|
Get the character span corresponding to an encoded token in a sequence of the batch. |
|
|
|
Character spans are returned as a CharSpan NamedTuple with: |
|
|
|
- start: index of the first character in the original string associated to the token |
|
- end: index of the character following the last character in the original string associated to the token |
|
|
|
Can be called as: |
|
|
|
- ``self.token_to_chars(token_index)`` if batch size is 1 |
|
- ``self.token_to_chars(batch_index, token_index)`` if batch size is greater or equal to 1 |
|
|
|
Args: |
|
batch_or_token_index (:obj:`int`): |
|
Index of the sequence in the batch. If the batch only comprise one sequence, |
|
this can be the index of the token in the sequence |
|
token_index (:obj:`int`, `optional`): |
|
If a batch index is provided in `batch_or_token_index`, this can be the index |
|
of the token or tokens in the sequence. |
|
|
|
Returns: |
|
:obj:`CharSpan`: |
|
Span of characters in the original string. |
|
|
|
:obj:`CharSpan` are NamedTuple with: |
|
|
|
- start: index of the first character in the original string |
|
- end: index of the character following the last character in the original string |
|
""" |
|
|
|
if not self._encodings: |
|
raise ValueError("token_to_chars() is not available when using Python based tokenizers") |
|
if token_index is not None: |
|
batch_index = batch_or_token_index |
|
else: |
|
batch_index = 0 |
|
token_index = batch_or_token_index |
|
return CharSpan(*(self._encodings[batch_index].token_to_chars(token_index))) |
|
|
|
def char_to_token(self, batch_or_char_index: int, char_index: Optional[int] = None) -> int: |
|
""" |
|
Get the index of the token in the encoded output comprising a character |
|
in the original string for a sequence of the batch. |
|
|
|
Can be called as: |
|
|
|
- ``self.char_to_token(char_index)`` if batch size is 1 |
|
- ``self.char_to_token(batch_index, char_index)`` if batch size is greater or equal to 1 |
|
|
|
This method is particularly suited when the input sequences are provided as |
|
pre-tokenized sequences (i.e. words are defined by the user). In this case it allows |
|
to easily associate encoded tokens with provided tokenized words. |
|
|
|
Args: |
|
batch_or_char_index (:obj:`int`): |
|
Index of the sequence in the batch. If the batch only comprise one sequence, |
|
this can be the index of the word in the sequence |
|
char_index (:obj:`int`, `optional`): |
|
If a batch index is provided in `batch_or_token_index`, this can be the index |
|
of the word in the sequence. |
|
|
|
|
|
Returns: |
|
:obj:`int`: Index of the token. |
|
""" |
|
|
|
if not self._encodings: |
|
raise ValueError("char_to_token() is not available when using Python based tokenizers") |
|
if char_index is not None: |
|
batch_index = batch_or_char_index |
|
else: |
|
batch_index = 0 |
|
char_index = batch_or_char_index |
|
return self._encodings[batch_index].char_to_token(char_index) |
|
|
|
def word_to_chars(self, batch_or_word_index: int, word_index: Optional[int] = None) -> CharSpan: |
|
""" |
|
Get the character span in the original string corresponding to given word in a sequence |
|
of the batch. |
|
|
|
Character spans are returned as a CharSpan NamedTuple with: |
|
|
|
- start: index of the first character in the original string |
|
- end: index of the character following the last character in the original string |
|
|
|
Can be called as: |
|
|
|
- ``self.word_to_chars(word_index)`` if batch size is 1 |
|
- ``self.word_to_chars(batch_index, word_index)`` if batch size is greater or equal to 1 |
|
|
|
Args: |
|
batch_or_word_index (:obj:`int`): |
|
Index of the sequence in the batch. If the batch only comprise one sequence, |
|
this can be the index of the word in the sequence |
|
word_index (:obj:`int`, `optional`): |
|
If a batch index is provided in `batch_or_token_index`, this can be the index |
|
of the word in the sequence. |
|
|
|
Returns: |
|
:obj:`CharSpan` or :obj:`List[CharSpan]`: |
|
Span(s) of the associated character or characters in the string. |
|
CharSpan are NamedTuple with: |
|
|
|
- start: index of the first character associated to the token in the original string |
|
- end: index of the character following the last character associated to the token in the original string |
|
""" |
|
|
|
if not self._encodings: |
|
raise ValueError("word_to_chars() is not available when using Python based tokenizers") |
|
if word_index is not None: |
|
batch_index = batch_or_word_index |
|
else: |
|
batch_index = 0 |
|
word_index = batch_or_word_index |
|
return CharSpan(*(self._encodings[batch_index].word_to_chars(word_index))) |
|
|
|
def char_to_word(self, batch_or_char_index: int, char_index: Optional[int] = None) -> int: |
|
""" |
|
Get the word in the original string corresponding to a character in the original string of |
|
a sequence of the batch. |
|
|
|
Can be called as: |
|
|
|
- ``self.char_to_word(char_index)`` if batch size is 1 |
|
- ``self.char_to_word(batch_index, char_index)`` if batch size is greater than 1 |
|
|
|
This method is particularly suited when the input sequences are provided as |
|
pre-tokenized sequences (i.e. words are defined by the user). In this case it allows |
|
to easily associate encoded tokens with provided tokenized words. |
|
|
|
Args: |
|
batch_or_char_index (:obj:`int`): |
|
Index of the sequence in the batch. If the batch only comprise one sequence, |
|
this can be the index of the character in the orginal string. |
|
char_index (:obj:`int`, `optional`): |
|
If a batch index is provided in `batch_or_token_index`, this can be the index |
|
of the character in the orginal string. |
|
|
|
|
|
Returns: |
|
:obj:`int` or :obj:`List[int]`: |
|
Index or indices of the associated encoded token(s). |
|
""" |
|
|
|
if not self._encodings: |
|
raise ValueError("char_to_word() is not available when using Python based tokenizers") |
|
if char_index is not None: |
|
batch_index = batch_or_char_index |
|
else: |
|
batch_index = 0 |
|
char_index = batch_or_char_index |
|
return self._encodings[batch_index].char_to_word(char_index) |
|
|
|
def convert_to_tensors(self, tensor_type: Union[None, str, TensorType], prepend_batch_axis: bool = False): |
|
if tensor_type is None: |
|
return self |
|
|
|
|
|
if not isinstance(tensor_type, TensorType): |
|
tensor_type = TensorType(tensor_type) |
|
|
|
|
|
if tensor_type == TensorType.TENSORFLOW and is_tf_available(): |
|
as_tensor = tf.constant |
|
elif tensor_type == TensorType.PYTORCH and is_torch_available(): |
|
as_tensor = torch.tensor |
|
elif tensor_type == TensorType.NUMPY: |
|
as_tensor = np.asarray |
|
else: |
|
raise ImportError( |
|
"Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format( |
|
tensor_type |
|
) |
|
) |
|
|
|
|
|
for key, value in self.items(): |
|
try: |
|
if prepend_batch_axis: |
|
value = [value] |
|
|
|
tensor = as_tensor(value) |
|
|
|
|
|
if tensor.ndim > 2: |
|
tensor = tensor.squeeze(0) |
|
elif tensor.ndim < 2: |
|
tensor = tensor[None, :] |
|
|
|
self[key] = tensor |
|
except: |
|
raise ValueError( |
|
"Unable to create tensor, you should probably activate truncation and/or padding " |
|
"with 'padding=True' 'truncation=True' to have batched tensors with the same length." |
|
) |
|
|
|
return self |
|
|
|
@torch_required |
|
def to(self, device: str): |
|
"""Send all values to device by calling v.to(device)""" |
|
self.data = {k: v.to(device) for k, v in self.data.items()} |
|
return self |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class SpecialTokensMixin: |
|
""" SpecialTokensMixin is derived by ``PreTrainedTokenizer`` and ``PreTrainedTokenizerFast`` and |
|
handles specific behaviors related to special tokens. In particular, this class hold the |
|
attributes which can be used to directly access to these special tokens in a |
|
model-independant manner and allow to set and update the special tokens. |
|
""" |
|
|
|
SPECIAL_TOKENS_ATTRIBUTES = [ |
|
"bos_token", |
|
"eos_token", |
|
"unk_token", |
|
"sep_token", |
|
"pad_token", |
|
"cls_token", |
|
"mask_token", |
|
"additional_special_tokens", |
|
] |
|
|
|
def __init__(self, verbose=True, **kwargs): |
|
self._bos_token = None |
|
self._eos_token = None |
|
self._unk_token = None |
|
self._sep_token = None |
|
self._pad_token = None |
|
self._cls_token = None |
|
self._mask_token = None |
|
self._pad_token_type_id = 0 |
|
self._additional_special_tokens = [] |
|
self.verbose = verbose |
|
|
|
|
|
|
|
|
|
for key, value in kwargs.items(): |
|
if key in self.SPECIAL_TOKENS_ATTRIBUTES: |
|
if key == "additional_special_tokens": |
|
assert isinstance(value, (list, tuple)) and all(isinstance(t, str) for t in value) |
|
setattr(self, key, value) |
|
elif isinstance(value, (str, AddedToken)): |
|
setattr(self, key, value) |
|
else: |
|
raise TypeError( |
|
"special token {} has to be either str or AddedToken but got: {}".format(key, type(value)) |
|
) |
|
|
|
def sanitize_special_tokens(self) -> int: |
|
""" Make sure that all the special tokens attributes of the tokenizer (tokenizer.mask_token, tokenizer.cls_token, ...) |
|
are in the vocabulary. Add the missing ones to the vocabulary if needed. |
|
|
|
Return: |
|
Number of tokens added in the vocaulary during the operation. |
|
""" |
|
return self.add_tokens(self.all_special_tokens_extended, special_tokens=True) |
|
|
|
def add_special_tokens(self, special_tokens_dict: Dict[str, Union[str, AddedToken]]) -> int: |
|
""" |
|
Add a dictionary of special tokens (eos, pad, cls...) to the encoder and link them |
|
to class attributes. If special tokens are NOT in the vocabulary, they are added |
|
to it (indexed starting from the last index of the current vocabulary). |
|
|
|
Using `add_special_tokens` will ensure your special tokens can be used in several ways: |
|
|
|
- special tokens are carefully handled by the tokenizer (they are never split) |
|
- you can easily refer to special tokens using tokenizer class attributes like `tokenizer.cls_token`. This makes it easy to develop model-agnostic training and fine-tuning scripts. |
|
|
|
When possible, special tokens are already registered for provided pretrained models (ex: BertTokenizer cls_token is already registered to be '[CLS]' and XLM's one is also registered to be '</s>') |
|
|
|
Args: |
|
special_tokens_dict: dict of string. Keys should be in the list of predefined special attributes: |
|
[``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, |
|
``additional_special_tokens``]. |
|
|
|
Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them). |
|
|
|
Returns: |
|
Number of tokens added to the vocabulary. |
|
|
|
Examples:: |
|
|
|
# Let's see how to add a new classification token to GPT-2 |
|
tokenizer = GPT2Tokenizer.from_pretrained('gpt2') |
|
model = GPT2Model.from_pretrained('gpt2') |
|
|
|
special_tokens_dict = {'cls_token': '<CLS>'} |
|
|
|
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict) |
|
print('We have added', num_added_toks, 'tokens') |
|
model.resize_token_embeddings(len(tokenizer)) # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer. |
|
|
|
assert tokenizer.cls_token == '<CLS>' |
|
""" |
|
if not special_tokens_dict: |
|
return 0 |
|
|
|
added_tokens = 0 |
|
for key, value in special_tokens_dict.items(): |
|
assert key in self.SPECIAL_TOKENS_ATTRIBUTES |
|
|
|
if self.verbose: |
|
logger.info("Assigning %s to the %s key of the tokenizer", value, key) |
|
setattr(self, key, value) |
|
|
|
if key == "additional_special_tokens": |
|
assert isinstance(value, (list, tuple)) and all( |
|
isinstance(t, (str, AddedToken)) for t in value |
|
), f"Tokens {value} for key {key} should all be str or AddedToken instances" |
|
added_tokens += self.add_tokens(value, special_tokens=True) |
|
else: |
|
assert isinstance( |
|
value, (str, AddedToken) |
|
), f"Token {value} for key {key} should be a str or an AddedToken instance" |
|
added_tokens += self.add_tokens([value], special_tokens=True) |
|
|
|
return added_tokens |
|
|
|
def add_tokens(self, new_tokens: Union[str, AddedToken, List[str], List[AddedToken]], special_tokens=False) -> int: |
|
""" |
|
Add a list of new tokens to the tokenizer class. If the new tokens are not in the |
|
vocabulary, they are added to it with indices starting from length of the current vocabulary. |
|
|
|
Args: |
|
new_tokens: string or list of string or :class:`~transformers.AddedToken`. Each string is a token to add. |
|
Tokens are only added if they are not already in the vocabulary. AddedToken wrap a string token to |
|
let you personnalize it's behavior (Whether this token should only match against single word, whether |
|
this token should strip all potential whitespaces on the left side, Whether this token should strip |
|
all potential whitespaces on the right side...). |
|
special_token: can be used to specify if the token is a special token. This mostly change the normalization |
|
behavior (special tokens like CLS or [MASK] are usually not lower-cased for instance) |
|
|
|
See details for :class:`~transformers.AddedToken` in HuggingFace tokenizers library. |
|
|
|
Returns: |
|
Number of tokens added to the vocabulary. |
|
|
|
Examples:: |
|
|
|
# Let's see how to increase the vocabulary of Bert model and tokenizer |
|
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased') |
|
model = BertModel.from_pretrained('bert-base-uncased') |
|
|
|
num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2']) |
|
print('We have added', num_added_toks, 'tokens') |
|
model.resize_token_embeddings(len(tokenizer)) # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer. |
|
""" |
|
if not new_tokens: |
|
return 0 |
|
|
|
if not isinstance(new_tokens, (list, tuple)): |
|
new_tokens = [new_tokens] |
|
|
|
return self._add_tokens(new_tokens, special_tokens=special_tokens) |
|
|
|
@property |
|
def bos_token(self): |
|
""" Beginning of sentence token (string). Log an error if used while not having been set. """ |
|
if self._bos_token is None and self.verbose: |
|
logger.error("Using bos_token, but it is not set yet.") |
|
return None |
|
return str(self._bos_token) |
|
|
|
@property |
|
def eos_token(self): |
|
""" End of sentence token (string). Log an error if used while not having been set. """ |
|
if self._eos_token is None and self.verbose: |
|
logger.error("Using eos_token, but it is not set yet.") |
|
return None |
|
return str(self._eos_token) |
|
|
|
@property |
|
def unk_token(self): |
|
""" Unknown token (string). Log an error if used while not having been set. """ |
|
if self._unk_token is None and self.verbose: |
|
logger.error("Using unk_token, but it is not set yet.") |
|
return None |
|
return str(self._unk_token) |
|
|
|
@property |
|
def sep_token(self): |
|
""" Separation token (string). E.g. separate context and query in an input sequence. Log an error if used while not having been set. """ |
|
if self._sep_token is None and self.verbose: |
|
logger.error("Using sep_token, but it is not set yet.") |
|
return None |
|
return str(self._sep_token) |
|
|
|
@property |
|
def pad_token(self): |
|
""" Padding token (string). Log an error if used while not having been set. """ |
|
if self._pad_token is None and self.verbose: |
|
logger.error("Using pad_token, but it is not set yet.") |
|
return None |
|
return str(self._pad_token) |
|
|
|
@property |
|
def cls_token(self): |
|
""" Classification token (string). E.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set. """ |
|
if self._cls_token is None and self.verbose: |
|
logger.error("Using cls_token, but it is not set yet.") |
|
return None |
|
return str(self._cls_token) |
|
|
|
@property |
|
def mask_token(self): |
|
""" Mask token (string). E.g. when training a model with masked-language modeling. Log an error if used while not having been set. """ |
|
if self._mask_token is None and self.verbose: |
|
logger.error("Using mask_token, but it is not set yet.") |
|
return None |
|
return str(self._mask_token) |
|
|
|
@property |
|
def additional_special_tokens(self): |
|
""" All the additional special tokens you may want to use (list of strings). Log an error if used while not having been set. """ |
|
if self._additional_special_tokens is None and self.verbose: |
|
logger.error("Using additional_special_tokens, but it is not set yet.") |
|
return None |
|
return [str(tok) for tok in self._additional_special_tokens] |
|
|
|
@bos_token.setter |
|
def bos_token(self, value): |
|
self._bos_token = value |
|
|
|
@eos_token.setter |
|
def eos_token(self, value): |
|
self._eos_token = value |
|
|
|
@unk_token.setter |
|
def unk_token(self, value): |
|
self._unk_token = value |
|
|
|
@sep_token.setter |
|
def sep_token(self, value): |
|
self._sep_token = value |
|
|
|
@pad_token.setter |
|
def pad_token(self, value): |
|
self._pad_token = value |
|
|
|
@cls_token.setter |
|
def cls_token(self, value): |
|
self._cls_token = value |
|
|
|
@mask_token.setter |
|
def mask_token(self, value): |
|
self._mask_token = value |
|
|
|
@additional_special_tokens.setter |
|
def additional_special_tokens(self, value): |
|
self._additional_special_tokens = value |
|
|
|
@property |
|
def bos_token_id(self): |
|
""" Id of the beginning of sentence token in the vocabulary. Log an error if used while not having been set. """ |
|
if self._bos_token is None: |
|
return None |
|
return self.convert_tokens_to_ids(self.bos_token) |
|
|
|
@property |
|
def eos_token_id(self): |
|
""" Id of the end of sentence token in the vocabulary. Log an error if used while not having been set. """ |
|
if self._eos_token is None: |
|
return None |
|
return self.convert_tokens_to_ids(self.eos_token) |
|
|
|
@property |
|
def unk_token_id(self): |
|
""" Id of the unknown token in the vocabulary. Log an error if used while not having been set. """ |
|
if self._unk_token is None: |
|
return None |
|
return self.convert_tokens_to_ids(self.unk_token) |
|
|
|
@property |
|
def sep_token_id(self): |
|
""" Id of the separation token in the vocabulary. E.g. separate context and query in an input sequence. Log an error if used while not having been set. """ |
|
if self._sep_token is None: |
|
return None |
|
return self.convert_tokens_to_ids(self.sep_token) |
|
|
|
@property |
|
def pad_token_id(self): |
|
""" Id of the padding token in the vocabulary. Log an error if used while not having been set. """ |
|
if self._pad_token is None: |
|
return None |
|
return self.convert_tokens_to_ids(self.pad_token) |
|
|
|
@property |
|
def pad_token_type_id(self): |
|
""" Id of the padding token type in the vocabulary.""" |
|
return self._pad_token_type_id |
|
|
|
@property |
|
def cls_token_id(self): |
|
""" Id of the classification token in the vocabulary. E.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set. """ |
|
if self._cls_token is None: |
|
return None |
|
return self.convert_tokens_to_ids(self.cls_token) |
|
|
|
@property |
|
def mask_token_id(self): |
|
""" Id of the mask token in the vocabulary. E.g. when training a model with masked-language modeling. Log an error if used while not having been set. """ |
|
if self._mask_token is None: |
|
return None |
|
return self.convert_tokens_to_ids(self.mask_token) |
|
|
|
@property |
|
def additional_special_tokens_ids(self): |
|
""" Ids of all the additional special tokens in the vocabulary (list of integers). Log an error if used while not having been set. """ |
|
return self.convert_tokens_to_ids(self.additional_special_tokens) |
|
|
|
@property |
|
def special_tokens_map(self): |
|
""" A dictionary mapping special token class attribute (cls_token, unk_token...) to their |
|
values ('<unk>', '<cls>'...) |
|
Convert tokens of AddedToken type in string. |
|
All returned tokens are strings |
|
""" |
|
set_attr = {} |
|
for attr in self.SPECIAL_TOKENS_ATTRIBUTES: |
|
attr_value = getattr(self, "_" + attr) |
|
if attr_value: |
|
set_attr[attr] = str(attr_value) |
|
return set_attr |
|
|
|
@property |
|
def special_tokens_map_extended(self): |
|
""" A dictionary mapping special token class attribute (cls_token, unk_token...) to their |
|
values ('<unk>', '<cls>'...) |
|
Keep the tokens as AddedToken if they are of this type. |
|
|
|
AddedToken can be used to control more finely how special tokens are tokenized. |
|
""" |
|
set_attr = {} |
|
for attr in self.SPECIAL_TOKENS_ATTRIBUTES: |
|
attr_value = getattr(self, "_" + attr) |
|
if attr_value: |
|
set_attr[attr] = attr_value |
|
return set_attr |
|
|
|
@property |
|
def all_special_tokens(self): |
|
""" List all the special tokens ('<unk>', '<cls>'...) mapped to class attributes |
|
Convert tokens of AddedToken type in string. |
|
All returned tokens are strings |
|
(cls_token, unk_token...). |
|
""" |
|
all_toks = [str(s) for s in self.all_special_tokens_extended] |
|
return all_toks |
|
|
|
@property |
|
def all_special_tokens_extended(self): |
|
""" List all the special tokens ('<unk>', '<cls>'...) mapped to class attributes |
|
Keep the tokens as AddedToken if they are of this type. |
|
|
|
AddedToken can be used to control more finely how special tokens are tokenized. |
|
""" |
|
all_toks = [] |
|
set_attr = self.special_tokens_map_extended |
|
for attr_value in set_attr.values(): |
|
all_toks = all_toks + (list(attr_value) if isinstance(attr_value, (list, tuple)) else [attr_value]) |
|
all_toks = list(set(all_toks)) |
|
return all_toks |
|
|
|
@property |
|
def all_special_ids(self): |
|
""" List the vocabulary indices of the special tokens ('<unk>', '<cls>'...) mapped to |
|
class attributes (cls_token, unk_token...). |
|
""" |
|
all_toks = self.all_special_tokens |
|
all_ids = self.convert_tokens_to_ids(all_toks) |
|
return all_ids |
|
|
|
|
|
ENCODE_KWARGS_DOCSTRING = r""" |
|
add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`): |
|
If set to ``True``, the sequences will be encoded with the special tokens relative |
|
to their model. |
|
`padding` (:obj:`Union[bool, str]`, `optional`, defaults to :obj:`False`): |
|
Activate and control padding. Accepts the following values: |
|
|
|
* `True` or `'longest'`: pad to the longest sequence in the batch (or no padding if only a single sequence if provided), |
|
* `'max_length'`: pad to a max length specified in `max_length` or to the max acceptable input length for the model if no length is provided (`max_length=None`) |
|
* `False` or `'do_not_pad'` (default): No padding (i.e. can output batch with sequences of uneven lengths) |
|
`truncation` (:obj:`Union[bool, str]`, `optional`, defaults to :obj:`False`): |
|
Activate and control truncation. Accepts the following values: |
|
|
|
* `True` or `'longest_first'`: truncate to a max length specified in `max_length` or to the max acceptable input length for the model if no length is provided (`max_length=None`). This will truncate token by token, removing a token from the longest sequence in the pair if a pair of sequences (or a batch of pairs) is provided, |
|
* `'only_first'`: truncate to a max length specified in `max_length` or to the max acceptable input length for the model if no length is provided (`max_length=None`). This will only truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided, |
|
* `'only_second'`: truncate to a max length specified in `max_length` or to the max acceptable input length for the model if no length is provided (`max_length=None`). This will only truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided, |
|
* `False` or `'do_not_truncate'` (default): No truncation (i.e. can output batch with sequences length greater than the model max admissible input size) |
|
`max_length` (:obj:`Union[int, None]`, `optional`, defaults to :obj:`None`): |
|
Control the length for padding/truncation. Accepts the following values |
|
|
|
* `None` (default): This will use the predefined model max length if required by one of the truncation/padding parameters. If the model has no specific max input length (e.g. XLNet) truncation/padding to max length is deactivated. |
|
* `any integer value` (e.g. `42`): Use this specific maximum length value if required by one of the truncation/padding parameters. |
|
stride (:obj:`int`, `optional`, defaults to ``0``): |
|
If set to a number along with max_length, the overflowing tokens returned when `return_overflowing_tokens=True` |
|
will contain some tokens from the end of the truncated sequence returned to provide some overlap between truncated and overflow ing sequences. |
|
The value of this argument defines the number of overlapping tokens. |
|
is_pretokenized (:obj:`bool`, defaults to :obj:`False`): |
|
Set to True to indicate the input is already tokenized |
|
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. |
|
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability |
|
>= 7.5 (Volta). |
|
return_tensors (:obj:`str`, `optional`, defaults to :obj:`None`): |
|
Can be set to 'tf', 'pt' or 'np' to return respectively TensorFlow :obj:`tf.constant`, |
|
PyTorch :obj:`torch.Tensor` or Numpy :oj: `np.ndarray` instead of a list of python integers. |
|
""" |
|
|
|
ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r""" |
|
return_token_type_ids (:obj:`bool`, `optional`, defaults to :obj:`None`): |
|
Whether to return token type IDs. If left to the default, will return the token type IDs according |
|
to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute. |
|
|
|
`What are token type IDs? <../glossary.html#token-type-ids>`_ |
|
return_attention_mask (:obj:`bool`, `optional`, defaults to :obj:`none`): |
|
Whether to return the attention mask. If left to the default, will return the attention mask according |
|
to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute. |
|
|
|
`What are attention masks? <../glossary.html#attention-mask>`__ |
|
return_overflowing_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): |
|
Set to True to return overflowing token sequences (default False). |
|
return_special_tokens_mask (:obj:`bool`, `optional`, defaults to :obj:`False`): |
|
Set to True to return special tokens mask information (default False). |
|
return_offsets_mapping (:obj:`bool`, `optional`, defaults to :obj:`False`): |
|
Set to True to return (char_start, char_end) for each token (default False). |
|
If using Python's tokenizer, this method will raise NotImplementedError. |
|
This one is only available on fast tokenizers inheriting from PreTrainedTokenizerFast. |
|
**kwargs: passed to the `self.tokenize()` method |
|
|
|
Return: |
|
A Dictionary of shape:: |
|
|
|
{ |
|
input_ids: list[int], |
|
token_type_ids: list[int] if return_token_type_ids is True (default) |
|
attention_mask: list[int] if return_attention_mask is True (default) |
|
overflowing_tokens: list[int] if the tokenizer is a slow tokenize, else a List[List[int]] if a ``max_length`` is specified and ``return_overflowing_tokens=True`` |
|
special_tokens_mask: list[int] if ``add_special_tokens`` if set to ``True`` |
|
and return_special_tokens_mask is True |
|
} |
|
|
|
With the fields: |
|
|
|
- ``input_ids``: list of token ids to be fed to a model |
|
- ``token_type_ids``: list of token type ids to be fed to a model |
|
- ``attention_mask``: list of indices specifying which tokens should be attended to by the model |
|
- ``overflowing_tokens``: list of overflowing tokens sequences if a max length is specified and ``return_overflowing_tokens=True``. |
|
- ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added |
|
tokens and 1 specifying sequence tokens. |
|
""" |
|
|
|
|
|
class PreTrainedTokenizerBase(SpecialTokensMixin): |
|
""" Base class for slow and fast tokenizers. |
|
|
|
Handle shared (mostly boiler plate) methods for slow and fast tokenizers. |
|
""" |
|
|
|
vocab_files_names: Dict[str, str] = {} |
|
pretrained_vocab_files_map: Dict[str, Dict[str, str]] = {} |
|
pretrained_init_configuration: Dict[str, Dict[str, Any]] = {} |
|
max_model_input_sizes: Dict[str, int] = {} |
|
model_input_names: List[str] = ["token_type_ids", "attention_mask"] |
|
|
|
padding_side: str = "right" |
|
|
|
def __init__(self, **kwargs): |
|
|
|
self.init_inputs = () |
|
self.init_kwargs = kwargs |
|
|
|
|
|
model_max_length = kwargs.pop("model_max_length", kwargs.pop("max_len", None)) |
|
self.model_max_length = model_max_length if model_max_length is not None else VERY_LARGE_INTEGER |
|
|
|
|
|
self.padding_side = kwargs.pop("padding_side", self.padding_side) |
|
assert self.padding_side in [ |
|
"right", |
|
"left", |
|
], f"Padding side should be selected between 'right' and 'left', current value: {self.padding_side}" |
|
self.model_input_names = kwargs.pop("model_input_names", self.model_input_names) |
|
|
|
super().__init__(**kwargs) |
|
|
|
@property |
|
def max_len(self) -> int: |
|
""" Kept here for backward compatibility. |
|
Now renamed to `model_max_length` to avoid ambiguity. |
|
""" |
|
return self.model_max_length |
|
|
|
@property |
|
def max_len_single_sentence(self) -> int: |
|
return self.model_max_length - self.num_special_tokens_to_add(pair=False) |
|
|
|
@property |
|
def max_len_sentences_pair(self) -> int: |
|
return self.model_max_length - self.num_special_tokens_to_add(pair=True) |
|
|
|
@max_len_single_sentence.setter |
|
def max_len_single_sentence(self, value) -> int: |
|
""" For backward compatibility, allow to try to setup 'max_len_single_sentence' """ |
|
if value == self.model_max_length - self.num_special_tokens_to_add(pair=False) and self.verbose: |
|
logger.warning( |
|
"Setting 'max_len_single_sentence' is now deprecated. " "This value is automatically set up." |
|
) |
|
else: |
|
raise ValueError( |
|
"Setting 'max_len_single_sentence' is now deprecated. " "This value is automatically set up." |
|
) |
|
|
|
@max_len_sentences_pair.setter |
|
def max_len_sentences_pair(self, value) -> int: |
|
""" For backward compatibility, allow to try to setup 'max_len_sentences_pair' """ |
|
if value == self.model_max_length - self.num_special_tokens_to_add(pair=True) and self.verbose: |
|
logger.warning( |
|
"Setting 'max_len_sentences_pair' is now deprecated. " "This value is automatically set up." |
|
) |
|
else: |
|
raise ValueError( |
|
"Setting 'max_len_sentences_pair' is now deprecated. " "This value is automatically set up." |
|
) |
|
|
|
@classmethod |
|
def from_pretrained(cls, *inputs, **kwargs): |
|
r""" |
|
Instantiate a :class:`~transformers.PreTrainedTokenizer` (or a derived class) from a predefined tokenizer. |
|
|
|
Args: |
|
pretrained_model_name_or_path: either: |
|
|
|
- a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``. |
|
- a string with the `identifier name` of a predefined tokenizer that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. |
|
- a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``. |
|
- (not applicable to all derived classes, deprecated) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``. |
|
|
|
cache_dir: (`optional`) string: |
|
Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the standard cache should not be used. |
|
|
|
force_download: (`optional`) boolean, default False: |
|
Force to (re-)download the vocabulary files and override the cached versions if they exists. |
|
|
|
resume_download: (`optional`) boolean, default False: |
|
Do not delete incompletely recieved file. Attempt to resume the download if such a file exists. |
|
|
|
proxies: (`optional`) dict, default None: |
|
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. |
|
The proxies are used on each request. |
|
|
|
inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method. |
|
|
|
kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~transformers.PreTrainedTokenizer` for details. |
|
|
|
Examples:: |
|
|
|
# We can't instantiate directly the base class `PreTrainedTokenizer` so let's show our examples on a derived class: BertTokenizer |
|
|
|
# Download vocabulary from S3 and cache. |
|
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') |
|
|
|
# Download vocabulary from S3 (user-uploaded) and cache. |
|
tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-german-cased') |
|
|
|
# If vocabulary files are in a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`) |
|
tokenizer = BertTokenizer.from_pretrained('./test/saved_model/') |
|
|
|
# If the tokenizer uses a single vocabulary file, you can point directly to this file |
|
tokenizer = BertTokenizer.from_pretrained('./test/saved_model/my_vocab.txt') |
|
|
|
# You can link tokens to special vocabulary when instantiating |
|
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', unk_token='<unk>') |
|
# You should be sure '<unk>' is in the vocabulary when doing that. |
|
# Otherwise use tokenizer.add_special_tokens({'unk_token': '<unk>'}) instead) |
|
assert tokenizer.unk_token == '<unk>' |
|
|
|
""" |
|
return cls._from_pretrained(*inputs, **kwargs) |
|
|
|
@classmethod |
|
def _from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs): |
|
cache_dir = kwargs.pop("cache_dir", None) |
|
force_download = kwargs.pop("force_download", False) |
|
resume_download = kwargs.pop("resume_download", False) |
|
proxies = kwargs.pop("proxies", None) |
|
local_files_only = kwargs.pop("local_files_only", False) |
|
|
|
s3_models = list(cls.max_model_input_sizes.keys()) |
|
vocab_files = {} |
|
init_configuration = {} |
|
if pretrained_model_name_or_path in s3_models: |
|
|
|
for file_id, map_list in cls.pretrained_vocab_files_map.items(): |
|
vocab_files[file_id] = map_list[pretrained_model_name_or_path] |
|
if ( |
|
cls.pretrained_init_configuration |
|
and pretrained_model_name_or_path in cls.pretrained_init_configuration |
|
): |
|
init_configuration = cls.pretrained_init_configuration[pretrained_model_name_or_path].copy() |
|
else: |
|
|
|
logger.info( |
|
"Model name '{}' not found in model shortcut name list ({}). " |
|
"Assuming '{}' is a path, a model identifier, or url to a directory containing tokenizer files.".format( |
|
pretrained_model_name_or_path, ", ".join(s3_models), pretrained_model_name_or_path |
|
) |
|
) |
|
|
|
if os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path): |
|
if len(cls.vocab_files_names) > 1: |
|
raise ValueError( |
|
"Calling {}.from_pretrained() with the path to a single file or url is not supported." |
|
"Use a model identifier or the path to a directory instead.".format(cls.__name__) |
|
) |
|
logger.warning( |
|
"Calling {}.from_pretrained() with the path to a single file or url is deprecated".format( |
|
cls.__name__ |
|
) |
|
) |
|
file_id = list(cls.vocab_files_names.keys())[0] |
|
vocab_files[file_id] = pretrained_model_name_or_path |
|
else: |
|
|
|
additional_files_names = { |
|
"added_tokens_file": ADDED_TOKENS_FILE, |
|
"special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE, |
|
"tokenizer_config_file": TOKENIZER_CONFIG_FILE, |
|
"full_tokenizer_file": FULL_TOKENIZER_FILE, |
|
} |
|
|
|
for file_id, file_name in {**cls.vocab_files_names, **additional_files_names}.items(): |
|
if os.path.isdir(pretrained_model_name_or_path): |
|
full_file_name = os.path.join(pretrained_model_name_or_path, file_name) |
|
if not os.path.exists(full_file_name): |
|
logger.info("Didn't find file {}. We won't load it.".format(full_file_name)) |
|
full_file_name = None |
|
else: |
|
full_file_name = hf_bucket_url( |
|
pretrained_model_name_or_path, filename=file_name, use_cdn=False |
|
) |
|
|
|
vocab_files[file_id] = full_file_name |
|
|
|
|
|
try: |
|
resolved_vocab_files = {} |
|
for file_id, file_path in vocab_files.items(): |
|
if file_path is None: |
|
resolved_vocab_files[file_id] = None |
|
else: |
|
resolved_vocab_files[file_id] = cached_path( |
|
file_path, |
|
cache_dir=cache_dir, |
|
force_download=force_download, |
|
proxies=proxies, |
|
resume_download=resume_download, |
|
local_files_only=local_files_only, |
|
) |
|
except EnvironmentError: |
|
if pretrained_model_name_or_path in s3_models: |
|
msg = "Couldn't reach server at '{}' to download vocabulary files." |
|
else: |
|
msg = ( |
|
"Model name '{}' was not found in tokenizers model name list ({}). " |
|
"We assumed '{}' was a path or url to a directory containing vocabulary files " |
|
"named {}, but couldn't find such vocabulary files at this path or url.".format( |
|
pretrained_model_name_or_path, |
|
", ".join(s3_models), |
|
pretrained_model_name_or_path, |
|
list(cls.vocab_files_names.values()), |
|
) |
|
) |
|
|
|
raise EnvironmentError(msg) |
|
|
|
if all(full_file_name is None for full_file_name in resolved_vocab_files.values()): |
|
raise EnvironmentError( |
|
"Model name '{}' was not found in tokenizers model name list ({}). " |
|
"We assumed '{}' was a path, a model identifier, or url to a directory containing vocabulary files " |
|
"named {} but couldn't find such vocabulary files at this path or url.".format( |
|
pretrained_model_name_or_path, |
|
", ".join(s3_models), |
|
pretrained_model_name_or_path, |
|
list(cls.vocab_files_names.values()), |
|
) |
|
) |
|
|
|
for file_id, file_path in vocab_files.items(): |
|
if file_path == resolved_vocab_files[file_id]: |
|
logger.info("loading file {}".format(file_path)) |
|
else: |
|
logger.info("loading file {} from cache at {}".format(file_path, resolved_vocab_files[file_id])) |
|
|
|
|
|
|
|
tokenizer_config_file = resolved_vocab_files.pop("tokenizer_config_file", None) |
|
if tokenizer_config_file is not None: |
|
with open(tokenizer_config_file, encoding="utf-8") as tokenizer_config_handle: |
|
init_kwargs = json.load(tokenizer_config_handle) |
|
saved_init_inputs = init_kwargs.pop("init_inputs", ()) |
|
if not init_inputs: |
|
init_inputs = saved_init_inputs |
|
else: |
|
init_kwargs = init_configuration |
|
|
|
|
|
init_kwargs.update(kwargs) |
|
|
|
|
|
if pretrained_model_name_or_path in cls.max_model_input_sizes: |
|
|
|
|
|
model_max_length = cls.max_model_input_sizes[pretrained_model_name_or_path] |
|
if model_max_length is not None and isinstance(model_max_length, (int, float)): |
|
init_kwargs["model_max_length"] = min(init_kwargs.get("model_max_length", int(1e30)), model_max_length) |
|
|
|
|
|
added_tokens_file = resolved_vocab_files.pop("added_tokens_file", None) |
|
for args_name, file_path in resolved_vocab_files.items(): |
|
if args_name not in init_kwargs: |
|
init_kwargs[args_name] = file_path |
|
|
|
|
|
try: |
|
tokenizer = cls(*init_inputs, **init_kwargs) |
|
except OSError: |
|
raise OSError( |
|
"Unable to load vocabulary from file. " |
|
"Please check that the provided vocabulary is accessible and not corrupted." |
|
) |
|
|
|
|
|
tokenizer.init_inputs = init_inputs |
|
tokenizer.init_kwargs = init_kwargs |
|
|
|
|
|
special_tokens_map_file = resolved_vocab_files.pop("special_tokens_map_file", None) |
|
if special_tokens_map_file is not None: |
|
with open(special_tokens_map_file, encoding="utf-8") as special_tokens_map_handle: |
|
special_tokens_map = json.load(special_tokens_map_handle) |
|
|
|
for key, value in special_tokens_map.items(): |
|
if isinstance(value, dict): |
|
value = AddedToken(**value) |
|
setattr(tokenizer, key, value) |
|
|
|
|
|
special_tokens = tokenizer.all_special_tokens |
|
if added_tokens_file is not None: |
|
with open(added_tokens_file, encoding="utf-8") as added_tokens_handle: |
|
added_tok_encoder = json.load(added_tokens_handle) |
|
|
|
|
|
added_tok_encoder_sorted = list(sorted(added_tok_encoder.items(), key=lambda x: x[1])) |
|
|
|
for token, index in added_tok_encoder_sorted: |
|
assert index == len(tokenizer), ( |
|
f"Non-consecutive added token '{token}' found. " |
|
f"Should have index {len(tokenizer)} but has index {index} in saved vocabulary." |
|
) |
|
tokenizer.add_tokens(token, special_tokens=bool(token in special_tokens)) |
|
|
|
|
|
added_tokens = tokenizer.sanitize_special_tokens() |
|
if added_tokens: |
|
logger.warning( |
|
"Special tokens have been added in the vocabulary, make sure the associated word emebedding are fine-tuned or trained." |
|
) |
|
|
|
return tokenizer |
|
|
|
def save_pretrained(self, save_directory) -> Tuple[str]: |
|
""" Save the tokenizer vocabulary files together with: |
|
- added tokens, |
|
- special-tokens-to-class-attributes-mapping, |
|
- tokenizer instantiation positional and keywords inputs (e.g. do_lower_case for Bert). |
|
|
|
Warning: This won't save modifications you may have applied to the tokenizer after the instantiation |
|
(e.g. modifying tokenizer.do_lower_case after creation). |
|
|
|
This method make sure the full tokenizer can then be re-loaded using the |
|
:func:`~transformers.PreTrainedTokenizer.from_pretrained` class method. |
|
""" |
|
if os.path.isfile(save_directory): |
|
logger.error("Provided path ({}) should be a directory, not a file".format(save_directory)) |
|
return |
|
os.makedirs(save_directory, exist_ok=True) |
|
|
|
special_tokens_map_file = os.path.join(save_directory, SPECIAL_TOKENS_MAP_FILE) |
|
added_tokens_file = os.path.join(save_directory, ADDED_TOKENS_FILE) |
|
tokenizer_config_file = os.path.join(save_directory, TOKENIZER_CONFIG_FILE) |
|
|
|
tokenizer_config = copy.deepcopy(self.init_kwargs) |
|
if len(self.init_inputs) > 0: |
|
tokenizer_config["init_inputs"] = copy.deepcopy(self.init_inputs) |
|
for file_id in self.vocab_files_names.keys(): |
|
tokenizer_config.pop(file_id, None) |
|
|
|
with open(tokenizer_config_file, "w", encoding="utf-8") as f: |
|
f.write(json.dumps(tokenizer_config, ensure_ascii=False)) |
|
|
|
with open(special_tokens_map_file, "w", encoding="utf-8") as f: |
|
write_dict = {} |
|
for key, value in self.special_tokens_map_extended.items(): |
|
if isinstance(value, AddedToken): |
|
write_dict[key] = value.__getstate__() |
|
else: |
|
write_dict[key] = value |
|
f.write(json.dumps(write_dict, ensure_ascii=False)) |
|
|
|
added_vocab = self.get_added_vocab() |
|
if added_vocab: |
|
with open(added_tokens_file, "w", encoding="utf-8") as f: |
|
out_str = json.dumps(added_vocab, ensure_ascii=False) |
|
f.write(out_str) |
|
|
|
vocab_files = self.save_vocabulary(save_directory) |
|
|
|
return vocab_files + (special_tokens_map_file, added_tokens_file) |
|
|
|
@add_end_docstrings( |
|
ENCODE_KWARGS_DOCSTRING, |
|
""" |
|
**kwargs: passed to the `self.tokenize()` method. |
|
""", |
|
) |
|
def encode( |
|
self, |
|
text: Union[TextInput, PreTokenizedInput, EncodedInput], |
|
text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None, |
|
add_special_tokens: bool = True, |
|
padding: Union[bool, str] = False, |
|
truncation: Union[bool, str] = False, |
|
max_length: Optional[int] = None, |
|
stride: int = 0, |
|
return_tensors: Optional[Union[str, TensorType]] = None, |
|
**kwargs |
|
): |
|
""" |
|
Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary. |
|
|
|
Same as doing ``self.convert_tokens_to_ids(self.tokenize(text))``. |
|
|
|
Args: |
|
text (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`): |
|
The first sequence to be encoded. This can be a string, a list of strings (tokenized string using |
|
the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids` |
|
method) |
|
text_pair (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`, `optional`, defaults to :obj:`None`): |
|
Optional second sequence to be encoded. This can be a string, a list of strings (tokenized |
|
string using the `tokenize` method) or a list of integers (tokenized string ids using the |
|
`convert_tokens_to_ids` method) |
|
""" |
|
encoded_inputs = self.encode_plus( |
|
text, |
|
text_pair=text_pair, |
|
add_special_tokens=add_special_tokens, |
|
padding=padding, |
|
truncation=truncation, |
|
max_length=max_length, |
|
stride=stride, |
|
return_tensors=return_tensors, |
|
**kwargs, |
|
) |
|
|
|
return encoded_inputs["input_ids"] |
|
|
|
def num_special_tokens_to_add(self, pair: bool = False) -> int: |
|
raise NotImplementedError |
|
|
|
def _get_padding_truncation_strategies( |
|
self, padding=False, truncation=False, max_length=None, pad_to_multiple_of=None, verbose=True, **kwargs |
|
): |
|
""" Find the correct padding/truncation strategy with backward compatibility |
|
for old arguments (truncation_strategy and pad_to_max_length) and behaviors. |
|
""" |
|
old_truncation_strategy = kwargs.pop("truncation_strategy", "do_not_truncate") |
|
old_pad_to_max_length = kwargs.pop("pad_to_max_length", False) |
|
|
|
|
|
|
|
if max_length is not None and padding is False and truncation is False: |
|
if verbose: |
|
logger.warning( |
|
"Truncation was not explicitely activated but `max_length` is provided a specific value, " |
|
"please use `truncation=True` to explicitely truncate examples to max length. " |
|
"Defaulting to 'longest_first' truncation strategy. " |
|
"If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy " |
|
"more precisely by providing a specific strategy to `truncation`." |
|
) |
|
truncation = "longest_first" |
|
|
|
|
|
if padding is False and old_pad_to_max_length: |
|
if verbose: |
|
warnings.warn( |
|
"The `pad_to_max_length` argument is deprecated and will be removed in a future version, " |
|
"use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or " |
|
"use `padding='max_length'` to pad to a max length. In this case, you can give a specific " |
|
"length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the " |
|
"maximal input size of the model (e.g. 512 for Bert).", |
|
DeprecationWarning, |
|
) |
|
if max_length is None: |
|
padding_strategy = PaddingStrategy.LONGEST |
|
else: |
|
padding_strategy = PaddingStrategy.MAX_LENGTH |
|
elif padding is not False: |
|
if padding is True: |
|
padding_strategy = PaddingStrategy.LONGEST |
|
elif not isinstance(padding, PaddingStrategy): |
|
padding_strategy = PaddingStrategy(padding) |
|
else: |
|
padding_strategy = PaddingStrategy.DO_NOT_PAD |
|
|
|
|
|
if truncation is False and old_truncation_strategy != "do_not_truncate": |
|
if verbose: |
|
warnings.warn( |
|
"The `truncation_strategy` argument is deprecated and will be removed in a future version, " |
|
"use `truncation=True` to truncate examples to a max length. You can give a specific " |
|
"length with `max_length` (e.g. `max_length=45`) or leave max_length to None to truncate to the " |
|
"maximal input size of the model (e.g. 512 for Bert). " |
|
" If you have pairs of inputs, you can give a specific truncation strategy selected among " |
|
"`truncation='only_first'` (will only truncate the first sentence in the pairs) " |
|
"`truncation='only_second'` (will only truncate the second sentence in the pairs) " |
|
"or `truncation='longest_first'` (will iteratively remove tokens from the longest sentence in the pairs).", |
|
DeprecationWarning, |
|
) |
|
truncation_strategy = TruncationStrategy(old_truncation_strategy) |
|
elif truncation is not False: |
|
if truncation is True: |
|
truncation_strategy = ( |
|
TruncationStrategy.LONGEST_FIRST |
|
) |
|
elif not isinstance(truncation, TruncationStrategy): |
|
truncation_strategy = TruncationStrategy(truncation) |
|
else: |
|
truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE |
|
|
|
|
|
if max_length is None: |
|
if padding_strategy == PaddingStrategy.MAX_LENGTH: |
|
if self.model_max_length > LARGE_INTEGER: |
|
if verbose: |
|
logger.warning( |
|
"Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. " |
|
"Default to no padding." |
|
) |
|
padding_strategy = PaddingStrategy.DO_NOT_PAD |
|
else: |
|
max_length = self.model_max_length |
|
|
|
if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE: |
|
if self.model_max_length > LARGE_INTEGER: |
|
if verbose: |
|
logger.warning( |
|
"Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. " |
|
"Default to no truncation." |
|
) |
|
truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE |
|
else: |
|
max_length = self.model_max_length |
|
|
|
|
|
if padding_strategy != PaddingStrategy.DO_NOT_PAD and (not self.pad_token or self.pad_token_id < 0): |
|
raise ValueError( |
|
"Asking to pad but the tokenizer does not have a padding token. " |
|
"Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` " |
|
"or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`." |
|
) |
|
|
|
|
|
if ( |
|
truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE |
|
and padding_strategy != PaddingStrategy.DO_NOT_PAD |
|
and pad_to_multiple_of is not None |
|
and max_length is not None |
|
and (max_length % pad_to_multiple_of != 0) |
|
): |
|
raise ValueError( |
|
f"Truncation and padding are both activated but " |
|
f"truncation length ({max_length}) is not a multiple of pad_to_multiple_of ({pad_to_multiple_of})." |
|
) |
|
|
|
return padding_strategy, truncation_strategy, max_length, kwargs |
|
|
|
@add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING) |
|
def __call__( |
|
self, |
|
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]], |
|
text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None, |
|
add_special_tokens: bool = True, |
|
padding: Union[bool, str] = False, |
|
truncation: Union[bool, str] = False, |
|
max_length: Optional[int] = None, |
|
stride: int = 0, |
|
is_pretokenized: bool = False, |
|
pad_to_multiple_of: Optional[int] = None, |
|
return_tensors: Optional[Union[str, TensorType]] = None, |
|
return_token_type_ids: Optional[bool] = None, |
|
return_attention_mask: Optional[bool] = None, |
|
return_overflowing_tokens: bool = False, |
|
return_special_tokens_mask: bool = False, |
|
return_offsets_mapping: bool = False, |
|
return_length: bool = False, |
|
verbose: bool = True, |
|
**kwargs |
|
) -> BatchEncoding: |
|
""" |
|
Returns a dictionary containing the encoded sequence or sequence pair and additional information: |
|
the mask for sequence classification and the overflowing elements if a ``max_length`` is specified. |
|
|
|
Args: |
|
text (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]``): |
|
The sequence or batch of sequences to be encoded. |
|
Each sequence can be a string or a list of strings (pre-tokenized string). |
|
If the sequences are provided as list of strings (pretokenized), you must set `is_pretokenized=True` |
|
(to lift the ambiguity with a batch of sequences) |
|
text_pair (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]``): |
|
The sequence or batch of sequences to be encoded. |
|
Each sequence can be a string or a list of strings (pre-tokenized string). |
|
If the sequences are provided as list of strings (pretokenized), you must set `is_pretokenized=True` |
|
(to lift the ambiguity with a batch of sequences) |
|
""" |
|
|
|
assert isinstance(text, str) or ( |
|
isinstance(text, (list, tuple)) |
|
and ( |
|
len(text) == 0 |
|
or ( |
|
isinstance(text[0], str) |
|
or (isinstance(text[0], (list, tuple)) and (len(text[0]) == 0 or isinstance(text[0][0], str))) |
|
) |
|
) |
|
), ( |
|
"text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) " |
|
"or `List[List[str]]` (batch of pretokenized examples)." |
|
) |
|
|
|
assert ( |
|
text_pair is None |
|
or isinstance(text_pair, str) |
|
or ( |
|
isinstance(text_pair, (list, tuple)) |
|
and ( |
|
len(text_pair) == 0 |
|
or ( |
|
isinstance(text_pair[0], str) |
|
or ( |
|
isinstance(text_pair[0], (list, tuple)) |
|
and (len(text_pair[0]) == 0 or isinstance(text_pair[0][0], str)) |
|
) |
|
) |
|
) |
|
) |
|
), ( |
|
"text_pair input must of type `str` (single example), `List[str]` (batch or single pretokenized example) " |
|
"or `List[List[str]]` (batch of pretokenized examples)." |
|
) |
|
|
|
is_batched = bool( |
|
(not is_pretokenized and isinstance(text, (list, tuple))) |
|
or (is_pretokenized and isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple))) |
|
) |
|
|
|
if is_batched: |
|
batch_text_or_text_pairs = list(zip(text, text_pair)) if text_pair is not None else text |
|
return self.batch_encode_plus( |
|
batch_text_or_text_pairs=batch_text_or_text_pairs, |
|
add_special_tokens=add_special_tokens, |
|
padding=padding, |
|
truncation=truncation, |
|
max_length=max_length, |
|
stride=stride, |
|
is_pretokenized=is_pretokenized, |
|
pad_to_multiple_of=pad_to_multiple_of, |
|
return_tensors=return_tensors, |
|
return_token_type_ids=return_token_type_ids, |
|
return_attention_mask=return_attention_mask, |
|
return_overflowing_tokens=return_overflowing_tokens, |
|
return_special_tokens_mask=return_special_tokens_mask, |
|
return_offsets_mapping=return_offsets_mapping, |
|
return_length=return_length, |
|
verbose=verbose, |
|
**kwargs, |
|
) |
|
else: |
|
return self.encode_plus( |
|
text=text, |
|
text_pair=text_pair, |
|
add_special_tokens=add_special_tokens, |
|
padding=padding, |
|
truncation=truncation, |
|
max_length=max_length, |
|
stride=stride, |
|
is_pretokenized=is_pretokenized, |
|
pad_to_multiple_of=pad_to_multiple_of, |
|
return_tensors=return_tensors, |
|
return_token_type_ids=return_token_type_ids, |
|
return_attention_mask=return_attention_mask, |
|
return_overflowing_tokens=return_overflowing_tokens, |
|
return_special_tokens_mask=return_special_tokens_mask, |
|
return_offsets_mapping=return_offsets_mapping, |
|
return_length=return_length, |
|
verbose=verbose, |
|
**kwargs, |
|
) |
|
|
|
@add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING) |
|
def encode_plus( |
|
self, |
|
text: Union[TextInput, PreTokenizedInput, EncodedInput], |
|
text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None, |
|
add_special_tokens: bool = True, |
|
padding: Union[bool, str] = False, |
|
truncation: Union[bool, str] = False, |
|
max_length: Optional[int] = None, |
|
stride: int = 0, |
|
is_pretokenized: bool = False, |
|
pad_to_multiple_of: Optional[int] = None, |
|
return_tensors: Optional[Union[str, TensorType]] = None, |
|
return_token_type_ids: Optional[bool] = None, |
|
return_attention_mask: Optional[bool] = None, |
|
return_overflowing_tokens: bool = False, |
|
return_special_tokens_mask: bool = False, |
|
return_offsets_mapping: bool = False, |
|
return_length: bool = False, |
|
verbose: bool = True, |
|
**kwargs |
|
) -> BatchEncoding: |
|
""" |
|
Returns a dictionary containing the encoded sequence or sequence pair and additional information: |
|
the mask for sequence classification and the overflowing elements if a ``max_length`` is specified. |
|
|
|
Args: |
|
text (:obj:`str`, :obj:`List[str]` or :obj:`List[int]` (the later only for not-fast tokenizers)): |
|
The first sequence to be encoded. This can be a string, a list of strings (tokenized string using |
|
the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids` |
|
method) |
|
text_pair (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`, `optional`, defaults to :obj:`None`): |
|
Optional second sequence to be encoded. This can be a string, a list of strings (tokenized |
|
string using the `tokenize` method) or a list of integers (tokenized string ids using the |
|
`convert_tokens_to_ids` method) |
|
""" |
|
|
|
|
|
padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies( |
|
padding=padding, |
|
truncation=truncation, |
|
max_length=max_length, |
|
pad_to_multiple_of=pad_to_multiple_of, |
|
verbose=verbose, |
|
**kwargs, |
|
) |
|
|
|
return self._encode_plus( |
|
text=text, |
|
text_pair=text_pair, |
|
add_special_tokens=add_special_tokens, |
|
padding_strategy=padding_strategy, |
|
truncation_strategy=truncation_strategy, |
|
max_length=max_length, |
|
stride=stride, |
|
is_pretokenized=is_pretokenized, |
|
pad_to_multiple_of=pad_to_multiple_of, |
|
return_tensors=return_tensors, |
|
return_token_type_ids=return_token_type_ids, |
|
return_attention_mask=return_attention_mask, |
|
return_overflowing_tokens=return_overflowing_tokens, |
|
return_special_tokens_mask=return_special_tokens_mask, |
|
return_offsets_mapping=return_offsets_mapping, |
|
return_length=return_length, |
|
verbose=verbose, |
|
**kwargs, |
|
) |
|
|
|
def _encode_plus( |
|
self, |
|
text: Union[TextInput, PreTokenizedInput, EncodedInput], |
|
text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None, |
|
add_special_tokens: bool = True, |
|
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, |
|
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, |
|
max_length: Optional[int] = None, |
|
stride: int = 0, |
|
is_pretokenized: bool = False, |
|
pad_to_multiple_of: Optional[int] = None, |
|
return_tensors: Optional[Union[str, TensorType]] = None, |
|
return_token_type_ids: Optional[bool] = None, |
|
return_attention_mask: Optional[bool] = None, |
|
return_overflowing_tokens: bool = False, |
|
return_special_tokens_mask: bool = False, |
|
return_offsets_mapping: bool = False, |
|
return_length: bool = False, |
|
verbose: bool = True, |
|
**kwargs |
|
) -> BatchEncoding: |
|
raise NotImplementedError |
|
|
|
@add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING) |
|
def batch_encode_plus( |
|
self, |
|
batch_text_or_text_pairs: Union[ |
|
List[TextInput], |
|
List[TextInputPair], |
|
List[PreTokenizedInput], |
|
List[PreTokenizedInputPair], |
|
List[EncodedInput], |
|
List[EncodedInputPair], |
|
], |
|
add_special_tokens: bool = True, |
|
padding: Union[bool, str] = False, |
|
truncation: Union[bool, str] = False, |
|
max_length: Optional[int] = None, |
|
stride: int = 0, |
|
is_pretokenized: bool = False, |
|
pad_to_multiple_of: Optional[int] = None, |
|
return_tensors: Optional[Union[str, TensorType]] = None, |
|
return_token_type_ids: Optional[bool] = None, |
|
return_attention_mask: Optional[bool] = None, |
|
return_overflowing_tokens: bool = False, |
|
return_special_tokens_mask: bool = False, |
|
return_offsets_mapping: bool = False, |
|
return_length: bool = False, |
|
verbose: bool = True, |
|
**kwargs |
|
) -> BatchEncoding: |
|
""" |
|
Returns a dictionary containing the encoded sequence or sequence pair and additional information: |
|
the mask for sequence classification and the overflowing elements if a ``max_length`` is specified. |
|
|
|
Args: |
|
batch_text_or_text_pairs (:obj:`List[str]`, :obj:`List[Tuple[str, str]]`, |
|
:obj:`List[List[str]]`, :obj:`List[Tuple[List[str], List[str]]]`, |
|
and for not-fast tokenizers, also: |
|
:obj:`List[List[int]]`, :obj:`List[Tuple[List[int], List[int]]]`): |
|
Batch of sequences or pair of sequences to be encoded. |
|
This can be a list of string/string-sequences/int-sequences or a list of pair of |
|
string/string-sequences/int-sequence (see details in encode_plus) |
|
""" |
|
|
|
|
|
padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies( |
|
padding=padding, |
|
truncation=truncation, |
|
max_length=max_length, |
|
pad_to_multiple_of=pad_to_multiple_of, |
|
verbose=verbose, |
|
**kwargs, |
|
) |
|
|
|
return self._batch_encode_plus( |
|
batch_text_or_text_pairs=batch_text_or_text_pairs, |
|
add_special_tokens=add_special_tokens, |
|
padding_strategy=padding_strategy, |
|
truncation_strategy=truncation_strategy, |
|
max_length=max_length, |
|
stride=stride, |
|
is_pretokenized=is_pretokenized, |
|
pad_to_multiple_of=pad_to_multiple_of, |
|
return_tensors=return_tensors, |
|
return_token_type_ids=return_token_type_ids, |
|
return_attention_mask=return_attention_mask, |
|
return_overflowing_tokens=return_overflowing_tokens, |
|
return_special_tokens_mask=return_special_tokens_mask, |
|
return_offsets_mapping=return_offsets_mapping, |
|
return_length=return_length, |
|
verbose=verbose, |
|
**kwargs, |
|
) |
|
|
|
def _batch_encode_plus( |
|
self, |
|
batch_text_or_text_pairs: Union[ |
|
List[TextInput], |
|
List[TextInputPair], |
|
List[PreTokenizedInput], |
|
List[PreTokenizedInputPair], |
|
List[EncodedInput], |
|
List[EncodedInputPair], |
|
], |
|
add_special_tokens: bool = True, |
|
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, |
|
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, |
|
max_length: Optional[int] = None, |
|
stride: int = 0, |
|
is_pretokenized: bool = False, |
|
pad_to_multiple_of: Optional[int] = None, |
|
return_tensors: Optional[Union[str, TensorType]] = None, |
|
return_token_type_ids: Optional[bool] = None, |
|
return_attention_mask: Optional[bool] = None, |
|
return_overflowing_tokens: bool = False, |
|
return_special_tokens_mask: bool = False, |
|
return_offsets_mapping: bool = False, |
|
return_length: bool = False, |
|
verbose: bool = True, |
|
**kwargs |
|
) -> BatchEncoding: |
|
raise NotImplementedError |
|
|
|
def pad( |
|
self, |
|
encoded_inputs: Union[ |
|
BatchEncoding, |
|
List[BatchEncoding], |
|
Dict[str, EncodedInput], |
|
Dict[str, List[EncodedInput]], |
|
List[Dict[str, EncodedInput]], |
|
], |
|
padding: Union[bool, str] = True, |
|
max_length: Optional[int] = None, |
|
pad_to_multiple_of: Optional[int] = None, |
|
return_attention_mask: Optional[bool] = None, |
|
return_tensors: Optional[Union[str, TensorType]] = None, |
|
verbose: bool = True, |
|
) -> BatchEncoding: |
|
""" Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length in the batch. |
|
|
|
Padding side (left/right) padding token ids are defined at the tokenizer level |
|
(with ``self.padding_side``, ``self.pad_token_id`` and ``self.pad_token_type_id``) |
|
|
|
Args: |
|
encoded_inputs: Dictionary of tokenized inputs (`Dict[str, List[int]]`) or batch of tokenized inputs. |
|
Batch of tokenized inputs can be given as dicts of lists or lists of dicts, both work so you can |
|
use ``tokenizer.pad()`` during pre-processing as well as in a PyTorch Dataloader collate function. |
|
(`Dict[str, List[List[int]]]` or `List[Dict[str, List[int]]]`). |
|
padding: Boolean or specific strategy to use for padding. |
|
Select a strategy to pad the returned sequences (according to the model's padding side and padding index) among: |
|
- 'longest' (or `True`) Pad to the longest sequence in the batch |
|
- 'max_length': Pad to the max length (default) |
|
- 'do_not_pad' (or `False`): Do not pad |
|
max_length: maximum length of the returned list and optionally padding length (see below). |
|
Will truncate by taking into account the special tokens. |
|
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. |
|
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability |
|
>= 7.5 (Volta). |
|
return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics) |
|
return_tensors (:obj:`str`, `optional`, defaults to :obj:`None`): |
|
Can be set to 'tf', 'pt' or 'np' to return respectively TensorFlow :obj:`tf.constant`, |
|
PyTorch :obj:`torch.Tensor` or Numpy :oj: `np.ndarray` instead of a list of python integers. |
|
verbose (:obj:`bool`, `optional`, defaults to :obj:`True`): |
|
Set to ``False`` to avoid printing infos and warnings. |
|
""" |
|
|
|
if isinstance(encoded_inputs, (list, tuple)) and isinstance(encoded_inputs[0], (dict, BatchEncoding)): |
|
encoded_inputs = {key: [example[key] for example in encoded_inputs] for key in encoded_inputs[0].keys()} |
|
|
|
assert "input_ids" in encoded_inputs, ( |
|
"You should supply an encoding or a list of encodings to this method. " |
|
"An encoding is the output of one the encoding methods of the tokenizer, i.e. " |
|
"__call__/encode_plus/batch_encode_plus. " |
|
) |
|
|
|
if not encoded_inputs["input_ids"]: |
|
if return_attention_mask: |
|
encoded_inputs["attention_mask"] = [] |
|
return encoded_inputs |
|
|
|
|
|
padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies( |
|
padding=padding, max_length=max_length, verbose=verbose |
|
) |
|
|
|
if encoded_inputs["input_ids"] and not isinstance(encoded_inputs["input_ids"][0], (list, tuple)): |
|
encoded_inputs = self._pad( |
|
encoded_inputs, |
|
max_length=max_length, |
|
padding_strategy=padding_strategy, |
|
pad_to_multiple_of=pad_to_multiple_of, |
|
return_attention_mask=return_attention_mask, |
|
) |
|
return BatchEncoding(encoded_inputs, tensor_type=return_tensors) |
|
|
|
batch_size = len(encoded_inputs["input_ids"]) |
|
assert all( |
|
len(v) == batch_size for v in encoded_inputs.values() |
|
), "Some items in the output dictionnary have a different batch size than others." |
|
|
|
if padding_strategy == PaddingStrategy.LONGEST: |
|
max_length = max(len(inputs) for inputs in encoded_inputs["input_ids"]) |
|
padding_strategy = PaddingStrategy.MAX_LENGTH |
|
|
|
batch_outputs = {} |
|
for i in range(batch_size): |
|
inputs = dict((k, v[i]) for k, v in encoded_inputs.items()) |
|
outputs = self._pad( |
|
inputs, |
|
max_length=max_length, |
|
padding_strategy=padding_strategy, |
|
pad_to_multiple_of=pad_to_multiple_of, |
|
return_attention_mask=return_attention_mask, |
|
) |
|
|
|
for key, value in outputs.items(): |
|
if key not in batch_outputs: |
|
batch_outputs[key] = [] |
|
batch_outputs[key].append(value) |
|
|
|
return BatchEncoding(batch_outputs, tensor_type=return_tensors) |
|
|
|
def create_token_type_ids_from_sequences(self, token_ids_0: List, token_ids_1: Optional[List] = None) -> List[int]: |
|
if token_ids_1 is None: |
|
return len(token_ids_0) * [0] |
|
return [0] * len(token_ids_0) + [1] * len(token_ids_1) |
|
|
|
def build_inputs_with_special_tokens(self, token_ids_0: List, token_ids_1: Optional[List] = None) -> List: |
|
""" |
|
Build model inputs from a sequence or a pair of sequence for sequence classification tasks |
|
by concatenating and adding special tokens. This implementation does not add special tokens. |
|
""" |
|
if token_ids_1 is None: |
|
return token_ids_0 |
|
return token_ids_0 + token_ids_1 |
|
|
|
@add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING) |
|
def prepare_for_model( |
|
self, |
|
ids: List[int], |
|
pair_ids: Optional[List[int]] = None, |
|
add_special_tokens: bool = True, |
|
padding: Union[bool, str] = False, |
|
truncation: Union[bool, str] = False, |
|
max_length: Optional[int] = None, |
|
stride: int = 0, |
|
pad_to_multiple_of: Optional[int] = None, |
|
return_tensors: Optional[Union[str, TensorType]] = None, |
|
return_token_type_ids: Optional[bool] = None, |
|
return_attention_mask: Optional[bool] = None, |
|
return_overflowing_tokens: bool = False, |
|
return_special_tokens_mask: bool = False, |
|
return_offsets_mapping: bool = False, |
|
return_length: bool = False, |
|
verbose: bool = True, |
|
prepend_batch_axis: bool = False, |
|
**kwargs |
|
) -> BatchEncoding: |
|
""" Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. |
|
It adds special tokens, truncates sequences if overflowing while taking into account the special tokens and |
|
manages a moving window (with user defined stride) for overflowing tokens |
|
|
|
Args: |
|
ids: list of tokenized input ids. Can be obtained from a string by chaining the |
|
`tokenize` and `convert_tokens_to_ids` methods. |
|
pair_ids: Optional second list of input ids. Can be obtained from a string by chaining the |
|
`tokenize` and `convert_tokens_to_ids` methods. |
|
""" |
|
|
|
if "return_lengths" in kwargs: |
|
if verbose: |
|
warnings.warn( |
|
"The PreTrainedTokenizerBase.prepare_for_model `return_lengths` parameter is deprecated. " |
|
"Please use `return_length` instead.", |
|
FutureWarning, |
|
) |
|
return_length = kwargs["return_lengths"] |
|
|
|
|
|
padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies( |
|
padding=padding, |
|
truncation=truncation, |
|
max_length=max_length, |
|
pad_to_multiple_of=pad_to_multiple_of, |
|
verbose=verbose, |
|
**kwargs, |
|
) |
|
|
|
pair = bool(pair_ids is not None) |
|
len_ids = len(ids) |
|
len_pair_ids = len(pair_ids) if pair else 0 |
|
|
|
|
|
if return_token_type_ids is None: |
|
return_token_type_ids = "token_type_ids" in self.model_input_names |
|
if return_attention_mask is None: |
|
return_attention_mask = "attention_mask" in self.model_input_names |
|
|
|
encoded_inputs = {} |
|
|
|
|
|
total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add(pair=pair) if add_special_tokens else 0) |
|
|
|
|
|
if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length: |
|
ids, pair_ids, overflowing_tokens = self.truncate_sequences( |
|
ids, |
|
pair_ids=pair_ids, |
|
num_tokens_to_remove=total_len - max_length, |
|
truncation_strategy=truncation_strategy, |
|
stride=stride, |
|
) |
|
if return_overflowing_tokens: |
|
encoded_inputs["overflowing_tokens"] = overflowing_tokens |
|
encoded_inputs["num_truncated_tokens"] = total_len - max_length |
|
|
|
|
|
if add_special_tokens: |
|
sequence = self.build_inputs_with_special_tokens(ids, pair_ids) |
|
token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids) |
|
else: |
|
sequence = ids + pair_ids if pair else ids |
|
token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else []) |
|
|
|
|
|
encoded_inputs["input_ids"] = sequence |
|
if return_token_type_ids: |
|
encoded_inputs["token_type_ids"] = token_type_ids |
|
if return_special_tokens_mask: |
|
if add_special_tokens: |
|
encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids) |
|
else: |
|
encoded_inputs["special_tokens_mask"] = [0] * len(sequence) |
|
|
|
|
|
if max_length is None and len(encoded_inputs["input_ids"]) > self.model_max_length and verbose: |
|
logger.warning( |
|
"Token indices sequence length is longer than the specified maximum sequence length " |
|
"for this model ({} > {}). Running this sequence through the model will result in " |
|
"indexing errors".format(len(ids), self.model_max_length) |
|
) |
|
|
|
|
|
if padding_strategy != PaddingStrategy.DO_NOT_PAD or return_attention_mask: |
|
encoded_inputs = self.pad( |
|
encoded_inputs, |
|
max_length=max_length, |
|
padding=padding_strategy.value, |
|
pad_to_multiple_of=pad_to_multiple_of, |
|
return_attention_mask=return_attention_mask, |
|
) |
|
|
|
if return_length: |
|
encoded_inputs["length"] = len(encoded_inputs["input_ids"]) |
|
|
|
batch_outputs = BatchEncoding( |
|
encoded_inputs, tensor_type=return_tensors, prepend_batch_axis=prepend_batch_axis |
|
) |
|
|
|
return batch_outputs |
|
|
|
def truncate_sequences( |
|
self, |
|
ids: List[int], |
|
pair_ids: Optional[List[int]] = None, |
|
num_tokens_to_remove: int = 0, |
|
truncation_strategy: Union[str, TruncationStrategy] = "longest_first", |
|
stride: int = 0, |
|
) -> Tuple[List[int], List[int], List[int]]: |
|
""" Truncates a sequence pair in place to the maximum length. |
|
|
|
Args: |
|
ids: list of tokenized input ids. Can be obtained from a string by chaining the |
|
`tokenize` and `convert_tokens_to_ids` methods. |
|
pair_ids: Optional second list of input ids. Can be obtained from a string by chaining the |
|
`tokenize` and `convert_tokens_to_ids` methods. |
|
num_tokens_to_remove (:obj:`int`, `optional`, defaults to ``0``): |
|
number of tokens to remove using the truncation strategy |
|
truncation_strategy (:obj:`string`, `optional`, defaults to "longest_first"): |
|
String selected in the following options: |
|
|
|
- 'longest_first' (default): Iteratively reduce the inputs sequence until the input is under max_length |
|
starting from the longest one at each token (when there is a pair of input sequences). |
|
Overflowing tokens only contains overflow from the first sequence. |
|
- 'only_first': Only truncate the first sequence. raise an error if the first sequence is shorter or equal to than num_tokens_to_remove. |
|
- 'only_second': Only truncate the second sequence |
|
- 'do_not_truncate' |
|
stride (:obj:`int`, `optional`, defaults to ``0``): |
|
If set to a number along with max_length, the overflowing tokens returned will contain some tokens |
|
from the main sequence returned. The value of this argument defines the number of additional tokens. |
|
""" |
|
if num_tokens_to_remove <= 0: |
|
return ids, pair_ids, [] |
|
|
|
if not isinstance(truncation_strategy, TruncationStrategy): |
|
truncation_strategy = TruncationStrategy(truncation_strategy) |
|
|
|
overflowing_tokens = [] |
|
if truncation_strategy == TruncationStrategy.LONGEST_FIRST: |
|
for _ in range(num_tokens_to_remove): |
|
if pair_ids is None or len(ids) > len(pair_ids): |
|
if not overflowing_tokens: |
|
window_len = min(len(ids), stride + 1) |
|
else: |
|
window_len = 1 |
|
overflowing_tokens.extend(ids[-window_len:]) |
|
ids = ids[:-1] |
|
else: |
|
if not overflowing_tokens: |
|
window_len = min(len(pair_ids), stride + 1) |
|
else: |
|
window_len = 1 |
|
overflowing_tokens.extend(pair_ids[-window_len:]) |
|
pair_ids = pair_ids[:-1] |
|
elif truncation_strategy == TruncationStrategy.ONLY_FIRST: |
|
if len(ids) > num_tokens_to_remove: |
|
window_len = min(len(ids), stride + num_tokens_to_remove) |
|
overflowing_tokens = ids[-window_len:] |
|
ids = ids[:-num_tokens_to_remove] |
|
else: |
|
logger.error( |
|
f"We need to remove {num_tokens_to_remove} to truncate the input" |
|
f"but the first sequence has a length {len(ids)}. " |
|
f"Please select another truncation strategy than {truncation_strategy}, " |
|
f"for instance 'longest_first' or 'only_second'." |
|
) |
|
elif truncation_strategy == TruncationStrategy.ONLY_SECOND and pair_ids is not None: |
|
if len(pair_ids) > num_tokens_to_remove: |
|
window_len = min(len(pair_ids), stride + num_tokens_to_remove) |
|
overflowing_tokens = pair_ids[-window_len:] |
|
pair_ids = pair_ids[:-num_tokens_to_remove] |
|
else: |
|
logger.error( |
|
f"We need to remove {num_tokens_to_remove} to truncate the input" |
|
f"but the second sequence has a length {len(pair_ids)}. " |
|
f"Please select another truncation strategy than {truncation_strategy}, " |
|
f"for instance 'longest_first' or 'only_first'." |
|
) |
|
|
|
return (ids, pair_ids, overflowing_tokens) |
|
|
|
def _pad( |
|
self, |
|
encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], |
|
max_length: Optional[int] = None, |
|
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, |
|
pad_to_multiple_of: Optional[int] = None, |
|
return_attention_mask: Optional[bool] = None, |
|
) -> dict: |
|
""" Pad encoded inputs (on left/right and up to predefined legnth or max length in the batch) |
|
|
|
Args: |
|
encoded_inputs: Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). |
|
max_length: maximum length of the returned list and optionally padding length (see below). |
|
Will truncate by taking into account the special tokens. |
|
padding_strategy: PaddingStrategy to use for padding. |
|
- PaddingStrategy.LONGEST Pad to the longest sequence in the batch |
|
- PaddingStrategy.MAX_LENGTH: Pad to the max length (default) |
|
- PaddingStrategy.DO_NOT_PAD: Do not pad |
|
The tokenizer padding sides are defined in self.padding_side: |
|
- 'left': pads on the left of the sequences |
|
- 'right': pads on the right of the sequences |
|
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. |
|
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability |
|
>= 7.5 (Volta). |
|
return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics) |
|
""" |
|
|
|
if return_attention_mask is None: |
|
return_attention_mask = "attention_mask" in self.model_input_names |
|
|
|
if padding_strategy == PaddingStrategy.LONGEST: |
|
max_length = len(encoded_inputs["input_ids"]) |
|
|
|
if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): |
|
max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of |
|
|
|
needs_to_be_padded = ( |
|
padding_strategy != PaddingStrategy.DO_NOT_PAD and len(encoded_inputs["input_ids"]) != max_length |
|
) |
|
|
|
if needs_to_be_padded: |
|
difference = max_length - len(encoded_inputs["input_ids"]) |
|
if self.padding_side == "right": |
|
if return_attention_mask: |
|
encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"]) + [0] * difference |
|
if "token_type_ids" in encoded_inputs: |
|
encoded_inputs["token_type_ids"] = ( |
|
encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference |
|
) |
|
if "special_tokens_mask" in encoded_inputs: |
|
encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference |
|
encoded_inputs["input_ids"] = encoded_inputs["input_ids"] + [self.pad_token_id] * difference |
|
elif self.padding_side == "left": |
|
if return_attention_mask: |
|
encoded_inputs["attention_mask"] = [0] * difference + [1] * len(encoded_inputs["input_ids"]) |
|
if "token_type_ids" in encoded_inputs: |
|
encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[ |
|
"token_type_ids" |
|
] |
|
if "special_tokens_mask" in encoded_inputs: |
|
encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"] |
|
encoded_inputs["input_ids"] = [self.pad_token_id] * difference + encoded_inputs["input_ids"] |
|
else: |
|
raise ValueError("Invalid padding strategy:" + str(self.padding_side)) |
|
else: |
|
if return_attention_mask: |
|
encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"]) |
|
|
|
return encoded_inputs |
|
|
|
def batch_decode(self, sequences: List[List[int]], **kwargs) -> List[str]: |
|
return [self.decode(seq, **kwargs) for seq in sequences] |
|
|
|
def decode( |
|
self, token_ids: List[int], skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = True |
|
) -> str: |
|
""" |
|
Converts a sequence of ids (integer) in a string, using the tokenizer and vocabulary |
|
with options to remove special tokens and clean up tokenization spaces. |
|
Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``. |
|
|
|
Args: |
|
token_ids: list of tokenized input ids. Can be obtained using the `encode` or `encode_plus` methods. |
|
skip_special_tokens: if set to True, will replace special tokens. |
|
clean_up_tokenization_spaces: if set to True, will clean up the tokenization spaces. |
|
""" |
|
raise NotImplementedError |
|
|
|
def get_special_tokens_mask( |
|
self, token_ids_0: List, token_ids_1: Optional[List] = None, already_has_special_tokens: bool = False |
|
) -> List[int]: |
|
""" |
|
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding |
|
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. |
|
|
|
Args: |
|
token_ids_0: list of ids (must not contain special tokens) |
|
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids |
|
for sequence pairs |
|
already_has_special_tokens: (default False) Set to True if the token list is already formated with |
|
special tokens for the model |
|
|
|
Returns: |
|
A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. |
|
""" |
|
assert already_has_special_tokens and token_ids_1 is None, ( |
|
"You cannot use ``already_has_special_tokens=False`` with this tokenizer. " |
|
"Please use a slow (full python) tokenizer to activate this argument." |
|
"Or set `return_special_token_mask=True` when calling the encoding method " |
|
"to get the special tokens mask in any tokenizer. " |
|
) |
|
|
|
all_special_ids = self.all_special_ids |
|
|
|
special_tokens_mask = [1 if token in all_special_ids else 0 for token in token_ids_0] |
|
|
|
return special_tokens_mask |
|
|
|
@staticmethod |
|
def clean_up_tokenization(out_string: str) -> str: |
|
""" Clean up a list of simple English tokenization artifacts like spaces before punctuations and abreviated forms. |
|
""" |
|
out_string = ( |
|
out_string.replace(" .", ".") |
|
.replace(" ?", "?") |
|
.replace(" !", "!") |
|
.replace(" ,", ",") |
|
.replace(" ' ", "'") |
|
.replace(" n't", "n't") |
|
.replace(" 'm", "'m") |
|
.replace(" 's", "'s") |
|
.replace(" 've", "'ve") |
|
.replace(" 're", "'re") |
|
) |
|
return out_string |
|
|