Error " Couldn't instantiate the backend tokenizer from one of ... "
#9
by
lucashw
- opened
Hi,
I'm trying to run command below, but I keep running into error.
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-reranker-base")
I tried cloning the entire repo and load the tokenizer file manually locally, yet the same error occurs. Can anyone please help troubleshoot? Does it require any specific "transformers" version? My version is 4.34.1
Here is the error.
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[61], line 2
1 from transformers import AutoTokenizer
----> 2 tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-reranker-base")
File ~/miniconda3/lib/python3.11/site-packages/transformers/models/auto/tokenization_auto.py:751, in AutoTokenizer.from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs)
747 if tokenizer_class is None:
748 raise ValueError(
749 f"Tokenizer class {tokenizer_class_candidate} does not exist or is not currently imported."
750 )
--> 751 return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
753 # Otherwise we have to be creative.
754 # if model is an encoder decoder, the encoder tokenizer class is used by default
755 if isinstance(config, EncoderDecoderConfig):
File ~/miniconda3/lib/python3.11/site-packages/transformers/tokenization_utils_base.py:2017, in PreTrainedTokenizerBase.from_pretrained(cls, pretrained_model_name_or_path, cache_dir, force_download, local_files_only, token, revision, *init_inputs, **kwargs)
2014 else:
2015 logger.info(f"loading file {file_path} from cache at {resolved_vocab_files[file_id]}")
-> 2017 return cls._from_pretrained(
2018 resolved_vocab_files,
2019 pretrained_model_name_or_path,
2020 init_configuration,
2021 *init_inputs,
2022 token=token,
2023 cache_dir=cache_dir,
2024 local_files_only=local_files_only,
2025 _commit_hash=commit_hash,
2026 _is_local=is_local,
2027 **kwargs,
2028 )
File ~/miniconda3/lib/python3.11/site-packages/transformers/tokenization_utils_base.py:2249, in PreTrainedTokenizerBase._from_pretrained(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, token, cache_dir, local_files_only, _commit_hash, _is_local, *init_inputs, **kwargs)
2247 # Instantiate the tokenizer.
2248 try:
-> 2249 tokenizer = cls(*init_inputs, **init_kwargs)
2250 except OSError:
2251 raise OSError(
2252 "Unable to load vocabulary from file. "
2253 "Please check that the provided vocabulary is accessible and not corrupted."
2254 )
File ~/miniconda3/lib/python3.11/site-packages/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py:155, in XLMRobertaTokenizerFast.__init__(self, vocab_file, tokenizer_file, bos_token, eos_token, sep_token, cls_token, unk_token, pad_token, mask_token, **kwargs)
139 def __init__(
140 self,
141 vocab_file=None,
(...)
151 ):
152 # Mask token behave like a normal word, i.e. include the space before it
153 mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
--> 155 super().__init__(
156 vocab_file,
157 tokenizer_file=tokenizer_file,
158 bos_token=bos_token,
159 eos_token=eos_token,
160 sep_token=sep_token,
161 cls_token=cls_token,
162 unk_token=unk_token,
163 pad_token=pad_token,
164 mask_token=mask_token,
165 **kwargs,
166 )
168 self.vocab_file = vocab_file
File ~/miniconda3/lib/python3.11/site-packages/transformers/tokenization_utils_fast.py:120, in PreTrainedTokenizerFast.__init__(self, *args, **kwargs)
118 fast_tokenizer = convert_slow_tokenizer(slow_tokenizer)
119 else:
--> 120 raise ValueError(
121 "Couldn't instantiate the backend tokenizer from one of: \n"
122 "(1) a `tokenizers` library serialization file, \n"
123 "(2) a slow tokenizer instance to convert or \n"
124 "(3) an equivalent slow tokenizer class to instantiate and convert. \n"
125 "You need to have sentencepiece installed to convert a slow tokenizer to a fast one."
126 )
128 self._tokenizer = fast_tokenizer
130 if slow_tokenizer is not None:
ValueError: Couldn't instantiate the backend tokenizer from one of:
(1) a `tokenizers` library serialization file,
(2) a slow tokenizer instance to convert or
(3) an equivalent slow tokenizer class to instantiate and convert.
You need to have sentencepiece installed to convert a slow tokenizer to a fast one.
Hi, we have not encountered this error, and transformers=4.34.1 also works well.
According to the error information, you can try to install the sentencepiece package and run this command again.
I did install sentencepiece with its latest version, and it didn't help. At a total loss now ...