Upload utils.py with huggingface_hub
Browse files
utils.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoTokenizer
|
2 |
+
import logging
|
3 |
+
from transformers import GemmaTokenizer # Import GemmaTokenizer
|
4 |
+
|
5 |
+
# Configure logging
|
6 |
+
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
|
7 |
+
|
8 |
+
def get_tokenizer(id):
|
9 |
+
logging.debug(f"Loading tokenizer: {id}")
|
10 |
+
try:
|
11 |
+
if "gemma" in id.lower():
|
12 |
+
tokenizer = GemmaTokenizer.from_pretrained(id)
|
13 |
+
else:
|
14 |
+
tokenizer = AutoTokenizer.from_pretrained(id, trust_remote_code=True)
|
15 |
+
logging.debug(f"Tokenizer loaded: {tokenizer}")
|
16 |
+
return tokenizer
|
17 |
+
except Exception as e:
|
18 |
+
logging.error(f"Error loading tokenizer {id}: {e}")
|
19 |
+
raise e
|
20 |
+
|
21 |
+
def get_tokenization(tokenizer, text):
|
22 |
+
logging.debug(f"Tokenizing text: {text}")
|
23 |
+
ids = tokenizer.encode(text)
|
24 |
+
string_tokens = tokenizer.convert_ids_to_tokens(ids)
|
25 |
+
logging.debug(f"Tokens: {string_tokens}")
|
26 |
+
return string_tokens
|
27 |
+
|
28 |
+
def get_vocab_size(tokenizer):
|
29 |
+
logging.debug(f"Getting vocabulary size for tokenizer: {tokenizer}")
|
30 |
+
vocab_size = len(tokenizer.get_vocab())
|
31 |
+
logging.debug(f"Vocabulary size: {vocab_size}")
|
32 |
+
return vocab_size
|
33 |
+
|
34 |
+
def check_latin_support(tokenizer):
|
35 |
+
logging.debug(f"Checking Latin support for tokenizer: {tokenizer}")
|
36 |
+
try:
|
37 |
+
test_text = "This is a test with latin characters 1234567890."
|
38 |
+
tokens = tokenizer.tokenize(test_text)
|
39 |
+
# If the tokenizer can tokenize the latin text without returning unknown tokens, we consider it as supporting latin
|
40 |
+
if all(token != tokenizer.unk_token for token in tokens):
|
41 |
+
logging.debug(f"Latin support: β
")
|
42 |
+
return "β
"
|
43 |
+
else:
|
44 |
+
logging.debug(f"Latin support: β")
|
45 |
+
return "β"
|
46 |
+
except Exception as e:
|
47 |
+
logging.error(f"Error checking latin support: {e}")
|
48 |
+
return "β"
|