nouamanetazi HF staff commited on
Commit
f43ae3f
Β·
verified Β·
1 Parent(s): cbb0bd9

Upload utils.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. utils.py +48 -0
utils.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer
2
+ import logging
3
+ from transformers import GemmaTokenizer # Import GemmaTokenizer
4
+
5
+ # Configure logging
6
+ logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
7
+
8
+ def get_tokenizer(id):
9
+ logging.debug(f"Loading tokenizer: {id}")
10
+ try:
11
+ if "gemma" in id.lower():
12
+ tokenizer = GemmaTokenizer.from_pretrained(id)
13
+ else:
14
+ tokenizer = AutoTokenizer.from_pretrained(id, trust_remote_code=True)
15
+ logging.debug(f"Tokenizer loaded: {tokenizer}")
16
+ return tokenizer
17
+ except Exception as e:
18
+ logging.error(f"Error loading tokenizer {id}: {e}")
19
+ raise e
20
+
21
+ def get_tokenization(tokenizer, text):
22
+ logging.debug(f"Tokenizing text: {text}")
23
+ ids = tokenizer.encode(text)
24
+ string_tokens = tokenizer.convert_ids_to_tokens(ids)
25
+ logging.debug(f"Tokens: {string_tokens}")
26
+ return string_tokens
27
+
28
+ def get_vocab_size(tokenizer):
29
+ logging.debug(f"Getting vocabulary size for tokenizer: {tokenizer}")
30
+ vocab_size = len(tokenizer.get_vocab())
31
+ logging.debug(f"Vocabulary size: {vocab_size}")
32
+ return vocab_size
33
+
34
+ def check_latin_support(tokenizer):
35
+ logging.debug(f"Checking Latin support for tokenizer: {tokenizer}")
36
+ try:
37
+ test_text = "This is a test with latin characters 1234567890."
38
+ tokens = tokenizer.tokenize(test_text)
39
+ # If the tokenizer can tokenize the latin text without returning unknown tokens, we consider it as supporting latin
40
+ if all(token != tokenizer.unk_token for token in tokens):
41
+ logging.debug(f"Latin support: βœ…")
42
+ return "βœ…"
43
+ else:
44
+ logging.debug(f"Latin support: ❌")
45
+ return "❌"
46
+ except Exception as e:
47
+ logging.error(f"Error checking latin support: {e}")
48
+ return "❌"