File size: 964 Bytes
830833d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
import tiktoken

test_string = "האיש האחרון עלי אדמות ישב לבד בחדרו, כשלפתע נשמעה דפיקה בדלת"

print(f'Test string = "{test_string}"')

enc = tiktoken.get_encoding("cl100k_base")
encoded_text = enc.encode(test_string)
print(f'num of characters = {len(test_string)} encoded length = {len(encoded_text)} (cl100k_base)')
decoded_text = enc.decode(encoded_text)
assert  decoded_text == test_string

enc = tiktoken.get_encoding("gpt2")
encoded_text = enc.encode(test_string)
print(f'num of characters = {len(test_string)} encoded length = {len(encoded_text)} (gpt2)')
decoded_text = enc.decode(encoded_text)
assert  decoded_text == test_string

enc = tiktoken.get_encoding("gpt-hebrew-tokenizer")
encoded_text = enc.encode(test_string)
print(f'num of characters = {len(test_string)} encoded length = {len(encoded_text)} (gpt-hebrew-tokenizer)')
decoded_text = enc.decode(encoded_text)
assert  decoded_text == test_string