TinyStories-3M-val-Hebrew / tiktoken /tests /test_compare_hebrew.py
Norod78's picture
Add TikToken extention support for the Hebrew Tokenizer
830833d
import tiktoken
test_string = "האיש האחרון עלי אדמות ישב לבד בחדרו, כשלפתע נשמעה דפיקה בדלת"
print(f'Test string = "{test_string}"')
enc = tiktoken.get_encoding("cl100k_base")
encoded_text = enc.encode(test_string)
print(f'num of characters = {len(test_string)} encoded length = {len(encoded_text)} (cl100k_base)')
decoded_text = enc.decode(encoded_text)
assert decoded_text == test_string
enc = tiktoken.get_encoding("gpt2")
encoded_text = enc.encode(test_string)
print(f'num of characters = {len(test_string)} encoded length = {len(encoded_text)} (gpt2)')
decoded_text = enc.decode(encoded_text)
assert decoded_text == test_string
enc = tiktoken.get_encoding("gpt-hebrew-tokenizer")
encoded_text = enc.encode(test_string)
print(f'num of characters = {len(test_string)} encoded length = {len(encoded_text)} (gpt-hebrew-tokenizer)')
decoded_text = enc.decode(encoded_text)
assert decoded_text == test_string