video-dubbing / whisper /tests /test_tokenizer.py
artificialguybr's picture
Upload 45 files
9e548ce
raw
history blame
917 Bytes
from whisper.tokenizer import get_tokenizer
def test_tokenizer():
gpt2_tokenizer = get_tokenizer(multilingual=False)
multilingual_tokenizer = get_tokenizer(multilingual=True)
text = "๋‹ค๋žŒ์ฅ ํ—Œ ์ณ‡๋ฐ”ํ€ด์— ํƒ€๊ณ ํŒŒ"
gpt2_tokens = gpt2_tokenizer.encode(text)
multilingual_tokens = multilingual_tokenizer.encode(text)
assert gpt2_tokenizer.decode(gpt2_tokens) == text
assert multilingual_tokenizer.decode(multilingual_tokens) == text
assert len(gpt2_tokens) > len(multilingual_tokens)
def test_split_on_unicode():
multilingual_tokenizer = get_tokenizer(multilingual=True)
tokens = [8404, 871, 287, 6, 246, 526, 3210, 20378]
words, word_tokens = multilingual_tokenizer.split_tokens_on_unicode(tokens)
assert words == [" elle", " est", " l", "'", "๏ฟฝ", "รฉ", "rit", "oire"]
assert word_tokens == [[8404], [871], [287], [6], [246], [526], [3210], [20378]]