from tiktoken.load import data_gym_to_mergeable_bpe_ranks, load_tiktoken_bpe def gpt_j_hebrew_tokenizer(): mergeable_ranks = data_gym_to_mergeable_bpe_ranks( vocab_bpe_file="https://huggingface.co/Norod78/gpt-j-hebrew-tokenizer/raw/main/merges.txt", encoder_json_file="https://huggingface.co/Norod78/gpt-j-hebrew-tokenizer/raw/main/vocab.json", ) return { "name": "gpt-j-hebrew-tokenizer", "explicit_n_vocab": 50257, "pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""", "mergeable_ranks": mergeable_ranks, "special_tokens": {"<|endoftext|>": 50256}, } def gpt_hebrew_tokenizer(): mergeable_ranks = data_gym_to_mergeable_bpe_ranks( vocab_bpe_file="https://huggingface.co/Norod78/TinyStories-3M-val-Hebrew/raw/main/merges.txt", encoder_json_file="https://huggingface.co/Norod78/TinyStories-3M-val-Hebrew/raw/main/vocab.json", ) return { "name": "gpt-hebrew-tokenizer", "explicit_n_vocab": 50259, "pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""", "mergeable_ranks": mergeable_ranks, "special_tokens": {"<|endoftext|>": 50256, "<|startoftext|>": 50257, "<|pad|>": 50258}, } ENCODING_CONSTRUCTORS = { "gpt-j-hebrew-tokenizer": gpt_j_hebrew_tokenizer, "gpt-hebrew-tokenizer": gpt_hebrew_tokenizer, }