|
|
|
|
|
|
|
|
|
import array |
|
import os |
|
import struct |
|
import argparse |
|
from pathlib import Path |
|
from typing import List |
|
|
|
import tiktoken |
|
from tiktoken.load import load_tiktoken_bpe |
|
|
|
TOKENIZER_MODEL = "tokenizer.model" |
|
|
|
|
|
class Tokenizer: |
|
pat_str = r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+" |
|
|
|
def __init__(self, tokenizer_model=None): |
|
model_path = tokenizer_model if tokenizer_model else TOKENIZER_MODEL |
|
assert os.path.isfile(model_path), model_path |
|
mergeable_ranks = load_tiktoken_bpe(model_path) |
|
self.model_path = model_path |
|
|
|
|
|
num_base_tokens = len(mergeable_ranks) |
|
num_reserved_special_tokens = 256 |
|
|
|
special_tokens = [ |
|
"<|begin_of_text|>", |
|
"<|end_of_text|>", |
|
"<|reserved_special_token_0|>", |
|
"<|reserved_special_token_1|>", |
|
"<|reserved_special_token_2|>", |
|
"<|reserved_special_token_3|>", |
|
"<|start_header_id|>", |
|
"<|end_header_id|>", |
|
"<|reserved_special_token_4|>", |
|
"<|eot_id|>", |
|
] + [ |
|
f"<|reserved_special_token_{i}|>" |
|
for i in range(5, num_reserved_special_tokens - 5) |
|
] |
|
self.special_tokens = { |
|
token: num_base_tokens + i for i, token in enumerate(special_tokens) |
|
} |
|
self.model = tiktoken.Encoding( |
|
name=Path(model_path).name, |
|
pat_str=self.pat_str, |
|
mergeable_ranks=mergeable_ranks, |
|
special_tokens=self.special_tokens, |
|
) |
|
self.n_words = self.model.n_vocab |
|
self.bos_id = self.special_tokens["<|begin_of_text|>"] |
|
self.eos_id = self.special_tokens["<|end_of_text|>"] |
|
self.pad_id = -1 |
|
self.stop_tokens = { |
|
self.special_tokens["<|end_of_text|>"], |
|
self.special_tokens["<|eot_id|>"], |
|
} |
|
|
|
def encode( |
|
self, s: str, bos: bool, eos: bool, allowed_special, disallowed_special |
|
) -> List[int]: |
|
assert type(s) is str |
|
self.model.encode( |
|
substr, |
|
allowed_special=allowed_special, |
|
disallowed_special=disallowed_special, |
|
) |
|
|
|
if bos: |
|
t.insert(0, self.bos_id) |
|
if eos: |
|
t.append(self.eos_id) |
|
return t |
|
|
|
def decode(self, t: List[int]) -> str: |
|
return self.model.decode(t) |
|
|
|
def export(self): |
|
|
|
|
|
tokens, scores = [], [] |
|
for i in range(self.n_words): |
|
|
|
|
|
t = self.model.decode_single_token_bytes(i) |
|
s = i |
|
tokens.append(t) |
|
scores.append(s) |
|
|
|
|
|
max_token_length = max(len(t) for t in tokens) |
|
|
|
|
|
|
|
tokenizer_bin = self.model_path.replace(".model", ".bin") |
|
with open(tokenizer_bin, "wb") as f: |
|
f.write(struct.pack("I", max_token_length)) |
|
for bytes, score in zip(tokens, scores): |
|
f.write(struct.pack("fI", score, len(bytes))) |
|
f.write(bytes) |
|
|
|
|
|
if __name__ == "__main__": |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument("-t", "--tokenizer-model", type=str, help="optional path to custom tokenizer ") |
|
|
|
args = parser.parse_args() |
|
|
|
t = Tokenizer(args.tokenizer_model) |
|
t.export() |
|
|