Spaces:
Running
on
T4
Running
on
T4
File size: 1,010 Bytes
565faca |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 |
import tiktoken
class TrainedBPETokeniser:
def __init__(self, name, pat_str, mergeable_ranks, special_tokens, offset=None) -> None:
self.tokenizer = tiktoken.Encoding(
name=name,
pat_str=pat_str,
mergeable_ranks=mergeable_ranks,
special_tokens=special_tokens,
)
self.offset = offset
def encode(self, text: str) -> list[int]:
# note: we add a end of text token!
tokens = self.tokenizer.encode(text) + [self.tokenizer.eot_token]
if self.offset is not None:
tokens = [x + self.offset for x in tokens]
return tokens
def decode(self, tokens: list[int]):
if self.offset is not None:
tokens = [x - self.offset for x in tokens]
return self.tokenizer.decode(tokens)
@property
def eot_token(self):
if self.offset is not None:
return self.tokenizer.eot_token + self.offset
else:
return self.tokenizer.eot_token
|