File size: 1,010 Bytes
565faca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import tiktoken


class TrainedBPETokeniser:
    def __init__(self, name, pat_str, mergeable_ranks, special_tokens, offset=None) -> None:
        self.tokenizer = tiktoken.Encoding(
            name=name,
            pat_str=pat_str,
            mergeable_ranks=mergeable_ranks,
            special_tokens=special_tokens,
        )
        self.offset = offset

    def encode(self, text: str) -> list[int]:
        # note: we add a end of text token!
        tokens = self.tokenizer.encode(text) + [self.tokenizer.eot_token]
        if self.offset is not None:
            tokens = [x + self.offset for x in tokens]

        return tokens

    def decode(self, tokens: list[int]):
        if self.offset is not None:
            tokens = [x - self.offset for x in tokens]
        return self.tokenizer.decode(tokens)

    @property
    def eot_token(self):
        if self.offset is not None:
            return self.tokenizer.eot_token + self.offset
        else:
            return self.tokenizer.eot_token