chansung commited on
Commit
de9bfc1
1 Parent(s): f5e2f31

Create tokenizer.py

Browse files
Files changed (1) hide show
  1. llama/tokenizer.py +40 -0
llama/tokenizer.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # This software may be used and distributed according to the terms of the GNU General Public License version 3.
3
+
4
+ from sentencepiece import SentencePieceProcessor
5
+ from logging import getLogger
6
+ from typing import List
7
+ import os
8
+
9
+
10
+ logger = getLogger()
11
+
12
+
13
+ class Tokenizer:
14
+ def __init__(self, model_path: str):
15
+ # reload tokenizer
16
+ assert os.path.isfile(model_path), model_path
17
+ self.sp_model = SentencePieceProcessor(model_file=model_path)
18
+ logger.info(f"Reloaded SentencePiece model from {model_path}")
19
+
20
+ # BOS / EOS token IDs
21
+ self.n_words: int = self.sp_model.vocab_size()
22
+ self.bos_id: int = self.sp_model.bos_id()
23
+ self.eos_id: int = self.sp_model.eos_id()
24
+ self.pad_id: int = self.sp_model.pad_id()
25
+ logger.info(
26
+ f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}"
27
+ )
28
+ assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
29
+
30
+ def encode(self, s: str, bos: bool, eos: bool) -> List[int]:
31
+ assert type(s) is str
32
+ t = self.sp_model.encode(s)
33
+ if bos:
34
+ t = [self.bos_id] + t
35
+ if eos:
36
+ t = t + [self.eos_id]
37
+ return t
38
+
39
+ def decode(self, t: List[int]) -> str:
40
+ return self.sp_model.decode(t)