Trofish commited on
Commit
fa3719a
1 Parent(s): 0ee90b1

Upload syllabletokenizer.py

Browse files
Files changed (1) hide show
  1. syllabletokenizer.py +88 -0
syllabletokenizer.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PreTrainedTokenizer
2
+ import json
3
+ import os
4
+
5
+ class SyllableTokenizer(PreTrainedTokenizer):
6
+ def __init__(
7
+ self,
8
+ vocab_file,
9
+ do_lower_case=False,
10
+ do_basic_tokenize=True,
11
+ never_split=None,
12
+ unk_token="[UNK]",
13
+ sep_token="[SEP]",
14
+ eos_token="[EOS]",
15
+ bos_token="[BOS]",
16
+ pad_token="[PAD]",
17
+ cls_token="[CLS]",
18
+ mask_token="[MASK]",
19
+ tokenize_chinese_chars=True,
20
+ **kwargs
21
+ ):
22
+ # Load vocabulary
23
+ with open(vocab_file, 'r', encoding='utf-8') as f:
24
+ self.vocab = json.load(f)
25
+ # Initialize special tokens
26
+ self.mask_token = mask_token
27
+ self.sep_token = sep_token
28
+ self.cls_token = cls_token
29
+ self.pad_token = pad_token
30
+ self.eos_token = eos_token
31
+ self.bos_token = bos_token
32
+ self.unk_token = unk_token
33
+
34
+ self.ids_to_tokens = {id: token for token, id in self.vocab.items()}
35
+ super().__init__(pad_token=self.pad_token, eos_token=self.eos_token, bos_token=self.bos_token, unk_token=self.unk_token, mask_token=self.mask_token, **kwargs)
36
+
37
+ @property
38
+ def vocab_size(self):
39
+ return len(self.vocab)
40
+
41
+ def get_vocab(self):
42
+ return dict(self.vocab, **self.added_tokens_encoder)
43
+
44
+ def _tokenize(self, text):
45
+ return list(" ".join(text.split())) # Erase duplicate space
46
+
47
+ def _convert_token_to_id(self, token):
48
+ """ Converts a token (str) in an id using the vocab. """
49
+ return self.vocab.get(token, self.vocab.get(self.unk_token))
50
+
51
+ def _convert_id_to_token(self, index):
52
+ """Converts an index (integer) in a token (str) using the vocab."""
53
+ return self.ids_to_tokens.get(index, self.unk_token)
54
+
55
+ def convert_tokens_to_string(self, tokens):
56
+ """ Converts a sequence of tokens (string) in a single string. """
57
+ return "".join(tokens).strip()
58
+
59
+ def save_vocabulary(self, vocab_path, filename_prefix=None):
60
+ """
61
+ Save the tokenizer vocabulary and special tokens file to a directory.
62
+
63
+ Args:
64
+ vocab_path (str): The directory in which to save the vocabulary.
65
+ filename_prefix (str, optional): A prefix to add to the saved vocabulary filename.
66
+
67
+ Returns:
68
+ Tuple[str]: Paths to the files saved.
69
+ """
70
+ index = 0
71
+ if os.path.isdir(vocab_path):
72
+ vocab_filename = "vocab.txt" if filename_prefix is None else f"{filename_prefix}_vocab.txt"
73
+ vocab_file = os.path.join(vocab_path, vocab_filename)
74
+ else:
75
+ vocab_file = vocab_path
76
+
77
+ with open(vocab_file, "w", encoding="utf-8") as writer:
78
+ for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
79
+ if index != token_index:
80
+ logger.warning(
81
+ f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive. "
82
+ "Please check that the vocabulary is not corrupted!"
83
+ )
84
+ index = token_index
85
+ writer.write(token + "\n")
86
+ index += 1
87
+
88
+ return (vocab_file,)