Spaces:
Running
Running
alirezamsh
commited on
Commit
•
a0e1063
1
Parent(s):
b9fa279
Update tokenization_small100.py
Browse files- tokenization_small100.py +16 -15
tokenization_small100.py
CHANGED
@@ -145,19 +145,6 @@ class SMALL100Tokenizer(PreTrainedTokenizer):
|
|
145 |
if self.get_lang_token(lang_code) not in kwargs["additional_special_tokens"]
|
146 |
]
|
147 |
|
148 |
-
super().__init__(
|
149 |
-
tgt_lang=tgt_lang,
|
150 |
-
bos_token=bos_token,
|
151 |
-
eos_token=eos_token,
|
152 |
-
sep_token=sep_token,
|
153 |
-
unk_token=unk_token,
|
154 |
-
pad_token=pad_token,
|
155 |
-
language_codes=language_codes,
|
156 |
-
sp_model_kwargs=self.sp_model_kwargs,
|
157 |
-
num_madeup_words=num_madeup_words,
|
158 |
-
**kwargs,
|
159 |
-
)
|
160 |
-
|
161 |
self.vocab_file = vocab_file
|
162 |
self.encoder = load_json(vocab_file)
|
163 |
self.decoder = {v: k for k, v in self.encoder.items()}
|
@@ -174,9 +161,23 @@ class SMALL100Tokenizer(PreTrainedTokenizer):
|
|
174 |
|
175 |
self._tgt_lang = tgt_lang if tgt_lang is not None else "en"
|
176 |
self.cur_lang_id = self.get_lang_id(self._tgt_lang)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
177 |
self.set_lang_special_tokens(self._tgt_lang)
|
178 |
|
179 |
-
self.num_madeup_words = num_madeup_words
|
180 |
|
181 |
@property
|
182 |
def vocab_size(self) -> int:
|
@@ -361,4 +362,4 @@ def load_json(path: str) -> Union[Dict, List]:
|
|
361 |
|
362 |
def save_json(data, path: str) -> None:
|
363 |
with open(path, "w") as f:
|
364 |
-
json.dump(data, f, indent=2)
|
|
|
145 |
if self.get_lang_token(lang_code) not in kwargs["additional_special_tokens"]
|
146 |
]
|
147 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
148 |
self.vocab_file = vocab_file
|
149 |
self.encoder = load_json(vocab_file)
|
150 |
self.decoder = {v: k for k, v in self.encoder.items()}
|
|
|
161 |
|
162 |
self._tgt_lang = tgt_lang if tgt_lang is not None else "en"
|
163 |
self.cur_lang_id = self.get_lang_id(self._tgt_lang)
|
164 |
+
self.num_madeup_words = num_madeup_words
|
165 |
+
|
166 |
+
super().__init__(
|
167 |
+
tgt_lang=tgt_lang,
|
168 |
+
bos_token=bos_token,
|
169 |
+
eos_token=eos_token,
|
170 |
+
sep_token=sep_token,
|
171 |
+
unk_token=unk_token,
|
172 |
+
pad_token=pad_token,
|
173 |
+
language_codes=language_codes,
|
174 |
+
sp_model_kwargs=self.sp_model_kwargs,
|
175 |
+
num_madeup_words=num_madeup_words,
|
176 |
+
**kwargs,
|
177 |
+
)
|
178 |
+
|
179 |
self.set_lang_special_tokens(self._tgt_lang)
|
180 |
|
|
|
181 |
|
182 |
@property
|
183 |
def vocab_size(self) -> int:
|
|
|
362 |
|
363 |
def save_json(data, path: str) -> None:
|
364 |
with open(path, "w") as f:
|
365 |
+
json.dump(data, f, indent=2)
|