Younes Belkada commited on
Commit
f342985
1 Parent(s): f4ea783

Add tokenizer file

Browse files
japanese-dummy-tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}
japanese-dummy-tokenizer/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
japanese-dummy-tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>", "extra_ids": 0, "additional_special_tokens": null, "special_tokens_map_file": "/home/patrick/.cache/torch/transformers/685ac0ca8568ec593a48b61b0a3c272beee9bc194a3c7241d15dcadb5f875e53.f76030f3ec1b96a8199b2593390c610e76ca8028ef3d24680000619ffb646276", "name_or_path": "csebuetnlp/mT5_multilingual_XLSum", "sp_model_kwargs": {}, "tokenizer_class": "T5Tokenizer"}
tokenizer.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ from transformers import AutoTokenizer
3
+ from huggingface_hub import Repository
4
+
5
+ repo = Repository(".", clone_from="ybelkada/japanese-dummy-tokenizer")
6
+
7
+ def get_training_corpus(dataset):
8
+ """
9
+ Returns the training corpus for the given dataset.
10
+ """
11
+ return (element['original_ja'] for element in iter(dataset))
12
+
13
+ dataset = load_dataset("snow_simplified_japanese_corpus", streaming=True, split="train")
14
+
15
+ train_dataset = dataset.skip(100)
16
+ val_dataset = dataset.take(100)
17
+
18
+ old_tokenizer = AutoTokenizer.from_pretrained("rinna/japanese-gpt2-small")
19
+ old_tokenizer = AutoTokenizer.from_pretrained("csebuetnlp/mT5_multilingual_XLSum")
20
+
21
+ print("Old Tokenizer:", old_tokenizer.tokenize("誰が一番に着くか私には分かりません。"))
22
+ new_tokenizer = old_tokenizer.train_new_from_iterator(get_training_corpus(train_dataset), 52000)
23
+
24
+ print("New Tokenizer:",new_tokenizer.tokenize("誰が一番に着くか私には分かりません。"))
25
+ new_tokenizer.save_pretrained("japanese-dummy-tokenizer")
26
+ repo.git_add()
27
+ repo.git_commit("Add tokenizer file")
28
+ repo.git_push()