SaulLu commited on
Commit
bf3a9c0
1 Parent(s): 73b4b8b

tokenizer v2 - fix user defined symbols

Browse files
Files changed (3) hide show
  1. README.md +1 -1
  2. spiece.model +2 -2
  3. tokenizer_config.json +1 -1
README.md CHANGED
@@ -88,7 +88,7 @@ config = {
88
  "split_by_whitespace": true,
89
  "model_prefix": "./spiece",
90
  "input_sentence_size": 4000000,
91
- "user_defined_symbols": "(,),\",-,.,–,£"
92
  }
93
  spm.SentencePieceTrainer.train(**config)
94
  ```
 
88
  "split_by_whitespace": true,
89
  "model_prefix": "./spiece",
90
  "input_sentence_size": 4000000,
91
+ "user_defined_symbols": "(,),-,.,–,£,।",
92
  }
93
  spm.SentencePieceTrainer.train(**config)
94
  ```
spiece.model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d0847b3d2eef4dca2710292d7d5f8046b58eea0e68d63de3d5738c4f7ddf7042
3
- size 1159342
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7adde7820956626ed2ab167049ffedaa4d293f8b072dcc8f055e68bb8450cd51
3
+ size 1160663
tokenizer_config.json CHANGED
@@ -1 +1 @@
1
- {"do_lower_case": true, "remove_space": true, "keep_accents": true, "bos_token": "[CLS]", "eos_token": "[SEP]", "unk_token": "<unk>", "sep_token": "[SEP]", "pad_token": "<pad>", "cls_token": "[CLS]", "mask_token": {"content": "[MASK]", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "model_max_length": 512, "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "tokenizer-bn-v0.0.1"}
 
1
+ {"do_lower_case": true, "remove_space": true, "keep_accents": true, "bos_token": "[CLS]", "eos_token": "[SEP]", "unk_token": "<unk>", "sep_token": "[SEP]", "pad_token": "<pad>", "cls_token": "[CLS]", "mask_token": {"content": "[MASK]", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "model_max_length": 512, "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "/content/drive/MyDrive/collaborative hub training/tokenizer/tokenizer_bn/oscar_wiki_bn_spm_unigram_4000000_2021_04_23_10_05_27/old"}