Qi Wang
commited on
Commit
•
1167841
1
Parent(s):
3f8acd4
Upload 6 files
Browse files- tokenizer.bin +2 -2
- tokenizer.model +2 -2
- tokenizer_config.json +1 -1
- tokenizer_word.json +0 -0
- training_log.txt +8 -12
tokenizer.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dd460874cfbf96533f4446124f9be664720d0d37855ec7582c8422841b6810f9
|
3 |
+
size 986807
|
tokenizer.model
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2ed96452b25663cfa9eb0ac468212eeae9b47f7d6181544a2ff8bf32d1654357
|
3 |
+
size 1051493
|
tokenizer_config.json
CHANGED
@@ -20,7 +20,7 @@
|
|
20 |
},
|
21 |
"legacy": false,
|
22 |
"max_length": 4096,
|
23 |
-
"model_max_length":
|
24 |
"pad_token": null,
|
25 |
"sp_model_kwargs": {},
|
26 |
"spaces_between_special_tokens": false,
|
|
|
20 |
},
|
21 |
"legacy": false,
|
22 |
"max_length": 4096,
|
23 |
+
"model_max_length": 1000000000000000019884624838656,
|
24 |
"pad_token": null,
|
25 |
"sp_model_kwargs": {},
|
26 |
"spaces_between_special_tokens": false,
|
tokenizer_word.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
training_log.txt
CHANGED
@@ -1,6 +1,3 @@
|
|
1 |
-
parameters: Namespace(corpus_dir='../datasets/online_novel//data.txt', output_dir='../models/baby-chinese-llama2', model_type='bpe', max_sentence_length=4096, vocab_size=32000, max_lines=1000000, shuffle_lines=True, pad_id=3, normalization_rule_name='identity', character_coverage=0.9995, action='export')
|
2 |
-
|
3 |
-
trainer_interface.cc(428) LOG(INFO) Normalizing sentences...
|
4 |
trainer_interface.cc(537) LOG(INFO) all chars count=796343461
|
5 |
trainer_interface.cc(548) LOG(INFO) Done: 99.95% characters are covered.
|
6 |
trainer_interface.cc(558) LOG(INFO) Alphabet size=5013
|
@@ -8,14 +5,13 @@ trainer_interface.cc(559) LOG(INFO) Final character coverage=0.9995
|
|
8 |
trainer_interface.cc(591) LOG(INFO) Done! preprocessed 800000 sentences.
|
9 |
trainer_interface.cc(597) LOG(INFO) Tokenizing input sentences with whitespace: 800000
|
10 |
trainer_interface.cc(608) LOG(INFO) Done! 1021909
|
|
|
11 |
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
95th percentile: 686.0,
|
20 |
-
99th percentile: 722.0,
|
21 |
max: 2657
|
|
|
|
|
|
|
|
|
1 |
trainer_interface.cc(537) LOG(INFO) all chars count=796343461
|
2 |
trainer_interface.cc(548) LOG(INFO) Done: 99.95% characters are covered.
|
3 |
trainer_interface.cc(558) LOG(INFO) Alphabet size=5013
|
|
|
5 |
trainer_interface.cc(591) LOG(INFO) Done! preprocessed 800000 sentences.
|
6 |
trainer_interface.cc(597) LOG(INFO) Tokenizing input sentences with whitespace: 800000
|
7 |
trainer_interface.cc(608) LOG(INFO) Done! 1021909
|
8 |
+
corpus_dir='../datasets/online_novel//data.txt', output_dir='../models/baby-chinese-llama2/64k', model_type='bpe', max_sentence_length=4096, vocab_size=64000, max_lines=1000000, shuffle_lines=True, pad_id=3, normalization_rule_name='identity', character_coverage=0.9995, action='export')
|
9 |
|
10 |
+
Total tokens: 1686.54MB
|
11 |
+
Mean: 563.0230508481366, Median: 559.0,
|
12 |
+
5th percentile: 504.0,
|
13 |
+
25th percentile: 535.0,
|
14 |
+
75th percentile: 587.0,
|
15 |
+
95th percentile: 634.0,
|
16 |
+
99th percentile: 669.0,
|
|
|
|
|
17 |
max: 2657
|