Qi Wang
commited on
Commit
•
de5164d
1
Parent(s):
1167841
Upload 5 files
Browse files- tokenizer.bin +2 -2
- tokenizer.model +2 -2
- tokenizer_config.json +1 -1
- training_log.txt +12 -8
tokenizer.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7d84b01069686fecae45d66b8d1468a2bdaa1b1b7221502e85b8b17bfacbec40
|
3 |
+
size 466508
|
tokenizer.model
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:38e1f4e133816d9bf7acb1e63a55aff18c3a0f987bf7624552c4be3e5a8f08b6
|
3 |
+
size 499194
|
tokenizer_config.json
CHANGED
@@ -20,7 +20,7 @@
|
|
20 |
},
|
21 |
"legacy": false,
|
22 |
"max_length": 4096,
|
23 |
-
"model_max_length":
|
24 |
"pad_token": null,
|
25 |
"sp_model_kwargs": {},
|
26 |
"spaces_between_special_tokens": false,
|
|
|
20 |
},
|
21 |
"legacy": false,
|
22 |
"max_length": 4096,
|
23 |
+
"model_max_length": 4096,
|
24 |
"pad_token": null,
|
25 |
"sp_model_kwargs": {},
|
26 |
"spaces_between_special_tokens": false,
|
training_log.txt
CHANGED
@@ -1,3 +1,6 @@
|
|
|
|
|
|
|
|
1 |
trainer_interface.cc(537) LOG(INFO) all chars count=796343461
|
2 |
trainer_interface.cc(548) LOG(INFO) Done: 99.95% characters are covered.
|
3 |
trainer_interface.cc(558) LOG(INFO) Alphabet size=5013
|
@@ -5,13 +8,14 @@ trainer_interface.cc(559) LOG(INFO) Final character coverage=0.9995
|
|
5 |
trainer_interface.cc(591) LOG(INFO) Done! preprocessed 800000 sentences.
|
6 |
trainer_interface.cc(597) LOG(INFO) Tokenizing input sentences with whitespace: 800000
|
7 |
trainer_interface.cc(608) LOG(INFO) Done! 1021909
|
8 |
-
corpus_dir='../datasets/online_novel//data.txt', output_dir='../models/baby-chinese-llama2/64k', model_type='bpe', max_sentence_length=4096, vocab_size=64000, max_lines=1000000, shuffle_lines=True, pad_id=3, normalization_rule_name='identity', character_coverage=0.9995, action='export')
|
9 |
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
|
|
|
|
17 |
max: 2657
|
|
|
1 |
+
parameters: Namespace(corpus_dir='../datasets/online_novel//data.txt', output_dir='../models/baby-chinese-llama2', model_type='bpe', max_sentence_length=4096, vocab_size=32000, max_lines=1000000, shuffle_lines=True, pad_id=3, normalization_rule_name='identity', character_coverage=0.9995, action='export')
|
2 |
+
|
3 |
+
trainer_interface.cc(428) LOG(INFO) Normalizing sentences...
|
4 |
trainer_interface.cc(537) LOG(INFO) all chars count=796343461
|
5 |
trainer_interface.cc(548) LOG(INFO) Done: 99.95% characters are covered.
|
6 |
trainer_interface.cc(558) LOG(INFO) Alphabet size=5013
|
|
|
8 |
trainer_interface.cc(591) LOG(INFO) Done! preprocessed 800000 sentences.
|
9 |
trainer_interface.cc(597) LOG(INFO) Tokenizing input sentences with whitespace: 800000
|
10 |
trainer_interface.cc(608) LOG(INFO) Done! 1021909
|
|
|
11 |
|
12 |
+
Raw corpus
|
13 |
+
Total lines: 2995508
|
14 |
+
Total tokens: 1827.13MB
|
15 |
+
Mean: 610, Median: 606.0,
|
16 |
+
5th percentile: 546.0,
|
17 |
+
25th percentile: 580.0,
|
18 |
+
75th percentile: 636.0,
|
19 |
+
95th percentile: 686.0,
|
20 |
+
99th percentile: 722.0,
|
21 |
max: 2657
|