Qi Wang
commited on
Commit
•
3f8acd4
1
Parent(s):
9110136
Upload 6 files
Browse files- tokenizer.bin +3 -0
- tokenizer.model +2 -2
- tokenizer_config.json +1 -1
- tokenizer_word.json +0 -0
- training_log.txt +21 -0
tokenizer.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7d84b01069686fecae45d66b8d1468a2bdaa1b1b7221502e85b8b17bfacbec40
|
3 |
+
size 466508
|
tokenizer.model
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:38e1f4e133816d9bf7acb1e63a55aff18c3a0f987bf7624552c4be3e5a8f08b6
|
3 |
+
size 499194
|
tokenizer_config.json
CHANGED
@@ -19,7 +19,7 @@
|
|
19 |
"single_word": false
|
20 |
},
|
21 |
"legacy": false,
|
22 |
-
"
|
23 |
"model_max_length": 4096,
|
24 |
"pad_token": null,
|
25 |
"sp_model_kwargs": {},
|
|
|
19 |
"single_word": false
|
20 |
},
|
21 |
"legacy": false,
|
22 |
+
"max_length": 4096,
|
23 |
"model_max_length": 4096,
|
24 |
"pad_token": null,
|
25 |
"sp_model_kwargs": {},
|
tokenizer_word.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
training_log.txt
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
parameters: Namespace(corpus_dir='../datasets/online_novel//data.txt', output_dir='../models/baby-chinese-llama2', model_type='bpe', max_sentence_length=4096, vocab_size=32000, max_lines=1000000, shuffle_lines=True, pad_id=3, normalization_rule_name='identity', character_coverage=0.9995, action='export')
|
2 |
+
|
3 |
+
trainer_interface.cc(428) LOG(INFO) Normalizing sentences...
|
4 |
+
trainer_interface.cc(537) LOG(INFO) all chars count=796343461
|
5 |
+
trainer_interface.cc(548) LOG(INFO) Done: 99.95% characters are covered.
|
6 |
+
trainer_interface.cc(558) LOG(INFO) Alphabet size=5013
|
7 |
+
trainer_interface.cc(559) LOG(INFO) Final character coverage=0.9995
|
8 |
+
trainer_interface.cc(591) LOG(INFO) Done! preprocessed 800000 sentences.
|
9 |
+
trainer_interface.cc(597) LOG(INFO) Tokenizing input sentences with whitespace: 800000
|
10 |
+
trainer_interface.cc(608) LOG(INFO) Done! 1021909
|
11 |
+
|
12 |
+
Raw corpus
|
13 |
+
Total lines: 2995508
|
14 |
+
Total tokens: 1827.13MB
|
15 |
+
Mean: 610, Median: 606.0,
|
16 |
+
5th percentile: 546.0,
|
17 |
+
25th percentile: 580.0,
|
18 |
+
75th percentile: 636.0,
|
19 |
+
95th percentile: 686.0,
|
20 |
+
99th percentile: 722.0,
|
21 |
+
max: 2657
|