Qi Wang commited on
Commit
de5164d
1 Parent(s): 1167841

Upload 5 files

Browse files
Files changed (4) hide show
  1. tokenizer.bin +2 -2
  2. tokenizer.model +2 -2
  3. tokenizer_config.json +1 -1
  4. training_log.txt +12 -8
tokenizer.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dd460874cfbf96533f4446124f9be664720d0d37855ec7582c8422841b6810f9
3
- size 986807
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d84b01069686fecae45d66b8d1468a2bdaa1b1b7221502e85b8b17bfacbec40
3
+ size 466508
tokenizer.model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2ed96452b25663cfa9eb0ac468212eeae9b47f7d6181544a2ff8bf32d1654357
3
- size 1051493
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38e1f4e133816d9bf7acb1e63a55aff18c3a0f987bf7624552c4be3e5a8f08b6
3
+ size 499194
tokenizer_config.json CHANGED
@@ -20,7 +20,7 @@
20
  },
21
  "legacy": false,
22
  "max_length": 4096,
23
- "model_max_length": 1000000000000000019884624838656,
24
  "pad_token": null,
25
  "sp_model_kwargs": {},
26
  "spaces_between_special_tokens": false,
 
20
  },
21
  "legacy": false,
22
  "max_length": 4096,
23
+ "model_max_length": 4096,
24
  "pad_token": null,
25
  "sp_model_kwargs": {},
26
  "spaces_between_special_tokens": false,
training_log.txt CHANGED
@@ -1,3 +1,6 @@
 
 
 
1
  trainer_interface.cc(537) LOG(INFO) all chars count=796343461
2
  trainer_interface.cc(548) LOG(INFO) Done: 99.95% characters are covered.
3
  trainer_interface.cc(558) LOG(INFO) Alphabet size=5013
@@ -5,13 +8,14 @@ trainer_interface.cc(559) LOG(INFO) Final character coverage=0.9995
5
  trainer_interface.cc(591) LOG(INFO) Done! preprocessed 800000 sentences.
6
  trainer_interface.cc(597) LOG(INFO) Tokenizing input sentences with whitespace: 800000
7
  trainer_interface.cc(608) LOG(INFO) Done! 1021909
8
- corpus_dir='../datasets/online_novel//data.txt', output_dir='../models/baby-chinese-llama2/64k', model_type='bpe', max_sentence_length=4096, vocab_size=64000, max_lines=1000000, shuffle_lines=True, pad_id=3, normalization_rule_name='identity', character_coverage=0.9995, action='export')
9
 
10
- Total tokens: 1686.54MB
11
- Mean: 563.0230508481366, Median: 559.0,
12
- 5th percentile: 504.0,
13
- 25th percentile: 535.0,
14
- 75th percentile: 587.0,
15
- 95th percentile: 634.0,
16
- 99th percentile: 669.0,
 
 
17
  max: 2657
 
1
+ parameters: Namespace(corpus_dir='../datasets/online_novel//data.txt', output_dir='../models/baby-chinese-llama2', model_type='bpe', max_sentence_length=4096, vocab_size=32000, max_lines=1000000, shuffle_lines=True, pad_id=3, normalization_rule_name='identity', character_coverage=0.9995, action='export')
2
+
3
+ trainer_interface.cc(428) LOG(INFO) Normalizing sentences...
4
  trainer_interface.cc(537) LOG(INFO) all chars count=796343461
5
  trainer_interface.cc(548) LOG(INFO) Done: 99.95% characters are covered.
6
  trainer_interface.cc(558) LOG(INFO) Alphabet size=5013
 
8
  trainer_interface.cc(591) LOG(INFO) Done! preprocessed 800000 sentences.
9
  trainer_interface.cc(597) LOG(INFO) Tokenizing input sentences with whitespace: 800000
10
  trainer_interface.cc(608) LOG(INFO) Done! 1021909
 
11
 
12
+ Raw corpus
13
+ Total lines: 2995508
14
+ Total tokens: 1827.13MB
15
+ Mean: 610, Median: 606.0,
16
+ 5th percentile: 546.0,
17
+ 25th percentile: 580.0,
18
+ 75th percentile: 636.0,
19
+ 95th percentile: 686.0,
20
+ 99th percentile: 722.0,
21
  max: 2657