inflaton commited on
Commit
e88a39b
1 Parent(s): eb52e90

removed 4bit checkpoints to save space

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. competition/08c_InterLM_finetuning_NV4080_p2.ipynb +0 -0
  2. llama-factory/saves/internlm2_5_7b/lora/sft/README.md +0 -69
  3. llama-factory/saves/internlm2_5_7b/lora/sft/adapter_config.json +0 -32
  4. llama-factory/saves/internlm2_5_7b/lora/sft/adapter_model.safetensors +0 -3
  5. llama-factory/saves/internlm2_5_7b/lora/sft/added_tokens.json +0 -8
  6. llama-factory/saves/internlm2_5_7b/lora/sft/all_results.json +0 -12
  7. llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-1124/README.md +0 -202
  8. llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-1124/adapter_config.json +0 -32
  9. llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-1124/adapter_model.safetensors +0 -3
  10. llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-1124/added_tokens.json +0 -8
  11. llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-1124/optimizer.pt +0 -3
  12. llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-1124/rng_state.pth +0 -3
  13. llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-1124/scheduler.pt +0 -3
  14. llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-1124/special_tokens_map.json +0 -38
  15. llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-1124/tokenization_internlm2.py +0 -236
  16. llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-1124/tokenization_internlm2_fast.py +0 -214
  17. llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-1124/tokenizer.json +0 -0
  18. llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-1124/tokenizer.model +0 -3
  19. llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-1124/tokenizer_config.json +0 -1640
  20. llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-1124/trainer_state.json +0 -126
  21. llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-1124/training_args.bin +0 -3
  22. llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-1686/README.md +0 -202
  23. llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-1686/adapter_config.json +0 -32
  24. llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-1686/adapter_model.safetensors +0 -3
  25. llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-1686/added_tokens.json +0 -8
  26. llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-1686/optimizer.pt +0 -3
  27. llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-1686/rng_state.pth +0 -3
  28. llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-1686/scheduler.pt +0 -3
  29. llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-1686/special_tokens_map.json +0 -38
  30. llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-1686/tokenization_internlm2.py +0 -236
  31. llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-1686/tokenization_internlm2_fast.py +0 -214
  32. llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-1686/tokenizer.json +0 -0
  33. llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-1686/tokenizer.model +0 -3
  34. llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-1686/tokenizer_config.json +0 -1640
  35. llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-1686/trainer_state.json +0 -169
  36. llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-1686/training_args.bin +0 -3
  37. llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-2248/README.md +0 -202
  38. llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-2248/adapter_config.json +0 -32
  39. llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-2248/adapter_model.safetensors +0 -3
  40. llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-2248/added_tokens.json +0 -8
  41. llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-2248/optimizer.pt +0 -3
  42. llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-2248/rng_state.pth +0 -3
  43. llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-2248/scheduler.pt +0 -3
  44. llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-2248/special_tokens_map.json +0 -38
  45. llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-2248/tokenization_internlm2.py +0 -236
  46. llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-2248/tokenization_internlm2_fast.py +0 -214
  47. llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-2248/tokenizer.json +0 -0
  48. llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-2248/tokenizer.model +0 -3
  49. llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-2248/tokenizer_config.json +0 -1640
  50. llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-2248/trainer_state.json +0 -219
competition/08c_InterLM_finetuning_NV4080_p2.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
llama-factory/saves/internlm2_5_7b/lora/sft/README.md DELETED
@@ -1,69 +0,0 @@
1
- ---
2
- license: other
3
- library_name: peft
4
- tags:
5
- - llama-factory
6
- - lora
7
- - generated_from_trainer
8
- base_model: internlm/internlm2_5-7b-chat-1m
9
- model-index:
10
- - name: sft
11
- results: []
12
- ---
13
-
14
- <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
- should probably proofread and complete it, then remove this comment. -->
16
-
17
- # sft
18
-
19
- This model is a fine-tuned version of [internlm/internlm2_5-7b-chat-1m](https://huggingface.co/internlm/internlm2_5-7b-chat-1m) on the alpaca_mgtv_p1 dataset.
20
- It achieves the following results on the evaluation set:
21
- - Loss: 0.7247
22
-
23
- ## Model description
24
-
25
- More information needed
26
-
27
- ## Intended uses & limitations
28
-
29
- More information needed
30
-
31
- ## Training and evaluation data
32
-
33
- More information needed
34
-
35
- ## Training procedure
36
-
37
- ### Training hyperparameters
38
-
39
- The following hyperparameters were used during training:
40
- - learning_rate: 0.0001
41
- - train_batch_size: 1
42
- - eval_batch_size: 1
43
- - seed: 42
44
- - gradient_accumulation_steps: 8
45
- - total_train_batch_size: 8
46
- - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
47
- - lr_scheduler_type: cosine
48
- - lr_scheduler_warmup_ratio: 0.1
49
- - num_epochs: 6.0
50
-
51
- ### Training results
52
-
53
- | Training Loss | Epoch | Step | Validation Loss |
54
- |:-------------:|:------:|:----:|:---------------:|
55
- | 0.3497 | 0.9991 | 562 | 0.6007 |
56
- | 0.2678 | 1.9982 | 1124 | 0.3570 |
57
- | 0.1949 | 2.9973 | 1686 | 0.4269 |
58
- | 0.1184 | 3.9964 | 2248 | 0.4489 |
59
- | 0.0684 | 4.9956 | 2810 | 0.6156 |
60
- | 0.029 | 5.9947 | 3372 | 0.7247 |
61
-
62
-
63
- ### Framework versions
64
-
65
- - PEFT 0.11.1
66
- - Transformers 4.42.3
67
- - Pytorch 2.3.0+cu121
68
- - Datasets 2.20.0
69
- - Tokenizers 0.19.1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
llama-factory/saves/internlm2_5_7b/lora/sft/adapter_config.json DELETED
@@ -1,32 +0,0 @@
1
- {
2
- "alpha_pattern": {},
3
- "auto_mapping": null,
4
- "base_model_name_or_path": "internlm/internlm2_5-7b-chat-1m",
5
- "bias": "none",
6
- "fan_in_fan_out": false,
7
- "inference_mode": true,
8
- "init_lora_weights": true,
9
- "layer_replication": null,
10
- "layers_pattern": null,
11
- "layers_to_transform": null,
12
- "loftq_config": {},
13
- "lora_alpha": 16,
14
- "lora_dropout": 0.0,
15
- "megatron_config": null,
16
- "megatron_core": "megatron.core",
17
- "modules_to_save": null,
18
- "peft_type": "LORA",
19
- "r": 8,
20
- "rank_pattern": {},
21
- "revision": null,
22
- "target_modules": [
23
- "w3",
24
- "wo",
25
- "w1",
26
- "wqkv",
27
- "w2"
28
- ],
29
- "task_type": "CAUSAL_LM",
30
- "use_dora": false,
31
- "use_rslora": false
32
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
llama-factory/saves/internlm2_5_7b/lora/sft/adapter_model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:7a925360dad2890e02463ad52f4b4968bcde5145f5fea2e900ddca1f84f78740
3
- size 75539712
 
 
 
 
llama-factory/saves/internlm2_5_7b/lora/sft/added_tokens.json DELETED
@@ -1,8 +0,0 @@
1
- {
2
- "[UNUSED_TOKEN_141]": 92544,
3
- "[UNUSED_TOKEN_142]": 92545,
4
- "[UNUSED_TOKEN_143]": 92546,
5
- "[UNUSED_TOKEN_144]": 92547,
6
- "[UNUSED_TOKEN_145]": 92548,
7
- "[UNUSED_TOKEN_146]": 92549
8
- }
 
 
 
 
 
 
 
 
 
llama-factory/saves/internlm2_5_7b/lora/sft/all_results.json DELETED
@@ -1,12 +0,0 @@
1
- {
2
- "epoch": 5.994666666666666,
3
- "eval_loss": 0.7247006893157959,
4
- "eval_runtime": 366.243,
5
- "eval_samples_per_second": 1.365,
6
- "eval_steps_per_second": 1.365,
7
- "total_flos": 4.067158614080225e+17,
8
- "train_loss": 0.1883302003604803,
9
- "train_runtime": 53299.0481,
10
- "train_samples_per_second": 0.507,
11
- "train_steps_per_second": 0.063
12
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-1124/README.md DELETED
@@ -1,202 +0,0 @@
1
- ---
2
- library_name: peft
3
- base_model: internlm/internlm2_5-7b-chat-1m
4
- ---
5
-
6
- # Model Card for Model ID
7
-
8
- <!-- Provide a quick summary of what the model is/does. -->
9
-
10
-
11
-
12
- ## Model Details
13
-
14
- ### Model Description
15
-
16
- <!-- Provide a longer summary of what this model is. -->
17
-
18
-
19
-
20
- - **Developed by:** [More Information Needed]
21
- - **Funded by [optional]:** [More Information Needed]
22
- - **Shared by [optional]:** [More Information Needed]
23
- - **Model type:** [More Information Needed]
24
- - **Language(s) (NLP):** [More Information Needed]
25
- - **License:** [More Information Needed]
26
- - **Finetuned from model [optional]:** [More Information Needed]
27
-
28
- ### Model Sources [optional]
29
-
30
- <!-- Provide the basic links for the model. -->
31
-
32
- - **Repository:** [More Information Needed]
33
- - **Paper [optional]:** [More Information Needed]
34
- - **Demo [optional]:** [More Information Needed]
35
-
36
- ## Uses
37
-
38
- <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
-
40
- ### Direct Use
41
-
42
- <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
-
44
- [More Information Needed]
45
-
46
- ### Downstream Use [optional]
47
-
48
- <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
-
50
- [More Information Needed]
51
-
52
- ### Out-of-Scope Use
53
-
54
- <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
-
56
- [More Information Needed]
57
-
58
- ## Bias, Risks, and Limitations
59
-
60
- <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
-
62
- [More Information Needed]
63
-
64
- ### Recommendations
65
-
66
- <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
-
68
- Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
-
70
- ## How to Get Started with the Model
71
-
72
- Use the code below to get started with the model.
73
-
74
- [More Information Needed]
75
-
76
- ## Training Details
77
-
78
- ### Training Data
79
-
80
- <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
-
82
- [More Information Needed]
83
-
84
- ### Training Procedure
85
-
86
- <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
-
88
- #### Preprocessing [optional]
89
-
90
- [More Information Needed]
91
-
92
-
93
- #### Training Hyperparameters
94
-
95
- - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
-
97
- #### Speeds, Sizes, Times [optional]
98
-
99
- <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
-
101
- [More Information Needed]
102
-
103
- ## Evaluation
104
-
105
- <!-- This section describes the evaluation protocols and provides the results. -->
106
-
107
- ### Testing Data, Factors & Metrics
108
-
109
- #### Testing Data
110
-
111
- <!-- This should link to a Dataset Card if possible. -->
112
-
113
- [More Information Needed]
114
-
115
- #### Factors
116
-
117
- <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
-
119
- [More Information Needed]
120
-
121
- #### Metrics
122
-
123
- <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
-
125
- [More Information Needed]
126
-
127
- ### Results
128
-
129
- [More Information Needed]
130
-
131
- #### Summary
132
-
133
-
134
-
135
- ## Model Examination [optional]
136
-
137
- <!-- Relevant interpretability work for the model goes here -->
138
-
139
- [More Information Needed]
140
-
141
- ## Environmental Impact
142
-
143
- <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
-
145
- Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
-
147
- - **Hardware Type:** [More Information Needed]
148
- - **Hours used:** [More Information Needed]
149
- - **Cloud Provider:** [More Information Needed]
150
- - **Compute Region:** [More Information Needed]
151
- - **Carbon Emitted:** [More Information Needed]
152
-
153
- ## Technical Specifications [optional]
154
-
155
- ### Model Architecture and Objective
156
-
157
- [More Information Needed]
158
-
159
- ### Compute Infrastructure
160
-
161
- [More Information Needed]
162
-
163
- #### Hardware
164
-
165
- [More Information Needed]
166
-
167
- #### Software
168
-
169
- [More Information Needed]
170
-
171
- ## Citation [optional]
172
-
173
- <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
-
175
- **BibTeX:**
176
-
177
- [More Information Needed]
178
-
179
- **APA:**
180
-
181
- [More Information Needed]
182
-
183
- ## Glossary [optional]
184
-
185
- <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
-
187
- [More Information Needed]
188
-
189
- ## More Information [optional]
190
-
191
- [More Information Needed]
192
-
193
- ## Model Card Authors [optional]
194
-
195
- [More Information Needed]
196
-
197
- ## Model Card Contact
198
-
199
- [More Information Needed]
200
- ### Framework versions
201
-
202
- - PEFT 0.11.1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-1124/adapter_config.json DELETED
@@ -1,32 +0,0 @@
1
- {
2
- "alpha_pattern": {},
3
- "auto_mapping": null,
4
- "base_model_name_or_path": "internlm/internlm2_5-7b-chat-1m",
5
- "bias": "none",
6
- "fan_in_fan_out": false,
7
- "inference_mode": true,
8
- "init_lora_weights": true,
9
- "layer_replication": null,
10
- "layers_pattern": null,
11
- "layers_to_transform": null,
12
- "loftq_config": {},
13
- "lora_alpha": 16,
14
- "lora_dropout": 0.0,
15
- "megatron_config": null,
16
- "megatron_core": "megatron.core",
17
- "modules_to_save": null,
18
- "peft_type": "LORA",
19
- "r": 8,
20
- "rank_pattern": {},
21
- "revision": null,
22
- "target_modules": [
23
- "w3",
24
- "wo",
25
- "w1",
26
- "wqkv",
27
- "w2"
28
- ],
29
- "task_type": "CAUSAL_LM",
30
- "use_dora": false,
31
- "use_rslora": false
32
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-1124/adapter_model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:ce0a70697cdc890c7e6ab41e71c8202cca18f704c23c6ea2072aac2dc64665ec
3
- size 75539712
 
 
 
 
llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-1124/added_tokens.json DELETED
@@ -1,8 +0,0 @@
1
- {
2
- "[UNUSED_TOKEN_141]": 92544,
3
- "[UNUSED_TOKEN_142]": 92545,
4
- "[UNUSED_TOKEN_143]": 92546,
5
- "[UNUSED_TOKEN_144]": 92547,
6
- "[UNUSED_TOKEN_145]": 92548,
7
- "[UNUSED_TOKEN_146]": 92549
8
- }
 
 
 
 
 
 
 
 
 
llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-1124/optimizer.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:1efa824175500db8095dd0244fc72c11d5102f40096ba8dde0fbc0b7217734c4
3
- size 151264058
 
 
 
 
llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-1124/rng_state.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:8d138cfe3a4adf21f048848ee35837c9a757a0a3616ff7adbb45b69aac247435
3
- size 14244
 
 
 
 
llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-1124/scheduler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:87aaed7cb6dcc2e48d745ccf810d4294a0ed894de1a71242beeebd4a9c4d8393
3
- size 1064
 
 
 
 
llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-1124/special_tokens_map.json DELETED
@@ -1,38 +0,0 @@
1
- {
2
- "additional_special_tokens": [
3
- "<|im_start|>",
4
- "<|im_end|>",
5
- "<|action_start|>",
6
- "<|action_end|>",
7
- "<|interpreter|>",
8
- "<|plugin|>"
9
- ],
10
- "bos_token": {
11
- "content": "<s>",
12
- "lstrip": false,
13
- "normalized": false,
14
- "rstrip": false,
15
- "single_word": false
16
- },
17
- "eos_token": {
18
- "content": "<|im_end|>",
19
- "lstrip": false,
20
- "normalized": false,
21
- "rstrip": false,
22
- "single_word": false
23
- },
24
- "pad_token": {
25
- "content": "</s>",
26
- "lstrip": false,
27
- "normalized": false,
28
- "rstrip": false,
29
- "single_word": false
30
- },
31
- "unk_token": {
32
- "content": "<unk>",
33
- "lstrip": false,
34
- "normalized": false,
35
- "rstrip": false,
36
- "single_word": false
37
- }
38
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-1124/tokenization_internlm2.py DELETED
@@ -1,236 +0,0 @@
1
- # coding=utf-8
2
- # Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
3
- #
4
- # This code is based on transformers/src/transformers/models/llama/tokenization_llama.py
5
- #
6
- # Licensed under the Apache License, Version 2.0 (the "License");
7
- # you may not use this file except in compliance with the License.
8
- # You may obtain a copy of the License at
9
- #
10
- # http://www.apache.org/licenses/LICENSE-2.0
11
- #
12
- # Unless required by applicable law or agreed to in writing, software
13
- # distributed under the License is distributed on an "AS IS" BASIS,
14
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
- # See the License for the specific language governing permissions and
16
- # limitations under the License.
17
-
18
- """Tokenization classes for InternLM."""
19
- import os
20
- from shutil import copyfile
21
- from typing import Any, Dict, List, Optional, Tuple
22
-
23
- import sentencepiece as spm
24
- from transformers.tokenization_utils import PreTrainedTokenizer
25
- from transformers.utils import logging
26
-
27
- logger = logging.get_logger(__name__)
28
-
29
- VOCAB_FILES_NAMES = {"vocab_file": "./tokenizer.model"}
30
-
31
- PRETRAINED_VOCAB_FILES_MAP = {}
32
-
33
-
34
- # Modified from transformers.model.llama.tokenization_llama.LlamaTokenizer
35
- class InternLM2Tokenizer(PreTrainedTokenizer):
36
- """
37
- Construct a InternLM2 tokenizer. Based on byte-level Byte-Pair-Encoding.
38
-
39
- Args:
40
- vocab_file (`str`):
41
- Path to the vocabulary file.
42
- """
43
-
44
- vocab_files_names = VOCAB_FILES_NAMES
45
- pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
46
- model_input_names = ["input_ids", "attention_mask"]
47
- _auto_class = "AutoTokenizer"
48
-
49
- def __init__(
50
- self,
51
- vocab_file,
52
- unk_token="<unk>",
53
- bos_token="<s>",
54
- eos_token="</s>",
55
- pad_token="</s>",
56
- sp_model_kwargs: Optional[Dict[str, Any]] = None,
57
- add_bos_token=True,
58
- add_eos_token=False,
59
- decode_with_prefix_space=False,
60
- clean_up_tokenization_spaces=False,
61
- **kwargs,
62
- ):
63
- self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
64
- self.vocab_file = vocab_file
65
- self.add_bos_token = add_bos_token
66
- self.add_eos_token = add_eos_token
67
- self.decode_with_prefix_space = decode_with_prefix_space
68
- self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
69
- self.sp_model.Load(vocab_file)
70
- self._no_prefix_space_tokens = None
71
- super().__init__(
72
- bos_token=bos_token,
73
- eos_token=eos_token,
74
- unk_token=unk_token,
75
- pad_token=pad_token,
76
- clean_up_tokenization_spaces=clean_up_tokenization_spaces,
77
- **kwargs,
78
- )
79
-
80
- @property
81
- def no_prefix_space_tokens(self):
82
- if self._no_prefix_space_tokens is None:
83
- vocab = self.convert_ids_to_tokens(list(range(self.vocab_size)))
84
- self._no_prefix_space_tokens = {i for i, tok in enumerate(vocab) if not tok.startswith("▁")}
85
- return self._no_prefix_space_tokens
86
-
87
- @property
88
- def vocab_size(self):
89
- """Returns vocab size"""
90
- return self.sp_model.get_piece_size()
91
-
92
- @property
93
- def bos_token_id(self) -> Optional[int]:
94
- return self.sp_model.bos_id()
95
-
96
- @property
97
- def eos_token_id(self) -> Optional[int]:
98
- return self.sp_model.eos_id()
99
-
100
- def get_vocab(self):
101
- """Returns vocab as a dict"""
102
- vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
103
- vocab.update(self.added_tokens_encoder)
104
- return vocab
105
-
106
- def _tokenize(self, text):
107
- """Returns a tokenized string."""
108
- return self.sp_model.encode(text, out_type=str)
109
-
110
- def _convert_token_to_id(self, token):
111
- """Converts a token (str) in an id using the vocab."""
112
- return self.sp_model.piece_to_id(token)
113
-
114
- def _convert_id_to_token(self, index):
115
- """Converts an index (integer) in a token (str) using the vocab."""
116
- token = self.sp_model.IdToPiece(index)
117
- return token
118
-
119
- def _maybe_add_prefix_space(self, tokens, decoded):
120
- if tokens and tokens[0] not in self.no_prefix_space_tokens:
121
- return " " + decoded
122
- else:
123
- return decoded
124
-
125
- def convert_tokens_to_string(self, tokens):
126
- """Converts a sequence of tokens (string) in a single string."""
127
- current_sub_tokens = []
128
- out_string = ""
129
- prev_is_special = False
130
- for token in tokens:
131
- # make sure that special tokens are not decoded using sentencepiece model
132
- if token in self.all_special_tokens:
133
- if not prev_is_special:
134
- out_string += " "
135
- out_string += self.sp_model.decode(current_sub_tokens) + token
136
- prev_is_special = True
137
- current_sub_tokens = []
138
- else:
139
- current_sub_tokens.append(token)
140
- prev_is_special = False
141
- out_string += self.sp_model.decode(current_sub_tokens)
142
- out_string = self.clean_up_tokenization(out_string)
143
- out_string = self._maybe_add_prefix_space(tokens=tokens, decoded=out_string)
144
- return out_string[1:]
145
-
146
- def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
147
- """
148
- Save the vocabulary and special tokens file to a directory.
149
-
150
- Args:
151
- save_directory (`str`):
152
- The directory in which to save the vocabulary.
153
-
154
- Returns:
155
- `Tuple(str)`: Paths to the files saved.
156
- """
157
- if not os.path.isdir(save_directory):
158
- logger.error(f"Vocabulary path ({save_directory}) should be a directory")
159
- return
160
- out_vocab_file = os.path.join(
161
- save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
162
- )
163
-
164
- if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
165
- copyfile(self.vocab_file, out_vocab_file)
166
- elif not os.path.isfile(self.vocab_file):
167
- with open(out_vocab_file, "wb") as fi:
168
- content_spiece_model = self.sp_model.serialized_model_proto()
169
- fi.write(content_spiece_model)
170
-
171
- return (out_vocab_file,)
172
-
173
- def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
174
- if self.add_bos_token:
175
- bos_token_ids = [self.bos_token_id]
176
- else:
177
- bos_token_ids = []
178
-
179
- output = bos_token_ids + token_ids_0
180
-
181
- if token_ids_1 is not None:
182
- output = output + token_ids_1
183
-
184
- if self.add_eos_token:
185
- output = output + [self.eos_token_id]
186
-
187
- return output
188
-
189
- def get_special_tokens_mask(
190
- self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
191
- ) -> List[int]:
192
- """
193
- Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
194
- special tokens using the tokenizer `prepare_for_model` method.
195
-
196
- Args:
197
- token_ids_0 (`List[int]`):
198
- List of IDs.
199
- token_ids_1 (`List[int]`, *optional*):
200
- Optional second list of IDs for sequence pairs.
201
- already_has_special_tokens (`bool`, *optional*, defaults to `False`):
202
- Whether or not the token list is already formatted with special tokens for the model.
203
-
204
- Returns:
205
- `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
206
- """
207
- if already_has_special_tokens:
208
- return super().get_special_tokens_mask(
209
- token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
210
- )
211
-
212
- if token_ids_1 is None:
213
- return [1] + ([0] * len(token_ids_0)) + [1]
214
- return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
215
-
216
- def create_token_type_ids_from_sequences(
217
- self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
218
- ) -> List[int]:
219
- """
220
- Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
221
- use of token type ids, therefore a list of zeros is returned.
222
-
223
- Args:
224
- token_ids_0 (`List[int]`):
225
- List of IDs.
226
- token_ids_1 (`List[int]`, *optional*):
227
- Optional second list of IDs for sequence pairs.
228
-
229
- Returns:
230
- `List[int]`: List of zeros.
231
- """
232
- eos = [self.eos_token_id]
233
-
234
- if token_ids_1 is None:
235
- return len(token_ids_0 + eos) * [0]
236
- return len(token_ids_0 + eos + token_ids_1 + eos) * [0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-1124/tokenization_internlm2_fast.py DELETED
@@ -1,214 +0,0 @@
1
- # coding=utf-8
2
- # Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
3
- #
4
- # This code is based on transformers/src/transformers/models/llama/tokenization_llama_fast.py
5
- #
6
- # Licensed under the Apache License, Version 2.0 (the "License");
7
- # you may not use this file except in compliance with the License.
8
- # You may obtain a copy of the License at
9
- #
10
- # http://www.apache.org/licenses/LICENSE-2.0
11
- #
12
- # Unless required by applicable law or agreed to in writing, software
13
- # distributed under the License is distributed on an "AS IS" BASIS,
14
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
- # See the License for the specific language governing permissions and
16
- # limitations under the License.
17
-
18
- """Tokenization Fast class for InternLM."""
19
- import os
20
- from shutil import copyfile
21
- from typing import Any, Dict, Optional, Tuple
22
-
23
- from tokenizers import processors, decoders, Tokenizer, normalizers
24
- from tokenizers.models import BPE
25
-
26
- from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
27
- from transformers.utils import logging
28
-
29
- from transformers.convert_slow_tokenizer import (
30
- SLOW_TO_FAST_CONVERTERS,
31
- SpmConverter,
32
- SentencePieceExtractor,
33
- )
34
-
35
- from .tokenization_internlm2 import InternLM2Tokenizer
36
-
37
- logger = logging.get_logger(__name__)
38
-
39
- VOCAB_FILES_NAMES = {"vocab_file": "./tokenizer.model"}
40
-
41
- # Modified from transformers.convert_slow_tokenizer.LlamaConverter
42
- class InternLM2Converter(SpmConverter):
43
- handle_byte_fallback = True
44
-
45
- def vocab(self, proto):
46
- vocab = [
47
- ("<unk>", 0.0),
48
- ("<s>", 0.0),
49
- ("</s>", 0.0),
50
- ]
51
- vocab += [(piece.piece, piece.score) for piece in proto.pieces[3:]]
52
- return vocab
53
-
54
- def unk_id(self, proto):
55
- unk_id = 0
56
- return unk_id
57
-
58
- def decoder(self, replacement, add_prefix_space):
59
- decoders_sequence = [
60
- decoders.Replace("▁", " "),
61
- decoders.ByteFallback(),
62
- decoders.Fuse(),
63
- ]
64
- if self.proto.normalizer_spec.add_dummy_prefix:
65
- decoders_sequence.append(decoders.Strip(content=" ", left=1))
66
- return decoders.Sequence(decoders_sequence)
67
-
68
- def tokenizer(self, proto):
69
- model_type = proto.trainer_spec.model_type
70
- vocab_scores = self.vocab(proto)
71
- # special tokens
72
- added_tokens = self.original_tokenizer.added_tokens_decoder
73
- for i in range(len(vocab_scores)):
74
- piece, score = vocab_scores[i]
75
- if i in added_tokens:
76
- vocab_scores[i] = (added_tokens[i].content, score)
77
- if model_type == 1:
78
- raise RuntimeError("InternLM2 is supposed to be a BPE model!")
79
-
80
- elif model_type == 2:
81
- _, merges = SentencePieceExtractor(self.original_tokenizer.vocab_file).extract(vocab_scores)
82
- bpe_vocab = {word: i for i, (word, _score) in enumerate(vocab_scores)}
83
- tokenizer = Tokenizer(
84
- BPE(bpe_vocab, merges, unk_token=proto.trainer_spec.unk_piece, fuse_unk=True, byte_fallback=True)
85
- )
86
- tokenizer.add_special_tokens(
87
- [ added_token for index, added_token in added_tokens.items()]
88
- )
89
- else:
90
- raise Exception(
91
- "You're trying to run a `Unigram` model but you're file was trained with a different algorithm"
92
- )
93
-
94
- return tokenizer
95
-
96
- def normalizer(self, proto):
97
- normalizers_list = []
98
- if proto.normalizer_spec.add_dummy_prefix:
99
- normalizers_list.append(normalizers.Prepend(prepend="▁"))
100
- normalizers_list.append(normalizers.Replace(pattern=" ", content="▁"))
101
- return normalizers.Sequence(normalizers_list)
102
-
103
- def pre_tokenizer(self, replacement, add_prefix_space):
104
- return None
105
-
106
- SLOW_TO_FAST_CONVERTERS["InternLM2Tokenizer"] = InternLM2Converter
107
-
108
-
109
- # Modified from transformers.model.llama.tokenization_llama_fast.LlamaTokenizerFast -> InternLM2TokenizerFast
110
- class InternLM2TokenizerFast(PreTrainedTokenizerFast):
111
- vocab_files_names = VOCAB_FILES_NAMES
112
- slow_tokenizer_class = InternLM2Tokenizer
113
- padding_side = "left"
114
- model_input_names = ["input_ids", "attention_mask"]
115
- _auto_class = "AutoTokenizer"
116
-
117
- def __init__(
118
- self,
119
- vocab_file,
120
- unk_token="<unk>",
121
- bos_token="<s>",
122
- eos_token="</s>",
123
- pad_token="</s>",
124
- sp_model_kwargs: Optional[Dict[str, Any]] = None,
125
- add_bos_token=True,
126
- add_eos_token=False,
127
- decode_with_prefix_space=False,
128
- clean_up_tokenization_spaces=False,
129
- **kwargs,
130
- ):
131
- super().__init__(
132
- vocab_file=vocab_file,
133
- unk_token=unk_token,
134
- bos_token=bos_token,
135
- eos_token=eos_token,
136
- pad_token=pad_token,
137
- sp_model_kwargs=sp_model_kwargs,
138
- add_bos_token=add_bos_token,
139
- add_eos_token=add_eos_token,
140
- decode_with_prefix_space=decode_with_prefix_space,
141
- clean_up_tokenization_spaces=clean_up_tokenization_spaces,
142
- **kwargs,
143
- )
144
- self._add_bos_token = add_bos_token
145
- self._add_eos_token = add_eos_token
146
- self.update_post_processor()
147
- self.vocab_file = vocab_file
148
-
149
- @property
150
- def can_save_slow_tokenizer(self) -> bool:
151
- return os.path.isfile(self.vocab_file) if self.vocab_file else False
152
-
153
- def update_post_processor(self):
154
- """
155
- Updates the underlying post processor with the current `bos_token` and `eos_token`.
156
- """
157
- bos = self.bos_token
158
- bos_token_id = self.bos_token_id
159
- if bos is None and self.add_bos_token:
160
- raise ValueError("add_bos_token = True but bos_token = None")
161
-
162
- eos = self.eos_token
163
- eos_token_id = self.eos_token_id
164
- if eos is None and self.add_eos_token:
165
- raise ValueError("add_eos_token = True but eos_token = None")
166
-
167
- single = f"{(bos+':0 ') if self.add_bos_token else ''}$A:0{(' '+eos+':0') if self.add_eos_token else ''}"
168
- pair = f"{single}{(' '+bos+':1') if self.add_bos_token else ''} $B:1{(' '+eos+':1') if self.add_eos_token else ''}"
169
-
170
- special_tokens = []
171
- if self.add_bos_token:
172
- special_tokens.append((bos, bos_token_id))
173
- if self.add_eos_token:
174
- special_tokens.append((eos, eos_token_id))
175
- self._tokenizer.post_processor = processors.TemplateProcessing(
176
- single=single, pair=pair, special_tokens=special_tokens
177
- )
178
-
179
- @property
180
- def add_eos_token(self):
181
- return self._add_eos_token
182
-
183
- @property
184
- def add_bos_token(self):
185
- return self._add_bos_token
186
-
187
- @add_eos_token.setter
188
- def add_eos_token(self, value):
189
- self._add_eos_token = value
190
- self.update_post_processor()
191
-
192
- @add_bos_token.setter
193
- def add_bos_token(self, value):
194
- self._add_bos_token = value
195
- self.update_post_processor()
196
-
197
- def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
198
- if not self.can_save_slow_tokenizer:
199
- raise ValueError(
200
- "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
201
- "tokenizer."
202
- )
203
-
204
- if not os.path.isdir(save_directory):
205
- logger.error(f"Vocabulary path ({save_directory}) should be a directory")
206
- return
207
- out_vocab_file = os.path.join(
208
- save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
209
- )
210
-
211
- if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
212
- copyfile(self.vocab_file, out_vocab_file)
213
-
214
- return (out_vocab_file,)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-1124/tokenizer.json DELETED
The diff for this file is too large to render. See raw diff
 
llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-1124/tokenizer.model DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f868398fc4e05ee1e8aeba95ddf18ddcc45b8bce55d5093bead5bbf80429b48b
3
- size 1477754
 
 
 
 
llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-1124/tokenizer_config.json DELETED
@@ -1,1640 +0,0 @@
1
- {
2
- "add_bos_token": true,
3
- "add_eos_token": false,
4
- "added_tokens_decoder": {
5
- "0": {
6
- "content": "<unk>",
7
- "lstrip": false,
8
- "normalized": false,
9
- "rstrip": false,
10
- "single_word": false,
11
- "special": true
12
- },
13
- "1": {
14
- "content": "<s>",
15
- "lstrip": false,
16
- "normalized": false,
17
- "rstrip": false,
18
- "single_word": false,
19
- "special": true
20
- },
21
- "2": {
22
- "content": "</s>",
23
- "lstrip": false,
24
- "normalized": false,
25
- "rstrip": false,
26
- "single_word": false,
27
- "special": true
28
- },
29
- "92352": {
30
- "content": "E",
31
- "lstrip": false,
32
- "normalized": false,
33
- "rstrip": false,
34
- "single_word": false,
35
- "special": false
36
- },
37
- "92353": {
38
- "content": "F",
39
- "lstrip": false,
40
- "normalized": false,
41
- "rstrip": false,
42
- "single_word": false,
43
- "special": false
44
- },
45
- "92354": {
46
- "content": "G",
47
- "lstrip": false,
48
- "normalized": false,
49
- "rstrip": false,
50
- "single_word": false,
51
- "special": false
52
- },
53
- "92355": {
54
- "content": "H",
55
- "lstrip": false,
56
- "normalized": false,
57
- "rstrip": false,
58
- "single_word": false,
59
- "special": false
60
- },
61
- "92356": {
62
- "content": "I",
63
- "lstrip": false,
64
- "normalized": false,
65
- "rstrip": false,
66
- "single_word": false,
67
- "special": false
68
- },
69
- "92357": {
70
- "content": "J",
71
- "lstrip": false,
72
- "normalized": false,
73
- "rstrip": false,
74
- "single_word": false,
75
- "special": false
76
- },
77
- "92358": {
78
- "content": "K",
79
- "lstrip": false,
80
- "normalized": false,
81
- "rstrip": false,
82
- "single_word": false,
83
- "special": false
84
- },
85
- "92359": {
86
- "content": "L",
87
- "lstrip": false,
88
- "normalized": false,
89
- "rstrip": false,
90
- "single_word": false,
91
- "special": false
92
- },
93
- "92360": {
94
- "content": "M",
95
- "lstrip": false,
96
- "normalized": false,
97
- "rstrip": false,
98
- "single_word": false,
99
- "special": false
100
- },
101
- "92361": {
102
- "content": "N",
103
- "lstrip": false,
104
- "normalized": false,
105
- "rstrip": false,
106
- "single_word": false,
107
- "special": false
108
- },
109
- "92362": {
110
- "content": "R",
111
- "lstrip": false,
112
- "normalized": false,
113
- "rstrip": false,
114
- "single_word": false,
115
- "special": false
116
- },
117
- "92363": {
118
- "content": "U",
119
- "lstrip": false,
120
- "normalized": false,
121
- "rstrip": false,
122
- "single_word": false,
123
- "special": false
124
- },
125
- "92364": {
126
- "content": "V",
127
- "lstrip": false,
128
- "normalized": false,
129
- "rstrip": false,
130
- "single_word": false,
131
- "special": false
132
- },
133
- "92365": {
134
- "content": "W",
135
- "lstrip": false,
136
- "normalized": false,
137
- "rstrip": false,
138
- "single_word": false,
139
- "special": false
140
- },
141
- "92366": {
142
- "content": "X",
143
- "lstrip": false,
144
- "normalized": false,
145
- "rstrip": false,
146
- "single_word": false,
147
- "special": false
148
- },
149
- "92367": {
150
- "content": "Y",
151
- "lstrip": false,
152
- "normalized": false,
153
- "rstrip": false,
154
- "single_word": false,
155
- "special": false
156
- },
157
- "92368": {
158
- "content": "Z",
159
- "lstrip": false,
160
- "normalized": false,
161
- "rstrip": false,
162
- "single_word": false,
163
- "special": false
164
- },
165
- "92369": {
166
- "content": "a",
167
- "lstrip": false,
168
- "normalized": false,
169
- "rstrip": false,
170
- "single_word": false,
171
- "special": false
172
- },
173
- "92370": {
174
- "content": "b",
175
- "lstrip": false,
176
- "normalized": false,
177
- "rstrip": false,
178
- "single_word": false,
179
- "special": false
180
- },
181
- "92371": {
182
- "content": "c",
183
- "lstrip": false,
184
- "normalized": false,
185
- "rstrip": false,
186
- "single_word": false,
187
- "special": false
188
- },
189
- "92372": {
190
- "content": "d",
191
- "lstrip": false,
192
- "normalized": false,
193
- "rstrip": false,
194
- "single_word": false,
195
- "special": false
196
- },
197
- "92373": {
198
- "content": "e",
199
- "lstrip": false,
200
- "normalized": false,
201
- "rstrip": false,
202
- "single_word": false,
203
- "special": false
204
- },
205
- "92374": {
206
- "content": "f",
207
- "lstrip": false,
208
- "normalized": false,
209
- "rstrip": false,
210
- "single_word": false,
211
- "special": false
212
- },
213
- "92375": {
214
- "content": "g",
215
- "lstrip": false,
216
- "normalized": false,
217
- "rstrip": false,
218
- "single_word": false,
219
- "special": false
220
- },
221
- "92376": {
222
- "content": "h",
223
- "lstrip": false,
224
- "normalized": false,
225
- "rstrip": false,
226
- "single_word": false,
227
- "special": false
228
- },
229
- "92377": {
230
- "content": "i",
231
- "lstrip": false,
232
- "normalized": false,
233
- "rstrip": false,
234
- "single_word": false,
235
- "special": false
236
- },
237
- "92378": {
238
- "content": "j",
239
- "lstrip": false,
240
- "normalized": false,
241
- "rstrip": false,
242
- "single_word": false,
243
- "special": false
244
- },
245
- "92379": {
246
- "content": "k",
247
- "lstrip": false,
248
- "normalized": false,
249
- "rstrip": false,
250
- "single_word": false,
251
- "special": false
252
- },
253
- "92380": {
254
- "content": "l",
255
- "lstrip": false,
256
- "normalized": false,
257
- "rstrip": false,
258
- "single_word": false,
259
- "special": false
260
- },
261
- "92381": {
262
- "content": "m",
263
- "lstrip": false,
264
- "normalized": false,
265
- "rstrip": false,
266
- "single_word": false,
267
- "special": false
268
- },
269
- "92382": {
270
- "content": "n",
271
- "lstrip": false,
272
- "normalized": false,
273
- "rstrip": false,
274
- "single_word": false,
275
- "special": false
276
- },
277
- "92383": {
278
- "content": "o",
279
- "lstrip": false,
280
- "normalized": false,
281
- "rstrip": false,
282
- "single_word": false,
283
- "special": false
284
- },
285
- "92384": {
286
- "content": "p",
287
- "lstrip": false,
288
- "normalized": false,
289
- "rstrip": false,
290
- "single_word": false,
291
- "special": false
292
- },
293
- "92385": {
294
- "content": "q",
295
- "lstrip": false,
296
- "normalized": false,
297
- "rstrip": false,
298
- "single_word": false,
299
- "special": false
300
- },
301
- "92386": {
302
- "content": "r",
303
- "lstrip": false,
304
- "normalized": false,
305
- "rstrip": false,
306
- "single_word": false,
307
- "special": false
308
- },
309
- "92387": {
310
- "content": "s",
311
- "lstrip": false,
312
- "normalized": false,
313
- "rstrip": false,
314
- "single_word": false,
315
- "special": false
316
- },
317
- "92388": {
318
- "content": "t",
319
- "lstrip": false,
320
- "normalized": false,
321
- "rstrip": false,
322
- "single_word": false,
323
- "special": false
324
- },
325
- "92389": {
326
- "content": "u",
327
- "lstrip": false,
328
- "normalized": false,
329
- "rstrip": false,
330
- "single_word": false,
331
- "special": false
332
- },
333
- "92390": {
334
- "content": "v",
335
- "lstrip": false,
336
- "normalized": false,
337
- "rstrip": false,
338
- "single_word": false,
339
- "special": false
340
- },
341
- "92391": {
342
- "content": "w",
343
- "lstrip": false,
344
- "normalized": false,
345
- "rstrip": false,
346
- "single_word": false,
347
- "special": false
348
- },
349
- "92392": {
350
- "content": "x",
351
- "lstrip": false,
352
- "normalized": false,
353
- "rstrip": false,
354
- "single_word": false,
355
- "special": false
356
- },
357
- "92393": {
358
- "content": "y",
359
- "lstrip": false,
360
- "normalized": false,
361
- "rstrip": false,
362
- "single_word": false,
363
- "special": false
364
- },
365
- "92394": {
366
- "content": "z",
367
- "lstrip": false,
368
- "normalized": false,
369
- "rstrip": false,
370
- "single_word": false,
371
- "special": false
372
- },
373
- "92395": {
374
- "content": "——",
375
- "lstrip": false,
376
- "normalized": false,
377
- "rstrip": false,
378
- "single_word": false,
379
- "special": false
380
- },
381
- "92396": {
382
- "content": "……",
383
- "lstrip": false,
384
- "normalized": false,
385
- "rstrip": false,
386
- "single_word": false,
387
- "special": false
388
- },
389
- "92397": {
390
- "content": "[UNUSED_TOKEN_0]",
391
- "lstrip": false,
392
- "normalized": false,
393
- "rstrip": false,
394
- "single_word": false,
395
- "special": false
396
- },
397
- "92398": {
398
- "content": "[UNUSED_TOKEN_1]",
399
- "lstrip": false,
400
- "normalized": false,
401
- "rstrip": false,
402
- "single_word": false,
403
- "special": false
404
- },
405
- "92399": {
406
- "content": "[UNUSED_TOKEN_2]",
407
- "lstrip": false,
408
- "normalized": false,
409
- "rstrip": false,
410
- "single_word": false,
411
- "special": false
412
- },
413
- "92400": {
414
- "content": "[UNUSED_TOKEN_3]",
415
- "lstrip": false,
416
- "normalized": false,
417
- "rstrip": false,
418
- "single_word": false,
419
- "special": false
420
- },
421
- "92401": {
422
- "content": "[UNUSED_TOKEN_4]",
423
- "lstrip": false,
424
- "normalized": false,
425
- "rstrip": false,
426
- "single_word": false,
427
- "special": false
428
- },
429
- "92402": {
430
- "content": "[UNUSED_TOKEN_5]",
431
- "lstrip": false,
432
- "normalized": false,
433
- "rstrip": false,
434
- "single_word": false,
435
- "special": false
436
- },
437
- "92403": {
438
- "content": "[UNUSED_TOKEN_6]",
439
- "lstrip": false,
440
- "normalized": false,
441
- "rstrip": false,
442
- "single_word": false,
443
- "special": false
444
- },
445
- "92404": {
446
- "content": "[UNUSED_TOKEN_7]",
447
- "lstrip": false,
448
- "normalized": false,
449
- "rstrip": false,
450
- "single_word": false,
451
- "special": false
452
- },
453
- "92405": {
454
- "content": "[UNUSED_TOKEN_8]",
455
- "lstrip": false,
456
- "normalized": false,
457
- "rstrip": false,
458
- "single_word": false,
459
- "special": false
460
- },
461
- "92406": {
462
- "content": "[UNUSED_TOKEN_9]",
463
- "lstrip": false,
464
- "normalized": false,
465
- "rstrip": false,
466
- "single_word": false,
467
- "special": false
468
- },
469
- "92407": {
470
- "content": "[UNUSED_TOKEN_10]",
471
- "lstrip": false,
472
- "normalized": false,
473
- "rstrip": false,
474
- "single_word": false,
475
- "special": false
476
- },
477
- "92408": {
478
- "content": "[UNUSED_TOKEN_11]",
479
- "lstrip": false,
480
- "normalized": false,
481
- "rstrip": false,
482
- "single_word": false,
483
- "special": false
484
- },
485
- "92409": {
486
- "content": "[UNUSED_TOKEN_12]",
487
- "lstrip": false,
488
- "normalized": false,
489
- "rstrip": false,
490
- "single_word": false,
491
- "special": false
492
- },
493
- "92410": {
494
- "content": "[UNUSED_TOKEN_13]",
495
- "lstrip": false,
496
- "normalized": false,
497
- "rstrip": false,
498
- "single_word": false,
499
- "special": false
500
- },
501
- "92411": {
502
- "content": "[UNUSED_TOKEN_14]",
503
- "lstrip": false,
504
- "normalized": false,
505
- "rstrip": false,
506
- "single_word": false,
507
- "special": false
508
- },
509
- "92412": {
510
- "content": "[UNUSED_TOKEN_15]",
511
- "lstrip": false,
512
- "normalized": false,
513
- "rstrip": false,
514
- "single_word": false,
515
- "special": false
516
- },
517
- "92413": {
518
- "content": "[UNUSED_TOKEN_16]",
519
- "lstrip": false,
520
- "normalized": false,
521
- "rstrip": false,
522
- "single_word": false,
523
- "special": false
524
- },
525
- "92414": {
526
- "content": "[UNUSED_TOKEN_17]",
527
- "lstrip": false,
528
- "normalized": false,
529
- "rstrip": false,
530
- "single_word": false,
531
- "special": false
532
- },
533
- "92415": {
534
- "content": "[UNUSED_TOKEN_18]",
535
- "lstrip": false,
536
- "normalized": false,
537
- "rstrip": false,
538
- "single_word": false,
539
- "special": false
540
- },
541
- "92416": {
542
- "content": "[UNUSED_TOKEN_19]",
543
- "lstrip": false,
544
- "normalized": false,
545
- "rstrip": false,
546
- "single_word": false,
547
- "special": false
548
- },
549
- "92417": {
550
- "content": "[UNUSED_TOKEN_20]",
551
- "lstrip": false,
552
- "normalized": false,
553
- "rstrip": false,
554
- "single_word": false,
555
- "special": false
556
- },
557
- "92418": {
558
- "content": "[UNUSED_TOKEN_21]",
559
- "lstrip": false,
560
- "normalized": false,
561
- "rstrip": false,
562
- "single_word": false,
563
- "special": false
564
- },
565
- "92419": {
566
- "content": "[UNUSED_TOKEN_22]",
567
- "lstrip": false,
568
- "normalized": false,
569
- "rstrip": false,
570
- "single_word": false,
571
- "special": false
572
- },
573
- "92420": {
574
- "content": "[UNUSED_TOKEN_23]",
575
- "lstrip": false,
576
- "normalized": false,
577
- "rstrip": false,
578
- "single_word": false,
579
- "special": false
580
- },
581
- "92421": {
582
- "content": "[UNUSED_TOKEN_24]",
583
- "lstrip": false,
584
- "normalized": false,
585
- "rstrip": false,
586
- "single_word": false,
587
- "special": false
588
- },
589
- "92422": {
590
- "content": "[UNUSED_TOKEN_25]",
591
- "lstrip": false,
592
- "normalized": false,
593
- "rstrip": false,
594
- "single_word": false,
595
- "special": false
596
- },
597
- "92423": {
598
- "content": "[UNUSED_TOKEN_26]",
599
- "lstrip": false,
600
- "normalized": false,
601
- "rstrip": false,
602
- "single_word": false,
603
- "special": false
604
- },
605
- "92424": {
606
- "content": "[UNUSED_TOKEN_27]",
607
- "lstrip": false,
608
- "normalized": false,
609
- "rstrip": false,
610
- "single_word": false,
611
- "special": false
612
- },
613
- "92425": {
614
- "content": "[UNUSED_TOKEN_28]",
615
- "lstrip": false,
616
- "normalized": false,
617
- "rstrip": false,
618
- "single_word": false,
619
- "special": false
620
- },
621
- "92426": {
622
- "content": "[UNUSED_TOKEN_29]",
623
- "lstrip": false,
624
- "normalized": false,
625
- "rstrip": false,
626
- "single_word": false,
627
- "special": false
628
- },
629
- "92427": {
630
- "content": "[UNUSED_TOKEN_30]",
631
- "lstrip": false,
632
- "normalized": false,
633
- "rstrip": false,
634
- "single_word": false,
635
- "special": false
636
- },
637
- "92428": {
638
- "content": "[UNUSED_TOKEN_31]",
639
- "lstrip": false,
640
- "normalized": false,
641
- "rstrip": false,
642
- "single_word": false,
643
- "special": false
644
- },
645
- "92429": {
646
- "content": "[UNUSED_TOKEN_32]",
647
- "lstrip": false,
648
- "normalized": false,
649
- "rstrip": false,
650
- "single_word": false,
651
- "special": false
652
- },
653
- "92430": {
654
- "content": "[UNUSED_TOKEN_33]",
655
- "lstrip": false,
656
- "normalized": false,
657
- "rstrip": false,
658
- "single_word": false,
659
- "special": false
660
- },
661
- "92431": {
662
- "content": "[UNUSED_TOKEN_34]",
663
- "lstrip": false,
664
- "normalized": false,
665
- "rstrip": false,
666
- "single_word": false,
667
- "special": false
668
- },
669
- "92432": {
670
- "content": "[UNUSED_TOKEN_35]",
671
- "lstrip": false,
672
- "normalized": false,
673
- "rstrip": false,
674
- "single_word": false,
675
- "special": false
676
- },
677
- "92433": {
678
- "content": "[UNUSED_TOKEN_36]",
679
- "lstrip": false,
680
- "normalized": false,
681
- "rstrip": false,
682
- "single_word": false,
683
- "special": false
684
- },
685
- "92434": {
686
- "content": "[UNUSED_TOKEN_37]",
687
- "lstrip": false,
688
- "normalized": false,
689
- "rstrip": false,
690
- "single_word": false,
691
- "special": false
692
- },
693
- "92435": {
694
- "content": "[UNUSED_TOKEN_38]",
695
- "lstrip": false,
696
- "normalized": false,
697
- "rstrip": false,
698
- "single_word": false,
699
- "special": false
700
- },
701
- "92436": {
702
- "content": "[UNUSED_TOKEN_39]",
703
- "lstrip": false,
704
- "normalized": false,
705
- "rstrip": false,
706
- "single_word": false,
707
- "special": false
708
- },
709
- "92437": {
710
- "content": "[UNUSED_TOKEN_40]",
711
- "lstrip": false,
712
- "normalized": false,
713
- "rstrip": false,
714
- "single_word": false,
715
- "special": false
716
- },
717
- "92438": {
718
- "content": "[UNUSED_TOKEN_41]",
719
- "lstrip": false,
720
- "normalized": false,
721
- "rstrip": false,
722
- "single_word": false,
723
- "special": false
724
- },
725
- "92439": {
726
- "content": "[UNUSED_TOKEN_42]",
727
- "lstrip": false,
728
- "normalized": false,
729
- "rstrip": false,
730
- "single_word": false,
731
- "special": false
732
- },
733
- "92440": {
734
- "content": "[UNUSED_TOKEN_43]",
735
- "lstrip": false,
736
- "normalized": false,
737
- "rstrip": false,
738
- "single_word": false,
739
- "special": false
740
- },
741
- "92441": {
742
- "content": "[UNUSED_TOKEN_44]",
743
- "lstrip": false,
744
- "normalized": false,
745
- "rstrip": false,
746
- "single_word": false,
747
- "special": false
748
- },
749
- "92442": {
750
- "content": "[UNUSED_TOKEN_45]",
751
- "lstrip": false,
752
- "normalized": false,
753
- "rstrip": false,
754
- "single_word": false,
755
- "special": false
756
- },
757
- "92443": {
758
- "content": "[UNUSED_TOKEN_46]",
759
- "lstrip": false,
760
- "normalized": false,
761
- "rstrip": false,
762
- "single_word": false,
763
- "special": false
764
- },
765
- "92444": {
766
- "content": "[UNUSED_TOKEN_47]",
767
- "lstrip": false,
768
- "normalized": false,
769
- "rstrip": false,
770
- "single_word": false,
771
- "special": false
772
- },
773
- "92445": {
774
- "content": "[UNUSED_TOKEN_48]",
775
- "lstrip": false,
776
- "normalized": false,
777
- "rstrip": false,
778
- "single_word": false,
779
- "special": false
780
- },
781
- "92446": {
782
- "content": "[UNUSED_TOKEN_49]",
783
- "lstrip": false,
784
- "normalized": false,
785
- "rstrip": false,
786
- "single_word": false,
787
- "special": false
788
- },
789
- "92447": {
790
- "content": "[UNUSED_TOKEN_50]",
791
- "lstrip": false,
792
- "normalized": false,
793
- "rstrip": false,
794
- "single_word": false,
795
- "special": false
796
- },
797
- "92448": {
798
- "content": "[UNUSED_TOKEN_51]",
799
- "lstrip": false,
800
- "normalized": false,
801
- "rstrip": false,
802
- "single_word": false,
803
- "special": false
804
- },
805
- "92449": {
806
- "content": "[UNUSED_TOKEN_52]",
807
- "lstrip": false,
808
- "normalized": false,
809
- "rstrip": false,
810
- "single_word": false,
811
- "special": false
812
- },
813
- "92450": {
814
- "content": "[UNUSED_TOKEN_53]",
815
- "lstrip": false,
816
- "normalized": false,
817
- "rstrip": false,
818
- "single_word": false,
819
- "special": false
820
- },
821
- "92451": {
822
- "content": "[UNUSED_TOKEN_54]",
823
- "lstrip": false,
824
- "normalized": false,
825
- "rstrip": false,
826
- "single_word": false,
827
- "special": false
828
- },
829
- "92452": {
830
- "content": "[UNUSED_TOKEN_55]",
831
- "lstrip": false,
832
- "normalized": false,
833
- "rstrip": false,
834
- "single_word": false,
835
- "special": false
836
- },
837
- "92453": {
838
- "content": "[UNUSED_TOKEN_56]",
839
- "lstrip": false,
840
- "normalized": false,
841
- "rstrip": false,
842
- "single_word": false,
843
- "special": false
844
- },
845
- "92454": {
846
- "content": "[UNUSED_TOKEN_57]",
847
- "lstrip": false,
848
- "normalized": false,
849
- "rstrip": false,
850
- "single_word": false,
851
- "special": false
852
- },
853
- "92455": {
854
- "content": "[UNUSED_TOKEN_58]",
855
- "lstrip": false,
856
- "normalized": false,
857
- "rstrip": false,
858
- "single_word": false,
859
- "special": false
860
- },
861
- "92456": {
862
- "content": "[UNUSED_TOKEN_59]",
863
- "lstrip": false,
864
- "normalized": false,
865
- "rstrip": false,
866
- "single_word": false,
867
- "special": false
868
- },
869
- "92457": {
870
- "content": "[UNUSED_TOKEN_60]",
871
- "lstrip": false,
872
- "normalized": false,
873
- "rstrip": false,
874
- "single_word": false,
875
- "special": false
876
- },
877
- "92458": {
878
- "content": "[UNUSED_TOKEN_61]",
879
- "lstrip": false,
880
- "normalized": false,
881
- "rstrip": false,
882
- "single_word": false,
883
- "special": false
884
- },
885
- "92459": {
886
- "content": "[UNUSED_TOKEN_62]",
887
- "lstrip": false,
888
- "normalized": false,
889
- "rstrip": false,
890
- "single_word": false,
891
- "special": false
892
- },
893
- "92460": {
894
- "content": "[UNUSED_TOKEN_63]",
895
- "lstrip": false,
896
- "normalized": false,
897
- "rstrip": false,
898
- "single_word": false,
899
- "special": false
900
- },
901
- "92461": {
902
- "content": "[UNUSED_TOKEN_64]",
903
- "lstrip": false,
904
- "normalized": false,
905
- "rstrip": false,
906
- "single_word": false,
907
- "special": false
908
- },
909
- "92462": {
910
- "content": "[UNUSED_TOKEN_65]",
911
- "lstrip": false,
912
- "normalized": false,
913
- "rstrip": false,
914
- "single_word": false,
915
- "special": false
916
- },
917
- "92463": {
918
- "content": "[UNUSED_TOKEN_66]",
919
- "lstrip": false,
920
- "normalized": false,
921
- "rstrip": false,
922
- "single_word": false,
923
- "special": false
924
- },
925
- "92464": {
926
- "content": "[UNUSED_TOKEN_67]",
927
- "lstrip": false,
928
- "normalized": false,
929
- "rstrip": false,
930
- "single_word": false,
931
- "special": false
932
- },
933
- "92465": {
934
- "content": "[UNUSED_TOKEN_68]",
935
- "lstrip": false,
936
- "normalized": false,
937
- "rstrip": false,
938
- "single_word": false,
939
- "special": false
940
- },
941
- "92466": {
942
- "content": "[UNUSED_TOKEN_69]",
943
- "lstrip": false,
944
- "normalized": false,
945
- "rstrip": false,
946
- "single_word": false,
947
- "special": false
948
- },
949
- "92467": {
950
- "content": "[UNUSED_TOKEN_70]",
951
- "lstrip": false,
952
- "normalized": false,
953
- "rstrip": false,
954
- "single_word": false,
955
- "special": false
956
- },
957
- "92468": {
958
- "content": "[UNUSED_TOKEN_71]",
959
- "lstrip": false,
960
- "normalized": false,
961
- "rstrip": false,
962
- "single_word": false,
963
- "special": false
964
- },
965
- "92469": {
966
- "content": "[UNUSED_TOKEN_72]",
967
- "lstrip": false,
968
- "normalized": false,
969
- "rstrip": false,
970
- "single_word": false,
971
- "special": false
972
- },
973
- "92470": {
974
- "content": "[UNUSED_TOKEN_73]",
975
- "lstrip": false,
976
- "normalized": false,
977
- "rstrip": false,
978
- "single_word": false,
979
- "special": false
980
- },
981
- "92471": {
982
- "content": "[UNUSED_TOKEN_74]",
983
- "lstrip": false,
984
- "normalized": false,
985
- "rstrip": false,
986
- "single_word": false,
987
- "special": false
988
- },
989
- "92472": {
990
- "content": "[UNUSED_TOKEN_75]",
991
- "lstrip": false,
992
- "normalized": false,
993
- "rstrip": false,
994
- "single_word": false,
995
- "special": false
996
- },
997
- "92473": {
998
- "content": "[UNUSED_TOKEN_76]",
999
- "lstrip": false,
1000
- "normalized": false,
1001
- "rstrip": false,
1002
- "single_word": false,
1003
- "special": false
1004
- },
1005
- "92474": {
1006
- "content": "[UNUSED_TOKEN_77]",
1007
- "lstrip": false,
1008
- "normalized": false,
1009
- "rstrip": false,
1010
- "single_word": false,
1011
- "special": false
1012
- },
1013
- "92475": {
1014
- "content": "[UNUSED_TOKEN_78]",
1015
- "lstrip": false,
1016
- "normalized": false,
1017
- "rstrip": false,
1018
- "single_word": false,
1019
- "special": false
1020
- },
1021
- "92476": {
1022
- "content": "[UNUSED_TOKEN_79]",
1023
- "lstrip": false,
1024
- "normalized": false,
1025
- "rstrip": false,
1026
- "single_word": false,
1027
- "special": false
1028
- },
1029
- "92477": {
1030
- "content": "[UNUSED_TOKEN_80]",
1031
- "lstrip": false,
1032
- "normalized": false,
1033
- "rstrip": false,
1034
- "single_word": false,
1035
- "special": false
1036
- },
1037
- "92478": {
1038
- "content": "[UNUSED_TOKEN_81]",
1039
- "lstrip": false,
1040
- "normalized": false,
1041
- "rstrip": false,
1042
- "single_word": false,
1043
- "special": false
1044
- },
1045
- "92479": {
1046
- "content": "[UNUSED_TOKEN_82]",
1047
- "lstrip": false,
1048
- "normalized": false,
1049
- "rstrip": false,
1050
- "single_word": false,
1051
- "special": false
1052
- },
1053
- "92480": {
1054
- "content": "[UNUSED_TOKEN_83]",
1055
- "lstrip": false,
1056
- "normalized": false,
1057
- "rstrip": false,
1058
- "single_word": false,
1059
- "special": false
1060
- },
1061
- "92481": {
1062
- "content": "[UNUSED_TOKEN_84]",
1063
- "lstrip": false,
1064
- "normalized": false,
1065
- "rstrip": false,
1066
- "single_word": false,
1067
- "special": false
1068
- },
1069
- "92482": {
1070
- "content": "[UNUSED_TOKEN_85]",
1071
- "lstrip": false,
1072
- "normalized": false,
1073
- "rstrip": false,
1074
- "single_word": false,
1075
- "special": false
1076
- },
1077
- "92483": {
1078
- "content": "[UNUSED_TOKEN_86]",
1079
- "lstrip": false,
1080
- "normalized": false,
1081
- "rstrip": false,
1082
- "single_word": false,
1083
- "special": false
1084
- },
1085
- "92484": {
1086
- "content": "[UNUSED_TOKEN_87]",
1087
- "lstrip": false,
1088
- "normalized": false,
1089
- "rstrip": false,
1090
- "single_word": false,
1091
- "special": false
1092
- },
1093
- "92485": {
1094
- "content": "[UNUSED_TOKEN_88]",
1095
- "lstrip": false,
1096
- "normalized": false,
1097
- "rstrip": false,
1098
- "single_word": false,
1099
- "special": false
1100
- },
1101
- "92486": {
1102
- "content": "[UNUSED_TOKEN_89]",
1103
- "lstrip": false,
1104
- "normalized": false,
1105
- "rstrip": false,
1106
- "single_word": false,
1107
- "special": false
1108
- },
1109
- "92487": {
1110
- "content": "[UNUSED_TOKEN_90]",
1111
- "lstrip": false,
1112
- "normalized": false,
1113
- "rstrip": false,
1114
- "single_word": false,
1115
- "special": false
1116
- },
1117
- "92488": {
1118
- "content": "[UNUSED_TOKEN_91]",
1119
- "lstrip": false,
1120
- "normalized": false,
1121
- "rstrip": false,
1122
- "single_word": false,
1123
- "special": false
1124
- },
1125
- "92489": {
1126
- "content": "[UNUSED_TOKEN_92]",
1127
- "lstrip": false,
1128
- "normalized": false,
1129
- "rstrip": false,
1130
- "single_word": false,
1131
- "special": false
1132
- },
1133
- "92490": {
1134
- "content": "[UNUSED_TOKEN_93]",
1135
- "lstrip": false,
1136
- "normalized": false,
1137
- "rstrip": false,
1138
- "single_word": false,
1139
- "special": false
1140
- },
1141
- "92491": {
1142
- "content": "[UNUSED_TOKEN_94]",
1143
- "lstrip": false,
1144
- "normalized": false,
1145
- "rstrip": false,
1146
- "single_word": false,
1147
- "special": false
1148
- },
1149
- "92492": {
1150
- "content": "[UNUSED_TOKEN_95]",
1151
- "lstrip": false,
1152
- "normalized": false,
1153
- "rstrip": false,
1154
- "single_word": false,
1155
- "special": false
1156
- },
1157
- "92493": {
1158
- "content": "[UNUSED_TOKEN_96]",
1159
- "lstrip": false,
1160
- "normalized": false,
1161
- "rstrip": false,
1162
- "single_word": false,
1163
- "special": false
1164
- },
1165
- "92494": {
1166
- "content": "[UNUSED_TOKEN_97]",
1167
- "lstrip": false,
1168
- "normalized": false,
1169
- "rstrip": false,
1170
- "single_word": false,
1171
- "special": false
1172
- },
1173
- "92495": {
1174
- "content": "[UNUSED_TOKEN_98]",
1175
- "lstrip": false,
1176
- "normalized": false,
1177
- "rstrip": false,
1178
- "single_word": false,
1179
- "special": false
1180
- },
1181
- "92496": {
1182
- "content": "[UNUSED_TOKEN_99]",
1183
- "lstrip": false,
1184
- "normalized": false,
1185
- "rstrip": false,
1186
- "single_word": false,
1187
- "special": false
1188
- },
1189
- "92497": {
1190
- "content": "[UNUSED_TOKEN_100]",
1191
- "lstrip": false,
1192
- "normalized": false,
1193
- "rstrip": false,
1194
- "single_word": false,
1195
- "special": false
1196
- },
1197
- "92498": {
1198
- "content": "[UNUSED_TOKEN_101]",
1199
- "lstrip": false,
1200
- "normalized": false,
1201
- "rstrip": false,
1202
- "single_word": false,
1203
- "special": false
1204
- },
1205
- "92499": {
1206
- "content": "[UNUSED_TOKEN_102]",
1207
- "lstrip": false,
1208
- "normalized": false,
1209
- "rstrip": false,
1210
- "single_word": false,
1211
- "special": false
1212
- },
1213
- "92500": {
1214
- "content": "[UNUSED_TOKEN_103]",
1215
- "lstrip": false,
1216
- "normalized": false,
1217
- "rstrip": false,
1218
- "single_word": false,
1219
- "special": false
1220
- },
1221
- "92501": {
1222
- "content": "[UNUSED_TOKEN_104]",
1223
- "lstrip": false,
1224
- "normalized": false,
1225
- "rstrip": false,
1226
- "single_word": false,
1227
- "special": false
1228
- },
1229
- "92502": {
1230
- "content": "[UNUSED_TOKEN_105]",
1231
- "lstrip": false,
1232
- "normalized": false,
1233
- "rstrip": false,
1234
- "single_word": false,
1235
- "special": false
1236
- },
1237
- "92503": {
1238
- "content": "[UNUSED_TOKEN_106]",
1239
- "lstrip": false,
1240
- "normalized": false,
1241
- "rstrip": false,
1242
- "single_word": false,
1243
- "special": false
1244
- },
1245
- "92504": {
1246
- "content": "[UNUSED_TOKEN_107]",
1247
- "lstrip": false,
1248
- "normalized": false,
1249
- "rstrip": false,
1250
- "single_word": false,
1251
- "special": false
1252
- },
1253
- "92505": {
1254
- "content": "[UNUSED_TOKEN_108]",
1255
- "lstrip": false,
1256
- "normalized": false,
1257
- "rstrip": false,
1258
- "single_word": false,
1259
- "special": false
1260
- },
1261
- "92506": {
1262
- "content": "[UNUSED_TOKEN_109]",
1263
- "lstrip": false,
1264
- "normalized": false,
1265
- "rstrip": false,
1266
- "single_word": false,
1267
- "special": false
1268
- },
1269
- "92507": {
1270
- "content": "[UNUSED_TOKEN_110]",
1271
- "lstrip": false,
1272
- "normalized": false,
1273
- "rstrip": false,
1274
- "single_word": false,
1275
- "special": false
1276
- },
1277
- "92508": {
1278
- "content": "[UNUSED_TOKEN_111]",
1279
- "lstrip": false,
1280
- "normalized": false,
1281
- "rstrip": false,
1282
- "single_word": false,
1283
- "special": false
1284
- },
1285
- "92509": {
1286
- "content": "[UNUSED_TOKEN_112]",
1287
- "lstrip": false,
1288
- "normalized": false,
1289
- "rstrip": false,
1290
- "single_word": false,
1291
- "special": false
1292
- },
1293
- "92510": {
1294
- "content": "[UNUSED_TOKEN_113]",
1295
- "lstrip": false,
1296
- "normalized": false,
1297
- "rstrip": false,
1298
- "single_word": false,
1299
- "special": false
1300
- },
1301
- "92511": {
1302
- "content": "[UNUSED_TOKEN_114]",
1303
- "lstrip": false,
1304
- "normalized": false,
1305
- "rstrip": false,
1306
- "single_word": false,
1307
- "special": false
1308
- },
1309
- "92512": {
1310
- "content": "[UNUSED_TOKEN_115]",
1311
- "lstrip": false,
1312
- "normalized": false,
1313
- "rstrip": false,
1314
- "single_word": false,
1315
- "special": false
1316
- },
1317
- "92513": {
1318
- "content": "[UNUSED_TOKEN_116]",
1319
- "lstrip": false,
1320
- "normalized": false,
1321
- "rstrip": false,
1322
- "single_word": false,
1323
- "special": false
1324
- },
1325
- "92514": {
1326
- "content": "[UNUSED_TOKEN_117]",
1327
- "lstrip": false,
1328
- "normalized": false,
1329
- "rstrip": false,
1330
- "single_word": false,
1331
- "special": false
1332
- },
1333
- "92515": {
1334
- "content": "[UNUSED_TOKEN_118]",
1335
- "lstrip": false,
1336
- "normalized": false,
1337
- "rstrip": false,
1338
- "single_word": false,
1339
- "special": false
1340
- },
1341
- "92516": {
1342
- "content": "[UNUSED_TOKEN_119]",
1343
- "lstrip": false,
1344
- "normalized": false,
1345
- "rstrip": false,
1346
- "single_word": false,
1347
- "special": false
1348
- },
1349
- "92517": {
1350
- "content": "[UNUSED_TOKEN_120]",
1351
- "lstrip": false,
1352
- "normalized": false,
1353
- "rstrip": false,
1354
- "single_word": false,
1355
- "special": false
1356
- },
1357
- "92518": {
1358
- "content": "[UNUSED_TOKEN_121]",
1359
- "lstrip": false,
1360
- "normalized": false,
1361
- "rstrip": false,
1362
- "single_word": false,
1363
- "special": false
1364
- },
1365
- "92519": {
1366
- "content": "[UNUSED_TOKEN_122]",
1367
- "lstrip": false,
1368
- "normalized": false,
1369
- "rstrip": false,
1370
- "single_word": false,
1371
- "special": false
1372
- },
1373
- "92520": {
1374
- "content": "[UNUSED_TOKEN_123]",
1375
- "lstrip": false,
1376
- "normalized": false,
1377
- "rstrip": false,
1378
- "single_word": false,
1379
- "special": false
1380
- },
1381
- "92521": {
1382
- "content": "[UNUSED_TOKEN_124]",
1383
- "lstrip": false,
1384
- "normalized": false,
1385
- "rstrip": false,
1386
- "single_word": false,
1387
- "special": false
1388
- },
1389
- "92522": {
1390
- "content": "[UNUSED_TOKEN_125]",
1391
- "lstrip": false,
1392
- "normalized": false,
1393
- "rstrip": false,
1394
- "single_word": false,
1395
- "special": false
1396
- },
1397
- "92523": {
1398
- "content": "[UNUSED_TOKEN_126]",
1399
- "lstrip": false,
1400
- "normalized": false,
1401
- "rstrip": false,
1402
- "single_word": false,
1403
- "special": false
1404
- },
1405
- "92524": {
1406
- "content": "[UNUSED_TOKEN_127]",
1407
- "lstrip": false,
1408
- "normalized": false,
1409
- "rstrip": false,
1410
- "single_word": false,
1411
- "special": false
1412
- },
1413
- "92525": {
1414
- "content": "[UNUSED_TOKEN_128]",
1415
- "lstrip": false,
1416
- "normalized": false,
1417
- "rstrip": false,
1418
- "single_word": false,
1419
- "special": false
1420
- },
1421
- "92526": {
1422
- "content": "[UNUSED_TOKEN_129]",
1423
- "lstrip": false,
1424
- "normalized": false,
1425
- "rstrip": false,
1426
- "single_word": false,
1427
- "special": false
1428
- },
1429
- "92527": {
1430
- "content": "[UNUSED_TOKEN_130]",
1431
- "lstrip": false,
1432
- "normalized": false,
1433
- "rstrip": false,
1434
- "single_word": false,
1435
- "special": false
1436
- },
1437
- "92528": {
1438
- "content": "[UNUSED_TOKEN_131]",
1439
- "lstrip": false,
1440
- "normalized": false,
1441
- "rstrip": false,
1442
- "single_word": false,
1443
- "special": false
1444
- },
1445
- "92529": {
1446
- "content": "[UNUSED_TOKEN_132]",
1447
- "lstrip": false,
1448
- "normalized": false,
1449
- "rstrip": false,
1450
- "single_word": false,
1451
- "special": false
1452
- },
1453
- "92530": {
1454
- "content": "[UNUSED_TOKEN_133]",
1455
- "lstrip": false,
1456
- "normalized": false,
1457
- "rstrip": false,
1458
- "single_word": false,
1459
- "special": false
1460
- },
1461
- "92531": {
1462
- "content": "[UNUSED_TOKEN_134]",
1463
- "lstrip": false,
1464
- "normalized": false,
1465
- "rstrip": false,
1466
- "single_word": false,
1467
- "special": false
1468
- },
1469
- "92532": {
1470
- "content": "[UNUSED_TOKEN_135]",
1471
- "lstrip": false,
1472
- "normalized": false,
1473
- "rstrip": false,
1474
- "single_word": false,
1475
- "special": false
1476
- },
1477
- "92533": {
1478
- "content": "[UNUSED_TOKEN_136]",
1479
- "lstrip": false,
1480
- "normalized": false,
1481
- "rstrip": false,
1482
- "single_word": false,
1483
- "special": false
1484
- },
1485
- "92534": {
1486
- "content": "[UNUSED_TOKEN_137]",
1487
- "lstrip": false,
1488
- "normalized": false,
1489
- "rstrip": false,
1490
- "single_word": false,
1491
- "special": false
1492
- },
1493
- "92535": {
1494
- "content": "[UNUSED_TOKEN_138]",
1495
- "lstrip": false,
1496
- "normalized": false,
1497
- "rstrip": false,
1498
- "single_word": false,
1499
- "special": false
1500
- },
1501
- "92536": {
1502
- "content": "[UNUSED_TOKEN_139]",
1503
- "lstrip": false,
1504
- "normalized": false,
1505
- "rstrip": false,
1506
- "single_word": false,
1507
- "special": false
1508
- },
1509
- "92537": {
1510
- "content": "[UNUSED_TOKEN_140]",
1511
- "lstrip": false,
1512
- "normalized": false,
1513
- "rstrip": false,
1514
- "single_word": false,
1515
- "special": false
1516
- },
1517
- "92538": {
1518
- "content": "<|plugin|>",
1519
- "lstrip": false,
1520
- "normalized": false,
1521
- "rstrip": false,
1522
- "single_word": false,
1523
- "special": true
1524
- },
1525
- "92539": {
1526
- "content": "<|interpreter|>",
1527
- "lstrip": false,
1528
- "normalized": false,
1529
- "rstrip": false,
1530
- "single_word": false,
1531
- "special": true
1532
- },
1533
- "92540": {
1534
- "content": "<|action_end|>",
1535
- "lstrip": false,
1536
- "normalized": false,
1537
- "rstrip": false,
1538
- "single_word": false,
1539
- "special": true
1540
- },
1541
- "92541": {
1542
- "content": "<|action_start|>",
1543
- "lstrip": false,
1544
- "normalized": false,
1545
- "rstrip": false,
1546
- "single_word": false,
1547
- "special": true
1548
- },
1549
- "92542": {
1550
- "content": "<|im_end|>",
1551
- "lstrip": false,
1552
- "normalized": false,
1553
- "rstrip": false,
1554
- "single_word": false,
1555
- "special": true
1556
- },
1557
- "92543": {
1558
- "content": "<|im_start|>",
1559
- "lstrip": false,
1560
- "normalized": false,
1561
- "rstrip": false,
1562
- "single_word": false,
1563
- "special": true
1564
- },
1565
- "92544": {
1566
- "content": "[UNUSED_TOKEN_141]",
1567
- "lstrip": false,
1568
- "normalized": false,
1569
- "rstrip": false,
1570
- "single_word": false,
1571
- "special": false
1572
- },
1573
- "92545": {
1574
- "content": "[UNUSED_TOKEN_142]",
1575
- "lstrip": false,
1576
- "normalized": false,
1577
- "rstrip": false,
1578
- "single_word": false,
1579
- "special": false
1580
- },
1581
- "92546": {
1582
- "content": "[UNUSED_TOKEN_143]",
1583
- "lstrip": false,
1584
- "normalized": false,
1585
- "rstrip": false,
1586
- "single_word": false,
1587
- "special": false
1588
- },
1589
- "92547": {
1590
- "content": "[UNUSED_TOKEN_144]",
1591
- "lstrip": false,
1592
- "normalized": false,
1593
- "rstrip": false,
1594
- "single_word": false,
1595
- "special": false
1596
- },
1597
- "92548": {
1598
- "content": "[UNUSED_TOKEN_145]",
1599
- "lstrip": false,
1600
- "normalized": false,
1601
- "rstrip": false,
1602
- "single_word": false,
1603
- "special": false
1604
- },
1605
- "92549": {
1606
- "content": "[UNUSED_TOKEN_146]",
1607
- "lstrip": false,
1608
- "normalized": false,
1609
- "rstrip": false,
1610
- "single_word": false,
1611
- "special": false
1612
- }
1613
- },
1614
- "additional_special_tokens": [
1615
- "<|im_start|>",
1616
- "<|im_end|>",
1617
- "<|action_start|>",
1618
- "<|action_end|>",
1619
- "<|interpreter|>",
1620
- "<|plugin|>"
1621
- ],
1622
- "auto_map": {
1623
- "AutoTokenizer": [
1624
- "tokenization_internlm2.InternLM2Tokenizer",
1625
- "tokenization_internlm2_fast.InternLM2TokenizerFast"
1626
- ]
1627
- },
1628
- "bos_token": "<s>",
1629
- "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|im_start|>system\n' + system_message + '<|im_end|>\n' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}",
1630
- "clean_up_tokenization_spaces": false,
1631
- "decode_with_prefix_space": false,
1632
- "eos_token": "<|im_end|>",
1633
- "model_max_length": 1000000000000000019884624838656,
1634
- "pad_token": "</s>",
1635
- "padding_side": "right",
1636
- "sp_model_kwargs": null,
1637
- "split_special_tokens": false,
1638
- "tokenizer_class": "InternLM2Tokenizer",
1639
- "unk_token": "<unk>"
1640
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-1124/trainer_state.json DELETED
@@ -1,126 +0,0 @@
1
- {
2
- "best_metric": null,
3
- "best_model_checkpoint": null,
4
- "epoch": 1.9982222222222221,
5
- "eval_steps": 562,
6
- "global_step": 1124,
7
- "is_hyper_param_search": false,
8
- "is_local_process_zero": true,
9
- "is_world_process_zero": true,
10
- "log_history": [
11
- {
12
- "epoch": 0.17777777777777778,
13
- "grad_norm": 4.040336608886719,
14
- "learning_rate": 2.958579881656805e-05,
15
- "loss": 0.4641,
16
- "step": 100
17
- },
18
- {
19
- "epoch": 0.35555555555555557,
20
- "grad_norm": 3.9950191974639893,
21
- "learning_rate": 5.91715976331361e-05,
22
- "loss": 0.3704,
23
- "step": 200
24
- },
25
- {
26
- "epoch": 0.5333333333333333,
27
- "grad_norm": 3.9038829803466797,
28
- "learning_rate": 8.875739644970414e-05,
29
- "loss": 0.3751,
30
- "step": 300
31
- },
32
- {
33
- "epoch": 0.7111111111111111,
34
- "grad_norm": 3.4998044967651367,
35
- "learning_rate": 9.989699867437137e-05,
36
- "loss": 0.3928,
37
- "step": 400
38
- },
39
- {
40
- "epoch": 0.8888888888888888,
41
- "grad_norm": 9.73261833190918,
42
- "learning_rate": 9.92981892269398e-05,
43
- "loss": 0.3497,
44
- "step": 500
45
- },
46
- {
47
- "epoch": 0.9991111111111111,
48
- "eval_loss": 0.6006748080253601,
49
- "eval_runtime": 411.5999,
50
- "eval_samples_per_second": 1.215,
51
- "eval_steps_per_second": 1.215,
52
- "step": 562
53
- },
54
- {
55
- "epoch": 1.0666666666666667,
56
- "grad_norm": 2.8831710815429688,
57
- "learning_rate": 9.817128546774103e-05,
58
- "loss": 0.3383,
59
- "step": 600
60
- },
61
- {
62
- "epoch": 1.2444444444444445,
63
- "grad_norm": 6.632827281951904,
64
- "learning_rate": 9.652835906663704e-05,
65
- "loss": 0.3167,
66
- "step": 700
67
- },
68
- {
69
- "epoch": 1.4222222222222223,
70
- "grad_norm": 6.977548122406006,
71
- "learning_rate": 9.438700945477697e-05,
72
- "loss": 0.3165,
73
- "step": 800
74
- },
75
- {
76
- "epoch": 1.6,
77
- "grad_norm": 10.037060737609863,
78
- "learning_rate": 9.177017529516772e-05,
79
- "loss": 0.2927,
80
- "step": 900
81
- },
82
- {
83
- "epoch": 1.7777777777777777,
84
- "grad_norm": 6.976019859313965,
85
- "learning_rate": 8.870588875808164e-05,
86
- "loss": 0.3062,
87
- "step": 1000
88
- },
89
- {
90
- "epoch": 1.9555555555555557,
91
- "grad_norm": 2.106227159500122,
92
- "learning_rate": 8.522697523356319e-05,
93
- "loss": 0.2678,
94
- "step": 1100
95
- },
96
- {
97
- "epoch": 1.9982222222222221,
98
- "eval_loss": 0.3569962680339813,
99
- "eval_runtime": 367.3155,
100
- "eval_samples_per_second": 1.361,
101
- "eval_steps_per_second": 1.361,
102
- "step": 1124
103
- }
104
- ],
105
- "logging_steps": 100,
106
- "max_steps": 3372,
107
- "num_input_tokens_seen": 0,
108
- "num_train_epochs": 6,
109
- "save_steps": 562,
110
- "stateful_callbacks": {
111
- "TrainerControl": {
112
- "args": {
113
- "should_epoch_stop": false,
114
- "should_evaluate": false,
115
- "should_log": false,
116
- "should_save": true,
117
- "should_training_stop": false
118
- },
119
- "attributes": {}
120
- }
121
- },
122
- "total_flos": 1.3557561305392742e+17,
123
- "train_batch_size": 1,
124
- "trial_name": null,
125
- "trial_params": null
126
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-1124/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:aee0526e63b02d5d5f300a2e1dfcfcc13e168aae73493f8d596712a07178876b
3
- size 5304
 
 
 
 
llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-1686/README.md DELETED
@@ -1,202 +0,0 @@
1
- ---
2
- library_name: peft
3
- base_model: internlm/internlm2_5-7b-chat-1m
4
- ---
5
-
6
- # Model Card for Model ID
7
-
8
- <!-- Provide a quick summary of what the model is/does. -->
9
-
10
-
11
-
12
- ## Model Details
13
-
14
- ### Model Description
15
-
16
- <!-- Provide a longer summary of what this model is. -->
17
-
18
-
19
-
20
- - **Developed by:** [More Information Needed]
21
- - **Funded by [optional]:** [More Information Needed]
22
- - **Shared by [optional]:** [More Information Needed]
23
- - **Model type:** [More Information Needed]
24
- - **Language(s) (NLP):** [More Information Needed]
25
- - **License:** [More Information Needed]
26
- - **Finetuned from model [optional]:** [More Information Needed]
27
-
28
- ### Model Sources [optional]
29
-
30
- <!-- Provide the basic links for the model. -->
31
-
32
- - **Repository:** [More Information Needed]
33
- - **Paper [optional]:** [More Information Needed]
34
- - **Demo [optional]:** [More Information Needed]
35
-
36
- ## Uses
37
-
38
- <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
-
40
- ### Direct Use
41
-
42
- <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
-
44
- [More Information Needed]
45
-
46
- ### Downstream Use [optional]
47
-
48
- <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
-
50
- [More Information Needed]
51
-
52
- ### Out-of-Scope Use
53
-
54
- <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
-
56
- [More Information Needed]
57
-
58
- ## Bias, Risks, and Limitations
59
-
60
- <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
-
62
- [More Information Needed]
63
-
64
- ### Recommendations
65
-
66
- <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
-
68
- Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
-
70
- ## How to Get Started with the Model
71
-
72
- Use the code below to get started with the model.
73
-
74
- [More Information Needed]
75
-
76
- ## Training Details
77
-
78
- ### Training Data
79
-
80
- <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
-
82
- [More Information Needed]
83
-
84
- ### Training Procedure
85
-
86
- <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
-
88
- #### Preprocessing [optional]
89
-
90
- [More Information Needed]
91
-
92
-
93
- #### Training Hyperparameters
94
-
95
- - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
-
97
- #### Speeds, Sizes, Times [optional]
98
-
99
- <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
-
101
- [More Information Needed]
102
-
103
- ## Evaluation
104
-
105
- <!-- This section describes the evaluation protocols and provides the results. -->
106
-
107
- ### Testing Data, Factors & Metrics
108
-
109
- #### Testing Data
110
-
111
- <!-- This should link to a Dataset Card if possible. -->
112
-
113
- [More Information Needed]
114
-
115
- #### Factors
116
-
117
- <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
-
119
- [More Information Needed]
120
-
121
- #### Metrics
122
-
123
- <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
-
125
- [More Information Needed]
126
-
127
- ### Results
128
-
129
- [More Information Needed]
130
-
131
- #### Summary
132
-
133
-
134
-
135
- ## Model Examination [optional]
136
-
137
- <!-- Relevant interpretability work for the model goes here -->
138
-
139
- [More Information Needed]
140
-
141
- ## Environmental Impact
142
-
143
- <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
-
145
- Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
-
147
- - **Hardware Type:** [More Information Needed]
148
- - **Hours used:** [More Information Needed]
149
- - **Cloud Provider:** [More Information Needed]
150
- - **Compute Region:** [More Information Needed]
151
- - **Carbon Emitted:** [More Information Needed]
152
-
153
- ## Technical Specifications [optional]
154
-
155
- ### Model Architecture and Objective
156
-
157
- [More Information Needed]
158
-
159
- ### Compute Infrastructure
160
-
161
- [More Information Needed]
162
-
163
- #### Hardware
164
-
165
- [More Information Needed]
166
-
167
- #### Software
168
-
169
- [More Information Needed]
170
-
171
- ## Citation [optional]
172
-
173
- <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
-
175
- **BibTeX:**
176
-
177
- [More Information Needed]
178
-
179
- **APA:**
180
-
181
- [More Information Needed]
182
-
183
- ## Glossary [optional]
184
-
185
- <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
-
187
- [More Information Needed]
188
-
189
- ## More Information [optional]
190
-
191
- [More Information Needed]
192
-
193
- ## Model Card Authors [optional]
194
-
195
- [More Information Needed]
196
-
197
- ## Model Card Contact
198
-
199
- [More Information Needed]
200
- ### Framework versions
201
-
202
- - PEFT 0.11.1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-1686/adapter_config.json DELETED
@@ -1,32 +0,0 @@
1
- {
2
- "alpha_pattern": {},
3
- "auto_mapping": null,
4
- "base_model_name_or_path": "internlm/internlm2_5-7b-chat-1m",
5
- "bias": "none",
6
- "fan_in_fan_out": false,
7
- "inference_mode": true,
8
- "init_lora_weights": true,
9
- "layer_replication": null,
10
- "layers_pattern": null,
11
- "layers_to_transform": null,
12
- "loftq_config": {},
13
- "lora_alpha": 16,
14
- "lora_dropout": 0.0,
15
- "megatron_config": null,
16
- "megatron_core": "megatron.core",
17
- "modules_to_save": null,
18
- "peft_type": "LORA",
19
- "r": 8,
20
- "rank_pattern": {},
21
- "revision": null,
22
- "target_modules": [
23
- "w3",
24
- "wo",
25
- "w1",
26
- "wqkv",
27
- "w2"
28
- ],
29
- "task_type": "CAUSAL_LM",
30
- "use_dora": false,
31
- "use_rslora": false
32
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-1686/adapter_model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:2db59fc1969809c41246edf4ce25225643730120516d17662a7d1bb14d7cfe39
3
- size 75539712
 
 
 
 
llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-1686/added_tokens.json DELETED
@@ -1,8 +0,0 @@
1
- {
2
- "[UNUSED_TOKEN_141]": 92544,
3
- "[UNUSED_TOKEN_142]": 92545,
4
- "[UNUSED_TOKEN_143]": 92546,
5
- "[UNUSED_TOKEN_144]": 92547,
6
- "[UNUSED_TOKEN_145]": 92548,
7
- "[UNUSED_TOKEN_146]": 92549
8
- }
 
 
 
 
 
 
 
 
 
llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-1686/optimizer.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:d1646f40df6fb41e1f3d0b5332c11e3ae9119600089df0a51ade6a7157c4b212
3
- size 151264058
 
 
 
 
llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-1686/rng_state.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:c062f7f375beded48b5337f5a3f3a5cb38807fa3e85dbf3e294c0ab6b627bfc2
3
- size 14244
 
 
 
 
llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-1686/scheduler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:905747c81bda26664cb2ffce8f8ce9044aa6bf92fd1cd473dd32e646b88e5e1a
3
- size 1064
 
 
 
 
llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-1686/special_tokens_map.json DELETED
@@ -1,38 +0,0 @@
1
- {
2
- "additional_special_tokens": [
3
- "<|im_start|>",
4
- "<|im_end|>",
5
- "<|action_start|>",
6
- "<|action_end|>",
7
- "<|interpreter|>",
8
- "<|plugin|>"
9
- ],
10
- "bos_token": {
11
- "content": "<s>",
12
- "lstrip": false,
13
- "normalized": false,
14
- "rstrip": false,
15
- "single_word": false
16
- },
17
- "eos_token": {
18
- "content": "<|im_end|>",
19
- "lstrip": false,
20
- "normalized": false,
21
- "rstrip": false,
22
- "single_word": false
23
- },
24
- "pad_token": {
25
- "content": "</s>",
26
- "lstrip": false,
27
- "normalized": false,
28
- "rstrip": false,
29
- "single_word": false
30
- },
31
- "unk_token": {
32
- "content": "<unk>",
33
- "lstrip": false,
34
- "normalized": false,
35
- "rstrip": false,
36
- "single_word": false
37
- }
38
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-1686/tokenization_internlm2.py DELETED
@@ -1,236 +0,0 @@
1
- # coding=utf-8
2
- # Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
3
- #
4
- # This code is based on transformers/src/transformers/models/llama/tokenization_llama.py
5
- #
6
- # Licensed under the Apache License, Version 2.0 (the "License");
7
- # you may not use this file except in compliance with the License.
8
- # You may obtain a copy of the License at
9
- #
10
- # http://www.apache.org/licenses/LICENSE-2.0
11
- #
12
- # Unless required by applicable law or agreed to in writing, software
13
- # distributed under the License is distributed on an "AS IS" BASIS,
14
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
- # See the License for the specific language governing permissions and
16
- # limitations under the License.
17
-
18
- """Tokenization classes for InternLM."""
19
- import os
20
- from shutil import copyfile
21
- from typing import Any, Dict, List, Optional, Tuple
22
-
23
- import sentencepiece as spm
24
- from transformers.tokenization_utils import PreTrainedTokenizer
25
- from transformers.utils import logging
26
-
27
- logger = logging.get_logger(__name__)
28
-
29
- VOCAB_FILES_NAMES = {"vocab_file": "./tokenizer.model"}
30
-
31
- PRETRAINED_VOCAB_FILES_MAP = {}
32
-
33
-
34
- # Modified from transformers.model.llama.tokenization_llama.LlamaTokenizer
35
- class InternLM2Tokenizer(PreTrainedTokenizer):
36
- """
37
- Construct a InternLM2 tokenizer. Based on byte-level Byte-Pair-Encoding.
38
-
39
- Args:
40
- vocab_file (`str`):
41
- Path to the vocabulary file.
42
- """
43
-
44
- vocab_files_names = VOCAB_FILES_NAMES
45
- pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
46
- model_input_names = ["input_ids", "attention_mask"]
47
- _auto_class = "AutoTokenizer"
48
-
49
- def __init__(
50
- self,
51
- vocab_file,
52
- unk_token="<unk>",
53
- bos_token="<s>",
54
- eos_token="</s>",
55
- pad_token="</s>",
56
- sp_model_kwargs: Optional[Dict[str, Any]] = None,
57
- add_bos_token=True,
58
- add_eos_token=False,
59
- decode_with_prefix_space=False,
60
- clean_up_tokenization_spaces=False,
61
- **kwargs,
62
- ):
63
- self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
64
- self.vocab_file = vocab_file
65
- self.add_bos_token = add_bos_token
66
- self.add_eos_token = add_eos_token
67
- self.decode_with_prefix_space = decode_with_prefix_space
68
- self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
69
- self.sp_model.Load(vocab_file)
70
- self._no_prefix_space_tokens = None
71
- super().__init__(
72
- bos_token=bos_token,
73
- eos_token=eos_token,
74
- unk_token=unk_token,
75
- pad_token=pad_token,
76
- clean_up_tokenization_spaces=clean_up_tokenization_spaces,
77
- **kwargs,
78
- )
79
-
80
- @property
81
- def no_prefix_space_tokens(self):
82
- if self._no_prefix_space_tokens is None:
83
- vocab = self.convert_ids_to_tokens(list(range(self.vocab_size)))
84
- self._no_prefix_space_tokens = {i for i, tok in enumerate(vocab) if not tok.startswith("▁")}
85
- return self._no_prefix_space_tokens
86
-
87
- @property
88
- def vocab_size(self):
89
- """Returns vocab size"""
90
- return self.sp_model.get_piece_size()
91
-
92
- @property
93
- def bos_token_id(self) -> Optional[int]:
94
- return self.sp_model.bos_id()
95
-
96
- @property
97
- def eos_token_id(self) -> Optional[int]:
98
- return self.sp_model.eos_id()
99
-
100
- def get_vocab(self):
101
- """Returns vocab as a dict"""
102
- vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
103
- vocab.update(self.added_tokens_encoder)
104
- return vocab
105
-
106
- def _tokenize(self, text):
107
- """Returns a tokenized string."""
108
- return self.sp_model.encode(text, out_type=str)
109
-
110
- def _convert_token_to_id(self, token):
111
- """Converts a token (str) in an id using the vocab."""
112
- return self.sp_model.piece_to_id(token)
113
-
114
- def _convert_id_to_token(self, index):
115
- """Converts an index (integer) in a token (str) using the vocab."""
116
- token = self.sp_model.IdToPiece(index)
117
- return token
118
-
119
- def _maybe_add_prefix_space(self, tokens, decoded):
120
- if tokens and tokens[0] not in self.no_prefix_space_tokens:
121
- return " " + decoded
122
- else:
123
- return decoded
124
-
125
- def convert_tokens_to_string(self, tokens):
126
- """Converts a sequence of tokens (string) in a single string."""
127
- current_sub_tokens = []
128
- out_string = ""
129
- prev_is_special = False
130
- for token in tokens:
131
- # make sure that special tokens are not decoded using sentencepiece model
132
- if token in self.all_special_tokens:
133
- if not prev_is_special:
134
- out_string += " "
135
- out_string += self.sp_model.decode(current_sub_tokens) + token
136
- prev_is_special = True
137
- current_sub_tokens = []
138
- else:
139
- current_sub_tokens.append(token)
140
- prev_is_special = False
141
- out_string += self.sp_model.decode(current_sub_tokens)
142
- out_string = self.clean_up_tokenization(out_string)
143
- out_string = self._maybe_add_prefix_space(tokens=tokens, decoded=out_string)
144
- return out_string[1:]
145
-
146
- def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
147
- """
148
- Save the vocabulary and special tokens file to a directory.
149
-
150
- Args:
151
- save_directory (`str`):
152
- The directory in which to save the vocabulary.
153
-
154
- Returns:
155
- `Tuple(str)`: Paths to the files saved.
156
- """
157
- if not os.path.isdir(save_directory):
158
- logger.error(f"Vocabulary path ({save_directory}) should be a directory")
159
- return
160
- out_vocab_file = os.path.join(
161
- save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
162
- )
163
-
164
- if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
165
- copyfile(self.vocab_file, out_vocab_file)
166
- elif not os.path.isfile(self.vocab_file):
167
- with open(out_vocab_file, "wb") as fi:
168
- content_spiece_model = self.sp_model.serialized_model_proto()
169
- fi.write(content_spiece_model)
170
-
171
- return (out_vocab_file,)
172
-
173
- def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
174
- if self.add_bos_token:
175
- bos_token_ids = [self.bos_token_id]
176
- else:
177
- bos_token_ids = []
178
-
179
- output = bos_token_ids + token_ids_0
180
-
181
- if token_ids_1 is not None:
182
- output = output + token_ids_1
183
-
184
- if self.add_eos_token:
185
- output = output + [self.eos_token_id]
186
-
187
- return output
188
-
189
- def get_special_tokens_mask(
190
- self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
191
- ) -> List[int]:
192
- """
193
- Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
194
- special tokens using the tokenizer `prepare_for_model` method.
195
-
196
- Args:
197
- token_ids_0 (`List[int]`):
198
- List of IDs.
199
- token_ids_1 (`List[int]`, *optional*):
200
- Optional second list of IDs for sequence pairs.
201
- already_has_special_tokens (`bool`, *optional*, defaults to `False`):
202
- Whether or not the token list is already formatted with special tokens for the model.
203
-
204
- Returns:
205
- `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
206
- """
207
- if already_has_special_tokens:
208
- return super().get_special_tokens_mask(
209
- token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
210
- )
211
-
212
- if token_ids_1 is None:
213
- return [1] + ([0] * len(token_ids_0)) + [1]
214
- return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
215
-
216
- def create_token_type_ids_from_sequences(
217
- self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
218
- ) -> List[int]:
219
- """
220
- Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
221
- use of token type ids, therefore a list of zeros is returned.
222
-
223
- Args:
224
- token_ids_0 (`List[int]`):
225
- List of IDs.
226
- token_ids_1 (`List[int]`, *optional*):
227
- Optional second list of IDs for sequence pairs.
228
-
229
- Returns:
230
- `List[int]`: List of zeros.
231
- """
232
- eos = [self.eos_token_id]
233
-
234
- if token_ids_1 is None:
235
- return len(token_ids_0 + eos) * [0]
236
- return len(token_ids_0 + eos + token_ids_1 + eos) * [0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-1686/tokenization_internlm2_fast.py DELETED
@@ -1,214 +0,0 @@
1
- # coding=utf-8
2
- # Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
3
- #
4
- # This code is based on transformers/src/transformers/models/llama/tokenization_llama_fast.py
5
- #
6
- # Licensed under the Apache License, Version 2.0 (the "License");
7
- # you may not use this file except in compliance with the License.
8
- # You may obtain a copy of the License at
9
- #
10
- # http://www.apache.org/licenses/LICENSE-2.0
11
- #
12
- # Unless required by applicable law or agreed to in writing, software
13
- # distributed under the License is distributed on an "AS IS" BASIS,
14
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
- # See the License for the specific language governing permissions and
16
- # limitations under the License.
17
-
18
- """Tokenization Fast class for InternLM."""
19
- import os
20
- from shutil import copyfile
21
- from typing import Any, Dict, Optional, Tuple
22
-
23
- from tokenizers import processors, decoders, Tokenizer, normalizers
24
- from tokenizers.models import BPE
25
-
26
- from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
27
- from transformers.utils import logging
28
-
29
- from transformers.convert_slow_tokenizer import (
30
- SLOW_TO_FAST_CONVERTERS,
31
- SpmConverter,
32
- SentencePieceExtractor,
33
- )
34
-
35
- from .tokenization_internlm2 import InternLM2Tokenizer
36
-
37
- logger = logging.get_logger(__name__)
38
-
39
- VOCAB_FILES_NAMES = {"vocab_file": "./tokenizer.model"}
40
-
41
- # Modified from transformers.convert_slow_tokenizer.LlamaConverter
42
- class InternLM2Converter(SpmConverter):
43
- handle_byte_fallback = True
44
-
45
- def vocab(self, proto):
46
- vocab = [
47
- ("<unk>", 0.0),
48
- ("<s>", 0.0),
49
- ("</s>", 0.0),
50
- ]
51
- vocab += [(piece.piece, piece.score) for piece in proto.pieces[3:]]
52
- return vocab
53
-
54
- def unk_id(self, proto):
55
- unk_id = 0
56
- return unk_id
57
-
58
- def decoder(self, replacement, add_prefix_space):
59
- decoders_sequence = [
60
- decoders.Replace("▁", " "),
61
- decoders.ByteFallback(),
62
- decoders.Fuse(),
63
- ]
64
- if self.proto.normalizer_spec.add_dummy_prefix:
65
- decoders_sequence.append(decoders.Strip(content=" ", left=1))
66
- return decoders.Sequence(decoders_sequence)
67
-
68
- def tokenizer(self, proto):
69
- model_type = proto.trainer_spec.model_type
70
- vocab_scores = self.vocab(proto)
71
- # special tokens
72
- added_tokens = self.original_tokenizer.added_tokens_decoder
73
- for i in range(len(vocab_scores)):
74
- piece, score = vocab_scores[i]
75
- if i in added_tokens:
76
- vocab_scores[i] = (added_tokens[i].content, score)
77
- if model_type == 1:
78
- raise RuntimeError("InternLM2 is supposed to be a BPE model!")
79
-
80
- elif model_type == 2:
81
- _, merges = SentencePieceExtractor(self.original_tokenizer.vocab_file).extract(vocab_scores)
82
- bpe_vocab = {word: i for i, (word, _score) in enumerate(vocab_scores)}
83
- tokenizer = Tokenizer(
84
- BPE(bpe_vocab, merges, unk_token=proto.trainer_spec.unk_piece, fuse_unk=True, byte_fallback=True)
85
- )
86
- tokenizer.add_special_tokens(
87
- [ added_token for index, added_token in added_tokens.items()]
88
- )
89
- else:
90
- raise Exception(
91
- "You're trying to run a `Unigram` model but you're file was trained with a different algorithm"
92
- )
93
-
94
- return tokenizer
95
-
96
- def normalizer(self, proto):
97
- normalizers_list = []
98
- if proto.normalizer_spec.add_dummy_prefix:
99
- normalizers_list.append(normalizers.Prepend(prepend="▁"))
100
- normalizers_list.append(normalizers.Replace(pattern=" ", content="▁"))
101
- return normalizers.Sequence(normalizers_list)
102
-
103
- def pre_tokenizer(self, replacement, add_prefix_space):
104
- return None
105
-
106
- SLOW_TO_FAST_CONVERTERS["InternLM2Tokenizer"] = InternLM2Converter
107
-
108
-
109
- # Modified from transformers.model.llama.tokenization_llama_fast.LlamaTokenizerFast -> InternLM2TokenizerFast
110
- class InternLM2TokenizerFast(PreTrainedTokenizerFast):
111
- vocab_files_names = VOCAB_FILES_NAMES
112
- slow_tokenizer_class = InternLM2Tokenizer
113
- padding_side = "left"
114
- model_input_names = ["input_ids", "attention_mask"]
115
- _auto_class = "AutoTokenizer"
116
-
117
- def __init__(
118
- self,
119
- vocab_file,
120
- unk_token="<unk>",
121
- bos_token="<s>",
122
- eos_token="</s>",
123
- pad_token="</s>",
124
- sp_model_kwargs: Optional[Dict[str, Any]] = None,
125
- add_bos_token=True,
126
- add_eos_token=False,
127
- decode_with_prefix_space=False,
128
- clean_up_tokenization_spaces=False,
129
- **kwargs,
130
- ):
131
- super().__init__(
132
- vocab_file=vocab_file,
133
- unk_token=unk_token,
134
- bos_token=bos_token,
135
- eos_token=eos_token,
136
- pad_token=pad_token,
137
- sp_model_kwargs=sp_model_kwargs,
138
- add_bos_token=add_bos_token,
139
- add_eos_token=add_eos_token,
140
- decode_with_prefix_space=decode_with_prefix_space,
141
- clean_up_tokenization_spaces=clean_up_tokenization_spaces,
142
- **kwargs,
143
- )
144
- self._add_bos_token = add_bos_token
145
- self._add_eos_token = add_eos_token
146
- self.update_post_processor()
147
- self.vocab_file = vocab_file
148
-
149
- @property
150
- def can_save_slow_tokenizer(self) -> bool:
151
- return os.path.isfile(self.vocab_file) if self.vocab_file else False
152
-
153
- def update_post_processor(self):
154
- """
155
- Updates the underlying post processor with the current `bos_token` and `eos_token`.
156
- """
157
- bos = self.bos_token
158
- bos_token_id = self.bos_token_id
159
- if bos is None and self.add_bos_token:
160
- raise ValueError("add_bos_token = True but bos_token = None")
161
-
162
- eos = self.eos_token
163
- eos_token_id = self.eos_token_id
164
- if eos is None and self.add_eos_token:
165
- raise ValueError("add_eos_token = True but eos_token = None")
166
-
167
- single = f"{(bos+':0 ') if self.add_bos_token else ''}$A:0{(' '+eos+':0') if self.add_eos_token else ''}"
168
- pair = f"{single}{(' '+bos+':1') if self.add_bos_token else ''} $B:1{(' '+eos+':1') if self.add_eos_token else ''}"
169
-
170
- special_tokens = []
171
- if self.add_bos_token:
172
- special_tokens.append((bos, bos_token_id))
173
- if self.add_eos_token:
174
- special_tokens.append((eos, eos_token_id))
175
- self._tokenizer.post_processor = processors.TemplateProcessing(
176
- single=single, pair=pair, special_tokens=special_tokens
177
- )
178
-
179
- @property
180
- def add_eos_token(self):
181
- return self._add_eos_token
182
-
183
- @property
184
- def add_bos_token(self):
185
- return self._add_bos_token
186
-
187
- @add_eos_token.setter
188
- def add_eos_token(self, value):
189
- self._add_eos_token = value
190
- self.update_post_processor()
191
-
192
- @add_bos_token.setter
193
- def add_bos_token(self, value):
194
- self._add_bos_token = value
195
- self.update_post_processor()
196
-
197
- def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
198
- if not self.can_save_slow_tokenizer:
199
- raise ValueError(
200
- "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
201
- "tokenizer."
202
- )
203
-
204
- if not os.path.isdir(save_directory):
205
- logger.error(f"Vocabulary path ({save_directory}) should be a directory")
206
- return
207
- out_vocab_file = os.path.join(
208
- save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
209
- )
210
-
211
- if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
212
- copyfile(self.vocab_file, out_vocab_file)
213
-
214
- return (out_vocab_file,)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-1686/tokenizer.json DELETED
The diff for this file is too large to render. See raw diff
 
llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-1686/tokenizer.model DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f868398fc4e05ee1e8aeba95ddf18ddcc45b8bce55d5093bead5bbf80429b48b
3
- size 1477754
 
 
 
 
llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-1686/tokenizer_config.json DELETED
@@ -1,1640 +0,0 @@
1
- {
2
- "add_bos_token": true,
3
- "add_eos_token": false,
4
- "added_tokens_decoder": {
5
- "0": {
6
- "content": "<unk>",
7
- "lstrip": false,
8
- "normalized": false,
9
- "rstrip": false,
10
- "single_word": false,
11
- "special": true
12
- },
13
- "1": {
14
- "content": "<s>",
15
- "lstrip": false,
16
- "normalized": false,
17
- "rstrip": false,
18
- "single_word": false,
19
- "special": true
20
- },
21
- "2": {
22
- "content": "</s>",
23
- "lstrip": false,
24
- "normalized": false,
25
- "rstrip": false,
26
- "single_word": false,
27
- "special": true
28
- },
29
- "92352": {
30
- "content": "E",
31
- "lstrip": false,
32
- "normalized": false,
33
- "rstrip": false,
34
- "single_word": false,
35
- "special": false
36
- },
37
- "92353": {
38
- "content": "F",
39
- "lstrip": false,
40
- "normalized": false,
41
- "rstrip": false,
42
- "single_word": false,
43
- "special": false
44
- },
45
- "92354": {
46
- "content": "G",
47
- "lstrip": false,
48
- "normalized": false,
49
- "rstrip": false,
50
- "single_word": false,
51
- "special": false
52
- },
53
- "92355": {
54
- "content": "H",
55
- "lstrip": false,
56
- "normalized": false,
57
- "rstrip": false,
58
- "single_word": false,
59
- "special": false
60
- },
61
- "92356": {
62
- "content": "I",
63
- "lstrip": false,
64
- "normalized": false,
65
- "rstrip": false,
66
- "single_word": false,
67
- "special": false
68
- },
69
- "92357": {
70
- "content": "J",
71
- "lstrip": false,
72
- "normalized": false,
73
- "rstrip": false,
74
- "single_word": false,
75
- "special": false
76
- },
77
- "92358": {
78
- "content": "K",
79
- "lstrip": false,
80
- "normalized": false,
81
- "rstrip": false,
82
- "single_word": false,
83
- "special": false
84
- },
85
- "92359": {
86
- "content": "L",
87
- "lstrip": false,
88
- "normalized": false,
89
- "rstrip": false,
90
- "single_word": false,
91
- "special": false
92
- },
93
- "92360": {
94
- "content": "M",
95
- "lstrip": false,
96
- "normalized": false,
97
- "rstrip": false,
98
- "single_word": false,
99
- "special": false
100
- },
101
- "92361": {
102
- "content": "N",
103
- "lstrip": false,
104
- "normalized": false,
105
- "rstrip": false,
106
- "single_word": false,
107
- "special": false
108
- },
109
- "92362": {
110
- "content": "R",
111
- "lstrip": false,
112
- "normalized": false,
113
- "rstrip": false,
114
- "single_word": false,
115
- "special": false
116
- },
117
- "92363": {
118
- "content": "U",
119
- "lstrip": false,
120
- "normalized": false,
121
- "rstrip": false,
122
- "single_word": false,
123
- "special": false
124
- },
125
- "92364": {
126
- "content": "V",
127
- "lstrip": false,
128
- "normalized": false,
129
- "rstrip": false,
130
- "single_word": false,
131
- "special": false
132
- },
133
- "92365": {
134
- "content": "W",
135
- "lstrip": false,
136
- "normalized": false,
137
- "rstrip": false,
138
- "single_word": false,
139
- "special": false
140
- },
141
- "92366": {
142
- "content": "X",
143
- "lstrip": false,
144
- "normalized": false,
145
- "rstrip": false,
146
- "single_word": false,
147
- "special": false
148
- },
149
- "92367": {
150
- "content": "Y",
151
- "lstrip": false,
152
- "normalized": false,
153
- "rstrip": false,
154
- "single_word": false,
155
- "special": false
156
- },
157
- "92368": {
158
- "content": "Z",
159
- "lstrip": false,
160
- "normalized": false,
161
- "rstrip": false,
162
- "single_word": false,
163
- "special": false
164
- },
165
- "92369": {
166
- "content": "a",
167
- "lstrip": false,
168
- "normalized": false,
169
- "rstrip": false,
170
- "single_word": false,
171
- "special": false
172
- },
173
- "92370": {
174
- "content": "b",
175
- "lstrip": false,
176
- "normalized": false,
177
- "rstrip": false,
178
- "single_word": false,
179
- "special": false
180
- },
181
- "92371": {
182
- "content": "c",
183
- "lstrip": false,
184
- "normalized": false,
185
- "rstrip": false,
186
- "single_word": false,
187
- "special": false
188
- },
189
- "92372": {
190
- "content": "d",
191
- "lstrip": false,
192
- "normalized": false,
193
- "rstrip": false,
194
- "single_word": false,
195
- "special": false
196
- },
197
- "92373": {
198
- "content": "e",
199
- "lstrip": false,
200
- "normalized": false,
201
- "rstrip": false,
202
- "single_word": false,
203
- "special": false
204
- },
205
- "92374": {
206
- "content": "f",
207
- "lstrip": false,
208
- "normalized": false,
209
- "rstrip": false,
210
- "single_word": false,
211
- "special": false
212
- },
213
- "92375": {
214
- "content": "g",
215
- "lstrip": false,
216
- "normalized": false,
217
- "rstrip": false,
218
- "single_word": false,
219
- "special": false
220
- },
221
- "92376": {
222
- "content": "h",
223
- "lstrip": false,
224
- "normalized": false,
225
- "rstrip": false,
226
- "single_word": false,
227
- "special": false
228
- },
229
- "92377": {
230
- "content": "i",
231
- "lstrip": false,
232
- "normalized": false,
233
- "rstrip": false,
234
- "single_word": false,
235
- "special": false
236
- },
237
- "92378": {
238
- "content": "j",
239
- "lstrip": false,
240
- "normalized": false,
241
- "rstrip": false,
242
- "single_word": false,
243
- "special": false
244
- },
245
- "92379": {
246
- "content": "k",
247
- "lstrip": false,
248
- "normalized": false,
249
- "rstrip": false,
250
- "single_word": false,
251
- "special": false
252
- },
253
- "92380": {
254
- "content": "l",
255
- "lstrip": false,
256
- "normalized": false,
257
- "rstrip": false,
258
- "single_word": false,
259
- "special": false
260
- },
261
- "92381": {
262
- "content": "m",
263
- "lstrip": false,
264
- "normalized": false,
265
- "rstrip": false,
266
- "single_word": false,
267
- "special": false
268
- },
269
- "92382": {
270
- "content": "n",
271
- "lstrip": false,
272
- "normalized": false,
273
- "rstrip": false,
274
- "single_word": false,
275
- "special": false
276
- },
277
- "92383": {
278
- "content": "o",
279
- "lstrip": false,
280
- "normalized": false,
281
- "rstrip": false,
282
- "single_word": false,
283
- "special": false
284
- },
285
- "92384": {
286
- "content": "p",
287
- "lstrip": false,
288
- "normalized": false,
289
- "rstrip": false,
290
- "single_word": false,
291
- "special": false
292
- },
293
- "92385": {
294
- "content": "q",
295
- "lstrip": false,
296
- "normalized": false,
297
- "rstrip": false,
298
- "single_word": false,
299
- "special": false
300
- },
301
- "92386": {
302
- "content": "r",
303
- "lstrip": false,
304
- "normalized": false,
305
- "rstrip": false,
306
- "single_word": false,
307
- "special": false
308
- },
309
- "92387": {
310
- "content": "s",
311
- "lstrip": false,
312
- "normalized": false,
313
- "rstrip": false,
314
- "single_word": false,
315
- "special": false
316
- },
317
- "92388": {
318
- "content": "t",
319
- "lstrip": false,
320
- "normalized": false,
321
- "rstrip": false,
322
- "single_word": false,
323
- "special": false
324
- },
325
- "92389": {
326
- "content": "u",
327
- "lstrip": false,
328
- "normalized": false,
329
- "rstrip": false,
330
- "single_word": false,
331
- "special": false
332
- },
333
- "92390": {
334
- "content": "v",
335
- "lstrip": false,
336
- "normalized": false,
337
- "rstrip": false,
338
- "single_word": false,
339
- "special": false
340
- },
341
- "92391": {
342
- "content": "w",
343
- "lstrip": false,
344
- "normalized": false,
345
- "rstrip": false,
346
- "single_word": false,
347
- "special": false
348
- },
349
- "92392": {
350
- "content": "x",
351
- "lstrip": false,
352
- "normalized": false,
353
- "rstrip": false,
354
- "single_word": false,
355
- "special": false
356
- },
357
- "92393": {
358
- "content": "y",
359
- "lstrip": false,
360
- "normalized": false,
361
- "rstrip": false,
362
- "single_word": false,
363
- "special": false
364
- },
365
- "92394": {
366
- "content": "z",
367
- "lstrip": false,
368
- "normalized": false,
369
- "rstrip": false,
370
- "single_word": false,
371
- "special": false
372
- },
373
- "92395": {
374
- "content": "——",
375
- "lstrip": false,
376
- "normalized": false,
377
- "rstrip": false,
378
- "single_word": false,
379
- "special": false
380
- },
381
- "92396": {
382
- "content": "……",
383
- "lstrip": false,
384
- "normalized": false,
385
- "rstrip": false,
386
- "single_word": false,
387
- "special": false
388
- },
389
- "92397": {
390
- "content": "[UNUSED_TOKEN_0]",
391
- "lstrip": false,
392
- "normalized": false,
393
- "rstrip": false,
394
- "single_word": false,
395
- "special": false
396
- },
397
- "92398": {
398
- "content": "[UNUSED_TOKEN_1]",
399
- "lstrip": false,
400
- "normalized": false,
401
- "rstrip": false,
402
- "single_word": false,
403
- "special": false
404
- },
405
- "92399": {
406
- "content": "[UNUSED_TOKEN_2]",
407
- "lstrip": false,
408
- "normalized": false,
409
- "rstrip": false,
410
- "single_word": false,
411
- "special": false
412
- },
413
- "92400": {
414
- "content": "[UNUSED_TOKEN_3]",
415
- "lstrip": false,
416
- "normalized": false,
417
- "rstrip": false,
418
- "single_word": false,
419
- "special": false
420
- },
421
- "92401": {
422
- "content": "[UNUSED_TOKEN_4]",
423
- "lstrip": false,
424
- "normalized": false,
425
- "rstrip": false,
426
- "single_word": false,
427
- "special": false
428
- },
429
- "92402": {
430
- "content": "[UNUSED_TOKEN_5]",
431
- "lstrip": false,
432
- "normalized": false,
433
- "rstrip": false,
434
- "single_word": false,
435
- "special": false
436
- },
437
- "92403": {
438
- "content": "[UNUSED_TOKEN_6]",
439
- "lstrip": false,
440
- "normalized": false,
441
- "rstrip": false,
442
- "single_word": false,
443
- "special": false
444
- },
445
- "92404": {
446
- "content": "[UNUSED_TOKEN_7]",
447
- "lstrip": false,
448
- "normalized": false,
449
- "rstrip": false,
450
- "single_word": false,
451
- "special": false
452
- },
453
- "92405": {
454
- "content": "[UNUSED_TOKEN_8]",
455
- "lstrip": false,
456
- "normalized": false,
457
- "rstrip": false,
458
- "single_word": false,
459
- "special": false
460
- },
461
- "92406": {
462
- "content": "[UNUSED_TOKEN_9]",
463
- "lstrip": false,
464
- "normalized": false,
465
- "rstrip": false,
466
- "single_word": false,
467
- "special": false
468
- },
469
- "92407": {
470
- "content": "[UNUSED_TOKEN_10]",
471
- "lstrip": false,
472
- "normalized": false,
473
- "rstrip": false,
474
- "single_word": false,
475
- "special": false
476
- },
477
- "92408": {
478
- "content": "[UNUSED_TOKEN_11]",
479
- "lstrip": false,
480
- "normalized": false,
481
- "rstrip": false,
482
- "single_word": false,
483
- "special": false
484
- },
485
- "92409": {
486
- "content": "[UNUSED_TOKEN_12]",
487
- "lstrip": false,
488
- "normalized": false,
489
- "rstrip": false,
490
- "single_word": false,
491
- "special": false
492
- },
493
- "92410": {
494
- "content": "[UNUSED_TOKEN_13]",
495
- "lstrip": false,
496
- "normalized": false,
497
- "rstrip": false,
498
- "single_word": false,
499
- "special": false
500
- },
501
- "92411": {
502
- "content": "[UNUSED_TOKEN_14]",
503
- "lstrip": false,
504
- "normalized": false,
505
- "rstrip": false,
506
- "single_word": false,
507
- "special": false
508
- },
509
- "92412": {
510
- "content": "[UNUSED_TOKEN_15]",
511
- "lstrip": false,
512
- "normalized": false,
513
- "rstrip": false,
514
- "single_word": false,
515
- "special": false
516
- },
517
- "92413": {
518
- "content": "[UNUSED_TOKEN_16]",
519
- "lstrip": false,
520
- "normalized": false,
521
- "rstrip": false,
522
- "single_word": false,
523
- "special": false
524
- },
525
- "92414": {
526
- "content": "[UNUSED_TOKEN_17]",
527
- "lstrip": false,
528
- "normalized": false,
529
- "rstrip": false,
530
- "single_word": false,
531
- "special": false
532
- },
533
- "92415": {
534
- "content": "[UNUSED_TOKEN_18]",
535
- "lstrip": false,
536
- "normalized": false,
537
- "rstrip": false,
538
- "single_word": false,
539
- "special": false
540
- },
541
- "92416": {
542
- "content": "[UNUSED_TOKEN_19]",
543
- "lstrip": false,
544
- "normalized": false,
545
- "rstrip": false,
546
- "single_word": false,
547
- "special": false
548
- },
549
- "92417": {
550
- "content": "[UNUSED_TOKEN_20]",
551
- "lstrip": false,
552
- "normalized": false,
553
- "rstrip": false,
554
- "single_word": false,
555
- "special": false
556
- },
557
- "92418": {
558
- "content": "[UNUSED_TOKEN_21]",
559
- "lstrip": false,
560
- "normalized": false,
561
- "rstrip": false,
562
- "single_word": false,
563
- "special": false
564
- },
565
- "92419": {
566
- "content": "[UNUSED_TOKEN_22]",
567
- "lstrip": false,
568
- "normalized": false,
569
- "rstrip": false,
570
- "single_word": false,
571
- "special": false
572
- },
573
- "92420": {
574
- "content": "[UNUSED_TOKEN_23]",
575
- "lstrip": false,
576
- "normalized": false,
577
- "rstrip": false,
578
- "single_word": false,
579
- "special": false
580
- },
581
- "92421": {
582
- "content": "[UNUSED_TOKEN_24]",
583
- "lstrip": false,
584
- "normalized": false,
585
- "rstrip": false,
586
- "single_word": false,
587
- "special": false
588
- },
589
- "92422": {
590
- "content": "[UNUSED_TOKEN_25]",
591
- "lstrip": false,
592
- "normalized": false,
593
- "rstrip": false,
594
- "single_word": false,
595
- "special": false
596
- },
597
- "92423": {
598
- "content": "[UNUSED_TOKEN_26]",
599
- "lstrip": false,
600
- "normalized": false,
601
- "rstrip": false,
602
- "single_word": false,
603
- "special": false
604
- },
605
- "92424": {
606
- "content": "[UNUSED_TOKEN_27]",
607
- "lstrip": false,
608
- "normalized": false,
609
- "rstrip": false,
610
- "single_word": false,
611
- "special": false
612
- },
613
- "92425": {
614
- "content": "[UNUSED_TOKEN_28]",
615
- "lstrip": false,
616
- "normalized": false,
617
- "rstrip": false,
618
- "single_word": false,
619
- "special": false
620
- },
621
- "92426": {
622
- "content": "[UNUSED_TOKEN_29]",
623
- "lstrip": false,
624
- "normalized": false,
625
- "rstrip": false,
626
- "single_word": false,
627
- "special": false
628
- },
629
- "92427": {
630
- "content": "[UNUSED_TOKEN_30]",
631
- "lstrip": false,
632
- "normalized": false,
633
- "rstrip": false,
634
- "single_word": false,
635
- "special": false
636
- },
637
- "92428": {
638
- "content": "[UNUSED_TOKEN_31]",
639
- "lstrip": false,
640
- "normalized": false,
641
- "rstrip": false,
642
- "single_word": false,
643
- "special": false
644
- },
645
- "92429": {
646
- "content": "[UNUSED_TOKEN_32]",
647
- "lstrip": false,
648
- "normalized": false,
649
- "rstrip": false,
650
- "single_word": false,
651
- "special": false
652
- },
653
- "92430": {
654
- "content": "[UNUSED_TOKEN_33]",
655
- "lstrip": false,
656
- "normalized": false,
657
- "rstrip": false,
658
- "single_word": false,
659
- "special": false
660
- },
661
- "92431": {
662
- "content": "[UNUSED_TOKEN_34]",
663
- "lstrip": false,
664
- "normalized": false,
665
- "rstrip": false,
666
- "single_word": false,
667
- "special": false
668
- },
669
- "92432": {
670
- "content": "[UNUSED_TOKEN_35]",
671
- "lstrip": false,
672
- "normalized": false,
673
- "rstrip": false,
674
- "single_word": false,
675
- "special": false
676
- },
677
- "92433": {
678
- "content": "[UNUSED_TOKEN_36]",
679
- "lstrip": false,
680
- "normalized": false,
681
- "rstrip": false,
682
- "single_word": false,
683
- "special": false
684
- },
685
- "92434": {
686
- "content": "[UNUSED_TOKEN_37]",
687
- "lstrip": false,
688
- "normalized": false,
689
- "rstrip": false,
690
- "single_word": false,
691
- "special": false
692
- },
693
- "92435": {
694
- "content": "[UNUSED_TOKEN_38]",
695
- "lstrip": false,
696
- "normalized": false,
697
- "rstrip": false,
698
- "single_word": false,
699
- "special": false
700
- },
701
- "92436": {
702
- "content": "[UNUSED_TOKEN_39]",
703
- "lstrip": false,
704
- "normalized": false,
705
- "rstrip": false,
706
- "single_word": false,
707
- "special": false
708
- },
709
- "92437": {
710
- "content": "[UNUSED_TOKEN_40]",
711
- "lstrip": false,
712
- "normalized": false,
713
- "rstrip": false,
714
- "single_word": false,
715
- "special": false
716
- },
717
- "92438": {
718
- "content": "[UNUSED_TOKEN_41]",
719
- "lstrip": false,
720
- "normalized": false,
721
- "rstrip": false,
722
- "single_word": false,
723
- "special": false
724
- },
725
- "92439": {
726
- "content": "[UNUSED_TOKEN_42]",
727
- "lstrip": false,
728
- "normalized": false,
729
- "rstrip": false,
730
- "single_word": false,
731
- "special": false
732
- },
733
- "92440": {
734
- "content": "[UNUSED_TOKEN_43]",
735
- "lstrip": false,
736
- "normalized": false,
737
- "rstrip": false,
738
- "single_word": false,
739
- "special": false
740
- },
741
- "92441": {
742
- "content": "[UNUSED_TOKEN_44]",
743
- "lstrip": false,
744
- "normalized": false,
745
- "rstrip": false,
746
- "single_word": false,
747
- "special": false
748
- },
749
- "92442": {
750
- "content": "[UNUSED_TOKEN_45]",
751
- "lstrip": false,
752
- "normalized": false,
753
- "rstrip": false,
754
- "single_word": false,
755
- "special": false
756
- },
757
- "92443": {
758
- "content": "[UNUSED_TOKEN_46]",
759
- "lstrip": false,
760
- "normalized": false,
761
- "rstrip": false,
762
- "single_word": false,
763
- "special": false
764
- },
765
- "92444": {
766
- "content": "[UNUSED_TOKEN_47]",
767
- "lstrip": false,
768
- "normalized": false,
769
- "rstrip": false,
770
- "single_word": false,
771
- "special": false
772
- },
773
- "92445": {
774
- "content": "[UNUSED_TOKEN_48]",
775
- "lstrip": false,
776
- "normalized": false,
777
- "rstrip": false,
778
- "single_word": false,
779
- "special": false
780
- },
781
- "92446": {
782
- "content": "[UNUSED_TOKEN_49]",
783
- "lstrip": false,
784
- "normalized": false,
785
- "rstrip": false,
786
- "single_word": false,
787
- "special": false
788
- },
789
- "92447": {
790
- "content": "[UNUSED_TOKEN_50]",
791
- "lstrip": false,
792
- "normalized": false,
793
- "rstrip": false,
794
- "single_word": false,
795
- "special": false
796
- },
797
- "92448": {
798
- "content": "[UNUSED_TOKEN_51]",
799
- "lstrip": false,
800
- "normalized": false,
801
- "rstrip": false,
802
- "single_word": false,
803
- "special": false
804
- },
805
- "92449": {
806
- "content": "[UNUSED_TOKEN_52]",
807
- "lstrip": false,
808
- "normalized": false,
809
- "rstrip": false,
810
- "single_word": false,
811
- "special": false
812
- },
813
- "92450": {
814
- "content": "[UNUSED_TOKEN_53]",
815
- "lstrip": false,
816
- "normalized": false,
817
- "rstrip": false,
818
- "single_word": false,
819
- "special": false
820
- },
821
- "92451": {
822
- "content": "[UNUSED_TOKEN_54]",
823
- "lstrip": false,
824
- "normalized": false,
825
- "rstrip": false,
826
- "single_word": false,
827
- "special": false
828
- },
829
- "92452": {
830
- "content": "[UNUSED_TOKEN_55]",
831
- "lstrip": false,
832
- "normalized": false,
833
- "rstrip": false,
834
- "single_word": false,
835
- "special": false
836
- },
837
- "92453": {
838
- "content": "[UNUSED_TOKEN_56]",
839
- "lstrip": false,
840
- "normalized": false,
841
- "rstrip": false,
842
- "single_word": false,
843
- "special": false
844
- },
845
- "92454": {
846
- "content": "[UNUSED_TOKEN_57]",
847
- "lstrip": false,
848
- "normalized": false,
849
- "rstrip": false,
850
- "single_word": false,
851
- "special": false
852
- },
853
- "92455": {
854
- "content": "[UNUSED_TOKEN_58]",
855
- "lstrip": false,
856
- "normalized": false,
857
- "rstrip": false,
858
- "single_word": false,
859
- "special": false
860
- },
861
- "92456": {
862
- "content": "[UNUSED_TOKEN_59]",
863
- "lstrip": false,
864
- "normalized": false,
865
- "rstrip": false,
866
- "single_word": false,
867
- "special": false
868
- },
869
- "92457": {
870
- "content": "[UNUSED_TOKEN_60]",
871
- "lstrip": false,
872
- "normalized": false,
873
- "rstrip": false,
874
- "single_word": false,
875
- "special": false
876
- },
877
- "92458": {
878
- "content": "[UNUSED_TOKEN_61]",
879
- "lstrip": false,
880
- "normalized": false,
881
- "rstrip": false,
882
- "single_word": false,
883
- "special": false
884
- },
885
- "92459": {
886
- "content": "[UNUSED_TOKEN_62]",
887
- "lstrip": false,
888
- "normalized": false,
889
- "rstrip": false,
890
- "single_word": false,
891
- "special": false
892
- },
893
- "92460": {
894
- "content": "[UNUSED_TOKEN_63]",
895
- "lstrip": false,
896
- "normalized": false,
897
- "rstrip": false,
898
- "single_word": false,
899
- "special": false
900
- },
901
- "92461": {
902
- "content": "[UNUSED_TOKEN_64]",
903
- "lstrip": false,
904
- "normalized": false,
905
- "rstrip": false,
906
- "single_word": false,
907
- "special": false
908
- },
909
- "92462": {
910
- "content": "[UNUSED_TOKEN_65]",
911
- "lstrip": false,
912
- "normalized": false,
913
- "rstrip": false,
914
- "single_word": false,
915
- "special": false
916
- },
917
- "92463": {
918
- "content": "[UNUSED_TOKEN_66]",
919
- "lstrip": false,
920
- "normalized": false,
921
- "rstrip": false,
922
- "single_word": false,
923
- "special": false
924
- },
925
- "92464": {
926
- "content": "[UNUSED_TOKEN_67]",
927
- "lstrip": false,
928
- "normalized": false,
929
- "rstrip": false,
930
- "single_word": false,
931
- "special": false
932
- },
933
- "92465": {
934
- "content": "[UNUSED_TOKEN_68]",
935
- "lstrip": false,
936
- "normalized": false,
937
- "rstrip": false,
938
- "single_word": false,
939
- "special": false
940
- },
941
- "92466": {
942
- "content": "[UNUSED_TOKEN_69]",
943
- "lstrip": false,
944
- "normalized": false,
945
- "rstrip": false,
946
- "single_word": false,
947
- "special": false
948
- },
949
- "92467": {
950
- "content": "[UNUSED_TOKEN_70]",
951
- "lstrip": false,
952
- "normalized": false,
953
- "rstrip": false,
954
- "single_word": false,
955
- "special": false
956
- },
957
- "92468": {
958
- "content": "[UNUSED_TOKEN_71]",
959
- "lstrip": false,
960
- "normalized": false,
961
- "rstrip": false,
962
- "single_word": false,
963
- "special": false
964
- },
965
- "92469": {
966
- "content": "[UNUSED_TOKEN_72]",
967
- "lstrip": false,
968
- "normalized": false,
969
- "rstrip": false,
970
- "single_word": false,
971
- "special": false
972
- },
973
- "92470": {
974
- "content": "[UNUSED_TOKEN_73]",
975
- "lstrip": false,
976
- "normalized": false,
977
- "rstrip": false,
978
- "single_word": false,
979
- "special": false
980
- },
981
- "92471": {
982
- "content": "[UNUSED_TOKEN_74]",
983
- "lstrip": false,
984
- "normalized": false,
985
- "rstrip": false,
986
- "single_word": false,
987
- "special": false
988
- },
989
- "92472": {
990
- "content": "[UNUSED_TOKEN_75]",
991
- "lstrip": false,
992
- "normalized": false,
993
- "rstrip": false,
994
- "single_word": false,
995
- "special": false
996
- },
997
- "92473": {
998
- "content": "[UNUSED_TOKEN_76]",
999
- "lstrip": false,
1000
- "normalized": false,
1001
- "rstrip": false,
1002
- "single_word": false,
1003
- "special": false
1004
- },
1005
- "92474": {
1006
- "content": "[UNUSED_TOKEN_77]",
1007
- "lstrip": false,
1008
- "normalized": false,
1009
- "rstrip": false,
1010
- "single_word": false,
1011
- "special": false
1012
- },
1013
- "92475": {
1014
- "content": "[UNUSED_TOKEN_78]",
1015
- "lstrip": false,
1016
- "normalized": false,
1017
- "rstrip": false,
1018
- "single_word": false,
1019
- "special": false
1020
- },
1021
- "92476": {
1022
- "content": "[UNUSED_TOKEN_79]",
1023
- "lstrip": false,
1024
- "normalized": false,
1025
- "rstrip": false,
1026
- "single_word": false,
1027
- "special": false
1028
- },
1029
- "92477": {
1030
- "content": "[UNUSED_TOKEN_80]",
1031
- "lstrip": false,
1032
- "normalized": false,
1033
- "rstrip": false,
1034
- "single_word": false,
1035
- "special": false
1036
- },
1037
- "92478": {
1038
- "content": "[UNUSED_TOKEN_81]",
1039
- "lstrip": false,
1040
- "normalized": false,
1041
- "rstrip": false,
1042
- "single_word": false,
1043
- "special": false
1044
- },
1045
- "92479": {
1046
- "content": "[UNUSED_TOKEN_82]",
1047
- "lstrip": false,
1048
- "normalized": false,
1049
- "rstrip": false,
1050
- "single_word": false,
1051
- "special": false
1052
- },
1053
- "92480": {
1054
- "content": "[UNUSED_TOKEN_83]",
1055
- "lstrip": false,
1056
- "normalized": false,
1057
- "rstrip": false,
1058
- "single_word": false,
1059
- "special": false
1060
- },
1061
- "92481": {
1062
- "content": "[UNUSED_TOKEN_84]",
1063
- "lstrip": false,
1064
- "normalized": false,
1065
- "rstrip": false,
1066
- "single_word": false,
1067
- "special": false
1068
- },
1069
- "92482": {
1070
- "content": "[UNUSED_TOKEN_85]",
1071
- "lstrip": false,
1072
- "normalized": false,
1073
- "rstrip": false,
1074
- "single_word": false,
1075
- "special": false
1076
- },
1077
- "92483": {
1078
- "content": "[UNUSED_TOKEN_86]",
1079
- "lstrip": false,
1080
- "normalized": false,
1081
- "rstrip": false,
1082
- "single_word": false,
1083
- "special": false
1084
- },
1085
- "92484": {
1086
- "content": "[UNUSED_TOKEN_87]",
1087
- "lstrip": false,
1088
- "normalized": false,
1089
- "rstrip": false,
1090
- "single_word": false,
1091
- "special": false
1092
- },
1093
- "92485": {
1094
- "content": "[UNUSED_TOKEN_88]",
1095
- "lstrip": false,
1096
- "normalized": false,
1097
- "rstrip": false,
1098
- "single_word": false,
1099
- "special": false
1100
- },
1101
- "92486": {
1102
- "content": "[UNUSED_TOKEN_89]",
1103
- "lstrip": false,
1104
- "normalized": false,
1105
- "rstrip": false,
1106
- "single_word": false,
1107
- "special": false
1108
- },
1109
- "92487": {
1110
- "content": "[UNUSED_TOKEN_90]",
1111
- "lstrip": false,
1112
- "normalized": false,
1113
- "rstrip": false,
1114
- "single_word": false,
1115
- "special": false
1116
- },
1117
- "92488": {
1118
- "content": "[UNUSED_TOKEN_91]",
1119
- "lstrip": false,
1120
- "normalized": false,
1121
- "rstrip": false,
1122
- "single_word": false,
1123
- "special": false
1124
- },
1125
- "92489": {
1126
- "content": "[UNUSED_TOKEN_92]",
1127
- "lstrip": false,
1128
- "normalized": false,
1129
- "rstrip": false,
1130
- "single_word": false,
1131
- "special": false
1132
- },
1133
- "92490": {
1134
- "content": "[UNUSED_TOKEN_93]",
1135
- "lstrip": false,
1136
- "normalized": false,
1137
- "rstrip": false,
1138
- "single_word": false,
1139
- "special": false
1140
- },
1141
- "92491": {
1142
- "content": "[UNUSED_TOKEN_94]",
1143
- "lstrip": false,
1144
- "normalized": false,
1145
- "rstrip": false,
1146
- "single_word": false,
1147
- "special": false
1148
- },
1149
- "92492": {
1150
- "content": "[UNUSED_TOKEN_95]",
1151
- "lstrip": false,
1152
- "normalized": false,
1153
- "rstrip": false,
1154
- "single_word": false,
1155
- "special": false
1156
- },
1157
- "92493": {
1158
- "content": "[UNUSED_TOKEN_96]",
1159
- "lstrip": false,
1160
- "normalized": false,
1161
- "rstrip": false,
1162
- "single_word": false,
1163
- "special": false
1164
- },
1165
- "92494": {
1166
- "content": "[UNUSED_TOKEN_97]",
1167
- "lstrip": false,
1168
- "normalized": false,
1169
- "rstrip": false,
1170
- "single_word": false,
1171
- "special": false
1172
- },
1173
- "92495": {
1174
- "content": "[UNUSED_TOKEN_98]",
1175
- "lstrip": false,
1176
- "normalized": false,
1177
- "rstrip": false,
1178
- "single_word": false,
1179
- "special": false
1180
- },
1181
- "92496": {
1182
- "content": "[UNUSED_TOKEN_99]",
1183
- "lstrip": false,
1184
- "normalized": false,
1185
- "rstrip": false,
1186
- "single_word": false,
1187
- "special": false
1188
- },
1189
- "92497": {
1190
- "content": "[UNUSED_TOKEN_100]",
1191
- "lstrip": false,
1192
- "normalized": false,
1193
- "rstrip": false,
1194
- "single_word": false,
1195
- "special": false
1196
- },
1197
- "92498": {
1198
- "content": "[UNUSED_TOKEN_101]",
1199
- "lstrip": false,
1200
- "normalized": false,
1201
- "rstrip": false,
1202
- "single_word": false,
1203
- "special": false
1204
- },
1205
- "92499": {
1206
- "content": "[UNUSED_TOKEN_102]",
1207
- "lstrip": false,
1208
- "normalized": false,
1209
- "rstrip": false,
1210
- "single_word": false,
1211
- "special": false
1212
- },
1213
- "92500": {
1214
- "content": "[UNUSED_TOKEN_103]",
1215
- "lstrip": false,
1216
- "normalized": false,
1217
- "rstrip": false,
1218
- "single_word": false,
1219
- "special": false
1220
- },
1221
- "92501": {
1222
- "content": "[UNUSED_TOKEN_104]",
1223
- "lstrip": false,
1224
- "normalized": false,
1225
- "rstrip": false,
1226
- "single_word": false,
1227
- "special": false
1228
- },
1229
- "92502": {
1230
- "content": "[UNUSED_TOKEN_105]",
1231
- "lstrip": false,
1232
- "normalized": false,
1233
- "rstrip": false,
1234
- "single_word": false,
1235
- "special": false
1236
- },
1237
- "92503": {
1238
- "content": "[UNUSED_TOKEN_106]",
1239
- "lstrip": false,
1240
- "normalized": false,
1241
- "rstrip": false,
1242
- "single_word": false,
1243
- "special": false
1244
- },
1245
- "92504": {
1246
- "content": "[UNUSED_TOKEN_107]",
1247
- "lstrip": false,
1248
- "normalized": false,
1249
- "rstrip": false,
1250
- "single_word": false,
1251
- "special": false
1252
- },
1253
- "92505": {
1254
- "content": "[UNUSED_TOKEN_108]",
1255
- "lstrip": false,
1256
- "normalized": false,
1257
- "rstrip": false,
1258
- "single_word": false,
1259
- "special": false
1260
- },
1261
- "92506": {
1262
- "content": "[UNUSED_TOKEN_109]",
1263
- "lstrip": false,
1264
- "normalized": false,
1265
- "rstrip": false,
1266
- "single_word": false,
1267
- "special": false
1268
- },
1269
- "92507": {
1270
- "content": "[UNUSED_TOKEN_110]",
1271
- "lstrip": false,
1272
- "normalized": false,
1273
- "rstrip": false,
1274
- "single_word": false,
1275
- "special": false
1276
- },
1277
- "92508": {
1278
- "content": "[UNUSED_TOKEN_111]",
1279
- "lstrip": false,
1280
- "normalized": false,
1281
- "rstrip": false,
1282
- "single_word": false,
1283
- "special": false
1284
- },
1285
- "92509": {
1286
- "content": "[UNUSED_TOKEN_112]",
1287
- "lstrip": false,
1288
- "normalized": false,
1289
- "rstrip": false,
1290
- "single_word": false,
1291
- "special": false
1292
- },
1293
- "92510": {
1294
- "content": "[UNUSED_TOKEN_113]",
1295
- "lstrip": false,
1296
- "normalized": false,
1297
- "rstrip": false,
1298
- "single_word": false,
1299
- "special": false
1300
- },
1301
- "92511": {
1302
- "content": "[UNUSED_TOKEN_114]",
1303
- "lstrip": false,
1304
- "normalized": false,
1305
- "rstrip": false,
1306
- "single_word": false,
1307
- "special": false
1308
- },
1309
- "92512": {
1310
- "content": "[UNUSED_TOKEN_115]",
1311
- "lstrip": false,
1312
- "normalized": false,
1313
- "rstrip": false,
1314
- "single_word": false,
1315
- "special": false
1316
- },
1317
- "92513": {
1318
- "content": "[UNUSED_TOKEN_116]",
1319
- "lstrip": false,
1320
- "normalized": false,
1321
- "rstrip": false,
1322
- "single_word": false,
1323
- "special": false
1324
- },
1325
- "92514": {
1326
- "content": "[UNUSED_TOKEN_117]",
1327
- "lstrip": false,
1328
- "normalized": false,
1329
- "rstrip": false,
1330
- "single_word": false,
1331
- "special": false
1332
- },
1333
- "92515": {
1334
- "content": "[UNUSED_TOKEN_118]",
1335
- "lstrip": false,
1336
- "normalized": false,
1337
- "rstrip": false,
1338
- "single_word": false,
1339
- "special": false
1340
- },
1341
- "92516": {
1342
- "content": "[UNUSED_TOKEN_119]",
1343
- "lstrip": false,
1344
- "normalized": false,
1345
- "rstrip": false,
1346
- "single_word": false,
1347
- "special": false
1348
- },
1349
- "92517": {
1350
- "content": "[UNUSED_TOKEN_120]",
1351
- "lstrip": false,
1352
- "normalized": false,
1353
- "rstrip": false,
1354
- "single_word": false,
1355
- "special": false
1356
- },
1357
- "92518": {
1358
- "content": "[UNUSED_TOKEN_121]",
1359
- "lstrip": false,
1360
- "normalized": false,
1361
- "rstrip": false,
1362
- "single_word": false,
1363
- "special": false
1364
- },
1365
- "92519": {
1366
- "content": "[UNUSED_TOKEN_122]",
1367
- "lstrip": false,
1368
- "normalized": false,
1369
- "rstrip": false,
1370
- "single_word": false,
1371
- "special": false
1372
- },
1373
- "92520": {
1374
- "content": "[UNUSED_TOKEN_123]",
1375
- "lstrip": false,
1376
- "normalized": false,
1377
- "rstrip": false,
1378
- "single_word": false,
1379
- "special": false
1380
- },
1381
- "92521": {
1382
- "content": "[UNUSED_TOKEN_124]",
1383
- "lstrip": false,
1384
- "normalized": false,
1385
- "rstrip": false,
1386
- "single_word": false,
1387
- "special": false
1388
- },
1389
- "92522": {
1390
- "content": "[UNUSED_TOKEN_125]",
1391
- "lstrip": false,
1392
- "normalized": false,
1393
- "rstrip": false,
1394
- "single_word": false,
1395
- "special": false
1396
- },
1397
- "92523": {
1398
- "content": "[UNUSED_TOKEN_126]",
1399
- "lstrip": false,
1400
- "normalized": false,
1401
- "rstrip": false,
1402
- "single_word": false,
1403
- "special": false
1404
- },
1405
- "92524": {
1406
- "content": "[UNUSED_TOKEN_127]",
1407
- "lstrip": false,
1408
- "normalized": false,
1409
- "rstrip": false,
1410
- "single_word": false,
1411
- "special": false
1412
- },
1413
- "92525": {
1414
- "content": "[UNUSED_TOKEN_128]",
1415
- "lstrip": false,
1416
- "normalized": false,
1417
- "rstrip": false,
1418
- "single_word": false,
1419
- "special": false
1420
- },
1421
- "92526": {
1422
- "content": "[UNUSED_TOKEN_129]",
1423
- "lstrip": false,
1424
- "normalized": false,
1425
- "rstrip": false,
1426
- "single_word": false,
1427
- "special": false
1428
- },
1429
- "92527": {
1430
- "content": "[UNUSED_TOKEN_130]",
1431
- "lstrip": false,
1432
- "normalized": false,
1433
- "rstrip": false,
1434
- "single_word": false,
1435
- "special": false
1436
- },
1437
- "92528": {
1438
- "content": "[UNUSED_TOKEN_131]",
1439
- "lstrip": false,
1440
- "normalized": false,
1441
- "rstrip": false,
1442
- "single_word": false,
1443
- "special": false
1444
- },
1445
- "92529": {
1446
- "content": "[UNUSED_TOKEN_132]",
1447
- "lstrip": false,
1448
- "normalized": false,
1449
- "rstrip": false,
1450
- "single_word": false,
1451
- "special": false
1452
- },
1453
- "92530": {
1454
- "content": "[UNUSED_TOKEN_133]",
1455
- "lstrip": false,
1456
- "normalized": false,
1457
- "rstrip": false,
1458
- "single_word": false,
1459
- "special": false
1460
- },
1461
- "92531": {
1462
- "content": "[UNUSED_TOKEN_134]",
1463
- "lstrip": false,
1464
- "normalized": false,
1465
- "rstrip": false,
1466
- "single_word": false,
1467
- "special": false
1468
- },
1469
- "92532": {
1470
- "content": "[UNUSED_TOKEN_135]",
1471
- "lstrip": false,
1472
- "normalized": false,
1473
- "rstrip": false,
1474
- "single_word": false,
1475
- "special": false
1476
- },
1477
- "92533": {
1478
- "content": "[UNUSED_TOKEN_136]",
1479
- "lstrip": false,
1480
- "normalized": false,
1481
- "rstrip": false,
1482
- "single_word": false,
1483
- "special": false
1484
- },
1485
- "92534": {
1486
- "content": "[UNUSED_TOKEN_137]",
1487
- "lstrip": false,
1488
- "normalized": false,
1489
- "rstrip": false,
1490
- "single_word": false,
1491
- "special": false
1492
- },
1493
- "92535": {
1494
- "content": "[UNUSED_TOKEN_138]",
1495
- "lstrip": false,
1496
- "normalized": false,
1497
- "rstrip": false,
1498
- "single_word": false,
1499
- "special": false
1500
- },
1501
- "92536": {
1502
- "content": "[UNUSED_TOKEN_139]",
1503
- "lstrip": false,
1504
- "normalized": false,
1505
- "rstrip": false,
1506
- "single_word": false,
1507
- "special": false
1508
- },
1509
- "92537": {
1510
- "content": "[UNUSED_TOKEN_140]",
1511
- "lstrip": false,
1512
- "normalized": false,
1513
- "rstrip": false,
1514
- "single_word": false,
1515
- "special": false
1516
- },
1517
- "92538": {
1518
- "content": "<|plugin|>",
1519
- "lstrip": false,
1520
- "normalized": false,
1521
- "rstrip": false,
1522
- "single_word": false,
1523
- "special": true
1524
- },
1525
- "92539": {
1526
- "content": "<|interpreter|>",
1527
- "lstrip": false,
1528
- "normalized": false,
1529
- "rstrip": false,
1530
- "single_word": false,
1531
- "special": true
1532
- },
1533
- "92540": {
1534
- "content": "<|action_end|>",
1535
- "lstrip": false,
1536
- "normalized": false,
1537
- "rstrip": false,
1538
- "single_word": false,
1539
- "special": true
1540
- },
1541
- "92541": {
1542
- "content": "<|action_start|>",
1543
- "lstrip": false,
1544
- "normalized": false,
1545
- "rstrip": false,
1546
- "single_word": false,
1547
- "special": true
1548
- },
1549
- "92542": {
1550
- "content": "<|im_end|>",
1551
- "lstrip": false,
1552
- "normalized": false,
1553
- "rstrip": false,
1554
- "single_word": false,
1555
- "special": true
1556
- },
1557
- "92543": {
1558
- "content": "<|im_start|>",
1559
- "lstrip": false,
1560
- "normalized": false,
1561
- "rstrip": false,
1562
- "single_word": false,
1563
- "special": true
1564
- },
1565
- "92544": {
1566
- "content": "[UNUSED_TOKEN_141]",
1567
- "lstrip": false,
1568
- "normalized": false,
1569
- "rstrip": false,
1570
- "single_word": false,
1571
- "special": false
1572
- },
1573
- "92545": {
1574
- "content": "[UNUSED_TOKEN_142]",
1575
- "lstrip": false,
1576
- "normalized": false,
1577
- "rstrip": false,
1578
- "single_word": false,
1579
- "special": false
1580
- },
1581
- "92546": {
1582
- "content": "[UNUSED_TOKEN_143]",
1583
- "lstrip": false,
1584
- "normalized": false,
1585
- "rstrip": false,
1586
- "single_word": false,
1587
- "special": false
1588
- },
1589
- "92547": {
1590
- "content": "[UNUSED_TOKEN_144]",
1591
- "lstrip": false,
1592
- "normalized": false,
1593
- "rstrip": false,
1594
- "single_word": false,
1595
- "special": false
1596
- },
1597
- "92548": {
1598
- "content": "[UNUSED_TOKEN_145]",
1599
- "lstrip": false,
1600
- "normalized": false,
1601
- "rstrip": false,
1602
- "single_word": false,
1603
- "special": false
1604
- },
1605
- "92549": {
1606
- "content": "[UNUSED_TOKEN_146]",
1607
- "lstrip": false,
1608
- "normalized": false,
1609
- "rstrip": false,
1610
- "single_word": false,
1611
- "special": false
1612
- }
1613
- },
1614
- "additional_special_tokens": [
1615
- "<|im_start|>",
1616
- "<|im_end|>",
1617
- "<|action_start|>",
1618
- "<|action_end|>",
1619
- "<|interpreter|>",
1620
- "<|plugin|>"
1621
- ],
1622
- "auto_map": {
1623
- "AutoTokenizer": [
1624
- "tokenization_internlm2.InternLM2Tokenizer",
1625
- "tokenization_internlm2_fast.InternLM2TokenizerFast"
1626
- ]
1627
- },
1628
- "bos_token": "<s>",
1629
- "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|im_start|>system\n' + system_message + '<|im_end|>\n' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}",
1630
- "clean_up_tokenization_spaces": false,
1631
- "decode_with_prefix_space": false,
1632
- "eos_token": "<|im_end|>",
1633
- "model_max_length": 1000000000000000019884624838656,
1634
- "pad_token": "</s>",
1635
- "padding_side": "right",
1636
- "sp_model_kwargs": null,
1637
- "split_special_tokens": false,
1638
- "tokenizer_class": "InternLM2Tokenizer",
1639
- "unk_token": "<unk>"
1640
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-1686/trainer_state.json DELETED
@@ -1,169 +0,0 @@
1
- {
2
- "best_metric": null,
3
- "best_model_checkpoint": null,
4
- "epoch": 2.997333333333333,
5
- "eval_steps": 562,
6
- "global_step": 1686,
7
- "is_hyper_param_search": false,
8
- "is_local_process_zero": true,
9
- "is_world_process_zero": true,
10
- "log_history": [
11
- {
12
- "epoch": 0.17777777777777778,
13
- "grad_norm": 4.040336608886719,
14
- "learning_rate": 2.958579881656805e-05,
15
- "loss": 0.4641,
16
- "step": 100
17
- },
18
- {
19
- "epoch": 0.35555555555555557,
20
- "grad_norm": 3.9950191974639893,
21
- "learning_rate": 5.91715976331361e-05,
22
- "loss": 0.3704,
23
- "step": 200
24
- },
25
- {
26
- "epoch": 0.5333333333333333,
27
- "grad_norm": 3.9038829803466797,
28
- "learning_rate": 8.875739644970414e-05,
29
- "loss": 0.3751,
30
- "step": 300
31
- },
32
- {
33
- "epoch": 0.7111111111111111,
34
- "grad_norm": 3.4998044967651367,
35
- "learning_rate": 9.989699867437137e-05,
36
- "loss": 0.3928,
37
- "step": 400
38
- },
39
- {
40
- "epoch": 0.8888888888888888,
41
- "grad_norm": 9.73261833190918,
42
- "learning_rate": 9.92981892269398e-05,
43
- "loss": 0.3497,
44
- "step": 500
45
- },
46
- {
47
- "epoch": 0.9991111111111111,
48
- "eval_loss": 0.6006748080253601,
49
- "eval_runtime": 411.5999,
50
- "eval_samples_per_second": 1.215,
51
- "eval_steps_per_second": 1.215,
52
- "step": 562
53
- },
54
- {
55
- "epoch": 1.0666666666666667,
56
- "grad_norm": 2.8831710815429688,
57
- "learning_rate": 9.817128546774103e-05,
58
- "loss": 0.3383,
59
- "step": 600
60
- },
61
- {
62
- "epoch": 1.2444444444444445,
63
- "grad_norm": 6.632827281951904,
64
- "learning_rate": 9.652835906663704e-05,
65
- "loss": 0.3167,
66
- "step": 700
67
- },
68
- {
69
- "epoch": 1.4222222222222223,
70
- "grad_norm": 6.977548122406006,
71
- "learning_rate": 9.438700945477697e-05,
72
- "loss": 0.3165,
73
- "step": 800
74
- },
75
- {
76
- "epoch": 1.6,
77
- "grad_norm": 10.037060737609863,
78
- "learning_rate": 9.177017529516772e-05,
79
- "loss": 0.2927,
80
- "step": 900
81
- },
82
- {
83
- "epoch": 1.7777777777777777,
84
- "grad_norm": 6.976019859313965,
85
- "learning_rate": 8.870588875808164e-05,
86
- "loss": 0.3062,
87
- "step": 1000
88
- },
89
- {
90
- "epoch": 1.9555555555555557,
91
- "grad_norm": 2.106227159500122,
92
- "learning_rate": 8.522697523356319e-05,
93
- "loss": 0.2678,
94
- "step": 1100
95
- },
96
- {
97
- "epoch": 1.9982222222222221,
98
- "eval_loss": 0.3569962680339813,
99
- "eval_runtime": 367.3155,
100
- "eval_samples_per_second": 1.361,
101
- "eval_steps_per_second": 1.361,
102
- "step": 1124
103
- },
104
- {
105
- "epoch": 2.1333333333333333,
106
- "grad_norm": 2.639159679412842,
107
- "learning_rate": 8.137070169778812e-05,
108
- "loss": 0.2198,
109
- "step": 1200
110
- },
111
- {
112
- "epoch": 2.311111111111111,
113
- "grad_norm": 3.6480841636657715,
114
- "learning_rate": 7.717837750006106e-05,
115
- "loss": 0.2412,
116
- "step": 1300
117
- },
118
- {
119
- "epoch": 2.488888888888889,
120
- "grad_norm": 2.240994930267334,
121
- "learning_rate": 7.269491184691924e-05,
122
- "loss": 0.1952,
123
- "step": 1400
124
- },
125
- {
126
- "epoch": 2.6666666666666665,
127
- "grad_norm": 13.413070678710938,
128
- "learning_rate": 6.79683327236813e-05,
129
- "loss": 0.2262,
130
- "step": 1500
131
- },
132
- {
133
- "epoch": 2.8444444444444446,
134
- "grad_norm": 0.8566445112228394,
135
- "learning_rate": 6.304927240687181e-05,
136
- "loss": 0.1949,
137
- "step": 1600
138
- },
139
- {
140
- "epoch": 2.997333333333333,
141
- "eval_loss": 0.42686474323272705,
142
- "eval_runtime": 365.7769,
143
- "eval_samples_per_second": 1.367,
144
- "eval_steps_per_second": 1.367,
145
- "step": 1686
146
- }
147
- ],
148
- "logging_steps": 100,
149
- "max_steps": 3372,
150
- "num_input_tokens_seen": 0,
151
- "num_train_epochs": 6,
152
- "save_steps": 562,
153
- "stateful_callbacks": {
154
- "TrainerControl": {
155
- "args": {
156
- "should_epoch_stop": false,
157
- "should_evaluate": false,
158
- "should_log": false,
159
- "should_save": true,
160
- "should_training_stop": false
161
- },
162
- "attributes": {}
163
- }
164
- },
165
- "total_flos": 2.033572224618332e+17,
166
- "train_batch_size": 1,
167
- "trial_name": null,
168
- "trial_params": null
169
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-1686/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:aee0526e63b02d5d5f300a2e1dfcfcc13e168aae73493f8d596712a07178876b
3
- size 5304
 
 
 
 
llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-2248/README.md DELETED
@@ -1,202 +0,0 @@
1
- ---
2
- library_name: peft
3
- base_model: internlm/internlm2_5-7b-chat-1m
4
- ---
5
-
6
- # Model Card for Model ID
7
-
8
- <!-- Provide a quick summary of what the model is/does. -->
9
-
10
-
11
-
12
- ## Model Details
13
-
14
- ### Model Description
15
-
16
- <!-- Provide a longer summary of what this model is. -->
17
-
18
-
19
-
20
- - **Developed by:** [More Information Needed]
21
- - **Funded by [optional]:** [More Information Needed]
22
- - **Shared by [optional]:** [More Information Needed]
23
- - **Model type:** [More Information Needed]
24
- - **Language(s) (NLP):** [More Information Needed]
25
- - **License:** [More Information Needed]
26
- - **Finetuned from model [optional]:** [More Information Needed]
27
-
28
- ### Model Sources [optional]
29
-
30
- <!-- Provide the basic links for the model. -->
31
-
32
- - **Repository:** [More Information Needed]
33
- - **Paper [optional]:** [More Information Needed]
34
- - **Demo [optional]:** [More Information Needed]
35
-
36
- ## Uses
37
-
38
- <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
-
40
- ### Direct Use
41
-
42
- <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
-
44
- [More Information Needed]
45
-
46
- ### Downstream Use [optional]
47
-
48
- <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
-
50
- [More Information Needed]
51
-
52
- ### Out-of-Scope Use
53
-
54
- <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
-
56
- [More Information Needed]
57
-
58
- ## Bias, Risks, and Limitations
59
-
60
- <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
-
62
- [More Information Needed]
63
-
64
- ### Recommendations
65
-
66
- <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
-
68
- Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
-
70
- ## How to Get Started with the Model
71
-
72
- Use the code below to get started with the model.
73
-
74
- [More Information Needed]
75
-
76
- ## Training Details
77
-
78
- ### Training Data
79
-
80
- <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
-
82
- [More Information Needed]
83
-
84
- ### Training Procedure
85
-
86
- <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
-
88
- #### Preprocessing [optional]
89
-
90
- [More Information Needed]
91
-
92
-
93
- #### Training Hyperparameters
94
-
95
- - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
-
97
- #### Speeds, Sizes, Times [optional]
98
-
99
- <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
-
101
- [More Information Needed]
102
-
103
- ## Evaluation
104
-
105
- <!-- This section describes the evaluation protocols and provides the results. -->
106
-
107
- ### Testing Data, Factors & Metrics
108
-
109
- #### Testing Data
110
-
111
- <!-- This should link to a Dataset Card if possible. -->
112
-
113
- [More Information Needed]
114
-
115
- #### Factors
116
-
117
- <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
-
119
- [More Information Needed]
120
-
121
- #### Metrics
122
-
123
- <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
-
125
- [More Information Needed]
126
-
127
- ### Results
128
-
129
- [More Information Needed]
130
-
131
- #### Summary
132
-
133
-
134
-
135
- ## Model Examination [optional]
136
-
137
- <!-- Relevant interpretability work for the model goes here -->
138
-
139
- [More Information Needed]
140
-
141
- ## Environmental Impact
142
-
143
- <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
-
145
- Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
-
147
- - **Hardware Type:** [More Information Needed]
148
- - **Hours used:** [More Information Needed]
149
- - **Cloud Provider:** [More Information Needed]
150
- - **Compute Region:** [More Information Needed]
151
- - **Carbon Emitted:** [More Information Needed]
152
-
153
- ## Technical Specifications [optional]
154
-
155
- ### Model Architecture and Objective
156
-
157
- [More Information Needed]
158
-
159
- ### Compute Infrastructure
160
-
161
- [More Information Needed]
162
-
163
- #### Hardware
164
-
165
- [More Information Needed]
166
-
167
- #### Software
168
-
169
- [More Information Needed]
170
-
171
- ## Citation [optional]
172
-
173
- <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
-
175
- **BibTeX:**
176
-
177
- [More Information Needed]
178
-
179
- **APA:**
180
-
181
- [More Information Needed]
182
-
183
- ## Glossary [optional]
184
-
185
- <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
-
187
- [More Information Needed]
188
-
189
- ## More Information [optional]
190
-
191
- [More Information Needed]
192
-
193
- ## Model Card Authors [optional]
194
-
195
- [More Information Needed]
196
-
197
- ## Model Card Contact
198
-
199
- [More Information Needed]
200
- ### Framework versions
201
-
202
- - PEFT 0.11.1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-2248/adapter_config.json DELETED
@@ -1,32 +0,0 @@
1
- {
2
- "alpha_pattern": {},
3
- "auto_mapping": null,
4
- "base_model_name_or_path": "internlm/internlm2_5-7b-chat-1m",
5
- "bias": "none",
6
- "fan_in_fan_out": false,
7
- "inference_mode": true,
8
- "init_lora_weights": true,
9
- "layer_replication": null,
10
- "layers_pattern": null,
11
- "layers_to_transform": null,
12
- "loftq_config": {},
13
- "lora_alpha": 16,
14
- "lora_dropout": 0.0,
15
- "megatron_config": null,
16
- "megatron_core": "megatron.core",
17
- "modules_to_save": null,
18
- "peft_type": "LORA",
19
- "r": 8,
20
- "rank_pattern": {},
21
- "revision": null,
22
- "target_modules": [
23
- "w3",
24
- "wo",
25
- "w1",
26
- "wqkv",
27
- "w2"
28
- ],
29
- "task_type": "CAUSAL_LM",
30
- "use_dora": false,
31
- "use_rslora": false
32
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-2248/adapter_model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:7c9c1e3de94ce17bac71870c9fdb4f38ee793dedf63408a300be4f77a3e99d52
3
- size 75539712
 
 
 
 
llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-2248/added_tokens.json DELETED
@@ -1,8 +0,0 @@
1
- {
2
- "[UNUSED_TOKEN_141]": 92544,
3
- "[UNUSED_TOKEN_142]": 92545,
4
- "[UNUSED_TOKEN_143]": 92546,
5
- "[UNUSED_TOKEN_144]": 92547,
6
- "[UNUSED_TOKEN_145]": 92548,
7
- "[UNUSED_TOKEN_146]": 92549
8
- }
 
 
 
 
 
 
 
 
 
llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-2248/optimizer.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:4637115ef19bf33a9b74063e48b805549dfd0582b7a622f402223671dbfc201c
3
- size 151264058
 
 
 
 
llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-2248/rng_state.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:9899ccda7f0d8d9511991180b93aab508ce6e8489de708c88ad1188e7e1d90d6
3
- size 14244
 
 
 
 
llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-2248/scheduler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:999d23797fcc8ea5f6c9314d2f36a0519a6c95f473b2a0839184c31f854432ee
3
- size 1064
 
 
 
 
llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-2248/special_tokens_map.json DELETED
@@ -1,38 +0,0 @@
1
- {
2
- "additional_special_tokens": [
3
- "<|im_start|>",
4
- "<|im_end|>",
5
- "<|action_start|>",
6
- "<|action_end|>",
7
- "<|interpreter|>",
8
- "<|plugin|>"
9
- ],
10
- "bos_token": {
11
- "content": "<s>",
12
- "lstrip": false,
13
- "normalized": false,
14
- "rstrip": false,
15
- "single_word": false
16
- },
17
- "eos_token": {
18
- "content": "<|im_end|>",
19
- "lstrip": false,
20
- "normalized": false,
21
- "rstrip": false,
22
- "single_word": false
23
- },
24
- "pad_token": {
25
- "content": "</s>",
26
- "lstrip": false,
27
- "normalized": false,
28
- "rstrip": false,
29
- "single_word": false
30
- },
31
- "unk_token": {
32
- "content": "<unk>",
33
- "lstrip": false,
34
- "normalized": false,
35
- "rstrip": false,
36
- "single_word": false
37
- }
38
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-2248/tokenization_internlm2.py DELETED
@@ -1,236 +0,0 @@
1
- # coding=utf-8
2
- # Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
3
- #
4
- # This code is based on transformers/src/transformers/models/llama/tokenization_llama.py
5
- #
6
- # Licensed under the Apache License, Version 2.0 (the "License");
7
- # you may not use this file except in compliance with the License.
8
- # You may obtain a copy of the License at
9
- #
10
- # http://www.apache.org/licenses/LICENSE-2.0
11
- #
12
- # Unless required by applicable law or agreed to in writing, software
13
- # distributed under the License is distributed on an "AS IS" BASIS,
14
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
- # See the License for the specific language governing permissions and
16
- # limitations under the License.
17
-
18
- """Tokenization classes for InternLM."""
19
- import os
20
- from shutil import copyfile
21
- from typing import Any, Dict, List, Optional, Tuple
22
-
23
- import sentencepiece as spm
24
- from transformers.tokenization_utils import PreTrainedTokenizer
25
- from transformers.utils import logging
26
-
27
- logger = logging.get_logger(__name__)
28
-
29
- VOCAB_FILES_NAMES = {"vocab_file": "./tokenizer.model"}
30
-
31
- PRETRAINED_VOCAB_FILES_MAP = {}
32
-
33
-
34
- # Modified from transformers.model.llama.tokenization_llama.LlamaTokenizer
35
- class InternLM2Tokenizer(PreTrainedTokenizer):
36
- """
37
- Construct a InternLM2 tokenizer. Based on byte-level Byte-Pair-Encoding.
38
-
39
- Args:
40
- vocab_file (`str`):
41
- Path to the vocabulary file.
42
- """
43
-
44
- vocab_files_names = VOCAB_FILES_NAMES
45
- pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
46
- model_input_names = ["input_ids", "attention_mask"]
47
- _auto_class = "AutoTokenizer"
48
-
49
- def __init__(
50
- self,
51
- vocab_file,
52
- unk_token="<unk>",
53
- bos_token="<s>",
54
- eos_token="</s>",
55
- pad_token="</s>",
56
- sp_model_kwargs: Optional[Dict[str, Any]] = None,
57
- add_bos_token=True,
58
- add_eos_token=False,
59
- decode_with_prefix_space=False,
60
- clean_up_tokenization_spaces=False,
61
- **kwargs,
62
- ):
63
- self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
64
- self.vocab_file = vocab_file
65
- self.add_bos_token = add_bos_token
66
- self.add_eos_token = add_eos_token
67
- self.decode_with_prefix_space = decode_with_prefix_space
68
- self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
69
- self.sp_model.Load(vocab_file)
70
- self._no_prefix_space_tokens = None
71
- super().__init__(
72
- bos_token=bos_token,
73
- eos_token=eos_token,
74
- unk_token=unk_token,
75
- pad_token=pad_token,
76
- clean_up_tokenization_spaces=clean_up_tokenization_spaces,
77
- **kwargs,
78
- )
79
-
80
- @property
81
- def no_prefix_space_tokens(self):
82
- if self._no_prefix_space_tokens is None:
83
- vocab = self.convert_ids_to_tokens(list(range(self.vocab_size)))
84
- self._no_prefix_space_tokens = {i for i, tok in enumerate(vocab) if not tok.startswith("▁")}
85
- return self._no_prefix_space_tokens
86
-
87
- @property
88
- def vocab_size(self):
89
- """Returns vocab size"""
90
- return self.sp_model.get_piece_size()
91
-
92
- @property
93
- def bos_token_id(self) -> Optional[int]:
94
- return self.sp_model.bos_id()
95
-
96
- @property
97
- def eos_token_id(self) -> Optional[int]:
98
- return self.sp_model.eos_id()
99
-
100
- def get_vocab(self):
101
- """Returns vocab as a dict"""
102
- vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
103
- vocab.update(self.added_tokens_encoder)
104
- return vocab
105
-
106
- def _tokenize(self, text):
107
- """Returns a tokenized string."""
108
- return self.sp_model.encode(text, out_type=str)
109
-
110
- def _convert_token_to_id(self, token):
111
- """Converts a token (str) in an id using the vocab."""
112
- return self.sp_model.piece_to_id(token)
113
-
114
- def _convert_id_to_token(self, index):
115
- """Converts an index (integer) in a token (str) using the vocab."""
116
- token = self.sp_model.IdToPiece(index)
117
- return token
118
-
119
- def _maybe_add_prefix_space(self, tokens, decoded):
120
- if tokens and tokens[0] not in self.no_prefix_space_tokens:
121
- return " " + decoded
122
- else:
123
- return decoded
124
-
125
- def convert_tokens_to_string(self, tokens):
126
- """Converts a sequence of tokens (string) in a single string."""
127
- current_sub_tokens = []
128
- out_string = ""
129
- prev_is_special = False
130
- for token in tokens:
131
- # make sure that special tokens are not decoded using sentencepiece model
132
- if token in self.all_special_tokens:
133
- if not prev_is_special:
134
- out_string += " "
135
- out_string += self.sp_model.decode(current_sub_tokens) + token
136
- prev_is_special = True
137
- current_sub_tokens = []
138
- else:
139
- current_sub_tokens.append(token)
140
- prev_is_special = False
141
- out_string += self.sp_model.decode(current_sub_tokens)
142
- out_string = self.clean_up_tokenization(out_string)
143
- out_string = self._maybe_add_prefix_space(tokens=tokens, decoded=out_string)
144
- return out_string[1:]
145
-
146
- def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
147
- """
148
- Save the vocabulary and special tokens file to a directory.
149
-
150
- Args:
151
- save_directory (`str`):
152
- The directory in which to save the vocabulary.
153
-
154
- Returns:
155
- `Tuple(str)`: Paths to the files saved.
156
- """
157
- if not os.path.isdir(save_directory):
158
- logger.error(f"Vocabulary path ({save_directory}) should be a directory")
159
- return
160
- out_vocab_file = os.path.join(
161
- save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
162
- )
163
-
164
- if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
165
- copyfile(self.vocab_file, out_vocab_file)
166
- elif not os.path.isfile(self.vocab_file):
167
- with open(out_vocab_file, "wb") as fi:
168
- content_spiece_model = self.sp_model.serialized_model_proto()
169
- fi.write(content_spiece_model)
170
-
171
- return (out_vocab_file,)
172
-
173
- def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
174
- if self.add_bos_token:
175
- bos_token_ids = [self.bos_token_id]
176
- else:
177
- bos_token_ids = []
178
-
179
- output = bos_token_ids + token_ids_0
180
-
181
- if token_ids_1 is not None:
182
- output = output + token_ids_1
183
-
184
- if self.add_eos_token:
185
- output = output + [self.eos_token_id]
186
-
187
- return output
188
-
189
- def get_special_tokens_mask(
190
- self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
191
- ) -> List[int]:
192
- """
193
- Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
194
- special tokens using the tokenizer `prepare_for_model` method.
195
-
196
- Args:
197
- token_ids_0 (`List[int]`):
198
- List of IDs.
199
- token_ids_1 (`List[int]`, *optional*):
200
- Optional second list of IDs for sequence pairs.
201
- already_has_special_tokens (`bool`, *optional*, defaults to `False`):
202
- Whether or not the token list is already formatted with special tokens for the model.
203
-
204
- Returns:
205
- `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
206
- """
207
- if already_has_special_tokens:
208
- return super().get_special_tokens_mask(
209
- token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
210
- )
211
-
212
- if token_ids_1 is None:
213
- return [1] + ([0] * len(token_ids_0)) + [1]
214
- return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
215
-
216
- def create_token_type_ids_from_sequences(
217
- self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
218
- ) -> List[int]:
219
- """
220
- Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
221
- use of token type ids, therefore a list of zeros is returned.
222
-
223
- Args:
224
- token_ids_0 (`List[int]`):
225
- List of IDs.
226
- token_ids_1 (`List[int]`, *optional*):
227
- Optional second list of IDs for sequence pairs.
228
-
229
- Returns:
230
- `List[int]`: List of zeros.
231
- """
232
- eos = [self.eos_token_id]
233
-
234
- if token_ids_1 is None:
235
- return len(token_ids_0 + eos) * [0]
236
- return len(token_ids_0 + eos + token_ids_1 + eos) * [0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-2248/tokenization_internlm2_fast.py DELETED
@@ -1,214 +0,0 @@
1
- # coding=utf-8
2
- # Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
3
- #
4
- # This code is based on transformers/src/transformers/models/llama/tokenization_llama_fast.py
5
- #
6
- # Licensed under the Apache License, Version 2.0 (the "License");
7
- # you may not use this file except in compliance with the License.
8
- # You may obtain a copy of the License at
9
- #
10
- # http://www.apache.org/licenses/LICENSE-2.0
11
- #
12
- # Unless required by applicable law or agreed to in writing, software
13
- # distributed under the License is distributed on an "AS IS" BASIS,
14
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
- # See the License for the specific language governing permissions and
16
- # limitations under the License.
17
-
18
- """Tokenization Fast class for InternLM."""
19
- import os
20
- from shutil import copyfile
21
- from typing import Any, Dict, Optional, Tuple
22
-
23
- from tokenizers import processors, decoders, Tokenizer, normalizers
24
- from tokenizers.models import BPE
25
-
26
- from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
27
- from transformers.utils import logging
28
-
29
- from transformers.convert_slow_tokenizer import (
30
- SLOW_TO_FAST_CONVERTERS,
31
- SpmConverter,
32
- SentencePieceExtractor,
33
- )
34
-
35
- from .tokenization_internlm2 import InternLM2Tokenizer
36
-
37
- logger = logging.get_logger(__name__)
38
-
39
- VOCAB_FILES_NAMES = {"vocab_file": "./tokenizer.model"}
40
-
41
- # Modified from transformers.convert_slow_tokenizer.LlamaConverter
42
- class InternLM2Converter(SpmConverter):
43
- handle_byte_fallback = True
44
-
45
- def vocab(self, proto):
46
- vocab = [
47
- ("<unk>", 0.0),
48
- ("<s>", 0.0),
49
- ("</s>", 0.0),
50
- ]
51
- vocab += [(piece.piece, piece.score) for piece in proto.pieces[3:]]
52
- return vocab
53
-
54
- def unk_id(self, proto):
55
- unk_id = 0
56
- return unk_id
57
-
58
- def decoder(self, replacement, add_prefix_space):
59
- decoders_sequence = [
60
- decoders.Replace("▁", " "),
61
- decoders.ByteFallback(),
62
- decoders.Fuse(),
63
- ]
64
- if self.proto.normalizer_spec.add_dummy_prefix:
65
- decoders_sequence.append(decoders.Strip(content=" ", left=1))
66
- return decoders.Sequence(decoders_sequence)
67
-
68
- def tokenizer(self, proto):
69
- model_type = proto.trainer_spec.model_type
70
- vocab_scores = self.vocab(proto)
71
- # special tokens
72
- added_tokens = self.original_tokenizer.added_tokens_decoder
73
- for i in range(len(vocab_scores)):
74
- piece, score = vocab_scores[i]
75
- if i in added_tokens:
76
- vocab_scores[i] = (added_tokens[i].content, score)
77
- if model_type == 1:
78
- raise RuntimeError("InternLM2 is supposed to be a BPE model!")
79
-
80
- elif model_type == 2:
81
- _, merges = SentencePieceExtractor(self.original_tokenizer.vocab_file).extract(vocab_scores)
82
- bpe_vocab = {word: i for i, (word, _score) in enumerate(vocab_scores)}
83
- tokenizer = Tokenizer(
84
- BPE(bpe_vocab, merges, unk_token=proto.trainer_spec.unk_piece, fuse_unk=True, byte_fallback=True)
85
- )
86
- tokenizer.add_special_tokens(
87
- [ added_token for index, added_token in added_tokens.items()]
88
- )
89
- else:
90
- raise Exception(
91
- "You're trying to run a `Unigram` model but you're file was trained with a different algorithm"
92
- )
93
-
94
- return tokenizer
95
-
96
- def normalizer(self, proto):
97
- normalizers_list = []
98
- if proto.normalizer_spec.add_dummy_prefix:
99
- normalizers_list.append(normalizers.Prepend(prepend="▁"))
100
- normalizers_list.append(normalizers.Replace(pattern=" ", content="▁"))
101
- return normalizers.Sequence(normalizers_list)
102
-
103
- def pre_tokenizer(self, replacement, add_prefix_space):
104
- return None
105
-
106
- SLOW_TO_FAST_CONVERTERS["InternLM2Tokenizer"] = InternLM2Converter
107
-
108
-
109
- # Modified from transformers.model.llama.tokenization_llama_fast.LlamaTokenizerFast -> InternLM2TokenizerFast
110
- class InternLM2TokenizerFast(PreTrainedTokenizerFast):
111
- vocab_files_names = VOCAB_FILES_NAMES
112
- slow_tokenizer_class = InternLM2Tokenizer
113
- padding_side = "left"
114
- model_input_names = ["input_ids", "attention_mask"]
115
- _auto_class = "AutoTokenizer"
116
-
117
- def __init__(
118
- self,
119
- vocab_file,
120
- unk_token="<unk>",
121
- bos_token="<s>",
122
- eos_token="</s>",
123
- pad_token="</s>",
124
- sp_model_kwargs: Optional[Dict[str, Any]] = None,
125
- add_bos_token=True,
126
- add_eos_token=False,
127
- decode_with_prefix_space=False,
128
- clean_up_tokenization_spaces=False,
129
- **kwargs,
130
- ):
131
- super().__init__(
132
- vocab_file=vocab_file,
133
- unk_token=unk_token,
134
- bos_token=bos_token,
135
- eos_token=eos_token,
136
- pad_token=pad_token,
137
- sp_model_kwargs=sp_model_kwargs,
138
- add_bos_token=add_bos_token,
139
- add_eos_token=add_eos_token,
140
- decode_with_prefix_space=decode_with_prefix_space,
141
- clean_up_tokenization_spaces=clean_up_tokenization_spaces,
142
- **kwargs,
143
- )
144
- self._add_bos_token = add_bos_token
145
- self._add_eos_token = add_eos_token
146
- self.update_post_processor()
147
- self.vocab_file = vocab_file
148
-
149
- @property
150
- def can_save_slow_tokenizer(self) -> bool:
151
- return os.path.isfile(self.vocab_file) if self.vocab_file else False
152
-
153
- def update_post_processor(self):
154
- """
155
- Updates the underlying post processor with the current `bos_token` and `eos_token`.
156
- """
157
- bos = self.bos_token
158
- bos_token_id = self.bos_token_id
159
- if bos is None and self.add_bos_token:
160
- raise ValueError("add_bos_token = True but bos_token = None")
161
-
162
- eos = self.eos_token
163
- eos_token_id = self.eos_token_id
164
- if eos is None and self.add_eos_token:
165
- raise ValueError("add_eos_token = True but eos_token = None")
166
-
167
- single = f"{(bos+':0 ') if self.add_bos_token else ''}$A:0{(' '+eos+':0') if self.add_eos_token else ''}"
168
- pair = f"{single}{(' '+bos+':1') if self.add_bos_token else ''} $B:1{(' '+eos+':1') if self.add_eos_token else ''}"
169
-
170
- special_tokens = []
171
- if self.add_bos_token:
172
- special_tokens.append((bos, bos_token_id))
173
- if self.add_eos_token:
174
- special_tokens.append((eos, eos_token_id))
175
- self._tokenizer.post_processor = processors.TemplateProcessing(
176
- single=single, pair=pair, special_tokens=special_tokens
177
- )
178
-
179
- @property
180
- def add_eos_token(self):
181
- return self._add_eos_token
182
-
183
- @property
184
- def add_bos_token(self):
185
- return self._add_bos_token
186
-
187
- @add_eos_token.setter
188
- def add_eos_token(self, value):
189
- self._add_eos_token = value
190
- self.update_post_processor()
191
-
192
- @add_bos_token.setter
193
- def add_bos_token(self, value):
194
- self._add_bos_token = value
195
- self.update_post_processor()
196
-
197
- def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
198
- if not self.can_save_slow_tokenizer:
199
- raise ValueError(
200
- "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
201
- "tokenizer."
202
- )
203
-
204
- if not os.path.isdir(save_directory):
205
- logger.error(f"Vocabulary path ({save_directory}) should be a directory")
206
- return
207
- out_vocab_file = os.path.join(
208
- save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
209
- )
210
-
211
- if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
212
- copyfile(self.vocab_file, out_vocab_file)
213
-
214
- return (out_vocab_file,)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-2248/tokenizer.json DELETED
The diff for this file is too large to render. See raw diff
 
llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-2248/tokenizer.model DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f868398fc4e05ee1e8aeba95ddf18ddcc45b8bce55d5093bead5bbf80429b48b
3
- size 1477754
 
 
 
 
llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-2248/tokenizer_config.json DELETED
@@ -1,1640 +0,0 @@
1
- {
2
- "add_bos_token": true,
3
- "add_eos_token": false,
4
- "added_tokens_decoder": {
5
- "0": {
6
- "content": "<unk>",
7
- "lstrip": false,
8
- "normalized": false,
9
- "rstrip": false,
10
- "single_word": false,
11
- "special": true
12
- },
13
- "1": {
14
- "content": "<s>",
15
- "lstrip": false,
16
- "normalized": false,
17
- "rstrip": false,
18
- "single_word": false,
19
- "special": true
20
- },
21
- "2": {
22
- "content": "</s>",
23
- "lstrip": false,
24
- "normalized": false,
25
- "rstrip": false,
26
- "single_word": false,
27
- "special": true
28
- },
29
- "92352": {
30
- "content": "E",
31
- "lstrip": false,
32
- "normalized": false,
33
- "rstrip": false,
34
- "single_word": false,
35
- "special": false
36
- },
37
- "92353": {
38
- "content": "F",
39
- "lstrip": false,
40
- "normalized": false,
41
- "rstrip": false,
42
- "single_word": false,
43
- "special": false
44
- },
45
- "92354": {
46
- "content": "G",
47
- "lstrip": false,
48
- "normalized": false,
49
- "rstrip": false,
50
- "single_word": false,
51
- "special": false
52
- },
53
- "92355": {
54
- "content": "H",
55
- "lstrip": false,
56
- "normalized": false,
57
- "rstrip": false,
58
- "single_word": false,
59
- "special": false
60
- },
61
- "92356": {
62
- "content": "I",
63
- "lstrip": false,
64
- "normalized": false,
65
- "rstrip": false,
66
- "single_word": false,
67
- "special": false
68
- },
69
- "92357": {
70
- "content": "J",
71
- "lstrip": false,
72
- "normalized": false,
73
- "rstrip": false,
74
- "single_word": false,
75
- "special": false
76
- },
77
- "92358": {
78
- "content": "K",
79
- "lstrip": false,
80
- "normalized": false,
81
- "rstrip": false,
82
- "single_word": false,
83
- "special": false
84
- },
85
- "92359": {
86
- "content": "L",
87
- "lstrip": false,
88
- "normalized": false,
89
- "rstrip": false,
90
- "single_word": false,
91
- "special": false
92
- },
93
- "92360": {
94
- "content": "M",
95
- "lstrip": false,
96
- "normalized": false,
97
- "rstrip": false,
98
- "single_word": false,
99
- "special": false
100
- },
101
- "92361": {
102
- "content": "N",
103
- "lstrip": false,
104
- "normalized": false,
105
- "rstrip": false,
106
- "single_word": false,
107
- "special": false
108
- },
109
- "92362": {
110
- "content": "R",
111
- "lstrip": false,
112
- "normalized": false,
113
- "rstrip": false,
114
- "single_word": false,
115
- "special": false
116
- },
117
- "92363": {
118
- "content": "U",
119
- "lstrip": false,
120
- "normalized": false,
121
- "rstrip": false,
122
- "single_word": false,
123
- "special": false
124
- },
125
- "92364": {
126
- "content": "V",
127
- "lstrip": false,
128
- "normalized": false,
129
- "rstrip": false,
130
- "single_word": false,
131
- "special": false
132
- },
133
- "92365": {
134
- "content": "W",
135
- "lstrip": false,
136
- "normalized": false,
137
- "rstrip": false,
138
- "single_word": false,
139
- "special": false
140
- },
141
- "92366": {
142
- "content": "X",
143
- "lstrip": false,
144
- "normalized": false,
145
- "rstrip": false,
146
- "single_word": false,
147
- "special": false
148
- },
149
- "92367": {
150
- "content": "Y",
151
- "lstrip": false,
152
- "normalized": false,
153
- "rstrip": false,
154
- "single_word": false,
155
- "special": false
156
- },
157
- "92368": {
158
- "content": "Z",
159
- "lstrip": false,
160
- "normalized": false,
161
- "rstrip": false,
162
- "single_word": false,
163
- "special": false
164
- },
165
- "92369": {
166
- "content": "a",
167
- "lstrip": false,
168
- "normalized": false,
169
- "rstrip": false,
170
- "single_word": false,
171
- "special": false
172
- },
173
- "92370": {
174
- "content": "b",
175
- "lstrip": false,
176
- "normalized": false,
177
- "rstrip": false,
178
- "single_word": false,
179
- "special": false
180
- },
181
- "92371": {
182
- "content": "c",
183
- "lstrip": false,
184
- "normalized": false,
185
- "rstrip": false,
186
- "single_word": false,
187
- "special": false
188
- },
189
- "92372": {
190
- "content": "d",
191
- "lstrip": false,
192
- "normalized": false,
193
- "rstrip": false,
194
- "single_word": false,
195
- "special": false
196
- },
197
- "92373": {
198
- "content": "e",
199
- "lstrip": false,
200
- "normalized": false,
201
- "rstrip": false,
202
- "single_word": false,
203
- "special": false
204
- },
205
- "92374": {
206
- "content": "f",
207
- "lstrip": false,
208
- "normalized": false,
209
- "rstrip": false,
210
- "single_word": false,
211
- "special": false
212
- },
213
- "92375": {
214
- "content": "g",
215
- "lstrip": false,
216
- "normalized": false,
217
- "rstrip": false,
218
- "single_word": false,
219
- "special": false
220
- },
221
- "92376": {
222
- "content": "h",
223
- "lstrip": false,
224
- "normalized": false,
225
- "rstrip": false,
226
- "single_word": false,
227
- "special": false
228
- },
229
- "92377": {
230
- "content": "i",
231
- "lstrip": false,
232
- "normalized": false,
233
- "rstrip": false,
234
- "single_word": false,
235
- "special": false
236
- },
237
- "92378": {
238
- "content": "j",
239
- "lstrip": false,
240
- "normalized": false,
241
- "rstrip": false,
242
- "single_word": false,
243
- "special": false
244
- },
245
- "92379": {
246
- "content": "k",
247
- "lstrip": false,
248
- "normalized": false,
249
- "rstrip": false,
250
- "single_word": false,
251
- "special": false
252
- },
253
- "92380": {
254
- "content": "l",
255
- "lstrip": false,
256
- "normalized": false,
257
- "rstrip": false,
258
- "single_word": false,
259
- "special": false
260
- },
261
- "92381": {
262
- "content": "m",
263
- "lstrip": false,
264
- "normalized": false,
265
- "rstrip": false,
266
- "single_word": false,
267
- "special": false
268
- },
269
- "92382": {
270
- "content": "n",
271
- "lstrip": false,
272
- "normalized": false,
273
- "rstrip": false,
274
- "single_word": false,
275
- "special": false
276
- },
277
- "92383": {
278
- "content": "o",
279
- "lstrip": false,
280
- "normalized": false,
281
- "rstrip": false,
282
- "single_word": false,
283
- "special": false
284
- },
285
- "92384": {
286
- "content": "p",
287
- "lstrip": false,
288
- "normalized": false,
289
- "rstrip": false,
290
- "single_word": false,
291
- "special": false
292
- },
293
- "92385": {
294
- "content": "q",
295
- "lstrip": false,
296
- "normalized": false,
297
- "rstrip": false,
298
- "single_word": false,
299
- "special": false
300
- },
301
- "92386": {
302
- "content": "r",
303
- "lstrip": false,
304
- "normalized": false,
305
- "rstrip": false,
306
- "single_word": false,
307
- "special": false
308
- },
309
- "92387": {
310
- "content": "s",
311
- "lstrip": false,
312
- "normalized": false,
313
- "rstrip": false,
314
- "single_word": false,
315
- "special": false
316
- },
317
- "92388": {
318
- "content": "t",
319
- "lstrip": false,
320
- "normalized": false,
321
- "rstrip": false,
322
- "single_word": false,
323
- "special": false
324
- },
325
- "92389": {
326
- "content": "u",
327
- "lstrip": false,
328
- "normalized": false,
329
- "rstrip": false,
330
- "single_word": false,
331
- "special": false
332
- },
333
- "92390": {
334
- "content": "v",
335
- "lstrip": false,
336
- "normalized": false,
337
- "rstrip": false,
338
- "single_word": false,
339
- "special": false
340
- },
341
- "92391": {
342
- "content": "w",
343
- "lstrip": false,
344
- "normalized": false,
345
- "rstrip": false,
346
- "single_word": false,
347
- "special": false
348
- },
349
- "92392": {
350
- "content": "x",
351
- "lstrip": false,
352
- "normalized": false,
353
- "rstrip": false,
354
- "single_word": false,
355
- "special": false
356
- },
357
- "92393": {
358
- "content": "y",
359
- "lstrip": false,
360
- "normalized": false,
361
- "rstrip": false,
362
- "single_word": false,
363
- "special": false
364
- },
365
- "92394": {
366
- "content": "z",
367
- "lstrip": false,
368
- "normalized": false,
369
- "rstrip": false,
370
- "single_word": false,
371
- "special": false
372
- },
373
- "92395": {
374
- "content": "——",
375
- "lstrip": false,
376
- "normalized": false,
377
- "rstrip": false,
378
- "single_word": false,
379
- "special": false
380
- },
381
- "92396": {
382
- "content": "……",
383
- "lstrip": false,
384
- "normalized": false,
385
- "rstrip": false,
386
- "single_word": false,
387
- "special": false
388
- },
389
- "92397": {
390
- "content": "[UNUSED_TOKEN_0]",
391
- "lstrip": false,
392
- "normalized": false,
393
- "rstrip": false,
394
- "single_word": false,
395
- "special": false
396
- },
397
- "92398": {
398
- "content": "[UNUSED_TOKEN_1]",
399
- "lstrip": false,
400
- "normalized": false,
401
- "rstrip": false,
402
- "single_word": false,
403
- "special": false
404
- },
405
- "92399": {
406
- "content": "[UNUSED_TOKEN_2]",
407
- "lstrip": false,
408
- "normalized": false,
409
- "rstrip": false,
410
- "single_word": false,
411
- "special": false
412
- },
413
- "92400": {
414
- "content": "[UNUSED_TOKEN_3]",
415
- "lstrip": false,
416
- "normalized": false,
417
- "rstrip": false,
418
- "single_word": false,
419
- "special": false
420
- },
421
- "92401": {
422
- "content": "[UNUSED_TOKEN_4]",
423
- "lstrip": false,
424
- "normalized": false,
425
- "rstrip": false,
426
- "single_word": false,
427
- "special": false
428
- },
429
- "92402": {
430
- "content": "[UNUSED_TOKEN_5]",
431
- "lstrip": false,
432
- "normalized": false,
433
- "rstrip": false,
434
- "single_word": false,
435
- "special": false
436
- },
437
- "92403": {
438
- "content": "[UNUSED_TOKEN_6]",
439
- "lstrip": false,
440
- "normalized": false,
441
- "rstrip": false,
442
- "single_word": false,
443
- "special": false
444
- },
445
- "92404": {
446
- "content": "[UNUSED_TOKEN_7]",
447
- "lstrip": false,
448
- "normalized": false,
449
- "rstrip": false,
450
- "single_word": false,
451
- "special": false
452
- },
453
- "92405": {
454
- "content": "[UNUSED_TOKEN_8]",
455
- "lstrip": false,
456
- "normalized": false,
457
- "rstrip": false,
458
- "single_word": false,
459
- "special": false
460
- },
461
- "92406": {
462
- "content": "[UNUSED_TOKEN_9]",
463
- "lstrip": false,
464
- "normalized": false,
465
- "rstrip": false,
466
- "single_word": false,
467
- "special": false
468
- },
469
- "92407": {
470
- "content": "[UNUSED_TOKEN_10]",
471
- "lstrip": false,
472
- "normalized": false,
473
- "rstrip": false,
474
- "single_word": false,
475
- "special": false
476
- },
477
- "92408": {
478
- "content": "[UNUSED_TOKEN_11]",
479
- "lstrip": false,
480
- "normalized": false,
481
- "rstrip": false,
482
- "single_word": false,
483
- "special": false
484
- },
485
- "92409": {
486
- "content": "[UNUSED_TOKEN_12]",
487
- "lstrip": false,
488
- "normalized": false,
489
- "rstrip": false,
490
- "single_word": false,
491
- "special": false
492
- },
493
- "92410": {
494
- "content": "[UNUSED_TOKEN_13]",
495
- "lstrip": false,
496
- "normalized": false,
497
- "rstrip": false,
498
- "single_word": false,
499
- "special": false
500
- },
501
- "92411": {
502
- "content": "[UNUSED_TOKEN_14]",
503
- "lstrip": false,
504
- "normalized": false,
505
- "rstrip": false,
506
- "single_word": false,
507
- "special": false
508
- },
509
- "92412": {
510
- "content": "[UNUSED_TOKEN_15]",
511
- "lstrip": false,
512
- "normalized": false,
513
- "rstrip": false,
514
- "single_word": false,
515
- "special": false
516
- },
517
- "92413": {
518
- "content": "[UNUSED_TOKEN_16]",
519
- "lstrip": false,
520
- "normalized": false,
521
- "rstrip": false,
522
- "single_word": false,
523
- "special": false
524
- },
525
- "92414": {
526
- "content": "[UNUSED_TOKEN_17]",
527
- "lstrip": false,
528
- "normalized": false,
529
- "rstrip": false,
530
- "single_word": false,
531
- "special": false
532
- },
533
- "92415": {
534
- "content": "[UNUSED_TOKEN_18]",
535
- "lstrip": false,
536
- "normalized": false,
537
- "rstrip": false,
538
- "single_word": false,
539
- "special": false
540
- },
541
- "92416": {
542
- "content": "[UNUSED_TOKEN_19]",
543
- "lstrip": false,
544
- "normalized": false,
545
- "rstrip": false,
546
- "single_word": false,
547
- "special": false
548
- },
549
- "92417": {
550
- "content": "[UNUSED_TOKEN_20]",
551
- "lstrip": false,
552
- "normalized": false,
553
- "rstrip": false,
554
- "single_word": false,
555
- "special": false
556
- },
557
- "92418": {
558
- "content": "[UNUSED_TOKEN_21]",
559
- "lstrip": false,
560
- "normalized": false,
561
- "rstrip": false,
562
- "single_word": false,
563
- "special": false
564
- },
565
- "92419": {
566
- "content": "[UNUSED_TOKEN_22]",
567
- "lstrip": false,
568
- "normalized": false,
569
- "rstrip": false,
570
- "single_word": false,
571
- "special": false
572
- },
573
- "92420": {
574
- "content": "[UNUSED_TOKEN_23]",
575
- "lstrip": false,
576
- "normalized": false,
577
- "rstrip": false,
578
- "single_word": false,
579
- "special": false
580
- },
581
- "92421": {
582
- "content": "[UNUSED_TOKEN_24]",
583
- "lstrip": false,
584
- "normalized": false,
585
- "rstrip": false,
586
- "single_word": false,
587
- "special": false
588
- },
589
- "92422": {
590
- "content": "[UNUSED_TOKEN_25]",
591
- "lstrip": false,
592
- "normalized": false,
593
- "rstrip": false,
594
- "single_word": false,
595
- "special": false
596
- },
597
- "92423": {
598
- "content": "[UNUSED_TOKEN_26]",
599
- "lstrip": false,
600
- "normalized": false,
601
- "rstrip": false,
602
- "single_word": false,
603
- "special": false
604
- },
605
- "92424": {
606
- "content": "[UNUSED_TOKEN_27]",
607
- "lstrip": false,
608
- "normalized": false,
609
- "rstrip": false,
610
- "single_word": false,
611
- "special": false
612
- },
613
- "92425": {
614
- "content": "[UNUSED_TOKEN_28]",
615
- "lstrip": false,
616
- "normalized": false,
617
- "rstrip": false,
618
- "single_word": false,
619
- "special": false
620
- },
621
- "92426": {
622
- "content": "[UNUSED_TOKEN_29]",
623
- "lstrip": false,
624
- "normalized": false,
625
- "rstrip": false,
626
- "single_word": false,
627
- "special": false
628
- },
629
- "92427": {
630
- "content": "[UNUSED_TOKEN_30]",
631
- "lstrip": false,
632
- "normalized": false,
633
- "rstrip": false,
634
- "single_word": false,
635
- "special": false
636
- },
637
- "92428": {
638
- "content": "[UNUSED_TOKEN_31]",
639
- "lstrip": false,
640
- "normalized": false,
641
- "rstrip": false,
642
- "single_word": false,
643
- "special": false
644
- },
645
- "92429": {
646
- "content": "[UNUSED_TOKEN_32]",
647
- "lstrip": false,
648
- "normalized": false,
649
- "rstrip": false,
650
- "single_word": false,
651
- "special": false
652
- },
653
- "92430": {
654
- "content": "[UNUSED_TOKEN_33]",
655
- "lstrip": false,
656
- "normalized": false,
657
- "rstrip": false,
658
- "single_word": false,
659
- "special": false
660
- },
661
- "92431": {
662
- "content": "[UNUSED_TOKEN_34]",
663
- "lstrip": false,
664
- "normalized": false,
665
- "rstrip": false,
666
- "single_word": false,
667
- "special": false
668
- },
669
- "92432": {
670
- "content": "[UNUSED_TOKEN_35]",
671
- "lstrip": false,
672
- "normalized": false,
673
- "rstrip": false,
674
- "single_word": false,
675
- "special": false
676
- },
677
- "92433": {
678
- "content": "[UNUSED_TOKEN_36]",
679
- "lstrip": false,
680
- "normalized": false,
681
- "rstrip": false,
682
- "single_word": false,
683
- "special": false
684
- },
685
- "92434": {
686
- "content": "[UNUSED_TOKEN_37]",
687
- "lstrip": false,
688
- "normalized": false,
689
- "rstrip": false,
690
- "single_word": false,
691
- "special": false
692
- },
693
- "92435": {
694
- "content": "[UNUSED_TOKEN_38]",
695
- "lstrip": false,
696
- "normalized": false,
697
- "rstrip": false,
698
- "single_word": false,
699
- "special": false
700
- },
701
- "92436": {
702
- "content": "[UNUSED_TOKEN_39]",
703
- "lstrip": false,
704
- "normalized": false,
705
- "rstrip": false,
706
- "single_word": false,
707
- "special": false
708
- },
709
- "92437": {
710
- "content": "[UNUSED_TOKEN_40]",
711
- "lstrip": false,
712
- "normalized": false,
713
- "rstrip": false,
714
- "single_word": false,
715
- "special": false
716
- },
717
- "92438": {
718
- "content": "[UNUSED_TOKEN_41]",
719
- "lstrip": false,
720
- "normalized": false,
721
- "rstrip": false,
722
- "single_word": false,
723
- "special": false
724
- },
725
- "92439": {
726
- "content": "[UNUSED_TOKEN_42]",
727
- "lstrip": false,
728
- "normalized": false,
729
- "rstrip": false,
730
- "single_word": false,
731
- "special": false
732
- },
733
- "92440": {
734
- "content": "[UNUSED_TOKEN_43]",
735
- "lstrip": false,
736
- "normalized": false,
737
- "rstrip": false,
738
- "single_word": false,
739
- "special": false
740
- },
741
- "92441": {
742
- "content": "[UNUSED_TOKEN_44]",
743
- "lstrip": false,
744
- "normalized": false,
745
- "rstrip": false,
746
- "single_word": false,
747
- "special": false
748
- },
749
- "92442": {
750
- "content": "[UNUSED_TOKEN_45]",
751
- "lstrip": false,
752
- "normalized": false,
753
- "rstrip": false,
754
- "single_word": false,
755
- "special": false
756
- },
757
- "92443": {
758
- "content": "[UNUSED_TOKEN_46]",
759
- "lstrip": false,
760
- "normalized": false,
761
- "rstrip": false,
762
- "single_word": false,
763
- "special": false
764
- },
765
- "92444": {
766
- "content": "[UNUSED_TOKEN_47]",
767
- "lstrip": false,
768
- "normalized": false,
769
- "rstrip": false,
770
- "single_word": false,
771
- "special": false
772
- },
773
- "92445": {
774
- "content": "[UNUSED_TOKEN_48]",
775
- "lstrip": false,
776
- "normalized": false,
777
- "rstrip": false,
778
- "single_word": false,
779
- "special": false
780
- },
781
- "92446": {
782
- "content": "[UNUSED_TOKEN_49]",
783
- "lstrip": false,
784
- "normalized": false,
785
- "rstrip": false,
786
- "single_word": false,
787
- "special": false
788
- },
789
- "92447": {
790
- "content": "[UNUSED_TOKEN_50]",
791
- "lstrip": false,
792
- "normalized": false,
793
- "rstrip": false,
794
- "single_word": false,
795
- "special": false
796
- },
797
- "92448": {
798
- "content": "[UNUSED_TOKEN_51]",
799
- "lstrip": false,
800
- "normalized": false,
801
- "rstrip": false,
802
- "single_word": false,
803
- "special": false
804
- },
805
- "92449": {
806
- "content": "[UNUSED_TOKEN_52]",
807
- "lstrip": false,
808
- "normalized": false,
809
- "rstrip": false,
810
- "single_word": false,
811
- "special": false
812
- },
813
- "92450": {
814
- "content": "[UNUSED_TOKEN_53]",
815
- "lstrip": false,
816
- "normalized": false,
817
- "rstrip": false,
818
- "single_word": false,
819
- "special": false
820
- },
821
- "92451": {
822
- "content": "[UNUSED_TOKEN_54]",
823
- "lstrip": false,
824
- "normalized": false,
825
- "rstrip": false,
826
- "single_word": false,
827
- "special": false
828
- },
829
- "92452": {
830
- "content": "[UNUSED_TOKEN_55]",
831
- "lstrip": false,
832
- "normalized": false,
833
- "rstrip": false,
834
- "single_word": false,
835
- "special": false
836
- },
837
- "92453": {
838
- "content": "[UNUSED_TOKEN_56]",
839
- "lstrip": false,
840
- "normalized": false,
841
- "rstrip": false,
842
- "single_word": false,
843
- "special": false
844
- },
845
- "92454": {
846
- "content": "[UNUSED_TOKEN_57]",
847
- "lstrip": false,
848
- "normalized": false,
849
- "rstrip": false,
850
- "single_word": false,
851
- "special": false
852
- },
853
- "92455": {
854
- "content": "[UNUSED_TOKEN_58]",
855
- "lstrip": false,
856
- "normalized": false,
857
- "rstrip": false,
858
- "single_word": false,
859
- "special": false
860
- },
861
- "92456": {
862
- "content": "[UNUSED_TOKEN_59]",
863
- "lstrip": false,
864
- "normalized": false,
865
- "rstrip": false,
866
- "single_word": false,
867
- "special": false
868
- },
869
- "92457": {
870
- "content": "[UNUSED_TOKEN_60]",
871
- "lstrip": false,
872
- "normalized": false,
873
- "rstrip": false,
874
- "single_word": false,
875
- "special": false
876
- },
877
- "92458": {
878
- "content": "[UNUSED_TOKEN_61]",
879
- "lstrip": false,
880
- "normalized": false,
881
- "rstrip": false,
882
- "single_word": false,
883
- "special": false
884
- },
885
- "92459": {
886
- "content": "[UNUSED_TOKEN_62]",
887
- "lstrip": false,
888
- "normalized": false,
889
- "rstrip": false,
890
- "single_word": false,
891
- "special": false
892
- },
893
- "92460": {
894
- "content": "[UNUSED_TOKEN_63]",
895
- "lstrip": false,
896
- "normalized": false,
897
- "rstrip": false,
898
- "single_word": false,
899
- "special": false
900
- },
901
- "92461": {
902
- "content": "[UNUSED_TOKEN_64]",
903
- "lstrip": false,
904
- "normalized": false,
905
- "rstrip": false,
906
- "single_word": false,
907
- "special": false
908
- },
909
- "92462": {
910
- "content": "[UNUSED_TOKEN_65]",
911
- "lstrip": false,
912
- "normalized": false,
913
- "rstrip": false,
914
- "single_word": false,
915
- "special": false
916
- },
917
- "92463": {
918
- "content": "[UNUSED_TOKEN_66]",
919
- "lstrip": false,
920
- "normalized": false,
921
- "rstrip": false,
922
- "single_word": false,
923
- "special": false
924
- },
925
- "92464": {
926
- "content": "[UNUSED_TOKEN_67]",
927
- "lstrip": false,
928
- "normalized": false,
929
- "rstrip": false,
930
- "single_word": false,
931
- "special": false
932
- },
933
- "92465": {
934
- "content": "[UNUSED_TOKEN_68]",
935
- "lstrip": false,
936
- "normalized": false,
937
- "rstrip": false,
938
- "single_word": false,
939
- "special": false
940
- },
941
- "92466": {
942
- "content": "[UNUSED_TOKEN_69]",
943
- "lstrip": false,
944
- "normalized": false,
945
- "rstrip": false,
946
- "single_word": false,
947
- "special": false
948
- },
949
- "92467": {
950
- "content": "[UNUSED_TOKEN_70]",
951
- "lstrip": false,
952
- "normalized": false,
953
- "rstrip": false,
954
- "single_word": false,
955
- "special": false
956
- },
957
- "92468": {
958
- "content": "[UNUSED_TOKEN_71]",
959
- "lstrip": false,
960
- "normalized": false,
961
- "rstrip": false,
962
- "single_word": false,
963
- "special": false
964
- },
965
- "92469": {
966
- "content": "[UNUSED_TOKEN_72]",
967
- "lstrip": false,
968
- "normalized": false,
969
- "rstrip": false,
970
- "single_word": false,
971
- "special": false
972
- },
973
- "92470": {
974
- "content": "[UNUSED_TOKEN_73]",
975
- "lstrip": false,
976
- "normalized": false,
977
- "rstrip": false,
978
- "single_word": false,
979
- "special": false
980
- },
981
- "92471": {
982
- "content": "[UNUSED_TOKEN_74]",
983
- "lstrip": false,
984
- "normalized": false,
985
- "rstrip": false,
986
- "single_word": false,
987
- "special": false
988
- },
989
- "92472": {
990
- "content": "[UNUSED_TOKEN_75]",
991
- "lstrip": false,
992
- "normalized": false,
993
- "rstrip": false,
994
- "single_word": false,
995
- "special": false
996
- },
997
- "92473": {
998
- "content": "[UNUSED_TOKEN_76]",
999
- "lstrip": false,
1000
- "normalized": false,
1001
- "rstrip": false,
1002
- "single_word": false,
1003
- "special": false
1004
- },
1005
- "92474": {
1006
- "content": "[UNUSED_TOKEN_77]",
1007
- "lstrip": false,
1008
- "normalized": false,
1009
- "rstrip": false,
1010
- "single_word": false,
1011
- "special": false
1012
- },
1013
- "92475": {
1014
- "content": "[UNUSED_TOKEN_78]",
1015
- "lstrip": false,
1016
- "normalized": false,
1017
- "rstrip": false,
1018
- "single_word": false,
1019
- "special": false
1020
- },
1021
- "92476": {
1022
- "content": "[UNUSED_TOKEN_79]",
1023
- "lstrip": false,
1024
- "normalized": false,
1025
- "rstrip": false,
1026
- "single_word": false,
1027
- "special": false
1028
- },
1029
- "92477": {
1030
- "content": "[UNUSED_TOKEN_80]",
1031
- "lstrip": false,
1032
- "normalized": false,
1033
- "rstrip": false,
1034
- "single_word": false,
1035
- "special": false
1036
- },
1037
- "92478": {
1038
- "content": "[UNUSED_TOKEN_81]",
1039
- "lstrip": false,
1040
- "normalized": false,
1041
- "rstrip": false,
1042
- "single_word": false,
1043
- "special": false
1044
- },
1045
- "92479": {
1046
- "content": "[UNUSED_TOKEN_82]",
1047
- "lstrip": false,
1048
- "normalized": false,
1049
- "rstrip": false,
1050
- "single_word": false,
1051
- "special": false
1052
- },
1053
- "92480": {
1054
- "content": "[UNUSED_TOKEN_83]",
1055
- "lstrip": false,
1056
- "normalized": false,
1057
- "rstrip": false,
1058
- "single_word": false,
1059
- "special": false
1060
- },
1061
- "92481": {
1062
- "content": "[UNUSED_TOKEN_84]",
1063
- "lstrip": false,
1064
- "normalized": false,
1065
- "rstrip": false,
1066
- "single_word": false,
1067
- "special": false
1068
- },
1069
- "92482": {
1070
- "content": "[UNUSED_TOKEN_85]",
1071
- "lstrip": false,
1072
- "normalized": false,
1073
- "rstrip": false,
1074
- "single_word": false,
1075
- "special": false
1076
- },
1077
- "92483": {
1078
- "content": "[UNUSED_TOKEN_86]",
1079
- "lstrip": false,
1080
- "normalized": false,
1081
- "rstrip": false,
1082
- "single_word": false,
1083
- "special": false
1084
- },
1085
- "92484": {
1086
- "content": "[UNUSED_TOKEN_87]",
1087
- "lstrip": false,
1088
- "normalized": false,
1089
- "rstrip": false,
1090
- "single_word": false,
1091
- "special": false
1092
- },
1093
- "92485": {
1094
- "content": "[UNUSED_TOKEN_88]",
1095
- "lstrip": false,
1096
- "normalized": false,
1097
- "rstrip": false,
1098
- "single_word": false,
1099
- "special": false
1100
- },
1101
- "92486": {
1102
- "content": "[UNUSED_TOKEN_89]",
1103
- "lstrip": false,
1104
- "normalized": false,
1105
- "rstrip": false,
1106
- "single_word": false,
1107
- "special": false
1108
- },
1109
- "92487": {
1110
- "content": "[UNUSED_TOKEN_90]",
1111
- "lstrip": false,
1112
- "normalized": false,
1113
- "rstrip": false,
1114
- "single_word": false,
1115
- "special": false
1116
- },
1117
- "92488": {
1118
- "content": "[UNUSED_TOKEN_91]",
1119
- "lstrip": false,
1120
- "normalized": false,
1121
- "rstrip": false,
1122
- "single_word": false,
1123
- "special": false
1124
- },
1125
- "92489": {
1126
- "content": "[UNUSED_TOKEN_92]",
1127
- "lstrip": false,
1128
- "normalized": false,
1129
- "rstrip": false,
1130
- "single_word": false,
1131
- "special": false
1132
- },
1133
- "92490": {
1134
- "content": "[UNUSED_TOKEN_93]",
1135
- "lstrip": false,
1136
- "normalized": false,
1137
- "rstrip": false,
1138
- "single_word": false,
1139
- "special": false
1140
- },
1141
- "92491": {
1142
- "content": "[UNUSED_TOKEN_94]",
1143
- "lstrip": false,
1144
- "normalized": false,
1145
- "rstrip": false,
1146
- "single_word": false,
1147
- "special": false
1148
- },
1149
- "92492": {
1150
- "content": "[UNUSED_TOKEN_95]",
1151
- "lstrip": false,
1152
- "normalized": false,
1153
- "rstrip": false,
1154
- "single_word": false,
1155
- "special": false
1156
- },
1157
- "92493": {
1158
- "content": "[UNUSED_TOKEN_96]",
1159
- "lstrip": false,
1160
- "normalized": false,
1161
- "rstrip": false,
1162
- "single_word": false,
1163
- "special": false
1164
- },
1165
- "92494": {
1166
- "content": "[UNUSED_TOKEN_97]",
1167
- "lstrip": false,
1168
- "normalized": false,
1169
- "rstrip": false,
1170
- "single_word": false,
1171
- "special": false
1172
- },
1173
- "92495": {
1174
- "content": "[UNUSED_TOKEN_98]",
1175
- "lstrip": false,
1176
- "normalized": false,
1177
- "rstrip": false,
1178
- "single_word": false,
1179
- "special": false
1180
- },
1181
- "92496": {
1182
- "content": "[UNUSED_TOKEN_99]",
1183
- "lstrip": false,
1184
- "normalized": false,
1185
- "rstrip": false,
1186
- "single_word": false,
1187
- "special": false
1188
- },
1189
- "92497": {
1190
- "content": "[UNUSED_TOKEN_100]",
1191
- "lstrip": false,
1192
- "normalized": false,
1193
- "rstrip": false,
1194
- "single_word": false,
1195
- "special": false
1196
- },
1197
- "92498": {
1198
- "content": "[UNUSED_TOKEN_101]",
1199
- "lstrip": false,
1200
- "normalized": false,
1201
- "rstrip": false,
1202
- "single_word": false,
1203
- "special": false
1204
- },
1205
- "92499": {
1206
- "content": "[UNUSED_TOKEN_102]",
1207
- "lstrip": false,
1208
- "normalized": false,
1209
- "rstrip": false,
1210
- "single_word": false,
1211
- "special": false
1212
- },
1213
- "92500": {
1214
- "content": "[UNUSED_TOKEN_103]",
1215
- "lstrip": false,
1216
- "normalized": false,
1217
- "rstrip": false,
1218
- "single_word": false,
1219
- "special": false
1220
- },
1221
- "92501": {
1222
- "content": "[UNUSED_TOKEN_104]",
1223
- "lstrip": false,
1224
- "normalized": false,
1225
- "rstrip": false,
1226
- "single_word": false,
1227
- "special": false
1228
- },
1229
- "92502": {
1230
- "content": "[UNUSED_TOKEN_105]",
1231
- "lstrip": false,
1232
- "normalized": false,
1233
- "rstrip": false,
1234
- "single_word": false,
1235
- "special": false
1236
- },
1237
- "92503": {
1238
- "content": "[UNUSED_TOKEN_106]",
1239
- "lstrip": false,
1240
- "normalized": false,
1241
- "rstrip": false,
1242
- "single_word": false,
1243
- "special": false
1244
- },
1245
- "92504": {
1246
- "content": "[UNUSED_TOKEN_107]",
1247
- "lstrip": false,
1248
- "normalized": false,
1249
- "rstrip": false,
1250
- "single_word": false,
1251
- "special": false
1252
- },
1253
- "92505": {
1254
- "content": "[UNUSED_TOKEN_108]",
1255
- "lstrip": false,
1256
- "normalized": false,
1257
- "rstrip": false,
1258
- "single_word": false,
1259
- "special": false
1260
- },
1261
- "92506": {
1262
- "content": "[UNUSED_TOKEN_109]",
1263
- "lstrip": false,
1264
- "normalized": false,
1265
- "rstrip": false,
1266
- "single_word": false,
1267
- "special": false
1268
- },
1269
- "92507": {
1270
- "content": "[UNUSED_TOKEN_110]",
1271
- "lstrip": false,
1272
- "normalized": false,
1273
- "rstrip": false,
1274
- "single_word": false,
1275
- "special": false
1276
- },
1277
- "92508": {
1278
- "content": "[UNUSED_TOKEN_111]",
1279
- "lstrip": false,
1280
- "normalized": false,
1281
- "rstrip": false,
1282
- "single_word": false,
1283
- "special": false
1284
- },
1285
- "92509": {
1286
- "content": "[UNUSED_TOKEN_112]",
1287
- "lstrip": false,
1288
- "normalized": false,
1289
- "rstrip": false,
1290
- "single_word": false,
1291
- "special": false
1292
- },
1293
- "92510": {
1294
- "content": "[UNUSED_TOKEN_113]",
1295
- "lstrip": false,
1296
- "normalized": false,
1297
- "rstrip": false,
1298
- "single_word": false,
1299
- "special": false
1300
- },
1301
- "92511": {
1302
- "content": "[UNUSED_TOKEN_114]",
1303
- "lstrip": false,
1304
- "normalized": false,
1305
- "rstrip": false,
1306
- "single_word": false,
1307
- "special": false
1308
- },
1309
- "92512": {
1310
- "content": "[UNUSED_TOKEN_115]",
1311
- "lstrip": false,
1312
- "normalized": false,
1313
- "rstrip": false,
1314
- "single_word": false,
1315
- "special": false
1316
- },
1317
- "92513": {
1318
- "content": "[UNUSED_TOKEN_116]",
1319
- "lstrip": false,
1320
- "normalized": false,
1321
- "rstrip": false,
1322
- "single_word": false,
1323
- "special": false
1324
- },
1325
- "92514": {
1326
- "content": "[UNUSED_TOKEN_117]",
1327
- "lstrip": false,
1328
- "normalized": false,
1329
- "rstrip": false,
1330
- "single_word": false,
1331
- "special": false
1332
- },
1333
- "92515": {
1334
- "content": "[UNUSED_TOKEN_118]",
1335
- "lstrip": false,
1336
- "normalized": false,
1337
- "rstrip": false,
1338
- "single_word": false,
1339
- "special": false
1340
- },
1341
- "92516": {
1342
- "content": "[UNUSED_TOKEN_119]",
1343
- "lstrip": false,
1344
- "normalized": false,
1345
- "rstrip": false,
1346
- "single_word": false,
1347
- "special": false
1348
- },
1349
- "92517": {
1350
- "content": "[UNUSED_TOKEN_120]",
1351
- "lstrip": false,
1352
- "normalized": false,
1353
- "rstrip": false,
1354
- "single_word": false,
1355
- "special": false
1356
- },
1357
- "92518": {
1358
- "content": "[UNUSED_TOKEN_121]",
1359
- "lstrip": false,
1360
- "normalized": false,
1361
- "rstrip": false,
1362
- "single_word": false,
1363
- "special": false
1364
- },
1365
- "92519": {
1366
- "content": "[UNUSED_TOKEN_122]",
1367
- "lstrip": false,
1368
- "normalized": false,
1369
- "rstrip": false,
1370
- "single_word": false,
1371
- "special": false
1372
- },
1373
- "92520": {
1374
- "content": "[UNUSED_TOKEN_123]",
1375
- "lstrip": false,
1376
- "normalized": false,
1377
- "rstrip": false,
1378
- "single_word": false,
1379
- "special": false
1380
- },
1381
- "92521": {
1382
- "content": "[UNUSED_TOKEN_124]",
1383
- "lstrip": false,
1384
- "normalized": false,
1385
- "rstrip": false,
1386
- "single_word": false,
1387
- "special": false
1388
- },
1389
- "92522": {
1390
- "content": "[UNUSED_TOKEN_125]",
1391
- "lstrip": false,
1392
- "normalized": false,
1393
- "rstrip": false,
1394
- "single_word": false,
1395
- "special": false
1396
- },
1397
- "92523": {
1398
- "content": "[UNUSED_TOKEN_126]",
1399
- "lstrip": false,
1400
- "normalized": false,
1401
- "rstrip": false,
1402
- "single_word": false,
1403
- "special": false
1404
- },
1405
- "92524": {
1406
- "content": "[UNUSED_TOKEN_127]",
1407
- "lstrip": false,
1408
- "normalized": false,
1409
- "rstrip": false,
1410
- "single_word": false,
1411
- "special": false
1412
- },
1413
- "92525": {
1414
- "content": "[UNUSED_TOKEN_128]",
1415
- "lstrip": false,
1416
- "normalized": false,
1417
- "rstrip": false,
1418
- "single_word": false,
1419
- "special": false
1420
- },
1421
- "92526": {
1422
- "content": "[UNUSED_TOKEN_129]",
1423
- "lstrip": false,
1424
- "normalized": false,
1425
- "rstrip": false,
1426
- "single_word": false,
1427
- "special": false
1428
- },
1429
- "92527": {
1430
- "content": "[UNUSED_TOKEN_130]",
1431
- "lstrip": false,
1432
- "normalized": false,
1433
- "rstrip": false,
1434
- "single_word": false,
1435
- "special": false
1436
- },
1437
- "92528": {
1438
- "content": "[UNUSED_TOKEN_131]",
1439
- "lstrip": false,
1440
- "normalized": false,
1441
- "rstrip": false,
1442
- "single_word": false,
1443
- "special": false
1444
- },
1445
- "92529": {
1446
- "content": "[UNUSED_TOKEN_132]",
1447
- "lstrip": false,
1448
- "normalized": false,
1449
- "rstrip": false,
1450
- "single_word": false,
1451
- "special": false
1452
- },
1453
- "92530": {
1454
- "content": "[UNUSED_TOKEN_133]",
1455
- "lstrip": false,
1456
- "normalized": false,
1457
- "rstrip": false,
1458
- "single_word": false,
1459
- "special": false
1460
- },
1461
- "92531": {
1462
- "content": "[UNUSED_TOKEN_134]",
1463
- "lstrip": false,
1464
- "normalized": false,
1465
- "rstrip": false,
1466
- "single_word": false,
1467
- "special": false
1468
- },
1469
- "92532": {
1470
- "content": "[UNUSED_TOKEN_135]",
1471
- "lstrip": false,
1472
- "normalized": false,
1473
- "rstrip": false,
1474
- "single_word": false,
1475
- "special": false
1476
- },
1477
- "92533": {
1478
- "content": "[UNUSED_TOKEN_136]",
1479
- "lstrip": false,
1480
- "normalized": false,
1481
- "rstrip": false,
1482
- "single_word": false,
1483
- "special": false
1484
- },
1485
- "92534": {
1486
- "content": "[UNUSED_TOKEN_137]",
1487
- "lstrip": false,
1488
- "normalized": false,
1489
- "rstrip": false,
1490
- "single_word": false,
1491
- "special": false
1492
- },
1493
- "92535": {
1494
- "content": "[UNUSED_TOKEN_138]",
1495
- "lstrip": false,
1496
- "normalized": false,
1497
- "rstrip": false,
1498
- "single_word": false,
1499
- "special": false
1500
- },
1501
- "92536": {
1502
- "content": "[UNUSED_TOKEN_139]",
1503
- "lstrip": false,
1504
- "normalized": false,
1505
- "rstrip": false,
1506
- "single_word": false,
1507
- "special": false
1508
- },
1509
- "92537": {
1510
- "content": "[UNUSED_TOKEN_140]",
1511
- "lstrip": false,
1512
- "normalized": false,
1513
- "rstrip": false,
1514
- "single_word": false,
1515
- "special": false
1516
- },
1517
- "92538": {
1518
- "content": "<|plugin|>",
1519
- "lstrip": false,
1520
- "normalized": false,
1521
- "rstrip": false,
1522
- "single_word": false,
1523
- "special": true
1524
- },
1525
- "92539": {
1526
- "content": "<|interpreter|>",
1527
- "lstrip": false,
1528
- "normalized": false,
1529
- "rstrip": false,
1530
- "single_word": false,
1531
- "special": true
1532
- },
1533
- "92540": {
1534
- "content": "<|action_end|>",
1535
- "lstrip": false,
1536
- "normalized": false,
1537
- "rstrip": false,
1538
- "single_word": false,
1539
- "special": true
1540
- },
1541
- "92541": {
1542
- "content": "<|action_start|>",
1543
- "lstrip": false,
1544
- "normalized": false,
1545
- "rstrip": false,
1546
- "single_word": false,
1547
- "special": true
1548
- },
1549
- "92542": {
1550
- "content": "<|im_end|>",
1551
- "lstrip": false,
1552
- "normalized": false,
1553
- "rstrip": false,
1554
- "single_word": false,
1555
- "special": true
1556
- },
1557
- "92543": {
1558
- "content": "<|im_start|>",
1559
- "lstrip": false,
1560
- "normalized": false,
1561
- "rstrip": false,
1562
- "single_word": false,
1563
- "special": true
1564
- },
1565
- "92544": {
1566
- "content": "[UNUSED_TOKEN_141]",
1567
- "lstrip": false,
1568
- "normalized": false,
1569
- "rstrip": false,
1570
- "single_word": false,
1571
- "special": false
1572
- },
1573
- "92545": {
1574
- "content": "[UNUSED_TOKEN_142]",
1575
- "lstrip": false,
1576
- "normalized": false,
1577
- "rstrip": false,
1578
- "single_word": false,
1579
- "special": false
1580
- },
1581
- "92546": {
1582
- "content": "[UNUSED_TOKEN_143]",
1583
- "lstrip": false,
1584
- "normalized": false,
1585
- "rstrip": false,
1586
- "single_word": false,
1587
- "special": false
1588
- },
1589
- "92547": {
1590
- "content": "[UNUSED_TOKEN_144]",
1591
- "lstrip": false,
1592
- "normalized": false,
1593
- "rstrip": false,
1594
- "single_word": false,
1595
- "special": false
1596
- },
1597
- "92548": {
1598
- "content": "[UNUSED_TOKEN_145]",
1599
- "lstrip": false,
1600
- "normalized": false,
1601
- "rstrip": false,
1602
- "single_word": false,
1603
- "special": false
1604
- },
1605
- "92549": {
1606
- "content": "[UNUSED_TOKEN_146]",
1607
- "lstrip": false,
1608
- "normalized": false,
1609
- "rstrip": false,
1610
- "single_word": false,
1611
- "special": false
1612
- }
1613
- },
1614
- "additional_special_tokens": [
1615
- "<|im_start|>",
1616
- "<|im_end|>",
1617
- "<|action_start|>",
1618
- "<|action_end|>",
1619
- "<|interpreter|>",
1620
- "<|plugin|>"
1621
- ],
1622
- "auto_map": {
1623
- "AutoTokenizer": [
1624
- "tokenization_internlm2.InternLM2Tokenizer",
1625
- "tokenization_internlm2_fast.InternLM2TokenizerFast"
1626
- ]
1627
- },
1628
- "bos_token": "<s>",
1629
- "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|im_start|>system\n' + system_message + '<|im_end|>\n' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}",
1630
- "clean_up_tokenization_spaces": false,
1631
- "decode_with_prefix_space": false,
1632
- "eos_token": "<|im_end|>",
1633
- "model_max_length": 1000000000000000019884624838656,
1634
- "pad_token": "</s>",
1635
- "padding_side": "right",
1636
- "sp_model_kwargs": null,
1637
- "split_special_tokens": false,
1638
- "tokenizer_class": "InternLM2Tokenizer",
1639
- "unk_token": "<unk>"
1640
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
llama-factory/saves/internlm2_5_7b/lora/sft/checkpoint-2248/trainer_state.json DELETED
@@ -1,219 +0,0 @@
1
- {
2
- "best_metric": null,
3
- "best_model_checkpoint": null,
4
- "epoch": 3.9964444444444442,
5
- "eval_steps": 562,
6
- "global_step": 2248,
7
- "is_hyper_param_search": false,
8
- "is_local_process_zero": true,
9
- "is_world_process_zero": true,
10
- "log_history": [
11
- {
12
- "epoch": 0.17777777777777778,
13
- "grad_norm": 4.040336608886719,
14
- "learning_rate": 2.958579881656805e-05,
15
- "loss": 0.4641,
16
- "step": 100
17
- },
18
- {
19
- "epoch": 0.35555555555555557,
20
- "grad_norm": 3.9950191974639893,
21
- "learning_rate": 5.91715976331361e-05,
22
- "loss": 0.3704,
23
- "step": 200
24
- },
25
- {
26
- "epoch": 0.5333333333333333,
27
- "grad_norm": 3.9038829803466797,
28
- "learning_rate": 8.875739644970414e-05,
29
- "loss": 0.3751,
30
- "step": 300
31
- },
32
- {
33
- "epoch": 0.7111111111111111,
34
- "grad_norm": 3.4998044967651367,
35
- "learning_rate": 9.989699867437137e-05,
36
- "loss": 0.3928,
37
- "step": 400
38
- },
39
- {
40
- "epoch": 0.8888888888888888,
41
- "grad_norm": 9.73261833190918,
42
- "learning_rate": 9.92981892269398e-05,
43
- "loss": 0.3497,
44
- "step": 500
45
- },
46
- {
47
- "epoch": 0.9991111111111111,
48
- "eval_loss": 0.6006748080253601,
49
- "eval_runtime": 411.5999,
50
- "eval_samples_per_second": 1.215,
51
- "eval_steps_per_second": 1.215,
52
- "step": 562
53
- },
54
- {
55
- "epoch": 1.0666666666666667,
56
- "grad_norm": 2.8831710815429688,
57
- "learning_rate": 9.817128546774103e-05,
58
- "loss": 0.3383,
59
- "step": 600
60
- },
61
- {
62
- "epoch": 1.2444444444444445,
63
- "grad_norm": 6.632827281951904,
64
- "learning_rate": 9.652835906663704e-05,
65
- "loss": 0.3167,
66
- "step": 700
67
- },
68
- {
69
- "epoch": 1.4222222222222223,
70
- "grad_norm": 6.977548122406006,
71
- "learning_rate": 9.438700945477697e-05,
72
- "loss": 0.3165,
73
- "step": 800
74
- },
75
- {
76
- "epoch": 1.6,
77
- "grad_norm": 10.037060737609863,
78
- "learning_rate": 9.177017529516772e-05,
79
- "loss": 0.2927,
80
- "step": 900
81
- },
82
- {
83
- "epoch": 1.7777777777777777,
84
- "grad_norm": 6.976019859313965,
85
- "learning_rate": 8.870588875808164e-05,
86
- "loss": 0.3062,
87
- "step": 1000
88
- },
89
- {
90
- "epoch": 1.9555555555555557,
91
- "grad_norm": 2.106227159500122,
92
- "learning_rate": 8.522697523356319e-05,
93
- "loss": 0.2678,
94
- "step": 1100
95
- },
96
- {
97
- "epoch": 1.9982222222222221,
98
- "eval_loss": 0.3569962680339813,
99
- "eval_runtime": 367.3155,
100
- "eval_samples_per_second": 1.361,
101
- "eval_steps_per_second": 1.361,
102
- "step": 1124
103
- },
104
- {
105
- "epoch": 2.1333333333333333,
106
- "grad_norm": 2.639159679412842,
107
- "learning_rate": 8.137070169778812e-05,
108
- "loss": 0.2198,
109
- "step": 1200
110
- },
111
- {
112
- "epoch": 2.311111111111111,
113
- "grad_norm": 3.6480841636657715,
114
- "learning_rate": 7.717837750006106e-05,
115
- "loss": 0.2412,
116
- "step": 1300
117
- },
118
- {
119
- "epoch": 2.488888888888889,
120
- "grad_norm": 2.240994930267334,
121
- "learning_rate": 7.269491184691924e-05,
122
- "loss": 0.1952,
123
- "step": 1400
124
- },
125
- {
126
- "epoch": 2.6666666666666665,
127
- "grad_norm": 13.413070678710938,
128
- "learning_rate": 6.79683327236813e-05,
129
- "loss": 0.2262,
130
- "step": 1500
131
- },
132
- {
133
- "epoch": 2.8444444444444446,
134
- "grad_norm": 0.8566445112228394,
135
- "learning_rate": 6.304927240687181e-05,
136
- "loss": 0.1949,
137
- "step": 1600
138
- },
139
- {
140
- "epoch": 2.997333333333333,
141
- "eval_loss": 0.42686474323272705,
142
- "eval_runtime": 365.7769,
143
- "eval_samples_per_second": 1.367,
144
- "eval_steps_per_second": 1.367,
145
- "step": 1686
146
- },
147
- {
148
- "epoch": 3.022222222222222,
149
- "grad_norm": 0.11057106405496597,
150
- "learning_rate": 5.799042507883874e-05,
151
- "loss": 0.2061,
152
- "step": 1700
153
- },
154
- {
155
- "epoch": 3.2,
156
- "grad_norm": 4.739655017852783,
157
- "learning_rate": 5.284598235472912e-05,
158
- "loss": 0.1207,
159
- "step": 1800
160
- },
161
- {
162
- "epoch": 3.3777777777777778,
163
- "grad_norm": 0.25179940462112427,
164
- "learning_rate": 4.7671052768596945e-05,
165
- "loss": 0.1683,
166
- "step": 1900
167
- },
168
- {
169
- "epoch": 3.5555555555555554,
170
- "grad_norm": 0.13251110911369324,
171
- "learning_rate": 4.2521071437250546e-05,
172
- "loss": 0.1325,
173
- "step": 2000
174
- },
175
- {
176
- "epoch": 3.7333333333333334,
177
- "grad_norm": 2.5385780334472656,
178
- "learning_rate": 3.7451206225665035e-05,
179
- "loss": 0.1328,
180
- "step": 2100
181
- },
182
- {
183
- "epoch": 3.911111111111111,
184
- "grad_norm": 0.5059184432029724,
185
- "learning_rate": 3.251576677526236e-05,
186
- "loss": 0.1184,
187
- "step": 2200
188
- },
189
- {
190
- "epoch": 3.9964444444444442,
191
- "eval_loss": 0.448859840631485,
192
- "eval_runtime": 364.0973,
193
- "eval_samples_per_second": 1.373,
194
- "eval_steps_per_second": 1.373,
195
- "step": 2248
196
- }
197
- ],
198
- "logging_steps": 100,
199
- "max_steps": 3372,
200
- "num_input_tokens_seen": 0,
201
- "num_train_epochs": 6,
202
- "save_steps": 562,
203
- "stateful_callbacks": {
204
- "TrainerControl": {
205
- "args": {
206
- "should_epoch_stop": false,
207
- "should_evaluate": false,
208
- "should_log": false,
209
- "should_save": true,
210
- "should_training_stop": false
211
- },
212
- "attributes": {}
213
- }
214
- },
215
- "total_flos": 2.7113989423300608e+17,
216
- "train_batch_size": 1,
217
- "trial_name": null,
218
- "trial_params": null
219
- }