Spaces:
Build error
Build error
09_InterLM_finetuning_colab epoch 3 ready
Browse files- competition/09_InterLM_finetuning_colab.ipynb +0 -0
- llama-factory/config/internlm2_5_7b_lora_sft_bf16.yaml +47 -0
- llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-1124/README.md +202 -0
- llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-1124/adapter_config.json +32 -0
- llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-1124/adapter_model.safetensors +3 -0
- llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-1124/optimizer.pt +3 -0
- llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-1124/rng_state.pth +3 -0
- llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-1124/scheduler.pt +3 -0
- llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-1124/special_tokens_map.json +38 -0
- llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-1124/tokenization_internlm2.py +236 -0
- llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-1124/tokenization_internlm2_fast.py +214 -0
- llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-1124/tokenizer.json +0 -0
- llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-1124/tokenizer.model +3 -0
- llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-1124/tokenizer_config.json +104 -0
- llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-1124/trainer_state.json +128 -0
- llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-1124/training_args.bin +3 -0
- llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-1686/README.md +202 -0
- llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-1686/adapter_config.json +32 -0
- llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-1686/adapter_model.safetensors +3 -0
- llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-1686/optimizer.pt +3 -0
- llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-1686/rng_state.pth +3 -0
- llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-1686/scheduler.pt +3 -0
- llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-1686/special_tokens_map.json +38 -0
- llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-1686/tokenization_internlm2.py +236 -0
- llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-1686/tokenization_internlm2_fast.py +214 -0
- llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-1686/tokenizer.json +0 -0
- llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-1686/tokenizer.model +3 -0
- llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-1686/tokenizer_config.json +104 -0
- llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-1686/trainer_state.json +172 -0
- llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-1686/training_args.bin +3 -0
- llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-562/README.md +202 -0
- llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-562/adapter_config.json +32 -0
- llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-562/adapter_model.safetensors +3 -0
- llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-562/optimizer.pt +3 -0
- llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-562/rng_state.pth +3 -0
- llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-562/scheduler.pt +3 -0
- llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-562/special_tokens_map.json +38 -0
- llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-562/tokenization_internlm2.py +236 -0
- llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-562/tokenization_internlm2_fast.py +214 -0
- llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-562/tokenizer.json +0 -0
- llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-562/tokenizer.model +3 -0
- llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-562/tokenizer_config.json +104 -0
- llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-562/trainer_state.json +77 -0
- llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-562/training_args.bin +3 -0
- llama-factory/saves/internlm2_5_7b/lora/sft_bf16/trainer_log.jsonl +23 -0
- llm_toolkit/logical_reasoning_utils.py +71 -1
competition/09_InterLM_finetuning_colab.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
llama-factory/config/internlm2_5_7b_lora_sft_bf16.yaml
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
### model
|
2 |
+
model_name_or_path: internlm/internlm2_5-7b-chat-1m
|
3 |
+
|
4 |
+
### method
|
5 |
+
stage: sft
|
6 |
+
do_train: true
|
7 |
+
finetuning_type: lora
|
8 |
+
lora_target: all
|
9 |
+
# quantization_bit: 4 # use 4-bit QLoRA
|
10 |
+
loraplus_lr_ratio: 16.0 # use LoRA+ with lambda=16.0
|
11 |
+
# use_unsloth: true # use UnslothAI's LoRA optimization for 2x faster training
|
12 |
+
upcast_layernorm: true
|
13 |
+
|
14 |
+
### dataset
|
15 |
+
dataset: alpaca_mgtv_p1
|
16 |
+
template: chatml
|
17 |
+
cutoff_len: 1024
|
18 |
+
max_samples: 5000
|
19 |
+
overwrite_cache: true
|
20 |
+
preprocessing_num_workers: 16
|
21 |
+
|
22 |
+
### output
|
23 |
+
output_dir: saves/internlm2_5_7b/lora/sft_bf16
|
24 |
+
logging_steps: 100
|
25 |
+
save_steps: 562
|
26 |
+
plot_loss: true
|
27 |
+
overwrite_output_dir: true
|
28 |
+
# resume_from_checkpoint: true
|
29 |
+
|
30 |
+
### train
|
31 |
+
per_device_train_batch_size: 1
|
32 |
+
gradient_accumulation_steps: 8
|
33 |
+
learning_rate: 1.0e-4
|
34 |
+
num_train_epochs: 6.0
|
35 |
+
lr_scheduler_type: cosine
|
36 |
+
warmup_ratio: 0.1
|
37 |
+
bf16: true
|
38 |
+
ddp_timeout: 180000000
|
39 |
+
|
40 |
+
### eval
|
41 |
+
val_size: 0.1
|
42 |
+
per_device_eval_batch_size: 1
|
43 |
+
eval_strategy: steps
|
44 |
+
eval_steps: 562
|
45 |
+
|
46 |
+
report_to: none
|
47 |
+
run_name: internlm2_5_7b # optional
|
llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-1124/README.md
ADDED
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
library_name: peft
|
3 |
+
base_model: internlm/internlm2_5-7b-chat-1m
|
4 |
+
---
|
5 |
+
|
6 |
+
# Model Card for Model ID
|
7 |
+
|
8 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
9 |
+
|
10 |
+
|
11 |
+
|
12 |
+
## Model Details
|
13 |
+
|
14 |
+
### Model Description
|
15 |
+
|
16 |
+
<!-- Provide a longer summary of what this model is. -->
|
17 |
+
|
18 |
+
|
19 |
+
|
20 |
+
- **Developed by:** [More Information Needed]
|
21 |
+
- **Funded by [optional]:** [More Information Needed]
|
22 |
+
- **Shared by [optional]:** [More Information Needed]
|
23 |
+
- **Model type:** [More Information Needed]
|
24 |
+
- **Language(s) (NLP):** [More Information Needed]
|
25 |
+
- **License:** [More Information Needed]
|
26 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
27 |
+
|
28 |
+
### Model Sources [optional]
|
29 |
+
|
30 |
+
<!-- Provide the basic links for the model. -->
|
31 |
+
|
32 |
+
- **Repository:** [More Information Needed]
|
33 |
+
- **Paper [optional]:** [More Information Needed]
|
34 |
+
- **Demo [optional]:** [More Information Needed]
|
35 |
+
|
36 |
+
## Uses
|
37 |
+
|
38 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
39 |
+
|
40 |
+
### Direct Use
|
41 |
+
|
42 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
43 |
+
|
44 |
+
[More Information Needed]
|
45 |
+
|
46 |
+
### Downstream Use [optional]
|
47 |
+
|
48 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
49 |
+
|
50 |
+
[More Information Needed]
|
51 |
+
|
52 |
+
### Out-of-Scope Use
|
53 |
+
|
54 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
55 |
+
|
56 |
+
[More Information Needed]
|
57 |
+
|
58 |
+
## Bias, Risks, and Limitations
|
59 |
+
|
60 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
61 |
+
|
62 |
+
[More Information Needed]
|
63 |
+
|
64 |
+
### Recommendations
|
65 |
+
|
66 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
67 |
+
|
68 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
69 |
+
|
70 |
+
## How to Get Started with the Model
|
71 |
+
|
72 |
+
Use the code below to get started with the model.
|
73 |
+
|
74 |
+
[More Information Needed]
|
75 |
+
|
76 |
+
## Training Details
|
77 |
+
|
78 |
+
### Training Data
|
79 |
+
|
80 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
81 |
+
|
82 |
+
[More Information Needed]
|
83 |
+
|
84 |
+
### Training Procedure
|
85 |
+
|
86 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
87 |
+
|
88 |
+
#### Preprocessing [optional]
|
89 |
+
|
90 |
+
[More Information Needed]
|
91 |
+
|
92 |
+
|
93 |
+
#### Training Hyperparameters
|
94 |
+
|
95 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
96 |
+
|
97 |
+
#### Speeds, Sizes, Times [optional]
|
98 |
+
|
99 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
100 |
+
|
101 |
+
[More Information Needed]
|
102 |
+
|
103 |
+
## Evaluation
|
104 |
+
|
105 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
106 |
+
|
107 |
+
### Testing Data, Factors & Metrics
|
108 |
+
|
109 |
+
#### Testing Data
|
110 |
+
|
111 |
+
<!-- This should link to a Dataset Card if possible. -->
|
112 |
+
|
113 |
+
[More Information Needed]
|
114 |
+
|
115 |
+
#### Factors
|
116 |
+
|
117 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
118 |
+
|
119 |
+
[More Information Needed]
|
120 |
+
|
121 |
+
#### Metrics
|
122 |
+
|
123 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
124 |
+
|
125 |
+
[More Information Needed]
|
126 |
+
|
127 |
+
### Results
|
128 |
+
|
129 |
+
[More Information Needed]
|
130 |
+
|
131 |
+
#### Summary
|
132 |
+
|
133 |
+
|
134 |
+
|
135 |
+
## Model Examination [optional]
|
136 |
+
|
137 |
+
<!-- Relevant interpretability work for the model goes here -->
|
138 |
+
|
139 |
+
[More Information Needed]
|
140 |
+
|
141 |
+
## Environmental Impact
|
142 |
+
|
143 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
144 |
+
|
145 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
146 |
+
|
147 |
+
- **Hardware Type:** [More Information Needed]
|
148 |
+
- **Hours used:** [More Information Needed]
|
149 |
+
- **Cloud Provider:** [More Information Needed]
|
150 |
+
- **Compute Region:** [More Information Needed]
|
151 |
+
- **Carbon Emitted:** [More Information Needed]
|
152 |
+
|
153 |
+
## Technical Specifications [optional]
|
154 |
+
|
155 |
+
### Model Architecture and Objective
|
156 |
+
|
157 |
+
[More Information Needed]
|
158 |
+
|
159 |
+
### Compute Infrastructure
|
160 |
+
|
161 |
+
[More Information Needed]
|
162 |
+
|
163 |
+
#### Hardware
|
164 |
+
|
165 |
+
[More Information Needed]
|
166 |
+
|
167 |
+
#### Software
|
168 |
+
|
169 |
+
[More Information Needed]
|
170 |
+
|
171 |
+
## Citation [optional]
|
172 |
+
|
173 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
174 |
+
|
175 |
+
**BibTeX:**
|
176 |
+
|
177 |
+
[More Information Needed]
|
178 |
+
|
179 |
+
**APA:**
|
180 |
+
|
181 |
+
[More Information Needed]
|
182 |
+
|
183 |
+
## Glossary [optional]
|
184 |
+
|
185 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
186 |
+
|
187 |
+
[More Information Needed]
|
188 |
+
|
189 |
+
## More Information [optional]
|
190 |
+
|
191 |
+
[More Information Needed]
|
192 |
+
|
193 |
+
## Model Card Authors [optional]
|
194 |
+
|
195 |
+
[More Information Needed]
|
196 |
+
|
197 |
+
## Model Card Contact
|
198 |
+
|
199 |
+
[More Information Needed]
|
200 |
+
### Framework versions
|
201 |
+
|
202 |
+
- PEFT 0.11.1
|
llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-1124/adapter_config.json
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"alpha_pattern": {},
|
3 |
+
"auto_mapping": null,
|
4 |
+
"base_model_name_or_path": "internlm/internlm2_5-7b-chat-1m",
|
5 |
+
"bias": "none",
|
6 |
+
"fan_in_fan_out": false,
|
7 |
+
"inference_mode": true,
|
8 |
+
"init_lora_weights": true,
|
9 |
+
"layer_replication": null,
|
10 |
+
"layers_pattern": null,
|
11 |
+
"layers_to_transform": null,
|
12 |
+
"loftq_config": {},
|
13 |
+
"lora_alpha": 16,
|
14 |
+
"lora_dropout": 0.0,
|
15 |
+
"megatron_config": null,
|
16 |
+
"megatron_core": "megatron.core",
|
17 |
+
"modules_to_save": null,
|
18 |
+
"peft_type": "LORA",
|
19 |
+
"r": 8,
|
20 |
+
"rank_pattern": {},
|
21 |
+
"revision": null,
|
22 |
+
"target_modules": [
|
23 |
+
"wo",
|
24 |
+
"w2",
|
25 |
+
"wqkv",
|
26 |
+
"w1",
|
27 |
+
"w3"
|
28 |
+
],
|
29 |
+
"task_type": "CAUSAL_LM",
|
30 |
+
"use_dora": false,
|
31 |
+
"use_rslora": false
|
32 |
+
}
|
llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-1124/adapter_model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e5b5bb607e0aa880c7324e419aad561c560e52477ec2e550e61a001a69944bf5
|
3 |
+
size 75539712
|
llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-1124/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2cc01aab47d78db84d5d05eefdf4485043f65d6d7ba2c57e413cd64f1459ec5f
|
3 |
+
size 151264058
|
llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-1124/rng_state.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8d138cfe3a4adf21f048848ee35837c9a757a0a3616ff7adbb45b69aac247435
|
3 |
+
size 14244
|
llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-1124/scheduler.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:87aaed7cb6dcc2e48d745ccf810d4294a0ed894de1a71242beeebd4a9c4d8393
|
3 |
+
size 1064
|
llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-1124/special_tokens_map.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"additional_special_tokens": [
|
3 |
+
"<|im_start|>",
|
4 |
+
"<|im_end|>",
|
5 |
+
"<|action_start|>",
|
6 |
+
"<|action_end|>",
|
7 |
+
"<|interpreter|>",
|
8 |
+
"<|plugin|>"
|
9 |
+
],
|
10 |
+
"bos_token": {
|
11 |
+
"content": "<s>",
|
12 |
+
"lstrip": false,
|
13 |
+
"normalized": false,
|
14 |
+
"rstrip": false,
|
15 |
+
"single_word": false
|
16 |
+
},
|
17 |
+
"eos_token": {
|
18 |
+
"content": "<|im_end|>",
|
19 |
+
"lstrip": false,
|
20 |
+
"normalized": false,
|
21 |
+
"rstrip": false,
|
22 |
+
"single_word": false
|
23 |
+
},
|
24 |
+
"pad_token": {
|
25 |
+
"content": "</s>",
|
26 |
+
"lstrip": false,
|
27 |
+
"normalized": false,
|
28 |
+
"rstrip": false,
|
29 |
+
"single_word": false
|
30 |
+
},
|
31 |
+
"unk_token": {
|
32 |
+
"content": "<unk>",
|
33 |
+
"lstrip": false,
|
34 |
+
"normalized": false,
|
35 |
+
"rstrip": false,
|
36 |
+
"single_word": false
|
37 |
+
}
|
38 |
+
}
|
llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-1124/tokenization_internlm2.py
ADDED
@@ -0,0 +1,236 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding=utf-8
|
2 |
+
# Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
|
3 |
+
#
|
4 |
+
# This code is based on transformers/src/transformers/models/llama/tokenization_llama.py
|
5 |
+
#
|
6 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
7 |
+
# you may not use this file except in compliance with the License.
|
8 |
+
# You may obtain a copy of the License at
|
9 |
+
#
|
10 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
11 |
+
#
|
12 |
+
# Unless required by applicable law or agreed to in writing, software
|
13 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
14 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
15 |
+
# See the License for the specific language governing permissions and
|
16 |
+
# limitations under the License.
|
17 |
+
|
18 |
+
"""Tokenization classes for InternLM."""
|
19 |
+
import os
|
20 |
+
from shutil import copyfile
|
21 |
+
from typing import Any, Dict, List, Optional, Tuple
|
22 |
+
|
23 |
+
import sentencepiece as spm
|
24 |
+
from transformers.tokenization_utils import PreTrainedTokenizer
|
25 |
+
from transformers.utils import logging
|
26 |
+
|
27 |
+
logger = logging.get_logger(__name__)
|
28 |
+
|
29 |
+
VOCAB_FILES_NAMES = {"vocab_file": "./tokenizer.model"}
|
30 |
+
|
31 |
+
PRETRAINED_VOCAB_FILES_MAP = {}
|
32 |
+
|
33 |
+
|
34 |
+
# Modified from transformers.model.llama.tokenization_llama.LlamaTokenizer
|
35 |
+
class InternLM2Tokenizer(PreTrainedTokenizer):
|
36 |
+
"""
|
37 |
+
Construct a InternLM2 tokenizer. Based on byte-level Byte-Pair-Encoding.
|
38 |
+
|
39 |
+
Args:
|
40 |
+
vocab_file (`str`):
|
41 |
+
Path to the vocabulary file.
|
42 |
+
"""
|
43 |
+
|
44 |
+
vocab_files_names = VOCAB_FILES_NAMES
|
45 |
+
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
46 |
+
model_input_names = ["input_ids", "attention_mask"]
|
47 |
+
_auto_class = "AutoTokenizer"
|
48 |
+
|
49 |
+
def __init__(
|
50 |
+
self,
|
51 |
+
vocab_file,
|
52 |
+
unk_token="<unk>",
|
53 |
+
bos_token="<s>",
|
54 |
+
eos_token="</s>",
|
55 |
+
pad_token="</s>",
|
56 |
+
sp_model_kwargs: Optional[Dict[str, Any]] = None,
|
57 |
+
add_bos_token=True,
|
58 |
+
add_eos_token=False,
|
59 |
+
decode_with_prefix_space=False,
|
60 |
+
clean_up_tokenization_spaces=False,
|
61 |
+
**kwargs,
|
62 |
+
):
|
63 |
+
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
|
64 |
+
self.vocab_file = vocab_file
|
65 |
+
self.add_bos_token = add_bos_token
|
66 |
+
self.add_eos_token = add_eos_token
|
67 |
+
self.decode_with_prefix_space = decode_with_prefix_space
|
68 |
+
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
|
69 |
+
self.sp_model.Load(vocab_file)
|
70 |
+
self._no_prefix_space_tokens = None
|
71 |
+
super().__init__(
|
72 |
+
bos_token=bos_token,
|
73 |
+
eos_token=eos_token,
|
74 |
+
unk_token=unk_token,
|
75 |
+
pad_token=pad_token,
|
76 |
+
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
|
77 |
+
**kwargs,
|
78 |
+
)
|
79 |
+
|
80 |
+
@property
|
81 |
+
def no_prefix_space_tokens(self):
|
82 |
+
if self._no_prefix_space_tokens is None:
|
83 |
+
vocab = self.convert_ids_to_tokens(list(range(self.vocab_size)))
|
84 |
+
self._no_prefix_space_tokens = {i for i, tok in enumerate(vocab) if not tok.startswith("▁")}
|
85 |
+
return self._no_prefix_space_tokens
|
86 |
+
|
87 |
+
@property
|
88 |
+
def vocab_size(self):
|
89 |
+
"""Returns vocab size"""
|
90 |
+
return self.sp_model.get_piece_size()
|
91 |
+
|
92 |
+
@property
|
93 |
+
def bos_token_id(self) -> Optional[int]:
|
94 |
+
return self.sp_model.bos_id()
|
95 |
+
|
96 |
+
@property
|
97 |
+
def eos_token_id(self) -> Optional[int]:
|
98 |
+
return self.sp_model.eos_id()
|
99 |
+
|
100 |
+
def get_vocab(self):
|
101 |
+
"""Returns vocab as a dict"""
|
102 |
+
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
|
103 |
+
vocab.update(self.added_tokens_encoder)
|
104 |
+
return vocab
|
105 |
+
|
106 |
+
def _tokenize(self, text):
|
107 |
+
"""Returns a tokenized string."""
|
108 |
+
return self.sp_model.encode(text, out_type=str)
|
109 |
+
|
110 |
+
def _convert_token_to_id(self, token):
|
111 |
+
"""Converts a token (str) in an id using the vocab."""
|
112 |
+
return self.sp_model.piece_to_id(token)
|
113 |
+
|
114 |
+
def _convert_id_to_token(self, index):
|
115 |
+
"""Converts an index (integer) in a token (str) using the vocab."""
|
116 |
+
token = self.sp_model.IdToPiece(index)
|
117 |
+
return token
|
118 |
+
|
119 |
+
def _maybe_add_prefix_space(self, tokens, decoded):
|
120 |
+
if tokens and tokens[0] not in self.no_prefix_space_tokens:
|
121 |
+
return " " + decoded
|
122 |
+
else:
|
123 |
+
return decoded
|
124 |
+
|
125 |
+
def convert_tokens_to_string(self, tokens):
|
126 |
+
"""Converts a sequence of tokens (string) in a single string."""
|
127 |
+
current_sub_tokens = []
|
128 |
+
out_string = ""
|
129 |
+
prev_is_special = False
|
130 |
+
for token in tokens:
|
131 |
+
# make sure that special tokens are not decoded using sentencepiece model
|
132 |
+
if token in self.all_special_tokens:
|
133 |
+
if not prev_is_special:
|
134 |
+
out_string += " "
|
135 |
+
out_string += self.sp_model.decode(current_sub_tokens) + token
|
136 |
+
prev_is_special = True
|
137 |
+
current_sub_tokens = []
|
138 |
+
else:
|
139 |
+
current_sub_tokens.append(token)
|
140 |
+
prev_is_special = False
|
141 |
+
out_string += self.sp_model.decode(current_sub_tokens)
|
142 |
+
out_string = self.clean_up_tokenization(out_string)
|
143 |
+
out_string = self._maybe_add_prefix_space(tokens=tokens, decoded=out_string)
|
144 |
+
return out_string[1:]
|
145 |
+
|
146 |
+
def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
147 |
+
"""
|
148 |
+
Save the vocabulary and special tokens file to a directory.
|
149 |
+
|
150 |
+
Args:
|
151 |
+
save_directory (`str`):
|
152 |
+
The directory in which to save the vocabulary.
|
153 |
+
|
154 |
+
Returns:
|
155 |
+
`Tuple(str)`: Paths to the files saved.
|
156 |
+
"""
|
157 |
+
if not os.path.isdir(save_directory):
|
158 |
+
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
|
159 |
+
return
|
160 |
+
out_vocab_file = os.path.join(
|
161 |
+
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
|
162 |
+
)
|
163 |
+
|
164 |
+
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
|
165 |
+
copyfile(self.vocab_file, out_vocab_file)
|
166 |
+
elif not os.path.isfile(self.vocab_file):
|
167 |
+
with open(out_vocab_file, "wb") as fi:
|
168 |
+
content_spiece_model = self.sp_model.serialized_model_proto()
|
169 |
+
fi.write(content_spiece_model)
|
170 |
+
|
171 |
+
return (out_vocab_file,)
|
172 |
+
|
173 |
+
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
|
174 |
+
if self.add_bos_token:
|
175 |
+
bos_token_ids = [self.bos_token_id]
|
176 |
+
else:
|
177 |
+
bos_token_ids = []
|
178 |
+
|
179 |
+
output = bos_token_ids + token_ids_0
|
180 |
+
|
181 |
+
if token_ids_1 is not None:
|
182 |
+
output = output + token_ids_1
|
183 |
+
|
184 |
+
if self.add_eos_token:
|
185 |
+
output = output + [self.eos_token_id]
|
186 |
+
|
187 |
+
return output
|
188 |
+
|
189 |
+
def get_special_tokens_mask(
|
190 |
+
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
|
191 |
+
) -> List[int]:
|
192 |
+
"""
|
193 |
+
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
|
194 |
+
special tokens using the tokenizer `prepare_for_model` method.
|
195 |
+
|
196 |
+
Args:
|
197 |
+
token_ids_0 (`List[int]`):
|
198 |
+
List of IDs.
|
199 |
+
token_ids_1 (`List[int]`, *optional*):
|
200 |
+
Optional second list of IDs for sequence pairs.
|
201 |
+
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
|
202 |
+
Whether or not the token list is already formatted with special tokens for the model.
|
203 |
+
|
204 |
+
Returns:
|
205 |
+
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
|
206 |
+
"""
|
207 |
+
if already_has_special_tokens:
|
208 |
+
return super().get_special_tokens_mask(
|
209 |
+
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
|
210 |
+
)
|
211 |
+
|
212 |
+
if token_ids_1 is None:
|
213 |
+
return [1] + ([0] * len(token_ids_0)) + [1]
|
214 |
+
return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
|
215 |
+
|
216 |
+
def create_token_type_ids_from_sequences(
|
217 |
+
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
218 |
+
) -> List[int]:
|
219 |
+
"""
|
220 |
+
Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
|
221 |
+
use of token type ids, therefore a list of zeros is returned.
|
222 |
+
|
223 |
+
Args:
|
224 |
+
token_ids_0 (`List[int]`):
|
225 |
+
List of IDs.
|
226 |
+
token_ids_1 (`List[int]`, *optional*):
|
227 |
+
Optional second list of IDs for sequence pairs.
|
228 |
+
|
229 |
+
Returns:
|
230 |
+
`List[int]`: List of zeros.
|
231 |
+
"""
|
232 |
+
eos = [self.eos_token_id]
|
233 |
+
|
234 |
+
if token_ids_1 is None:
|
235 |
+
return len(token_ids_0 + eos) * [0]
|
236 |
+
return len(token_ids_0 + eos + token_ids_1 + eos) * [0]
|
llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-1124/tokenization_internlm2_fast.py
ADDED
@@ -0,0 +1,214 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding=utf-8
|
2 |
+
# Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
|
3 |
+
#
|
4 |
+
# This code is based on transformers/src/transformers/models/llama/tokenization_llama_fast.py
|
5 |
+
#
|
6 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
7 |
+
# you may not use this file except in compliance with the License.
|
8 |
+
# You may obtain a copy of the License at
|
9 |
+
#
|
10 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
11 |
+
#
|
12 |
+
# Unless required by applicable law or agreed to in writing, software
|
13 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
14 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
15 |
+
# See the License for the specific language governing permissions and
|
16 |
+
# limitations under the License.
|
17 |
+
|
18 |
+
"""Tokenization Fast class for InternLM."""
|
19 |
+
import os
|
20 |
+
from shutil import copyfile
|
21 |
+
from typing import Any, Dict, Optional, Tuple
|
22 |
+
|
23 |
+
from tokenizers import processors, decoders, Tokenizer, normalizers
|
24 |
+
from tokenizers.models import BPE
|
25 |
+
|
26 |
+
from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
|
27 |
+
from transformers.utils import logging
|
28 |
+
|
29 |
+
from transformers.convert_slow_tokenizer import (
|
30 |
+
SLOW_TO_FAST_CONVERTERS,
|
31 |
+
SpmConverter,
|
32 |
+
SentencePieceExtractor,
|
33 |
+
)
|
34 |
+
|
35 |
+
from .tokenization_internlm2 import InternLM2Tokenizer
|
36 |
+
|
37 |
+
logger = logging.get_logger(__name__)
|
38 |
+
|
39 |
+
VOCAB_FILES_NAMES = {"vocab_file": "./tokenizer.model"}
|
40 |
+
|
41 |
+
# Modified from transformers.convert_slow_tokenizer.LlamaConverter
|
42 |
+
class InternLM2Converter(SpmConverter):
|
43 |
+
handle_byte_fallback = True
|
44 |
+
|
45 |
+
def vocab(self, proto):
|
46 |
+
vocab = [
|
47 |
+
("<unk>", 0.0),
|
48 |
+
("<s>", 0.0),
|
49 |
+
("</s>", 0.0),
|
50 |
+
]
|
51 |
+
vocab += [(piece.piece, piece.score) for piece in proto.pieces[3:]]
|
52 |
+
return vocab
|
53 |
+
|
54 |
+
def unk_id(self, proto):
|
55 |
+
unk_id = 0
|
56 |
+
return unk_id
|
57 |
+
|
58 |
+
def decoder(self, replacement, add_prefix_space):
|
59 |
+
decoders_sequence = [
|
60 |
+
decoders.Replace("▁", " "),
|
61 |
+
decoders.ByteFallback(),
|
62 |
+
decoders.Fuse(),
|
63 |
+
]
|
64 |
+
if self.proto.normalizer_spec.add_dummy_prefix:
|
65 |
+
decoders_sequence.append(decoders.Strip(content=" ", left=1))
|
66 |
+
return decoders.Sequence(decoders_sequence)
|
67 |
+
|
68 |
+
def tokenizer(self, proto):
|
69 |
+
model_type = proto.trainer_spec.model_type
|
70 |
+
vocab_scores = self.vocab(proto)
|
71 |
+
# special tokens
|
72 |
+
added_tokens = self.original_tokenizer.added_tokens_decoder
|
73 |
+
for i in range(len(vocab_scores)):
|
74 |
+
piece, score = vocab_scores[i]
|
75 |
+
if i in added_tokens:
|
76 |
+
vocab_scores[i] = (added_tokens[i].content, score)
|
77 |
+
if model_type == 1:
|
78 |
+
raise RuntimeError("InternLM2 is supposed to be a BPE model!")
|
79 |
+
|
80 |
+
elif model_type == 2:
|
81 |
+
_, merges = SentencePieceExtractor(self.original_tokenizer.vocab_file).extract(vocab_scores)
|
82 |
+
bpe_vocab = {word: i for i, (word, _score) in enumerate(vocab_scores)}
|
83 |
+
tokenizer = Tokenizer(
|
84 |
+
BPE(bpe_vocab, merges, unk_token=proto.trainer_spec.unk_piece, fuse_unk=True, byte_fallback=True)
|
85 |
+
)
|
86 |
+
tokenizer.add_special_tokens(
|
87 |
+
[ added_token for index, added_token in added_tokens.items()]
|
88 |
+
)
|
89 |
+
else:
|
90 |
+
raise Exception(
|
91 |
+
"You're trying to run a `Unigram` model but you're file was trained with a different algorithm"
|
92 |
+
)
|
93 |
+
|
94 |
+
return tokenizer
|
95 |
+
|
96 |
+
def normalizer(self, proto):
|
97 |
+
normalizers_list = []
|
98 |
+
if proto.normalizer_spec.add_dummy_prefix:
|
99 |
+
normalizers_list.append(normalizers.Prepend(prepend="▁"))
|
100 |
+
normalizers_list.append(normalizers.Replace(pattern=" ", content="▁"))
|
101 |
+
return normalizers.Sequence(normalizers_list)
|
102 |
+
|
103 |
+
def pre_tokenizer(self, replacement, add_prefix_space):
|
104 |
+
return None
|
105 |
+
|
106 |
+
SLOW_TO_FAST_CONVERTERS["InternLM2Tokenizer"] = InternLM2Converter
|
107 |
+
|
108 |
+
|
109 |
+
# Modified from transformers.model.llama.tokenization_llama_fast.LlamaTokenizerFast -> InternLM2TokenizerFast
|
110 |
+
class InternLM2TokenizerFast(PreTrainedTokenizerFast):
|
111 |
+
vocab_files_names = VOCAB_FILES_NAMES
|
112 |
+
slow_tokenizer_class = InternLM2Tokenizer
|
113 |
+
padding_side = "left"
|
114 |
+
model_input_names = ["input_ids", "attention_mask"]
|
115 |
+
_auto_class = "AutoTokenizer"
|
116 |
+
|
117 |
+
def __init__(
|
118 |
+
self,
|
119 |
+
vocab_file,
|
120 |
+
unk_token="<unk>",
|
121 |
+
bos_token="<s>",
|
122 |
+
eos_token="</s>",
|
123 |
+
pad_token="</s>",
|
124 |
+
sp_model_kwargs: Optional[Dict[str, Any]] = None,
|
125 |
+
add_bos_token=True,
|
126 |
+
add_eos_token=False,
|
127 |
+
decode_with_prefix_space=False,
|
128 |
+
clean_up_tokenization_spaces=False,
|
129 |
+
**kwargs,
|
130 |
+
):
|
131 |
+
super().__init__(
|
132 |
+
vocab_file=vocab_file,
|
133 |
+
unk_token=unk_token,
|
134 |
+
bos_token=bos_token,
|
135 |
+
eos_token=eos_token,
|
136 |
+
pad_token=pad_token,
|
137 |
+
sp_model_kwargs=sp_model_kwargs,
|
138 |
+
add_bos_token=add_bos_token,
|
139 |
+
add_eos_token=add_eos_token,
|
140 |
+
decode_with_prefix_space=decode_with_prefix_space,
|
141 |
+
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
|
142 |
+
**kwargs,
|
143 |
+
)
|
144 |
+
self._add_bos_token = add_bos_token
|
145 |
+
self._add_eos_token = add_eos_token
|
146 |
+
self.update_post_processor()
|
147 |
+
self.vocab_file = vocab_file
|
148 |
+
|
149 |
+
@property
|
150 |
+
def can_save_slow_tokenizer(self) -> bool:
|
151 |
+
return os.path.isfile(self.vocab_file) if self.vocab_file else False
|
152 |
+
|
153 |
+
def update_post_processor(self):
|
154 |
+
"""
|
155 |
+
Updates the underlying post processor with the current `bos_token` and `eos_token`.
|
156 |
+
"""
|
157 |
+
bos = self.bos_token
|
158 |
+
bos_token_id = self.bos_token_id
|
159 |
+
if bos is None and self.add_bos_token:
|
160 |
+
raise ValueError("add_bos_token = True but bos_token = None")
|
161 |
+
|
162 |
+
eos = self.eos_token
|
163 |
+
eos_token_id = self.eos_token_id
|
164 |
+
if eos is None and self.add_eos_token:
|
165 |
+
raise ValueError("add_eos_token = True but eos_token = None")
|
166 |
+
|
167 |
+
single = f"{(bos+':0 ') if self.add_bos_token else ''}$A:0{(' '+eos+':0') if self.add_eos_token else ''}"
|
168 |
+
pair = f"{single}{(' '+bos+':1') if self.add_bos_token else ''} $B:1{(' '+eos+':1') if self.add_eos_token else ''}"
|
169 |
+
|
170 |
+
special_tokens = []
|
171 |
+
if self.add_bos_token:
|
172 |
+
special_tokens.append((bos, bos_token_id))
|
173 |
+
if self.add_eos_token:
|
174 |
+
special_tokens.append((eos, eos_token_id))
|
175 |
+
self._tokenizer.post_processor = processors.TemplateProcessing(
|
176 |
+
single=single, pair=pair, special_tokens=special_tokens
|
177 |
+
)
|
178 |
+
|
179 |
+
@property
|
180 |
+
def add_eos_token(self):
|
181 |
+
return self._add_eos_token
|
182 |
+
|
183 |
+
@property
|
184 |
+
def add_bos_token(self):
|
185 |
+
return self._add_bos_token
|
186 |
+
|
187 |
+
@add_eos_token.setter
|
188 |
+
def add_eos_token(self, value):
|
189 |
+
self._add_eos_token = value
|
190 |
+
self.update_post_processor()
|
191 |
+
|
192 |
+
@add_bos_token.setter
|
193 |
+
def add_bos_token(self, value):
|
194 |
+
self._add_bos_token = value
|
195 |
+
self.update_post_processor()
|
196 |
+
|
197 |
+
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
198 |
+
if not self.can_save_slow_tokenizer:
|
199 |
+
raise ValueError(
|
200 |
+
"Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
|
201 |
+
"tokenizer."
|
202 |
+
)
|
203 |
+
|
204 |
+
if not os.path.isdir(save_directory):
|
205 |
+
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
|
206 |
+
return
|
207 |
+
out_vocab_file = os.path.join(
|
208 |
+
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
|
209 |
+
)
|
210 |
+
|
211 |
+
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
|
212 |
+
copyfile(self.vocab_file, out_vocab_file)
|
213 |
+
|
214 |
+
return (out_vocab_file,)
|
llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-1124/tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-1124/tokenizer.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f868398fc4e05ee1e8aeba95ddf18ddcc45b8bce55d5093bead5bbf80429b48b
|
3 |
+
size 1477754
|
llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-1124/tokenizer_config.json
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_bos_token": true,
|
3 |
+
"add_eos_token": false,
|
4 |
+
"added_tokens_decoder": {
|
5 |
+
"0": {
|
6 |
+
"content": "<unk>",
|
7 |
+
"lstrip": false,
|
8 |
+
"normalized": false,
|
9 |
+
"rstrip": false,
|
10 |
+
"single_word": false,
|
11 |
+
"special": true
|
12 |
+
},
|
13 |
+
"1": {
|
14 |
+
"content": "<s>",
|
15 |
+
"lstrip": false,
|
16 |
+
"normalized": false,
|
17 |
+
"rstrip": false,
|
18 |
+
"single_word": false,
|
19 |
+
"special": true
|
20 |
+
},
|
21 |
+
"2": {
|
22 |
+
"content": "</s>",
|
23 |
+
"lstrip": false,
|
24 |
+
"normalized": false,
|
25 |
+
"rstrip": false,
|
26 |
+
"single_word": false,
|
27 |
+
"special": true
|
28 |
+
},
|
29 |
+
"92538": {
|
30 |
+
"content": "<|plugin|>",
|
31 |
+
"lstrip": false,
|
32 |
+
"normalized": false,
|
33 |
+
"rstrip": false,
|
34 |
+
"single_word": false,
|
35 |
+
"special": true
|
36 |
+
},
|
37 |
+
"92539": {
|
38 |
+
"content": "<|interpreter|>",
|
39 |
+
"lstrip": false,
|
40 |
+
"normalized": false,
|
41 |
+
"rstrip": false,
|
42 |
+
"single_word": false,
|
43 |
+
"special": true
|
44 |
+
},
|
45 |
+
"92540": {
|
46 |
+
"content": "<|action_end|>",
|
47 |
+
"lstrip": false,
|
48 |
+
"normalized": false,
|
49 |
+
"rstrip": false,
|
50 |
+
"single_word": false,
|
51 |
+
"special": true
|
52 |
+
},
|
53 |
+
"92541": {
|
54 |
+
"content": "<|action_start|>",
|
55 |
+
"lstrip": false,
|
56 |
+
"normalized": false,
|
57 |
+
"rstrip": false,
|
58 |
+
"single_word": false,
|
59 |
+
"special": true
|
60 |
+
},
|
61 |
+
"92542": {
|
62 |
+
"content": "<|im_end|>",
|
63 |
+
"lstrip": false,
|
64 |
+
"normalized": false,
|
65 |
+
"rstrip": false,
|
66 |
+
"single_word": false,
|
67 |
+
"special": true
|
68 |
+
},
|
69 |
+
"92543": {
|
70 |
+
"content": "<|im_start|>",
|
71 |
+
"lstrip": false,
|
72 |
+
"normalized": false,
|
73 |
+
"rstrip": false,
|
74 |
+
"single_word": false,
|
75 |
+
"special": true
|
76 |
+
}
|
77 |
+
},
|
78 |
+
"additional_special_tokens": [
|
79 |
+
"<|im_start|>",
|
80 |
+
"<|im_end|>",
|
81 |
+
"<|action_start|>",
|
82 |
+
"<|action_end|>",
|
83 |
+
"<|interpreter|>",
|
84 |
+
"<|plugin|>"
|
85 |
+
],
|
86 |
+
"auto_map": {
|
87 |
+
"AutoTokenizer": [
|
88 |
+
"tokenization_internlm2.InternLM2Tokenizer",
|
89 |
+
"tokenization_internlm2_fast.InternLM2TokenizerFast"
|
90 |
+
]
|
91 |
+
},
|
92 |
+
"bos_token": "<s>",
|
93 |
+
"chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|im_start|>system\n' + system_message + '<|im_end|>\n' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}",
|
94 |
+
"clean_up_tokenization_spaces": false,
|
95 |
+
"decode_with_prefix_space": false,
|
96 |
+
"eos_token": "<|im_end|>",
|
97 |
+
"model_max_length": 1000000000000000019884624838656,
|
98 |
+
"pad_token": "</s>",
|
99 |
+
"padding_side": "right",
|
100 |
+
"sp_model_kwargs": null,
|
101 |
+
"split_special_tokens": false,
|
102 |
+
"tokenizer_class": "InternLM2Tokenizer",
|
103 |
+
"unk_token": "<unk>"
|
104 |
+
}
|
llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-1124/trainer_state.json
ADDED
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"best_metric": null,
|
3 |
+
"best_model_checkpoint": null,
|
4 |
+
"epoch": 1.9982222222222221,
|
5 |
+
"eval_steps": 562,
|
6 |
+
"global_step": 1124,
|
7 |
+
"is_hyper_param_search": false,
|
8 |
+
"is_local_process_zero": true,
|
9 |
+
"is_world_process_zero": true,
|
10 |
+
"log_history": [
|
11 |
+
{
|
12 |
+
"epoch": 0.17777777777777778,
|
13 |
+
"grad_norm": 2.4078445434570312,
|
14 |
+
"learning_rate": 2.958579881656805e-05,
|
15 |
+
"loss": 0.4734,
|
16 |
+
"step": 100
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"epoch": 0.35555555555555557,
|
20 |
+
"grad_norm": 4.478668689727783,
|
21 |
+
"learning_rate": 5.91715976331361e-05,
|
22 |
+
"loss": 0.3667,
|
23 |
+
"step": 200
|
24 |
+
},
|
25 |
+
{
|
26 |
+
"epoch": 0.5333333333333333,
|
27 |
+
"grad_norm": 4.706894397735596,
|
28 |
+
"learning_rate": 8.875739644970414e-05,
|
29 |
+
"loss": 0.3744,
|
30 |
+
"step": 300
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"epoch": 0.7111111111111111,
|
34 |
+
"grad_norm": 2.9772543907165527,
|
35 |
+
"learning_rate": 9.989699867437137e-05,
|
36 |
+
"loss": 0.3797,
|
37 |
+
"step": 400
|
38 |
+
},
|
39 |
+
{
|
40 |
+
"epoch": 0.8888888888888888,
|
41 |
+
"grad_norm": 6.316380023956299,
|
42 |
+
"learning_rate": 9.92981892269398e-05,
|
43 |
+
"loss": 0.3443,
|
44 |
+
"step": 500
|
45 |
+
},
|
46 |
+
{
|
47 |
+
"epoch": 0.9991111111111111,
|
48 |
+
"eval_accuracy": 0.7856666666666667,
|
49 |
+
"eval_loss": 0.7739760279655457,
|
50 |
+
"eval_runtime": 87.6689,
|
51 |
+
"eval_samples_per_second": 5.703,
|
52 |
+
"eval_steps_per_second": 5.703,
|
53 |
+
"step": 562
|
54 |
+
},
|
55 |
+
{
|
56 |
+
"epoch": 1.0666666666666667,
|
57 |
+
"grad_norm": 3.866260051727295,
|
58 |
+
"learning_rate": 9.817128546774103e-05,
|
59 |
+
"loss": 0.3644,
|
60 |
+
"step": 600
|
61 |
+
},
|
62 |
+
{
|
63 |
+
"epoch": 1.2444444444444445,
|
64 |
+
"grad_norm": 2.369489908218384,
|
65 |
+
"learning_rate": 9.652835906663704e-05,
|
66 |
+
"loss": 0.3146,
|
67 |
+
"step": 700
|
68 |
+
},
|
69 |
+
{
|
70 |
+
"epoch": 1.4222222222222223,
|
71 |
+
"grad_norm": 4.548018932342529,
|
72 |
+
"learning_rate": 9.438700945477697e-05,
|
73 |
+
"loss": 0.3365,
|
74 |
+
"step": 800
|
75 |
+
},
|
76 |
+
{
|
77 |
+
"epoch": 1.6,
|
78 |
+
"grad_norm": 12.139796257019043,
|
79 |
+
"learning_rate": 9.177017529516772e-05,
|
80 |
+
"loss": 0.2878,
|
81 |
+
"step": 900
|
82 |
+
},
|
83 |
+
{
|
84 |
+
"epoch": 1.7777777777777777,
|
85 |
+
"grad_norm": 0.8868021965026855,
|
86 |
+
"learning_rate": 8.870588875808164e-05,
|
87 |
+
"loss": 0.3291,
|
88 |
+
"step": 1000
|
89 |
+
},
|
90 |
+
{
|
91 |
+
"epoch": 1.9555555555555557,
|
92 |
+
"grad_norm": 0.3065238893032074,
|
93 |
+
"learning_rate": 8.522697523356319e-05,
|
94 |
+
"loss": 0.2604,
|
95 |
+
"step": 1100
|
96 |
+
},
|
97 |
+
{
|
98 |
+
"epoch": 1.9982222222222221,
|
99 |
+
"eval_accuracy": 0.8555,
|
100 |
+
"eval_loss": 0.38267847895622253,
|
101 |
+
"eval_runtime": 86.9945,
|
102 |
+
"eval_samples_per_second": 5.747,
|
103 |
+
"eval_steps_per_second": 5.747,
|
104 |
+
"step": 1124
|
105 |
+
}
|
106 |
+
],
|
107 |
+
"logging_steps": 100,
|
108 |
+
"max_steps": 3372,
|
109 |
+
"num_input_tokens_seen": 0,
|
110 |
+
"num_train_epochs": 6,
|
111 |
+
"save_steps": 562,
|
112 |
+
"stateful_callbacks": {
|
113 |
+
"TrainerControl": {
|
114 |
+
"args": {
|
115 |
+
"should_epoch_stop": false,
|
116 |
+
"should_evaluate": false,
|
117 |
+
"should_log": false,
|
118 |
+
"should_save": true,
|
119 |
+
"should_training_stop": false
|
120 |
+
},
|
121 |
+
"attributes": {}
|
122 |
+
}
|
123 |
+
},
|
124 |
+
"total_flos": 1.3557561305392742e+17,
|
125 |
+
"train_batch_size": 1,
|
126 |
+
"trial_name": null,
|
127 |
+
"trial_params": null
|
128 |
+
}
|
llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-1124/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ba59bb13ccbc1536fe7be63d6753c0657b8ec8334daccb6ec740876a5104de43
|
3 |
+
size 5304
|
llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-1686/README.md
ADDED
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
library_name: peft
|
3 |
+
base_model: internlm/internlm2_5-7b-chat-1m
|
4 |
+
---
|
5 |
+
|
6 |
+
# Model Card for Model ID
|
7 |
+
|
8 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
9 |
+
|
10 |
+
|
11 |
+
|
12 |
+
## Model Details
|
13 |
+
|
14 |
+
### Model Description
|
15 |
+
|
16 |
+
<!-- Provide a longer summary of what this model is. -->
|
17 |
+
|
18 |
+
|
19 |
+
|
20 |
+
- **Developed by:** [More Information Needed]
|
21 |
+
- **Funded by [optional]:** [More Information Needed]
|
22 |
+
- **Shared by [optional]:** [More Information Needed]
|
23 |
+
- **Model type:** [More Information Needed]
|
24 |
+
- **Language(s) (NLP):** [More Information Needed]
|
25 |
+
- **License:** [More Information Needed]
|
26 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
27 |
+
|
28 |
+
### Model Sources [optional]
|
29 |
+
|
30 |
+
<!-- Provide the basic links for the model. -->
|
31 |
+
|
32 |
+
- **Repository:** [More Information Needed]
|
33 |
+
- **Paper [optional]:** [More Information Needed]
|
34 |
+
- **Demo [optional]:** [More Information Needed]
|
35 |
+
|
36 |
+
## Uses
|
37 |
+
|
38 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
39 |
+
|
40 |
+
### Direct Use
|
41 |
+
|
42 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
43 |
+
|
44 |
+
[More Information Needed]
|
45 |
+
|
46 |
+
### Downstream Use [optional]
|
47 |
+
|
48 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
49 |
+
|
50 |
+
[More Information Needed]
|
51 |
+
|
52 |
+
### Out-of-Scope Use
|
53 |
+
|
54 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
55 |
+
|
56 |
+
[More Information Needed]
|
57 |
+
|
58 |
+
## Bias, Risks, and Limitations
|
59 |
+
|
60 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
61 |
+
|
62 |
+
[More Information Needed]
|
63 |
+
|
64 |
+
### Recommendations
|
65 |
+
|
66 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
67 |
+
|
68 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
69 |
+
|
70 |
+
## How to Get Started with the Model
|
71 |
+
|
72 |
+
Use the code below to get started with the model.
|
73 |
+
|
74 |
+
[More Information Needed]
|
75 |
+
|
76 |
+
## Training Details
|
77 |
+
|
78 |
+
### Training Data
|
79 |
+
|
80 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
81 |
+
|
82 |
+
[More Information Needed]
|
83 |
+
|
84 |
+
### Training Procedure
|
85 |
+
|
86 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
87 |
+
|
88 |
+
#### Preprocessing [optional]
|
89 |
+
|
90 |
+
[More Information Needed]
|
91 |
+
|
92 |
+
|
93 |
+
#### Training Hyperparameters
|
94 |
+
|
95 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
96 |
+
|
97 |
+
#### Speeds, Sizes, Times [optional]
|
98 |
+
|
99 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
100 |
+
|
101 |
+
[More Information Needed]
|
102 |
+
|
103 |
+
## Evaluation
|
104 |
+
|
105 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
106 |
+
|
107 |
+
### Testing Data, Factors & Metrics
|
108 |
+
|
109 |
+
#### Testing Data
|
110 |
+
|
111 |
+
<!-- This should link to a Dataset Card if possible. -->
|
112 |
+
|
113 |
+
[More Information Needed]
|
114 |
+
|
115 |
+
#### Factors
|
116 |
+
|
117 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
118 |
+
|
119 |
+
[More Information Needed]
|
120 |
+
|
121 |
+
#### Metrics
|
122 |
+
|
123 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
124 |
+
|
125 |
+
[More Information Needed]
|
126 |
+
|
127 |
+
### Results
|
128 |
+
|
129 |
+
[More Information Needed]
|
130 |
+
|
131 |
+
#### Summary
|
132 |
+
|
133 |
+
|
134 |
+
|
135 |
+
## Model Examination [optional]
|
136 |
+
|
137 |
+
<!-- Relevant interpretability work for the model goes here -->
|
138 |
+
|
139 |
+
[More Information Needed]
|
140 |
+
|
141 |
+
## Environmental Impact
|
142 |
+
|
143 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
144 |
+
|
145 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
146 |
+
|
147 |
+
- **Hardware Type:** [More Information Needed]
|
148 |
+
- **Hours used:** [More Information Needed]
|
149 |
+
- **Cloud Provider:** [More Information Needed]
|
150 |
+
- **Compute Region:** [More Information Needed]
|
151 |
+
- **Carbon Emitted:** [More Information Needed]
|
152 |
+
|
153 |
+
## Technical Specifications [optional]
|
154 |
+
|
155 |
+
### Model Architecture and Objective
|
156 |
+
|
157 |
+
[More Information Needed]
|
158 |
+
|
159 |
+
### Compute Infrastructure
|
160 |
+
|
161 |
+
[More Information Needed]
|
162 |
+
|
163 |
+
#### Hardware
|
164 |
+
|
165 |
+
[More Information Needed]
|
166 |
+
|
167 |
+
#### Software
|
168 |
+
|
169 |
+
[More Information Needed]
|
170 |
+
|
171 |
+
## Citation [optional]
|
172 |
+
|
173 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
174 |
+
|
175 |
+
**BibTeX:**
|
176 |
+
|
177 |
+
[More Information Needed]
|
178 |
+
|
179 |
+
**APA:**
|
180 |
+
|
181 |
+
[More Information Needed]
|
182 |
+
|
183 |
+
## Glossary [optional]
|
184 |
+
|
185 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
186 |
+
|
187 |
+
[More Information Needed]
|
188 |
+
|
189 |
+
## More Information [optional]
|
190 |
+
|
191 |
+
[More Information Needed]
|
192 |
+
|
193 |
+
## Model Card Authors [optional]
|
194 |
+
|
195 |
+
[More Information Needed]
|
196 |
+
|
197 |
+
## Model Card Contact
|
198 |
+
|
199 |
+
[More Information Needed]
|
200 |
+
### Framework versions
|
201 |
+
|
202 |
+
- PEFT 0.11.1
|
llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-1686/adapter_config.json
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"alpha_pattern": {},
|
3 |
+
"auto_mapping": null,
|
4 |
+
"base_model_name_or_path": "internlm/internlm2_5-7b-chat-1m",
|
5 |
+
"bias": "none",
|
6 |
+
"fan_in_fan_out": false,
|
7 |
+
"inference_mode": true,
|
8 |
+
"init_lora_weights": true,
|
9 |
+
"layer_replication": null,
|
10 |
+
"layers_pattern": null,
|
11 |
+
"layers_to_transform": null,
|
12 |
+
"loftq_config": {},
|
13 |
+
"lora_alpha": 16,
|
14 |
+
"lora_dropout": 0.0,
|
15 |
+
"megatron_config": null,
|
16 |
+
"megatron_core": "megatron.core",
|
17 |
+
"modules_to_save": null,
|
18 |
+
"peft_type": "LORA",
|
19 |
+
"r": 8,
|
20 |
+
"rank_pattern": {},
|
21 |
+
"revision": null,
|
22 |
+
"target_modules": [
|
23 |
+
"wo",
|
24 |
+
"w2",
|
25 |
+
"wqkv",
|
26 |
+
"w1",
|
27 |
+
"w3"
|
28 |
+
],
|
29 |
+
"task_type": "CAUSAL_LM",
|
30 |
+
"use_dora": false,
|
31 |
+
"use_rslora": false
|
32 |
+
}
|
llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-1686/adapter_model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e99ac1cf2f0143c7c7fde32133960887326a9b172d080113b4b7ab4e19d0f583
|
3 |
+
size 75539712
|
llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-1686/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:03c3ffb0b44ab6683a224c84b8440fd86167ecc71bc063ce0c8b4b88705f51e7
|
3 |
+
size 151264058
|
llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-1686/rng_state.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c062f7f375beded48b5337f5a3f3a5cb38807fa3e85dbf3e294c0ab6b627bfc2
|
3 |
+
size 14244
|
llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-1686/scheduler.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:905747c81bda26664cb2ffce8f8ce9044aa6bf92fd1cd473dd32e646b88e5e1a
|
3 |
+
size 1064
|
llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-1686/special_tokens_map.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"additional_special_tokens": [
|
3 |
+
"<|im_start|>",
|
4 |
+
"<|im_end|>",
|
5 |
+
"<|action_start|>",
|
6 |
+
"<|action_end|>",
|
7 |
+
"<|interpreter|>",
|
8 |
+
"<|plugin|>"
|
9 |
+
],
|
10 |
+
"bos_token": {
|
11 |
+
"content": "<s>",
|
12 |
+
"lstrip": false,
|
13 |
+
"normalized": false,
|
14 |
+
"rstrip": false,
|
15 |
+
"single_word": false
|
16 |
+
},
|
17 |
+
"eos_token": {
|
18 |
+
"content": "<|im_end|>",
|
19 |
+
"lstrip": false,
|
20 |
+
"normalized": false,
|
21 |
+
"rstrip": false,
|
22 |
+
"single_word": false
|
23 |
+
},
|
24 |
+
"pad_token": {
|
25 |
+
"content": "</s>",
|
26 |
+
"lstrip": false,
|
27 |
+
"normalized": false,
|
28 |
+
"rstrip": false,
|
29 |
+
"single_word": false
|
30 |
+
},
|
31 |
+
"unk_token": {
|
32 |
+
"content": "<unk>",
|
33 |
+
"lstrip": false,
|
34 |
+
"normalized": false,
|
35 |
+
"rstrip": false,
|
36 |
+
"single_word": false
|
37 |
+
}
|
38 |
+
}
|
llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-1686/tokenization_internlm2.py
ADDED
@@ -0,0 +1,236 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding=utf-8
|
2 |
+
# Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
|
3 |
+
#
|
4 |
+
# This code is based on transformers/src/transformers/models/llama/tokenization_llama.py
|
5 |
+
#
|
6 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
7 |
+
# you may not use this file except in compliance with the License.
|
8 |
+
# You may obtain a copy of the License at
|
9 |
+
#
|
10 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
11 |
+
#
|
12 |
+
# Unless required by applicable law or agreed to in writing, software
|
13 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
14 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
15 |
+
# See the License for the specific language governing permissions and
|
16 |
+
# limitations under the License.
|
17 |
+
|
18 |
+
"""Tokenization classes for InternLM."""
|
19 |
+
import os
|
20 |
+
from shutil import copyfile
|
21 |
+
from typing import Any, Dict, List, Optional, Tuple
|
22 |
+
|
23 |
+
import sentencepiece as spm
|
24 |
+
from transformers.tokenization_utils import PreTrainedTokenizer
|
25 |
+
from transformers.utils import logging
|
26 |
+
|
27 |
+
logger = logging.get_logger(__name__)
|
28 |
+
|
29 |
+
VOCAB_FILES_NAMES = {"vocab_file": "./tokenizer.model"}
|
30 |
+
|
31 |
+
PRETRAINED_VOCAB_FILES_MAP = {}
|
32 |
+
|
33 |
+
|
34 |
+
# Modified from transformers.model.llama.tokenization_llama.LlamaTokenizer
|
35 |
+
class InternLM2Tokenizer(PreTrainedTokenizer):
|
36 |
+
"""
|
37 |
+
Construct a InternLM2 tokenizer. Based on byte-level Byte-Pair-Encoding.
|
38 |
+
|
39 |
+
Args:
|
40 |
+
vocab_file (`str`):
|
41 |
+
Path to the vocabulary file.
|
42 |
+
"""
|
43 |
+
|
44 |
+
vocab_files_names = VOCAB_FILES_NAMES
|
45 |
+
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
46 |
+
model_input_names = ["input_ids", "attention_mask"]
|
47 |
+
_auto_class = "AutoTokenizer"
|
48 |
+
|
49 |
+
def __init__(
|
50 |
+
self,
|
51 |
+
vocab_file,
|
52 |
+
unk_token="<unk>",
|
53 |
+
bos_token="<s>",
|
54 |
+
eos_token="</s>",
|
55 |
+
pad_token="</s>",
|
56 |
+
sp_model_kwargs: Optional[Dict[str, Any]] = None,
|
57 |
+
add_bos_token=True,
|
58 |
+
add_eos_token=False,
|
59 |
+
decode_with_prefix_space=False,
|
60 |
+
clean_up_tokenization_spaces=False,
|
61 |
+
**kwargs,
|
62 |
+
):
|
63 |
+
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
|
64 |
+
self.vocab_file = vocab_file
|
65 |
+
self.add_bos_token = add_bos_token
|
66 |
+
self.add_eos_token = add_eos_token
|
67 |
+
self.decode_with_prefix_space = decode_with_prefix_space
|
68 |
+
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
|
69 |
+
self.sp_model.Load(vocab_file)
|
70 |
+
self._no_prefix_space_tokens = None
|
71 |
+
super().__init__(
|
72 |
+
bos_token=bos_token,
|
73 |
+
eos_token=eos_token,
|
74 |
+
unk_token=unk_token,
|
75 |
+
pad_token=pad_token,
|
76 |
+
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
|
77 |
+
**kwargs,
|
78 |
+
)
|
79 |
+
|
80 |
+
@property
|
81 |
+
def no_prefix_space_tokens(self):
|
82 |
+
if self._no_prefix_space_tokens is None:
|
83 |
+
vocab = self.convert_ids_to_tokens(list(range(self.vocab_size)))
|
84 |
+
self._no_prefix_space_tokens = {i for i, tok in enumerate(vocab) if not tok.startswith("▁")}
|
85 |
+
return self._no_prefix_space_tokens
|
86 |
+
|
87 |
+
@property
|
88 |
+
def vocab_size(self):
|
89 |
+
"""Returns vocab size"""
|
90 |
+
return self.sp_model.get_piece_size()
|
91 |
+
|
92 |
+
@property
|
93 |
+
def bos_token_id(self) -> Optional[int]:
|
94 |
+
return self.sp_model.bos_id()
|
95 |
+
|
96 |
+
@property
|
97 |
+
def eos_token_id(self) -> Optional[int]:
|
98 |
+
return self.sp_model.eos_id()
|
99 |
+
|
100 |
+
def get_vocab(self):
|
101 |
+
"""Returns vocab as a dict"""
|
102 |
+
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
|
103 |
+
vocab.update(self.added_tokens_encoder)
|
104 |
+
return vocab
|
105 |
+
|
106 |
+
def _tokenize(self, text):
|
107 |
+
"""Returns a tokenized string."""
|
108 |
+
return self.sp_model.encode(text, out_type=str)
|
109 |
+
|
110 |
+
def _convert_token_to_id(self, token):
|
111 |
+
"""Converts a token (str) in an id using the vocab."""
|
112 |
+
return self.sp_model.piece_to_id(token)
|
113 |
+
|
114 |
+
def _convert_id_to_token(self, index):
|
115 |
+
"""Converts an index (integer) in a token (str) using the vocab."""
|
116 |
+
token = self.sp_model.IdToPiece(index)
|
117 |
+
return token
|
118 |
+
|
119 |
+
def _maybe_add_prefix_space(self, tokens, decoded):
|
120 |
+
if tokens and tokens[0] not in self.no_prefix_space_tokens:
|
121 |
+
return " " + decoded
|
122 |
+
else:
|
123 |
+
return decoded
|
124 |
+
|
125 |
+
def convert_tokens_to_string(self, tokens):
|
126 |
+
"""Converts a sequence of tokens (string) in a single string."""
|
127 |
+
current_sub_tokens = []
|
128 |
+
out_string = ""
|
129 |
+
prev_is_special = False
|
130 |
+
for token in tokens:
|
131 |
+
# make sure that special tokens are not decoded using sentencepiece model
|
132 |
+
if token in self.all_special_tokens:
|
133 |
+
if not prev_is_special:
|
134 |
+
out_string += " "
|
135 |
+
out_string += self.sp_model.decode(current_sub_tokens) + token
|
136 |
+
prev_is_special = True
|
137 |
+
current_sub_tokens = []
|
138 |
+
else:
|
139 |
+
current_sub_tokens.append(token)
|
140 |
+
prev_is_special = False
|
141 |
+
out_string += self.sp_model.decode(current_sub_tokens)
|
142 |
+
out_string = self.clean_up_tokenization(out_string)
|
143 |
+
out_string = self._maybe_add_prefix_space(tokens=tokens, decoded=out_string)
|
144 |
+
return out_string[1:]
|
145 |
+
|
146 |
+
def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
147 |
+
"""
|
148 |
+
Save the vocabulary and special tokens file to a directory.
|
149 |
+
|
150 |
+
Args:
|
151 |
+
save_directory (`str`):
|
152 |
+
The directory in which to save the vocabulary.
|
153 |
+
|
154 |
+
Returns:
|
155 |
+
`Tuple(str)`: Paths to the files saved.
|
156 |
+
"""
|
157 |
+
if not os.path.isdir(save_directory):
|
158 |
+
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
|
159 |
+
return
|
160 |
+
out_vocab_file = os.path.join(
|
161 |
+
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
|
162 |
+
)
|
163 |
+
|
164 |
+
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
|
165 |
+
copyfile(self.vocab_file, out_vocab_file)
|
166 |
+
elif not os.path.isfile(self.vocab_file):
|
167 |
+
with open(out_vocab_file, "wb") as fi:
|
168 |
+
content_spiece_model = self.sp_model.serialized_model_proto()
|
169 |
+
fi.write(content_spiece_model)
|
170 |
+
|
171 |
+
return (out_vocab_file,)
|
172 |
+
|
173 |
+
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
|
174 |
+
if self.add_bos_token:
|
175 |
+
bos_token_ids = [self.bos_token_id]
|
176 |
+
else:
|
177 |
+
bos_token_ids = []
|
178 |
+
|
179 |
+
output = bos_token_ids + token_ids_0
|
180 |
+
|
181 |
+
if token_ids_1 is not None:
|
182 |
+
output = output + token_ids_1
|
183 |
+
|
184 |
+
if self.add_eos_token:
|
185 |
+
output = output + [self.eos_token_id]
|
186 |
+
|
187 |
+
return output
|
188 |
+
|
189 |
+
def get_special_tokens_mask(
|
190 |
+
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
|
191 |
+
) -> List[int]:
|
192 |
+
"""
|
193 |
+
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
|
194 |
+
special tokens using the tokenizer `prepare_for_model` method.
|
195 |
+
|
196 |
+
Args:
|
197 |
+
token_ids_0 (`List[int]`):
|
198 |
+
List of IDs.
|
199 |
+
token_ids_1 (`List[int]`, *optional*):
|
200 |
+
Optional second list of IDs for sequence pairs.
|
201 |
+
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
|
202 |
+
Whether or not the token list is already formatted with special tokens for the model.
|
203 |
+
|
204 |
+
Returns:
|
205 |
+
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
|
206 |
+
"""
|
207 |
+
if already_has_special_tokens:
|
208 |
+
return super().get_special_tokens_mask(
|
209 |
+
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
|
210 |
+
)
|
211 |
+
|
212 |
+
if token_ids_1 is None:
|
213 |
+
return [1] + ([0] * len(token_ids_0)) + [1]
|
214 |
+
return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
|
215 |
+
|
216 |
+
def create_token_type_ids_from_sequences(
|
217 |
+
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
218 |
+
) -> List[int]:
|
219 |
+
"""
|
220 |
+
Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
|
221 |
+
use of token type ids, therefore a list of zeros is returned.
|
222 |
+
|
223 |
+
Args:
|
224 |
+
token_ids_0 (`List[int]`):
|
225 |
+
List of IDs.
|
226 |
+
token_ids_1 (`List[int]`, *optional*):
|
227 |
+
Optional second list of IDs for sequence pairs.
|
228 |
+
|
229 |
+
Returns:
|
230 |
+
`List[int]`: List of zeros.
|
231 |
+
"""
|
232 |
+
eos = [self.eos_token_id]
|
233 |
+
|
234 |
+
if token_ids_1 is None:
|
235 |
+
return len(token_ids_0 + eos) * [0]
|
236 |
+
return len(token_ids_0 + eos + token_ids_1 + eos) * [0]
|
llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-1686/tokenization_internlm2_fast.py
ADDED
@@ -0,0 +1,214 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding=utf-8
|
2 |
+
# Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
|
3 |
+
#
|
4 |
+
# This code is based on transformers/src/transformers/models/llama/tokenization_llama_fast.py
|
5 |
+
#
|
6 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
7 |
+
# you may not use this file except in compliance with the License.
|
8 |
+
# You may obtain a copy of the License at
|
9 |
+
#
|
10 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
11 |
+
#
|
12 |
+
# Unless required by applicable law or agreed to in writing, software
|
13 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
14 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
15 |
+
# See the License for the specific language governing permissions and
|
16 |
+
# limitations under the License.
|
17 |
+
|
18 |
+
"""Tokenization Fast class for InternLM."""
|
19 |
+
import os
|
20 |
+
from shutil import copyfile
|
21 |
+
from typing import Any, Dict, Optional, Tuple
|
22 |
+
|
23 |
+
from tokenizers import processors, decoders, Tokenizer, normalizers
|
24 |
+
from tokenizers.models import BPE
|
25 |
+
|
26 |
+
from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
|
27 |
+
from transformers.utils import logging
|
28 |
+
|
29 |
+
from transformers.convert_slow_tokenizer import (
|
30 |
+
SLOW_TO_FAST_CONVERTERS,
|
31 |
+
SpmConverter,
|
32 |
+
SentencePieceExtractor,
|
33 |
+
)
|
34 |
+
|
35 |
+
from .tokenization_internlm2 import InternLM2Tokenizer
|
36 |
+
|
37 |
+
logger = logging.get_logger(__name__)
|
38 |
+
|
39 |
+
VOCAB_FILES_NAMES = {"vocab_file": "./tokenizer.model"}
|
40 |
+
|
41 |
+
# Modified from transformers.convert_slow_tokenizer.LlamaConverter
|
42 |
+
class InternLM2Converter(SpmConverter):
|
43 |
+
handle_byte_fallback = True
|
44 |
+
|
45 |
+
def vocab(self, proto):
|
46 |
+
vocab = [
|
47 |
+
("<unk>", 0.0),
|
48 |
+
("<s>", 0.0),
|
49 |
+
("</s>", 0.0),
|
50 |
+
]
|
51 |
+
vocab += [(piece.piece, piece.score) for piece in proto.pieces[3:]]
|
52 |
+
return vocab
|
53 |
+
|
54 |
+
def unk_id(self, proto):
|
55 |
+
unk_id = 0
|
56 |
+
return unk_id
|
57 |
+
|
58 |
+
def decoder(self, replacement, add_prefix_space):
|
59 |
+
decoders_sequence = [
|
60 |
+
decoders.Replace("▁", " "),
|
61 |
+
decoders.ByteFallback(),
|
62 |
+
decoders.Fuse(),
|
63 |
+
]
|
64 |
+
if self.proto.normalizer_spec.add_dummy_prefix:
|
65 |
+
decoders_sequence.append(decoders.Strip(content=" ", left=1))
|
66 |
+
return decoders.Sequence(decoders_sequence)
|
67 |
+
|
68 |
+
def tokenizer(self, proto):
|
69 |
+
model_type = proto.trainer_spec.model_type
|
70 |
+
vocab_scores = self.vocab(proto)
|
71 |
+
# special tokens
|
72 |
+
added_tokens = self.original_tokenizer.added_tokens_decoder
|
73 |
+
for i in range(len(vocab_scores)):
|
74 |
+
piece, score = vocab_scores[i]
|
75 |
+
if i in added_tokens:
|
76 |
+
vocab_scores[i] = (added_tokens[i].content, score)
|
77 |
+
if model_type == 1:
|
78 |
+
raise RuntimeError("InternLM2 is supposed to be a BPE model!")
|
79 |
+
|
80 |
+
elif model_type == 2:
|
81 |
+
_, merges = SentencePieceExtractor(self.original_tokenizer.vocab_file).extract(vocab_scores)
|
82 |
+
bpe_vocab = {word: i for i, (word, _score) in enumerate(vocab_scores)}
|
83 |
+
tokenizer = Tokenizer(
|
84 |
+
BPE(bpe_vocab, merges, unk_token=proto.trainer_spec.unk_piece, fuse_unk=True, byte_fallback=True)
|
85 |
+
)
|
86 |
+
tokenizer.add_special_tokens(
|
87 |
+
[ added_token for index, added_token in added_tokens.items()]
|
88 |
+
)
|
89 |
+
else:
|
90 |
+
raise Exception(
|
91 |
+
"You're trying to run a `Unigram` model but you're file was trained with a different algorithm"
|
92 |
+
)
|
93 |
+
|
94 |
+
return tokenizer
|
95 |
+
|
96 |
+
def normalizer(self, proto):
|
97 |
+
normalizers_list = []
|
98 |
+
if proto.normalizer_spec.add_dummy_prefix:
|
99 |
+
normalizers_list.append(normalizers.Prepend(prepend="▁"))
|
100 |
+
normalizers_list.append(normalizers.Replace(pattern=" ", content="▁"))
|
101 |
+
return normalizers.Sequence(normalizers_list)
|
102 |
+
|
103 |
+
def pre_tokenizer(self, replacement, add_prefix_space):
|
104 |
+
return None
|
105 |
+
|
106 |
+
SLOW_TO_FAST_CONVERTERS["InternLM2Tokenizer"] = InternLM2Converter
|
107 |
+
|
108 |
+
|
109 |
+
# Modified from transformers.model.llama.tokenization_llama_fast.LlamaTokenizerFast -> InternLM2TokenizerFast
|
110 |
+
class InternLM2TokenizerFast(PreTrainedTokenizerFast):
|
111 |
+
vocab_files_names = VOCAB_FILES_NAMES
|
112 |
+
slow_tokenizer_class = InternLM2Tokenizer
|
113 |
+
padding_side = "left"
|
114 |
+
model_input_names = ["input_ids", "attention_mask"]
|
115 |
+
_auto_class = "AutoTokenizer"
|
116 |
+
|
117 |
+
def __init__(
|
118 |
+
self,
|
119 |
+
vocab_file,
|
120 |
+
unk_token="<unk>",
|
121 |
+
bos_token="<s>",
|
122 |
+
eos_token="</s>",
|
123 |
+
pad_token="</s>",
|
124 |
+
sp_model_kwargs: Optional[Dict[str, Any]] = None,
|
125 |
+
add_bos_token=True,
|
126 |
+
add_eos_token=False,
|
127 |
+
decode_with_prefix_space=False,
|
128 |
+
clean_up_tokenization_spaces=False,
|
129 |
+
**kwargs,
|
130 |
+
):
|
131 |
+
super().__init__(
|
132 |
+
vocab_file=vocab_file,
|
133 |
+
unk_token=unk_token,
|
134 |
+
bos_token=bos_token,
|
135 |
+
eos_token=eos_token,
|
136 |
+
pad_token=pad_token,
|
137 |
+
sp_model_kwargs=sp_model_kwargs,
|
138 |
+
add_bos_token=add_bos_token,
|
139 |
+
add_eos_token=add_eos_token,
|
140 |
+
decode_with_prefix_space=decode_with_prefix_space,
|
141 |
+
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
|
142 |
+
**kwargs,
|
143 |
+
)
|
144 |
+
self._add_bos_token = add_bos_token
|
145 |
+
self._add_eos_token = add_eos_token
|
146 |
+
self.update_post_processor()
|
147 |
+
self.vocab_file = vocab_file
|
148 |
+
|
149 |
+
@property
|
150 |
+
def can_save_slow_tokenizer(self) -> bool:
|
151 |
+
return os.path.isfile(self.vocab_file) if self.vocab_file else False
|
152 |
+
|
153 |
+
def update_post_processor(self):
|
154 |
+
"""
|
155 |
+
Updates the underlying post processor with the current `bos_token` and `eos_token`.
|
156 |
+
"""
|
157 |
+
bos = self.bos_token
|
158 |
+
bos_token_id = self.bos_token_id
|
159 |
+
if bos is None and self.add_bos_token:
|
160 |
+
raise ValueError("add_bos_token = True but bos_token = None")
|
161 |
+
|
162 |
+
eos = self.eos_token
|
163 |
+
eos_token_id = self.eos_token_id
|
164 |
+
if eos is None and self.add_eos_token:
|
165 |
+
raise ValueError("add_eos_token = True but eos_token = None")
|
166 |
+
|
167 |
+
single = f"{(bos+':0 ') if self.add_bos_token else ''}$A:0{(' '+eos+':0') if self.add_eos_token else ''}"
|
168 |
+
pair = f"{single}{(' '+bos+':1') if self.add_bos_token else ''} $B:1{(' '+eos+':1') if self.add_eos_token else ''}"
|
169 |
+
|
170 |
+
special_tokens = []
|
171 |
+
if self.add_bos_token:
|
172 |
+
special_tokens.append((bos, bos_token_id))
|
173 |
+
if self.add_eos_token:
|
174 |
+
special_tokens.append((eos, eos_token_id))
|
175 |
+
self._tokenizer.post_processor = processors.TemplateProcessing(
|
176 |
+
single=single, pair=pair, special_tokens=special_tokens
|
177 |
+
)
|
178 |
+
|
179 |
+
@property
|
180 |
+
def add_eos_token(self):
|
181 |
+
return self._add_eos_token
|
182 |
+
|
183 |
+
@property
|
184 |
+
def add_bos_token(self):
|
185 |
+
return self._add_bos_token
|
186 |
+
|
187 |
+
@add_eos_token.setter
|
188 |
+
def add_eos_token(self, value):
|
189 |
+
self._add_eos_token = value
|
190 |
+
self.update_post_processor()
|
191 |
+
|
192 |
+
@add_bos_token.setter
|
193 |
+
def add_bos_token(self, value):
|
194 |
+
self._add_bos_token = value
|
195 |
+
self.update_post_processor()
|
196 |
+
|
197 |
+
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
198 |
+
if not self.can_save_slow_tokenizer:
|
199 |
+
raise ValueError(
|
200 |
+
"Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
|
201 |
+
"tokenizer."
|
202 |
+
)
|
203 |
+
|
204 |
+
if not os.path.isdir(save_directory):
|
205 |
+
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
|
206 |
+
return
|
207 |
+
out_vocab_file = os.path.join(
|
208 |
+
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
|
209 |
+
)
|
210 |
+
|
211 |
+
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
|
212 |
+
copyfile(self.vocab_file, out_vocab_file)
|
213 |
+
|
214 |
+
return (out_vocab_file,)
|
llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-1686/tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-1686/tokenizer.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f868398fc4e05ee1e8aeba95ddf18ddcc45b8bce55d5093bead5bbf80429b48b
|
3 |
+
size 1477754
|
llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-1686/tokenizer_config.json
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_bos_token": true,
|
3 |
+
"add_eos_token": false,
|
4 |
+
"added_tokens_decoder": {
|
5 |
+
"0": {
|
6 |
+
"content": "<unk>",
|
7 |
+
"lstrip": false,
|
8 |
+
"normalized": false,
|
9 |
+
"rstrip": false,
|
10 |
+
"single_word": false,
|
11 |
+
"special": true
|
12 |
+
},
|
13 |
+
"1": {
|
14 |
+
"content": "<s>",
|
15 |
+
"lstrip": false,
|
16 |
+
"normalized": false,
|
17 |
+
"rstrip": false,
|
18 |
+
"single_word": false,
|
19 |
+
"special": true
|
20 |
+
},
|
21 |
+
"2": {
|
22 |
+
"content": "</s>",
|
23 |
+
"lstrip": false,
|
24 |
+
"normalized": false,
|
25 |
+
"rstrip": false,
|
26 |
+
"single_word": false,
|
27 |
+
"special": true
|
28 |
+
},
|
29 |
+
"92538": {
|
30 |
+
"content": "<|plugin|>",
|
31 |
+
"lstrip": false,
|
32 |
+
"normalized": false,
|
33 |
+
"rstrip": false,
|
34 |
+
"single_word": false,
|
35 |
+
"special": true
|
36 |
+
},
|
37 |
+
"92539": {
|
38 |
+
"content": "<|interpreter|>",
|
39 |
+
"lstrip": false,
|
40 |
+
"normalized": false,
|
41 |
+
"rstrip": false,
|
42 |
+
"single_word": false,
|
43 |
+
"special": true
|
44 |
+
},
|
45 |
+
"92540": {
|
46 |
+
"content": "<|action_end|>",
|
47 |
+
"lstrip": false,
|
48 |
+
"normalized": false,
|
49 |
+
"rstrip": false,
|
50 |
+
"single_word": false,
|
51 |
+
"special": true
|
52 |
+
},
|
53 |
+
"92541": {
|
54 |
+
"content": "<|action_start|>",
|
55 |
+
"lstrip": false,
|
56 |
+
"normalized": false,
|
57 |
+
"rstrip": false,
|
58 |
+
"single_word": false,
|
59 |
+
"special": true
|
60 |
+
},
|
61 |
+
"92542": {
|
62 |
+
"content": "<|im_end|>",
|
63 |
+
"lstrip": false,
|
64 |
+
"normalized": false,
|
65 |
+
"rstrip": false,
|
66 |
+
"single_word": false,
|
67 |
+
"special": true
|
68 |
+
},
|
69 |
+
"92543": {
|
70 |
+
"content": "<|im_start|>",
|
71 |
+
"lstrip": false,
|
72 |
+
"normalized": false,
|
73 |
+
"rstrip": false,
|
74 |
+
"single_word": false,
|
75 |
+
"special": true
|
76 |
+
}
|
77 |
+
},
|
78 |
+
"additional_special_tokens": [
|
79 |
+
"<|im_start|>",
|
80 |
+
"<|im_end|>",
|
81 |
+
"<|action_start|>",
|
82 |
+
"<|action_end|>",
|
83 |
+
"<|interpreter|>",
|
84 |
+
"<|plugin|>"
|
85 |
+
],
|
86 |
+
"auto_map": {
|
87 |
+
"AutoTokenizer": [
|
88 |
+
"tokenization_internlm2.InternLM2Tokenizer",
|
89 |
+
"tokenization_internlm2_fast.InternLM2TokenizerFast"
|
90 |
+
]
|
91 |
+
},
|
92 |
+
"bos_token": "<s>",
|
93 |
+
"chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|im_start|>system\n' + system_message + '<|im_end|>\n' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}",
|
94 |
+
"clean_up_tokenization_spaces": false,
|
95 |
+
"decode_with_prefix_space": false,
|
96 |
+
"eos_token": "<|im_end|>",
|
97 |
+
"model_max_length": 1000000000000000019884624838656,
|
98 |
+
"pad_token": "</s>",
|
99 |
+
"padding_side": "right",
|
100 |
+
"sp_model_kwargs": null,
|
101 |
+
"split_special_tokens": false,
|
102 |
+
"tokenizer_class": "InternLM2Tokenizer",
|
103 |
+
"unk_token": "<unk>"
|
104 |
+
}
|
llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-1686/trainer_state.json
ADDED
@@ -0,0 +1,172 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"best_metric": null,
|
3 |
+
"best_model_checkpoint": null,
|
4 |
+
"epoch": 2.997333333333333,
|
5 |
+
"eval_steps": 562,
|
6 |
+
"global_step": 1686,
|
7 |
+
"is_hyper_param_search": false,
|
8 |
+
"is_local_process_zero": true,
|
9 |
+
"is_world_process_zero": true,
|
10 |
+
"log_history": [
|
11 |
+
{
|
12 |
+
"epoch": 0.17777777777777778,
|
13 |
+
"grad_norm": 2.4078445434570312,
|
14 |
+
"learning_rate": 2.958579881656805e-05,
|
15 |
+
"loss": 0.4734,
|
16 |
+
"step": 100
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"epoch": 0.35555555555555557,
|
20 |
+
"grad_norm": 4.478668689727783,
|
21 |
+
"learning_rate": 5.91715976331361e-05,
|
22 |
+
"loss": 0.3667,
|
23 |
+
"step": 200
|
24 |
+
},
|
25 |
+
{
|
26 |
+
"epoch": 0.5333333333333333,
|
27 |
+
"grad_norm": 4.706894397735596,
|
28 |
+
"learning_rate": 8.875739644970414e-05,
|
29 |
+
"loss": 0.3744,
|
30 |
+
"step": 300
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"epoch": 0.7111111111111111,
|
34 |
+
"grad_norm": 2.9772543907165527,
|
35 |
+
"learning_rate": 9.989699867437137e-05,
|
36 |
+
"loss": 0.3797,
|
37 |
+
"step": 400
|
38 |
+
},
|
39 |
+
{
|
40 |
+
"epoch": 0.8888888888888888,
|
41 |
+
"grad_norm": 6.316380023956299,
|
42 |
+
"learning_rate": 9.92981892269398e-05,
|
43 |
+
"loss": 0.3443,
|
44 |
+
"step": 500
|
45 |
+
},
|
46 |
+
{
|
47 |
+
"epoch": 0.9991111111111111,
|
48 |
+
"eval_accuracy": 0.7856666666666667,
|
49 |
+
"eval_loss": 0.7739760279655457,
|
50 |
+
"eval_runtime": 87.6689,
|
51 |
+
"eval_samples_per_second": 5.703,
|
52 |
+
"eval_steps_per_second": 5.703,
|
53 |
+
"step": 562
|
54 |
+
},
|
55 |
+
{
|
56 |
+
"epoch": 1.0666666666666667,
|
57 |
+
"grad_norm": 3.866260051727295,
|
58 |
+
"learning_rate": 9.817128546774103e-05,
|
59 |
+
"loss": 0.3644,
|
60 |
+
"step": 600
|
61 |
+
},
|
62 |
+
{
|
63 |
+
"epoch": 1.2444444444444445,
|
64 |
+
"grad_norm": 2.369489908218384,
|
65 |
+
"learning_rate": 9.652835906663704e-05,
|
66 |
+
"loss": 0.3146,
|
67 |
+
"step": 700
|
68 |
+
},
|
69 |
+
{
|
70 |
+
"epoch": 1.4222222222222223,
|
71 |
+
"grad_norm": 4.548018932342529,
|
72 |
+
"learning_rate": 9.438700945477697e-05,
|
73 |
+
"loss": 0.3365,
|
74 |
+
"step": 800
|
75 |
+
},
|
76 |
+
{
|
77 |
+
"epoch": 1.6,
|
78 |
+
"grad_norm": 12.139796257019043,
|
79 |
+
"learning_rate": 9.177017529516772e-05,
|
80 |
+
"loss": 0.2878,
|
81 |
+
"step": 900
|
82 |
+
},
|
83 |
+
{
|
84 |
+
"epoch": 1.7777777777777777,
|
85 |
+
"grad_norm": 0.8868021965026855,
|
86 |
+
"learning_rate": 8.870588875808164e-05,
|
87 |
+
"loss": 0.3291,
|
88 |
+
"step": 1000
|
89 |
+
},
|
90 |
+
{
|
91 |
+
"epoch": 1.9555555555555557,
|
92 |
+
"grad_norm": 0.3065238893032074,
|
93 |
+
"learning_rate": 8.522697523356319e-05,
|
94 |
+
"loss": 0.2604,
|
95 |
+
"step": 1100
|
96 |
+
},
|
97 |
+
{
|
98 |
+
"epoch": 1.9982222222222221,
|
99 |
+
"eval_accuracy": 0.8555,
|
100 |
+
"eval_loss": 0.38267847895622253,
|
101 |
+
"eval_runtime": 86.9945,
|
102 |
+
"eval_samples_per_second": 5.747,
|
103 |
+
"eval_steps_per_second": 5.747,
|
104 |
+
"step": 1124
|
105 |
+
},
|
106 |
+
{
|
107 |
+
"epoch": 2.1333333333333333,
|
108 |
+
"grad_norm": 1.911468744277954,
|
109 |
+
"learning_rate": 8.137070169778812e-05,
|
110 |
+
"loss": 0.2122,
|
111 |
+
"step": 1200
|
112 |
+
},
|
113 |
+
{
|
114 |
+
"epoch": 2.311111111111111,
|
115 |
+
"grad_norm": 3.41011381149292,
|
116 |
+
"learning_rate": 7.717837750006106e-05,
|
117 |
+
"loss": 0.2322,
|
118 |
+
"step": 1300
|
119 |
+
},
|
120 |
+
{
|
121 |
+
"epoch": 2.488888888888889,
|
122 |
+
"grad_norm": 4.826447486877441,
|
123 |
+
"learning_rate": 7.269491184691924e-05,
|
124 |
+
"loss": 0.2091,
|
125 |
+
"step": 1400
|
126 |
+
},
|
127 |
+
{
|
128 |
+
"epoch": 2.6666666666666665,
|
129 |
+
"grad_norm": 5.958643913269043,
|
130 |
+
"learning_rate": 6.79683327236813e-05,
|
131 |
+
"loss": 0.229,
|
132 |
+
"step": 1500
|
133 |
+
},
|
134 |
+
{
|
135 |
+
"epoch": 2.8444444444444446,
|
136 |
+
"grad_norm": 0.9083921909332275,
|
137 |
+
"learning_rate": 6.304927240687181e-05,
|
138 |
+
"loss": 0.2278,
|
139 |
+
"step": 1600
|
140 |
+
},
|
141 |
+
{
|
142 |
+
"epoch": 2.997333333333333,
|
143 |
+
"eval_accuracy": 0.867,
|
144 |
+
"eval_loss": 0.4158739745616913,
|
145 |
+
"eval_runtime": 87.1063,
|
146 |
+
"eval_samples_per_second": 5.74,
|
147 |
+
"eval_steps_per_second": 5.74,
|
148 |
+
"step": 1686
|
149 |
+
}
|
150 |
+
],
|
151 |
+
"logging_steps": 100,
|
152 |
+
"max_steps": 3372,
|
153 |
+
"num_input_tokens_seen": 0,
|
154 |
+
"num_train_epochs": 6,
|
155 |
+
"save_steps": 562,
|
156 |
+
"stateful_callbacks": {
|
157 |
+
"TrainerControl": {
|
158 |
+
"args": {
|
159 |
+
"should_epoch_stop": false,
|
160 |
+
"should_evaluate": false,
|
161 |
+
"should_log": false,
|
162 |
+
"should_save": true,
|
163 |
+
"should_training_stop": false
|
164 |
+
},
|
165 |
+
"attributes": {}
|
166 |
+
}
|
167 |
+
},
|
168 |
+
"total_flos": 2.033572224618332e+17,
|
169 |
+
"train_batch_size": 1,
|
170 |
+
"trial_name": null,
|
171 |
+
"trial_params": null
|
172 |
+
}
|
llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-1686/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ba59bb13ccbc1536fe7be63d6753c0657b8ec8334daccb6ec740876a5104de43
|
3 |
+
size 5304
|
llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-562/README.md
ADDED
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
library_name: peft
|
3 |
+
base_model: internlm/internlm2_5-7b-chat-1m
|
4 |
+
---
|
5 |
+
|
6 |
+
# Model Card for Model ID
|
7 |
+
|
8 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
9 |
+
|
10 |
+
|
11 |
+
|
12 |
+
## Model Details
|
13 |
+
|
14 |
+
### Model Description
|
15 |
+
|
16 |
+
<!-- Provide a longer summary of what this model is. -->
|
17 |
+
|
18 |
+
|
19 |
+
|
20 |
+
- **Developed by:** [More Information Needed]
|
21 |
+
- **Funded by [optional]:** [More Information Needed]
|
22 |
+
- **Shared by [optional]:** [More Information Needed]
|
23 |
+
- **Model type:** [More Information Needed]
|
24 |
+
- **Language(s) (NLP):** [More Information Needed]
|
25 |
+
- **License:** [More Information Needed]
|
26 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
27 |
+
|
28 |
+
### Model Sources [optional]
|
29 |
+
|
30 |
+
<!-- Provide the basic links for the model. -->
|
31 |
+
|
32 |
+
- **Repository:** [More Information Needed]
|
33 |
+
- **Paper [optional]:** [More Information Needed]
|
34 |
+
- **Demo [optional]:** [More Information Needed]
|
35 |
+
|
36 |
+
## Uses
|
37 |
+
|
38 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
39 |
+
|
40 |
+
### Direct Use
|
41 |
+
|
42 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
43 |
+
|
44 |
+
[More Information Needed]
|
45 |
+
|
46 |
+
### Downstream Use [optional]
|
47 |
+
|
48 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
49 |
+
|
50 |
+
[More Information Needed]
|
51 |
+
|
52 |
+
### Out-of-Scope Use
|
53 |
+
|
54 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
55 |
+
|
56 |
+
[More Information Needed]
|
57 |
+
|
58 |
+
## Bias, Risks, and Limitations
|
59 |
+
|
60 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
61 |
+
|
62 |
+
[More Information Needed]
|
63 |
+
|
64 |
+
### Recommendations
|
65 |
+
|
66 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
67 |
+
|
68 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
69 |
+
|
70 |
+
## How to Get Started with the Model
|
71 |
+
|
72 |
+
Use the code below to get started with the model.
|
73 |
+
|
74 |
+
[More Information Needed]
|
75 |
+
|
76 |
+
## Training Details
|
77 |
+
|
78 |
+
### Training Data
|
79 |
+
|
80 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
81 |
+
|
82 |
+
[More Information Needed]
|
83 |
+
|
84 |
+
### Training Procedure
|
85 |
+
|
86 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
87 |
+
|
88 |
+
#### Preprocessing [optional]
|
89 |
+
|
90 |
+
[More Information Needed]
|
91 |
+
|
92 |
+
|
93 |
+
#### Training Hyperparameters
|
94 |
+
|
95 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
96 |
+
|
97 |
+
#### Speeds, Sizes, Times [optional]
|
98 |
+
|
99 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
100 |
+
|
101 |
+
[More Information Needed]
|
102 |
+
|
103 |
+
## Evaluation
|
104 |
+
|
105 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
106 |
+
|
107 |
+
### Testing Data, Factors & Metrics
|
108 |
+
|
109 |
+
#### Testing Data
|
110 |
+
|
111 |
+
<!-- This should link to a Dataset Card if possible. -->
|
112 |
+
|
113 |
+
[More Information Needed]
|
114 |
+
|
115 |
+
#### Factors
|
116 |
+
|
117 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
118 |
+
|
119 |
+
[More Information Needed]
|
120 |
+
|
121 |
+
#### Metrics
|
122 |
+
|
123 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
124 |
+
|
125 |
+
[More Information Needed]
|
126 |
+
|
127 |
+
### Results
|
128 |
+
|
129 |
+
[More Information Needed]
|
130 |
+
|
131 |
+
#### Summary
|
132 |
+
|
133 |
+
|
134 |
+
|
135 |
+
## Model Examination [optional]
|
136 |
+
|
137 |
+
<!-- Relevant interpretability work for the model goes here -->
|
138 |
+
|
139 |
+
[More Information Needed]
|
140 |
+
|
141 |
+
## Environmental Impact
|
142 |
+
|
143 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
144 |
+
|
145 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
146 |
+
|
147 |
+
- **Hardware Type:** [More Information Needed]
|
148 |
+
- **Hours used:** [More Information Needed]
|
149 |
+
- **Cloud Provider:** [More Information Needed]
|
150 |
+
- **Compute Region:** [More Information Needed]
|
151 |
+
- **Carbon Emitted:** [More Information Needed]
|
152 |
+
|
153 |
+
## Technical Specifications [optional]
|
154 |
+
|
155 |
+
### Model Architecture and Objective
|
156 |
+
|
157 |
+
[More Information Needed]
|
158 |
+
|
159 |
+
### Compute Infrastructure
|
160 |
+
|
161 |
+
[More Information Needed]
|
162 |
+
|
163 |
+
#### Hardware
|
164 |
+
|
165 |
+
[More Information Needed]
|
166 |
+
|
167 |
+
#### Software
|
168 |
+
|
169 |
+
[More Information Needed]
|
170 |
+
|
171 |
+
## Citation [optional]
|
172 |
+
|
173 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
174 |
+
|
175 |
+
**BibTeX:**
|
176 |
+
|
177 |
+
[More Information Needed]
|
178 |
+
|
179 |
+
**APA:**
|
180 |
+
|
181 |
+
[More Information Needed]
|
182 |
+
|
183 |
+
## Glossary [optional]
|
184 |
+
|
185 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
186 |
+
|
187 |
+
[More Information Needed]
|
188 |
+
|
189 |
+
## More Information [optional]
|
190 |
+
|
191 |
+
[More Information Needed]
|
192 |
+
|
193 |
+
## Model Card Authors [optional]
|
194 |
+
|
195 |
+
[More Information Needed]
|
196 |
+
|
197 |
+
## Model Card Contact
|
198 |
+
|
199 |
+
[More Information Needed]
|
200 |
+
### Framework versions
|
201 |
+
|
202 |
+
- PEFT 0.11.1
|
llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-562/adapter_config.json
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"alpha_pattern": {},
|
3 |
+
"auto_mapping": null,
|
4 |
+
"base_model_name_or_path": "internlm/internlm2_5-7b-chat-1m",
|
5 |
+
"bias": "none",
|
6 |
+
"fan_in_fan_out": false,
|
7 |
+
"inference_mode": true,
|
8 |
+
"init_lora_weights": true,
|
9 |
+
"layer_replication": null,
|
10 |
+
"layers_pattern": null,
|
11 |
+
"layers_to_transform": null,
|
12 |
+
"loftq_config": {},
|
13 |
+
"lora_alpha": 16,
|
14 |
+
"lora_dropout": 0.0,
|
15 |
+
"megatron_config": null,
|
16 |
+
"megatron_core": "megatron.core",
|
17 |
+
"modules_to_save": null,
|
18 |
+
"peft_type": "LORA",
|
19 |
+
"r": 8,
|
20 |
+
"rank_pattern": {},
|
21 |
+
"revision": null,
|
22 |
+
"target_modules": [
|
23 |
+
"wo",
|
24 |
+
"w2",
|
25 |
+
"wqkv",
|
26 |
+
"w1",
|
27 |
+
"w3"
|
28 |
+
],
|
29 |
+
"task_type": "CAUSAL_LM",
|
30 |
+
"use_dora": false,
|
31 |
+
"use_rslora": false
|
32 |
+
}
|
llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-562/adapter_model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b277bee0209c45ea1420f055de02a7a042fcfdf2ad56f282ac6a8e8761c9a96c
|
3 |
+
size 75539712
|
llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-562/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:91bf9474a2ce630f35349cc4bed2d0abff6089338725016db52c7934e05282f8
|
3 |
+
size 151264058
|
llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-562/rng_state.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1ff264f99d31b522cc7e2a4eac9d38606d0c58a34c0adc74d71e0ca8b371dc36
|
3 |
+
size 14244
|
llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-562/scheduler.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4eeb45374edbc927ec21316a30c15fe7e5f3e224c067c49fa377d0dbe48eada2
|
3 |
+
size 1064
|
llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-562/special_tokens_map.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"additional_special_tokens": [
|
3 |
+
"<|im_start|>",
|
4 |
+
"<|im_end|>",
|
5 |
+
"<|action_start|>",
|
6 |
+
"<|action_end|>",
|
7 |
+
"<|interpreter|>",
|
8 |
+
"<|plugin|>"
|
9 |
+
],
|
10 |
+
"bos_token": {
|
11 |
+
"content": "<s>",
|
12 |
+
"lstrip": false,
|
13 |
+
"normalized": false,
|
14 |
+
"rstrip": false,
|
15 |
+
"single_word": false
|
16 |
+
},
|
17 |
+
"eos_token": {
|
18 |
+
"content": "<|im_end|>",
|
19 |
+
"lstrip": false,
|
20 |
+
"normalized": false,
|
21 |
+
"rstrip": false,
|
22 |
+
"single_word": false
|
23 |
+
},
|
24 |
+
"pad_token": {
|
25 |
+
"content": "</s>",
|
26 |
+
"lstrip": false,
|
27 |
+
"normalized": false,
|
28 |
+
"rstrip": false,
|
29 |
+
"single_word": false
|
30 |
+
},
|
31 |
+
"unk_token": {
|
32 |
+
"content": "<unk>",
|
33 |
+
"lstrip": false,
|
34 |
+
"normalized": false,
|
35 |
+
"rstrip": false,
|
36 |
+
"single_word": false
|
37 |
+
}
|
38 |
+
}
|
llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-562/tokenization_internlm2.py
ADDED
@@ -0,0 +1,236 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding=utf-8
|
2 |
+
# Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
|
3 |
+
#
|
4 |
+
# This code is based on transformers/src/transformers/models/llama/tokenization_llama.py
|
5 |
+
#
|
6 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
7 |
+
# you may not use this file except in compliance with the License.
|
8 |
+
# You may obtain a copy of the License at
|
9 |
+
#
|
10 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
11 |
+
#
|
12 |
+
# Unless required by applicable law or agreed to in writing, software
|
13 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
14 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
15 |
+
# See the License for the specific language governing permissions and
|
16 |
+
# limitations under the License.
|
17 |
+
|
18 |
+
"""Tokenization classes for InternLM."""
|
19 |
+
import os
|
20 |
+
from shutil import copyfile
|
21 |
+
from typing import Any, Dict, List, Optional, Tuple
|
22 |
+
|
23 |
+
import sentencepiece as spm
|
24 |
+
from transformers.tokenization_utils import PreTrainedTokenizer
|
25 |
+
from transformers.utils import logging
|
26 |
+
|
27 |
+
logger = logging.get_logger(__name__)
|
28 |
+
|
29 |
+
VOCAB_FILES_NAMES = {"vocab_file": "./tokenizer.model"}
|
30 |
+
|
31 |
+
PRETRAINED_VOCAB_FILES_MAP = {}
|
32 |
+
|
33 |
+
|
34 |
+
# Modified from transformers.model.llama.tokenization_llama.LlamaTokenizer
|
35 |
+
class InternLM2Tokenizer(PreTrainedTokenizer):
|
36 |
+
"""
|
37 |
+
Construct a InternLM2 tokenizer. Based on byte-level Byte-Pair-Encoding.
|
38 |
+
|
39 |
+
Args:
|
40 |
+
vocab_file (`str`):
|
41 |
+
Path to the vocabulary file.
|
42 |
+
"""
|
43 |
+
|
44 |
+
vocab_files_names = VOCAB_FILES_NAMES
|
45 |
+
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
46 |
+
model_input_names = ["input_ids", "attention_mask"]
|
47 |
+
_auto_class = "AutoTokenizer"
|
48 |
+
|
49 |
+
def __init__(
|
50 |
+
self,
|
51 |
+
vocab_file,
|
52 |
+
unk_token="<unk>",
|
53 |
+
bos_token="<s>",
|
54 |
+
eos_token="</s>",
|
55 |
+
pad_token="</s>",
|
56 |
+
sp_model_kwargs: Optional[Dict[str, Any]] = None,
|
57 |
+
add_bos_token=True,
|
58 |
+
add_eos_token=False,
|
59 |
+
decode_with_prefix_space=False,
|
60 |
+
clean_up_tokenization_spaces=False,
|
61 |
+
**kwargs,
|
62 |
+
):
|
63 |
+
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
|
64 |
+
self.vocab_file = vocab_file
|
65 |
+
self.add_bos_token = add_bos_token
|
66 |
+
self.add_eos_token = add_eos_token
|
67 |
+
self.decode_with_prefix_space = decode_with_prefix_space
|
68 |
+
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
|
69 |
+
self.sp_model.Load(vocab_file)
|
70 |
+
self._no_prefix_space_tokens = None
|
71 |
+
super().__init__(
|
72 |
+
bos_token=bos_token,
|
73 |
+
eos_token=eos_token,
|
74 |
+
unk_token=unk_token,
|
75 |
+
pad_token=pad_token,
|
76 |
+
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
|
77 |
+
**kwargs,
|
78 |
+
)
|
79 |
+
|
80 |
+
@property
|
81 |
+
def no_prefix_space_tokens(self):
|
82 |
+
if self._no_prefix_space_tokens is None:
|
83 |
+
vocab = self.convert_ids_to_tokens(list(range(self.vocab_size)))
|
84 |
+
self._no_prefix_space_tokens = {i for i, tok in enumerate(vocab) if not tok.startswith("▁")}
|
85 |
+
return self._no_prefix_space_tokens
|
86 |
+
|
87 |
+
@property
|
88 |
+
def vocab_size(self):
|
89 |
+
"""Returns vocab size"""
|
90 |
+
return self.sp_model.get_piece_size()
|
91 |
+
|
92 |
+
@property
|
93 |
+
def bos_token_id(self) -> Optional[int]:
|
94 |
+
return self.sp_model.bos_id()
|
95 |
+
|
96 |
+
@property
|
97 |
+
def eos_token_id(self) -> Optional[int]:
|
98 |
+
return self.sp_model.eos_id()
|
99 |
+
|
100 |
+
def get_vocab(self):
|
101 |
+
"""Returns vocab as a dict"""
|
102 |
+
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
|
103 |
+
vocab.update(self.added_tokens_encoder)
|
104 |
+
return vocab
|
105 |
+
|
106 |
+
def _tokenize(self, text):
|
107 |
+
"""Returns a tokenized string."""
|
108 |
+
return self.sp_model.encode(text, out_type=str)
|
109 |
+
|
110 |
+
def _convert_token_to_id(self, token):
|
111 |
+
"""Converts a token (str) in an id using the vocab."""
|
112 |
+
return self.sp_model.piece_to_id(token)
|
113 |
+
|
114 |
+
def _convert_id_to_token(self, index):
|
115 |
+
"""Converts an index (integer) in a token (str) using the vocab."""
|
116 |
+
token = self.sp_model.IdToPiece(index)
|
117 |
+
return token
|
118 |
+
|
119 |
+
def _maybe_add_prefix_space(self, tokens, decoded):
|
120 |
+
if tokens and tokens[0] not in self.no_prefix_space_tokens:
|
121 |
+
return " " + decoded
|
122 |
+
else:
|
123 |
+
return decoded
|
124 |
+
|
125 |
+
def convert_tokens_to_string(self, tokens):
|
126 |
+
"""Converts a sequence of tokens (string) in a single string."""
|
127 |
+
current_sub_tokens = []
|
128 |
+
out_string = ""
|
129 |
+
prev_is_special = False
|
130 |
+
for token in tokens:
|
131 |
+
# make sure that special tokens are not decoded using sentencepiece model
|
132 |
+
if token in self.all_special_tokens:
|
133 |
+
if not prev_is_special:
|
134 |
+
out_string += " "
|
135 |
+
out_string += self.sp_model.decode(current_sub_tokens) + token
|
136 |
+
prev_is_special = True
|
137 |
+
current_sub_tokens = []
|
138 |
+
else:
|
139 |
+
current_sub_tokens.append(token)
|
140 |
+
prev_is_special = False
|
141 |
+
out_string += self.sp_model.decode(current_sub_tokens)
|
142 |
+
out_string = self.clean_up_tokenization(out_string)
|
143 |
+
out_string = self._maybe_add_prefix_space(tokens=tokens, decoded=out_string)
|
144 |
+
return out_string[1:]
|
145 |
+
|
146 |
+
def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
147 |
+
"""
|
148 |
+
Save the vocabulary and special tokens file to a directory.
|
149 |
+
|
150 |
+
Args:
|
151 |
+
save_directory (`str`):
|
152 |
+
The directory in which to save the vocabulary.
|
153 |
+
|
154 |
+
Returns:
|
155 |
+
`Tuple(str)`: Paths to the files saved.
|
156 |
+
"""
|
157 |
+
if not os.path.isdir(save_directory):
|
158 |
+
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
|
159 |
+
return
|
160 |
+
out_vocab_file = os.path.join(
|
161 |
+
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
|
162 |
+
)
|
163 |
+
|
164 |
+
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
|
165 |
+
copyfile(self.vocab_file, out_vocab_file)
|
166 |
+
elif not os.path.isfile(self.vocab_file):
|
167 |
+
with open(out_vocab_file, "wb") as fi:
|
168 |
+
content_spiece_model = self.sp_model.serialized_model_proto()
|
169 |
+
fi.write(content_spiece_model)
|
170 |
+
|
171 |
+
return (out_vocab_file,)
|
172 |
+
|
173 |
+
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
|
174 |
+
if self.add_bos_token:
|
175 |
+
bos_token_ids = [self.bos_token_id]
|
176 |
+
else:
|
177 |
+
bos_token_ids = []
|
178 |
+
|
179 |
+
output = bos_token_ids + token_ids_0
|
180 |
+
|
181 |
+
if token_ids_1 is not None:
|
182 |
+
output = output + token_ids_1
|
183 |
+
|
184 |
+
if self.add_eos_token:
|
185 |
+
output = output + [self.eos_token_id]
|
186 |
+
|
187 |
+
return output
|
188 |
+
|
189 |
+
def get_special_tokens_mask(
|
190 |
+
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
|
191 |
+
) -> List[int]:
|
192 |
+
"""
|
193 |
+
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
|
194 |
+
special tokens using the tokenizer `prepare_for_model` method.
|
195 |
+
|
196 |
+
Args:
|
197 |
+
token_ids_0 (`List[int]`):
|
198 |
+
List of IDs.
|
199 |
+
token_ids_1 (`List[int]`, *optional*):
|
200 |
+
Optional second list of IDs for sequence pairs.
|
201 |
+
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
|
202 |
+
Whether or not the token list is already formatted with special tokens for the model.
|
203 |
+
|
204 |
+
Returns:
|
205 |
+
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
|
206 |
+
"""
|
207 |
+
if already_has_special_tokens:
|
208 |
+
return super().get_special_tokens_mask(
|
209 |
+
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
|
210 |
+
)
|
211 |
+
|
212 |
+
if token_ids_1 is None:
|
213 |
+
return [1] + ([0] * len(token_ids_0)) + [1]
|
214 |
+
return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
|
215 |
+
|
216 |
+
def create_token_type_ids_from_sequences(
|
217 |
+
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
218 |
+
) -> List[int]:
|
219 |
+
"""
|
220 |
+
Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
|
221 |
+
use of token type ids, therefore a list of zeros is returned.
|
222 |
+
|
223 |
+
Args:
|
224 |
+
token_ids_0 (`List[int]`):
|
225 |
+
List of IDs.
|
226 |
+
token_ids_1 (`List[int]`, *optional*):
|
227 |
+
Optional second list of IDs for sequence pairs.
|
228 |
+
|
229 |
+
Returns:
|
230 |
+
`List[int]`: List of zeros.
|
231 |
+
"""
|
232 |
+
eos = [self.eos_token_id]
|
233 |
+
|
234 |
+
if token_ids_1 is None:
|
235 |
+
return len(token_ids_0 + eos) * [0]
|
236 |
+
return len(token_ids_0 + eos + token_ids_1 + eos) * [0]
|
llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-562/tokenization_internlm2_fast.py
ADDED
@@ -0,0 +1,214 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding=utf-8
|
2 |
+
# Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
|
3 |
+
#
|
4 |
+
# This code is based on transformers/src/transformers/models/llama/tokenization_llama_fast.py
|
5 |
+
#
|
6 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
7 |
+
# you may not use this file except in compliance with the License.
|
8 |
+
# You may obtain a copy of the License at
|
9 |
+
#
|
10 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
11 |
+
#
|
12 |
+
# Unless required by applicable law or agreed to in writing, software
|
13 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
14 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
15 |
+
# See the License for the specific language governing permissions and
|
16 |
+
# limitations under the License.
|
17 |
+
|
18 |
+
"""Tokenization Fast class for InternLM."""
|
19 |
+
import os
|
20 |
+
from shutil import copyfile
|
21 |
+
from typing import Any, Dict, Optional, Tuple
|
22 |
+
|
23 |
+
from tokenizers import processors, decoders, Tokenizer, normalizers
|
24 |
+
from tokenizers.models import BPE
|
25 |
+
|
26 |
+
from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
|
27 |
+
from transformers.utils import logging
|
28 |
+
|
29 |
+
from transformers.convert_slow_tokenizer import (
|
30 |
+
SLOW_TO_FAST_CONVERTERS,
|
31 |
+
SpmConverter,
|
32 |
+
SentencePieceExtractor,
|
33 |
+
)
|
34 |
+
|
35 |
+
from .tokenization_internlm2 import InternLM2Tokenizer
|
36 |
+
|
37 |
+
logger = logging.get_logger(__name__)
|
38 |
+
|
39 |
+
VOCAB_FILES_NAMES = {"vocab_file": "./tokenizer.model"}
|
40 |
+
|
41 |
+
# Modified from transformers.convert_slow_tokenizer.LlamaConverter
|
42 |
+
class InternLM2Converter(SpmConverter):
|
43 |
+
handle_byte_fallback = True
|
44 |
+
|
45 |
+
def vocab(self, proto):
|
46 |
+
vocab = [
|
47 |
+
("<unk>", 0.0),
|
48 |
+
("<s>", 0.0),
|
49 |
+
("</s>", 0.0),
|
50 |
+
]
|
51 |
+
vocab += [(piece.piece, piece.score) for piece in proto.pieces[3:]]
|
52 |
+
return vocab
|
53 |
+
|
54 |
+
def unk_id(self, proto):
|
55 |
+
unk_id = 0
|
56 |
+
return unk_id
|
57 |
+
|
58 |
+
def decoder(self, replacement, add_prefix_space):
|
59 |
+
decoders_sequence = [
|
60 |
+
decoders.Replace("▁", " "),
|
61 |
+
decoders.ByteFallback(),
|
62 |
+
decoders.Fuse(),
|
63 |
+
]
|
64 |
+
if self.proto.normalizer_spec.add_dummy_prefix:
|
65 |
+
decoders_sequence.append(decoders.Strip(content=" ", left=1))
|
66 |
+
return decoders.Sequence(decoders_sequence)
|
67 |
+
|
68 |
+
def tokenizer(self, proto):
|
69 |
+
model_type = proto.trainer_spec.model_type
|
70 |
+
vocab_scores = self.vocab(proto)
|
71 |
+
# special tokens
|
72 |
+
added_tokens = self.original_tokenizer.added_tokens_decoder
|
73 |
+
for i in range(len(vocab_scores)):
|
74 |
+
piece, score = vocab_scores[i]
|
75 |
+
if i in added_tokens:
|
76 |
+
vocab_scores[i] = (added_tokens[i].content, score)
|
77 |
+
if model_type == 1:
|
78 |
+
raise RuntimeError("InternLM2 is supposed to be a BPE model!")
|
79 |
+
|
80 |
+
elif model_type == 2:
|
81 |
+
_, merges = SentencePieceExtractor(self.original_tokenizer.vocab_file).extract(vocab_scores)
|
82 |
+
bpe_vocab = {word: i for i, (word, _score) in enumerate(vocab_scores)}
|
83 |
+
tokenizer = Tokenizer(
|
84 |
+
BPE(bpe_vocab, merges, unk_token=proto.trainer_spec.unk_piece, fuse_unk=True, byte_fallback=True)
|
85 |
+
)
|
86 |
+
tokenizer.add_special_tokens(
|
87 |
+
[ added_token for index, added_token in added_tokens.items()]
|
88 |
+
)
|
89 |
+
else:
|
90 |
+
raise Exception(
|
91 |
+
"You're trying to run a `Unigram` model but you're file was trained with a different algorithm"
|
92 |
+
)
|
93 |
+
|
94 |
+
return tokenizer
|
95 |
+
|
96 |
+
def normalizer(self, proto):
|
97 |
+
normalizers_list = []
|
98 |
+
if proto.normalizer_spec.add_dummy_prefix:
|
99 |
+
normalizers_list.append(normalizers.Prepend(prepend="▁"))
|
100 |
+
normalizers_list.append(normalizers.Replace(pattern=" ", content="▁"))
|
101 |
+
return normalizers.Sequence(normalizers_list)
|
102 |
+
|
103 |
+
def pre_tokenizer(self, replacement, add_prefix_space):
|
104 |
+
return None
|
105 |
+
|
106 |
+
SLOW_TO_FAST_CONVERTERS["InternLM2Tokenizer"] = InternLM2Converter
|
107 |
+
|
108 |
+
|
109 |
+
# Modified from transformers.model.llama.tokenization_llama_fast.LlamaTokenizerFast -> InternLM2TokenizerFast
|
110 |
+
class InternLM2TokenizerFast(PreTrainedTokenizerFast):
|
111 |
+
vocab_files_names = VOCAB_FILES_NAMES
|
112 |
+
slow_tokenizer_class = InternLM2Tokenizer
|
113 |
+
padding_side = "left"
|
114 |
+
model_input_names = ["input_ids", "attention_mask"]
|
115 |
+
_auto_class = "AutoTokenizer"
|
116 |
+
|
117 |
+
def __init__(
|
118 |
+
self,
|
119 |
+
vocab_file,
|
120 |
+
unk_token="<unk>",
|
121 |
+
bos_token="<s>",
|
122 |
+
eos_token="</s>",
|
123 |
+
pad_token="</s>",
|
124 |
+
sp_model_kwargs: Optional[Dict[str, Any]] = None,
|
125 |
+
add_bos_token=True,
|
126 |
+
add_eos_token=False,
|
127 |
+
decode_with_prefix_space=False,
|
128 |
+
clean_up_tokenization_spaces=False,
|
129 |
+
**kwargs,
|
130 |
+
):
|
131 |
+
super().__init__(
|
132 |
+
vocab_file=vocab_file,
|
133 |
+
unk_token=unk_token,
|
134 |
+
bos_token=bos_token,
|
135 |
+
eos_token=eos_token,
|
136 |
+
pad_token=pad_token,
|
137 |
+
sp_model_kwargs=sp_model_kwargs,
|
138 |
+
add_bos_token=add_bos_token,
|
139 |
+
add_eos_token=add_eos_token,
|
140 |
+
decode_with_prefix_space=decode_with_prefix_space,
|
141 |
+
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
|
142 |
+
**kwargs,
|
143 |
+
)
|
144 |
+
self._add_bos_token = add_bos_token
|
145 |
+
self._add_eos_token = add_eos_token
|
146 |
+
self.update_post_processor()
|
147 |
+
self.vocab_file = vocab_file
|
148 |
+
|
149 |
+
@property
|
150 |
+
def can_save_slow_tokenizer(self) -> bool:
|
151 |
+
return os.path.isfile(self.vocab_file) if self.vocab_file else False
|
152 |
+
|
153 |
+
def update_post_processor(self):
|
154 |
+
"""
|
155 |
+
Updates the underlying post processor with the current `bos_token` and `eos_token`.
|
156 |
+
"""
|
157 |
+
bos = self.bos_token
|
158 |
+
bos_token_id = self.bos_token_id
|
159 |
+
if bos is None and self.add_bos_token:
|
160 |
+
raise ValueError("add_bos_token = True but bos_token = None")
|
161 |
+
|
162 |
+
eos = self.eos_token
|
163 |
+
eos_token_id = self.eos_token_id
|
164 |
+
if eos is None and self.add_eos_token:
|
165 |
+
raise ValueError("add_eos_token = True but eos_token = None")
|
166 |
+
|
167 |
+
single = f"{(bos+':0 ') if self.add_bos_token else ''}$A:0{(' '+eos+':0') if self.add_eos_token else ''}"
|
168 |
+
pair = f"{single}{(' '+bos+':1') if self.add_bos_token else ''} $B:1{(' '+eos+':1') if self.add_eos_token else ''}"
|
169 |
+
|
170 |
+
special_tokens = []
|
171 |
+
if self.add_bos_token:
|
172 |
+
special_tokens.append((bos, bos_token_id))
|
173 |
+
if self.add_eos_token:
|
174 |
+
special_tokens.append((eos, eos_token_id))
|
175 |
+
self._tokenizer.post_processor = processors.TemplateProcessing(
|
176 |
+
single=single, pair=pair, special_tokens=special_tokens
|
177 |
+
)
|
178 |
+
|
179 |
+
@property
|
180 |
+
def add_eos_token(self):
|
181 |
+
return self._add_eos_token
|
182 |
+
|
183 |
+
@property
|
184 |
+
def add_bos_token(self):
|
185 |
+
return self._add_bos_token
|
186 |
+
|
187 |
+
@add_eos_token.setter
|
188 |
+
def add_eos_token(self, value):
|
189 |
+
self._add_eos_token = value
|
190 |
+
self.update_post_processor()
|
191 |
+
|
192 |
+
@add_bos_token.setter
|
193 |
+
def add_bos_token(self, value):
|
194 |
+
self._add_bos_token = value
|
195 |
+
self.update_post_processor()
|
196 |
+
|
197 |
+
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
198 |
+
if not self.can_save_slow_tokenizer:
|
199 |
+
raise ValueError(
|
200 |
+
"Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
|
201 |
+
"tokenizer."
|
202 |
+
)
|
203 |
+
|
204 |
+
if not os.path.isdir(save_directory):
|
205 |
+
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
|
206 |
+
return
|
207 |
+
out_vocab_file = os.path.join(
|
208 |
+
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
|
209 |
+
)
|
210 |
+
|
211 |
+
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
|
212 |
+
copyfile(self.vocab_file, out_vocab_file)
|
213 |
+
|
214 |
+
return (out_vocab_file,)
|
llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-562/tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-562/tokenizer.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f868398fc4e05ee1e8aeba95ddf18ddcc45b8bce55d5093bead5bbf80429b48b
|
3 |
+
size 1477754
|
llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-562/tokenizer_config.json
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_bos_token": true,
|
3 |
+
"add_eos_token": false,
|
4 |
+
"added_tokens_decoder": {
|
5 |
+
"0": {
|
6 |
+
"content": "<unk>",
|
7 |
+
"lstrip": false,
|
8 |
+
"normalized": false,
|
9 |
+
"rstrip": false,
|
10 |
+
"single_word": false,
|
11 |
+
"special": true
|
12 |
+
},
|
13 |
+
"1": {
|
14 |
+
"content": "<s>",
|
15 |
+
"lstrip": false,
|
16 |
+
"normalized": false,
|
17 |
+
"rstrip": false,
|
18 |
+
"single_word": false,
|
19 |
+
"special": true
|
20 |
+
},
|
21 |
+
"2": {
|
22 |
+
"content": "</s>",
|
23 |
+
"lstrip": false,
|
24 |
+
"normalized": false,
|
25 |
+
"rstrip": false,
|
26 |
+
"single_word": false,
|
27 |
+
"special": true
|
28 |
+
},
|
29 |
+
"92538": {
|
30 |
+
"content": "<|plugin|>",
|
31 |
+
"lstrip": false,
|
32 |
+
"normalized": false,
|
33 |
+
"rstrip": false,
|
34 |
+
"single_word": false,
|
35 |
+
"special": true
|
36 |
+
},
|
37 |
+
"92539": {
|
38 |
+
"content": "<|interpreter|>",
|
39 |
+
"lstrip": false,
|
40 |
+
"normalized": false,
|
41 |
+
"rstrip": false,
|
42 |
+
"single_word": false,
|
43 |
+
"special": true
|
44 |
+
},
|
45 |
+
"92540": {
|
46 |
+
"content": "<|action_end|>",
|
47 |
+
"lstrip": false,
|
48 |
+
"normalized": false,
|
49 |
+
"rstrip": false,
|
50 |
+
"single_word": false,
|
51 |
+
"special": true
|
52 |
+
},
|
53 |
+
"92541": {
|
54 |
+
"content": "<|action_start|>",
|
55 |
+
"lstrip": false,
|
56 |
+
"normalized": false,
|
57 |
+
"rstrip": false,
|
58 |
+
"single_word": false,
|
59 |
+
"special": true
|
60 |
+
},
|
61 |
+
"92542": {
|
62 |
+
"content": "<|im_end|>",
|
63 |
+
"lstrip": false,
|
64 |
+
"normalized": false,
|
65 |
+
"rstrip": false,
|
66 |
+
"single_word": false,
|
67 |
+
"special": true
|
68 |
+
},
|
69 |
+
"92543": {
|
70 |
+
"content": "<|im_start|>",
|
71 |
+
"lstrip": false,
|
72 |
+
"normalized": false,
|
73 |
+
"rstrip": false,
|
74 |
+
"single_word": false,
|
75 |
+
"special": true
|
76 |
+
}
|
77 |
+
},
|
78 |
+
"additional_special_tokens": [
|
79 |
+
"<|im_start|>",
|
80 |
+
"<|im_end|>",
|
81 |
+
"<|action_start|>",
|
82 |
+
"<|action_end|>",
|
83 |
+
"<|interpreter|>",
|
84 |
+
"<|plugin|>"
|
85 |
+
],
|
86 |
+
"auto_map": {
|
87 |
+
"AutoTokenizer": [
|
88 |
+
"tokenization_internlm2.InternLM2Tokenizer",
|
89 |
+
"tokenization_internlm2_fast.InternLM2TokenizerFast"
|
90 |
+
]
|
91 |
+
},
|
92 |
+
"bos_token": "<s>",
|
93 |
+
"chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|im_start|>system\n' + system_message + '<|im_end|>\n' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}",
|
94 |
+
"clean_up_tokenization_spaces": false,
|
95 |
+
"decode_with_prefix_space": false,
|
96 |
+
"eos_token": "<|im_end|>",
|
97 |
+
"model_max_length": 1000000000000000019884624838656,
|
98 |
+
"pad_token": "</s>",
|
99 |
+
"padding_side": "right",
|
100 |
+
"sp_model_kwargs": null,
|
101 |
+
"split_special_tokens": false,
|
102 |
+
"tokenizer_class": "InternLM2Tokenizer",
|
103 |
+
"unk_token": "<unk>"
|
104 |
+
}
|
llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-562/trainer_state.json
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"best_metric": null,
|
3 |
+
"best_model_checkpoint": null,
|
4 |
+
"epoch": 0.9991111111111111,
|
5 |
+
"eval_steps": 562,
|
6 |
+
"global_step": 562,
|
7 |
+
"is_hyper_param_search": false,
|
8 |
+
"is_local_process_zero": true,
|
9 |
+
"is_world_process_zero": true,
|
10 |
+
"log_history": [
|
11 |
+
{
|
12 |
+
"epoch": 0.17777777777777778,
|
13 |
+
"grad_norm": 2.4078445434570312,
|
14 |
+
"learning_rate": 2.958579881656805e-05,
|
15 |
+
"loss": 0.4734,
|
16 |
+
"step": 100
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"epoch": 0.35555555555555557,
|
20 |
+
"grad_norm": 4.478668689727783,
|
21 |
+
"learning_rate": 5.91715976331361e-05,
|
22 |
+
"loss": 0.3667,
|
23 |
+
"step": 200
|
24 |
+
},
|
25 |
+
{
|
26 |
+
"epoch": 0.5333333333333333,
|
27 |
+
"grad_norm": 4.706894397735596,
|
28 |
+
"learning_rate": 8.875739644970414e-05,
|
29 |
+
"loss": 0.3744,
|
30 |
+
"step": 300
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"epoch": 0.7111111111111111,
|
34 |
+
"grad_norm": 2.9772543907165527,
|
35 |
+
"learning_rate": 9.989699867437137e-05,
|
36 |
+
"loss": 0.3797,
|
37 |
+
"step": 400
|
38 |
+
},
|
39 |
+
{
|
40 |
+
"epoch": 0.8888888888888888,
|
41 |
+
"grad_norm": 6.316380023956299,
|
42 |
+
"learning_rate": 9.92981892269398e-05,
|
43 |
+
"loss": 0.3443,
|
44 |
+
"step": 500
|
45 |
+
},
|
46 |
+
{
|
47 |
+
"epoch": 0.9991111111111111,
|
48 |
+
"eval_accuracy": 0.7856666666666667,
|
49 |
+
"eval_loss": 0.7739760279655457,
|
50 |
+
"eval_runtime": 87.6689,
|
51 |
+
"eval_samples_per_second": 5.703,
|
52 |
+
"eval_steps_per_second": 5.703,
|
53 |
+
"step": 562
|
54 |
+
}
|
55 |
+
],
|
56 |
+
"logging_steps": 100,
|
57 |
+
"max_steps": 3372,
|
58 |
+
"num_input_tokens_seen": 0,
|
59 |
+
"num_train_epochs": 6,
|
60 |
+
"save_steps": 562,
|
61 |
+
"stateful_callbacks": {
|
62 |
+
"TrainerControl": {
|
63 |
+
"args": {
|
64 |
+
"should_epoch_stop": false,
|
65 |
+
"should_evaluate": false,
|
66 |
+
"should_log": false,
|
67 |
+
"should_save": true,
|
68 |
+
"should_training_stop": false
|
69 |
+
},
|
70 |
+
"attributes": {}
|
71 |
+
}
|
72 |
+
},
|
73 |
+
"total_flos": 6.778373413443994e+16,
|
74 |
+
"train_batch_size": 1,
|
75 |
+
"trial_name": null,
|
76 |
+
"trial_params": null
|
77 |
+
}
|
llama-factory/saves/internlm2_5_7b/lora/sft_bf16/checkpoint-562/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ba59bb13ccbc1536fe7be63d6753c0657b8ec8334daccb6ec740876a5104de43
|
3 |
+
size 5304
|
llama-factory/saves/internlm2_5_7b/lora/sft_bf16/trainer_log.jsonl
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"current_steps": 100, "total_steps": 3372, "loss": 0.4734, "learning_rate": 2.958579881656805e-05, "epoch": 0.17777777777777778, "percentage": 2.97, "elapsed_time": "0:07:18", "remaining_time": "3:59:15", "throughput": "0.00", "total_tokens": 0}
|
2 |
+
{"current_steps": 200, "total_steps": 3372, "loss": 0.3667, "learning_rate": 5.91715976331361e-05, "epoch": 0.35555555555555557, "percentage": 5.93, "elapsed_time": "0:14:40", "remaining_time": "3:52:41", "throughput": "0.00", "total_tokens": 0}
|
3 |
+
{"current_steps": 300, "total_steps": 3372, "loss": 0.3744, "learning_rate": 8.875739644970414e-05, "epoch": 0.5333333333333333, "percentage": 8.9, "elapsed_time": "0:22:01", "remaining_time": "3:45:32", "throughput": "0.00", "total_tokens": 0}
|
4 |
+
{"current_steps": 400, "total_steps": 3372, "loss": 0.3797, "learning_rate": 9.989699867437137e-05, "epoch": 0.7111111111111111, "percentage": 11.86, "elapsed_time": "0:29:21", "remaining_time": "3:38:10", "throughput": "0.00", "total_tokens": 0}
|
5 |
+
{"current_steps": 500, "total_steps": 3372, "loss": 0.3443, "learning_rate": 9.92981892269398e-05, "epoch": 0.8888888888888888, "percentage": 14.83, "elapsed_time": "0:36:42", "remaining_time": "3:30:48", "throughput": "0.00", "total_tokens": 0}
|
6 |
+
{"current_steps": 562, "total_steps": 3372, "eval_loss": 0.7739760279655457, "epoch": 0.9991111111111111, "percentage": 16.67, "elapsed_time": "0:42:43", "remaining_time": "3:33:37", "throughput": "0.00", "total_tokens": 0}
|
7 |
+
{"current_steps": 600, "total_steps": 3372, "loss": 0.3644, "learning_rate": 9.817128546774103e-05, "epoch": 1.0666666666666667, "percentage": 17.79, "elapsed_time": "0:45:32", "remaining_time": "3:30:25", "throughput": "0.00", "total_tokens": 0}
|
8 |
+
{"current_steps": 700, "total_steps": 3372, "loss": 0.3146, "learning_rate": 9.652835906663704e-05, "epoch": 1.2444444444444445, "percentage": 20.76, "elapsed_time": "0:52:52", "remaining_time": "3:21:49", "throughput": "0.00", "total_tokens": 0}
|
9 |
+
{"current_steps": 800, "total_steps": 3372, "loss": 0.3365, "learning_rate": 9.438700945477697e-05, "epoch": 1.4222222222222223, "percentage": 23.72, "elapsed_time": "1:00:11", "remaining_time": "3:13:29", "throughput": "0.00", "total_tokens": 0}
|
10 |
+
{"current_steps": 900, "total_steps": 3372, "loss": 0.2878, "learning_rate": 9.177017529516772e-05, "epoch": 1.6, "percentage": 26.69, "elapsed_time": "1:07:29", "remaining_time": "3:05:23", "throughput": "0.00", "total_tokens": 0}
|
11 |
+
{"current_steps": 1000, "total_steps": 3372, "loss": 0.3291, "learning_rate": 8.870588875808164e-05, "epoch": 1.7777777777777777, "percentage": 29.66, "elapsed_time": "1:14:48", "remaining_time": "2:57:27", "throughput": "0.00", "total_tokens": 0}
|
12 |
+
{"current_steps": 1100, "total_steps": 3372, "loss": 0.2604, "learning_rate": 8.522697523356319e-05, "epoch": 1.9555555555555557, "percentage": 32.62, "elapsed_time": "1:22:07", "remaining_time": "2:49:37", "throughput": "0.00", "total_tokens": 0}
|
13 |
+
{"current_steps": 1124, "total_steps": 3372, "eval_loss": 0.38267847895622253, "epoch": 1.9982222222222221, "percentage": 33.33, "elapsed_time": "1:25:19", "remaining_time": "2:50:39", "throughput": "0.00", "total_tokens": 0}
|
14 |
+
{"current_steps": 1200, "total_steps": 3372, "loss": 0.2122, "learning_rate": 8.137070169778812e-05, "epoch": 2.1333333333333333, "percentage": 35.59, "elapsed_time": "1:30:54", "remaining_time": "2:44:32", "throughput": "0.00", "total_tokens": 0}
|
15 |
+
{"current_steps": 1300, "total_steps": 3372, "loss": 0.2322, "learning_rate": 7.717837750006106e-05, "epoch": 2.311111111111111, "percentage": 38.55, "elapsed_time": "1:38:13", "remaining_time": "2:36:32", "throughput": "0.00", "total_tokens": 0}
|
16 |
+
{"current_steps": 1400, "total_steps": 3372, "loss": 0.2091, "learning_rate": 7.269491184691924e-05, "epoch": 2.488888888888889, "percentage": 41.52, "elapsed_time": "1:45:32", "remaining_time": "2:28:39", "throughput": "0.00", "total_tokens": 0}
|
17 |
+
{"current_steps": 1500, "total_steps": 3372, "loss": 0.229, "learning_rate": 6.79683327236813e-05, "epoch": 2.6666666666666665, "percentage": 44.48, "elapsed_time": "1:52:50", "remaining_time": "2:20:48", "throughput": "0.00", "total_tokens": 0}
|
18 |
+
{"current_steps": 1600, "total_steps": 3372, "loss": 0.2278, "learning_rate": 6.304927240687181e-05, "epoch": 2.8444444444444446, "percentage": 47.45, "elapsed_time": "2:00:08", "remaining_time": "2:13:02", "throughput": "0.00", "total_tokens": 0}
|
19 |
+
{"current_steps": 1686, "total_steps": 3372, "eval_loss": 0.4158739745616913, "epoch": 2.997333333333333, "percentage": 50.0, "elapsed_time": "2:07:51", "remaining_time": "2:07:51", "throughput": "0.00", "total_tokens": 0}
|
20 |
+
{"current_steps": 1700, "total_steps": 3372, "loss": 0.1859, "learning_rate": 5.799042507883874e-05, "epoch": 3.022222222222222, "percentage": 50.42, "elapsed_time": "2:08:54", "remaining_time": "2:06:47", "throughput": "0.00", "total_tokens": 0}
|
21 |
+
{"current_steps": 1800, "total_steps": 3372, "loss": 0.1403, "learning_rate": 5.284598235472912e-05, "epoch": 3.2, "percentage": 53.38, "elapsed_time": "2:16:13", "remaining_time": "1:58:58", "throughput": "0.00", "total_tokens": 0}
|
22 |
+
{"current_steps": 1900, "total_steps": 3372, "loss": 0.156, "learning_rate": 4.7671052768596945e-05, "epoch": 3.3777777777777778, "percentage": 56.35, "elapsed_time": "2:23:31", "remaining_time": "1:51:11", "throughput": "0.00", "total_tokens": 0}
|
23 |
+
{"current_steps": 2000, "total_steps": 3372, "loss": 0.1221, "learning_rate": 4.2521071437250546e-05, "epoch": 3.5555555555555554, "percentage": 59.31, "elapsed_time": "2:30:49", "remaining_time": "1:43:27", "throughput": "0.00", "total_tokens": 0}
|
llm_toolkit/logical_reasoning_utils.py
CHANGED
@@ -4,7 +4,6 @@ import pandas as pd
|
|
4 |
import seaborn as sns
|
5 |
import matplotlib.pyplot as plt
|
6 |
from datasets import load_dataset
|
7 |
-
from llm_toolkit.llm_utils import extract_answer
|
8 |
from tqdm import tqdm
|
9 |
|
10 |
print(f"loading {__file__}")
|
@@ -48,6 +47,44 @@ P2 = """你是一个情景猜谜游戏的主持人。游戏规则如下:
|
|
48 |
"""
|
49 |
|
50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
def calc_metrics(references, predictions, debug=False):
|
52 |
assert len(references) == len(
|
53 |
predictions
|
@@ -270,3 +307,36 @@ def get_metrics(df):
|
|
270 |
metrics_df["all_metrics"] = all_metrics
|
271 |
|
272 |
return metrics_df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
import seaborn as sns
|
5 |
import matplotlib.pyplot as plt
|
6 |
from datasets import load_dataset
|
|
|
7 |
from tqdm import tqdm
|
8 |
|
9 |
print(f"loading {__file__}")
|
|
|
47 |
"""
|
48 |
|
49 |
|
50 |
+
def extract_answer(text, debug=False):
|
51 |
+
if text:
|
52 |
+
# Remove the begin and end tokens
|
53 |
+
text = re.sub(
|
54 |
+
r".*?(assistant|\[/INST\]).+?\b",
|
55 |
+
"",
|
56 |
+
text,
|
57 |
+
flags=re.DOTALL | re.MULTILINE,
|
58 |
+
)
|
59 |
+
if debug:
|
60 |
+
print("--------\nstep 1:", text)
|
61 |
+
|
62 |
+
text = re.sub(r"<.+?>.*", "", text, flags=re.DOTALL | re.MULTILINE)
|
63 |
+
if debug:
|
64 |
+
print("--------\nstep 2:", text)
|
65 |
+
|
66 |
+
text = re.sub(
|
67 |
+
r".*?end_header_id\|>\n\n", "", text, flags=re.DOTALL | re.MULTILINE
|
68 |
+
)
|
69 |
+
if debug:
|
70 |
+
print("--------\nstep 3:", text)
|
71 |
+
|
72 |
+
text = text.split(".")[0].strip()
|
73 |
+
if debug:
|
74 |
+
print("--------\nstep 4:", text)
|
75 |
+
|
76 |
+
text = re.sub(
|
77 |
+
r"^Response:.+?\b",
|
78 |
+
"",
|
79 |
+
text,
|
80 |
+
flags=re.DOTALL | re.MULTILINE,
|
81 |
+
)
|
82 |
+
if debug:
|
83 |
+
print("--------\nstep 5:", text)
|
84 |
+
|
85 |
+
return text
|
86 |
+
|
87 |
+
|
88 |
def calc_metrics(references, predictions, debug=False):
|
89 |
assert len(references) == len(
|
90 |
predictions
|
|
|
307 |
metrics_df["all_metrics"] = all_metrics
|
308 |
|
309 |
return metrics_df
|
310 |
+
|
311 |
+
|
312 |
+
def load_alpaca_data(data_path, using_p1=True, use_english_datasets=False):
|
313 |
+
alpaca_data_path = (
|
314 |
+
"llama-factory/data/alpaca_mgtv_p1.json"
|
315 |
+
if using_p1
|
316 |
+
else "llama-factory/data/alpaca_mgtv_p2.json"
|
317 |
+
)
|
318 |
+
|
319 |
+
if os.path.exists(alpaca_data_path):
|
320 |
+
print("loading existing data from:", alpaca_data_path)
|
321 |
+
data = pd.read_json(alpaca_data_path, orient="records", lines=False)
|
322 |
+
return data
|
323 |
+
|
324 |
+
print("loading new data from:", alpaca_data_path)
|
325 |
+
datasets = load_logical_reasoning_dataset(
|
326 |
+
data_path, chinese_prompt=not use_english_datasets
|
327 |
+
)
|
328 |
+
|
329 |
+
prompt_template = P1 if using_p1 else P2
|
330 |
+
df_train = datasets["train"].to_pandas()
|
331 |
+
df_train["instruction"] = df_train.apply(
|
332 |
+
lambda x: prompt_template.format(x["puzzle"], x["truth"], x["text"]), axis=1
|
333 |
+
)
|
334 |
+
|
335 |
+
df_alpaca = pd.DataFrame(
|
336 |
+
{"instruction": [""] * len(df_train), "input": [""] * len(df_train)}
|
337 |
+
)
|
338 |
+
df_alpaca["instruction"] = df_train["instruction"]
|
339 |
+
df_alpaca["output"] = df_train["label"]
|
340 |
+
df_alpaca.to_json(alpaca_data_path, orient="records", lines=False, indent=2)
|
341 |
+
|
342 |
+
return df_alpaca
|