Update README.md
Browse files
README.md
CHANGED
@@ -19,10 +19,11 @@ language:
|
|
19 |
|
20 |
### Model Sources [optional]
|
21 |
|
22 |
-
使用qwen1.5 14b作为基础,进行lora
|
23 |
|
24 |
训练参数如下:
|
25 |
|
|
|
26 |
quantization_bit: 4
|
27 |
|
28 |
stage: sft
|
@@ -54,12 +55,11 @@ overwrite_output_dir: true
|
|
54 |
flash_attn: fa2
|
55 |
per_device_train_batch_size: 2
|
56 |
gradient_accumulation_steps: 8
|
57 |
-
#之前3e-4学习率疑似有点高了,loss震荡比较厉害
|
58 |
learning_rate: 0.0001
|
59 |
num_train_epochs: 3
|
60 |
weight_decay: 0.01
|
61 |
optim: adamw_torch
|
62 |
-
|
63 |
lr_scheduler_type: cosine
|
64 |
warmup_steps: 0.01
|
65 |
bf16: true
|
@@ -69,7 +69,7 @@ val_size: 0.001
|
|
69 |
per_device_eval_batch_size: 1
|
70 |
evaluation_strategy: steps
|
71 |
eval_steps: 250
|
72 |
-
|
73 |
|
74 |
## Uses
|
75 |
|
|
|
19 |
|
20 |
### Model Sources [optional]
|
21 |
|
22 |
+
使用qwen1.5 14b作为基础,进行lora训练而成,使用的llamafactory框架
|
23 |
|
24 |
训练参数如下:
|
25 |
|
26 |
+
```yaml
|
27 |
quantization_bit: 4
|
28 |
|
29 |
stage: sft
|
|
|
55 |
flash_attn: fa2
|
56 |
per_device_train_batch_size: 2
|
57 |
gradient_accumulation_steps: 8
|
|
|
58 |
learning_rate: 0.0001
|
59 |
num_train_epochs: 3
|
60 |
weight_decay: 0.01
|
61 |
optim: adamw_torch
|
62 |
+
#8bit优化器似乎存在问题
|
63 |
lr_scheduler_type: cosine
|
64 |
warmup_steps: 0.01
|
65 |
bf16: true
|
|
|
69 |
per_device_eval_batch_size: 1
|
70 |
evaluation_strategy: steps
|
71 |
eval_steps: 250
|
72 |
+
```
|
73 |
|
74 |
## Uses
|
75 |
|