|
--- |
|
license: apache-2.0 |
|
base_model: |
|
- meta-llama/Llama-3.1-8B-Instruct |
|
--- |
|
|
|
A preview version of FuseChat-3.0, under testing... |
|
|
|
Training configs: |
|
```yaml |
|
### model |
|
model_name_or_path: meta-llama/Llama-3.1-8B-Instruct |
|
|
|
### method |
|
stage: sft |
|
do_train: true |
|
finetuning_type: full |
|
deepspeed: examples/deepspeed/ds_z3_config.json |
|
|
|
### dataset |
|
dataset: FuseChat-Mixture-v3-SFT |
|
template: llama3 |
|
cutoff_len: 2048 |
|
overwrite_cache: true |
|
preprocessing_num_workers: 16 |
|
|
|
### output |
|
output_dir: LLaMA-Factory/saves/llama31/FuseChat-Llama-3.1-8B-SFT-preview |
|
logging_steps: 10 |
|
save_steps: 10086 |
|
plot_loss: true |
|
overwrite_output_dir: true |
|
|
|
### train |
|
per_device_train_batch_size: 8 |
|
gradient_accumulation_steps: 2 |
|
learning_rate: 5.0e-6 |
|
num_train_epochs: 3.0 |
|
lr_scheduler_type: cosine |
|
warmup_ratio: 0.1 |
|
bf16: true |
|
ddp_timeout: 180000000 |
|
|
|
### custom |
|
do_eval: false |
|
packing: false |
|
train_on_prompt: false |
|
flash_attn: fa2 |
|
save_strategy: "no" |
|
save_total_limit: 1 |
|
seed: 42 |
|
save_only_model: true |
|
gradient_checkpointing: true |
|
gradient_checkpointing_kwargs: |
|
use_reentrant: False |
|
``` |