|
cache_dir: ./cache |
|
ddp_find_unused_parameters: false |
|
ddp_timeout: 30000 |
|
device_map: auto |
|
do_eval: true |
|
do_train: true |
|
eval_steps: 1000 |
|
evaluation_strategy: steps |
|
fp16: true |
|
gradient_accumulation_steps: 1 |
|
gradient_checkpointing: true |
|
gradient_checkpointing_kwargs: |
|
use_reentrant: false |
|
hub_model_id: hllj/sft-mistral-v1-clean-valid |
|
hub_strategy: every_save |
|
learning_rate: 3.0e-05 |
|
log_level: info |
|
logging_first_step: true |
|
logging_steps: 10 |
|
logging_strategy: steps |
|
lora_alpha: 128 |
|
lora_dropout: 0.05 |
|
lora_r: 256 |
|
lora_target_modules: |
|
- q_proj |
|
- k_proj |
|
- v_proj |
|
- o_proj |
|
lr_scheduler_type: cosine |
|
max_seq_length: 1024 |
|
model_name_or_path: hllj/mistral-vi-math |
|
model_type: auto |
|
num_train_epochs: 2 |
|
output_dir: outputs-sft-mistral-v1-clean-valid |
|
overwrite_output_dir: true |
|
per_device_eval_batch_size: 4 |
|
per_device_train_batch_size: 4 |
|
preprocessing_num_workers: 4 |
|
push_to_hub: true |
|
report_to: wandb |
|
run_name: sft-mistral-v1-clean-valid |
|
save_steps: 1000 |
|
save_strategy: steps |
|
save_total_limit: 13 |
|
seed: 42 |
|
token: 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 |
|
torch_dtype: float16 |
|
train_file_dir: datasets/finetune |
|
use_peft: true |
|
validation_file_dir: datasets/validation |
|
warmup_ratio: 0.05 |
|
weight_decay: 0.05 |
|
|