Fizzarolli commited on
Commit
a9b7b4e
1 Parent(s): 05e7a35

someone asked for the training config

Browse files
Files changed (1) hide show
  1. axolotl_config.yml +88 -0
axolotl_config.yml ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_model: Fizzarolli/phi3-4x4b-uninitialized
2
+ model_type: MixtralForCausalLM
3
+ tokenizer_type: AutoTokenizer # PreTrainedTokenizerFast
4
+
5
+ load_in_8bit: false
6
+ load_in_4bit: true
7
+ strict: false
8
+
9
+ datasets:
10
+ - path: NeelNanda/pile-10k
11
+ type: completion
12
+ field: text
13
+ - path: BEE-spoke-data/gutenberg-en-v1-clean
14
+ type: completion
15
+ field: text
16
+ dataset_prepared_path: last_run_prepared
17
+ val_set_size: 0.05
18
+ output_dir: ./lora-out
19
+
20
+ adapter: qlora
21
+ lora_model_dir:
22
+
23
+ sequence_len: 4072
24
+ sample_packing: true
25
+ pad_to_sequence_len: true
26
+
27
+ lora_r: 32
28
+ lora_alpha: 64
29
+ lora_dropout: 0.05
30
+ lora_target_modules:
31
+ - k_proj
32
+ - q_proj
33
+ - v_proj
34
+ - o_proj
35
+ lora_target_linear: true
36
+ lora_fan_in_fan_out:
37
+
38
+ wandb_project: phixtral3
39
+ wandb_entity:
40
+ wandb_watch:
41
+ wandb_name:
42
+ wandb_log_model:
43
+
44
+ gradient_accumulation_steps: 4
45
+ micro_batch_size: 6
46
+ num_epochs: 1
47
+ optimizer: adamw_torch
48
+ lr_scheduler: cosine
49
+ learning_rate: 0.000001
50
+
51
+ train_on_inputs: false
52
+ group_by_length: false
53
+ bf16: auto
54
+ fp16:
55
+ tf32: false
56
+
57
+ gradient_checkpointing: true
58
+ gradient_checkpointing_kwargs:
59
+ use_reentrant: true
60
+ early_stopping_patience:
61
+ resume_from_checkpoint:
62
+ local_rank:
63
+ logging_steps: 1
64
+ xformers_attention:
65
+ flash_attention: true
66
+
67
+ warmup_steps: 10
68
+ evals_per_epoch: 0
69
+ eval_table_size:
70
+ saves_per_epoch: 20
71
+ debug:
72
+ fsdp:
73
+ - full_shard
74
+ - auto_wrap
75
+ fsdp_config:
76
+ fsdp_limit_all_gathers: true
77
+ fsdp_sync_module_states: true
78
+ fsdp_offload_params: true
79
+ fsdp_use_orig_params: false
80
+ fsdp_cpu_ram_efficient_loading: true
81
+ fsdp_transformer_layer_cls_to_wrap: MixtralSparseMoeBlock
82
+ fsdp_state_dict_type: FULL_STATE_DICT
83
+ fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
84
+ fsdp_sharding_strategy: FULL_SHARD
85
+ fsdp_forward_prefetch: false
86
+ fsdp_backward_prefetch: BACKWARD_PRE
87
+ weight_decay: 0.0
88
+ special_tokens: