Crystalcareai commited on
Commit
2bebe0c
1 Parent(s): f3a237e

Added comments

Browse files
Files changed (1) hide show
  1. Training-Configs/AxolotlConfig.yml +7 -9
Training-Configs/AxolotlConfig.yml CHANGED
@@ -1,18 +1,18 @@
1
- base_model: Crystalcareai/Qwen-1.5-8x7B
2
- model_type: Qwen2ForCausalLM
3
- tokenizer_type: Qwen2Tokenizer
4
  trust_remote_code: true
5
 
6
 
7
  load_in_8bit: false
8
- load_in_4bit: true
9
  strict: false
10
 
11
 
12
  datasets:
13
  - path: Crystalcareai/MoD
14
  type: sharegpt
15
- dataset_prepared_path: last_run_prepared
16
  val_set_size: 0.0
17
  output_dir: ./qlora-out
18
 
@@ -23,8 +23,6 @@ model_config:
23
 
24
  adapter: qlora
25
  lora_model_dir:
26
-
27
-
28
  sequence_len: 32768
29
  sample_packing: true
30
  pad_to_sequence_len: true
@@ -42,7 +40,7 @@ micro_batch_size: 2
42
  num_epochs: 4
43
  optimizer: adamw_bnb_8bit
44
  lr_scheduler: cosine
45
- learning_rate: 0.0002
46
 
47
 
48
  train_on_inputs: false
@@ -53,7 +51,7 @@ tf32: false
53
 
54
 
55
  gradient_checkpointing: true
56
- early_stopping_patience:
57
  resume_from_checkpoint:
58
  local_rank:
59
  logging_steps: 1
 
1
+ base_model: Crystalcareai/Qwen-1.5-8x7B #this is the raw (random gated) model straight out of mergekit. Change this to "Crystalcareai/Qwen1.5-8x7b" for training SFT'd model.
2
+ model_type: Qwen2ForCausalLM #don't use HF auto config
3
+ tokenizer_type: Qwen2Tokenizer #don't use HF auto config
4
  trust_remote_code: true
5
 
6
 
7
  load_in_8bit: false
8
+ load_in_4bit: true #Mixtral models still chug vram in axolotl, so qlora is required at the moment.
9
  strict: false
10
 
11
 
12
  datasets:
13
  - path: Crystalcareai/MoD
14
  type: sharegpt
15
+ dataset_prepared_path: last_run_prepared #preprocess your dataset for easier vram: "python -m axolotl.cli.preprocess examples/Qwen/YOURCONFIG.yml"
16
  val_set_size: 0.0
17
  output_dir: ./qlora-out
18
 
 
23
 
24
  adapter: qlora
25
  lora_model_dir:
 
 
26
  sequence_len: 32768
27
  sample_packing: true
28
  pad_to_sequence_len: true
 
40
  num_epochs: 4
41
  optimizer: adamw_bnb_8bit
42
  lr_scheduler: cosine
43
+ learning_rate: 0.0002 # anything from 2-5 is acceptable
44
 
45
 
46
  train_on_inputs: false
 
51
 
52
 
53
  gradient_checkpointing: true
54
+ early_stopping_patience:
55
  resume_from_checkpoint:
56
  local_rank:
57
  logging_steps: 1