jeiku commited on
Commit
9c606a9
1 Parent(s): 1e0ba03

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +27 -264
README.md CHANGED
@@ -60,274 +60,28 @@ Coming soon...
60
 
61
  ## Training Configuration
62
 
63
- <details><summary>Click here for Axolotl configs</summary>
64
 
65
- Completion SFT
66
 
67
  ```yaml
68
- base_model: IntervitensInc/Llama-3.1-Minitron-4B-Width-Base-chatml
69
- model_type: AutoModelForCausalLM
70
- tokenizer_type: AutoTokenizer
71
-
72
- load_in_8bit: false
73
- load_in_4bit: false
74
- strict: false
75
-
76
- hub_model_id: jeiku/completion4B
77
- hub_strategy: "all_checkpoints"
78
- push_dataset_to_hub:
79
- hf_use_auth_token: true
80
-
81
- datasets:
82
- - path: Mielikki/Erebus-87k
83
- type: completion
84
- field: body
85
-
86
- shuffle_merged_datasets: true
87
- val_set_size: 0.0025
88
- output_dir: ./outputs/out
89
-
90
- adapter:
91
- lora_r:
92
- lora_alpha:
93
- lora_dropout:
94
- lora_target_linear:
95
-
96
- sequence_len: 8192
97
- sample_packing: true
98
- eval_sample_packing: false
99
- pad_to_sequence_len: true
100
-
101
- plugins:
102
- - axolotl.integrations.liger.LigerPlugin
103
- liger_rope: true
104
- liger_rms_norm: true
105
- liger_swiglu: true
106
- liger_fused_linear_cross_entropy: true
107
-
108
- wandb_project: EXP4B
109
- wandb_entity:
110
- wandb_watch:
111
- wandb_name: EXP4B
112
- wandb_log_model:
113
-
114
- gradient_accumulation_steps: 12
115
- micro_batch_size: 3
116
- num_epochs: 1
117
- optimizer: adamw_bnb_8bit
118
- lr_scheduler: cosine
119
- learning_rate: 0.00001
120
- weight_decay: 0.05
121
-
122
- train_on_inputs: false
123
- group_by_length: false
124
- bf16: auto
125
- fp16:
126
- tf32: true
127
-
128
- gradient_checkpointing: true
129
- early_stopping_patience:
130
- resume_from_checkpoint:
131
- local_rank:
132
- logging_steps: 1
133
- xformers_attention:
134
- flash_attention: true
135
-
136
- warmup_ratio: 0.1
137
- evals_per_epoch: 4
138
- eval_table_size:
139
- eval_max_new_tokens: 128
140
- saves_per_epoch: 1
141
-
142
- debug:
143
- deepspeed: deepspeed_configs/zero3_bf16.json
144
- fsdp:
145
- fsdp_config:
146
-
147
- special_tokens:
148
- pad_token: <|finetune_right_pad_id|>
149
- ```
150
-
151
- Instruct SFT
152
-
153
- ```yaml
154
- base_model: jeiku/completion4B
155
- model_type: AutoModelForCausalLM
156
- tokenizer_type: AutoTokenizer
157
-
158
- load_in_8bit: false
159
- load_in_4bit: false
160
- strict: false
161
-
162
- hub_model_id: jeiku/instructered4B
163
- hub_strategy: "all_checkpoints"
164
- push_dataset_to_hub:
165
- hf_use_auth_token: true
166
-
167
- datasets:
168
- - path: FourOhFour/Instruct_Phase
169
- type: sharegpt
170
- conversation: chatml
171
-
172
- chat_template: chatml
173
-
174
- shuffle_merged_datasets: true
175
- val_set_size: 0.0025
176
- output_dir: ./outputs/out
177
-
178
- adapter:
179
- lora_r:
180
- lora_alpha:
181
- lora_dropout:
182
- lora_target_linear:
183
-
184
- sequence_len: 8192
185
- sample_packing: true
186
- eval_sample_packing: false
187
- pad_to_sequence_len: true
188
-
189
- plugins:
190
- - axolotl.integrations.liger.LigerPlugin
191
- liger_rope: true
192
- liger_rms_norm: true
193
- liger_swiglu: true
194
- liger_fused_linear_cross_entropy: true
195
-
196
- wandb_project: EXP4B
197
- wandb_entity:
198
- wandb_watch:
199
- wandb_name: EXP4B
200
- wandb_log_model:
201
-
202
- gradient_accumulation_steps: 12
203
- micro_batch_size: 3
204
- num_epochs: 2
205
- optimizer: adamw_bnb_8bit
206
- lr_scheduler: cosine
207
- learning_rate: 0.00001
208
- weight_decay: 0.05
209
-
210
- train_on_inputs: false
211
- group_by_length: false
212
- bf16: auto
213
- fp16:
214
- tf32: true
215
-
216
- gradient_checkpointing: true
217
- early_stopping_patience:
218
- resume_from_checkpoint:
219
- local_rank:
220
- logging_steps: 1
221
- xformers_attention:
222
- flash_attention: true
223
-
224
- warmup_ratio: 0.1
225
- evals_per_epoch: 4
226
- eval_table_size:
227
- eval_max_new_tokens: 128
228
- saves_per_epoch: 2
229
-
230
- debug:
231
- deepspeed: deepspeed_configs/zero3_bf16.json
232
- fsdp:
233
- fsdp_config:
234
-
235
- special_tokens:
236
- pad_token: <|finetune_right_pad_id|>
237
- ```
238
-
239
- Roleplaying SFT
240
-
241
- ```yaml
242
- base_model: jeiku/instructered4B
243
- model_type: AutoModelForCausalLM
244
- tokenizer_type: AutoTokenizer
245
-
246
- load_in_8bit: false
247
- load_in_4bit: false
248
- strict: false
249
-
250
- hub_model_id: jeiku/TheBest4B
251
- hub_strategy: "all_checkpoints"
252
- push_dataset_to_hub:
253
- hf_use_auth_token: true
254
-
255
- datasets:
256
- - path: FourOhFour/RP_Phase
257
- type: sharegpt
258
- conversation: chatml
259
-
260
- chat_template: chatml
261
-
262
- shuffle_merged_datasets: true
263
- val_set_size: 0.0025
264
- output_dir: ./outputs/out
265
-
266
- adapter:
267
- lora_r:
268
- lora_alpha:
269
- lora_dropout:
270
- lora_target_linear:
271
-
272
- sequence_len: 8192
273
- sample_packing: true
274
- eval_sample_packing: false
275
- pad_to_sequence_len: true
276
-
277
- plugins:
278
- - axolotl.integrations.liger.LigerPlugin
279
- liger_rope: true
280
- liger_rms_norm: true
281
- liger_swiglu: true
282
- liger_fused_linear_cross_entropy: true
283
-
284
- wandb_project: EXP4B
285
- wandb_entity:
286
- wandb_watch:
287
- wandb_name: EXP4B
288
- wandb_log_model:
289
-
290
- gradient_accumulation_steps: 12
291
- micro_batch_size: 3
292
- num_epochs: 2
293
- optimizer: adamw_bnb_8bit
294
- lr_scheduler: cosine
295
- learning_rate: 0.00001
296
- weight_decay: 0.05
297
-
298
- train_on_inputs: false
299
- group_by_length: false
300
- bf16: auto
301
- fp16:
302
- tf32: true
303
-
304
- gradient_checkpointing: true
305
- early_stopping_patience:
306
- resume_from_checkpoint:
307
- local_rank:
308
- logging_steps: 1
309
- xformers_attention:
310
- flash_attention: true
311
-
312
- warmup_ratio: 0.1
313
- evals_per_epoch: 4
314
- eval_table_size:
315
- eval_max_new_tokens: 128
316
- saves_per_epoch: 2
317
-
318
- debug:
319
- deepspeed: deepspeed_configs/zero3_bf16.json
320
- fsdp:
321
- fsdp_config:
322
-
323
- special_tokens:
324
- pad_token: <|finetune_right_pad_id|>
325
  ```
326
 
327
  KTO
328
 
329
  ```yaml
330
- base_model: FourOhFour/Crispy_Crab_4B
331
  model_type: AutoModelForCausalLM
332
  tokenizer_type: AutoTokenizer
333
 
@@ -335,7 +89,7 @@ load_in_8bit: false
335
  load_in_4bit: false
336
  strict: false
337
 
338
- hub_model_id: jeiku/aura4bkto
339
  hub_strategy: "all_checkpoints"
340
  push_dataset_to_hub:
341
  hf_use_auth_token: true
@@ -354,15 +108,24 @@ shuffle_merged_datasets: true
354
  val_set_size: 0.0
355
  output_dir: ./outputs/out
356
 
 
 
 
 
 
 
 
 
 
357
  sequence_len: 8192
358
  sample_packing: false
359
  eval_sample_packing: false
360
  pad_to_sequence_len: false
361
 
362
- wandb_project: Aura-4B
363
  wandb_entity:
364
  wandb_watch:
365
- wandb_name: Aura-4B
366
  wandb_log_model:
367
 
368
  gradient_accumulation_steps: 16
@@ -372,7 +135,7 @@ max_steps: 500
372
 
373
  optimizer: adamw_8bit
374
  lr_scheduler: cosine
375
- learning_rate: 0.00001
376
  weight_decay: 0.05
377
 
378
  train_on_inputs: false
 
60
 
61
  ## Training Configuration
62
 
63
+ <details><summary>Click here for Mergekit and Axolotl configs</summary>
64
 
65
+ MoE Merge
66
 
67
  ```yaml
68
+ base_model: FourOhFour/Crispy_Crab_4B
69
+ gate_mode: hidden
70
+ dtype: bfloat16
71
+ experts_per_token: 1
72
+ experts:
73
+ - source_model: FourOhFour/Crispy_Crab_4B
74
+ positive_prompts:
75
+ - "You are a roleplaying powerhouse, reply as {{char}} in this ongoing conversation with {{user}}."
76
+ - source_model: FourOhFour/Zenith_4B
77
+ positive_prompts:
78
+ - "You are a helpful assistant designed to perform tasks for the user."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  ```
80
 
81
  KTO
82
 
83
  ```yaml
84
+ base_model: jeiku/2x4Bmoe
85
  model_type: AutoModelForCausalLM
86
  tokenizer_type: AutoTokenizer
87
 
 
89
  load_in_4bit: false
90
  strict: false
91
 
92
+ hub_model_id: jeiku/moekto
93
  hub_strategy: "all_checkpoints"
94
  push_dataset_to_hub:
95
  hf_use_auth_token: true
 
108
  val_set_size: 0.0
109
  output_dir: ./outputs/out
110
 
111
+ adapter: lora
112
+ lora_model_dir:
113
+
114
+ lora_r: 32
115
+ lora_alpha: 64
116
+ lora_dropout: 0.05
117
+ lora_target_linear: true
118
+ lora_fan_in_fan_out:
119
+
120
  sequence_len: 8192
121
  sample_packing: false
122
  eval_sample_packing: false
123
  pad_to_sequence_len: false
124
 
125
+ wandb_project: moekto
126
  wandb_entity:
127
  wandb_watch:
128
+ wandb_name: moekto
129
  wandb_log_model:
130
 
131
  gradient_accumulation_steps: 16
 
135
 
136
  optimizer: adamw_8bit
137
  lr_scheduler: cosine
138
+ learning_rate: 0.0001
139
  weight_decay: 0.05
140
 
141
  train_on_inputs: false