xzuyn commited on
Commit
041c5d1
·
verified ·
1 Parent(s): 9e3f115

Create README.md

Browse files
Files changed (1) hide show
  1. README.md +348 -0
README.md ADDED
@@ -0,0 +1,348 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: apache-2.0
4
+ base_model: Qwen/Qwen2.5-0.5B-Instruct
5
+ tags:
6
+ - generated_from_trainer
7
+ datasets:
8
+ - PJMixers-Dev/allura-org_gryphe-sonnet-3.5-charcards-names-added-qwq-all-aphrodite
9
+ - PJMixers-Dev/anthracite-org_c2_logs_32k_llama3_qwen2_v1.3-qwq-all-aphrodite
10
+ - PJMixers-Dev/grimulkan_aicg-logs-augmented-system-qwq-all-aphrodite
11
+ - PJMixers-Dev/grimulkan_jannie-log-augmented-system-qwq-all-aphrodite
12
+ - PJMixers-Dev/grimulkan_PIPPA-augmented-dedup-system-qwq-all-aphrodite
13
+ - PJMixers-Dev/lemonilia_LimaRP-Only-NonSus-Simple-CustomShareGPT-qwq-all-aphrodite
14
+ - PJMixers-Dev/MinervaAI_Aesir-Preview-Anon-qwq-all-aphrodite
15
+ - PJMixers-Dev/NyxKrage_chub-logs-sharegpt-longest-CustomShareGPT-qwq-all-aphrodite
16
+ model-index:
17
+ - name: Outputs/Qwen2.5-QwQ-RP-Draft-v0.1-0.5B-LoRA-run10
18
+ results: []
19
+ ---
20
+
21
+ ![image/png](https://cdn-uploads.huggingface.co/production/uploads/63559199805be5a8f30f6505/616DQIBD3QK8KFYjtYF8K.png)
22
+
23
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
24
+ should probably proofread and complete it, then remove this comment. -->
25
+
26
+ [<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)
27
+ <details><summary>See axolotl config</summary>
28
+
29
+ axolotl version: `0.6.0`
30
+ ```yaml
31
+ # Weights and Biases logging config
32
+ wandb_project: Qwen2.5-QwQ-RP-Draft-0.5B
33
+ wandb_entity:
34
+ wandb_watch:
35
+ wandb_name: Qwen2.5-QwQ-RP-Draft-v0.1-0.5B-LoRA-run10
36
+ wandb_log_model:
37
+
38
+ # Model checkpointing config
39
+ output_dir: ./Outputs/Qwen2.5-QwQ-RP-Draft-v0.1-0.5B-LoRA-run10
40
+ resume_from_checkpoint:
41
+ save_steps: 10
42
+ save_safetensors: true
43
+ save_total_limit: 3
44
+ save_only_model: false
45
+
46
+ # Model architecture config
47
+ base_model: Qwen/Qwen2.5-0.5B-Instruct
48
+ model_type: AutoModelForCausalLM
49
+ tokenizer_type: AutoTokenizer
50
+
51
+ # Mixed precision training config
52
+ bf16: true
53
+ fp16: false
54
+ tf32: false
55
+
56
+ # Model loading config
57
+ load_in_8bit: false
58
+ load_in_4bit: false
59
+ strict: false
60
+
61
+ # Sequence config
62
+ sequence_len: 8192
63
+ min_sample_len: 256
64
+ sample_packing: true
65
+ eval_sample_packing: true
66
+ pad_to_sequence_len: true
67
+ train_on_inputs: false
68
+ group_by_length: false
69
+
70
+ # LoRA adapter config
71
+ adapter: lora
72
+ lora_model_dir:
73
+ lora_r: 128
74
+ lora_alpha: 128
75
+ lora_dropout: 0.125
76
+ peft_layers_to_transform:
77
+ peft_use_dora:
78
+ peft_use_rslora:
79
+ peft_layer_replication:
80
+ lora_target_modules:
81
+ - gate_proj
82
+ - down_proj
83
+ - up_proj
84
+ - q_proj
85
+ - v_proj
86
+ - k_proj
87
+ - o_proj
88
+ lora_modules_to_save:
89
+
90
+ # Fix uninitialized tokens (such as <|start_header_id|> on the base L3 models)
91
+ fix_untrained_tokens:
92
+
93
+ # Dataset config
94
+ # RP: https://github.com/xzuyn/axolotl/blob/prompt_formats/src/axolotl/prompt_strategies/customchatml-regex-last-only.py
95
+ datasets:
96
+ - path: PJMixers-Dev/allura-org_gryphe-sonnet-3.5-charcards-names-added-qwq-all-aphrodite
97
+ split: train[128:] # Everything except the first 128
98
+ type: customchatml-regex-last-only
99
+ - path: PJMixers-Dev/anthracite-org_c2_logs_32k_llama3_qwen2_v1.3-qwq-all-aphrodite
100
+ split: train[128:] # Everything except the first 128
101
+ type: customchatml-regex-last-only
102
+ - path: PJMixers-Dev/grimulkan_aicg-logs-augmented-system-qwq-all-aphrodite
103
+ split: train[128:] # Everything except the first 128
104
+ type: customchatml-regex-last-only
105
+ - path: PJMixers-Dev/grimulkan_jannie-log-augmented-system-qwq-all-aphrodite
106
+ split: train[128:] # Everything except the first 128
107
+ type: customchatml-regex-last-only
108
+ - path: PJMixers-Dev/grimulkan_PIPPA-augmented-dedup-system-qwq-all-aphrodite
109
+ split: train[128:] # Everything except the first 128
110
+ type: customchatml-regex-last-only
111
+ - path: PJMixers-Dev/lemonilia_LimaRP-Only-NonSus-Simple-CustomShareGPT-qwq-all-aphrodite
112
+ split: train[128:] # Everything except the first 128
113
+ type: customchatml-regex-last-only
114
+ - path: PJMixers-Dev/MinervaAI_Aesir-Preview-Anon-qwq-all-aphrodite
115
+ split: train[128:] # Everything except the first 128
116
+ type: customchatml-regex-last-only
117
+ - path: PJMixers-Dev/NyxKrage_chub-logs-sharegpt-longest-CustomShareGPT-qwq-all-aphrodite
118
+ split: train[128:] # Everything except the first 128
119
+ type: customchatml-regex-last-only
120
+ test_datasets:
121
+ - path: PJMixers-Dev/allura-org_gryphe-sonnet-3.5-charcards-names-added-qwq-all-aphrodite
122
+ split: train[:128] # Only the first 128
123
+ type: customchatml-regex-last-only
124
+ - path: PJMixers-Dev/anthracite-org_c2_logs_32k_llama3_qwen2_v1.3-qwq-all-aphrodite
125
+ split: train[:128] # Only the first 128
126
+ type: customchatml-regex-last-only
127
+ - path: PJMixers-Dev/grimulkan_aicg-logs-augmented-system-qwq-all-aphrodite
128
+ split: train[:128] # Only the first 128
129
+ type: customchatml-regex-last-only
130
+ - path: PJMixers-Dev/grimulkan_jannie-log-augmented-system-qwq-all-aphrodite
131
+ split: train[:128] # Only the first 128
132
+ type: customchatml-regex-last-only
133
+ - path: PJMixers-Dev/grimulkan_PIPPA-augmented-dedup-system-qwq-all-aphrodite
134
+ split: train[:128] # Only the first 128
135
+ type: customchatml-regex-last-only
136
+ - path: PJMixers-Dev/lemonilia_LimaRP-Only-NonSus-Simple-CustomShareGPT-qwq-all-aphrodite
137
+ split: train[:128] # Only the first 128
138
+ type: customchatml-regex-last-only
139
+ - path: PJMixers-Dev/MinervaAI_Aesir-Preview-Anon-qwq-all-aphrodite
140
+ split: train[:128] # Only the first 128
141
+ type: customchatml-regex-last-only
142
+ - path: PJMixers-Dev/NyxKrage_chub-logs-sharegpt-longest-CustomShareGPT-qwq-all-aphrodite
143
+ split: train[:128] # Only the first 128
144
+ type: customchatml-regex-last-only
145
+ val_set_size: 0
146
+ eval_strategy: steps
147
+ eval_steps: 10
148
+ dataset_prepared_path: ./00-Tokenized-Datasets/Qwen2.5-QwQ-Draft-0.5B-customchatml-regex-newer
149
+ shuffle_merged_datasets: true
150
+ dataset_processes:
151
+
152
+ # Training hyperparameters
153
+ num_epochs: 1
154
+ gradient_accumulation_steps: 1
155
+ micro_batch_size: 16
156
+ eval_batch_size: 16
157
+ warmup_steps: 0
158
+ optimizer: came_pytorch
159
+ optim_args:
160
+ optim_target_modules:
161
+ lr_scheduler: rex
162
+ learning_rate: 1e-5
163
+ cosine_min_lr_ratio:
164
+ loraplus_lr_ratio:
165
+ loraplus_lr_embedding:
166
+ weight_decay: 0.1
167
+ max_grad_norm: 1
168
+ logging_steps: 1
169
+
170
+ # Model optimization
171
+ gradient_checkpointing: unsloth
172
+ sdp_attention: true
173
+ plugins:
174
+ - axolotl.integrations.liger.LigerPlugin
175
+ - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
176
+ cut_cross_entropy: true
177
+ liger_rope: true
178
+ liger_rms_norm: true
179
+ liger_layer_norm: true
180
+ liger_glu_activation: true
181
+ liger_cross_entropy: false
182
+ liger_fused_linear_cross_entropy: false
183
+ lora_mlp_kernel: false
184
+ lora_qkv_kernel: false
185
+ lora_o_kernel: false
186
+
187
+ # DeepSpeed
188
+ deepspeed:
189
+
190
+ # Garbage Collection
191
+ gc_steps: 1
192
+
193
+ # Debug config
194
+ debug: true
195
+ seed: 42
196
+
197
+ # Token config
198
+ special_tokens:
199
+ eos_token: "<|endoftext|>"
200
+ pad_token: "<|endoftext|>"
201
+ tokens:
202
+
203
+ ```
204
+
205
+ </details><br>
206
+
207
+ # Outputs/Qwen2.5-QwQ-RP-Draft-v0.1-0.5B-LoRA-run10
208
+
209
+ This model is a fine-tuned version of [Qwen/Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct) on the PJMixers-Dev/allura-org_gryphe-sonnet-3.5-charcards-names-added-qwq-all-aphrodite, the PJMixers-Dev/anthracite-org_c2_logs_32k_llama3_qwen2_v1.3-qwq-all-aphrodite, the PJMixers-Dev/grimulkan_aicg-logs-augmented-system-qwq-all-aphrodite, the PJMixers-Dev/grimulkan_jannie-log-augmented-system-qwq-all-aphrodite, the PJMixers-Dev/grimulkan_PIPPA-augmented-dedup-system-qwq-all-aphrodite, the PJMixers-Dev/lemonilia_LimaRP-Only-NonSus-Simple-CustomShareGPT-qwq-all-aphrodite, the PJMixers-Dev/MinervaAI_Aesir-Preview-Anon-qwq-all-aphrodite and the PJMixers-Dev/NyxKrage_chub-logs-sharegpt-longest-CustomShareGPT-qwq-all-aphrodite datasets.
210
+ It achieves the following results on the evaluation set:
211
+ - Loss: 1.9716
212
+
213
+ ## Model description
214
+
215
+ More information needed
216
+
217
+ ## Intended uses & limitations
218
+
219
+ More information needed
220
+
221
+ ## Training and evaluation data
222
+
223
+ More information needed
224
+
225
+ ## Training procedure
226
+
227
+ ### Training hyperparameters
228
+
229
+ The following hyperparameters were used during training:
230
+ - learning_rate: 1e-05
231
+ - train_batch_size: 16
232
+ - eval_batch_size: 16
233
+ - seed: 42
234
+ - optimizer: Use OptimizerNames.ADAMW_HF with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
235
+ - lr_scheduler_type: cosine
236
+ - num_epochs: 1.0
237
+
238
+ ### Training results
239
+
240
+ | Training Loss | Epoch | Step | Validation Loss |
241
+ |:-------------:|:------:|:----:|:---------------:|
242
+ | 3.4865 | 0.0010 | 1 | 3.2134 |
243
+ | 2.481 | 0.0102 | 10 | 2.5552 |
244
+ | 2.2964 | 0.0205 | 20 | 2.4036 |
245
+ | 2.3048 | 0.0307 | 30 | 2.3367 |
246
+ | 2.2577 | 0.0409 | 40 | 2.2934 |
247
+ | 2.2298 | 0.0512 | 50 | 2.2601 |
248
+ | 2.1367 | 0.0614 | 60 | 2.2385 |
249
+ | 2.1512 | 0.0716 | 70 | 2.2166 |
250
+ | 2.1703 | 0.0819 | 80 | 2.2022 |
251
+ | 2.1263 | 0.0921 | 90 | 2.1883 |
252
+ | 2.2121 | 0.1024 | 100 | 2.1750 |
253
+ | 2.1741 | 0.1126 | 110 | 2.1633 |
254
+ | 2.1621 | 0.1228 | 120 | 2.1547 |
255
+ | 2.0664 | 0.1331 | 130 | 2.1456 |
256
+ | 2.1005 | 0.1433 | 140 | 2.1374 |
257
+ | 2.0822 | 0.1535 | 150 | 2.1315 |
258
+ | 2.0856 | 0.1638 | 160 | 2.1252 |
259
+ | 2.1386 | 0.1740 | 170 | 2.1182 |
260
+ | 2.0756 | 0.1842 | 180 | 2.1134 |
261
+ | 2.0492 | 0.1945 | 190 | 2.1066 |
262
+ | 1.9882 | 0.2047 | 200 | 2.1024 |
263
+ | 2.036 | 0.2149 | 210 | 2.0970 |
264
+ | 2.1313 | 0.2252 | 220 | 2.0940 |
265
+ | 2.0356 | 0.2354 | 230 | 2.0897 |
266
+ | 2.0278 | 0.2456 | 240 | 2.0869 |
267
+ | 2.0754 | 0.2559 | 250 | 2.0825 |
268
+ | 2.0582 | 0.2661 | 260 | 2.0784 |
269
+ | 2.0588 | 0.2764 | 270 | 2.0758 |
270
+ | 1.9757 | 0.2866 | 280 | 2.0723 |
271
+ | 2.0619 | 0.2968 | 290 | 2.0700 |
272
+ | 1.956 | 0.3071 | 300 | 2.0684 |
273
+ | 2.065 | 0.3173 | 310 | 2.0642 |
274
+ | 1.982 | 0.3275 | 320 | 2.0604 |
275
+ | 2.0424 | 0.3378 | 330 | 2.0577 |
276
+ | 2.0635 | 0.3480 | 340 | 2.0553 |
277
+ | 1.9895 | 0.3582 | 350 | 2.0518 |
278
+ | 2.0296 | 0.3685 | 360 | 2.0496 |
279
+ | 2.0231 | 0.3787 | 370 | 2.0472 |
280
+ | 1.9422 | 0.3889 | 380 | 2.0459 |
281
+ | 2.0214 | 0.3992 | 390 | 2.0427 |
282
+ | 2.0107 | 0.4094 | 400 | 2.0401 |
283
+ | 2.0307 | 0.4197 | 410 | 2.0371 |
284
+ | 1.9874 | 0.4299 | 420 | 2.0356 |
285
+ | 2.0249 | 0.4401 | 430 | 2.0331 |
286
+ | 2.0947 | 0.4504 | 440 | 2.0314 |
287
+ | 1.9644 | 0.4606 | 450 | 2.0291 |
288
+ | 2.0633 | 0.4708 | 460 | 2.0271 |
289
+ | 2.0438 | 0.4811 | 470 | 2.0255 |
290
+ | 2.0227 | 0.4913 | 480 | 2.0239 |
291
+ | 2.0023 | 0.5015 | 490 | 2.0208 |
292
+ | 2.0231 | 0.5118 | 500 | 2.0193 |
293
+ | 1.9659 | 0.5220 | 510 | 2.0179 |
294
+ | 1.9382 | 0.5322 | 520 | 2.0171 |
295
+ | 1.9959 | 0.5425 | 530 | 2.0157 |
296
+ | 1.9835 | 0.5527 | 540 | 2.0139 |
297
+ | 1.942 | 0.5629 | 550 | 2.0124 |
298
+ | 2.0036 | 0.5732 | 560 | 2.0109 |
299
+ | 2.023 | 0.5834 | 570 | 2.0100 |
300
+ | 1.9686 | 0.5937 | 580 | 2.0078 |
301
+ | 1.9867 | 0.6039 | 590 | 2.0070 |
302
+ | 1.9662 | 0.6141 | 600 | 2.0060 |
303
+ | 1.968 | 0.6244 | 610 | 2.0045 |
304
+ | 1.9435 | 0.6346 | 620 | 2.0035 |
305
+ | 1.9245 | 0.6448 | 630 | 2.0024 |
306
+ | 1.9573 | 0.6551 | 640 | 2.0007 |
307
+ | 1.9466 | 0.6653 | 650 | 1.9994 |
308
+ | 2.0202 | 0.6755 | 660 | 1.9976 |
309
+ | 1.891 | 0.6858 | 670 | 1.9965 |
310
+ | 2.0134 | 0.6960 | 680 | 1.9980 |
311
+ | 1.9276 | 0.7062 | 690 | 1.9958 |
312
+ | 1.9266 | 0.7165 | 700 | 1.9949 |
313
+ | 1.8661 | 0.7267 | 710 | 1.9932 |
314
+ | 1.9446 | 0.7369 | 720 | 1.9923 |
315
+ | 1.8605 | 0.7472 | 730 | 1.9908 |
316
+ | 1.9426 | 0.7574 | 740 | 1.9906 |
317
+ | 1.9806 | 0.7677 | 750 | 1.9893 |
318
+ | 1.9268 | 0.7779 | 760 | 1.9880 |
319
+ | 1.987 | 0.7881 | 770 | 1.9870 |
320
+ | 1.9182 | 0.7984 | 780 | 1.9866 |
321
+ | 2.0103 | 0.8086 | 790 | 1.9853 |
322
+ | 1.9153 | 0.8188 | 800 | 1.9839 |
323
+ | 2.0043 | 0.8291 | 810 | 1.9830 |
324
+ | 1.9791 | 0.8393 | 820 | 1.9819 |
325
+ | 1.912 | 0.8495 | 830 | 1.9811 |
326
+ | 1.9288 | 0.8598 | 840 | 1.9808 |
327
+ | 1.9613 | 0.8700 | 850 | 1.9796 |
328
+ | 1.9767 | 0.8802 | 860 | 1.9783 |
329
+ | 1.9097 | 0.8905 | 870 | 1.9783 |
330
+ | 1.9727 | 0.9007 | 880 | 1.9773 |
331
+ | 1.9432 | 0.9110 | 890 | 1.9763 |
332
+ | 1.9109 | 0.9212 | 900 | 1.9754 |
333
+ | 1.9184 | 0.9314 | 910 | 1.9749 |
334
+ | 1.9179 | 0.9417 | 920 | 1.9744 |
335
+ | 1.9812 | 0.9519 | 930 | 1.9735 |
336
+ | 1.9695 | 0.9621 | 940 | 1.9727 |
337
+ | 1.9474 | 0.9724 | 950 | 1.9727 |
338
+ | 1.8376 | 0.9826 | 960 | 1.9721 |
339
+ | 1.8961 | 0.9928 | 970 | 1.9716 |
340
+
341
+
342
+ ### Framework versions
343
+
344
+ - PEFT 0.14.0
345
+ - Transformers 4.50.0.dev0
346
+ - Pytorch 2.7.0.dev20250224+rocm6.3
347
+ - Datasets 3.3.1
348
+ - Tokenizers 0.21.0