library_name: transformers
license: apache-2.0
base_model: Qwen/Qwen2.5-0.5B-Instruct
tags:
- generated_from_trainer
datasets:
- >-
PJMixers-Dev/allura-org_gryphe-sonnet-3.5-charcards-names-added-qwq-all-aphrodite
- PJMixers-Dev/anthracite-org_c2_logs_32k_llama3_qwen2_v1.3-qwq-all-aphrodite
- PJMixers-Dev/grimulkan_aicg-logs-augmented-system-qwq-all-aphrodite
- PJMixers-Dev/grimulkan_jannie-log-augmented-system-qwq-all-aphrodite
- PJMixers-Dev/grimulkan_PIPPA-augmented-dedup-system-qwq-all-aphrodite
- >-
PJMixers-Dev/lemonilia_LimaRP-Only-NonSus-Simple-CustomShareGPT-qwq-all-aphrodite
- PJMixers-Dev/MinervaAI_Aesir-Preview-Anon-qwq-all-aphrodite
- >-
PJMixers-Dev/NyxKrage_chub-logs-sharegpt-longest-CustomShareGPT-qwq-all-aphrodite
model-index:
- name: Outputs/Qwen2.5-QwQ-RP-Draft-v0.1-0.5B-LoRA-run10
results: []
See axolotl config
axolotl version: 0.6.0
# Weights and Biases logging config
wandb_project: Qwen2.5-QwQ-RP-Draft-0.5B
wandb_entity:
wandb_watch:
wandb_name: Qwen2.5-QwQ-RP-Draft-v0.1-0.5B-LoRA-run10
wandb_log_model:
# Model checkpointing config
output_dir: ./Outputs/Qwen2.5-QwQ-RP-Draft-v0.1-0.5B-LoRA-run10
resume_from_checkpoint:
save_steps: 10
save_safetensors: true
save_total_limit: 3
save_only_model: false
# Model architecture config
base_model: Qwen/Qwen2.5-0.5B-Instruct
model_type: AutoModelForCausalLM
tokenizer_type: AutoTokenizer
# Mixed precision training config
bf16: true
fp16: false
tf32: false
# Model loading config
load_in_8bit: false
load_in_4bit: false
strict: false
# Sequence config
sequence_len: 8192
min_sample_len: 256
sample_packing: true
eval_sample_packing: true
pad_to_sequence_len: true
train_on_inputs: false
group_by_length: false
# LoRA adapter config
adapter: lora
lora_model_dir:
lora_r: 128
lora_alpha: 128
lora_dropout: 0.125
peft_layers_to_transform:
peft_use_dora:
peft_use_rslora:
peft_layer_replication:
lora_target_modules:
- gate_proj
- down_proj
- up_proj
- q_proj
- v_proj
- k_proj
- o_proj
lora_modules_to_save:
# Fix uninitialized tokens (such as <|start_header_id|> on the base L3 models)
fix_untrained_tokens:
# Dataset config
# RP: https://github.com/xzuyn/axolotl/blob/prompt_formats/src/axolotl/prompt_strategies/customchatml-regex-last-only.py
datasets:
- path: PJMixers-Dev/allura-org_gryphe-sonnet-3.5-charcards-names-added-qwq-all-aphrodite
split: train[128:] # Everything except the first 128
type: customchatml-regex-last-only
- path: PJMixers-Dev/anthracite-org_c2_logs_32k_llama3_qwen2_v1.3-qwq-all-aphrodite
split: train[128:] # Everything except the first 128
type: customchatml-regex-last-only
- path: PJMixers-Dev/grimulkan_aicg-logs-augmented-system-qwq-all-aphrodite
split: train[128:] # Everything except the first 128
type: customchatml-regex-last-only
- path: PJMixers-Dev/grimulkan_jannie-log-augmented-system-qwq-all-aphrodite
split: train[128:] # Everything except the first 128
type: customchatml-regex-last-only
- path: PJMixers-Dev/grimulkan_PIPPA-augmented-dedup-system-qwq-all-aphrodite
split: train[128:] # Everything except the first 128
type: customchatml-regex-last-only
- path: PJMixers-Dev/lemonilia_LimaRP-Only-NonSus-Simple-CustomShareGPT-qwq-all-aphrodite
split: train[128:] # Everything except the first 128
type: customchatml-regex-last-only
- path: PJMixers-Dev/MinervaAI_Aesir-Preview-Anon-qwq-all-aphrodite
split: train[128:] # Everything except the first 128
type: customchatml-regex-last-only
- path: PJMixers-Dev/NyxKrage_chub-logs-sharegpt-longest-CustomShareGPT-qwq-all-aphrodite
split: train[128:] # Everything except the first 128
type: customchatml-regex-last-only
test_datasets:
- path: PJMixers-Dev/allura-org_gryphe-sonnet-3.5-charcards-names-added-qwq-all-aphrodite
split: train[:128] # Only the first 128
type: customchatml-regex-last-only
- path: PJMixers-Dev/anthracite-org_c2_logs_32k_llama3_qwen2_v1.3-qwq-all-aphrodite
split: train[:128] # Only the first 128
type: customchatml-regex-last-only
- path: PJMixers-Dev/grimulkan_aicg-logs-augmented-system-qwq-all-aphrodite
split: train[:128] # Only the first 128
type: customchatml-regex-last-only
- path: PJMixers-Dev/grimulkan_jannie-log-augmented-system-qwq-all-aphrodite
split: train[:128] # Only the first 128
type: customchatml-regex-last-only
- path: PJMixers-Dev/grimulkan_PIPPA-augmented-dedup-system-qwq-all-aphrodite
split: train[:128] # Only the first 128
type: customchatml-regex-last-only
- path: PJMixers-Dev/lemonilia_LimaRP-Only-NonSus-Simple-CustomShareGPT-qwq-all-aphrodite
split: train[:128] # Only the first 128
type: customchatml-regex-last-only
- path: PJMixers-Dev/MinervaAI_Aesir-Preview-Anon-qwq-all-aphrodite
split: train[:128] # Only the first 128
type: customchatml-regex-last-only
- path: PJMixers-Dev/NyxKrage_chub-logs-sharegpt-longest-CustomShareGPT-qwq-all-aphrodite
split: train[:128] # Only the first 128
type: customchatml-regex-last-only
val_set_size: 0
eval_strategy: steps
eval_steps: 10
dataset_prepared_path: ./00-Tokenized-Datasets/Qwen2.5-QwQ-Draft-0.5B-customchatml-regex-newer
shuffle_merged_datasets: true
dataset_processes:
# Training hyperparameters
num_epochs: 1
gradient_accumulation_steps: 1
micro_batch_size: 16
eval_batch_size: 16
warmup_steps: 0
optimizer: came_pytorch
optim_args:
optim_target_modules:
lr_scheduler: rex
learning_rate: 1e-5
cosine_min_lr_ratio:
loraplus_lr_ratio:
loraplus_lr_embedding:
weight_decay: 0.1
max_grad_norm: 1
logging_steps: 1
# Model optimization
gradient_checkpointing: unsloth
sdp_attention: true
plugins:
- axolotl.integrations.liger.LigerPlugin
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
cut_cross_entropy: true
liger_rope: true
liger_rms_norm: true
liger_layer_norm: true
liger_glu_activation: true
liger_cross_entropy: false
liger_fused_linear_cross_entropy: false
lora_mlp_kernel: false
lora_qkv_kernel: false
lora_o_kernel: false
# DeepSpeed
deepspeed:
# Garbage Collection
gc_steps: 1
# Debug config
debug: true
seed: 42
# Token config
special_tokens:
eos_token: "<|endoftext|>"
pad_token: "<|endoftext|>"
tokens:
Outputs/Qwen2.5-QwQ-RP-Draft-v0.1-0.5B-LoRA-run10
This model is a fine-tuned version of Qwen/Qwen2.5-0.5B-Instruct on the PJMixers-Dev/allura-org_gryphe-sonnet-3.5-charcards-names-added-qwq-all-aphrodite, the PJMixers-Dev/anthracite-org_c2_logs_32k_llama3_qwen2_v1.3-qwq-all-aphrodite, the PJMixers-Dev/grimulkan_aicg-logs-augmented-system-qwq-all-aphrodite, the PJMixers-Dev/grimulkan_jannie-log-augmented-system-qwq-all-aphrodite, the PJMixers-Dev/grimulkan_PIPPA-augmented-dedup-system-qwq-all-aphrodite, the PJMixers-Dev/lemonilia_LimaRP-Only-NonSus-Simple-CustomShareGPT-qwq-all-aphrodite, the PJMixers-Dev/MinervaAI_Aesir-Preview-Anon-qwq-all-aphrodite and the PJMixers-Dev/NyxKrage_chub-logs-sharegpt-longest-CustomShareGPT-qwq-all-aphrodite datasets. It achieves the following results on the evaluation set:
- Loss: 1.9716
Model description
More information needed
Intended uses & limitations
More information needed
Training and evaluation data
More information needed
Training procedure
Training hyperparameters
The following hyperparameters were used during training:
- learning_rate: 1e-05
- train_batch_size: 16
- eval_batch_size: 16
- seed: 42
- optimizer: Use OptimizerNames.ADAMW_HF with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
- lr_scheduler_type: cosine
- num_epochs: 1.0
Training results
Training Loss | Epoch | Step | Validation Loss |
---|---|---|---|
3.4865 | 0.0010 | 1 | 3.2134 |
2.481 | 0.0102 | 10 | 2.5552 |
2.2964 | 0.0205 | 20 | 2.4036 |
2.3048 | 0.0307 | 30 | 2.3367 |
2.2577 | 0.0409 | 40 | 2.2934 |
2.2298 | 0.0512 | 50 | 2.2601 |
2.1367 | 0.0614 | 60 | 2.2385 |
2.1512 | 0.0716 | 70 | 2.2166 |
2.1703 | 0.0819 | 80 | 2.2022 |
2.1263 | 0.0921 | 90 | 2.1883 |
2.2121 | 0.1024 | 100 | 2.1750 |
2.1741 | 0.1126 | 110 | 2.1633 |
2.1621 | 0.1228 | 120 | 2.1547 |
2.0664 | 0.1331 | 130 | 2.1456 |
2.1005 | 0.1433 | 140 | 2.1374 |
2.0822 | 0.1535 | 150 | 2.1315 |
2.0856 | 0.1638 | 160 | 2.1252 |
2.1386 | 0.1740 | 170 | 2.1182 |
2.0756 | 0.1842 | 180 | 2.1134 |
2.0492 | 0.1945 | 190 | 2.1066 |
1.9882 | 0.2047 | 200 | 2.1024 |
2.036 | 0.2149 | 210 | 2.0970 |
2.1313 | 0.2252 | 220 | 2.0940 |
2.0356 | 0.2354 | 230 | 2.0897 |
2.0278 | 0.2456 | 240 | 2.0869 |
2.0754 | 0.2559 | 250 | 2.0825 |
2.0582 | 0.2661 | 260 | 2.0784 |
2.0588 | 0.2764 | 270 | 2.0758 |
1.9757 | 0.2866 | 280 | 2.0723 |
2.0619 | 0.2968 | 290 | 2.0700 |
1.956 | 0.3071 | 300 | 2.0684 |
2.065 | 0.3173 | 310 | 2.0642 |
1.982 | 0.3275 | 320 | 2.0604 |
2.0424 | 0.3378 | 330 | 2.0577 |
2.0635 | 0.3480 | 340 | 2.0553 |
1.9895 | 0.3582 | 350 | 2.0518 |
2.0296 | 0.3685 | 360 | 2.0496 |
2.0231 | 0.3787 | 370 | 2.0472 |
1.9422 | 0.3889 | 380 | 2.0459 |
2.0214 | 0.3992 | 390 | 2.0427 |
2.0107 | 0.4094 | 400 | 2.0401 |
2.0307 | 0.4197 | 410 | 2.0371 |
1.9874 | 0.4299 | 420 | 2.0356 |
2.0249 | 0.4401 | 430 | 2.0331 |
2.0947 | 0.4504 | 440 | 2.0314 |
1.9644 | 0.4606 | 450 | 2.0291 |
2.0633 | 0.4708 | 460 | 2.0271 |
2.0438 | 0.4811 | 470 | 2.0255 |
2.0227 | 0.4913 | 480 | 2.0239 |
2.0023 | 0.5015 | 490 | 2.0208 |
2.0231 | 0.5118 | 500 | 2.0193 |
1.9659 | 0.5220 | 510 | 2.0179 |
1.9382 | 0.5322 | 520 | 2.0171 |
1.9959 | 0.5425 | 530 | 2.0157 |
1.9835 | 0.5527 | 540 | 2.0139 |
1.942 | 0.5629 | 550 | 2.0124 |
2.0036 | 0.5732 | 560 | 2.0109 |
2.023 | 0.5834 | 570 | 2.0100 |
1.9686 | 0.5937 | 580 | 2.0078 |
1.9867 | 0.6039 | 590 | 2.0070 |
1.9662 | 0.6141 | 600 | 2.0060 |
1.968 | 0.6244 | 610 | 2.0045 |
1.9435 | 0.6346 | 620 | 2.0035 |
1.9245 | 0.6448 | 630 | 2.0024 |
1.9573 | 0.6551 | 640 | 2.0007 |
1.9466 | 0.6653 | 650 | 1.9994 |
2.0202 | 0.6755 | 660 | 1.9976 |
1.891 | 0.6858 | 670 | 1.9965 |
2.0134 | 0.6960 | 680 | 1.9980 |
1.9276 | 0.7062 | 690 | 1.9958 |
1.9266 | 0.7165 | 700 | 1.9949 |
1.8661 | 0.7267 | 710 | 1.9932 |
1.9446 | 0.7369 | 720 | 1.9923 |
1.8605 | 0.7472 | 730 | 1.9908 |
1.9426 | 0.7574 | 740 | 1.9906 |
1.9806 | 0.7677 | 750 | 1.9893 |
1.9268 | 0.7779 | 760 | 1.9880 |
1.987 | 0.7881 | 770 | 1.9870 |
1.9182 | 0.7984 | 780 | 1.9866 |
2.0103 | 0.8086 | 790 | 1.9853 |
1.9153 | 0.8188 | 800 | 1.9839 |
2.0043 | 0.8291 | 810 | 1.9830 |
1.9791 | 0.8393 | 820 | 1.9819 |
1.912 | 0.8495 | 830 | 1.9811 |
1.9288 | 0.8598 | 840 | 1.9808 |
1.9613 | 0.8700 | 850 | 1.9796 |
1.9767 | 0.8802 | 860 | 1.9783 |
1.9097 | 0.8905 | 870 | 1.9783 |
1.9727 | 0.9007 | 880 | 1.9773 |
1.9432 | 0.9110 | 890 | 1.9763 |
1.9109 | 0.9212 | 900 | 1.9754 |
1.9184 | 0.9314 | 910 | 1.9749 |
1.9179 | 0.9417 | 920 | 1.9744 |
1.9812 | 0.9519 | 930 | 1.9735 |
1.9695 | 0.9621 | 940 | 1.9727 |
1.9474 | 0.9724 | 950 | 1.9727 |
1.8376 | 0.9826 | 960 | 1.9721 |
1.8961 | 0.9928 | 970 | 1.9716 |
Framework versions
- PEFT 0.14.0
- Transformers 4.50.0.dev0
- Pytorch 2.7.0.dev20250224+rocm6.3
- Datasets 3.3.1
- Tokenizers 0.21.0