xzuyn's picture
Create README.md
041c5d1 verified
metadata
library_name: transformers
license: apache-2.0
base_model: Qwen/Qwen2.5-0.5B-Instruct
tags:
  - generated_from_trainer
datasets:
  - >-
    PJMixers-Dev/allura-org_gryphe-sonnet-3.5-charcards-names-added-qwq-all-aphrodite
  - PJMixers-Dev/anthracite-org_c2_logs_32k_llama3_qwen2_v1.3-qwq-all-aphrodite
  - PJMixers-Dev/grimulkan_aicg-logs-augmented-system-qwq-all-aphrodite
  - PJMixers-Dev/grimulkan_jannie-log-augmented-system-qwq-all-aphrodite
  - PJMixers-Dev/grimulkan_PIPPA-augmented-dedup-system-qwq-all-aphrodite
  - >-
    PJMixers-Dev/lemonilia_LimaRP-Only-NonSus-Simple-CustomShareGPT-qwq-all-aphrodite
  - PJMixers-Dev/MinervaAI_Aesir-Preview-Anon-qwq-all-aphrodite
  - >-
    PJMixers-Dev/NyxKrage_chub-logs-sharegpt-longest-CustomShareGPT-qwq-all-aphrodite
model-index:
  - name: Outputs/Qwen2.5-QwQ-RP-Draft-v0.1-0.5B-LoRA-run10
    results: []

image/png

Built with Axolotl

See axolotl config

axolotl version: 0.6.0

# Weights and Biases logging config
wandb_project: Qwen2.5-QwQ-RP-Draft-0.5B
wandb_entity:
wandb_watch:
wandb_name: Qwen2.5-QwQ-RP-Draft-v0.1-0.5B-LoRA-run10
wandb_log_model:

# Model checkpointing config
output_dir: ./Outputs/Qwen2.5-QwQ-RP-Draft-v0.1-0.5B-LoRA-run10
resume_from_checkpoint:
save_steps: 10
save_safetensors: true
save_total_limit: 3
save_only_model: false

# Model architecture config
base_model: Qwen/Qwen2.5-0.5B-Instruct
model_type: AutoModelForCausalLM
tokenizer_type: AutoTokenizer

# Mixed precision training config
bf16: true
fp16: false
tf32: false

# Model loading config
load_in_8bit: false
load_in_4bit: false
strict: false

# Sequence config
sequence_len: 8192
min_sample_len: 256
sample_packing: true
eval_sample_packing: true
pad_to_sequence_len: true
train_on_inputs: false
group_by_length: false

# LoRA adapter config
adapter: lora
lora_model_dir:
lora_r: 128
lora_alpha: 128
lora_dropout: 0.125
peft_layers_to_transform:
peft_use_dora:
peft_use_rslora:
peft_layer_replication:
lora_target_modules:
  - gate_proj
  - down_proj
  - up_proj
  - q_proj
  - v_proj
  - k_proj
  - o_proj
lora_modules_to_save:

# Fix uninitialized tokens (such as <|start_header_id|> on the base L3 models)
fix_untrained_tokens:

# Dataset config
# RP: https://github.com/xzuyn/axolotl/blob/prompt_formats/src/axolotl/prompt_strategies/customchatml-regex-last-only.py
datasets:
  - path: PJMixers-Dev/allura-org_gryphe-sonnet-3.5-charcards-names-added-qwq-all-aphrodite
    split: train[128:]  # Everything except the first 128
    type: customchatml-regex-last-only
  - path: PJMixers-Dev/anthracite-org_c2_logs_32k_llama3_qwen2_v1.3-qwq-all-aphrodite
    split: train[128:]  # Everything except the first 128
    type: customchatml-regex-last-only
  - path: PJMixers-Dev/grimulkan_aicg-logs-augmented-system-qwq-all-aphrodite
    split: train[128:]  # Everything except the first 128
    type: customchatml-regex-last-only
  - path: PJMixers-Dev/grimulkan_jannie-log-augmented-system-qwq-all-aphrodite
    split: train[128:]  # Everything except the first 128
    type: customchatml-regex-last-only
  - path: PJMixers-Dev/grimulkan_PIPPA-augmented-dedup-system-qwq-all-aphrodite
    split: train[128:]  # Everything except the first 128
    type: customchatml-regex-last-only
  - path: PJMixers-Dev/lemonilia_LimaRP-Only-NonSus-Simple-CustomShareGPT-qwq-all-aphrodite
    split: train[128:]  # Everything except the first 128
    type: customchatml-regex-last-only
  - path: PJMixers-Dev/MinervaAI_Aesir-Preview-Anon-qwq-all-aphrodite
    split: train[128:]  # Everything except the first 128
    type: customchatml-regex-last-only
  - path: PJMixers-Dev/NyxKrage_chub-logs-sharegpt-longest-CustomShareGPT-qwq-all-aphrodite
    split: train[128:]  # Everything except the first 128
    type: customchatml-regex-last-only
test_datasets:
  - path: PJMixers-Dev/allura-org_gryphe-sonnet-3.5-charcards-names-added-qwq-all-aphrodite
    split: train[:128]  # Only the first 128
    type: customchatml-regex-last-only
  - path: PJMixers-Dev/anthracite-org_c2_logs_32k_llama3_qwen2_v1.3-qwq-all-aphrodite
    split: train[:128]  # Only the first 128
    type: customchatml-regex-last-only
  - path: PJMixers-Dev/grimulkan_aicg-logs-augmented-system-qwq-all-aphrodite
    split: train[:128]  # Only the first 128
    type: customchatml-regex-last-only
  - path: PJMixers-Dev/grimulkan_jannie-log-augmented-system-qwq-all-aphrodite
    split: train[:128]  # Only the first 128
    type: customchatml-regex-last-only
  - path: PJMixers-Dev/grimulkan_PIPPA-augmented-dedup-system-qwq-all-aphrodite
    split: train[:128]  # Only the first 128
    type: customchatml-regex-last-only
  - path: PJMixers-Dev/lemonilia_LimaRP-Only-NonSus-Simple-CustomShareGPT-qwq-all-aphrodite
    split: train[:128]  # Only the first 128
    type: customchatml-regex-last-only
  - path: PJMixers-Dev/MinervaAI_Aesir-Preview-Anon-qwq-all-aphrodite
    split: train[:128]  # Only the first 128
    type: customchatml-regex-last-only
  - path: PJMixers-Dev/NyxKrage_chub-logs-sharegpt-longest-CustomShareGPT-qwq-all-aphrodite
    split: train[:128]  # Only the first 128
    type: customchatml-regex-last-only
val_set_size: 0
eval_strategy: steps
eval_steps: 10
dataset_prepared_path: ./00-Tokenized-Datasets/Qwen2.5-QwQ-Draft-0.5B-customchatml-regex-newer
shuffle_merged_datasets: true
dataset_processes:

# Training hyperparameters
num_epochs: 1
gradient_accumulation_steps: 1
micro_batch_size: 16
eval_batch_size: 16
warmup_steps: 0
optimizer: came_pytorch
optim_args:
optim_target_modules:
lr_scheduler: rex
learning_rate: 1e-5
cosine_min_lr_ratio:
loraplus_lr_ratio:
loraplus_lr_embedding:
weight_decay: 0.1
max_grad_norm: 1
logging_steps: 1

# Model optimization
gradient_checkpointing: unsloth
sdp_attention: true
plugins:
  - axolotl.integrations.liger.LigerPlugin
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
cut_cross_entropy: true
liger_rope: true
liger_rms_norm: true
liger_layer_norm: true
liger_glu_activation: true
liger_cross_entropy: false
liger_fused_linear_cross_entropy: false
lora_mlp_kernel: false
lora_qkv_kernel: false
lora_o_kernel: false

# DeepSpeed
deepspeed:

# Garbage Collection
gc_steps: 1

# Debug config
debug: true
seed: 42

# Token config
special_tokens:
  eos_token: "<|endoftext|>"
  pad_token: "<|endoftext|>"
tokens:

Outputs/Qwen2.5-QwQ-RP-Draft-v0.1-0.5B-LoRA-run10

This model is a fine-tuned version of Qwen/Qwen2.5-0.5B-Instruct on the PJMixers-Dev/allura-org_gryphe-sonnet-3.5-charcards-names-added-qwq-all-aphrodite, the PJMixers-Dev/anthracite-org_c2_logs_32k_llama3_qwen2_v1.3-qwq-all-aphrodite, the PJMixers-Dev/grimulkan_aicg-logs-augmented-system-qwq-all-aphrodite, the PJMixers-Dev/grimulkan_jannie-log-augmented-system-qwq-all-aphrodite, the PJMixers-Dev/grimulkan_PIPPA-augmented-dedup-system-qwq-all-aphrodite, the PJMixers-Dev/lemonilia_LimaRP-Only-NonSus-Simple-CustomShareGPT-qwq-all-aphrodite, the PJMixers-Dev/MinervaAI_Aesir-Preview-Anon-qwq-all-aphrodite and the PJMixers-Dev/NyxKrage_chub-logs-sharegpt-longest-CustomShareGPT-qwq-all-aphrodite datasets. It achieves the following results on the evaluation set:

  • Loss: 1.9716

Model description

More information needed

Intended uses & limitations

More information needed

Training and evaluation data

More information needed

Training procedure

Training hyperparameters

The following hyperparameters were used during training:

  • learning_rate: 1e-05
  • train_batch_size: 16
  • eval_batch_size: 16
  • seed: 42
  • optimizer: Use OptimizerNames.ADAMW_HF with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
  • lr_scheduler_type: cosine
  • num_epochs: 1.0

Training results

Training Loss Epoch Step Validation Loss
3.4865 0.0010 1 3.2134
2.481 0.0102 10 2.5552
2.2964 0.0205 20 2.4036
2.3048 0.0307 30 2.3367
2.2577 0.0409 40 2.2934
2.2298 0.0512 50 2.2601
2.1367 0.0614 60 2.2385
2.1512 0.0716 70 2.2166
2.1703 0.0819 80 2.2022
2.1263 0.0921 90 2.1883
2.2121 0.1024 100 2.1750
2.1741 0.1126 110 2.1633
2.1621 0.1228 120 2.1547
2.0664 0.1331 130 2.1456
2.1005 0.1433 140 2.1374
2.0822 0.1535 150 2.1315
2.0856 0.1638 160 2.1252
2.1386 0.1740 170 2.1182
2.0756 0.1842 180 2.1134
2.0492 0.1945 190 2.1066
1.9882 0.2047 200 2.1024
2.036 0.2149 210 2.0970
2.1313 0.2252 220 2.0940
2.0356 0.2354 230 2.0897
2.0278 0.2456 240 2.0869
2.0754 0.2559 250 2.0825
2.0582 0.2661 260 2.0784
2.0588 0.2764 270 2.0758
1.9757 0.2866 280 2.0723
2.0619 0.2968 290 2.0700
1.956 0.3071 300 2.0684
2.065 0.3173 310 2.0642
1.982 0.3275 320 2.0604
2.0424 0.3378 330 2.0577
2.0635 0.3480 340 2.0553
1.9895 0.3582 350 2.0518
2.0296 0.3685 360 2.0496
2.0231 0.3787 370 2.0472
1.9422 0.3889 380 2.0459
2.0214 0.3992 390 2.0427
2.0107 0.4094 400 2.0401
2.0307 0.4197 410 2.0371
1.9874 0.4299 420 2.0356
2.0249 0.4401 430 2.0331
2.0947 0.4504 440 2.0314
1.9644 0.4606 450 2.0291
2.0633 0.4708 460 2.0271
2.0438 0.4811 470 2.0255
2.0227 0.4913 480 2.0239
2.0023 0.5015 490 2.0208
2.0231 0.5118 500 2.0193
1.9659 0.5220 510 2.0179
1.9382 0.5322 520 2.0171
1.9959 0.5425 530 2.0157
1.9835 0.5527 540 2.0139
1.942 0.5629 550 2.0124
2.0036 0.5732 560 2.0109
2.023 0.5834 570 2.0100
1.9686 0.5937 580 2.0078
1.9867 0.6039 590 2.0070
1.9662 0.6141 600 2.0060
1.968 0.6244 610 2.0045
1.9435 0.6346 620 2.0035
1.9245 0.6448 630 2.0024
1.9573 0.6551 640 2.0007
1.9466 0.6653 650 1.9994
2.0202 0.6755 660 1.9976
1.891 0.6858 670 1.9965
2.0134 0.6960 680 1.9980
1.9276 0.7062 690 1.9958
1.9266 0.7165 700 1.9949
1.8661 0.7267 710 1.9932
1.9446 0.7369 720 1.9923
1.8605 0.7472 730 1.9908
1.9426 0.7574 740 1.9906
1.9806 0.7677 750 1.9893
1.9268 0.7779 760 1.9880
1.987 0.7881 770 1.9870
1.9182 0.7984 780 1.9866
2.0103 0.8086 790 1.9853
1.9153 0.8188 800 1.9839
2.0043 0.8291 810 1.9830
1.9791 0.8393 820 1.9819
1.912 0.8495 830 1.9811
1.9288 0.8598 840 1.9808
1.9613 0.8700 850 1.9796
1.9767 0.8802 860 1.9783
1.9097 0.8905 870 1.9783
1.9727 0.9007 880 1.9773
1.9432 0.9110 890 1.9763
1.9109 0.9212 900 1.9754
1.9184 0.9314 910 1.9749
1.9179 0.9417 920 1.9744
1.9812 0.9519 930 1.9735
1.9695 0.9621 940 1.9727
1.9474 0.9724 950 1.9727
1.8376 0.9826 960 1.9721
1.8961 0.9928 970 1.9716

Framework versions

  • PEFT 0.14.0
  • Transformers 4.50.0.dev0
  • Pytorch 2.7.0.dev20250224+rocm6.3
  • Datasets 3.3.1
  • Tokenizers 0.21.0