metadata

library_name: transformers
license: apache-2.0
base_model: Qwen/Qwen2.5-0.5B-Instruct
tags:
  - generated_from_trainer
datasets:
  - >-
    PJMixers-Dev/allura-org_gryphe-sonnet-3.5-charcards-names-added-qwq-all-aphrodite
  - PJMixers-Dev/anthracite-org_c2_logs_32k_llama3_qwen2_v1.3-qwq-all-aphrodite
  - PJMixers-Dev/grimulkan_aicg-logs-augmented-system-qwq-all-aphrodite
  - PJMixers-Dev/grimulkan_jannie-log-augmented-system-qwq-all-aphrodite
  - PJMixers-Dev/grimulkan_PIPPA-augmented-dedup-system-qwq-all-aphrodite
  - >-
    PJMixers-Dev/lemonilia_LimaRP-Only-NonSus-Simple-CustomShareGPT-qwq-all-aphrodite
  - PJMixers-Dev/MinervaAI_Aesir-Preview-Anon-qwq-all-aphrodite
  - >-
    PJMixers-Dev/NyxKrage_chub-logs-sharegpt-longest-CustomShareGPT-qwq-all-aphrodite
model-index:
  - name: Outputs/Qwen2.5-QwQ-RP-Draft-v0.1-0.5B-LoRA-run10
    results: []

See axolotl config

axolotl version: 0.6.0

# Weights and Biases logging config
wandb_project: Qwen2.5-QwQ-RP-Draft-0.5B
wandb_entity:
wandb_watch:
wandb_name: Qwen2.5-QwQ-RP-Draft-v0.1-0.5B-LoRA-run10
wandb_log_model:

# Model checkpointing config
output_dir: ./Outputs/Qwen2.5-QwQ-RP-Draft-v0.1-0.5B-LoRA-run10
resume_from_checkpoint:
save_steps: 10
save_safetensors: true
save_total_limit: 3
save_only_model: false

# Model architecture config
base_model: Qwen/Qwen2.5-0.5B-Instruct
model_type: AutoModelForCausalLM
tokenizer_type: AutoTokenizer

# Mixed precision training config
bf16: true
fp16: false
tf32: false

# Model loading config
load_in_8bit: false
load_in_4bit: false
strict: false

# Sequence config
sequence_len: 8192
min_sample_len: 256
sample_packing: true
eval_sample_packing: true
pad_to_sequence_len: true
train_on_inputs: false
group_by_length: false

# LoRA adapter config
adapter: lora
lora_model_dir:
lora_r: 128
lora_alpha: 128
lora_dropout: 0.125
peft_layers_to_transform:
peft_use_dora:
peft_use_rslora:
peft_layer_replication:
lora_target_modules:
  - gate_proj
  - down_proj
  - up_proj
  - q_proj
  - v_proj
  - k_proj
  - o_proj
lora_modules_to_save:

# Fix uninitialized tokens (such as <|start_header_id|> on the base L3 models)
fix_untrained_tokens:

# Dataset config
# RP: https://github.com/xzuyn/axolotl/blob/prompt_formats/src/axolotl/prompt_strategies/customchatml-regex-last-only.py
datasets:
  - path: PJMixers-Dev/allura-org_gryphe-sonnet-3.5-charcards-names-added-qwq-all-aphrodite
    split: train[128:]  # Everything except the first 128
    type: customchatml-regex-last-only
  - path: PJMixers-Dev/anthracite-org_c2_logs_32k_llama3_qwen2_v1.3-qwq-all-aphrodite
    split: train[128:]  # Everything except the first 128
    type: customchatml-regex-last-only
  - path: PJMixers-Dev/grimulkan_aicg-logs-augmented-system-qwq-all-aphrodite
    split: train[128:]  # Everything except the first 128
    type: customchatml-regex-last-only
  - path: PJMixers-Dev/grimulkan_jannie-log-augmented-system-qwq-all-aphrodite
    split: train[128:]  # Everything except the first 128
    type: customchatml-regex-last-only
  - path: PJMixers-Dev/grimulkan_PIPPA-augmented-dedup-system-qwq-all-aphrodite
    split: train[128:]  # Everything except the first 128
    type: customchatml-regex-last-only
  - path: PJMixers-Dev/lemonilia_LimaRP-Only-NonSus-Simple-CustomShareGPT-qwq-all-aphrodite
    split: train[128:]  # Everything except the first 128
    type: customchatml-regex-last-only
  - path: PJMixers-Dev/MinervaAI_Aesir-Preview-Anon-qwq-all-aphrodite
    split: train[128:]  # Everything except the first 128
    type: customchatml-regex-last-only
  - path: PJMixers-Dev/NyxKrage_chub-logs-sharegpt-longest-CustomShareGPT-qwq-all-aphrodite
    split: train[128:]  # Everything except the first 128
    type: customchatml-regex-last-only
test_datasets:
  - path: PJMixers-Dev/allura-org_gryphe-sonnet-3.5-charcards-names-added-qwq-all-aphrodite
    split: train[:128]  # Only the first 128
    type: customchatml-regex-last-only
  - path: PJMixers-Dev/anthracite-org_c2_logs_32k_llama3_qwen2_v1.3-qwq-all-aphrodite
    split: train[:128]  # Only the first 128
    type: customchatml-regex-last-only
  - path: PJMixers-Dev/grimulkan_aicg-logs-augmented-system-qwq-all-aphrodite
    split: train[:128]  # Only the first 128
    type: customchatml-regex-last-only
  - path: PJMixers-Dev/grimulkan_jannie-log-augmented-system-qwq-all-aphrodite
    split: train[:128]  # Only the first 128
    type: customchatml-regex-last-only
  - path: PJMixers-Dev/grimulkan_PIPPA-augmented-dedup-system-qwq-all-aphrodite
    split: train[:128]  # Only the first 128
    type: customchatml-regex-last-only
  - path: PJMixers-Dev/lemonilia_LimaRP-Only-NonSus-Simple-CustomShareGPT-qwq-all-aphrodite
    split: train[:128]  # Only the first 128
    type: customchatml-regex-last-only
  - path: PJMixers-Dev/MinervaAI_Aesir-Preview-Anon-qwq-all-aphrodite
    split: train[:128]  # Only the first 128
    type: customchatml-regex-last-only
  - path: PJMixers-Dev/NyxKrage_chub-logs-sharegpt-longest-CustomShareGPT-qwq-all-aphrodite
    split: train[:128]  # Only the first 128
    type: customchatml-regex-last-only
val_set_size: 0
eval_strategy: steps
eval_steps: 10
dataset_prepared_path: ./00-Tokenized-Datasets/Qwen2.5-QwQ-Draft-0.5B-customchatml-regex-newer
shuffle_merged_datasets: true
dataset_processes:

# Training hyperparameters
num_epochs: 1
gradient_accumulation_steps: 1
micro_batch_size: 16
eval_batch_size: 16
warmup_steps: 0
optimizer: came_pytorch
optim_args:
optim_target_modules:
lr_scheduler: rex
learning_rate: 1e-5
cosine_min_lr_ratio:
loraplus_lr_ratio:
loraplus_lr_embedding:
weight_decay: 0.1
max_grad_norm: 1
logging_steps: 1

# Model optimization
gradient_checkpointing: unsloth
sdp_attention: true
plugins:
  - axolotl.integrations.liger.LigerPlugin
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
cut_cross_entropy: true
liger_rope: true
liger_rms_norm: true
liger_layer_norm: true
liger_glu_activation: true
liger_cross_entropy: false
liger_fused_linear_cross_entropy: false
lora_mlp_kernel: false
lora_qkv_kernel: false
lora_o_kernel: false

# DeepSpeed
deepspeed:

# Garbage Collection
gc_steps: 1

# Debug config
debug: true
seed: 42

# Token config
special_tokens:
  eos_token: "<|endoftext|>"
  pad_token: "<|endoftext|>"
tokens:

Outputs/Qwen2.5-QwQ-RP-Draft-v0.1-0.5B-LoRA-run10

This model is a fine-tuned version of Qwen/Qwen2.5-0.5B-Instruct on the PJMixers-Dev/allura-org_gryphe-sonnet-3.5-charcards-names-added-qwq-all-aphrodite, the PJMixers-Dev/anthracite-org_c2_logs_32k_llama3_qwen2_v1.3-qwq-all-aphrodite, the PJMixers-Dev/grimulkan_aicg-logs-augmented-system-qwq-all-aphrodite, the PJMixers-Dev/grimulkan_jannie-log-augmented-system-qwq-all-aphrodite, the PJMixers-Dev/grimulkan_PIPPA-augmented-dedup-system-qwq-all-aphrodite, the PJMixers-Dev/lemonilia_LimaRP-Only-NonSus-Simple-CustomShareGPT-qwq-all-aphrodite, the PJMixers-Dev/MinervaAI_Aesir-Preview-Anon-qwq-all-aphrodite and the PJMixers-Dev/NyxKrage_chub-logs-sharegpt-longest-CustomShareGPT-qwq-all-aphrodite datasets. It achieves the following results on the evaluation set:

Loss: 1.9716

Model description

More information needed

Intended uses & limitations

More information needed

Training and evaluation data

More information needed

Training procedure

Training hyperparameters

The following hyperparameters were used during training:

learning_rate: 1e-05
train_batch_size: 16
eval_batch_size: 16
seed: 42
optimizer: Use OptimizerNames.ADAMW_HF with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
lr_scheduler_type: cosine
num_epochs: 1.0

Training results

Training Loss	Epoch	Step	Validation Loss
3.4865	0.0010	1	3.2134
2.481	0.0102	10	2.5552
2.2964	0.0205	20	2.4036
2.3048	0.0307	30	2.3367
2.2577	0.0409	40	2.2934
2.2298	0.0512	50	2.2601
2.1367	0.0614	60	2.2385
2.1512	0.0716	70	2.2166
2.1703	0.0819	80	2.2022
2.1263	0.0921	90	2.1883
2.2121	0.1024	100	2.1750
2.1741	0.1126	110	2.1633
2.1621	0.1228	120	2.1547
2.0664	0.1331	130	2.1456
2.1005	0.1433	140	2.1374
2.0822	0.1535	150	2.1315
2.0856	0.1638	160	2.1252
2.1386	0.1740	170	2.1182
2.0756	0.1842	180	2.1134
2.0492	0.1945	190	2.1066
1.9882	0.2047	200	2.1024
2.036	0.2149	210	2.0970
2.1313	0.2252	220	2.0940
2.0356	0.2354	230	2.0897
2.0278	0.2456	240	2.0869
2.0754	0.2559	250	2.0825
2.0582	0.2661	260	2.0784
2.0588	0.2764	270	2.0758
1.9757	0.2866	280	2.0723
2.0619	0.2968	290	2.0700
1.956	0.3071	300	2.0684
2.065	0.3173	310	2.0642
1.982	0.3275	320	2.0604
2.0424	0.3378	330	2.0577
2.0635	0.3480	340	2.0553
1.9895	0.3582	350	2.0518
2.0296	0.3685	360	2.0496
2.0231	0.3787	370	2.0472
1.9422	0.3889	380	2.0459
2.0214	0.3992	390	2.0427
2.0107	0.4094	400	2.0401
2.0307	0.4197	410	2.0371
1.9874	0.4299	420	2.0356
2.0249	0.4401	430	2.0331
2.0947	0.4504	440	2.0314
1.9644	0.4606	450	2.0291
2.0633	0.4708	460	2.0271
2.0438	0.4811	470	2.0255
2.0227	0.4913	480	2.0239
2.0023	0.5015	490	2.0208
2.0231	0.5118	500	2.0193
1.9659	0.5220	510	2.0179
1.9382	0.5322	520	2.0171
1.9959	0.5425	530	2.0157
1.9835	0.5527	540	2.0139
1.942	0.5629	550	2.0124
2.0036	0.5732	560	2.0109
2.023	0.5834	570	2.0100
1.9686	0.5937	580	2.0078
1.9867	0.6039	590	2.0070
1.9662	0.6141	600	2.0060
1.968	0.6244	610	2.0045
1.9435	0.6346	620	2.0035
1.9245	0.6448	630	2.0024
1.9573	0.6551	640	2.0007
1.9466	0.6653	650	1.9994
2.0202	0.6755	660	1.9976
1.891	0.6858	670	1.9965
2.0134	0.6960	680	1.9980
1.9276	0.7062	690	1.9958
1.9266	0.7165	700	1.9949
1.8661	0.7267	710	1.9932
1.9446	0.7369	720	1.9923
1.8605	0.7472	730	1.9908
1.9426	0.7574	740	1.9906
1.9806	0.7677	750	1.9893
1.9268	0.7779	760	1.9880
1.987	0.7881	770	1.9870
1.9182	0.7984	780	1.9866
2.0103	0.8086	790	1.9853
1.9153	0.8188	800	1.9839
2.0043	0.8291	810	1.9830
1.9791	0.8393	820	1.9819
1.912	0.8495	830	1.9811
1.9288	0.8598	840	1.9808
1.9613	0.8700	850	1.9796
1.9767	0.8802	860	1.9783
1.9097	0.8905	870	1.9783
1.9727	0.9007	880	1.9773
1.9432	0.9110	890	1.9763
1.9109	0.9212	900	1.9754
1.9184	0.9314	910	1.9749
1.9179	0.9417	920	1.9744
1.9812	0.9519	930	1.9735
1.9695	0.9621	940	1.9727
1.9474	0.9724	950	1.9727
1.8376	0.9826	960	1.9721
1.8961	0.9928	970	1.9716

Framework versions

PEFT 0.14.0
Transformers 4.50.0.dev0
Pytorch 2.7.0.dev20250224+rocm6.3
Datasets 3.3.1
Tokenizers 0.21.0