Text Generation
Transformers
Safetensors
qwen2
Generated from Trainer
axolotl
conversational
Inference Endpoints
text-generation-inference
Crystalcareai commited on
Commit
3e843c0
1 Parent(s): 8a50107

Upload folder using huggingface_hub

Browse files
README.md ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ base_model: Qwen/Qwen2-0.5B
4
+ tags:
5
+ - generated_from_trainer
6
+ model-index:
7
+ - name: qwen-2.9.3-qwen2-0.5b
8
+ results: []
9
+ ---
10
+
11
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
12
+ should probably proofread and complete it, then remove this comment. -->
13
+
14
+ [<img src="https://raw.githubusercontent.com/OpenAccess-AI-Collective/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/OpenAccess-AI-Collective/axolotl)
15
+ <details><summary>See axolotl config</summary>
16
+
17
+ axolotl version: `0.4.1`
18
+ ```yaml
19
+ base_model: Qwen/Qwen2-0.5B
20
+ model_type: AutoModelForCausalLM
21
+ tokenizer_type: AutoTokenizer
22
+
23
+ # load_in_4bit: true
24
+
25
+ chat_template: chatml
26
+ datasets:
27
+ - path: /workspace/datasets/dolphin201-sharegpt2.jsonl
28
+ type: sharegpt
29
+ conversation: chatml
30
+ - path: /workspace/datasets/SystemChat_filtered_sharegpt.jsonl
31
+ type: sharegpt
32
+ conversation: chatml
33
+ - path: /workspace/datasets/SystemChat_multilingual_sharegpt.jsonl
34
+ type: sharegpt
35
+ conversation: chatml
36
+ # - path: /workspace/datasets/SystemChat-2.0-Arabic/SystemChatArabic_sharegpt.jsonl
37
+ # type: sharegpt
38
+ # conversation: chatml
39
+ # - path: /workspace/datasets/dolphin-coder-translate-sharegpt2.jsonl
40
+ # type: sharegpt
41
+ # conversation: chatml
42
+ # - path: /workspace/datasets/dolphin-coder-codegen-sharegpt2.jsonl
43
+ # type: sharegpt
44
+ # conversation: chatml
45
+ # - path: /workspace/datasets/m-a-p_Code-Feedback-sharegpt-unfiltered.jsonl
46
+ # type: sharegpt
47
+ # conversation: chatml
48
+ # - path: /workspace/datasets/m-a-p_CodeFeedback-Filtered-Instruction-sharegpt-unfiltered.jsonl
49
+ # type: sharegpt
50
+ # conversation: chatml
51
+ - path: /workspace/datasets/not_samantha_norefusals.jsonl
52
+ type: sharegpt
53
+ conversation: chatml
54
+ - path: /workspace/datasets/Orca-Math-resort-unfiltered.jsonl
55
+ type: sharegpt
56
+ conversation: chatml
57
+ - path: /workspace/datasets/agent_instruct_react_unfiltered.jsonl
58
+ type: sharegpt
59
+ conversation: chatml
60
+ - path: /workspace/datasets/toolbench_instruct_j1s1_3k_unfiltered.jsonl
61
+ type: sharegpt
62
+ conversation: chatml
63
+ - path: /workspace/datasets/toolbench_negative_unfiltered.jsonl
64
+ type: sharegpt
65
+ conversation: chatml
66
+ - path: /workspace/datasets/toolbench_react_10p_unfiltered.jsonl
67
+ type: sharegpt
68
+ conversation: chatml
69
+ - path: /workspace/datasets/toolbench_tflan_cot_30p_unfiltered.jsonl
70
+ type: sharegpt
71
+ conversation: chatml
72
+ - path: /workspace/datasets/openhermes200k_unfiltered.jsonl
73
+ type: sharegpt
74
+ conversation: chatml
75
+
76
+ dataset_prepared_path: last_run_prepared
77
+ val_set_size: 0.03
78
+ output_dir: ./qwen-2.9.3-qwen2-0.5b
79
+
80
+ sequence_len: 16384
81
+ sample_packing: true
82
+ pad_to_sequence_len: true
83
+
84
+ # adapter: qlora
85
+ # lora_r: 16
86
+ # lora_alpha: 32
87
+ # lora_dropout: 0.05
88
+ # lora_target_modules:
89
+ # - q_proj
90
+ # - k_proj
91
+ # - v_proj
92
+ # - o_proj
93
+ # - gate_proj
94
+ # - up_proj
95
+ # - down_proj
96
+
97
+ wandb_project: 2.9.3-qwen-2.9.3-qwen2-0.5b
98
+ # wandb_entity: oaaic
99
+ # wandb_watch:
100
+ # wandb_name:
101
+ # wandb_log_model:
102
+
103
+ gradient_accumulation_steps: 4
104
+ micro_batch_size: 1
105
+ num_epochs: 3
106
+ optimizer: adamw_8bit
107
+ lr_scheduler: cosine
108
+ learning_rate: 1e-4
109
+ # max_grad_norm: 1.0
110
+
111
+ train_on_inputs: false
112
+ group_by_length: false
113
+ bf16: true
114
+ tf32: false
115
+
116
+ gradient_checkpointing: true
117
+ gradient_checkpointing_kwargs:
118
+ use_reentrant: true
119
+ logging_steps: 1
120
+ flash_attention: true
121
+ deepspeed: /workspace/axolotl/deepspeed_configs/zero3_bf16.json
122
+ warmup_steps: 10
123
+ # evals_per_epoch: 2
124
+ saves_per_epoch: 2
125
+ save_total_limit: 2
126
+ weight_decay: 0.1
127
+ special_tokens:
128
+ eos_token: <|im_end|>
129
+
130
+
131
+ ```
132
+
133
+ </details><br>
134
+
135
+ # qwen-2.9.3-qwen2-0.5b
136
+
137
+ This model is a fine-tuned version of [Qwen/Qwen2-0.5B](https://huggingface.co/Qwen/Qwen2-0.5B) on the None dataset.
138
+ It achieves the following results on the evaluation set:
139
+ - Loss: 0.9401
140
+
141
+ ## Model description
142
+
143
+ More information needed
144
+
145
+ ## Intended uses & limitations
146
+
147
+ More information needed
148
+
149
+ ## Training and evaluation data
150
+
151
+ More information needed
152
+
153
+ ## Training procedure
154
+
155
+ ### Training hyperparameters
156
+
157
+ The following hyperparameters were used during training:
158
+ - learning_rate: 0.0001
159
+ - train_batch_size: 1
160
+ - eval_batch_size: 1
161
+ - seed: 42
162
+ - distributed_type: multi-GPU
163
+ - num_devices: 8
164
+ - gradient_accumulation_steps: 4
165
+ - total_train_batch_size: 32
166
+ - total_eval_batch_size: 8
167
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
168
+ - lr_scheduler_type: cosine
169
+ - lr_scheduler_warmup_steps: 10
170
+ - num_epochs: 3
171
+
172
+ ### Training results
173
+
174
+ | Training Loss | Epoch | Step | Validation Loss |
175
+ |:-------------:|:------:|:----:|:---------------:|
176
+ | 1.0495 | 1.0147 | 1453 | 1.0261 |
177
+ | 0.9052 | 2.0161 | 2908 | 0.9491 |
178
+ | 0.8097 | 2.9693 | 4296 | 0.9401 |
179
+
180
+
181
+ ### Framework versions
182
+
183
+ - Transformers 4.41.1
184
+ - Pytorch 2.1.2+cu121
185
+ - Datasets 2.19.1
186
+ - Tokenizers 0.19.1
added_tokens.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "<|endoftext|>": 151643,
3
+ "<|im_end|>": 151645,
4
+ "<|im_start|>": 151644
5
+ }
config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "Qwen/Qwen2-0.5B",
3
+ "architectures": [
4
+ "Qwen2ForCausalLM"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "eos_token_id": 151645,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 896,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 4864,
12
+ "max_position_embeddings": 131072,
13
+ "max_window_layers": 24,
14
+ "model_type": "qwen2",
15
+ "num_attention_heads": 14,
16
+ "num_hidden_layers": 24,
17
+ "num_key_value_heads": 2,
18
+ "rms_norm_eps": 1e-06,
19
+ "rope_theta": 1000000.0,
20
+ "sliding_window": 131072,
21
+ "tie_word_embeddings": true,
22
+ "torch_dtype": "bfloat16",
23
+ "transformers_version": "4.41.1",
24
+ "use_cache": false,
25
+ "use_sliding_window": false,
26
+ "vocab_size": 151936
27
+ }
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": 151643,
5
+ "max_new_tokens": 2048,
6
+ "transformers_version": "4.41.1"
7
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9adccc59fd39ac70af1c269702fa727fc14840d6c6ddc60668e95b9478947294
3
+ size 988097824
special_tokens_map.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>"
5
+ ],
6
+ "eos_token": {
7
+ "content": "<|im_end|>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false
12
+ },
13
+ "pad_token": {
14
+ "content": "<|endoftext|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ }
20
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "151643": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "151644": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "151645": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ }
28
+ },
29
+ "additional_special_tokens": [
30
+ "<|im_start|>",
31
+ "<|im_end|>"
32
+ ],
33
+ "bos_token": null,
34
+ "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
35
+ "clean_up_tokenization_spaces": false,
36
+ "eos_token": "<|im_end|>",
37
+ "errors": "replace",
38
+ "model_max_length": 32768,
39
+ "pad_token": "<|endoftext|>",
40
+ "split_special_tokens": false,
41
+ "tokenizer_class": "Qwen2Tokenizer",
42
+ "unk_token": null
43
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff