MaziyarPanahi commited on
Commit
926fdf1
1 Parent(s): 119d8c7

aa2bb9297e1d964e97cf5d0f9bd530ed9653ca5fb8d456d16b818527fc1ff715

Browse files
README.md ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: other
3
+ library_name: peft
4
+ tags:
5
+ - generated_from_trainer
6
+ - fine-tuned
7
+ - wikihow
8
+ - cosmopedia
9
+ - qwen
10
+ - moe
11
+ base_model: Qwen/Qwen1.5-MoE-A2.7B
12
+ model-index:
13
+ - name: models/Qwen1.5-MoE-A2.7B-Wikihow
14
+ results: []
15
+ ---
16
+
17
+ # models/Qwen1.5-MoE-A2.7B-Wikihow
18
+
19
+ This model is a fine-tuned version of [Qwen/Qwen1.5-MoE-A2.7B](https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B) on the None dataset.
20
+
21
+ ## Model description
22
+
23
+ More information needed
24
+
25
+ ## Intended uses & limitations
26
+
27
+ More information needed
28
+
29
+ ## Training and evaluation data
30
+
31
+ More information needed
32
+
33
+ ## Training procedure
34
+
35
+ ### Training hyperparameters
36
+
37
+ The following hyperparameters were used during training:
38
+ - learning_rate: 0.0002
39
+ - train_batch_size: 2
40
+ - eval_batch_size: 2
41
+ - seed: 42
42
+ - distributed_type: multi-GPU
43
+ - num_devices: 4
44
+ - gradient_accumulation_steps: 4
45
+ - total_train_batch_size: 32
46
+ - total_eval_batch_size: 8
47
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
48
+ - lr_scheduler_type: cosine
49
+ - lr_scheduler_warmup_steps: 10
50
+ - num_epochs: 1
51
+
52
+ ### Training results
53
+
54
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
55
+ should probably proofread and complete it, then remove this comment. -->
56
+
57
+ [<img src="https://raw.githubusercontent.com/OpenAccess-AI-Collective/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/OpenAccess-AI-Collective/axolotl)
58
+ <details><summary>See axolotl config</summary>
59
+
60
+ axolotl version: `0.4.0`
61
+ ```yaml
62
+ base_model: Qwen/Qwen1.5-MoE-A2.7B
63
+ trust_remote_code: true
64
+
65
+ load_in_8bit: false
66
+ load_in_4bit: true
67
+ strict: false
68
+
69
+ # hub_model_id: MaziyarPanahi/Qwen1.5-MoE-A2.7B-Wikihow
70
+ # hf_use_auth_token: true
71
+
72
+ chat_template: chatml
73
+
74
+ datasets:
75
+ - path: HuggingFaceTB/cosmopedia
76
+ name: wikihow
77
+ type:
78
+ system_prompt: ""
79
+ field_instruction: prompt
80
+ field_output: text
81
+ format: "<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n"
82
+ no_input_format: "<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n"
83
+
84
+ dataset_prepared_path:
85
+ val_set_size: 0.0
86
+ output_dir: ./models/Qwen1.5-MoE-A2.7B-Wikihow
87
+
88
+ sequence_len: 2048
89
+ sample_packing: false
90
+ pad_to_sequence_len: false
91
+
92
+ adapter: lora
93
+ lora_model_dir:
94
+ lora_r: 32
95
+ lora_alpha: 16
96
+ lora_dropout: 0.05
97
+ lora_target_linear: true
98
+ lora_fan_in_fan_out:
99
+
100
+ wandb_project:
101
+ wandb_entity:
102
+ wandb_watch:
103
+ wandb_name:
104
+ wandb_log_model:
105
+
106
+ gradient_accumulation_steps: 4
107
+ micro_batch_size: 2
108
+ num_epochs: 1
109
+ optimizer: paged_adamw_8bit
110
+ lr_scheduler: cosine
111
+ learning_rate: 0.0002
112
+
113
+ train_on_inputs: false
114
+ group_by_length: false
115
+ bf16: auto
116
+ fp16:
117
+ tf32: true
118
+
119
+ gradient_checkpointing: true
120
+ gradient_checkpointing_kwargs:
121
+ use_reentrant: false
122
+ early_stopping_patience:
123
+ resume_from_checkpoint:
124
+ local_rank:
125
+ logging_steps: 1
126
+ xformers_attention:
127
+ flash_attention: true
128
+
129
+ warmup_steps: 10
130
+ evals_per_epoch: 4
131
+ saves_per_epoch: 1
132
+ debug:
133
+ deepspeed:
134
+ weight_decay: 0.0
135
+ fsdp:
136
+ fsdp_config:
137
+ special_tokens:
138
+ ```
139
+
140
+ </details><br>
141
+
142
+
143
+ ### Framework versions
144
+
145
+ - PEFT 0.10.0
146
+ - Transformers 4.40.0.dev0
147
+ - Pytorch 2.2.0+cu121
148
+ - Datasets 2.18.0
149
+ - Tokenizers 0.15.2
added_tokens.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "<|endoftext|>": 151643,
3
+ "<|im_end|>": 151645,
4
+ "<|im_start|>": 151644
5
+ }
config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "Qwen/Qwen1.5-MoE-A2.7B",
3
+ "architectures": [
4
+ "Qwen2MoeForCausalLM"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "decoder_sparse_step": 1,
8
+ "eos_token_id": 151643,
9
+ "hidden_act": "silu",
10
+ "hidden_size": 2048,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 5632,
13
+ "max_position_embeddings": 8192,
14
+ "max_window_layers": 21,
15
+ "model_type": "qwen2_moe",
16
+ "moe_intermediate_size": 1408,
17
+ "norm_topk_prob": false,
18
+ "num_attention_heads": 16,
19
+ "num_experts": 60,
20
+ "num_experts_per_tok": 4,
21
+ "num_hidden_layers": 24,
22
+ "num_key_value_heads": 16,
23
+ "output_router_logits": false,
24
+ "rms_norm_eps": 1e-06,
25
+ "rope_theta": 1000000.0,
26
+ "router_aux_loss_coef": 0.001,
27
+ "shared_expert_intermediate_size": 5632,
28
+ "sliding_window": 32768,
29
+ "tie_word_embeddings": false,
30
+ "torch_dtype": "bfloat16",
31
+ "transformers_version": "4.40.0.dev0",
32
+ "use_cache": false,
33
+ "use_sliding_window": false,
34
+ "vocab_size": 151936
35
+ }
generation_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "transformers_version": "4.40.0.dev0"
10
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>"
5
+ ],
6
+ "eos_token": {
7
+ "content": "<|endoftext|>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false
12
+ },
13
+ "pad_token": {
14
+ "content": "<|endoftext|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ }
20
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "151643": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "151644": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "151645": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ }
28
+ },
29
+ "additional_special_tokens": [
30
+ "<|im_start|>",
31
+ "<|im_end|>"
32
+ ],
33
+ "bos_token": null,
34
+ "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
35
+ "clean_up_tokenization_spaces": false,
36
+ "eos_token": "<|endoftext|>",
37
+ "errors": "replace",
38
+ "model_max_length": 32768,
39
+ "pad_token": "<|endoftext|>",
40
+ "split_special_tokens": false,
41
+ "tokenizer_class": "Qwen2Tokenizer",
42
+ "unk_token": null
43
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff