ayousanz commited on
Commit
33ca2b9
·
verified ·
1 Parent(s): 0f807f2

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +222 -0
README.md CHANGED
@@ -1,3 +1,225 @@
1
  ---
2
  license: apache-2.0
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: apache-2.0
3
  ---
4
+
5
+ # mistralアーキテクチャを使った日本語LLM(0.3B)
6
+
7
+ # 学習環境
8
+
9
+ * A5000 × 7
10
+
11
+ # 学習パラメータ
12
+
13
+ ## hf_config.json
14
+
15
+ ```json
16
+ {
17
+ "model_type": "gpt2",
18
+ "config_name":"gpt2-medium" ,
19
+ "tokenizer_name":"/home/ubuntu/slm/spm_tokenizer_neologdn_bytefallback_nofast" ,
20
+ "train_file":"../wiki.txt",
21
+ "validation_split_percentage":5,
22
+ "output_dir":"checkpoints-mistral-300M-FA2-3",
23
+ "do_train":true,
24
+ "do_eval":true,
25
+ "prediction_loss_only":true,
26
+ "remove_unused_columns":false ,
27
+ "learning_rate":3.0e-4 ,
28
+ "weight_decay":0.1 ,
29
+ "adam_beta2":0.95 ,
30
+ "num_train_epochs":10,
31
+ "logging_dir":"checkpoints-mistral-300M-FA2-3/logs",
32
+ "logging_strategy": "steps" ,
33
+ "logging_steps":10 ,
34
+ "evaluation_strategy":"steps" ,
35
+ "save_strategy": "steps" ,
36
+ "eval_steps":500 ,
37
+ "save_steps":500 ,
38
+ "load_best_model_at_end":true ,
39
+ "save_total_limit":10 ,
40
+ "warmup_steps":4 ,
41
+ "lr_scheduler_type":"cosine" ,
42
+ "per_device_train_batch_size":8,
43
+ "per_device_eval_batch_size":8,
44
+ "block_size":1024 ,
45
+ "adam_epsilon":1.0e-4 ,
46
+ "fp16":true ,
47
+ "gradient_accumulation_steps":16,
48
+ "push_to_hub":false,
49
+ "dataloader_num_workers": 8,
50
+ "optim":"adamw_bnb_8bit" ,
51
+ "torch_compile":true
52
+ }
53
+ ```
54
+
55
+ ## モデルパラメータ
56
+
57
+ ```json
58
+ {
59
+ "architectures": [
60
+ "MistralForCausalLM"
61
+ ],
62
+ "bos_token_id": 0,
63
+ "eos_token_id": 0,
64
+ "hidden_act": "silu",
65
+ "hidden_size": 1024,
66
+ "initializer_range": 0.02,
67
+ "intermediate_size": 2400,
68
+ "max_position_embeddings": 4096,
69
+ "model_type": "mistral",
70
+ "num_attention_heads": 16,
71
+ "num_hidden_layers": 24,
72
+ "num_key_value_heads": 8,
73
+ "rms_norm_eps": 1e-05,
74
+ "rope_theta": 10000.0,
75
+ "sliding_window": 1024,
76
+ "tie_word_embeddings": false,
77
+ "torch_dtype": "float16",
78
+ "transformers_version": "4.35.2",
79
+ "use_cache": true,
80
+ "vocab_size": 50257
81
+ }
82
+ ```
83
+
84
+ ## deepspeedのパラメータ
85
+
86
+ ```json
87
+ {
88
+ "fp16": {
89
+ "enabled": "auto",
90
+ "loss_scale": 0.0,
91
+ "loss_scale_window": 1000,
92
+ "initial_scale_power": 16,
93
+ "hysteresis": 1,
94
+ "min_loss_scale": 0
95
+ },
96
+
97
+ "bf16": {
98
+ "enabled": "auto"
99
+ },
100
+
101
+
102
+ "optimizer": {
103
+ "type": "AdamW",
104
+ "params": {
105
+ "lr": "auto",
106
+ "betas": "auto",
107
+ "eps": "auto",
108
+ "weight_decay": "auto"
109
+ }
110
+ },
111
+
112
+ "scheduler": {
113
+ "type": "WarmupDecayLR",
114
+ "params": {
115
+ "warmup_min_lr": "auto",
116
+ "warmup_max_lr": "auto",
117
+ "warmup_num_steps": "auto",
118
+ "total_num_steps": "auto"
119
+ }
120
+ },
121
+
122
+ "zero_optimization": {
123
+ "stage": 0,
124
+ "allgather_partitions": true,
125
+ "allgather_bucket_size": 2e6,
126
+ "overlap_comm": true,
127
+ "reduce_scatter": true,
128
+ "reduce_bucket_size": 2e6,
129
+ "contiguous_gradients": true,
130
+ "round_robin_gradients":true
131
+ },
132
+
133
+ "dump_state": true,
134
+ "comms_logger": {
135
+ "enabled": true,
136
+ "verbose": false,
137
+ "prof_all": true,
138
+ "debug": false
139
+ },
140
+
141
+ "gradient_accumulation_steps": "auto",
142
+ "gradient_clipping": "auto",
143
+ "train_batch_size": "auto",
144
+ "train_micro_batch_size_per_gpu": "auto"
145
+
146
+
147
+ }
148
+
149
+ ```
150
+
151
+ # モデルの推論コード
152
+
153
+ ```python
154
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
155
+ import torch
156
+
157
+ MODEL_NAME = "./pretrain/checkpoints-mistral-300M-FA2-3/checkpoint-12000/"
158
+ torch.set_float32_matmul_precision('high')
159
+
160
+ DEVICE = "cuda"
161
+ if torch.cuda.is_available():
162
+ print("cuda")
163
+ DEVICE = "cuda"
164
+ else:
165
+ print("cpu")
166
+ DEVICE = "cpu"
167
+ # DEVICE = "cpu"
168
+
169
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME,use_fast=False)
170
+ model = AutoModelForCausalLM.from_pretrained(
171
+ MODEL_NAME,
172
+ trust_remote_code=True,
173
+ ).to(DEVICE)
174
+
175
+ # streamer = TextStreamer(tokenizer)
176
+
177
+ prompt = "大規模言語モデルとは、"
178
+
179
+
180
+ inputs = tokenizer(prompt, add_special_tokens=False,return_tensors="pt").to(model.device)
181
+ with torch.no_grad():
182
+
183
+ outputs = model.generate(
184
+ inputs["input_ids"],
185
+ max_new_tokens=100,
186
+ do_sample=True,
187
+ early_stopping=False,
188
+ top_p=0.95,
189
+ top_k=50,
190
+ temperature=0.9,
191
+ # streamer=streamer,
192
+ no_repeat_ngram_size=2,
193
+ num_beams=3
194
+ )
195
+
196
+ print(outputs.tolist()[0])
197
+ outputs_txt = tokenizer.decode(outputs[0])
198
+ print(outputs_txt)
199
+
200
+
201
+ prompt = "まどマギで一番可愛いキャラは、"
202
+
203
+
204
+ inputs = tokenizer(prompt, add_special_tokens=False,return_tensors="pt").to(model.device)
205
+ with torch.no_grad():
206
+
207
+ outputs = model.generate(
208
+ inputs["input_ids"],
209
+ max_new_tokens=100,
210
+ do_sample=True,
211
+ early_stopping=False,
212
+ top_p=0.95,
213
+ top_k=50,
214
+ temperature=0.9,
215
+ # streamer=streamer,
216
+ no_repeat_ngram_size=2,
217
+ num_beams=3
218
+ )
219
+
220
+ print(outputs.tolist()[0])
221
+ outputs_txt = tokenizer.decode(outputs[0])
222
+ print(outputs_txt)
223
+
224
+
225
+ ```