bornjre commited on
Commit
2bc7d0b
1 Parent(s): 936db68

main cpp file upload

Browse files
Files changed (1) hide show
  1. btlm_model_wip.cpp +410 -0
btlm_model_wip.cpp ADDED
@@ -0,0 +1,410 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "ggml/ggml.h"
2
+
3
+ #include "common-ggml.h"
4
+ #include "common.h"
5
+
6
+ #include <cassert>
7
+ #include <cinttypes>
8
+ #include <cmath>
9
+ #include <cstdio>
10
+ #include <cstring>
11
+ #include <fstream>
12
+ #include <iostream>
13
+ #include <map>
14
+ #include <stdint.h>
15
+ #include <string>
16
+ #include <vector>
17
+
18
+ struct btlm_vocab {
19
+ using id = int32_t;
20
+ using token = std::string;
21
+
22
+ std::map<token, id> token_to_id;
23
+ std::map<id, token> id_to_token;
24
+ std::vector<std::string> special_tokens;
25
+ };
26
+
27
+ struct btlm_params {
28
+ int32_t seed = -1; // RNG seed
29
+ int32_t n_threads = std::min(4, (int32_t)std::thread::hardware_concurrency());
30
+ int32_t n_predict = 200; // new tokens to predict
31
+ int32_t n_batch = 8; // batch size for prompt processing
32
+
33
+ // sampling parameters
34
+ int32_t top_k = 40;
35
+ float top_p = 0.9f;
36
+ float temp = 0.9f;
37
+ int32_t repeat_last_n = 64;
38
+ float repeat_penalty = 1.00f;
39
+
40
+ std::string model =
41
+ "/home/madman/Desktop/ml_play/ml_models/btlm-3b.ggml.bin"; // model path
42
+ std::string prompt = "Capital of Nepal is";
43
+ std::string token_test = "";
44
+ };
45
+
46
+ struct btlm_hparams {
47
+ int32_t n_vocab;
48
+ int32_t n_ctx;
49
+ int32_t n_embd;
50
+ int32_t n_head;
51
+ int32_t n_layer;
52
+ int32_t n_inner;
53
+ int32_t ftype;
54
+ };
55
+
56
+ struct btlm_layer {
57
+ // normalization
58
+ struct ggml_tensor *ln_1_g;
59
+ struct ggml_tensor *ln_1_b;
60
+
61
+ struct ggml_tensor *ln_2_g;
62
+ struct ggml_tensor *ln_2_b;
63
+
64
+ // attention
65
+ struct ggml_tensor *c_attn_attn_w;
66
+ struct ggml_tensor *c_attn_attn_b;
67
+ struct ggml_tensor *c_attn_attn_scb;
68
+
69
+ struct ggml_tensor *c_attn_proj_w;
70
+ struct ggml_tensor *c_attn_proj_b;
71
+ struct ggml_tensor *c_attn_proj_scb;
72
+
73
+ // mlp
74
+ struct ggml_tensor *c_mlp_fc_w;
75
+ struct ggml_tensor *c_mlp_fc_b;
76
+ struct ggml_tensor *c_mlp_fc_scb;
77
+
78
+ struct ggml_tensor *c_mlp_fc2_w;
79
+ struct ggml_tensor *c_mlp_fc2_b;
80
+ struct ggml_tensor *c_mlp_fc2_scb;
81
+
82
+ struct ggml_tensor *c_mlp_proj_w;
83
+ struct ggml_tensor *c_mlp_proj_b;
84
+ struct ggml_tensor *c_mlp_proj_scb;
85
+ };
86
+
87
+ struct btlm_model {
88
+ btlm_hparams hparams;
89
+
90
+ // normalization
91
+ struct ggml_tensor *ln_f_g;
92
+ struct ggml_tensor *ln_f_b;
93
+
94
+ struct ggml_tensor *wte; // position embedding
95
+ struct ggml_tensor *alibi_slopes;
96
+ struct ggml_tensor *lm_head; // language model head
97
+
98
+ std::vector<btlm_layer> layers;
99
+
100
+ // key + value memory
101
+ struct ggml_tensor *memory_k;
102
+ struct ggml_tensor *memory_v;
103
+
104
+ //
105
+ struct ggml_context *ctx;
106
+ std::map<std::string, struct ggml_tensor *> tensors;
107
+ };
108
+
109
+ // load the model's weights from a file
110
+ bool btlm_model_load(const std::string &fname, btlm_model &model,
111
+ btlm_vocab &vocab) {
112
+ printf("%s: loading model from '%s'\n", __func__, fname.c_str());
113
+
114
+ auto fin = std::ifstream(fname, std::ios::binary);
115
+ if (!fin) {
116
+ fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
117
+ return false;
118
+ }
119
+
120
+ // verify magic
121
+ {
122
+ uint32_t magic;
123
+ fin.read((char *)&magic, sizeof(magic));
124
+ if (magic != GGML_FILE_MAGIC) {
125
+ fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__,
126
+ fname.c_str());
127
+ return false;
128
+ }
129
+ }
130
+
131
+ // load hparams
132
+ {
133
+ auto &hparams = model.hparams;
134
+
135
+ fin.read((char *)&hparams.n_vocab, sizeof(hparams.n_vocab));
136
+ fin.read((char *)&hparams.n_ctx, sizeof(hparams.n_ctx));
137
+ fin.read((char *)&hparams.n_embd, sizeof(hparams.n_embd));
138
+ fin.read((char *)&hparams.n_head, sizeof(hparams.n_head));
139
+ fin.read((char *)&hparams.n_layer, sizeof(hparams.n_layer));
140
+ fin.read((char *)&hparams.n_inner, sizeof(hparams.n_inner));
141
+ fin.read((char *)&hparams.ftype, sizeof(hparams.ftype));
142
+
143
+ const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
144
+
145
+ printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
146
+ printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
147
+ printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
148
+ printf("%s: n_head = %d\n", __func__, hparams.n_head);
149
+ printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
150
+ printf("%s: n_inner = %d\n", __func__, hparams.n_inner);
151
+ printf("%s: ftype = %d\n", __func__, hparams.ftype);
152
+ printf("%s: qntvr = %d\n", __func__, qntvr);
153
+
154
+ hparams.ftype %= GGML_QNT_VERSION_FACTOR;
155
+ }
156
+
157
+ // for the big tensors, we have the option to store the data in 16-bit floats
158
+ // or quantized in order to save memory and also to speed up the computation
159
+ ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype)(model.hparams.ftype));
160
+ if (wtype == GGML_TYPE_COUNT) {
161
+ fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
162
+ __func__, fname.c_str(), model.hparams.ftype);
163
+ return false;
164
+ }
165
+
166
+ auto &ctx = model.ctx;
167
+ size_t ctx_size = 0;
168
+
169
+ {
170
+
171
+ ctx_size = 1024 * 1024 * 8000u; // fixme => actually calculate this
172
+
173
+ printf("%s: ggml tensor size = %d bytes\n", __func__,
174
+ (int)sizeof(ggml_tensor));
175
+ printf("%s: ggml ctx size = %6.2f MB\n", __func__,
176
+ ctx_size / (1024.0 * 1024.0));
177
+ printf("%s: ggml ctx size = %d \n", __func__, ctx_size);
178
+ }
179
+
180
+ // create the ggml context
181
+ {
182
+ struct ggml_init_params params = {
183
+ /*.mem_size =*/ctx_size,
184
+ /*.mem_buffer =*/NULL,
185
+ /*.no_alloc =*/false,
186
+ };
187
+
188
+ model.ctx = ggml_init(params);
189
+ if (!model.ctx) {
190
+ fprintf(stderr, "%s: ggml_init() failed\n", __func__);
191
+ return false;
192
+ }
193
+ }
194
+
195
+ // load vocab
196
+ {
197
+ int32_t n_vocab = model.hparams.n_vocab;
198
+
199
+ std::string word;
200
+ std::vector<char> buf(128);
201
+
202
+ for (int i = 0; i < n_vocab; i++) {
203
+ uint32_t len;
204
+ fin.read((char *)&len, sizeof(len));
205
+
206
+ buf.resize(len);
207
+ fin.read((char *)buf.data(), len);
208
+ word.assign(buf.data(), len);
209
+
210
+ // printf("%s \n", word.c_str());
211
+
212
+ vocab.token_to_id[word] = i;
213
+ vocab.id_to_token[i] = word;
214
+ }
215
+ }
216
+
217
+ {
218
+
219
+ // alloc memory
220
+
221
+ const auto &hparams = model.hparams;
222
+
223
+ const int n_embd = hparams.n_embd;
224
+ const int n_layer = hparams.n_layer;
225
+ // const int n_ctx = hparams.n_ctx;
226
+ const int n_vocab = hparams.n_vocab;
227
+
228
+ model.layers.resize(n_layer);
229
+
230
+ model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_embd);
231
+ model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_embd);
232
+ model.wte = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_embd, n_vocab);
233
+ model.lm_head = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_embd, n_vocab);
234
+ model.alibi_slopes = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 32);
235
+
236
+ // map by name
237
+ model.tensors["model/ln_f/g"] = model.ln_f_g;
238
+ model.tensors["model/ln_f/b"] = model.ln_f_b;
239
+ model.tensors["model/wte"] = model.wte;
240
+ model.tensors["model/lm_head"] = model.lm_head;
241
+ model.tensors["model/relative_pe/slopes"] = model.alibi_slopes;
242
+
243
+ for (int i = 0; i < n_layer; ++i) {
244
+ auto &layer = model.layers[i];
245
+
246
+ layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_embd);
247
+ layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_embd);
248
+
249
+ layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_embd);
250
+ layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_embd);
251
+
252
+ layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, 3 * n_embd, n_embd );
253
+ layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 3 * n_embd);
254
+ layer.c_attn_attn_scb =
255
+ ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 3 * n_embd);
256
+
257
+ layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_embd, n_embd);
258
+ layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_embd);
259
+ layer.c_attn_proj_scb = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_embd);
260
+
261
+ layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, 6832, n_embd);
262
+ layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 6826);
263
+ layer.c_mlp_fc_scb = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 6826);
264
+
265
+ layer.c_mlp_fc2_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_embd, 6832 );
266
+ layer.c_mlp_fc2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 6826);
267
+ layer.c_mlp_fc2_scb = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 6826);
268
+
269
+ layer.c_mlp_proj_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_embd, 6848);
270
+ layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_embd);
271
+ layer.c_mlp_proj_scb = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_embd);
272
+
273
+ // map by name
274
+ model.tensors["model/h" + std::to_string(i) + "/ln_1/g"] = layer.ln_1_g;
275
+ model.tensors["model/h" + std::to_string(i) + "/ln_1/b"] = layer.ln_1_b;
276
+
277
+ model.tensors["model/h" + std::to_string(i) + "/ln_2/g"] = layer.ln_2_g;
278
+ model.tensors["model/h" + std::to_string(i) + "/ln_2/b"] = layer.ln_2_b;
279
+
280
+ model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/w"] = layer.c_attn_attn_w;
281
+ model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/b"] = layer.c_attn_attn_b;
282
+ model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/scb"] = layer.c_attn_attn_scb;
283
+
284
+
285
+ model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/w"] =
286
+ layer.c_attn_proj_w;
287
+ model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/b"] =
288
+ layer.c_attn_proj_b;
289
+ model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/scb"] =
290
+ layer.c_attn_proj_scb;
291
+
292
+ model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"] =
293
+ layer.c_mlp_fc_w;
294
+ model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"] =
295
+ layer.c_mlp_fc_b;
296
+ model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/scb"] =
297
+ layer.c_mlp_fc_scb;
298
+
299
+ model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc2/w"] =
300
+ layer.c_mlp_fc2_w;
301
+ model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc2/b"] =
302
+ layer.c_mlp_fc2_b;
303
+ model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc2/scb"] =
304
+ layer.c_mlp_fc2_scb;
305
+
306
+
307
+ model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"] =
308
+ layer.c_mlp_proj_w;
309
+ model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"] =
310
+ layer.c_mlp_proj_b;
311
+ model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/scb"] =
312
+ layer.c_mlp_proj_scb;
313
+ }
314
+ }
315
+
316
+ // load weights
317
+ {
318
+ size_t total_size = 0;
319
+
320
+ bool has_lm_head = false;
321
+
322
+ while (true) {
323
+ int32_t n_dims;
324
+ int32_t length;
325
+ int32_t ttype;
326
+
327
+ fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
328
+ fin.read(reinterpret_cast<char *>(&length), sizeof(length));
329
+ fin.read(reinterpret_cast<char *>(&ttype), sizeof(ttype));
330
+
331
+ if (fin.eof()) {
332
+ break;
333
+ }
334
+
335
+ int32_t nelements = 1;
336
+ int32_t ne[2] = {1, 1};
337
+ for (int i = 0; i < n_dims; ++i) {
338
+ fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
339
+ nelements *= ne[i];
340
+ }
341
+
342
+ std::string name(length, 0);
343
+ fin.read(&name[0], length);
344
+
345
+ printf("processing tensor '%s' in model file\n", name.data());
346
+
347
+ if (model.tensors.find(name.data()) == model.tensors.end()) {
348
+ fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__,
349
+ name.data());
350
+ return false;
351
+ }
352
+
353
+ auto tensor = model.tensors[name.data()];
354
+ if (ggml_nelements(tensor) != nelements) {
355
+ fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n",
356
+ __func__, name.data());
357
+ return false;
358
+ }
359
+
360
+ if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
361
+ fprintf(stderr,
362
+ "%s: tensor '%s' has wrong shape in model file: got [%d, %d], "
363
+ "expected [%d, %d]\n",
364
+ __func__, name.data(), (int)tensor->ne[0], (int)tensor->ne[1],
365
+ ne[0], ne[1]);
366
+ return false;
367
+ }
368
+
369
+ // for debugging
370
+ if (1) {
371
+ printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n",
372
+ name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)),
373
+ ggml_nbytes(tensor) / 1024.0 / 1024.0, ggml_nbytes(tensor));
374
+ }
375
+
376
+ const size_t bpe = ggml_type_size(ggml_type(ttype));
377
+
378
+ if ((nelements * bpe) / ggml_blck_size(tensor->type) !=
379
+ ggml_nbytes(tensor)) {
380
+ fprintf(stderr,
381
+ "%s: tensor '%s' has wrong size in model file: got %zu, "
382
+ "expected %zu\n",
383
+ __func__, name.data(), ggml_nbytes(tensor), nelements * bpe);
384
+ return false;
385
+ }
386
+
387
+ fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
388
+
389
+
390
+ total_size += ggml_nbytes(tensor);
391
+ }
392
+
393
+ printf("%s: model size = %8.2f MB\n", __func__,
394
+ total_size / 1024.0 / 1024.0);
395
+ }
396
+
397
+ fin.close();
398
+
399
+ return true;
400
+ }
401
+
402
+ int main(int argc, char **argv) {
403
+ btlm_params params;
404
+ btlm_model models;
405
+ btlm_vocab vocab;
406
+
407
+ btlm_model_load(params.model, models, vocab);
408
+
409
+ return 0;
410
+ }