English
LLM
BELLE
bigmoyan commited on
Commit
3f70f85
·
1 Parent(s): 51eca0d

Upload 13 files

Browse files
.gitattributes CHANGED
@@ -32,3 +32,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ lyraBelle/libth_transformer.so filter=lfs diff=lfs merge=lfs -text
36
+ model/tokenizer.json filter=lfs diff=lfs merge=lfs -text
demo.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from lyraBelle import LyraBelle
3
+
4
+ data_type = "fp16"
5
+ prompts = "今天天气大概 25度,有点小雨,吹着风,我想去户外散步,应该穿什么样的衣服裤子鞋子搭配。"
6
+ model_dir = "./model"
7
+ model_name = "1-gpu-fp16.h5"
8
+ max_output_length = 512
9
+
10
+
11
+ model = LyraBelle(model_dir, model_name, data_type, 0)
12
+ output_texts = model.generate(prompts, output_length=max_output_length,top_k=30, top_p=0.85, temperature=0.35, repetition_penalty=1.2, do_sample=True)
13
+ print(output_texts)
gemm_config.in ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ batch_size, seq_len, head_num, size_per_head dataType ### batchCount, n, m, k, algoId, customOption, tile, numSplitsK, swizzle, reductionScheme, workspaceSize, stages, exec_time
2
+ 64 64 32 128 1 ### 1 12288 4096 4096 6 0 20 0 1 0 0 11 1.444813
3
+ 64 64 32 128 1 ### 2048 64 64 128 112 -1 -1 -1 -1 -1 -1 -1 0.083370
4
+ 64 64 32 128 1 ### 2048 128 64 64 100 -1 -1 -1 -1 -1 -1 -1 0.070630
5
+ 64 64 32 128 1 ### 1 4096 4096 4096 6 0 24 1 0 0 0 9 0.502825
6
+ 64 64 32 128 1 ### 1 16384 4096 4096 6 0 20 0 1 0 0 11 1.898404
7
+ 64 64 32 128 1 ### 1 4096 4096 16384 21 0 24 1 0 0 0 12 1.909555
8
+ 64 1 32 128 1 ### 1 12288 64 4096 6 0 18 0 1 0 0 16 0.080251
9
+ 64 1 32 128 1 ### 1 4096 64 4096 6 0 15 1 0 0 0 18 0.026583
10
+ 64 1 32 128 1 ### 1 16384 64 4096 6 0 18 0 1 0 0 15 0.110223
11
+ 64 1 32 128 1 ### 1 4096 64 16384 31 0 15 1 1 0 0 18 0.109978
12
+ 64 1 32 128 1 ### 1 250880 64 4096 112 -1 -1 -1 -1 -1 -1 -1 1.602350
13
+ 32 64 32 128 1 ### 1 12288 2048 4096 6 0 20 0 1 0 0 11 0.750490
14
+ 32 64 32 128 1 ### 1024 64 64 128 109 -1 -1 -1 -1 -1 -1 -1 0.047020
15
+ 32 64 32 128 1 ### 1024 128 64 64 108 -1 -1 -1 -1 -1 -1 -1 0.037950
16
+ 32 64 32 128 1 ### 1 4096 2048 4096 6 0 20 0 0 0 0 11 0.256123
17
+ 32 64 32 128 1 ### 1 16384 2048 4096 6 0 20 0 1 0 0 11 0.959887
18
+ 32 64 32 128 1 ### 1 4096 2048 16384 6 0 20 0 1 0 0 11 0.979282
19
+ 32 1 32 128 1 ### 1 12288 32 4096 6 0 18 0 0 0 0 16 0.078582
20
+ 32 1 32 128 1 ### 1 4096 32 4096 31 0 15 1 0 0 0 18 0.024535
21
+ 32 1 32 128 1 ### 1 16384 32 4096 6 0 18 0 0 0 0 12 0.105523
22
+ 32 1 32 128 1 ### 1 4096 32 16384 109 -1 -1 -1 -1 -1 -1 -1 0.105160
23
+ 32 1 32 128 1 ### 1 250880 32 4096 114 -1 -1 -1 -1 -1 -1 -1 1.479260
24
+ 16 64 32 128 1 ### 1 12288 1024 4096 6 0 20 2 1 1 3072 11 0.398694
25
+ 16 64 32 128 1 ### 512 64 64 128 105 -1 -1 -1 -1 -1 -1 -1 0.015370
26
+ 16 64 32 128 1 ### 512 128 64 64 114 -1 -1 -1 -1 -1 -1 -1 0.014250
27
+ 16 64 32 128 1 ### 1 4096 1024 4096 21 0 20 2 0 1 1024 11 0.144855
28
+ 16 64 32 128 1 ### 1 16384 1024 4096 6 0 20 0 1 0 0 11 0.505098
29
+ 16 64 32 128 1 ### 1 4096 1024 16384 111 -1 -1 -1 -1 -1 -1 -1 0.545680
30
+ 16 1 32 128 1 ### 1 12288 16 4096 6 0 18 1 1 0 0 16 0.077865
31
+ 16 1 32 128 1 ### 1 4096 16 4096 31 0 15 1 1 0 0 18 0.024023
32
+ 16 1 32 128 1 ### 1 16384 16 4096 6 0 21 1 0 0 0 15 0.104765
33
+ 16 1 32 128 1 ### 1 4096 16 16384 6 0 15 1 1 0 0 17 0.105298
34
+ 16 1 32 128 1 ### 1 250880 16 4096 109 -1 -1 -1 -1 -1 -1 -1 1.450620
35
+ 8 64 32 128 1 ### 1 12288 512 4096 115 -1 -1 -1 -1 -1 -1 -1 0.204910
36
+ 8 64 32 128 1 ### 256 64 64 128 105 -1 -1 -1 -1 -1 -1 -1 0.010500
37
+ 8 64 32 128 1 ### 256 128 64 64 109 -1 -1 -1 -1 -1 -1 -1 0.010250
38
+ 8 64 32 128 1 ### 1 4096 512 4096 6 0 20 4 1 1 512 11 0.081009
39
+ 8 64 32 128 1 ### 1 16384 512 4096 107 -1 -1 -1 -1 -1 -1 -1 0.257450
40
+ 8 64 32 128 1 ### 1 4096 512 16384 6 0 20 5 1 1 512 11 0.256573
41
+ 8 1 32 128 1 ### 1 12288 8 4096 6 0 18 1 1 0 0 16 0.077445
42
+ 8 1 32 128 1 ### 1 4096 8 4096 31 0 15 1 1 0 0 18 0.023245
43
+ 8 1 32 128 1 ### 1 16384 8 4096 110 -1 -1 -1 -1 -1 -1 -1 0.104450
44
+ 8 1 32 128 1 ### 1 4096 8 16384 6 0 15 1 1 0 0 17 0.104192
45
+ 8 1 32 128 1 ### 1 250880 8 4096 108 -1 -1 -1 -1 -1 -1 -1 1.429910
46
+ 1 64 32 128 1 ### 1 12288 64 4096 109 -1 -1 -1 -1 -1 -1 -1 0.080110
47
+ 1 64 32 128 1 ### 32 64 64 128 103 -1 -1 -1 -1 -1 -1 -1 0.005320
48
+ 1 64 32 128 1 ### 32 128 64 64 109 -1 -1 -1 -1 -1 -1 -1 0.005470
49
+ 1 64 32 128 1 ### 1 4096 64 4096 6 0 15 1 0 0 0 18 0.026429
50
+ 1 64 32 128 1 ### 1 16384 64 4096 107 -1 -1 -1 -1 -1 -1 -1 0.110100
51
+ 1 64 32 128 1 ### 1 4096 64 16384 31 0 15 1 1 0 0 18 0.109885
52
+ 1 1 32 128 1 ### 1 12288 1 4096 6 0 18 1 1 0 0 16 0.076769
53
+ 1 1 32 128 1 ### 1 4096 1 4096 6 0 15 1 1 0 0 18 0.023040
54
+ 1 1 32 128 1 ### 1 16384 1 4096 105 -1 -1 -1 -1 -1 -1 -1 0.103720
55
+ 1 1 32 128 1 ### 1 4096 1 16384 6 0 18 3 0 4 24576 16 0.102124
56
+ 1 1 32 128 1 ### 1 250880 1 4096 102 -1 -1 -1 -1 -1 -1 -1 1.402680
57
+ 64 128 32 128 1 ### 1 12288 8192 4096 6 0 20 0 1 0 0 11 2.837852
58
+ 64 128 32 128 1 ### 2048 128 128 128 111 -1 -1 -1 -1 -1 -1 -1 0.202480
59
+ 64 128 32 128 1 ### 2048 128 128 128 103 -1 -1 -1 -1 -1 -1 -1 0.156770
60
+ 64 128 32 128 1 ### 1 4096 8192 4096 6 0 20 0 1 0 0 11 0.955003
61
+ 64 128 32 128 1 ### 1 16384 8192 4096 6 0 20 0 1 0 0 11 3.772959
62
+ 64 128 32 128 1 ### 1 4096 8192 16384 6 0 20 0 1 0 0 11 3.703818
63
+ 64 1 32 128 1 ### 1 12288 64 4096 6 0 18 0 0 0 0 16 0.080015
64
+ 64 1 32 128 1 ### 1 4096 64 4096 6 0 15 1 0 0 0 18 0.026460
65
+ 64 1 32 128 1 ### 1 16384 64 4096 105 -1 -1 -1 -1 -1 -1 -1 0.110300
66
+ 64 1 32 128 1 ### 1 4096 64 16384 31 0 15 1 1 0 0 18 0.109691
67
+ 64 1 32 128 1 ### 1 250880 64 4096 100 -1 -1 -1 -1 -1 -1 -1 1.603500
68
+ 32 128 32 128 1 ### 1 12288 4096 4096 6 0 20 0 1 0 0 11 1.444751
69
+ 32 128 32 128 1 ### 1024 128 128 128 112 -1 -1 -1 -1 -1 -1 -1 0.105780
70
+ 32 128 32 128 1 ### 1024 128 128 128 113 -1 -1 -1 -1 -1 -1 -1 0.084340
71
+ 32 128 32 128 1 ### 1 4096 4096 4096 6 0 24 1 0 0 0 9 0.502835
72
+ 32 128 32 128 1 ### 1 16384 4096 4096 6 0 20 0 1 0 0 11 1.898291
73
+ 32 128 32 128 1 ### 1 4096 4096 16384 21 0 24 1 0 0 0 12 1.910139
74
+ 32 1 32 128 1 ### 1 12288 32 4096 107 -1 -1 -1 -1 -1 -1 -1 0.078600
75
+ 32 1 32 128 1 ### 1 4096 32 4096 31 0 15 1 0 0 0 18 0.024586
76
+ 32 1 32 128 1 ### 1 16384 32 4096 6 0 18 0 1 0 0 12 0.105708
77
+ 32 1 32 128 1 ### 1 4096 32 16384 105 -1 -1 -1 -1 -1 -1 -1 0.105120
78
+ 32 1 32 128 1 ### 1 250880 32 4096 106 -1 -1 -1 -1 -1 -1 -1 1.480140
79
+ 16 128 32 128 1 ### 1 12288 2048 4096 6 0 20 0 1 0 0 11 0.750612
80
+ 16 128 32 128 1 ### 512 128 128 128 108 -1 -1 -1 -1 -1 -1 -1 0.057030
81
+ 16 128 32 128 1 ### 512 128 128 128 114 -1 -1 -1 -1 -1 -1 -1 0.048080
82
+ 16 128 32 128 1 ### 1 4096 2048 4096 6 0 20 0 0 0 0 11 0.256000
83
+ 16 128 32 128 1 ### 1 16384 2048 4096 6 0 20 0 1 0 0 11 0.957215
84
+ 16 128 32 128 1 ### 1 4096 2048 16384 6 0 20 0 1 0 0 11 0.978862
85
+ 16 1 32 128 1 ### 1 12288 16 4096 6 0 18 1 1 0 0 16 0.077793
86
+ 16 1 32 128 1 ### 1 4096 16 4096 31 0 15 1 1 0 0 18 0.023849
87
+ 16 1 32 128 1 ### 1 16384 16 4096 6 0 21 1 0 0 0 15 0.104858
88
+ 16 1 32 128 1 ### 1 4096 16 16384 6 0 15 1 1 0 0 17 0.105001
89
+ 16 1 32 128 1 ### 1 250880 16 4096 108 -1 -1 -1 -1 -1 -1 -1 1.450760
90
+ 8 128 32 128 1 ### 1 12288 1024 4096 6 0 20 2 1 1 3072 11 0.398592
91
+ 8 128 32 128 1 ### 256 128 128 128 107 -1 -1 -1 -1 -1 -1 -1 0.018050
92
+ 8 128 32 128 1 ### 256 128 128 128 104 -1 -1 -1 -1 -1 -1 -1 0.015680
93
+ 8 128 32 128 1 ### 1 4096 1024 4096 21 0 20 2 0 1 1024 11 0.144763
94
+ 8 128 32 128 1 ### 1 16384 1024 4096 6 0 20 0 1 0 0 11 0.505160
95
+ 8 128 32 128 1 ### 1 4096 1024 16384 115 -1 -1 -1 -1 -1 -1 -1 0.545580
96
+ 8 1 32 128 1 ### 1 12288 8 4096 6 0 18 1 1 0 0 16 0.077445
97
+ 8 1 32 128 1 ### 1 4096 8 4096 31 0 15 1 1 0 0 18 0.023245
98
+ 8 1 32 128 1 ### 1 16384 8 4096 110 -1 -1 -1 -1 -1 -1 -1 0.104360
99
+ 8 1 32 128 1 ### 1 4096 8 16384 6 0 15 1 1 0 0 17 0.104305
100
+ 8 1 32 128 1 ### 1 250880 8 4096 100 -1 -1 -1 -1 -1 -1 -1 1.430000
101
+ 1 128 32 128 1 ### 1 12288 128 4096 6 0 18 0 1 0 0 15 0.085402
102
+ 1 128 32 128 1 ### 32 128 128 128 108 -1 -1 -1 -1 -1 -1 -1 0.007070
103
+ 1 128 32 128 1 ### 32 128 128 128 114 -1 -1 -1 -1 -1 -1 -1 0.007350
104
+ 1 128 32 128 1 ### 1 4096 128 4096 104 -1 -1 -1 -1 -1 -1 -1 0.033170
105
+ 1 128 32 128 1 ### 1 16384 128 4096 6 0 24 0 0 0 0 15 0.115405
106
+ 1 128 32 128 1 ### 1 4096 128 16384 104 -1 -1 -1 -1 -1 -1 -1 0.118900
107
+ 1 1 32 128 1 ### 1 12288 1 4096 6 0 18 1 1 0 0 16 0.076872
108
+ 1 1 32 128 1 ### 1 4096 1 4096 6 0 15 1 1 0 0 18 0.023235
109
+ 1 1 32 128 1 ### 1 16384 1 4096 107 -1 -1 -1 -1 -1 -1 -1 0.103860
110
+ 1 1 32 128 1 ### 1 4096 1 16384 6 0 18 3 0 4 24576 16 0.102523
111
+ 1 1 32 128 1 ### 1 250880 1 4096 103 -1 -1 -1 -1 -1 -1 -1 1.402790
lyraBelle/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .lyraBelle import LyraBelle
lyraBelle/config.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import dataclasses
2
+ from typing import Optional
3
+
4
+
5
+ @dataclasses.dataclass
6
+ class BelleParam:
7
+ num_heads: int = 32
8
+ size_per_head: int = 128
9
+ inter_size: int = 16384
10
+ num_layers: int = 30
11
+ vocab_size: int = 250880
12
+ start_id: Optional[int] = 1
13
+ end_id: Optional[int] = 2
14
+ tensor_para_size: int = 1
15
+ pipeline_para_size: int = 1
16
+ remove_padding: bool = True
17
+ shared_contexts_ratio: float = 1.0
18
+ weights_data_type: str = "fp16"
19
+
20
+ def __post_init__(self):
21
+ if not 0.0 <= self.shared_contexts_ratio <= 1.0:
22
+ raise ValueError(
23
+ f'Got an invalid value of shared_context_ratio '
24
+ f'{self.shared_contexts_ratio} - range: [0.0, 1.0]')
25
+
26
+ def asdict(self):
27
+ return dataclasses.asdict(self)
28
+
29
+
30
+ BELLE_PARAM = BelleParam()
31
+ import os
32
+ current_dir = os.path.dirname(os.path.abspath(__file__))
33
+ LIB_SO_PATH = os.path.join(current_dir, 'libth_transformer.so')
lyraBelle/libth_transformer.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:17485c356e0f201d2f3193e6c31ec26d3b4e0b3f605968e1915a7adcd2b05b43
3
+ size 200050816
lyraBelle/lyraBelle.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import configparser
4
+ import pathlib
5
+ import typing
6
+
7
+ import torch
8
+ import transformers
9
+ from torch.nn.utils.rnn import pad_sequence
10
+
11
+ from .config import BELLE_PARAM, LIB_SO_PATH
12
+ from .model import BelleModel
13
+ import os
14
+
15
+
16
+ class LyraBelle:
17
+ def __init__(self, model_path, model_name, dtype='fp16', int8_mode=0) -> None:
18
+ self.model_path = model_path
19
+ self.model_name = model_name
20
+ self.dtype = dtype
21
+ if dtype != 'int8':
22
+ int8_mode = 0
23
+ self.int8_mode = int8_mode
24
+
25
+ print(f'Loading model and tokenizer from {self.model_path}')
26
+ self.model, self.tokenizer = self.load_model_and_tokenizer()
27
+ print("Got model and tokenizer")
28
+
29
+ def load_model_and_tokenizer(self):
30
+ tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_path)
31
+
32
+ checkpoint_path = pathlib.Path(self.model_path)
33
+ config_path = checkpoint_path / 'config.ini'
34
+
35
+ if config_path.exists():
36
+ # Read model params from config.
37
+ cfg = configparser.ConfigParser()
38
+ cfg.read(config_path)
39
+ model_name = 'belle'
40
+ inference_data_type = self.dtype
41
+ if inference_data_type == None:
42
+ inference_data_type = cfg.get(model_name, "weight_data_type")
43
+ model_args = dict(
44
+ head_num=cfg.getint(model_name, 'head_num'),
45
+ size_per_head=cfg.getint(model_name, "size_per_head"),
46
+ layer_num=cfg.getint(model_name, "num_layer"),
47
+ tensor_para_size=cfg.getint(model_name, "tensor_para_size"),
48
+ vocab_size=cfg.getint(model_name, "vocab_size"),
49
+ start_id=cfg.getint(model_name, "start_id"),
50
+ end_id=cfg.getint(model_name, "end_id"),
51
+ weights_data_type=cfg.get(model_name, "weight_data_type"),
52
+ layernorm_eps=cfg.getfloat(model_name, 'layernorm_eps'),
53
+ inference_data_type=inference_data_type)
54
+ else:
55
+ inference_data_type = self.dtype
56
+ if inference_data_type == None:
57
+ inference_data_type = BELLE_PARAM.weights_data_type
58
+ model_args = dict(head_num=BELLE_PARAM.num_heads,
59
+ size_per_head=BELLE_PARAM.size_per_head,
60
+ vocab_size=BELLE_PARAM.vocab_size,
61
+ start_id=BELLE_PARAM.start_id or tokenizer.bos_token_id,
62
+ end_id=BELLE_PARAM.end_id or tokenizer.eos_token_id,
63
+ layer_num=BELLE_PARAM.num_layers,
64
+ tensor_para_size=BELLE_PARAM.tensor_para_size,
65
+ weights_data_type=BELLE_PARAM.weights_data_type,
66
+ inference_data_type=inference_data_type)
67
+
68
+ # update common parameters
69
+ model_args.update(dict(
70
+ lib_path=LIB_SO_PATH,
71
+ pipeline_para_size=BELLE_PARAM.pipeline_para_size,
72
+ shared_contexts_ratio=BELLE_PARAM.shared_contexts_ratio,
73
+ int8_mode=self.int8_mode
74
+ ))
75
+
76
+ print('[FT][INFO] Load Our FT Highly Optimized BELLE model')
77
+ for k, v in model_args.items():
78
+ print(f' - {k.ljust(25, ".")}: {v}')
79
+
80
+ # Check sanity and consistency between the model and tokenizer.
81
+ checklist = ['head_num', 'size_per_head', 'vocab_size', 'layer_num',
82
+ 'tensor_para_size', 'tensor_para_size', 'weights_data_type']
83
+ if None in [model_args[k] for k in checklist]:
84
+ none_params = [p for p in checklist if model_args[p] is None]
85
+ print(f'[FT][WARNING] Found None parameters {none_params}. They must '
86
+ f'be provided either by config file or CLI arguments.')
87
+ if model_args['start_id'] != tokenizer.bos_token_id:
88
+ print('[FT][WARNING] Given start_id is not matched with the bos token '
89
+ 'id of the pretrained tokenizer.')
90
+ if model_args['end_id'] not in (tokenizer.pad_token_id, tokenizer.eos_token_id):
91
+ print('[FT][WARNING] Given end_id is not matched with neither pad '
92
+ 'token id nor eos token id of the pretrained tokenizer.')
93
+
94
+ model = BelleModel(**model_args)
95
+ if not model.load(ckpt_path=os.path.join(self.model_path, self.model_name)):
96
+ print('[FT][WARNING] Skip model loading since no checkpoints are found')
97
+
98
+ return model, tokenizer
99
+
100
+ def generate(self, prompts: typing.List[str] | str,
101
+ output_length: int = 512,
102
+ beam_width: int = 1,
103
+ top_k: typing.Optional[torch.IntTensor] = 1,
104
+ top_p: typing.Optional[torch.FloatTensor] = 1.0,
105
+ beam_search_diversity_rate: typing.Optional[torch.FloatTensor] = 0.0,
106
+ temperature: typing.Optional[torch.FloatTensor] = 1.0,
107
+ len_penalty: typing.Optional[torch.FloatTensor] = 0.0,
108
+ repetition_penalty: typing.Optional[torch.FloatTensor] = 1.0,
109
+ presence_penalty: typing.Optional[torch.FloatTensor] = None,
110
+ min_length: typing.Optional[torch.IntTensor] = None,
111
+ bad_words_list: typing.Optional[torch.IntTensor] = None,
112
+ do_sample: bool = False,
113
+ return_output_length: bool = False,
114
+ return_cum_log_probs: int = 0):
115
+ #
116
+ if isinstance(prompts, str):
117
+ prompts = [prompts, ]
118
+
119
+ inputs = ['Human: ' + prompt.strip() +
120
+ '\n\nAssistant:' for prompt in prompts]
121
+ batch_size = len(inputs)
122
+ ones_int = torch.ones(size=[batch_size], dtype=torch.int32)
123
+ ones_float = torch.ones(size=[batch_size], dtype=torch.float32)
124
+
125
+ # we must encode the raw prompt text one by one in order to compute the length of the original text.
126
+ input_token_ids = [self.tokenizer(text, return_tensors="pt").input_ids.int().squeeze() for text in inputs]
127
+ input_lengths = torch.IntTensor([len(ids) for ids in input_token_ids])
128
+ # after got the length of each input text tokens. we can batchfy the input list to a tensor. padding the right.
129
+ input_token_ids = pad_sequence(input_token_ids, batch_first=True, padding_value=self.tokenizer.eos_token_id)
130
+
131
+ random_seed = None
132
+ if do_sample:
133
+ random_seed = torch.randint(0, 262144, (batch_size,), dtype=torch.long)
134
+
135
+ outputs = self.model(start_ids=input_token_ids,
136
+ start_lengths=input_lengths,
137
+ output_len=output_length,
138
+ beam_width=beam_width,
139
+ top_k=top_k*ones_int,
140
+ top_p=top_p*ones_float,
141
+ beam_search_diversity_rate=beam_search_diversity_rate*ones_float,
142
+ temperature=temperature*ones_float,
143
+ len_penalty=len_penalty*ones_float,
144
+ repetition_penalty=repetition_penalty*ones_float,
145
+ presence_penalty=presence_penalty,
146
+ min_length=min_length,
147
+ random_seed=random_seed,
148
+ bad_words_list=bad_words_list,
149
+ return_output_length=return_output_length,
150
+ return_cum_log_probs=return_cum_log_probs)
151
+
152
+ if return_cum_log_probs > 0:
153
+ outputs = outputs[0] # output_token_ids.
154
+
155
+ # Slice the generated token ids of the 1st beam result.
156
+ # output = input tokens + generated tokens.
157
+ output_token_ids = [out[0, length:].cpu()
158
+ for out, length in zip(outputs, input_lengths)]
159
+
160
+ output_texts = self.tokenizer.batch_decode(
161
+ output_token_ids, skip_special_tokens=True)
162
+
163
+ return output_texts
lyraBelle/model.py ADDED
@@ -0,0 +1,764 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import pathlib
5
+ import typing
6
+ from pathlib import Path
7
+ from typing import Optional
8
+
9
+ import numpy as np
10
+ import torch
11
+ import torch.distributed as dist
12
+ import torch.nn as nn
13
+
14
+ str_type_map = {"fp32": torch.float32,
15
+ "fp16": torch.float16, "bf16": torch.bfloat16}
16
+
17
+
18
+ class BaseBelleWeights:
19
+ def __init__(self, head_num, size_per_head, layer_num, vocab_size, max_seq_len, tensor_para_size, pipeline_para_size,
20
+ weights_data_type: typing.Union[str, np.dtype],
21
+ inference_data_type: str,
22
+ has_adapters: bool = False,
23
+ adapter_inter_size: int = 0,
24
+ gpt_with_moe: bool = False,
25
+ has_positional_encoding: bool = True,
26
+ has_pre_decoder_layernorm: bool = False,
27
+ has_post_decoder_layernorm: bool = True,
28
+ int8_mode: int = 0,
29
+ inter_size: int = 0):
30
+ assert(head_num % tensor_para_size == 0)
31
+
32
+ if int8_mode == 1:
33
+ torch_infer_dtype = str_type_map[inference_data_type]
34
+ assert torch_infer_dtype == torch.float16 or torch_infer_dtype == torch.bfloat16, "Weight only quant only supported for infer type fp16 or bf16."
35
+ quant = torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix
36
+ self.weight_transpose_calibrate_quantize = lambda x: quant(
37
+ x, torch.int8)
38
+ else:
39
+ assert int8_mode == 0, "Invalid int8 mode for BELLE. Must be 0 or 1"
40
+
41
+ self.head_num = head_num
42
+ self.size_per_head = size_per_head
43
+ self.layer_num = layer_num
44
+ self.vocab_size = vocab_size
45
+ self.max_seq_len = max_seq_len
46
+ self.tensor_para_size = tensor_para_size
47
+ self.pipeline_para_size = pipeline_para_size
48
+ self.layers_per_device = layer_num // pipeline_para_size
49
+
50
+ self.has_adapters = has_adapters
51
+ self.adapter_inter_size = adapter_inter_size
52
+ self.gpt_with_moe = gpt_with_moe
53
+ self.has_positional_encoding = has_positional_encoding
54
+ self.has_pre_decoder_layernorm = has_pre_decoder_layernorm
55
+ self.has_post_decoder_layernorm = has_post_decoder_layernorm
56
+
57
+ local_head_num = head_num // tensor_para_size
58
+ global_head_num = head_num
59
+ local_hidden_units = local_head_num * size_per_head
60
+ global_hidden_units = global_head_num * size_per_head
61
+ local_inter_size = local_hidden_units * 4
62
+ if inter_size != 0:
63
+ assert inter_size % tensor_para_size == 0, f"inter_size({inter_size}) \% tensor_para_size({tensor_para_size}) must be 0"
64
+ local_inter_size = inter_size // tensor_para_size
65
+ local_adapter_inter_size = self.adapter_inter_size // tensor_para_size
66
+
67
+ self.local_head_num = local_head_num
68
+ self.global_head_num = global_head_num
69
+ self.local_hidden_units = local_hidden_units
70
+ self.global_hidden_units = global_hidden_units
71
+ self.local_inter_size = local_inter_size
72
+
73
+ self.int8_mode = int8_mode
74
+ self.share_embed = False
75
+
76
+ if isinstance(weights_data_type, str):
77
+ try:
78
+ weights_data_type = {
79
+ "fp16": np.float16,
80
+ "fp32": np.float32,
81
+ "float16": np.float16,
82
+ "float32": np.float32,
83
+ }[weights_data_type]
84
+ except KeyError:
85
+ raise ValueError(
86
+ f"Don't know how to interpret weights_data_type: {weights_data_type}")
87
+
88
+ assert weights_data_type in [np.float32, np.float16]
89
+ self.weights_data_type = weights_data_type
90
+ self.inference_data_type = inference_data_type
91
+
92
+ self.w = []
93
+ self.int8_w = []
94
+ self.scale = []
95
+ # Transformer blocks
96
+ self.w.extend([torch.zeros(global_hidden_units, dtype=str_type_map[
97
+ self.inference_data_type])] * layer_num) # self_layernorm_gamma
98
+ self.w.extend([torch.zeros(global_hidden_units, dtype=str_type_map[
99
+ self.inference_data_type])] * layer_num) # self_layernorm_beta
100
+ self.w.extend([torch.zeros(global_hidden_units, local_hidden_units * 3,
101
+ dtype=str_type_map[self.inference_data_type])] * layer_num) # self_kernel
102
+ self.w.extend([torch.zeros(local_hidden_units * 3, dtype=str_type_map[self.inference_data_type])]
103
+ * layer_num) # self_bias
104
+ self.w.extend([torch.zeros(local_hidden_units, global_hidden_units, dtype=str_type_map[
105
+ self.inference_data_type])] * layer_num) # self_output_kernel
106
+ self.w.extend([torch.zeros(global_hidden_units, dtype=str_type_map[
107
+ self.inference_data_type])] * layer_num) # self_output_bias
108
+ self.w.extend([torch.zeros(global_hidden_units, dtype=str_type_map[
109
+ self.inference_data_type])] * layer_num) # ffn_layernorm_gamma
110
+ self.w.extend([torch.zeros(global_hidden_units, dtype=str_type_map[
111
+ self.inference_data_type])] * layer_num) # ffn_layernorm_beta
112
+ self.w.extend([torch.zeros(global_hidden_units, local_inter_size,
113
+ dtype=str_type_map[self.inference_data_type])] * layer_num) # ffn_kernel1
114
+ self.w.extend([torch.zeros(local_inter_size, dtype=str_type_map[
115
+ self.inference_data_type])] * layer_num) # ffn_bias1
116
+ self.w.extend([torch.zeros(local_inter_size, global_hidden_units,
117
+ dtype=str_type_map[self.inference_data_type])] * layer_num) # ffn_kernel2
118
+ self.w.extend([torch.zeros(global_hidden_units, dtype=str_type_map[
119
+ self.inference_data_type])] * layer_num) # ffn_bias2
120
+
121
+ optional_adapter_offset = 0
122
+ # After Transformer blocks
123
+ if self.has_pre_decoder_layernorm:
124
+ self.w.append(torch.zeros(global_hidden_units, dtype=str_type_map[
125
+ self.inference_data_type])) # embedding layernorm gamma
126
+ self.w.append(torch.zeros(global_hidden_units, dtype=str_type_map[
127
+ self.inference_data_type])) # embedding layernorm beta
128
+ optional_adapter_offset += 2
129
+ if self.has_post_decoder_layernorm:
130
+ self.w.append(torch.zeros(global_hidden_units, dtype=str_type_map[
131
+ self.inference_data_type])) # final layernorm gamma
132
+ self.w.append(torch.zeros(global_hidden_units, dtype=str_type_map[
133
+ self.inference_data_type])) # final layernorm beta
134
+ optional_adapter_offset += 2
135
+ if self.has_positional_encoding:
136
+ self.w.append(torch.zeros(max_seq_len, global_hidden_units, dtype=str_type_map[
137
+ self.inference_data_type])) # position_encoding_table
138
+ optional_adapter_offset += 1
139
+
140
+ self.pre_embed_idx = len(self.w)
141
+ self.w.append(torch.zeros(vocab_size, global_hidden_units,
142
+ dtype=str_type_map[self.inference_data_type])) # embedding_table
143
+ self.post_embed_idx = len(self.w)
144
+ self.w.append(torch.zeros(vocab_size, global_hidden_units, dtype=str_type_map[
145
+ self.inference_data_type])) # post embedding_kernel
146
+ self.adapter_offset = 2 + optional_adapter_offset
147
+
148
+ self.w.extend([torch.empty(
149
+ 0, dtype=str_type_map[self.inference_data_type])] * layer_num) # gating_weight
150
+ self.adapter_offset += layer_num
151
+
152
+ # adapters
153
+ if self.has_adapters:
154
+ self.w.extend([torch.zeros(global_hidden_units, local_adapter_inter_size,
155
+ dtype=str_type_map[self.inference_data_type])] * layer_num) # adaptor1_kernel1
156
+ self.w.extend([torch.zeros(local_adapter_inter_size, dtype=str_type_map[
157
+ self.inference_data_type])] * layer_num) # adaptor1_bias1
158
+ self.w.extend([torch.zeros(local_adapter_inter_size, global_hidden_units,
159
+ dtype=str_type_map[self.inference_data_type])] * layer_num) # adaptor1_kernel2
160
+ self.w.extend([torch.zeros(global_hidden_units, dtype=str_type_map[
161
+ self.inference_data_type])] * layer_num) # adaptor1_bias2
162
+ self.w.extend([torch.zeros(global_hidden_units, local_adapter_inter_size,
163
+ dtype=str_type_map[self.inference_data_type])] * layer_num) # adaptor2_kernel1
164
+ self.w.extend([torch.zeros(local_adapter_inter_size, dtype=str_type_map[
165
+ self.inference_data_type])] * layer_num) # adaptor2_bias1
166
+ self.w.extend([torch.zeros(local_adapter_inter_size, global_hidden_units,
167
+ dtype=str_type_map[self.inference_data_type])] * layer_num) # adaptor2_kernel2
168
+ self.w.extend([torch.zeros(global_hidden_units, dtype=str_type_map[
169
+ self.inference_data_type])] * layer_num) # adaptor2_bias2
170
+
171
+ # Initialization
172
+ self._map(lambda w: torch.nn.init.normal_(w, mean=0., std=1.))
173
+
174
+ if (self.int8_mode != 0):
175
+ self.int8_w.extend([torch.zeros(global_hidden_units, local_hidden_units *
176
+ 3, dtype=torch.int8)] * layer_num) # self_int8_kernel
177
+ self.scale.extend([torch.zeros(
178
+ local_hidden_units * 3, dtype=torch.float)] * layer_num) # self_scale
179
+ self.int8_w.extend([torch.zeros(local_hidden_units, global_hidden_units, dtype=torch.int8)]
180
+ * layer_num) # self_output_int8_kernel
181
+ # self_output_scale
182
+ self.scale.extend(
183
+ [torch.zeros(global_hidden_units, dtype=torch.float)] * layer_num)
184
+ self.int8_w.extend([torch.zeros(global_hidden_units, local_inter_size,
185
+ dtype=torch.int8)] * layer_num) # ffn_int8_kernel1
186
+ self.scale.extend(
187
+ [torch.zeros(local_inter_size, dtype=torch.float)] * layer_num) # ffn_scale1
188
+ self.int8_w.extend([torch.zeros(local_inter_size, global_hidden_units,
189
+ dtype=torch.int8)] * layer_num) # ffn_int8_kernel2
190
+ self.scale.extend(
191
+ [torch.zeros(global_hidden_units, dtype=torch.float)] * layer_num) # ffn_scale2
192
+ if self.has_adapters:
193
+ self.int8_w.extend([torch.zeros(global_hidden_units, local_adapter_inter_size,
194
+ dtype=torch.int8)] * layer_num) # adaptor1_int8_kernel1
195
+ self.scale.extend([torch.zeros(local_adapter_inter_size, dtype=torch.float)]
196
+ * layer_num) # adaptor1_scale1
197
+ self.int8_w.extend([torch.zeros(local_adapter_inter_size, global_hidden_units,
198
+ dtype=torch.int8)] * layer_num) # adaptor1_int8_kernel2
199
+ self.scale.extend([torch.zeros(
200
+ global_hidden_units, dtype=torch.float)] * layer_num) # adaptor1_scale2
201
+ self.int8_w.extend([torch.zeros(global_hidden_units, local_adapter_inter_size,
202
+ dtype=torch.int8)] * layer_num) # adaptor2_int8_kernel1
203
+ self.scale.extend([torch.zeros(local_adapter_inter_size, dtype=torch.float)]
204
+ * layer_num) # adaptor2_scale1
205
+ self.int8_w.extend([torch.zeros(local_adapter_inter_size, global_hidden_units,
206
+ dtype=torch.int8)] * layer_num) # adaptor2_int8_kernel2
207
+ self.scale.extend([torch.zeros(
208
+ global_hidden_units, dtype=torch.float)] * layer_num) # adaptor2_scale2
209
+
210
+ def __getitem__(self, idx):
211
+ return self.w[idx]
212
+
213
+ def __setitem__(self, idx, val):
214
+ self.w[idx] = val
215
+
216
+ def __len__(self):
217
+ return len(self.w)
218
+
219
+ def _map(self, func):
220
+ assert(self.pre_embed_idx < self.post_embed_idx,
221
+ "Pre decoder embedding index should be lower than post decoder embedding index.")
222
+ for i in range(len(self.w)):
223
+ if isinstance(self.w[i], list):
224
+ for j in range(len(self.w[i])):
225
+ self.w[i][j] = func(self.w[i][j])
226
+ else:
227
+ if self.share_embed and i == self.post_embed_idx:
228
+ # If sharing the pre and post embedding, any mapping to
229
+ # the pre decoder weight will give the same output to the
230
+ # post decoder weight, so we just copy here.
231
+ self.w[self.post_embed_idx] = self.w[self.pre_embed_idx]
232
+ else:
233
+ self.w[i] = func(self.w[i])
234
+
235
+ def _map_int8(self, func):
236
+ for i in range(len(self.int8_w)):
237
+ if isinstance(self.int8_w[i], list):
238
+ for j in range(len(self.int8_w[i])):
239
+ self.int8_w[i][j] = func(self.int8_w[i][j])
240
+
241
+ else:
242
+ self.int8_w[i] = func(self.int8_w[i])
243
+ for i in range(len(self.scale)):
244
+ if isinstance(self.scale[i], list):
245
+ for j in range(len(self.scale[i])):
246
+ self.scale[i][j] = func(self.scale[i][j])
247
+
248
+ else:
249
+ self.scale[i] = func(self.scale[i])
250
+
251
+ def _map_int8_scales(self, func):
252
+ for i in range(len(self.scale)):
253
+ if isinstance(self.scale[i], list):
254
+ for j in range(len(self.scale[i])):
255
+ self.scale[i][j] = func(self.scale[i][j])
256
+
257
+ else:
258
+ self.scale[i] = func(self.scale[i])
259
+
260
+ def load(self, ckpt_path, tp_rank, pipeline_para_rank):
261
+ if not os.path.exists(ckpt_path):
262
+ raise FileNotFoundError(f"Failed to find {ckpt_path}")
263
+ w = []
264
+
265
+ type_map = {np.float32: torch.float32, np.float16: torch.float16}
266
+ # Load
267
+
268
+ def is_load(i): return i >= self.layers_per_device * \
269
+ pipeline_para_rank and i < self.layers_per_device * \
270
+ (pipeline_para_rank + 1)
271
+
272
+ def load_to_torch(npdata: str, is_load: bool):
273
+ if is_load:
274
+ return torch.from_numpy(npdata).to(str_type_map[self.inference_data_type])
275
+ #return torch.from_numpy(np.fromfile(file_path, dtype=self.weights_data_type)).to(str_type_map[self.inference_data_type])
276
+ else:
277
+ return torch.empty(0).to(str_type_map[self.inference_data_type])
278
+
279
+
280
+ def get_np_data(h5f, layername, layer_num, weight_type, tp_rank=None):
281
+ if tp_rank is None:
282
+ return [load_to_torch(h5f[f'model.layers.{i}.{layername}.{weight_type}']["weights"][:], is_load(i)) for i in range(layer_num)]
283
+ else:
284
+ return [load_to_torch(h5f[f'model.layers.{i}.{layername}.{weight_type}.{tp_rank}']["weights"][:], is_load(i)) for i in range(layer_num)]
285
+
286
+ def get_np_data_single(h5f, layername, weight_type, is_loaded, tp_rank=None):
287
+ if weight_type is None:
288
+ return load_to_torch(h5f[f'model.{layername}']["weights"][:], is_loaded)
289
+
290
+ if tp_rank is None:
291
+ return load_to_torch(h5f[f'model.{layername}.{weight_type}']["weights"][:], is_loaded)
292
+ else:
293
+ return load_to_torch(h5f[f'model.{layername}.{weight_type}.{tp_rank}']["weights"][:], is_loaded)
294
+
295
+ import h5py
296
+ ckpt_f = h5py.File(ckpt_path, "r")
297
+
298
+ w.extend(get_np_data(ckpt_f, "input_layernorm", self.layer_num, "weight"))
299
+ w.extend(get_np_data(ckpt_f, "input_layernorm", self.layer_num, "bias"))
300
+
301
+ w.extend(get_np_data(ckpt_f, "attention.query_key_value", self.layer_num, "weight", tp_rank))
302
+ w.extend(get_np_data(ckpt_f, "attention.query_key_value", self.layer_num, "bias", tp_rank))
303
+
304
+ w.extend(get_np_data(ckpt_f, "attention.dense", self.layer_num, "weight", tp_rank))
305
+ w.extend(get_np_data(ckpt_f, "attention.dense", self.layer_num, "bias"))
306
+
307
+ w.extend(get_np_data(ckpt_f, "post_attention_layernorm", self.layer_num, "weight"))
308
+ w.extend(get_np_data(ckpt_f, "post_attention_layernorm", self.layer_num, "bias"))
309
+
310
+ # if moe, load "mlp.moe.experts.dense_h_to_4h"
311
+ w.extend(get_np_data(ckpt_f, "mlp.dense_h_to_4h", self.layer_num, "weight", tp_rank))
312
+ w.extend(get_np_data(ckpt_f, "mlp.dense_h_to_4h", self.layer_num, "bias", tp_rank))
313
+
314
+ # if moe, load "mlp.moe.experts.dense_4h_to_h"
315
+ w.extend(get_np_data(ckpt_f, "mlp.dense_4h_to_h", self.layer_num, "weight", tp_rank))
316
+ w.extend(get_np_data(ckpt_f, "mlp.dense_4h_to_h", self.layer_num, "bias"))
317
+
318
+
319
+
320
+ if self.has_pre_decoder_layernorm:
321
+ w.append(get_np_data_single(ckpt_f, "pre_decoder_layernorm", "weight", True))
322
+ w.append(get_np_data_single(ckpt_f, "pre_decoder_layernorm", "bias", True))
323
+
324
+
325
+ if self.has_post_decoder_layernorm:
326
+ w.append(get_np_data_single(ckpt_f, "final_layernorm", "weight", True))
327
+ w.append(get_np_data_single(ckpt_f, "final_layernorm", "bias", True))
328
+
329
+
330
+ if self.has_positional_encoding:
331
+ wpe = load_to_torch(get_np_data_single(ckpt_f, "wpe", weight_type=None, is_loaded=True)).reshape(-1, self.global_hidden_units)
332
+ assert self.max_seq_len <= wpe.size(0), (
333
+ f"max_seq_len ({self.max_seq_len} must not exceed "
334
+ f"the value of maximum sequence length during training ({wpe.size(0)})."
335
+ )
336
+ w.append(wpe)
337
+
338
+ w.append(get_np_data_single(ckpt_f, "wte", weight_type=None, is_loaded=True))
339
+
340
+ if "model.lm_head.weight" in ckpt_f.keys():
341
+ self.share_embed = False
342
+ w.append(get_np_data_single(ckpt_f, "lm_head", "weight", True))
343
+ else:
344
+ self.share_embed = True
345
+ w.append(torch.empty(0).to(str_type_map[self.inference_data_type]))
346
+
347
+ gate_list = []
348
+ for i in range(self.layer_num):
349
+ print(">>>???>>")
350
+ if f"model.layers.{i}.mlp.moe.gate.wg.weight" in ckpt_f.keys():
351
+ gate_list.append(load_to_torch(
352
+ f"{ckpt_path}/model.layers.{i}.mlp.moe.gate.wg.weight.bin", True))
353
+ else:
354
+ gate_list.append(load_to_torch(
355
+ f"{ckpt_path}/model.layers.{i}.mlp.moe.gate.wg.weight.bin", False))
356
+ w.extend(gate_list)
357
+ """
358
+ if self.has_adapters:
359
+ w.extend([load_to_torch(
360
+ f"{ckpt_path}/model.layers.{i}.after_attention_adapter.dense_h_to_4h.weight.{tp_rank}.bin"
361
+ if os.path.isfile(f"{ckpt_path}/model.layers.{i}.after_attention_adapter.dense_h_to_4h.weight.{tp_rank}.bin")
362
+ else f"{ckpt_path}/model.layers.{i}.after_attention_adapter.moe.experts.dense_h_to_4h.weight.{tp_rank}.bin",
363
+ is_load(i)) for i in range(self.layer_num)])
364
+ w.extend([load_to_torch(
365
+ f"{ckpt_path}/model.layers.{i}.after_attention_adapter.dense_h_to_4h.bias.{tp_rank}.bin"
366
+ if os.path.isfile(f"{ckpt_path}/model.layers.{i}.after_attention_adapter.dense_h_to_4h.bias.{tp_rank}.bin")
367
+ else f"{ckpt_path}/model.layers.{i}.after_attention_adapter.moe.experts.dense_h_to_4h.bias.{tp_rank}.bin",
368
+ is_load(i)) for i in range(self.layer_num)])
369
+ w.extend([load_to_torch(
370
+ f"{ckpt_path}/model.layers.{i}.after_attention_adapter.dense_4h_to_h.weight.{tp_rank}.bin"
371
+ if os.path.isfile(f"{ckpt_path}/model.layers.{i}.after_attention_adapter.dense_4h_to_h.weight.{tp_rank}.bin")
372
+ else f"{ckpt_path}/model.layers.{i}.after_attention_adapter.moe.experts.dense_4h_to_h.weight.{tp_rank}.bin",
373
+ is_load(i)) for i in range(self.layer_num)])
374
+ w.extend([load_to_torch(
375
+ f"{ckpt_path}/model.layers.{i}.after_attention_adapter.dense_4h_to_h.bias.bin"
376
+ if os.path.isfile(f"{ckpt_path}/model.layers.{i}.after_attention_adapter.dense_4h_to_h.bias.bin")
377
+ else f"{ckpt_path}/model.layers.{i}.after_attention_adapter.moe.experts.dense_4h_to_h.bias.bin",
378
+ is_load(i)) for i in range(self.layer_num)])
379
+ w.extend([load_to_torch(
380
+ f"{ckpt_path}/model.layers.{i}.after_ffn_adapter.dense_h_to_4h.weight.{tp_rank}.bin"
381
+ if os.path.isfile(f"{ckpt_path}/model.layers.{i}.after_ffn_adapter.dense_h_to_4h.weight.{tp_rank}.bin")
382
+ else f"{ckpt_path}/model.layers.{i}.after_ffn_adapter.moe.experts.dense_h_to_4h.weight.{tp_rank}.bin",
383
+ is_load(i)) for i in range(self.layer_num)])
384
+ w.extend([load_to_torch(
385
+ f"{ckpt_path}/model.layers.{i}.after_ffn_adapter.dense_h_to_4h.bias.{tp_rank}.bin"
386
+ if os.path.isfile(f"{ckpt_path}/model.layers.{i}.after_ffn_adapter.dense_h_to_4h.bias.{tp_rank}.bin")
387
+ else f"{ckpt_path}/model.layers.{i}.after_ffn_adapter.moe.experts.dense_h_to_4h.bias.{tp_rank}.bin",
388
+ is_load(i)) for i in range(self.layer_num)])
389
+ w.extend([load_to_torch(
390
+ f"{ckpt_path}/model.layers.{i}.after_ffn_adapter.dense_4h_to_h.weight.{tp_rank}.bin"
391
+ if os.path.isfile(f"{ckpt_path}/model.layers.{i}.after_ffn_adapter.dense_4h_to_h.weight.{tp_rank}.bin")
392
+ else f"{ckpt_path}/model.layers.{i}.after_ffn_adapter.moe.experts.dense_4h_to_h.weight.{tp_rank}.bin",
393
+ is_load(i)) for i in range(self.layer_num)])
394
+ w.extend([load_to_torch(
395
+ f"{ckpt_path}/model.layers.{i}.after_ffn_adapter.dense_4h_to_h.bias.bin"
396
+ if os.path.isfile(f"{ckpt_path}/model.layers.{i}.after_ffn_adapter.dense_4h_to_h.bias.bin")
397
+ else f"{ckpt_path}/model.layers.{i}.after_ffn_adapter.moe.experts.dense_4h_to_h.bias.bin",
398
+ is_load(i)) for i in range(self.layer_num)])
399
+ """
400
+ assert len(self.w) == len(w)
401
+
402
+ # Reshape
403
+ try:
404
+ for i in range(len(w)):
405
+ if w[i].nelement() == self.w[i].nelement():
406
+ self.w[i] = w[i].reshape(self.w[i].shape)
407
+ else:
408
+ self.w[i] = w[i]
409
+
410
+ except RuntimeError:
411
+ raise RuntimeError(
412
+ f"head_num, size_per_head, vocab_size, and max_seq_len must be the same as the ones during training "
413
+ f"(idx: {i} expected shape: {self.w[i].shape} got shape: {w[i].shape})."
414
+ )
415
+
416
+ # transpose calibrate quantize the kernel
417
+ layer_num = self.layer_num
418
+ if self.int8_mode != 0:
419
+ for i in range(layer_num):
420
+ self.int8_w[i + 0 * layer_num], self.scale[i + 0 *
421
+ layer_num] = self.weight_transpose_calibrate_quantize(self.w[2 * layer_num + i])
422
+ self.int8_w[i + 1 * layer_num], self.scale[i + 1 *
423
+ layer_num] = self.weight_transpose_calibrate_quantize(self.w[4 * layer_num + i])
424
+ self.int8_w[i + 2 * layer_num], self.scale[i + 2 *
425
+ layer_num] = self.weight_transpose_calibrate_quantize(self.w[8 * layer_num + i])
426
+ self.int8_w[i + 3 * layer_num], self.scale[i + 3 *
427
+ layer_num] = self.weight_transpose_calibrate_quantize(self.w[10 * layer_num + i])
428
+
429
+ # We clear the original weights since they are no longer needed
430
+ if self.int8_mode == 1:
431
+ self.w[2 * layer_num +
432
+ i] = torch.empty(0).to(str_type_map[self.inference_data_type])
433
+ self.w[4 * layer_num +
434
+ i] = torch.empty(0).to(str_type_map[self.inference_data_type])
435
+ self.w[8 * layer_num +
436
+ i] = torch.empty(0).to(str_type_map[self.inference_data_type])
437
+ self.w[10 * layer_num +
438
+ i] = torch.empty(0).to(str_type_map[self.inference_data_type])
439
+
440
+ if self.has_adapters:
441
+ self.int8_w[i + 4 * layer_num], self.scale[i + 4 * layer_num] = self.weight_transpose_calibrate_quantize(
442
+ self.w[12 * layer_num + i + self.adapter_offset])
443
+ self.int8_w[i + 5 * layer_num], self.scale[i + 5 * layer_num] = self.weight_transpose_calibrate_quantize(
444
+ self.w[14 * layer_num + i + self.adapter_offset])
445
+ self.int8_w[i + 6 * layer_num], self.scale[i + 6 * layer_num] = self.weight_transpose_calibrate_quantize(
446
+ self.w[16 * layer_num + i + self.adapter_offset])
447
+ self.int8_w[i + 7 * layer_num], self.scale[i + 7 * layer_num] = self.weight_transpose_calibrate_quantize(
448
+ self.w[18 * layer_num + i + self.adapter_offset])
449
+
450
+ # Similar to above:
451
+ if self.int8_mode == 1:
452
+ self.w[12 * layer_num + i + self.adapter_offset] = torch.empty(
453
+ 0).to(str_type_map[self.inference_data_type])
454
+ self.w[14 * layer_num + i + self.adapter_offset] = torch.empty(
455
+ 0).to(str_type_map[self.inference_data_type])
456
+ self.w[16 * layer_num + i + self.adapter_offset] = torch.empty(
457
+ 0).to(str_type_map[self.inference_data_type])
458
+ self.w[18 * layer_num + i + self.adapter_offset] = torch.empty(
459
+ 0).to(str_type_map[self.inference_data_type])
460
+ return True
461
+
462
+
463
+ class BaseBelleModel(nn.Module):
464
+ def __init__(self,
465
+ head_num, size_per_head,
466
+ vocab_size, start_id, end_id, layer_num,
467
+ max_seq_len: int,
468
+ tensor_para_size: int,
469
+ pipeline_para_size: int,
470
+ lib_path: typing.Union[str, pathlib.Path],
471
+ inference_data_type: str,
472
+ inter_size: int = 0,
473
+ # gpt_variant_params
474
+ layernorm_eps: float = 1e-6,
475
+ layernorm_type: typing.Literal['pre_layernorm',
476
+ 'post_layernorm'] = "pre_layernorm",
477
+ activation_type: str = "Gelu",
478
+ gpt_with_moe: bool = False,
479
+ expert_num: int = 0,
480
+ moe_k: int = 0,
481
+ moe_layer_index: typing.List = [],
482
+ has_positional_encoding: bool = True,
483
+ has_pre_decoder_layernorm: bool = False,
484
+ has_post_decoder_layernorm: bool = True,
485
+ has_adapters: bool = False,
486
+ adapter_inter_size: int = 0,
487
+ use_attention_linear_bias: bool = False,
488
+ int8_mode: int = 0,
489
+ weights_data_type: typing.Union[str, np.dtype] = np.float32,
490
+ shared_contexts_ratio: float = 1.0):
491
+ super().__init__()
492
+ self.head_num = head_num
493
+ self.size_per_head = size_per_head
494
+ self.vocab_size = vocab_size
495
+ self.start_id = start_id
496
+ self.end_id = end_id
497
+ self.layer_num = layer_num
498
+ self.inter_size = inter_size if inter_size != 0 else 4 * \
499
+ self.head_num * self.size_per_head
500
+
501
+ # gpt_variant_params
502
+ self.layernorm_eps = layernorm_eps
503
+ self.layernorm_type = layernorm_type
504
+ self.activation_type = activation_type
505
+ self.gpt_with_moe = gpt_with_moe
506
+ self.expert_num = expert_num
507
+ self.moe_k = moe_k
508
+ self.moe_layer_index = moe_layer_index
509
+ self.has_positional_encoding = has_positional_encoding
510
+ self.has_pre_decoder_layernorm = has_pre_decoder_layernorm
511
+ self.has_post_decoder_layernorm = has_post_decoder_layernorm
512
+ self.has_adapters = has_adapters
513
+ self.adapter_inter_size = adapter_inter_size
514
+ self.use_attention_linear_bias = use_attention_linear_bias
515
+
516
+ # multi-gpu params
517
+ self.tensor_para_size = tensor_para_size
518
+ self.pipeline_para_size = pipeline_para_size
519
+ self.use_sparse_gemm = False
520
+ self.build_model = False
521
+ self.int8_mode = int8_mode
522
+ self.weights_data_type = weights_data_type
523
+ self.shared_contexts_ratio = shared_contexts_ratio
524
+
525
+ assert torch.cuda.is_available(), "CUDA is required for this model."
526
+
527
+ assert head_num % tensor_para_size == 0, "head_num must be a multiple of tensor_para_size."
528
+ assert layer_num % pipeline_para_size == 0, "layer_num must be a multiple of pipeline_para_size."
529
+
530
+ # Load the C++ model into Pytorch model.
531
+ torch.classes.load_library(os.path.abspath(lib_path))
532
+
533
+ # Prepare weights
534
+ self.weights = BaseBelleWeights(head_num, size_per_head, layer_num, vocab_size,
535
+ max_seq_len, tensor_para_size, pipeline_para_size,
536
+ weights_data_type=weights_data_type,
537
+ inference_data_type=inference_data_type,
538
+ gpt_with_moe=self.gpt_with_moe,
539
+ has_positional_encoding=self.has_positional_encoding,
540
+ has_pre_decoder_layernorm=self.has_pre_decoder_layernorm,
541
+ has_post_decoder_layernorm=self.has_post_decoder_layernorm,
542
+ has_adapters=self.has_adapters,
543
+ adapter_inter_size=self.adapter_inter_size,
544
+ int8_mode=int8_mode,
545
+ inter_size=inter_size)
546
+
547
+ # Prepare for tensor/pipeline parallel
548
+ try:
549
+ dist.init_process_group(backend='mpi')
550
+ except:
551
+ print("[INFO] WARNING: Have initialized the process group")
552
+ self.rank = dist.get_rank()
553
+ self.device_count = torch.cuda.device_count()
554
+ self.device = self.rank % self.device_count
555
+ torch.cuda.set_device(self.device)
556
+
557
+ world_size = dist.get_world_size()
558
+ assert world_size == tensor_para_size * \
559
+ pipeline_para_size, "tensor_para_size * pipeline_para_size must be equal to world_size."
560
+
561
+ self.tensor_para_rank = self.rank % self.tensor_para_size
562
+ self.pipeline_para_rank = self.rank // self.tensor_para_size
563
+
564
+ def load(self, ckpt_path):
565
+ is_load = self.weights.load(ckpt_path, tp_rank=self.tensor_para_rank,
566
+ pipeline_para_rank=self.pipeline_para_rank)
567
+ self.cuda()
568
+ torch.cuda.empty_cache() # clean cache for model weight preprocessing
569
+ return is_load
570
+
571
+ def sparse(self):
572
+ if not self.use_sparse_gemm:
573
+ self.use_sparse_gemm = True
574
+
575
+ def cuda(self):
576
+ self.weights._map(lambda w: w.cuda(self.device))
577
+ if self.int8_mode != 0:
578
+ self.weights._map_int8(lambda w: w.cuda(self.device))
579
+
580
+ if self.build_model:
581
+ del self.model
582
+ self.build_model = False
583
+
584
+ self.model = torch.classes.FasterTransformer.GptOp(
585
+ self.head_num, self.size_per_head, self.inter_size,
586
+ self.layer_num,
587
+ self.expert_num,
588
+ self.moe_k,
589
+ self.moe_layer_index,
590
+ self.vocab_size, self.start_id, self.end_id,
591
+ self.use_sparse_gemm,
592
+ # gpt_variant_params
593
+ self.layernorm_eps,
594
+ self.layernorm_type,
595
+ self.activation_type,
596
+ self.has_positional_encoding,
597
+ self.has_pre_decoder_layernorm,
598
+ self.has_post_decoder_layernorm,
599
+ self.has_adapters,
600
+ self.adapter_inter_size,
601
+ self.use_attention_linear_bias,
602
+ self.weights.w)
603
+ self.build_model = True
604
+
605
+ def forward(self,
606
+ start_ids: torch.IntTensor,
607
+ start_lengths: torch.IntTensor,
608
+ output_len: int,
609
+ beam_width: int = 1,
610
+ top_k: typing.Optional[torch.IntTensor] = None,
611
+ top_p: typing.Optional[torch.FloatTensor] = None,
612
+ beam_search_diversity_rate: typing.Optional[torch.FloatTensor] = None,
613
+ temperature: typing.Optional[torch.FloatTensor] = None,
614
+ len_penalty: typing.Optional[torch.FloatTensor] = None,
615
+ repetition_penalty: typing.Optional[torch.FloatTensor] = None,
616
+ presence_penalty: typing.Optional[torch.FloatTensor] = None,
617
+ min_length: typing.Optional[torch.IntTensor] = None,
618
+ random_seed: typing.Optional[torch.LongTensor] = None,
619
+ bad_words_list: typing.Optional[torch.IntTensor] = None,
620
+ return_output_length: bool = False,
621
+ return_cum_log_probs: int = 0):
622
+ if not self.build_model:
623
+ # for the cases we don't load model
624
+ self.cuda()
625
+ torch.cuda.empty_cache() # clean cache for model weight preprocessing
626
+ input_len = start_ids.size(1)
627
+ assert input_len > 0, "input len must be larger than zero. For an unconditional case, use start_id as the first token."
628
+
629
+ # Inputs to device
630
+ start_ids = start_ids.cuda(self.device)
631
+ start_lengths = start_lengths.cuda(self.device)
632
+ # outputs: output_ids, output_lengths, output_cum_log_probs (optional)
633
+ outputs = self.model.forward(start_ids,
634
+ start_lengths,
635
+ output_len,
636
+ beam_width, # optional, can be None
637
+ top_k, # optional, can be None
638
+ top_p, # optional, can be None
639
+ beam_search_diversity_rate, # optional, can be None
640
+ temperature, # optional, can be None
641
+ len_penalty, # optional, can be None
642
+ repetition_penalty, # optional, can be None
643
+ presence_penalty, # optional, can be None
644
+ min_length, # optional, can be None
645
+ random_seed, # optional, can be None
646
+ bad_words_list, # optional, can be None
647
+ return_cum_log_probs) # optional, can be None
648
+ if return_cum_log_probs == 0:
649
+ output_ids, output_lengths = outputs
650
+ else:
651
+ output_ids, output_lengths, output_cum_log_probs = outputs
652
+ if return_output_length:
653
+ if return_cum_log_probs > 0:
654
+ return output_ids, output_lengths, output_cum_log_probs
655
+ else:
656
+ return output_ids, output_lengths
657
+ else:
658
+ return output_ids
659
+
660
+ def set_input_tensor(self, input_tensor):
661
+ """Set input tensor to be used instead of forward()'s input.
662
+
663
+ When doing pipeline parallelism the input from the previous
664
+ stage comes from communication, not from the input, so the
665
+ model's forward_step_func won't have it. This function is thus
666
+ used by internal code to bypass the input provided by the
667
+ forward_step_func"""
668
+ self.input_tensor = input_tensor
669
+
670
+
671
+ class BaseParallelBelleModel(BaseBelleModel):
672
+
673
+ def cuda(self):
674
+ self.weights._map(lambda w: w.cuda(self.device))
675
+ if self.int8_mode != 0:
676
+ self.weights._map_int8(lambda w: w.cuda(self.device))
677
+
678
+ if self.build_model:
679
+ del self.model
680
+ self.build_model = False
681
+ self.model = torch.classes.FasterTransformer.ParallelGptOp(
682
+ self.head_num, self.size_per_head, self.inter_size,
683
+ self.layer_num,
684
+ self.expert_num,
685
+ self.moe_k,
686
+ self.moe_layer_index,
687
+ self.vocab_size, self.start_id, self.end_id,
688
+ self.tensor_para_size, self.pipeline_para_size, self.int8_mode,
689
+ # GPT variant parameters
690
+ self.layernorm_eps,
691
+ self.layernorm_type,
692
+ self.activation_type,
693
+ self.has_positional_encoding,
694
+ self.has_pre_decoder_layernorm,
695
+ self.has_post_decoder_layernorm,
696
+ self.has_adapters,
697
+ self.adapter_inter_size,
698
+ self.use_attention_linear_bias,
699
+ self.weights.w,
700
+ self.weights.int8_w,
701
+ self.weights.scale,
702
+ self.shared_contexts_ratio)
703
+ self.build_model = True
704
+
705
+
706
+ class BelleWeight(BaseBelleWeights):
707
+
708
+ def __init__(self, head_num, size_per_head, layer_num, vocab_size,
709
+ tensor_para_size, pipeline_para_size, weights_data_type, inference_data_type,
710
+ int8_mode=0):
711
+ super().__init__(
712
+ head_num, size_per_head, layer_num, vocab_size, 0,
713
+ tensor_para_size, pipeline_para_size, weights_data_type,
714
+ inference_data_type,
715
+ has_adapters=False,
716
+ adapter_inter_size=0,
717
+ has_positional_encoding=False,
718
+ has_pre_decoder_layernorm=True,
719
+ has_post_decoder_layernorm=True,
720
+ int8_mode=int8_mode)
721
+
722
+
723
+ class BelleModel(BaseParallelBelleModel):
724
+
725
+ def __init__(self,
726
+ head_num, size_per_head,
727
+ vocab_size, start_id, end_id, layer_num,
728
+ tensor_para_size: int,
729
+ pipeline_para_size: int,
730
+ lib_path: str | Path,
731
+ inference_data_type: str,
732
+ weights_data_type: str | np.dtype = np.float32,
733
+ layernorm_eps: float = 1e-5,
734
+ shared_contexts_ratio: float = 1.0,
735
+ int8_mode: int = 0):
736
+ super().__init__(
737
+ head_num, size_per_head, vocab_size, start_id, end_id, layer_num,
738
+ 0, tensor_para_size, pipeline_para_size,
739
+ lib_path=lib_path,
740
+ inference_data_type=inference_data_type,
741
+ layernorm_eps=layernorm_eps,
742
+ # gpt_variant_params
743
+ layernorm_type="pre_layernorm",
744
+ activation_type="Gelu",
745
+ has_positional_encoding=False,
746
+ has_pre_decoder_layernorm=True,
747
+ has_post_decoder_layernorm=True,
748
+ has_adapters=False,
749
+ adapter_inter_size=0,
750
+ use_attention_linear_bias=True,
751
+ int8_mode=int8_mode,
752
+ weights_data_type=weights_data_type,
753
+ shared_contexts_ratio=shared_contexts_ratio)
754
+
755
+ def set_input_tensor(self, input_tensor: Optional[torch.Tensor]):
756
+ """Set input tensor to be used instead of forward()'s input.
757
+
758
+ When doing pipeline parallelism the input from the previous
759
+ stage comes from communication, not from the input, so the
760
+ model's forward_step_func won't have it. This function is thus
761
+ used by internal code to bypass the input provided by the
762
+ forward_step_func
763
+ """
764
+ self.input_tensor = input_tensor
model/1-gpu-fp16.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:606ee9330476cbcc465466b7a3e5ecb5945879ee42197c779b76127c4e87a037
3
+ size 14153067254
model/config.ini ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [belle]
2
+ model_name=
3
+ num_layer=30
4
+ head_num=32
5
+ inter_size=16384
6
+ size_per_head=128
7
+ vocab_size=250880
8
+ tensor_para_size=1
9
+ weight_data_type=fp16
10
+ model_variant=bloom-pre
11
+ layernorm_eps=1e-05
12
+ layernorm_type=pre_layernorm
13
+ activation_type=Gelu
14
+ has_positional_encoding=False
15
+ has_pre_decoder_layernorm=True
16
+ has_post_decoder_layernorm=True
17
+ use_attention_linear_bias=True
18
+ start_id=1
19
+ end_id=2
20
+
model/special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}
model/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fa39cd4b1500feb205bcce3b9703a4373414cafe4970e0657b413f7ddd2a9d3
3
+ size 14500438
model/tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "<unk>", "eos_token": "</s>", "bos_token": "<s>", "pad_token": "<pad>", "name_or_path": "bigscience/tokenizer", "special_tokens_map_file": null, "tokenizer_class": "BloomTokenizerFast"}
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ huggingface_hub
2
+ numpy
3
+ safetensors
4
+ setuptools
5
+ torch
6
+ transformers