makaveli10 commited on
Commit
201054b
1 Parent(s): 6c3262d

add mistral LLM

Browse files
assets/1221-135766-0002.wav ADDED
Binary file (154 kB). View file
 
assets/mel_filters.npz ADDED
Binary file (4.27 kB). View file
 
services/llm_service.py ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from pathlib import Path
3
+ from typing import Optional
4
+ import numpy as np
5
+ import torch
6
+ from transformers import AutoTokenizer
7
+
8
+ import tensorrt_llm
9
+ from tensorrt_llm.logger import logger
10
+ from tensorrt_llm.runtime import PYTHON_BINDINGS, ModelRunner
11
+
12
+ if PYTHON_BINDINGS:
13
+ from tensorrt_llm.runtime import ModelRunnerCpp
14
+
15
+
16
+ def read_model_name(engine_dir: str):
17
+ engine_version = tensorrt_llm.builder.get_engine_version(engine_dir)
18
+
19
+ with open(Path(engine_dir) / "config.json", 'r') as f:
20
+ config = json.load(f)
21
+
22
+ if engine_version is None:
23
+ return config['builder_config']['name']
24
+
25
+ return config['pretrained_config']['architecture']
26
+
27
+
28
+ def throttle_generator(generator, stream_interval):
29
+ for i, out in enumerate(generator):
30
+ if not i % stream_interval:
31
+ yield out
32
+
33
+ if i % stream_interval:
34
+ yield out
35
+
36
+
37
+ def load_tokenizer(tokenizer_dir: Optional[str] = None,
38
+ vocab_file: Optional[str] = None,
39
+ model_name: str = 'gpt',
40
+ tokenizer_type: Optional[str] = None):
41
+ if vocab_file is None:
42
+ use_fast = True
43
+ if tokenizer_type is not None and tokenizer_type == "llama":
44
+ use_fast = False
45
+ # Should set both padding_side and truncation_side to be 'left'
46
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
47
+ legacy=False,
48
+ padding_side='left',
49
+ truncation_side='left',
50
+ trust_remote_code=True,
51
+ tokenizer_type=tokenizer_type,
52
+ use_fast=use_fast)
53
+ else:
54
+ # For gpt-next, directly load from tokenizer.model
55
+ assert model_name == 'gpt'
56
+ tokenizer = T5Tokenizer(vocab_file=vocab_file,
57
+ padding_side='left',
58
+ truncation_side='left')
59
+
60
+ if model_name == 'qwen':
61
+ with open(Path(tokenizer_dir) / "generation_config.json") as f:
62
+ gen_config = json.load(f)
63
+ chat_format = gen_config['chat_format']
64
+ if chat_format == 'raw':
65
+ pad_id = gen_config['pad_token_id']
66
+ end_id = gen_config['eos_token_id']
67
+ elif chat_format == 'chatml':
68
+ pad_id = tokenizer.im_end_id
69
+ end_id = tokenizer.im_end_id
70
+ else:
71
+ raise Exception(f"unknown chat format: {chat_format}")
72
+ elif model_name == 'glm_10b':
73
+ pad_id = tokenizer.pad_token_id
74
+ end_id = tokenizer.eop_token_id
75
+ else:
76
+ if tokenizer.pad_token_id is None:
77
+ tokenizer.pad_token_id = tokenizer.eos_token_id
78
+ pad_id = tokenizer.pad_token_id
79
+ end_id = tokenizer.eos_token_id
80
+
81
+ return tokenizer, pad_id, end_id
82
+
83
+
84
+ class MistralTensorRTLLM:
85
+ def __init__(self):
86
+ pass
87
+
88
+ def initialize_model(self, engine_dir, tokenizer_dir):
89
+ self.log_level = 'error'
90
+ self.runtime_rank = tensorrt_llm.mpi_rank()
91
+ logger.set_level(self.log_level)
92
+ model_name = read_model_name(engine_dir)
93
+ self.tokenizer, self.pad_id, self.end_id = load_tokenizer(
94
+ tokenizer_dir=tokenizer_dir,
95
+ vocab_file=None,
96
+ model_name=model_name,
97
+ tokenizer_type=None,
98
+ )
99
+ self.prompt_template = None
100
+ self.runner_cls = ModelRunner
101
+ self.runner_kwargs = dict(engine_dir=engine_dir,
102
+ lora_dir=None,
103
+ rank=self.runtime_rank,
104
+ debug_mode=False,
105
+ lora_ckpt_source='hf')
106
+
107
+ def parse_input(
108
+ self,
109
+ input_text=None,
110
+ add_special_tokens=True,
111
+ max_input_length=923,
112
+ pad_id=None,
113
+ ):
114
+ if self.pad_id is None:
115
+ self.pad_id = self.tokenizer.pad_token_id
116
+
117
+ batch_input_ids = []
118
+ for curr_text in input_text:
119
+ if self.prompt_template is not None:
120
+ curr_text = self.prompt_template.format(input_text=curr_text)
121
+ input_ids = self.tokenizer.encode(
122
+ curr_text,
123
+ add_special_tokens=add_special_tokens,
124
+ truncation=True,
125
+ max_length=max_input_length
126
+ )
127
+ batch_input_ids.append(input_ids)
128
+
129
+ batch_input_ids = [
130
+ torch.tensor(x, dtype=torch.int32).unsqueeze(0) for x in batch_input_ids
131
+ ]
132
+ return batch_input_ids
133
+
134
+ def decode_tokens(
135
+ self,
136
+ output_ids,
137
+ input_lengths,
138
+ sequence_lengths,
139
+ ):
140
+ batch_size, num_beams, _ = output_ids.size()
141
+ for batch_idx in range(batch_size):
142
+ inputs = output_ids[batch_idx][0][:input_lengths[batch_idx]].tolist(
143
+ )
144
+ input_text = self.tokenizer.decode(inputs)
145
+ output = []
146
+ for beam in range(num_beams):
147
+ output_begin = input_lengths[batch_idx]
148
+ output_end = sequence_lengths[batch_idx][beam]
149
+ outputs = output_ids[batch_idx][beam][
150
+ output_begin:output_end].tolist()
151
+ output_text = self.tokenizer.decode(outputs)
152
+ output.append(output_text)
153
+ return output
154
+
155
+ def __call__(
156
+ self,
157
+ input_text,
158
+ max_output_len=100,
159
+ max_attention_window_size=4096,
160
+ num_beams=1,
161
+ streaming=True,
162
+ streaming_interval=4,
163
+ ):
164
+ import time
165
+ start = time.time()
166
+ batch_input_ids = self.parse_input(
167
+ input_text=input_text,
168
+ add_special_tokens=True,
169
+ max_input_length=923,
170
+ pad_id=None,
171
+ )
172
+
173
+ input_lengths = [x.size(1) for x in batch_input_ids]
174
+ print(self.runner_kwargs)
175
+ runner = self.runner_cls.from_dir(**self.runner_kwargs)
176
+ with torch.no_grad():
177
+ outputs = runner.generate(
178
+ batch_input_ids,
179
+ max_new_tokens=max_output_len,
180
+ max_attention_window_size=max_attention_window_size,
181
+ end_id=self.end_id,
182
+ pad_id=self.pad_id,
183
+ temperature=1.0,
184
+ top_k=1,
185
+ top_p=0.0,
186
+ num_beams=num_beams,
187
+ length_penalty=1.0,
188
+ repetition_penalty=1.0,
189
+ stop_words_list=None,
190
+ bad_words_list=None,
191
+ lora_uids=None,
192
+ prompt_table_path=None,
193
+ prompt_tasks=None,
194
+ streaming=streaming,
195
+ output_sequence_lengths=True,
196
+ return_dict=True)
197
+ torch.cuda.synchronize()
198
+ print(outputs)
199
+ if streaming:
200
+ for curr_outputs in throttle_generator(outputs, streaming_interval):
201
+ output_ids = curr_outputs['output_ids']
202
+ sequence_lengths = curr_outputs['sequence_lengths']
203
+ output = self.decode_tokens(
204
+ output_ids,
205
+ input_lengths,
206
+ sequence_lengths
207
+ )
208
+ print(time.time() - start)
209
+ print(input_text[0] + " " + output[0])
210
+ else:
211
+ output_ids = outputs['output_ids']
212
+ sequence_lengths = outputs['sequence_lengths']
213
+ context_logits = None
214
+ generation_logits = None
215
+ if runner.gather_all_token_logits:
216
+ context_logits = outputs['context_logits']
217
+ generation_logits = outputs['generation_logits']
218
+ output = self.decode_tokens(
219
+ output_ids,
220
+ input_lengths,
221
+ sequence_lengths,
222
+ )
223
+ return output
224
+
225
+
226
+ if __name__=="__main__":
227
+ llm = MistralTensorRTLLM()
228
+ llm.initialize_model(
229
+ "/root/TensorRT-LLM/examples/llama/tmp/mistral/7B/trt_engines/fp16/1-gpu",
230
+ "teknium/OpenHermes-2.5-Mistral-7B",
231
+ )
232
+ print("intialized")
233
+ for i in range(1):
234
+ output = llm(
235
+ ["Born in north-east France, Soyer trained as a"], streaming=True
236
+ )
237
+ print(output)
238
+
239
+