ardalan.mehrani
commited on
Commit
·
11ef54b
1
Parent(s):
5e0f625
hhandle batch video generation
Browse files- examples/{video_chat.py → chat.py} +37 -10
- examples/{image_chat_batch.py → chat_batch.py} +38 -19
- examples/image_chat.py +0 -31
- modeling_internvl_chat.py +27 -25
examples/{video_chat.py → chat.py}
RENAMED
@@ -1,11 +1,14 @@
|
|
1 |
|
2 |
import torch
|
3 |
from transformers import AutoModel, AutoTokenizer
|
4 |
-
from utils import load_video
|
5 |
|
6 |
if __name__ == "__main__":
|
7 |
|
8 |
-
dir, rev = 'morpheushoc/InternVL2_5-2B', 'main'
|
|
|
|
|
|
|
9 |
model = AutoModel.from_pretrained(dir,
|
10 |
torch_dtype=torch.bfloat16,
|
11 |
load_in_8bit=False,
|
@@ -16,12 +19,36 @@ if __name__ == "__main__":
|
|
16 |
tokenizer = AutoTokenizer.from_pretrained(dir, trust_remote_code=True, use_fast=False)
|
17 |
generation_config = dict(max_new_tokens=1024, do_sample=False)
|
18 |
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
pixel_values = pixel_values.to(torch.bfloat16).cuda()
|
25 |
-
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
|
26 |
-
num_patches_list=num_patches_list, history=None, return_history=True)
|
27 |
-
print(f'User: {question}\nAssistant: {response}')
|
|
|
1 |
|
2 |
import torch
|
3 |
from transformers import AutoModel, AutoTokenizer
|
4 |
+
from utils import load_image, load_video
|
5 |
|
6 |
if __name__ == "__main__":
|
7 |
|
8 |
+
# dir, rev = 'morpheushoc/InternVL2_5-2B', 'main'
|
9 |
+
dir, rev = '../', 'main'
|
10 |
+
|
11 |
+
# path = 'OpenGVLab/InternVL2_5-2B'
|
12 |
model = AutoModel.from_pretrained(dir,
|
13 |
torch_dtype=torch.bfloat16,
|
14 |
load_in_8bit=False,
|
|
|
19 |
tokenizer = AutoTokenizer.from_pretrained(dir, trust_remote_code=True, use_fast=False)
|
20 |
generation_config = dict(max_new_tokens=1024, do_sample=False)
|
21 |
|
22 |
+
paths = [
|
23 |
+
'image1.jpg',
|
24 |
+
'image1.jpg',
|
25 |
+
'image2.jpg',
|
26 |
+
'red-panda.mp4',
|
27 |
+
]
|
28 |
+
|
29 |
+
questions = [
|
30 |
+
'describe this image',
|
31 |
+
'describe this image',
|
32 |
+
'describe this image',
|
33 |
+
'describe this video'
|
34 |
+
]
|
35 |
+
|
36 |
+
for fp, question in zip(paths, questions):
|
37 |
+
if fp.endswith('mp4'):
|
38 |
+
pixel_values, num_patches_list = load_video(fp, num_segments=8, max_num=1)
|
39 |
+
prefix = ''.join([f'Frame{i+1}: <image>\n' for i in range(len(num_patches_list))])
|
40 |
+
|
41 |
+
else:
|
42 |
+
pixel_values = load_image(fp, max_num=12).to(torch.bfloat16).cuda()
|
43 |
+
num_patches_list = [len(pixel_values)]
|
44 |
+
prefix = '<image>\n'
|
45 |
+
|
46 |
+
question = prefix + question
|
47 |
+
pixel_values = pixel_values.to(torch.bfloat16).cuda()
|
48 |
+
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
|
49 |
+
num_patches_list=num_patches_list, history=None, return_history=True)
|
50 |
+
print(f'User: {question}\nAssistant: {response}')
|
51 |
|
52 |
+
question = 'How many animals ?'
|
53 |
+
response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=history, return_history=True)
|
54 |
+
print(f'User: {question}\nAssistant: {response}')
|
|
|
|
|
|
|
|
examples/{image_chat_batch.py → chat_batch.py}
RENAMED
@@ -1,11 +1,14 @@
|
|
1 |
|
2 |
import torch
|
3 |
from transformers import AutoModel, AutoTokenizer
|
4 |
-
from utils import load_image
|
5 |
|
6 |
if __name__ == "__main__":
|
7 |
|
8 |
-
dir, rev = 'morpheushoc/InternVL2_5-2B', 'main'
|
|
|
|
|
|
|
9 |
model = AutoModel.from_pretrained(dir,
|
10 |
torch_dtype=torch.bfloat16,
|
11 |
load_in_8bit=False,
|
@@ -15,24 +18,40 @@ if __name__ == "__main__":
|
|
15 |
revision=rev).eval().cuda()
|
16 |
tokenizer = AutoTokenizer.from_pretrained(dir, trust_remote_code=True, use_fast=False)
|
17 |
generation_config = dict(max_new_tokens=1024, do_sample=False)
|
|
|
18 |
paths = [
|
19 |
-
'image1.jpg',
|
20 |
-
'image1.jpg'
|
21 |
-
|
22 |
-
|
|
|
|
|
23 |
questions = [
|
24 |
-
'
|
25 |
-
'
|
26 |
-
|
|
|
|
|
27 |
|
28 |
-
pixel_values, num_patches_list
|
29 |
-
for
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
|
|
|
|
|
|
35 |
|
36 |
-
|
37 |
-
|
38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
|
2 |
import torch
|
3 |
from transformers import AutoModel, AutoTokenizer
|
4 |
+
from utils import load_image, load_video
|
5 |
|
6 |
if __name__ == "__main__":
|
7 |
|
8 |
+
# dir, rev = 'morpheushoc/InternVL2_5-2B', 'main'
|
9 |
+
dir, rev = '../', 'main'
|
10 |
+
|
11 |
+
# path = 'OpenGVLab/InternVL2_5-2B'
|
12 |
model = AutoModel.from_pretrained(dir,
|
13 |
torch_dtype=torch.bfloat16,
|
14 |
load_in_8bit=False,
|
|
|
18 |
revision=rev).eval().cuda()
|
19 |
tokenizer = AutoTokenizer.from_pretrained(dir, trust_remote_code=True, use_fast=False)
|
20 |
generation_config = dict(max_new_tokens=1024, do_sample=False)
|
21 |
+
|
22 |
paths = [
|
23 |
+
'image1.jpg',
|
24 |
+
'image1.jpg',
|
25 |
+
'image2.jpg',
|
26 |
+
'red-panda.mp4',
|
27 |
+
]
|
28 |
+
|
29 |
questions = [
|
30 |
+
'describe this image',
|
31 |
+
'describe this image',
|
32 |
+
'describe this image',
|
33 |
+
'describe this video'
|
34 |
+
]
|
35 |
|
36 |
+
pixel_values, num_patches_list = [], []
|
37 |
+
for i, fp in enumerate(paths):
|
38 |
+
|
39 |
+
if fp.endswith('mp4'):
|
40 |
+
pxl_val, num_patches = load_video(fp, num_segments=8, max_num=1)
|
41 |
+
prefix = ''.join([f'Frame{i+1}: <image>\n' for i in range(len(num_patches))])
|
42 |
+
else:
|
43 |
+
pxl_val = load_image(fp, max_num=12).to(torch.bfloat16)
|
44 |
+
num_patches = [len(pxl_val)]
|
45 |
+
prefix = '<image>\n'
|
46 |
|
47 |
+
pixel_values.append(pxl_val)
|
48 |
+
num_patches_list.append(num_patches)
|
49 |
+
questions[i] = prefix + questions[i]
|
50 |
+
|
51 |
+
pixel_values = torch.cat(pixel_values).to(torch.bfloat16).cuda()
|
52 |
+
response = model.batch_chat(tokenizer, pixel_values, questions, generation_config,
|
53 |
+
num_patches_list=num_patches_list, history=None, return_history=False)
|
54 |
+
|
55 |
+
for q, r in zip(questions, response):
|
56 |
+
print(f'User: {q}\nAssistant: {r}')
|
57 |
+
print('\n')
|
examples/image_chat.py
DELETED
@@ -1,31 +0,0 @@
|
|
1 |
-
|
2 |
-
import torch
|
3 |
-
from transformers import AutoModel, AutoTokenizer, AutoConfig
|
4 |
-
from utils import load_image
|
5 |
-
|
6 |
-
if __name__ == "__main__":
|
7 |
-
|
8 |
-
dir, rev = 'morpheushoc/InternVL2_5-2B', 'main'
|
9 |
-
# path = 'OpenGVLab/InternVL2_5-2B'
|
10 |
-
model = AutoModel.from_pretrained(dir,
|
11 |
-
torch_dtype=torch.bfloat16,
|
12 |
-
load_in_8bit=False,
|
13 |
-
low_cpu_mem_usage=True,
|
14 |
-
use_flash_attn=True,
|
15 |
-
trust_remote_code=True,
|
16 |
-
revision=rev).eval().cuda()
|
17 |
-
tokenizer = AutoTokenizer.from_pretrained(dir, trust_remote_code=True, use_fast=False)
|
18 |
-
generation_config = dict(max_new_tokens=1024, do_sample=False)
|
19 |
-
|
20 |
-
fp, question = 'image1.jpg', 'Describe this image in great details'
|
21 |
-
|
22 |
-
pixel_values = load_image(fp, max_num=12).to(torch.bfloat16).cuda()
|
23 |
-
num_patches_list = [len(pixel_values)]
|
24 |
-
prefix = '<image>\n'
|
25 |
-
|
26 |
-
question = prefix + question
|
27 |
-
pixel_values = pixel_values.to(torch.bfloat16).cuda()
|
28 |
-
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
|
29 |
-
num_patches_list=num_patches_list, history=None, return_history=True)
|
30 |
-
print(f'User: {question}\nAssistant: {response}')
|
31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
modeling_internvl_chat.py
CHANGED
@@ -220,24 +220,15 @@ class InternVLChatModel(PreTrainedModel):
|
|
220 |
print(f'dynamic ViT batch size: {image_bs}')
|
221 |
|
222 |
queries = []
|
223 |
-
for
|
224 |
-
|
225 |
-
if pixel_values is not None and '<image>' not in question:
|
226 |
-
question = '<image>\n' + question
|
227 |
-
template = get_conv_template(self.template)
|
228 |
-
template.system_message = self.system_message
|
229 |
-
template.append_message(template.roles[0], question)
|
230 |
-
template.append_message(template.roles[1], None)
|
231 |
-
query = template.get_prompt()
|
232 |
-
|
233 |
-
image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * self.num_image_token * num_patches + IMG_END_TOKEN
|
234 |
-
query = query.replace('<image>', image_tokens, 1)
|
235 |
queries.append(query)
|
236 |
|
237 |
tokenizer.padding_side = 'left'
|
238 |
model_inputs = tokenizer(queries, return_tensors='pt', padding=True)
|
239 |
input_ids = model_inputs['input_ids'].to(self.device)
|
240 |
attention_mask = model_inputs['attention_mask'].to(self.device)
|
|
|
241 |
eos_token_id = tokenizer.convert_tokens_to_ids(template.sep.strip())
|
242 |
generation_config['eos_token_id'] = eos_token_id
|
243 |
generation_output = self.generate(
|
@@ -264,25 +255,16 @@ class InternVLChatModel(PreTrainedModel):
|
|
264 |
img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
|
265 |
self.img_context_token_id = img_context_token_id
|
266 |
|
267 |
-
template = get_conv_template(self.template)
|
268 |
-
template.system_message = self.system_message
|
269 |
-
eos_token_id = tokenizer.convert_tokens_to_ids(template.sep.strip())
|
270 |
-
|
271 |
history = [] if history is None else history
|
272 |
-
|
273 |
-
|
274 |
-
template.append_message(template.roles[1], old_answer)
|
275 |
-
template.append_message(template.roles[0], question)
|
276 |
-
template.append_message(template.roles[1], None)
|
277 |
-
query = template.get_prompt()
|
278 |
|
279 |
if verbose and pixel_values is not None:
|
280 |
image_bs = pixel_values.shape[0]
|
281 |
print(f'dynamic ViT batch size: {image_bs}')
|
282 |
|
283 |
-
|
284 |
-
|
285 |
-
query = query.replace('<image>', image_tokens, 1)
|
286 |
|
287 |
model_inputs = tokenizer(query, return_tensors='pt')
|
288 |
input_ids = model_inputs['input_ids'].to(self.device)
|
@@ -306,6 +288,26 @@ class InternVLChatModel(PreTrainedModel):
|
|
306 |
print(query_to_print, response)
|
307 |
return response
|
308 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
309 |
@torch.no_grad()
|
310 |
def generate(
|
311 |
self,
|
|
|
220 |
print(f'dynamic ViT batch size: {image_bs}')
|
221 |
|
222 |
queries = []
|
223 |
+
for q, num_patches in zip(questions, num_patches_list):
|
224 |
+
query = self.build_query(q, [], num_patches, IMG_START_TOKEN, IMG_END_TOKEN, IMG_CONTEXT_TOKEN)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
225 |
queries.append(query)
|
226 |
|
227 |
tokenizer.padding_side = 'left'
|
228 |
model_inputs = tokenizer(queries, return_tensors='pt', padding=True)
|
229 |
input_ids = model_inputs['input_ids'].to(self.device)
|
230 |
attention_mask = model_inputs['attention_mask'].to(self.device)
|
231 |
+
template = get_conv_template(self.template)
|
232 |
eos_token_id = tokenizer.convert_tokens_to_ids(template.sep.strip())
|
233 |
generation_config['eos_token_id'] = eos_token_id
|
234 |
generation_output = self.generate(
|
|
|
255 |
img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
|
256 |
self.img_context_token_id = img_context_token_id
|
257 |
|
|
|
|
|
|
|
|
|
258 |
history = [] if history is None else history
|
259 |
+
query = self.build_query(question, history, num_patches_list, IMG_START_TOKEN,
|
260 |
+
IMG_END_TOKEN, IMG_CONTEXT_TOKEN)
|
|
|
|
|
|
|
|
|
261 |
|
262 |
if verbose and pixel_values is not None:
|
263 |
image_bs = pixel_values.shape[0]
|
264 |
print(f'dynamic ViT batch size: {image_bs}')
|
265 |
|
266 |
+
template = get_conv_template(self.template)
|
267 |
+
eos_token_id = tokenizer.convert_tokens_to_ids(template.sep.strip())
|
|
|
268 |
|
269 |
model_inputs = tokenizer(query, return_tensors='pt')
|
270 |
input_ids = model_inputs['input_ids'].to(self.device)
|
|
|
288 |
print(query_to_print, response)
|
289 |
return response
|
290 |
|
291 |
+
def build_query(self, question, history, num_patches_list=None, IMG_START_TOKEN='<img>',
|
292 |
+
IMG_END_TOKEN='</img>', IMG_CONTEXT_TOKEN='<IMG_CONTEXT>'):
|
293 |
+
|
294 |
+
template = get_conv_template(self.template)
|
295 |
+
template.system_message = self.system_message
|
296 |
+
|
297 |
+
for (old_question, old_answer) in history:
|
298 |
+
template.append_message(template.roles[0], old_question)
|
299 |
+
template.append_message(template.roles[1], old_answer)
|
300 |
+
template.append_message(template.roles[0], question)
|
301 |
+
template.append_message(template.roles[1], None)
|
302 |
+
query = template.get_prompt()
|
303 |
+
|
304 |
+
for num_patches in num_patches_list:
|
305 |
+
image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * self.num_image_token * num_patches + IMG_END_TOKEN
|
306 |
+
query = query.replace('<image>', image_tokens, 1)
|
307 |
+
|
308 |
+
return query
|
309 |
+
|
310 |
+
|
311 |
@torch.no_grad()
|
312 |
def generate(
|
313 |
self,
|