ardalan.mehrani commited on
Commit
11ef54b
·
1 Parent(s): 5e0f625

hhandle batch video generation

Browse files
examples/{video_chat.py → chat.py} RENAMED
@@ -1,11 +1,14 @@
1
 
2
  import torch
3
  from transformers import AutoModel, AutoTokenizer
4
- from utils import load_video
5
 
6
  if __name__ == "__main__":
7
 
8
- dir, rev = 'morpheushoc/InternVL2_5-2B', 'main'
 
 
 
9
  model = AutoModel.from_pretrained(dir,
10
  torch_dtype=torch.bfloat16,
11
  load_in_8bit=False,
@@ -16,12 +19,36 @@ if __name__ == "__main__":
16
  tokenizer = AutoTokenizer.from_pretrained(dir, trust_remote_code=True, use_fast=False)
17
  generation_config = dict(max_new_tokens=1024, do_sample=False)
18
 
19
- fp, question = 'red-panda.mp4', 'Describe this video in great details'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
- pixel_values, num_patches_list = load_video(fp, num_segments=8, max_num=1)
22
- prefix = ''.join([f'Frame{i+1}: <image>\n' for i in range(len(num_patches_list))])
23
- question = prefix + question
24
- pixel_values = pixel_values.to(torch.bfloat16).cuda()
25
- response, history = model.chat(tokenizer, pixel_values, question, generation_config,
26
- num_patches_list=num_patches_list, history=None, return_history=True)
27
- print(f'User: {question}\nAssistant: {response}')
 
1
 
2
  import torch
3
  from transformers import AutoModel, AutoTokenizer
4
+ from utils import load_image, load_video
5
 
6
  if __name__ == "__main__":
7
 
8
+ # dir, rev = 'morpheushoc/InternVL2_5-2B', 'main'
9
+ dir, rev = '../', 'main'
10
+
11
+ # path = 'OpenGVLab/InternVL2_5-2B'
12
  model = AutoModel.from_pretrained(dir,
13
  torch_dtype=torch.bfloat16,
14
  load_in_8bit=False,
 
19
  tokenizer = AutoTokenizer.from_pretrained(dir, trust_remote_code=True, use_fast=False)
20
  generation_config = dict(max_new_tokens=1024, do_sample=False)
21
 
22
+ paths = [
23
+ 'image1.jpg',
24
+ 'image1.jpg',
25
+ 'image2.jpg',
26
+ 'red-panda.mp4',
27
+ ]
28
+
29
+ questions = [
30
+ 'describe this image',
31
+ 'describe this image',
32
+ 'describe this image',
33
+ 'describe this video'
34
+ ]
35
+
36
+ for fp, question in zip(paths, questions):
37
+ if fp.endswith('mp4'):
38
+ pixel_values, num_patches_list = load_video(fp, num_segments=8, max_num=1)
39
+ prefix = ''.join([f'Frame{i+1}: <image>\n' for i in range(len(num_patches_list))])
40
+
41
+ else:
42
+ pixel_values = load_image(fp, max_num=12).to(torch.bfloat16).cuda()
43
+ num_patches_list = [len(pixel_values)]
44
+ prefix = '<image>\n'
45
+
46
+ question = prefix + question
47
+ pixel_values = pixel_values.to(torch.bfloat16).cuda()
48
+ response, history = model.chat(tokenizer, pixel_values, question, generation_config,
49
+ num_patches_list=num_patches_list, history=None, return_history=True)
50
+ print(f'User: {question}\nAssistant: {response}')
51
 
52
+ question = 'How many animals ?'
53
+ response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=history, return_history=True)
54
+ print(f'User: {question}\nAssistant: {response}')
 
 
 
 
examples/{image_chat_batch.py → chat_batch.py} RENAMED
@@ -1,11 +1,14 @@
1
 
2
  import torch
3
  from transformers import AutoModel, AutoTokenizer
4
- from utils import load_image
5
 
6
  if __name__ == "__main__":
7
 
8
- dir, rev = 'morpheushoc/InternVL2_5-2B', 'main'
 
 
 
9
  model = AutoModel.from_pretrained(dir,
10
  torch_dtype=torch.bfloat16,
11
  load_in_8bit=False,
@@ -15,24 +18,40 @@ if __name__ == "__main__":
15
  revision=rev).eval().cuda()
16
  tokenizer = AutoTokenizer.from_pretrained(dir, trust_remote_code=True, use_fast=False)
17
  generation_config = dict(max_new_tokens=1024, do_sample=False)
 
18
  paths = [
19
- 'image1.jpg',
20
- 'image1.jpg'
21
- ]
22
-
 
 
23
  questions = [
24
- 'Describe this image in great details',
25
- 'Describe this image in great details'
26
- ]
 
 
27
 
28
- pixel_values, num_patches_list, l_questions = [], [], []
29
- for path, q in zip(paths, questions):
30
- pxl_val = load_image(path, max_num=12).to(torch.bfloat16).cuda()
31
- pixel_values.append(pxl_val)
32
- num_patches_list.append(len(pxl_val))
33
- l_questions.append('<image>\n{}'.format(q))
34
- pixel_values = torch.cat(pixel_values)
 
 
 
35
 
36
- responses = model.batch_chat(tokenizer, pixel_values, num_patches_list=num_patches_list,questions=questions,generation_config=generation_config)
37
- for question, response in zip(questions, responses):
38
- print(f'User: {question}\nAssistant: {response}')
 
 
 
 
 
 
 
 
 
1
 
2
  import torch
3
  from transformers import AutoModel, AutoTokenizer
4
+ from utils import load_image, load_video
5
 
6
  if __name__ == "__main__":
7
 
8
+ # dir, rev = 'morpheushoc/InternVL2_5-2B', 'main'
9
+ dir, rev = '../', 'main'
10
+
11
+ # path = 'OpenGVLab/InternVL2_5-2B'
12
  model = AutoModel.from_pretrained(dir,
13
  torch_dtype=torch.bfloat16,
14
  load_in_8bit=False,
 
18
  revision=rev).eval().cuda()
19
  tokenizer = AutoTokenizer.from_pretrained(dir, trust_remote_code=True, use_fast=False)
20
  generation_config = dict(max_new_tokens=1024, do_sample=False)
21
+
22
  paths = [
23
+ 'image1.jpg',
24
+ 'image1.jpg',
25
+ 'image2.jpg',
26
+ 'red-panda.mp4',
27
+ ]
28
+
29
  questions = [
30
+ 'describe this image',
31
+ 'describe this image',
32
+ 'describe this image',
33
+ 'describe this video'
34
+ ]
35
 
36
+ pixel_values, num_patches_list = [], []
37
+ for i, fp in enumerate(paths):
38
+
39
+ if fp.endswith('mp4'):
40
+ pxl_val, num_patches = load_video(fp, num_segments=8, max_num=1)
41
+ prefix = ''.join([f'Frame{i+1}: <image>\n' for i in range(len(num_patches))])
42
+ else:
43
+ pxl_val = load_image(fp, max_num=12).to(torch.bfloat16)
44
+ num_patches = [len(pxl_val)]
45
+ prefix = '<image>\n'
46
 
47
+ pixel_values.append(pxl_val)
48
+ num_patches_list.append(num_patches)
49
+ questions[i] = prefix + questions[i]
50
+
51
+ pixel_values = torch.cat(pixel_values).to(torch.bfloat16).cuda()
52
+ response = model.batch_chat(tokenizer, pixel_values, questions, generation_config,
53
+ num_patches_list=num_patches_list, history=None, return_history=False)
54
+
55
+ for q, r in zip(questions, response):
56
+ print(f'User: {q}\nAssistant: {r}')
57
+ print('\n')
examples/image_chat.py DELETED
@@ -1,31 +0,0 @@
1
-
2
- import torch
3
- from transformers import AutoModel, AutoTokenizer, AutoConfig
4
- from utils import load_image
5
-
6
- if __name__ == "__main__":
7
-
8
- dir, rev = 'morpheushoc/InternVL2_5-2B', 'main'
9
- # path = 'OpenGVLab/InternVL2_5-2B'
10
- model = AutoModel.from_pretrained(dir,
11
- torch_dtype=torch.bfloat16,
12
- load_in_8bit=False,
13
- low_cpu_mem_usage=True,
14
- use_flash_attn=True,
15
- trust_remote_code=True,
16
- revision=rev).eval().cuda()
17
- tokenizer = AutoTokenizer.from_pretrained(dir, trust_remote_code=True, use_fast=False)
18
- generation_config = dict(max_new_tokens=1024, do_sample=False)
19
-
20
- fp, question = 'image1.jpg', 'Describe this image in great details'
21
-
22
- pixel_values = load_image(fp, max_num=12).to(torch.bfloat16).cuda()
23
- num_patches_list = [len(pixel_values)]
24
- prefix = '<image>\n'
25
-
26
- question = prefix + question
27
- pixel_values = pixel_values.to(torch.bfloat16).cuda()
28
- response, history = model.chat(tokenizer, pixel_values, question, generation_config,
29
- num_patches_list=num_patches_list, history=None, return_history=True)
30
- print(f'User: {question}\nAssistant: {response}')
31
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
modeling_internvl_chat.py CHANGED
@@ -220,24 +220,15 @@ class InternVLChatModel(PreTrainedModel):
220
  print(f'dynamic ViT batch size: {image_bs}')
221
 
222
  queries = []
223
- for idx, num_patches in enumerate(num_patches_list):
224
- question = questions[idx]
225
- if pixel_values is not None and '<image>' not in question:
226
- question = '<image>\n' + question
227
- template = get_conv_template(self.template)
228
- template.system_message = self.system_message
229
- template.append_message(template.roles[0], question)
230
- template.append_message(template.roles[1], None)
231
- query = template.get_prompt()
232
-
233
- image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * self.num_image_token * num_patches + IMG_END_TOKEN
234
- query = query.replace('<image>', image_tokens, 1)
235
  queries.append(query)
236
 
237
  tokenizer.padding_side = 'left'
238
  model_inputs = tokenizer(queries, return_tensors='pt', padding=True)
239
  input_ids = model_inputs['input_ids'].to(self.device)
240
  attention_mask = model_inputs['attention_mask'].to(self.device)
 
241
  eos_token_id = tokenizer.convert_tokens_to_ids(template.sep.strip())
242
  generation_config['eos_token_id'] = eos_token_id
243
  generation_output = self.generate(
@@ -264,25 +255,16 @@ class InternVLChatModel(PreTrainedModel):
264
  img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
265
  self.img_context_token_id = img_context_token_id
266
 
267
- template = get_conv_template(self.template)
268
- template.system_message = self.system_message
269
- eos_token_id = tokenizer.convert_tokens_to_ids(template.sep.strip())
270
-
271
  history = [] if history is None else history
272
- for (old_question, old_answer) in history:
273
- template.append_message(template.roles[0], old_question)
274
- template.append_message(template.roles[1], old_answer)
275
- template.append_message(template.roles[0], question)
276
- template.append_message(template.roles[1], None)
277
- query = template.get_prompt()
278
 
279
  if verbose and pixel_values is not None:
280
  image_bs = pixel_values.shape[0]
281
  print(f'dynamic ViT batch size: {image_bs}')
282
 
283
- for num_patches in num_patches_list:
284
- image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * self.num_image_token * num_patches + IMG_END_TOKEN
285
- query = query.replace('<image>', image_tokens, 1)
286
 
287
  model_inputs = tokenizer(query, return_tensors='pt')
288
  input_ids = model_inputs['input_ids'].to(self.device)
@@ -306,6 +288,26 @@ class InternVLChatModel(PreTrainedModel):
306
  print(query_to_print, response)
307
  return response
308
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
309
  @torch.no_grad()
310
  def generate(
311
  self,
 
220
  print(f'dynamic ViT batch size: {image_bs}')
221
 
222
  queries = []
223
+ for q, num_patches in zip(questions, num_patches_list):
224
+ query = self.build_query(q, [], num_patches, IMG_START_TOKEN, IMG_END_TOKEN, IMG_CONTEXT_TOKEN)
 
 
 
 
 
 
 
 
 
 
225
  queries.append(query)
226
 
227
  tokenizer.padding_side = 'left'
228
  model_inputs = tokenizer(queries, return_tensors='pt', padding=True)
229
  input_ids = model_inputs['input_ids'].to(self.device)
230
  attention_mask = model_inputs['attention_mask'].to(self.device)
231
+ template = get_conv_template(self.template)
232
  eos_token_id = tokenizer.convert_tokens_to_ids(template.sep.strip())
233
  generation_config['eos_token_id'] = eos_token_id
234
  generation_output = self.generate(
 
255
  img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
256
  self.img_context_token_id = img_context_token_id
257
 
 
 
 
 
258
  history = [] if history is None else history
259
+ query = self.build_query(question, history, num_patches_list, IMG_START_TOKEN,
260
+ IMG_END_TOKEN, IMG_CONTEXT_TOKEN)
 
 
 
 
261
 
262
  if verbose and pixel_values is not None:
263
  image_bs = pixel_values.shape[0]
264
  print(f'dynamic ViT batch size: {image_bs}')
265
 
266
+ template = get_conv_template(self.template)
267
+ eos_token_id = tokenizer.convert_tokens_to_ids(template.sep.strip())
 
268
 
269
  model_inputs = tokenizer(query, return_tensors='pt')
270
  input_ids = model_inputs['input_ids'].to(self.device)
 
288
  print(query_to_print, response)
289
  return response
290
 
291
+ def build_query(self, question, history, num_patches_list=None, IMG_START_TOKEN='<img>',
292
+ IMG_END_TOKEN='</img>', IMG_CONTEXT_TOKEN='<IMG_CONTEXT>'):
293
+
294
+ template = get_conv_template(self.template)
295
+ template.system_message = self.system_message
296
+
297
+ for (old_question, old_answer) in history:
298
+ template.append_message(template.roles[0], old_question)
299
+ template.append_message(template.roles[1], old_answer)
300
+ template.append_message(template.roles[0], question)
301
+ template.append_message(template.roles[1], None)
302
+ query = template.get_prompt()
303
+
304
+ for num_patches in num_patches_list:
305
+ image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * self.num_image_token * num_patches + IMG_END_TOKEN
306
+ query = query.replace('<image>', image_tokens, 1)
307
+
308
+ return query
309
+
310
+
311
  @torch.no_grad()
312
  def generate(
313
  self,