czczup commited on
Commit
cec24c8
·
verified ·
1 Parent(s): 6479cfb

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. README.md +121 -54
README.md CHANGED
@@ -112,10 +112,100 @@ We welcome MLLM benchmark developers to assess our InternVL1.5 and InternVL2 ser
112
 
113
  We provide an example code to run InternVL2-4B using `transformers`.
114
 
115
- We also welcome you to experience the InternVL2 series models in our [online demo](https://internvl.opengvlab.com/). Currently, due to the limited GPU resources with public IP addresses, we can only deploy models up to a maximum of 26B. We will expand soon and deploy larger models to the online demo.
116
 
117
  > Please use transformers==4.37.2 to ensure the model works normally.
118
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  ```python
120
  import numpy as np
121
  import torch
@@ -128,7 +218,6 @@ from transformers import AutoModel, AutoTokenizer
128
  IMAGENET_MEAN = (0.485, 0.456, 0.406)
129
  IMAGENET_STD = (0.229, 0.224, 0.225)
130
 
131
-
132
  def build_transform(input_size):
133
  MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
134
  transform = T.Compose([
@@ -139,7 +228,6 @@ def build_transform(input_size):
139
  ])
140
  return transform
141
 
142
-
143
  def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
144
  best_ratio_diff = float('inf')
145
  best_ratio = (1, 1)
@@ -155,8 +243,7 @@ def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_
155
  best_ratio = ratio
156
  return best_ratio
157
 
158
-
159
- def dynamic_preprocess(image, min_num=1, max_num=6, image_size=448, use_thumbnail=False):
160
  orig_width, orig_height = image.size
161
  aspect_ratio = orig_width / orig_height
162
 
@@ -194,8 +281,7 @@ def dynamic_preprocess(image, min_num=1, max_num=6, image_size=448, use_thumbnai
194
  processed_images.append(thumbnail_img)
195
  return processed_images
196
 
197
-
198
- def load_image(image_file, input_size=448, max_num=6):
199
  image = Image.open(image_file).convert('RGB')
200
  transform = build_transform(input_size=input_size)
201
  images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
@@ -203,70 +289,60 @@ def load_image(image_file, input_size=448, max_num=6):
203
  pixel_values = torch.stack(pixel_values)
204
  return pixel_values
205
 
206
-
207
  path = 'OpenGVLab/InternVL2-4B'
208
  model = AutoModel.from_pretrained(
209
  path,
210
  torch_dtype=torch.bfloat16,
211
  low_cpu_mem_usage=True,
212
  trust_remote_code=True).eval().cuda()
 
213
 
214
- tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
215
  # set the max number of tiles in `max_num`
216
- pixel_values = load_image('./examples/image1.jpg', max_num=6).to(torch.bfloat16).cuda()
217
-
218
- generation_config = dict(
219
- num_beams=1,
220
- max_new_tokens=1024,
221
- do_sample=False,
222
- )
223
 
224
  # pure-text conversation (纯文本对话)
225
  question = 'Hello, who are you?'
226
  response, history = model.chat(tokenizer, None, question, generation_config, history=None, return_history=True)
227
- print(f'User: {question}')
228
- print(f'Assistant: {response}')
229
 
230
  question = 'Can you tell me a story?'
231
  response, history = model.chat(tokenizer, None, question, generation_config, history=history, return_history=True)
232
- print(f'User: {question}')
233
- print(f'Assistant: {response}')
234
 
235
  # single-image single-round conversation (单图单轮对话)
236
  question = '<image>\nPlease describe the image shortly.'
237
  response = model.chat(tokenizer, pixel_values, question, generation_config)
238
- print(f'User: {question}')
239
- print(f'Assistant: {response}')
240
 
241
  # single-image multi-round conversation (单图多轮对话)
242
  question = '<image>\nPlease describe the image in detail.'
243
  response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
244
- print(f'User: {question}')
245
- print(f'Assistant: {response}')
246
 
247
  question = 'Please write a poem according to the image.'
248
  response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=history, return_history=True)
249
- print(f'User: {question}')
250
- print(f'Assistant: {response}')
251
 
252
  # multi-image multi-round conversation, combined images (多图多轮对话,拼接图像)
253
- pixel_values1 = load_image('./examples/image1.jpg', max_num=6).to(torch.bfloat16).cuda()
254
- pixel_values2 = load_image('./examples/image2.jpg', max_num=6).to(torch.bfloat16).cuda()
255
  pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
256
 
257
  question = '<image>\nDescribe the two images in detail.'
258
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
259
  history=None, return_history=True)
 
260
 
261
  question = 'What are the similarities and differences between these two images.'
262
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
263
  history=history, return_history=True)
264
- print(f'User: {question}')
265
- print(f'Assistant: {response}')
266
 
267
  # multi-image multi-round conversation, separate images (多图多轮对话,独立图像)
268
- pixel_values1 = load_image('./examples/image1.jpg', max_num=6).to(torch.bfloat16).cuda()
269
- pixel_values2 = load_image('./examples/image2.jpg', max_num=6).to(torch.bfloat16).cuda()
270
  pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
271
  num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
272
 
@@ -274,19 +350,17 @@ question = 'Image-1: <image>\nImage-2: <image>\nDescribe the two images in detai
274
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
275
  num_patches_list=num_patches_list,
276
  history=None, return_history=True)
277
- print(f'User: {question}')
278
- print(f'Assistant: {response}')
279
 
280
  question = 'What are the similarities and differences between these two images.'
281
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
282
  num_patches_list=num_patches_list,
283
  history=history, return_history=True)
284
- print(f'User: {question}')
285
- print(f'Assistant: {response}')
286
 
287
  # batch inference, single image per sample (单图批处理)
288
- pixel_values1 = load_image('./examples/image1.jpg', max_num=6).to(torch.bfloat16).cuda()
289
- pixel_values2 = load_image('./examples/image2.jpg', max_num=6).to(torch.bfloat16).cuda()
290
  num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
291
  pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
292
 
@@ -296,8 +370,7 @@ responses = model.batch_chat(tokenizer, pixel_values,
296
  questions=questions,
297
  generation_config=generation_config)
298
  for question, response in zip(questions, responses):
299
- print(f'User: {question}')
300
- print(f'Assistant: {response}')
301
 
302
  # video multi-round conversation (视频多轮对话)
303
  def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
@@ -332,29 +405,23 @@ def load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=3
332
  pixel_values = torch.cat(pixel_values_list)
333
  return pixel_values, num_patches_list
334
 
335
-
336
  video_path = './examples/red-panda.mp4'
337
- # pixel_values, num_patches_list = load_video(video_path, num_segments=32, max_num=1)
338
  pixel_values, num_patches_list = load_video(video_path, num_segments=8, max_num=1)
339
  pixel_values = pixel_values.to(torch.bfloat16).cuda()
340
  video_prefix = ''.join([f'Frame{i+1}: <image>\n' for i in range(len(num_patches_list))])
341
  question = video_prefix + 'What is the red panda doing?'
342
- # Frame1: <image>\nFrame2: <image>\n...\nFrame31: <image>\n{question}
343
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
344
- num_patches_list=num_patches_list,
345
- history=None, return_history=True)
346
- print(f'User: {question}')
347
- print(f'Assistant: {response}')
348
 
349
  question = 'Describe this video in detail. Don\'t repeat.'
350
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
351
- num_patches_list=num_patches_list,
352
- history=history, return_history=True)
353
- print(f'User: {question}')
354
- print(f'Assistant: {response}')
355
  ```
356
 
357
- ### Streaming output
358
 
359
  Besides this method, you can also use the following code to get streamed output.
360
 
@@ -365,7 +432,7 @@ from threading import Thread
365
  # Initialize the streamer
366
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=10)
367
  # Define the generation configuration
368
- generation_config = dict(num_beams=1, max_new_tokens=1024, do_sample=False, streamer=streamer)
369
  # Start the model chat in a separate thread
370
  thread = Thread(target=model.chat, kwargs=dict(
371
  tokenizer=tokenizer, pixel_values=pixel_values, question=question,
@@ -676,7 +743,7 @@ InternVL 2.0 是一个多模态大语言模型系列,包含各种规模的模
676
 
677
  我们提供了一个示例代码,用于使用 `transformers` 运行 InternVL2-4B。
678
 
679
- 我们也欢迎你在我们的[在线demo](https://internvl.opengvlab.com/)中体验InternVL2的系列模型。目前,由于具备公网IP地址的GPU资源有限,我们目前只能部署最大到26B的模型。我们会在不久之后进行扩容,把更大的模型部署到在线demo上,敬请期待。
680
 
681
  > 请使用 transformers==4.37.2 以确保模型正常运行。
682
 
 
112
 
113
  We provide an example code to run InternVL2-4B using `transformers`.
114
 
115
+ We also welcome you to experience the InternVL2 series models in our [online demo](https://internvl.opengvlab.com/).
116
 
117
  > Please use transformers==4.37.2 to ensure the model works normally.
118
 
119
+ ### Model Loading
120
+
121
+ #### 16-bit (bf16 / fp16)
122
+
123
+ ```python
124
+ import torch
125
+ from transformers import AutoTokenizer, AutoModel
126
+ path = "OpenGVLab/InternVL2-4B"
127
+ model = AutoModel.from_pretrained(
128
+ path,
129
+ torch_dtype=torch.bfloat16,
130
+ low_cpu_mem_usage=True,
131
+ trust_remote_code=True).eval().cuda()
132
+ ```
133
+
134
+ #### BNB 8-bit Quantization
135
+
136
+ ```python
137
+ import torch
138
+ from transformers import AutoTokenizer, AutoModel
139
+ path = "OpenGVLab/InternVL2-4B"
140
+ model = AutoModel.from_pretrained(
141
+ path,
142
+ torch_dtype=torch.bfloat16,
143
+ load_in_8bit=True,
144
+ low_cpu_mem_usage=True,
145
+ trust_remote_code=True).eval()
146
+ ```
147
+
148
+ #### BNB 4-bit Quantization
149
+
150
+ ```python
151
+ import torch
152
+ from transformers import AutoTokenizer, AutoModel
153
+ path = "OpenGVLab/InternVL2-4B"
154
+ model = AutoModel.from_pretrained(
155
+ path,
156
+ torch_dtype=torch.bfloat16,
157
+ load_in_4bit=True,
158
+ low_cpu_mem_usage=True,
159
+ trust_remote_code=True).eval()
160
+ ```
161
+
162
+ #### Multiple GPUs
163
+
164
+ The reason for writing the code this way is to avoid errors that occur during multi-GPU inference due to tensors not being on the same device. By ensuring that the first and last layers of the large language model (LLM) are on the same device, we prevent such errors.
165
+
166
+ ```python
167
+ import math
168
+ import torch
169
+ from transformers import AutoTokenizer, AutoModel
170
+
171
+ def split_model(model_name):
172
+ device_map = {}
173
+ world_size = torch.cuda.device_count()
174
+ num_layers = {
175
+ 'InternVL2-1B': 24, 'InternVL2-2B': 24, 'InternVL2-4B': 32, 'InternVL2-8B': 32,
176
+ 'InternVL2-26B': 48, 'InternVL2-40B': 60, 'InternVL2-Llama3-76B': 80}[model_name]
177
+ # Since the first GPU will be used for ViT, treat it as half a GPU.
178
+ num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
179
+ num_layers_per_gpu = [num_layers_per_gpu] * world_size
180
+ num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5)
181
+ layer_cnt = 0
182
+ for i, num_layer in enumerate(num_layers_per_gpu):
183
+ for j in range(num_layer):
184
+ device_map[f'language_model.model.layers.{layer_cnt}'] = i
185
+ layer_cnt += 1
186
+ device_map['vision_model'] = 0
187
+ device_map['mlp1'] = 0
188
+ device_map['language_model.model.tok_embeddings'] = 0
189
+ device_map['language_model.model.embed_tokens'] = 0
190
+ device_map['language_model.output'] = 0
191
+ device_map['language_model.model.norm'] = 0
192
+ device_map['language_model.lm_head'] = 0
193
+ device_map[f'language_model.model.layers.{num_layers - 1}'] = 0
194
+
195
+ return device_map
196
+
197
+ path = "OpenGVLab/InternVL2-4B"
198
+ device_map = split_model('InternVL2-4B')
199
+ model = AutoModel.from_pretrained(
200
+ path,
201
+ torch_dtype=torch.bfloat16,
202
+ low_cpu_mem_usage=True,
203
+ trust_remote_code=True,
204
+ device_map=device_map).eval()
205
+ ```
206
+
207
+ ### Inference with Transformers
208
+
209
  ```python
210
  import numpy as np
211
  import torch
 
218
  IMAGENET_MEAN = (0.485, 0.456, 0.406)
219
  IMAGENET_STD = (0.229, 0.224, 0.225)
220
 
 
221
  def build_transform(input_size):
222
  MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
223
  transform = T.Compose([
 
228
  ])
229
  return transform
230
 
 
231
  def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
232
  best_ratio_diff = float('inf')
233
  best_ratio = (1, 1)
 
243
  best_ratio = ratio
244
  return best_ratio
245
 
246
+ def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
 
247
  orig_width, orig_height = image.size
248
  aspect_ratio = orig_width / orig_height
249
 
 
281
  processed_images.append(thumbnail_img)
282
  return processed_images
283
 
284
+ def load_image(image_file, input_size=448, max_num=12):
 
285
  image = Image.open(image_file).convert('RGB')
286
  transform = build_transform(input_size=input_size)
287
  images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
 
289
  pixel_values = torch.stack(pixel_values)
290
  return pixel_values
291
 
292
+ # If you want to load a model using multiple GPUs, please refer to the `Multiple GPUs` section.
293
  path = 'OpenGVLab/InternVL2-4B'
294
  model = AutoModel.from_pretrained(
295
  path,
296
  torch_dtype=torch.bfloat16,
297
  low_cpu_mem_usage=True,
298
  trust_remote_code=True).eval().cuda()
299
+ tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
300
 
 
301
  # set the max number of tiles in `max_num`
302
+ pixel_values = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
303
+ generation_config = dict(max_new_tokens=1024, do_sample=False)
 
 
 
 
 
304
 
305
  # pure-text conversation (纯文本对话)
306
  question = 'Hello, who are you?'
307
  response, history = model.chat(tokenizer, None, question, generation_config, history=None, return_history=True)
308
+ print(f'User: {question}\nAssistant: {response}')
 
309
 
310
  question = 'Can you tell me a story?'
311
  response, history = model.chat(tokenizer, None, question, generation_config, history=history, return_history=True)
312
+ print(f'User: {question}\nAssistant: {response}')
 
313
 
314
  # single-image single-round conversation (单图单轮对话)
315
  question = '<image>\nPlease describe the image shortly.'
316
  response = model.chat(tokenizer, pixel_values, question, generation_config)
317
+ print(f'User: {question}\nAssistant: {response}')
 
318
 
319
  # single-image multi-round conversation (单图多轮对话)
320
  question = '<image>\nPlease describe the image in detail.'
321
  response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
322
+ print(f'User: {question}\nAssistant: {response}')
 
323
 
324
  question = 'Please write a poem according to the image.'
325
  response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=history, return_history=True)
326
+ print(f'User: {question}\nAssistant: {response}')
 
327
 
328
  # multi-image multi-round conversation, combined images (多图多轮对话,拼接图像)
329
+ pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
330
+ pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
331
  pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
332
 
333
  question = '<image>\nDescribe the two images in detail.'
334
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
335
  history=None, return_history=True)
336
+ print(f'User: {question}\nAssistant: {response}')
337
 
338
  question = 'What are the similarities and differences between these two images.'
339
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
340
  history=history, return_history=True)
341
+ print(f'User: {question}\nAssistant: {response}')
 
342
 
343
  # multi-image multi-round conversation, separate images (多图多轮对话,独立图像)
344
+ pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
345
+ pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
346
  pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
347
  num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
348
 
 
350
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
351
  num_patches_list=num_patches_list,
352
  history=None, return_history=True)
353
+ print(f'User: {question}\nAssistant: {response}')
 
354
 
355
  question = 'What are the similarities and differences between these two images.'
356
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
357
  num_patches_list=num_patches_list,
358
  history=history, return_history=True)
359
+ print(f'User: {question}\nAssistant: {response}')
 
360
 
361
  # batch inference, single image per sample (单图批处理)
362
+ pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
363
+ pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
364
  num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
365
  pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
366
 
 
370
  questions=questions,
371
  generation_config=generation_config)
372
  for question, response in zip(questions, responses):
373
+ print(f'User: {question}\nAssistant: {response}')
 
374
 
375
  # video multi-round conversation (视频多轮对话)
376
  def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
 
405
  pixel_values = torch.cat(pixel_values_list)
406
  return pixel_values, num_patches_list
407
 
 
408
  video_path = './examples/red-panda.mp4'
 
409
  pixel_values, num_patches_list = load_video(video_path, num_segments=8, max_num=1)
410
  pixel_values = pixel_values.to(torch.bfloat16).cuda()
411
  video_prefix = ''.join([f'Frame{i+1}: <image>\n' for i in range(len(num_patches_list))])
412
  question = video_prefix + 'What is the red panda doing?'
413
+ # Frame1: <image>\nFrame2: <image>\n...\nFrame8: <image>\n{question}
414
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
415
+ num_patches_list=num_patches_list, history=None, return_history=True)
416
+ print(f'User: {question}\nAssistant: {response}')
 
 
417
 
418
  question = 'Describe this video in detail. Don\'t repeat.'
419
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
420
+ num_patches_list=num_patches_list, history=history, return_history=True)
421
+ print(f'User: {question}\nAssistant: {response}')
 
 
422
  ```
423
 
424
+ #### Streaming output
425
 
426
  Besides this method, you can also use the following code to get streamed output.
427
 
 
432
  # Initialize the streamer
433
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=10)
434
  # Define the generation configuration
435
+ generation_config = dict(max_new_tokens=1024, do_sample=False, streamer=streamer)
436
  # Start the model chat in a separate thread
437
  thread = Thread(target=model.chat, kwargs=dict(
438
  tokenizer=tokenizer, pixel_values=pixel_values, question=question,
 
743
 
744
  我们提供了一个示例代码,用于使用 `transformers` 运行 InternVL2-4B。
745
 
746
+ 我们也欢迎你在我们的[在线demo](https://internvl.opengvlab.com/)中体验InternVL2的系列模型。
747
 
748
  > 请使用 transformers==4.37.2 以确保模型正常运行。
749