czczup commited on
Commit
3dae19b
·
verified ·
1 Parent(s): 159f0ea

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. README.md +115 -60
README.md CHANGED
@@ -112,10 +112,90 @@ We welcome MLLM benchmark developers to assess our InternVL1.5 and InternVL2 ser
112
 
113
  We provide an example code to run InternVL2-Llama3-76B using `transformers`.
114
 
115
- We also welcome you to experience the InternVL2 series models in our [online demo](https://internvl.opengvlab.com/). Currently, due to the limited GPU resources with public IP addresses, we can only deploy models up to a maximum of 26B. We will expand soon and deploy larger models to the online demo.
116
 
117
  > Please use transformers==4.37.2 to ensure the model works normally.
118
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  ```python
120
  import math
121
  import numpy as np
@@ -129,7 +209,6 @@ from transformers import AutoModel, AutoTokenizer
129
  IMAGENET_MEAN = (0.485, 0.456, 0.406)
130
  IMAGENET_STD = (0.229, 0.224, 0.225)
131
 
132
-
133
  def build_transform(input_size):
134
  MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
135
  transform = T.Compose([
@@ -140,7 +219,6 @@ def build_transform(input_size):
140
  ])
141
  return transform
142
 
143
-
144
  def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
145
  best_ratio_diff = float('inf')
146
  best_ratio = (1, 1)
@@ -156,8 +234,7 @@ def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_
156
  best_ratio = ratio
157
  return best_ratio
158
 
159
-
160
- def dynamic_preprocess(image, min_num=1, max_num=6, image_size=448, use_thumbnail=False):
161
  orig_width, orig_height = image.size
162
  aspect_ratio = orig_width / orig_height
163
 
@@ -195,8 +272,7 @@ def dynamic_preprocess(image, min_num=1, max_num=6, image_size=448, use_thumbnai
195
  processed_images.append(thumbnail_img)
196
  return processed_images
197
 
198
-
199
- def load_image(image_file, input_size=448, max_num=6):
200
  image = Image.open(image_file).convert('RGB')
201
  transform = build_transform(input_size=input_size)
202
  images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
@@ -204,12 +280,12 @@ def load_image(image_file, input_size=448, max_num=6):
204
  pixel_values = torch.stack(pixel_values)
205
  return pixel_values
206
 
207
-
208
  def split_model(model_name):
209
  device_map = {}
210
  world_size = torch.cuda.device_count()
211
- num_layers = {'InternVL2-8B': 32, 'InternVL2-26B': 48,
212
- 'InternVL2-40B': 60, 'InternVL2-Llama3-76B': 80}[model_name]
 
213
  # Since the first GPU will be used for ViT, treat it as half a GPU.
214
  num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
215
  num_layers_per_gpu = [num_layers_per_gpu] * world_size
@@ -230,12 +306,10 @@ def split_model(model_name):
230
 
231
  return device_map
232
 
233
-
234
- path = 'OpenGVLab/InternVL2-Llama3-76B'
235
- device_map = split_model('InternVL2-Llama3-76B')
236
- print(device_map)
237
  # If you set `load_in_8bit=True`, you will need two 80GB GPUs.
238
  # If you set `load_in_8bit=False`, you will need at least three 80GB GPUs.
 
 
239
  model = AutoModel.from_pretrained(
240
  path,
241
  torch_dtype=torch.bfloat16,
@@ -243,63 +317,53 @@ model = AutoModel.from_pretrained(
243
  low_cpu_mem_usage=True,
244
  trust_remote_code=True,
245
  device_map=device_map).eval()
 
246
 
247
- tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
248
  # set the max number of tiles in `max_num`
249
- pixel_values = load_image('./examples/image1.jpg', max_num=6).to(torch.bfloat16).cuda()
250
-
251
- generation_config = dict(
252
- num_beams=1,
253
- max_new_tokens=1024,
254
- do_sample=False,
255
- )
256
 
257
  # pure-text conversation (纯文本对话)
258
  question = 'Hello, who are you?'
259
  response, history = model.chat(tokenizer, None, question, generation_config, history=None, return_history=True)
260
- print(f'User: {question}')
261
- print(f'Assistant: {response}')
262
 
263
  question = 'Can you tell me a story?'
264
  response, history = model.chat(tokenizer, None, question, generation_config, history=history, return_history=True)
265
- print(f'User: {question}')
266
- print(f'Assistant: {response}')
267
 
268
  # single-image single-round conversation (单图单轮对话)
269
  question = '<image>\nPlease describe the image shortly.'
270
  response = model.chat(tokenizer, pixel_values, question, generation_config)
271
- print(f'User: {question}')
272
- print(f'Assistant: {response}')
273
 
274
  # single-image multi-round conversation (单图多轮对话)
275
  question = '<image>\nPlease describe the image in detail.'
276
  response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
277
- print(f'User: {question}')
278
- print(f'Assistant: {response}')
279
 
280
  question = 'Please write a poem according to the image.'
281
  response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=history, return_history=True)
282
- print(f'User: {question}')
283
- print(f'Assistant: {response}')
284
 
285
  # multi-image multi-round conversation, combined images (多图多轮对话,拼接图像)
286
- pixel_values1 = load_image('./examples/image1.jpg', max_num=6).to(torch.bfloat16).cuda()
287
- pixel_values2 = load_image('./examples/image2.jpg', max_num=6).to(torch.bfloat16).cuda()
288
  pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
289
 
290
  question = '<image>\nDescribe the two images in detail.'
291
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
292
  history=None, return_history=True)
 
293
 
294
  question = 'What are the similarities and differences between these two images.'
295
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
296
  history=history, return_history=True)
297
- print(f'User: {question}')
298
- print(f'Assistant: {response}')
299
 
300
  # multi-image multi-round conversation, separate images (多图多轮对话,独立图像)
301
- pixel_values1 = load_image('./examples/image1.jpg', max_num=6).to(torch.bfloat16).cuda()
302
- pixel_values2 = load_image('./examples/image2.jpg', max_num=6).to(torch.bfloat16).cuda()
303
  pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
304
  num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
305
 
@@ -307,19 +371,17 @@ question = 'Image-1: <image>\nImage-2: <image>\nDescribe the two images in detai
307
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
308
  num_patches_list=num_patches_list,
309
  history=None, return_history=True)
310
- print(f'User: {question}')
311
- print(f'Assistant: {response}')
312
 
313
  question = 'What are the similarities and differences between these two images.'
314
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
315
  num_patches_list=num_patches_list,
316
  history=history, return_history=True)
317
- print(f'User: {question}')
318
- print(f'Assistant: {response}')
319
 
320
  # batch inference, single image per sample (单图批处理)
321
- pixel_values1 = load_image('./examples/image1.jpg', max_num=6).to(torch.bfloat16).cuda()
322
- pixel_values2 = load_image('./examples/image2.jpg', max_num=6).to(torch.bfloat16).cuda()
323
  num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
324
  pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
325
 
@@ -329,8 +391,7 @@ responses = model.batch_chat(tokenizer, pixel_values,
329
  questions=questions,
330
  generation_config=generation_config)
331
  for question, response in zip(questions, responses):
332
- print(f'User: {question}')
333
- print(f'Assistant: {response}')
334
 
335
  # video multi-round conversation (视频多轮对话)
336
  def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
@@ -365,29 +426,23 @@ def load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=3
365
  pixel_values = torch.cat(pixel_values_list)
366
  return pixel_values, num_patches_list
367
 
368
-
369
  video_path = './examples/red-panda.mp4'
370
- # pixel_values, num_patches_list = load_video(video_path, num_segments=32, max_num=1)
371
  pixel_values, num_patches_list = load_video(video_path, num_segments=8, max_num=1)
372
  pixel_values = pixel_values.to(torch.bfloat16).cuda()
373
  video_prefix = ''.join([f'Frame{i+1}: <image>\n' for i in range(len(num_patches_list))])
374
  question = video_prefix + 'What is the red panda doing?'
375
- # Frame1: <image>\nFrame2: <image>\n...\nFrame31: <image>\n{question}
376
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
377
- num_patches_list=num_patches_list,
378
- history=None, return_history=True)
379
- print(f'User: {question}')
380
- print(f'Assistant: {response}')
381
 
382
  question = 'Describe this video in detail. Don\'t repeat.'
383
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
384
- num_patches_list=num_patches_list,
385
- history=history, return_history=True)
386
- print(f'User: {question}')
387
- print(f'Assistant: {response}')
388
  ```
389
 
390
- ### Streaming output
391
 
392
  Besides this method, you can also use the following code to get streamed output.
393
 
@@ -398,7 +453,7 @@ from threading import Thread
398
  # Initialize the streamer
399
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=10)
400
  # Define the generation configuration
401
- generation_config = dict(num_beams=1, max_new_tokens=1024, do_sample=False, streamer=streamer)
402
  # Start the model chat in a separate thread
403
  thread = Thread(target=model.chat, kwargs=dict(
404
  tokenizer=tokenizer, pixel_values=pixel_values, question=question,
@@ -556,7 +611,7 @@ InternVL 2.0 是一个多模态大语言模型系列,包含各种规模的模
556
 
557
  我们提供了一个示例代码,用于使用 `transformers` 运行 InternVL2-Llama3-76B。
558
 
559
- 我们也欢迎你在我们的[在线demo](https://internvl.opengvlab.com/)中体验InternVL2的系列模型。目前,由于具备公网IP地址的GPU资源有限,我们目前只能部署最大到26B的模型。我们会在不久之后进行扩容,把更大的模型部署到在线demo上,敬请期待。
560
 
561
  > 请使用 transformers==4.37.2 以确保模型正常运行。
562
 
 
112
 
113
  We provide an example code to run InternVL2-Llama3-76B using `transformers`.
114
 
115
+ We also welcome you to experience the InternVL2 series models in our [online demo](https://internvl.opengvlab.com/).
116
 
117
  > Please use transformers==4.37.2 to ensure the model works normally.
118
 
119
+ ### Model Loading
120
+
121
+ #### 16-bit (bf16 / fp16)
122
+
123
+ ```python
124
+ import torch
125
+ from transformers import AutoTokenizer, AutoModel
126
+ path = "OpenGVLab/InternVL2-Llama3-76B"
127
+ model = AutoModel.from_pretrained(
128
+ path,
129
+ torch_dtype=torch.bfloat16,
130
+ low_cpu_mem_usage=True,
131
+ trust_remote_code=True).eval().cuda()
132
+ ```
133
+
134
+ #### BNB 8-bit Quantization
135
+
136
+ ```python
137
+ import torch
138
+ from transformers import AutoTokenizer, AutoModel
139
+ path = "OpenGVLab/InternVL2-Llama3-76B"
140
+ model = AutoModel.from_pretrained(
141
+ path,
142
+ torch_dtype=torch.bfloat16,
143
+ load_in_8bit=True,
144
+ low_cpu_mem_usage=True,
145
+ trust_remote_code=True).eval()
146
+ ```
147
+
148
+ #### BNB 4-bit Quantization
149
+
150
+ > **⚠️ Warning:** Due to significant quantization errors with BNB 4-bit quantization on InternViT-6B, the model may produce nonsensical outputs and fail to understand images. Therefore, please avoid using BNB 4-bit quantization.
151
+
152
+ #### Multiple GPUs
153
+
154
+ The reason for writing the code this way is to avoid errors that occur during multi-GPU inference due to tensors not being on the same device. By ensuring that the first and last layers of the large language model (LLM) are on the same device, we prevent such errors.
155
+
156
+ ```python
157
+ import math
158
+ import torch
159
+ from transformers import AutoTokenizer, AutoModel
160
+
161
+ def split_model(model_name):
162
+ device_map = {}
163
+ world_size = torch.cuda.device_count()
164
+ num_layers = {
165
+ 'InternVL2-1B': 24, 'InternVL2-2B': 24, 'InternVL2-4B': 32, 'InternVL2-8B': 32,
166
+ 'InternVL2-26B': 48, 'InternVL2-40B': 60, 'InternVL2-Llama3-76B': 80}[model_name]
167
+ # Since the first GPU will be used for ViT, treat it as half a GPU.
168
+ num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
169
+ num_layers_per_gpu = [num_layers_per_gpu] * world_size
170
+ num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5)
171
+ layer_cnt = 0
172
+ for i, num_layer in enumerate(num_layers_per_gpu):
173
+ for j in range(num_layer):
174
+ device_map[f'language_model.model.layers.{layer_cnt}'] = i
175
+ layer_cnt += 1
176
+ device_map['vision_model'] = 0
177
+ device_map['mlp1'] = 0
178
+ device_map['language_model.model.tok_embeddings'] = 0
179
+ device_map['language_model.model.embed_tokens'] = 0
180
+ device_map['language_model.output'] = 0
181
+ device_map['language_model.model.norm'] = 0
182
+ device_map['language_model.lm_head'] = 0
183
+ device_map[f'language_model.model.layers.{num_layers - 1}'] = 0
184
+
185
+ return device_map
186
+
187
+ path = "OpenGVLab/InternVL2-Llama3-76B"
188
+ device_map = split_model('InternVL2-Llama3-76B')
189
+ model = AutoModel.from_pretrained(
190
+ path,
191
+ torch_dtype=torch.bfloat16,
192
+ low_cpu_mem_usage=True,
193
+ trust_remote_code=True,
194
+ device_map=device_map).eval()
195
+ ```
196
+
197
+ ### Inference with Transformers
198
+
199
  ```python
200
  import math
201
  import numpy as np
 
209
  IMAGENET_MEAN = (0.485, 0.456, 0.406)
210
  IMAGENET_STD = (0.229, 0.224, 0.225)
211
 
 
212
  def build_transform(input_size):
213
  MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
214
  transform = T.Compose([
 
219
  ])
220
  return transform
221
 
 
222
  def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
223
  best_ratio_diff = float('inf')
224
  best_ratio = (1, 1)
 
234
  best_ratio = ratio
235
  return best_ratio
236
 
237
+ def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
 
238
  orig_width, orig_height = image.size
239
  aspect_ratio = orig_width / orig_height
240
 
 
272
  processed_images.append(thumbnail_img)
273
  return processed_images
274
 
275
+ def load_image(image_file, input_size=448, max_num=12):
 
276
  image = Image.open(image_file).convert('RGB')
277
  transform = build_transform(input_size=input_size)
278
  images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
 
280
  pixel_values = torch.stack(pixel_values)
281
  return pixel_values
282
 
 
283
  def split_model(model_name):
284
  device_map = {}
285
  world_size = torch.cuda.device_count()
286
+ num_layers = {
287
+ 'InternVL2-1B': 24, 'InternVL2-2B': 24, 'InternVL2-4B': 32, 'InternVL2-8B': 32,
288
+ 'InternVL2-26B': 48, 'InternVL2-40B': 60, 'InternVL2-Llama3-76B': 80}[model_name]
289
  # Since the first GPU will be used for ViT, treat it as half a GPU.
290
  num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
291
  num_layers_per_gpu = [num_layers_per_gpu] * world_size
 
306
 
307
  return device_map
308
 
 
 
 
 
309
  # If you set `load_in_8bit=True`, you will need two 80GB GPUs.
310
  # If you set `load_in_8bit=False`, you will need at least three 80GB GPUs.
311
+ path = 'OpenGVLab/InternVL2-Llama3-76B'
312
+ device_map = split_model('InternVL2-Llama3-76B')
313
  model = AutoModel.from_pretrained(
314
  path,
315
  torch_dtype=torch.bfloat16,
 
317
  low_cpu_mem_usage=True,
318
  trust_remote_code=True,
319
  device_map=device_map).eval()
320
+ tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
321
 
 
322
  # set the max number of tiles in `max_num`
323
+ pixel_values = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
324
+ generation_config = dict(max_new_tokens=1024, do_sample=False)
 
 
 
 
 
325
 
326
  # pure-text conversation (纯文本对话)
327
  question = 'Hello, who are you?'
328
  response, history = model.chat(tokenizer, None, question, generation_config, history=None, return_history=True)
329
+ print(f'User: {question}\nAssistant: {response}')
 
330
 
331
  question = 'Can you tell me a story?'
332
  response, history = model.chat(tokenizer, None, question, generation_config, history=history, return_history=True)
333
+ print(f'User: {question}\nAssistant: {response}')
 
334
 
335
  # single-image single-round conversation (单图单轮对话)
336
  question = '<image>\nPlease describe the image shortly.'
337
  response = model.chat(tokenizer, pixel_values, question, generation_config)
338
+ print(f'User: {question}\nAssistant: {response}')
 
339
 
340
  # single-image multi-round conversation (单图多轮对话)
341
  question = '<image>\nPlease describe the image in detail.'
342
  response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
343
+ print(f'User: {question}\nAssistant: {response}')
 
344
 
345
  question = 'Please write a poem according to the image.'
346
  response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=history, return_history=True)
347
+ print(f'User: {question}\nAssistant: {response}')
 
348
 
349
  # multi-image multi-round conversation, combined images (多图多轮对话,拼接图像)
350
+ pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
351
+ pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
352
  pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
353
 
354
  question = '<image>\nDescribe the two images in detail.'
355
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
356
  history=None, return_history=True)
357
+ print(f'User: {question}\nAssistant: {response}')
358
 
359
  question = 'What are the similarities and differences between these two images.'
360
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
361
  history=history, return_history=True)
362
+ print(f'User: {question}\nAssistant: {response}')
 
363
 
364
  # multi-image multi-round conversation, separate images (多图多轮对话,独立图像)
365
+ pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
366
+ pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
367
  pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
368
  num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
369
 
 
371
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
372
  num_patches_list=num_patches_list,
373
  history=None, return_history=True)
374
+ print(f'User: {question}\nAssistant: {response}')
 
375
 
376
  question = 'What are the similarities and differences between these two images.'
377
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
378
  num_patches_list=num_patches_list,
379
  history=history, return_history=True)
380
+ print(f'User: {question}\nAssistant: {response}')
 
381
 
382
  # batch inference, single image per sample (单图批处理)
383
+ pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
384
+ pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
385
  num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
386
  pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
387
 
 
391
  questions=questions,
392
  generation_config=generation_config)
393
  for question, response in zip(questions, responses):
394
+ print(f'User: {question}\nAssistant: {response}')
 
395
 
396
  # video multi-round conversation (视频多轮对话)
397
  def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
 
426
  pixel_values = torch.cat(pixel_values_list)
427
  return pixel_values, num_patches_list
428
 
 
429
  video_path = './examples/red-panda.mp4'
 
430
  pixel_values, num_patches_list = load_video(video_path, num_segments=8, max_num=1)
431
  pixel_values = pixel_values.to(torch.bfloat16).cuda()
432
  video_prefix = ''.join([f'Frame{i+1}: <image>\n' for i in range(len(num_patches_list))])
433
  question = video_prefix + 'What is the red panda doing?'
434
+ # Frame1: <image>\nFrame2: <image>\n...\nFrame8: <image>\n{question}
435
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
436
+ num_patches_list=num_patches_list, history=None, return_history=True)
437
+ print(f'User: {question}\nAssistant: {response}')
 
 
438
 
439
  question = 'Describe this video in detail. Don\'t repeat.'
440
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
441
+ num_patches_list=num_patches_list, history=history, return_history=True)
442
+ print(f'User: {question}\nAssistant: {response}')
 
 
443
  ```
444
 
445
+ #### Streaming output
446
 
447
  Besides this method, you can also use the following code to get streamed output.
448
 
 
453
  # Initialize the streamer
454
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=10)
455
  # Define the generation configuration
456
+ generation_config = dict(max_new_tokens=1024, do_sample=False, streamer=streamer)
457
  # Start the model chat in a separate thread
458
  thread = Thread(target=model.chat, kwargs=dict(
459
  tokenizer=tokenizer, pixel_values=pixel_values, question=question,
 
611
 
612
  我们提供了一个示例代码,用于使用 `transformers` 运行 InternVL2-Llama3-76B。
613
 
614
+ 我们也欢迎你在我们的[在线demo](https://internvl.opengvlab.com/)中体验InternVL2的系列模型。
615
 
616
  > 请使用 transformers==4.37.2 以确保模型正常运行。
617