AXCXEPT
/

EZO-InternVL2-26B

@@ -99,6 +99,9 @@ model = AutoModel.from_pretrained(
 The reason for writing the code this way is to avoid errors that occur during multi-GPU inference due to tensors not being on the same device. By ensuring that the first and last layers of the large language model (LLM) are on the same device, we prevent such errors.
 ```python
 import math
 import torch
@@ -239,7 +242,7 @@ tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast
 pixel_values = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
 generation_config = dict(max_new_tokens=1024, do_sample=False)
-# pure-text conversation (纯文本对话)
 question = 'Hello, who are you?'
 response, history = model.chat(tokenizer, None, question, generation_config, history=None, return_history=True)
 print(f'User: {question}\nAssistant: {response}')
@@ -248,12 +251,12 @@ question = 'Can you tell me a story?'
 response, history = model.chat(tokenizer, None, question, generation_config, history=history, return_history=True)
 print(f'User: {question}\nAssistant: {response}')
-# single-image single-round conversation (单图单轮对话)
 question = '<image>\nPlease describe the image shortly.'
 response = model.chat(tokenizer, pixel_values, question, generation_config)
 print(f'User: {question}\nAssistant: {response}')
-# single-image multi-round conversation (单图多轮对话)
 question = '<image>\nPlease describe the image in detail.'
 response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
 print(f'User: {question}\nAssistant: {response}')
@@ -262,7 +265,7 @@ question = 'Please write a poem according to the image.'
 response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=history, return_history=True)
 print(f'User: {question}\nAssistant: {response}')
-# multi-image multi-round conversation, combined images (多图多轮对话，拼接图像)
 pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
 pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
 pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
@@ -277,7 +280,7 @@ response, history = model.chat(tokenizer, pixel_values, question, generation_con
                                history=history, return_history=True)
 print(f'User: {question}\nAssistant: {response}')
-# multi-image multi-round conversation, separate images (多图多轮对话，独立图像)
 pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
 pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
 pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
@@ -295,7 +298,7 @@ response, history = model.chat(tokenizer, pixel_values, question, generation_con
                                history=history, return_history=True)
 print(f'User: {question}\nAssistant: {response}')
-# batch inference, single image per sample (单图批处理)
 pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
 pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
 num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
@@ -309,7 +312,7 @@ responses = model.batch_chat(tokenizer, pixel_values,
 for question, response in zip(questions, responses):
     print(f'User: {question}\nAssistant: {response}')
-# video multi-round conversation (视频多轮对话)
 def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
     if bound:
         start, end = bound[0], bound[1]

 The reason for writing the code this way is to avoid errors that occur during multi-GPU inference due to tensors not being on the same device. By ensuring that the first and last layers of the large language model (LLM) are on the same device, we prevent such errors.
+このようにコードを書く理由は、テンソルが同じデバイス上にないためにマルチGPU推論中に発生するエラーを避けるためです。
+ラージ・ランゲージ・モデル（LLM）の最初のレイヤーと最後のレイヤーが同じデバイス上にあるようにすることで、このようなエラーを防ぐことができます。
 ```python
 import math
 import torch
 pixel_values = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
 generation_config = dict(max_new_tokens=1024, do_sample=False)
+# pure-text conversation (テキストのみの対話)
 question = 'Hello, who are you?'
 response, history = model.chat(tokenizer, None, question, generation_config, history=None, return_history=True)
 print(f'User: {question}\nAssistant: {response}')
 response, history = model.chat(tokenizer, None, question, generation_config, history=history, return_history=True)
 print(f'User: {question}\nAssistant: {response}')
+# single-image single-round conversation (単一画像、単一ラウンド対話)
 question = '<image>\nPlease describe the image shortly.'
 response = model.chat(tokenizer, pixel_values, question, generation_config)
 print(f'User: {question}\nAssistant: {response}')
+# single-image multi-round conversation (単一画像、多ラウンド対話)
 question = '<image>\nPlease describe the image in detail.'
 response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
 print(f'User: {question}\nAssistant: {response}')
 response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=history, return_history=True)
 print(f'User: {question}\nAssistant: {response}')
+# multi-image multi-round conversation, combined images (複数画像、複数ラウンド対話、画像のステッチング)
 pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
 pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
 pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
                                history=history, return_history=True)
 print(f'User: {question}\nAssistant: {response}')
+# multi-image multi-round conversation, separate images (別々の画像による多画像多ラウンド対話)
 pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
 pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
 pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
                                history=history, return_history=True)
 print(f'User: {question}\nAssistant: {response}')
+# batch inference, single image per sample (単一画像バッチ処理)
 pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
 pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
 num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
 for question, response in zip(questions, responses):
     print(f'User: {question}\nAssistant: {response}')
+# video multi-round conversation (ビデオ・マルチラウンド・ダイアログ)
 def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
     if bound:
         start, end = bound[0], bound[1]