Update README.md
Browse files
README.md
CHANGED
@@ -99,6 +99,9 @@ model = AutoModel.from_pretrained(
|
|
99 |
|
100 |
The reason for writing the code this way is to avoid errors that occur during multi-GPU inference due to tensors not being on the same device. By ensuring that the first and last layers of the large language model (LLM) are on the same device, we prevent such errors.
|
101 |
|
|
|
|
|
|
|
102 |
```python
|
103 |
import math
|
104 |
import torch
|
@@ -239,7 +242,7 @@ tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast
|
|
239 |
pixel_values = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
|
240 |
generation_config = dict(max_new_tokens=1024, do_sample=False)
|
241 |
|
242 |
-
# pure-text conversation (
|
243 |
question = 'Hello, who are you?'
|
244 |
response, history = model.chat(tokenizer, None, question, generation_config, history=None, return_history=True)
|
245 |
print(f'User: {question}\nAssistant: {response}')
|
@@ -248,12 +251,12 @@ question = 'Can you tell me a story?'
|
|
248 |
response, history = model.chat(tokenizer, None, question, generation_config, history=history, return_history=True)
|
249 |
print(f'User: {question}\nAssistant: {response}')
|
250 |
|
251 |
-
# single-image single-round conversation (
|
252 |
question = '<image>\nPlease describe the image shortly.'
|
253 |
response = model.chat(tokenizer, pixel_values, question, generation_config)
|
254 |
print(f'User: {question}\nAssistant: {response}')
|
255 |
|
256 |
-
# single-image multi-round conversation (
|
257 |
question = '<image>\nPlease describe the image in detail.'
|
258 |
response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
|
259 |
print(f'User: {question}\nAssistant: {response}')
|
@@ -262,7 +265,7 @@ question = 'Please write a poem according to the image.'
|
|
262 |
response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=history, return_history=True)
|
263 |
print(f'User: {question}\nAssistant: {response}')
|
264 |
|
265 |
-
# multi-image multi-round conversation, combined images (
|
266 |
pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
|
267 |
pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
|
268 |
pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
|
@@ -277,7 +280,7 @@ response, history = model.chat(tokenizer, pixel_values, question, generation_con
|
|
277 |
history=history, return_history=True)
|
278 |
print(f'User: {question}\nAssistant: {response}')
|
279 |
|
280 |
-
# multi-image multi-round conversation, separate images (
|
281 |
pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
|
282 |
pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
|
283 |
pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
|
@@ -295,7 +298,7 @@ response, history = model.chat(tokenizer, pixel_values, question, generation_con
|
|
295 |
history=history, return_history=True)
|
296 |
print(f'User: {question}\nAssistant: {response}')
|
297 |
|
298 |
-
# batch inference, single image per sample (
|
299 |
pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
|
300 |
pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
|
301 |
num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
|
@@ -309,7 +312,7 @@ responses = model.batch_chat(tokenizer, pixel_values,
|
|
309 |
for question, response in zip(questions, responses):
|
310 |
print(f'User: {question}\nAssistant: {response}')
|
311 |
|
312 |
-
# video multi-round conversation (
|
313 |
def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
|
314 |
if bound:
|
315 |
start, end = bound[0], bound[1]
|
|
|
99 |
|
100 |
The reason for writing the code this way is to avoid errors that occur during multi-GPU inference due to tensors not being on the same device. By ensuring that the first and last layers of the large language model (LLM) are on the same device, we prevent such errors.
|
101 |
|
102 |
+
このようにコードを書く理由は、テンソルが同じデバイス上にないためにマルチGPU推論中に発生するエラーを避けるためです。
|
103 |
+
ラージ・ランゲージ・モデル(LLM)の最初のレイヤーと最後のレイヤーが同じデバイス上にあるようにすることで、このようなエラーを防ぐことができます。
|
104 |
+
|
105 |
```python
|
106 |
import math
|
107 |
import torch
|
|
|
242 |
pixel_values = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
|
243 |
generation_config = dict(max_new_tokens=1024, do_sample=False)
|
244 |
|
245 |
+
# pure-text conversation (テキストのみの対話)
|
246 |
question = 'Hello, who are you?'
|
247 |
response, history = model.chat(tokenizer, None, question, generation_config, history=None, return_history=True)
|
248 |
print(f'User: {question}\nAssistant: {response}')
|
|
|
251 |
response, history = model.chat(tokenizer, None, question, generation_config, history=history, return_history=True)
|
252 |
print(f'User: {question}\nAssistant: {response}')
|
253 |
|
254 |
+
# single-image single-round conversation (単一画像、単一ラウンド対話)
|
255 |
question = '<image>\nPlease describe the image shortly.'
|
256 |
response = model.chat(tokenizer, pixel_values, question, generation_config)
|
257 |
print(f'User: {question}\nAssistant: {response}')
|
258 |
|
259 |
+
# single-image multi-round conversation (単一画像、多ラウンド対話)
|
260 |
question = '<image>\nPlease describe the image in detail.'
|
261 |
response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
|
262 |
print(f'User: {question}\nAssistant: {response}')
|
|
|
265 |
response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=history, return_history=True)
|
266 |
print(f'User: {question}\nAssistant: {response}')
|
267 |
|
268 |
+
# multi-image multi-round conversation, combined images (複数画像、複数ラウンド対話、画像のステッチング)
|
269 |
pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
|
270 |
pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
|
271 |
pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
|
|
|
280 |
history=history, return_history=True)
|
281 |
print(f'User: {question}\nAssistant: {response}')
|
282 |
|
283 |
+
# multi-image multi-round conversation, separate images (別々の画像による多画像多ラウンド対話)
|
284 |
pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
|
285 |
pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
|
286 |
pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
|
|
|
298 |
history=history, return_history=True)
|
299 |
print(f'User: {question}\nAssistant: {response}')
|
300 |
|
301 |
+
# batch inference, single image per sample (単一画像バッチ処理)
|
302 |
pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
|
303 |
pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
|
304 |
num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
|
|
|
312 |
for question, response in zip(questions, responses):
|
313 |
print(f'User: {question}\nAssistant: {response}')
|
314 |
|
315 |
+
# video multi-round conversation (ビデオ・マルチラウンド・ダイアログ)
|
316 |
def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
|
317 |
if bound:
|
318 |
start, end = bound[0], bound[1]
|