metadata
library_name: transformers
tags: []
Malaysian Qwen1.5-0.5B + siglip-base-patch16-384
WanDB https://wandb.ai/huseinzol05/vision-qwen0.5?workspace=user-huseinzol05
how-to
from modeling_vision import MM_LLMs, MM_LLMs_Config
from transformers import AutoTokenizer, AutoProcessor
from PIL import Image
import requests
model = MM_LLMs.from_pretrained(
'mesolitica/malaysian-Qwen1.5-0.5B-siglip-base-384-vision',
flash_attention = True,
dtype = torch.bfloat16,
torch_dtype = torch.bfloat16
)
_ = model.cuda()
image_processor = AutoProcessor.from_pretrained('google/siglip-base-patch16-384')
tokenizer = AutoTokenizer.from_pretrained('mesolitica/malaysian-Qwen1.5-0.5B-siglip-base-384-vision')
model.llm.generation_config.eos_token_id = tokenizer.eos_token_id
def prepare_dataset(messages, images: List[str] = None):
if images is not None:
images = [Image.open(f).convert('RGB') for f in images]
image_output = image_processor(images=images, return_tensors='pt')['pixel_values']
else:
image_output = None
prompt = tokenizer.apply_chat_template(messages, tokenize = False)
outputs = tokenizer(
prompt,
return_tensors='pt',
return_overflowing_tokens=False,
return_length=False)
outputs['images'] = image_output
outputs['image_index'] = torch.tensor([0] * len(outputs['images']))
outputs['image_starts'] = torch.tensor([tokenizer.convert_tokens_to_ids('<image>')] * len(outputs['images']))
return outputs
with open('Persian-cat-breed.jpg', 'wb') as fopen:
fopen.write(requests.get('https://cdn.beautifulnara.net/wp-content/uploads/2017/12/10201620/Persian-cat-breed.jpg').content)
with open('nasi-goreng-1-23.jpg', 'wb') as fopen:
fopen.write(requests.get('https://www.jocooks.com/wp-content/uploads/2023/09/nasi-goreng-1-23.jpg').content)
messages = [
{'role': 'user', 'content': '<image> </image> ini gambar apa'},
]
outputs = prepare_dataset(messages, images = ['Persian-cat-breed.jpg'])
outputs['images'] = outputs['images'].type(model.dtype)
for k in outputs.keys():
if outputs[k] is not None:
outputs[k] = outputs[k].cuda()
with torch.no_grad():
model_inputs = model.prepare_inputs_for_generation(**outputs)
r = model_inputs.pop('input_ids', None)
generate_kwargs = dict(
model_inputs,
max_new_tokens=300,
top_p=0.95,
top_k=50,
temperature=0.1,
do_sample=True,
num_beams=1,
)
r = model.llm.generate(**generate_kwargs)
print(tokenizer.decode(r[0]))
<|endoftext|><|im_start|>assistant
Ini adalah gambar kucing putih yang duduk di atas sofa hitam.<|im_end|>
messages = [
{'role': 'user', 'content': '<image> </image> <image> </image> apa kaitan 2 gambar ni'},
]
outputs = prepare_dataset(messages, images = ['Persian-cat-breed.jpg', 'nasi-goreng-1-23.jpg'])
outputs['images'] = outputs['images'].type(model.dtype)
for k in outputs.keys():
if outputs[k] is not None:
outputs[k] = outputs[k].cuda()
with torch.no_grad():
model_inputs = model.prepare_inputs_for_generation(**outputs)
r = model_inputs.pop('input_ids', None)
generate_kwargs = dict(
model_inputs,
max_new_tokens=300,
top_p=0.95,
top_k=50,
temperature=0.1,
do_sample=True,
num_beams=1,
)
r = model.llm.generate(**generate_kwargs)
print(tokenizer.decode(r[0]))
<|endoftext|><|im_start|>assistant
Tiada hubungan langsung antara gambar 1 dan gambar 2. Gambar 1 ialah imej kucing putih dengan bulu putih, manakala gambar 2 ialah gambar mangkuk makan tengah hari kacang hitam dan lobak merah yang dicincang, dengan garpu diletakkan di sebelahnya. Kedua-duanya tidak berkaitan dari segi kandungan.<|im_end|>