Sample Script Here
#6
by
ctranslate2-4you
- opened
I hate it when repo owners don't give detailed examples so here you go people...The vision capabilities were pretty good actually:
from transformers import LlavaNextForConditionalGeneration, LlavaNextProcessor, BitsAndBytesConfig
import torch
from PIL import Image
import warnings
def process_image(image_path):
model_id = r"[PATH TO LOCAL DIRECTOR ON COMPUTER OR THE REPOSITORY ID NOT IN A RAW STRING OBVIOUSLY"
quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
)
# instantiate model
model = LlavaNextForConditionalGeneration.from_pretrained(
model_id,
quantization_config=quantization_config,
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
# instantiate processor
processor = LlavaNextProcessor.from_pretrained(model_id, tokenizer_class='PreTrainedTokenizerFast', trust_remote_code=True)
image = Image.open(image_path)
instruction = "Describe this image in detail as possible but be succinct and don't repeat yourself."
prompt = f"User:<image>\n{instruction} Falcon:"
inputs = processor(text=prompt, images=image, return_tensors="pt", padding=True).to("cuda:0")
output = model.generate(**inputs, max_new_tokens=512)
prompt_length = inputs['input_ids'].shape[1]
model_response = processor.decode(output[0][prompt_length:], skip_special_tokens=True).strip()
print(f"\n{model_response}\n")
if __name__ == "__main__":
input_image_path = r"[PATH TO A LOCAL FILE ON YOUR COMPUTER]"
process_image(input_image_path)