llava-1.5-7b-hf run on cpu
!pip install -U diffusers
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
!pip uninstall -y tensorflow && pip install tensorflow-cpu
from transformers import pipeline, AutoProcessor
from PIL import Image
import requests
import torch
model_id = "llava-hf/llava-1.5-7b-hf"
pipe = pipeline("image-to-text", model=model_id)
USE_CPU_OFFLOAD = True
DEVICE = torch.device("cpu")
from PIL import Image
image = Image.open("/content/Wallpaper.jpg")
#url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg"
image = Image.open(requests.get(url, stream=True).raw)
Define a chat history and use apply_chat_template
to get correctly formatted prompt
Each value in "content" has to be a list of dicts with types ("text", "image")
conversation = [
{
"role": "user",
"content": [
{"type": "text", "text": "Describe what is in the picture?"},
{"type": "image"},
],
},
]
processor = AutoProcessor.from_pretrained(model_id)
prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200}) # Use pipe object here instead of model
print(outputs)
Loading checkpoint shards: 100%
3/3 [00:04<00:00, 1.47s/it]
[{'generated_text': 'USER: \nDescribe what is in the picture? ASSISTANT: The image features a large, colorful mountain with a sun at the top. The mountain is surrounded by a variety of smaller mountains, and there are several arrows pointing towards different parts of the mountain. The arrows are scattered throughout the scene, indicating various directions and features of the mountain.'}]