!pip install -U diffusers

!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu

!pip uninstall -y tensorflow && pip install tensorflow-cpu

from transformers import pipeline, AutoProcessor
from PIL import Image
import requests
import torch
model_id = "llava-hf/llava-1.5-7b-hf"
pipe = pipeline("image-to-text", model=model_id)
USE_CPU_OFFLOAD = True

DEVICE = torch.device("cpu")
from PIL import Image

image = Image.open("/content/Wallpaper.jpg")
#url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg"
image = Image.open(requests.get(url, stream=True).raw)

Define a chat history and use `apply_chat_template` to get correctly formatted prompt

Each value in "content" has to be a list of dicts with types ("text", "image")

conversation = [
{
"role": "user",
"content": [
{"type": "text", "text": "Describe what is in the picture?"},
{"type": "image"},
],
},
]
processor = AutoProcessor.from_pretrained(model_id)

prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)

outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200}) # Use pipe object here instead of model
print(outputs)

Loading checkpoint shards: 100%
3/3 [00:04<00:00, 1.47s/it]
[{'generated_text': 'USER: \nDescribe what is in the picture? ASSISTANT: The image features a large, colorful mountain with a sun at the top. The mountain is surrounded by a variety of smaller mountains, and there are several arrows pointing towards different parts of the mountain. The arrows are scattered throughout the scene, indicating various directions and features of the mountain.'}]

llava-hf
/

llava-1.5-7b-hf

llava-1.5-7b-hf run on cpu

Define a chat history and use `apply_chat_template` to get correctly formatted prompt

Each value in "content" has to be a list of dicts with types ("text", "image")

llava-1.5-7b-hf run on cpu

Define a chat history and use apply_chat_template to get correctly formatted prompt

Each value in "content" has to be a list of dicts with types ("text", "image")

Define a chat history and use `apply_chat_template` to get correctly formatted prompt