ZoomLocation / app.py
greencatted's picture
Use Llama Vision Instruct
0bef8ce
raw
history blame
1.04 kB
import streamlit as st
from PIL import Image
import torch
from transformers import MllamaForConditionalGeneration, AutoProcessor
model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
model = MllamaForConditionalGeneration.from_pretrained(
model_id,
torch_dtype=torch.bfloat16,
device_map="auto",
)
processor = AutoProcessor.from_pretrained(model_id)
enable = st.checkbox("Enable camera")
picture = st.camera_input("Take a picture", disabled=not enable)
if picture:
image = Image.open(picture)
messages = [
{"role": "user", "content": [
{"type": "image"},
{"type": "text", "text": "Provide your best guess as to where this person is holding his online meeting. Just state your guess of location in your response."}
]}
]
input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(
image,
input_text,
add_special_tokens=False,
return_tensors="pt"
).to(model.device)
output = model.generate(**inputs, max_new_tokens=30)
print(processor.decode(output[0]))