lukiod's picture
Update app.py
fff6204 verified
raw
history blame
1.98 kB
import streamlit as st
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
from PIL import Image
import torch
# Load the model and processor
@st.cache_resource
def load_model():
# Load Qwen2-VL-7B on CPU
model = Qwen2VLForConditionalGeneration.from_pretrained(
"Qwen/Qwen2-VL-7B-Instruct", torch_dtype=torch.float32, device_map="cpu"
)
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
return model, processor
model, processor = load_model()
# Streamlit Interface
st.title("Qwen2-VL-7B Multimodal Demo")
st.write("Upload an image and provide a text prompt to see the model's response.")
# Image uploader
image = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"])
# Text input field
text = st.text_input("Enter a text description or query")
# If both image and text are provided
if image and text:
# Load image with PIL
img = Image.open(image)
st.image(img, caption="Uploaded Image", use_column_width=True)
# Prepare inputs for Qwen2-VL
messages = [
{
"role": "user",
"content": [
{"type": "image", "image": img},
{"type": "text", "text": text},
],
}
]
# Prepare for inference
text_input = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
image_inputs, _ = process_vision_info(messages)
inputs = processor(text=[text_input], images=image_inputs, padding=True, return_tensors="pt")
# Move tensors to CPU
inputs = inputs.to("cpu")
# Run the model and generate output
with torch.no_grad():
generated_ids = model.generate(**inputs, max_new_tokens=128)
# Decode the output text
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
# Display the response
st.write("Model's response:", generated_text)