Spaces:
Sleeping
Sleeping
File size: 1,977 Bytes
7901fac fff6204 7901fac 5781f7f fff6204 7901fac fff6204 7901fac fff6204 5781f7f fff6204 5781f7f fff6204 7901fac fff6204 7901fac fff6204 7901fac fff6204 7901fac 5781f7f fff6204 5781f7f fff6204 5781f7f fff6204 5781f7f fff6204 5781f7f fff6204 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
import streamlit as st
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
from PIL import Image
import torch
# Load the model and processor
@st.cache_resource
def load_model():
# Load Qwen2-VL-7B on CPU
model = Qwen2VLForConditionalGeneration.from_pretrained(
"Qwen/Qwen2-VL-7B-Instruct", torch_dtype=torch.float32, device_map="cpu"
)
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
return model, processor
model, processor = load_model()
# Streamlit Interface
st.title("Qwen2-VL-7B Multimodal Demo")
st.write("Upload an image and provide a text prompt to see the model's response.")
# Image uploader
image = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"])
# Text input field
text = st.text_input("Enter a text description or query")
# If both image and text are provided
if image and text:
# Load image with PIL
img = Image.open(image)
st.image(img, caption="Uploaded Image", use_column_width=True)
# Prepare inputs for Qwen2-VL
messages = [
{
"role": "user",
"content": [
{"type": "image", "image": img},
{"type": "text", "text": text},
],
}
]
# Prepare for inference
text_input = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
image_inputs, _ = process_vision_info(messages)
inputs = processor(text=[text_input], images=image_inputs, padding=True, return_tensors="pt")
# Move tensors to CPU
inputs = inputs.to("cpu")
# Run the model and generate output
with torch.no_grad():
generated_ids = model.generate(**inputs, max_new_tokens=128)
# Decode the output text
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
# Display the response
st.write("Model's response:", generated_text)
|