File size: 1,977 Bytes
7901fac
fff6204
7901fac
5781f7f
fff6204
7901fac
fff6204
7901fac
fff6204
 
 
 
 
 
 
5781f7f
fff6204
5781f7f
fff6204
 
 
7901fac
fff6204
 
7901fac
fff6204
 
 
 
 
 
 
 
 
 
7901fac
 
 
 
fff6204
 
7901fac
 
 
5781f7f
fff6204
 
 
 
5781f7f
fff6204
 
5781f7f
fff6204
 
 
5781f7f
fff6204
 
5781f7f
fff6204
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import streamlit as st
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
from PIL import Image
import torch

# Load the model and processor
@st.cache_resource
def load_model():
    # Load Qwen2-VL-7B on CPU
    model = Qwen2VLForConditionalGeneration.from_pretrained(
        "Qwen/Qwen2-VL-7B-Instruct", torch_dtype=torch.float32, device_map="cpu"
    )
    processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
    return model, processor

model, processor = load_model()

# Streamlit Interface
st.title("Qwen2-VL-7B Multimodal Demo")
st.write("Upload an image and provide a text prompt to see the model's response.")

# Image uploader
image = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"])

# Text input field
text = st.text_input("Enter a text description or query")

# If both image and text are provided
if image and text:
    # Load image with PIL
    img = Image.open(image)
    st.image(img, caption="Uploaded Image", use_column_width=True)

    # Prepare inputs for Qwen2-VL
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": img},
                {"type": "text", "text": text},
            ],
        }
    ]

    # Prepare for inference
    text_input = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, _ = process_vision_info(messages)
    inputs = processor(text=[text_input], images=image_inputs, padding=True, return_tensors="pt")

    # Move tensors to CPU
    inputs = inputs.to("cpu")

    # Run the model and generate output
    with torch.no_grad():
        generated_ids = model.generate(**inputs, max_new_tokens=128)

    # Decode the output text
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)

    # Display the response
    st.write("Model's response:", generated_text)