streamlit_qwen2_withbyaldi

Sleeping

App Files Files Community

streamlit_qwen2_withbyaldi / app.py

lukiod

Update app.py

fff6204 verified 4 months ago

raw

history blame

1.98 kB

	import streamlit as st
	from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
	from qwen_vl_utils import process_vision_info
	from PIL import Image
	import torch

	# Load the model and processor
	@st.cache_resource
	def load_model():
	# Load Qwen2-VL-7B on CPU
	model = Qwen2VLForConditionalGeneration.from_pretrained(
	"Qwen/Qwen2-VL-7B-Instruct", torch_dtype=torch.float32, device_map="cpu"
	)
	processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
	return model, processor

	model, processor = load_model()

	# Streamlit Interface
	st.title("Qwen2-VL-7B Multimodal Demo")
	st.write("Upload an image and provide a text prompt to see the model's response.")

	# Image uploader
	image = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"])

	# Text input field
	text = st.text_input("Enter a text description or query")

	# If both image and text are provided
	if image and text:
	# Load image with PIL
	img = Image.open(image)
	st.image(img, caption="Uploaded Image", use_column_width=True)

	# Prepare inputs for Qwen2-VL
	messages = [
	{
	"role": "user",
	"content": [
	{"type": "image", "image": img},
	{"type": "text", "text": text},
	],
	}
	]

	# Prepare for inference
	text_input = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	image_inputs, _ = process_vision_info(messages)
	inputs = processor(text=[text_input], images=image_inputs, padding=True, return_tensors="pt")

	# Move tensors to CPU
	inputs = inputs.to("cpu")

	# Run the model and generate output
	with torch.no_grad():
	generated_ids = model.generate(**inputs, max_new_tokens=128)

	# Decode the output text
	generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)

	# Display the response
	st.write("Model's response:", generated_text)