streamlit_qwen2_withbyaldi

Sleeping

App Files Files Community

streamlit_qwen2_withbyaldi / app.py

lukiod

Update app.py

f50b49c verified 3 months ago

raw

history blame

3.69 kB

	import streamlit as st
	import torch
	from PIL import Image
	import gc
	import tempfile
	import os
	from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
	from qwen_vl_utils import process_vision_info
	from byaldi import RAGMultiModalModel

	# Function to load Byaldi model
	@st.cache_resource
	def load_byaldi_model():
	model = RAGMultiModalModel.from_pretrained("vidore/colpali-v1.2", device="cpu")
	return model

	# Function to load Qwen2-VL model
	@st.cache_resource
	def load_qwen_model():
	model = Qwen2VLForConditionalGeneration.from_pretrained(
	"Qwen/Qwen2-VL-7B-Instruct", torch_dtype=torch.float32, device_map="cpu"
	)
	processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
	return model, processor

	# Function to clear GPU memory
	def clear_memory():
	gc.collect()
	torch.cuda.empty_cache()

	# Streamlit Interface
	st.title("OCR and Visual Language Model Demo")
	st.write("Upload an image for OCR extraction and then ask a question about the image.")

	# Image uploader
	image = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"])

	if image:
	img = Image.open(image)
	st.image(img, caption="Uploaded Image", use_column_width=True)

	# OCR Extraction with Byaldi
	st.write("Extracting text from image...")
	byaldi_model = load_byaldi_model()

	# Save the image to a temporary file
	with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as temp_file:
	img.save(temp_file, format="JPEG")
	temp_file_path = temp_file.name

	# Create a temporary index for the uploaded image
	with st.spinner("Processing image..."):
	byaldi_model.index(temp_file_path, index_name="temp_index", overwrite=True)

	# Perform a dummy search to get the OCR results
	ocr_results = byaldi_model.search("Extract all text from the image", k=1)

	# Extract the OCR text from the results
	if ocr_results:
	extracted_text = ocr_results[0].metadata.get("ocr_text", "No text extracted")
	else:
	extracted_text = "No text extracted"

	st.write("Extracted Text:")
	st.write(extracted_text)

	# Clear Byaldi model from memory
	del byaldi_model
	clear_memory()

	# Remove the temporary file
	os.unlink(temp_file_path)

	# Text input field for question
	question = st.text_input("Ask a question about the image and extracted text")

	if question:
	st.write("Processing with Qwen2-VL...")
	qwen_model, qwen_processor = load_qwen_model()

	# Prepare inputs for Qwen2-VL
	messages = [
	{
	"role": "user",
	"content": [
	{"type": "image", "image": img},
	{"type": "text", "text": f"Extracted text: {extracted_text}\n\nQuestion: {question}"},
	],
	}
	]

	# Prepare for inference
	text_input = qwen_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	image_inputs, _ = process_vision_info(messages)
	inputs = qwen_processor(text=[text_input], images=image_inputs, padding=True, return_tensors="pt")

	# Move tensors to CPU
	inputs = inputs.to("cpu")

	# Run the model and generate output
	with torch.no_grad():
	generated_ids = qwen_model.generate(**inputs, max_new_tokens=128)

	# Decode the output text
	generated_text = qwen_processor.batch_decode(generated_ids, skip_special_tokens=True)

	# Display the response
	st.write("Model's response:", generated_text)

	# Clear Qwen model from memory
	del qwen_model, qwen_processor
	clear_memory()