# streamlit_app.py import streamlit as st from PIL import Image from transformers import AutoModelForCausalLM, AutoProcessor import torch # Load the model and processor device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = AutoModelForCausalLM.from_pretrained("sahilnishad/Florence-2-FT-DocVQA").to(device) processor = AutoProcessor.from_pretrained("sahilnishad/Florence-2-FT-DocVQA") # Function to run inference def get_answer(task_prompt, question, image): prompt = task_prompt + question if image.mode != "RGB": image = image.convert("RGB") inputs = processor(text=prompt, images=image, return_tensors="pt").to(device) with torch.no_grad(): generated_ids = model.generate( input_ids=inputs["input_ids"], pixel_values=inputs["pixel_values"], max_new_tokens=1024, num_beams=3 ) generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] return generated_text # Streamlit UI st.title("Scanned Document Question Answering with Florence-2") st.write("Upload scanned document image and ask a question") # File uploader for the document image uploaded_file = st.file_uploader("Choose a document image...", type=["jpg", "jpeg", "png"]) # Text input for the question question = st.text_input("Enter your question:") # Run the model and display the answer if uploaded_file is not None and question: image = Image.open(uploaded_file) st.image(image, caption="Uploaded Document", use_column_width=True) with st.spinner("Generating answer..."): answer = get_answer("", question, image) st.write("**Answer:**", answer)