Spaces:

sahilnishad
/

scanned-doc-chat

Sleeping

File size: 1,706 Bytes

# streamlit_app.py

import streamlit as st
from PIL import Image
from transformers import AutoModelForCausalLM, AutoProcessor
import torch

# Load the model and processor
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForCausalLM.from_pretrained("sahilnishad/Florence-2-FT-DocVQA").to(device)
processor = AutoProcessor.from_pretrained("sahilnishad/Florence-2-FT-DocVQA")

# Function to run inference
def get_answer(task_prompt, question, image):
    prompt = task_prompt + question
    if image.mode != "RGB":
        image = image.convert("RGB")
    
    inputs = processor(text=prompt, images=image, return_tensors="pt").to(device)
    with torch.no_grad():
        generated_ids = model.generate(
            input_ids=inputs["input_ids"],
            pixel_values=inputs["pixel_values"],
            max_new_tokens=1024,
            num_beams=3
        )
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return generated_text

# Streamlit UI
st.title("Scanned Document Question Answering with Florence-2")
st.write("Upload scanned document image and ask a question")

# File uploader for the document image
uploaded_file = st.file_uploader("Choose a document image...", type=["jpg", "jpeg", "png"])

# Text input for the question
question = st.text_input("Enter your question:")

# Run the model and display the answer
if uploaded_file is not None and question:
    image = Image.open(uploaded_file)
    st.image(image, caption="Uploaded Document", use_column_width=True)
    
    with st.spinner("Generating answer..."):
        answer = get_answer("<DocVQA>", question, image)
    
    st.write("**Answer:**", answer)