import re import gradio as gr import torch from transformers import UdopProcessor, UdopForConditionalGeneration repo_id = "microsoft/udop-large" processor = UdopProcessor.from_pretrained(repo_id) model = UdopForConditionalGeneration.from_pretrained(repo_id) def process_document(image, question): pixel_values = processor(image, return_tensors="pt").pixel_values encoding = processor(images=image, text=question, return_tensors="pt") outputs = model.generate(**encoding, max_new_tokens=20) generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0] return generated_text description = "Unofficial Gradio Demo for UDOP for DocVQA (document visual question answering). To use it, simply upload your image and type a question and click 'submit', or click one of the examples to load them. Read more at the links below." article = "

Unifying Vision, Text, and Layout for Universal Document Processing | Github Repo

" demo = gr.Interface( fn=process_document, inputs=["image", "text"], outputs="text", title="Demo: UDOP for DocVQA", description=description, article=article, enable_queue=True, # examples=[["example_1.png", "When is the coffee break?"], ["example_2.jpeg", "What's the population of Stoddard?"]], cache_examples=False) demo.launch()