# import re # import gradio as gr # import torch # from transformers import DonutProcessor, VisionEncoderDecoderModel # processor = DonutProcessor.from_pretrained("pacman2223/univ-docu-model-v3") # model = VisionEncoderDecoderModel.from_pretrained("pacman2223/univ-docu-model-v3") # device = "cuda" if torch.cuda.is_available() else "cpu" # model.to(device) # def process_document(image, question): # # prepare encoder inputs # pixel_values = processor(image, return_tensors="pt").pixel_values # # prepare decoder inputs # task_prompt = "{user_input}" # prompt = task_prompt.replace("{user_input}", question) # decoder_input_ids = processor.tokenizer(prompt, add_special_tokens=False, return_tensors="pt").input_ids # # generate answer # outputs = model.generate( # pixel_values.to(device), # decoder_input_ids=decoder_input_ids.to(device), # max_length=model.decoder.config.max_position_embeddings, # early_stopping=True, # pad_token_id=processor.tokenizer.pad_token_id, # eos_token_id=processor.tokenizer.eos_token_id, # use_cache=True, # num_beams=1, # bad_words_ids=[[processor.tokenizer.unk_token_id]], # return_dict_in_generate=True, # ) # # postprocess # sequence = processor.batch_decode(outputs.sequences)[0] # sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "") # sequence = re.sub(r"<.*?>", "", sequence, count=1).strip() # remove first task start token # return processor.token2json(sequence) # description = "Gradio Demo for Model-V3, an instance of `VisionEncoderDecoderModel` fine-tuned on DocVQA (document visual question answering). To use it, simply upload your image and type a question and click 'submit', or click one of the examples to load them. Read more at the links below." # article = "

Model-V3

" # demo = gr.Interface( # fn=process_document, # inputs=["image", "text"], # outputs="json", # title="Demo: Model-V3 for Document Analysis", # description=description, # article=article, # examples=[["example_1.png", "What is the title shown?"], ["example_2.png", "When is mid semester exams?"]], # cache_examples=False) # demo.queue(max_size=5) # demo.launch() # import re # import gradio as gr # import torch # from transformers import DonutProcessor, VisionEncoderDecoderModel # import fitz # PyMuPDF # from PIL import Image # import io # processor = DonutProcessor.from_pretrained("pacman2223/univ-docu-model-v3") # model = VisionEncoderDecoderModel.from_pretrained("pacman2223/univ-docu-model-v3") # device = "cuda" if torch.cuda.is_available() else "cpu" # model.to(device) # def pdf_to_images(pdf_file): # if pdf_file is None: # return None # pdf_path = pdf_file.name # Get the file path # images = [] # try: # doc = fitz.open(pdf_path) # for page in doc: # pix = page.get_pixmap() # img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) # images.append(img) # return images # except Exception as e: # print(f"Error converting PDF: {e}") # return None # def process_document(pdf_file, page_number, question): # if pdf_file is None: # return "Please upload a PDF file." # images = pdf_to_images(pdf_file) # if images is None: # return "Failed to process the PDF file." # if page_number < 1 or page_number > len(images): # return f"Invalid page number. The PDF has {len(images)} pages." # image = images[page_number - 1] # # prepare encoder inputs # pixel_values = processor(image, return_tensors="pt").pixel_values # # prepare decoder inputs # task_prompt = "{user_input}" # prompt = task_prompt.replace("{user_input}", question) # decoder_input_ids = processor.tokenizer(prompt, add_special_tokens=False, return_tensors="pt").input_ids # # generate answer # outputs = model.generate( # pixel_values.to(device), # decoder_input_ids=decoder_input_ids.to(device), # max_length=model.decoder.config.max_position_embeddings, # early_stopping=True, # pad_token_id=processor.tokenizer.pad_token_id, # eos_token_id=processor.tokenizer.eos_token_id, # use_cache=True, # num_beams=1, # bad_words_ids=[[processor.tokenizer.unk_token_id]], # return_dict_in_generate=True, # ) # # postprocess # sequence = processor.batch_decode(outputs.sequences)[0] # sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "") # sequence = re.sub(r"<.*?>", "", sequence, count=1).strip() # remove first task start token # return processor.token2json(sequence) # def update_page_preview(pdf_file, page_number): # if pdf_file is None: # return None # images = pdf_to_images(pdf_file) # if images is None or page_number < 1 or page_number > len(images): # return None # return images[page_number - 1] # # def update_page_slider(pdf_file): # # if pdf_file is None: # # return gr.Slider(minimum=1, maximum=1, value=1, step=1, label="Page Number") # # images = pdf_to_images(pdf_file) # # if images is None: # # return gr.Slider(minimum=1, maximum=1, value=1, step=1, label="Page Number") # # return gr.Slider(minimum=1, maximum=len(images), value=1, step=1, label="Page Number") # description = "Gradio Demo for Model-V3, an instance of `VisionEncoderDecoderModel` fine-tuned on DocVQA (document visual question answering). To use it, upload a PDF file, select a page number, type a question, and click 'submit'." # article = "

Model-V3

" # with gr.Blocks() as demo: # gr.Markdown("# Demo: Model-V3 for Document Analysis") # gr.Markdown(description) # with gr.Row(): # with gr.Column(scale=1): # pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"]) # page_slider = gr.Slider(minimum=1, maximum=1, value=1, step=1, label="Page Number") # with gr.Column(scale=2): # page_preview = gr.Image(label="Page Preview") # question_input = gr.Textbox(label="Question") # submit_button = gr.Button("Submit") # output = gr.JSON(label="Output") # def update_interface(pdf_file): # if pdf_file is None: # return gr.Slider(minimum=1, maximum=1, value=1, step=1, label="Page Number"), None # images = pdf_to_images(pdf_file) # if images is None: # return gr.Slider(minimum=1, maximum=1, value=1, step=1, label="Page Number"), None # return ( # gr.Slider(minimum=1, maximum=len(images), value=1, step=1, label="Page Number"), # images[0] # Show the first page by default # ) # pdf_input.change(update_interface, inputs=[pdf_input], outputs=[page_slider, page_preview]) # page_slider.change(update_page_preview, inputs=[pdf_input, page_slider], outputs=[page_preview]) # submit_button.click(process_document, inputs=[pdf_input, page_slider, question_input], outputs=[output]) # demo.launch() import re import gradio as gr import torch from transformers import DonutProcessor, VisionEncoderDecoderModel import fitz # PyMuPDF from PIL import Image import io processor = DonutProcessor.from_pretrained("pacman2223/univ-docu-model-v3") model = VisionEncoderDecoderModel.from_pretrained("pacman2223/univ-docu-model-v3") device = "cuda" if torch.cuda.is_available() else "cpu" model.to(device) def pdf_to_images(pdf_file): if pdf_file is None: return None pdf_path = pdf_file.name # Get the file path images = [] try: doc = fitz.open(pdf_path) for page in doc: pix = page.get_pixmap() img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) images.append(img) return images except Exception as e: print(f"Error converting PDF: {e}") return None def process_document(file, page_number, question, input_type): if file is None: return "Please upload a file." if input_type == "PDF": images = pdf_to_images(file) if images is None: return "Failed to process the PDF file." if page_number < 1 or page_number > len(images): return f"Invalid page number. The PDF has {len(images)} pages." image = images[page_number - 1] else: # Image image = Image.open(file.name) # prepare encoder inputs pixel_values = processor(image, return_tensors="pt").pixel_values # prepare decoder inputs task_prompt = "{user_input}" prompt = task_prompt.replace("{user_input}", question) decoder_input_ids = processor.tokenizer(prompt, add_special_tokens=False, return_tensors="pt").input_ids # generate answer outputs = model.generate( pixel_values.to(device), decoder_input_ids=decoder_input_ids.to(device), max_length=model.decoder.config.max_position_embeddings, early_stopping=True, pad_token_id=processor.tokenizer.pad_token_id, eos_token_id=processor.tokenizer.eos_token_id, use_cache=True, num_beams=1, bad_words_ids=[[processor.tokenizer.unk_token_id]], return_dict_in_generate=True, ) # postprocess sequence = processor.batch_decode(outputs.sequences)[0] sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "") sequence = re.sub(r"<.*?>", "", sequence, count=1).strip() # remove first task start token return processor.token2json(sequence) def update_page_preview(file, page_number, input_type): if file is None: return None if input_type == "PDF": images = pdf_to_images(file) if images is None or page_number < 1 or page_number > len(images): return None return images[page_number - 1] else: # Image return Image.open(file.name) description = "Gradio Demo for Model-V3, an instance of `VisionEncoderDecoderModel` fine-tuned on DocVQA (document visual question answering). To use it, upload a PDF or image file, select a page number (for PDF), type a question, and click 'submit'." article = "

Model-V3

" with gr.Blocks() as demo: gr.Markdown("# Demo: Model-V3 for Document Analysis") gr.Markdown(description) with gr.Row(): with gr.Column(scale=1): input_type = gr.Radio(["PDF", "Image"], label="Input Type", value="PDF") file_input = gr.File(label="Upload File") page_slider = gr.Slider(minimum=1, maximum=1, value=1, step=1, label="Page Number (PDF only)") with gr.Column(scale=2): page_preview = gr.Image(label="Page/Image Preview") question_input = gr.Textbox(label="Question") submit_button = gr.Button("Submit") output = gr.JSON(label="Output") def update_interface(file, input_type): if file is None: return gr.Slider(visible=False, minimum=1, maximum=1, value=1, step=1, label="Page Number (PDF only)"), None if input_type == "PDF": images = pdf_to_images(file) if images is None: return gr.Slider(visible=False, minimum=1, maximum=1, value=1, step=1, label="Page Number (PDF only)"), None return ( gr.Slider(visible=True, minimum=1, maximum=len(images), value=1, step=1, label="Page Number (PDF only)"), images[0] # Show the first page by default ) else: # Image return gr.Slider(visible=False, minimum=1, maximum=1, value=1, step=1, label="Page Number (PDF only)"), Image.open(file.name) input_type.change(lambda x: gr.File(label="Upload File", file_types=[".pdf"] if x == "PDF" else ["image/*"]), inputs=[input_type], outputs=[file_input]) file_input.change(update_interface, inputs=[file_input, input_type], outputs=[page_slider, page_preview]) page_slider.change(update_page_preview, inputs=[file_input, page_slider, input_type], outputs=[page_preview]) submit_button.click(process_document, inputs=[file_input, page_slider, question_input, input_type], outputs=[output]) demo.launch()