Spaces:
Runtime error
Runtime error
File size: 2,635 Bytes
fb700e7 e6d304f 2f61d8a a394d66 2f61d8a e6d304f 2f61d8a a394d66 e6d304f 2f61d8a a394d66 e6d304f a394d66 e6d304f a394d66 e6d304f 2f61d8a e6d304f a394d66 2f61d8a a394d66 e6d304f e18713b e6d304f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
import re
import gradio as gr
import torch
from transformers import DonutProcessor, VisionEncoderDecoderModel
#processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2")
#model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2")
#processor = DonutProcessor.from_pretrained("Iqra56/ENGLISHDONUT")
#model = VisionEncoderDecoderModel.from_pretrained("Iqra56/ENGLISHDONUT")
processor = DonutProcessor.from_pretrained("Iqra56/DONUTWOKEYS")
model = VisionEncoderDecoderModel.from_pretrained("Iqra56/DONUTWOKEYS")
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
def process_document(image):
# prepare encoder inputs
pixel_values = processor(image, return_tensors="pt").pixel_values
# prepare decoder inputs
task_prompt = "<s>"
decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids
# generate answer
outputs = model.generate(
pixel_values.to(device),
decoder_input_ids=decoder_input_ids.to(device),
max_length=model.decoder.config.max_position_embeddings,
early_stopping=True,
pad_token_id=processor.tokenizer.pad_token_id,
eos_token_id=processor.tokenizer.eos_token_id,
use_cache=True,
num_beams=1,
bad_words_ids=[[processor.tokenizer.unk_token_id]],
return_dict_in_generate=True,
)
# postprocess
sequence = processor.batch_decode(outputs.sequences)[0]
sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
sequence = re.sub(r"<.*?>", "", sequence, count=1).strip() # remove first task start token
return processor.token2json(sequence)
description = "Gradio Demo for Donut, an instance of `VisionEncoderDecoderModel` fine-tuned on CORD (document parsing). To use it, simply upload your image and click 'submit', or click one of the examples to load them. Read more at the links below."
article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2111.15664' target='_blank'>Donut: OCR-free Document Understanding Transformer</a> | <a href='https://github.com/clovaai/donut' target='_blank'>Github Repo</a></p>"
demo = gr.Interface(
fn=process_document,
inputs="image",
outputs="json",
title="Demo: Donut 🍩 for Document Parsing",
description=description,
article=article,
enable_queue=True,
examples=[["Binder1_Page_48_Image_0001.png"], ["SKMBT_75122072616550_Page_50_Image_0001.png"]],
cache_examples=False)
demo.launch()
|