pacman2223 commited on
Commit
9da9cd5
1 Parent(s): 38e9395

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +170 -44
app.py CHANGED
@@ -57,6 +57,129 @@
57
  # demo.launch()
58
 
59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  import re
61
  import gradio as gr
62
  import torch
@@ -87,18 +210,19 @@ def pdf_to_images(pdf_file):
87
  print(f"Error converting PDF: {e}")
88
  return None
89
 
90
- def process_document(pdf_file, page_number, question):
91
- if pdf_file is None:
92
- return "Please upload a PDF file."
93
-
94
- images = pdf_to_images(pdf_file)
95
- if images is None:
96
- return "Failed to process the PDF file."
97
 
98
- if page_number < 1 or page_number > len(images):
99
- return f"Invalid page number. The PDF has {len(images)} pages."
100
-
101
- image = images[page_number - 1]
 
 
 
 
 
102
 
103
  # prepare encoder inputs
104
  pixel_values = processor(image, return_tensors="pt").pixel_values
@@ -129,23 +253,18 @@ def process_document(pdf_file, page_number, question):
129
 
130
  return processor.token2json(sequence)
131
 
132
- def update_page_preview(pdf_file, page_number):
133
- if pdf_file is None:
134
  return None
135
- images = pdf_to_images(pdf_file)
136
- if images is None or page_number < 1 or page_number > len(images):
137
- return None
138
- return images[page_number - 1]
 
 
 
139
 
140
- # def update_page_slider(pdf_file):
141
- # if pdf_file is None:
142
- # return gr.Slider(minimum=1, maximum=1, value=1, step=1, label="Page Number")
143
- # images = pdf_to_images(pdf_file)
144
- # if images is None:
145
- # return gr.Slider(minimum=1, maximum=1, value=1, step=1, label="Page Number")
146
- # return gr.Slider(minimum=1, maximum=len(images), value=1, step=1, label="Page Number")
147
-
148
- description = "Gradio Demo for Model-V3, an instance of `VisionEncoderDecoderModel` fine-tuned on DocVQA (document visual question answering). To use it, upload a PDF file, select a page number, type a question, and click 'submit'."
149
  article = "<p style='text-align: center'>Model-V3</p>"
150
 
151
  with gr.Blocks() as demo:
@@ -154,27 +273,34 @@ with gr.Blocks() as demo:
154
 
155
  with gr.Row():
156
  with gr.Column(scale=1):
157
- pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
158
- page_slider = gr.Slider(minimum=1, maximum=1, value=1, step=1, label="Page Number")
 
159
  with gr.Column(scale=2):
160
- page_preview = gr.Image(label="Page Preview")
161
-
162
  question_input = gr.Textbox(label="Question")
163
  submit_button = gr.Button("Submit")
164
  output = gr.JSON(label="Output")
165
-
166
- def update_interface(pdf_file):
167
- if pdf_file is None:
168
- return gr.Slider(minimum=1, maximum=1, value=1, step=1, label="Page Number"), None
169
- images = pdf_to_images(pdf_file)
170
- if images is None:
171
- return gr.Slider(minimum=1, maximum=1, value=1, step=1, label="Page Number"), None
172
- return (
173
- gr.Slider(minimum=1, maximum=len(images), value=1, step=1, label="Page Number"),
174
- images[0] # Show the first page by default
175
- )
176
- pdf_input.change(update_interface, inputs=[pdf_input], outputs=[page_slider, page_preview])
177
- page_slider.change(update_page_preview, inputs=[pdf_input, page_slider], outputs=[page_preview])
178
- submit_button.click(process_document, inputs=[pdf_input, page_slider, question_input], outputs=[output])
 
 
 
 
 
 
179
 
180
  demo.launch()
 
57
  # demo.launch()
58
 
59
 
60
+ # import re
61
+ # import gradio as gr
62
+ # import torch
63
+ # from transformers import DonutProcessor, VisionEncoderDecoderModel
64
+ # import fitz # PyMuPDF
65
+ # from PIL import Image
66
+ # import io
67
+
68
+ # processor = DonutProcessor.from_pretrained("pacman2223/univ-docu-model-v3")
69
+ # model = VisionEncoderDecoderModel.from_pretrained("pacman2223/univ-docu-model-v3")
70
+ # device = "cuda" if torch.cuda.is_available() else "cpu"
71
+ # model.to(device)
72
+
73
+ # def pdf_to_images(pdf_file):
74
+ # if pdf_file is None:
75
+ # return None
76
+ # pdf_path = pdf_file.name # Get the file path
77
+
78
+ # images = []
79
+ # try:
80
+ # doc = fitz.open(pdf_path)
81
+ # for page in doc:
82
+ # pix = page.get_pixmap()
83
+ # img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
84
+ # images.append(img)
85
+ # return images
86
+ # except Exception as e:
87
+ # print(f"Error converting PDF: {e}")
88
+ # return None
89
+
90
+ # def process_document(pdf_file, page_number, question):
91
+ # if pdf_file is None:
92
+ # return "Please upload a PDF file."
93
+
94
+ # images = pdf_to_images(pdf_file)
95
+ # if images is None:
96
+ # return "Failed to process the PDF file."
97
+
98
+ # if page_number < 1 or page_number > len(images):
99
+ # return f"Invalid page number. The PDF has {len(images)} pages."
100
+
101
+ # image = images[page_number - 1]
102
+
103
+ # # prepare encoder inputs
104
+ # pixel_values = processor(image, return_tensors="pt").pixel_values
105
+
106
+ # # prepare decoder inputs
107
+ # task_prompt = "{user_input}"
108
+ # prompt = task_prompt.replace("{user_input}", question)
109
+ # decoder_input_ids = processor.tokenizer(prompt, add_special_tokens=False, return_tensors="pt").input_ids
110
+
111
+ # # generate answer
112
+ # outputs = model.generate(
113
+ # pixel_values.to(device),
114
+ # decoder_input_ids=decoder_input_ids.to(device),
115
+ # max_length=model.decoder.config.max_position_embeddings,
116
+ # early_stopping=True,
117
+ # pad_token_id=processor.tokenizer.pad_token_id,
118
+ # eos_token_id=processor.tokenizer.eos_token_id,
119
+ # use_cache=True,
120
+ # num_beams=1,
121
+ # bad_words_ids=[[processor.tokenizer.unk_token_id]],
122
+ # return_dict_in_generate=True,
123
+ # )
124
+
125
+ # # postprocess
126
+ # sequence = processor.batch_decode(outputs.sequences)[0]
127
+ # sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
128
+ # sequence = re.sub(r"<.*?>", "", sequence, count=1).strip() # remove first task start token
129
+
130
+ # return processor.token2json(sequence)
131
+
132
+ # def update_page_preview(pdf_file, page_number):
133
+ # if pdf_file is None:
134
+ # return None
135
+ # images = pdf_to_images(pdf_file)
136
+ # if images is None or page_number < 1 or page_number > len(images):
137
+ # return None
138
+ # return images[page_number - 1]
139
+
140
+ # # def update_page_slider(pdf_file):
141
+ # # if pdf_file is None:
142
+ # # return gr.Slider(minimum=1, maximum=1, value=1, step=1, label="Page Number")
143
+ # # images = pdf_to_images(pdf_file)
144
+ # # if images is None:
145
+ # # return gr.Slider(minimum=1, maximum=1, value=1, step=1, label="Page Number")
146
+ # # return gr.Slider(minimum=1, maximum=len(images), value=1, step=1, label="Page Number")
147
+
148
+ # description = "Gradio Demo for Model-V3, an instance of `VisionEncoderDecoderModel` fine-tuned on DocVQA (document visual question answering). To use it, upload a PDF file, select a page number, type a question, and click 'submit'."
149
+ # article = "<p style='text-align: center'>Model-V3</p>"
150
+
151
+ # with gr.Blocks() as demo:
152
+ # gr.Markdown("# Demo: Model-V3 for Document Analysis")
153
+ # gr.Markdown(description)
154
+
155
+ # with gr.Row():
156
+ # with gr.Column(scale=1):
157
+ # pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
158
+ # page_slider = gr.Slider(minimum=1, maximum=1, value=1, step=1, label="Page Number")
159
+ # with gr.Column(scale=2):
160
+ # page_preview = gr.Image(label="Page Preview")
161
+
162
+ # question_input = gr.Textbox(label="Question")
163
+ # submit_button = gr.Button("Submit")
164
+ # output = gr.JSON(label="Output")
165
+
166
+ # def update_interface(pdf_file):
167
+ # if pdf_file is None:
168
+ # return gr.Slider(minimum=1, maximum=1, value=1, step=1, label="Page Number"), None
169
+ # images = pdf_to_images(pdf_file)
170
+ # if images is None:
171
+ # return gr.Slider(minimum=1, maximum=1, value=1, step=1, label="Page Number"), None
172
+ # return (
173
+ # gr.Slider(minimum=1, maximum=len(images), value=1, step=1, label="Page Number"),
174
+ # images[0] # Show the first page by default
175
+ # )
176
+ # pdf_input.change(update_interface, inputs=[pdf_input], outputs=[page_slider, page_preview])
177
+ # page_slider.change(update_page_preview, inputs=[pdf_input, page_slider], outputs=[page_preview])
178
+ # submit_button.click(process_document, inputs=[pdf_input, page_slider, question_input], outputs=[output])
179
+
180
+ # demo.launch()
181
+
182
+
183
  import re
184
  import gradio as gr
185
  import torch
 
210
  print(f"Error converting PDF: {e}")
211
  return None
212
 
213
+ def process_document(file, page_number, question, input_type):
214
+ if file is None:
215
+ return "Please upload a file."
 
 
 
 
216
 
217
+ if input_type == "PDF":
218
+ images = pdf_to_images(file)
219
+ if images is None:
220
+ return "Failed to process the PDF file."
221
+ if page_number < 1 or page_number > len(images):
222
+ return f"Invalid page number. The PDF has {len(images)} pages."
223
+ image = images[page_number - 1]
224
+ else: # Image
225
+ image = Image.open(file.name)
226
 
227
  # prepare encoder inputs
228
  pixel_values = processor(image, return_tensors="pt").pixel_values
 
253
 
254
  return processor.token2json(sequence)
255
 
256
+ def update_page_preview(file, page_number, input_type):
257
+ if file is None:
258
  return None
259
+ if input_type == "PDF":
260
+ images = pdf_to_images(file)
261
+ if images is None or page_number < 1 or page_number > len(images):
262
+ return None
263
+ return images[page_number - 1]
264
+ else: # Image
265
+ return Image.open(file.name)
266
 
267
+ description = "Gradio Demo for Model-V3, an instance of `VisionEncoderDecoderModel` fine-tuned on DocVQA (document visual question answering). To use it, upload a PDF or image file, select a page number (for PDF), type a question, and click 'submit'."
 
 
 
 
 
 
 
 
268
  article = "<p style='text-align: center'>Model-V3</p>"
269
 
270
  with gr.Blocks() as demo:
 
273
 
274
  with gr.Row():
275
  with gr.Column(scale=1):
276
+ input_type = gr.Radio(["PDF", "Image"], label="Input Type", value="PDF")
277
+ file_input = gr.File(label="Upload File")
278
+ page_slider = gr.Slider(minimum=1, maximum=1, value=1, step=1, label="Page Number (PDF only)")
279
  with gr.Column(scale=2):
280
+ page_preview = gr.Image(label="Page/Image Preview")
281
+
282
  question_input = gr.Textbox(label="Question")
283
  submit_button = gr.Button("Submit")
284
  output = gr.JSON(label="Output")
285
+
286
+ def update_interface(file, input_type):
287
+ if file is None:
288
+ return gr.Slider(visible=False, minimum=1, maximum=1, value=1, step=1, label="Page Number (PDF only)"), None
289
+
290
+ if input_type == "PDF":
291
+ images = pdf_to_images(file)
292
+ if images is None:
293
+ return gr.Slider(visible=False, minimum=1, maximum=1, value=1, step=1, label="Page Number (PDF only)"), None
294
+ return (
295
+ gr.Slider(visible=True, minimum=1, maximum=len(images), value=1, step=1, label="Page Number (PDF only)"),
296
+ images[0] # Show the first page by default
297
+ )
298
+ else: # Image
299
+ return gr.Slider(visible=False, minimum=1, maximum=1, value=1, step=1, label="Page Number (PDF only)"), Image.open(file.name)
300
+
301
+ input_type.change(lambda x: gr.File(label="Upload File", file_types=[".pdf"] if x == "PDF" else ["image/*"]), inputs=[input_type], outputs=[file_input])
302
+ file_input.change(update_interface, inputs=[file_input, input_type], outputs=[page_slider, page_preview])
303
+ page_slider.change(update_page_preview, inputs=[file_input, page_slider, input_type], outputs=[page_preview])
304
+ submit_button.click(process_document, inputs=[file_input, page_slider, question_input, input_type], outputs=[output])
305
 
306
  demo.launch()