linjieccc commited on
Commit
69f6386
·
1 Parent(s): fc53b5e
Files changed (11) hide show
  1. .gitattributes +5 -0
  2. README.md +4 -4
  3. app.py +448 -0
  4. business_card.png +3 -0
  5. custom.jpeg +3 -0
  6. footer.html +4 -0
  7. header.html +28 -0
  8. invoice.jpeg +3 -0
  9. license.jpeg +3 -0
  10. requirements.txt +5 -0
  11. resume.png +3 -0
.gitattributes CHANGED
@@ -32,3 +32,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ .psd filter=lfs diff=lfs merge=lfs -text
36
+ .jpeg filter=lfs diff=lfs merge=lfs -text
37
+ *.jpeg filter=lfs diff=lfs merge=lfs -text
38
+ *.psd filter=lfs diff=lfs merge=lfs -text
39
+ *.png filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
- title: Uie Test
3
- emoji: 📚
4
- colorFrom: blue
5
- colorTo: blue
6
  sdk: gradio
7
  sdk_version: 3.14.0
8
  app_file: app.py
 
1
  ---
2
+ title: UIE Test
3
+ emoji: 📄
4
+ colorFrom: gray
5
+ colorTo: pink
6
  sdk: gradio
7
  sdk_version: 3.14.0
8
  app_file: app.py
app.py ADDED
@@ -0,0 +1,448 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #-*- coding: UTF-8 -*-
2
+ # Copyright 2022 The Impira Team and the HuggingFace Team.
3
+ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ import os
18
+ import json
19
+ import base64
20
+ from io import BytesIO
21
+ from PIL import Image
22
+ import traceback
23
+
24
+ import requests
25
+ import numpy as np
26
+ import gradio as gr
27
+ import cv2
28
+
29
+ from paddlenlp import Taskflow
30
+ from paddlenlp.utils.doc_parser import DocParser
31
+
32
+ doc_parser = DocParser()
33
+ task_instance = Taskflow("information_extraction", model="uie-x-base", schema="")
34
+
35
+ examples = [
36
+ [
37
+ "invoice.jpeg",
38
+ "名称;纳税人识别号;开票日期",
39
+ ],
40
+ [
41
+ "custom.jpeg",
42
+ "收发货人;进口口岸;进口日期;运输方式;征免性质;境内目的地;运输工具名称;包装种类;件数;合同协议号"
43
+ ],
44
+ [
45
+ "resume.png",
46
+ "职位;年龄;学校|时间;学校|专业",
47
+ ],
48
+ ]
49
+
50
+ example_files = {
51
+ "Name;Title;Web Link;Email;Address": "business_card.png",
52
+ "Name;DOB;ISS;EXP": "license.jpeg",
53
+ "职位;年龄;学校|时间;学校|专业": "resume.png",
54
+ "收发货人;进口口岸;进口日期;运输方式;征免性质;境内目的地;运输工具名称;包装种类;件数;合同协议号": "custom.jpeg",
55
+ "名称;纳税人识别号;开票日期": "invoice.jpeg",
56
+ }
57
+
58
+ lang_map = {
59
+ "resume.png": "ch",
60
+ "custom.jpeg": "ch",
61
+ "business_card.png": "en",
62
+ "invoice.jpeg": "ch",
63
+ "license.jpeg": "en",
64
+ }
65
+
66
+ def dbc2sbc(s):
67
+ rs = ""
68
+ for char in s:
69
+ code = ord(char)
70
+ if code == 0x3000:
71
+ code = 0x0020
72
+ else:
73
+ code -= 0xfee0
74
+ if not (0x0021 <= code and code <= 0x7e):
75
+ rs += char
76
+ continue
77
+ rs += chr(code)
78
+ return rs
79
+
80
+
81
+ def process_path(path):
82
+ error = None
83
+ if path:
84
+ try:
85
+ images_list = [doc_parser.read_image(path)]
86
+ return (
87
+ path,
88
+ gr.update(visible=True, value=images_list),
89
+ gr.update(visible=True),
90
+ gr.update(visible=False, value=None),
91
+ gr.update(visible=False, value=None),
92
+ None,
93
+ )
94
+ except Exception as e:
95
+ traceback.print_exc()
96
+ error = str(e)
97
+ return (
98
+ None,
99
+ gr.update(visible=False, value=None),
100
+ gr.update(visible=False),
101
+ gr.update(visible=False, value=None),
102
+ gr.update(visible=False, value=None),
103
+ gr.update(visible=True, value=error) if error is not None else None,
104
+ None,
105
+ )
106
+
107
+
108
+ def process_upload(file):
109
+ if file:
110
+ return process_path(file.name)
111
+ else:
112
+ return (
113
+ None,
114
+ gr.update(visible=False, value=None),
115
+ gr.update(visible=False),
116
+ gr.update(visible=False, value=None),
117
+ gr.update(visible=False, value=None),
118
+ None,
119
+ )
120
+
121
+
122
+ def BGR2RGB(img):
123
+ pilimg = img.copy()
124
+ pilimg[:, :, 0] = img[:, :, 2]
125
+ pilimg[:, :, 2] = img[:, :, 0]
126
+ return pilimg
127
+
128
+
129
+ def np2base64(image_np):
130
+ image_np = BGR2RGB(image_np)
131
+ image = cv2.imencode('.jpg', image_np)[1]
132
+ base64_str = str(base64.b64encode(image))[2:-1]
133
+ return base64_str
134
+
135
+
136
+ def get_schema(schema_str):
137
+ def _is_ch(s):
138
+ for ch in s:
139
+ if "\u4e00" <= ch <= "\u9fff":
140
+ return True
141
+ return False
142
+ schema_lang = "ch" if _is_ch(schema_str) else "en"
143
+ schema = schema_str.split(";")
144
+ schema_list = []
145
+ for s in schema:
146
+ cand = s.split("|")
147
+ if len(cand) == 1:
148
+ schema_list.append(cand[0])
149
+ else:
150
+ subject = cand[0]
151
+ relations = cand[1:]
152
+ added = False
153
+ for a in schema_list:
154
+ if isinstance(a, dict):
155
+ if subject in a.keys():
156
+ a[subject].extend(relations)
157
+ added = True
158
+ break
159
+ if not added:
160
+ a = {subject: relations}
161
+ schema_list.append(a)
162
+ return schema_list, schema_lang
163
+
164
+
165
+ def run_taskflow(document, schema, argument):
166
+ task_instance.set_schema(schema)
167
+ # task_instance.set_argument(argument)
168
+ return task_instance({'doc': document})
169
+
170
+
171
+ def process_doc(document, schema, ocr_lang, layout_analysis):
172
+ if not schema:
173
+ schema = '时间;组织机构;人物'
174
+ if document is None:
175
+ return None, None
176
+
177
+ schema, schema_lang = get_schema(dbc2sbc(schema))
178
+ argument = {
179
+ "ocr_lang": ocr_lang,
180
+ "schema_lang": schema_lang,
181
+ "layout_analysis": layout_analysis
182
+ }
183
+ prediction = run_taskflow(document, schema, argument)[0]
184
+
185
+ img_show = doc_parser.write_image_with_results(
186
+ document,
187
+ result=prediction,
188
+ return_image=True)
189
+ img_list = [img_show]
190
+
191
+ return (
192
+ gr.update(visible=True, value=img_list),
193
+ gr.update(visible=True, value=prediction),
194
+ )
195
+
196
+
197
+ def load_example_document(img, schema, ocr_lang, layout_analysis):
198
+ if img is not None:
199
+ document = example_files[schema]
200
+ choice = lang_map[document].split("-")
201
+ ocr_lang = choice[0]
202
+ layout_analysis = False if len(choice) == 1 else True
203
+ preview, answer = process_doc(document, schema, ocr_lang, layout_analysis)
204
+ return document, schema, preview, gr.update(visible=True), answer
205
+ else:
206
+ return None, None, None, gr.update(visible=False), None
207
+
208
+
209
+ def read_content(file_path: str) -> str:
210
+ """read the content of target file
211
+ """
212
+ with open(file_path, 'r', encoding='utf-8') as f:
213
+ content = f.read()
214
+
215
+ return content
216
+
217
+
218
+ CSS = """
219
+ #prompt input {
220
+ font-size: 16px;
221
+ }
222
+ #url-textbox {
223
+ padding: 0 !important;
224
+ }
225
+ #short-upload-box .w-full {
226
+ min-height: 10rem !important;
227
+ }
228
+ /* I think something like this can be used to re-shape
229
+ * the table
230
+ */
231
+ /*
232
+ .gr-samples-table tr {
233
+ display: inline;
234
+ }
235
+ .gr-samples-table .p-2 {
236
+ width: 100px;
237
+ }
238
+ */
239
+ #select-a-file {
240
+ width: 100%;
241
+ }
242
+ #file-clear {
243
+ padding-top: 2px !important;
244
+ padding-bottom: 2px !important;
245
+ padding-left: 8px !important;
246
+ padding-right: 8px !important;
247
+ margin-top: 10px;
248
+ }
249
+ .gradio-container .gr-button-primary {
250
+ background: linear-gradient(180deg, #CDF9BE 0%, #AFF497 100%);
251
+ border: 1px solid #B0DCCC;
252
+ border-radius: 8px;
253
+ color: #1B8700;
254
+ }
255
+ .gradio-container.dark button#submit-button {
256
+ background: linear-gradient(180deg, #CDF9BE 0%, #AFF497 100%);
257
+ border: 1px solid #B0DCCC;
258
+ border-radius: 8px;
259
+ color: #1B8700
260
+ }
261
+ table.gr-samples-table tr td {
262
+ border: none;
263
+ outline: none;
264
+ }
265
+ table.gr-samples-table tr td:first-of-type {
266
+ width: 0%;
267
+ }
268
+ div#short-upload-box div.absolute {
269
+ display: none !important;
270
+ }
271
+ gradio-app > div > div > div > div.w-full > div, .gradio-app > div > div > div > div.w-full > div {
272
+ gap: 0px 2%;
273
+ }
274
+ gradio-app div div div div.w-full, .gradio-app div div div div.w-full {
275
+ gap: 0px;
276
+ }
277
+ gradio-app h2, .gradio-app h2 {
278
+ padding-top: 10px;
279
+ }
280
+ #answer {
281
+ overflow-y: scroll;
282
+ color: white;
283
+ background: #666;
284
+ border-color: #666;
285
+ font-size: 20px;
286
+ font-weight: bold;
287
+ }
288
+ #answer span {
289
+ color: white;
290
+ }
291
+ #answer textarea {
292
+ color:white;
293
+ background: #777;
294
+ border-color: #777;
295
+ font-size: 18px;
296
+ }
297
+ #url-error input {
298
+ color: red;
299
+ }
300
+ """
301
+
302
+ with gr.Blocks(css=CSS) as demo:
303
+ gr.HTML(read_content("header.html"))
304
+ gr.Markdown(
305
+ "**UIE-X 🧾 🎓** is a universal information extraction engine which supports both document and text inputs. It is powered by BAIDU and released on PaddleNLP. "
306
+ "Our extraction target(schema) can be set in natural language without limitation, and it also supports most extraction tasks. "
307
+ "The model performs well on zero-shot and few-shot settings. Moreover, on PaddleNLP, we provide a comprehensive and easy-to-use fine-tuning customization workflow."
308
+ "For more details, please visit the [GitHub](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/applications/information_extraction)"
309
+ )
310
+
311
+ document = gr.Variable()
312
+ is_text = gr.Variable()
313
+ example_schema = gr.Textbox(visible=False)
314
+ example_image = gr.Image(visible=False)
315
+ with gr.Row(equal_height=True):
316
+ with gr.Column():
317
+ with gr.Row():
318
+ gr.Markdown("## 1. 选择文件 / Select a file 📄", elem_id="select-a-file")
319
+ img_clear_button = gr.Button(
320
+ "Clear", variant="secondary", elem_id="file-clear", visible=False
321
+ )
322
+ image = gr.Gallery(visible=False)
323
+ with gr.Row(equal_height=True):
324
+ with gr.Column():
325
+ with gr.Row():
326
+ url = gr.Textbox(
327
+ show_label=False,
328
+ placeholder="URL",
329
+ lines=1,
330
+ max_lines=1,
331
+ elem_id="url-textbox",
332
+ )
333
+ submit = gr.Button("Get")
334
+ url_error = gr.Textbox(
335
+ visible=False,
336
+ elem_id="url-error",
337
+ max_lines=1,
338
+ interactive=False,
339
+ label="Error",
340
+ )
341
+ gr.Markdown("— or —")
342
+ upload = gr.File(label=None, interactive=True, elem_id="short-upload-box")
343
+ gr.Examples(
344
+ examples=examples,
345
+ inputs=[example_image, example_schema],
346
+ )
347
+
348
+ with gr.Column():
349
+ gr.Markdown("## 2. 信息抽取 / Information extraction ℹ️ ")
350
+ gr.Markdown("### 👉 设置schema")
351
+ gr.Markdown("实体抽取:实体类别之间以';'分割,例如 **人物;组织机构**")
352
+ gr.Markdown("关系抽取:需配置主体和关系类别,中间以'|'分割,例如 **人物|出生时间;人物|邮箱**")
353
+ gr.Markdown("### 👉 Set a schema")
354
+ gr.Markdown("Entity extraction: entity label should be separated by ';', e.g. **Person;Organization**")
355
+ gr.Markdown("Relation extraction: set the subject and relation type, separated by '|', e.g. **Person|Date;Person|Email**")
356
+ gr.Markdown("### 💪 模型定制 / Model customization")
357
+ gr.Markdown("我们建议通过[数据标注+微调](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/applications/information_extraction/document)的流程进一步增强模型在特定场景的效果")
358
+ gr.Markdown("We recommend to further improve the extraction performance in specific domain through the process of [data annotation & fine-tuning](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/applications/information_extraction/document)")
359
+
360
+ schema = gr.Textbox(
361
+ label="Schema",
362
+ placeholder="e.g. Name|Company;Name|Position;Email;Phone Number",
363
+ lines=1,
364
+ max_lines=1,
365
+ )
366
+
367
+ ocr_lang = gr.Radio(
368
+ choices=["ch", "en"],
369
+ value="en",
370
+ label="OCR语言 / OCR Language (Please choose ch for Chinese images.)",
371
+ )
372
+
373
+ layout_analysis = gr.Radio(
374
+ choices=["yes", "no"],
375
+ value="no",
376
+ label="版面分析 / Layout analysis (Better extraction for multi-line text)",
377
+ )
378
+
379
+ with gr.Row():
380
+ clear_button = gr.Button("Clear", variant="secondary")
381
+ submit_button = gr.Button(
382
+ "Submit", variant="primary", elem_id="submit-button"
383
+ )
384
+ with gr.Column():
385
+ output = gr.JSON(label="Output", visible=False)
386
+
387
+ for cb in [img_clear_button, clear_button]:
388
+ cb.click(
389
+ lambda _: (
390
+ gr.update(visible=False, value=None),
391
+ None,
392
+ gr.update(visible=False, value=None),
393
+ gr.update(visible=False),
394
+ None,
395
+ None,
396
+ None,
397
+ gr.update(visible=False, value=None),
398
+ None,
399
+ ),
400
+ inputs=clear_button,
401
+ outputs=[
402
+ image,
403
+ document,
404
+ output,
405
+ img_clear_button,
406
+ example_image,
407
+ upload,
408
+ url,
409
+ url_error,
410
+ schema,
411
+ ],
412
+ )
413
+
414
+ upload.change(
415
+ fn=process_upload,
416
+ inputs=[upload],
417
+ outputs=[document, image, img_clear_button, output, url_error],
418
+ )
419
+ submit.click(
420
+ fn=process_path,
421
+ inputs=[url],
422
+ outputs=[document, image, img_clear_button, output, url_error],
423
+ )
424
+
425
+ schema.submit(
426
+ fn=process_doc,
427
+ inputs=[document, schema, ocr_lang, layout_analysis],
428
+ outputs=[image, output],
429
+ )
430
+
431
+ submit_button.click(
432
+ fn=process_doc,
433
+ inputs=[document, schema, ocr_lang, layout_analysis],
434
+ outputs=[image, output],
435
+ )
436
+
437
+ example_image.change(
438
+ fn=load_example_document,
439
+ inputs=[example_image, example_schema, ocr_lang, layout_analysis],
440
+ outputs=[document, schema, image, img_clear_button, output],
441
+ )
442
+
443
+ gr.Markdown("[![Stargazers repo roster for @PaddlePaddle/PaddleNLP](https://reporoster.com/stars/PaddlePaddle/PaddleNLP)](https://github.com/PaddlePaddle/PaddleNLP)")
444
+ gr.HTML(read_content("footer.html"))
445
+
446
+
447
+ if __name__ == "__main__":
448
+ demo.launch(enable_queue=False)
business_card.png ADDED

Git LFS Details

  • SHA256: 68aa93a2b4122a517fac752507a4c65218fbaccbf16385afec02dbac0ecdbbdc
  • Pointer size: 131 Bytes
  • Size of remote file: 313 kB
custom.jpeg ADDED

Git LFS Details

  • SHA256: b0d83ab6cac4747e00192474a2e8636285bddfcab884c4083ad30c6284f13b10
  • Pointer size: 131 Bytes
  • Size of remote file: 520 kB
footer.html ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ <div class="footer">
2
+ <p>Model by <a href="https://github.com/PaddlePaddle/PaddleNLP" style="text-decoration: underline;" target="_blank">PaddleNLP</a> - Gradio Demo by 🤗 Hugging Face
3
+ </p>
4
+ </div>
header.html ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div style="text-align: center; max-width: 650px; margin: 0 auto;">
2
+ <div
3
+ style="
4
+ display: inline-flex;
5
+ gap: 0.8rem;
6
+ font-size: 1.75rem;
7
+ margin-bottom: 10px;
8
+ margin-left: 220px;
9
+ justify-content: center;
10
+ "
11
+ >
12
+ <a href="https://github.com/PaddlePaddle/PaddleNLP"><img src="https://user-images.githubusercontent.com/1371212/175816733-8ec25eb0-9af3-4380-9218-27c154518258.png" alt="PaddleNLP" width="60%"></a>
13
+ </div>
14
+ <div
15
+ style="
16
+ display: inline-flex;
17
+ align-items: center;
18
+ gap: 0.8rem;
19
+ font-size: 1.75rem;
20
+ margin-bottom: 10px;
21
+ justify-content: center;
22
+ ">
23
+ <a href="https://github.com/PaddlePaddle/PaddleNLP/tree/develop/model_zoo/ernie-layout"><h1 style="font-weight: 900; align-items: center; margin-bottom: 7px;">
24
+ UIE-X
25
+ </h1></a>
26
+ </div>
27
+ <a href="https://github.com/PaddlePaddle/PaddleNLP"><img src="https://user-images.githubusercontent.com/40840292/207606629-22da5e22-68e5-47a6-aafd-87d3d7447793.png" width="100%"></a>
28
+ </div>
invoice.jpeg ADDED

Git LFS Details

  • SHA256: a3afad8c016954d8f5b1e79cc9209ca54318c860e0228a812d3e75805cd50f4b
  • Pointer size: 132 Bytes
  • Size of remote file: 2.83 MB
license.jpeg ADDED

Git LFS Details

  • SHA256: 3fd243446a474f8c7de06b92da796e6a36d0604b4c83d7c30c027a8d3525a766
  • Pointer size: 131 Bytes
  • Size of remote file: 102 kB
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ numpy
2
+ opencv-python
3
+ paddlenlp
4
+ paddlepaddle
5
+ requests
resume.png ADDED

Git LFS Details

  • SHA256: 7be8498397a59f6aedf3cbee96041aea96b5d8f1aa667cf1d3ac5e93a7716734
  • Pointer size: 131 Bytes
  • Size of remote file: 191 kB