wendys-llc commited on
Commit
c47dc14
1 Parent(s): 0763565
.gitattributes CHANGED
@@ -17,6 +17,7 @@
17
  *.ot filter=lfs diff=lfs merge=lfs -text
18
  *.parquet filter=lfs diff=lfs merge=lfs -text
19
  *.pb filter=lfs diff=lfs merge=lfs -text
 
20
  *.pickle filter=lfs diff=lfs merge=lfs -text
21
  *.pkl filter=lfs diff=lfs merge=lfs -text
22
  *.pt filter=lfs diff=lfs merge=lfs -text
@@ -33,3 +34,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
17
  *.ot filter=lfs diff=lfs merge=lfs -text
18
  *.parquet filter=lfs diff=lfs merge=lfs -text
19
  *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pdf filter=lfs diff=lfs merge=lfs -text
21
  *.pickle filter=lfs diff=lfs merge=lfs -text
22
  *.pkl filter=lfs diff=lfs merge=lfs -text
23
  *.pt filter=lfs diff=lfs merge=lfs -text
 
34
  *.zip filter=lfs diff=lfs merge=lfs -text
35
  *.zst filter=lfs diff=lfs merge=lfs -text
36
  *tfevents* filter=lfs diff=lfs merge=lfs -text
37
+ examples/background-checks.pdf filter=lfs diff=lfs merge=lfs -text
38
+ examples/museums.pdf filter=lfs diff=lfs merge=lfs -text
39
+ examples/players.pdf filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .DS_Store
app.py ADDED
@@ -0,0 +1,382 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pdfplumber
3
+ import textwrap
4
+ import pprint
5
+ import json
6
+ import os
7
+ from pathlib import Path
8
+
9
+
10
+ def table_debugger(
11
+ file_obj,
12
+ page_num=0,
13
+ table_num=0,
14
+ crop_x0=None,
15
+ crop_top=None,
16
+ crop_x1=None,
17
+ crop_bottom=None,
18
+ vertical_strategy=None,
19
+ horizontal_strategy=None,
20
+ explicit_vertical_lines=None,
21
+ explicit_horizontal_lines=None,
22
+ snap_tolerance=None,
23
+ snap_x_tolerance=None,
24
+ snap_y_tolerance=None,
25
+ join_tolerance=None,
26
+ join_x_tolerance=None,
27
+ join_y_tolerance=None,
28
+ text_tolerance=None,
29
+ text_x_tolerance=None,
30
+ text_y_tolerance=None,
31
+ intersection_tolerance=None,
32
+ intersection_x_tolerance=None,
33
+ intersection_y_tolerance=None,
34
+ edge_min_length=None,
35
+ min_words_vertical=None,
36
+ min_words_horizontal=None,
37
+ keep_blank_chars=None,
38
+ ):
39
+ table_settings = {
40
+ "vertical_strategy": vertical_strategy,
41
+ "horizontal_strategy": horizontal_strategy,
42
+ "explicit_vertical_lines": json.loads(explicit_vertical_lines)
43
+ if explicit_vertical_lines
44
+ else None,
45
+ "explicit_horizontal_lines": json.loads(explicit_horizontal_lines)
46
+ if explicit_horizontal_lines
47
+ else None,
48
+ "snap_tolerance": snap_tolerance,
49
+ "snap_x_tolerance": snap_x_tolerance,
50
+ "snap_y_tolerance": snap_y_tolerance,
51
+ "join_tolerance": join_tolerance,
52
+ "join_x_tolerance": join_x_tolerance,
53
+ "join_y_tolerance": join_y_tolerance,
54
+ "text_tolerance": text_tolerance,
55
+ "text_x_tolerance": text_x_tolerance,
56
+ "text_y_tolerance": text_y_tolerance,
57
+ "intersection_tolerance": intersection_tolerance,
58
+ "intersection_x_tolerance": intersection_x_tolerance,
59
+ "intersection_y_tolerance": intersection_y_tolerance,
60
+ "edge_min_length": edge_min_length,
61
+ "min_words_vertical": min_words_vertical,
62
+ "min_words_horizontal": min_words_horizontal,
63
+ #'keep_blank_chars': keep_blank_chars
64
+ }
65
+
66
+ keys = list(table_settings.keys())
67
+ for key in keys:
68
+ if (
69
+ table_settings[key] == ""
70
+ or table_settings[key] == []
71
+ or table_settings[key] is None
72
+ ):
73
+ del table_settings[key]
74
+ elif table_settings[key].isdigit():
75
+ table_settings[key] = int(table_settings[key])
76
+
77
+ table_num = int(table_num)
78
+
79
+ with pdfplumber.open(file_obj.name) as pdf:
80
+ page_num = int(page_num)
81
+ page = pdf.pages[page_num]
82
+ page_width = int(page.width)
83
+ page_height = int(page.height)
84
+
85
+ crop_x0 = int(crop_x0) if crop_x0 else 0
86
+ crop_top = int(crop_top) if crop_top else 0
87
+ crop_x1 = int(crop_x1) if crop_x1 else page_width
88
+ crop_bottom = int(crop_bottom) if crop_bottom else page_height
89
+
90
+ # Allow negative numbers
91
+ if crop_bottom < 0:
92
+ crop_bottom = page_height + crop_bottom
93
+ if crop_x1 < 0:
94
+ crop_x1 = page_width + crop_x1
95
+
96
+ is_cropped = (
97
+ crop_x0 != 0
98
+ or crop_top != 0
99
+ or crop_x1 != page_width
100
+ or crop_bottom != page_height
101
+ )
102
+
103
+ # Only crop if we need to!
104
+ if is_cropped:
105
+ page = page.crop((crop_x0, crop_top, crop_x1, crop_bottom))
106
+
107
+ tables = page.extract_tables(table_settings)
108
+ if len(tables) > 0:
109
+ table = tables[0]
110
+ else:
111
+ table = None
112
+ visual = page.to_image().debug_tablefinder(table_settings).annotated
113
+
114
+ base_filename = file_obj.name.split("/")[-1]
115
+
116
+ notes = f"""
117
+ - **Filename:** {base_filename}
118
+ - **Pages:** {len(pdf.pages)}
119
+ - **Page num {int(page_num)}:**
120
+ - **Full dimensions:** {page_width} x {page_height}
121
+ - **Crop:** {crop_x0}, {crop_top}, {crop_x1}, {crop_bottom}
122
+ - **Tables found:** {len(tables)}
123
+
124
+ ```python
125
+ import pdfplumber
126
+ pdf = pdfplumber.open("{base_filename}")
127
+ page = pdf.pages[{page_num}]
128
+ """.strip()
129
+
130
+ if is_cropped:
131
+ notes += (
132
+ f"\n page = page.crop(({crop_x0}, {crop_top}, {crop_x1}, {crop_bottom}))"
133
+ )
134
+
135
+ notes += f"""\n
136
+ table_settings = {pprint.pformat(table_settings, indent=8).strip()}
137
+ tables = page.extract_tables(table_settings)
138
+ table = tables[{table_num}]
139
+ ```"""
140
+
141
+ notes = textwrap.dedent(notes)
142
+
143
+ return [notes, visual, table]
144
+
145
+
146
+ def demo_subset(
147
+ file_obj,
148
+ page_num,
149
+ table_num,
150
+ vertical_strategy,
151
+ horizontal_strategy,
152
+ snap_y_tolerance,
153
+ intersection_x_tolerance,
154
+ crop_bottom,
155
+ ):
156
+ return table_debugger(
157
+ file_obj,
158
+ page_num=page_num,
159
+ table_num=table_num,
160
+ vertical_strategy=vertical_strategy,
161
+ horizontal_strategy=horizontal_strategy,
162
+ snap_y_tolerance=snap_y_tolerance,
163
+ intersection_x_tolerance=intersection_x_tolerance,
164
+ crop_bottom=crop_bottom,
165
+ )
166
+
167
+
168
+ notes = gr.Markdown()
169
+ output_image = gr.Image()
170
+ data_table = gr.Dataframe(height=250, render=False)
171
+
172
+ crop_top = gr.Text(label="top", placeholder="top", container=False, render=False)
173
+ crop_x0 = gr.Text(label="x0", placeholder="left", container=False, render=False)
174
+ crop_x1 = gr.Text(
175
+ label="x1", placeholder="right (from page left)", container=False, render=False
176
+ )
177
+ crop_bottom = gr.Text(
178
+ label="bottom", placeholder="bottom (from page top)", container=False, render=False
179
+ )
180
+
181
+ vertical_strategy = gr.Dropdown(
182
+ label="Vertical Strategy",
183
+ choices=["lines", "lines_strict", "text", "explicit"],
184
+ render=False,
185
+ value="lines",
186
+ )
187
+ horizontal_strategy = gr.Dropdown(
188
+ label="Horizontal Strategy",
189
+ choices=["lines", "lines_strict", "text", "explicit"],
190
+ render=False,
191
+ value="lines",
192
+ )
193
+ explicit_vertical_lines = gr.Textbox(
194
+ label="explicit_vertical_lines", render=False, placeholder="[]"
195
+ )
196
+ explicit_horizontal_lines = gr.Textbox(
197
+ label="explicit_horizontal_lines", render=False, placeholder="[]"
198
+ )
199
+ snap_tolerance = gr.Textbox(label="Snap tolerance", placeholder="3", render=False)
200
+ snap_x_tolerance = gr.Textbox(label="Snap tolerance (x)", placeholder="3", render=False)
201
+ snap_y_tolerance = gr.Textbox(label="Snap tolerance (y)", placeholder="3", render=False)
202
+ join_tolerance = gr.Textbox(label="Join tolerance", placeholder="3", render=False)
203
+ join_x_tolerance = gr.Textbox(label="Join tolerance (x)", placeholder="3", render=False)
204
+ join_y_tolerance = gr.Textbox(label="Join tolerance (x)", placeholder="3", render=False)
205
+ text_tolerance = gr.Textbox(
206
+ label="Text tolerance", placeholder="1", render=False, value=None
207
+ )
208
+ text_x_tolerance = gr.Textbox(label="Text tolerance (x)", placeholder="1", render=False)
209
+ text_y_tolerance = gr.Textbox(label="Text tolerance (y)", placeholder="1", render=False)
210
+ intersection_tolerance = gr.Textbox(
211
+ label="Intersection tolerance", placeholder="1", render=False
212
+ )
213
+ intersection_x_tolerance = gr.Textbox(
214
+ label="Intersection tolerance (x)", placeholder="1", render=False
215
+ )
216
+ intersection_y_tolerance = gr.Textbox(
217
+ label="Intersection tolerance (y)", placeholder="1", render=False
218
+ )
219
+ edge_min_length = gr.Textbox(label="edge_min_length", placeholder="3", render=False)
220
+ min_words_vertical = gr.Textbox(
221
+ label="min_words_vertical", placeholder="3", render=False
222
+ )
223
+ min_words_horizontal = gr.Textbox(
224
+ label="min_words_horizontal", placeholder="1", render=False
225
+ )
226
+ keep_blank_chars = gr.Checkbox(label="Keep blank chars?", value=False)
227
+
228
+ file = gr.File(label="PDF", type="filepath", file_types=["pdf"], render=False)
229
+ page_num = gr.Number(
230
+ label="Page number", value=0, info="It's an index: first is 0!", render=False
231
+ )
232
+ table_num = gr.Number(
233
+ label="Table number", value=0, info="It's an index: first is 0!", render=False
234
+ )
235
+
236
+ example_dir = Path(os.path.dirname(__file__)).joinpath("examples")
237
+
238
+ examples = [
239
+ [str(example_dir.joinpath("players.pdf")), 0, 0, "text", "text", None, None, None],
240
+ [
241
+ str(example_dir.joinpath("museums.pdf")),
242
+ 2,
243
+ 0,
244
+ "lines",
245
+ "lines",
246
+ None,
247
+ None,
248
+ None,
249
+ ],
250
+ [
251
+ str(example_dir.joinpath("background-checks.pdf")),
252
+ 0,
253
+ 0,
254
+ "text",
255
+ "text",
256
+ 5,
257
+ 15,
258
+ 487,
259
+ ],
260
+ ]
261
+
262
+ with gr.Blocks() as demo:
263
+ gr.Markdown(
264
+ """
265
+ # pdfplumber table extraction playground
266
+
267
+ [pdfplumber](https://github.com/jsvine/pdfplumber/) is a delightful library for processing PDFs, including table extraction. **Scroll down for examples and lots more settings!**
268
+ """
269
+ )
270
+
271
+ with gr.Row():
272
+ with gr.Column(scale=2):
273
+ file.render()
274
+ with gr.Accordion("Table details", open=True):
275
+ with gr.Group():
276
+ with gr.Row():
277
+ page_num.render()
278
+ table_num.render()
279
+
280
+ with gr.Row():
281
+ vertical_strategy.render()
282
+ horizontal_strategy.render()
283
+
284
+ with gr.Accordion("Crop", open=True):
285
+ with gr.Group():
286
+ crop_top.render()
287
+ with gr.Row():
288
+ crop_x0.render()
289
+ crop_x1.render()
290
+ crop_bottom.render()
291
+
292
+ btn = gr.Button(value="Run")
293
+ btn.click(
294
+ table_debugger,
295
+ inputs=[
296
+ file,
297
+ page_num,
298
+ table_num,
299
+ crop_x0,
300
+ crop_top,
301
+ crop_x1,
302
+ crop_bottom,
303
+ vertical_strategy,
304
+ horizontal_strategy,
305
+ explicit_vertical_lines,
306
+ explicit_horizontal_lines,
307
+ snap_tolerance,
308
+ snap_x_tolerance,
309
+ snap_y_tolerance,
310
+ join_tolerance,
311
+ join_x_tolerance,
312
+ join_y_tolerance,
313
+ text_tolerance,
314
+ text_x_tolerance,
315
+ text_y_tolerance,
316
+ intersection_tolerance,
317
+ intersection_x_tolerance,
318
+ intersection_y_tolerance,
319
+ edge_min_length,
320
+ min_words_vertical,
321
+ min_words_horizontal,
322
+ keep_blank_chars,
323
+ ],
324
+ outputs=[notes, output_image, data_table],
325
+ )
326
+
327
+ notes.render()
328
+
329
+ with gr.Column(scale=3):
330
+ data_table.render()
331
+ output_image.render()
332
+
333
+ gr.Examples(
334
+ examples=examples,
335
+ inputs=[
336
+ file,
337
+ page_num,
338
+ table_num,
339
+ vertical_strategy,
340
+ horizontal_strategy,
341
+ snap_y_tolerance,
342
+ intersection_x_tolerance,
343
+ crop_bottom,
344
+ ],
345
+ outputs=[notes, output_image, data_table],
346
+ fn=demo_subset,
347
+ run_on_click=True,
348
+ )
349
+
350
+ gr.Markdown("## Additional options")
351
+ with gr.Row():
352
+ with gr.Column():
353
+ with gr.Group():
354
+ snap_tolerance.render()
355
+ with gr.Row():
356
+ snap_x_tolerance.render()
357
+ snap_y_tolerance.render()
358
+ join_tolerance.render()
359
+ with gr.Row():
360
+ join_x_tolerance.render()
361
+ join_y_tolerance.render()
362
+ text_tolerance.render()
363
+ with gr.Row():
364
+ text_x_tolerance.render()
365
+ text_y_tolerance.render()
366
+ intersection_tolerance.render()
367
+ with gr.Row():
368
+ intersection_x_tolerance.render()
369
+ intersection_y_tolerance.render()
370
+
371
+ with gr.Column():
372
+ with gr.Group():
373
+ explicit_vertical_lines.render()
374
+ explicit_horizontal_lines.render()
375
+ edge_min_length.render()
376
+ with gr.Row():
377
+ min_words_vertical.render()
378
+ min_words_horizontal.render()
379
+ keep_blank_chars.render()
380
+
381
+ if __name__ == "__main__":
382
+ demo.launch(share=True)
examples/background-checks.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f0ae6e017b0e97de211da8fd97316e55bd927af5740d6ce95e1b7b1539f9407
3
+ size 90468
examples/museums.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6501bfc83d7a1d88ae035fbc32a5fc14ccd9ccde714e0baf86f6a40647a25435
3
+ size 1554077
examples/players.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6731f3c8a74940bf625cd009743a6a42ec19c998de639ba14b250cd3e9b46f00
3
+ size 21310
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ pdfplumber
2
+ gradio