Spaces:
Sleeping
Sleeping
ankur-bohra
commited on
Commit
·
8152a82
1
Parent(s):
d2ac459
Add interface
Browse files- app.py +359 -13
- categories/__init__.py +5 -5
- categories/random_/__init__.py +8 -2
- main.py +10 -5
- requirements.txt +79 -301
app.py
CHANGED
@@ -1,20 +1,366 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
|
|
|
|
|
|
2 |
from categories import Category
|
|
|
3 |
|
4 |
-
|
5 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
-
_input = gr.File(type="binary", file_count="single", file_types=["image", "pdf"], label="Upload a receipt as a document or as an image")
|
9 |
-
category_output = gr.Dropdown(Category.__members__.values(), value=Category.RANDOM, label="Identified category")
|
10 |
-
information_output = gr.Json(label="Extracted information")
|
11 |
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
-
|
|
|
1 |
+
import base64
|
2 |
+
import os
|
3 |
+
from io import BytesIO
|
4 |
+
from pathlib import Path
|
5 |
+
|
6 |
+
from langchain.schema.output_parser import OutputParserException
|
7 |
import gradio as gr
|
8 |
+
from PIL import Image
|
9 |
+
|
10 |
+
import categories
|
11 |
from categories import Category
|
12 |
+
from main import process_image, process_pdf
|
13 |
|
14 |
+
HF_TOKEN = os.getenv("HF_TOKEN")
|
15 |
+
PDF_IFRAME = """
|
16 |
+
<div style="border-radius: 10px; width: 100%; overflow: hidden;">
|
17 |
+
<iframe
|
18 |
+
src="data:application/pdf;base64,{0}"
|
19 |
+
width="100%"
|
20 |
+
height="400"
|
21 |
+
type="application/pdf">
|
22 |
+
</iframe>
|
23 |
+
</div>"""
|
24 |
|
25 |
+
hf_writer_normal = gr.HuggingFaceDatasetSaver(
|
26 |
+
HF_TOKEN, "automatic-reimbursement-tool-demo", separate_dirs=False
|
27 |
+
)
|
28 |
+
hf_writer_incorrect = gr.HuggingFaceDatasetSaver(
|
29 |
+
HF_TOKEN, "automatic-reimbursement-tool-demo-incorrect", separate_dirs=False
|
30 |
+
)
|
31 |
+
# with open("examples/example1.pdf", "rb") as pdf_file:
|
32 |
+
# base64_pdf = base64.b64encode(pdf_file.read())
|
33 |
|
|
|
|
|
|
|
34 |
|
35 |
+
# example_paths = []
|
36 |
+
# current_file_path = None
|
37 |
+
|
38 |
+
# def ignore_examples(function):
|
39 |
+
# def new_function(*args, **kwargs):
|
40 |
+
# global example_paths, current_file_path
|
41 |
+
# if current_file_path not in example_paths:
|
42 |
+
# return function(*args, **kwargs)
|
43 |
+
|
44 |
+
|
45 |
+
|
46 |
+
def display_file(input_file):
|
47 |
+
global current_file_path
|
48 |
+
current_file_path = input_file.name if input_file else None
|
49 |
+
if not input_file:
|
50 |
+
return gr.HTML.update(visible=False), gr.Image.update(visible=False)
|
51 |
+
if input_file.name.endswith(".pdf"):
|
52 |
+
with open(input_file.name, "rb") as input_file:
|
53 |
+
pdf_base64 = base64.b64encode(input_file.read()).decode()
|
54 |
+
return gr.HTML.update(
|
55 |
+
PDF_IFRAME.format(pdf_base64), visible=True
|
56 |
+
), gr.Image.update(visible=False)
|
57 |
+
else:
|
58 |
+
# image = Image.open(input_file.name)
|
59 |
+
return gr.HTML.update(visible=False), gr.Image.update(
|
60 |
+
input_file.name, visible=True
|
61 |
+
)
|
62 |
+
|
63 |
+
|
64 |
+
def show_intermediate_outputs(show_intermediate):
|
65 |
+
if show_intermediate:
|
66 |
+
return gr.Accordion.update(visible=True)
|
67 |
+
else:
|
68 |
+
return gr.Accordion.update(visible=False)
|
69 |
+
|
70 |
+
|
71 |
+
def show_share_contact(share_result):
|
72 |
+
return gr.Textbox.update(visible=share_result)
|
73 |
+
|
74 |
+
|
75 |
+
def clear_inputs():
|
76 |
+
return gr.File.update(value=None)
|
77 |
+
|
78 |
+
|
79 |
+
def submit(input_file, old_text):
|
80 |
+
if not input_file:
|
81 |
+
gr.Error("Please upload a file to continue!")
|
82 |
+
return gr.Textbox.update()
|
83 |
+
|
84 |
+
# Send change to preprocessed image or to extracted text
|
85 |
+
if input_file.name.endswith(".pdf"):
|
86 |
+
text = process_pdf(Path(input_file.name), extract_only=True)
|
87 |
+
else:
|
88 |
+
text = process_image(Path(input_file.name), extract_only=True)
|
89 |
+
return text
|
90 |
+
|
91 |
+
|
92 |
+
def categorize_extracted_text(extracted_text):
|
93 |
+
category = categories.categorize_text(extracted_text)
|
94 |
+
# gr.Info(f"Recognized category: {category}")
|
95 |
+
return category
|
96 |
+
|
97 |
+
|
98 |
+
def extract_from_category(category, extracted_text):
|
99 |
+
# gr.Info("Received category: " + category)
|
100 |
+
if not category:
|
101 |
+
return (
|
102 |
+
gr.Chatbot.update(None),
|
103 |
+
gr.JSON.update(None),
|
104 |
+
gr.Button.update(interactive=False),
|
105 |
+
gr.Button.update(interactive=False),
|
106 |
+
)
|
107 |
+
category = Category[category]
|
108 |
+
chain = categories.category_modules[category].chain
|
109 |
+
formatted_prompt = chain.prompt.format_prompt(
|
110 |
+
text=extracted_text,
|
111 |
+
format_instructions=chain.output_parser.get_format_instructions(),
|
112 |
+
)
|
113 |
+
result = chain.generate(
|
114 |
+
input_list=[
|
115 |
+
{
|
116 |
+
"text": extracted_text,
|
117 |
+
"format_instructions": chain.output_parser.get_format_instructions(),
|
118 |
+
}
|
119 |
+
]
|
120 |
+
)
|
121 |
+
question = f""
|
122 |
+
if len(formatted_prompt.messages) > 1:
|
123 |
+
question += f"**System:**\n{formatted_prompt.messages[1].content}"
|
124 |
+
question += f"\n\n**Human:**\n{formatted_prompt.messages[0].content}"
|
125 |
+
answer = result.generations[0][0].text
|
126 |
+
try:
|
127 |
+
information = chain.output_parser.parse_with_prompt(answer, formatted_prompt)
|
128 |
+
information = information.json() if information else {}
|
129 |
+
except OutputParserException as e:
|
130 |
+
information = {
|
131 |
+
"error": "Unable to parse chatbot output",
|
132 |
+
"details": str(e),
|
133 |
+
"output": e.llm_output,
|
134 |
+
}
|
135 |
+
return (
|
136 |
+
gr.Chatbot.update([[question, answer]]),
|
137 |
+
gr.JSON.update(information),
|
138 |
+
gr.Button.update(interactive=True),
|
139 |
+
gr.Button.update(interactive=True),
|
140 |
+
)
|
141 |
+
|
142 |
+
|
143 |
+
def dynamic_auto_flag(flag_method):
|
144 |
+
def modified_flag_method(share_result, *args, **kwargs):
|
145 |
+
if share_result:
|
146 |
+
flag_method(*args, **kwargs)
|
147 |
+
|
148 |
+
return modified_flag_method
|
149 |
+
|
150 |
+
|
151 |
+
# def save_example_and_submit(input_file):
|
152 |
+
# example_paths.append(input_file.name)
|
153 |
+
# submit(input_file, "")
|
154 |
+
|
155 |
+
|
156 |
+
with gr.Blocks(title="Automatic Reimbursement Tool Demo") as page:
|
157 |
+
gr.Markdown("<center><h1>Automatic Reimbursement Tool Demo</h1></center>")
|
158 |
+
gr.Markdown("<h2>Description</h2>")
|
159 |
+
gr.Markdown(
|
160 |
+
"The reimbursement filing process can be time-consuming and cumbersome, causing "
|
161 |
+
"frustration for faculty members and finance departments. Our project aims to "
|
162 |
+
"automate the information extraction involved in the process by feeding "
|
163 |
+
"extracted text to language models such as ChatGPT. This demo showcases the "
|
164 |
+
"categorization and extraction parts of the pipeline. Categorization is done "
|
165 |
+
"to identify the relevant details associated with the text, after which "
|
166 |
+
"extraction is done for those details using a language model."
|
167 |
+
)
|
168 |
+
gr.Markdown("<h2>Try it out!</h2>")
|
169 |
+
with gr.Box() as demo:
|
170 |
+
with gr.Row():
|
171 |
+
with gr.Column(variant="panel"):
|
172 |
+
gr.HTML(
|
173 |
+
'<div><center style="color:rgb(200, 200, 200);">Input</center></div>'
|
174 |
+
)
|
175 |
+
pdf_preview = gr.HTML(label="Preview", show_label=True, visible=False)
|
176 |
+
image_preview = gr.Image(
|
177 |
+
label="Preview", show_label=True, visible=False, height=350
|
178 |
+
)
|
179 |
+
input_file = gr.File(
|
180 |
+
label="Input receipt",
|
181 |
+
show_label=True,
|
182 |
+
type="file",
|
183 |
+
file_count="single",
|
184 |
+
file_types=["image", ".pdf"],
|
185 |
+
)
|
186 |
+
input_file.change(
|
187 |
+
display_file, input_file, [pdf_preview, image_preview]
|
188 |
+
)
|
189 |
+
|
190 |
+
with gr.Row():
|
191 |
+
clear = gr.Button("Clear", variant="secondary")
|
192 |
+
submit_button = gr.Button("Submit", variant="primary")
|
193 |
+
|
194 |
+
show_intermediate = gr.Checkbox(
|
195 |
+
False,
|
196 |
+
label="Show intermediate outputs",
|
197 |
+
info="There are several intermediate steps in the process such as preprocessing, OCR, chatbot interaction. You can choose to show their results here.",
|
198 |
+
)
|
199 |
+
share_result = gr.Checkbox(
|
200 |
+
True,
|
201 |
+
label="Share results",
|
202 |
+
info="Sharing your result with us will help us immensely in improving this tool.",
|
203 |
+
interactive=True,
|
204 |
+
)
|
205 |
+
contact = gr.Textbox(
|
206 |
+
type="email",
|
207 |
+
label="Contact",
|
208 |
+
interactive=True,
|
209 |
+
placeholder="Enter your email address",
|
210 |
+
info="Optionally, enter your email address to allow us to contact you regarding your result.",
|
211 |
+
visible=True,
|
212 |
+
)
|
213 |
+
share_result.change(show_share_contact, share_result, [contact])
|
214 |
+
|
215 |
+
with gr.Column(variant="panel"):
|
216 |
+
gr.HTML(
|
217 |
+
'<div><center style="color:rgb(200, 200, 200);">Output</center></div>'
|
218 |
+
)
|
219 |
+
category = gr.Dropdown(
|
220 |
+
value=None,
|
221 |
+
choices=Category.__members__.keys(),
|
222 |
+
label=f"Recognized category ({', '.join(Category.__members__.keys())})",
|
223 |
+
show_label=True,
|
224 |
+
interactive=False,
|
225 |
+
)
|
226 |
+
intermediate_outputs = gr.Accordion(
|
227 |
+
"Intermediate outputs", open=True, visible=False
|
228 |
+
)
|
229 |
+
with intermediate_outputs:
|
230 |
+
extracted_text = gr.Textbox(
|
231 |
+
label="Extracted text",
|
232 |
+
show_label=True,
|
233 |
+
max_lines=5,
|
234 |
+
show_copy_button=True,
|
235 |
+
lines=5,
|
236 |
+
interactive=False,
|
237 |
+
)
|
238 |
+
chatbot = gr.Chatbot(
|
239 |
+
None,
|
240 |
+
label="Chatbot interaction",
|
241 |
+
show_label=True,
|
242 |
+
interactive=False,
|
243 |
+
height=240,
|
244 |
+
)
|
245 |
+
information = gr.JSON(label="Extracted information")
|
246 |
+
with gr.Row():
|
247 |
+
flag_incorrect_button = gr.Button(
|
248 |
+
"Flag as incorrect", variant="stop", interactive=True
|
249 |
+
)
|
250 |
+
flag_irrelevant_button = gr.Button(
|
251 |
+
"Flag as irrelevant", variant="stop", interactive=True
|
252 |
+
)
|
253 |
+
|
254 |
+
show_intermediate.change(
|
255 |
+
show_intermediate_outputs, show_intermediate, [intermediate_outputs]
|
256 |
+
)
|
257 |
+
|
258 |
+
clear.click(clear_inputs, None, [input_file])
|
259 |
+
submit_button.click(
|
260 |
+
submit,
|
261 |
+
[input_file, extracted_text],
|
262 |
+
[extracted_text],
|
263 |
+
)
|
264 |
+
submit_button.click(
|
265 |
+
lambda input_file, category, chatbot, information: (
|
266 |
+
gr.Dropdown.update(None),
|
267 |
+
gr.Chatbot.update(None),
|
268 |
+
gr.Textbox.update(None),
|
269 |
+
) if input_file else (category, chatbot, information),
|
270 |
+
[input_file, category, chatbot, information],
|
271 |
+
[category, chatbot, information],
|
272 |
+
)
|
273 |
+
extracted_text.change(
|
274 |
+
categorize_extracted_text,
|
275 |
+
[extracted_text],
|
276 |
+
[category],
|
277 |
+
)
|
278 |
+
category.change(
|
279 |
+
extract_from_category,
|
280 |
+
[category, extracted_text],
|
281 |
+
[chatbot, information, flag_incorrect_button, flag_irrelevant_button],
|
282 |
+
)
|
283 |
+
|
284 |
+
hf_writer_normal.setup(
|
285 |
+
[input_file, extracted_text, category, chatbot, information, contact],
|
286 |
+
flagging_dir="flagged",
|
287 |
+
)
|
288 |
+
flag_method = gr.flagging.FlagMethod(
|
289 |
+
hf_writer_normal, "", "", visual_feedback=True
|
290 |
+
)
|
291 |
+
information.change(
|
292 |
+
dynamic_auto_flag(flag_method),
|
293 |
+
inputs=[
|
294 |
+
share_result,
|
295 |
+
input_file,
|
296 |
+
extracted_text,
|
297 |
+
category,
|
298 |
+
chatbot,
|
299 |
+
information,
|
300 |
+
contact,
|
301 |
+
],
|
302 |
+
outputs=None,
|
303 |
+
preprocess=False,
|
304 |
+
queue=False,
|
305 |
+
)
|
306 |
+
|
307 |
+
hf_writer_incorrect.setup(
|
308 |
+
[input_file, extracted_text, category, chatbot, information, contact],
|
309 |
+
flagging_dir="flagged_incorrect",
|
310 |
+
)
|
311 |
+
flag_incorrect_method = gr.flagging.FlagMethod(
|
312 |
+
hf_writer_incorrect,
|
313 |
+
"Flag as incorrect",
|
314 |
+
"Incorrect",
|
315 |
+
visual_feedback=True,
|
316 |
+
)
|
317 |
+
flag_incorrect_button.click(
|
318 |
+
lambda: gr.Button.update(value="Saving...", interactive=False),
|
319 |
+
None,
|
320 |
+
flag_incorrect_button,
|
321 |
+
queue=False,
|
322 |
+
)
|
323 |
+
flag_incorrect_button.click(
|
324 |
+
flag_incorrect_method,
|
325 |
+
inputs=[
|
326 |
+
input_file,
|
327 |
+
extracted_text,
|
328 |
+
category,
|
329 |
+
chatbot,
|
330 |
+
information,
|
331 |
+
contact,
|
332 |
+
],
|
333 |
+
outputs=[flag_incorrect_button],
|
334 |
+
preprocess=False,
|
335 |
+
queue=False,
|
336 |
+
)
|
337 |
+
|
338 |
+
flag_irrelevant_method = gr.flagging.FlagMethod(
|
339 |
+
hf_writer_incorrect,
|
340 |
+
"Flag as irrelevant",
|
341 |
+
"Irrelevant",
|
342 |
+
visual_feedback=True,
|
343 |
+
)
|
344 |
+
flag_irrelevant_button.click(
|
345 |
+
lambda: gr.Button.update(value="Saving...", interactive=False),
|
346 |
+
None,
|
347 |
+
flag_irrelevant_button,
|
348 |
+
queue=False,
|
349 |
+
)
|
350 |
+
flag_irrelevant_button.click(
|
351 |
+
flag_irrelevant_method,
|
352 |
+
inputs=[
|
353 |
+
input_file,
|
354 |
+
extracted_text,
|
355 |
+
category,
|
356 |
+
chatbot,
|
357 |
+
information,
|
358 |
+
contact,
|
359 |
+
],
|
360 |
+
outputs=[flag_irrelevant_button],
|
361 |
+
preprocess=False,
|
362 |
+
queue=False,
|
363 |
+
)
|
364 |
+
|
365 |
|
366 |
+
page.launch(show_api=True, show_error=True, debug=True)
|
categories/__init__.py
CHANGED
@@ -14,11 +14,11 @@ from . import accomodation, random_, travel_cab, travel_flight
|
|
14 |
|
15 |
|
16 |
class Category(Enum):
|
17 |
-
ACCOMODATION = "
|
18 |
-
TRAVEL_FLIGHT = "
|
19 |
-
TRAVEL_CAB = "
|
20 |
-
# VENDOR = "
|
21 |
-
RANDOM = "
|
22 |
|
23 |
|
24 |
category_modules = {
|
|
|
14 |
|
15 |
|
16 |
class Category(Enum):
|
17 |
+
ACCOMODATION = "ACCOMODATION"
|
18 |
+
TRAVEL_FLIGHT = "TRAVEL_FLIGHT"
|
19 |
+
TRAVEL_CAB = "TRAVEL_CAB"
|
20 |
+
# VENDOR = "VENDOR"
|
21 |
+
RANDOM = "RANDOM"
|
22 |
|
23 |
|
24 |
category_modules = {
|
categories/random_/__init__.py
CHANGED
@@ -124,5 +124,11 @@ Customers desirous of availing input GST credit are requested to create a Busine
|
|
124 |
Please note that this invoice is not a demand for payment
|
125 |
|
126 |
Page 1 of 1"""
|
127 |
-
result = chain.
|
128 |
-
print(result.json(indent=4))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
Please note that this invoice is not a demand for payment
|
125 |
|
126 |
Page 1 of 1"""
|
127 |
+
# result = chain.prompt.format_prompt(text=text, format_instructions=fixing_parser.get_format_instructions())
|
128 |
+
# print(result.json(indent=4))
|
129 |
+
result = chain.generate(input_list=[{"text": text, "format_instructions": fixing_parser.get_format_instructions()}])
|
130 |
+
print(result)
|
131 |
+
result = fixing_parser.parse_with_prompt(result.generations[0][0].text, chain.prompt.format_prompt(text=text, format_instructions=fixing_parser.get_format_instructions()))
|
132 |
+
print(result)
|
133 |
+
# result = chain.run(text=text, format_instructions=output_parser.get_format_instructions(), verbose=True)
|
134 |
+
# print(result)
|
main.py
CHANGED
@@ -16,11 +16,12 @@ def categorize_and_parse_text(text: str) -> BaseModel:
|
|
16 |
Returns: The category of the text.
|
17 |
"""
|
18 |
category = categories.categorize_text(text)
|
19 |
-
|
|
|
20 |
result = categories.run_category_chain(category, text)
|
21 |
return result
|
22 |
|
23 |
-
def process_pdf(filename: Path) -> BaseModel:
|
24 |
"""Processes the given PDF file and extracts information from it.
|
25 |
|
26 |
Args:
|
@@ -35,13 +36,14 @@ def process_pdf(filename: Path) -> BaseModel:
|
|
35 |
# If the encoded text is too short, a pdf scanner probably added a watermark
|
36 |
if len(text) < 20:
|
37 |
# Try to extract text from images
|
38 |
-
images = processing.
|
39 |
text = extract.extract_text_from_images_pyocr_tesseract(images)
|
40 |
-
|
|
|
41 |
result = categorize_and_parse_text(text)
|
42 |
return result
|
43 |
|
44 |
-
def process_image(filename: Path) -> BaseModel:
|
45 |
"""Processes the given image file and extracts information from it.
|
46 |
|
47 |
Args:
|
@@ -50,8 +52,11 @@ def process_image(filename: Path) -> BaseModel:
|
|
50 |
Returns: The extracted information.
|
51 |
"""
|
52 |
image = Image.open(filename)
|
|
|
53 |
text = extract.extract_text_from_image_pyocr_tesseract(image)
|
54 |
image.close()
|
|
|
|
|
55 |
result = categorize_and_parse_text(text)
|
56 |
return result
|
57 |
|
|
|
16 |
Returns: The category of the text.
|
17 |
"""
|
18 |
category = categories.categorize_text(text)
|
19 |
+
# if stop_on_category:
|
20 |
+
# return category, text
|
21 |
result = categories.run_category_chain(category, text)
|
22 |
return result
|
23 |
|
24 |
+
def process_pdf(filename: Path, extract_only=False) -> BaseModel:
|
25 |
"""Processes the given PDF file and extracts information from it.
|
26 |
|
27 |
Args:
|
|
|
36 |
# If the encoded text is too short, a pdf scanner probably added a watermark
|
37 |
if len(text) < 20:
|
38 |
# Try to extract text from images
|
39 |
+
images = processing.preprocess_pdf_pdf2image(pdf_bytes)
|
40 |
text = extract.extract_text_from_images_pyocr_tesseract(images)
|
41 |
+
if extract_only:
|
42 |
+
return text
|
43 |
result = categorize_and_parse_text(text)
|
44 |
return result
|
45 |
|
46 |
+
def process_image(filename: Path, extract_only=False) -> BaseModel:
|
47 |
"""Processes the given image file and extracts information from it.
|
48 |
|
49 |
Args:
|
|
|
52 |
Returns: The extracted information.
|
53 |
"""
|
54 |
image = Image.open(filename)
|
55 |
+
image = processing.preprocess_image(image)
|
56 |
text = extract.extract_text_from_image_pyocr_tesseract(image)
|
57 |
image.close()
|
58 |
+
if extract_only:
|
59 |
+
return text
|
60 |
result = categorize_and_parse_text(text)
|
61 |
return result
|
62 |
|
requirements.txt
CHANGED
@@ -1,345 +1,123 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
alabaster==0.7.12
|
8 |
-
anyio==3.5.0
|
9 |
-
appdirs==1.4.4
|
10 |
-
argon2-cffi==21.3.0
|
11 |
-
argon2-cffi-bindings==21.2.0
|
12 |
-
arrow==1.2.3
|
13 |
-
astroid==2.14.2
|
14 |
-
astropy==5.1
|
15 |
asttokens==2.2.1
|
16 |
async-timeout==4.0.2
|
17 |
-
|
18 |
-
attrs==22.1.0
|
19 |
-
Automat==20.2.0
|
20 |
-
autopep8==1.6.0
|
21 |
-
Babel==2.11.0
|
22 |
backcall==0.2.0
|
23 |
-
|
24 |
-
beautifulsoup4==4.12.2
|
25 |
-
binaryornot==0.4.4
|
26 |
-
bleach==4.1.0
|
27 |
-
bokeh==3.1.1
|
28 |
-
botocore==1.29.76
|
29 |
-
Bottleneck==1.3.5
|
30 |
-
brotlipy==0.7.0
|
31 |
certifi==2023.7.22
|
32 |
-
|
33 |
-
|
34 |
-
charset-normalizer==2.0.4
|
35 |
-
click==8.0.4
|
36 |
-
cloudpickle==2.2.1
|
37 |
colorama==0.4.6
|
38 |
-
colorcet==3.0.1
|
39 |
comm==0.1.3
|
40 |
-
|
41 |
-
contourpy==1.0.5
|
42 |
-
cookiecutter==1.7.3
|
43 |
-
cryptography==39.0.1
|
44 |
-
cssselect==1.1.0
|
45 |
cycler==0.11.0
|
46 |
-
cytoolz==0.12.0
|
47 |
-
daal4py==2023.1.1
|
48 |
-
dask==2023.6.0
|
49 |
dataclasses-json==0.5.13
|
50 |
-
datasets==2.
|
51 |
-
datashader==0.15.0
|
52 |
-
datashape==0.5.4
|
53 |
debugpy==1.6.7
|
54 |
decorator==5.1.1
|
55 |
-
|
56 |
-
|
57 |
-
dill==0.3.6
|
58 |
-
distributed==2023.6.0
|
59 |
-
docstring-to-markdown==0.11
|
60 |
-
docutils==0.18.1
|
61 |
-
entrypoints==0.4
|
62 |
-
et-xmlfile==1.1.0
|
63 |
-
exceptiongroup==1.0.4
|
64 |
executing==1.2.0
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
greenlet==2.0.
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
huggingface-hub==0.
|
78 |
-
hvplot==0.8.4
|
79 |
-
hyperlink==21.0.0
|
80 |
idna==3.4
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
incremental==21.3.0
|
87 |
-
inflection==0.5.1
|
88 |
-
iniconfig==1.1.1
|
89 |
-
intake==0.6.8
|
90 |
-
intervaltree==3.1.0
|
91 |
-
ipykernel==6.22.0
|
92 |
-
ipython==8.12.0
|
93 |
-
ipython-genutils==0.2.0
|
94 |
-
ipywidgets==8.0.4
|
95 |
-
iso4217==1.9.20220401
|
96 |
-
isort==5.9.3
|
97 |
-
itemadapter==0.3.0
|
98 |
-
itemloaders==1.0.4
|
99 |
-
itsdangerous==2.0.1
|
100 |
-
jaraco.classes==3.2.1
|
101 |
jedi==0.18.2
|
102 |
-
jellyfish==0.9.0
|
103 |
Jinja2==3.1.2
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
jsonschema==4.17.3
|
109 |
-
jupyter==1.0.0
|
110 |
-
jupyter_client==8.2.0
|
111 |
-
jupyter-console==6.6.3
|
112 |
-
jupyter_core==5.3.0
|
113 |
-
jupyter-events==0.6.3
|
114 |
-
jupyter-server==1.23.6
|
115 |
-
jupyter_server_fileid==0.9.0
|
116 |
-
jupyter_server_terminals==0.4.4
|
117 |
-
jupyter_server_ydoc==0.8.0
|
118 |
-
jupyter-ydoc==0.2.4
|
119 |
-
jupyterlab==3.6.3
|
120 |
-
jupyterlab-pygments==0.1.2
|
121 |
-
jupyterlab_server==2.22.0
|
122 |
-
jupyterlab-widgets==3.0.5
|
123 |
-
keyring==23.13.1
|
124 |
kiwisolver==1.4.4
|
125 |
-
langchain==0.0.
|
126 |
langsmith==0.0.15
|
127 |
-
|
128 |
-
lazy-object-proxy==1.6.0
|
129 |
-
linkify-it-py==2.0.0
|
130 |
-
llvmlite==0.40.0
|
131 |
-
lmdb==1.4.1
|
132 |
-
locket==1.0.0
|
133 |
-
lxml==4.9.2
|
134 |
-
lz4==4.3.2
|
135 |
-
Markdown==3.4.1
|
136 |
markdown-it-py==2.2.0
|
137 |
-
MarkupSafe==2.1.
|
138 |
marshmallow==3.20.1
|
139 |
-
matplotlib==3.7.
|
140 |
matplotlib-inline==0.1.6
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
mkl-fft==1.3.6
|
147 |
-
mkl-random==1.2.2
|
148 |
-
mkl-service==2.4.0
|
149 |
-
more-itertools==8.12.0
|
150 |
-
mpmath==1.2.1
|
151 |
-
msgpack==1.0.3
|
152 |
-
multidict==6.0.2
|
153 |
-
multipledispatch==0.6.0
|
154 |
-
multiprocess==0.70.14
|
155 |
-
munkres==1.1.4
|
156 |
-
mypy-extensions==0.4.3
|
157 |
-
nbclassic==0.5.5
|
158 |
-
nbclient==0.5.13
|
159 |
-
nbconvert==7.7.3
|
160 |
-
nbformat==5.7.0
|
161 |
nest-asyncio==1.5.6
|
162 |
-
networkx==2.8.4
|
163 |
-
nltk==3.7
|
164 |
-
notebook==6.5.4
|
165 |
-
notebook_shim==0.2.2
|
166 |
-
numba==0.57.0
|
167 |
numexpr==2.8.4
|
168 |
-
numpy==1.
|
169 |
-
numpydoc==1.5.0
|
170 |
openai==0.27.8
|
171 |
openapi-schema-pydantic==1.2.4
|
172 |
opencv-python-headless==4.8.0.74
|
173 |
-
|
174 |
-
packaging==23.
|
175 |
-
pandas==
|
176 |
-
pandocfilters==1.5.0
|
177 |
-
panel==1.1.0
|
178 |
-
param==1.13.0
|
179 |
-
paramiko==2.8.1
|
180 |
-
parsel==1.6.0
|
181 |
parso==0.8.3
|
182 |
-
partd==1.2.0
|
183 |
-
pathspec==0.10.3
|
184 |
-
patsy==0.5.3
|
185 |
pdf2image==1.16.3
|
186 |
-
pep8==1.7.1
|
187 |
-
pexpect==4.8.0
|
188 |
pickleshare==0.7.5
|
189 |
-
Pillow==
|
190 |
-
pip==23.1
|
191 |
-
platformdirs==3.
|
192 |
-
|
193 |
-
pluggy==1.0.0
|
194 |
-
ply==3.11
|
195 |
-
pooch==1.4.0
|
196 |
-
poyo==0.5.0
|
197 |
-
prometheus-client==0.14.1
|
198 |
-
prompt-toolkit==3.0.38
|
199 |
-
Protego==0.1.16
|
200 |
psutil==5.9.5
|
201 |
-
ptyprocess==0.7.0
|
202 |
pure-eval==0.2.2
|
203 |
-
|
204 |
-
pyarrow==11.0.0
|
205 |
-
pyasn1==0.4.8
|
206 |
-
pyasn1-modules==0.2.8
|
207 |
-
pycodestyle==2.10.0
|
208 |
-
pycparser==2.21
|
209 |
-
pyct==0.5.0
|
210 |
-
pycurl==7.45.2
|
211 |
pydantic==1.10.12
|
212 |
-
|
213 |
-
|
214 |
-
pyerfa==2.0.0
|
215 |
-
pyflakes==3.0.1
|
216 |
Pygments==2.15.1
|
217 |
-
pylint==2.16.2
|
218 |
-
pylint-venv==2.3.0
|
219 |
-
pyls-spyder==0.4.0
|
220 |
-
PyNaCl==1.5.0
|
221 |
pyocr==0.8.3
|
222 |
-
pyodbc==4.0.34
|
223 |
-
pyOpenSSL==23.0.0
|
224 |
pyparsing==3.0.9
|
225 |
pypdf==3.13.0
|
226 |
-
PyQt5==5.15.7
|
227 |
-
PyQt5-sip==12.11.0
|
228 |
-
PyQtWebEngine==5.15.4
|
229 |
-
pyrsistent==0.18.0
|
230 |
-
PySocks==1.7.1
|
231 |
-
pytest==7.3.1
|
232 |
python-dateutil==2.8.2
|
233 |
-
python-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
pywin32==305.1
|
244 |
-
pywin32-ctypes==0.2.0
|
245 |
-
pywinpty==2.0.10
|
246 |
-
PyYAML==6.0
|
247 |
-
pyzmq==25.0.2
|
248 |
-
QDarkStyle==3.0.2
|
249 |
-
qstylizer==0.2.2
|
250 |
-
QtAwesome==1.2.2
|
251 |
-
qtconsole==5.4.2
|
252 |
-
QtPy==2.2.0
|
253 |
-
queuelib==1.5.0
|
254 |
-
regex==2022.7.9
|
255 |
-
requests==2.29.0
|
256 |
-
requests-file==1.5.1
|
257 |
-
responses==0.13.3
|
258 |
-
rfc3339-validator==0.1.4
|
259 |
-
rfc3986-validator==0.1.1
|
260 |
-
rope==1.7.0
|
261 |
-
Rtree==1.0.1
|
262 |
-
s3fs==2023.4.0
|
263 |
-
sacremoses==0.0.43
|
264 |
-
scikit-image==0.20.0
|
265 |
-
scikit-learn==1.2.2
|
266 |
-
scikit-learn-intelex==20230426.121158
|
267 |
-
scipy==1.10.1
|
268 |
-
Scrapy==2.8.0
|
269 |
-
seaborn==0.12.2
|
270 |
-
Send2Trash==1.8.0
|
271 |
-
service-identity==18.1.0
|
272 |
-
setuptools==67.8.0
|
273 |
-
sip==6.6.2
|
274 |
six==1.16.0
|
275 |
-
|
276 |
-
|
277 |
-
snowballstemmer==2.2.0
|
278 |
-
sortedcontainers==2.4.0
|
279 |
-
soupsieve==2.4
|
280 |
-
Sphinx==5.0.2
|
281 |
-
sphinxcontrib-applehelp==1.0.2
|
282 |
-
sphinxcontrib-devhelp==1.0.2
|
283 |
-
sphinxcontrib-htmlhelp==2.0.0
|
284 |
-
sphinxcontrib-jsmath==1.0.1
|
285 |
-
sphinxcontrib-qthelp==1.0.3
|
286 |
-
sphinxcontrib-serializinghtml==1.1.5
|
287 |
-
spyder==5.4.3
|
288 |
-
spyder-kernels==2.4.3
|
289 |
-
SQLAlchemy==1.4.39
|
290 |
stack-data==0.6.2
|
291 |
-
|
292 |
-
sympy==1.11.1
|
293 |
-
tables==3.8.0
|
294 |
-
tabulate==0.8.10
|
295 |
-
TBB==0.2
|
296 |
-
tblib==1.7.0
|
297 |
tenacity==8.2.2
|
298 |
-
terminado==0.17.1
|
299 |
-
text-unidecode==1.3
|
300 |
-
textdistance==4.2.1
|
301 |
-
threadpoolctl==2.2.0
|
302 |
-
three-merge==0.1.1
|
303 |
-
tifffile==2021.7.2
|
304 |
-
tinycss2==1.2.1
|
305 |
-
tldextract==3.2.0
|
306 |
-
tokenizers==0.13.2
|
307 |
-
toml==0.10.2
|
308 |
-
tomli==2.0.1
|
309 |
-
tomlkit==0.11.1
|
310 |
toolz==0.12.0
|
311 |
-
|
312 |
-
tornado==6.3.1
|
313 |
tqdm==4.65.0
|
314 |
traitlets==5.9.0
|
315 |
-
|
316 |
-
Twisted==22.10.0
|
317 |
-
twisted-iocpsupport==1.0.2
|
318 |
-
typing_extensions==4.6.3
|
319 |
typing-inspect==0.9.0
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
w3lib==1.21.0
|
325 |
-
watchdog==2.1.6
|
326 |
wcwidth==0.2.6
|
327 |
-
|
328 |
-
websocket-client==0.58.0
|
329 |
-
Werkzeug==2.2.3
|
330 |
-
whatthepatch==1.0.2
|
331 |
wheel==0.38.4
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
xarray==2022.11.0
|
336 |
-
xlwings==0.29.1
|
337 |
-
xxhash==2.0.2
|
338 |
-
xyzservices==2022.9.0
|
339 |
-
y-py==0.5.9
|
340 |
-
yapf==0.31.0
|
341 |
-
yarl==1.8.1
|
342 |
-
ypy-websocket==0.8.2
|
343 |
-
zict==2.2.0
|
344 |
-
zipp==3.11.0
|
345 |
-
zope.interface==5.4.0
|
|
|
1 |
+
aiofiles==23.1.0
|
2 |
+
aiohttp==3.8.5
|
3 |
+
aiosignal==1.3.1
|
4 |
+
altair==5.0.1
|
5 |
+
annotated-types==0.5.0
|
6 |
+
anyio==3.7.1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
asttokens==2.2.1
|
8 |
async-timeout==4.0.2
|
9 |
+
attrs==23.1.0
|
|
|
|
|
|
|
|
|
10 |
backcall==0.2.0
|
11 |
+
backports.functools-lru-cache==1.6.5
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
certifi==2023.7.22
|
13 |
+
charset-normalizer==3.2.0
|
14 |
+
click==8.1.6
|
|
|
|
|
|
|
15 |
colorama==0.4.6
|
|
|
16 |
comm==0.1.3
|
17 |
+
contourpy==1.1.0
|
|
|
|
|
|
|
|
|
18 |
cycler==0.11.0
|
|
|
|
|
|
|
19 |
dataclasses-json==0.5.13
|
20 |
+
datasets==2.14.1
|
|
|
|
|
21 |
debugpy==1.6.7
|
22 |
decorator==5.1.1
|
23 |
+
dill==0.3.7
|
24 |
+
exceptiongroup==1.1.2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
executing==1.2.0
|
26 |
+
fastapi==0.100.1
|
27 |
+
ffmpy==0.3.1
|
28 |
+
filelock==3.12.2
|
29 |
+
fonttools==4.41.1
|
30 |
+
frozenlist==1.4.0
|
31 |
+
fsspec==2023.6.0
|
32 |
+
gradio==3.39.0
|
33 |
+
gradio_client==0.3.0
|
34 |
+
greenlet==2.0.2
|
35 |
+
h11==0.14.0
|
36 |
+
httpcore==0.17.3
|
37 |
+
httpx==0.24.1
|
38 |
+
huggingface-hub==0.16.4
|
|
|
|
|
39 |
idna==3.4
|
40 |
+
importlib-metadata==6.8.0
|
41 |
+
importlib-resources==6.0.0
|
42 |
+
ipykernel==6.25.0
|
43 |
+
ipython==8.14.0
|
44 |
+
iso4217==1.11.20220401
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
jedi==0.18.2
|
|
|
46 |
Jinja2==3.1.2
|
47 |
+
jsonschema==4.18.4
|
48 |
+
jsonschema-specifications==2023.7.1
|
49 |
+
jupyter_client==8.3.0
|
50 |
+
jupyter_core==5.3.1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
kiwisolver==1.4.4
|
52 |
+
langchain==0.0.247
|
53 |
langsmith==0.0.15
|
54 |
+
linkify-it-py==2.0.2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
markdown-it-py==2.2.0
|
56 |
+
MarkupSafe==2.1.3
|
57 |
marshmallow==3.20.1
|
58 |
+
matplotlib==3.7.2
|
59 |
matplotlib-inline==0.1.6
|
60 |
+
mdit-py-plugins==0.3.3
|
61 |
+
mdurl==0.1.2
|
62 |
+
multidict==6.0.4
|
63 |
+
multiprocess==0.70.15
|
64 |
+
mypy-extensions==1.0.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
nest-asyncio==1.5.6
|
|
|
|
|
|
|
|
|
|
|
66 |
numexpr==2.8.4
|
67 |
+
numpy==1.25.1
|
|
|
68 |
openai==0.27.8
|
69 |
openapi-schema-pydantic==1.2.4
|
70 |
opencv-python-headless==4.8.0.74
|
71 |
+
orjson==3.9.2
|
72 |
+
packaging==23.1
|
73 |
+
pandas==2.0.3
|
|
|
|
|
|
|
|
|
|
|
74 |
parso==0.8.3
|
|
|
|
|
|
|
75 |
pdf2image==1.16.3
|
|
|
|
|
76 |
pickleshare==0.7.5
|
77 |
+
Pillow==10.0.0
|
78 |
+
pip==23.2.1
|
79 |
+
platformdirs==3.9.1
|
80 |
+
prompt-toolkit==3.0.39
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
psutil==5.9.5
|
|
|
82 |
pure-eval==0.2.2
|
83 |
+
pyarrow==12.0.1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
pydantic==1.10.12
|
85 |
+
pydantic_core==2.4.0
|
86 |
+
pydub==0.25.1
|
|
|
|
|
87 |
Pygments==2.15.1
|
|
|
|
|
|
|
|
|
88 |
pyocr==0.8.3
|
|
|
|
|
89 |
pyparsing==3.0.9
|
90 |
pypdf==3.13.0
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
python-dateutil==2.8.2
|
92 |
+
python-multipart==0.0.6
|
93 |
+
pytz==2023.3
|
94 |
+
pywin32==304
|
95 |
+
PyYAML==6.0.1
|
96 |
+
pyzmq==25.1.0
|
97 |
+
referencing==0.30.0
|
98 |
+
requests==2.31.0
|
99 |
+
rpds-py==0.9.2
|
100 |
+
semantic-version==2.10.0
|
101 |
+
setuptools==68.0.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
six==1.16.0
|
103 |
+
sniffio==1.3.0
|
104 |
+
SQLAlchemy==2.0.19
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
stack-data==0.6.2
|
106 |
+
starlette==0.27.0
|
|
|
|
|
|
|
|
|
|
|
107 |
tenacity==8.2.2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
toolz==0.12.0
|
109 |
+
tornado==6.3.2
|
|
|
110 |
tqdm==4.65.0
|
111 |
traitlets==5.9.0
|
112 |
+
typing_extensions==4.7.1
|
|
|
|
|
|
|
113 |
typing-inspect==0.9.0
|
114 |
+
tzdata==2023.3
|
115 |
+
uc-micro-py==1.0.2
|
116 |
+
urllib3==2.0.4
|
117 |
+
uvicorn==0.23.1
|
|
|
|
|
118 |
wcwidth==0.2.6
|
119 |
+
websockets==11.0.3
|
|
|
|
|
|
|
120 |
wheel==0.38.4
|
121 |
+
xxhash==3.3.0
|
122 |
+
yarl==1.9.2
|
123 |
+
zipp==3.16.2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|