π general ease of use
Browse filesSigned-off-by: peter szemraj <peterszemraj@gmail.com>
app.py
CHANGED
@@ -72,7 +72,9 @@ def convert_PDF(pdf_obj, language: str = "en", max_pages=20,):
|
|
72 |
if isinstance(pdf_obj, list):
|
73 |
pdf_obj = pdf_obj[0]
|
74 |
file_path = Path(pdf_obj.name)
|
75 |
-
|
|
|
|
|
76 |
conversion_stats = convert_PDF_to_Text(
|
77 |
file_path,
|
78 |
ocr_model=ocr_model,
|
@@ -90,7 +92,11 @@ def convert_PDF(pdf_obj, language: str = "en", max_pages=20,):
|
|
90 |
html += f"<p>WARNING - PDF was truncated to {max_pages} pages</p>"
|
91 |
html += f"<p>Runtime: {rt} minutes on CPU for {num_pages} pages</p>"
|
92 |
|
93 |
-
|
|
|
|
|
|
|
|
|
94 |
|
95 |
|
96 |
if __name__ == "__main__":
|
@@ -125,7 +131,7 @@ if __name__ == "__main__":
|
|
125 |
with gr.Column():
|
126 |
|
127 |
gr.Markdown("## Load Inputs")
|
128 |
-
gr.Markdown("Upload your own file
|
129 |
gr.Markdown("_If no file is uploaded, a sample PDF will be used_")
|
130 |
|
131 |
|
@@ -135,13 +141,12 @@ if __name__ == "__main__":
|
|
135 |
type="file",
|
136 |
value= _here / "example_file.pdf",
|
137 |
)
|
138 |
-
# load_file_button = gr.Button("Load Uploaded File")
|
139 |
|
140 |
gr.Markdown("---")
|
141 |
|
142 |
with gr.Column():
|
143 |
gr.Markdown("## Convert PDF to Text")
|
144 |
-
convert_button = gr.Button("Convert PDF!")
|
145 |
out_placeholder = gr.HTML("<p><em>Output will appear below:</em></p>")
|
146 |
gr.Markdown("### Output")
|
147 |
OCR_text = gr.Textbox(
|
@@ -153,11 +158,8 @@ if __name__ == "__main__":
|
|
153 |
type="file",
|
154 |
interactive=False,
|
155 |
)
|
156 |
-
# load_file_button.click(
|
157 |
-
# fn=load_uploaded_file, inputs=uploaded_file, outputs=[pdf_obj]
|
158 |
-
# )
|
159 |
|
160 |
convert_button.click(
|
161 |
-
fn=convert_PDF, inputs=[uploaded_file], outputs=[OCR_text, out_placeholder]
|
162 |
)
|
163 |
demo.launch(enable_queue=True)
|
|
|
72 |
if isinstance(pdf_obj, list):
|
73 |
pdf_obj = pdf_obj[0]
|
74 |
file_path = Path(pdf_obj.name)
|
75 |
+
if not file_path.suffix == ".pdf":
|
76 |
+
logging.error(f"File {file_path} is not a PDF file")
|
77 |
+
return "File is not a PDF file", None, None
|
78 |
conversion_stats = convert_PDF_to_Text(
|
79 |
file_path,
|
80 |
ocr_model=ocr_model,
|
|
|
92 |
html += f"<p>WARNING - PDF was truncated to {max_pages} pages</p>"
|
93 |
html += f"<p>Runtime: {rt} minutes on CPU for {num_pages} pages</p>"
|
94 |
|
95 |
+
_output_name = f"RESULT_{file_path.stem}_OCR.txt"
|
96 |
+
with open(_output_name, "w", encoding="utf-8", errors="ignore") as f:
|
97 |
+
f.write(converted_txt)
|
98 |
+
|
99 |
+
return converted_txt, html, _output_name
|
100 |
|
101 |
|
102 |
if __name__ == "__main__":
|
|
|
131 |
with gr.Column():
|
132 |
|
133 |
gr.Markdown("## Load Inputs")
|
134 |
+
gr.Markdown("Upload your own file & replace the default")
|
135 |
gr.Markdown("_If no file is uploaded, a sample PDF will be used_")
|
136 |
|
137 |
|
|
|
141 |
type="file",
|
142 |
value= _here / "example_file.pdf",
|
143 |
)
|
|
|
144 |
|
145 |
gr.Markdown("---")
|
146 |
|
147 |
with gr.Column():
|
148 |
gr.Markdown("## Convert PDF to Text")
|
149 |
+
convert_button = gr.Button("Convert PDF!", variant="primary")
|
150 |
out_placeholder = gr.HTML("<p><em>Output will appear below:</em></p>")
|
151 |
gr.Markdown("### Output")
|
152 |
OCR_text = gr.Textbox(
|
|
|
158 |
type="file",
|
159 |
interactive=False,
|
160 |
)
|
|
|
|
|
|
|
161 |
|
162 |
convert_button.click(
|
163 |
+
fn=convert_PDF, inputs=[uploaded_file], outputs=[OCR_text, out_placeholder, text_file]
|
164 |
)
|
165 |
demo.launch(enable_queue=True)
|