⚡️ add warning for truncation
Browse filesSigned-off-by: peter szemraj <peterszemraj@gmail.com>
- app.py +17 -15
- pdf2text.py +3 -1
app.py
CHANGED
@@ -55,7 +55,7 @@ def load_uploaded_file(file_obj, temp_dir: Path = None):
|
|
55 |
return None
|
56 |
|
57 |
|
58 |
-
def convert_PDF(pdf_obj, language: str = "en"):
|
59 |
"""
|
60 |
convert_PDF - convert a PDF file to text
|
61 |
|
@@ -76,15 +76,18 @@ def convert_PDF(pdf_obj, language: str = "en"):
|
|
76 |
conversion_stats = convert_PDF_to_Text(
|
77 |
file_path,
|
78 |
ocr_model=ocr_model,
|
79 |
-
max_pages=
|
80 |
)
|
81 |
converted_txt = conversion_stats["converted_text"]
|
82 |
num_pages = conversion_stats["num_pages"]
|
|
|
83 |
# if alt_lang: # TODO: fix this
|
84 |
|
85 |
rt = round((time.perf_counter() - st) / 60, 2)
|
86 |
print(f"Runtime: {rt} minutes")
|
87 |
html = ""
|
|
|
|
|
88 |
html += f"<p>Runtime: {rt} minutes on CPU for {num_pages} pages</p>"
|
89 |
|
90 |
return converted_txt, html
|
@@ -125,20 +128,14 @@ if __name__ == "__main__":
|
|
125 |
gr.Markdown("Upload your own file:")
|
126 |
gr.Markdown("_If no file is uploaded, a sample PDF will be used_")
|
127 |
|
128 |
-
|
129 |
-
|
130 |
-
label="VM file path",
|
131 |
-
placeholder="When the file is uploaded, the path will appear here",
|
132 |
-
value=pdf_obj,
|
133 |
-
)
|
134 |
-
with gr.Row():
|
135 |
-
uploaded_file = gr.File(
|
136 |
label="Upload a PDF file",
|
137 |
file_count="single",
|
138 |
type="file",
|
139 |
value= _here / "example_file.pdf",
|
140 |
)
|
141 |
-
load_file_button = gr.Button("Load Uploaded File")
|
142 |
|
143 |
gr.Markdown("---")
|
144 |
|
@@ -150,10 +147,15 @@ if __name__ == "__main__":
|
|
150 |
OCR_text = gr.Textbox(
|
151 |
label="OCR Result", placeholder="The OCR text will appear here"
|
152 |
)
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
|
|
|
|
|
|
|
|
|
|
157 |
|
158 |
convert_button.click(
|
159 |
fn=convert_PDF, inputs=[uploaded_file], outputs=[OCR_text, out_placeholder]
|
|
|
55 |
return None
|
56 |
|
57 |
|
58 |
+
def convert_PDF(pdf_obj, language: str = "en", max_pages=20,):
|
59 |
"""
|
60 |
convert_PDF - convert a PDF file to text
|
61 |
|
|
|
76 |
conversion_stats = convert_PDF_to_Text(
|
77 |
file_path,
|
78 |
ocr_model=ocr_model,
|
79 |
+
max_pages=max_pages,
|
80 |
)
|
81 |
converted_txt = conversion_stats["converted_text"]
|
82 |
num_pages = conversion_stats["num_pages"]
|
83 |
+
was_truncated = conversion_stats["truncated"]
|
84 |
# if alt_lang: # TODO: fix this
|
85 |
|
86 |
rt = round((time.perf_counter() - st) / 60, 2)
|
87 |
print(f"Runtime: {rt} minutes")
|
88 |
html = ""
|
89 |
+
if was_truncated:
|
90 |
+
html += f"<p>WARNING - PDF was truncated to {max_pages} pages</p>"
|
91 |
html += f"<p>Runtime: {rt} minutes on CPU for {num_pages} pages</p>"
|
92 |
|
93 |
return converted_txt, html
|
|
|
128 |
gr.Markdown("Upload your own file:")
|
129 |
gr.Markdown("_If no file is uploaded, a sample PDF will be used_")
|
130 |
|
131 |
+
|
132 |
+
uploaded_file = gr.File(
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
label="Upload a PDF file",
|
134 |
file_count="single",
|
135 |
type="file",
|
136 |
value= _here / "example_file.pdf",
|
137 |
)
|
138 |
+
# load_file_button = gr.Button("Load Uploaded File")
|
139 |
|
140 |
gr.Markdown("---")
|
141 |
|
|
|
147 |
OCR_text = gr.Textbox(
|
148 |
label="OCR Result", placeholder="The OCR text will appear here"
|
149 |
)
|
150 |
+
text_file = gr.File(
|
151 |
+
label="Download Text File",
|
152 |
+
file_count="single",
|
153 |
+
type="file",
|
154 |
+
interactive=False,
|
155 |
+
)
|
156 |
+
# load_file_button.click(
|
157 |
+
# fn=load_uploaded_file, inputs=uploaded_file, outputs=[pdf_obj]
|
158 |
+
# )
|
159 |
|
160 |
convert_button.click(
|
161 |
fn=convert_PDF, inputs=[uploaded_file], outputs=[OCR_text, out_placeholder]
|
pdf2text.py
CHANGED
@@ -591,12 +591,13 @@ def convert_PDF_to_Text(
|
|
591 |
ocr_model = ocr_predictor(pretrained=True) if ocr_model is None else ocr_model
|
592 |
logging.info(f"starting OCR on {PDF_file.name}")
|
593 |
doc = DocumentFile.from_pdf(PDF_file)
|
594 |
-
|
595 |
if len(doc) > max_pages:
|
596 |
logging.warning(
|
597 |
f"PDF has {len(doc)} pages, which is more than {max_pages}.. truncating"
|
598 |
)
|
599 |
doc = doc[:max_pages]
|
|
|
600 |
|
601 |
# Analyze
|
602 |
logging.info(f"running OCR on {len(doc)} pages")
|
@@ -616,6 +617,7 @@ def convert_PDF_to_Text(
|
|
616 |
"runtime": round(fn_rt, 2),
|
617 |
"date": str(date.today()),
|
618 |
"converted_text": ocr_results,
|
|
|
619 |
"length": len(ocr_results),
|
620 |
}
|
621 |
|
|
|
591 |
ocr_model = ocr_predictor(pretrained=True) if ocr_model is None else ocr_model
|
592 |
logging.info(f"starting OCR on {PDF_file.name}")
|
593 |
doc = DocumentFile.from_pdf(PDF_file)
|
594 |
+
truncated = False
|
595 |
if len(doc) > max_pages:
|
596 |
logging.warning(
|
597 |
f"PDF has {len(doc)} pages, which is more than {max_pages}.. truncating"
|
598 |
)
|
599 |
doc = doc[:max_pages]
|
600 |
+
truncated = True
|
601 |
|
602 |
# Analyze
|
603 |
logging.info(f"running OCR on {len(doc)} pages")
|
|
|
617 |
"runtime": round(fn_rt, 2),
|
618 |
"date": str(date.today()),
|
619 |
"converted_text": ocr_results,
|
620 |
+
"truncated": truncated,
|
621 |
"length": len(ocr_results),
|
622 |
}
|
623 |
|