format
Browse files- app.py +6 -1
- pdf2text.py +7 -8
app.py
CHANGED
@@ -95,7 +95,12 @@ if __name__ == "__main__":
|
|
95 |
logging.info(f"Using GPU status: {use_GPU}")
|
96 |
logging.info("Loading OCR model")
|
97 |
with contextlib.redirect_stdout(None):
|
98 |
-
ocr_model = ocr_predictor(
|
|
|
|
|
|
|
|
|
|
|
99 |
|
100 |
# define pdf bytes as None
|
101 |
pdf_obj = _here / "example_file.pdf"
|
|
|
95 |
logging.info(f"Using GPU status: {use_GPU}")
|
96 |
logging.info("Loading OCR model")
|
97 |
with contextlib.redirect_stdout(None):
|
98 |
+
ocr_model = ocr_predictor(
|
99 |
+
"db_resnet50",
|
100 |
+
"crnn_mobilenet_v3_large",
|
101 |
+
pretrained=True,
|
102 |
+
assume_straight_pages=True,
|
103 |
+
)
|
104 |
|
105 |
# define pdf bytes as None
|
106 |
pdf_obj = _here / "example_file.pdf"
|
pdf2text.py
CHANGED
@@ -32,6 +32,8 @@ from tqdm.auto import tqdm
|
|
32 |
|
33 |
from doctr.io import DocumentFile
|
34 |
from doctr.models import ocr_predictor
|
|
|
|
|
35 |
def fast_scandir(dirname):
|
36 |
# return all subfolders in a given filepath
|
37 |
|
@@ -421,7 +423,6 @@ def download_URL(url: str, file=None, dlpath=None, verbose=False):
|
|
421 |
"""
|
422 |
|
423 |
|
424 |
-
|
425 |
# need to run only once to load model into memory
|
426 |
|
427 |
custom_replace_list = {
|
@@ -554,6 +555,7 @@ def postprocess(text: str) -> str:
|
|
554 |
|
555 |
return eval_and_replace(proc)
|
556 |
|
|
|
557 |
def result2text(result) -> str:
|
558 |
"""Convert OCR result to text"""
|
559 |
|
@@ -568,11 +570,10 @@ def result2text(result) -> str:
|
|
568 |
text += word.value + " "
|
569 |
full_doc.append(text)
|
570 |
|
571 |
-
|
572 |
-
|
573 |
full_text = "\n".join(full_doc)
|
574 |
return full_text
|
575 |
|
|
|
576 |
import warnings
|
577 |
from datetime import date
|
578 |
from os.path import join
|
@@ -593,7 +594,9 @@ def convert_PDF_to_Text(
|
|
593 |
doc = DocumentFile.from_pdf(PDF_file)
|
594 |
|
595 |
if len(doc) > max_pages:
|
596 |
-
logging.warning(
|
|
|
|
|
597 |
doc = doc[:max_pages]
|
598 |
|
599 |
# Analyze
|
@@ -603,14 +606,10 @@ def convert_PDF_to_Text(
|
|
603 |
proc_text = format_ocr_out(raw_text)
|
604 |
output_text = postprocess(proc_text)
|
605 |
|
606 |
-
|
607 |
fn_rt = time.perf_counter() - st
|
608 |
|
609 |
-
|
610 |
-
|
611 |
logging.info("OCR complete")
|
612 |
|
613 |
-
|
614 |
results_dict = {
|
615 |
"num_pages": len(doc),
|
616 |
"runtime": round(fn_rt, 2),
|
|
|
32 |
|
33 |
from doctr.io import DocumentFile
|
34 |
from doctr.models import ocr_predictor
|
35 |
+
|
36 |
+
|
37 |
def fast_scandir(dirname):
|
38 |
# return all subfolders in a given filepath
|
39 |
|
|
|
423 |
"""
|
424 |
|
425 |
|
|
|
426 |
# need to run only once to load model into memory
|
427 |
|
428 |
custom_replace_list = {
|
|
|
555 |
|
556 |
return eval_and_replace(proc)
|
557 |
|
558 |
+
|
559 |
def result2text(result) -> str:
|
560 |
"""Convert OCR result to text"""
|
561 |
|
|
|
570 |
text += word.value + " "
|
571 |
full_doc.append(text)
|
572 |
|
|
|
|
|
573 |
full_text = "\n".join(full_doc)
|
574 |
return full_text
|
575 |
|
576 |
+
|
577 |
import warnings
|
578 |
from datetime import date
|
579 |
from os.path import join
|
|
|
594 |
doc = DocumentFile.from_pdf(PDF_file)
|
595 |
|
596 |
if len(doc) > max_pages:
|
597 |
+
logging.warning(
|
598 |
+
f"PDF has {len(doc)} pages, which is more than {max_pages}.. truncating"
|
599 |
+
)
|
600 |
doc = doc[:max_pages]
|
601 |
|
602 |
# Analyze
|
|
|
606 |
proc_text = format_ocr_out(raw_text)
|
607 |
output_text = postprocess(proc_text)
|
608 |
|
|
|
609 |
fn_rt = time.perf_counter() - st
|
610 |
|
|
|
|
|
611 |
logging.info("OCR complete")
|
612 |
|
|
|
613 |
results_dict = {
|
614 |
"num_pages": len(doc),
|
615 |
"runtime": round(fn_rt, 2),
|