Spaces:
Build error
Build error
import os | |
import traceback | |
import argparse | |
from typing import List, Tuple, Set, Dict | |
import time | |
from PIL import Image | |
import numpy as np | |
from doctr.models import ocr_predictor | |
import logging | |
import pandas as pd | |
from bs4 import BeautifulSoup | |
import gradio | |
from utils import cropImages | |
from utils import draw_only_box,draw_box_with_text,getlogger,Annotation | |
from ocr_component1 import OCRComponent1 | |
from detectionAndOcrTable1 import DetectionAndOcrTable1 | |
from detectionAndOcrTable2 import DetectionAndOcrTable2 | |
from detectionAndOcrTable3 import DetectionAndOcrTable3 | |
from detectionAndOcrTable4 import DetectionAndOcrTable4 | |
from ocrTable1 import OcrTable1 | |
from ocrTable2 import OcrTable2 | |
from pdf2image import convert_from_path | |
def convertHTMLToCSV(html:str,output_path:str)->str: | |
# empty list | |
data = [] | |
# for getting the header from | |
# the HTML file | |
list_header = [] | |
soup = BeautifulSoup(html,'html.parser') | |
header = soup.find_all("table")[0].find("tr") | |
for items in header: | |
try: | |
list_header.append(items.get_text()) | |
except: | |
continue | |
# for getting the data | |
HTML_data = soup.find_all("table")[0].find_all("tr")[1:] | |
for element in HTML_data: | |
sub_data = [] | |
for sub_element in element: | |
try: | |
sub_data.append(sub_element.get_text()) | |
except: | |
continue | |
data.append(sub_data) | |
# Storing the data into Pandas | |
# DataFrame | |
dataFrame = pd.DataFrame(data = data, columns = list_header) | |
# Converting Pandas DataFrame | |
# into CSV file | |
dataFrame.to_csv(output_path) | |
def saveResults(image_list, results, labels, output_dir='output/', threshold=0.5): | |
if not os.path.exists(output_dir): | |
os.makedirs(output_dir) | |
for idx, im in enumerate(image_list): | |
im = draw_only_box(im, results[idx], labels, threshold=threshold) | |
out_path = os.path.join(output_dir, f"{idx}.jpg") | |
im.save(out_path, quality=95) | |
print("save result to: " + out_path) | |
def InputToImages(input_path:str,resolution=300)-> List[Image.Image]: | |
""" | |
input is file location to image | |
return : List of Pillow image objects | |
""" | |
images=[] | |
try: | |
img =Image.open(input_path) | |
if img.mode == 'RGBA': | |
img = img.convert('RGB') | |
images.append(img) | |
except Exception as e: | |
traceback.print_exc() | |
return images | |
def drawTextDetRes(bxs :List[List[float]],img:Image.Image,output_path:str): | |
""" | |
draw layout analysis results | |
""" | |
"""bxs_draw is xmin, ymin, xmax, ymax""" | |
bxs_draw = [[b[0][0], b[0][1], b[1][0], b[-1][1]] for b in bxs if b[0][0] <= b[1][0] and b[0][1] <= b[-1][1]] | |
#images_to_recognizer = cropImage(bxs, img) | |
img_to_save = draw_only_box(img, bxs_draw) | |
img_to_save.save(output_path, quality=95) | |
def test_ocr_component1(test_file="TestingFiles/OCRTest1German.pdf", debug_folder = './res/table1/',englishFlag = False): | |
#Takes as input image of a single page and returns the detected lines and words | |
images = convert_from_path(test_file) | |
ocr = OCRComponent1(englishFlag) | |
ocr_results = {} | |
all_text_in_pages = {} | |
for page_number,img in enumerate(images): | |
text_in_page = "" | |
line_annotations= ocr.predict(img = np.array(img)) | |
ocr_results[page_number] = line_annotations | |
""" | |
boxes_to_draw =[] | |
for list_of_ann in word_annotations: | |
for ann in list_of_ann: | |
logger.info(ann.text) | |
b = ann.box | |
boxes_to_draw.append(b) | |
img_to_save = draw_only_box(img,boxes_to_draw) | |
img_to_save.save("res/12June_2_lines.png", quality=95) | |
""" | |
line_boxes_to_draw =[] | |
#print("Detected lines are ") | |
#print(len(line_annotations.items())) | |
for index,ann in line_annotations.items(): | |
b = ann.box | |
line_boxes_to_draw.append(b) | |
line_words = "" | |
#print("detected words per line") | |
#print(len(ann.words)) | |
for wordann in ann.words: | |
line_words += wordann.text +" " | |
print(line_words) | |
text_in_page += line_words +"\n" | |
img_to_save1 = draw_only_box(img,line_boxes_to_draw) | |
imgname = test_file.split("/")[-1][:-4] | |
img_to_save1.save(debug_folder+imgname+"_"+str(page_number)+"_bbox_detection.png", quality=95) | |
all_text_in_pages[page_number] = text_in_page | |
return ocr_results, all_text_in_pages | |
def test_tableOcrOnly1(test_file :Image.Image , debug_folder = './res/table1/',denoise = False,englishFlag = False): | |
#Hybrid Unitable +DocTR | |
#Good at these kind of tables - with a lot of texts | |
table = OcrTable1(englishFlag) | |
image = test_file.convert("RGB") | |
""" | |
parts = test_file.split("/") | |
filename = parts[-1][:-4] | |
debugfolder_filename_page_name= debug_folder+filename+"_" | |
table_code = table.predict([image],debugfolder_filename_page_name,denoise = denoise) | |
with open(debugfolder_filename_page_name+'output.txt', 'w') as file: | |
file.write(table_code) | |
""" | |
table_code = table.predict([image],denoise = denoise) | |
return table_code | |
def test_tableOcrOnly2(test_file:Image.Image, debug_folder = './res/table2/'): | |
table = OcrTable2() | |
#FullUnitable | |
#Good at these kind of tables - with not much text | |
image = test_file.convert("RGB") | |
table.predict([image],debug_folder) | |
def test_table_component1(test_file = 'TestingFiles/TableOCRTestEnglish.pdf', debug_folder ='./res/table_debug2/',denoise = False,englishFlag = True): | |
table_predictor = DetectionAndOcrTable1(englishFlag) | |
images = convert_from_path(test_file) | |
for page_number,img in enumerate(images): | |
#print(img.mode) | |
print("Looking at page:") | |
print(page_number) | |
parts = test_file.split("/") | |
filename = parts[-1][:-4] | |
debugfolder_filename_page_name= debug_folder+filename+"_"+ str(page_number)+'_' | |
table_codes = table_predictor.predict(img,debugfolder_filename_page_name=debugfolder_filename_page_name,denoise = denoise) | |
for index, table_code in enumerate(table_codes): | |
with open(debugfolder_filename_page_name+str(index)+'output.xls', 'w') as file: | |
file.write(table_code) | |
return table_codes | |
def test_table_component2(test_file = 'TestingFiles/TableOCRTestEnglish.pdf', debug_folder ='./res/table_debug2/'): | |
#This components can take in entire pdf page as input , scan for tables and return the table in html format | |
#Uses the full unitable model | |
table_predictor = DetectionAndOcrTable2() | |
images = convert_from_path(test_file) | |
for page_number,img in enumerate(images): | |
print("Looking at page:") | |
print(page_number) | |
parts = test_file.split("/") | |
filename = parts[-1][:-4] | |
debugfolder_filename_page_name= debug_folder+filename+"_"+ str(page_number)+'_' | |
table_codes = table_predictor.predict(img,debugfolder_filename_page_name=debugfolder_filename_page_name) | |
for index, table_code in enumerate(table_codes): | |
with open(debugfolder_filename_page_name+str(index)+'output.xls', 'w') as file: | |
file.write(table_code) | |
return table_codes | |
def test_table_component3(test_file = 'TestingFiles/TableOCRTestEnglish.pdf',debug_folder ='./res/table_debug3/',denoise = False,englishFlag = True): | |
table_predictor = DetectionAndOcrTable3(englishFlag) | |
images = convert_from_path(test_file) | |
for page_number,img in enumerate(images): | |
#print(img.mode) | |
print("Looking at page:") | |
print(page_number) | |
parts = test_file.split("/") | |
filename = parts[-1][:-4] | |
debugfolder_filename_page_name= debug_folder+filename+"_"+ str(page_number)+'_' | |
table_codes = table_predictor.predict(img,debugfolder_filename_page_name=debugfolder_filename_page_name) | |
for index, table_code in enumerate(table_codes): | |
with open(debugfolder_filename_page_name+str(index)+'output.xls', 'w') as file: | |
file.write(table_code) | |
return table_codes | |
def test_table_component4(test_file = 'TestingFiles/TableOCRTestEnglish.pdf',debug_folder ='./res/table_debug3/'): | |
table_predictor = DetectionAndOcrTable4() | |
images = convert_from_path(test_file) | |
for page_number,img in enumerate(images): | |
#print(img.mode) | |
print("Looking at page:") | |
print(page_number) | |
parts = test_file.split("/") | |
filename = parts[-1][:-4] | |
debugfolder_filename_page_name= debug_folder+filename+"_"+ str(page_number)+'_' | |
table_codes = table_predictor.predict(img,debugfolder_filename_page_name=debugfolder_filename_page_name) | |
for index, table_code in enumerate(table_codes): | |
with open(debugfolder_filename_page_name+str(index)+'output.xls', 'w') as file: | |
file.write(table_code) | |
return table_codes | |
""" | |
parser = argparse.ArgumentParser(description='Process some strings.') | |
parser.add_argument('ocr', type=str, help='type in id of the component to test') | |
parser.add_argument('--test_file',type=str, help='path to the testing file') | |
parser.add_argument('--debug_folder',type=str, help='path to the folder you want to save your results in') | |
parser.add_argument('--englishFlag',type=bool, help='Whether your pdf is in english => could lead to better results ') | |
parser.add_argument('--denoise',type=bool, help='preprocessing for not clean scans ') | |
args = parser.parse_args() | |
start = time.time() | |
if args.ocr == "ocr1": | |
test_ocr_component1(args.test_file,args.debug_folder, args.englishFlag) | |
elif args.ocr == "table1": | |
test_tableOcrOnly1(args.test_file,args.debug_folder,args.englishFlag,args.denoise) | |
elif args.ocr == "table2": | |
test_tableOcrOnly2(args.test_file,args.debug_folder) | |
elif args.ocr =="pdftable1": | |
test_table_component1(args.test_file,args.debug_folder,args.englishFlag,args.denoise) | |
elif args.ocr =="pdftable2": | |
test_table_component2(args.test_file,args.debug_folder) | |
elif args.ocr =="pdftable3": | |
test_table_component3(args.test_file,args.debug_folder,args.englishFlag,args.denoise) | |
elif args.ocr =="pdftable4": | |
test_table_component4(args.test_file,args.debug_folder) | |
""" | |
import gradio as gr | |
from gradio_pdf import PDF | |
with gr.Blocks() as demo: | |
gr.Markdown("# OCR component") | |
inputs_for_ocr = [PDF(label="Document"), gr.Textbox(label="internal debug folder",placeholder = "./res/table1/"),gr.Checkbox(label ="English Document?",value =False)] | |
ocr_btn = gr.Button("Run ocr") | |
gr.Examples( | |
examples=[["TestingFiles/OCRTest1German.pdf",'./res/table1/',False]], | |
inputs=inputs_for_ocr | |
) | |
outputs_for_ocr = [gr.Textbox(label="List of annotation objects"), gr.Textbox("Text in page")] | |
ocr_btn.click(fn=test_ocr_component1, | |
inputs = inputs_for_ocr, | |
outputs = outputs_for_ocr, | |
api_name="OCR" | |
) | |
gr.Markdown("# Table OCR components that takes a pdf, extract table and return their html code ") | |
gr.Markdown("## Component 1 uses table transformer and doctr +Unitable") | |
inputs_for_pdftable1 = [PDF(label="Document"), gr.Textbox(label="internal debug folder",placeholder = "./res/table1/"),gr.Checkbox(label ="Denoise?",value =False),gr.Checkbox(label ="English Document?",value =False)] | |
table1_btn = gr.Button("Run pdftable1") | |
gr.Examples( | |
examples=[["TestingFiles/OCRTest5English.pdf",'./res/table1/',False]], | |
inputs=inputs_for_pdftable1 | |
) | |
outputs_for_pdftable1 = [gr.Textbox(label="Table code")] | |
table1_btn.click(fn=test_table_component1, | |
inputs = inputs_for_pdftable1, | |
outputs = outputs_for_pdftable1, | |
api_name="pdfTable1" | |
) | |
gr.Markdown("## Component 2 uses table transformer and Unitable") | |
inputs_for_pdftable2 = [PDF(label="Document"), gr.Textbox(label="internal debug folder",placeholder = "./res/table1/")] | |
table2_btn = gr.Button("Run pdftable2") | |
gr.Examples( | |
examples=[["TestingFiles/OCRTest5English.pdf",'./res/table1/',False]], | |
inputs=inputs_for_pdftable1 | |
) | |
outputs_for_pdftable2 = [gr.Textbox(label="Table code")] | |
table2_btn.click(fn=test_table_component2, | |
inputs = inputs_for_pdftable2, | |
outputs = outputs_for_pdftable2, | |
api_name="pdfTable2" | |
) | |
gr.Markdown("## Component 3 uses Yolo and Unitable+doctr") | |
inputs_for_pdftable3 = [PDF(label="Document"), gr.Textbox(label="internal debug folder",placeholder = "./res/table1/"),gr.Checkbox(label ="Denoise?",value =False),gr.Checkbox(label ="English Document?",value =False)] | |
table3_btn = gr.Button("Run pdftable3") | |
gr.Examples( | |
examples=[["TestingFiles/TableOCRTestEnglish.pdf",'./res/table1/',False]], | |
inputs=inputs_for_pdftable1 | |
) | |
outputs_for_pdftable3 = [gr.Textbox(label="Table code")] | |
table3_btn.click(fn=test_table_component3, | |
inputs = inputs_for_pdftable3, | |
outputs = outputs_for_pdftable3, | |
api_name="pdfTable3" | |
) | |
gr.Markdown("## Component 4 uses Yolo and Unitable") | |
inputs_for_pdftable4 = [PDF(label="Document"), gr.Textbox(label="internal debug folder",placeholder = "./res/table1/")] | |
table4_btn = gr.Button("Run pdftable4") | |
gr.Examples( | |
examples=[["TestingFiles/TableOCRTestEasier.pdf",'./res/table1/',False]], | |
inputs=inputs_for_pdftable1 | |
) | |
outputs_for_pdftable4 = [gr.Textbox(label="Table code")] | |
table4_btn.click(fn=test_table_component4, | |
inputs = inputs_for_pdftable4, | |
outputs = outputs_for_pdftable4, | |
api_name="pdfTable4" | |
) | |
gr.Markdown("# Table OCR component that takes image of an cropped tavle, extract table and return their html code ") | |
inputs_for_table1 = [gr.Image(label="Image of cropped table",type='pil'), gr.Textbox(label="internal debug folder",placeholder = "./res/table1/"),gr.Checkbox(label ="Denoise?",value =False),gr.Checkbox(label ="English Document?",value =False)] | |
onlytable1_btn = gr.Button("Run table1") | |
gr.Examples( | |
examples=[[Image.open("cropped_table.png"),'./res/table1/',False]], | |
inputs=inputs_for_table1 | |
) | |
outputs_for_table1 = [gr.HTML(label="Table code")] | |
onlytable1_btn.click(fn=test_tableOcrOnly1, | |
inputs = inputs_for_table1, | |
outputs = outputs_for_table1, | |
api_name="table1" | |
) | |
gr.Markdown("## Another Table OCR component that takes image of an cropped table, extract table and return their html code ") | |
inputs_for_table2 = [gr.Image(label="Image of cropped table",type='pil'), gr.Textbox(label="internal debug folder",placeholder = "./res/table1/")] | |
onlytable2_btn = gr.Button("Run table2") | |
gr.Examples( | |
examples=[[Image.open("cropped_table.png"),'./res/table1/',False]], | |
inputs=inputs_for_table2 | |
) | |
outputs_for_table2 = [gr.HTML(label="Table code")] | |
onlytable2_btn.click(fn=test_tableOcrOnly2, | |
inputs = inputs_for_table2, | |
outputs = outputs_for_table2, | |
api_name="table2" | |
) | |
demo.launch(share=True) |