from pathlib import Path import gradio as gr import fitz import pandas as pd def create_excel(doc,name_excel): HEADERS = ['TR [min]','Nome','Area','Fator Capacidade','Pratos Teóricos','Sinal-ruído (USP)','Resolução','Assimetría','Altura','Pureza'] LIMITS = [(17,50),(50,130),(130,184),(184,240),(240,311),(311,360),(330,418),(400,487),(450,533),(500,600)] # LIMITS will be used to correctly identify to which block the data pertains def within_limits(x,idx_limit): return (x >= LIMITS[idx_limit][0] and x < LIMITS[idx_limit][1]) # to create the dataframe data = {val:list() for val in HEADERS} # Extracting text from all pages all_text = [] for page_num in range(len(doc)): page = doc[page_num] blocks = page.get_text(option = "words") # visit each page idx = 0 while (idx < len(blocks)) and (blocks[idx][4] != 'TR'): idx = idx + 1 # check if the next is [min] if (idx + 1 < len(blocks)) and blocks[idx + 1][4] != "[min]": continue # #print(blocks[idx:(idx+14)]) idx = idx + 14 while (idx < len(blocks)): if(blocks[idx][4] == 'Relatório'): break idx_col = 0 while (idx_col < len(HEADERS)) and (idx < len(blocks)): if within_limits(blocks[idx][0],idx_col): if idx_col == 1: final_string = "" #blocks[idx][4] while (idx < len(blocks)) and within_limits(blocks[idx][0],idx_col): final_string = final_string + " " + blocks[idx][4] idx = idx + 1 data[HEADERS[idx_col]].append(final_string) idx = idx - 1 else: data[HEADERS[idx_col]].append(blocks[idx][4]) idx = idx + 1 else: data[HEADERS[idx_col]].append(None) idx_col = idx_col + 1 # SHOW THE RECOVERED DATA #print(data) df_table = pd.DataFrame.from_dict(data) df_table.to_excel(name_excel, index=False) def upload_file(filepath): name = Path(filepath).name excel_name = Path(filepath).stem + ".xlsx" # load pdf doc = fitz.open(filepath) #create_excel(doc,excel_name) # now create the excel file return [gr.UploadButton(visible=False), gr.DownloadButton(label=f"Descarga {excel_name}", value=excel_name, visible=True)] def download_file(): return [gr.UploadButton(visible=True), gr.DownloadButton(visible=False)] with gr.Blocks() as demo: gr.Markdown("Primero sube tu archivo PDF, luego podrás descargar un archivo Excel (Sube un archivo por vez!)") with gr.Row(): u = gr.UploadButton("Sube tu PDF", file_count="single") d = gr.DownloadButton("Descarga tu Excel", visible=False) u.upload(upload_file, u, [u, d]) d.click(download_file, None, [u, d]) if __name__ == "__main__": demo.launch()