from pathlib import Path import gradio as gr import fitz import pandas as pd def create_excel(doc): HEADERS = ['TR [min]','Nome','Area','Fator Capacidade','Pratos Teóricos','Sinal-ruído (USP)','Resolução','Assimetría','Altura','Pureza'] LIMITS = [(17,50),(50,130),(130,184),(184,240),(240,311),(311,360),(330,418),(400,487),(450,533),(500,600)] # LIMITS will be used to correctly identify to which block the data pertains def within_limits(x,idx_limit): return (x >= LIMITS[idx_limit][0] and x < LIMITS[idx_limit][1]) # to create the dataframe data = {val:list() for val in HEADERS} # Extracting text from all pages all_text = [] for page_num in range(len(doc)): page = doc[page_num] blocks = page.get_text(option = "words") # visit each page idx = 0 while (idx < len(blocks)) and (blocks[idx][4] != 'TR'): idx = idx + 1 # check if the next is [min] if (idx + 1 < len(blocks)) and blocks[idx + 1][4] != "[min]": continue # #print(blocks[idx:(idx+14)]) idx = idx + 14 while (idx < len(blocks)): if(blocks[idx][4] == 'Relatório'): break idx_col = 0 while (idx_col < len(HEADERS)) and (idx < len(blocks)): if within_limits(blocks[idx][0],idx_col): if idx_col == 1: final_string = "" #blocks[idx][4] while (idx < len(blocks)) and within_limits(blocks[idx][0],idx_col): final_string = final_string + " " + blocks[idx][4] idx = idx + 1 data[HEADERS[idx_col]].append(final_string) idx = idx - 1 else: data[HEADERS[idx_col]].append(blocks[idx][4]) idx = idx + 1 else: data[HEADERS[idx_col]].append(None) idx_col = idx_col + 1 # SHOW THE RECOVERED DATA df_table = pd.DataFrame.from_dict(data) return (df_table.to_excel("tabla.xlsx", index=False)) def upload_file(filepath): name = Path(filepath).name # load pdf doc = fitz.open(filepath) df_table = create_excel(doc) # now create the excel file return [gr.UploadButton(visible=False), gr.DownloadButton(label=f"Download tabla.xlsx", value=df_table, visible=True)] def download_file(): return [gr.UploadButton(visible=True), gr.DownloadButton(visible=False)] with gr.Blocks() as demo: gr.Markdown("First upload a file and and then you'll be able download it (but only once!)") with gr.Row(): u = gr.UploadButton("Upload a file", file_count="single") d = gr.DownloadButton("Download the file", visible=False) u.upload(upload_file, u, [u, d]) d.click(download_file, None, [u, d]) if __name__ == "__main__": demo.launch()