import gradio as gr import subprocess import uuid import os import requests # sample PDF link #pdf_link = "https://arxiv.org/pdf/2308.13418.pdf" def get_pdf(pdf_link): # Generate a unique filename unique_filename = f"input/downloaded_paper_{uuid.uuid4().hex}.pdf" # Send a GET request to the PDF link response = requests.get(pdf_link) if response.status_code == 200: # Save the PDF content to a local file with open(unique_filename, 'wb') as pdf_file: pdf_file.write(response.content) print("PDF downloaded successfully.") else: print("Failed to download the PDF.") return unique_filename #.split('/')[-1][:-4] def nougat_ocr(file_name): #unique_filename = f"/content/output/downloaded_paper_{uuid.uuid4().hex}.pdf" # Command to run cli_command = [ 'nougat', #'--out', unique_filename, '--out', 'output', 'pdf', f'{file_name}', '--checkpoint', 'nougat' ] # Run the command and capture its output #completed_process = subprocess.run(cli_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) return #unique_filename def predict(pdf_file, pdf_link): if pdf_file is None: if pdf_link == '': print("No file is uploaded and No link is provided") return "No data provided. Upload a pdf file or provide a pdf link and try again!" else: print(f'pdf_link is - {pdf_link}') file_name = get_pdf(pdf_link) print(f'file_name is - {file_name}') else: file_name = pdf_file.name print(file_name) pdf_name = pdf_file.name.split('/')[-1].split('.')[0] print(pdf_name) # Call nougat nougat_ocr(file_name) #print("BACKKKK") # Open the file for reading file_name = file_name.split('/')[-1][:-4] with open(f'output/{file_name}.mmd', 'r') as file: content = file.read() return content def nougat_ocr1(file_name): print('******* inside nougat_ocr *******') # CLI Command to run cli_command = [ 'nougat', '--out', 'output', 'pdf', f'{file_name}', '--checkpoint', 'nougat' ] # Run the command and get .mmd file in an output folder subprocess.run(cli_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) return def predict1(pdf_file): print('******* inside predict *******') print(f"temporary file - {pdf_file.name}") pdf_name = pdf_file.name.split('/')[-1].split('.')[0] print(f"pdf file name - {pdf_name}") #! Get prediction for a PDF using nougat nougat_ocr(pdf_file.name) print("BAACCKKK") # Open the multimarkdown (.mmd) file for reading with open(f'output/{pdf_name}.mmd', 'r') as file: content = file.read() return content css = """ #mkd { height: 500px; overflow: auto; border: 1px solid #ccc; } """ with gr.Blocks(css=css) as demo: gr.HTML("