Spaces:

Geetansh01
/

ebookify-backend2

Sleeping

Geetansh

initial commit

6604d8f 29 days ago

1.44 kB

	import gradio as gr
	import pdf_to_image
	import image_to_text
	from ml_engine.model_functions import is_it_title

	def process_pdf(pdf):
	# Ensure we get the correct path to the uploaded file
	pdf_path = pdf.name # `pdf` is now a NamedString/TempFile with a `.name` attribute

	pdf_pages_images = pdf_to_image.pdfToImg2(pdf_path)

	pages = []
	curr_pg = ""

	for img in pdf_pages_images:
	text = image_to_text.img2string(img)
	for line in text.split("\n"):
	if(len(line) == 0): continue
	if(is_it_title(line)):
	# print(f"TITLE FOUND: {line}") #Debug statement
	if(len(curr_pg) != 0):
	pages.append(curr_pg)
	curr_pg = ""

	curr_pg = (curr_pg + line + "\n")
	pages.append(curr_pg)
	# print(pages)
	return pages # Returning a list of strings

	# Gradio interface using latest syntax
	with gr.Blocks() as demo:
	gr.Markdown("# PDF to Pages Processor")
	gr.Markdown("Upload a PDF and get a list of extracted pages as output.")

	# pdf_input = gr.File(label="Upload a PDF", file_types=[".pdf"])
	pdf_input = gr.File(label="Upload a PDF")
	output = gr.JSON(label="Extracted Pages")

	submit_button = gr.Button("Process PDF")

	# Define interaction
	submit_button.click(fn=process_pdf, inputs=pdf_input, outputs=output)

	if __name__ == "__main__":
	demo.launch()