import json import gradio as gr from pdfminer.high_level import extract_pages from pdfminer.layout import LTTextBoxHorizontal, LTFigure, LTImage import os import io from PIL import Image import pandas as pd import tabula import camelot from PyPDF2 import PdfReader def parse_pdf(pdf_file, output_format, progress=gr.Progress()): """ Parses a PDF file, extracts text, tables, and images, and formats the output. Args: pdf_file: Path to the uploaded PDF file. output_format: Desired output format ("JSON", "Markdown", or "HTML"). progress: Gradio Progress object for displaying progress. Returns: tuple: Extracted text and download data in the specified format. Returns an empty string and None if there is an error. """ try: with open(pdf_file, 'rb') as file: pages = list(extract_pages(file)) # Convert generator to list text = "" tables = [] images = [] # Iterate through pages and extract text and images for i, page in enumerate(pages): progress(i / len(pages)) # Update progress bar for element in page: if isinstance(element, LTTextBoxHorizontal): text += element.get_text() elif isinstance(element, (LTFigure, LTImage)): try: if hasattr(element, 'stream'): image_data = element.stream.read() image = Image.open(io.BytesIO(image_data)) image_filename = f"extracted_image_{len(images)}.png" image.save(image_filename) images.append({"filename": image_filename}) else: for child in element: if isinstance(child, LTImage): image_data = child.stream.read() image = Image.open(io.BytesIO(image_data)) image_filename = f"extracted_image_{len(images)}.png" image.save(image_filename) images.append({"filename": image_filename}) except Exception as e: print(f"Error extracting image: {e}") # Enhanced table extraction (tabula-py preferred, fallback to camelot) try: tables = tabula.read_pdf(pdf_file, pages='all', multiple_tables=True) except Exception as e: print(f"tabula-py failed: {e}. Trying camelot...") try: camelot_tables = camelot.read_pdf(pdf_file) for table in camelot_tables: tables.append(table.df) except Exception as e: print(f"camelot also failed: {e}. No tables extracted.") # Format extracted data based on user selection if output_format == "JSON": json_data = { "text": text, "tables": [table.to_dict() for table in tables], "images": images } download_data = json.dumps(json_data, indent=4) # Add indentation for readability elif output_format == "Markdown": markdown_text = f"# Extracted Text\n\n{text}\n\n# Tables\n" for i, table in enumerate(tables): markdown_text += f"## Table {i+1}\n" markdown_text += table.to_markdown(index=False) + "\n\n" # Image embedding in Markdown (using relative paths) markdown_text += "\n\n# Images\n\n" for image in images: image_path = os.path.join(os.getcwd(), image["filename"]) markdown_text += f'![Image]({image_path})\n' download_data = markdown_text elif output_format == "HTML": html_text = f"

{text}

\n\n

Tables

\n" for i, table in enumerate(tables): html_text += f"

Table {i+1}

\n" html_text += table.to_html() + "
" # Image embedding in HTML (using relative paths) html_text += "\n\n

Images

\n\n" for image in images: image_path = os.path.join(os.getcwd(), image["filename"]) html_text += f'Image
\n' download_data = html_text.encode("utf-8") # Encode for HTML download return text, download_data except Exception as main_e: print(f"A main error occurred: {main_e}") return "", None # Return empty string and None in case of error iface = gr.Interface( fn=parse_pdf, inputs=["file", gr.Dropdown(["JSON", "Markdown", "HTML"])], # Remove gr.Progress() from inputs outputs=[ gr.Text(label="Output Text"), gr.File(label="Download Output") ], title="PDF Parser", description="Parse a PDF and choose the output format." ) if __name__ == "__main__": iface.launch(share=False)