Spaces:

ChinmayBH
/

PDF_DATA_EXTRACTOR_PAGEWISE

Running

App Files Files Community

ChinmayBH commited on Aug 5, 2024

Commit

503c3e1

verified ·

1 Parent(s): 8199bf8

Upload 2 files

Browse files

Files changed (2) hide show

app.py +376 -0
requirements.txt +53 -0

app.py ADDED Viewed

	@@ -0,0 +1,376 @@

+import os
+import json
+import fitz
+import pdfplumber
+import pandas as pd
+import streamlit as st
+from tempfile import NamedTemporaryFile
+from PIL import Image
+import io
+def extract_text_images(
+        pdf_path: str, output_folder: str,
+        minimum_font_size: int,
+        extract_text: bool = True,
+        extract_images: bool = True,
+        mode: str = 'headerwise',
+        header_font_sizes: list[float] = None,
+        tolerance: float = 0.01,
+        ) -> dict:
+    """
+    Extracts text and/or images from a PDF and organizes them either by headers or by pages.
+    Params
+    -------
+    pdf_path: str
+        Path to the input PDF file.
+    output_folder: str
+        Path to the output folder where extracted data will be saved.
+    extract_text: bool
+        Whether to extract text.
+    extract_images: bool
+        Whether to extract images.
+    mode: str
+        Extraction mode, either 'headerwise' or 'pagewise'.
+    header_font_sizes: list[float]
+        List of font sizes to be considered as headers.
+    tolerance: float
+        Tolerance for font size comparison.
+    Returns
+    -------
+    dict
+        Dictionary containing extracted text and/or image data.
+    """
+    if not os.path.exists(output_folder):
+        os.makedirs(output_folder)
+    extraction_data = []
+    current_header = None
+    current_header_content = []
+    def add_current_header_content() -> None:
+        """
+        Adds the current header and its content to the extraction data.
+        """
+        nonlocal current_header, current_header_content
+        if current_header:
+            extraction_data.append({
+                'header': current_header,
+                'content': current_header_content
+            })
+            current_header_content = []
+        current_header = None
+    def is_header_font_size(font_size: float) -> bool:
+        """
+        Checks if a given font size matches any of the header font sizes.
+        """
+        return any(
+            abs(font_size - header_font_size) <= tolerance
+            for header_font_size in header_font_sizes
+        )
+    pdf_document = fitz.open(pdf_path)
+    for page_number in range(pdf_document.page_count):
+        page = pdf_document.load_page(page_number)
+        elements = []
+        if extract_text:
+            # Extract text blocks with their positions and font sizes
+            text_blocks = page.get_text("dict")["blocks"]
+            lines = {}
+            # Group text blocks by their vertical position (top) to form lines
+            for block in text_blocks:
+                if block["type"] == 0:  # Text block
+                    for line in block["lines"]:
+                        for span in line["spans"]:
+                            font_size = span["size"]
+                            top = span["bbox"][1]
+                            # Skip text blocks with font size less than 10
+                            if font_size < minimum_font_size:
+                                continue
+                            if top not in lines:
+                                lines[top] = []
+                            lines[top].append(span)
+            # Process each line to check if it's a header
+            for top in sorted(lines.keys()):
+                line = lines[top]
+                line_text = " ".join([span['text'] for span in line])
+                line_font_size = line[0]['size']
+                elements.append({
+                    'type': 'text',
+                    'font_size': line_font_size,
+                    'page': page_number + 1,
+                    'content': line_text,
+                    'x0': line[0]['bbox'][0],
+                    'top': top
+                })
+        if extract_images:
+            # Extract images using PyMuPDF
+            image_list = page.get_images(full=True)
+            for img_index, img in enumerate(image_list):
+                xref = img[0]
+                base_image = pdf_document.extract_image(xref)
+                image_bytes = base_image["image"]
+                image_filename = os.path.join(
+                    output_folder,
+                    f"page_{page_number + 1}_img_{img_index + 1}.png"
+                )
+                with open(image_filename, "wb") as img_file:
+                    img_file.write(image_bytes)
+                # Get the position of the image
+                img_rect = page.get_image_bbox(img)
+                elements.append({
+                    'type': 'image',
+                    'page': page_number + 1,
+                    'path': image_filename,
+                    'x0': img_rect.x0,
+                    'top': img_rect.y0
+                })
+        # Sort elements by their vertical position (top) first,
+        # and then by horizontal position (x0)
+        elements.sort(key=lambda e: (e['top'], e['x0']))
+        if mode == 'headerwise':
+            # Process elements to extract headers and content
+            for element in elements:
+                if element['type'] == 'text' and \
+                 is_header_font_size(element['font_size']):
+                    # If a new header is found,
+                    #  finalize the current header content
+                    add_current_header_content()
+                    current_header = element['content']
+                elif element['type'] == 'text':
+                    if current_header_content and \
+                     current_header_content[-1]['type'] == 'text':
+                        current_header_content[-1]['content'] \
+                         += " " + element['content']
+                    else:
+                        current_header_content.append({
+                            'type': 'text',
+                            'content': element['content']
+                        })
+                elif element['type'] == 'image':
+                    current_header_content.append({
+                        'type': 'image',
+                        'path': element['path']
+                    })
+        elif mode == 'pagewise':
+            page_content = []
+            for element in elements:
+                if element['type'] == 'text':
+                    if page_content and \
+                     page_content[-1]['type'] == 'text':
+                        page_content[-1]['content'] \
+                         += " " + element['content']
+                    else:
+                        page_content.append({
+                            'type': 'text',
+                            'content': element['content']
+                        })
+                elif element['type'] == 'image':
+                    page_content.append({
+                        'type': 'image',
+                        'path': element['path']
+                    })
+            extraction_data.append({
+                'page': page_number + 1,
+                'content': page_content
+            })
+    # After the loop, finalize any remaining header content
+    if mode == 'headerwise':
+        add_current_header_content()
+    pdf_document.close()
+    return extraction_data
+def get_word_font_sizes(pdf_path, target_words):
+    word_font_sizes = {word: [] for word in target_words}
+    with pdfplumber.open(pdf_path) as pdf:
+        for page in pdf.pages:
+            words = page.extract_words(extra_attrs=['fontname', 'size'])
+            for word in words:
+                text = word['text'].strip()
+                if text in target_words:
+                    word_font_sizes[text].append(word['size'])
+    return word_font_sizes
+def preview_pdf(pdf_path, num_pages=1):
+    pdf_document = fitz.open(pdf_path)
+    preview_images = []
+    for page_number in range(min(num_pages, pdf_document.page_count)):
+        page = pdf_document.load_page(page_number)
+        pix = page.get_pixmap()
+        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+        preview_images.append(img)
+    pdf_document.close()
+    return preview_images
+# Streamlit UI
+import io
+def main():
+    # setting page config
+    st.set_page_config(
+        page_title="Object counting",
+        page_icon="🧊",
+        layout="wide",
+        initial_sidebar_state="expanded",
+        menu_items={
+            'Get Help': 'https://www.extremelycoolapp.com/help',
+            'Report a bug': "https://www.extremelycoolapp.com/bug",
+        }
+    )
+    st.markdown("<h1 style='text-align: center; color: blue;'>PDF DATA SNACHER</h1>",
+                unsafe_allow_html=True)
+    st.markdown(
+        "<h3 style='text-align: center;color: brown;'>Extract valuable text and images from PDFs effortlessly and Convert PDFs into editable text and high-quality images </h3>",
+        unsafe_allow_html=True
+    )
+    st.markdown(
+        "<h5 style='text-align: center;color: red;'>Step 1: Upload pdf </h5>",
+        unsafe_allow_html=True
+    )
+    st.markdown(
+        "<h5 style='text-align: center;color: red;'>Step 2: Fill the values at right in data extraction settings </h5>",
+        unsafe_allow_html=True
+    )
+    st.markdown(
+        "<h5 style='text-align: center;color: red;'>Step 3: Download the data in desired format </h5>",
+        unsafe_allow_html=True
+    )
+    uploaded_pdf = st.file_uploader("Upload PDF", type="pdf")
+    if uploaded_pdf:
+        # Save the uploaded PDF to a temporary file
+        with NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
+            temp_pdf.write(uploaded_pdf.read())
+            temp_pdf_path = temp_pdf.name
+        # Collapsible PDF Preview
+        with st.expander("PDF Preview", expanded=True):
+            num_pages = st.slider("Number of pages to preview", min_value=1, max_value=5, value=1)
+            preview_images = preview_pdf(temp_pdf_path, num_pages)
+            for img in preview_images:
+                st.image(img, caption=f"Page {preview_images.index(img) + 1}", use_column_width=True)
+        st.sidebar.title("DATA EXTRACTION SETTINGS")
+        st.sidebar.write("How you want to extract data?")
+        extraction_mode = st.sidebar.radio("Extraction Mode", ["headerwise", "pagewise"])
+        # Font Size Detection
+        st.sidebar.title("FONT SIZE DETECTION")
+        st.sidebar.warning("[Only in case of headerwise extraction] if you dont know the font size for your headers or text then copy paste any of those words below")
+        target_words_input = st.sidebar.text_input(
+            "Target words (comma-separated)", "")
+        target_words = [word.strip() for word in target_words_input.split(",")]
+        if st.sidebar.button("Get Font Sizes"):
+            word_font_sizes = get_word_font_sizes(temp_pdf_path, target_words)
+            for word, sizes in word_font_sizes.items():
+                st.sidebar.write(f"Word: {word}, Font sizes: {sizes}")
+        # st.sidebar.warning("Fill below required details")
+        header_font_sizes = st.sidebar.text_input("Header Font Sizes (comma-separated)", "0")
+        # st.sidebar.info("Header sizes are only required in case of headerwise extraction")
+        header_font_sizes = [float(size.strip()) for size in header_font_sizes.split(",")]
+        st.sidebar.title("OUTPUT FOLDER PATH")
+        output_folder = st.sidebar.text_input(" ", value=os.path.join(os.path.dirname ("Extracted_Data")))
+        st.sidebar.info("what do you want to include in data extraction?")
+        extract_text = st.sidebar.checkbox("Extract Text", value=True)
+        extract_images = st.sidebar.checkbox("Extract Images", value=True)
+        minimum_font_size = st.sidebar.number_input("Minimum Font Size", min_value=1, value=10)
+        if st.sidebar.button("Start Extraction"):
+            if not os.path.exists(output_folder):
+                os.makedirs(output_folder)
+            extracted_data = extract_text_images(
+                temp_pdf_path,
+                output_folder,
+                minimum_font_size=minimum_font_size,
+                extract_text=extract_text,
+                extract_images=extract_images,
+                mode=extraction_mode,
+                header_font_sizes=header_font_sizes
+            )
+            # Display extracted data as JSON
+            st.json(extracted_data)
+            # Convert extracted data to a pandas DataFrame
+            def extract_to_dataframe(data):
+                rows = []
+                for item in data:
+                    if 'header' in item:
+                        header = item['header']
+                        for content_item in item['content']:
+                            if content_item['type'] == 'text':
+                                rows.append({'Header': header, 'Content': content_item['content']})
+                            elif content_item['type'] == 'image':
+                                rows.append({'Header': header, 'Content': f"Image: {content_item['path']}"})
+                    elif 'page' in item:
+                        page_num = item['page']
+                        for content_item in item['content']:
+                            if content_item['type'] == 'text':
+                                rows.append({'Page': page_num, 'Content': content_item['content']})
+                            elif content_item['type'] == 'image':
+                                rows.append({'Page': page_num, 'Content': f"Image: {content_item['path']}"})
+                return pd.DataFrame(rows)
+            df = extract_to_dataframe(extracted_data)
+            # Save DataFrame to an in-memory BytesIO buffer
+            buffer = io.BytesIO()
+            with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer:
+                df.to_excel(writer, index=False, sheet_name='Extracted Data')
+            buffer.seek(0)
+            # Preview the first 5 lines of the XLSX data
+            st.subheader("Preview of Extracted Data (First 5 Lines)")
+            preview_df = pd.read_excel(buffer, sheet_name='Extracted Data')
+            st.dataframe(preview_df.head())
+            # Provide download options
+            st.download_button(
+                label="Download JSON",
+                data=json.dumps(extracted_data, ensure_ascii=False),
+                file_name='extracted_data.json',
+                mime='application/json'
+            )
+            st.download_button(
+                label="Download XLSX",
+                data=buffer,
+                file_name='extracted_data.xlsx',
+                mime='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
+            )
+            st.success("Extraction complete. Data displayed as JSON.")
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,53 @@

+altair==5.3.0
+attrs==24.1.0
+blinker==1.8.2
+cachetools==5.4.0
+certifi==2024.7.4
+cffi==1.16.0
+charset-normalizer==3.3.2
+click==8.1.7
+colorama==0.4.6
+cryptography==43.0.0
+et-xmlfile==1.1.0
+gitdb==4.0.11
+GitPython==3.1.43
+idna==3.7
+Jinja2==3.1.4
+jsonschema==4.23.0
+jsonschema-specifications==2023.12.1
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+mdurl==0.1.2
+numpy==2.0.1
+openpyxl==3.1.5
+packaging==24.1
+pandas==2.2.2
+pdfminer.six==20231228
+pdfplumber==0.11.2
+pillow==10.4.0
+protobuf==5.27.3
+pyarrow==17.0.0
+pycparser==2.22
+pydeck==0.9.1
+Pygments==2.18.0
+PyMuPDF==1.24.9
+PyMuPDFb==1.24.9
+pypdfium2==4.30.0
+python-dateutil==2.9.0.post0
+pytz==2024.1
+referencing==0.35.1
+requests==2.32.3
+rich==13.7.1
+rpds-py==0.19.1
+six==1.16.0
+smmap==5.0.1
+streamlit==1.37.0
+tenacity==8.5.0
+toml==0.10.2
+toolz==0.12.1
+tornado==6.4.1
+typing_extensions==4.12.2
+tzdata==2024.1
+urllib3==2.2.2
+watchdog==4.0.1
+XlsxWriter==3.2.0