import pymupdf from PIL import Image import io import gradio as gr import pandas as pd def image_to_bytes(image): img_byte_arr = io.BytesIO() image.save(img_byte_arr, format="PNG") return base64.b64encode(img_byte_arr.getvalue()).decode("utf-8") def extract_pdfs(docs, doc_collection): if docs: doc_collection = [] doc_collection.extend(docs) return ( doc_collection, gr.Tabs(selected=1), pd.DataFrame([i.split("/")[-1] for i in list(docs)], columns=["Filename"]), ) def extract_images(docs): images = [] for doc_path in docs: doc = pymupdf.open(doc_path) # open a document for page_index in range(len(doc)): # iterate over pdf pages page = doc[page_index] # get the page image_list = page.get_images() for image_index, img in enumerate( image_list, start=1 ): # enumerate the image list xref = img[0] # get the XREF of the image pix = pymupdf.Pixmap(doc, xref) # create a Pixmap if pix.n - pix.alpha > 3: # CMYK: convert to RGB first pix = pymupdf.Pixmap(pymupdf.csRGB, pix) images.append(Image.open(io.BytesIO(pix.pil_tobytes("JPEG")))) return images def clean_text(text): text = text.strip() cleaned_text = text.replace("\n", " ") cleaned_text = cleaned_text.replace("\t", " ") cleaned_text = cleaned_text.replace(" ", " ") cleaned_text = cleaned_text.strip() return cleaned_text