Spaces:
Running
Running
import pymupdf | |
from PIL import Image | |
import io | |
import gradio as gr | |
import pandas as pd | |
def image_to_bytes(image): | |
img_byte_arr = io.BytesIO() | |
image.save(img_byte_arr, format="PNG") | |
return base64.b64encode(img_byte_arr.getvalue()).decode("utf-8") | |
def extract_pdfs(docs, doc_collection): | |
if docs: | |
doc_collection = [] | |
doc_collection.extend(docs) | |
return ( | |
doc_collection, | |
gr.Tabs(selected=1), | |
pd.DataFrame([i.split("/")[-1] for i in list(docs)], columns=["Filename"]), | |
) | |
def extract_images(docs): | |
images = [] | |
for doc_path in docs: | |
doc = pymupdf.open(doc_path) # open a document | |
for page_index in range(len(doc)): # iterate over pdf pages | |
page = doc[page_index] # get the page | |
image_list = page.get_images() | |
for image_index, img in enumerate( | |
image_list, start=1 | |
): # enumerate the image list | |
xref = img[0] # get the XREF of the image | |
pix = pymupdf.Pixmap(doc, xref) # create a Pixmap | |
if pix.n - pix.alpha > 3: # CMYK: convert to RGB first | |
pix = pymupdf.Pixmap(pymupdf.csRGB, pix) | |
images.append(Image.open(io.BytesIO(pix.pil_tobytes("JPEG")))) | |
return images | |
def clean_text(text): | |
text = text.strip() | |
cleaned_text = text.replace("\n", " ") | |
cleaned_text = cleaned_text.replace("\t", " ") | |
cleaned_text = cleaned_text.replace(" ", " ") | |
cleaned_text = cleaned_text.strip() | |
return cleaned_text | |