anand004's picture
bug fixes, faster ocr and restructure
e70cddd unverified
raw
history blame
1.57 kB
import pymupdf
from PIL import Image
import io
import gradio as gr
import pandas as pd
def image_to_bytes(image):
img_byte_arr = io.BytesIO()
image.save(img_byte_arr, format="PNG")
return base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
def extract_pdfs(docs, doc_collection):
if docs:
doc_collection = []
doc_collection.extend(docs)
return (
doc_collection,
gr.Tabs(selected=1),
pd.DataFrame([i.split("/")[-1] for i in list(docs)], columns=["Filename"]),
)
def extract_images(docs):
images = []
for doc_path in docs:
doc = pymupdf.open(doc_path) # open a document
for page_index in range(len(doc)): # iterate over pdf pages
page = doc[page_index] # get the page
image_list = page.get_images()
for image_index, img in enumerate(
image_list, start=1
): # enumerate the image list
xref = img[0] # get the XREF of the image
pix = pymupdf.Pixmap(doc, xref) # create a Pixmap
if pix.n - pix.alpha > 3: # CMYK: convert to RGB first
pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
images.append(Image.open(io.BytesIO(pix.pil_tobytes("JPEG"))))
return images
def clean_text(text):
text = text.strip()
cleaned_text = text.replace("\n", " ")
cleaned_text = cleaned_text.replace("\t", " ")
cleaned_text = cleaned_text.replace(" ", " ")
cleaned_text = cleaned_text.strip()
return cleaned_text