File size: 1,567 Bytes
e70cddd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import pymupdf
from PIL import Image
import io
import gradio as gr
import pandas as pd


def image_to_bytes(image):
    img_byte_arr = io.BytesIO()
    image.save(img_byte_arr, format="PNG")
    return base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")


def extract_pdfs(docs, doc_collection):
    if docs:
        doc_collection = []
        doc_collection.extend(docs)
    return (
        doc_collection,
        gr.Tabs(selected=1),
        pd.DataFrame([i.split("/")[-1] for i in list(docs)], columns=["Filename"]),
    )


def extract_images(docs):
    images = []
    for doc_path in docs:
        doc = pymupdf.open(doc_path)  # open a document

        for page_index in range(len(doc)):  # iterate over pdf pages
            page = doc[page_index]  # get the page
            image_list = page.get_images()

            for image_index, img in enumerate(
                image_list, start=1
            ):  # enumerate the image list
                xref = img[0]  # get the XREF of the image
                pix = pymupdf.Pixmap(doc, xref)  # create a Pixmap

                if pix.n - pix.alpha > 3:  # CMYK: convert to RGB first
                    pix = pymupdf.Pixmap(pymupdf.csRGB, pix)

                images.append(Image.open(io.BytesIO(pix.pil_tobytes("JPEG"))))
    return images


def clean_text(text):
    text = text.strip()
    cleaned_text = text.replace("\n", " ")
    cleaned_text = cleaned_text.replace("\t", " ")
    cleaned_text = cleaned_text.replace("  ", " ")
    cleaned_text = cleaned_text.strip()
    return cleaned_text