Multimodal-PDF-Chatbot

Running

Multimodal-PDF-Chatbot / utils.py

bug fixes, faster ocr and restructure

e70cddd unverified 30 days ago

No virus

1.57 kB

	import pymupdf
	from PIL import Image
	import io
	import gradio as gr
	import pandas as pd


	def image_to_bytes(image):
	img_byte_arr = io.BytesIO()
	image.save(img_byte_arr, format="PNG")
	return base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")


	def extract_pdfs(docs, doc_collection):
	if docs:
	doc_collection = []
	doc_collection.extend(docs)
	return (
	doc_collection,
	gr.Tabs(selected=1),
	pd.DataFrame([i.split("/")[-1] for i in list(docs)], columns=["Filename"]),
	)


	def extract_images(docs):
	images = []
	for doc_path in docs:
	doc = pymupdf.open(doc_path) # open a document

	for page_index in range(len(doc)): # iterate over pdf pages
	page = doc[page_index] # get the page
	image_list = page.get_images()

	for image_index, img in enumerate(
	image_list, start=1
	): # enumerate the image list
	xref = img[0] # get the XREF of the image
	pix = pymupdf.Pixmap(doc, xref) # create a Pixmap

	if pix.n - pix.alpha > 3: # CMYK: convert to RGB first
	pix = pymupdf.Pixmap(pymupdf.csRGB, pix)

	images.append(Image.open(io.BytesIO(pix.pil_tobytes("JPEG"))))
	return images


	def clean_text(text):
	text = text.strip()
	cleaned_text = text.replace("\n", " ")
	cleaned_text = cleaned_text.replace("\t", " ")
	cleaned_text = cleaned_text.replace(" ", " ")
	cleaned_text = cleaned_text.strip()
	return cleaned_text