Spaces:

elia-waefler
/

ki_rag_classify

Runtime error

App Files Files Community

ki_rag_classify / ingest.py

elia-waefler

Upload 17 files

c2b923e verified 7 months ago

raw

history blame contribute delete

3.86 kB

	from PyPDF2 import PdfReader
	from langchain.text_splitter import CharacterTextSplitter
	import tabula
	import io
	import fitz # PyMuPDF
	import pdfplumber
	from PIL import Image
	import io


	def get_pdf_tables(pdf_bytes):
	"""
	Extracts tables from a PDF file loaded directly from bytes.

	Args:
	pdf_bytes (bytes): The byte content of the PDF file.

	Returns:
	List[pandas.DataFrame]: A list of DataFrames, each representing a table extracted from the PDF.
	"""
	tables = []
	with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
	for page in pdf.pages:
	# Extract tables from the current page
	page_tables = page.extract_tables()
	for table in page_tables:
	# Convert table to a DataFrame and append to the list
	tables.append(table)

	# Optionally convert lists of lists (tables) to pandas DataFrames
	import pandas as pd
	dataframes = [pd.DataFrame(table[1:], columns=table[0]) for table in tables if table]
	return dataframes


	def get_pdf_images(pdf_bytes):
	"""
	Extracts images and captures screenshots of each page from a given PDF's bytes.

	Args:
	pdf_bytes (bytes): The byte content of the PDF file.

	Returns:
	List[bytes]: A list of image bytes extracted from the PDF, including screenshots of each page.
	"""
	images = []
	pdf_stream = io.BytesIO(pdf_bytes)
	doc = fitz.open("pdf", pdf_stream.read())

	for page_num, page in enumerate(doc):
	# Take a screenshot of the current page
	pix = page.get_pixmap() # This line captures the page as an image
	img_bytes = pix.tobytes("png") # Save the pixmap as PNG bytes
	images.append(img_bytes) # Append the screenshot to the list of images

	# Extract embedded images
	for img_index, img in enumerate(page.get_images(full=True)):
	xref = img[0]
	base_image = doc.extract_image(xref)
	image_bytes = base_image["image"]
	images.append(image_bytes)

	doc.close()
	return images


	def get_pdf_old_tables(pdf_bytes):
	"""
	Extracts tables from a given PDF's bytes using Tabula.
	Args:
	pdf_bytes (bytes): The byte content of the PDF file.

	Returns:
	List[pandas.DataFrame]: A list of DataFrames, each representing a table extracted from the PDF.
	"""
	pdf_stream = io.BytesIO(pdf_bytes)
	# Read PDF into list of DataFrame
	tables = tabula.read_pdf(pdf_stream, pages='all', multiple_tables=True)
	return tables


	def get_pdf_text(pdf_docs):
	text = ""
	if type(pdf_docs) == list:
	for pdf in pdf_docs:
	pdf_reader = PdfReader(pdf)
	for page in pdf_reader.pages:
	text += page.extract_text()
	else:
	pdf_reader = PdfReader(pdf_docs)
	for page in pdf_reader.pages:
	text += page.extract_text()
	return text


	def get_text_chunks(text):
	text_splitter = CharacterTextSplitter(
	separator="\n",
	chunk_size=1000,
	chunk_overlap=200,
	length_function=len
	)
	chunks = text_splitter.split_text(text)
	return chunks


	def extract_images_from_pdf_path(pdf_path):
	doc = fitz.open(pdf_path)
	images = []
	for i in range(len(doc)):
	for img in doc.get_page_images(i):
	xref = img[0]
	base = img[1]
	img_data = doc.extract_image(xref)
	img_bytes = img_data['image']

	image = Image.open(io.BytesIO(img_bytes))
	images.append(image)

	return images


	def get_tables_from_pdf_path(pdf_path):
	# read_pdf will save the pdf table into Pandas Dataframe
	tables = tabula.read_pdf(pdf_path, pages='all')
	return tables