ki_rag_classify / ingest.py
elia-waefler's picture
Upload 17 files
c2b923e verified
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
import tabula
import io
import fitz # PyMuPDF
import pdfplumber
from PIL import Image
import io
def get_pdf_tables(pdf_bytes):
"""
Extracts tables from a PDF file loaded directly from bytes.
Args:
pdf_bytes (bytes): The byte content of the PDF file.
Returns:
List[pandas.DataFrame]: A list of DataFrames, each representing a table extracted from the PDF.
"""
tables = []
with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
for page in pdf.pages:
# Extract tables from the current page
page_tables = page.extract_tables()
for table in page_tables:
# Convert table to a DataFrame and append to the list
tables.append(table)
# Optionally convert lists of lists (tables) to pandas DataFrames
import pandas as pd
dataframes = [pd.DataFrame(table[1:], columns=table[0]) for table in tables if table]
return dataframes
def get_pdf_images(pdf_bytes):
"""
Extracts images and captures screenshots of each page from a given PDF's bytes.
Args:
pdf_bytes (bytes): The byte content of the PDF file.
Returns:
List[bytes]: A list of image bytes extracted from the PDF, including screenshots of each page.
"""
images = []
pdf_stream = io.BytesIO(pdf_bytes)
doc = fitz.open("pdf", pdf_stream.read())
for page_num, page in enumerate(doc):
# Take a screenshot of the current page
pix = page.get_pixmap() # This line captures the page as an image
img_bytes = pix.tobytes("png") # Save the pixmap as PNG bytes
images.append(img_bytes) # Append the screenshot to the list of images
# Extract embedded images
for img_index, img in enumerate(page.get_images(full=True)):
xref = img[0]
base_image = doc.extract_image(xref)
image_bytes = base_image["image"]
images.append(image_bytes)
doc.close()
return images
def get_pdf_old_tables(pdf_bytes):
"""
Extracts tables from a given PDF's bytes using Tabula.
Args:
pdf_bytes (bytes): The byte content of the PDF file.
Returns:
List[pandas.DataFrame]: A list of DataFrames, each representing a table extracted from the PDF.
"""
pdf_stream = io.BytesIO(pdf_bytes)
# Read PDF into list of DataFrame
tables = tabula.read_pdf(pdf_stream, pages='all', multiple_tables=True)
return tables
def get_pdf_text(pdf_docs):
text = ""
if type(pdf_docs) == list:
for pdf in pdf_docs:
pdf_reader = PdfReader(pdf)
for page in pdf_reader.pages:
text += page.extract_text()
else:
pdf_reader = PdfReader(pdf_docs)
for page in pdf_reader.pages:
text += page.extract_text()
return text
def get_text_chunks(text):
text_splitter = CharacterTextSplitter(
separator="\n",
chunk_size=1000,
chunk_overlap=200,
length_function=len
)
chunks = text_splitter.split_text(text)
return chunks
def extract_images_from_pdf_path(pdf_path):
doc = fitz.open(pdf_path)
images = []
for i in range(len(doc)):
for img in doc.get_page_images(i):
xref = img[0]
base = img[1]
img_data = doc.extract_image(xref)
img_bytes = img_data['image']
image = Image.open(io.BytesIO(img_bytes))
images.append(image)
return images
def get_tables_from_pdf_path(pdf_path):
# read_pdf will save the pdf table into Pandas Dataframe
tables = tabula.read_pdf(pdf_path, pages='all')
return tables