Spaces:
Runtime error
Runtime error
from PyPDF2 import PdfReader | |
from langchain.text_splitter import CharacterTextSplitter | |
import tabula | |
import io | |
import fitz # PyMuPDF | |
import pdfplumber | |
from PIL import Image | |
import io | |
def get_pdf_tables(pdf_bytes): | |
""" | |
Extracts tables from a PDF file loaded directly from bytes. | |
Args: | |
pdf_bytes (bytes): The byte content of the PDF file. | |
Returns: | |
List[pandas.DataFrame]: A list of DataFrames, each representing a table extracted from the PDF. | |
""" | |
tables = [] | |
with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf: | |
for page in pdf.pages: | |
# Extract tables from the current page | |
page_tables = page.extract_tables() | |
for table in page_tables: | |
# Convert table to a DataFrame and append to the list | |
tables.append(table) | |
# Optionally convert lists of lists (tables) to pandas DataFrames | |
import pandas as pd | |
dataframes = [pd.DataFrame(table[1:], columns=table[0]) for table in tables if table] | |
return dataframes | |
def get_pdf_images(pdf_bytes): | |
""" | |
Extracts images and captures screenshots of each page from a given PDF's bytes. | |
Args: | |
pdf_bytes (bytes): The byte content of the PDF file. | |
Returns: | |
List[bytes]: A list of image bytes extracted from the PDF, including screenshots of each page. | |
""" | |
images = [] | |
pdf_stream = io.BytesIO(pdf_bytes) | |
doc = fitz.open("pdf", pdf_stream.read()) | |
for page_num, page in enumerate(doc): | |
# Take a screenshot of the current page | |
pix = page.get_pixmap() # This line captures the page as an image | |
img_bytes = pix.tobytes("png") # Save the pixmap as PNG bytes | |
images.append(img_bytes) # Append the screenshot to the list of images | |
# Extract embedded images | |
for img_index, img in enumerate(page.get_images(full=True)): | |
xref = img[0] | |
base_image = doc.extract_image(xref) | |
image_bytes = base_image["image"] | |
images.append(image_bytes) | |
doc.close() | |
return images | |
def get_pdf_old_tables(pdf_bytes): | |
""" | |
Extracts tables from a given PDF's bytes using Tabula. | |
Args: | |
pdf_bytes (bytes): The byte content of the PDF file. | |
Returns: | |
List[pandas.DataFrame]: A list of DataFrames, each representing a table extracted from the PDF. | |
""" | |
pdf_stream = io.BytesIO(pdf_bytes) | |
# Read PDF into list of DataFrame | |
tables = tabula.read_pdf(pdf_stream, pages='all', multiple_tables=True) | |
return tables | |
def get_pdf_text(pdf_docs): | |
text = "" | |
if type(pdf_docs) == list: | |
for pdf in pdf_docs: | |
pdf_reader = PdfReader(pdf) | |
for page in pdf_reader.pages: | |
text += page.extract_text() | |
else: | |
pdf_reader = PdfReader(pdf_docs) | |
for page in pdf_reader.pages: | |
text += page.extract_text() | |
return text | |
def get_text_chunks(text): | |
text_splitter = CharacterTextSplitter( | |
separator="\n", | |
chunk_size=1000, | |
chunk_overlap=200, | |
length_function=len | |
) | |
chunks = text_splitter.split_text(text) | |
return chunks | |
def extract_images_from_pdf_path(pdf_path): | |
doc = fitz.open(pdf_path) | |
images = [] | |
for i in range(len(doc)): | |
for img in doc.get_page_images(i): | |
xref = img[0] | |
base = img[1] | |
img_data = doc.extract_image(xref) | |
img_bytes = img_data['image'] | |
image = Image.open(io.BytesIO(img_bytes)) | |
images.append(image) | |
return images | |
def get_tables_from_pdf_path(pdf_path): | |
# read_pdf will save the pdf table into Pandas Dataframe | |
tables = tabula.read_pdf(pdf_path, pages='all') | |
return tables | |