from PyPDF2 import PdfReader from langchain.text_splitter import CharacterTextSplitter import tabula import io import fitz # PyMuPDF import pdfplumber from PIL import Image import io def get_pdf_tables(pdf_bytes): """ Extracts tables from a PDF file loaded directly from bytes. Args: pdf_bytes (bytes): The byte content of the PDF file. Returns: List[pandas.DataFrame]: A list of DataFrames, each representing a table extracted from the PDF. """ tables = [] with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf: for page in pdf.pages: # Extract tables from the current page page_tables = page.extract_tables() for table in page_tables: # Convert table to a DataFrame and append to the list tables.append(table) # Optionally convert lists of lists (tables) to pandas DataFrames import pandas as pd dataframes = [pd.DataFrame(table[1:], columns=table[0]) for table in tables if table] return dataframes def get_pdf_images(pdf_bytes): """ Extracts images and captures screenshots of each page from a given PDF's bytes. Args: pdf_bytes (bytes): The byte content of the PDF file. Returns: List[bytes]: A list of image bytes extracted from the PDF, including screenshots of each page. """ images = [] pdf_stream = io.BytesIO(pdf_bytes) doc = fitz.open("pdf", pdf_stream.read()) for page_num, page in enumerate(doc): # Take a screenshot of the current page pix = page.get_pixmap() # This line captures the page as an image img_bytes = pix.tobytes("png") # Save the pixmap as PNG bytes images.append(img_bytes) # Append the screenshot to the list of images # Extract embedded images for img_index, img in enumerate(page.get_images(full=True)): xref = img[0] base_image = doc.extract_image(xref) image_bytes = base_image["image"] images.append(image_bytes) doc.close() return images def get_pdf_old_tables(pdf_bytes): """ Extracts tables from a given PDF's bytes using Tabula. Args: pdf_bytes (bytes): The byte content of the PDF file. Returns: List[pandas.DataFrame]: A list of DataFrames, each representing a table extracted from the PDF. """ pdf_stream = io.BytesIO(pdf_bytes) # Read PDF into list of DataFrame tables = tabula.read_pdf(pdf_stream, pages='all', multiple_tables=True) return tables def get_pdf_text(pdf_docs): text = "" if type(pdf_docs) == list: for pdf in pdf_docs: pdf_reader = PdfReader(pdf) for page in pdf_reader.pages: text += page.extract_text() else: pdf_reader = PdfReader(pdf_docs) for page in pdf_reader.pages: text += page.extract_text() return text def get_text_chunks(text): text_splitter = CharacterTextSplitter( separator="\n", chunk_size=1000, chunk_overlap=200, length_function=len ) chunks = text_splitter.split_text(text) return chunks def extract_images_from_pdf_path(pdf_path): doc = fitz.open(pdf_path) images = [] for i in range(len(doc)): for img in doc.get_page_images(i): xref = img[0] base = img[1] img_data = doc.extract_image(xref) img_bytes = img_data['image'] image = Image.open(io.BytesIO(img_bytes)) images.append(image) return images def get_tables_from_pdf_path(pdf_path): # read_pdf will save the pdf table into Pandas Dataframe tables = tabula.read_pdf(pdf_path, pages='all') return tables