Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 1,773 Bytes
928f123 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
import os
import fitz
import PyPDF2
def extract_text_and_figures(pdf_path):
"""
Extracts text and figures from a PDF file.
Args:
pdf_path (str): The path to the PDF file.
Returns:
tuple: A tuple containing two lists:
* A list of extracted text blocks.
* A list of extracted figures (as bytes).
"""
texts = []
figures = []
# Open the PDF using PyMuPDF (fitz) for image extraction
doc = fitz.open(pdf_path)
for page_num, page in enumerate(doc):
text = page.get_text("text") # Extract text as plain text
texts.append(text)
# Process images on the page
image_list = page.get_images()
for image_index, img in enumerate(image_list):
xref = img[0] # Image XREF
pix = fitz.Pixmap(doc, xref) # Create Pixmap image
# Save image in desired format (here, PNG)
if pix.n < 5: # Grayscale or RGB
img_bytes = pix.tobytes("png")
else: # CMYK: Convert to RGB first
pix = fitz.Pixmap(fitz.csRGB, pix)
img_bytes = pix.tobytes("png")
figures.append(img_bytes)
# Extract additional text using PyPDF2 (in case fitz didn't get everything)
with open(pdf_path, 'rb') as pdf_file:
pdf_reader = PyPDF2.PdfReader(pdf_file)
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
text = page.extract_text()
texts.append(text)
try:
os.remove(pdf_path)
except FileNotFoundError:
print(f"File '{pdf_path}' not found.")
except PermissionError:
print(f"Unable to remove '{pdf_path}'. Check permissions.")
return texts, figures
|