tldw / App_Function_Libraries /PDF_Ingestion_Lib.py
oceansweep's picture
?
ed28876
raw
history blame
5.73 kB
# PDF_Ingestion_Lib.py
#########################################
# Library to hold functions for ingesting PDF files.#
#
####################
# Function List
#
# 1. convert_pdf_to_markdown(pdf_path)
# 2. ingest_pdf_file(file_path, title=None, author=None, keywords=None):
# 3.
#
#
####################
# Import necessary libraries
from datetime import datetime
import logging
import subprocess
import os
import shutil
import tempfile
# Import Local
from App_Function_Libraries.SQLite_DB import add_media_with_keywords
#######################################################################################################################
# Function Definitions
#
# Ingest a text file into the database with Title/Author/Keywords
# Constants
MAX_FILE_SIZE_MB = 50
CONVERSION_TIMEOUT_SECONDS = 300
def convert_pdf_to_markdown(pdf_path):
"""
Convert a PDF file to Markdown by calling a script in another virtual environment.
"""
logging.debug(f"Marker: Converting PDF file to Markdown: {pdf_path}")
# Check if the file size exceeds the maximum allowed size
file_size_mb = os.path.getsize(pdf_path) / (1024 * 1024)
if file_size_mb > MAX_FILE_SIZE_MB:
raise ValueError(f"File size ({file_size_mb:.2f} MB) exceeds the maximum allowed size of {MAX_FILE_SIZE_MB} MB")
logging.debug("Marker: Converting PDF file to Markdown using Marker virtual environment")
# Path to the Python interpreter in the other virtual environment
other_venv_python = "Helper_Scripts/marker_venv/bin/python"
# Path to the conversion script
converter_script = "Helper_Scripts/PDF_Converter.py"
logging.debug("Marker: Attempting to convert PDF file to Markdown...")
try:
result = subprocess.run(
[other_venv_python, converter_script, pdf_path],
capture_output=True,
text=True,
timeout=CONVERSION_TIMEOUT_SECONDS
)
if result.returncode != 0:
raise Exception(f"Conversion failed: {result.stderr}")
return result.stdout
except subprocess.TimeoutExpired:
raise Exception(f"PDF conversion timed out after {CONVERSION_TIMEOUT_SECONDS} seconds")
def process_and_ingest_pdf(file, title, author, keywords):
if file is None:
return "Please select a PDF file to upload."
try:
# Create a temporary directory
with tempfile.TemporaryDirectory() as temp_dir:
# Create a path for the temporary PDF file
temp_path = os.path.join(temp_dir, "temp.pdf")
# Copy the contents of the uploaded file to the temporary file
shutil.copy(file.name, temp_path)
# Call the ingest_pdf_file function with the temporary file path
result = ingest_pdf_file(temp_path, title, author, keywords)
return result
except Exception as e:
return f"Error processing PDF: {str(e)}"
def ingest_pdf_file(file_path, title=None, author=None, keywords=None):
try:
# Convert PDF to Markdown
markdown_content = convert_pdf_to_markdown(file_path)
# If title is not provided, use the filename without extension
if not title:
title = os.path.splitext(os.path.basename(file_path))[0]
# If author is not provided, set it to 'Unknown'
if not author:
author = 'Unknown'
# If keywords are not provided, use a default keyword
if not keywords:
keywords = 'pdf_file,markdown_converted'
else:
keywords = f'pdf_file,markdown_converted,{keywords}'
# Add the markdown content to the database
add_media_with_keywords(
url=file_path,
title=title,
media_type='document',
content=markdown_content,
keywords=keywords,
prompt='No prompt for PDF files',
summary='No summary for PDF files',
transcription_model='None',
author=author,
ingestion_date=datetime.now().strftime('%Y-%m-%d')
)
return f"PDF file '{title}' converted to Markdown and ingested successfully.", file_path
except ValueError as e:
logging.error(f"File size error: {str(e)}")
return f"Error: {str(e)}", file_path
except Exception as e:
logging.error(f"Error ingesting PDF file: {str(e)}")
return f"Error ingesting PDF file: {str(e)}", file_path
def process_and_cleanup_pdf(file, title, author, keywords):
if file is None:
return "No file uploaded. Please upload a PDF file."
temp_dir = tempfile.mkdtemp()
temp_file_path = os.path.join(temp_dir, "temp.pdf")
try:
# Copy the uploaded file to a temporary location
shutil.copy2(file.name, temp_file_path)
# Process the file
result, _ = ingest_pdf_file(temp_file_path, title, author, keywords)
return result
except Exception as e:
logging.error(f"Error in processing and cleanup: {str(e)}")
return f"Error: {str(e)}"
finally:
# Clean up the temporary directory and its contents
try:
shutil.rmtree(temp_dir)
logging.info(f"Removed temporary directory: {temp_dir}")
except Exception as cleanup_error:
logging.error(f"Error during cleanup: {str(cleanup_error)}")
result += f"\nWarning: Could not remove temporary files: {str(cleanup_error)}"
#
#
#######################################################################################################################