oceansweep commited on
Commit
8846bdc
1 Parent(s): 0b0710f

Upload PDF_Ingestion_Lib.py

Browse files
App_Function_Libraries/PDF_Ingestion_Lib.py CHANGED
@@ -11,19 +11,12 @@
11
  #
12
  #
13
  ####################
14
-
15
 
16
  # Import necessary libraries
17
- from datetime import datetime
18
- import logging
19
- import subprocess
20
- import os
21
- import shutil
22
- import tempfile
23
 
24
 
25
  # Import Local
26
- from App_Function_Libraries.SQLite_DB import add_media_with_keywords
27
 
28
  #######################################################################################################################
29
  # Function Definitions
@@ -36,38 +29,222 @@ from App_Function_Libraries.SQLite_DB import add_media_with_keywords
36
  MAX_FILE_SIZE_MB = 50
37
  CONVERSION_TIMEOUT_SECONDS = 300
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
- def convert_pdf_to_markdown(pdf_path):
41
- """
42
- Convert a PDF file to Markdown by calling a script in another virtual environment.
43
- """
44
 
45
- logging.debug(f"Marker: Converting PDF file to Markdown: {pdf_path}")
46
- # Check if the file size exceeds the maximum allowed size
47
- file_size_mb = os.path.getsize(pdf_path) / (1024 * 1024)
48
- if file_size_mb > MAX_FILE_SIZE_MB:
49
- raise ValueError(f"File size ({file_size_mb:.2f} MB) exceeds the maximum allowed size of {MAX_FILE_SIZE_MB} MB")
 
 
 
 
 
 
 
50
 
51
- logging.debug("Marker: Converting PDF file to Markdown using Marker virtual environment")
52
- # Path to the Python interpreter in the other virtual environment
53
- other_venv_python = "Helper_Scripts/marker_venv/bin/python"
54
 
55
- # Path to the conversion script
56
- converter_script = "Helper_Scripts/PDF_Converter.py"
57
 
58
- logging.debug("Marker: Attempting to convert PDF file to Markdown...")
 
 
 
59
  try:
60
- result = subprocess.run(
61
- [other_venv_python, converter_script, pdf_path],
62
- capture_output=True,
63
- text=True,
64
- timeout=CONVERSION_TIMEOUT_SECONDS
65
- )
66
- if result.returncode != 0:
67
- raise Exception(f"Conversion failed: {result.stderr}")
68
- return result.stdout
69
- except subprocess.TimeoutExpired:
70
- raise Exception(f"PDF conversion timed out after {CONVERSION_TIMEOUT_SECONDS} seconds")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
 
73
  def process_and_ingest_pdf(file, title, author, keywords):
@@ -83,84 +260,59 @@ def process_and_ingest_pdf(file, title, author, keywords):
83
  # Copy the contents of the uploaded file to the temporary file
84
  shutil.copy(file.name, temp_path)
85
 
86
- # Call the ingest_pdf_file function with the temporary file path
87
- result = ingest_pdf_file(temp_path, title, author, keywords)
88
-
89
- return result
90
- except Exception as e:
91
- return f"Error processing PDF: {str(e)}"
92
-
93
-
94
- def ingest_pdf_file(file_path, title=None, author=None, keywords=None):
95
- try:
96
- # Convert PDF to Markdown
97
- markdown_content = convert_pdf_to_markdown(file_path)
98
-
99
- # If title is not provided, use the filename without extension
100
- if not title:
101
- title = os.path.splitext(os.path.basename(file_path))[0]
102
-
103
- # If author is not provided, set it to 'Unknown'
104
- if not author:
105
- author = 'Unknown'
106
-
107
- # If keywords are not provided, use a default keyword
108
- if not keywords:
109
- keywords = 'pdf_file,markdown_converted'
110
- else:
111
- keywords = f'pdf_file,markdown_converted,{keywords}'
112
-
113
- # Add the markdown content to the database
114
- add_media_with_keywords(
115
- url=file_path,
116
- title=title,
117
- media_type='document',
118
- content=markdown_content,
119
- keywords=keywords,
120
- prompt='No prompt for PDF files',
121
- summary='No summary for PDF files',
122
- transcription_model='None',
123
- author=author,
124
- ingestion_date=datetime.now().strftime('%Y-%m-%d')
125
- )
126
-
127
- return f"PDF file '{title}' converted to Markdown and ingested successfully.", file_path
128
- except ValueError as e:
129
- logging.error(f"File size error: {str(e)}")
130
- return f"Error: {str(e)}", file_path
131
  except Exception as e:
132
  logging.error(f"Error ingesting PDF file: {str(e)}")
133
- return f"Error ingesting PDF file: {str(e)}", file_path
134
 
135
 
136
  def process_and_cleanup_pdf(file, title, author, keywords):
137
  if file is None:
138
  return "No file uploaded. Please upload a PDF file."
139
 
140
- temp_dir = tempfile.mkdtemp()
141
- temp_file_path = os.path.join(temp_dir, "temp.pdf")
142
-
143
  try:
144
- # Copy the uploaded file to a temporary location
145
- shutil.copy2(file.name, temp_file_path)
146
-
147
- # Process the file
148
- result, _ = ingest_pdf_file(temp_file_path, title, author, keywords)
149
-
150
  return result
151
  except Exception as e:
152
  logging.error(f"Error in processing and cleanup: {str(e)}")
153
  return f"Error: {str(e)}"
154
- finally:
155
- # Clean up the temporary directory and its contents
156
- try:
157
- shutil.rmtree(temp_dir)
158
- logging.info(f"Removed temporary directory: {temp_dir}")
159
- except Exception as cleanup_error:
160
- logging.error(f"Error during cleanup: {str(cleanup_error)}")
161
- result += f"\nWarning: Could not remove temporary files: {str(cleanup_error)}"
162
-
163
 
164
  #
165
- #
166
  #######################################################################################################################
 
11
  #
12
  #
13
  ####################
14
+ import re
15
 
16
  # Import necessary libraries
 
 
 
 
 
 
17
 
18
 
19
  # Import Local
 
20
 
21
  #######################################################################################################################
22
  # Function Definitions
 
29
  MAX_FILE_SIZE_MB = 50
30
  CONVERSION_TIMEOUT_SECONDS = 300
31
 
32
+ # Marker PDF solution
33
+ # def convert_pdf_to_markdown(pdf_path):
34
+ # """
35
+ # Convert a PDF file to Markdown by calling a script in another virtual environment.
36
+ # """
37
+ #
38
+ # logging.debug(f"Marker: Converting PDF file to Markdown: {pdf_path}")
39
+ # # Check if the file size exceeds the maximum allowed size
40
+ # file_size_mb = os.path.getsize(pdf_path) / (1024 * 1024)
41
+ # if file_size_mb > MAX_FILE_SIZE_MB:
42
+ # raise ValueError(f"File size ({file_size_mb:.2f} MB) exceeds the maximum allowed size of {MAX_FILE_SIZE_MB} MB")
43
+ #
44
+ # logging.debug("Marker: Converting PDF file to Markdown using Marker virtual environment")
45
+ # # Path to the Python interpreter in the other virtual environment
46
+ # other_venv_python = "Helper_Scripts/marker_venv/bin/python"
47
+ #
48
+ # # Path to the conversion script
49
+ # converter_script = "Helper_Scripts/PDF_Converter.py"
50
+ #
51
+ # logging.debug("Marker: Attempting to convert PDF file to Markdown...")
52
+ # try:
53
+ # result = subprocess.run(
54
+ # [other_venv_python, converter_script, pdf_path],
55
+ # capture_output=True,
56
+ # text=True,
57
+ # timeout=CONVERSION_TIMEOUT_SECONDS
58
+ # )
59
+ # if result.returncode != 0:
60
+ # raise Exception(f"Conversion failed: {result.stderr}")
61
+ # return result.stdout
62
+ # except subprocess.TimeoutExpired:
63
+ # raise Exception(f"PDF conversion timed out after {CONVERSION_TIMEOUT_SECONDS} seconds")
64
+ #
65
+ #
66
+ # def process_and_ingest_pdf(file, title, author, keywords):
67
+ # if file is None:
68
+ # return "Please select a PDF file to upload."
69
+ #
70
+ # try:
71
+ # # Create a temporary directory
72
+ # with tempfile.TemporaryDirectory() as temp_dir:
73
+ # # Create a path for the temporary PDF file
74
+ # temp_path = os.path.join(temp_dir, "temp.pdf")
75
+ #
76
+ # # Copy the contents of the uploaded file to the temporary file
77
+ # shutil.copy(file.name, temp_path)
78
+ #
79
+ # # Call the ingest_pdf_file function with the temporary file path
80
+ # result = ingest_pdf_file(temp_path, title, author, keywords)
81
+ #
82
+ # return result
83
+ # except Exception as e:
84
+ # return f"Error processing PDF: {str(e)}"
85
+ #
86
+ #
87
+ # def ingest_pdf_file(file_path, title=None, author=None, keywords=None):
88
+ # try:
89
+ # # Convert PDF to Markdown
90
+ # markdown_content = convert_pdf_to_markdown(file_path)
91
+ #
92
+ # # If title is not provided, use the filename without extension
93
+ # if not title:
94
+ # title = os.path.splitext(os.path.basename(file_path))[0]
95
+ #
96
+ # # If author is not provided, set it to 'Unknown'
97
+ # if not author:
98
+ # author = 'Unknown'
99
+ #
100
+ # # If keywords are not provided, use a default keyword
101
+ # if not keywords:
102
+ # keywords = 'pdf_file,markdown_converted'
103
+ # else:
104
+ # keywords = f'pdf_file,markdown_converted,{keywords}'
105
+ #
106
+ # # Add the markdown content to the database
107
+ # add_media_with_keywords(
108
+ # url=file_path,
109
+ # title=title,
110
+ # media_type='document',
111
+ # content=markdown_content,
112
+ # keywords=keywords,
113
+ # prompt='No prompt for PDF files',
114
+ # summary='No summary for PDF files',
115
+ # transcription_model='None',
116
+ # author=author,
117
+ # ingestion_date=datetime.now().strftime('%Y-%m-%d')
118
+ # )
119
+ #
120
+ # return f"PDF file '{title}' converted to Markdown and ingested successfully.", file_path
121
+ # except ValueError as e:
122
+ # logging.error(f"File size error: {str(e)}")
123
+ # return f"Error: {str(e)}", file_path
124
+ # except Exception as e:
125
+ # logging.error(f"Error ingesting PDF file: {str(e)}")
126
+ # return f"Error ingesting PDF file: {str(e)}", file_path
127
+ #
128
+ #
129
+ # def process_and_cleanup_pdf(file, title, author, keywords):
130
+ # # FIXME - Update to validate file upload/filetype is pdf....
131
+ # if file is None:
132
+ # return "No file uploaded. Please upload a PDF file."
133
+ #
134
+ # temp_dir = tempfile.mkdtemp()
135
+ # temp_file_path = os.path.join(temp_dir, "temp.pdf")
136
+ #
137
+ # try:
138
+ # # Copy the uploaded file to a temporary location
139
+ # shutil.copy2(file.name, temp_file_path)
140
+ #
141
+ # # Process the file
142
+ # result, _ = ingest_pdf_file(temp_file_path, title, author, keywords)
143
+ #
144
+ # return result
145
+ # except Exception as e:
146
+ # logging.error(f"Error in processing and cleanup: {str(e)}")
147
+ # return f"Error: {str(e)}"
148
+ # finally:
149
+ # # Clean up the temporary directory and its contents
150
+ # try:
151
+ # shutil.rmtree(temp_dir)
152
+ # logging.info(f"Removed temporary directory: {temp_dir}")
153
+ # except Exception as cleanup_error:
154
+ # logging.error(f"Error during cleanup: {str(cleanup_error)}")
155
+ # result += f"\nWarning: Could not remove temporary files: {str(cleanup_error)}"
156
 
 
 
 
 
157
 
158
+ import logging
159
+ #
160
+ #
161
+ #######################################################################################################################
162
+ #
163
+ # Non-Marker implementation
164
+ import os
165
+ import shutil
166
+ import tempfile
167
+ from datetime import datetime
168
+
169
+ import pymupdf
170
 
171
+ from App_Function_Libraries.SQLite_DB import add_media_with_keywords
 
 
172
 
 
 
173
 
174
+ def extract_text_and_format_from_pdf(pdf_path):
175
+ """
176
+ Extract text from a PDF file and convert it to Markdown, preserving formatting.
177
+ """
178
  try:
179
+ markdown_text = ""
180
+ with pymupdf.open(pdf_path) as doc:
181
+ for page_num, page in enumerate(doc, 1):
182
+ markdown_text += f"## Page {page_num}\n\n"
183
+ blocks = page.get_text("dict")["blocks"]
184
+ current_paragraph = ""
185
+ for block in blocks:
186
+ if block["type"] == 0: # Text block
187
+ for line in block["lines"]:
188
+ line_text = ""
189
+ for span in line["spans"]:
190
+ text = span["text"]
191
+ font_size = span["size"]
192
+ font_flags = span["flags"]
193
+
194
+ # Apply formatting based on font size and flags
195
+ if font_size > 20:
196
+ text = f"# {text}"
197
+ elif font_size > 16:
198
+ text = f"## {text}"
199
+ elif font_size > 14:
200
+ text = f"### {text}"
201
+
202
+ if font_flags & 2 ** 0: # Bold
203
+ text = f"**{text}**"
204
+ if font_flags & 2 ** 1: # Italic
205
+ text = f"*{text}*"
206
+
207
+ line_text += text + " "
208
+
209
+ # Remove hyphens at the end of lines
210
+ line_text = line_text.rstrip()
211
+ if line_text.endswith('-'):
212
+ line_text = line_text[:-1]
213
+ else:
214
+ line_text += " "
215
+
216
+ current_paragraph += line_text
217
+
218
+ # End of block, add paragraph
219
+ if current_paragraph:
220
+ # Remove extra spaces
221
+ current_paragraph = re.sub(r'\s+', ' ', current_paragraph).strip()
222
+ markdown_text += current_paragraph + "\n\n"
223
+ current_paragraph = ""
224
+ elif block["type"] == 1: # Image block
225
+ markdown_text += "[Image]\n\n"
226
+ markdown_text += "\n---\n\n" # Page separator
227
+
228
+ # Clean up hyphenated words
229
+ markdown_text = re.sub(r'(\w+)-\s*\n(\w+)', r'\1\2', markdown_text)
230
+
231
+ return markdown_text
232
+ except Exception as e:
233
+ logging.error(f"Error extracting text and formatting from PDF: {str(e)}")
234
+ raise
235
+
236
+
237
+ def extract_metadata_from_pdf(pdf_path):
238
+ """
239
+ Extract metadata from a PDF file using PyMuPDF.
240
+ """
241
+ try:
242
+ with pymupdf.open(pdf_path) as doc:
243
+ metadata = doc.metadata
244
+ return metadata
245
+ except Exception as e:
246
+ logging.error(f"Error extracting metadata from PDF: {str(e)}")
247
+ return {}
248
 
249
 
250
  def process_and_ingest_pdf(file, title, author, keywords):
 
260
  # Copy the contents of the uploaded file to the temporary file
261
  shutil.copy(file.name, temp_path)
262
 
263
+ # Extract text and convert to Markdown
264
+ markdown_text = extract_text_and_format_from_pdf(temp_path)
265
+
266
+ # Extract metadata from PDF
267
+ metadata = extract_metadata_from_pdf(temp_path)
268
+
269
+ # Use metadata for title and author if not provided
270
+ if not title:
271
+ title = metadata.get('title', os.path.splitext(os.path.basename(file.name))[0])
272
+ if not author:
273
+ author = metadata.get('author', 'Unknown')
274
+
275
+ # If keywords are not provided, use a default keyword
276
+ if not keywords:
277
+ keywords = 'pdf_file,markdown_converted'
278
+ else:
279
+ keywords = f'pdf_file,markdown_converted,{keywords}'
280
+
281
+ # Add metadata-based keywords
282
+ if 'subject' in metadata:
283
+ keywords += f",{metadata['subject']}"
284
+
285
+ # Add the PDF content to the database
286
+ add_media_with_keywords(
287
+ url=file.name,
288
+ title=title,
289
+ media_type='document',
290
+ content=markdown_text,
291
+ keywords=keywords,
292
+ prompt='No prompt for PDF files',
293
+ summary='No summary for PDF files',
294
+ transcription_model='None',
295
+ author=author,
296
+ ingestion_date=datetime.now().strftime('%Y-%m-%d')
297
+ )
298
+
299
+ return f"PDF file '{title}' by {author} ingested successfully and converted to Markdown."
 
 
 
 
 
 
 
 
300
  except Exception as e:
301
  logging.error(f"Error ingesting PDF file: {str(e)}")
302
+ return f"Error ingesting PDF file: {str(e)}"
303
 
304
 
305
  def process_and_cleanup_pdf(file, title, author, keywords):
306
  if file is None:
307
  return "No file uploaded. Please upload a PDF file."
308
 
 
 
 
309
  try:
310
+ result = process_and_ingest_pdf(file, title, author, keywords)
 
 
 
 
 
311
  return result
312
  except Exception as e:
313
  logging.error(f"Error in processing and cleanup: {str(e)}")
314
  return f"Error: {str(e)}"
 
 
 
 
 
 
 
 
 
315
 
316
  #
317
+ # End of PDF_Ingestion_Lib.py
318
  #######################################################################################################################