oceansweep commited on
Commit
0b8741e
1 Parent(s): 9a2dd5b

Update App_Function_Libraries/PDF_Ingestion_Lib.py

Browse files
App_Function_Libraries/PDF_Ingestion_Lib.py CHANGED
@@ -1,318 +1,318 @@
1
- # PDF_Ingestion_Lib.py
2
- #########################################
3
- # Library to hold functions for ingesting PDF files.#
4
- #
5
- ####################
6
- # Function List
7
- #
8
- # 1. convert_pdf_to_markdown(pdf_path)
9
- # 2. ingest_pdf_file(file_path, title=None, author=None, keywords=None):
10
- # 3.
11
- #
12
- #
13
- ####################
14
- import re
15
-
16
- # Import necessary libraries
17
-
18
-
19
- # Import Local
20
-
21
- #######################################################################################################################
22
- # Function Definitions
23
- #
24
-
25
- # Ingest a text file into the database with Title/Author/Keywords
26
-
27
-
28
- # Constants
29
- MAX_FILE_SIZE_MB = 50
30
- CONVERSION_TIMEOUT_SECONDS = 300
31
-
32
- # Marker PDF solution
33
- # def convert_pdf_to_markdown(pdf_path):
34
- # """
35
- # Convert a PDF file to Markdown by calling a script in another virtual environment.
36
- # """
37
- #
38
- # logging.debug(f"Marker: Converting PDF file to Markdown: {pdf_path}")
39
- # # Check if the file size exceeds the maximum allowed size
40
- # file_size_mb = os.path.getsize(pdf_path) / (1024 * 1024)
41
- # if file_size_mb > MAX_FILE_SIZE_MB:
42
- # raise ValueError(f"File size ({file_size_mb:.2f} MB) exceeds the maximum allowed size of {MAX_FILE_SIZE_MB} MB")
43
- #
44
- # logging.debug("Marker: Converting PDF file to Markdown using Marker virtual environment")
45
- # # Path to the Python interpreter in the other virtual environment
46
- # other_venv_python = "Helper_Scripts/marker_venv/bin/python"
47
- #
48
- # # Path to the conversion script
49
- # converter_script = "Helper_Scripts/PDF_Converter.py"
50
- #
51
- # logging.debug("Marker: Attempting to convert PDF file to Markdown...")
52
- # try:
53
- # result = subprocess.run(
54
- # [other_venv_python, converter_script, pdf_path],
55
- # capture_output=True,
56
- # text=True,
57
- # timeout=CONVERSION_TIMEOUT_SECONDS
58
- # )
59
- # if result.returncode != 0:
60
- # raise Exception(f"Conversion failed: {result.stderr}")
61
- # return result.stdout
62
- # except subprocess.TimeoutExpired:
63
- # raise Exception(f"PDF conversion timed out after {CONVERSION_TIMEOUT_SECONDS} seconds")
64
- #
65
- #
66
- # def process_and_ingest_pdf(file, title, author, keywords):
67
- # if file is None:
68
- # return "Please select a PDF file to upload."
69
- #
70
- # try:
71
- # # Create a temporary directory
72
- # with tempfile.TemporaryDirectory() as temp_dir:
73
- # # Create a path for the temporary PDF file
74
- # temp_path = os.path.join(temp_dir, "temp.pdf")
75
- #
76
- # # Copy the contents of the uploaded file to the temporary file
77
- # shutil.copy(file.name, temp_path)
78
- #
79
- # # Call the ingest_pdf_file function with the temporary file path
80
- # result = ingest_pdf_file(temp_path, title, author, keywords)
81
- #
82
- # return result
83
- # except Exception as e:
84
- # return f"Error processing PDF: {str(e)}"
85
- #
86
- #
87
- # def ingest_pdf_file(file_path, title=None, author=None, keywords=None):
88
- # try:
89
- # # Convert PDF to Markdown
90
- # markdown_content = convert_pdf_to_markdown(file_path)
91
- #
92
- # # If title is not provided, use the filename without extension
93
- # if not title:
94
- # title = os.path.splitext(os.path.basename(file_path))[0]
95
- #
96
- # # If author is not provided, set it to 'Unknown'
97
- # if not author:
98
- # author = 'Unknown'
99
- #
100
- # # If keywords are not provided, use a default keyword
101
- # if not keywords:
102
- # keywords = 'pdf_file,markdown_converted'
103
- # else:
104
- # keywords = f'pdf_file,markdown_converted,{keywords}'
105
- #
106
- # # Add the markdown content to the database
107
- # add_media_with_keywords(
108
- # url=file_path,
109
- # title=title,
110
- # media_type='document',
111
- # content=markdown_content,
112
- # keywords=keywords,
113
- # prompt='No prompt for PDF files',
114
- # summary='No summary for PDF files',
115
- # transcription_model='None',
116
- # author=author,
117
- # ingestion_date=datetime.now().strftime('%Y-%m-%d')
118
- # )
119
- #
120
- # return f"PDF file '{title}' converted to Markdown and ingested successfully.", file_path
121
- # except ValueError as e:
122
- # logging.error(f"File size error: {str(e)}")
123
- # return f"Error: {str(e)}", file_path
124
- # except Exception as e:
125
- # logging.error(f"Error ingesting PDF file: {str(e)}")
126
- # return f"Error ingesting PDF file: {str(e)}", file_path
127
- #
128
- #
129
- # def process_and_cleanup_pdf(file, title, author, keywords):
130
- # # FIXME - Update to validate file upload/filetype is pdf....
131
- # if file is None:
132
- # return "No file uploaded. Please upload a PDF file."
133
- #
134
- # temp_dir = tempfile.mkdtemp()
135
- # temp_file_path = os.path.join(temp_dir, "temp.pdf")
136
- #
137
- # try:
138
- # # Copy the uploaded file to a temporary location
139
- # shutil.copy2(file.name, temp_file_path)
140
- #
141
- # # Process the file
142
- # result, _ = ingest_pdf_file(temp_file_path, title, author, keywords)
143
- #
144
- # return result
145
- # except Exception as e:
146
- # logging.error(f"Error in processing and cleanup: {str(e)}")
147
- # return f"Error: {str(e)}"
148
- # finally:
149
- # # Clean up the temporary directory and its contents
150
- # try:
151
- # shutil.rmtree(temp_dir)
152
- # logging.info(f"Removed temporary directory: {temp_dir}")
153
- # except Exception as cleanup_error:
154
- # logging.error(f"Error during cleanup: {str(cleanup_error)}")
155
- # result += f"\nWarning: Could not remove temporary files: {str(cleanup_error)}"
156
-
157
-
158
- import logging
159
- #
160
- #
161
- #######################################################################################################################
162
- #
163
- # Non-Marker implementation
164
- import os
165
- import shutil
166
- import tempfile
167
- from datetime import datetime
168
-
169
- import pymupdf
170
-
171
- from App_Function_Libraries.DB_Manager import add_media_with_keywords
172
-
173
-
174
- def extract_text_and_format_from_pdf(pdf_path):
175
- """
176
- Extract text from a PDF file and convert it to Markdown, preserving formatting.
177
- """
178
- try:
179
- markdown_text = ""
180
- with pymupdf.open(pdf_path) as doc:
181
- for page_num, page in enumerate(doc, 1):
182
- markdown_text += f"## Page {page_num}\n\n"
183
- blocks = page.get_text("dict")["blocks"]
184
- current_paragraph = ""
185
- for block in blocks:
186
- if block["type"] == 0: # Text block
187
- for line in block["lines"]:
188
- line_text = ""
189
- for span in line["spans"]:
190
- text = span["text"]
191
- font_size = span["size"]
192
- font_flags = span["flags"]
193
-
194
- # Apply formatting based on font size and flags
195
- if font_size > 20:
196
- text = f"# {text}"
197
- elif font_size > 16:
198
- text = f"## {text}"
199
- elif font_size > 14:
200
- text = f"### {text}"
201
-
202
- if font_flags & 2 ** 0: # Bold
203
- text = f"**{text}**"
204
- if font_flags & 2 ** 1: # Italic
205
- text = f"*{text}*"
206
-
207
- line_text += text + " "
208
-
209
- # Remove hyphens at the end of lines
210
- line_text = line_text.rstrip()
211
- if line_text.endswith('-'):
212
- line_text = line_text[:-1]
213
- else:
214
- line_text += " "
215
-
216
- current_paragraph += line_text
217
-
218
- # End of block, add paragraph
219
- if current_paragraph:
220
- # Remove extra spaces
221
- current_paragraph = re.sub(r'\s+', ' ', current_paragraph).strip()
222
- markdown_text += current_paragraph + "\n\n"
223
- current_paragraph = ""
224
- elif block["type"] == 1: # Image block
225
- markdown_text += "[Image]\n\n"
226
- markdown_text += "\n---\n\n" # Page separator
227
-
228
- # Clean up hyphenated words
229
- markdown_text = re.sub(r'(\w+)-\s*\n(\w+)', r'\1\2', markdown_text)
230
-
231
- return markdown_text
232
- except Exception as e:
233
- logging.error(f"Error extracting text and formatting from PDF: {str(e)}")
234
- raise
235
-
236
-
237
- def extract_metadata_from_pdf(pdf_path):
238
- """
239
- Extract metadata from a PDF file using PyMuPDF.
240
- """
241
- try:
242
- with pymupdf.open(pdf_path) as doc:
243
- metadata = doc.metadata
244
- return metadata
245
- except Exception as e:
246
- logging.error(f"Error extracting metadata from PDF: {str(e)}")
247
- return {}
248
-
249
-
250
- def process_and_ingest_pdf(file, title, author, keywords):
251
- if file is None:
252
- return "Please select a PDF file to upload."
253
-
254
- try:
255
- # Create a temporary directory
256
- with tempfile.TemporaryDirectory() as temp_dir:
257
- # Create a path for the temporary PDF file
258
- temp_path = os.path.join(temp_dir, "temp.pdf")
259
-
260
- # Copy the contents of the uploaded file to the temporary file
261
- shutil.copy(file.name, temp_path)
262
-
263
- # Extract text and convert to Markdown
264
- markdown_text = extract_text_and_format_from_pdf(temp_path)
265
-
266
- # Extract metadata from PDF
267
- metadata = extract_metadata_from_pdf(temp_path)
268
-
269
- # Use metadata for title and author if not provided
270
- if not title:
271
- title = metadata.get('title', os.path.splitext(os.path.basename(file.name))[0])
272
- if not author:
273
- author = metadata.get('author', 'Unknown')
274
-
275
- # If keywords are not provided, use a default keyword
276
- if not keywords:
277
- keywords = 'pdf_file,markdown_converted'
278
- else:
279
- keywords = f'pdf_file,markdown_converted,{keywords}'
280
-
281
- # Add metadata-based keywords
282
- if 'subject' in metadata:
283
- keywords += f",{metadata['subject']}"
284
-
285
- # Add the PDF content to the database
286
- add_media_with_keywords(
287
- url=file.name,
288
- title=title,
289
- media_type='document',
290
- content=markdown_text,
291
- keywords=keywords,
292
- prompt='No prompt for PDF files',
293
- summary='No summary for PDF files',
294
- transcription_model='None',
295
- author=author,
296
- ingestion_date=datetime.now().strftime('%Y-%m-%d')
297
- )
298
-
299
- return f"PDF file '{title}' by {author} ingested successfully and converted to Markdown."
300
- except Exception as e:
301
- logging.error(f"Error ingesting PDF file: {str(e)}")
302
- return f"Error ingesting PDF file: {str(e)}"
303
-
304
-
305
- def process_and_cleanup_pdf(file, title, author, keywords):
306
- if file is None:
307
- return "No file uploaded. Please upload a PDF file."
308
-
309
- try:
310
- result = process_and_ingest_pdf(file, title, author, keywords)
311
- return result
312
- except Exception as e:
313
- logging.error(f"Error in processing and cleanup: {str(e)}")
314
- return f"Error: {str(e)}"
315
-
316
- #
317
- # End of PDF_Ingestion_Lib.py
318
  #######################################################################################################################
 
1
+ # PDF_Ingestion_Lib.py
2
+ #########################################
3
+ # Library to hold functions for ingesting PDF files.#
4
+ #
5
+ ####################
6
+ # Function List
7
+ #
8
+ # 1. convert_pdf_to_markdown(pdf_path)
9
+ # 2. ingest_pdf_file(file_path, title=None, author=None, keywords=None):
10
+ # 3.
11
+ #
12
+ #
13
+ ####################
14
+ import re
15
+
16
+ # Import necessary libraries
17
+
18
+
19
+ # Import Local
20
+
21
+ #######################################################################################################################
22
+ # Function Definitions
23
+ #
24
+
25
+ # Ingest a text file into the database with Title/Author/Keywords
26
+
27
+
28
+ # Constants
29
+ MAX_FILE_SIZE_MB = 50
30
+ CONVERSION_TIMEOUT_SECONDS = 300
31
+
32
+ # Marker PDF solution
33
+ # def convert_pdf_to_markdown(pdf_path):
34
+ # """
35
+ # Convert a PDF file to Markdown by calling a script in another virtual environment.
36
+ # """
37
+ #
38
+ # logging.debug(f"Marker: Converting PDF file to Markdown: {pdf_path}")
39
+ # # Check if the file size exceeds the maximum allowed size
40
+ # file_size_mb = os.path.getsize(pdf_path) / (1024 * 1024)
41
+ # if file_size_mb > MAX_FILE_SIZE_MB:
42
+ # raise ValueError(f"File size ({file_size_mb:.2f} MB) exceeds the maximum allowed size of {MAX_FILE_SIZE_MB} MB")
43
+ #
44
+ # logging.debug("Marker: Converting PDF file to Markdown using Marker virtual environment")
45
+ # # Path to the Python interpreter in the other virtual environment
46
+ # other_venv_python = "Helper_Scripts/marker_venv/bin/python"
47
+ #
48
+ # # Path to the conversion script
49
+ # converter_script = "Helper_Scripts/PDF_Converter.py"
50
+ #
51
+ # logging.debug("Marker: Attempting to convert PDF file to Markdown...")
52
+ # try:
53
+ # result = subprocess.run(
54
+ # [other_venv_python, converter_script, pdf_path],
55
+ # capture_output=True,
56
+ # text=True,
57
+ # timeout=CONVERSION_TIMEOUT_SECONDS
58
+ # )
59
+ # if result.returncode != 0:
60
+ # raise Exception(f"Conversion failed: {result.stderr}")
61
+ # return result.stdout
62
+ # except subprocess.TimeoutExpired:
63
+ # raise Exception(f"PDF conversion timed out after {CONVERSION_TIMEOUT_SECONDS} seconds")
64
+ #
65
+ #
66
+ # def process_and_ingest_pdf(file, title, author, keywords):
67
+ # if file is None:
68
+ # return "Please select a PDF file to upload."
69
+ #
70
+ # try:
71
+ # # Create a temporary directory
72
+ # with tempfile.TemporaryDirectory() as temp_dir:
73
+ # # Create a path for the temporary PDF file
74
+ # temp_path = os.path.join(temp_dir, "temp.pdf")
75
+ #
76
+ # # Copy the contents of the uploaded file to the temporary file
77
+ # shutil.copy(file.name, temp_path)
78
+ #
79
+ # # Call the ingest_pdf_file function with the temporary file path
80
+ # result = ingest_pdf_file(temp_path, title, author, keywords)
81
+ #
82
+ # return result
83
+ # except Exception as e:
84
+ # return f"Error processing PDF: {str(e)}"
85
+ #
86
+ #
87
+ # def ingest_pdf_file(file_path, title=None, author=None, keywords=None):
88
+ # try:
89
+ # # Convert PDF to Markdown
90
+ # markdown_content = convert_pdf_to_markdown(file_path)
91
+ #
92
+ # # If title is not provided, use the filename without extension
93
+ # if not title:
94
+ # title = os.path.splitext(os.path.basename(file_path))[0]
95
+ #
96
+ # # If author is not provided, set it to 'Unknown'
97
+ # if not author:
98
+ # author = 'Unknown'
99
+ #
100
+ # # If keywords are not provided, use a default keyword
101
+ # if not keywords:
102
+ # keywords = 'pdf_file,markdown_converted'
103
+ # else:
104
+ # keywords = f'pdf_file,markdown_converted,{keywords}'
105
+ #
106
+ # # Add the markdown content to the database
107
+ # add_media_with_keywords(
108
+ # url=file_path,
109
+ # title=title,
110
+ # media_type='document',
111
+ # content=markdown_content,
112
+ # keywords=keywords,
113
+ # prompt='No prompt for PDF files',
114
+ # summary='No summary for PDF files',
115
+ # transcription_model='None',
116
+ # author=author,
117
+ # ingestion_date=datetime.now().strftime('%Y-%m-%d')
118
+ # )
119
+ #
120
+ # return f"PDF file '{title}' converted to Markdown and ingested successfully.", file_path
121
+ # except ValueError as e:
122
+ # logging.error(f"File size error: {str(e)}")
123
+ # return f"Error: {str(e)}", file_path
124
+ # except Exception as e:
125
+ # logging.error(f"Error ingesting PDF file: {str(e)}")
126
+ # return f"Error ingesting PDF file: {str(e)}", file_path
127
+ #
128
+ #
129
+ # def process_and_cleanup_pdf(file, title, author, keywords):
130
+ # # FIXME - Update to validate file upload/filetype is pdf....
131
+ # if file is None:
132
+ # return "No file uploaded. Please upload a PDF file."
133
+ #
134
+ # temp_dir = tempfile.mkdtemp()
135
+ # temp_file_path = os.path.join(temp_dir, "temp.pdf")
136
+ #
137
+ # try:
138
+ # # Copy the uploaded file to a temporary location
139
+ # shutil.copy2(file.name, temp_file_path)
140
+ #
141
+ # # Process the file
142
+ # result, _ = ingest_pdf_file(temp_file_path, title, author, keywords)
143
+ #
144
+ # return result
145
+ # except Exception as e:
146
+ # logging.error(f"Error in processing and cleanup: {str(e)}")
147
+ # return f"Error: {str(e)}"
148
+ # finally:
149
+ # # Clean up the temporary directory and its contents
150
+ # try:
151
+ # shutil.rmtree(temp_dir)
152
+ # logging.info(f"Removed temporary directory: {temp_dir}")
153
+ # except Exception as cleanup_error:
154
+ # logging.error(f"Error during cleanup: {str(cleanup_error)}")
155
+ # result += f"\nWarning: Could not remove temporary files: {str(cleanup_error)}"
156
+
157
+
158
+ import logging
159
+ #
160
+ #
161
+ #######################################################################################################################
162
+ #
163
+ # Non-Marker implementation
164
+ import os
165
+ import shutil
166
+ import tempfile
167
+ from datetime import datetime
168
+
169
+ import pymupdf
170
+
171
+ from App_Function_Libraries.DB.DB_Manager import add_media_with_keywords
172
+
173
+
174
+ def extract_text_and_format_from_pdf(pdf_path):
175
+ """
176
+ Extract text from a PDF file and convert it to Markdown, preserving formatting.
177
+ """
178
+ try:
179
+ markdown_text = ""
180
+ with pymupdf.open(pdf_path) as doc:
181
+ for page_num, page in enumerate(doc, 1):
182
+ markdown_text += f"## Page {page_num}\n\n"
183
+ blocks = page.get_text("dict")["blocks"]
184
+ current_paragraph = ""
185
+ for block in blocks:
186
+ if block["type"] == 0: # Text block
187
+ for line in block["lines"]:
188
+ line_text = ""
189
+ for span in line["spans"]:
190
+ text = span["text"]
191
+ font_size = span["size"]
192
+ font_flags = span["flags"]
193
+
194
+ # Apply formatting based on font size and flags
195
+ if font_size > 20:
196
+ text = f"# {text}"
197
+ elif font_size > 16:
198
+ text = f"## {text}"
199
+ elif font_size > 14:
200
+ text = f"### {text}"
201
+
202
+ if font_flags & 2 ** 0: # Bold
203
+ text = f"**{text}**"
204
+ if font_flags & 2 ** 1: # Italic
205
+ text = f"*{text}*"
206
+
207
+ line_text += text + " "
208
+
209
+ # Remove hyphens at the end of lines
210
+ line_text = line_text.rstrip()
211
+ if line_text.endswith('-'):
212
+ line_text = line_text[:-1]
213
+ else:
214
+ line_text += " "
215
+
216
+ current_paragraph += line_text
217
+
218
+ # End of block, add paragraph
219
+ if current_paragraph:
220
+ # Remove extra spaces
221
+ current_paragraph = re.sub(r'\s+', ' ', current_paragraph).strip()
222
+ markdown_text += current_paragraph + "\n\n"
223
+ current_paragraph = ""
224
+ elif block["type"] == 1: # Image block
225
+ markdown_text += "[Image]\n\n"
226
+ markdown_text += "\n---\n\n" # Page separator
227
+
228
+ # Clean up hyphenated words
229
+ markdown_text = re.sub(r'(\w+)-\s*\n(\w+)', r'\1\2', markdown_text)
230
+
231
+ return markdown_text
232
+ except Exception as e:
233
+ logging.error(f"Error extracting text and formatting from PDF: {str(e)}")
234
+ raise
235
+
236
+
237
+ def extract_metadata_from_pdf(pdf_path):
238
+ """
239
+ Extract metadata from a PDF file using PyMuPDF.
240
+ """
241
+ try:
242
+ with pymupdf.open(pdf_path) as doc:
243
+ metadata = doc.metadata
244
+ return metadata
245
+ except Exception as e:
246
+ logging.error(f"Error extracting metadata from PDF: {str(e)}")
247
+ return {}
248
+
249
+
250
+ def process_and_ingest_pdf(file, title, author, keywords):
251
+ if file is None:
252
+ return "Please select a PDF file to upload."
253
+
254
+ try:
255
+ # Create a temporary directory
256
+ with tempfile.TemporaryDirectory() as temp_dir:
257
+ # Create a path for the temporary PDF file
258
+ temp_path = os.path.join(temp_dir, "temp.pdf")
259
+
260
+ # Copy the contents of the uploaded file to the temporary file
261
+ shutil.copy(file.name, temp_path)
262
+
263
+ # Extract text and convert to Markdown
264
+ markdown_text = extract_text_and_format_from_pdf(temp_path)
265
+
266
+ # Extract metadata from PDF
267
+ metadata = extract_metadata_from_pdf(temp_path)
268
+
269
+ # Use metadata for title and author if not provided
270
+ if not title:
271
+ title = metadata.get('title', os.path.splitext(os.path.basename(file.name))[0])
272
+ if not author:
273
+ author = metadata.get('author', 'Unknown')
274
+
275
+ # If keywords are not provided, use a default keyword
276
+ if not keywords:
277
+ keywords = 'pdf_file,markdown_converted'
278
+ else:
279
+ keywords = f'pdf_file,markdown_converted,{keywords}'
280
+
281
+ # Add metadata-based keywords
282
+ if 'subject' in metadata:
283
+ keywords += f",{metadata['subject']}"
284
+
285
+ # Add the PDF content to the database
286
+ add_media_with_keywords(
287
+ url=file.name,
288
+ title=title,
289
+ media_type='document',
290
+ content=markdown_text,
291
+ keywords=keywords,
292
+ prompt='No prompt for PDF files',
293
+ summary='No summary for PDF files',
294
+ transcription_model='None',
295
+ author=author,
296
+ ingestion_date=datetime.now().strftime('%Y-%m-%d')
297
+ )
298
+
299
+ return f"PDF file '{title}' by {author} ingested successfully and converted to Markdown."
300
+ except Exception as e:
301
+ logging.error(f"Error ingesting PDF file: {str(e)}")
302
+ return f"Error ingesting PDF file: {str(e)}"
303
+
304
+
305
+ def process_and_cleanup_pdf(file, title, author, keywords):
306
+ if file is None:
307
+ return "No file uploaded. Please upload a PDF file."
308
+
309
+ try:
310
+ result = process_and_ingest_pdf(file, title, author, keywords)
311
+ return result
312
+ except Exception as e:
313
+ logging.error(f"Error in processing and cleanup: {str(e)}")
314
+ return f"Error: {str(e)}"
315
+
316
+ #
317
+ # End of PDF_Ingestion_Lib.py
318
  #######################################################################################################################