Update utils.py
Browse files
utils.py
CHANGED
@@ -244,6 +244,31 @@ def create_directory_loader(file_type, directory_path):
|
|
244 |
################################################
|
245 |
# Custom Loader-Funktionen zu dem DirektoryLoader
|
246 |
# Custom loader functions
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
247 |
def load_pdf_with_metadata(file_path):
|
248 |
document = fitz.open(file_path)
|
249 |
documents = []
|
@@ -269,7 +294,7 @@ def load_word_with_metadata(file_path):
|
|
269 |
content = para.text
|
270 |
contents.append({"page_content": content, "metadata": {**metadata, "page": 1}})
|
271 |
return contents
|
272 |
-
|
273 |
|
274 |
|
275 |
################################################
|
|
|
244 |
################################################
|
245 |
# Custom Loader-Funktionen zu dem DirektoryLoader
|
246 |
# Custom loader functions
|
247 |
+
def load_pdf_with_metadata(file_path):
|
248 |
+
document = fitz.open(file_path)
|
249 |
+
documents = []
|
250 |
+
for page_num in range(len(document)):
|
251 |
+
page = document.load_page(page_num)
|
252 |
+
content = page.get_text("text")
|
253 |
+
title = document.metadata.get("title", "Unbekannt")
|
254 |
+
page_number = page_num + 1
|
255 |
+
documents.append(Document(content=content, title=title, page=page_number, path=file_path))
|
256 |
+
return documents
|
257 |
+
|
258 |
+
def load_word_with_metadata(file_path):
|
259 |
+
document = docx.Document(file_path)
|
260 |
+
title = "Dokument"
|
261 |
+
path = file_path
|
262 |
+
documents = []
|
263 |
+
for para in document.paragraphs:
|
264 |
+
content = para.text
|
265 |
+
page_number = 1 # Word-Dokumente haben keine Seitenzahlen in diesem Kontext
|
266 |
+
documents.append(Document(content=content, title=title, page=page_number, path=path))
|
267 |
+
return documents
|
268 |
+
|
269 |
+
|
270 |
+
"""
|
271 |
+
# Custom loader functions
|
272 |
def load_pdf_with_metadata(file_path):
|
273 |
document = fitz.open(file_path)
|
274 |
documents = []
|
|
|
294 |
content = para.text
|
295 |
contents.append({"page_content": content, "metadata": {**metadata, "page": 1}})
|
296 |
return contents
|
297 |
+
"""
|
298 |
|
299 |
|
300 |
################################################
|