Update utils.py
Browse files
utils.py
CHANGED
@@ -267,34 +267,6 @@ def load_word_with_metadata(file_path):
|
|
267 |
return documents
|
268 |
|
269 |
|
270 |
-
"""
|
271 |
-
# Custom loader functions
|
272 |
-
def load_pdf_with_metadata(file_path):
|
273 |
-
document = fitz.open(file_path)
|
274 |
-
documents = []
|
275 |
-
for page_num in range(len(document)):
|
276 |
-
page = document.load_page(page_num)
|
277 |
-
content = page.get_text("text")
|
278 |
-
metadata = {
|
279 |
-
"title": document.metadata.get("title", "Unbekannt"),
|
280 |
-
"page": page_num + 1,
|
281 |
-
"path": file_path
|
282 |
-
}
|
283 |
-
documents.append({"page_content": content, "metadata": metadata})
|
284 |
-
return documents
|
285 |
-
|
286 |
-
def load_word_with_metadata(file_path):
|
287 |
-
document = docx.Document(file_path)
|
288 |
-
metadata = {
|
289 |
-
"title": "Dokument",
|
290 |
-
"path": file_path
|
291 |
-
}
|
292 |
-
contents = []
|
293 |
-
for para in document.paragraphs:
|
294 |
-
content = para.text
|
295 |
-
contents.append({"page_content": content, "metadata": {**metadata, "page": 1}})
|
296 |
-
return contents
|
297 |
-
"""
|
298 |
|
299 |
|
300 |
################################################
|
@@ -362,34 +334,7 @@ def document_retrieval_chroma(llm, prompt):
|
|
362 |
return db
|
363 |
|
364 |
|
365 |
-
############################################
|
366 |
-
# rag_chain Alternative für RAg mit Bild-Upload, da hier das llm so nicht genutzt werden kann und der prompt mit den RAG Erweiterungen anders übergeben wird
|
367 |
-
#langchain nutzen, um prompt an llm zu leiten, aber vorher in der VektorDB suchen, um passende splits zum Prompt hinzuzufügen
|
368 |
-
#prompt mit RAG!!!
|
369 |
-
"""
|
370 |
-
def rag_chainback(prompt, db, k=3):
|
371 |
-
rag_template = "Nutze ausschließlich die folgenden Kontext Teile am Ende, um die Frage zu beantworten . " + template + "Frage: " + prompt + "Kontext Teile: "
|
372 |
-
retrieved_chunks = db.similarity_search(prompt, k)
|
373 |
-
|
374 |
-
# Erstelle ein Dictionary für die Chunks
|
375 |
-
chunks_dict = []
|
376 |
-
for i, chunk in enumerate(retrieved_chunks):
|
377 |
-
chunk_dict = {
|
378 |
-
"chunk_index": i + 1,
|
379 |
-
"page_content": chunk.page_content, # assuming chunk has page_content attribute
|
380 |
-
"metadata": chunk.metadata # assuming chunk has metadata attribute
|
381 |
-
}
|
382 |
-
chunks_dict.append(chunk_dict)
|
383 |
-
|
384 |
-
# Erstelle das neue Prompt
|
385 |
-
neu_prompt = rag_template
|
386 |
-
for chunk in chunks_dict:
|
387 |
-
neu_prompt += f"{chunk['chunk_index']}. {chunk['page_content']}\n"
|
388 |
|
389 |
-
print("dict.............................."+ json.dumps(chunks_dict, indent=4, ensure_ascii=False))
|
390 |
-
|
391 |
-
return neu_prompt, chunks_dict # returning both the new prompt and the dictionary
|
392 |
-
"""
|
393 |
|
394 |
###############################################
|
395 |
#Langchain anlegen für RAG Chaining
|
@@ -414,10 +359,7 @@ def rag_chain(llm, prompt, retriever):
|
|
414 |
relevant_docs = retriever.get_relevant_documents(prompt)
|
415 |
extracted_docs = extract_document_info(relevant_docs)
|
416 |
|
417 |
-
|
418 |
-
print("releant docs1......................")
|
419 |
if (len(extracted_docs)>0):
|
420 |
-
print("releant docs2......................")
|
421 |
print(extracted_docs)
|
422 |
#llm_chain = LLMChain(llm = llm, prompt = RAG_CHAIN_PROMPT)
|
423 |
#result = llm_chain.run({"context": relevant_docs, "question": prompt})
|
@@ -476,23 +418,6 @@ def extract_document_info(documents):
|
|
476 |
}
|
477 |
extracted_info.append(info)
|
478 |
return extracted_info
|
479 |
-
|
480 |
-
|
481 |
-
"""
|
482 |
-
# Funktion zum Erstellen der Liste von Dictionaries
|
483 |
-
def extract_document_info(documents):
|
484 |
-
extracted_info = []
|
485 |
-
for doc in documents:
|
486 |
-
info = {
|
487 |
-
'content' : doc["content"],
|
488 |
-
'metadaten' : doc["metadata"],
|
489 |
-
'titel' : metadaten.get("title", "Keine Überschrift"),
|
490 |
-
'seite' : metadaten.get("page", "Unbekannte Seite"),
|
491 |
-
'pfad' : metadaten.get("path", "Kein Pfad verfügbar")
|
492 |
-
}
|
493 |
-
extracted_info.append(info)
|
494 |
-
return extracted_info
|
495 |
-
"""
|
496 |
|
497 |
|
498 |
|
|
|
267 |
return documents
|
268 |
|
269 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
270 |
|
271 |
|
272 |
################################################
|
|
|
334 |
return db
|
335 |
|
336 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
337 |
|
|
|
|
|
|
|
|
|
338 |
|
339 |
###############################################
|
340 |
#Langchain anlegen für RAG Chaining
|
|
|
359 |
relevant_docs = retriever.get_relevant_documents(prompt)
|
360 |
extracted_docs = extract_document_info(relevant_docs)
|
361 |
|
|
|
|
|
362 |
if (len(extracted_docs)>0):
|
|
|
363 |
print(extracted_docs)
|
364 |
#llm_chain = LLMChain(llm = llm, prompt = RAG_CHAIN_PROMPT)
|
365 |
#result = llm_chain.run({"context": relevant_docs, "question": prompt})
|
|
|
418 |
}
|
419 |
extracted_info.append(info)
|
420 |
return extracted_info
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
421 |
|
422 |
|
423 |
|