Update utils.py
Browse files
utils.py
CHANGED
@@ -102,16 +102,6 @@ HF_WRITE = os.getenv("HF_WRITE")
|
|
102 |
# HfApi-Instanz erstellen
|
103 |
api = HfApi()
|
104 |
|
105 |
-
# Überprüfen, ob das Repository existiert und zugänglich ist
|
106 |
-
try:
|
107 |
-
repo_info = api.list_repo_files(repo_id=STORAGE_REPO_ID, repo_type=REPO_TYPE, token=hf_token)
|
108 |
-
print(f"Repository '{STORAGE_REPO_ID}' enthält folgende Dateien: {repo_info}")
|
109 |
-
except Exception as e:
|
110 |
-
print(f"Fehler beim Zugriff auf das Repository: {e}")
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
|
116 |
#Maoing für die Splits (orginal und Preprocessed
|
117 |
split_to_original_mapping = []
|
@@ -260,10 +250,7 @@ def clean_text(text):
|
|
260 |
##################################################
|
261 |
#RAG Hilfsfunktionen - Dokumenten bearbeiten für Vektorstore
|
262 |
##################################################
|
263 |
-
|
264 |
-
# Directory Loader Konfigurieren um Text zu extrahieren
|
265 |
-
##################################################
|
266 |
-
|
267 |
def access_pdf(self, filename):
|
268 |
# Temporäre Datei erstellen
|
269 |
with tempfile.NamedTemporaryFile(delete=False, suffix=self.file_type) as temp_file:
|
@@ -280,35 +267,8 @@ def access_pdf(self, filename):
|
|
280 |
|
281 |
return temp_path
|
282 |
|
283 |
-
|
284 |
-
#besseren directory Loader als CustomLoader definieren, der den inhalt des dokuemnts, die seitenzahlen, die überschriften und die pfadezu den dokumenten extrahieren
|
285 |
-
def create_custom_loader(file_type, file_list): #create_directory_loader(file_type, directory_path):
|
286 |
-
loaders = {
|
287 |
-
'.pdf': load_pdf_with_metadata,
|
288 |
-
'.word': load_word_with_metadata,
|
289 |
-
}
|
290 |
-
|
291 |
-
|
292 |
-
class CustomLoader:
|
293 |
-
|
294 |
-
def __init__(self, directory_path, file_type, loader_func):
|
295 |
-
self.directory_path = directory_path
|
296 |
-
self.file_type = file_type
|
297 |
-
self.loader_func = loader_func
|
298 |
-
|
299 |
-
def load(self):
|
300 |
-
documents = []
|
301 |
-
for root, _, files in os.walk(self.directory_path):
|
302 |
-
for file in files:
|
303 |
-
if file.endswith(self.file_type):
|
304 |
-
file_path = os.path.join(root, file)
|
305 |
-
documents.extend(self.loader_func(file_path))
|
306 |
-
return documents
|
307 |
-
|
308 |
-
return CustomLoader(directory_path, file_type, loaders[file_type])
|
309 |
-
"""
|
310 |
-
|
311 |
-
|
312 |
def create_custom_loader(file_type, file_list):
|
313 |
loaders = {
|
314 |
'.pdf': load_pdf_with_metadata,
|
@@ -330,7 +290,7 @@ def load_pdf_with_metadata(file_path):
|
|
330 |
documents.append(Document(content=content, title=title, page=page_number, path=file_path, split_id=None))
|
331 |
return documents
|
332 |
|
333 |
-
#für
|
334 |
def load_word_with_metadata(file_path):
|
335 |
document = docx.Document(file_path)
|
336 |
title = "Dokument"
|
@@ -345,22 +305,11 @@ def load_word_with_metadata(file_path):
|
|
345 |
|
346 |
|
347 |
################################################
|
348 |
-
#Vektorstore
|
349 |
################################################
|
350 |
-
|
351 |
-
"""
|
352 |
-
def list_files_in_hf_repo(repo_id, directory=""):
|
353 |
-
try:
|
354 |
-
repo_info = api.list_repo_files(repo_id=repo_id, repo_type=REPO_TYPE)
|
355 |
-
if directory:
|
356 |
-
repo_info = [file for file in repo_info if file.startswith(directory)]
|
357 |
-
return repo_info
|
358 |
-
except Exception as e:
|
359 |
-
print(f"Fehler beim Zugriff auf das Repository.........................:{repo_id} {e}")
|
360 |
-
return []
|
361 |
-
"""
|
362 |
################################################
|
363 |
-
# Document Splitting
|
364 |
################################################
|
365 |
#die Inhalte splitten, um in Vektordatenbank entsprechend zu laden als Splits
|
366 |
# Funktion zum Splitten und Zuweisen der doc_id
|
@@ -382,34 +331,11 @@ def split_documents_with_id(docs, text_splitter):
|
|
382 |
|
383 |
|
384 |
########################################
|
385 |
-
#finally die Splits erzeugen und laden.....
|
386 |
def document_loading_splitting():
|
387 |
docs = []
|
388 |
print("Directory Loader neu............................")
|
389 |
|
390 |
-
|
391 |
-
|
392 |
-
# Verzeichnis für heruntergeladene Dateien
|
393 |
-
#download_dir = CHROMA_PDF
|
394 |
-
#os.makedirs(download_dir, exist_ok=True)
|
395 |
-
|
396 |
-
# Dateien im Hugging Face Space auflisten
|
397 |
-
"""
|
398 |
-
files_in_repo = list_files_in_hf_repo(STORAGE_REPO_ID, "chroma/kkg/pdf/")
|
399 |
-
print("hier.....................................")
|
400 |
-
# Dateien aus dem Hugging Face Space mit der STORAGE_REPO_ID herunterladen
|
401 |
-
for file_name in files_in_repo:
|
402 |
-
if file_name.endswith('.pdf'):
|
403 |
-
local_file_path = os.path.join(CHROMA_PDF, os.path.basename(file_name))
|
404 |
-
download_file_from_hf(file_name, local_file_path)
|
405 |
-
if file_name.endswith('.docx'):
|
406 |
-
local_file_path = os.path.join(CHROMA_WORD, os.path.basename(file_name))
|
407 |
-
download_file_from_hf(file_name, local_file_path)
|
408 |
-
print("file_name..................."+str(file_name))
|
409 |
-
print("local_file_path..................."+str(local_file_path))
|
410 |
-
"""
|
411 |
-
|
412 |
-
|
413 |
# Dateien im Hugging Face Space auflisten
|
414 |
files_in_repo = list_repo_files(repo_id=STORAGE_REPO_ID, repo_type="space", token=hf_token)
|
415 |
pdf_files = [f for f in files_in_repo if f.endswith('.pdf') and f.startswith("chroma/kkg/pdf/")]
|
@@ -417,19 +343,9 @@ def document_loading_splitting():
|
|
417 |
|
418 |
|
419 |
# Erstellen von DirectoryLoader für jeden Dateityp
|
420 |
-
# pdf_loader = create_directory_loader('.pdf', CHROMA_PDF)
|
421 |
-
#word_loader = create_directory_loader('.word', CHROMA_WORD)
|
422 |
-
|
423 |
pdf_loader = create_custom_loader('.pdf', pdf_files)
|
424 |
word_loader = create_custom_loader('.docx', word_files)
|
425 |
|
426 |
-
|
427 |
-
|
428 |
-
"""
|
429 |
-
# kreiere einen DirectoryLoader für jeden file type
|
430 |
-
pdf_loader = create_directory_loader('.pdf', CHROMA_PDF)
|
431 |
-
word_loader = create_directory_loader('.word', CHROMA_WORD)
|
432 |
-
"""
|
433 |
# Load the files
|
434 |
pdf_documents = pdf_loader.load()
|
435 |
word_documents = word_loader.load()
|
@@ -489,6 +405,7 @@ def document_storage_chroma(splits):
|
|
489 |
|
490 |
########################################################
|
491 |
#Splits für den Vektorstore speichern - bzw. laden
|
|
|
492 |
def save_splits(preprocessed_splits, original_splits, directory="chroma/kkg", preprocessed_filename="preprocessed_splits.pkl", original_filename="original_splits.pkl"):
|
493 |
# Erstellen des Verzeichnisses, falls es nicht existiert
|
494 |
if not os.path.exists(directory):
|
@@ -507,23 +424,7 @@ def save_splits(preprocessed_splits, original_splits, directory="chroma/kkg", pr
|
|
507 |
# Hochladen der Splits-Dateien zum Hugging Face Space
|
508 |
upload_file_to_huggingface(preprocessed_filepath, f"{directory}/{preprocessed_filename}")
|
509 |
upload_file_to_huggingface(original_filepath, f"{directory}/{original_filename}")
|
510 |
-
|
511 |
-
def load_splits(directory="chroma/kkg", preprocessed_filename="preprocessed_splits.pkl", original_filename="original_splits.pkl"):
|
512 |
-
# Vollständigen Pfad zur Datei erstellen
|
513 |
-
preprocessed_filepath = os.path.join(directory, preprocessed_filename)
|
514 |
-
original_filepath = os.path.join(directory, original_filename)
|
515 |
-
|
516 |
-
# Laden der vorverarbeiteten Splits aus der Datei
|
517 |
-
if os.path.exists(preprocessed_filepath) and os.path.exists(original_filepath):
|
518 |
-
with open(preprocessed_filepath, "rb") as f:
|
519 |
-
preprocessed_splits = pickle.load(f)
|
520 |
-
|
521 |
-
with open(original_filepath, "rb") as f:
|
522 |
-
original_splits = pickle.load(f)
|
523 |
-
|
524 |
-
return preprocessed_splits, original_splits
|
525 |
-
return None, None
|
526 |
-
"""
|
527 |
def load_splits(directory="chroma/kkg", preprocessed_filename="preprocessed_splits.pkl", original_filename="original_splits.pkl"):
|
528 |
preprocessed_splits = None
|
529 |
original_splits = None
|
@@ -554,20 +455,11 @@ def load_splits(directory="chroma/kkg", preprocessed_filename="preprocessed_spli
|
|
554 |
|
555 |
return preprocessed_splits, original_splits
|
556 |
|
557 |
-
########################################################
|
558 |
-
#Vektorstore speichern - bzw. laden
|
559 |
-
#Laden des Vektorstores - aus den gespeicherten splits
|
560 |
-
"""
|
561 |
-
def load_vectorstore():
|
562 |
-
splits_and_metadata = load_splits_and_metadata()
|
563 |
-
if splits_and_metadata is not None:
|
564 |
-
PREPROCESSED_SPLITS, SPLIT_TO_ORIGINAL_MAPPING = splits_and_metadata
|
565 |
-
return document_storage_chroma(PREPROCESSED_SPLITS)
|
566 |
-
return None
|
567 |
-
"""
|
568 |
|
569 |
-
|
570 |
-
|
|
|
|
|
571 |
def save_split_to_original_mapping(mapping, directory="chroma/kkg", filename="mapping.pkl"):
|
572 |
# Erstellen des Verzeichnisses, falls es nicht existiert
|
573 |
if not os.path.exists(directory):
|
@@ -580,17 +472,7 @@ def save_split_to_original_mapping(mapping, directory="chroma/kkg", filename="ma
|
|
580 |
|
581 |
# Hochladen der Mapping-Datei zum Hugging Face Space
|
582 |
upload_file_to_huggingface(filepath, f"{directory}/{filename}")
|
583 |
-
|
584 |
-
def load_split_to_original_mapping(directory="chroma/kkg", filename="mapping.pkl"):
|
585 |
-
# Vollständigen Pfad zur Datei erstellen
|
586 |
-
filepath = os.path.join(directory, filename)
|
587 |
-
|
588 |
-
# Laden des Mappings aus der Datei
|
589 |
-
if os.path.exists(filepath):
|
590 |
-
with open(filepath, "rb") as f:
|
591 |
-
return pickle.load(f)
|
592 |
-
return None
|
593 |
-
"""
|
594 |
|
595 |
def load_split_to_original_mapping(directory="chroma/kkg", filename="mapping.pkl"):
|
596 |
try:
|
@@ -624,7 +506,7 @@ def upload_file_to_huggingface(file_path, upload_path):
|
|
624 |
)
|
625 |
|
626 |
|
627 |
-
#ein File aus dem Space mit der REPO_ID laden
|
628 |
def download_file_from_hf(file_name, save_path):
|
629 |
url = f"https://huggingface.co/{STORAGE_REPO_ID}/resolve/main/{file_name}"
|
630 |
response = requests.get(url, headers=HEADERS)
|
@@ -776,7 +658,7 @@ def rag_chain_simpel( prompt, retriever):
|
|
776 |
# keine relevanten Dokumente gefunden
|
777 |
result = {
|
778 |
"answer": "Keine relevanten Dokumente gefunden",
|
779 |
-
"relevant_docs":
|
780 |
}
|
781 |
|
782 |
return result
|
@@ -793,16 +675,7 @@ def extract_document_info(documents):
|
|
793 |
title = filename if filename else "Keine Überschrift"
|
794 |
doc_path = doc.metadata.get("path", "")
|
795 |
# Determine the document type and adjust the path accordingly
|
796 |
-
d_link = download_link(doc)
|
797 |
-
"""
|
798 |
-
doc_path = doc.metadata.get("path", "")
|
799 |
-
if doc_path.endswith('.pdf'):
|
800 |
-
download_link = download_link(doc) #f"https://huggingface.co/spaces/alexkueck/{STORAGE_REPO_ID}/resolve/main/chroma/kkg/pdf/{title}"
|
801 |
-
elif doc_path.endswith('.docx'):
|
802 |
-
download_link = f"https://huggingface.co/spaces/alexkueck/{STORAGE_REPO_ID}/resolve/main/chroma/kkg/word/{title}"
|
803 |
-
else:
|
804 |
-
download_link = doc_path
|
805 |
-
"""
|
806 |
|
807 |
info = {
|
808 |
'content': doc.page_content,
|
@@ -838,16 +711,6 @@ def generate_prompt_with_history(text, history, max_length=4048):
|
|
838 |
return None
|
839 |
|
840 |
|
841 |
-
#############################################
|
842 |
-
#Prompt und History für Hugging Face Schnittstelle
|
843 |
-
def generate_prompt_with_history_hf(prompt, history):
|
844 |
-
history_transformer_format = history + [[prompt, ""]]
|
845 |
-
#stop = StopOnTokens()
|
846 |
-
|
847 |
-
messages = "".join(["".join(["\n<human>:"+item[0], "\n<bot>:"+item[1]]) #curr_system_message +
|
848 |
-
for item in history_transformer_format])
|
849 |
-
|
850 |
-
|
851 |
|
852 |
##########################################
|
853 |
#Hashing.... Für die Validierung........
|
@@ -869,20 +732,7 @@ def transfer_input(inputs):
|
|
869 |
|
870 |
|
871 |
########################################################
|
872 |
-
######## Hilfsfunktionen Datei-
|
873 |
-
"""
|
874 |
-
def download_link(doc):
|
875 |
-
# URL für das Herunterladen der Datei
|
876 |
-
# Check if doc is a dictionary and contains the key 'pfad'
|
877 |
-
if isinstance(doc, dict) and 'pfad' in doc:
|
878 |
-
file_url = f"https://huggingface.co/spaces/alexkueck/{STORAGE_REPO_ID}/resolve/main/chroma/kkg/{doc['pfad']}?token=hf_token"
|
879 |
-
return f'<b><a href="{file_url}" target="_blank" style="color: #BB70FC; font-weight: bold;">{doc["titel"]}</a></b>'
|
880 |
-
else:
|
881 |
-
file_url = f"https://huggingface.co/spaces/alexkueck/{STORAGE_REPO_ID}/resolve/main/{doc}?token=hf_token"
|
882 |
-
return f'<b><a href="{file_url}" target="_blank" style="color: #BB70FC; font-weight: bold;">{doc}</a></b>'
|
883 |
-
|
884 |
-
"""
|
885 |
-
|
886 |
def download_link(doc):
|
887 |
# Basis-URL für das Hugging Face Repository
|
888 |
base_url = f"https://huggingface.co/spaces/{STORAGE_REPO_ID}/resolve/main"
|
@@ -917,32 +767,6 @@ def download_link(doc):
|
|
917 |
#################################################
|
918 |
#File Liste beim Tab für File-Upload schön darstellen
|
919 |
#################################################
|
920 |
-
"""
|
921 |
-
def display_files():
|
922 |
-
files = os.listdir(DOCS_DIR_PDF)
|
923 |
-
files_table = "<table style='width:100%; border-collapse: collapse;'>"
|
924 |
-
files_table += "<tr style='background-color: #930BBA; color: white; font-weight: bold; font-size: larger;'><th>Dateiname - PDF-Ordner</th></tr>"
|
925 |
-
for i, file in enumerate(files):
|
926 |
-
file_path = os.path.join(DOCS_DIR_PDF, file)
|
927 |
-
file_size = os.path.getsize(file_path) / 1024 # Größe in KB
|
928 |
-
row_color = "#4f4f4f" if i % 2 == 0 else "#3a3a3a" # Wechselnde Zeilenfarben
|
929 |
-
files_table += f"<tr style='background-color: {row_color}; border-bottom: 1px solid #ddd;'>"
|
930 |
-
files_table += f"<td><b>{download_link(file)}</b></td></tr>"
|
931 |
-
files_table += "</table>"
|
932 |
-
|
933 |
-
files = os.listdir(DOCS_DIR_WORD)
|
934 |
-
files_table += "<table style='width:100%; border-collapse: collapse;'>"
|
935 |
-
files_table += "<tr style='background-color: #930BBA; color: white; font-weight: bold; font-size: larger;'><th>Dateiname - Word-Ordner</th></tr>"
|
936 |
-
for i, file in enumerate(files):
|
937 |
-
file_path = os.path.join(DOCS_DIR_WORD, file)
|
938 |
-
file_size = os.path.getsize(file_path) / 1024 # Größe in KB
|
939 |
-
row_color = "#4f4f4f" if i % 2 == 0 else "#3a3a3a" # Wechselnde Zeilenfarben
|
940 |
-
files_table += f"<tr style='background-color: {row_color}; border-bottom: 1px solid #ddd;'>"
|
941 |
-
files_table += f"<td><b>{download_link(file)}</b></td></tr>"
|
942 |
-
files_table += "</table>"
|
943 |
-
return files_table
|
944 |
-
"""
|
945 |
-
|
946 |
def display_files():
|
947 |
files_table = "<table style='width:100%; border-collapse: collapse;'>"
|
948 |
|
@@ -966,13 +790,6 @@ def display_files():
|
|
966 |
return files_table
|
967 |
|
968 |
|
969 |
-
# gefundene relevante Dokumente auflisten (links)
|
970 |
-
"""
|
971 |
-
def list_pdfs():
|
972 |
-
if not os.path.exists(DOCS_DIR):
|
973 |
-
return []
|
974 |
-
return [f for f in os.listdir(SAVE_DIR) if f.endswith('.pdf')]
|
975 |
-
"""
|
976 |
##########################################
|
977 |
#Extension des hochgeladenen Files bestimmen
|
978 |
def analyze_file(file):
|
@@ -1012,8 +829,9 @@ class State:
|
|
1012 |
self.interrupted = False
|
1013 |
shared_state = State()
|
1014 |
|
1015 |
-
|
1016 |
#Für die relevanten Dokumente - damit sie passend zum Dictionary die Attribute haben
|
|
|
1017 |
class Document:
|
1018 |
def __init__(self, content, title, page, path, split_id=None):
|
1019 |
self.page_content = content
|
|
|
102 |
# HfApi-Instanz erstellen
|
103 |
api = HfApi()
|
104 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
|
106 |
#Maoing für die Splits (orginal und Preprocessed
|
107 |
split_to_original_mapping = []
|
|
|
250 |
##################################################
|
251 |
#RAG Hilfsfunktionen - Dokumenten bearbeiten für Vektorstore
|
252 |
##################################################
|
253 |
+
#Files aus anderem Repi downloaden
|
|
|
|
|
|
|
254 |
def access_pdf(self, filename):
|
255 |
# Temporäre Datei erstellen
|
256 |
with tempfile.NamedTemporaryFile(delete=False, suffix=self.file_type) as temp_file:
|
|
|
267 |
|
268 |
return temp_path
|
269 |
|
270 |
+
################################################
|
271 |
+
#besseren directory Loader als CustomLoader definieren, der den inhalt des dokuemnts, die seitenzahlen, die überschriften und die pfadezu den dokumenten extrahieren
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
272 |
def create_custom_loader(file_type, file_list):
|
273 |
loaders = {
|
274 |
'.pdf': load_pdf_with_metadata,
|
|
|
290 |
documents.append(Document(content=content, title=title, page=page_number, path=file_path, split_id=None))
|
291 |
return documents
|
292 |
|
293 |
+
#für Word Dokumente
|
294 |
def load_word_with_metadata(file_path):
|
295 |
document = docx.Document(file_path)
|
296 |
title = "Dokument"
|
|
|
305 |
|
306 |
|
307 |
################################################
|
308 |
+
#für den Vektorstore
|
309 |
################################################
|
310 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
311 |
################################################
|
312 |
+
# Document Splitting - und id für das Mapping
|
313 |
################################################
|
314 |
#die Inhalte splitten, um in Vektordatenbank entsprechend zu laden als Splits
|
315 |
# Funktion zum Splitten und Zuweisen der doc_id
|
|
|
331 |
|
332 |
|
333 |
########################################
|
334 |
+
#finally die Splits erzeugen und laden..... für den Vektorstore
|
335 |
def document_loading_splitting():
|
336 |
docs = []
|
337 |
print("Directory Loader neu............................")
|
338 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
339 |
# Dateien im Hugging Face Space auflisten
|
340 |
files_in_repo = list_repo_files(repo_id=STORAGE_REPO_ID, repo_type="space", token=hf_token)
|
341 |
pdf_files = [f for f in files_in_repo if f.endswith('.pdf') and f.startswith("chroma/kkg/pdf/")]
|
|
|
343 |
|
344 |
|
345 |
# Erstellen von DirectoryLoader für jeden Dateityp
|
|
|
|
|
|
|
346 |
pdf_loader = create_custom_loader('.pdf', pdf_files)
|
347 |
word_loader = create_custom_loader('.docx', word_files)
|
348 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
349 |
# Load the files
|
350 |
pdf_documents = pdf_loader.load()
|
351 |
word_documents = word_loader.load()
|
|
|
405 |
|
406 |
########################################################
|
407 |
#Splits für den Vektorstore speichern - bzw. laden
|
408 |
+
########################################################
|
409 |
def save_splits(preprocessed_splits, original_splits, directory="chroma/kkg", preprocessed_filename="preprocessed_splits.pkl", original_filename="original_splits.pkl"):
|
410 |
# Erstellen des Verzeichnisses, falls es nicht existiert
|
411 |
if not os.path.exists(directory):
|
|
|
424 |
# Hochladen der Splits-Dateien zum Hugging Face Space
|
425 |
upload_file_to_huggingface(preprocessed_filepath, f"{directory}/{preprocessed_filename}")
|
426 |
upload_file_to_huggingface(original_filepath, f"{directory}/{original_filename}")
|
427 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
428 |
def load_splits(directory="chroma/kkg", preprocessed_filename="preprocessed_splits.pkl", original_filename="original_splits.pkl"):
|
429 |
preprocessed_splits = None
|
430 |
original_splits = None
|
|
|
455 |
|
456 |
return preprocessed_splits, original_splits
|
457 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
458 |
|
459 |
+
|
460 |
+
########################################
|
461 |
+
#das Mapping der orginal-Splits und der preprocessed Splits speichern - und laden
|
462 |
+
########################################
|
463 |
def save_split_to_original_mapping(mapping, directory="chroma/kkg", filename="mapping.pkl"):
|
464 |
# Erstellen des Verzeichnisses, falls es nicht existiert
|
465 |
if not os.path.exists(directory):
|
|
|
472 |
|
473 |
# Hochladen der Mapping-Datei zum Hugging Face Space
|
474 |
upload_file_to_huggingface(filepath, f"{directory}/{filename}")
|
475 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
476 |
|
477 |
def load_split_to_original_mapping(directory="chroma/kkg", filename="mapping.pkl"):
|
478 |
try:
|
|
|
506 |
)
|
507 |
|
508 |
|
509 |
+
#ein File aus dem Space mit der REPO_ID laden - Authentifizierung über den HEADER
|
510 |
def download_file_from_hf(file_name, save_path):
|
511 |
url = f"https://huggingface.co/{STORAGE_REPO_ID}/resolve/main/{file_name}"
|
512 |
response = requests.get(url, headers=HEADERS)
|
|
|
658 |
# keine relevanten Dokumente gefunden
|
659 |
result = {
|
660 |
"answer": "Keine relevanten Dokumente gefunden",
|
661 |
+
"relevant_docs": None
|
662 |
}
|
663 |
|
664 |
return result
|
|
|
675 |
title = filename if filename else "Keine Überschrift"
|
676 |
doc_path = doc.metadata.get("path", "")
|
677 |
# Determine the document type and adjust the path accordingly
|
678 |
+
d_link = download_link(doc)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
679 |
|
680 |
info = {
|
681 |
'content': doc.page_content,
|
|
|
711 |
return None
|
712 |
|
713 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
714 |
|
715 |
##########################################
|
716 |
#Hashing.... Für die Validierung........
|
|
|
732 |
|
733 |
|
734 |
########################################################
|
735 |
+
######## Hilfsfunktionen Datei-Download ##################
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
736 |
def download_link(doc):
|
737 |
# Basis-URL für das Hugging Face Repository
|
738 |
base_url = f"https://huggingface.co/spaces/{STORAGE_REPO_ID}/resolve/main"
|
|
|
767 |
#################################################
|
768 |
#File Liste beim Tab für File-Upload schön darstellen
|
769 |
#################################################
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
770 |
def display_files():
|
771 |
files_table = "<table style='width:100%; border-collapse: collapse;'>"
|
772 |
|
|
|
790 |
return files_table
|
791 |
|
792 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
793 |
##########################################
|
794 |
#Extension des hochgeladenen Files bestimmen
|
795 |
def analyze_file(file):
|
|
|
829 |
self.interrupted = False
|
830 |
shared_state = State()
|
831 |
|
832 |
+
###############################################
|
833 |
#Für die relevanten Dokumente - damit sie passend zum Dictionary die Attribute haben
|
834 |
+
###############################################
|
835 |
class Document:
|
836 |
def __init__(self, content, title, page, path, split_id=None):
|
837 |
self.page_content = content
|