Update utils.py
Browse files
utils.py
CHANGED
@@ -348,11 +348,20 @@ def document_loading_splitting():
|
|
348 |
# Document splitting
|
349 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)# RecursiveCharacterTextSplitter(chunk_overlap = 150, chunk_size = 1500)
|
350 |
splits = text_splitter.split_documents(preprocessed_docs)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
351 |
print("Splits...........................")
|
352 |
-
for split in
|
353 |
if 'divis' in split.page_content:
|
354 |
print("DIVIS found in chunk:", split)
|
355 |
-
|
|
|
356 |
|
357 |
###########################################
|
358 |
#Chroma DB die splits ablegen - vektorisiert...
|
|
|
348 |
# Document splitting
|
349 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)# RecursiveCharacterTextSplitter(chunk_overlap = 150, chunk_size = 1500)
|
350 |
splits = text_splitter.split_documents(preprocessed_docs)
|
351 |
+
# Split sowohl für originale als auch für vorverarbeitete Dokumente
|
352 |
+
original_splits = text_splitter.split_documents(docs)
|
353 |
+
preprocessed_splits = text_splitter.split_documents(preprocessed_docs)
|
354 |
+
|
355 |
+
# Mapping von vorverarbeiteten Splits zu Originalsplits
|
356 |
+
split_to_original_mapping = {p_split: o_split for p_split, o_split in zip(preprocessed_splits, original_splits)}
|
357 |
+
|
358 |
+
|
359 |
print("Splits...........................")
|
360 |
+
for split in preprocessed_splits:
|
361 |
if 'divis' in split.page_content:
|
362 |
print("DIVIS found in chunk:", split)
|
363 |
+
|
364 |
+
return preprocessed_splits, split_to_original_mapping
|
365 |
|
366 |
###########################################
|
367 |
#Chroma DB die splits ablegen - vektorisiert...
|