Spaces:
Sleeping
Sleeping
oceansweep
commited on
Commit
•
5616cfc
1
Parent(s):
fa4b240
Upload Chunk_Lib.py
Browse files
App_Function_Libraries/Chunk_Lib.py
CHANGED
@@ -476,22 +476,22 @@ def semantic_chunk_long_file(file_path, max_chunk_size=1000, overlap=100, unit='
|
|
476 |
#
|
477 |
# Embedding Chunking
|
478 |
|
479 |
-
def chunk_for_embedding(text: str, file_name: str,
|
480 |
options = chunk_options.copy()
|
481 |
if custom_chunk_options:
|
482 |
options.update(custom_chunk_options)
|
483 |
|
|
|
484 |
chunks = improved_chunking_process(text, options)
|
485 |
total_chunks = len(chunks)
|
|
|
486 |
|
487 |
chunked_text_with_headers = []
|
488 |
for i, chunk in enumerate(chunks, 1):
|
489 |
chunk_text = chunk['text']
|
490 |
chunk_position = determine_chunk_position(chunk['metadata']['relative_position'])
|
491 |
-
|
492 |
chunk_header = f"""
|
493 |
Original Document: {file_name}
|
494 |
-
Full Document Summary: {full_summary or "Full document summary not available."}
|
495 |
Chunk: {i} of {total_chunks}
|
496 |
Position: {chunk_position}
|
497 |
|
|
|
476 |
#
|
477 |
# Embedding Chunking
|
478 |
|
479 |
+
def chunk_for_embedding(text: str, file_name: str, custom_chunk_options: Dict[str, Any] = None) -> List[Dict[str, Any]]:
|
480 |
options = chunk_options.copy()
|
481 |
if custom_chunk_options:
|
482 |
options.update(custom_chunk_options)
|
483 |
|
484 |
+
logging.info(f"Chunking options: {options}")
|
485 |
chunks = improved_chunking_process(text, options)
|
486 |
total_chunks = len(chunks)
|
487 |
+
logging.info(f"Total chunks created: {total_chunks}")
|
488 |
|
489 |
chunked_text_with_headers = []
|
490 |
for i, chunk in enumerate(chunks, 1):
|
491 |
chunk_text = chunk['text']
|
492 |
chunk_position = determine_chunk_position(chunk['metadata']['relative_position'])
|
|
|
493 |
chunk_header = f"""
|
494 |
Original Document: {file_name}
|
|
|
495 |
Chunk: {i} of {total_chunks}
|
496 |
Position: {chunk_position}
|
497 |
|