Standard_Intelligence_Dev

Sleeping

App Files Files Community

YchKhan commited on Apr 30, 2024

Commit

7667045

1 Parent(s): dde97ad

Update split_files_to_excel.py

Browse files

Files changed (1) hide show

split_files_to_excel.py +62 -14

split_files_to_excel.py CHANGED Viewed

@@ -29,8 +29,8 @@ import requests
 import json
 MODEL = "thenlper/gte-base"
-CHUNK_SIZE = 1000
-CHUNK_OVERLAP = 200
 embeddings = HuggingFaceEmbeddings(
     model_name=MODEL,
@@ -323,15 +323,41 @@ def split_chunks_by_tokens_period(documents, max_length=170, overlap=10, min_chu
     # Create an empty list to store the resized documents
     resized = []
     previous_file=""
     # Iterate through the original documents list
-    for doc in documents:
         current_file = doc.metadata['source']
         if current_file != previous_file: #chunk counting
             previous_file = current_file
             chunk_counter = 0
             is_first_chunk = True  # Keep track of the first chunk in the document
-        encoded = tokenizer.encode(doc.page_content)#encode the current document
         if len(encoded) > max_length:
             remaining_encoded = encoded
             is_last_chunk = False
             while len(remaining_encoded) > 1 and not is_last_chunk:
@@ -339,47 +365,69 @@ def split_chunks_by_tokens_period(documents, max_length=170, overlap=10, min_chu
                 overlap_text = tokenizer.decode(remaining_encoded[:overlap])# Index by token
                 period_index_b = overlap_text.find('.')# Index by character
                 if len(remaining_encoded)>max_length + min_chunk_size:
                     current_encoded = remaining_encoded[:max(10, max_length)]
                 else:
-                    current_encoded = remaining_encoded[:max(10, max_length + min_chunk_size)] #if the last chunk is to small, concatenate it with the previous one
                     is_last_chunk = True
-                period_index_e = len(doc.page_content) # an amount of character that I am sure will be greater or equal to the max lengh of a chunk, could have done len(tokenizer.decode(current_encoded))
                 if len(remaining_encoded)>max_length+min_chunk_size:# If it is not the last sub chunk
                     overlap_text_last = tokenizer.decode(current_encoded[-overlap:])
                     period_index_last = overlap_text_last.find('.')
                     if period_index_last != -1 and period_index_last < len(overlap_text_last) - 1:
-                        #print(f"period index last found at {period_index_last}")
-                        period_index_e = period_index_last - len(overlap_text_last) + 1
-                        #print(f"period_index_e :{period_index_e}")
-                    #print(f"last :{overlap_text_last}")
                 if not is_first_chunk:#starting after the period in overlap
                     if period_index_b == -1:# Period not found in overlap
-                        #print(". not found in overlap")
                         split_doc = Document(page_content=tokenizer.decode(current_encoded)[:period_index_e], metadata=doc.metadata.copy()) # Keep regular splitting
                     else:
                         if is_last_chunk : #not the first but the last
                             split_doc = Document(page_content=tokenizer.decode(current_encoded)[period_index_b+1:], metadata=doc.metadata.copy())
                         #print("Should start after \".\"")
                         else:
                             split_doc = Document(page_content=tokenizer.decode(current_encoded)[period_index_b+1:period_index_e], metadata=doc.metadata.copy()) # Split at the begining and the end
                 else:#first chunk
                     split_doc = Document(page_content=tokenizer.decode(current_encoded)[:period_index_e], metadata=doc.metadata.copy()) # split only at the end if its first chunk
                 if 'titles' in split_doc.metadata:
                     chunk_counter += 1
                     split_doc.metadata['chunk_id'] = chunk_counter
                 #A1 We could round chunk length in token if we ignore the '.' position in the overlap and save time of computation
                 split_doc.metadata['token_length'] = len(tokenizer.encode(split_doc.page_content))
                 resized.append(split_doc)
                 remaining_encoded = remaining_encoded[max(10, max_length - overlap):]
                 is_first_chunk = False
-                #print(len(tokenizer.encode(split_doc.page_content)), split_doc.page_content, "\n-----------------")
-        elif len(encoded)>min_chunk_size:#ignore the chunks that are too small
             #print(f"◀Document:{{ {doc.page_content} }} was not added because to short▶")
             if 'titles' in doc.metadata:#check if it was splitted by or split_docx
                 chunk_counter += 1
                 doc.metadata['chunk_id'] = chunk_counter
-                doc.metadata['token_length'] = len(encoded)
             resized.append(doc)
     print(f"Number of chunks before resplitting: {len(documents)} \nAfter splitting: {len(resized)}")
     return resized

 import json
 MODEL = "thenlper/gte-base"
+CHUNK_SIZE = 1500
+CHUNK_OVERLAP = 400
 embeddings = HuggingFaceEmbeddings(
     model_name=MODEL,
     # Create an empty list to store the resized documents
     resized = []
     previous_file=""
+    to_encode = ""
+    skip_next = False
     # Iterate through the original documents list
+    for i, doc in enumerate(documents):
+        if skip_next:
+            skip_next = False
+            continue
         current_file = doc.metadata['source']
         if current_file != previous_file: #chunk counting
             previous_file = current_file
             chunk_counter = 0
             is_first_chunk = True  # Keep track of the first chunk in the document
+        to_encode += doc.page_content
+        # if last chunk < min_chunk_size we add it to the previous chunk for the splitting.
+        if (documents[i+1] is documents[-1] or documents[i+1].metadata['source'] != documents[i+2].metadata['source']) and len(tokenizer.encode(documents[i+1].page_content)) < min_chunk_size: # if the next doc is the last doc of the current file or the last of the corpus
+            # print('SAME DOC')
+            skip_next = True
+            to_encode += documents[i+1].page_content
+        #print(f"to_encode:\n{to_encode}")
+        encoded = tokenizer.encode(to_encode)#encode the current document
+        if len(encoded) < min_chunk_size and not skip_next:
+            # print(f"len(encoded):{len(encoded)}<min_chunk_size:{min_chunk_size}")
+            continue
+        elif skip_next:
+            split_doc = Document(page_content=tokenizer.decode(encoded), metadata=doc.metadata.copy())
+            split_doc.metadata['token_length'] = len(tokenizer.encode(split_doc.page_content))
+            resized.append(split_doc)
+            # print(f"Added a document of {split_doc.metadata['token_length']} tokens 1")
+            to_encode = ""
+            continue
+        else:
+            # print(f"len(encoded):{len(encoded)}>=min_chunk_size:{min_chunk_size}")
+            to_encode = ""
         if len(encoded) > max_length:
+            # print(f"len(encoded):{len(encoded)}>=max_length:{max_length}")
             remaining_encoded = encoded
             is_last_chunk = False
             while len(remaining_encoded) > 1 and not is_last_chunk:
                 overlap_text = tokenizer.decode(remaining_encoded[:overlap])# Index by token
                 period_index_b = overlap_text.find('.')# Index by character
                 if len(remaining_encoded)>max_length + min_chunk_size:
+                    # print("len(remaining_encoded)>max_length + min_chunk_size")
                     current_encoded = remaining_encoded[:max(10, max_length)]
                 else:
+                    # print("not len(remaining_encoded)>max_length + min_chunk_size")
+                    current_encoded = remaining_encoded #if the last chunk is to small, concatenate it with the previous one
                     is_last_chunk = True
+                    split_doc = Document(page_content=tokenizer.decode(current_encoded), metadata=doc.metadata.copy())
+                    split_doc.metadata['token_length'] = len(tokenizer.encode(split_doc.page_content))
+                    resized.append(split_doc)
+                    # print(f"Added a document of {split_doc.metadata['token_length']} tokens 2")
+                    break
+                period_index_e = -1 # an amount of character that I am sure will be greater or equal to the max lengh of a chunk, could have done len(tokenizer.decode(current_encoded))
                 if len(remaining_encoded)>max_length+min_chunk_size:# If it is not the last sub chunk
+                    # print("len(remaining_encoded)>max_length+min_chunk_size")
                     overlap_text_last = tokenizer.decode(current_encoded[-overlap:])
                     period_index_last = overlap_text_last.find('.')
                     if period_index_last != -1 and period_index_last < len(overlap_text_last) - 1:
+                        # print(f"period index last found at {period_index_last}")
+                        period_index_e = period_index_last - len(overlap_text_last)
+                        # print(f"period_index_e :{period_index_e}")
+                    # print(f"last :{overlap_text_last}")
                 if not is_first_chunk:#starting after the period in overlap
+                    # print("not is_first_chunk", period_index_b)
                     if period_index_b == -1:# Period not found in overlap
+                        # print(". not found in overlap")
                         split_doc = Document(page_content=tokenizer.decode(current_encoded)[:period_index_e], metadata=doc.metadata.copy()) # Keep regular splitting
                     else:
                         if is_last_chunk : #not the first but the last
+                            # print("is_last_chunk")
                             split_doc = Document(page_content=tokenizer.decode(current_encoded)[period_index_b+1:], metadata=doc.metadata.copy())
                         #print("Should start after \".\"")
                         else:
+                            # print("not is_last_chunk", period_index_e, len(to_encode))
                             split_doc = Document(page_content=tokenizer.decode(current_encoded)[period_index_b+1:period_index_e], metadata=doc.metadata.copy()) # Split at the begining and the end
                 else:#first chunk
+                    # print("else")
                     split_doc = Document(page_content=tokenizer.decode(current_encoded)[:period_index_e], metadata=doc.metadata.copy()) # split only at the end if its first chunk
                 if 'titles' in split_doc.metadata:
+                    # print("title in metadata")
                     chunk_counter += 1
                     split_doc.metadata['chunk_id'] = chunk_counter
                 #A1 We could round chunk length in token if we ignore the '.' position in the overlap and save time of computation
                 split_doc.metadata['token_length'] = len(tokenizer.encode(split_doc.page_content))
                 resized.append(split_doc)
+                print(f"Added a document of {split_doc.metadata['token_length']} tokens 3")
                 remaining_encoded = remaining_encoded[max(10, max_length - overlap):]
                 is_first_chunk = False
+                # # print(len(tokenizer.encode(split_doc.page_content)), split_doc.page_content[:50], "\n-----------------")
+                # print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
+                # print(split_doc.page_content[:100])
+                # # print("😂😂😂😂")
+                # print(split_doc.page_content[-100:])
+                # print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
+        else:# len(encoded)>min_chunk_size:#ignore the chunks that are too small
+            print(f"found a chunk with the perfect size:{len(encoded)}")
             #print(f"◀Document:{{ {doc.page_content} }} was not added because to short▶")
             if 'titles' in doc.metadata:#check if it was splitted by or split_docx
                 chunk_counter += 1
                 doc.metadata['chunk_id'] = chunk_counter
+            doc.metadata['token_length'] = len(encoded)
+            doc.page_content = tokenizer.decode(encoded)
             resized.append(doc)
+            print(f"Added a document of {doc.metadata['token_length']} tokens 4")
     print(f"Number of chunks before resplitting: {len(documents)} \nAfter splitting: {len(resized)}")
     return resized