YchKhan commited on
Commit
7667045
1 Parent(s): dde97ad

Update split_files_to_excel.py

Browse files
Files changed (1) hide show
  1. split_files_to_excel.py +62 -14
split_files_to_excel.py CHANGED
@@ -29,8 +29,8 @@ import requests
29
  import json
30
 
31
  MODEL = "thenlper/gte-base"
32
- CHUNK_SIZE = 1000
33
- CHUNK_OVERLAP = 200
34
 
35
  embeddings = HuggingFaceEmbeddings(
36
  model_name=MODEL,
@@ -323,15 +323,41 @@ def split_chunks_by_tokens_period(documents, max_length=170, overlap=10, min_chu
323
  # Create an empty list to store the resized documents
324
  resized = []
325
  previous_file=""
 
 
326
  # Iterate through the original documents list
327
- for doc in documents:
 
 
 
328
  current_file = doc.metadata['source']
329
  if current_file != previous_file: #chunk counting
330
  previous_file = current_file
331
  chunk_counter = 0
332
  is_first_chunk = True # Keep track of the first chunk in the document
333
- encoded = tokenizer.encode(doc.page_content)#encode the current document
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
334
  if len(encoded) > max_length:
 
335
  remaining_encoded = encoded
336
  is_last_chunk = False
337
  while len(remaining_encoded) > 1 and not is_last_chunk:
@@ -339,47 +365,69 @@ def split_chunks_by_tokens_period(documents, max_length=170, overlap=10, min_chu
339
  overlap_text = tokenizer.decode(remaining_encoded[:overlap])# Index by token
340
  period_index_b = overlap_text.find('.')# Index by character
341
  if len(remaining_encoded)>max_length + min_chunk_size:
 
342
  current_encoded = remaining_encoded[:max(10, max_length)]
343
  else:
344
- current_encoded = remaining_encoded[:max(10, max_length + min_chunk_size)] #if the last chunk is to small, concatenate it with the previous one
 
345
  is_last_chunk = True
346
- period_index_e = len(doc.page_content) # an amount of character that I am sure will be greater or equal to the max lengh of a chunk, could have done len(tokenizer.decode(current_encoded))
 
 
 
 
 
347
  if len(remaining_encoded)>max_length+min_chunk_size:# If it is not the last sub chunk
 
348
  overlap_text_last = tokenizer.decode(current_encoded[-overlap:])
349
  period_index_last = overlap_text_last.find('.')
350
  if period_index_last != -1 and period_index_last < len(overlap_text_last) - 1:
351
- #print(f"period index last found at {period_index_last}")
352
- period_index_e = period_index_last - len(overlap_text_last) + 1
353
- #print(f"period_index_e :{period_index_e}")
354
- #print(f"last :{overlap_text_last}")
355
  if not is_first_chunk:#starting after the period in overlap
 
356
  if period_index_b == -1:# Period not found in overlap
357
- #print(". not found in overlap")
358
  split_doc = Document(page_content=tokenizer.decode(current_encoded)[:period_index_e], metadata=doc.metadata.copy()) # Keep regular splitting
359
  else:
360
  if is_last_chunk : #not the first but the last
 
361
  split_doc = Document(page_content=tokenizer.decode(current_encoded)[period_index_b+1:], metadata=doc.metadata.copy())
362
  #print("Should start after \".\"")
363
  else:
 
364
  split_doc = Document(page_content=tokenizer.decode(current_encoded)[period_index_b+1:period_index_e], metadata=doc.metadata.copy()) # Split at the begining and the end
365
  else:#first chunk
 
366
  split_doc = Document(page_content=tokenizer.decode(current_encoded)[:period_index_e], metadata=doc.metadata.copy()) # split only at the end if its first chunk
367
  if 'titles' in split_doc.metadata:
 
368
  chunk_counter += 1
369
  split_doc.metadata['chunk_id'] = chunk_counter
370
  #A1 We could round chunk length in token if we ignore the '.' position in the overlap and save time of computation
371
  split_doc.metadata['token_length'] = len(tokenizer.encode(split_doc.page_content))
372
  resized.append(split_doc)
 
373
  remaining_encoded = remaining_encoded[max(10, max_length - overlap):]
374
  is_first_chunk = False
375
- #print(len(tokenizer.encode(split_doc.page_content)), split_doc.page_content, "\n-----------------")
376
- elif len(encoded)>min_chunk_size:#ignore the chunks that are too small
 
 
 
 
 
 
377
  #print(f"◀Document:{{ {doc.page_content} }} was not added because to short▶")
378
  if 'titles' in doc.metadata:#check if it was splitted by or split_docx
379
  chunk_counter += 1
380
  doc.metadata['chunk_id'] = chunk_counter
381
- doc.metadata['token_length'] = len(encoded)
 
382
  resized.append(doc)
 
383
  print(f"Number of chunks before resplitting: {len(documents)} \nAfter splitting: {len(resized)}")
384
  return resized
385
 
 
29
  import json
30
 
31
  MODEL = "thenlper/gte-base"
32
+ CHUNK_SIZE = 1500
33
+ CHUNK_OVERLAP = 400
34
 
35
  embeddings = HuggingFaceEmbeddings(
36
  model_name=MODEL,
 
323
  # Create an empty list to store the resized documents
324
  resized = []
325
  previous_file=""
326
+ to_encode = ""
327
+ skip_next = False
328
  # Iterate through the original documents list
329
+ for i, doc in enumerate(documents):
330
+ if skip_next:
331
+ skip_next = False
332
+ continue
333
  current_file = doc.metadata['source']
334
  if current_file != previous_file: #chunk counting
335
  previous_file = current_file
336
  chunk_counter = 0
337
  is_first_chunk = True # Keep track of the first chunk in the document
338
+ to_encode += doc.page_content
339
+ # if last chunk < min_chunk_size we add it to the previous chunk for the splitting.
340
+ if (documents[i+1] is documents[-1] or documents[i+1].metadata['source'] != documents[i+2].metadata['source']) and len(tokenizer.encode(documents[i+1].page_content)) < min_chunk_size: # if the next doc is the last doc of the current file or the last of the corpus
341
+ # print('SAME DOC')
342
+ skip_next = True
343
+ to_encode += documents[i+1].page_content
344
+ #print(f"to_encode:\n{to_encode}")
345
+ encoded = tokenizer.encode(to_encode)#encode the current document
346
+ if len(encoded) < min_chunk_size and not skip_next:
347
+ # print(f"len(encoded):{len(encoded)}<min_chunk_size:{min_chunk_size}")
348
+ continue
349
+ elif skip_next:
350
+ split_doc = Document(page_content=tokenizer.decode(encoded), metadata=doc.metadata.copy())
351
+ split_doc.metadata['token_length'] = len(tokenizer.encode(split_doc.page_content))
352
+ resized.append(split_doc)
353
+ # print(f"Added a document of {split_doc.metadata['token_length']} tokens 1")
354
+ to_encode = ""
355
+ continue
356
+ else:
357
+ # print(f"len(encoded):{len(encoded)}>=min_chunk_size:{min_chunk_size}")
358
+ to_encode = ""
359
  if len(encoded) > max_length:
360
+ # print(f"len(encoded):{len(encoded)}>=max_length:{max_length}")
361
  remaining_encoded = encoded
362
  is_last_chunk = False
363
  while len(remaining_encoded) > 1 and not is_last_chunk:
 
365
  overlap_text = tokenizer.decode(remaining_encoded[:overlap])# Index by token
366
  period_index_b = overlap_text.find('.')# Index by character
367
  if len(remaining_encoded)>max_length + min_chunk_size:
368
+ # print("len(remaining_encoded)>max_length + min_chunk_size")
369
  current_encoded = remaining_encoded[:max(10, max_length)]
370
  else:
371
+ # print("not len(remaining_encoded)>max_length + min_chunk_size")
372
+ current_encoded = remaining_encoded #if the last chunk is to small, concatenate it with the previous one
373
  is_last_chunk = True
374
+ split_doc = Document(page_content=tokenizer.decode(current_encoded), metadata=doc.metadata.copy())
375
+ split_doc.metadata['token_length'] = len(tokenizer.encode(split_doc.page_content))
376
+ resized.append(split_doc)
377
+ # print(f"Added a document of {split_doc.metadata['token_length']} tokens 2")
378
+ break
379
+ period_index_e = -1 # an amount of character that I am sure will be greater or equal to the max lengh of a chunk, could have done len(tokenizer.decode(current_encoded))
380
  if len(remaining_encoded)>max_length+min_chunk_size:# If it is not the last sub chunk
381
+ # print("len(remaining_encoded)>max_length+min_chunk_size")
382
  overlap_text_last = tokenizer.decode(current_encoded[-overlap:])
383
  period_index_last = overlap_text_last.find('.')
384
  if period_index_last != -1 and period_index_last < len(overlap_text_last) - 1:
385
+ # print(f"period index last found at {period_index_last}")
386
+ period_index_e = period_index_last - len(overlap_text_last)
387
+ # print(f"period_index_e :{period_index_e}")
388
+ # print(f"last :{overlap_text_last}")
389
  if not is_first_chunk:#starting after the period in overlap
390
+ # print("not is_first_chunk", period_index_b)
391
  if period_index_b == -1:# Period not found in overlap
392
+ # print(". not found in overlap")
393
  split_doc = Document(page_content=tokenizer.decode(current_encoded)[:period_index_e], metadata=doc.metadata.copy()) # Keep regular splitting
394
  else:
395
  if is_last_chunk : #not the first but the last
396
+ # print("is_last_chunk")
397
  split_doc = Document(page_content=tokenizer.decode(current_encoded)[period_index_b+1:], metadata=doc.metadata.copy())
398
  #print("Should start after \".\"")
399
  else:
400
+ # print("not is_last_chunk", period_index_e, len(to_encode))
401
  split_doc = Document(page_content=tokenizer.decode(current_encoded)[period_index_b+1:period_index_e], metadata=doc.metadata.copy()) # Split at the begining and the end
402
  else:#first chunk
403
+ # print("else")
404
  split_doc = Document(page_content=tokenizer.decode(current_encoded)[:period_index_e], metadata=doc.metadata.copy()) # split only at the end if its first chunk
405
  if 'titles' in split_doc.metadata:
406
+ # print("title in metadata")
407
  chunk_counter += 1
408
  split_doc.metadata['chunk_id'] = chunk_counter
409
  #A1 We could round chunk length in token if we ignore the '.' position in the overlap and save time of computation
410
  split_doc.metadata['token_length'] = len(tokenizer.encode(split_doc.page_content))
411
  resized.append(split_doc)
412
+ print(f"Added a document of {split_doc.metadata['token_length']} tokens 3")
413
  remaining_encoded = remaining_encoded[max(10, max_length - overlap):]
414
  is_first_chunk = False
415
+ # # print(len(tokenizer.encode(split_doc.page_content)), split_doc.page_content[:50], "\n-----------------")
416
+ # print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
417
+ # print(split_doc.page_content[:100])
418
+ # # print("😂😂😂😂")
419
+ # print(split_doc.page_content[-100:])
420
+ # print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
421
+ else:# len(encoded)>min_chunk_size:#ignore the chunks that are too small
422
+ print(f"found a chunk with the perfect size:{len(encoded)}")
423
  #print(f"◀Document:{{ {doc.page_content} }} was not added because to short▶")
424
  if 'titles' in doc.metadata:#check if it was splitted by or split_docx
425
  chunk_counter += 1
426
  doc.metadata['chunk_id'] = chunk_counter
427
+ doc.metadata['token_length'] = len(encoded)
428
+ doc.page_content = tokenizer.decode(encoded)
429
  resized.append(doc)
430
+ print(f"Added a document of {doc.metadata['token_length']} tokens 4")
431
  print(f"Number of chunks before resplitting: {len(documents)} \nAfter splitting: {len(resized)}")
432
  return resized
433