Standard_Intelligence_Dev

Sleeping

YchKhan commited on Jun 18, 2024

Commit

b5d29e3

verified ·

1 Parent(s): ecf9456

Update split_files_to_excel.py

Files changed (1) hide show

split_files_to_excel.py CHANGED Viewed

@@ -471,19 +471,20 @@ def split_doc_in_chunks(input_folder, base_folders, nb_pages):
     docs = []
     for i, filename in enumerate(input_folder):
         path = filename#os.path.join(input_folder, filename)
-        print(f"Treating file {i}/{len(input_folder)}")
         # Select the appropriate document loader
         chunks=[]
         if path.endswith(".pdf"):
             try:
                 print("Treatment of pdf file", path)
                 raw_chunks = split_pdf(path, input_folder)
-                for raw_chunk in raw_chunks:
-                    print(f"BASE zzzzz LIST : {base_folders} = i = {i}")
-                    raw_chunk.metadata["Base Folder"] = base_folders[i]
                 sb_chunks = group_chunks_by_section(raw_chunks)
                 if nb_pages > 0:
                     for sb_chunk in sb_chunks:
                         if int(sb_chunk.metadata["page_number"])<nb_pages:
                             chunks.append(sb_chunk)
                     else:
@@ -602,7 +603,7 @@ def split_in_df(files, nb_pages):
         else:
             processed_files.append(file_path)
             base_folders.append("")
-    print(f"BASE FOLDERS LIST : {base_folders}")
     print("Finished processing zip files\nSplitting files into chunks...")
     documents = split_doc_in_chunks(processed_files, base_folders, nb_pages)
     re_docs = resplit_by_end_of_sentence(documents, 1000, 100, 1500)

     docs = []
     for i, filename in enumerate(input_folder):
         path = filename#os.path.join(input_folder, filename)
+        print(f"Treating file {i+1}/{len(input_folder)}")
         # Select the appropriate document loader
         chunks=[]
         if path.endswith(".pdf"):
             try:
                 print("Treatment of pdf file", path)
                 raw_chunks = split_pdf(path, input_folder)
+                for j, raw_chunk in enumerate(raw_chunks):
+                    print(f"BASE zzzzz LIST : {base_folders} = i = {j}")
+                    raw_chunk.metadata["Base Folder"] = base_folders[j]
                 sb_chunks = group_chunks_by_section(raw_chunks)
                 if nb_pages > 0:
                     for sb_chunk in sb_chunks:
+                        print(f"CHUNK PAGENUM = {sb_chunk.metadata['page_number']}")
                         if int(sb_chunk.metadata["page_number"])<nb_pages:
                             chunks.append(sb_chunk)
                     else:
         else:
             processed_files.append(file_path)
             base_folders.append("")
+    print(f"BASE FOLDERS LIST : {base_folders}, FILES LIST : {processed_files}")
     print("Finished processing zip files\nSplitting files into chunks...")
     documents = split_doc_in_chunks(processed_files, base_folders, nb_pages)
     re_docs = resplit_by_end_of_sentence(documents, 1000, 100, 1500)