YchKhan commited on
Commit
ef5d30c
1 Parent(s): fc7e90c

Create split_files_to_excel.py

Browse files
Files changed (1) hide show
  1. split_files_to_excel.py +474 -0
split_files_to_excel.py ADDED
@@ -0,0 +1,474 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import io
3
+ import os
4
+ import logging
5
+ import collections
6
+ import tempfile
7
+ from langchain.document_loaders import UnstructuredFileLoader
8
+ from langchain.text_splitter import CharacterTextSplitter
9
+ from langchain.vectorstores import FAISS
10
+ from langchain.embeddings import HuggingFaceEmbeddings
11
+
12
+ from langchain.document_loaders import PDFMinerPDFasHTMLLoader
13
+ from bs4 import BeautifulSoup
14
+ import re
15
+ from langchain.docstore.document import Document
16
+
17
+ import unstructured
18
+ from unstructured.partition.docx import partition_docx
19
+ from unstructured.partition.auto import partition
20
+
21
+ from transformers import AutoTokenizer
22
+
23
+ MODEL = "thenlper/gte-base"
24
+ CHUNK_SIZE = 1000
25
+ CHUNK_OVERLAP = 200
26
+
27
+ embeddings = HuggingFaceEmbeddings(
28
+ model_name=MODEL,
29
+ cache_folder=os.getenv("SENTENCE_TRANSFORMERS_HOME")
30
+ )
31
+
32
+ model_id = "mistralai/Mistral-7B-Instruct-v0.1"
33
+
34
+ tokenizer = AutoTokenizer.from_pretrained(
35
+ model_id,
36
+ padding_side="left"
37
+ )
38
+
39
+ text_splitter = CharacterTextSplitter(
40
+ separator = "\n",
41
+ chunk_size = CHUNK_SIZE,
42
+ chunk_overlap = CHUNK_OVERLAP,
43
+ length_function = len,
44
+ )
45
+
46
+ ## PDF Functions
47
+
48
+ def group_text_by_font_size(content):
49
+ cur_fs = []
50
+ cur_text = ''
51
+ cur_page = -1
52
+ cur_c = content[0]
53
+ multi_fs = False
54
+ snippets = [] # first collect all snippets that have the same font size
55
+ for c in content:
56
+ # print(f"c={c}\n\n")
57
+ if c.find('a') != None and c.find('a').get('name'):
58
+ cur_page = int(c.find('a').get('name'))
59
+ sp_list = c.find_all('span')
60
+ if not sp_list:
61
+ continue
62
+ for sp in sp_list:
63
+ # print(f"sp={sp}\n\n")
64
+ if not sp:
65
+ continue
66
+ st = sp.get('style')
67
+ if not st:
68
+ continue
69
+ fs = re.findall('font-size:(\d+)px',st)
70
+ # print(f"fs={fs}\n\n")
71
+ if not fs:
72
+ continue
73
+ fs = [int(fs[0])]
74
+ if len(cur_fs)==0:
75
+ cur_fs = fs
76
+ if fs == cur_fs:
77
+ cur_text += sp.text
78
+ elif not sp.find('br') and cur_c==c:
79
+ cur_text += sp.text
80
+ cur_fs.extend(fs)
81
+ multi_fs = True
82
+ elif sp.find('br') and multi_fs == True: # if a br tag is found and the text is in a different fs, it is the last part of the multifontsize line
83
+ cur_fs.extend(fs)
84
+ snippets.append((cur_text+sp.text,max(cur_fs), cur_page))
85
+ cur_fs = []
86
+ cur_text = ''
87
+ cur_c = c
88
+ multi_fs = False
89
+ else:
90
+ snippets.append((cur_text,max(cur_fs), cur_page))
91
+ cur_fs = fs
92
+ cur_text = sp.text
93
+ cur_c = c
94
+ multi_fs = False
95
+ snippets.append((cur_text,max(cur_fs), cur_page))
96
+ return snippets
97
+
98
+ def get_titles_fs(fs_list):
99
+ filtered_fs_list = [item[0] for item in fs_list if item[0] > fs_list[0][0]]
100
+ return sorted(filtered_fs_list, reverse=True)
101
+
102
+ def calculate_total_characters(snippets):
103
+ font_sizes = {} #dictionary to store font-size and total characters
104
+
105
+ for text, font_size, _ in snippets:
106
+ #remove newline# and digits
107
+ cleaned_text = text.replace('\n', '')
108
+ #cleaned_text = re.sub(r'\d+', '', cleaned_text)
109
+ total_characters = len(cleaned_text)
110
+
111
+ #update the dictionary
112
+ if font_size in font_sizes:
113
+ font_sizes[font_size] += total_characters
114
+ else:
115
+ font_sizes[font_size] = total_characters
116
+ #convert the dictionary into a sorted list of tuples
117
+ size_charac_list = sorted(font_sizes.items(), key=lambda x: x[1], reverse=True)
118
+
119
+ return size_charac_list
120
+
121
+ def create_documents(source, snippets, font_sizes):
122
+ docs = []
123
+
124
+ titles_fs = get_titles_fs(font_sizes)
125
+
126
+ for snippet in snippets:
127
+ cur_fs = snippet[1]
128
+ if cur_fs>font_sizes[0][0] and len(snippet[0])>2:
129
+ content = min((titles_fs.index(cur_fs)+1), 3)*"#" + " " + snippet[0].replace(" ", " ")
130
+ category = "Title"
131
+ else:
132
+ content = snippet[0].replace(" ", " ")
133
+ category = "Paragraph"
134
+ metadata={"source":source, "filename":source.split("/")[-1], "file_directory": "/".join(source.split("/")[:-1]), "file_category":"", "file_sub-cat":"", "file_sub2-cat":"", "category":category, "filetype":source.split(".")[-1], "page_number":snippet[2]}
135
+ categories = source.split("/")
136
+ cat_update=""
137
+ if len(categories)>4:
138
+ cat_update = {"file_category":categories[1], "file_sub-cat":categories[2], "file_sub2-cat":categories[3]}
139
+ elif len(categories)>3:
140
+ cat_update = {"file_category":categories[1], "file_sub-cat":categories[2]}
141
+ elif len(categories)>2:
142
+ cat_update = {"file_category":categories[1]}
143
+ metadata.update(cat_update)
144
+ docs.append(Document(page_content=content, metadata=metadata))
145
+ return docs
146
+
147
+ ## Group Chunks docx or pdf
148
+
149
+ # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
150
+ def group_chunks_by_section(chunks, min_chunk_size=512):
151
+ filtered_chunks = [chunk for chunk in chunks if chunk.metadata['category'] != 'PageBreak']# Add more filters if needed
152
+ #print(f"filtered = {len(filtered_chunks)} - before = {len(chunks)}")
153
+ new_chunks = []
154
+ seen_paragraph = False
155
+ new_title = True #switches when there is a new paragraph to create a new chunk
156
+ for i, chunk in enumerate(filtered_chunks):
157
+ # print(f"\n\n\n#{i}:METADATA: {chunk.metadata['category']}")
158
+ if new_title:
159
+ #print(f"<-- NEW title DETECTED -->")
160
+ new_chunk = chunk
161
+ new_title = False
162
+ add_content = False
163
+ new_chunk.metadata['titles'] = ""
164
+ #print(f"CONTENT: {new_chunk.page_content}\nMETADATA: {new_chunk.metadata['category']} \n title: {new_chunk.metadata['title']}")
165
+
166
+ if chunk.metadata['category'].lower() =='title':
167
+ new_chunk.metadata['titles'] += f"{chunk.page_content} ~~ "
168
+ else:
169
+ #Activates when a paragraph is seen after one or more titles
170
+ seen_paragraph = True
171
+
172
+ #Avoid adding the title 2 times to the page content
173
+ if add_content:#and chunk.page_content not in new_chunk.page_content
174
+ new_chunk.page_content += f"\n{chunk.page_content}"
175
+ #edit the end_page number, the last one keeps its place
176
+ try:
177
+ new_chunk.metadata['end_page'] = chunk.metadata['page_number']
178
+ except:
179
+ print("", end="")
180
+ #print("Exception: No page number in metadata")
181
+
182
+ add_content = True
183
+
184
+ #If filtered_chunks[i+1] raises an error, this is probably because this is the last chunk
185
+ try:
186
+ #If the next chunk is a title and we have already seen a paragraph and the current chunk content is long enough, we create a new document
187
+ if filtered_chunks[i+1].metadata['category'].lower() =="title" and seen_paragraph and len(new_chunk.page_content)>min_chunk_size:
188
+ if 'category' in new_chunk.metadata:
189
+ new_chunk.metadata.pop('category')
190
+ new_chunks.append(new_chunk)
191
+ new_title = True
192
+ seen_paragraph = False
193
+ #index out of range
194
+ except:
195
+ new_chunks.append(new_chunk)
196
+ #print('🆘 Gone through all chunks 🆘')
197
+ break
198
+ return new_chunks
199
+
200
+ # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
201
+ ## Split documents by font
202
+
203
+ def split_pdf(file_path, folder):
204
+ loader = PDFMinerPDFasHTMLLoader(file_path)
205
+
206
+ data = loader.load()[0] # entire pdf is loaded as a single Document
207
+ soup = BeautifulSoup(data.page_content,'html.parser')
208
+ content = soup.find_all('div')#List of all elements in div tags
209
+ try:
210
+ snippets = group_text_by_font_size(content)
211
+ except Exception as e:
212
+ print("ERROR WHILE GROUPING BY FONT SIZE", e)
213
+ snippets = [("ERROR WHILE GROUPING BY FONT SIZE", 0, -1)]
214
+ font_sizes = calculate_total_characters(snippets)#get the amount of characters for each font_size
215
+ chunks = create_documents(file_path, snippets, font_sizes)
216
+ return chunks
217
+
218
+ # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
219
+ def split_docx(file_path, folder):
220
+ chunks_elms = partition_docx(filename=file_path)
221
+ chunks = []
222
+ file_categories = file_path.split("/")
223
+ for chunk_elm in chunks_elms:
224
+ category = chunk_elm.category
225
+ if category == "Title":
226
+ chunk = Document(page_content= min(chunk_elm.metadata.to_dict()['category_depth']+1, 3)*"#" + ' ' + chunk_elm.text, metadata=chunk_elm.metadata.to_dict())
227
+ else:
228
+ chunk = Document(page_content=chunk_elm.text, metadata=chunk_elm.metadata.to_dict())
229
+ metadata={"source":file_path, "filename":file_path.split("/")[-1], "file_category":"", "file_sub-cat":"", "file_sub2-cat":"", "category":category, "filetype":file_path.split(".")[-1]}
230
+ cat_update=""
231
+ if len(file_categories)>4:
232
+ cat_update = {"file_category":file_categories[1], "file_sub-cat":file_categories[2], "file_sub2-cat":file_categories[3]}
233
+ elif len(file_categories)>3:
234
+ cat_update = {"file_category":file_categories[1], "file_sub-cat":file_categories[2]}
235
+ elif len(file_categories)>2:
236
+ cat_update = {"file_category":file_categories[1]}
237
+ metadata.update(cat_update)
238
+ chunk.metadata.update(metadata)
239
+ chunks.append(chunk)
240
+ return chunks
241
+
242
+ # Load the index of documents (if it has already been built)
243
+
244
+ def rebuild_index(input_folder, output_folder):
245
+ paths_time = []
246
+ to_keep = set()
247
+ print(f'number of files {len(paths_time)}')
248
+ if len(output_folder.list_paths_in_partition()) > 0:
249
+ with tempfile.TemporaryDirectory() as temp_dir:
250
+ for f in output_folder.list_paths_in_partition():
251
+ with output_folder.get_download_stream(f) as stream:
252
+ with open(os.path.join(temp_dir, os.path.basename(f)), "wb") as f2:
253
+ f2.write(stream.read())
254
+ index = FAISS.load_local(temp_dir, embeddings)
255
+ to_remove = []
256
+ logging.info(f"{len(index.docstore._dict)} vectors loaded")
257
+ for idx, doc in index.docstore._dict.items():
258
+ source = (doc.metadata["source"], doc.metadata["last_modified"])
259
+ if source in paths_time:
260
+ # Identify documents already indexed and still present in the source folder
261
+ to_keep.add(source)
262
+ else:
263
+ # Identify documents removed from the source folder
264
+ to_remove.append(idx)
265
+
266
+ docstore_id_to_index = {v: k for k, v in index.index_to_docstore_id.items()}
267
+
268
+ # Remove documents that have been deleted from the source folder
269
+ vectors_to_remove = []
270
+ for idx in to_remove:
271
+ del index.docstore._dict[idx]
272
+ ind = docstore_id_to_index[idx]
273
+ del index.index_to_docstore_id[ind]
274
+ vectors_to_remove.append(ind)
275
+ index.index.remove_ids(np.array(vectors_to_remove, dtype=np.int64))
276
+
277
+ index.index_to_docstore_id = {
278
+ i: ind
279
+ for i, ind in enumerate(index.index_to_docstore_id.values())
280
+ }
281
+ logging.info(f"{len(to_remove)} vectors removed")
282
+ else:
283
+ index = None
284
+ to_add = [path[0] for path in paths_time if path not in to_keep]
285
+ print(f'to_keep: {to_keep}')
286
+ print(f'to_add: {to_add}')
287
+ return index, to_add
288
+
289
+ # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
290
+ def split_chunks_by_tokens(documents, max_length=170, overlap=10):
291
+ # Create an empty list to store the resized documents
292
+ resized = []
293
+
294
+ # Iterate through the original documents list
295
+ for doc in documents:
296
+ encoded = tokenizer.encode(doc.page_content)
297
+ if len(encoded) > max_length:
298
+ remaining_encoded = tokenizer.encode(doc.page_content)
299
+ while len(remaining_encoded) > 0:
300
+ split_doc = Document(page_content=tokenizer.decode(remaining_encoded[:max(10, max_length)]), metadata=doc.metadata.copy())
301
+ resized.append(split_doc)
302
+ remaining_encoded = remaining_encoded[max(10, max_length - overlap):]
303
+
304
+ else:
305
+ resized.append(doc)
306
+ print(f"Number of chunks before resplitting: {len(documents)} \nAfter splitting: {len(resized)}")
307
+ return resized
308
+
309
+ # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
310
+ def split_chunks_by_tokens_period(documents, max_length=170, overlap=10, min_chunk_size=20):
311
+ # Create an empty list to store the resized documents
312
+ resized = []
313
+ previous_file=""
314
+ # Iterate through the original documents list
315
+ for doc in documents:
316
+ current_file = doc.metadata['source']
317
+ if current_file != previous_file: #chunk counting
318
+ previous_file = current_file
319
+ chunk_counter = 0
320
+ is_first_chunk = True # Keep track of the first chunk in the document
321
+ encoded = tokenizer.encode(doc.page_content)#encode the current document
322
+ if len(encoded) > max_length:
323
+ remaining_encoded = encoded
324
+ is_last_chunk = False
325
+ while len(remaining_encoded) > 1 and not is_last_chunk:
326
+ # Check for a period in the first 'overlap' tokens
327
+ overlap_text = tokenizer.decode(remaining_encoded[:overlap])# Index by token
328
+ period_index_b = overlap_text.find('.')# Index by character
329
+ if len(remaining_encoded)>max_length + min_chunk_size:
330
+ current_encoded = remaining_encoded[:max(10, max_length)]
331
+ else:
332
+ current_encoded = remaining_encoded[:max(10, max_length + min_chunk_size)] #if the last chunk is to small, concatenate it with the previous one
333
+ is_last_chunk = True
334
+ period_index_e = len(doc.page_content) # an amount of character that I am sure will be greater or equal to the max lengh of a chunk, could have done len(tokenizer.decode(current_encoded))
335
+ if len(remaining_encoded)>max_length+min_chunk_size:# If it is not the last sub chunk
336
+ overlap_text_last = tokenizer.decode(current_encoded[-overlap:])
337
+ period_index_last = overlap_text_last.find('.')
338
+ if period_index_last != -1 and period_index_last < len(overlap_text_last) - 1:
339
+ #print(f"period index last found at {period_index_last}")
340
+ period_index_e = period_index_last - len(overlap_text_last) + 1
341
+ #print(f"period_index_e :{period_index_e}")
342
+ #print(f"last :{overlap_text_last}")
343
+ if not is_first_chunk:#starting after the period in overlap
344
+ if period_index_b == -1:# Period not found in overlap
345
+ #print(". not found in overlap")
346
+ split_doc = Document(page_content=tokenizer.decode(current_encoded)[:period_index_e], metadata=doc.metadata.copy()) # Keep regular splitting
347
+ else:
348
+ if is_last_chunk : #not the first but the last
349
+ split_doc = Document(page_content=tokenizer.decode(current_encoded)[period_index_b+1:], metadata=doc.metadata.copy())
350
+ #print("Should start after \".\"")
351
+ else:
352
+ split_doc = Document(page_content=tokenizer.decode(current_encoded)[period_index_b+1:period_index_e], metadata=doc.metadata.copy()) # Split at the begining and the end
353
+ else:#first chunk
354
+ split_doc = Document(page_content=tokenizer.decode(current_encoded)[:period_index_e], metadata=doc.metadata.copy()) # split only at the end if its first chunk
355
+ if 'titles' in split_doc.metadata:
356
+ chunk_counter += 1
357
+ split_doc.metadata['chunk_id'] = chunk_counter
358
+ #A1 We could round chunk length in token if we ignore the '.' position in the overlap and save time of computation
359
+ split_doc.metadata['token_length'] = len(tokenizer.encode(split_doc.page_content))
360
+ resized.append(split_doc)
361
+ remaining_encoded = remaining_encoded[max(10, max_length - overlap):]
362
+ is_first_chunk = False
363
+ #print(len(tokenizer.encode(split_doc.page_content)), split_doc.page_content, "\n-----------------")
364
+ elif len(encoded)>min_chunk_size:#ignore the chunks that are too small
365
+ #print(f"◀Document:{{ {doc.page_content} }} was not added because to short▶")
366
+ if 'titles' in doc.metadata:#check if it was splitted by or split_docx
367
+ chunk_counter += 1
368
+ doc.metadata['chunk_id'] = chunk_counter
369
+ doc.metadata['token_length'] = len(encoded)
370
+ resized.append(doc)
371
+ print(f"Number of chunks before resplitting: {len(documents)} \nAfter splitting: {len(resized)}")
372
+ return resized
373
+
374
+ # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
375
+
376
+ def split_doc_in_chunks(input_folder):
377
+ docs = []
378
+ for i, filename in enumerate(input_folder):
379
+ path = filename#os.path.join(input_folder, filename)
380
+ print(f"Treating file {i}/{len(input_folder)}")
381
+ # Select the appropriate document loader
382
+ chunks=[]
383
+ if path.endswith(".pdf"):
384
+ try:
385
+ print("Treatment of pdf file", path)
386
+ raw_chuncks = split_pdf(path, input_folder)
387
+ chunks = group_chunks_by_section(raw_chuncks)
388
+ print(f"Document splitted in {len(chunks)} chunks")
389
+ # for chunk in chunks:
390
+ # print(f"\n\n____\n\n\nPDF CONTENT: \n{chunk.page_content}\ntitle: {chunk.metadata['title']}\nFile Name: {chunk.metadata['filename']}\n\n")
391
+ except Exception as e:
392
+ print("Error while splitting the pdf file: ", e)
393
+ elif path.endswith(".docx"):
394
+ try:
395
+ print ("Treatment of docx file", path)
396
+ raw_chuncks = split_docx(path, input_folder)
397
+ #print(f"RAW :\n***\n{raw_chuncks}")
398
+ chunks = group_chunks_by_section(raw_chuncks)
399
+ print(f"Document splitted in {len(chunks)} chunks")
400
+ #if "cards-Jan 2022-SP.docx" in path:
401
+ #for chunk in chunks:
402
+ #print(f"\n\n____\n\n\nDOCX CONTENT: \n{chunk.page_content}\ntitle: {chunk.metadata['title']}\nFile Name: {chunk.metadata['filename']}\n\n")
403
+ except Exception as e:
404
+ print("Error while splitting the docx file: ", e)
405
+ elif path.endswith(".doc"):
406
+ try:
407
+ loader = UnstructuredFileLoader(path)
408
+ # Load the documents and split them in chunks
409
+ chunks = loader.load_and_split(text_splitter=text_splitter)
410
+ counter, counter2 = collections.Counter(), collections.Counter()
411
+ filename = os.path.basename(path)
412
+ # Define a unique id for each chunk
413
+ for chunk in chunks:
414
+ chunk.metadata["filename"] = filename.split("/")[-1]
415
+ chunk.metadata["file_directory"] = filename.split("/")[:-1]
416
+ chunk.metadata["filetype"] = filename.split(".")[-1]
417
+ if "page" in chunk.metadata:
418
+ counter[chunk.metadata['page']] += 1
419
+ for i in range(len(chunks)):
420
+ counter2[chunks[i].metadata['page']] += 1
421
+ chunks[i].metadata['source'] = filename
422
+ else:
423
+ if len(chunks) == 1:
424
+ chunks[0].metadata['source'] = filename
425
+ #The file type is not supported (e.g. .xlsx)
426
+ except Exception as e:
427
+ print(f"An error occurred: {e}")
428
+ try:
429
+ if len(chunks)>0:
430
+ docs += chunks
431
+ except NameError as e:
432
+ print(f"An error has occured: {e}")
433
+ return docs
434
+
435
+ # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
436
+ def resplit_by_end_of_sentence(docs):
437
+ print("❌❌\nResplitting docs by end of sentence\n❌❌")
438
+ resized_docs = split_chunks_by_tokens_period(docs, max_length=200, overlap=40, min_chunk_size=20)
439
+ try:
440
+ # add chunk title to all resplitted chunks #todo move this to split_chunks_by_tokens_period(inject_title = True) with a boolean parameter
441
+ cur_source = ""
442
+ cpt_chunk = 1
443
+ for resized_doc in resized_docs:
444
+ try:
445
+ title = resized_doc.metadata['titles'].split(' ~~ ')[-2] #Getting the last title of the chunk and adding it to the content if it is not the case
446
+ if title not in resized_doc.page_content:
447
+ resized_doc.page_content = title + "\n" + resized_doc.page_content
448
+ if cur_source == resized_doc.metadata["source"]:
449
+ resized_doc.metadata['chunk_number'] = cpt_chunk
450
+ else:
451
+ cpt_chunk = 1
452
+ cur_source = resized_doc.metadata["source"]
453
+ resized_doc.metadata['chunk_number'] = cpt_chunk
454
+ except Exception as e:#either the title was notfound or title absent in metadata
455
+ print("An error occured: ", e)
456
+ #print(f"METADATA:\n{resized_doc.metadata}")
457
+ cpt_chunk += 1
458
+ except Exception as e:
459
+ print('AN ERROR OCCURRED: ', e)
460
+ return resized_docs
461
+
462
+ # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
463
+ def build_index(docs, index, output_folder):
464
+ if len(docs) > 0:
465
+ if index is not None:
466
+ # Compute the embedding of each chunk and index these chunks
467
+ new_index = FAISS.from_documents(docs, embeddings)
468
+ index.merge_from(new_index)
469
+ else:
470
+ index = FAISS.from_documents(docs, embeddings)
471
+ with tempfile.TemporaryDirectory() as temp_dir:
472
+ index.save_local(temp_dir)
473
+ for f in os.listdir(temp_dir):
474
+ output_folder.upload_file(f, os.path.join(temp_dir, f))