ppsingh commited on
Commit
7ebdd15
1 Parent(s): c4bb3a1

Update auditqa/doc_process.py

Browse files
Files changed (1) hide show
  1. auditqa/doc_process.py +10 -2
auditqa/doc_process.py CHANGED
@@ -27,6 +27,14 @@ def process_pdf():
27
  chunk_overlap=int(chunk_size / 10),
28
  add_start_index=True,
29
  strip_whitespace=True,
30
- separators=["\n\n", "\n", ".", " ", ""],
31
  )
32
-
 
 
 
 
 
 
 
 
 
27
  chunk_overlap=int(chunk_size / 10),
28
  add_start_index=True,
29
  strip_whitespace=True,
30
+ separators=["\n\n", "\n"],
31
  )
32
+ all_documents = {}
33
+ for file,value in docs.items():
34
+ doc_processed = text_splitter.split_documents(value)
35
+ for doc in doc_processed:
36
+ doc.metadata["source"] = file
37
+ all_documents[file] = doc_processed
38
+
39
+ print(all_documents.keys())
40
+ print(all_documents['ABC'])