ppsingh commited on
Commit
3b6480c
1 Parent(s): c83f30f

Update auditqa/doc_process.py

Browse files
Files changed (1) hide show
  1. auditqa/doc_process.py +10 -2
auditqa/doc_process.py CHANGED
@@ -37,7 +37,15 @@ def process_pdf():
37
  all_documents = {}
38
  categories = list(files.keys())
39
  for category in categories:
40
- all_documents[category] = {}
 
 
 
 
 
 
 
 
41
 
42
- print(all_documents)
43
 
 
37
  all_documents = {}
38
  categories = list(files.keys())
39
  for category in categories:
40
+ all_documents[category] = []
41
+ subtypes = list(files[category].keys())
42
+ for subtype in subtypes:
43
+ for file in files[category][subtype]:
44
+ doc_processed = text_splitter.split_documents(docs[file])
45
+ for doc in doc_processed:
46
+ doc.metadata["source"] = category
47
+ doc.metadata["subtype"] = subtype
48
+ doc.metadata["year"] = file[-4:]
49
 
50
+ all_documents[category].append(doc_processed)
51