ppsingh commited on
Commit
1593f40
1 Parent(s): 6240195

Update auditqa/doc_process.py

Browse files
Files changed (1) hide show
  1. auditqa/doc_process.py +15 -7
auditqa/doc_process.py CHANGED
@@ -31,29 +31,37 @@ def process_pdf():
31
  text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
32
  AutoTokenizer.from_pretrained("BAAI/bge-small-en-v1.5"),
33
  chunk_size=chunk_size,
34
- chunk_overlap=int(chunk_size / 10),
35
  add_start_index=True,
36
  strip_whitespace=True,
37
  separators=["\n\n", "\n"],
38
  )
39
- all_documents = {}
 
 
40
  for file,value in docs.items():
41
  doc_processed = text_splitter.split_documents(value)
42
  for doc in doc_processed:
43
  doc.metadata["source"] = file
44
  doc.metadata["year"] = file[-4:]
45
- all_documents[file] = doc_processed
46
-
47
- print(all_documents.keys())
 
48
 
49
-
 
 
 
 
50
  embeddings = HuggingFaceEmbeddings(
51
  model_kwargs = {'device': 'cpu'},
52
  encode_kwargs = {'normalize_embeddings': True},
53
  model_name="BAAI/bge-small-en-v1.5"
54
  )
55
-
56
  qdrant_collections = {}
 
57
  for file,value in all_documents.items():
58
  print("emebddings for:",file)
59
  qdrant_collections[file] = Qdrant.from_documents(
 
31
  text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
32
  AutoTokenizer.from_pretrained("BAAI/bge-small-en-v1.5"),
33
  chunk_size=chunk_size,
34
+ chunk_overlap=int(chunk_size / 20),
35
  add_start_index=True,
36
  strip_whitespace=True,
37
  separators=["\n\n", "\n"],
38
  )
39
+
40
+ all_documents = {'Consolidated':[], 'MWTS':[]}
41
+
42
  for file,value in docs.items():
43
  doc_processed = text_splitter.split_documents(value)
44
  for doc in doc_processed:
45
  doc.metadata["source"] = file
46
  doc.metadata["year"] = file[-4:]
47
+ for key in all_documents:
48
+ if key in file:
49
+ print(key)
50
+ all_documents[key].append(doc_processed)
51
 
52
+ for key, docs_processed in all_documents.items():
53
+ docs_processed = [item for sublist in docs_processed for item in sublist]
54
+ all_documents[key] = docs_processed
55
+
56
+
57
  embeddings = HuggingFaceEmbeddings(
58
  model_kwargs = {'device': 'cpu'},
59
  encode_kwargs = {'normalize_embeddings': True},
60
  model_name="BAAI/bge-small-en-v1.5"
61
  )
62
+
63
  qdrant_collections = {}
64
+
65
  for file,value in all_documents.items():
66
  print("emebddings for:",file)
67
  qdrant_collections[file] = Qdrant.from_documents(