ppsingh commited on
Commit
6702158
1 Parent(s): 1a3a52c

Update auditqa/doc_process.py

Browse files
Files changed (1) hide show
  1. auditqa/doc_process.py +25 -7
auditqa/doc_process.py CHANGED
@@ -8,13 +8,15 @@ from langchain_community.embeddings import HuggingFaceEmbeddings, HuggingFaceInf
8
  from langchain_community.vectorstores import Qdrant
9
  from auditqa.reports import files, report_list
10
  device = 'cuda' if cuda.is_available() else 'cpu'
11
- #from dotenv import load_dotenv
12
- #load_dotenv()
13
 
14
- #HF_token = os.environ["HF_TOKEN"]
15
  path_to_data = "./data/pdf/"
16
 
17
  def process_pdf():
 
 
 
 
18
  docs = {}
19
  for file in report_list:
20
  try:
@@ -22,6 +24,7 @@ def process_pdf():
22
  except Exception as e:
23
  print("Exception: ", e)
24
 
 
25
  # text splitter based on the tokenizer of a model of your choosing
26
  # to make texts fit exactly a transformer's context window size
27
  # langchain text splitters: https://python.langchain.com/docs/modules/data_connection/document_transformers/
@@ -34,35 +37,49 @@ def process_pdf():
34
  strip_whitespace=True,
35
  separators=["\n\n", "\n"],
36
  )
 
 
 
37
  all_documents = {}
38
  categories = list(files.keys())
 
39
  for category in categories:
40
  print(category)
41
  all_documents[category] = []
42
  subtypes = list(files[category].keys())
 
 
43
  for subtype in subtypes:
44
  print(subtype)
45
  for file in files[category][subtype]:
 
 
46
  doc_processed = text_splitter.split_documents(docs[file])
 
 
47
  for doc in doc_processed:
48
  doc.metadata["source"] = category
49
  doc.metadata["subtype"] = subtype
50
  doc.metadata["year"] = file[-4:]
51
 
52
  all_documents[category].append(doc_processed)
53
-
 
54
  for key, docs_processed in all_documents.items():
55
  docs_processed = [item for sublist in docs_processed for item in sublist]
56
  all_documents[key] = docs_processed
57
-
 
 
58
  embeddings = HuggingFaceEmbeddings(
59
  model_kwargs = {'device': device},
60
  encode_kwargs = {'normalize_embeddings': True},
61
  model_name="BAAI/bge-small-en-v1.5"
62
  )
63
-
64
  qdrant_collections = {}
65
-
 
66
  for file,value in all_documents.items():
67
  print("emebddings for:",file)
68
  qdrant_collections[file] = Qdrant.from_documents(
@@ -71,6 +88,7 @@ def process_pdf():
71
  location=":memory:",
72
  collection_name=file,
73
  )
 
74
  print("done")
75
  return qdrant_collections
76
 
 
8
  from langchain_community.vectorstores import Qdrant
9
  from auditqa.reports import files, report_list
10
  device = 'cuda' if cuda.is_available() else 'cpu'
 
 
11
 
12
+ # path to the pdf files
13
  path_to_data = "./data/pdf/"
14
 
15
  def process_pdf():
16
+ """
17
+ this method reads through the files and report_list to create the vector database
18
+ """
19
+ # load all the files using PyMuPDFfLoader
20
  docs = {}
21
  for file in report_list:
22
  try:
 
24
  except Exception as e:
25
  print("Exception: ", e)
26
 
27
+
28
  # text splitter based on the tokenizer of a model of your choosing
29
  # to make texts fit exactly a transformer's context window size
30
  # langchain text splitters: https://python.langchain.com/docs/modules/data_connection/document_transformers/
 
37
  strip_whitespace=True,
38
  separators=["\n\n", "\n"],
39
  )
40
+ # we iterate through the files which contain information about its
41
+ # 'source'=='category', 'subtype', these are used in UI for document selection
42
+ # which will be used later for filtering database
43
  all_documents = {}
44
  categories = list(files.keys())
45
+ # iterate through 'source'
46
  for category in categories:
47
  print(category)
48
  all_documents[category] = []
49
  subtypes = list(files[category].keys())
50
+ # iterate through 'subtype' within the source
51
+ # example source/category == 'District', has subtypes which is district names
52
  for subtype in subtypes:
53
  print(subtype)
54
  for file in files[category][subtype]:
55
+
56
+ # create the chunks
57
  doc_processed = text_splitter.split_documents(docs[file])
58
+
59
+ # add metadata information
60
  for doc in doc_processed:
61
  doc.metadata["source"] = category
62
  doc.metadata["subtype"] = subtype
63
  doc.metadata["year"] = file[-4:]
64
 
65
  all_documents[category].append(doc_processed)
66
+
67
+ # convert list of list to flat list
68
  for key, docs_processed in all_documents.items():
69
  docs_processed = [item for sublist in docs_processed for item in sublist]
70
  all_documents[key] = docs_processed
71
+ all_documents['allreports'] = [sublist for key,sublist in all_documents.items()]
72
+ all_documents['allreports'] = [item for sublist in all_documents['allreports'] for item in sublist]
73
+ # define embedding model
74
  embeddings = HuggingFaceEmbeddings(
75
  model_kwargs = {'device': device},
76
  encode_kwargs = {'normalize_embeddings': True},
77
  model_name="BAAI/bge-small-en-v1.5"
78
  )
79
+ # placeholder for collection
80
  qdrant_collections = {}
81
+
82
+
83
  for file,value in all_documents.items():
84
  print("emebddings for:",file)
85
  qdrant_collections[file] = Qdrant.from_documents(
 
88
  location=":memory:",
89
  collection_name=file,
90
  )
91
+
92
  print("done")
93
  return qdrant_collections
94