ppsingh commited on
Commit
22b7264
1 Parent(s): 7ebdd15

Update auditqa/doc_process.py

Browse files
Files changed (1) hide show
  1. auditqa/doc_process.py +18 -1
auditqa/doc_process.py CHANGED
@@ -3,6 +3,8 @@ import os
3
  from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter
4
  from transformers import AutoTokenizer
5
  from langchain_community.document_loaders import PyMuPDFLoader
 
 
6
  path_to_data = "./data/"
7
 
8
 
@@ -37,4 +39,19 @@ def process_pdf():
37
  all_documents[file] = doc_processed
38
 
39
  print(all_documents.keys())
40
- print(all_documents['ABC'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter
4
  from transformers import AutoTokenizer
5
  from langchain_community.document_loaders import PyMuPDFLoader
6
+ from langchain.embeddings import HuggingFaceEmbeddings
7
+ from langchain_community.vectorstores import Qdrant
8
  path_to_data = "./data/"
9
 
10
 
 
39
  all_documents[file] = doc_processed
40
 
41
  print(all_documents.keys())
42
+ print(all_documents['ABC'])
43
+
44
+
45
+
46
+ embeddings = HuggingFaceEmbeddings(
47
+ model_name="sentence-transformers/all-mpnet-base-v2"
48
+ )
49
+
50
+ qdrant_collections = {}
51
+ for file,value in all_documents:
52
+ qdrant_collections[file] = Qdrant.from_documents(
53
+ value,
54
+ embeddings,
55
+ location=":memory:",
56
+ collection_name=file,
57
+ )