Spaces:
Sleeping
Sleeping
Merge pull request #6 from almutareb/create_vector_store
Browse files
example.env
CHANGED
@@ -5,4 +5,7 @@ HUGGINGFACEHUB_API_TOKEN=
|
|
5 |
OLLMA_BASE_URL=
|
6 |
|
7 |
# environmental varaibles needed to use tools
|
8 |
-
SERPAPI_API_KEY=
|
|
|
|
|
|
|
|
5 |
OLLMA_BASE_URL=
|
6 |
|
7 |
# environmental varaibles needed to use tools
|
8 |
+
SERPAPI_API_KEY=
|
9 |
+
|
10 |
+
# for chromadb
|
11 |
+
VECTOR_DATABASE_LOCATION=
|
innovation_pathfinder_ai/vector_store/chroma_vector_store.py
ADDED
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# got some of the code from https://diptimanrc.medium.com/rapid-q-a-on-multiple-pdfs-using-langchain-and-chromadb-as-local-disk-vector-store-60678328c0df
|
2 |
+
|
3 |
+
import PyPDF2
|
4 |
+
import io
|
5 |
+
import os
|
6 |
+
from langchain_community.vectorstores import Chroma
|
7 |
+
from langchain.document_loaders import PyPDFLoader
|
8 |
+
from langchain.text_splitter import CharacterTextSplitter
|
9 |
+
from langchain.vectorstores import Chroma
|
10 |
+
import chromadb
|
11 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
12 |
+
import dotenv
|
13 |
+
|
14 |
+
dotenv.load_dotenv()
|
15 |
+
|
16 |
+
|
17 |
+
VECTOR_DATABASE_LOCATION = os.getenv("VECTOR_DATABASE_LOCATION")
|
18 |
+
|
19 |
+
|
20 |
+
def extract_text_from_pdf(file) -> list[str]:
|
21 |
+
documents = []
|
22 |
+
try:
|
23 |
+
reader = PyPDF2.PdfReader(file)
|
24 |
+
num_pages = len(reader.pages)
|
25 |
+
for page_num in range(num_pages):
|
26 |
+
page = reader.pages[page_num]
|
27 |
+
text= page.extract_text() + "\n"
|
28 |
+
documents.append(text)
|
29 |
+
except Exception as e:
|
30 |
+
print(e)
|
31 |
+
finally:
|
32 |
+
return documents
|
33 |
+
|
34 |
+
|
35 |
+
def add_pdf_to_vector_store(
|
36 |
+
vector_store:Chroma.from_documents,
|
37 |
+
pdf_file_location:str,
|
38 |
+
text_chunk_size=1000,
|
39 |
+
text_chunk_overlap=10,
|
40 |
+
) -> None:
|
41 |
+
"""
|
42 |
+
## Summary
|
43 |
+
given the location of a pdf file this will chunk it's contents
|
44 |
+
and store it the given vectorstore
|
45 |
+
|
46 |
+
## Arguments
|
47 |
+
vectorstore (vectorstore):
|
48 |
+
pdf_file_location (str) : location of pdf file
|
49 |
+
|
50 |
+
## Return
|
51 |
+
None
|
52 |
+
"""
|
53 |
+
|
54 |
+
documents = []
|
55 |
+
|
56 |
+
loader = PyPDFLoader(pdf_file_location)
|
57 |
+
# documents.extend(loader.load())
|
58 |
+
|
59 |
+
text_splitter = CharacterTextSplitter(
|
60 |
+
chunk_size=text_chunk_size,
|
61 |
+
chunk_overlap=text_chunk_overlap,
|
62 |
+
)
|
63 |
+
|
64 |
+
|
65 |
+
documents.extend(loader.load())
|
66 |
+
|
67 |
+
chunked_documents = text_splitter.split_documents(documents)
|
68 |
+
|
69 |
+
vectordb = Chroma.from_documents(
|
70 |
+
documents=chunked_documents,
|
71 |
+
embedding = HuggingFaceEmbeddings(),
|
72 |
+
persist_directory=VECTOR_DATABASE_LOCATION,
|
73 |
+
)
|
74 |
+
|
75 |
+
|
76 |
+
def load_chunk_persist_pdf(
|
77 |
+
pdf_folder_path: str = "mydir",
|
78 |
+
vector_db_location:str = VECTOR_DATABASE_LOCATION,
|
79 |
+
) -> Chroma:
|
80 |
+
|
81 |
+
documents = []
|
82 |
+
for file in os.listdir(pdf_folder_path):
|
83 |
+
if file.endswith('.pdf'):
|
84 |
+
pdf_path = os.path.join(pdf_folder_path, file)
|
85 |
+
loader = PyPDFLoader(pdf_path)
|
86 |
+
documents.extend(loader.load())
|
87 |
+
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
|
88 |
+
chunked_documents = text_splitter.split_documents(documents)
|
89 |
+
client = chromadb.Client()
|
90 |
+
if client.list_collections():
|
91 |
+
consent_collection = client.create_collection("consent_collection")
|
92 |
+
else:
|
93 |
+
print("Collection already exists")
|
94 |
+
vectordb = Chroma.from_documents(
|
95 |
+
documents=chunked_documents,
|
96 |
+
embedding = HuggingFaceEmbeddings(),
|
97 |
+
persist_directory=VECTOR_DATABASE_LOCATION,
|
98 |
+
)
|
99 |
+
vectordb.persist()
|
100 |
+
return vectordb
|
101 |
+
|
102 |
+
|
103 |
+
def load_vector_store(
|
104 |
+
vector_store_location=os.getenv("VECTOR_DATABASE_LOCATION"),
|
105 |
+
embeddings:chromadb.utils.embedding_functions = HuggingFaceEmbeddings(),
|
106 |
+
) -> Chroma:
|
107 |
+
"""
|
108 |
+
## Summary
|
109 |
+
get the vector_store
|
110 |
+
|
111 |
+
## Arguments
|
112 |
+
vector_store_location (str) : the location of the vector store
|
113 |
+
embeddings (chromadb.utils.embedding_functions) : the function for embedding the data
|
114 |
+
|
115 |
+
## Return
|
116 |
+
returns the chroma db vector store
|
117 |
+
"""
|
118 |
+
|
119 |
+
db = Chroma(
|
120 |
+
persist_directory=vector_store_location,
|
121 |
+
embedding_function=embeddings,
|
122 |
+
)
|
123 |
+
|
124 |
+
return db
|
125 |
+
|
126 |
+
|
127 |
+
if __name__ == "__main__":
|
128 |
+
|
129 |
+
vector_db = load_vector_store()
|
130 |
+
# pdf_file_location = "mydir/181000551.pdf"
|
131 |
+
pdf_file_location = "/workspaces/InnovationPathfinderAI/2402.17764.pdf"
|
132 |
+
|
133 |
+
add_pdf_to_vector_store(
|
134 |
+
vector_store=vector_db,
|
135 |
+
pdf_file_location=pdf_file_location,
|
136 |
+
)
|