isayahc commited on
Commit
26a0cad
1 Parent(s): 509813e

code for vector store

Browse files
innovation_pathfinder_ai/vector_store/chroma_vector_store.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # got some of the code from https://diptimanrc.medium.com/rapid-q-a-on-multiple-pdfs-using-langchain-and-chromadb-as-local-disk-vector-store-60678328c0df
2
+
3
+ import PyPDF2
4
+ import io
5
+ import os
6
+ from langchain_community.vectorstores import Chroma
7
+ from langchain.document_loaders import PyPDFLoader
8
+ from langchain.text_splitter import CharacterTextSplitter
9
+ from langchain.vectorstores import Chroma
10
+ import chromadb
11
+ from langchain_community.embeddings import HuggingFaceEmbeddings
12
+ import dotenv
13
+
14
+ dotenv.load_dotenv()
15
+
16
+
17
+ VECTOR_DATABASE_LOCATION = os.getenv("VECTOR_DATABASE_LOCATION")
18
+
19
+
20
+ def extract_text_from_pdf(file) -> list[str]:
21
+ documents = []
22
+ try:
23
+ reader = PyPDF2.PdfReader(file)
24
+ num_pages = len(reader.pages)
25
+ for page_num in range(num_pages):
26
+ page = reader.pages[page_num]
27
+ text= page.extract_text() + "\n"
28
+ documents.append(text)
29
+ except Exception as e:
30
+ print(e)
31
+ finally:
32
+ return documents
33
+
34
+
35
+ def add_pdf_to_vector_store(
36
+ vector_store:Chroma.from_documents,
37
+ pdf_file_location:str,
38
+ text_chunk_size=1000,
39
+ text_chunk_overlap=10,
40
+ ) -> None:
41
+ """
42
+ ## Summary
43
+ given the location of a pdf file this will chunk it's contents
44
+ and store it the given vectorstore
45
+
46
+ ## Arguments
47
+ vectorstore (vectorstore):
48
+ pdf_file_location (str) : location of pdf file
49
+
50
+ ## Return
51
+ None
52
+ """
53
+
54
+ documents = []
55
+
56
+ loader = PyPDFLoader(pdf_file_location)
57
+ # documents.extend(loader.load())
58
+
59
+ text_splitter = CharacterTextSplitter(
60
+ chunk_size=text_chunk_size,
61
+ chunk_overlap=text_chunk_overlap,
62
+ )
63
+
64
+
65
+ documents.extend(loader.load())
66
+
67
+ chunked_documents = text_splitter.split_documents(documents)
68
+
69
+ vectordb = Chroma.from_documents(
70
+ documents=chunked_documents,
71
+ embedding = HuggingFaceEmbeddings(),
72
+ persist_directory=VECTOR_DATABASE_LOCATION,
73
+ )
74
+
75
+
76
+ def load_chunk_persist_pdf(
77
+ pdf_folder_path: str = "mydir",
78
+ vector_db_location:str = VECTOR_DATABASE_LOCATION,
79
+ ) -> Chroma:
80
+
81
+ documents = []
82
+ for file in os.listdir(pdf_folder_path):
83
+ if file.endswith('.pdf'):
84
+ pdf_path = os.path.join(pdf_folder_path, file)
85
+ loader = PyPDFLoader(pdf_path)
86
+ documents.extend(loader.load())
87
+ text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
88
+ chunked_documents = text_splitter.split_documents(documents)
89
+ client = chromadb.Client()
90
+ if client.list_collections():
91
+ consent_collection = client.create_collection("consent_collection")
92
+ else:
93
+ print("Collection already exists")
94
+ vectordb = Chroma.from_documents(
95
+ documents=chunked_documents,
96
+ embedding = HuggingFaceEmbeddings(),
97
+ persist_directory=VECTOR_DATABASE_LOCATION,
98
+ )
99
+ vectordb.persist()
100
+ return vectordb
101
+
102
+
103
+ def load_vector_store(
104
+ vector_store_location=os.getenv("VECTOR_DATABASE_LOCATION"),
105
+ embeddings:chromadb.utils.embedding_functions = HuggingFaceEmbeddings(),
106
+ ) -> Chroma:
107
+ """
108
+ ## Summary
109
+ get the vector_store
110
+
111
+ ## Arguments
112
+ vector_store_location (str) : the location of the vector store
113
+ embeddings (chromadb.utils.embedding_functions) : the function for embedding the data
114
+
115
+ ## Return
116
+ returns the chroma db vector store
117
+ """
118
+
119
+ db = Chroma(
120
+ persist_directory=vector_store_location,
121
+ embedding_function=embeddings,
122
+ )
123
+
124
+ return db
125
+
126
+
127
+ if __name__ == "__main__":
128
+
129
+ vector_db = load_vector_store()
130
+ # pdf_file_location = "mydir/181000551.pdf"
131
+ pdf_file_location = "/workspaces/InnovationPathfinderAI/2402.17764.pdf"
132
+
133
+ add_pdf_to_vector_store(
134
+ vector_store=vector_db,
135
+ pdf_file_location=pdf_file_location,
136
+ )