isayahc commited on
Commit
cd0ca96
1 Parent(s): 3f9dc1c

allowing the embedding model to be changed via .env

Browse files
example.env CHANGED
@@ -11,4 +11,6 @@ SERPAPI_API_KEY=
11
  VECTOR_DATABASE_LOCATION=
12
 
13
  # Name for the Conversation Memory Collection
14
- CONVERSATION_COLLECTION_NAME="ConversationMemory"
 
 
 
11
  VECTOR_DATABASE_LOCATION=
12
 
13
  # Name for the Conversation Memory Collection
14
+ CONVERSATION_COLLECTION_NAME="ConversationMemory"
15
+
16
+ EMBEDDING_MODEL="sentence-transformers/all-MiniLM-L6-v2"
innovation_pathfinder_ai/structured_tools/structured_tools.py CHANGED
@@ -52,7 +52,7 @@ def memory_search(query:str) -> str:
52
  #store using envar
53
 
54
  embedding_function = SentenceTransformerEmbeddings(
55
- model_name="all-MiniLM-L6-v2",
56
  )
57
 
58
  vector_db = Chroma(
@@ -78,7 +78,7 @@ def knowledgeBase_search(query:str) -> str:
78
  #store using envar
79
 
80
  embedding_function = SentenceTransformerEmbeddings(
81
- model_name="all-MiniLM-L6-v2",
82
  )
83
 
84
  vector_db = Chroma(
@@ -152,7 +152,7 @@ def embed_arvix_paper(paper_id:str) -> None:
152
  #store using envar
153
 
154
  embedding_function = SentenceTransformerEmbeddings(
155
- model_name="all-MiniLM-L6-v2",
156
  )
157
 
158
  full_path = os.path.join(pdf_directory, pdf_file_name)
 
52
  #store using envar
53
 
54
  embedding_function = SentenceTransformerEmbeddings(
55
+ model_name=os.getenv("EMBEDDING_MODEL"),
56
  )
57
 
58
  vector_db = Chroma(
 
78
  #store using envar
79
 
80
  embedding_function = SentenceTransformerEmbeddings(
81
+ model_name=os.getenv("EMBEDDING_MODEL"),
82
  )
83
 
84
  vector_db = Chroma(
 
152
  #store using envar
153
 
154
  embedding_function = SentenceTransformerEmbeddings(
155
+ model_name=os.getenv("EMBEDDING_MODEL"),
156
  )
157
 
158
  full_path = os.path.join(pdf_directory, pdf_file_name)
innovation_pathfinder_ai/vector_store/chroma_vector_store.py CHANGED
@@ -99,9 +99,9 @@ def add_markdown_to_collection(
99
  name=collection_name,
100
  )
101
 
102
- embed_data = embedding_functions.HuggingFaceEmbeddingFunction(
103
- api_key= os.getenv("HUGGINGFACEHUB_API_TOKEN"),
104
- )
105
 
106
  documents_page_content:list = [i.page_content for i in splits]
107
 
@@ -111,7 +111,7 @@ def add_markdown_to_collection(
111
  collection.add(
112
  ids=[generate_uuid()], # give each document a uuid
113
  documents=documents_page_content[i], # contents of document
114
- embeddings=embed_data.embed_with_retries(documents_page_content[i]),
115
  metadatas=data.metadata, # type: ignore
116
  )
117
 
@@ -181,13 +181,9 @@ def add_pdf_to_vector_store(
181
  name=collection_name,
182
  )
183
 
184
- embed_data = embedding_functions.HuggingFaceEmbeddingFunction(
185
- api_key = os.getenv("HUGGINGFACEHUB_API_TOKEN"),
186
- model_name= "sentence-transformers/all-MiniLM-L6-v2" # added model name for clariity
187
- )
188
-
189
- # create the open-source embedding function
190
- # embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
191
 
192
  documents_page_content:list = [i.page_content for i in split_docs]
193
 
@@ -198,7 +194,7 @@ def add_pdf_to_vector_store(
198
  collection.add(
199
  ids=[generate_uuid()], # give each document a uuid
200
  documents=documents_page_content[i], # contents of document
201
- embeddings=embed_data.embed_with_retries(documents_page_content[i]),
202
  metadatas=data.metadata, # type: ignore
203
  )
204
 
@@ -244,7 +240,7 @@ if __name__ == "__main__":
244
 
245
  # create the open-source embedding function
246
  embedding_function = SentenceTransformerEmbeddings(
247
- model_name="all-MiniLM-L6-v2",
248
  )
249
 
250
  #method of integrating Chroma and Langchain
 
99
  name=collection_name,
100
  )
101
 
102
+ embedding_function = SentenceTransformerEmbeddings(
103
+ model_name=os.getenv("EMBEDDING_MODEL"),
104
+ )
105
 
106
  documents_page_content:list = [i.page_content for i in splits]
107
 
 
111
  collection.add(
112
  ids=[generate_uuid()], # give each document a uuid
113
  documents=documents_page_content[i], # contents of document
114
+ embeddings=embedding_function(documents_page_content[i]),
115
  metadatas=data.metadata, # type: ignore
116
  )
117
 
 
181
  name=collection_name,
182
  )
183
 
184
+ embedding_function = SentenceTransformerEmbeddings(
185
+ model_name=os.getenv("EMBEDDING_MODEL"),
186
+ )
 
 
 
 
187
 
188
  documents_page_content:list = [i.page_content for i in split_docs]
189
 
 
194
  collection.add(
195
  ids=[generate_uuid()], # give each document a uuid
196
  documents=documents_page_content[i], # contents of document
197
+ embeddings=embedding_function(documents_page_content[i]),
198
  metadatas=data.metadata, # type: ignore
199
  )
200
 
 
240
 
241
  # create the open-source embedding function
242
  embedding_function = SentenceTransformerEmbeddings(
243
+ model_name=os.getenv("EMBEDDING_MODEL"),
244
  )
245
 
246
  #method of integrating Chroma and Langchain