sabazo commited on
Commit
67264eb
2 Parent(s): 3f9dc1c 446904e

Merge pull request #43 from almutareb/one_embedding_model

Browse files
example.env CHANGED
@@ -11,4 +11,6 @@ SERPAPI_API_KEY=
11
  VECTOR_DATABASE_LOCATION=
12
 
13
  # Name for the Conversation Memory Collection
14
- CONVERSATION_COLLECTION_NAME="ConversationMemory"
 
 
 
11
  VECTOR_DATABASE_LOCATION=
12
 
13
  # Name for the Conversation Memory Collection
14
+ CONVERSATION_COLLECTION_NAME="ConversationMemory"
15
+
16
+ EMBEDDING_MODEL="sentence-transformers/all-MiniLM-L6-v2"
innovation_pathfinder_ai/structured_tools/structured_tools.py CHANGED
@@ -52,7 +52,7 @@ def memory_search(query:str) -> str:
52
  #store using envar
53
 
54
  embedding_function = SentenceTransformerEmbeddings(
55
- model_name="all-MiniLM-L6-v2",
56
  )
57
 
58
  vector_db = Chroma(
@@ -78,7 +78,7 @@ def knowledgeBase_search(query:str) -> str:
78
  #store using envar
79
 
80
  embedding_function = SentenceTransformerEmbeddings(
81
- model_name="all-MiniLM-L6-v2",
82
  )
83
 
84
  vector_db = Chroma(
@@ -152,7 +152,7 @@ def embed_arvix_paper(paper_id:str) -> None:
152
  #store using envar
153
 
154
  embedding_function = SentenceTransformerEmbeddings(
155
- model_name="all-MiniLM-L6-v2",
156
  )
157
 
158
  full_path = os.path.join(pdf_directory, pdf_file_name)
 
52
  #store using envar
53
 
54
  embedding_function = SentenceTransformerEmbeddings(
55
+ model_name=os.getenv("EMBEDDING_MODEL"),
56
  )
57
 
58
  vector_db = Chroma(
 
78
  #store using envar
79
 
80
  embedding_function = SentenceTransformerEmbeddings(
81
+ model_name=os.getenv("EMBEDDING_MODEL"),
82
  )
83
 
84
  vector_db = Chroma(
 
152
  #store using envar
153
 
154
  embedding_function = SentenceTransformerEmbeddings(
155
+ model_name=os.getenv("EMBEDDING_MODEL"),
156
  )
157
 
158
  full_path = os.path.join(pdf_directory, pdf_file_name)
innovation_pathfinder_ai/vector_store/chroma_vector_store.py CHANGED
@@ -8,7 +8,6 @@
8
  # https://python.langchain.com/docs/modules/data_connection/retrievers/vectorstore
9
 
10
  import chromadb
11
- import chromadb.utils.embedding_functions as embedding_functions
12
 
13
  from langchain.text_splitter import CharacterTextSplitter
14
  from langchain_text_splitters import MarkdownHeaderTextSplitter
@@ -99,9 +98,9 @@ def add_markdown_to_collection(
99
  name=collection_name,
100
  )
101
 
102
- embed_data = embedding_functions.HuggingFaceEmbeddingFunction(
103
- api_key= os.getenv("HUGGINGFACEHUB_API_TOKEN"),
104
- )
105
 
106
  documents_page_content:list = [i.page_content for i in splits]
107
 
@@ -111,7 +110,7 @@ def add_markdown_to_collection(
111
  collection.add(
112
  ids=[generate_uuid()], # give each document a uuid
113
  documents=documents_page_content[i], # contents of document
114
- embeddings=embed_data.embed_with_retries(documents_page_content[i]),
115
  metadatas=data.metadata, # type: ignore
116
  )
117
 
@@ -181,13 +180,9 @@ def add_pdf_to_vector_store(
181
  name=collection_name,
182
  )
183
 
184
- embed_data = embedding_functions.HuggingFaceEmbeddingFunction(
185
- api_key = os.getenv("HUGGINGFACEHUB_API_TOKEN"),
186
- model_name= "sentence-transformers/all-MiniLM-L6-v2" # added model name for clariity
187
- )
188
-
189
- # create the open-source embedding function
190
- # embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
191
 
192
  documents_page_content:list = [i.page_content for i in split_docs]
193
 
@@ -198,7 +193,7 @@ def add_pdf_to_vector_store(
198
  collection.add(
199
  ids=[generate_uuid()], # give each document a uuid
200
  documents=documents_page_content[i], # contents of document
201
- embeddings=embed_data.embed_with_retries(documents_page_content[i]),
202
  metadatas=data.metadata, # type: ignore
203
  )
204
 
@@ -244,7 +239,7 @@ if __name__ == "__main__":
244
 
245
  # create the open-source embedding function
246
  embedding_function = SentenceTransformerEmbeddings(
247
- model_name="all-MiniLM-L6-v2",
248
  )
249
 
250
  #method of integrating Chroma and Langchain
 
8
  # https://python.langchain.com/docs/modules/data_connection/retrievers/vectorstore
9
 
10
  import chromadb
 
11
 
12
  from langchain.text_splitter import CharacterTextSplitter
13
  from langchain_text_splitters import MarkdownHeaderTextSplitter
 
98
  name=collection_name,
99
  )
100
 
101
+ embedding_function = SentenceTransformerEmbeddings(
102
+ model_name=os.getenv("EMBEDDING_MODEL"),
103
+ )
104
 
105
  documents_page_content:list = [i.page_content for i in splits]
106
 
 
110
  collection.add(
111
  ids=[generate_uuid()], # give each document a uuid
112
  documents=documents_page_content[i], # contents of document
113
+ embeddings=embedding_function(documents_page_content[i]),
114
  metadatas=data.metadata, # type: ignore
115
  )
116
 
 
180
  name=collection_name,
181
  )
182
 
183
+ embedding_function = SentenceTransformerEmbeddings(
184
+ model_name=os.getenv("EMBEDDING_MODEL"),
185
+ )
 
 
 
 
186
 
187
  documents_page_content:list = [i.page_content for i in split_docs]
188
 
 
193
  collection.add(
194
  ids=[generate_uuid()], # give each document a uuid
195
  documents=documents_page_content[i], # contents of document
196
+ embeddings=embedding_function(documents_page_content[i]),
197
  metadatas=data.metadata, # type: ignore
198
  )
199
 
 
239
 
240
  # create the open-source embedding function
241
  embedding_function = SentenceTransformerEmbeddings(
242
+ model_name=os.getenv("EMBEDDING_MODEL"),
243
  )
244
 
245
  #method of integrating Chroma and Langchain