Spaces:

sabazo
/

innoSageAgentOne

Sleeping

App Files Files Community

sabazo commited on Mar 27

Commit

67264eb

•

2 Parent(s): 3f9dc1c 446904e

Merge pull request #43 from almutareb/one_embedding_model

Browse files

Files changed (3) hide show

example.env +3 -1
innovation_pathfinder_ai/structured_tools/structured_tools.py +3 -3
innovation_pathfinder_ai/vector_store/chroma_vector_store.py +9 -14

example.env CHANGED Viewed

@@ -11,4 +11,6 @@ SERPAPI_API_KEY=
 VECTOR_DATABASE_LOCATION=
 # Name for the Conversation Memory Collection
-CONVERSATION_COLLECTION_NAME="ConversationMemory"

 VECTOR_DATABASE_LOCATION=
 # Name for the Conversation Memory Collection
+CONVERSATION_COLLECTION_NAME="ConversationMemory"
+EMBEDDING_MODEL="sentence-transformers/all-MiniLM-L6-v2"

innovation_pathfinder_ai/structured_tools/structured_tools.py CHANGED Viewed

@@ -52,7 +52,7 @@ def memory_search(query:str) -> str:
     #store using envar
     embedding_function = SentenceTransformerEmbeddings(
-        model_name="all-MiniLM-L6-v2",
         )
     vector_db = Chroma(
@@ -78,7 +78,7 @@ def knowledgeBase_search(query:str) -> str:
     #store using envar
     embedding_function = SentenceTransformerEmbeddings(
-        model_name="all-MiniLM-L6-v2",
         )
     vector_db = Chroma(
@@ -152,7 +152,7 @@ def embed_arvix_paper(paper_id:str) -> None:
     #store using envar
     embedding_function = SentenceTransformerEmbeddings(
-        model_name="all-MiniLM-L6-v2",
         )
     full_path = os.path.join(pdf_directory, pdf_file_name)

     #store using envar
     embedding_function = SentenceTransformerEmbeddings(
+        model_name=os.getenv("EMBEDDING_MODEL"),
         )
     vector_db = Chroma(
     #store using envar
     embedding_function = SentenceTransformerEmbeddings(
+        model_name=os.getenv("EMBEDDING_MODEL"),
         )
     vector_db = Chroma(
     #store using envar
     embedding_function = SentenceTransformerEmbeddings(
+        model_name=os.getenv("EMBEDDING_MODEL"),
         )
     full_path = os.path.join(pdf_directory, pdf_file_name)

innovation_pathfinder_ai/vector_store/chroma_vector_store.py CHANGED Viewed

@@ -8,7 +8,6 @@
 # https://python.langchain.com/docs/modules/data_connection/retrievers/vectorstore
 import chromadb
-import chromadb.utils.embedding_functions as embedding_functions
 from langchain.text_splitter import CharacterTextSplitter
 from langchain_text_splitters import MarkdownHeaderTextSplitter
@@ -99,9 +98,9 @@ def add_markdown_to_collection(
         name=collection_name,
         )
-    embed_data = embedding_functions.HuggingFaceEmbeddingFunction(
-        api_key= os.getenv("HUGGINGFACEHUB_API_TOKEN"),
-    )
     documents_page_content:list = [i.page_content for i in splits]
@@ -111,7 +110,7 @@ def add_markdown_to_collection(
         collection.add(
             ids=[generate_uuid()], # give each document a uuid
             documents=documents_page_content[i], # contents of document
-            embeddings=embed_data.embed_with_retries(documents_page_content[i]),
             metadatas=data.metadata,  # type: ignore
         )
@@ -181,13 +180,9 @@ def add_pdf_to_vector_store(
     name=collection_name,
     )
-    embed_data = embedding_functions.HuggingFaceEmbeddingFunction(
-        api_key = os.getenv("HUGGINGFACEHUB_API_TOKEN"),
-        model_name= "sentence-transformers/all-MiniLM-L6-v2" # added model name for clariity
-    )
-    # create the open-source embedding function
-    # embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
     documents_page_content:list = [i.page_content for i in split_docs]
@@ -198,7 +193,7 @@ def add_pdf_to_vector_store(
         collection.add(
             ids=[generate_uuid()], # give each document a uuid
             documents=documents_page_content[i], # contents of document
-            embeddings=embed_data.embed_with_retries(documents_page_content[i]),
             metadatas=data.metadata,  # type: ignore
         )
@@ -244,7 +239,7 @@ if __name__ == "__main__":
     # create the open-source embedding function
     embedding_function = SentenceTransformerEmbeddings(
-        model_name="all-MiniLM-L6-v2",
         )
     #method of integrating Chroma and Langchain

 # https://python.langchain.com/docs/modules/data_connection/retrievers/vectorstore
 import chromadb
 from langchain.text_splitter import CharacterTextSplitter
 from langchain_text_splitters import MarkdownHeaderTextSplitter
         name=collection_name,
         )
+    embedding_function = SentenceTransformerEmbeddings(
+        model_name=os.getenv("EMBEDDING_MODEL"),
+        )
     documents_page_content:list = [i.page_content for i in splits]
         collection.add(
             ids=[generate_uuid()], # give each document a uuid
             documents=documents_page_content[i], # contents of document
+            embeddings=embedding_function(documents_page_content[i]),
             metadatas=data.metadata,  # type: ignore
         )
     name=collection_name,
     )
+    embedding_function = SentenceTransformerEmbeddings(
+        model_name=os.getenv("EMBEDDING_MODEL"),
+        )
     documents_page_content:list = [i.page_content for i in split_docs]
         collection.add(
             ids=[generate_uuid()], # give each document a uuid
             documents=documents_page_content[i], # contents of document
+            embeddings=embedding_function(documents_page_content[i]),
             metadatas=data.metadata,  # type: ignore
         )
     # create the open-source embedding function
     embedding_function = SentenceTransformerEmbeddings(
+        model_name=os.getenv("EMBEDDING_MODEL"),
         )
     #method of integrating Chroma and Langchain