Spaces:

sabazo
/

innoSageAgentOne

Sleeping

Asaad Almutareb commited on Apr 8, 2024

Commit

9237552

1 Parent(s): fa99d8f

corrected streaming callback handler

replaced sentence-transformers with Embed4All from GPT4All
Updated requirements.txt, example.env and README to reflect this repo's settings

Files changed (6) hide show

README.md +11 -31
app/api/v1/agents/hf_mixtral_agent.py +1 -1
app/structured_tools/structured_tools.py +14 -16
app/vector_store/chroma_vector_store.py +16 -12
example.env +16 -3
requirements.txt +3 -5

README.md CHANGED Viewed

@@ -1,37 +1,17 @@
----
-title: Innovation Pathfinder AI
-emoji: 🚀
-colorFrom: gray
-colorTo: gray
-sdk: gradio
-sdk_version: 4.2.0
-app_file: app.py
-pinned: false
----
-# InnovationPathfinderAI
 GenAI Research Assistant for Innovation Labs
-## Problem Statement
-In the age of the internet there is more information available than ever before. This is amazing,
-however it is difficult to manage all of this information in a central location. With out tool we
-want to enable people with the capable to discover and manage knowledge bases.
-## Vector Store
-Documents are embedded and store inside of a Chroma vector store
-## Agents
-with agents our application is able to discover and refine the information it collects based on
-the needs and sentiment of the user.
-## Agent Tools
-The tools our agents have access to. More is being created
-- `embed_arvix_paper` This tool is able to add [arvix papers](https://arxiv.org/) to the Chroma Vector Store
-- `knowledgeBase_search` This tool is able to search the knowledge base generated by the user
-- `wikipedia_search` search wikipedia
-- `google_search` search google

+# FastAPI Backend for InnovationPathfinderAI
 GenAI Research Assistant for Innovation Labs
+## Getting Started
+To get started
+1. install requirements:
+    install -r requirements.txt
+2. copy example.env to .env and add your API keys and variables
+3. run uvicron:
+    uvicorn app.main:app
+## ToDo
+we are testing replacing the sentence-transformers with GPT4All's Embed4All
+Code still need to be cleaned

app/api/v1/agents/hf_mixtral_agent.py CHANGED Viewed

@@ -74,7 +74,7 @@ async def websocket_endpoint(websocket: WebSocket):
             await websocket.send_json(resp.model_dump())
             message_id: str = utils.generate_uuid()
-            custom_handler = CustomFinalStreamingStdOutCallbackHandler(
                  websocket, message_id=message_id
              )

             await websocket.send_json(resp.model_dump())
             message_id: str = utils.generate_uuid()
+            custom_handler = CustomAsyncCallbackHandler(
                  websocket, message_id=message_id
              )

app/structured_tools/structured_tools.py CHANGED Viewed

@@ -8,6 +8,7 @@ from langchain_community.utilities import GoogleSearchAPIWrapper
 from langchain_community.embeddings.sentence_transformer import (
     SentenceTransformerEmbeddings,
 )
 from app.core.config import settings
 from langchain_community.vectorstores import Chroma
 import arxiv
@@ -51,10 +52,11 @@ def memory_search(query:str) -> str:
     collection_name = settings.CONVERSATION_COLLECTION_NAME
     #store using envar
-    embedding_function = SentenceTransformerEmbeddings(
-        model_name=settings.EMBEDDING_MODEL
-        #model_name=os.getenv("EMBEDDING_MODEL"),
-        )
     vector_db = Chroma(
     client=client, # client for Chroma
@@ -78,15 +80,16 @@ def knowledgeBase_search(query:str) -> str:
     collection_name="ArxivPapers"
     #store using envar
-    embedding_function = SentenceTransformerEmbeddings(
-        #model_name=os.getenv("EMBEDDING_MODEL"),
-        model_name=settings.EMBEDDING_MODEL
-        )
     vector_db = Chroma(
-    client=client, # client for Chroma
-    collection_name=collection_name,
-    embedding_function=embedding_function,
     )
     retriever = vector_db.as_retriever()
@@ -153,11 +156,6 @@ def embed_arvix_paper(paper_id:str) -> None:
     collection_name="ArxivPapers"
     #store using envar
-    embedding_function = SentenceTransformerEmbeddings(
-        #model_name=os.getenv("EMBEDDING_MODEL"),
-        model_name=settings.EMBEDDING_MODEL
-        )
     full_path = os.path.join(pdf_directory, pdf_file_name)
     add_pdf_to_vector_store(

 from langchain_community.embeddings.sentence_transformer import (
     SentenceTransformerEmbeddings,
 )
+from langchain_community.embeddings import GPT4AllEmbeddings
 from app.core.config import settings
 from langchain_community.vectorstores import Chroma
 import arxiv
     collection_name = settings.CONVERSATION_COLLECTION_NAME
     #store using envar
+    # embedding_function = SentenceTransformerEmbeddings(
+    #     model_name=settings.EMBEDDING_MODEL
+    #     #model_name=os.getenv("EMBEDDING_MODEL"),
+    #     )
+    embedding_function = GPT4AllEmbeddings()
     vector_db = Chroma(
     client=client, # client for Chroma
     collection_name="ArxivPapers"
     #store using envar
+    # embedding_function = SentenceTransformerEmbeddings(
+    #     #model_name=os.getenv("EMBEDDING_MODEL"),
+    #     model_name=settings.EMBEDDING_MODEL
+    #     )
+    embedding_function = GPT4AllEmbeddings()
     vector_db = Chroma(
+        client=client, # client for Chroma
+        collection_name=collection_name,
+        embedding_function=embedding_function,
     )
     retriever = vector_db.as_retriever()
     collection_name="ArxivPapers"
     #store using envar
     full_path = os.path.join(pdf_directory, pdf_file_name)
     add_pdf_to_vector_store(

app/vector_store/chroma_vector_store.py CHANGED Viewed

@@ -20,6 +20,7 @@ from langchain_community.vectorstores import Chroma
 from langchain_community.embeddings.sentence_transformer import (
     SentenceTransformerEmbeddings,
 )
 from app.utils.utils import (
     generate_uuid
 )
@@ -97,10 +98,11 @@ def add_markdown_to_collection(
         name=collection_name,
         )
-    embedding_function = SentenceTransformerEmbeddings(
-        #model_name=os.getenv("EMBEDDING_MODEL"),
-        model_name=settings.EMBEDDING_MODEL
-        )
     documents_page_content:list = [i.page_content for i in splits]
@@ -180,10 +182,11 @@ def add_pdf_to_vector_store(
     name=collection_name,
     )
-    embedding_function = SentenceTransformerEmbeddings(
-        #model_name=os.getenv("EMBEDDING_MODEL"),
-        model_name=settings.EMBEDDING_MODEL
-        )
     documents_page_content:list = [i.page_content for i in split_docs]
@@ -239,10 +242,11 @@ if __name__ == "__main__":
     collection_name="ArxivPapers"
     # create the open-source embedding function
-    embedding_function = SentenceTransformerEmbeddings(
-        #model_name=os.getenv("EMBEDDING_MODEL"),
-        model_name=settings.EMBEDDING_MODEL
-        )
     #method of integrating Chroma and Langchain
     vector_db = Chroma(

 from langchain_community.embeddings.sentence_transformer import (
     SentenceTransformerEmbeddings,
 )
+from langchain_community.embeddings import GPT4AllEmbeddings
 from app.utils.utils import (
     generate_uuid
 )
         name=collection_name,
         )
+    # embedding_function = SentenceTransformerEmbeddings(
+    #     #model_name=os.getenv("EMBEDDING_MODEL"),
+    #     model_name=settings.EMBEDDING_MODEL
+    #     )
+    embedding_function = GPT4AllEmbeddings()
     documents_page_content:list = [i.page_content for i in splits]
     name=collection_name,
     )
+    # embedding_function = SentenceTransformerEmbeddings(
+    #     #model_name=os.getenv("EMBEDDING_MODEL"),
+    #     model_name=settings.EMBEDDING_MODEL
+    #     )
+    embedding_function = GPT4AllEmbeddings()
     documents_page_content:list = [i.page_content for i in split_docs]
     collection_name="ArxivPapers"
     # create the open-source embedding function
+    # embedding_function = SentenceTransformerEmbeddings(
+    #     #model_name=os.getenv("EMBEDDING_MODEL"),
+    #     model_name=settings.EMBEDDING_MODEL
+    #     )
+    embedding_function = GPT4AllEmbeddings()
     #method of integrating Chroma and Langchain
     vector_db = Chroma(

example.env CHANGED Viewed

@@ -5,14 +5,27 @@ HUGGINGFACEHUB_API_TOKEN=
 OLLMA_BASE_URL=
 # environmental varaibles needed to use tools
-SERPAPI_API_KEY=
 # for chromadb
-VECTOR_DATABASE_LOCATION="innovation_pathfinder_ai/knowledge_base/"
 # Name for the Conversation Memory Collection
 CONVERSATION_COLLECTION_NAME="ConversationMemory"
 EMBEDDING_MODEL="sentence-transformers/all-MiniLM-L6-v2"
-SOURCES_CACHE="innovation_pathfinder_ai/database/sources_cache.sqlite3"

 OLLMA_BASE_URL=
 # environmental varaibles needed to use tools
+#SERPAPI_API_KEY=
+# we are using Google Custom Search Engine now
+GOOGLE_CSE_ID=
+GOOGLE_API_KEY=
 # for chromadb
+VECTOR_DATABASE_LOCATION="app/knowledge_base/"
 # Name for the Conversation Memory Collection
 CONVERSATION_COLLECTION_NAME="ConversationMemory"
 EMBEDDING_MODEL="sentence-transformers/all-MiniLM-L6-v2"
+SOURCES_CACHE="app/database/sources_cache.sqlite3"
+# local cache
+LOCAL_CACHE=".cache.db"
+# project name
+PROJECT_NAME=innovation_pathfinder_ai
+# restricting access to the backend resources, for development it's set to * ('all')
+BACKEND_CORS_ORIGINS=["*"]

requirements.txt CHANGED Viewed

@@ -2,10 +2,8 @@ langchain-community
 langchain
 google-search-results
 langchainhub
-text_generation
 arxiv
 wikipedia
-gradio==3.48.0
 chromadb
 google_api_python_client
 pypdf2
@@ -13,6 +11,6 @@ sqlmodel
 rich
 fastapi
 uvicorn
-sentence-transformers
-fastapi-pagination
-adaptive-cards-py

 langchain
 google-search-results
 langchainhub
 arxiv
 wikipedia
 chromadb
 google_api_python_client
 pypdf2
 rich
 fastapi
 uvicorn
+adaptive-cards-py
+pydantic_settings
+gpt4all