Spaces:
Sleeping
Sleeping
Asaad Almutareb
commited on
Commit
·
9237552
1
Parent(s):
fa99d8f
corrected streaming callback handler
Browse filesreplaced sentence-transformers with Embed4All from GPT4All
Updated requirements.txt, example.env and README to reflect this repo's settings
- README.md +11 -31
- app/api/v1/agents/hf_mixtral_agent.py +1 -1
- app/structured_tools/structured_tools.py +14 -16
- app/vector_store/chroma_vector_store.py +16 -12
- example.env +16 -3
- requirements.txt +3 -5
README.md
CHANGED
@@ -1,37 +1,17 @@
|
|
1 |
-
|
2 |
-
title: Innovation Pathfinder AI
|
3 |
-
emoji: 🚀
|
4 |
-
colorFrom: gray
|
5 |
-
colorTo: gray
|
6 |
-
sdk: gradio
|
7 |
-
sdk_version: 4.2.0
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
-
---
|
11 |
-
|
12 |
-
# InnovationPathfinderAI
|
13 |
GenAI Research Assistant for Innovation Labs
|
14 |
|
15 |
-
##
|
16 |
-
|
17 |
-
however it is difficult to manage all of this information in a central location. With out tool we
|
18 |
-
want to enable people with the capable to discover and manage knowledge bases.
|
19 |
-
|
20 |
-
## Vector Store
|
21 |
-
Documents are embedded and store inside of a Chroma vector store
|
22 |
-
|
23 |
-
## Agents
|
24 |
-
|
25 |
-
with agents our application is able to discover and refine the information it collects based on
|
26 |
-
the needs and sentiment of the user.
|
27 |
-
|
28 |
-
## Agent Tools
|
29 |
-
The tools our agents have access to. More is being created
|
30 |
|
31 |
-
|
|
|
32 |
|
33 |
-
|
34 |
|
35 |
-
|
|
|
36 |
|
37 |
-
|
|
|
|
|
|
1 |
+
# FastAPI Backend for InnovationPathfinderAI
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
GenAI Research Assistant for Innovation Labs
|
3 |
|
4 |
+
## Getting Started
|
5 |
+
To get started
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
+
1. install requirements:
|
8 |
+
install -r requirements.txt
|
9 |
|
10 |
+
2. copy example.env to .env and add your API keys and variables
|
11 |
|
12 |
+
3. run uvicron:
|
13 |
+
uvicorn app.main:app
|
14 |
|
15 |
+
## ToDo
|
16 |
+
we are testing replacing the sentence-transformers with GPT4All's Embed4All
|
17 |
+
Code still need to be cleaned
|
app/api/v1/agents/hf_mixtral_agent.py
CHANGED
@@ -74,7 +74,7 @@ async def websocket_endpoint(websocket: WebSocket):
|
|
74 |
|
75 |
await websocket.send_json(resp.model_dump())
|
76 |
message_id: str = utils.generate_uuid()
|
77 |
-
custom_handler =
|
78 |
websocket, message_id=message_id
|
79 |
)
|
80 |
|
|
|
74 |
|
75 |
await websocket.send_json(resp.model_dump())
|
76 |
message_id: str = utils.generate_uuid()
|
77 |
+
custom_handler = CustomAsyncCallbackHandler(
|
78 |
websocket, message_id=message_id
|
79 |
)
|
80 |
|
app/structured_tools/structured_tools.py
CHANGED
@@ -8,6 +8,7 @@ from langchain_community.utilities import GoogleSearchAPIWrapper
|
|
8 |
from langchain_community.embeddings.sentence_transformer import (
|
9 |
SentenceTransformerEmbeddings,
|
10 |
)
|
|
|
11 |
from app.core.config import settings
|
12 |
from langchain_community.vectorstores import Chroma
|
13 |
import arxiv
|
@@ -51,10 +52,11 @@ def memory_search(query:str) -> str:
|
|
51 |
collection_name = settings.CONVERSATION_COLLECTION_NAME
|
52 |
#store using envar
|
53 |
|
54 |
-
embedding_function = SentenceTransformerEmbeddings(
|
55 |
-
|
56 |
-
|
57 |
-
|
|
|
58 |
|
59 |
vector_db = Chroma(
|
60 |
client=client, # client for Chroma
|
@@ -78,15 +80,16 @@ def knowledgeBase_search(query:str) -> str:
|
|
78 |
collection_name="ArxivPapers"
|
79 |
#store using envar
|
80 |
|
81 |
-
embedding_function = SentenceTransformerEmbeddings(
|
82 |
-
|
83 |
-
|
84 |
-
|
|
|
85 |
|
86 |
vector_db = Chroma(
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
)
|
91 |
|
92 |
retriever = vector_db.as_retriever()
|
@@ -153,11 +156,6 @@ def embed_arvix_paper(paper_id:str) -> None:
|
|
153 |
collection_name="ArxivPapers"
|
154 |
#store using envar
|
155 |
|
156 |
-
embedding_function = SentenceTransformerEmbeddings(
|
157 |
-
#model_name=os.getenv("EMBEDDING_MODEL"),
|
158 |
-
model_name=settings.EMBEDDING_MODEL
|
159 |
-
)
|
160 |
-
|
161 |
full_path = os.path.join(pdf_directory, pdf_file_name)
|
162 |
|
163 |
add_pdf_to_vector_store(
|
|
|
8 |
from langchain_community.embeddings.sentence_transformer import (
|
9 |
SentenceTransformerEmbeddings,
|
10 |
)
|
11 |
+
from langchain_community.embeddings import GPT4AllEmbeddings
|
12 |
from app.core.config import settings
|
13 |
from langchain_community.vectorstores import Chroma
|
14 |
import arxiv
|
|
|
52 |
collection_name = settings.CONVERSATION_COLLECTION_NAME
|
53 |
#store using envar
|
54 |
|
55 |
+
# embedding_function = SentenceTransformerEmbeddings(
|
56 |
+
# model_name=settings.EMBEDDING_MODEL
|
57 |
+
# #model_name=os.getenv("EMBEDDING_MODEL"),
|
58 |
+
# )
|
59 |
+
embedding_function = GPT4AllEmbeddings()
|
60 |
|
61 |
vector_db = Chroma(
|
62 |
client=client, # client for Chroma
|
|
|
80 |
collection_name="ArxivPapers"
|
81 |
#store using envar
|
82 |
|
83 |
+
# embedding_function = SentenceTransformerEmbeddings(
|
84 |
+
# #model_name=os.getenv("EMBEDDING_MODEL"),
|
85 |
+
# model_name=settings.EMBEDDING_MODEL
|
86 |
+
# )
|
87 |
+
embedding_function = GPT4AllEmbeddings()
|
88 |
|
89 |
vector_db = Chroma(
|
90 |
+
client=client, # client for Chroma
|
91 |
+
collection_name=collection_name,
|
92 |
+
embedding_function=embedding_function,
|
93 |
)
|
94 |
|
95 |
retriever = vector_db.as_retriever()
|
|
|
156 |
collection_name="ArxivPapers"
|
157 |
#store using envar
|
158 |
|
|
|
|
|
|
|
|
|
|
|
159 |
full_path = os.path.join(pdf_directory, pdf_file_name)
|
160 |
|
161 |
add_pdf_to_vector_store(
|
app/vector_store/chroma_vector_store.py
CHANGED
@@ -20,6 +20,7 @@ from langchain_community.vectorstores import Chroma
|
|
20 |
from langchain_community.embeddings.sentence_transformer import (
|
21 |
SentenceTransformerEmbeddings,
|
22 |
)
|
|
|
23 |
from app.utils.utils import (
|
24 |
generate_uuid
|
25 |
)
|
@@ -97,10 +98,11 @@ def add_markdown_to_collection(
|
|
97 |
name=collection_name,
|
98 |
)
|
99 |
|
100 |
-
embedding_function = SentenceTransformerEmbeddings(
|
101 |
-
|
102 |
-
|
103 |
-
|
|
|
104 |
|
105 |
documents_page_content:list = [i.page_content for i in splits]
|
106 |
|
@@ -180,10 +182,11 @@ def add_pdf_to_vector_store(
|
|
180 |
name=collection_name,
|
181 |
)
|
182 |
|
183 |
-
embedding_function = SentenceTransformerEmbeddings(
|
184 |
-
|
185 |
-
|
186 |
-
|
|
|
187 |
|
188 |
documents_page_content:list = [i.page_content for i in split_docs]
|
189 |
|
@@ -239,10 +242,11 @@ if __name__ == "__main__":
|
|
239 |
collection_name="ArxivPapers"
|
240 |
|
241 |
# create the open-source embedding function
|
242 |
-
embedding_function = SentenceTransformerEmbeddings(
|
243 |
-
|
244 |
-
|
245 |
-
|
|
|
246 |
|
247 |
#method of integrating Chroma and Langchain
|
248 |
vector_db = Chroma(
|
|
|
20 |
from langchain_community.embeddings.sentence_transformer import (
|
21 |
SentenceTransformerEmbeddings,
|
22 |
)
|
23 |
+
from langchain_community.embeddings import GPT4AllEmbeddings
|
24 |
from app.utils.utils import (
|
25 |
generate_uuid
|
26 |
)
|
|
|
98 |
name=collection_name,
|
99 |
)
|
100 |
|
101 |
+
# embedding_function = SentenceTransformerEmbeddings(
|
102 |
+
# #model_name=os.getenv("EMBEDDING_MODEL"),
|
103 |
+
# model_name=settings.EMBEDDING_MODEL
|
104 |
+
# )
|
105 |
+
embedding_function = GPT4AllEmbeddings()
|
106 |
|
107 |
documents_page_content:list = [i.page_content for i in splits]
|
108 |
|
|
|
182 |
name=collection_name,
|
183 |
)
|
184 |
|
185 |
+
# embedding_function = SentenceTransformerEmbeddings(
|
186 |
+
# #model_name=os.getenv("EMBEDDING_MODEL"),
|
187 |
+
# model_name=settings.EMBEDDING_MODEL
|
188 |
+
# )
|
189 |
+
embedding_function = GPT4AllEmbeddings()
|
190 |
|
191 |
documents_page_content:list = [i.page_content for i in split_docs]
|
192 |
|
|
|
242 |
collection_name="ArxivPapers"
|
243 |
|
244 |
# create the open-source embedding function
|
245 |
+
# embedding_function = SentenceTransformerEmbeddings(
|
246 |
+
# #model_name=os.getenv("EMBEDDING_MODEL"),
|
247 |
+
# model_name=settings.EMBEDDING_MODEL
|
248 |
+
# )
|
249 |
+
embedding_function = GPT4AllEmbeddings()
|
250 |
|
251 |
#method of integrating Chroma and Langchain
|
252 |
vector_db = Chroma(
|
example.env
CHANGED
@@ -5,14 +5,27 @@ HUGGINGFACEHUB_API_TOKEN=
|
|
5 |
OLLMA_BASE_URL=
|
6 |
|
7 |
# environmental varaibles needed to use tools
|
8 |
-
SERPAPI_API_KEY=
|
|
|
|
|
|
|
|
|
9 |
|
10 |
# for chromadb
|
11 |
-
VECTOR_DATABASE_LOCATION="
|
12 |
|
13 |
# Name for the Conversation Memory Collection
|
14 |
CONVERSATION_COLLECTION_NAME="ConversationMemory"
|
15 |
|
16 |
EMBEDDING_MODEL="sentence-transformers/all-MiniLM-L6-v2"
|
17 |
|
18 |
-
SOURCES_CACHE="
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
OLLMA_BASE_URL=
|
6 |
|
7 |
# environmental varaibles needed to use tools
|
8 |
+
#SERPAPI_API_KEY=
|
9 |
+
|
10 |
+
# we are using Google Custom Search Engine now
|
11 |
+
GOOGLE_CSE_ID=
|
12 |
+
GOOGLE_API_KEY=
|
13 |
|
14 |
# for chromadb
|
15 |
+
VECTOR_DATABASE_LOCATION="app/knowledge_base/"
|
16 |
|
17 |
# Name for the Conversation Memory Collection
|
18 |
CONVERSATION_COLLECTION_NAME="ConversationMemory"
|
19 |
|
20 |
EMBEDDING_MODEL="sentence-transformers/all-MiniLM-L6-v2"
|
21 |
|
22 |
+
SOURCES_CACHE="app/database/sources_cache.sqlite3"
|
23 |
+
|
24 |
+
# local cache
|
25 |
+
LOCAL_CACHE=".cache.db"
|
26 |
+
|
27 |
+
# project name
|
28 |
+
PROJECT_NAME=innovation_pathfinder_ai
|
29 |
+
|
30 |
+
# restricting access to the backend resources, for development it's set to * ('all')
|
31 |
+
BACKEND_CORS_ORIGINS=["*"]
|
requirements.txt
CHANGED
@@ -2,10 +2,8 @@ langchain-community
|
|
2 |
langchain
|
3 |
google-search-results
|
4 |
langchainhub
|
5 |
-
text_generation
|
6 |
arxiv
|
7 |
wikipedia
|
8 |
-
gradio==3.48.0
|
9 |
chromadb
|
10 |
google_api_python_client
|
11 |
pypdf2
|
@@ -13,6 +11,6 @@ sqlmodel
|
|
13 |
rich
|
14 |
fastapi
|
15 |
uvicorn
|
16 |
-
|
17 |
-
|
18 |
-
|
|
|
2 |
langchain
|
3 |
google-search-results
|
4 |
langchainhub
|
|
|
5 |
arxiv
|
6 |
wikipedia
|
|
|
7 |
chromadb
|
8 |
google_api_python_client
|
9 |
pypdf2
|
|
|
11 |
rich
|
12 |
fastapi
|
13 |
uvicorn
|
14 |
+
adaptive-cards-py
|
15 |
+
pydantic_settings
|
16 |
+
gpt4all
|