sabazo commited on
Commit
f2ccb97
2 Parent(s): 160d7e4 bf79622

Merge pull request #12 from almutareb/add_document_to_chroma_tool

Browse files
innovation_pathfinder_ai/structured_tools/structured_tools.py CHANGED
@@ -5,8 +5,15 @@ from langchain_community.tools import WikipediaQueryRun
5
  from langchain_community.utilities import WikipediaAPIWrapper
6
  #from langchain.tools import Tool
7
  from langchain_community.utilities import GoogleSearchAPIWrapper
 
 
 
 
8
  import arxiv
9
  import ast
 
 
 
10
  # hacky and should be replaced with a database
11
  from innovation_pathfinder_ai.source_container.container import (
12
  all_sources
@@ -18,6 +25,15 @@ from innovation_pathfinder_ai.database.db_handler import (
18
  add_many
19
  )
20
 
 
 
 
 
 
 
 
 
 
21
  @tool
22
  def arxiv_search(query: str) -> str:
23
  """Search arxiv database for scientific research papers and studies. This is your primary information source.
@@ -72,9 +88,71 @@ def wikipedia_search(query: str) -> str:
72
  api_wrapper = WikipediaAPIWrapper()
73
  wikipedia_search = WikipediaQueryRun(api_wrapper=api_wrapper)
74
  wikipedia_results = wikipedia_search.run(query)
75
- formatted_summaries = format_wiki_summaries(wikipedia_results)
76
- all_sources += formatted_summaries
77
- parsed_summaries = parse_list_to_dicts(formatted_summaries)
78
- add_many(parsed_summaries)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
- return wikipedia_results.__str__()
 
 
 
 
 
5
  from langchain_community.utilities import WikipediaAPIWrapper
6
  #from langchain.tools import Tool
7
  from langchain_community.utilities import GoogleSearchAPIWrapper
8
+ from langchain_community.embeddings.sentence_transformer import (
9
+ SentenceTransformerEmbeddings,
10
+ )
11
+ from langchain_community.vectorstores import Chroma
12
  import arxiv
13
  import ast
14
+
15
+ import chromadb
16
+
17
  # hacky and should be replaced with a database
18
  from innovation_pathfinder_ai.source_container.container import (
19
  all_sources
 
25
  add_many
26
  )
27
 
28
+ from innovation_pathfinder_ai.vector_store.chroma_vector_store import (
29
+ add_pdf_to_vector_store
30
+ )
31
+ from innovation_pathfinder_ai.utils.utils import (
32
+ create_wikipedia_urls_from_text, create_folder_if_not_exists,
33
+ )
34
+ import os
35
+ # from innovation_pathfinder_ai.utils import create_wikipedia_urls_from_text
36
+
37
  @tool
38
  def arxiv_search(query: str) -> str:
39
  """Search arxiv database for scientific research papers and studies. This is your primary information source.
 
88
  api_wrapper = WikipediaAPIWrapper()
89
  wikipedia_search = WikipediaQueryRun(api_wrapper=api_wrapper)
90
  wikipedia_results = wikipedia_search.run(query)
91
+ all_sources += create_wikipedia_urls_from_text(wikipedia_results)
92
+ return wikipedia_results
93
+
94
+ @tool
95
+ def chroma_search(query:str) -> str:
96
+ """Search the Arxiv vector store for docmunets and relevent chunks"""
97
+ client = chromadb.PersistentClient(
98
+ # path=persist_directory,
99
+ )
100
+
101
+ collection_name="ArxivPapers"
102
+ #store using envar
103
+
104
+ embedding_function = SentenceTransformerEmbeddings(
105
+ model_name="all-MiniLM-L6-v2",
106
+ )
107
+
108
+ vector_db = Chroma(
109
+ client=client, # client for Chroma
110
+ collection_name=collection_name,
111
+ embedding_function=embedding_function,
112
+ )
113
+
114
+ retriever = vector_db.as_retriever()
115
+ docs = retriever.get_relevant_documents(query)
116
+
117
+ return docs.__str__()
118
+
119
+
120
+ @tool
121
+ def embed_arvix_paper(paper_id:str) -> None:
122
+ """Download a paper from axriv to download a paper please input
123
+ the axriv id such as "1605.08386v1" This tool is named get_arxiv_paper
124
+ If you input "http://arxiv.org/abs/2312.02813", This will break the code. Also only do
125
+ "2312.02813". In addition please download one paper at a time. Pleaase keep the inputs/output
126
+ free of additional information only have the id.
127
+ """
128
+ # code from https://lukasschwab.me/arxiv.py/arxiv.html
129
+ paper = next(arxiv.Client().results(arxiv.Search(id_list=[paper_id])))
130
+
131
+ number_without_period = paper_id.replace('.', '')
132
+
133
+ pdf_file_name = f"{number_without_period}.pdf"
134
+
135
+ pdf_directory = "./downloaded_papers"
136
+ create_folder_if_not_exists(pdf_directory)
137
+
138
+ # Download the PDF to a specified directory with a custom filename.
139
+ paper.download_pdf(dirpath=pdf_directory, filename=f"{number_without_period}.pdf")
140
+
141
+ client = chromadb.PersistentClient(
142
+ # path=persist_directory,
143
+ )
144
+
145
+ collection_name="ArxivPapers"
146
+ #store using envar
147
+
148
+ embedding_function = SentenceTransformerEmbeddings(
149
+ model_name="all-MiniLM-L6-v2",
150
+ )
151
+
152
+ full_path = os.path.join(pdf_directory, pdf_file_name)
153
 
154
+ add_pdf_to_vector_store(
155
+ collection_name=collection_name,
156
+ pdf_file_location=full_path,
157
+ )
158
+
innovation_pathfinder_ai/utils/utils.py CHANGED
@@ -1,5 +1,6 @@
1
  import hashlib
2
  import datetime
 
3
 
4
  from innovation_pathfinder_ai.utils import logger
5
 
@@ -168,4 +169,17 @@ def hash_text(text: str) -> str:
168
 
169
 
170
  def convert_timestamp_to_datetime(timestamp: str) -> str:
171
- return datetime.datetime.fromtimestamp(int(timestamp)).strftime("%Y-%m-%d %H:%M:%S")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import hashlib
2
  import datetime
3
+ import os
4
 
5
  from innovation_pathfinder_ai.utils import logger
6
 
 
169
 
170
 
171
  def convert_timestamp_to_datetime(timestamp: str) -> str:
172
+ return datetime.datetime.fromtimestamp(int(timestamp)).strftime("%Y-%m-%d %H:%M:%S")
173
+
174
+ def create_folder_if_not_exists(folder_path: str) -> None:
175
+ """
176
+ Create a folder if it doesn't already exist.
177
+
178
+ Args:
179
+ - folder_path (str): The path of the folder to create.
180
+ """
181
+ if not os.path.exists(folder_path):
182
+ os.makedirs(folder_path)
183
+ print(f"Folder '{folder_path}' created.")
184
+ else:
185
+ print(f"Folder '{folder_path}' already exists.")