Baskar2005 commited on
Commit
83c0ff1
·
verified ·
1 Parent(s): c94ebab

Rename cowen.py to app.py

Browse files
Files changed (1) hide show
  1. cowen.py → app.py +322 -324
cowen.py → app.py RENAMED
@@ -1,325 +1,323 @@
1
- from langchain.text_splitter import CharacterTextSplitter
2
- from langchain.vectorstores import FAISS
3
- from langchain.chat_models import ChatOpenAI
4
- from langchain_openai import AzureChatOpenAI,AzureOpenAIEmbeddings
5
- from langchain.memory import ConversationBufferMemory
6
- from langchain.chains import ConversationChain
7
- from langchain.chains import ConversationalRetrievalChain
8
- from langchain.document_loaders import UnstructuredFileLoader
9
- from typing import List, Dict, Tuple
10
- import gradio as gr
11
- import validators
12
- import requests
13
- import mimetypes
14
- import tempfile
15
- import os
16
- from langchain.chains.question_answering import load_qa_chain
17
- from langchain.llms import OpenAI
18
- from langchain.prompts import PromptTemplate
19
- from langchain.prompts.prompt import PromptTemplate
20
- import pandas as pd
21
- from langchain_experimental.agents.agent_toolkits import create_csv_agent
22
- from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent
23
- from langchain.agents import ZeroShotAgent, Tool, AgentExecutor
24
- from langchain.agents.agent_types import AgentType
25
- # from langchain.agents import create_csv_agent
26
- from langchain import OpenAI, LLMChain
27
- from openai import AzureOpenAI
28
-
29
- os.environ['AZURE_OPENAI_API_KEY'] = "a96a965049c8420dad412abf07cbd26d"
30
- os.environ['AZURE_OPENAI_ENDPOINT'] = "https://eastus2.api.cognitive.microsoft.com/"
31
- os.environ['OPENAI_API_VERSION'] = "2024-02-01"
32
-
33
- class ChatDocumentQA:
34
- def __init__(self) -> None:
35
- pass
36
-
37
- def _get_empty_state(self) -> Dict[str, None]:
38
- """Create an empty knowledge base."""
39
- return {"knowledge_base": None}
40
-
41
- def _extract_text_from_pdfs(self, file_paths: List[str]) -> List[str]:
42
- """Extract text content from PDF files.
43
-
44
- Args:
45
- file_paths (List[str]): List of file paths.
46
-
47
- Returns:
48
- List[str]: Extracted text from the PDFs.
49
- """
50
- docs = []
51
- loaders = [UnstructuredFileLoader(file_obj, strategy="fast") for file_obj in file_paths]
52
- for loader in loaders:
53
- docs.extend(loader.load())
54
- return docs
55
-
56
- def _get_content_from_url(self, urls: str) -> List[str]:
57
- """Fetch content from given URLs.
58
-
59
- Args:
60
- urls (str): Comma-separated URLs.
61
-
62
- Returns:
63
- List[str]: List of text content fetched from the URLs.
64
- """
65
- file_paths = []
66
- for url in urls.split(','):
67
- if validators.url(url):
68
- headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',}
69
- r = requests.get(url, headers=headers)
70
- if r.status_code != 200:
71
- raise ValueError("Check the url of your file; returned status code %s" % r.status_code)
72
- content_type = r.headers.get("content-type")
73
- file_extension = mimetypes.guess_extension(content_type)
74
- temp_file = tempfile.NamedTemporaryFile(suffix=file_extension, delete=False)
75
- temp_file.write(r.content)
76
- file_paths.append(temp_file.name)
77
-
78
- print("File_Paths:",file_paths)
79
- docs = self._extract_text_from_pdfs(file_paths)
80
- return docs
81
-
82
- def _split_text_into_chunks(self, text: str) -> List[str]:
83
- """Split text into smaller chunks.
84
-
85
- Args:
86
- text (str): Input text to be split.
87
-
88
- Returns:
89
- List[str]: List of smaller text chunks.
90
- """
91
- text_splitter = CharacterTextSplitter(separator="\n", chunk_size=6000, chunk_overlap=0, length_function=len)
92
-
93
- chunks = text_splitter.split_documents(text)
94
-
95
- return chunks
96
-
97
- def _create_vector_store_from_text_chunks(self, text_chunks: List[str]) -> FAISS:
98
- """Create a vector store from text chunks.
99
-
100
- Args:
101
- text_chunks (List[str]): List of text chunks.
102
-
103
- Returns:
104
- FAISS: Vector store created from the text chunks.
105
- """
106
- embeddings = AzureOpenAIEmbeddings(
107
- azure_deployment="text-embedding-3-large",
108
- )
109
-
110
- return FAISS.from_documents(documents=text_chunks, embedding=embeddings)
111
-
112
-
113
- def _create_conversation_chain(self,vectorstore):
114
-
115
- _template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.
116
-
117
- Chat History: {chat_history}
118
- Follow Up Input: {question}
119
- Standalone question:"""
120
- CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)
121
-
122
- memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
123
-
124
- # llm = ChatOpenAI(temperature=0)
125
- llm=AzureChatOpenAI(azure_deployment = "GPT-4o")
126
-
127
- return ConversationalRetrievalChain.from_llm(llm=llm, retriever=vectorstore.as_retriever(),
128
- condense_question_prompt=CONDENSE_QUESTION_PROMPT,
129
- memory=memory)
130
-
131
- def _get_documents_knowledge_base(self, file_paths: List[str]) -> Tuple[str, Dict[str, FAISS]]:
132
- """Build knowledge base from uploaded files.
133
-
134
- Args:
135
- file_paths (List[str]): List of file paths.
136
-
137
- Returns:
138
- Tuple[str, Dict]: Tuple containing a status message and the knowledge base.
139
- """
140
- file_path = file_paths[0].name
141
- file_extension = os.path.splitext(file_path)[1]
142
-
143
- if file_extension == '.csv':
144
- # agent = self.create_agent(file_path)
145
- # tools = self.get_agent_tools(agent)
146
- # memory,tools,prompt = self.create_memory_for_csv_qa(tools)
147
- # agent_chain = self.create_agent_chain_for_csv_qa(memory,tools,prompt)
148
- agent_chain = create_csv_agent(
149
- AzureChatOpenAI(azure_deployment = "GPT-4o"),
150
- file_path,
151
- verbose=True,
152
- agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
153
- )
154
- return "file uploaded", {"knowledge_base": agent_chain}
155
-
156
- else:
157
- pdf_docs = [file_path.name for file_path in file_paths]
158
- raw_text = self._extract_text_from_pdfs(pdf_docs)
159
- text_chunks = self._split_text_into_chunks(raw_text)
160
- vectorstore = self._create_vector_store_from_text_chunks(text_chunks)
161
- return "file uploaded", {"knowledge_base": vectorstore}
162
-
163
-
164
- def _get_urls_knowledge_base(self, urls: str) -> Tuple[str, Dict[str, FAISS]]:
165
- """Build knowledge base from URLs.
166
-
167
- Args:
168
- urls (str): Comma-separated URLs.
169
-
170
- Returns:
171
- Tuple[str, Dict]: Tuple containing a status message and the knowledge base.
172
- """
173
- webpage_text = self._get_content_from_url(urls)
174
- text_chunks = self._split_text_into_chunks(webpage_text)
175
- vectorstore = self._create_vector_store_from_text_chunks(text_chunks)
176
- return "file uploaded", {"knowledge_base": vectorstore}
177
-
178
- #************************
179
- # csv qa
180
- #************************
181
- def create_agent(self,file_path):
182
- agent_chain = create_csv_agent(
183
- AzureChatOpenAI(azure_deployment = "GPT-4o"),
184
- file_path,
185
- verbose=True,
186
- agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
187
- )
188
- return agent_chain
189
- def get_agent_tools(self,agent):
190
- # search = agent
191
- tools = [
192
- Tool(
193
- name="dataframe qa",
194
- func=agent.run,
195
- description="useful for when you need to answer questions about table data and dataframe data",
196
- )
197
- ]
198
- return tools
199
-
200
- def create_memory_for_csv_qa(self,tools):
201
- prefix = """Have a conversation with a human, answering the following questions about table data and dataframe data as best you can. You have access to the following tools:"""
202
- suffix = """Begin!"
203
-
204
- {chat_history}
205
- Question: {input}
206
- {agent_scratchpad}"""
207
-
208
- prompt = ZeroShotAgent.create_prompt(
209
- tools,
210
- prefix=prefix,
211
- suffix=suffix,
212
- input_variables=["input", "chat_history", "agent_scratchpad"],
213
- )
214
- memory = ConversationBufferMemory(memory_key="chat_history",return_messages=True)
215
-
216
- return memory,tools,prompt
217
-
218
- def create_agent_chain_for_csv_qa(self,memory,tools,prompt):
219
-
220
- llm_chain = LLMChain(llm=AzureChatOpenAI(azure_deployment = "GPT-4o"), prompt=prompt)
221
- agent = ZeroShotAgent(llm_chain=llm_chain, tools=tools, verbose=True)
222
- agent_chain = AgentExecutor.from_agent_and_tools(
223
- agent=agent, tools=tools, verbose=True, memory=memory
224
- )
225
-
226
- return agent_chain
227
-
228
- def _get_response(self, message: str, chat_history: List[Tuple[str, str]], state: Dict[str, FAISS],file_paths) -> Tuple[str, List[Tuple[str, str]]]:
229
- """Get a response from the chatbot.
230
-
231
- Args:
232
- message (str): User's message/question.
233
- chat_history (List[Tuple[str, str]]): List of chat history as tuples of (user_message, bot_response).
234
- state (dict): State containing the knowledge base.
235
-
236
- Returns:
237
- Tuple[str, List[Tuple[str, str]]]: Tuple containing a status message and updated chat history.
238
- """
239
- try:
240
- if file_paths:
241
- file_path = file_paths[0].name
242
- file_extension = os.path.splitext(file_path)[1]
243
-
244
- if file_extension == '.csv':
245
- agent_chain = state["knowledge_base"]
246
- response = agent_chain.run(input = message)
247
- chat_history.append((message, response))
248
- return "", chat_history
249
-
250
- else:
251
- vectorstore = state["knowledge_base"]
252
- chat = self._create_conversation_chain(vectorstore)
253
- response = chat({"question": message,"chat_history": chat_history})
254
- chat_history.append((message, response["answer"]))
255
- return "", chat_history
256
- else:
257
- vectorstore = state["knowledge_base"]
258
- chat = self._create_conversation_chain(vectorstore)
259
- response = chat({"question": message,"chat_history": chat_history})
260
- chat_history.append((message, response["answer"]))
261
- return "", chat_history
262
- except:
263
- chat_history.append((message, "Please Upload Document or URL"))
264
- return "", chat_history
265
-
266
- def gradio_interface(self) -> None:
267
- """Create a Gradio interface for the chatbot."""
268
- with gr.Blocks(css="#textbox_id textarea {color: white}",theme='SherlockRamos/Feliz') as demo:
269
- gr.HTML("""
270
- <style>
271
- .footer {
272
- display: none !important;
273
- }
274
- footer {
275
- display: none !important;
276
- }
277
- #foot {
278
- display: none !important;
279
- }
280
- .svelte-1fzp3xt {
281
- display: none !important;
282
- }
283
- #root > div > div > div {
284
- padding-bottom: 0 !important;
285
- }
286
- .custom-footer {
287
- text-align: center;
288
- padding: 10px;
289
- font-size: 14px;
290
- color: #333;
291
- }
292
- </style>
293
- """)
294
- gr.HTML("""<div><img src="data:image/jpeg;base64,/9j/4AAQSkZJRgABAQAAAQABAAD/2wCEAAkGBxAQDw4NDQ8PDg0OEA0NDQ4NDQ8NCQ0NFhEWFhURFRUYHSgsJBomGxMVIT0hJSs3Li4wFx8/RDM4QygtLisBCgoKDg0OGhAQFysdHiYrKy0tKy0rNy0tLS0tLSstKy0tLSs3LS0tLS0tLS0tLS0tLS0tLSsrLS0tKystLS0tLf/AABEIAMgAyAMBEQACEQEDEQH/xAAbAAEBAQEBAQEBAAAAAAAAAAAAAQcGBQMCBP/EAEcQAAIBAgEDDA8GBQUAAAAAAAABAgMEEQYSUQUHFiExQVRzgZKToRMVIjM0NVNhY4ORsbLB0SMycXLC0hRis/DxJEJSgqL/xAAaAQEBAAMBAQAAAAAAAAAAAAAAAQQFBgMC/8QAMhEBAAECAgYIBwADAQAAAAAAAAECAwQREiEyQVHwBRMUFTFhodEiIzM0gbHBcZHhQv/aAAwDAQACEQMRAD8AhqXHgAAAAAUKgRQIBQIAQAAAAAAAAAAAoVAigQCgQCkyVCoAAKBAAAABQqBAAAAoEApMlQqAACgQABQoEQAAAAUCAAAAABQIAAAUKBEAAAAFAgAABQIAAoVCIpRAKFEEQABQIAAAAKBAAFCoRFKIAAoVAgAAAAAAABQqBAAAJ5qDJAooVAgAAAAAAAAAAAAAAAAAVAQAAAAAAAAAAAAAAAAAAUKAQIiJORTTIvMM4SnRkxLkaS8vUFUCYAOXqJkqYlfOa4+cGkmIM15eoipiXJNIQfJgNT60RIapSKZgJ4J/l+ivRAgAAAAAFAgHSZGah0brs/Zs77PsTjmvDdz8ceaZFiimuGf0fhrdyas/L+umWQ9n6TpGZHZ6c2wp6NsRERl+/c2D2npekZOz0Pru2zw9Z912D2fpOeOz0J3dZ4JsHtPSdIx2ehe7bPD9+5sHtPSdIx2eg7ts8P37mwe09J0jHZ6Du2zw/fubBrT0nSMdRQndtnh6z7rsHtPSdIx2ehe7bPD9+5sHtNFTnjs9Cd3WeBsHs9FTnjs9B3dZ4JsHtPSdIx2ehe7bPD9+6SyGtN7skfwksfcOz0Hd1ndTk/hutb+GGNGvJPRUgpJ8scDzqwtM73hV0XGfwzl+P+uX1W1AuLbbqxTh5SDcqXmx0cp4V2Zpa29g7tqc5l5mB5Qx51qVQCBAAAAoVAgB1mt/qhRou67PVhTzuw5ufJRxwz8ff1mTh6qYhs+jLtNM1xVPD+uxWUFnwmjz0ZWnTxbSMVaiNr0ldkNnwmjz0TrKeK9qtcfSSOr9o2krmk22klnrHERcpXtNrj6S9Q9HuAefdarW9KTp1a1OnNJNxlNKSR8zVFLxrv0UTlMvlshs+E0eej562ni+e02+PpK7IbPhNHnodZTxO1WuPobIbPhNLnounTxO1WuPpJshs+E0ueidZSdqtcfSRZQWfCqPSJDrKV7Va4+kv7La5hUWdTnGcdMJKUeo+4qiXpTVFWuJffAr7l+KlNSTjJKUWmmmk4taNsnkkxmzPK/J7+Gmq1Lweo2mvJy23h+GGOBhXrWjrc/jsH1VWnGzPi53Exohr9JCnioUAAAAAABEhqhNe5SZL4eIIV9bLvsOMhvfzI+qS1rmW2o2rrwDMMtYZ2qDi9yUaMW1upPBPAwL0Z3Ms3PY6M8RllzlDodgNt5Wvzqf7T37PRm2HdtqZ1psCtvK1/bT/YXs9JHRtnPNyOU2pcLW4dGnKUlmRljPByxe7uJbW0Yl6iKWqxdjqrmUc6oeWeWbFMBmr62N5UozVSjNwksNtPaa0PSfdNc0vq1eqtzpUS1LJrVlXdBVNpVI9zVitzO0rzM2FqrSh0mGxEXqM458XsnoyX8Wq1jGvRq0ZYd3FpN/7Zb0vbgz5rp0oyed23FyjQljk4OLcZLCUW4yW+mtpr2o1blNyRD5pAoAAAAKFQIAVARkhKn1su+0+Mh8aLS+7e221G2dcAZnlj4yXqPfEwL220GN+554Q0sz5b+ADMtcDw2XF0fezAxO257pL6s/j+OcR4MBAIvoSUjVrdTrdXTjdTo49zVhJ4aZRbw6s4ybE/HzwbLoqrRrqo58ZaUZzfgGR5W2yp3txFbjl2RaHnwzn1tmuvR8znycxj6Mr087nkL6HkxPHJ+g+kAoEAAAAACgCK/dl32n+eHxlpLe225G2deAZllj4zXqPfEwb2257G/dc+TTDOl0MAGZa4HhsuLo+9mBidtz3SX1Z/H8c2eDAABDyetkhUzb63f8zjyShNfM9rG2ycDOWIj8fqprhsXTgGZa4UML1PTRg3yORg34+Nz/AElHzeeEOaRjNdTqCqqAgFCoEAAAABSblfSx75T4yHxn1StrabajauuAMzyx8ZL1H6TAvbbn8b9zzwhpZnuggAzLXA8NlxVP5mBidtz/AEj9Wed0ObPBrwCLfI+ad70smX/rbbjIe6R7WY+OPyyMHV8+nndU2E2LqgDNNcbwuHEU/wCpIwsT4tB0r9SOd9Ll/wC+sxmt3qFAKBAAFCgRAKTzVEI8B9bHvlPjIfGWktbTbUbZ14BmeWPjJeo/SYF7bc/jfueeENLM+XQQAZjrgeGviofMwMRtud6S+rPO6HOHgwVAi3yT4JTvetkjTzr63WiWdzYSfyPeztx+WVgYzvRzuqa4bB04BmWuFPG9w0UYRf8A6fzMHE7Tn+k5zuZc+EOaRjtd/wCgKoVAgAAAAKBAKRX0se+U+Mh8ZaS1tNtRtnXgGZ5Y+Mug/SYF7bc/jfueeENLM90EAGZa4HhsuKp/MwMTtuf6R+rPO6HNng14ATGW4p+LU6bW8tc67dTepU3t6JPCKXscjIw0Z1aTYdF2/mTPD2lpaM50ABkuV9wql7cS3oyUFt7mbDB9aZrr+uvnyczj6om9M8+DyMcOo8mJ4QBQAAAAAAAABSK+ll32nxkPjR9Urb222o2rrgDM8sfGXQfIwL225/G/c88IaWZ7oI8ADMtcDw2XFU/mYGI23P8ASP1Z53Q5s8GvVAIRbaik220kkm5Nt4JLDfJlnrKac9TUMjtSHa0O7X21XCVTTHDajHkXvZsrVGjS6PA4fqbeW/2zdAerNfy6oXcaNKpWn92EZSe9jgtwky+LlcUxmxqtUcpTnLblOUpSfnbbx9rNVM51OTrqzzl+dPITeBUUKgQAAAAAAA0kV9LLvtP88PjLSW9ttptnXAVmeWPjJeo/SYN/bc9jZ0cVq51Q0sznQqB4Oq+S1C5qdmqSqKTjGOEXHMwWO8152eVdqmpiXsHbvbT+HYFa+Ur86H7T57PQ8O7LKrIO1/51n/2hh8JOooO7LT1dTMnrW3edSprP3M+bcqi/DRyHpTbincyrWFt2tmHrH2yEHmM9y61eVR/wlGWMItOrJPalJP7vJ78NBiYi5nGUNJ0hitL5dE86nImK1QBQqBFAgFAgFJkqFQAqIr92jSqU28ElOm23tJJS222fVJb22udvLThVDpofU2WnTxdT19vidvLThVDpofUadPE6+3xcBlVc053/AGSE4Th9j3cZxdPdWO3iYV6r42jxdVE4iapnnKHfrVy04Vb9NT+pnTU3kX7fE7e2nCqHTQ+o04Ovt8Tt7acKodND6jTg663xO3lpwq36aH1Jp08V6+3xO3tpwqh00PqNOlOvt8X4qZQWi3bmjyTTfUOsp4vmcTajxq9Hn3eWlnD7s5VnopQlu/i8D4m/RG9419I2afCc/wDfs5XVrK+vXThSXYKTxW0260l+b2bhj14iZ1Q1t/pCu7T8vVz/AIc2v8GN4a5a3VKlfQAAoEAAUKBEAAAAFAgAAAAAUCAAAFCgRAAAABQIAAAUCAAKFQiKUQChRBEAAUCAAAACgQABQqERSiAAKFQIAAAAAAAAUKgQAACeagyQKKFQIAAAAAAAAAAAAAAAAAFQEAAAAAAAAAAAAAAAAAAFCgECKFQIAAAAAAAAUKgQAAUKAQIoVAgAAAAAFAgFCgECAAABQqBFAgAATzUKgBQIBQoBAgAAAUKgQAoVAgAAAAAFQEAoAAFQIoVAgBQqBAAAA//Z" alt="Intercontinental Exchange" style="float:left;width:80px;height:80px;"><h1 style="color:#000;margin-left:4in;padding-top:10px">Virtual Assistant Chatbot</h1></div>""")
295
- state = gr.State(self._get_empty_state())
296
- chatbot = gr.Chatbot()
297
-
298
- with gr.Row():
299
- with gr.Column(scale=0.85):
300
- msg = gr.Textbox(label="Question", elem_id="textbox_id")
301
- with gr.Column(scale=0.15):
302
- file_output = gr.Textbox(label="File Status")
303
- with gr.Row():
304
- with gr.Column(scale=0.85):
305
- clear = gr.ClearButton([msg, chatbot])
306
- with gr.Column(scale=0.15):
307
- upload_button = gr.UploadButton(
308
- "Browse File",
309
- file_types=[".txt", ".pdf", ".doc", ".docx", ".csv"],
310
- file_count="multiple", variant="primary"
311
- )
312
- with gr.Row():
313
- with gr.Column(scale=1):
314
- input_url = gr.Textbox(label="urls", elem_id="textbox_id")
315
-
316
- input_url.submit(self._get_urls_knowledge_base, input_url, [file_output, state])
317
- upload_button.upload(self._get_documents_knowledge_base, upload_button, [file_output, state])
318
- msg.submit(self._get_response, [msg, chatbot, state,upload_button], [msg, chatbot])
319
-
320
- demo.launch(debug=True,allowed_paths=["/content/"])
321
-
322
-
323
- if __name__ == "__main__":
324
- chatdocumentqa = ChatDocumentQA()
325
  chatdocumentqa.gradio_interface()
 
1
+ from langchain.text_splitter import CharacterTextSplitter
2
+ from langchain.vectorstores import FAISS
3
+ from langchain.chat_models import ChatOpenAI
4
+ from langchain_openai import AzureChatOpenAI,AzureOpenAIEmbeddings
5
+ from langchain.memory import ConversationBufferMemory
6
+ from langchain.chains import ConversationChain
7
+ from langchain.chains import ConversationalRetrievalChain
8
+ from langchain.document_loaders import UnstructuredFileLoader
9
+ from typing import List, Dict, Tuple
10
+ import gradio as gr
11
+ import validators
12
+ import requests
13
+ import mimetypes
14
+ import tempfile
15
+ import os
16
+ from langchain.chains.question_answering import load_qa_chain
17
+ from langchain.llms import OpenAI
18
+ from langchain.prompts import PromptTemplate
19
+ from langchain.prompts.prompt import PromptTemplate
20
+ import pandas as pd
21
+ from langchain_experimental.agents.agent_toolkits import create_csv_agent
22
+ from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent
23
+ from langchain.agents import ZeroShotAgent, Tool, AgentExecutor
24
+ from langchain.agents.agent_types import AgentType
25
+ # from langchain.agents import create_csv_agent
26
+ from langchain import OpenAI, LLMChain
27
+ from openai import AzureOpenAI
28
+
29
+
30
+
31
+ class ChatDocumentQA:
32
+ def __init__(self) -> None:
33
+ pass
34
+
35
+ def _get_empty_state(self) -> Dict[str, None]:
36
+ """Create an empty knowledge base."""
37
+ return {"knowledge_base": None}
38
+
39
+ def _extract_text_from_pdfs(self, file_paths: List[str]) -> List[str]:
40
+ """Extract text content from PDF files.
41
+
42
+ Args:
43
+ file_paths (List[str]): List of file paths.
44
+
45
+ Returns:
46
+ List[str]: Extracted text from the PDFs.
47
+ """
48
+ docs = []
49
+ loaders = [UnstructuredFileLoader(file_obj, strategy="fast") for file_obj in file_paths]
50
+ for loader in loaders:
51
+ docs.extend(loader.load())
52
+ return docs
53
+
54
+ def _get_content_from_url(self, urls: str) -> List[str]:
55
+ """Fetch content from given URLs.
56
+
57
+ Args:
58
+ urls (str): Comma-separated URLs.
59
+
60
+ Returns:
61
+ List[str]: List of text content fetched from the URLs.
62
+ """
63
+ file_paths = []
64
+ for url in urls.split(','):
65
+ if validators.url(url):
66
+ headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',}
67
+ r = requests.get(url, headers=headers)
68
+ if r.status_code != 200:
69
+ raise ValueError("Check the url of your file; returned status code %s" % r.status_code)
70
+ content_type = r.headers.get("content-type")
71
+ file_extension = mimetypes.guess_extension(content_type)
72
+ temp_file = tempfile.NamedTemporaryFile(suffix=file_extension, delete=False)
73
+ temp_file.write(r.content)
74
+ file_paths.append(temp_file.name)
75
+
76
+ print("File_Paths:",file_paths)
77
+ docs = self._extract_text_from_pdfs(file_paths)
78
+ return docs
79
+
80
+ def _split_text_into_chunks(self, text: str) -> List[str]:
81
+ """Split text into smaller chunks.
82
+
83
+ Args:
84
+ text (str): Input text to be split.
85
+
86
+ Returns:
87
+ List[str]: List of smaller text chunks.
88
+ """
89
+ text_splitter = CharacterTextSplitter(separator="\n", chunk_size=6000, chunk_overlap=0, length_function=len)
90
+
91
+ chunks = text_splitter.split_documents(text)
92
+
93
+ return chunks
94
+
95
+ def _create_vector_store_from_text_chunks(self, text_chunks: List[str]) -> FAISS:
96
+ """Create a vector store from text chunks.
97
+
98
+ Args:
99
+ text_chunks (List[str]): List of text chunks.
100
+
101
+ Returns:
102
+ FAISS: Vector store created from the text chunks.
103
+ """
104
+ embeddings = AzureOpenAIEmbeddings(
105
+ azure_deployment="text-embedding-3-large",
106
+ )
107
+
108
+ return FAISS.from_documents(documents=text_chunks, embedding=embeddings)
109
+
110
+
111
+ def _create_conversation_chain(self,vectorstore):
112
+
113
+ _template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.
114
+
115
+ Chat History: {chat_history}
116
+ Follow Up Input: {question}
117
+ Standalone question:"""
118
+ CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)
119
+
120
+ memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
121
+
122
+ # llm = ChatOpenAI(temperature=0)
123
+ llm=AzureChatOpenAI(azure_deployment = "GPT-4o")
124
+
125
+ return ConversationalRetrievalChain.from_llm(llm=llm, retriever=vectorstore.as_retriever(),
126
+ condense_question_prompt=CONDENSE_QUESTION_PROMPT,
127
+ memory=memory)
128
+
129
+ def _get_documents_knowledge_base(self, file_paths: List[str]) -> Tuple[str, Dict[str, FAISS]]:
130
+ """Build knowledge base from uploaded files.
131
+
132
+ Args:
133
+ file_paths (List[str]): List of file paths.
134
+
135
+ Returns:
136
+ Tuple[str, Dict]: Tuple containing a status message and the knowledge base.
137
+ """
138
+ file_path = file_paths[0].name
139
+ file_extension = os.path.splitext(file_path)[1]
140
+
141
+ if file_extension == '.csv':
142
+ # agent = self.create_agent(file_path)
143
+ # tools = self.get_agent_tools(agent)
144
+ # memory,tools,prompt = self.create_memory_for_csv_qa(tools)
145
+ # agent_chain = self.create_agent_chain_for_csv_qa(memory,tools,prompt)
146
+ agent_chain = create_csv_agent(
147
+ AzureChatOpenAI(azure_deployment = "GPT-4o"),
148
+ file_path,
149
+ verbose=True,
150
+ agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
151
+ )
152
+ return "file uploaded", {"knowledge_base": agent_chain}
153
+
154
+ else:
155
+ pdf_docs = [file_path.name for file_path in file_paths]
156
+ raw_text = self._extract_text_from_pdfs(pdf_docs)
157
+ text_chunks = self._split_text_into_chunks(raw_text)
158
+ vectorstore = self._create_vector_store_from_text_chunks(text_chunks)
159
+ return "file uploaded", {"knowledge_base": vectorstore}
160
+
161
+
162
+ def _get_urls_knowledge_base(self, urls: str) -> Tuple[str, Dict[str, FAISS]]:
163
+ """Build knowledge base from URLs.
164
+
165
+ Args:
166
+ urls (str): Comma-separated URLs.
167
+
168
+ Returns:
169
+ Tuple[str, Dict]: Tuple containing a status message and the knowledge base.
170
+ """
171
+ webpage_text = self._get_content_from_url(urls)
172
+ text_chunks = self._split_text_into_chunks(webpage_text)
173
+ vectorstore = self._create_vector_store_from_text_chunks(text_chunks)
174
+ return "file uploaded", {"knowledge_base": vectorstore}
175
+
176
+ #************************
177
+ # csv qa
178
+ #************************
179
+ def create_agent(self,file_path):
180
+ agent_chain = create_csv_agent(
181
+ AzureChatOpenAI(azure_deployment = "GPT-4o"),
182
+ file_path,
183
+ verbose=True,
184
+ agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
185
+ )
186
+ return agent_chain
187
+ def get_agent_tools(self,agent):
188
+ # search = agent
189
+ tools = [
190
+ Tool(
191
+ name="dataframe qa",
192
+ func=agent.run,
193
+ description="useful for when you need to answer questions about table data and dataframe data",
194
+ )
195
+ ]
196
+ return tools
197
+
198
+ def create_memory_for_csv_qa(self,tools):
199
+ prefix = """Have a conversation with a human, answering the following questions about table data and dataframe data as best you can. You have access to the following tools:"""
200
+ suffix = """Begin!"
201
+
202
+ {chat_history}
203
+ Question: {input}
204
+ {agent_scratchpad}"""
205
+
206
+ prompt = ZeroShotAgent.create_prompt(
207
+ tools,
208
+ prefix=prefix,
209
+ suffix=suffix,
210
+ input_variables=["input", "chat_history", "agent_scratchpad"],
211
+ )
212
+ memory = ConversationBufferMemory(memory_key="chat_history",return_messages=True)
213
+
214
+ return memory,tools,prompt
215
+
216
+ def create_agent_chain_for_csv_qa(self,memory,tools,prompt):
217
+
218
+ llm_chain = LLMChain(llm=AzureChatOpenAI(azure_deployment = "GPT-4o"), prompt=prompt)
219
+ agent = ZeroShotAgent(llm_chain=llm_chain, tools=tools, verbose=True)
220
+ agent_chain = AgentExecutor.from_agent_and_tools(
221
+ agent=agent, tools=tools, verbose=True, memory=memory
222
+ )
223
+
224
+ return agent_chain
225
+
226
+ def _get_response(self, message: str, chat_history: List[Tuple[str, str]], state: Dict[str, FAISS],file_paths) -> Tuple[str, List[Tuple[str, str]]]:
227
+ """Get a response from the chatbot.
228
+
229
+ Args:
230
+ message (str): User's message/question.
231
+ chat_history (List[Tuple[str, str]]): List of chat history as tuples of (user_message, bot_response).
232
+ state (dict): State containing the knowledge base.
233
+
234
+ Returns:
235
+ Tuple[str, List[Tuple[str, str]]]: Tuple containing a status message and updated chat history.
236
+ """
237
+ try:
238
+ if file_paths:
239
+ file_path = file_paths[0].name
240
+ file_extension = os.path.splitext(file_path)[1]
241
+
242
+ if file_extension == '.csv':
243
+ agent_chain = state["knowledge_base"]
244
+ response = agent_chain.run(input = message)
245
+ chat_history.append((message, response))
246
+ return "", chat_history
247
+
248
+ else:
249
+ vectorstore = state["knowledge_base"]
250
+ chat = self._create_conversation_chain(vectorstore)
251
+ response = chat({"question": message,"chat_history": chat_history})
252
+ chat_history.append((message, response["answer"]))
253
+ return "", chat_history
254
+ else:
255
+ vectorstore = state["knowledge_base"]
256
+ chat = self._create_conversation_chain(vectorstore)
257
+ response = chat({"question": message,"chat_history": chat_history})
258
+ chat_history.append((message, response["answer"]))
259
+ return "", chat_history
260
+ except:
261
+ chat_history.append((message, "Please Upload Document or URL"))
262
+ return "", chat_history
263
+
264
+ def gradio_interface(self) -> None:
265
+ """Create a Gradio interface for the chatbot."""
266
+ with gr.Blocks(css="#textbox_id textarea {color: white}",theme='SherlockRamos/Feliz') as demo:
267
+ gr.HTML("""
268
+ <style>
269
+ .footer {
270
+ display: none !important;
271
+ }
272
+ footer {
273
+ display: none !important;
274
+ }
275
+ #foot {
276
+ display: none !important;
277
+ }
278
+ .svelte-1fzp3xt {
279
+ display: none !important;
280
+ }
281
+ #root > div > div > div {
282
+ padding-bottom: 0 !important;
283
+ }
284
+ .custom-footer {
285
+ text-align: center;
286
+ padding: 10px;
287
+ font-size: 14px;
288
+ color: #333;
289
+ }
290
+ </style>
291
+ """)
292
+ gr.HTML("""<div><img src="data:image/jpeg;base64,/9j/4AAQSkZJRgABAQAAAQABAAD/2wCEAAkGBxAQDw4NDQ8PDg0OEA0NDQ4NDQ8NCQ0NFhEWFhURFRUYHSgsJBomGxMVIT0hJSs3Li4wFx8/RDM4QygtLisBCgoKDg0OGhAQFysdHiYrKy0tKy0rNy0tLS0tLSstKy0tLSs3LS0tLS0tLS0tLS0tLS0tLSsrLS0tKystLS0tLf/AABEIAMgAyAMBEQACEQEDEQH/xAAbAAEBAQEBAQEBAAAAAAAAAAAAAQcGBQMCBP/EAEcQAAIBAgEDDA8GBQUAAAAAAAABAgMEEQYSUQUHFiExQVRzgZKToRMVIjM0NVNhY4ORsbLB0SMycXLC0hRis/DxJEJSgqL/xAAaAQEBAAMBAQAAAAAAAAAAAAAAAQQFBgMC/8QAMhEBAAECAgYIBwADAQAAAAAAAAECAwQREiEyQVHwBRMUFTFhodEiIzM0gbHBcZHhQv/aAAwDAQACEQMRAD8AhqXHgAAAAAUKgRQIBQIAQAAAAAAAAAAAoVAigQCgQCkyVCoAAKBAAAABQqBAAAAoEApMlQqAACgQABQoEQAAAAUCAAAAABQIAAAUKBEAAAAFAgAABQIAAoVCIpRAKFEEQABQIAAAAKBAAFCoRFKIAAoVAgAAAAAAABQqBAAAJ5qDJAooVAgAAAAAAAAAAAAAAAAAVAQAAAAAAAAAAAAAAAAAAUKAQIiJORTTIvMM4SnRkxLkaS8vUFUCYAOXqJkqYlfOa4+cGkmIM15eoipiXJNIQfJgNT60RIapSKZgJ4J/l+ivRAgAAAAAFAgHSZGah0brs/Zs77PsTjmvDdz8ceaZFiimuGf0fhrdyas/L+umWQ9n6TpGZHZ6c2wp6NsRERl+/c2D2npekZOz0Pru2zw9Z912D2fpOeOz0J3dZ4JsHtPSdIx2ehe7bPD9+5sHtPSdIx2eg7ts8P37mwe09J0jHZ6Du2zw/fubBrT0nSMdRQndtnh6z7rsHtPSdIx2ehe7bPD9+5sHtNFTnjs9Cd3WeBsHs9FTnjs9B3dZ4JsHtPSdIx2ehe7bPD9+6SyGtN7skfwksfcOz0Hd1ndTk/hutb+GGNGvJPRUgpJ8scDzqwtM73hV0XGfwzl+P+uX1W1AuLbbqxTh5SDcqXmx0cp4V2Zpa29g7tqc5l5mB5Qx51qVQCBAAAAoVAgB1mt/qhRou67PVhTzuw5ufJRxwz8ff1mTh6qYhs+jLtNM1xVPD+uxWUFnwmjz0ZWnTxbSMVaiNr0ldkNnwmjz0TrKeK9qtcfSSOr9o2krmk22klnrHERcpXtNrj6S9Q9HuAefdarW9KTp1a1OnNJNxlNKSR8zVFLxrv0UTlMvlshs+E0eej562ni+e02+PpK7IbPhNHnodZTxO1WuPobIbPhNLnounTxO1WuPpJshs+E0ueidZSdqtcfSRZQWfCqPSJDrKV7Va4+kv7La5hUWdTnGcdMJKUeo+4qiXpTVFWuJffAr7l+KlNSTjJKUWmmmk4taNsnkkxmzPK/J7+Gmq1Lweo2mvJy23h+GGOBhXrWjrc/jsH1VWnGzPi53Exohr9JCnioUAAAAAABEhqhNe5SZL4eIIV9bLvsOMhvfzI+qS1rmW2o2rrwDMMtYZ2qDi9yUaMW1upPBPAwL0Z3Ms3PY6M8RllzlDodgNt5Wvzqf7T37PRm2HdtqZ1psCtvK1/bT/YXs9JHRtnPNyOU2pcLW4dGnKUlmRljPByxe7uJbW0Yl6iKWqxdjqrmUc6oeWeWbFMBmr62N5UozVSjNwksNtPaa0PSfdNc0vq1eqtzpUS1LJrVlXdBVNpVI9zVitzO0rzM2FqrSh0mGxEXqM458XsnoyX8Wq1jGvRq0ZYd3FpN/7Zb0vbgz5rp0oyed23FyjQljk4OLcZLCUW4yW+mtpr2o1blNyRD5pAoAAAAKFQIAVARkhKn1su+0+Mh8aLS+7e221G2dcAZnlj4yXqPfEwL220GN+554Q0sz5b+ADMtcDw2XF0fezAxO257pL6s/j+OcR4MBAIvoSUjVrdTrdXTjdTo49zVhJ4aZRbw6s4ybE/HzwbLoqrRrqo58ZaUZzfgGR5W2yp3txFbjl2RaHnwzn1tmuvR8znycxj6Mr087nkL6HkxPHJ+g+kAoEAAAAACgCK/dl32n+eHxlpLe225G2deAZllj4zXqPfEwb2257G/dc+TTDOl0MAGZa4HhsuLo+9mBidtz3SX1Z/H8c2eDAABDyetkhUzb63f8zjyShNfM9rG2ycDOWIj8fqprhsXTgGZa4UML1PTRg3yORg34+Nz/AElHzeeEOaRjNdTqCqqAgFCoEAAAABSblfSx75T4yHxn1StrabajauuAMzyx8ZL1H6TAvbbn8b9zzwhpZnuggAzLXA8NlxVP5mBidtz/AEj9Wed0ObPBrwCLfI+ad70smX/rbbjIe6R7WY+OPyyMHV8+nndU2E2LqgDNNcbwuHEU/wCpIwsT4tB0r9SOd9Ll/wC+sxmt3qFAKBAAFCgRAKTzVEI8B9bHvlPjIfGWktbTbUbZ14BmeWPjJeo/SYF7bc/jfueeENLM+XQQAZjrgeGviofMwMRtud6S+rPO6HOHgwVAi3yT4JTvetkjTzr63WiWdzYSfyPeztx+WVgYzvRzuqa4bB04BmWuFPG9w0UYRf8A6fzMHE7Tn+k5zuZc+EOaRjtd/wCgKoVAgAAAAKBAKRX0se+U+Mh8ZaS1tNtRtnXgGZ5Y+Mug/SYF7bc/jfueeENLM90EAGZa4HhsuKp/MwMTtuf6R+rPO6HNng14ATGW4p+LU6bW8tc67dTepU3t6JPCKXscjIw0Z1aTYdF2/mTPD2lpaM50ABkuV9wql7cS3oyUFt7mbDB9aZrr+uvnyczj6om9M8+DyMcOo8mJ4QBQAAAAAAAABSK+ll32nxkPjR9Urb222o2rrgDM8sfGXQfIwL225/G/c88IaWZ7oI8ADMtcDw2XFU/mYGI23P8ASP1Z53Q5s8GvVAIRbaik220kkm5Nt4JLDfJlnrKac9TUMjtSHa0O7X21XCVTTHDajHkXvZsrVGjS6PA4fqbeW/2zdAerNfy6oXcaNKpWn92EZSe9jgtwky+LlcUxmxqtUcpTnLblOUpSfnbbx9rNVM51OTrqzzl+dPITeBUUKgQAAAAAAA0kV9LLvtP88PjLSW9ttptnXAVmeWPjJeo/SYN/bc9jZ0cVq51Q0sznQqB4Oq+S1C5qdmqSqKTjGOEXHMwWO8152eVdqmpiXsHbvbT+HYFa+Ur86H7T57PQ8O7LKrIO1/51n/2hh8JOooO7LT1dTMnrW3edSprP3M+bcqi/DRyHpTbincyrWFt2tmHrH2yEHmM9y61eVR/wlGWMItOrJPalJP7vJ78NBiYi5nGUNJ0hitL5dE86nImK1QBQqBFAgFAgFJkqFQAqIr92jSqU28ElOm23tJJS222fVJb22udvLThVDpofU2WnTxdT19vidvLThVDpofUadPE6+3xcBlVc053/AGSE4Th9j3cZxdPdWO3iYV6r42jxdVE4iapnnKHfrVy04Vb9NT+pnTU3kX7fE7e2nCqHTQ+o04Ovt8Tt7acKodND6jTg663xO3lpwq36aH1Jp08V6+3xO3tpwqh00PqNOlOvt8X4qZQWi3bmjyTTfUOsp4vmcTajxq9Hn3eWlnD7s5VnopQlu/i8D4m/RG9419I2afCc/wDfs5XVrK+vXThSXYKTxW0260l+b2bhj14iZ1Q1t/pCu7T8vVz/AIc2v8GN4a5a3VKlfQAAoEAAUKBEAAAAFAgAAAAAUCAAAFCgRAAAABQIAAAUCAAKFQiKUQChRBEAAUCAAAACgQABQqERSiAAKFQIAAAAAAAAUKgQAACeagyQKKFQIAAAAAAAAAAAAAAAAAFQEAAAAAAAAAAAAAAAAAAFCgECKFQIAAAAAAAAUKgQAAUKAQIoVAgAAAAAFAgFCgECAAABQqBFAgAATzUKgBQIBQoBAgAAAUKgQAoVAgAAAAAFQEAoAAFQIoVAgBQqBAAAA//Z" alt="Intercontinental Exchange" style="float:left;width:80px;height:80px;"><h1 style="color:#000;margin-left:4in;padding-top:10px">Virtual Assistant Chatbot</h1></div>""")
293
+ state = gr.State(self._get_empty_state())
294
+ chatbot = gr.Chatbot()
295
+
296
+ with gr.Row():
297
+ with gr.Column(scale=0.85):
298
+ msg = gr.Textbox(label="Question", elem_id="textbox_id")
299
+ with gr.Column(scale=0.15):
300
+ file_output = gr.Textbox(label="File Status")
301
+ with gr.Row():
302
+ with gr.Column(scale=0.85):
303
+ clear = gr.ClearButton([msg, chatbot])
304
+ with gr.Column(scale=0.15):
305
+ upload_button = gr.UploadButton(
306
+ "Browse File",
307
+ file_types=[".txt", ".pdf", ".doc", ".docx", ".csv"],
308
+ file_count="multiple", variant="primary"
309
+ )
310
+ with gr.Row():
311
+ with gr.Column(scale=1):
312
+ input_url = gr.Textbox(label="urls", elem_id="textbox_id")
313
+
314
+ input_url.submit(self._get_urls_knowledge_base, input_url, [file_output, state])
315
+ upload_button.upload(self._get_documents_knowledge_base, upload_button, [file_output, state])
316
+ msg.submit(self._get_response, [msg, chatbot, state,upload_button], [msg, chatbot])
317
+
318
+ demo.launch(debug=True,allowed_paths=["/content/"])
319
+
320
+
321
+ if __name__ == "__main__":
322
+ chatdocumentqa = ChatDocumentQA()
 
 
323
  chatdocumentqa.gradio_interface()