Spaces:
Running
Running
arslan-ahmed
commited on
Commit
•
1b37f68
1
Parent(s):
c138c11
added Watsonx models
Browse files- README.md +1 -1
- app.py +94 -45
- requirements.txt +3 -1
- ttyd_consts.py +6 -4
- ttyd_functions.py +40 -20
README.md
CHANGED
@@ -29,7 +29,7 @@ You can develop and deploy your own personal chatbot (similar to https://hugging
|
|
29 |
|
30 |
docker pull arslan2k12/ttyd_base (https://hub.docker.com/r/arslan2k12/ttyd_base) <br/>
|
31 |
docker pull arslan2k12/arslanbot (https://hub.docker.com/r/arslan2k12/arslanbot)<br/>
|
32 |
-
docker run --rm -d -p 7860:7860 --env-file ./.env arslan2k12/
|
33 |
|
34 |
|
35 |
Contents of `.env` file:
|
|
|
29 |
|
30 |
docker pull arslan2k12/ttyd_base (https://hub.docker.com/r/arslan2k12/ttyd_base) <br/>
|
31 |
docker pull arslan2k12/arslanbot (https://hub.docker.com/r/arslan2k12/arslanbot)<br/>
|
32 |
+
docker run --rm -d -p 7860:7860 --env-file ./.env arslan2k12/ttyd_arslanbot
|
33 |
|
34 |
|
35 |
Contents of `.env` file:
|
app.py
CHANGED
@@ -9,6 +9,7 @@ from langchain.vectorstores import Chroma
|
|
9 |
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
|
10 |
from langchain.chains import ConversationalRetrievalChain
|
11 |
from langchain.chains import RetrievalQA
|
|
|
12 |
|
13 |
import os
|
14 |
from langchain.chat_models import ChatOpenAI
|
@@ -16,6 +17,12 @@ from langchain import OpenAI
|
|
16 |
from langchain.document_loaders import WebBaseLoader, TextLoader, Docx2txtLoader, PyMuPDFLoader
|
17 |
from whatsapp_chat_custom import WhatsAppChatLoader # use this instead of from langchain.document_loaders import WhatsAppChatLoader
|
18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
from collections import deque
|
20 |
import re
|
21 |
from bs4 import BeautifulSoup
|
@@ -31,7 +38,7 @@ from ttyd_consts import *
|
|
31 |
|
32 |
load_dotenv()
|
33 |
|
34 |
-
# select the mode
|
35 |
if (os.getenv("TTYD_MODE",'')).split('_')[0]=='personalBot':
|
36 |
mode = mode_arslan
|
37 |
gDriveUrl = (os.getenv("GDRIVE_FOLDER_URL",'')).replace('?usp=sharing','')
|
@@ -48,8 +55,8 @@ else:
|
|
48 |
|
49 |
|
50 |
if mode.type!='userInputDocs':
|
51 |
-
# local vector store as opposed to gradio state vector store
|
52 |
-
vsDict_hard = localData_vecStore(
|
53 |
|
54 |
###############################################################################################
|
55 |
|
@@ -57,30 +64,27 @@ if mode.type!='userInputDocs':
|
|
57 |
|
58 |
###############################################################################################
|
59 |
|
60 |
-
# initialize chatbot function sets the QA Chain, and also sets/updates any other components to start chatting. updateQaChain function only updates QA chain and will be called whenever Adv Settings are updated.
|
61 |
-
def initializeChatbot(temp, k, modelName, stdlQs, api_key_st, vsDict_st, progress=gr.Progress()):
|
62 |
-
progress(0.1, waitText_initialize)
|
63 |
-
qa_chain_st = updateQaChain(temp, k, modelName, stdlQs, api_key_st, vsDict_st)
|
64 |
-
progress(0.5, waitText_initialize)
|
65 |
-
#generate welcome message
|
66 |
-
if mode.welcomeMsg:
|
67 |
-
welMsg = mode.welcomeMsg
|
68 |
-
else:
|
69 |
-
welMsg = qa_chain_st({'question': initialize_prompt, 'chat_history':[]})['answer']
|
70 |
-
print('Chatbot initialized at ', datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
|
71 |
-
|
72 |
-
return qa_chain_st, btn.update(interactive=True), initChatbot_btn.update('Chatbot ready. Now visit the chatbot Tab.', interactive=False)\
|
73 |
-
, oaiKey_tb.update(), gr.Tabs.update(selected='cb'), chatbot.update(value=[('', welMsg)])
|
74 |
-
|
75 |
-
|
76 |
def setOaiApiKey(api_key):
|
77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
try:
|
79 |
-
|
|
|
80 |
api_key_st = api_key
|
81 |
-
return
|
82 |
except Exception as e:
|
83 |
-
return
|
84 |
|
85 |
# convert user uploaded data to vectorstore
|
86 |
def uiData_vecStore(userFiles, userUrls, api_key_st, vsDict_st={}, progress=gr.Progress()):
|
@@ -103,8 +107,7 @@ def uiData_vecStore(userFiles, userUrls, api_key_st, vsDict_st={}, progress=gr.P
|
|
103 |
docs = split_docs(documents)
|
104 |
# Embeddings
|
105 |
try:
|
106 |
-
|
107 |
-
embeddings = OpenAIEmbeddings(openai_api_key=api_key_st)
|
108 |
except Exception as e:
|
109 |
return {}, str(e), *[x.update() for x in opComponents]
|
110 |
|
@@ -117,18 +120,57 @@ def uiData_vecStore(userFiles, userUrls, api_key_st, vsDict_st={}, progress=gr.P
|
|
117 |
progress(1, 'Data loaded')
|
118 |
return vsDict_st, src_str, *[x.update(interactive=False) for x in [data_ingest_btn, upload_fb]], urls_tb.update(interactive=False, placeholder='')
|
119 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
# just update the QA Chain, no updates to any UI
|
121 |
-
def updateQaChain(temp, k,
|
122 |
# if we are not adding data from ui, then use vsDict_hard as vectorstore
|
123 |
if vsDict_st=={} and mode.type!='userInputDocs': vsDict_st=vsDict_hard
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
132 |
# settingsUpdated = 'Settings updated:'+ ' Model=' + modelName + ', Temp=' + str(temp)+ ', k=' + str(k)
|
133 |
# gr.Info(settingsUpdated)
|
134 |
|
@@ -150,7 +192,7 @@ def updateQaChain(temp, k, modelName, stdlQs, api_key_st, vsDict_st):
|
|
150 |
return_generated_question=True
|
151 |
)
|
152 |
|
153 |
-
return qa_chain_st
|
154 |
|
155 |
|
156 |
def respond(message, chat_history, qa_chain):
|
@@ -172,7 +214,7 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue='orange', secondary_hue='gray
|
|
172 |
# Initialize state variables - stored in this browser session - these can only be used within input or output of .click/.submit etc, not as a python var coz they are not stored in backend, only as a frontend gradio component
|
173 |
# but if you initialize it with a default value, that value will be stored in backend and accessible across all users. You can also change it with statear.value='newValue'
|
174 |
qa_state = gr.State()
|
175 |
-
api_key_state = gr.State(getPersonalBotApiKey() if mode.type=='personalBot' else
|
176 |
chromaVS_state = gr.State({})
|
177 |
|
178 |
|
@@ -183,9 +225,14 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue='orange', secondary_hue='gray
|
|
183 |
with gr.Row():
|
184 |
with gr.Column():
|
185 |
oaiKey_tb = gr.Textbox(label="OpenAI API Key", type='password'\
|
186 |
-
, info='You can find OpenAI API key at https://platform.openai.com/account/api-keys'
|
187 |
-
|
188 |
-
|
|
|
|
|
|
|
|
|
|
|
189 |
with gr.Row(visible=mode.uiAddDataVis):
|
190 |
upload_fb = gr.Files(scale=5, label="Upload (multiple) Files - pdf/txt/docx supported", file_types=['.doc', '.docx', 'text', '.pdf', '.csv'])
|
191 |
urls_tb = gr.Textbox(scale=5, label="Enter URLs starting with https (comma separated)"\
|
@@ -203,8 +250,6 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue='orange', secondary_hue='gray
|
|
203 |
with gr.Row():
|
204 |
btn = gr.Button("Send Message", interactive=False, variant="primary")
|
205 |
clear = gr.ClearButton(components=[msg, chatbot, srcDocs], value="Clear chat history")
|
206 |
-
# exp_comp = gr.Dataset(scale=0.7, samples=[['123'],['456'], ['123'],['456'],['456']], components=[msg], label='Examples (auto generated by LLM)', visible=False)
|
207 |
-
# gr.Examples(examples=exps, inputs=msg)
|
208 |
with gr.Accordion("Advance Settings - click to expand", open=False):
|
209 |
with gr.Row():
|
210 |
with gr.Column():
|
@@ -220,23 +265,27 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue='orange', secondary_hue='gray
|
|
220 |
|
221 |
### Setup the Gradio Event Listeners
|
222 |
|
223 |
-
# API button
|
224 |
-
oaiKey_btn_args = {'fn':setOaiApiKey, 'inputs':[oaiKey_tb], 'outputs':[oaiKey_tb, oaiKey_btn, api_key_state]}
|
225 |
oaiKey_btn.click(**oaiKey_btn_args)
|
226 |
oaiKey_tb.submit(**oaiKey_btn_args)
|
227 |
|
|
|
|
|
|
|
|
|
228 |
# Data Ingest Button
|
229 |
data_ingest_event = data_ingest_btn.click(uiData_vecStore, [upload_fb, urls_tb, api_key_state, chromaVS_state], [chromaVS_state, status_tb, data_ingest_btn, upload_fb, urls_tb])
|
230 |
|
231 |
# Adv Settings
|
232 |
-
advSet_args = {'fn':updateQaChain, 'inputs':[temp_sld, k_sld, model_dd, stdlQs_rb, api_key_state, chromaVS_state], 'outputs':[qa_state]}
|
233 |
temp_sld.release(**advSet_args)
|
234 |
k_sld.release(**advSet_args)
|
235 |
model_dd.change(**advSet_args)
|
236 |
stdlQs_rb.change(**advSet_args)
|
237 |
|
238 |
# Initialize button
|
239 |
-
initCb_args = {'fn':initializeChatbot, 'inputs':[temp_sld, k_sld, model_dd, stdlQs_rb, api_key_state, chromaVS_state], 'outputs':[qa_state, btn, initChatbot_btn, oaiKey_tb, tabs, chatbot]}
|
240 |
if mode.type=='personalBot':
|
241 |
demo.load(**initCb_args) # load Chatbot UI directly on startup
|
242 |
initChatbot_btn.click(**initCb_args)
|
|
|
9 |
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
|
10 |
from langchain.chains import ConversationalRetrievalChain
|
11 |
from langchain.chains import RetrievalQA
|
12 |
+
from langchain.embeddings import SentenceTransformerEmbeddings
|
13 |
|
14 |
import os
|
15 |
from langchain.chat_models import ChatOpenAI
|
|
|
17 |
from langchain.document_loaders import WebBaseLoader, TextLoader, Docx2txtLoader, PyMuPDFLoader
|
18 |
from whatsapp_chat_custom import WhatsAppChatLoader # use this instead of from langchain.document_loaders import WhatsAppChatLoader
|
19 |
|
20 |
+
from ibm_watson_machine_learning.foundation_models.utils.enums import ModelTypes
|
21 |
+
from ibm_watson_machine_learning.metanames import GenTextParamsMetaNames as GenParams
|
22 |
+
from ibm_watson_machine_learning.foundation_models.utils.enums import DecodingMethods
|
23 |
+
from ibm_watson_machine_learning.foundation_models import Model
|
24 |
+
from ibm_watson_machine_learning.foundation_models.extensions.langchain import WatsonxLLM
|
25 |
+
|
26 |
from collections import deque
|
27 |
import re
|
28 |
from bs4 import BeautifulSoup
|
|
|
38 |
|
39 |
load_dotenv()
|
40 |
|
41 |
+
# select the mode when starting container - modes options are in ttyd_consts.py
|
42 |
if (os.getenv("TTYD_MODE",'')).split('_')[0]=='personalBot':
|
43 |
mode = mode_arslan
|
44 |
gDriveUrl = (os.getenv("GDRIVE_FOLDER_URL",'')).replace('?usp=sharing','')
|
|
|
55 |
|
56 |
|
57 |
if mode.type!='userInputDocs':
|
58 |
+
# local vector store as opposed to gradio state vector store, if we the user is not uploading the docs
|
59 |
+
vsDict_hard = localData_vecStore(getPersonalBotApiKey(), inputDir=mode.inputDir, file_list=mode.file_list, url_list=mode.url_list)
|
60 |
|
61 |
###############################################################################################
|
62 |
|
|
|
64 |
|
65 |
###############################################################################################
|
66 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
def setOaiApiKey(api_key):
|
68 |
+
credComps = [oaiKey_btn, wxKey_tb, wxPid_tb, wxKey_btn]
|
69 |
+
api_key = getOaiCreds(api_key)
|
70 |
+
try:
|
71 |
+
openai.Model.list(api_key=api_key.get('oai_key','Null')) # test the API key
|
72 |
+
api_key_st = api_key
|
73 |
+
return oaiKey_tb.update('API Key accepted', interactive=False, type='text'), *[x.update(interactive=False) for x in credComps], api_key_st
|
74 |
+
except Exception as e:
|
75 |
+
return oaiKey_tb.update(str(e), type='text'), *[x.update() for x in credComps+[api_key_state]]
|
76 |
+
|
77 |
+
|
78 |
+
def setWxApiKey(key, p_id):
|
79 |
+
credComps = [wxKey_btn, oaiKey_tb, oaiKey_btn]
|
80 |
+
api_key = getWxCreds(key, p_id)
|
81 |
try:
|
82 |
+
testModel = Model(model_id=ModelTypes.FLAN_UL2, credentials=api_key['credentials'], project_id=api_key['project_id']) # test the API key
|
83 |
+
del testModel
|
84 |
api_key_st = api_key
|
85 |
+
return *[x.update('Watsonx credentials accepted', interactive=False, type='text') for x in [wxKey_tb, wxPid_tb]], *[x.update(interactive=False) for x in credComps], api_key_st
|
86 |
except Exception as e:
|
87 |
+
return *[x.update(str(e), type='text') for x in [wxKey_tb, wxPid_tb]], *[x.update() for x in credComps+[api_key_state]]
|
88 |
|
89 |
# convert user uploaded data to vectorstore
|
90 |
def uiData_vecStore(userFiles, userUrls, api_key_st, vsDict_st={}, progress=gr.Progress()):
|
|
|
107 |
docs = split_docs(documents)
|
108 |
# Embeddings
|
109 |
try:
|
110 |
+
embeddings = getEmbeddingFunc(api_key_st)
|
|
|
111 |
except Exception as e:
|
112 |
return {}, str(e), *[x.update() for x in opComponents]
|
113 |
|
|
|
120 |
progress(1, 'Data loaded')
|
121 |
return vsDict_st, src_str, *[x.update(interactive=False) for x in [data_ingest_btn, upload_fb]], urls_tb.update(interactive=False, placeholder='')
|
122 |
|
123 |
+
# initialize chatbot function sets the QA Chain, and also sets/updates any other components to start chatting. updateQaChain function only updates QA chain and will be called whenever Adv Settings are updated.
|
124 |
+
def initializeChatbot(temp, k, modelName, stdlQs, api_key_st, vsDict_st, progress=gr.Progress()):
|
125 |
+
progress(0.1, waitText_initialize)
|
126 |
+
chainTuple = updateQaChain(temp, k, modelName, stdlQs, api_key_st, vsDict_st)
|
127 |
+
qa_chain_st = chainTuple[0]
|
128 |
+
progress(0.5, waitText_initialize)
|
129 |
+
#generate welcome message
|
130 |
+
if mode.welcomeMsg:
|
131 |
+
welMsg = mode.welcomeMsg
|
132 |
+
else:
|
133 |
+
welMsg = qa_chain_st({'question': initialize_prompt, 'chat_history':[]})['answer']
|
134 |
+
print('Chatbot initialized at ', datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
|
135 |
+
|
136 |
+
return qa_chain_st, chainTuple[1], btn.update(interactive=True), initChatbot_btn.update('Chatbot ready. Now visit the chatbot Tab.', interactive=False)\
|
137 |
+
, oaiKey_tb.update(), gr.Tabs.update(selected='cb'), chatbot.update(value=[('', welMsg)])
|
138 |
+
|
139 |
# just update the QA Chain, no updates to any UI
|
140 |
+
def updateQaChain(temp, k, modelNameDD, stdlQs, api_key_st, vsDict_st):
|
141 |
# if we are not adding data from ui, then use vsDict_hard as vectorstore
|
142 |
if vsDict_st=={} and mode.type!='userInputDocs': vsDict_st=vsDict_hard
|
143 |
+
|
144 |
+
if api_key_st.get('service')=='openai':
|
145 |
+
if not 'openai' in modelNameDD:
|
146 |
+
modelNameDD = 'gpt-3.5-turbo (openai)' # default model for openai
|
147 |
+
modelName = modelNameDD.split('(')[0].strip()
|
148 |
+
# check if the input model is chat model or legacy model
|
149 |
+
try:
|
150 |
+
ChatOpenAI(openai_api_key=api_key_st.get('oai_key','Null'), temperature=0,model_name=modelName,max_tokens=1).predict('')
|
151 |
+
llm = ChatOpenAI(openai_api_key=api_key_st.get('oai_key','Null'), temperature=float(temp),model_name=modelName)
|
152 |
+
except:
|
153 |
+
OpenAI(openai_api_key=api_key_st.get('oai_key','Null'), temperature=0,model_name=modelName,max_tokens=1).predict('')
|
154 |
+
llm = OpenAI(openai_api_key=api_key_st.get('oai_key','Null'), temperature=float(temp),model_name=modelName)
|
155 |
+
elif api_key_st.get('service')=='watsonx':
|
156 |
+
if not 'watsonx' in modelNameDD:
|
157 |
+
modelNameDD = 'meta-llama/llama-2-70b-chat (watsonx)' # default model for watsonx
|
158 |
+
modelName = modelNameDD.split('(')[0].strip()
|
159 |
+
wxModelParams = {
|
160 |
+
GenParams.DECODING_METHOD: DecodingMethods.SAMPLE,
|
161 |
+
GenParams.MAX_NEW_TOKENS: 1000,
|
162 |
+
GenParams.MIN_NEW_TOKENS: 1,
|
163 |
+
GenParams.TEMPERATURE: float(temp),
|
164 |
+
GenParams.TOP_K: 50,
|
165 |
+
GenParams.TOP_P: 1
|
166 |
+
}
|
167 |
+
flan_ul2_model = Model(
|
168 |
+
model_id=modelName,
|
169 |
+
params=wxModelParams,
|
170 |
+
credentials=api_key_st['credentials'], project_id=api_key_st['project_id'])
|
171 |
+
llm = WatsonxLLM(model=flan_ul2_model)
|
172 |
+
else:
|
173 |
+
raise Exception('Error: Invalid or None Credentials')
|
174 |
# settingsUpdated = 'Settings updated:'+ ' Model=' + modelName + ', Temp=' + str(temp)+ ', k=' + str(k)
|
175 |
# gr.Info(settingsUpdated)
|
176 |
|
|
|
192 |
return_generated_question=True
|
193 |
)
|
194 |
|
195 |
+
return qa_chain_st, model_dd.update(value=modelNameDD)
|
196 |
|
197 |
|
198 |
def respond(message, chat_history, qa_chain):
|
|
|
214 |
# Initialize state variables - stored in this browser session - these can only be used within input or output of .click/.submit etc, not as a python var coz they are not stored in backend, only as a frontend gradio component
|
215 |
# but if you initialize it with a default value, that value will be stored in backend and accessible across all users. You can also change it with statear.value='newValue'
|
216 |
qa_state = gr.State()
|
217 |
+
api_key_state = gr.State(getPersonalBotApiKey() if mode.type=='personalBot' else {}) # can be string (OpenAI) or dict (WX)
|
218 |
chromaVS_state = gr.State({})
|
219 |
|
220 |
|
|
|
225 |
with gr.Row():
|
226 |
with gr.Column():
|
227 |
oaiKey_tb = gr.Textbox(label="OpenAI API Key", type='password'\
|
228 |
+
, info='You can find OpenAI API key at https://platform.openai.com/account/api-keys')
|
229 |
+
oaiKey_btn = gr.Button("Submit OpenAI API Key")
|
230 |
+
with gr.Column():
|
231 |
+
wxKey_tb = gr.Textbox(label="Watsonx API Key", type='password'\
|
232 |
+
, info='You can find IBM Cloud API Key at Manage > Access (IAM) > API keys on https://cloud.ibm.com/iam/overview')
|
233 |
+
wxPid_tb = gr.Textbox(label="Watsonx Project ID"\
|
234 |
+
, info='You can find Project ID at Project -> Manage -> General -> Details on https://dataplatform.cloud.ibm.com/wx/home')
|
235 |
+
wxKey_btn = gr.Button("Submit Watsonx Credentials")
|
236 |
with gr.Row(visible=mode.uiAddDataVis):
|
237 |
upload_fb = gr.Files(scale=5, label="Upload (multiple) Files - pdf/txt/docx supported", file_types=['.doc', '.docx', 'text', '.pdf', '.csv'])
|
238 |
urls_tb = gr.Textbox(scale=5, label="Enter URLs starting with https (comma separated)"\
|
|
|
250 |
with gr.Row():
|
251 |
btn = gr.Button("Send Message", interactive=False, variant="primary")
|
252 |
clear = gr.ClearButton(components=[msg, chatbot, srcDocs], value="Clear chat history")
|
|
|
|
|
253 |
with gr.Accordion("Advance Settings - click to expand", open=False):
|
254 |
with gr.Row():
|
255 |
with gr.Column():
|
|
|
265 |
|
266 |
### Setup the Gradio Event Listeners
|
267 |
|
268 |
+
# OpenAI API button
|
269 |
+
oaiKey_btn_args = {'fn':setOaiApiKey, 'inputs':[oaiKey_tb], 'outputs':[oaiKey_tb, oaiKey_btn, wxKey_tb, wxPid_tb, wxKey_btn, api_key_state]}
|
270 |
oaiKey_btn.click(**oaiKey_btn_args)
|
271 |
oaiKey_tb.submit(**oaiKey_btn_args)
|
272 |
|
273 |
+
# Watsonx Creds button
|
274 |
+
wxKey_btn_args = {'fn':setWxApiKey, 'inputs':[wxKey_tb, wxPid_tb], 'outputs':[wxKey_tb, wxPid_tb, wxKey_btn, oaiKey_tb, oaiKey_btn, api_key_state]}
|
275 |
+
wxKey_btn.click(**wxKey_btn_args)
|
276 |
+
|
277 |
# Data Ingest Button
|
278 |
data_ingest_event = data_ingest_btn.click(uiData_vecStore, [upload_fb, urls_tb, api_key_state, chromaVS_state], [chromaVS_state, status_tb, data_ingest_btn, upload_fb, urls_tb])
|
279 |
|
280 |
# Adv Settings
|
281 |
+
advSet_args = {'fn':updateQaChain, 'inputs':[temp_sld, k_sld, model_dd, stdlQs_rb, api_key_state, chromaVS_state], 'outputs':[qa_state, model_dd]}
|
282 |
temp_sld.release(**advSet_args)
|
283 |
k_sld.release(**advSet_args)
|
284 |
model_dd.change(**advSet_args)
|
285 |
stdlQs_rb.change(**advSet_args)
|
286 |
|
287 |
# Initialize button
|
288 |
+
initCb_args = {'fn':initializeChatbot, 'inputs':[temp_sld, k_sld, model_dd, stdlQs_rb, api_key_state, chromaVS_state], 'outputs':[qa_state, model_dd, btn, initChatbot_btn, oaiKey_tb, tabs, chatbot]}
|
289 |
if mode.type=='personalBot':
|
290 |
demo.load(**initCb_args) # load Chatbot UI directly on startup
|
291 |
initChatbot_btn.click(**initCb_args)
|
requirements.txt
CHANGED
@@ -7,4 +7,6 @@ pypdf
|
|
7 |
gradio
|
8 |
PyMuPDF
|
9 |
gdown
|
10 |
-
docx2txt
|
|
|
|
|
|
7 |
gradio
|
8 |
PyMuPDF
|
9 |
gdown
|
10 |
+
docx2txt
|
11 |
+
sentence-transformers
|
12 |
+
ibm-watson-machine-learning
|
ttyd_consts.py
CHANGED
@@ -1,10 +1,12 @@
|
|
|
|
|
|
1 |
exp_query = 'Generate top 5 questions that I can ask about this data. Questions should be very precise and short, ideally less than 10 words.'
|
2 |
|
3 |
waitText_initialize = 'Preparing the documents, please wait...'
|
4 |
|
5 |
-
initialize_prompt =
|
6 |
If this data is about a person, mention his name instead of using pronouns. After describing the overview, you should mention top 3 example questions that the user can ask about this data.\
|
7 |
-
\n\nYour response should be short and precise. Format of your response should be Summary:\n{Description and Summary} \n\n Example Questions:\n{Example Questions}
|
8 |
|
9 |
nustian_exps = ['Tell me about NUSTIAN',
|
10 |
'Who is the NUSTIAN regional lead for Silicon Valley?',
|
@@ -24,7 +26,7 @@ stdlQs_rb_choices = ['Retrieve relavant docs using original question, send orig
|
|
24 |
|
25 |
model_dd_info = 'You can also input any OpenAI model name, compatible with /v1/completions or /v1/chat/completions endpoint. Details: https://platform.openai.com/docs/models/'
|
26 |
|
27 |
-
model_dd_choices = ['gpt-3.5-turbo', 'gpt-3.5-turbo-16k', 'gpt-4', 'text-davinci-003 (Legacy)', 'text-curie-001 (Legacy)', 'babbage-002']
|
28 |
|
29 |
url_tb_info = 'Upto 100 domain webpages will be crawled for each URL. You can also enter online PDF files.'
|
30 |
|
@@ -33,7 +35,7 @@ url_tb_ph = 'https://example.com, https://another.com, https://anyremotedocument
|
|
33 |
|
34 |
md_title_general = """
|
35 |
## Chat with your documents and websites<br>
|
36 |
-
Step 1) Enter your OpenAI API
|
37 |
Step 2) Upload your documents and/or enter URLs, then click Load Data.<br>
|
38 |
Step 3) Once data is loaded, click Initialize Chatbot (at the bottom of the page) to start talking to your data.<br>
|
39 |
|
|
|
1 |
+
from ibm_watson_machine_learning.foundation_models.utils.enums import ModelTypes
|
2 |
+
|
3 |
exp_query = 'Generate top 5 questions that I can ask about this data. Questions should be very precise and short, ideally less than 10 words.'
|
4 |
|
5 |
waitText_initialize = 'Preparing the documents, please wait...'
|
6 |
|
7 |
+
initialize_prompt = """Write a short welcome message to the user. Describe the data with a comprehensive overview including short summary.\
|
8 |
If this data is about a person, mention his name instead of using pronouns. After describing the overview, you should mention top 3 example questions that the user can ask about this data.\
|
9 |
+
\n\nYour response should be short and precise. Format of your response should be Summary:\n{Description and Summary} \n\n Example Questions:\n{Example Questions}"""
|
10 |
|
11 |
nustian_exps = ['Tell me about NUSTIAN',
|
12 |
'Who is the NUSTIAN regional lead for Silicon Valley?',
|
|
|
26 |
|
27 |
model_dd_info = 'You can also input any OpenAI model name, compatible with /v1/completions or /v1/chat/completions endpoint. Details: https://platform.openai.com/docs/models/'
|
28 |
|
29 |
+
model_dd_choices = ['gpt-3.5-turbo (openai)', 'gpt-3.5-turbo-16k (openai)', 'gpt-4 (openai)', 'text-davinci-003 (Legacy - openai)', 'text-curie-001 (Legacy - openai)', 'babbage-002 (openai)'] + [model.value+' (watsonx)' for model in ModelTypes]
|
30 |
|
31 |
url_tb_info = 'Upto 100 domain webpages will be crawled for each URL. You can also enter online PDF files.'
|
32 |
|
|
|
35 |
|
36 |
md_title_general = """
|
37 |
## Chat with your documents and websites<br>
|
38 |
+
Step 1) Enter your OpenAI API or Watsonx Credentials, and click Submit.<br>
|
39 |
Step 2) Upload your documents and/or enter URLs, then click Load Data.<br>
|
40 |
Step 3) Once data is loaded, click Initialize Chatbot (at the bottom of the page) to start talking to your data.<br>
|
41 |
|
ttyd_functions.py
CHANGED
@@ -1,9 +1,11 @@
|
|
1 |
|
2 |
import datetime
|
3 |
import uuid
|
|
|
4 |
from langchain.embeddings import OpenAIEmbeddings
|
5 |
from langchain.vectorstores import Chroma
|
6 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
|
7 |
|
8 |
import os
|
9 |
from langchain.document_loaders import WebBaseLoader, TextLoader, Docx2txtLoader, PyMuPDFLoader
|
@@ -25,15 +27,31 @@ mimetypes.init()
|
|
25 |
media_files = tuple([x for x in mimetypes.types_map if mimetypes.types_map[x].split('/')[0] in ['image', 'video', 'audio']])
|
26 |
filter_strings = ['/email-protection#']
|
27 |
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
else:
|
35 |
-
return
|
36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
def get_hyperlinks(url):
|
38 |
try:
|
39 |
reqs = requests.get(url)
|
@@ -226,6 +244,18 @@ def getSourcesFromMetadata(metadata, sourceOnly=True, sepFileUrl=True):
|
|
226 |
src_docs = '\n'.join(([f"{i+1}) {x}" for i,x in enumerate(sorted(list(setSrc), key=str.casefold))]))
|
227 |
return src_docs, len(setSrc)
|
228 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
229 |
|
230 |
def getVsDict(embeddingFunc, docs, vsDict={}):
|
231 |
# create chroma client if doesnt exist
|
@@ -241,13 +271,13 @@ def getVsDict(embeddingFunc, docs, vsDict={}):
|
|
241 |
return vsDict
|
242 |
|
243 |
# used for Hardcoded documents only - not uploaded by user (userData_vecStore is separate function)
|
244 |
-
def localData_vecStore(
|
245 |
documents = data_ingestion(inputDir, file_list, url_list)
|
246 |
if not documents:
|
247 |
-
|
248 |
docs = split_docs(documents)
|
249 |
# Embeddings
|
250 |
-
embeddings =
|
251 |
# create chroma client if doesnt exist
|
252 |
vsDict_hd = getVsDict(embeddings, docs, vsDict)
|
253 |
# get sources from metadata
|
@@ -263,13 +293,3 @@ def num_tokens_from_string(string, encoding_name = "cl100k_base"):
|
|
263 |
num_tokens = len(encoding.encode(string))
|
264 |
return num_tokens
|
265 |
|
266 |
-
def getPersonalBotApiKey():
|
267 |
-
if os.getenv("OPENAI_API_KEY"):
|
268 |
-
return os.getenv("OPENAI_API_KEY")
|
269 |
-
elif os.getenv("WX_API_KEY"):
|
270 |
-
wxCreds = {'credentials' : {"url": "https://us-south.ml.cloud.ibm.com", "apikey": os.getenv("WX_API_KEY") },
|
271 |
-
'project_id': os.getenv("WX_PROJECT_ID")
|
272 |
-
}
|
273 |
-
return wxCreds
|
274 |
-
else:
|
275 |
-
return None
|
|
|
1 |
|
2 |
import datetime
|
3 |
import uuid
|
4 |
+
import openai
|
5 |
from langchain.embeddings import OpenAIEmbeddings
|
6 |
from langchain.vectorstores import Chroma
|
7 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
8 |
+
from langchain.embeddings import SentenceTransformerEmbeddings
|
9 |
|
10 |
import os
|
11 |
from langchain.document_loaders import WebBaseLoader, TextLoader, Docx2txtLoader, PyMuPDFLoader
|
|
|
27 |
media_files = tuple([x for x in mimetypes.types_map if mimetypes.types_map[x].split('/')[0] in ['image', 'video', 'audio']])
|
28 |
filter_strings = ['/email-protection#']
|
29 |
|
30 |
+
def getOaiCreds(key):
|
31 |
+
if key:
|
32 |
+
return {'service': 'openai',
|
33 |
+
'oai_key' : key
|
34 |
+
}
|
35 |
+
else:
|
36 |
+
return {}
|
37 |
+
|
38 |
+
def getWxCreds(key, p_id):
|
39 |
+
if key and p_id:
|
40 |
+
return {'service': 'watsonx',
|
41 |
+
'credentials' : {"url": "https://us-south.ml.cloud.ibm.com", "apikey": key },
|
42 |
+
'project_id': p_id
|
43 |
+
}
|
44 |
else:
|
45 |
+
return {}
|
46 |
|
47 |
+
def getPersonalBotApiKey():
|
48 |
+
if os.getenv("OPENAI_API_KEY"):
|
49 |
+
return getOaiCreds(os.getenv("OPENAI_API_KEY"))
|
50 |
+
elif os.getenv("WX_API_KEY") and os.getenv("WX_PROJECT_ID"):
|
51 |
+
return getWxCreds(os.getenv("WX_API_KEY"), os.getenv("WX_PROJECT_ID"))
|
52 |
+
else:
|
53 |
+
return {}
|
54 |
+
|
55 |
def get_hyperlinks(url):
|
56 |
try:
|
57 |
reqs = requests.get(url)
|
|
|
244 |
src_docs = '\n'.join(([f"{i+1}) {x}" for i,x in enumerate(sorted(list(setSrc), key=str.casefold))]))
|
245 |
return src_docs, len(setSrc)
|
246 |
|
247 |
+
def getEmbeddingFunc(creds):
|
248 |
+
# OpenAI key used
|
249 |
+
if creds.get('service')=='openai':
|
250 |
+
embeddings = OpenAIEmbeddings(openai_api_key=creds.get('oai_key','Null'))
|
251 |
+
# WX key used
|
252 |
+
elif creds.get('service')=='watsonx':
|
253 |
+
# testModel = Model(model_id=ModelTypes.FLAN_UL2, credentials=creds['credentials'], project_id=creds['project_id']) # test the API key
|
254 |
+
# del testModel
|
255 |
+
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2") # for now use OpenSource model for embedding as WX doesnt have any embedding model
|
256 |
+
else:
|
257 |
+
raise Exception('Error: Invalid or None Credentials')
|
258 |
+
return embeddings
|
259 |
|
260 |
def getVsDict(embeddingFunc, docs, vsDict={}):
|
261 |
# create chroma client if doesnt exist
|
|
|
271 |
return vsDict
|
272 |
|
273 |
# used for Hardcoded documents only - not uploaded by user (userData_vecStore is separate function)
|
274 |
+
def localData_vecStore(embKey={}, inputDir=None, file_list=[], url_list=[], vsDict={}):
|
275 |
documents = data_ingestion(inputDir, file_list, url_list)
|
276 |
if not documents:
|
277 |
+
raise Exception('Error: No Documents Found')
|
278 |
docs = split_docs(documents)
|
279 |
# Embeddings
|
280 |
+
embeddings = getEmbeddingFunc(embKey)
|
281 |
# create chroma client if doesnt exist
|
282 |
vsDict_hd = getVsDict(embeddings, docs, vsDict)
|
283 |
# get sources from metadata
|
|
|
293 |
num_tokens = len(encoding.encode(string))
|
294 |
return num_tokens
|
295 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|