Spaces:
Running
Running
minor fixes
Browse files- app.py +3 -2
- utils.py +13 -2
- webchat.py +11 -24
app.py
CHANGED
@@ -10,6 +10,7 @@ watsonx_project_id = ""
|
|
10 |
api_key = ""
|
11 |
def main():
|
12 |
utils.get_credentials()
|
|
|
13 |
st.set_page_config(layout="wide", page_title="RAG Web Demo", page_icon="")
|
14 |
utils.load_css("styles.css")
|
15 |
# Streamlit app title with style
|
@@ -44,7 +45,7 @@ def main():
|
|
44 |
#collection_name = utils.create_collection_name(user_url)
|
45 |
if button_clicked and user_url:
|
46 |
# Invoke the LLM when the button is clicked
|
47 |
-
response = webchat.answer_questions_from_web(api_key, watsonx_project_id, user_url, question, collection_name)
|
48 |
st.write(response)
|
49 |
else:
|
50 |
st.warning("Please provide API Key and Project ID in the sidebar.")
|
@@ -55,7 +56,7 @@ def main():
|
|
55 |
clean_button_clicked = st.sidebar.button("Clean Memory")
|
56 |
if clean_button_clicked :
|
57 |
if collection_name: # Check if collection_name is defined and not empty
|
58 |
-
utils.clear_collection(collection_name)
|
59 |
st.sidebar.success("Memory cleared successfully!")
|
60 |
print("Memory cleared successfully!")
|
61 |
else:
|
|
|
10 |
api_key = ""
|
11 |
def main():
|
12 |
utils.get_credentials()
|
13 |
+
client=utils.chromadb_client()
|
14 |
st.set_page_config(layout="wide", page_title="RAG Web Demo", page_icon="")
|
15 |
utils.load_css("styles.css")
|
16 |
# Streamlit app title with style
|
|
|
45 |
#collection_name = utils.create_collection_name(user_url)
|
46 |
if button_clicked and user_url:
|
47 |
# Invoke the LLM when the button is clicked
|
48 |
+
response = webchat.answer_questions_from_web(api_key, watsonx_project_id, user_url, question, collection_name,client)
|
49 |
st.write(response)
|
50 |
else:
|
51 |
st.warning("Please provide API Key and Project ID in the sidebar.")
|
|
|
56 |
clean_button_clicked = st.sidebar.button("Clean Memory")
|
57 |
if clean_button_clicked :
|
58 |
if collection_name: # Check if collection_name is defined and not empty
|
59 |
+
utils.clear_collection(collection_name, client)
|
60 |
st.sidebar.success("Memory cleared successfully!")
|
61 |
print("Memory cleared successfully!")
|
62 |
else:
|
utils.py
CHANGED
@@ -19,9 +19,20 @@ def create_collection_name(url):
|
|
19 |
return domain_parts[-2] # Extracting the second-level domain
|
20 |
else:
|
21 |
return "base"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
-
def clear_collection(collection_name):
|
24 |
-
client = chromadb.Client()
|
25 |
try:
|
26 |
collection = client.get_collection(collection_name)
|
27 |
if collection:
|
|
|
19 |
return domain_parts[-2] # Extracting the second-level domain
|
20 |
else:
|
21 |
return "base"
|
22 |
+
|
23 |
+
def chromadb_client():
|
24 |
+
import chromadb
|
25 |
+
# Set up cache directory (consider user-defined location)
|
26 |
+
current_dir = os.getcwd()
|
27 |
+
# Replace 'my_custom_cache_path' with your desired location
|
28 |
+
custom_cache_path = os.path.join(current_dir, ".cache")
|
29 |
+
# Create settings object with custom cache path
|
30 |
+
settings = chromadb.Settings(persist_directory=custom_cache_path)
|
31 |
+
# Initialize client with custom settings
|
32 |
+
client = chromadb.Client(settings)
|
33 |
+
return client
|
34 |
|
35 |
+
def clear_collection(collection_name,client):
|
|
|
36 |
try:
|
37 |
collection = client.get_collection(collection_name)
|
38 |
if collection:
|
webchat.py
CHANGED
@@ -15,6 +15,7 @@ from bs4 import BeautifulSoup
|
|
15 |
import spacy
|
16 |
import chromadb
|
17 |
import en_core_web_md
|
|
|
18 |
|
19 |
# Important: hardcoding the API key in Python code is not a best practice. We are using
|
20 |
# this approach for the ease of demo setup. In a production application these variables
|
@@ -79,8 +80,6 @@ def get_model_test(model_type, max_tokens, min_tokens, decoding, temperature):
|
|
79 |
|
80 |
return model
|
81 |
|
82 |
-
|
83 |
-
|
84 |
# Set up cache directory (consider user-defined location)
|
85 |
current_dir = os.getcwd()
|
86 |
cache_dir = os.path.join(current_dir, ".cache")
|
@@ -95,6 +94,7 @@ model_name = 'sentence-transformers/all-MiniLM-L6-v2'
|
|
95 |
model = SentenceTransformer(model_name, cache_folder=cache_dir)
|
96 |
# Print confirmation message
|
97 |
print(f"Model '{model_name}' downloaded and loaded from cache directory: {cache_dir}")
|
|
|
98 |
# Embedding function
|
99 |
class MiniLML6V2EmbeddingFunction(EmbeddingFunction):
|
100 |
MODEL = model
|
@@ -122,7 +122,6 @@ def extract_text(url):
|
|
122 |
# remove \xa0 which is used in html to avoid words break acorss lines.
|
123 |
cleaned_text = raw_web_text.replace("\xa0", " ")
|
124 |
return cleaned_text
|
125 |
-
|
126 |
else:
|
127 |
print(f"Failed to retrieve the page. Status code: {response.status_code}")
|
128 |
|
@@ -137,22 +136,10 @@ def split_text_into_sentences(text):
|
|
137 |
cleaned_sentences = [s.strip() for s in sentences]
|
138 |
return cleaned_sentences
|
139 |
|
140 |
-
|
141 |
-
def create_embedding(url, collection_name):
|
142 |
-
# Set up cache directory (consider user-defined location)
|
143 |
-
current_dir = os.getcwd()
|
144 |
-
# Replace 'my_custom_cache_path' with your desired location
|
145 |
-
custom_cache_path = os.path.join(current_dir, ".cache")
|
146 |
-
# Create settings object with custom cache path
|
147 |
-
settings = chromadb.Settings(persist_directory=custom_cache_path)
|
148 |
-
|
149 |
cleaned_text = extract_text(url)
|
150 |
cleaned_sentences = split_text_into_sentences(cleaned_text)
|
151 |
-
# Initialize client with custom settings
|
152 |
-
client = chromadb.Client(settings)
|
153 |
-
|
154 |
collection = client.get_or_create_collection(collection_name)
|
155 |
-
|
156 |
# Upload text to chroma
|
157 |
collection.upsert(
|
158 |
documents=cleaned_sentences,
|
@@ -163,9 +150,9 @@ def create_embedding(url, collection_name):
|
|
163 |
return collection
|
164 |
|
165 |
|
166 |
-
def create_prompt_old(url, question, collection_name):
|
167 |
# Create embeddings for the text file
|
168 |
-
collection = create_embedding(url, collection_name)
|
169 |
|
170 |
# query relevant information
|
171 |
relevant_chunks = collection.query(
|
@@ -181,10 +168,10 @@ def create_prompt_old(url, question, collection_name):
|
|
181 |
|
182 |
return prompt
|
183 |
|
184 |
-
def create_prompt(url, question, collection_name):
|
185 |
try:
|
186 |
# Create embeddings for the text file
|
187 |
-
collection = create_embedding(url, collection_name)
|
188 |
except Exception as e:
|
189 |
return f"Error creating embeddings: {e}"
|
190 |
|
@@ -222,7 +209,7 @@ def main():
|
|
222 |
|
223 |
# Get the API key and project id and update global variables
|
224 |
get_credentials()
|
225 |
-
|
226 |
# Try diffrent URLs and questions
|
227 |
url = "https://www.usbank.com/financialiq/manage-your-household/buy-a-car/own-electric-vehicles-learned-buying-driving-EVs.html"
|
228 |
|
@@ -231,10 +218,10 @@ def main():
|
|
231 |
# question = "Can an EV be plugged in to a household outlet?"
|
232 |
collection_name = "test_web_RAG"
|
233 |
|
234 |
-
answer_questions_from_web(api_key, watsonx_project_id, url, question, collection_name)
|
235 |
|
236 |
|
237 |
-
def answer_questions_from_web(request_api_key, request_project_id, url, question, collection_name):
|
238 |
# Update the global variable
|
239 |
globals()["api_key"] = request_api_key
|
240 |
globals()["watsonx_project_id"] = request_project_id
|
@@ -253,7 +240,7 @@ def answer_questions_from_web(request_api_key, request_project_id, url, question
|
|
253 |
model = get_model(model_type, max_tokens, min_tokens, decoding, temperature, top_k, top_p)
|
254 |
|
255 |
# Get the prompt
|
256 |
-
complete_prompt = create_prompt(url, question, collection_name)
|
257 |
|
258 |
# Let's review the prompt
|
259 |
print("----------------------------------------------------------------------------------------------------")
|
|
|
15 |
import spacy
|
16 |
import chromadb
|
17 |
import en_core_web_md
|
18 |
+
from utils import chromadb_client
|
19 |
|
20 |
# Important: hardcoding the API key in Python code is not a best practice. We are using
|
21 |
# this approach for the ease of demo setup. In a production application these variables
|
|
|
80 |
|
81 |
return model
|
82 |
|
|
|
|
|
83 |
# Set up cache directory (consider user-defined location)
|
84 |
current_dir = os.getcwd()
|
85 |
cache_dir = os.path.join(current_dir, ".cache")
|
|
|
94 |
model = SentenceTransformer(model_name, cache_folder=cache_dir)
|
95 |
# Print confirmation message
|
96 |
print(f"Model '{model_name}' downloaded and loaded from cache directory: {cache_dir}")
|
97 |
+
|
98 |
# Embedding function
|
99 |
class MiniLML6V2EmbeddingFunction(EmbeddingFunction):
|
100 |
MODEL = model
|
|
|
122 |
# remove \xa0 which is used in html to avoid words break acorss lines.
|
123 |
cleaned_text = raw_web_text.replace("\xa0", " ")
|
124 |
return cleaned_text
|
|
|
125 |
else:
|
126 |
print(f"Failed to retrieve the page. Status code: {response.status_code}")
|
127 |
|
|
|
136 |
cleaned_sentences = [s.strip() for s in sentences]
|
137 |
return cleaned_sentences
|
138 |
|
139 |
+
def create_embedding(url, collection_name,client):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
140 |
cleaned_text = extract_text(url)
|
141 |
cleaned_sentences = split_text_into_sentences(cleaned_text)
|
|
|
|
|
|
|
142 |
collection = client.get_or_create_collection(collection_name)
|
|
|
143 |
# Upload text to chroma
|
144 |
collection.upsert(
|
145 |
documents=cleaned_sentences,
|
|
|
150 |
return collection
|
151 |
|
152 |
|
153 |
+
def create_prompt_old(url, question, collection_name, client):
|
154 |
# Create embeddings for the text file
|
155 |
+
collection = create_embedding(url, collection_name, client)
|
156 |
|
157 |
# query relevant information
|
158 |
relevant_chunks = collection.query(
|
|
|
168 |
|
169 |
return prompt
|
170 |
|
171 |
+
def create_prompt(url, question, collection_name,client):
|
172 |
try:
|
173 |
# Create embeddings for the text file
|
174 |
+
collection = create_embedding(url, collection_name,client)
|
175 |
except Exception as e:
|
176 |
return f"Error creating embeddings: {e}"
|
177 |
|
|
|
209 |
|
210 |
# Get the API key and project id and update global variables
|
211 |
get_credentials()
|
212 |
+
client=chromadb_client()
|
213 |
# Try diffrent URLs and questions
|
214 |
url = "https://www.usbank.com/financialiq/manage-your-household/buy-a-car/own-electric-vehicles-learned-buying-driving-EVs.html"
|
215 |
|
|
|
218 |
# question = "Can an EV be plugged in to a household outlet?"
|
219 |
collection_name = "test_web_RAG"
|
220 |
|
221 |
+
answer_questions_from_web(api_key, watsonx_project_id, url, question, collection_name,client)
|
222 |
|
223 |
|
224 |
+
def answer_questions_from_web(request_api_key, request_project_id, url, question, collection_name,client):
|
225 |
# Update the global variable
|
226 |
globals()["api_key"] = request_api_key
|
227 |
globals()["watsonx_project_id"] = request_project_id
|
|
|
240 |
model = get_model(model_type, max_tokens, min_tokens, decoding, temperature, top_k, top_p)
|
241 |
|
242 |
# Get the prompt
|
243 |
+
complete_prompt = create_prompt(url, question, collection_name,client)
|
244 |
|
245 |
# Let's review the prompt
|
246 |
print("----------------------------------------------------------------------------------------------------")
|