Spaces:
Sleeping
Sleeping
poemsforaphrodite
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -1,10 +1,7 @@
|
|
1 |
import os
|
2 |
import streamlit as st
|
3 |
from openai import OpenAI
|
4 |
-
from PyPDF2 import PdfReader
|
5 |
import requests
|
6 |
-
from youtube_transcript_api import YouTubeTranscriptApi
|
7 |
-
from urllib.parse import urlparse, parse_qs
|
8 |
from pinecone import Pinecone
|
9 |
import uuid
|
10 |
from dotenv import load_dotenv
|
@@ -21,27 +18,22 @@ from pymongo import MongoClient
|
|
21 |
from pymongo.errors import ConnectionFailure
|
22 |
from datetime import datetime
|
23 |
|
24 |
-
# Set page config at the very beginning
|
25 |
st.set_page_config(layout="wide")
|
26 |
|
27 |
-
# Load environment variables
|
28 |
load_dotenv()
|
29 |
|
30 |
-
# Set up OpenAI client
|
31 |
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
32 |
|
33 |
-
# Set up Pinecone
|
34 |
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
|
35 |
|
36 |
-
index_name = "lyca"
|
37 |
index = pc.Index(index_name)
|
38 |
|
39 |
-
# Set up MongoDB connection
|
40 |
mongo_uri = os.getenv("MONGODB_URI")
|
41 |
|
42 |
try:
|
43 |
client = MongoClient(mongo_uri, serverSelectionTimeoutMS=5000)
|
44 |
-
client.server_info()
|
45 |
db = client['lyca']
|
46 |
sim_swap_collection = db['sim_swap_requests']
|
47 |
except ConnectionFailure:
|
@@ -52,47 +44,30 @@ def get_embedding(text):
|
|
52 |
response = client.embeddings.create(input=text, model="text-embedding-3-large")
|
53 |
return response.data[0].embedding
|
54 |
|
55 |
-
def process_pdf(file):
|
56 |
-
reader = PdfReader(file)
|
57 |
-
text = ""
|
58 |
-
for page in reader.pages:
|
59 |
-
text += page.extract_text() + "\n"
|
60 |
-
return text
|
61 |
-
|
62 |
def process_web_link(url):
|
63 |
try:
|
64 |
-
# Set up Selenium options
|
65 |
chrome_options = Options()
|
66 |
-
chrome_options.add_argument("--headless")
|
67 |
chrome_options.add_argument("--no-sandbox")
|
68 |
chrome_options.add_argument("--disable-dev-shm-usage")
|
69 |
|
70 |
-
# Install the Chrome driver automatically using webdriver-manager
|
71 |
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
|
72 |
|
73 |
-
# Navigate to the URL
|
74 |
driver.get(url)
|
75 |
|
76 |
-
# Give the page some time to load fully
|
77 |
time.sleep(3)
|
78 |
|
79 |
-
# Extract the rendered page's content
|
80 |
page_source = driver.page_source
|
81 |
|
82 |
-
# Close the browser after extracting content
|
83 |
driver.quit()
|
84 |
|
85 |
-
# Parse the page content using BeautifulSoup
|
86 |
soup = BeautifulSoup(page_source, 'lxml')
|
87 |
|
88 |
-
# Remove script and style elements
|
89 |
for script in soup(["script", "style"]):
|
90 |
script.decompose()
|
91 |
|
92 |
-
# Get text
|
93 |
text = soup.get_text()
|
94 |
|
95 |
-
# Clean up the text
|
96 |
lines = (line.strip() for line in text.splitlines())
|
97 |
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
98 |
text = '\n'.join(chunk for chunk in chunks if chunk)
|
@@ -102,38 +77,14 @@ def process_web_link(url):
|
|
102 |
print(f"Error processing web link {url}: {str(e)}")
|
103 |
return f"Error processing {url}: {str(e)}"
|
104 |
|
105 |
-
def process_youtube_link(url):
|
106 |
-
video_id = extract_video_id(url)
|
107 |
-
transcript = YouTubeTranscriptApi.get_transcript(video_id)
|
108 |
-
return " ".join([entry['text'] for entry in transcript])
|
109 |
-
|
110 |
-
def extract_video_id(url):
|
111 |
-
parsed_url = urlparse(url)
|
112 |
-
if parsed_url.hostname == 'youtu.be':
|
113 |
-
return parsed_url.path[1:]
|
114 |
-
if parsed_url.hostname in ('www.youtube.com', 'youtube.com'):
|
115 |
-
if parsed_url.path == '/watch':
|
116 |
-
return parse_qs(parsed_url.query)['v'][0]
|
117 |
-
if parsed_url.path[:7] == '/embed/':
|
118 |
-
return parsed_url.path.split('/')[2]
|
119 |
-
if parsed_url.path[:3] == '/v/':
|
120 |
-
return parsed_url.path.split('/')[2]
|
121 |
-
return None
|
122 |
-
|
123 |
def process_upload(upload_type, file_or_link, file_name=None):
|
124 |
print(f"Starting process_upload for {upload_type}")
|
125 |
doc_id = str(uuid.uuid4())
|
126 |
print(f"Generated doc_id: {doc_id}")
|
127 |
|
128 |
-
if upload_type == "
|
129 |
-
content = process_pdf(file_or_link)
|
130 |
-
doc_name = file_name or "Uploaded PDF"
|
131 |
-
elif upload_type == "Web Link":
|
132 |
content = process_web_link(file_or_link)
|
133 |
doc_name = file_or_link
|
134 |
-
elif upload_type == "YouTube Link":
|
135 |
-
content = process_youtube_link(file_or_link)
|
136 |
-
doc_name = f"YouTube: {file_or_link}"
|
137 |
else:
|
138 |
print("Invalid upload type")
|
139 |
return "Invalid upload type"
|
@@ -141,7 +92,6 @@ def process_upload(upload_type, file_or_link, file_name=None):
|
|
141 |
content_length = len(content)
|
142 |
print(f"Content extracted, length: {content_length}")
|
143 |
|
144 |
-
# Dynamically adjust chunk size based on content length
|
145 |
if content_length < 10000:
|
146 |
chunk_size = 1000
|
147 |
elif content_length < 100000:
|
@@ -158,7 +108,6 @@ def process_upload(upload_type, file_or_link, file_name=None):
|
|
158 |
|
159 |
for future in as_completed(futures):
|
160 |
vectors.append(future.result())
|
161 |
-
# Update progress
|
162 |
progress = len(vectors) / len(chunks)
|
163 |
st.session_state.upload_progress.progress(progress)
|
164 |
|
@@ -186,14 +135,12 @@ def get_relevant_context(query, top_k=5):
|
|
186 |
search_results = index.query(vector=query_embedding, top_k=top_k, include_metadata=True)
|
187 |
print(f"Found {len(search_results['matches'])} relevant results")
|
188 |
|
189 |
-
# Sort results by doc_id and chunk_index to maintain document structure
|
190 |
sorted_results = sorted(search_results['matches'], key=lambda x: (x['metadata']['doc_id'], x['metadata']['chunk_index']))
|
191 |
|
192 |
context = "\n".join([result['metadata']['text'] for result in sorted_results])
|
193 |
return context, sorted_results
|
194 |
|
195 |
def check_lyca_data_loaded():
|
196 |
-
# Check if there are any vectors in the index
|
197 |
stats = index.describe_index_stats()
|
198 |
return stats['total_vector_count'] > 0
|
199 |
|
@@ -218,12 +165,10 @@ def general_conversation(message):
|
|
218 |
|
219 |
def is_sim_swap_request(message):
|
220 |
sim_swap_keywords = {'sim', 'swap', 'change', 'new', 'replace'}
|
221 |
-
# Remove the question mark at the end if it exists
|
222 |
message = message.rstrip('?')
|
223 |
message_words = set(message.lower().split())
|
224 |
return len(sim_swap_keywords.intersection(message_words)) >= 2
|
225 |
|
226 |
-
# Add a print statement for debugging
|
227 |
print(f"is_sim_swap_request result: {is_sim_swap_request('how to change my sim?')}")
|
228 |
|
229 |
def trigger_sim_swap_workflow():
|
@@ -294,7 +239,6 @@ def chat_with_ai(message):
|
|
294 |
for result in results
|
295 |
]
|
296 |
else:
|
297 |
-
# Fallback to general conversation if no relevant context is found or similarity is low
|
298 |
ai_response = general_conversation(message)
|
299 |
sources = []
|
300 |
|
@@ -309,7 +253,6 @@ def clear_database():
|
|
309 |
print("Database cleared")
|
310 |
return "Database cleared successfully."
|
311 |
|
312 |
-
# Streamlit UI
|
313 |
st.title("Lyca Mobile Assistant")
|
314 |
|
315 |
if 'workflow' not in st.session_state:
|
@@ -320,7 +263,6 @@ if 'workflow' not in st.session_state:
|
|
320 |
if 'chat_history' not in st.session_state:
|
321 |
st.session_state.chat_history = []
|
322 |
|
323 |
-
# Create two columns instead of three
|
324 |
col1, col2 = st.columns([2, 1])
|
325 |
|
326 |
with col1:
|
@@ -329,14 +271,12 @@ with col1:
|
|
329 |
if st.session_state.workflow == 'sim_swap':
|
330 |
process_sim_swap_workflow()
|
331 |
else:
|
332 |
-
# Display chat history
|
333 |
for message in st.session_state.chat_history:
|
334 |
st.markdown(f"**{'You' if message['role'] == 'user' else 'AI'}:** {message['content']}")
|
335 |
|
336 |
user_input = st.text_input("How can I assist you with Lyca Mobile today?")
|
337 |
if st.button("Send"):
|
338 |
if user_input:
|
339 |
-
# Add debug print
|
340 |
print(f"User input: {user_input}")
|
341 |
is_swap_request = is_sim_swap_request(user_input)
|
342 |
print(f"Is sim swap request: {is_swap_request}")
|
@@ -348,20 +288,16 @@ with col1:
|
|
348 |
st.session_state.workflow = 'sim_swap'
|
349 |
else:
|
350 |
print("Proceeding with regular chat flow")
|
351 |
-
# Existing code for non-sim-swap requests
|
352 |
st.session_state.chat_progress = st.progress(0)
|
353 |
response, sources = chat_with_ai(user_input)
|
354 |
st.session_state.chat_progress.progress(1.0)
|
355 |
|
356 |
-
# Add to chat history
|
357 |
st.session_state.chat_history.append({"role": "user", "content": user_input})
|
358 |
st.session_state.chat_history.append({"role": "assistant", "content": response})
|
359 |
|
360 |
-
# Display the latest messages
|
361 |
st.markdown("**You:** " + user_input)
|
362 |
st.markdown("**AI:** " + response)
|
363 |
|
364 |
-
# Store sources in session state for display in col2
|
365 |
st.session_state.sources = sources
|
366 |
st.session_state.chat_progress.empty()
|
367 |
else:
|
|
|
1 |
import os
|
2 |
import streamlit as st
|
3 |
from openai import OpenAI
|
|
|
4 |
import requests
|
|
|
|
|
5 |
from pinecone import Pinecone
|
6 |
import uuid
|
7 |
from dotenv import load_dotenv
|
|
|
18 |
from pymongo.errors import ConnectionFailure
|
19 |
from datetime import datetime
|
20 |
|
|
|
21 |
st.set_page_config(layout="wide")
|
22 |
|
|
|
23 |
load_dotenv()
|
24 |
|
|
|
25 |
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
26 |
|
|
|
27 |
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
|
28 |
|
29 |
+
index_name = "lyca"
|
30 |
index = pc.Index(index_name)
|
31 |
|
|
|
32 |
mongo_uri = os.getenv("MONGODB_URI")
|
33 |
|
34 |
try:
|
35 |
client = MongoClient(mongo_uri, serverSelectionTimeoutMS=5000)
|
36 |
+
client.server_info()
|
37 |
db = client['lyca']
|
38 |
sim_swap_collection = db['sim_swap_requests']
|
39 |
except ConnectionFailure:
|
|
|
44 |
response = client.embeddings.create(input=text, model="text-embedding-3-large")
|
45 |
return response.data[0].embedding
|
46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
def process_web_link(url):
|
48 |
try:
|
|
|
49 |
chrome_options = Options()
|
50 |
+
chrome_options.add_argument("--headless")
|
51 |
chrome_options.add_argument("--no-sandbox")
|
52 |
chrome_options.add_argument("--disable-dev-shm-usage")
|
53 |
|
|
|
54 |
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
|
55 |
|
|
|
56 |
driver.get(url)
|
57 |
|
|
|
58 |
time.sleep(3)
|
59 |
|
|
|
60 |
page_source = driver.page_source
|
61 |
|
|
|
62 |
driver.quit()
|
63 |
|
|
|
64 |
soup = BeautifulSoup(page_source, 'lxml')
|
65 |
|
|
|
66 |
for script in soup(["script", "style"]):
|
67 |
script.decompose()
|
68 |
|
|
|
69 |
text = soup.get_text()
|
70 |
|
|
|
71 |
lines = (line.strip() for line in text.splitlines())
|
72 |
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
73 |
text = '\n'.join(chunk for chunk in chunks if chunk)
|
|
|
77 |
print(f"Error processing web link {url}: {str(e)}")
|
78 |
return f"Error processing {url}: {str(e)}"
|
79 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
def process_upload(upload_type, file_or_link, file_name=None):
|
81 |
print(f"Starting process_upload for {upload_type}")
|
82 |
doc_id = str(uuid.uuid4())
|
83 |
print(f"Generated doc_id: {doc_id}")
|
84 |
|
85 |
+
if upload_type == "Web Link":
|
|
|
|
|
|
|
86 |
content = process_web_link(file_or_link)
|
87 |
doc_name = file_or_link
|
|
|
|
|
|
|
88 |
else:
|
89 |
print("Invalid upload type")
|
90 |
return "Invalid upload type"
|
|
|
92 |
content_length = len(content)
|
93 |
print(f"Content extracted, length: {content_length}")
|
94 |
|
|
|
95 |
if content_length < 10000:
|
96 |
chunk_size = 1000
|
97 |
elif content_length < 100000:
|
|
|
108 |
|
109 |
for future in as_completed(futures):
|
110 |
vectors.append(future.result())
|
|
|
111 |
progress = len(vectors) / len(chunks)
|
112 |
st.session_state.upload_progress.progress(progress)
|
113 |
|
|
|
135 |
search_results = index.query(vector=query_embedding, top_k=top_k, include_metadata=True)
|
136 |
print(f"Found {len(search_results['matches'])} relevant results")
|
137 |
|
|
|
138 |
sorted_results = sorted(search_results['matches'], key=lambda x: (x['metadata']['doc_id'], x['metadata']['chunk_index']))
|
139 |
|
140 |
context = "\n".join([result['metadata']['text'] for result in sorted_results])
|
141 |
return context, sorted_results
|
142 |
|
143 |
def check_lyca_data_loaded():
|
|
|
144 |
stats = index.describe_index_stats()
|
145 |
return stats['total_vector_count'] > 0
|
146 |
|
|
|
165 |
|
166 |
def is_sim_swap_request(message):
|
167 |
sim_swap_keywords = {'sim', 'swap', 'change', 'new', 'replace'}
|
|
|
168 |
message = message.rstrip('?')
|
169 |
message_words = set(message.lower().split())
|
170 |
return len(sim_swap_keywords.intersection(message_words)) >= 2
|
171 |
|
|
|
172 |
print(f"is_sim_swap_request result: {is_sim_swap_request('how to change my sim?')}")
|
173 |
|
174 |
def trigger_sim_swap_workflow():
|
|
|
239 |
for result in results
|
240 |
]
|
241 |
else:
|
|
|
242 |
ai_response = general_conversation(message)
|
243 |
sources = []
|
244 |
|
|
|
253 |
print("Database cleared")
|
254 |
return "Database cleared successfully."
|
255 |
|
|
|
256 |
st.title("Lyca Mobile Assistant")
|
257 |
|
258 |
if 'workflow' not in st.session_state:
|
|
|
263 |
if 'chat_history' not in st.session_state:
|
264 |
st.session_state.chat_history = []
|
265 |
|
|
|
266 |
col1, col2 = st.columns([2, 1])
|
267 |
|
268 |
with col1:
|
|
|
271 |
if st.session_state.workflow == 'sim_swap':
|
272 |
process_sim_swap_workflow()
|
273 |
else:
|
|
|
274 |
for message in st.session_state.chat_history:
|
275 |
st.markdown(f"**{'You' if message['role'] == 'user' else 'AI'}:** {message['content']}")
|
276 |
|
277 |
user_input = st.text_input("How can I assist you with Lyca Mobile today?")
|
278 |
if st.button("Send"):
|
279 |
if user_input:
|
|
|
280 |
print(f"User input: {user_input}")
|
281 |
is_swap_request = is_sim_swap_request(user_input)
|
282 |
print(f"Is sim swap request: {is_swap_request}")
|
|
|
288 |
st.session_state.workflow = 'sim_swap'
|
289 |
else:
|
290 |
print("Proceeding with regular chat flow")
|
|
|
291 |
st.session_state.chat_progress = st.progress(0)
|
292 |
response, sources = chat_with_ai(user_input)
|
293 |
st.session_state.chat_progress.progress(1.0)
|
294 |
|
|
|
295 |
st.session_state.chat_history.append({"role": "user", "content": user_input})
|
296 |
st.session_state.chat_history.append({"role": "assistant", "content": response})
|
297 |
|
|
|
298 |
st.markdown("**You:** " + user_input)
|
299 |
st.markdown("**AI:** " + response)
|
300 |
|
|
|
301 |
st.session_state.sources = sources
|
302 |
st.session_state.chat_progress.empty()
|
303 |
else:
|