poemsforaphrodite commited on
Commit
f5c431c
·
verified ·
1 Parent(s): 52c4f09

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -68
app.py CHANGED
@@ -1,10 +1,7 @@
1
  import os
2
  import streamlit as st
3
  from openai import OpenAI
4
- from PyPDF2 import PdfReader
5
  import requests
6
- from youtube_transcript_api import YouTubeTranscriptApi
7
- from urllib.parse import urlparse, parse_qs
8
  from pinecone import Pinecone
9
  import uuid
10
  from dotenv import load_dotenv
@@ -21,27 +18,22 @@ from pymongo import MongoClient
21
  from pymongo.errors import ConnectionFailure
22
  from datetime import datetime
23
 
24
- # Set page config at the very beginning
25
  st.set_page_config(layout="wide")
26
 
27
- # Load environment variables
28
  load_dotenv()
29
 
30
- # Set up OpenAI client
31
  client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
32
 
33
- # Set up Pinecone
34
  pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
35
 
36
- index_name = "lyca" # Your index name
37
  index = pc.Index(index_name)
38
 
39
- # Set up MongoDB connection
40
  mongo_uri = os.getenv("MONGODB_URI")
41
 
42
  try:
43
  client = MongoClient(mongo_uri, serverSelectionTimeoutMS=5000)
44
- client.server_info() # This will raise an exception if the connection fails
45
  db = client['lyca']
46
  sim_swap_collection = db['sim_swap_requests']
47
  except ConnectionFailure:
@@ -52,47 +44,30 @@ def get_embedding(text):
52
  response = client.embeddings.create(input=text, model="text-embedding-3-large")
53
  return response.data[0].embedding
54
 
55
- def process_pdf(file):
56
- reader = PdfReader(file)
57
- text = ""
58
- for page in reader.pages:
59
- text += page.extract_text() + "\n"
60
- return text
61
-
62
  def process_web_link(url):
63
  try:
64
- # Set up Selenium options
65
  chrome_options = Options()
66
- chrome_options.add_argument("--headless") # Run in headless mode for performance
67
  chrome_options.add_argument("--no-sandbox")
68
  chrome_options.add_argument("--disable-dev-shm-usage")
69
 
70
- # Install the Chrome driver automatically using webdriver-manager
71
  driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
72
 
73
- # Navigate to the URL
74
  driver.get(url)
75
 
76
- # Give the page some time to load fully
77
  time.sleep(3)
78
 
79
- # Extract the rendered page's content
80
  page_source = driver.page_source
81
 
82
- # Close the browser after extracting content
83
  driver.quit()
84
 
85
- # Parse the page content using BeautifulSoup
86
  soup = BeautifulSoup(page_source, 'lxml')
87
 
88
- # Remove script and style elements
89
  for script in soup(["script", "style"]):
90
  script.decompose()
91
 
92
- # Get text
93
  text = soup.get_text()
94
 
95
- # Clean up the text
96
  lines = (line.strip() for line in text.splitlines())
97
  chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
98
  text = '\n'.join(chunk for chunk in chunks if chunk)
@@ -102,38 +77,14 @@ def process_web_link(url):
102
  print(f"Error processing web link {url}: {str(e)}")
103
  return f"Error processing {url}: {str(e)}"
104
 
105
- def process_youtube_link(url):
106
- video_id = extract_video_id(url)
107
- transcript = YouTubeTranscriptApi.get_transcript(video_id)
108
- return " ".join([entry['text'] for entry in transcript])
109
-
110
- def extract_video_id(url):
111
- parsed_url = urlparse(url)
112
- if parsed_url.hostname == 'youtu.be':
113
- return parsed_url.path[1:]
114
- if parsed_url.hostname in ('www.youtube.com', 'youtube.com'):
115
- if parsed_url.path == '/watch':
116
- return parse_qs(parsed_url.query)['v'][0]
117
- if parsed_url.path[:7] == '/embed/':
118
- return parsed_url.path.split('/')[2]
119
- if parsed_url.path[:3] == '/v/':
120
- return parsed_url.path.split('/')[2]
121
- return None
122
-
123
  def process_upload(upload_type, file_or_link, file_name=None):
124
  print(f"Starting process_upload for {upload_type}")
125
  doc_id = str(uuid.uuid4())
126
  print(f"Generated doc_id: {doc_id}")
127
 
128
- if upload_type == "PDF":
129
- content = process_pdf(file_or_link)
130
- doc_name = file_name or "Uploaded PDF"
131
- elif upload_type == "Web Link":
132
  content = process_web_link(file_or_link)
133
  doc_name = file_or_link
134
- elif upload_type == "YouTube Link":
135
- content = process_youtube_link(file_or_link)
136
- doc_name = f"YouTube: {file_or_link}"
137
  else:
138
  print("Invalid upload type")
139
  return "Invalid upload type"
@@ -141,7 +92,6 @@ def process_upload(upload_type, file_or_link, file_name=None):
141
  content_length = len(content)
142
  print(f"Content extracted, length: {content_length}")
143
 
144
- # Dynamically adjust chunk size based on content length
145
  if content_length < 10000:
146
  chunk_size = 1000
147
  elif content_length < 100000:
@@ -158,7 +108,6 @@ def process_upload(upload_type, file_or_link, file_name=None):
158
 
159
  for future in as_completed(futures):
160
  vectors.append(future.result())
161
- # Update progress
162
  progress = len(vectors) / len(chunks)
163
  st.session_state.upload_progress.progress(progress)
164
 
@@ -186,14 +135,12 @@ def get_relevant_context(query, top_k=5):
186
  search_results = index.query(vector=query_embedding, top_k=top_k, include_metadata=True)
187
  print(f"Found {len(search_results['matches'])} relevant results")
188
 
189
- # Sort results by doc_id and chunk_index to maintain document structure
190
  sorted_results = sorted(search_results['matches'], key=lambda x: (x['metadata']['doc_id'], x['metadata']['chunk_index']))
191
 
192
  context = "\n".join([result['metadata']['text'] for result in sorted_results])
193
  return context, sorted_results
194
 
195
  def check_lyca_data_loaded():
196
- # Check if there are any vectors in the index
197
  stats = index.describe_index_stats()
198
  return stats['total_vector_count'] > 0
199
 
@@ -218,12 +165,10 @@ def general_conversation(message):
218
 
219
  def is_sim_swap_request(message):
220
  sim_swap_keywords = {'sim', 'swap', 'change', 'new', 'replace'}
221
- # Remove the question mark at the end if it exists
222
  message = message.rstrip('?')
223
  message_words = set(message.lower().split())
224
  return len(sim_swap_keywords.intersection(message_words)) >= 2
225
 
226
- # Add a print statement for debugging
227
  print(f"is_sim_swap_request result: {is_sim_swap_request('how to change my sim?')}")
228
 
229
  def trigger_sim_swap_workflow():
@@ -294,7 +239,6 @@ def chat_with_ai(message):
294
  for result in results
295
  ]
296
  else:
297
- # Fallback to general conversation if no relevant context is found or similarity is low
298
  ai_response = general_conversation(message)
299
  sources = []
300
 
@@ -309,7 +253,6 @@ def clear_database():
309
  print("Database cleared")
310
  return "Database cleared successfully."
311
 
312
- # Streamlit UI
313
  st.title("Lyca Mobile Assistant")
314
 
315
  if 'workflow' not in st.session_state:
@@ -320,7 +263,6 @@ if 'workflow' not in st.session_state:
320
  if 'chat_history' not in st.session_state:
321
  st.session_state.chat_history = []
322
 
323
- # Create two columns instead of three
324
  col1, col2 = st.columns([2, 1])
325
 
326
  with col1:
@@ -329,14 +271,12 @@ with col1:
329
  if st.session_state.workflow == 'sim_swap':
330
  process_sim_swap_workflow()
331
  else:
332
- # Display chat history
333
  for message in st.session_state.chat_history:
334
  st.markdown(f"**{'You' if message['role'] == 'user' else 'AI'}:** {message['content']}")
335
 
336
  user_input = st.text_input("How can I assist you with Lyca Mobile today?")
337
  if st.button("Send"):
338
  if user_input:
339
- # Add debug print
340
  print(f"User input: {user_input}")
341
  is_swap_request = is_sim_swap_request(user_input)
342
  print(f"Is sim swap request: {is_swap_request}")
@@ -348,20 +288,16 @@ with col1:
348
  st.session_state.workflow = 'sim_swap'
349
  else:
350
  print("Proceeding with regular chat flow")
351
- # Existing code for non-sim-swap requests
352
  st.session_state.chat_progress = st.progress(0)
353
  response, sources = chat_with_ai(user_input)
354
  st.session_state.chat_progress.progress(1.0)
355
 
356
- # Add to chat history
357
  st.session_state.chat_history.append({"role": "user", "content": user_input})
358
  st.session_state.chat_history.append({"role": "assistant", "content": response})
359
 
360
- # Display the latest messages
361
  st.markdown("**You:** " + user_input)
362
  st.markdown("**AI:** " + response)
363
 
364
- # Store sources in session state for display in col2
365
  st.session_state.sources = sources
366
  st.session_state.chat_progress.empty()
367
  else:
 
1
  import os
2
  import streamlit as st
3
  from openai import OpenAI
 
4
  import requests
 
 
5
  from pinecone import Pinecone
6
  import uuid
7
  from dotenv import load_dotenv
 
18
  from pymongo.errors import ConnectionFailure
19
  from datetime import datetime
20
 
 
21
  st.set_page_config(layout="wide")
22
 
 
23
  load_dotenv()
24
 
 
25
  client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
26
 
 
27
  pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
28
 
29
+ index_name = "lyca"
30
  index = pc.Index(index_name)
31
 
 
32
  mongo_uri = os.getenv("MONGODB_URI")
33
 
34
  try:
35
  client = MongoClient(mongo_uri, serverSelectionTimeoutMS=5000)
36
+ client.server_info()
37
  db = client['lyca']
38
  sim_swap_collection = db['sim_swap_requests']
39
  except ConnectionFailure:
 
44
  response = client.embeddings.create(input=text, model="text-embedding-3-large")
45
  return response.data[0].embedding
46
 
 
 
 
 
 
 
 
47
  def process_web_link(url):
48
  try:
 
49
  chrome_options = Options()
50
+ chrome_options.add_argument("--headless")
51
  chrome_options.add_argument("--no-sandbox")
52
  chrome_options.add_argument("--disable-dev-shm-usage")
53
 
 
54
  driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
55
 
 
56
  driver.get(url)
57
 
 
58
  time.sleep(3)
59
 
 
60
  page_source = driver.page_source
61
 
 
62
  driver.quit()
63
 
 
64
  soup = BeautifulSoup(page_source, 'lxml')
65
 
 
66
  for script in soup(["script", "style"]):
67
  script.decompose()
68
 
 
69
  text = soup.get_text()
70
 
 
71
  lines = (line.strip() for line in text.splitlines())
72
  chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
73
  text = '\n'.join(chunk for chunk in chunks if chunk)
 
77
  print(f"Error processing web link {url}: {str(e)}")
78
  return f"Error processing {url}: {str(e)}"
79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  def process_upload(upload_type, file_or_link, file_name=None):
81
  print(f"Starting process_upload for {upload_type}")
82
  doc_id = str(uuid.uuid4())
83
  print(f"Generated doc_id: {doc_id}")
84
 
85
+ if upload_type == "Web Link":
 
 
 
86
  content = process_web_link(file_or_link)
87
  doc_name = file_or_link
 
 
 
88
  else:
89
  print("Invalid upload type")
90
  return "Invalid upload type"
 
92
  content_length = len(content)
93
  print(f"Content extracted, length: {content_length}")
94
 
 
95
  if content_length < 10000:
96
  chunk_size = 1000
97
  elif content_length < 100000:
 
108
 
109
  for future in as_completed(futures):
110
  vectors.append(future.result())
 
111
  progress = len(vectors) / len(chunks)
112
  st.session_state.upload_progress.progress(progress)
113
 
 
135
  search_results = index.query(vector=query_embedding, top_k=top_k, include_metadata=True)
136
  print(f"Found {len(search_results['matches'])} relevant results")
137
 
 
138
  sorted_results = sorted(search_results['matches'], key=lambda x: (x['metadata']['doc_id'], x['metadata']['chunk_index']))
139
 
140
  context = "\n".join([result['metadata']['text'] for result in sorted_results])
141
  return context, sorted_results
142
 
143
  def check_lyca_data_loaded():
 
144
  stats = index.describe_index_stats()
145
  return stats['total_vector_count'] > 0
146
 
 
165
 
166
  def is_sim_swap_request(message):
167
  sim_swap_keywords = {'sim', 'swap', 'change', 'new', 'replace'}
 
168
  message = message.rstrip('?')
169
  message_words = set(message.lower().split())
170
  return len(sim_swap_keywords.intersection(message_words)) >= 2
171
 
 
172
  print(f"is_sim_swap_request result: {is_sim_swap_request('how to change my sim?')}")
173
 
174
  def trigger_sim_swap_workflow():
 
239
  for result in results
240
  ]
241
  else:
 
242
  ai_response = general_conversation(message)
243
  sources = []
244
 
 
253
  print("Database cleared")
254
  return "Database cleared successfully."
255
 
 
256
  st.title("Lyca Mobile Assistant")
257
 
258
  if 'workflow' not in st.session_state:
 
263
  if 'chat_history' not in st.session_state:
264
  st.session_state.chat_history = []
265
 
 
266
  col1, col2 = st.columns([2, 1])
267
 
268
  with col1:
 
271
  if st.session_state.workflow == 'sim_swap':
272
  process_sim_swap_workflow()
273
  else:
 
274
  for message in st.session_state.chat_history:
275
  st.markdown(f"**{'You' if message['role'] == 'user' else 'AI'}:** {message['content']}")
276
 
277
  user_input = st.text_input("How can I assist you with Lyca Mobile today?")
278
  if st.button("Send"):
279
  if user_input:
 
280
  print(f"User input: {user_input}")
281
  is_swap_request = is_sim_swap_request(user_input)
282
  print(f"Is sim swap request: {is_swap_request}")
 
288
  st.session_state.workflow = 'sim_swap'
289
  else:
290
  print("Proceeding with regular chat flow")
 
291
  st.session_state.chat_progress = st.progress(0)
292
  response, sources = chat_with_ai(user_input)
293
  st.session_state.chat_progress.progress(1.0)
294
 
 
295
  st.session_state.chat_history.append({"role": "user", "content": user_input})
296
  st.session_state.chat_history.append({"role": "assistant", "content": response})
297
 
 
298
  st.markdown("**You:** " + user_input)
299
  st.markdown("**AI:** " + response)
300
 
 
301
  st.session_state.sources = sources
302
  st.session_state.chat_progress.empty()
303
  else: