samiee2213 commited on
Commit
14a0aaa
·
verified ·
1 Parent(s): 98915c7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +188 -109
app.py CHANGED
@@ -7,6 +7,9 @@ from googleapiclient.discovery import build
7
  from streamlit_chat import message as st_message
8
  import plotly.express as px
9
  import re
 
 
 
10
  import warnings
11
  import time
12
  from langchain.schema import HumanMessage, SystemMessage, AIMessage
@@ -18,10 +21,13 @@ from langchain.agents import initialize_agent, Tool
18
  from langchain.agents import AgentType
19
  from langchain_groq import ChatGroq
20
  import numpy as np
 
21
  from dotenv import load_dotenv
22
 
23
  warnings.filterwarnings("ignore", category=DeprecationWarning)
24
-
 
 
25
  #environment
26
  load_dotenv()
27
  GROQ_API_KEY = os.getenv("GROQ_API_KEY")
@@ -56,45 +62,70 @@ agent = initialize_agent(
56
  )
57
 
58
  # Function to perform the web search and get results
59
- def perform_web_search(query):
60
- search_results = search.run(query)
61
- return search_results
62
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  # Function to get LLM response for dynamic queries
 
64
  def get_llm_response(entity, query, web_results):
65
  prompt = f"""
66
  Extract relevant {query} (e.g., email, phone number) from the following web results for the entity: {entity}.
67
  Web Results: {web_results}
68
  """
69
-
70
  human_message_content = f"""
71
  Entity: {entity}
72
  Query: {query}
73
  Web Results: {web_results}
74
  """
75
-
76
- response = agent.invoke([system_message_content, human_message_content])
77
- extracted_info = response.get("output", "Information not available").strip()
78
 
79
- # Clean up irrelevant parts of the response
80
- cleaned_info = re.sub(r"(Thought:|Action:)[^A-Za-z0-9]*", "", extracted_info).strip()
81
- return cleaned_info
82
 
 
 
 
 
 
83
  # Retry logic for multiple web searches if necessary
84
  def refine_answer_with_searches(entity, query, max_retries=3):
85
  search_results = perform_web_search(query.format(entity=entity))
86
  extracted_answer = get_llm_response(entity, query, search_results)
87
-
88
- retries = 0
89
- while retries < max_retries:
90
- if len(extracted_answer.split()) <= 2 or "not available" in extracted_answer.lower():
91
- retries += 1
92
- time.sleep(2)
93
- search_results = perform_web_search(query.format(entity=entity))
94
- extracted_answer = get_llm_response(entity, query, search_results)
95
- else:
96
- break
97
-
98
  return extracted_answer, search_results
99
 
100
  # Setup Google Sheets data fetch
@@ -122,24 +153,22 @@ with st.sidebar:
122
  )
123
 
124
  if selected == "Home":
125
-
126
  st.markdown("""
127
  <h1 style="text-align:center; color:#4CAF50; font-size: 40px;">🚀 Welcome to DataScribe</h1>
128
- <p style="text-align:center; font-size: 18px;">An AI-powered information extraction tool to streamline data retrieval and analysis.</p>
129
  """, unsafe_allow_html=True)
130
 
131
  st.markdown("""---""")
 
132
  def feature_card(title, description, icon, page):
133
  col1, col2 = st.columns([1, 4])
134
  with col1:
135
- st.markdown(f"<div style='font-size: 40px;'>{icon}</div>", unsafe_allow_html=True)
136
  with col2:
137
- if st.button(f"{title}", key=title):
138
  st.session_state.selected_page = page
139
- st.write(description)
140
 
141
-
142
-
143
  col1, col2 = st.columns([1, 1])
144
 
145
  with col1:
@@ -183,7 +212,7 @@ elif selected == "Upload Data":
183
  if data_source == "CSV Files":
184
  if "data" in st.session_state:
185
  st.success("Data uploaded successfully! Here is a preview:")
186
- st.dataframe(st.session_state["data"])
187
  else:
188
  uploaded_files = st.file_uploader("Upload your CSV files", type=["csv"], accept_multiple_files=True)
189
 
@@ -200,45 +229,69 @@ elif selected == "Upload Data":
200
  full_data = pd.concat(dfs, ignore_index=True)
201
  st.session_state["data"] = full_data
202
  st.success("Data uploaded successfully! Here is a preview:")
203
- st.dataframe(full_data)
204
  else:
205
  st.warning("No valid data found in the uploaded files.")
 
 
 
 
206
 
207
  elif data_source == "Google Sheets":
208
  sheet_id = st.text_input("Enter Google Sheet ID")
209
  range_name = st.text_input("Enter the data range (e.g., Sheet1!A1:C100)")
210
 
211
- if st.button("Fetch Data"):
212
- try:
213
- data = get_google_sheet_data(sheet_id, range_name)
214
- st.session_state["data"] = data
215
- st.write("Data fetched successfully. Here is a preview:")
216
- st.dataframe(data)
217
- except Exception as e:
218
- st.error(f"Error fetching data: {e}")
 
 
 
 
 
219
 
220
  elif selected == "Define Query":
221
  st.header("Define Your Custom Query")
222
-
223
  if "data" not in st.session_state or st.session_state["data"] is None:
224
- st.warning("Please upload data first!")
225
  else:
226
- column = st.selectbox("Select entity column", st.session_state["data"].columns)
 
 
 
 
227
 
228
- st.markdown(f"""
229
  <style>
230
  div[data-baseweb="select"] div[data-id="select"] {{
231
  background-color: #f0f8ff;
232
  }}
233
  </style>
234
  """, unsafe_allow_html=True)
235
-
236
  st.subheader("Define Fields to Extract")
237
- num_fields = st.number_input("Number of fields to extract", min_value=1, value=1, step=1)
 
 
 
 
 
 
238
 
239
  fields = []
240
  for i in range(num_fields):
241
- field = st.text_input(f"Field {i+1} name", key=f"field_{i}")
 
 
 
 
 
242
  if field:
243
  fields.append(field)
244
 
@@ -246,7 +299,8 @@ elif selected == "Define Query":
246
  st.subheader("Query Template")
247
  query_template = st.text_area(
248
  "Enter query template (Use '{entity}' to represent each entity)",
249
- value=f"Find the {', '.join(fields)} for {{entity}}"
 
250
  )
251
 
252
  if "{entity}" in query_template:
@@ -256,11 +310,15 @@ elif selected == "Define Query":
256
  st.code(example_query)
257
 
258
  if st.button("Save Query Configuration"):
259
- st.session_state["column_selection"] = column
260
- st.session_state["query_template"] = query_template
261
- st.session_state["extraction_fields"] = fields
262
- st.success("Query configuration saved!")
263
-
 
 
 
 
264
 
265
  elif selected == "Extract Information":
266
  st.header("Extract Information")
@@ -274,51 +332,41 @@ elif selected == "Extract Information":
274
  st.write("### Selected Entity Column:")
275
  st.dataframe(entities_column)
276
 
277
- st.write("Data extraction is in progress. This may take a few moments.")
278
-
279
- # Custom styled progress bar
280
- progress_bar = st.progress(0)
281
-
282
- # Custom CSS for a cute progress bar style
283
- st.markdown("""
284
- <style>
285
- .stProgress > div {
286
- background-color: #FFB6C1; /* Light pink */
287
- border-radius: 20px;
288
- height: 15px;
289
- }
290
- </style>
291
- """, unsafe_allow_html=True)
292
-
293
- try:
294
- results = []
295
- for i, selected_entity in enumerate(entities_column):
296
- user_query = st.session_state["query_template"].replace("{entity}", str(selected_entity))
297
- final_answer, search_results = refine_answer_with_searches(selected_entity, user_query)
298
- results.append({
299
- "Entity": selected_entity,
300
- "Extracted Information": final_answer,
301
- "Search Results": search_results
302
- })
303
-
304
- # Update progress bar with a smooth and cute animation
305
- progress_bar.progress(int((i + 1) / len(entities_column) * 100))
306
-
307
- st.session_state["results"] = results
308
-
309
- st.write("### Extracted Information")
310
- for result in results:
311
- st.write(f"**Entity:** {result['Entity']}")
312
- st.write(f"**Extracted Information:** {result['Extracted Information']}")
313
 
314
- st.write("### Web Results:")
315
- for result in results:
316
- st.write(result["Search Results"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
317
 
318
- except Exception as e:
319
- st.error(f"An error occurred while extracting information: {e}")
320
  else:
321
  st.warning("Please upload your data and define the query template.")
 
322
  elif selected == "View & Download":
323
  st.header("View & Download Results")
324
 
@@ -326,27 +374,58 @@ elif selected == "View & Download":
326
  results_df = pd.DataFrame(st.session_state["results"])
327
  st.write("### Results Preview")
328
 
 
329
  st.dataframe(results_df.style.applymap(lambda val: 'background-color: #d3f4ff' if isinstance(val, str) else '', subset=["Extracted Information", "Search Results"]))
330
 
331
- st.download_button(
332
- label="Download all results as CSV",
333
- data=results_df.to_csv(index=False),
334
- file_name="extracted_results.csv",
335
- mime="text/csv"
336
  )
337
 
338
- st.download_button(
339
- label="Download Extracted Information as CSV",
340
- data=results_df[["Entity", "Extracted Information"]].to_csv(index=False),
341
- file_name="extracted_information.csv",
342
- mime="text/csv"
343
- )
344
 
345
  st.download_button(
346
- label="Download Web Results as CSV",
347
- data=results_df[["Entity", "Search Results"]].to_csv(index=False),
348
- file_name="web_results.csv",
349
  mime="text/csv"
350
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
351
  else:
352
- st.warning("No results available to view. Please run the extraction process.")
 
7
  from streamlit_chat import message as st_message
8
  import plotly.express as px
9
  import re
10
+ import streamlit as st
11
+ import gspread
12
+ from google.oauth2.service_account import Credentials
13
  import warnings
14
  import time
15
  from langchain.schema import HumanMessage, SystemMessage, AIMessage
 
21
  from langchain.agents import AgentType
22
  from langchain_groq import ChatGroq
23
  import numpy as np
24
+ import gspread
25
  from dotenv import load_dotenv
26
 
27
  warnings.filterwarnings("ignore", category=DeprecationWarning)
28
+ scopes = ["https://www.googleapis.com/auth/spreadsheets"]
29
+ creds = Credentials.from_service_account_file("credentials.json", scopes=scopes)
30
+ client = gspread.authorize(creds)
31
  #environment
32
  load_dotenv()
33
  GROQ_API_KEY = os.getenv("GROQ_API_KEY")
 
62
  )
63
 
64
  # Function to perform the web search and get results
65
+ def perform_web_search(query, max_retries=3, delay=2):
66
+ retries = 0
67
+ while retries < max_retries:
68
+ try:
69
+ search_results = search.run(query)
70
+ return search_results
71
+ except Exception as e:
72
+ retries += 1
73
+ st.warning(f"Web search failed for query '{query}'. Retrying ({retries}/{max_retries})...")
74
+ time.sleep(delay)
75
+ st.error(f"Failed to perform web search for query '{query}' after {max_retries} retries.")
76
+ return "NaN"
77
+ def update_google_sheet(sheet_id, range_name, data):
78
+ try:
79
+ # Define the Google Sheets API scope
80
+ scopes = ["https://www.googleapis.com/auth/spreadsheets"]
81
+ creds = Credentials.from_service_account_file("credentials.json", scopes=scopes)
82
+ client = gspread.authorize(creds)
83
+
84
+ # Open the Google Sheet and specify the worksheet
85
+ sheet = client.open_by_key(sheet_id).worksheet(range_name.split("!")[0])
86
+
87
+ # Prepare data for update
88
+ data_to_update = [data.columns.tolist()] + data.values.tolist()
89
+
90
+ # Clear the existing content in the specified range and update it with new data
91
+ sheet.clear()
92
+ sheet.update(range_name, data_to_update)
93
+
94
+ st.success("Data successfully updated in the Google Sheet!")
95
+ except Exception as e:
96
+ st.error(f"Error updating Google Sheet: {e}")
97
  # Function to get LLM response for dynamic queries
98
+
99
  def get_llm_response(entity, query, web_results):
100
  prompt = f"""
101
  Extract relevant {query} (e.g., email, phone number) from the following web results for the entity: {entity}.
102
  Web Results: {web_results}
103
  """
104
+
105
  human_message_content = f"""
106
  Entity: {entity}
107
  Query: {query}
108
  Web Results: {web_results}
109
  """
 
 
 
110
 
111
+ try:
112
+ response = agent.invoke([system_message_content, human_message_content], handle_parsing_errors=True)
113
+ extracted_info = response.get("output", "Information not available").strip()
114
 
115
+ # Clean up irrelevant parts of the response
116
+ cleaned_info = re.sub(r"(Thought:|Action:)[^A-Za-z0-9]*", "", extracted_info).strip()
117
+ return cleaned_info
118
+ except Exception as e:
119
+ return "NaN"
120
  # Retry logic for multiple web searches if necessary
121
  def refine_answer_with_searches(entity, query, max_retries=3):
122
  search_results = perform_web_search(query.format(entity=entity))
123
  extracted_answer = get_llm_response(entity, query, search_results)
124
+
125
+ if len(extracted_answer.split()) <= 2 or "not available" in extracted_answer.lower():
126
+ search_results = perform_web_search(query.format(entity=entity))
127
+ extracted_answer = get_llm_response(entity, query, search_results)
128
+
 
 
 
 
 
 
129
  return extracted_answer, search_results
130
 
131
  # Setup Google Sheets data fetch
 
153
  )
154
 
155
  if selected == "Home":
 
156
  st.markdown("""
157
  <h1 style="text-align:center; color:#4CAF50; font-size: 40px;">🚀 Welcome to DataScribe</h1>
158
+ <p style="text-align:center; font-size: 18px; color:#333;">An AI-powered information extraction tool to streamline data retrieval and analysis.</p>
159
  """, unsafe_allow_html=True)
160
 
161
  st.markdown("""---""")
162
+
163
  def feature_card(title, description, icon, page):
164
  col1, col2 = st.columns([1, 4])
165
  with col1:
166
+ st.markdown(f"<div style='font-size: 40px; text-align:center;'>{icon}</div>", unsafe_allow_html=True)
167
  with col2:
168
+ if st.button(f"{title}", key=title, help=description):
169
  st.session_state.selected_page = page
170
+ st.markdown(f"<p style='font-size: 14px; color:#555;'>{description}</p>", unsafe_allow_html=True)
171
 
 
 
172
  col1, col2 = st.columns([1, 1])
173
 
174
  with col1:
 
212
  if data_source == "CSV Files":
213
  if "data" in st.session_state:
214
  st.success("Data uploaded successfully! Here is a preview:")
215
+ st.dataframe(st.session_state["data"].head(10)) # Display only the first 10 rows for a cleaner view
216
  else:
217
  uploaded_files = st.file_uploader("Upload your CSV files", type=["csv"], accept_multiple_files=True)
218
 
 
229
  full_data = pd.concat(dfs, ignore_index=True)
230
  st.session_state["data"] = full_data
231
  st.success("Data uploaded successfully! Here is a preview:")
232
+ st.dataframe(full_data.head(10)) # Show preview of first 10 rows
233
  else:
234
  st.warning("No valid data found in the uploaded files.")
235
+
236
+ if st.button("Clear Data"):
237
+ del st.session_state["data"]
238
+ st.success("Data has been cleared!")
239
 
240
  elif data_source == "Google Sheets":
241
  sheet_id = st.text_input("Enter Google Sheet ID")
242
  range_name = st.text_input("Enter the data range (e.g., Sheet1!A1:C100)")
243
 
244
+ if sheet_id and range_name:
245
+ if st.button("Fetch Data"):
246
+ with st.spinner("Fetching data from Google Sheets..."):
247
+ try:
248
+ data = get_google_sheet_data(sheet_id, range_name)
249
+ st.session_state["data"] = data
250
+ st.success("Data fetched successfully! Here is a preview:")
251
+ st.dataframe(data.head(10)) # Show preview of first 10 rows
252
+ except Exception as e:
253
+ st.error(f"Error fetching data: {e}")
254
+ else:
255
+ st.warning("Please enter both Sheet ID and Range name before fetching data.")
256
+
257
 
258
  elif selected == "Define Query":
259
  st.header("Define Your Custom Query")
260
+
261
  if "data" not in st.session_state or st.session_state["data"] is None:
262
+ st.warning("Please upload data first! Use the 'Upload Data' section to upload your data.")
263
  else:
264
+ column = st.selectbox(
265
+ "Select entity column",
266
+ st.session_state["data"].columns,
267
+ help="Select the column that contains the entities for which you want to define queries."
268
+ )
269
 
270
+ st.markdown("""
271
  <style>
272
  div[data-baseweb="select"] div[data-id="select"] {{
273
  background-color: #f0f8ff;
274
  }}
275
  </style>
276
  """, unsafe_allow_html=True)
277
+
278
  st.subheader("Define Fields to Extract")
279
+ num_fields = st.number_input(
280
+ "Number of fields to extract",
281
+ min_value=1,
282
+ value=1,
283
+ step=1,
284
+ help="Specify how many fields you want to extract from each entity."
285
+ )
286
 
287
  fields = []
288
  for i in range(num_fields):
289
+ field = st.text_input(
290
+ f"Field {i+1} name",
291
+ key=f"field_{i}",
292
+ placeholder=f"Enter field name for {i+1}",
293
+ help="Name the field you want to extract from the entity."
294
+ )
295
  if field:
296
  fields.append(field)
297
 
 
299
  st.subheader("Query Template")
300
  query_template = st.text_area(
301
  "Enter query template (Use '{entity}' to represent each entity)",
302
+ value=f"Find the {', '.join(fields)} for {{entity}}",
303
+ help="You can use {entity} as a placeholder to represent each entity in the query."
304
  )
305
 
306
  if "{entity}" in query_template:
 
310
  st.code(example_query)
311
 
312
  if st.button("Save Query Configuration"):
313
+ if not fields:
314
+ st.error("Please define at least one field to extract.")
315
+ elif not query_template:
316
+ st.error("Please enter a query template.")
317
+ else:
318
+ st.session_state["column_selection"] = column
319
+ st.session_state["query_template"] = query_template
320
+ st.session_state["extraction_fields"] = fields
321
+ st.success("Query configuration saved successfully!")
322
 
323
  elif selected == "Extract Information":
324
  st.header("Extract Information")
 
332
  st.write("### Selected Entity Column:")
333
  st.dataframe(entities_column)
334
 
335
+ if st.button("Start Extraction"):
336
+ st.write("Data extraction is in progress. This may take a few moments.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
337
 
338
+ # Custom styled progress bar
339
+ progress_bar = st.progress(0)
340
+ try:
341
+ results = []
342
+ for i, selected_entity in enumerate(entities_column):
343
+ user_query = st.session_state["query_template"].replace("{entity}", str(selected_entity))
344
+ final_answer, search_results = refine_answer_with_searches(selected_entity, user_query)
345
+ results.append({
346
+ "Entity": selected_entity,
347
+ "Extracted Information": final_answer,
348
+ "Search Results": search_results
349
+ })
350
+
351
+ # Update progress bar with a smooth and cute animation
352
+ progress_bar.progress(int((i + 1) / len(entities_column) * 100))
353
+
354
+ st.session_state["results"] = results
355
+
356
+ st.write("### Extracted Information")
357
+ for result in results:
358
+ st.write(f"**Entity:** {result['Entity']}")
359
+ st.write(f"**Extracted Information:** {result['Extracted Information']}")
360
+
361
+ st.write("### Web Results:")
362
+ for result in results:
363
+ st.write(result["Search Results"])
364
 
365
+ except Exception as e:
366
+ st.error(f"An error occurred while extracting information: {e}")
367
  else:
368
  st.warning("Please upload your data and define the query template.")
369
+
370
  elif selected == "View & Download":
371
  st.header("View & Download Results")
372
 
 
374
  results_df = pd.DataFrame(st.session_state["results"])
375
  st.write("### Results Preview")
376
 
377
+ # Display results with some background color for the relevant columns
378
  st.dataframe(results_df.style.applymap(lambda val: 'background-color: #d3f4ff' if isinstance(val, str) else '', subset=["Extracted Information", "Search Results"]))
379
 
380
+ download_option = st.selectbox(
381
+ "Select data to download:",
382
+ ["All Results", "Extracted Information", "Web Results"]
 
 
383
  )
384
 
385
+ if download_option == "All Results":
386
+ data_to_download = results_df
387
+ elif download_option == "Extracted Information":
388
+ data_to_download = results_df[["Entity", "Extracted Information"]]
389
+ elif download_option == "Web Results":
390
+ data_to_download = results_df[["Entity", "Search Results"]]
391
 
392
  st.download_button(
393
+ label=f"Download {download_option} as CSV",
394
+ data=data_to_download.to_csv(index=False),
395
+ file_name=f"{download_option.lower().replace(' ', '_')}.csv",
396
  mime="text/csv"
397
  )
398
+
399
+ # To ensure the inputs and button are persistent, store their values in session_state
400
+ if 'sheet_id' not in st.session_state:
401
+ st.session_state.sheet_id = ''
402
+ if 'range_name' not in st.session_state:
403
+ st.session_state.range_name = ''
404
+
405
+ sheet_id = st.text_input("Enter Google Sheet ID", value=st.session_state.sheet_id)
406
+ range_name = st.text_input("Enter Range (e.g., 'Sheet1!A1')", value=st.session_state.range_name)
407
+
408
+ if sheet_id and range_name:
409
+ st.session_state.sheet_id = sheet_id
410
+ st.session_state.range_name = range_name
411
+
412
+ # Define data_to_update to update the Google Sheet
413
+ data_to_update = [results_df.columns.tolist()] + results_df.values.tolist()
414
+
415
+ # Update Google Sheets button
416
+ if st.button("Update Google Sheet"):
417
+ try:
418
+ if '!' not in range_name:
419
+ st.error("Invalid range format. Please use the format 'SheetName!Range'.")
420
+ else:
421
+ sheet_name, cell_range = range_name.split('!', 1)
422
+ sheet = client.open_by_key(sheet_id).worksheet(sheet_name)
423
+ sheet.clear() # Clear the existing data before updating
424
+ sheet.update(f"{cell_range}", data_to_update) # Update the data to the specified range
425
+ st.success("Data updated in the Google Sheet!")
426
+ except Exception as e:
427
+ st.error(f"Error updating Google Sheet: {e}")
428
+ else:
429
+ st.warning("Please enter both the Sheet ID and Range name before updating.")
430
  else:
431
+ st.warning("No results available to view. Please run the extraction process.")