samiee2213 commited on
Commit
98915c7
·
verified ·
1 Parent(s): d8796fc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +286 -147
app.py CHANGED
@@ -6,59 +6,112 @@ from google.oauth2 import service_account
6
  from googleapiclient.discovery import build
7
  from streamlit_chat import message as st_message
8
  import plotly.express as px
 
 
 
9
  from langchain.schema import HumanMessage, SystemMessage, AIMessage
10
  from langchain.chat_models import ChatOpenAI
11
  from langchain.memory import ConversationBufferWindowMemory
12
  from langchain.prompts import PromptTemplate
13
- import warnings
14
- import time
 
15
  from langchain_groq import ChatGroq
16
  import numpy as np
17
  from dotenv import load_dotenv
18
- import re
19
 
20
  warnings.filterwarnings("ignore", category=DeprecationWarning)
21
 
22
- # Load environment variables
23
  load_dotenv()
24
  GROQ_API_KEY = os.getenv("GROQ_API_KEY")
 
25
  llm = ChatGroq(model="llama-3.1-70b-versatile")
26
 
27
- PROMPT_TEMPLATE = """
28
- You are an expert information extraction assistant designed to obtain specific details from the web and external sources.
29
- You’ll be provided with an entity name and a query that specifies the type of information needed about that entity.
30
- Please follow the instructions carefully and return only the most relevant, accurate information.
31
 
32
- #### Entity Name: {entity}
33
- #### Query: {query}
34
 
35
- Instructions:
36
- 1. Extract the information directly related to the entity.
37
- 2. If available, include only verified, publicly accessible data.
38
- 3. Provide information in a single sentence or a short, structured response.
39
- 4. If the requested information isn’t available or verifiable, respond with "Information not available."
40
 
41
- #### Example Output Format:
42
- "Company: {entity} | Requested Information: {extracted_information}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
- Begin extraction.
45
- """
 
46
 
47
- # Function to get response from LLM
48
- def get_llm_response(entity, query):
49
- # Format the prompt with the entity and query
50
- prompt = PROMPT_TEMPLATE.format(entity=entity, query=query)
51
 
52
- # Request response from the LLM
53
- response = llm([SystemMessage(content=prompt)])
 
 
 
 
 
 
 
54
 
55
- # Return content or default message
56
- return response[0].content if response else "Information not available"
57
 
58
- # Streamlit app setup
 
 
 
 
 
 
 
 
 
 
 
 
59
  st.set_page_config(page_title="DataScribe", page_icon=":notebook_with_decorative_cover:", layout="wide")
60
 
61
- # Sidebar navigation
62
  with st.sidebar:
63
  selected = option_menu(
64
  "DataScribe Menu",
@@ -68,146 +121,232 @@ with st.sidebar:
68
  default_index=0
69
  )
70
 
71
- # Main header
72
- st.title("DataScribe: AI-Powered Information Extractor")
73
-
74
- # Initialize session states for data and results
75
- if "data" not in st.session_state:
76
- st.session_state["data"] = None
77
- if "results" not in st.session_state:
78
- st.session_state["results"] = None
79
- if "column_selection" not in st.session_state:
80
- st.session_state["column_selection"] = None
81
-
82
- # Helper function for Google Sheets API setup
83
- def get_google_sheet_data(sheet_id, range_name):
84
- credentials = service_account.Credentials.from_service_account_info(st.secrets["gcp_service_account"])
85
- service = build('sheets', 'v4', credentials=credentials)
86
- sheet = service.spreadsheets()
87
- result = sheet.values().get(spreadsheetId=sheet_id, range=range_name).execute()
88
- values = result.get('values', [])
89
- return pd.DataFrame(values[1:], columns=values[0])
90
-
91
- # Function to write results back to Google Sheets
92
- def update_google_sheet(sheet_id, range_name, data):
93
- credentials = service_account.Credentials.from_service_account_info(st.secrets["gcp_service_account"])
94
- service = build('sheets', 'v4', credentials=credentials)
95
- sheet = service.spreadsheets()
96
- body = {
97
- 'values': [data.columns.tolist()] + data.values.tolist()
98
- }
99
- sheet.values().update(
100
- spreadsheetId=sheet_id,
101
- range=range_name,
102
- valueInputOption="RAW",
103
- body=body
104
- ).execute()
105
-
106
- # Home Page
107
  if selected == "Home":
108
- st.markdown(
109
- """
110
- ### Welcome to DataScribe
111
- **DataScribe** is an AI-powered tool designed to extract structured information from the web
112
- based on entities in your data file. Start by uploading a CSV or Google Sheet and defining a
113
- custom search query.
114
- """
115
- )
116
- st.image("https://via.placeholder.com/1200x400.png?text=DataScribe+AI+Agent+Dashboard") # Placeholder banner image
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
 
118
- # Upload Data Section
119
  elif selected == "Upload Data":
120
  st.header("Upload or Connect Your Data")
121
-
122
- # CSV Upload
123
- data_source = st.radio("Choose data source:", ["CSV File", "Google Sheets"])
124
-
125
- if data_source == "CSV File":
126
- uploaded_file = st.file_uploader("Upload your CSV file", type=["csv"])
127
- if uploaded_file:
128
- st.session_state["data"] = pd.read_csv(uploaded_file)
129
- st.write("### Preview of Uploaded Data")
130
- st.dataframe(st.session_state["data"].head())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
 
132
  elif data_source == "Google Sheets":
133
  sheet_id = st.text_input("Enter Google Sheet ID")
134
  range_name = st.text_input("Enter the data range (e.g., Sheet1!A1:C100)")
 
135
  if st.button("Fetch Data"):
136
- if sheet_id and range_name:
137
- st.session_state["data"] = get_google_sheet_data(sheet_id, range_name)
138
- st.write("### Preview of Google Sheets Data")
139
- st.dataframe(st.session_state["data"].head())
140
- else:
141
- st.warning("Please enter both the Google Sheet ID and range.")
142
-
143
- # Define Query Section
144
  elif selected == "Define Query":
145
  st.header("Define Your Custom Query")
146
-
147
- if st.session_state["data"] is not None:
148
- column_selection = st.selectbox("Select the primary column for entities", options=st.session_state["data"].columns)
149
- query_template = st.text_input("Define your query template", "Get me the email for {company}")
150
- st.session_state["query_template"] = query_template
151
- st.session_state["column_selection"] = column_selection # Store column selection in session state
152
-
153
- st.write("### Example query preview")
154
- if column_selection:
155
- # Convert sample_entity to string to avoid replace errors
156
- sample_entity = str(st.session_state["data"][column_selection].iloc[0])
157
- example_query = query_template.replace("{company}", sample_entity)
158
- st.code(example_query)
159
  else:
160
- st.warning("Please upload data first.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
 
162
- # Extract Information Section with Progress Bar
163
  elif selected == "Extract Information":
164
  st.header("Extract Information")
165
 
166
- if st.session_state.get("query_template") and st.session_state["data"] is not None and st.session_state["column_selection"] is not None:
167
- st.write("Data extraction is in progress. This may take a few moments.")
 
168
 
169
- # Progress bar initialization
170
- progress_bar = st.progress(0)
171
  column_selection = st.session_state["column_selection"]
172
- progress_step = 1.0 / len(st.session_state["data"][column_selection])
 
 
 
 
 
 
 
173
 
174
- results = []
175
- for i, entity in enumerate(st.session_state["data"][column_selection]):
176
- # Prepare the prompt for the model
177
- user_message = st.session_state["query_template"].replace("{company}", str(entity))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
 
179
- # Get response and append to results
180
- result_text = get_llm_response(entity, user_message)
181
- results.append({"Entity": entity, "Extracted Information": result_text}) # Consistent key
182
 
183
- # Update the progress bar
184
- progress_bar.progress((i + 1) * progress_step)
 
 
185
 
186
- # Save and display results
187
- st.session_state["results"] = pd.DataFrame(results)
188
- st.write("### Extracted Information")
189
- st.dataframe(st.session_state["results"])
190
 
191
- # View & Download Section with Google Sheets Update
 
 
 
192
  elif selected == "View & Download":
193
- st.header("View and Download Results")
194
-
195
- if st.session_state["results"] is not None:
196
- st.write("### Extracted Data Table")
197
- st.dataframe(st.session_state["results"])
198
-
199
- # Download as CSV
200
- csv_data = st.session_state["results"].to_csv(index=False)
201
- st.download_button("Download as CSV", csv_data, "datascribe_results.csv", "text/csv")
202
-
203
- # Option to update Google Sheet
204
- sheet_id = st.text_input("Enter Google Sheet ID to update with results")
205
- range_name = st.text_input("Enter range (e.g., Sheet1!A1)")
206
- if st.button("Update Google Sheet"):
207
- try:
208
- update_google_sheet(sheet_id, range_name, st.session_state["results"])
209
- st.success("Google Sheet updated successfully!")
210
- except Exception as e:
211
- st.error(f"Failed to update Google Sheet: {e}")
 
 
 
 
 
 
 
 
 
212
  else:
213
- st.warning("No data available to view or download.")
 
6
  from googleapiclient.discovery import build
7
  from streamlit_chat import message as st_message
8
  import plotly.express as px
9
+ import re
10
+ import warnings
11
+ import time
12
  from langchain.schema import HumanMessage, SystemMessage, AIMessage
13
  from langchain.chat_models import ChatOpenAI
14
  from langchain.memory import ConversationBufferWindowMemory
15
  from langchain.prompts import PromptTemplate
16
+ from langchain_community.utilities import GoogleSerperAPIWrapper
17
+ from langchain.agents import initialize_agent, Tool
18
+ from langchain.agents import AgentType
19
  from langchain_groq import ChatGroq
20
  import numpy as np
21
  from dotenv import load_dotenv
 
22
 
23
  warnings.filterwarnings("ignore", category=DeprecationWarning)
24
 
25
+ #environment
26
  load_dotenv()
27
  GROQ_API_KEY = os.getenv("GROQ_API_KEY")
28
+ SERPER_API_KEY = os.getenv("SERPER_API_KEY")
29
  llm = ChatGroq(model="llama-3.1-70b-versatile")
30
 
 
 
 
 
31
 
32
+ # Initialize Google Serper API wrapper
33
+ search = GoogleSerperAPIWrapper(serp_api_key=SERPER_API_KEY)
34
 
35
+ # Create the system and human messages for dynamic query processing
36
+ system_message_content = """
37
+ You are a helpful assistant designed to answer questions by extracting information from the web and external sources. Your goal is to provide the most relevant, concise, and accurate response to user queries.
38
+ """
 
39
 
40
+ # Define the tool list
41
+ tools = [
42
+ Tool(
43
+ name="Web Search",
44
+ func=search.run,
45
+ description="Searches the web for information related to the query"
46
+ )
47
+ ]
48
+
49
+ # Initialize the agent with the tools
50
+ agent = initialize_agent(
51
+ tools,
52
+ ChatGroq(api_key=GROQ_API_KEY, model="llama-3.1-70b-versatile"),
53
+ agent_type=AgentType.SELF_ASK_WITH_SEARCH,
54
+ verbose=True,
55
+ memory=ConversationBufferWindowMemory(k=5, return_messages=True)
56
+ )
57
+
58
+ # Function to perform the web search and get results
59
+ def perform_web_search(query):
60
+ search_results = search.run(query)
61
+ return search_results
62
+
63
+ # Function to get LLM response for dynamic queries
64
+ def get_llm_response(entity, query, web_results):
65
+ prompt = f"""
66
+ Extract relevant {query} (e.g., email, phone number) from the following web results for the entity: {entity}.
67
+ Web Results: {web_results}
68
+ """
69
+
70
+ human_message_content = f"""
71
+ Entity: {entity}
72
+ Query: {query}
73
+ Web Results: {web_results}
74
+ """
75
+
76
+ response = agent.invoke([system_message_content, human_message_content])
77
+ extracted_info = response.get("output", "Information not available").strip()
78
 
79
+ # Clean up irrelevant parts of the response
80
+ cleaned_info = re.sub(r"(Thought:|Action:)[^A-Za-z0-9]*", "", extracted_info).strip()
81
+ return cleaned_info
82
 
83
+ # Retry logic for multiple web searches if necessary
84
+ def refine_answer_with_searches(entity, query, max_retries=3):
85
+ search_results = perform_web_search(query.format(entity=entity))
86
+ extracted_answer = get_llm_response(entity, query, search_results)
87
 
88
+ retries = 0
89
+ while retries < max_retries:
90
+ if len(extracted_answer.split()) <= 2 or "not available" in extracted_answer.lower():
91
+ retries += 1
92
+ time.sleep(2)
93
+ search_results = perform_web_search(query.format(entity=entity))
94
+ extracted_answer = get_llm_response(entity, query, search_results)
95
+ else:
96
+ break
97
 
98
+ return extracted_answer, search_results
 
99
 
100
+ # Setup Google Sheets data fetch
101
+ def get_google_sheet_data(sheet_id, range_name):
102
+ creds = service_account.Credentials.from_service_account_info(
103
+ st.secrets["gcp_service_account"],
104
+ scopes=["https://www.googleapis.com/auth/spreadsheets.readonly"],
105
+ )
106
+ service = build("sheets", "v4", credentials=creds)
107
+ sheet = service.spreadsheets()
108
+ result = sheet.values().get(spreadsheetId=sheet_id, range=range_name).execute()
109
+ values = result.get("values", [])
110
+ return pd.DataFrame(values[1:], columns=values[0])
111
+
112
+ #streamlitconfiguration
113
  st.set_page_config(page_title="DataScribe", page_icon=":notebook_with_decorative_cover:", layout="wide")
114
 
 
115
  with st.sidebar:
116
  selected = option_menu(
117
  "DataScribe Menu",
 
121
  default_index=0
122
  )
123
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  if selected == "Home":
125
+
126
+ st.markdown("""
127
+ <h1 style="text-align:center; color:#4CAF50; font-size: 40px;">🚀 Welcome to DataScribe</h1>
128
+ <p style="text-align:center; font-size: 18px;">An AI-powered information extraction tool to streamline data retrieval and analysis.</p>
129
+ """, unsafe_allow_html=True)
130
+
131
+ st.markdown("""---""")
132
+ def feature_card(title, description, icon, page):
133
+ col1, col2 = st.columns([1, 4])
134
+ with col1:
135
+ st.markdown(f"<div style='font-size: 40px;'>{icon}</div>", unsafe_allow_html=True)
136
+ with col2:
137
+ if st.button(f"{title}", key=title):
138
+ st.session_state.selected_page = page
139
+ st.write(description)
140
+
141
+
142
+
143
+ col1, col2 = st.columns([1, 1])
144
+
145
+ with col1:
146
+ feature_card(
147
+ title="Upload Data",
148
+ description="Upload data from CSV or Google Sheets to get started with your extraction.",
149
+ icon="📄",
150
+ page="Upload Data"
151
+ )
152
+
153
+ with col2:
154
+ feature_card(
155
+ title="Define Custom Queries",
156
+ description="Set custom search queries for each entity in your dataset for specific information retrieval.",
157
+ icon="🔍",
158
+ page="Define Query"
159
+ )
160
+
161
+ col1, col2 = st.columns([1, 1])
162
+
163
+ with col1:
164
+ feature_card(
165
+ title="Run Automated Searches",
166
+ description="Execute automated web searches and extract relevant information using an AI-powered agent.",
167
+ icon="🤖",
168
+ page="Extract Information"
169
+ )
170
+
171
+ with col2:
172
+ feature_card(
173
+ title="View & Download Results",
174
+ description="View extracted data in a structured format and download as a CSV or update Google Sheets.",
175
+ icon="📊",
176
+ page="View & Download"
177
+ )
178
 
 
179
  elif selected == "Upload Data":
180
  st.header("Upload or Connect Your Data")
181
+ data_source = st.radio("Choose data source:", ["CSV Files", "Google Sheets"])
182
+
183
+ if data_source == "CSV Files":
184
+ if "data" in st.session_state:
185
+ st.success("Data uploaded successfully! Here is a preview:")
186
+ st.dataframe(st.session_state["data"])
187
+ else:
188
+ uploaded_files = st.file_uploader("Upload your CSV files", type=["csv"], accept_multiple_files=True)
189
+
190
+ if uploaded_files is not None:
191
+ dfs = []
192
+ for uploaded_file in uploaded_files:
193
+ try:
194
+ df = pd.read_csv(uploaded_file)
195
+ dfs.append(df)
196
+ except Exception as e:
197
+ st.error(f"Error reading file {uploaded_file.name}: {e}")
198
+
199
+ if dfs:
200
+ full_data = pd.concat(dfs, ignore_index=True)
201
+ st.session_state["data"] = full_data
202
+ st.success("Data uploaded successfully! Here is a preview:")
203
+ st.dataframe(full_data)
204
+ else:
205
+ st.warning("No valid data found in the uploaded files.")
206
 
207
  elif data_source == "Google Sheets":
208
  sheet_id = st.text_input("Enter Google Sheet ID")
209
  range_name = st.text_input("Enter the data range (e.g., Sheet1!A1:C100)")
210
+
211
  if st.button("Fetch Data"):
212
+ try:
213
+ data = get_google_sheet_data(sheet_id, range_name)
214
+ st.session_state["data"] = data
215
+ st.write("Data fetched successfully. Here is a preview:")
216
+ st.dataframe(data)
217
+ except Exception as e:
218
+ st.error(f"Error fetching data: {e}")
219
+
220
  elif selected == "Define Query":
221
  st.header("Define Your Custom Query")
222
+
223
+ if "data" not in st.session_state or st.session_state["data"] is None:
224
+ st.warning("Please upload data first!")
 
 
 
 
 
 
 
 
 
 
225
  else:
226
+ column = st.selectbox("Select entity column", st.session_state["data"].columns)
227
+
228
+ st.markdown(f"""
229
+ <style>
230
+ div[data-baseweb="select"] div[data-id="select"] {{
231
+ background-color: #f0f8ff;
232
+ }}
233
+ </style>
234
+ """, unsafe_allow_html=True)
235
+
236
+ st.subheader("Define Fields to Extract")
237
+ num_fields = st.number_input("Number of fields to extract", min_value=1, value=1, step=1)
238
+
239
+ fields = []
240
+ for i in range(num_fields):
241
+ field = st.text_input(f"Field {i+1} name", key=f"field_{i}")
242
+ if field:
243
+ fields.append(field)
244
+
245
+ if fields:
246
+ st.subheader("Query Template")
247
+ query_template = st.text_area(
248
+ "Enter query template (Use '{entity}' to represent each entity)",
249
+ value=f"Find the {', '.join(fields)} for {{entity}}"
250
+ )
251
+
252
+ if "{entity}" in query_template:
253
+ example_entity = str(st.session_state["data"][column].iloc[0])
254
+ example_query = query_template.replace("{entity}", example_entity)
255
+ st.write("### Example Query Preview")
256
+ st.code(example_query)
257
+
258
+ if st.button("Save Query Configuration"):
259
+ st.session_state["column_selection"] = column
260
+ st.session_state["query_template"] = query_template
261
+ st.session_state["extraction_fields"] = fields
262
+ st.success("Query configuration saved!")
263
+
264
 
 
265
  elif selected == "Extract Information":
266
  st.header("Extract Information")
267
 
268
+ if "query_template" in st.session_state and "data" in st.session_state:
269
+ st.write("### Using Query Template:")
270
+ st.code(st.session_state["query_template"])
271
 
 
 
272
  column_selection = st.session_state["column_selection"]
273
+ entities_column = st.session_state["data"][column_selection]
274
+ st.write("### Selected Entity Column:")
275
+ st.dataframe(entities_column)
276
+
277
+ st.write("Data extraction is in progress. This may take a few moments.")
278
+
279
+ # Custom styled progress bar
280
+ progress_bar = st.progress(0)
281
 
282
+ # Custom CSS for a cute progress bar style
283
+ st.markdown("""
284
+ <style>
285
+ .stProgress > div {
286
+ background-color: #FFB6C1; /* Light pink */
287
+ border-radius: 20px;
288
+ height: 15px;
289
+ }
290
+ </style>
291
+ """, unsafe_allow_html=True)
292
+
293
+ try:
294
+ results = []
295
+ for i, selected_entity in enumerate(entities_column):
296
+ user_query = st.session_state["query_template"].replace("{entity}", str(selected_entity))
297
+ final_answer, search_results = refine_answer_with_searches(selected_entity, user_query)
298
+ results.append({
299
+ "Entity": selected_entity,
300
+ "Extracted Information": final_answer,
301
+ "Search Results": search_results
302
+ })
303
+
304
+ # Update progress bar with a smooth and cute animation
305
+ progress_bar.progress(int((i + 1) / len(entities_column) * 100))
306
 
307
+ st.session_state["results"] = results
 
 
308
 
309
+ st.write("### Extracted Information")
310
+ for result in results:
311
+ st.write(f"**Entity:** {result['Entity']}")
312
+ st.write(f"**Extracted Information:** {result['Extracted Information']}")
313
 
314
+ st.write("### Web Results:")
315
+ for result in results:
316
+ st.write(result["Search Results"])
 
317
 
318
+ except Exception as e:
319
+ st.error(f"An error occurred while extracting information: {e}")
320
+ else:
321
+ st.warning("Please upload your data and define the query template.")
322
  elif selected == "View & Download":
323
+ st.header("View & Download Results")
324
+
325
+ if "results" in st.session_state:
326
+ results_df = pd.DataFrame(st.session_state["results"])
327
+ st.write("### Results Preview")
328
+
329
+ st.dataframe(results_df.style.applymap(lambda val: 'background-color: #d3f4ff' if isinstance(val, str) else '', subset=["Extracted Information", "Search Results"]))
330
+
331
+ st.download_button(
332
+ label="Download all results as CSV",
333
+ data=results_df.to_csv(index=False),
334
+ file_name="extracted_results.csv",
335
+ mime="text/csv"
336
+ )
337
+
338
+ st.download_button(
339
+ label="Download Extracted Information as CSV",
340
+ data=results_df[["Entity", "Extracted Information"]].to_csv(index=False),
341
+ file_name="extracted_information.csv",
342
+ mime="text/csv"
343
+ )
344
+
345
+ st.download_button(
346
+ label="Download Web Results as CSV",
347
+ data=results_df[["Entity", "Search Results"]].to_csv(index=False),
348
+ file_name="web_results.csv",
349
+ mime="text/csv"
350
+ )
351
  else:
352
+ st.warning("No results available to view. Please run the extraction process.")