samiee2213 commited on
Commit
9dcfa9a
·
verified ·
1 Parent(s): eece8d8

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +210 -0
app.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from streamlit_option_menu import option_menu
3
+ import pandas as pd
4
+ import os
5
+ from google.oauth2 import service_account
6
+ from googleapiclient.discovery import build
7
+ from streamlit_chat import message as st_message
8
+ import plotly.express as px
9
+ from langchain.schema import HumanMessage, SystemMessage, AIMessage
10
+ from langchain.chat_models import ChatOpenAI
11
+ from langchain.memory import ConversationBufferWindowMemory
12
+ from langchain.prompts import PromptTemplate
13
+ import warnings
14
+ import time
15
+ from langchain_groq import ChatGroq
16
+ import numpy as np
17
+ from dotenv import load_dotenv
18
+ import re
19
+
20
+ warnings.filterwarnings("ignore", category=DeprecationWarning)
21
+
22
+ # Load environment variables
23
+ load_dotenv()
24
+ GROQ_API_KEY = os.getenv("GROQ_API_KEY")
25
+ os.environ['GROQ_API_KEY'] = GROQ_API_KEY
26
+ llm = ChatGroq(model="llama-3.1-70b-versatile")
27
+
28
+ PROMPT_TEMPLATE = """
29
+ You are an expert information extraction assistant designed to obtain specific details from the web and external sources.
30
+ You’ll be provided with an entity name and a query that specifies the type of information needed about that entity.
31
+ Please follow the instructions carefully and return only the most relevant, accurate information.
32
+
33
+ #### Entity Name: {entity}
34
+ #### Query: {query}
35
+
36
+ Instructions:
37
+ 1. Extract the information directly related to the entity.
38
+ 2. If available, include only verified, publicly accessible data.
39
+ 3. Provide information in a single sentence or a short, structured response.
40
+ 4. If the requested information isn’t available or verifiable, respond with "Information not available."
41
+
42
+ #### Example Output Format:
43
+ "Company: {entity} | Requested Information: {extracted_information}"
44
+
45
+ Begin extraction.
46
+ """
47
+
48
+ # Set up the page
49
+ st.set_page_config(page_title="DataScribe", page_icon=":notebook_with_decorative_cover:", layout="wide")
50
+
51
+ # Sidebar navigation
52
+ with st.sidebar:
53
+ selected = option_menu(
54
+ "DataScribe Menu",
55
+ ["Home", "Upload Data", "Define Query", "Extract Information", "View & Download"],
56
+ icons=["house", "cloud-upload", "gear", "search", "table"],
57
+ menu_icon="cast",
58
+ default_index=0
59
+ )
60
+
61
+ # Main header
62
+ st.title("DataScribe: AI-Powered Information Extractor")
63
+
64
+ # Initialize session states for data and results
65
+ if "data" not in st.session_state:
66
+ st.session_state["data"] = None
67
+ if "results" not in st.session_state:
68
+ st.session_state["results"] = None
69
+ if "column_selection" not in st.session_state:
70
+ st.session_state["column_selection"] = None
71
+
72
+ # Helper function for Google Sheets API setup
73
+ def get_google_sheet_data(sheet_id, range_name):
74
+ credentials = service_account.Credentials.from_service_account_info(st.secrets["gcp_service_account"])
75
+ service = build('sheets', 'v4', credentials=credentials)
76
+ sheet = service.spreadsheets()
77
+ result = sheet.values().get(spreadsheetId=sheet_id, range=range_name).execute()
78
+ values = result.get('values', [])
79
+ return pd.DataFrame(values[1:], columns=values[0])
80
+
81
+ # Function to write results back to Google Sheets
82
+ def update_google_sheet(sheet_id, range_name, data):
83
+ credentials = service_account.Credentials.from_service_account_info(st.secrets["gcp_service_account"])
84
+ service = build('sheets', 'v4', credentials=credentials)
85
+ sheet = service.spreadsheets()
86
+ body = {
87
+ 'values': [data.columns.tolist()] + data.values.tolist()
88
+ }
89
+ sheet.values().update(
90
+ spreadsheetId=sheet_id,
91
+ range=range_name,
92
+ valueInputOption="RAW",
93
+ body=body
94
+ ).execute()
95
+
96
+ # Home Page
97
+ if selected == "Home":
98
+ st.markdown(
99
+ """
100
+ ### Welcome to DataScribe
101
+ **DataScribe** is an AI-powered tool designed to extract structured information from the web
102
+ based on entities in your data file. Start by uploading a CSV or Google Sheet and defining a
103
+ custom search query.
104
+ """
105
+ )
106
+ st.image("https://via.placeholder.com/1200x400.png?text=DataScribe+AI+Agent+Dashboard") # Placeholder banner image
107
+
108
+ # Upload Data Section
109
+ elif selected == "Upload Data":
110
+ st.header("Upload or Connect Your Data")
111
+
112
+ # CSV Upload
113
+ data_source = st.radio("Choose data source:", ["CSV File", "Google Sheets"])
114
+
115
+ if data_source == "CSV File":
116
+ uploaded_file = st.file_uploader("Upload your CSV file", type=["csv"])
117
+ if uploaded_file:
118
+ st.session_state["data"] = pd.read_csv(uploaded_file)
119
+ st.write("### Preview of Uploaded Data")
120
+ st.dataframe(st.session_state["data"].head())
121
+
122
+ elif data_source == "Google Sheets":
123
+ sheet_id = st.text_input("Enter Google Sheet ID")
124
+ range_name = st.text_input("Enter the data range (e.g., Sheet1!A1:C100)")
125
+ if st.button("Fetch Data"):
126
+ if sheet_id and range_name:
127
+ st.session_state["data"] = get_google_sheet_data(sheet_id, range_name)
128
+ st.write("### Preview of Google Sheets Data")
129
+ st.dataframe(st.session_state["data"].head())
130
+ else:
131
+ st.warning("Please enter both the Google Sheet ID and range.")
132
+
133
+ # Define Query Section
134
+ elif selected == "Define Query":
135
+ st.header("Define Your Custom Query")
136
+
137
+ if st.session_state["data"] is not None:
138
+ column_selection = st.selectbox("Select the primary column for entities", options=st.session_state["data"].columns)
139
+ query_template = st.text_input("Define your query template", "Get me the email for {company}")
140
+ st.session_state["query_template"] = query_template
141
+ st.session_state["column_selection"] = column_selection # Store column selection in session state
142
+
143
+ st.write("### Example query preview")
144
+ if column_selection:
145
+ # Convert sample_entity to string to avoid replace errors
146
+ sample_entity = str(st.session_state["data"][column_selection].iloc[0])
147
+ example_query = query_template.replace("{company}", sample_entity)
148
+ st.code(example_query)
149
+ else:
150
+ st.warning("Please upload data first.")
151
+
152
+ # Extract Information Section with Progress Bar
153
+ elif selected == "Extract Information":
154
+ st.header("Extract Information")
155
+
156
+ if st.session_state.get("query_template") and st.session_state["data"] is not None and st.session_state["column_selection"] is not None:
157
+ st.write("Data extraction is in progress. This may take a few moments.")
158
+
159
+ # Progress bar initialization
160
+ progress_bar = st.progress(0)
161
+ column_selection = st.session_state["column_selection"]
162
+ progress_step = 1.0 / len(st.session_state["data"][column_selection])
163
+
164
+ results = []
165
+ for i, entity in enumerate(st.session_state["data"][column_selection]):
166
+ # Prepare the prompt for the model
167
+ user_message = st.session_state["query_template"].replace("{company}", str(entity))
168
+ formatted_prompt = PROMPT_TEMPLATE.format(entity=entity, query=user_message)
169
+
170
+ # Append user message to the flow history
171
+ st.session_state.flowmessages.append(HumanMessage(content=user_message))
172
+
173
+ # Generate response from the model
174
+ response = llm([SystemMessage(content=formatted_prompt)])
175
+
176
+ # Collect the model's response
177
+ result_text = response[0].content if response else "Information not available"
178
+ results.append({"Entity": entity, "Extracted Information": result_text})
179
+
180
+ # Update the progress bar
181
+ progress_bar.progress((i + 1) * progress_step)
182
+
183
+ # Save and display results
184
+ st.session_state["results"] = pd.DataFrame(results)
185
+ st.write("### Extracted Information")
186
+ st.dataframe(st.session_state["results"])
187
+
188
+ # View & Download Section with Google Sheets Update
189
+ elif selected == "View & Download":
190
+ st.header("View and Download Results")
191
+
192
+ if st.session_state["results"] is not None:
193
+ st.write("### Extracted Data Table")
194
+ st.dataframe(st.session_state["results"])
195
+
196
+ # Download as CSV
197
+ csv_data = st.session_state["results"].to_csv(index=False)
198
+ st.download_button("Download as CSV", csv_data, "datascribe_results.csv", "text/csv")
199
+
200
+ # Option to update Google Sheet
201
+ sheet_id = st.text_input("Enter Google Sheet ID to update with results")
202
+ range_name = st.text_input("Enter range (e.g., Sheet1!A1)")
203
+ if st.button("Update Google Sheet"):
204
+ try:
205
+ update_google_sheet(sheet_id, range_name, st.session_state["results"])
206
+ st.success("Google Sheet updated successfully!")
207
+ except Exception as e:
208
+ st.error(f"Failed to update Google Sheet: {e}")
209
+ else:
210
+ st.warning("No data available to view or download.")