Spaces:
Sleeping
Sleeping
import streamlit as st | |
from streamlit_option_menu import option_menu | |
import pandas as pd | |
import os | |
from google.oauth2 import service_account | |
from googleapiclient.discovery import build | |
from streamlit_chat import message as st_message | |
import plotly.express as px | |
from langchain.schema import HumanMessage, SystemMessage, AIMessage | |
from langchain.chat_models import ChatOpenAI | |
from langchain.memory import ConversationBufferWindowMemory | |
from langchain.prompts import PromptTemplate | |
import warnings | |
import time | |
from langchain_groq import ChatGroq | |
import numpy as np | |
from dotenv import load_dotenv | |
import re | |
warnings.filterwarnings("ignore", category=DeprecationWarning) | |
# Load environment variables | |
load_dotenv() | |
GROQ_API_KEY = os.getenv("GROQ_API_KEY") | |
os.environ['GROQ_API_KEY'] = GROQ_API_KEY | |
llm = ChatGroq(model="llama-3.1-70b-versatile") | |
PROMPT_TEMPLATE = """ | |
You are an expert information extraction assistant designed to obtain specific details from the web and external sources. | |
You’ll be provided with an entity name and a query that specifies the type of information needed about that entity. | |
Please follow the instructions carefully and return only the most relevant, accurate information. | |
#### Entity Name: {entity} | |
#### Query: {query} | |
Instructions: | |
1. Extract the information directly related to the entity. | |
2. If available, include only verified, publicly accessible data. | |
3. Provide information in a single sentence or a short, structured response. | |
4. If the requested information isn’t available or verifiable, respond with "Information not available." | |
#### Example Output Format: | |
"Company: {entity} | Requested Information: {extracted_information}" | |
Begin extraction. | |
""" | |
# Set up the page | |
st.set_page_config(page_title="DataScribe", page_icon=":notebook_with_decorative_cover:", layout="wide") | |
# Sidebar navigation | |
with st.sidebar: | |
selected = option_menu( | |
"DataScribe Menu", | |
["Home", "Upload Data", "Define Query", "Extract Information", "View & Download"], | |
icons=["house", "cloud-upload", "gear", "search", "table"], | |
menu_icon="cast", | |
default_index=0 | |
) | |
# Main header | |
st.title("DataScribe: AI-Powered Information Extractor") | |
# Initialize session states for data and results | |
if "data" not in st.session_state: | |
st.session_state["data"] = None | |
if "results" not in st.session_state: | |
st.session_state["results"] = None | |
if "column_selection" not in st.session_state: | |
st.session_state["column_selection"] = None | |
# Helper function for Google Sheets API setup | |
def get_google_sheet_data(sheet_id, range_name): | |
credentials = service_account.Credentials.from_service_account_info(st.secrets["gcp_service_account"]) | |
service = build('sheets', 'v4', credentials=credentials) | |
sheet = service.spreadsheets() | |
result = sheet.values().get(spreadsheetId=sheet_id, range=range_name).execute() | |
values = result.get('values', []) | |
return pd.DataFrame(values[1:], columns=values[0]) | |
# Function to write results back to Google Sheets | |
def update_google_sheet(sheet_id, range_name, data): | |
credentials = service_account.Credentials.from_service_account_info(st.secrets["gcp_service_account"]) | |
service = build('sheets', 'v4', credentials=credentials) | |
sheet = service.spreadsheets() | |
body = { | |
'values': [data.columns.tolist()] + data.values.tolist() | |
} | |
sheet.values().update( | |
spreadsheetId=sheet_id, | |
range=range_name, | |
valueInputOption="RAW", | |
body=body | |
).execute() | |
# Home Page | |
if selected == "Home": | |
st.markdown( | |
""" | |
### Welcome to DataScribe | |
**DataScribe** is an AI-powered tool designed to extract structured information from the web | |
based on entities in your data file. Start by uploading a CSV or Google Sheet and defining a | |
custom search query. | |
""" | |
) | |
st.image("https://via.placeholder.com/1200x400.png?text=DataScribe+AI+Agent+Dashboard") # Placeholder banner image | |
# Upload Data Section | |
elif selected == "Upload Data": | |
st.header("Upload or Connect Your Data") | |
# CSV Upload | |
data_source = st.radio("Choose data source:", ["CSV File", "Google Sheets"]) | |
if data_source == "CSV File": | |
uploaded_file = st.file_uploader("Upload your CSV file", type=["csv"]) | |
if uploaded_file: | |
st.session_state["data"] = pd.read_csv(uploaded_file) | |
st.write("### Preview of Uploaded Data") | |
st.dataframe(st.session_state["data"].head()) | |
elif data_source == "Google Sheets": | |
sheet_id = st.text_input("Enter Google Sheet ID") | |
range_name = st.text_input("Enter the data range (e.g., Sheet1!A1:C100)") | |
if st.button("Fetch Data"): | |
if sheet_id and range_name: | |
st.session_state["data"] = get_google_sheet_data(sheet_id, range_name) | |
st.write("### Preview of Google Sheets Data") | |
st.dataframe(st.session_state["data"].head()) | |
else: | |
st.warning("Please enter both the Google Sheet ID and range.") | |
# Define Query Section | |
elif selected == "Define Query": | |
st.header("Define Your Custom Query") | |
if st.session_state["data"] is not None: | |
column_selection = st.selectbox("Select the primary column for entities", options=st.session_state["data"].columns) | |
query_template = st.text_input("Define your query template", "Get me the email for {company}") | |
st.session_state["query_template"] = query_template | |
st.session_state["column_selection"] = column_selection # Store column selection in session state | |
st.write("### Example query preview") | |
if column_selection: | |
# Convert sample_entity to string to avoid replace errors | |
sample_entity = str(st.session_state["data"][column_selection].iloc[0]) | |
example_query = query_template.replace("{company}", sample_entity) | |
st.code(example_query) | |
else: | |
st.warning("Please upload data first.") | |
# Extract Information Section with Progress Bar | |
elif selected == "Extract Information": | |
st.header("Extract Information") | |
if st.session_state.get("query_template") and st.session_state["data"] is not None and st.session_state["column_selection"] is not None: | |
st.write("Data extraction is in progress. This may take a few moments.") | |
# Progress bar initialization | |
progress_bar = st.progress(0) | |
column_selection = st.session_state["column_selection"] | |
progress_step = 1.0 / len(st.session_state["data"][column_selection]) | |
results = [] | |
for i, entity in enumerate(st.session_state["data"][column_selection]): | |
# Prepare the prompt for the model | |
user_message = st.session_state["query_template"].replace("{company}", str(entity)) | |
formatted_prompt = PROMPT_TEMPLATE.format(entity=entity, query=user_message) | |
# Append user message to the flow history | |
st.session_state.flowmessages.append(HumanMessage(content=user_message)) | |
# Generate response from the model | |
response = llm([SystemMessage(content=formatted_prompt)]) | |
# Collect the model's response | |
result_text = response[0].content if response else "Information not available" | |
results.append({"Entity": entity, "Extracted Information": result_text}) | |
# Update the progress bar | |
progress_bar.progress((i + 1) * progress_step) | |
# Save and display results | |
st.session_state["results"] = pd.DataFrame(results) | |
st.write("### Extracted Information") | |
st.dataframe(st.session_state["results"]) | |
# View & Download Section with Google Sheets Update | |
elif selected == "View & Download": | |
st.header("View and Download Results") | |
if st.session_state["results"] is not None: | |
st.write("### Extracted Data Table") | |
st.dataframe(st.session_state["results"]) | |
# Download as CSV | |
csv_data = st.session_state["results"].to_csv(index=False) | |
st.download_button("Download as CSV", csv_data, "datascribe_results.csv", "text/csv") | |
# Option to update Google Sheet | |
sheet_id = st.text_input("Enter Google Sheet ID to update with results") | |
range_name = st.text_input("Enter range (e.g., Sheet1!A1)") | |
if st.button("Update Google Sheet"): | |
try: | |
update_google_sheet(sheet_id, range_name, st.session_state["results"]) | |
st.success("Google Sheet updated successfully!") | |
except Exception as e: | |
st.error(f"Failed to update Google Sheet: {e}") | |
else: | |
st.warning("No data available to view or download.") | |