Spaces:

samiee2213
/

DataScribe

Sleeping

App Files Files Community

samiee2213 commited on Nov 13, 2024

Commit

98915c7

verified ·

1 Parent(s): d8796fc

Update app.py

Browse files

Files changed (1) hide show

app.py +286 -147

app.py CHANGED Viewed

@@ -6,59 +6,112 @@ from google.oauth2 import service_account
 from googleapiclient.discovery import build
 from streamlit_chat import message as st_message
 import plotly.express as px
 from langchain.schema import HumanMessage, SystemMessage, AIMessage
 from langchain.chat_models import ChatOpenAI
 from langchain.memory import ConversationBufferWindowMemory
 from langchain.prompts import PromptTemplate
-import warnings
-import time
 from langchain_groq import ChatGroq
 import numpy as np
 from dotenv import load_dotenv
-import re
 warnings.filterwarnings("ignore", category=DeprecationWarning)
-# Load environment variables
 load_dotenv()
 GROQ_API_KEY = os.getenv("GROQ_API_KEY")
 llm = ChatGroq(model="llama-3.1-70b-versatile")
-PROMPT_TEMPLATE = """
-You are an expert information extraction assistant designed to obtain specific details from the web and external sources.
-You’ll be provided with an entity name and a query that specifies the type of information needed about that entity.
-Please follow the instructions carefully and return only the most relevant, accurate information.
-#### Entity Name: {entity}
-#### Query: {query}
-Instructions:
-1. Extract the information directly related to the entity.
-2. If available, include only verified, publicly accessible data.
-3. Provide information in a single sentence or a short, structured response.
-4. If the requested information isn’t available or verifiable, respond with "Information not available."
-#### Example Output Format:
-"Company: {entity} | Requested Information: {extracted_information}"
-Begin extraction.
-"""
-# Function to get response from LLM
-def get_llm_response(entity, query):
-    # Format the prompt with the entity and query
-    prompt = PROMPT_TEMPLATE.format(entity=entity, query=query)
-    # Request response from the LLM
-    response = llm([SystemMessage(content=prompt)])
-    # Return content or default message
-    return response[0].content if response else "Information not available"
-# Streamlit app setup
 st.set_page_config(page_title="DataScribe", page_icon=":notebook_with_decorative_cover:", layout="wide")
-# Sidebar navigation
 with st.sidebar:
     selected = option_menu(
         "DataScribe Menu",
@@ -68,146 +121,232 @@ with st.sidebar:
         default_index=0
     )
-# Main header
-st.title("DataScribe: AI-Powered Information Extractor")
-# Initialize session states for data and results
-if "data" not in st.session_state:
-    st.session_state["data"] = None
-if "results" not in st.session_state:
-    st.session_state["results"] = None
-if "column_selection" not in st.session_state:
-    st.session_state["column_selection"] = None
-# Helper function for Google Sheets API setup
-def get_google_sheet_data(sheet_id, range_name):
-    credentials = service_account.Credentials.from_service_account_info(st.secrets["gcp_service_account"])
-    service = build('sheets', 'v4', credentials=credentials)
-    sheet = service.spreadsheets()
-    result = sheet.values().get(spreadsheetId=sheet_id, range=range_name).execute()
-    values = result.get('values', [])
-    return pd.DataFrame(values[1:], columns=values[0])
-# Function to write results back to Google Sheets
-def update_google_sheet(sheet_id, range_name, data):
-    credentials = service_account.Credentials.from_service_account_info(st.secrets["gcp_service_account"])
-    service = build('sheets', 'v4', credentials=credentials)
-    sheet = service.spreadsheets()
-    body = {
-        'values': [data.columns.tolist()] + data.values.tolist()
-    }
-    sheet.values().update(
-        spreadsheetId=sheet_id,
-        range=range_name,
-        valueInputOption="RAW",
-        body=body
-    ).execute()
-# Home Page
 if selected == "Home":
-    st.markdown(
-        """
-        ### Welcome to DataScribe
-        **DataScribe** is an AI-powered tool designed to extract structured information from the web
-        based on entities in your data file. Start by uploading a CSV or Google Sheet and defining a
-        custom search query.
-        """
-    )
-    st.image("https://via.placeholder.com/1200x400.png?text=DataScribe+AI+Agent+Dashboard")  # Placeholder banner image
-# Upload Data Section
 elif selected == "Upload Data":
     st.header("Upload or Connect Your Data")
-    # CSV Upload
-    data_source = st.radio("Choose data source:", ["CSV File", "Google Sheets"])
-    if data_source == "CSV File":
-        uploaded_file = st.file_uploader("Upload your CSV file", type=["csv"])
-        if uploaded_file:
-            st.session_state["data"] = pd.read_csv(uploaded_file)
-            st.write("### Preview of Uploaded Data")
-            st.dataframe(st.session_state["data"].head())
     elif data_source == "Google Sheets":
         sheet_id = st.text_input("Enter Google Sheet ID")
         range_name = st.text_input("Enter the data range (e.g., Sheet1!A1:C100)")
         if st.button("Fetch Data"):
-            if sheet_id and range_name:
-                st.session_state["data"] = get_google_sheet_data(sheet_id, range_name)
-                st.write("### Preview of Google Sheets Data")
-                st.dataframe(st.session_state["data"].head())
-            else:
-                st.warning("Please enter both the Google Sheet ID and range.")
-# Define Query Section
 elif selected == "Define Query":
     st.header("Define Your Custom Query")
-    if st.session_state["data"] is not None:
-        column_selection = st.selectbox("Select the primary column for entities", options=st.session_state["data"].columns)
-        query_template = st.text_input("Define your query template", "Get me the email for {company}")
-        st.session_state["query_template"] = query_template
-        st.session_state["column_selection"] = column_selection  # Store column selection in session state
-        st.write("### Example query preview")
-        if column_selection:
-            # Convert sample_entity to string to avoid replace errors
-            sample_entity = str(st.session_state["data"][column_selection].iloc[0])
-            example_query = query_template.replace("{company}", sample_entity)
-            st.code(example_query)
     else:
-        st.warning("Please upload data first.")
-# Extract Information Section with Progress Bar
 elif selected == "Extract Information":
     st.header("Extract Information")
-    if st.session_state.get("query_template") and st.session_state["data"] is not None and st.session_state["column_selection"] is not None:
-        st.write("Data extraction is in progress. This may take a few moments.")
-        # Progress bar initialization
-        progress_bar = st.progress(0)
         column_selection = st.session_state["column_selection"]
-        progress_step = 1.0 / len(st.session_state["data"][column_selection])
-        results = []
-        for i, entity in enumerate(st.session_state["data"][column_selection]):
-            # Prepare the prompt for the model
-            user_message = st.session_state["query_template"].replace("{company}", str(entity))
-            # Get response and append to results
-            result_text = get_llm_response(entity, user_message)
-            results.append({"Entity": entity, "Extracted Information": result_text})  # Consistent key
-            # Update the progress bar
-            progress_bar.progress((i + 1) * progress_step)
-        # Save and display results
-        st.session_state["results"] = pd.DataFrame(results)
-        st.write("### Extracted Information")
-        st.dataframe(st.session_state["results"])
-# View & Download Section with Google Sheets Update
 elif selected == "View & Download":
-    st.header("View and Download Results")
-    if st.session_state["results"] is not None:
-        st.write("### Extracted Data Table")
-        st.dataframe(st.session_state["results"])
-        # Download as CSV
-        csv_data = st.session_state["results"].to_csv(index=False)
-        st.download_button("Download as CSV", csv_data, "datascribe_results.csv", "text/csv")
-        # Option to update Google Sheet
-        sheet_id = st.text_input("Enter Google Sheet ID to update with results")
-        range_name = st.text_input("Enter range (e.g., Sheet1!A1)")
-        if st.button("Update Google Sheet"):
-            try:
-                update_google_sheet(sheet_id, range_name, st.session_state["results"])
-                st.success("Google Sheet updated successfully!")
-            except Exception as e:
-                st.error(f"Failed to update Google Sheet: {e}")
     else:
-        st.warning("No data available to view or download.")

 from googleapiclient.discovery import build
 from streamlit_chat import message as st_message
 import plotly.express as px
+import re
+import warnings
+import time
 from langchain.schema import HumanMessage, SystemMessage, AIMessage
 from langchain.chat_models import ChatOpenAI
 from langchain.memory import ConversationBufferWindowMemory
 from langchain.prompts import PromptTemplate
+from langchain_community.utilities import GoogleSerperAPIWrapper
+from langchain.agents import initialize_agent, Tool
+from langchain.agents import AgentType
 from langchain_groq import ChatGroq
 import numpy as np
 from dotenv import load_dotenv
 warnings.filterwarnings("ignore", category=DeprecationWarning)
+#environment
 load_dotenv()
 GROQ_API_KEY = os.getenv("GROQ_API_KEY")
+SERPER_API_KEY = os.getenv("SERPER_API_KEY")
 llm = ChatGroq(model="llama-3.1-70b-versatile")
+# Initialize Google Serper API wrapper
+search = GoogleSerperAPIWrapper(serp_api_key=SERPER_API_KEY)
+# Create the system and human messages for dynamic query processing
+system_message_content = """
+You are a helpful assistant designed to answer questions by extracting information from the web and external sources. Your goal is to provide the most relevant, concise, and accurate response to user queries.
+"""
+# Define the tool list
+tools = [
+    Tool(
+        name="Web Search",
+        func=search.run,
+        description="Searches the web for information related to the query"
+    )
+]
+# Initialize the agent with the tools
+agent = initialize_agent(
+    tools,
+    ChatGroq(api_key=GROQ_API_KEY, model="llama-3.1-70b-versatile"),
+    agent_type=AgentType.SELF_ASK_WITH_SEARCH,
+    verbose=True,
+    memory=ConversationBufferWindowMemory(k=5, return_messages=True)
+)
+# Function to perform the web search and get results
+def perform_web_search(query):
+    search_results = search.run(query)
+    return search_results
+# Function to get LLM response for dynamic queries
+def get_llm_response(entity, query, web_results):
+    prompt = f"""
+    Extract relevant {query} (e.g., email, phone number) from the following web results for the entity: {entity}.
+    Web Results: {web_results}
+    """
+    human_message_content = f"""
+    Entity: {entity}
+    Query: {query}
+    Web Results: {web_results}
+    """
+    response = agent.invoke([system_message_content, human_message_content])
+    extracted_info = response.get("output", "Information not available").strip()
+    # Clean up irrelevant parts of the response
+    cleaned_info = re.sub(r"(Thought:|Action:)[^A-Za-z0-9]*", "", extracted_info).strip()
+    return cleaned_info
+# Retry logic for multiple web searches if necessary
+def refine_answer_with_searches(entity, query, max_retries=3):
+    search_results = perform_web_search(query.format(entity=entity))
+    extracted_answer = get_llm_response(entity, query, search_results)
+    retries = 0
+    while retries < max_retries:
+        if len(extracted_answer.split()) <= 2 or "not available" in extracted_answer.lower():
+            retries += 1
+            time.sleep(2)
+            search_results = perform_web_search(query.format(entity=entity))
+            extracted_answer = get_llm_response(entity, query, search_results)
+        else:
+            break
+    return extracted_answer, search_results
+# Setup Google Sheets data fetch
+def get_google_sheet_data(sheet_id, range_name):
+    creds = service_account.Credentials.from_service_account_info(
+        st.secrets["gcp_service_account"],
+        scopes=["https://www.googleapis.com/auth/spreadsheets.readonly"],
+    )
+    service = build("sheets", "v4", credentials=creds)
+    sheet = service.spreadsheets()
+    result = sheet.values().get(spreadsheetId=sheet_id, range=range_name).execute()
+    values = result.get("values", [])
+    return pd.DataFrame(values[1:], columns=values[0])
+#streamlitconfiguration
 st.set_page_config(page_title="DataScribe", page_icon=":notebook_with_decorative_cover:", layout="wide")
 with st.sidebar:
     selected = option_menu(
         "DataScribe Menu",
         default_index=0
     )
 if selected == "Home":
+    st.markdown("""
+        <h1 style="text-align:center; color:#4CAF50; font-size: 40px;">🚀 Welcome to DataScribe</h1>
+        <p style="text-align:center; font-size: 18px;">An AI-powered information extraction tool to streamline data retrieval and analysis.</p>
+    """, unsafe_allow_html=True)
+    st.markdown("""---""")
+    def feature_card(title, description, icon, page):
+        col1, col2 = st.columns([1, 4])
+        with col1:
+            st.markdown(f"<div style='font-size: 40px;'>{icon}</div>", unsafe_allow_html=True)
+        with col2:
+            if st.button(f"{title}", key=title):
+                st.session_state.selected_page = page
+            st.write(description)
+    col1, col2 = st.columns([1, 1])
+    with col1:
+        feature_card(
+            title="Upload Data",
+            description="Upload data from CSV or Google Sheets to get started with your extraction.",
+            icon="📄",
+            page="Upload Data"
+        )
+    with col2:
+        feature_card(
+            title="Define Custom Queries",
+            description="Set custom search queries for each entity in your dataset for specific information retrieval.",
+            icon="🔍",
+            page="Define Query"
+        )
+    col1, col2 = st.columns([1, 1])
+    with col1:
+        feature_card(
+            title="Run Automated Searches",
+            description="Execute automated web searches and extract relevant information using an AI-powered agent.",
+            icon="🤖",
+            page="Extract Information"
+        )
+    with col2:
+        feature_card(
+            title="View & Download Results",
+            description="View extracted data in a structured format and download as a CSV or update Google Sheets.",
+            icon="📊",
+            page="View & Download"
+        )
 elif selected == "Upload Data":
     st.header("Upload or Connect Your Data")
+    data_source = st.radio("Choose data source:", ["CSV Files", "Google Sheets"])
+    if data_source == "CSV Files":
+        if "data" in st.session_state:
+            st.success("Data uploaded successfully! Here is a preview:")
+            st.dataframe(st.session_state["data"])
+        else:
+            uploaded_files = st.file_uploader("Upload your CSV files", type=["csv"], accept_multiple_files=True)
+            if uploaded_files is not None:
+                dfs = []
+                for uploaded_file in uploaded_files:
+                    try:
+                        df = pd.read_csv(uploaded_file)
+                        dfs.append(df)
+                    except Exception as e:
+                        st.error(f"Error reading file {uploaded_file.name}: {e}")
+                if dfs:
+                    full_data = pd.concat(dfs, ignore_index=True)
+                    st.session_state["data"] = full_data
+                    st.success("Data uploaded successfully! Here is a preview:")
+                    st.dataframe(full_data)
+                else:
+                    st.warning("No valid data found in the uploaded files.")
     elif data_source == "Google Sheets":
         sheet_id = st.text_input("Enter Google Sheet ID")
         range_name = st.text_input("Enter the data range (e.g., Sheet1!A1:C100)")
         if st.button("Fetch Data"):
+            try:
+                data = get_google_sheet_data(sheet_id, range_name)
+                st.session_state["data"] = data
+                st.write("Data fetched successfully. Here is a preview:")
+                st.dataframe(data)
+            except Exception as e:
+                st.error(f"Error fetching data: {e}")
 elif selected == "Define Query":
     st.header("Define Your Custom Query")
+    if "data" not in st.session_state or st.session_state["data"] is None:
+        st.warning("Please upload data first!")
     else:
+        column = st.selectbox("Select entity column", st.session_state["data"].columns)
+        st.markdown(f"""
+        <style>
+        div[data-baseweb="select"] div[data-id="select"] {{
+            background-color: #f0f8ff;
+        }}
+        </style>
+        """, unsafe_allow_html=True)
+        st.subheader("Define Fields to Extract")
+        num_fields = st.number_input("Number of fields to extract", min_value=1, value=1, step=1)
+        fields = []
+        for i in range(num_fields):
+            field = st.text_input(f"Field {i+1} name", key=f"field_{i}")
+            if field:
+                fields.append(field)
+        if fields:
+            st.subheader("Query Template")
+            query_template = st.text_area(
+                "Enter query template (Use '{entity}' to represent each entity)",
+                value=f"Find the {', '.join(fields)} for {{entity}}"
+            )
+            if "{entity}" in query_template:
+                example_entity = str(st.session_state["data"][column].iloc[0])
+                example_query = query_template.replace("{entity}", example_entity)
+                st.write("### Example Query Preview")
+                st.code(example_query)
+            if st.button("Save Query Configuration"):
+                st.session_state["column_selection"] = column
+                st.session_state["query_template"] = query_template
+                st.session_state["extraction_fields"] = fields
+                st.success("Query configuration saved!")
 elif selected == "Extract Information":
     st.header("Extract Information")
+    if "query_template" in st.session_state and "data" in st.session_state:
+        st.write("### Using Query Template:")
+        st.code(st.session_state["query_template"])
         column_selection = st.session_state["column_selection"]
+        entities_column = st.session_state["data"][column_selection]
+        st.write("### Selected Entity Column:")
+        st.dataframe(entities_column)
+        st.write("Data extraction is in progress. This may take a few moments.")
+        # Custom styled progress bar
+        progress_bar = st.progress(0)
+        # Custom CSS for a cute progress bar style
+        st.markdown("""
+        <style>
+        .stProgress > div {
+            background-color: #FFB6C1;  /* Light pink */
+            border-radius: 20px;
+            height: 15px;
+        }
+        </style>
+        """, unsafe_allow_html=True)
+        try:
+            results = []
+            for i, selected_entity in enumerate(entities_column):
+                user_query = st.session_state["query_template"].replace("{entity}", str(selected_entity))
+                final_answer, search_results = refine_answer_with_searches(selected_entity, user_query)
+                results.append({
+                    "Entity": selected_entity,
+                    "Extracted Information": final_answer,
+                    "Search Results": search_results
+                })
+                # Update progress bar with a smooth and cute animation
+                progress_bar.progress(int((i + 1) / len(entities_column) * 100))
+            st.session_state["results"] = results
+            st.write("### Extracted Information")
+            for result in results:
+                st.write(f"**Entity:** {result['Entity']}")
+                st.write(f"**Extracted Information:** {result['Extracted Information']}")
+            st.write("### Web Results:")
+            for result in results:
+                st.write(result["Search Results"])
+        except Exception as e:
+            st.error(f"An error occurred while extracting information: {e}")
+    else:
+        st.warning("Please upload your data and define the query template.")
 elif selected == "View & Download":
+    st.header("View & Download Results")
+    if "results" in st.session_state:
+        results_df = pd.DataFrame(st.session_state["results"])
+        st.write("### Results Preview")
+        st.dataframe(results_df.style.applymap(lambda val: 'background-color: #d3f4ff' if isinstance(val, str) else '', subset=["Extracted Information", "Search Results"]))
+        st.download_button(
+            label="Download all results as CSV",
+            data=results_df.to_csv(index=False),
+            file_name="extracted_results.csv",
+            mime="text/csv"
+        )
+        st.download_button(
+            label="Download Extracted Information as CSV",
+            data=results_df[["Entity", "Extracted Information"]].to_csv(index=False),
+            file_name="extracted_information.csv",
+            mime="text/csv"
+        )
+        st.download_button(
+            label="Download Web Results as CSV",
+            data=results_df[["Entity", "Search Results"]].to_csv(index=False),
+            file_name="web_results.csv",
+            mime="text/csv"
+        )
     else:
+        st.warning("No results available to view. Please run the extraction process.")