Spaces:
Sleeping
Sleeping
import os | |
import streamlit as st | |
import pandas as pd | |
import sqlite3 | |
import logging | |
from langchain.agents import create_sql_agent | |
from langchain.agents.agent_toolkits import SQLDatabaseToolkit | |
from langchain.llms import OpenAI | |
from langchain.sql_database import SQLDatabase | |
from langchain.prompts import ( | |
ChatPromptTemplate, | |
FewShotPromptTemplate, | |
PromptTemplate, | |
SystemMessagePromptTemplate, | |
HumanMessagePromptTemplate, | |
MessagesPlaceholder | |
) | |
from langchain.schema import HumanMessage | |
from langchain.chat_models import ChatOpenAI | |
from langchain.evaluation import load_evaluator | |
# Initialize logging | |
logging.basicConfig(level=logging.INFO) | |
# Initialize conversation history | |
if 'history' not in st.session_state: | |
st.session_state.history = [] | |
# OpenAI API key (ensure it is securely stored) | |
openai_api_key = os.getenv("OPENAI_API_KEY") | |
# Check if the API key is set | |
if not openai_api_key: | |
st.error("OpenAI API key is not set. Please set the OPENAI_API_KEY environment variable.") | |
st.stop() | |
# Step 1: Upload CSV data file (or use default) | |
st.title("Enhanced Natural Language to SQL Query App") | |
st.write("Upload a CSV file to get started, or use the default dataset.") | |
csv_file = st.file_uploader("Upload your CSV file", type=["csv"]) | |
if csv_file is None: | |
data = pd.read_csv("default_data.csv") # Ensure this file exists in your working directory | |
st.write("Using default_data.csv file.") | |
table_name = "default_table" | |
else: | |
data = pd.read_csv(csv_file) | |
table_name = csv_file.name.split('.')[0] | |
st.write(f"Data Preview ({csv_file.name}):") | |
st.dataframe(data.head()) | |
# Step 2: Load CSV data into a persistent SQLite database | |
db_file = 'my_database.db' | |
conn = sqlite3.connect(db_file) | |
data.to_sql(table_name, conn, index=False, if_exists='replace') | |
# SQL table metadata (for validation and schema) | |
valid_columns = list(data.columns) | |
st.write(f"Valid columns: {valid_columns}") | |
# Create SQLDatabase instance with custom table info | |
engine = SQLDatabase.from_uri(f"sqlite:///{db_file}", include_tables=[table_name]) | |
# Step 3: Define the few-shot examples for the prompt | |
few_shot_examples = [ | |
{ | |
"input": "What is the total revenue for each category?", | |
"query": f"SELECT category, SUM(revenue) FROM {table_name} GROUP BY category;" | |
}, | |
{ | |
"input": "Show the top 5 products by sales.", | |
"query": f"SELECT product_name, sales FROM {table_name} ORDER BY sales DESC LIMIT 5;" | |
}, | |
{ | |
"input": "How many orders were placed in the last month?", | |
"query": f"SELECT COUNT(*) FROM {table_name} WHERE order_date >= DATE('now', '-1 month');" | |
} | |
] | |
# Step 4: Define the prompt templates | |
system_prefix = """ | |
You are an expert data analyst who can convert natural language questions into SQL queries. | |
Follow these guidelines: | |
1. Only use the columns and tables provided. | |
2. Use appropriate SQL syntax for SQLite. | |
3. Ensure string comparisons are case-insensitive. | |
4. Do not execute queries that could be harmful or unethical. | |
5. Provide clear and concise SQL queries. | |
""" | |
few_shot_prompt = FewShotPromptTemplate( | |
example_prompt=PromptTemplate.from_template("Question: {input}\nSQL Query: {query}"), | |
examples=few_shot_examples, | |
prefix=system_prefix, | |
suffix="Question: {input}\nSQL Query:", | |
input_variables=["input"] | |
) | |
# Step 5: Initialize the LLM and toolkit | |
llm = ChatOpenAI(temperature=0, openai_api_key=openai_api_key) | |
toolkit = SQLDatabaseToolkit(db=engine, llm=llm) | |
# Step 6: Create the agent | |
agent_prompt = ChatPromptTemplate.from_messages([ | |
SystemMessagePromptTemplate(prompt=few_shot_prompt), | |
HumanMessagePromptTemplate.from_template("{input}") | |
]) | |
sql_agent = create_sql_agent( | |
llm=llm, | |
toolkit=toolkit, | |
prompt=agent_prompt, | |
verbose=True, | |
agent_type="openai-functions", | |
max_iterations=5 | |
) | |
# Step 7: Define the callback function | |
def process_input(): | |
user_prompt = st.session_state['user_input'] | |
if user_prompt: | |
try: | |
# Append user message to history | |
st.session_state.history.append({"role": "user", "content": user_prompt}) | |
# Use the agent to generate the SQL query | |
with st.spinner("Generating SQL query..."): | |
response = sql_agent.run(user_prompt) | |
# Check if the response contains SQL code | |
if "SELECT" in response.upper(): | |
sql_query = response.strip() | |
logging.info(f"Generated SQL Query: {sql_query}") | |
# Attempt to execute SQL query and handle exceptions | |
try: | |
result = pd.read_sql_query(sql_query, conn) | |
if result.empty: | |
assistant_response = "The query returned no results. Please try a different question." | |
st.session_state.history.append({"role": "assistant", "content": assistant_response}) | |
else: | |
# Limit the result to first 10 rows for display | |
result_display = result.head(10) | |
st.session_state.history.append({"role": "assistant", "content": "Here are the results:"}) | |
st.session_state.history.append({"role": "assistant", "content": result_display}) | |
# Generate insights based on the query result | |
insights_template = """ | |
You are an expert data analyst. Based on the user's question and the SQL query result provided below, generate a concise analysis that includes key data insights and actionable recommendations. Limit the response to a maximum of 150 words. | |
User's Question: {question} | |
SQL Query Result: | |
{result} | |
Concise Analysis: | |
""" | |
insights_prompt = PromptTemplate(template=insights_template, input_variables=['question', 'result']) | |
insights_chain = LLMChain(llm=llm, prompt=insights_prompt) | |
result_str = result_display.to_string(index=False) | |
insights = insights_chain.run({'question': user_prompt, 'result': result_str}) | |
# Append the assistant's insights to the history | |
st.session_state.history.append({"role": "assistant", "content": insights}) | |
except Exception as e: | |
logging.error(f"An error occurred during SQL execution: {e}") | |
assistant_response = f"Error executing SQL query: {e}" | |
st.session_state.history.append({"role": "assistant", "content": assistant_response}) | |
else: | |
# Handle responses that do not contain SQL queries | |
assistant_response = response | |
st.session_state.history.append({"role": "assistant", "content": assistant_response}) | |
# Evaluate the response for harmful content | |
try: | |
evaluator = load_evaluator("harmful_content", llm=llm) | |
eval_result = evaluator.evaluate_strings( | |
input=user_prompt, | |
prediction=response | |
) | |
if eval_result['flagged']: | |
st.warning("The assistant's response may not be appropriate.") | |
else: | |
logging.info("Response evaluated as appropriate.") | |
except Exception as e: | |
logging.error(f"An error occurred during evaluation: {e}") | |
except Exception as e: | |
logging.error(f"An error occurred: {e}") | |
assistant_response = f"Error: {e}" | |
st.session_state.history.append({"role": "assistant", "content": assistant_response}) | |
# Reset the user_input in session state | |
st.session_state['user_input'] = '' | |
# Step 8: Display the conversation history | |
for message in st.session_state.history: | |
if message['role'] == 'user': | |
st.markdown(f"**User:** {message['content']}") | |
elif message['role'] == 'assistant': | |
if isinstance(message['content'], pd.DataFrame): | |
st.markdown("**Assistant:** Query Results:") | |
st.dataframe(message['content']) | |
else: | |
st.markdown(f"**Assistant:** {message['content']}") | |
# Place the input field at the bottom with the callback | |
st.text_input("Enter your message:", key='user_input', on_change=process_input) | |