GenBIChatbotfree

Sleeping

Ari

Update app.py

bb31796 verified 4 months ago

9.56 kB

	import os
	import streamlit as st
	import pandas as pd
	import sqlite3
	from transformers import pipeline
	import sqlparse
	import logging

	# Initialize conversation history
	if 'history' not in st.session_state:
	st.session_state.history = []

	# Load a pre-trained GPT-2 model from Hugging Face
	llm = pipeline('text-generation', model='gpt2')

	# Step 1: Upload CSV data file (or use default)
	st.title("Natural Language to SQL Query App with Enhanced Insights")
	st.write("Upload a CSV file to get started, or use the default dataset.")

	csv_file = st.file_uploader("Upload your CSV file", type=["csv"])
	if csv_file is None:
	data = pd.read_csv("default_data.csv") # Ensure this file exists in your working directory
	st.write("Using default_data.csv file.")
	table_name = "default_table"
	else:
	data = pd.read_csv(csv_file)
	table_name = csv_file.name.split('.')[0]
	st.write(f"Data Preview ({csv_file.name}):")
	st.dataframe(data.head())

	# Step 2: Load CSV data into a persistent SQLite database
	db_file = 'my_database.db'
	conn = sqlite3.connect(db_file)
	data.to_sql(table_name, conn, index=False, if_exists='replace')

	# SQL table metadata (for validation and schema)
	valid_columns = list(data.columns)
	st.write(f"Valid columns: {valid_columns}")

	# Function to generate SQL query using Hugging Face model
	def generate_sql_query(question, table_name, columns):
	prompt = f"""
	You are an expert data scientist. Given a natural language question, the name of the table, and a list of valid columns, generate a valid SQL query that answers the question.
	Ensure that:
	- You only use the columns provided.
	- When performing string comparisons in the WHERE clause, make them case-insensitive by using 'COLLATE NOCASE' or the LOWER() function.
	- Do not use 'COLLATE NOCASE' in ORDER BY clauses unless sorting a string column.
	- Do not apply 'COLLATE NOCASE' to numeric columns.
	If the question is vague or open-ended and does not pertain to specific data retrieval, respond with "NO_SQL" to indicate that a SQL query should not be generated.
	Question: {question}
	Table name: {table_name}
	Valid columns: {columns}
	SQL Query:
	"""
	response = llm(prompt, max_length=180)
	return response[0]['generated_text'].strip()

	# Function to generate insights using Hugging Face model
	def generate_insights(question, result):
	prompt = f"""
	You are an expert data scientist. Based on the user's question and the SQL query result provided below, generate a concise analysis that includes key data insights and actionable recommendations. Limit the response to a maximum of 150 words.
	User's Question: {question}
	SQL Query Result:
	{result}
	Concise Analysis (max 200 words):
	"""
	response = llm(prompt, max_length=150)
	return response[0]['generated_text'].strip()

	# Function to classify user query as SQL or Insights
	def classify_query(question):
	prompt = f"""
	You are an AI assistant that classifies user queries into two categories: 'SQL' for specific data retrieval queries and 'INSIGHTS' for general analytical or recommendation queries.
	Determine the appropriate category for the following user question.
	Question: "{question}"
	Category (SQL/INSIGHTS):
	"""
	response = llm(prompt, max_length=10)
	category = response[0]['generated_text'].strip().upper()
	return 'SQL' if 'SQL' in category else 'INSIGHTS'

	# Function to generate dataset summary
	def generate_dataset_summary(data):
	summary_template = f"""
	You are an expert data scientist. Based on the dataset provided below, generate a concise summary that includes the number of records, number of columns, data types, and any notable features.
	Dataset:
	{data.head().to_string(index=False)}
	Dataset Summary:
	"""
	response = llm(summary_template, max_length=150)
	return response[0]['generated_text'].strip()

	# Optional: Clean up function to remove incorrect COLLATE NOCASE usage
	def clean_sql_query(query):
	"""Removes incorrect usage of COLLATE NOCASE from the SQL query."""
	parsed = sqlparse.parse(query)
	statements = []
	for stmt in parsed:
	tokens = []
	idx = 0
	while idx < len(stmt.tokens):
	token = stmt.tokens[idx]
	if (token.ttype is sqlparse.tokens.Keyword and token.value.upper() == 'COLLATE'):
	# Check if the next token is 'NOCASE'
	next_token = stmt.tokens[idx + 2] if idx + 2 < len(stmt.tokens) else None
	if next_token and next_token.value.upper() == 'NOCASE':
	# Skip 'COLLATE' and 'NOCASE' tokens
	idx += 3 # Skip 'COLLATE', whitespace, 'NOCASE'
	continue
	tokens.append(token)
	idx += 1
	statements.append(''.join([str(t) for t in tokens]))
	return ' '.join(statements)

	# Define the callback function
	def process_input():
	user_prompt = st.session_state['user_input']

	if user_prompt:
	try:
	# Append user message to history
	st.session_state.history.append({"role": "user", "content": user_prompt})

	# Classify the user query
	category = classify_query(user_prompt)
	logging.info(f"User query classified as: {category}")

	if "COLUMNS" in user_prompt.upper():
	assistant_response = f"The columns are: {', '.join(valid_columns)}"
	st.session_state.history.append({"role": "assistant", "content": assistant_response})
	elif category == 'SQL':
	columns = ', '.join(valid_columns)
	generated_sql = generate_sql_query(user_prompt, table_name, columns)

	if generated_sql.upper() == "NO_SQL":
	# Handle cases where no SQL should be generated
	assistant_response = "Sure, let's discuss some general insights and recommendations based on the data."

	# Generate dataset summary
	dataset_summary = generate_dataset_summary(data)

	# Generate general insights and recommendations
	general_insights = generate_insights(user_prompt, dataset_summary)

	# Append the assistant's insights to the history
	st.session_state.history.append({"role": "assistant", "content": general_insights})
	else:
	# Clean the SQL query
	cleaned_sql = clean_sql_query(generated_sql)
	logging.info(f"Generated SQL Query: {cleaned_sql}")

	# Attempt to execute SQL query and handle exceptions
	try:
	result = pd.read_sql_query(cleaned_sql, conn)

	if result.empty:
	assistant_response = "The query returned no results. Please try a different question."
	st.session_state.history.append({"role": "assistant", "content": assistant_response})
	else:
	# Convert the result to a string for the insights prompt
	result_str = result.head(10).to_string(index=False) # Limit to first 10 rows

	# Generate insights and recommendations based on the query result
	insights = generate_insights(user_prompt, result_str)

	# Append the assistant's insights to the history
	st.session_state.history.append({"role": "assistant", "content": insights})
	# Append the result DataFrame to the history
	st.session_state.history.append({"role": "assistant", "content": result})
	except Exception as e:
	logging.error(f"An error occurred during SQL execution: {e}")
	assistant_response = f"Error executing SQL query: {e}"
	st.session_state.history.append({"role": "assistant", "content": assistant_response})
	else: # INSIGHTS category
	# Generate dataset summary
	dataset_summary = generate_dataset_summary(data)

	# Generate general insights and recommendations
	general_insights = generate_insights(user_prompt, dataset_summary)

	# Append the assistant's insights to the history
	st.session_state.history.append({"role": "assistant", "content": general_insights})

	except Exception as e:
	logging.error(f"An error occurred: {e}")
	assistant_response = f"Error: {e}"
	st.session_state.history.append({"role": "assistant", "content": assistant_response})

	# Reset the user_input in session state
	st.session_state['user_input'] = ''

	# Display the conversation history
	for message in st.session_state.history:
	if message['role'] == 'user':
	st.markdown(f"User: {message['content']}")
	elif message['role'] == 'assistant':
	if isinstance(message['content'], pd.DataFrame):
	st.markdown("Assistant: Query Results:")
	st.dataframe(message['content'])
	else:
	st.markdown(f"Assistant: {message['content']}")

	# Place the input field at the bottom with the callback
	st.text_input("Enter your message:", key='user_input', on_change=process_input)