GenBIChatbotfree

Sleeping

App Files Files Community

Ari commited on Sep 25, 2024

Commit

0bb1965

verified ·

1 Parent(s): 82bfc51

Update app.py

Browse files

Files changed (1) hide show

app.py +61 -94

app.py CHANGED Viewed

@@ -2,33 +2,30 @@ import os
 import streamlit as st
 import pandas as pd
 import sqlite3
-import numpy as np  # For numerical operations
 from langchain import OpenAI, LLMChain, PromptTemplate
 import sqlparse
 import logging
-from sklearn.linear_model import LinearRegression  # For machine learning tasks
-from sklearn.model_selection import train_test_split
-from sklearn.metrics import mean_squared_error, r2_score
 # Initialize conversation history
 if 'history' not in st.session_state:
     st.session_state.history = []
-# Set up logging
-logging.basicConfig(level=logging.ERROR)
 # OpenAI API key (ensure it is securely stored)
 openai_api_key = os.getenv("OPENAI_API_KEY")
-# Set OpenAI API key for langchain
-from langchain.llms import OpenAI as LangchainOpenAI
-LangchainOpenAI.api_key = openai_api_key
 # Step 1: Upload CSV data file (or use default)
-st.title("Data Science Chatbot")
 csv_file = st.file_uploader("Upload your CSV file", type=["csv"])
 if csv_file is None:
-    data = pd.read_csv("default_data.csv")  # Use default CSV if no file is uploaded
     st.write("Using default_data.csv file.")
 else:
     data = pd.read_csv(csv_file)
@@ -45,30 +42,16 @@ data.to_sql(table_name, conn, index=False, if_exists='replace')
 valid_columns = list(data.columns)
 st.write(f"Valid columns: {valid_columns}")
-# Step 3: Define helper functions
-def extract_code(response):
-    """Extracts code enclosed between <CODE> and </CODE> tags."""
-    import re
-    pattern = r"<CODE>(.*?)</CODE>"
-    match = re.search(pattern, response, re.DOTALL)
-    if match:
-        return match.group(1).strip()
-    else:
-        return None
-# Step 4: Set up the LLM Chain to generate SQL queries or Python code
 template = """
-You are an expert data scientist assistant. Given a natural language question, the name of the table, and a list of valid columns, decide whether to generate a SQL query to retrieve data, perform statistical analysis, or build a simple machine learning model.
-Instructions:
-- If the question involves data retrieval or simple aggregations, generate a SQL query.
-- If the question requires statistical analysis, generate a Python code snippet using pandas and numpy.
-- If the question involves predictions or modeling, generate a Python code snippet using scikit-learn.
-- Ensure that you only use the columns provided.
-- Do not include any import statements in the code.
-- For case-insensitive string comparisons in SQL, use either 'LOWER(column) = LOWER(value)' or 'column = value COLLATE NOCASE', but do not use both together.
-- Provide the code between <CODE> and </CODE> tags.
 Question: {question}
@@ -76,12 +59,34 @@ Table name: {table_name}
 Valid columns: {columns}
-Response:
 """
 prompt = PromptTemplate(template=template, input_variables=['question', 'table_name', 'columns'])
-llm = LangchainOpenAI(temperature=0)
 sql_generation_chain = LLMChain(llm=llm, prompt=prompt)
 # Define the callback function
 def process_input():
     user_prompt = st.session_state['user_input']
@@ -96,59 +101,27 @@ def process_input():
                 st.session_state.history.append({"role": "assistant", "content": assistant_response})
             else:
                 columns = ', '.join(valid_columns)
-                response = sql_generation_chain.run({
                     'question': user_prompt,
                     'table_name': table_name,
                     'columns': columns
                 })
-                # Extract code from response
-                code = extract_code(response)
-                if code:
-                    # Determine if the code is SQL or Python
-                    if code.strip().lower().startswith('select'):
-                        # It's a SQL query
-                        st.write(f"Generated SQL Query:\n{code}")
-                        try:
-                            # Execute the SQL query
-                            result = pd.read_sql_query(code, conn)
-                            assistant_response = f"Generated SQL Query:\n{code}"
-                            st.session_state.history.append({"role": "assistant", "content": assistant_response})
-                            st.session_state.history.append({"role": "assistant", "content": result})
-                        except Exception as e:
-                            logging.error(f"An error occurred during SQL execution: {e}")
-                            assistant_response = f"Error executing SQL query: {e}"
-                            st.session_state.history.append({"role": "assistant", "content": assistant_response})
-                    else:
-                        # It's Python code
-                        st.write(f"Generated Python Code:\n{code}")
-                        try:
-                            # Prepare the local namespace
-                            local_vars = {
-                                'pd': pd,
-                                'np': np,
-                                'data': data.copy(),
-                                'result': None,
-                                'LinearRegression': LinearRegression,
-                                'train_test_split': train_test_split,
-                                'mean_squared_error': mean_squared_error,
-                                'r2_score': r2_score
-                            }
-                            exec(code, {}, local_vars)
-                            result = local_vars.get('result')
-                            if result is not None:
-                                assistant_response = "Result:"
-                                st.session_state.history.append({"role": "assistant", "content": assistant_response})
-                                st.session_state.history.append({"role": "assistant", "content": result})
-                            else:
-                                assistant_response = "Code executed successfully."
-                                st.session_state.history.append({"role": "assistant", "content": assistant_response})
-                        except Exception as e:
-                            logging.error(f"An error occurred during code execution: {e}")
-                            assistant_response = f"Error executing code: {e}"
-                            st.session_state.history.append({"role": "assistant", "content": assistant_response})
-                else:
-                    assistant_response = response.strip()
                     st.session_state.history.append({"role": "assistant", "content": assistant_response})
         except Exception as e:
@@ -164,17 +137,11 @@ for message in st.session_state.history:
     if message['role'] == 'user':
         st.markdown(f"**User:** {message['content']}")
     elif message['role'] == 'assistant':
-        content = message['content']
-        if isinstance(content, pd.DataFrame):
-            st.markdown("**Assistant:** Here are the results:")
-            st.dataframe(content)
-        elif isinstance(content, (int, float)):
-            st.markdown(f"**Assistant:** {content}")
-        elif isinstance(content, dict):
-            st.markdown("**Assistant:** Here are the results:")
-            st.json(content)
         else:
-            st.markdown(f"**Assistant:** {content}")
 # Place the input field at the bottom with the callback
 st.text_input("Enter your message:", key='user_input', on_change=process_input)

 import streamlit as st
 import pandas as pd
 import sqlite3
 from langchain import OpenAI, LLMChain, PromptTemplate
 import sqlparse
 import logging
 # Initialize conversation history
 if 'history' not in st.session_state:
     st.session_state.history = []
 # OpenAI API key (ensure it is securely stored)
+# You can set the API key in your environment variables or a .env file
 openai_api_key = os.getenv("OPENAI_API_KEY")
+# Check if the API key is set
+if not openai_api_key:
+    st.error("OpenAI API key is not set. Please set the OPENAI_API_KEY environment variable.")
+    st.stop()
 # Step 1: Upload CSV data file (or use default)
+st.title("Natural Language to SQL Query App")
+st.write("Upload a CSV file to get started, or use the default dataset.")
 csv_file = st.file_uploader("Upload your CSV file", type=["csv"])
 if csv_file is None:
+    data = pd.read_csv("default_data.csv")  # Ensure this file exists in your working directory
     st.write("Using default_data.csv file.")
 else:
     data = pd.read_csv(csv_file)
 valid_columns = list(data.columns)
 st.write(f"Valid columns: {valid_columns}")
+# Step 3: Set up the LLM Chain to generate SQL queries
 template = """
+You are an expert data scientist. Given a natural language question, the name of the table, and a list of valid columns, generate a valid SQL query that answers the question.
+Ensure that:
+- You only use the columns provided.
+- When performing string comparisons in the WHERE clause, make them case-insensitive by using 'COLLATE NOCASE' or the LOWER() function.
+- Do not use 'COLLATE NOCASE' in ORDER BY clauses unless sorting a string column.
+- Do not apply 'COLLATE NOCASE' to numeric columns.
 Question: {question}
 Valid columns: {columns}
+SQL Query:
 """
 prompt = PromptTemplate(template=template, input_variables=['question', 'table_name', 'columns'])
+llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
 sql_generation_chain = LLMChain(llm=llm, prompt=prompt)
+# Optional: Clean up function to remove incorrect COLLATE NOCASE usage
+def clean_sql_query(query):
+    """Removes incorrect usage of COLLATE NOCASE from the SQL query."""
+    parsed = sqlparse.parse(query)
+    statements = []
+    for stmt in parsed:
+        tokens = []
+        idx = 0
+        while idx < len(stmt.tokens):
+            token = stmt.tokens[idx]
+            if (token.ttype is sqlparse.tokens.Keyword and token.value.upper() == 'COLLATE'):
+                # Check if the next token is 'NOCASE'
+                next_token = stmt.tokens[idx + 2] if idx + 2 < len(stmt.tokens) else None
+                if next_token and next_token.value.upper() == 'NOCASE':
+                    # Skip 'COLLATE' and 'NOCASE' tokens
+                    idx += 3  # Skip 'COLLATE', whitespace, 'NOCASE'
+                    continue
+            tokens.append(token)
+            idx += 1
+        statements.append(''.join([str(t) for t in tokens]))
+    return ' '.join(statements)
 # Define the callback function
 def process_input():
     user_prompt = st.session_state['user_input']
                 st.session_state.history.append({"role": "assistant", "content": assistant_response})
             else:
                 columns = ', '.join(valid_columns)
+                generated_sql = sql_generation_chain.run({
                     'question': user_prompt,
                     'table_name': table_name,
                     'columns': columns
                 })
+                # Debug: Display generated SQL query for inspection
+                st.write(f"Generated SQL Query:\n{generated_sql}")
+                # Clean the SQL query
+                generated_sql = clean_sql_query(generated_sql)
+                # Attempt to execute SQL query and handle exceptions
+                try:
+                    result = pd.read_sql_query(generated_sql, conn)
+                    assistant_response = f"Generated SQL Query:\n{generated_sql}"
+                    st.session_state.history.append({"role": "assistant", "content": assistant_response})
+                    st.session_state.history.append({"role": "assistant", "content": result})
+                except Exception as e:
+                    logging.error(f"An error occurred during SQL execution: {e}")
+                    assistant_response = f"Error executing SQL query: {e}"
                     st.session_state.history.append({"role": "assistant", "content": assistant_response})
         except Exception as e:
     if message['role'] == 'user':
         st.markdown(f"**User:** {message['content']}")
     elif message['role'] == 'assistant':
+        if isinstance(message['content'], pd.DataFrame):
+            st.markdown("**Assistant:** Query Results:")
+            st.dataframe(message['content'])
         else:
+            st.markdown(f"**Assistant:** {message['content']}")
 # Place the input field at the bottom with the callback
 st.text_input("Enter your message:", key='user_input', on_change=process_input)