GenBIChatbotfree

Sleeping

App Files Files Community

arithescientist commited on Sep 30, 2024

Commit

1c40c30

verified ·

1 Parent(s): f69e09c

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -75

app.py CHANGED Viewed

@@ -30,7 +30,7 @@ else:
 # Step 2: Load CSV data into a persistent SQLite database
 db_file = 'my_database.db'
-conn = sqlite3.connect(db_file)
 data.to_sql(table_name, conn, index=False, if_exists='replace')
 # SQL table metadata (for validation and schema)
@@ -40,80 +40,45 @@ st.write(f"Valid columns: {valid_columns}")
 # Function to generate SQL query using Hugging Face model
 def generate_sql_query(question, table_name, columns):
     prompt = f"""
-    You are an expert data scientist. Given a natural language question, the name of the table, and a list of valid columns, generate a valid SQL query that answers the question.
-    Ensure that:
-    - You only use the columns provided.
-    - When performing string comparisons in the WHERE clause, make them case-insensitive by using 'COLLATE NOCASE' or the LOWER() function.
-    - Do not use 'COLLATE NOCASE' in ORDER BY clauses unless sorting a string column.
-    - Do not apply 'COLLATE NOCASE' to numeric columns.
-    If the question is vague or open-ended and does not pertain to specific data retrieval, respond with "NO_SQL" to indicate that a SQL query should not be generated.
     Question: {question}
-    Table name: {table_name}
-    Valid columns: {columns}
-    SQL Query:
     """
-    response = llm(prompt, max_new_tokens=150)  # Changed max_length to max_new_tokens
     return response[0]['generated_text'].strip()
 # Function to generate insights using Hugging Face model
 def generate_insights(question, result):
     prompt = f"""
-    You are an expert data scientist. Based on the user's question and the SQL query result provided below, generate a concise analysis that includes key data insights and actionable recommendations. Limit the response to a maximum of 150 words.
-    User's Question: {question}
-    SQL Query Result:
     {result}
-    Concise Analysis (max 200 words):
     """
-    response = llm(prompt, max_new_tokens=100)  # Changed max_length to max_new_tokens
     return response[0]['generated_text'].strip()
 # Function to classify user query as SQL or Insights
 def classify_query(question):
     prompt = f"""
-    You are an AI assistant that classifies user queries into two categories: 'SQL' for specific data retrieval queries and 'INSIGHTS' for general analytical or recommendation queries.
-    Determine the appropriate category for the following user question.
-    Question: "{question}"
-    Category (SQL/INSIGHTS):
     """
-    response = llm(prompt, max_length=100)
     category = response[0]['generated_text'].strip().upper()
     return 'SQL' if 'SQL' in category else 'INSIGHTS'
 # Function to generate dataset summary
 def generate_dataset_summary(data):
     summary_template = f"""
-    You are an expert data scientist. Based on the dataset provided below, generate a concise summary that includes the number of records, number of columns, data types, and any notable features.
-    Dataset:
     {data.head().to_string(index=False)}
-    Dataset Summary:
     """
-    response = llm(summary_template, max_new_tokens=100)  # Changed max_length to max_new_tokens
     return response[0]['generated_text'].strip()
-# Optional: Clean up function to remove incorrect COLLATE NOCASE usage
-def clean_sql_query(query):
-    """Removes incorrect usage of COLLATE NOCASE from the SQL query."""
-    parsed = sqlparse.parse(query)
-    statements = []
-    for stmt in parsed:
-        tokens = []
-        idx = 0
-        while idx < len(stmt.tokens):
-            token = stmt.tokens[idx]
-            if (token.ttype is sqlparse.tokens.Keyword and token.value.upper() == 'COLLATE'):
-                # Check if the next token is 'NOCASE'
-                next_token = stmt.tokens[idx + 2] if idx + 2 < len(stmt.tokens) else None
-                if next_token and next_token.value.upper() == 'NOCASE':
-                    # Skip 'COLLATE' and 'NOCASE' tokens
-                    idx += 3  # Skip 'COLLATE', whitespace, 'NOCASE'
-                    continue
-            tokens.append(token)
-            idx += 1
-        statements.append(''.join([str(t) for t in tokens]))
-    return ' '.join(statements)
 # Define the callback function
 def process_input():
@@ -148,32 +113,32 @@ def process_input():
                     # Append the assistant's insights to the history
                     st.session_state.history.append({"role": "assistant", "content": general_insights})
                 else:
-                    # Clean the SQL query
-                    cleaned_sql = clean_sql_query(generated_sql)
-                    logging.info(f"Generated SQL Query: {cleaned_sql}")
-                    # Attempt to execute SQL query and handle exceptions
-                    try:
-                        result = pd.read_sql_query(cleaned_sql, conn)
-                        if result.empty:
-                            assistant_response = "The query returned no results. Please try a different question."
                             st.session_state.history.append({"role": "assistant", "content": assistant_response})
-                        else:
-                            # Convert the result to a string for the insights prompt
-                            result_str = result.head(10).to_string(index=False)  # Limit to first 10 rows
-                            # Generate insights and recommendations based on the query result
-                            insights = generate_insights(user_prompt, result_str)
-                            # Append the assistant's insights to the history
-                            st.session_state.history.append({"role": "assistant", "content": insights})
-                            # Append the result DataFrame to the history
-                            st.session_state.history.append({"role": "assistant", "content": result})
-                    except Exception as e:
-                        logging.error(f"An error occurred during SQL execution: {e}")
-                        assistant_response = f"Error executing SQL query: {e}"
-                        st.session_state.history.append({"role": "assistant", "content": assistant_response})
             else:  # INSIGHTS category
                 # Generate dataset summary
                 dataset_summary = generate_dataset_summary(data)

 # Step 2: Load CSV data into a persistent SQLite database
 db_file = 'my_database.db'
+conn = sqlite3.connect(db_file, check_same_thread=False)  # Allow connection across threads
 data.to_sql(table_name, conn, index=False, if_exists='replace')
 # SQL table metadata (for validation and schema)
 # Function to generate SQL query using Hugging Face model
 def generate_sql_query(question, table_name, columns):
     prompt = f"""
+    Generate a valid SQL query using the following columns:
+    {columns}.
     Question: {question}
     """
+    response = llm(prompt, max_new_tokens=50, truncation=True)  # Ensure max tokens are reasonable
     return response[0]['generated_text'].strip()
 # Function to generate insights using Hugging Face model
 def generate_insights(question, result):
     prompt = f"""
+    Based on the user's question and the SQL query result below, generate concise data insights:
     {result}
     """
+    response = llm(prompt, max_new_tokens=100, truncation=True)
     return response[0]['generated_text'].strip()
 # Function to classify user query as SQL or Insights
 def classify_query(question):
     prompt = f"""
+    Classify the following question as 'SQL' or 'INSIGHTS':
+    "{question}"
     """
+    response = llm(prompt, max_new_tokens=10, truncation=True)
     category = response[0]['generated_text'].strip().upper()
     return 'SQL' if 'SQL' in category else 'INSIGHTS'
 # Function to generate dataset summary
 def generate_dataset_summary(data):
     summary_template = f"""
+    Provide a brief summary of the dataset:
     {data.head().to_string(index=False)}
     """
+    response = llm(summary_template, max_new_tokens=100, truncation=True)
     return response[0]['generated_text'].strip()
+# Function to validate if the generated SQL query is valid
+def is_valid_sql(query):
+    sql_keywords = ["SELECT", "INSERT", "UPDATE", "DELETE", "CREATE", "DROP", "ALTER"]
+    return any(query.strip().upper().startswith(keyword) for keyword in sql_keywords)
 # Define the callback function
 def process_input():
                     # Append the assistant's insights to the history
                     st.session_state.history.append({"role": "assistant", "content": general_insights})
                 else:
+                    # Validate the SQL query
+                    if is_valid_sql(generated_sql):
+                        # Attempt to execute SQL query and handle exceptions
+                        try:
+                            result = pd.read_sql_query(generated_sql, conn)
+                            if result.empty:
+                                assistant_response = "The query returned no results. Please try a different question."
+                                st.session_state.history.append({"role": "assistant", "content": assistant_response})
+                            else:
+                                # Convert the result to a string for the insights prompt
+                                result_str = result.head(10).to_string(index=False)  # Limit to first 10 rows
+                                # Generate insights and recommendations based on the query result
+                                insights = generate_insights(user_prompt, result_str)
+                                # Append the assistant's insights to the history
+                                st.session_state.history.append({"role": "assistant", "content": insights})
+                                # Append the result DataFrame to the history
+                                st.session_state.history.append({"role": "assistant", "content": result})
+                        except Exception as e:
+                            logging.error(f"An error occurred during SQL execution: {e}")
+                            assistant_response = f"Error executing SQL query: {e}"
                             st.session_state.history.append({"role": "assistant", "content": assistant_response})
+                    else:
+                        st.session_state.history.append({"role": "assistant", "content": "Generated text is not a valid SQL query."})
             else:  # INSIGHTS category
                 # Generate dataset summary
                 dataset_summary = generate_dataset_summary(data)