arithescientist commited on
Commit
1c40c30
·
verified ·
1 Parent(s): f69e09c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -75
app.py CHANGED
@@ -30,7 +30,7 @@ else:
30
 
31
  # Step 2: Load CSV data into a persistent SQLite database
32
  db_file = 'my_database.db'
33
- conn = sqlite3.connect(db_file)
34
  data.to_sql(table_name, conn, index=False, if_exists='replace')
35
 
36
  # SQL table metadata (for validation and schema)
@@ -40,80 +40,45 @@ st.write(f"Valid columns: {valid_columns}")
40
  # Function to generate SQL query using Hugging Face model
41
  def generate_sql_query(question, table_name, columns):
42
  prompt = f"""
43
- You are an expert data scientist. Given a natural language question, the name of the table, and a list of valid columns, generate a valid SQL query that answers the question.
44
- Ensure that:
45
- - You only use the columns provided.
46
- - When performing string comparisons in the WHERE clause, make them case-insensitive by using 'COLLATE NOCASE' or the LOWER() function.
47
- - Do not use 'COLLATE NOCASE' in ORDER BY clauses unless sorting a string column.
48
- - Do not apply 'COLLATE NOCASE' to numeric columns.
49
- If the question is vague or open-ended and does not pertain to specific data retrieval, respond with "NO_SQL" to indicate that a SQL query should not be generated.
50
  Question: {question}
51
- Table name: {table_name}
52
- Valid columns: {columns}
53
- SQL Query:
54
  """
55
- response = llm(prompt, max_new_tokens=150) # Changed max_length to max_new_tokens
56
  return response[0]['generated_text'].strip()
57
 
58
-
59
  # Function to generate insights using Hugging Face model
60
  def generate_insights(question, result):
61
  prompt = f"""
62
- You are an expert data scientist. Based on the user's question and the SQL query result provided below, generate a concise analysis that includes key data insights and actionable recommendations. Limit the response to a maximum of 150 words.
63
- User's Question: {question}
64
- SQL Query Result:
65
  {result}
66
- Concise Analysis (max 200 words):
67
  """
68
- response = llm(prompt, max_new_tokens=100) # Changed max_length to max_new_tokens
69
  return response[0]['generated_text'].strip()
70
 
71
-
72
  # Function to classify user query as SQL or Insights
73
  def classify_query(question):
74
  prompt = f"""
75
- You are an AI assistant that classifies user queries into two categories: 'SQL' for specific data retrieval queries and 'INSIGHTS' for general analytical or recommendation queries.
76
- Determine the appropriate category for the following user question.
77
- Question: "{question}"
78
- Category (SQL/INSIGHTS):
79
  """
80
- response = llm(prompt, max_length=100)
81
  category = response[0]['generated_text'].strip().upper()
82
  return 'SQL' if 'SQL' in category else 'INSIGHTS'
83
 
84
  # Function to generate dataset summary
85
  def generate_dataset_summary(data):
86
  summary_template = f"""
87
- You are an expert data scientist. Based on the dataset provided below, generate a concise summary that includes the number of records, number of columns, data types, and any notable features.
88
- Dataset:
89
  {data.head().to_string(index=False)}
90
- Dataset Summary:
91
  """
92
- response = llm(summary_template, max_new_tokens=100) # Changed max_length to max_new_tokens
93
  return response[0]['generated_text'].strip()
94
 
95
-
96
- # Optional: Clean up function to remove incorrect COLLATE NOCASE usage
97
- def clean_sql_query(query):
98
- """Removes incorrect usage of COLLATE NOCASE from the SQL query."""
99
- parsed = sqlparse.parse(query)
100
- statements = []
101
- for stmt in parsed:
102
- tokens = []
103
- idx = 0
104
- while idx < len(stmt.tokens):
105
- token = stmt.tokens[idx]
106
- if (token.ttype is sqlparse.tokens.Keyword and token.value.upper() == 'COLLATE'):
107
- # Check if the next token is 'NOCASE'
108
- next_token = stmt.tokens[idx + 2] if idx + 2 < len(stmt.tokens) else None
109
- if next_token and next_token.value.upper() == 'NOCASE':
110
- # Skip 'COLLATE' and 'NOCASE' tokens
111
- idx += 3 # Skip 'COLLATE', whitespace, 'NOCASE'
112
- continue
113
- tokens.append(token)
114
- idx += 1
115
- statements.append(''.join([str(t) for t in tokens]))
116
- return ' '.join(statements)
117
 
118
  # Define the callback function
119
  def process_input():
@@ -148,32 +113,32 @@ def process_input():
148
  # Append the assistant's insights to the history
149
  st.session_state.history.append({"role": "assistant", "content": general_insights})
150
  else:
151
- # Clean the SQL query
152
- cleaned_sql = clean_sql_query(generated_sql)
153
- logging.info(f"Generated SQL Query: {cleaned_sql}")
154
-
155
- # Attempt to execute SQL query and handle exceptions
156
- try:
157
- result = pd.read_sql_query(cleaned_sql, conn)
158
-
159
- if result.empty:
160
- assistant_response = "The query returned no results. Please try a different question."
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  st.session_state.history.append({"role": "assistant", "content": assistant_response})
162
- else:
163
- # Convert the result to a string for the insights prompt
164
- result_str = result.head(10).to_string(index=False) # Limit to first 10 rows
165
-
166
- # Generate insights and recommendations based on the query result
167
- insights = generate_insights(user_prompt, result_str)
168
-
169
- # Append the assistant's insights to the history
170
- st.session_state.history.append({"role": "assistant", "content": insights})
171
- # Append the result DataFrame to the history
172
- st.session_state.history.append({"role": "assistant", "content": result})
173
- except Exception as e:
174
- logging.error(f"An error occurred during SQL execution: {e}")
175
- assistant_response = f"Error executing SQL query: {e}"
176
- st.session_state.history.append({"role": "assistant", "content": assistant_response})
177
  else: # INSIGHTS category
178
  # Generate dataset summary
179
  dataset_summary = generate_dataset_summary(data)
 
30
 
31
  # Step 2: Load CSV data into a persistent SQLite database
32
  db_file = 'my_database.db'
33
+ conn = sqlite3.connect(db_file, check_same_thread=False) # Allow connection across threads
34
  data.to_sql(table_name, conn, index=False, if_exists='replace')
35
 
36
  # SQL table metadata (for validation and schema)
 
40
  # Function to generate SQL query using Hugging Face model
41
  def generate_sql_query(question, table_name, columns):
42
  prompt = f"""
43
+ Generate a valid SQL query using the following columns:
44
+ {columns}.
 
 
 
 
 
45
  Question: {question}
 
 
 
46
  """
47
+ response = llm(prompt, max_new_tokens=50, truncation=True) # Ensure max tokens are reasonable
48
  return response[0]['generated_text'].strip()
49
 
 
50
  # Function to generate insights using Hugging Face model
51
  def generate_insights(question, result):
52
  prompt = f"""
53
+ Based on the user's question and the SQL query result below, generate concise data insights:
 
 
54
  {result}
 
55
  """
56
+ response = llm(prompt, max_new_tokens=100, truncation=True)
57
  return response[0]['generated_text'].strip()
58
 
 
59
  # Function to classify user query as SQL or Insights
60
  def classify_query(question):
61
  prompt = f"""
62
+ Classify the following question as 'SQL' or 'INSIGHTS':
63
+ "{question}"
 
 
64
  """
65
+ response = llm(prompt, max_new_tokens=10, truncation=True)
66
  category = response[0]['generated_text'].strip().upper()
67
  return 'SQL' if 'SQL' in category else 'INSIGHTS'
68
 
69
  # Function to generate dataset summary
70
  def generate_dataset_summary(data):
71
  summary_template = f"""
72
+ Provide a brief summary of the dataset:
 
73
  {data.head().to_string(index=False)}
 
74
  """
75
+ response = llm(summary_template, max_new_tokens=100, truncation=True)
76
  return response[0]['generated_text'].strip()
77
 
78
+ # Function to validate if the generated SQL query is valid
79
+ def is_valid_sql(query):
80
+ sql_keywords = ["SELECT", "INSERT", "UPDATE", "DELETE", "CREATE", "DROP", "ALTER"]
81
+ return any(query.strip().upper().startswith(keyword) for keyword in sql_keywords)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
83
  # Define the callback function
84
  def process_input():
 
113
  # Append the assistant's insights to the history
114
  st.session_state.history.append({"role": "assistant", "content": general_insights})
115
  else:
116
+ # Validate the SQL query
117
+ if is_valid_sql(generated_sql):
118
+ # Attempt to execute SQL query and handle exceptions
119
+ try:
120
+ result = pd.read_sql_query(generated_sql, conn)
121
+
122
+ if result.empty:
123
+ assistant_response = "The query returned no results. Please try a different question."
124
+ st.session_state.history.append({"role": "assistant", "content": assistant_response})
125
+ else:
126
+ # Convert the result to a string for the insights prompt
127
+ result_str = result.head(10).to_string(index=False) # Limit to first 10 rows
128
+
129
+ # Generate insights and recommendations based on the query result
130
+ insights = generate_insights(user_prompt, result_str)
131
+
132
+ # Append the assistant's insights to the history
133
+ st.session_state.history.append({"role": "assistant", "content": insights})
134
+ # Append the result DataFrame to the history
135
+ st.session_state.history.append({"role": "assistant", "content": result})
136
+ except Exception as e:
137
+ logging.error(f"An error occurred during SQL execution: {e}")
138
+ assistant_response = f"Error executing SQL query: {e}"
139
  st.session_state.history.append({"role": "assistant", "content": assistant_response})
140
+ else:
141
+ st.session_state.history.append({"role": "assistant", "content": "Generated text is not a valid SQL query."})
 
 
 
 
 
 
 
 
 
 
 
 
 
142
  else: # INSIGHTS category
143
  # Generate dataset summary
144
  dataset_summary = generate_dataset_summary(data)