Spaces:
Sleeping
Sleeping
File size: 17,999 Bytes
5975965 0a6aa1c 5975965 5870b59 f1be059 5870b59 5975965 0a6aa1c 5975965 a0d222a 5870b59 5975965 5ec66d0 5870b59 5975965 f1be059 5870b59 f1be059 5870b59 fb9cae8 5870b59 fb9cae8 416ee2d 5975965 5870b59 5975965 5870b59 5975965 5870b59 5975965 416ee2d 5870b59 5975965 f81145c 7dc0ac4 64a2db8 7dc0ac4 64a2db8 5975965 5870b59 5975965 819dc57 5870b59 a6e3784 819dc57 a6e3784 819dc57 5870b59 5975965 5870b59 5975965 7dc0ac4 a6e3784 7dc0ac4 e0721fb 7dc0ac4 29bcfc5 a3682f0 29bcfc5 a3682f0 29bcfc5 7dc0ac4 746fb67 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 |
import os
import re
from langchain.agents import initialize_agent, Tool
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
import pandas as pd
from pandasai.llm.openai import OpenAI
from pandasai import SmartDataframe
# Initialize a blank DataFrame as a global variable
global_df = pd.DataFrame()
class ChatHandler:
def __init__(self, vector_db_path, open_api_key, grok_api_key,db_final):
self.vector_db_path = vector_db_path
self.openai_embeddings = OpenAIEmbeddings(api_key=open_api_key)
self.llm_openai = ChatOpenAI(model_name="gpt-4o-mini", api_key=open_api_key, max_tokens=500, temperature=0.2)
self.grok_api_key = grok_api_key
self.openai_api_key = open_api_key
self.sql_db = db_final
def _load_documents_from_vector_db(self, query):
"""Fetch relevant documents from the vector database."""
results = []
# Debug: Print the query being processed
print(f"Processing query: {query}")
for root, dirs, files in os.walk(self.vector_db_path):
print(f"Searching in directory: {root}") # Debug: Current directory being processed
for dir in dirs:
index_path = os.path.join(root, dir, "index.faiss")
# Debug: Check if FAISS index exists
if os.path.exists(index_path):
print(f"Found FAISS index at: {index_path}")
# Load the FAISS vector store
try:
vector_store = FAISS.load_local(
os.path.join(root, dir),
self.openai_embeddings,
allow_dangerous_deserialization=True
)
print(f"Loaded FAISS vector store from: {os.path.join(root, dir)}")
except Exception as e:
print(f"Error loading FAISS store: {e}")
continue
# Perform similarity search
try:
response_with_scores = vector_store.similarity_search_with_relevance_scores(query, k=100)
#print(response_with_scores)
print(f"Similarity search returned {len(response_with_scores)} results.")
filtered_results = [
(doc, score) for doc, score in response_with_scores
if score is not None and score > 0.7 #and material_name.lower() in doc.page_content.lower() # Check material name in document
]
print(f"Filtered results: {filtered_results}")
response_with_scores = filtered_results
# Debug: Print each document and score
for doc, score in response_with_scores:
print(f"Document: {doc.page_content[:100]}... Score: {score}")
results.extend([(doc.page_content, score) for doc, score in response_with_scores])
except Exception as e:
print(f"Error during similarity search: {e}")
# Sort and return results
sorted_results = [doc for doc, score in sorted(results, key=lambda x: -x[1])]
print(f"Total results after sorting: {len(sorted_results)}")
return sorted_results
def _load_schema_from_database(self, query):
"""
Fetch database schema, generate a SQL query from the user's question, and execute it.
"""
try:
# Fetch the schema
schema = self.sql_db.get_table_info()
# Define the prompt template
template_query_generation = """
Based on the table schema below, write a mySQL query with correct syntax that would answer the user's question.
Only write the SQL query without explanations and without string 'sql'and '''.
Schema:
{schema}
Question: {question}
SQL Query:
"""
prompt = PromptTemplate(
input_variables=["schema", "question"],
template=template_query_generation
)
# Initialize the language modelgpt-4o-mini
llm = ChatOpenAI(model_name="gpt-4o-mini", api_key=self.openai_api_key, max_tokens=500, temperature=0.2)
# Create the runnable sequence
chain = prompt | llm | StrOutputParser()
# Generate the SQL query
sql_query = chain.invoke({"schema": schema, "question": query}).strip()
if not sql_query:
return "Could not generate an SQL query for your question."
# Execute the SQL query
try:
result = self.sql_db.run(sql_query)
print(f"SQL query executed successfully. Result: {result}")
except Exception as e:
print(f"Error executing SQL query: {str(e)}")
return f"As you know I am still learning at this moment I am not able to respond to your question.\nThank you for your patience!"
# If no result, return an appropriate message
if not result:
return "Query executed, but no results were returned."
# Return the result
return result
except Exception as e:
print( f"Error fetching schema details or processing query: {str(e)}")
return f"As you know I am still learning at this moment I am not able to respond to your question.\nThank you for your patience!"
def answer_question(self, query, visual_query):
global global_df
"""Determine whether to use vector database or SQL database for the query."""
tools = [
# {
# "name": "Document Vector Store",
# "function": lambda q: "\n".join(self._load_documents_from_vector_db(q)),
# "description": """Search within the uploaded documents stored in the vector database.
# Display the response as a combination of response summary and the response data in the form of table.
# If the user requested comparison between two or more years, data should be shown for all the years. (For example, if the user requested from 2020 to 2024, then display the output table with the columns [Month, Material value in 2020, Material value in 2021, Material value in 2022, Material value in 2023, Material value in 2024]) so that the records will be displayed for all the months from Jaunary to December across the years.
# display the material quantity in blue colour if it the 'Type' column value is 'actual'.
# display the Material Quanity in red colour if its value is 'predicted'.
# include the table data in the Final answer of agent executor invoke.""",
# },
{
"name": "Database Schema",
"function": lambda q: self._load_schema_from_database(q),
"description": """Search within the mysql database schema and generate SQL-based responses.
The database has single table 'tp_material_forecast' which contains the columns 'date', 'material_name', 'material_quantity', and 'type'. Frame the query only with these four columns.
If the material name is given, frame the query in such a way that the material_name is not case-sensitive.
If the material name is not present in the table, return the proper message as "This material name is not in the database". Do not give any false values if the material name is not available in database.
If the response has month column, display the month as name For example, January instead of displaying as 1.
If the user requested comparison between two or more years or the user asks for the data for all years, data should be shown for all the years with month as first column and the years like 2020, 2021 etc as the adjacent columns.
Do not show everything in the same column. (For example, if the user requested from 2020 to 2024, then display the output table with the columns [Month, Material value in 2020, Material value in 2020, Material value in 2021, Material value in 2022, Material value in 2023, Material value in 2024]) so that the records will be displayed for all the months from Jaunary to December across the years.
If there is any error while executing the user question, kindly display the error message as 'As you know I am still learning at this moment I am not able to respond to your question.\nThank you for your patience!
display the response as a combination of response summary and the response data in the form of table. Display the table properly in a professional manner. """,
},
]
agent_prompt = PromptTemplate(
input_variables=["input", "agent_scratchpad"],
template="""
You are a highly skilled AI assistant specializing in mysql database.
I have a mysql database for material demand forecasts with columns as 'date', 'material_name', 'material_quantity', and 'type'.
The data includes historical demand information for various items.
1. The uploaded document includes:
- **Date:** The date of demand entry.
- **Material Name:** The name of the material or equipment.
- **Material Quantity:** The number of units actual or predicted.
- **Type:** Type contains actual or forecasted, actual represents the actual material utilized and forecasted represents the prediction by ai model.
2. I may ask questions such as:
- Forecasting future demand for specific items.
- Analyzing trends or patterns for materials over time.
- Summarizing the highest or lowest demands within a specific date range.
- Comparing demand values between two or more items.
Your task:
- If the query relates to forecasting, extract the necessary information from it
and provide precise, professional, and data-driven responses.
Make sure your answers are aligned with the uploaded document, depending on the context of the query.
display the response in the format as mentioned in the tool description.
include the table in the Final answer whereever it is required.
Do not display the first line and the last line of the table as '''
Tools available to you:
{tools}
Input Question:
{input}
{agent_scratchpad}
""",
)
# Initialize the agent
agent = initialize_agent(
tools=[Tool(name=t["name"], func=t["function"], description=t["description"]) for t in tools],
llm=self.llm_openai,
agent="zero-shot-react-description",
verbose=True,
prompt=agent_prompt
)
try:
response = agent.invoke(query, handle_parsing_errors=True)
print(f"response:{response}")
if isinstance(response, dict) and "output" in response:
response = response["output"] # Extract and return only the output field
else:
response = response # Fallback if output field is not present
if visual_query is not None:
# Check if the response contains table-like formatting
if "|" in response and "---" in response:
print("Table data is present in the response.")
#convert table data into dataframe
# Extract table rows
table_pattern = r"\|.*\|"
import re
table_data = re.findall(table_pattern, response)
# Remove separator lines (like |---|---|)
filtered_data = [row for row in table_data if not re.match(r"\|\-+\|", row)]
# Split rows into columns
split_data = [row.strip('|').split('|') for row in filtered_data]
# Create DataFrame
columns = [col.strip() for col in split_data[0]] # First row is the header
data = [list(map(str.strip, row)) for row in split_data[1:]] # Remaining rows are data
global_df = pd.DataFrame(data, columns=columns)
# Function to convert datatypes
global_df = convert_column_types(global_df)
print(f"Dataframe created from response:\n{global_df}")
visual_response = create_visualization_csv(visual_query)
else:
print("No table data found in the response.")
global_df = unstructured_text_to_df(response)
print(global_df)
visual_response = create_visualization_csv(visual_query)
print(visual_response)
else:
visual_response = None
return response, visual_response
except Exception as e:
print(f"Error while processing your query: {str(e)}")
return f"As you know I am still learning at this moment I am not able to respond to your question.\nThank you for your patience!" , None
def create_visualization_csv(visual_query):
global_df
#import matplotlib
#matplotlib.use('TkAgg') # Replace with 'QtAgg' or 'MacOSX' if on macOS
visual_query = visual_query + """ use proper axis scale so that all values can be plotted and shown properly.
mention axis values properly.
Do not miss any values.
Mention only month name in date axis and not the numbers or the date.
Do not place legend in the middle of the chart.
Increase the size of chart to make sure than the values are not trucated and the legend text is not truncated.
Place the legend in such a way that the plotted chart is not hidden.
Return the image path only after plotting all the values."""
llm_chart = OpenAI()
#from pandasai import PandasAI
#pandas_ai = PandasAI(llm_chart, show_plots=False)
#pandas_ai = PandasAI(show_plots=False) # Avoids attempting to show plots
sdf = SmartDataframe(global_df, config={"llm": llm_chart})
llm_response = sdf.chat(visual_query)
if "no result" in llm_response:
return f"As you know I am still learning at this moment I am not able to respond to your question.\nThank you for your patience!"
return llm_response
def convert_column_types(df):
for col in df.columns:
# Try to convert to integer
if all(df[col].str.isdigit()):
df[col] = df[col].astype(int)
# Try to convert to datetime
else:
try:
df[col] = pd.to_datetime(df[col], format='%Y-%m-%d', errors='raise')
except ValueError:
# Leave as string if neither integer nor date
pass
return df
def unstructured_text_to_df(text):
import openai
import pandas as pd
import os
import json
# Your OpenAI API key
openai.api_key = os.getenv("OPENAI_API_KEY", "")
# OpenAI prompt to structure the data
prompt = f"""
Extract the materials and their quantities from the following text and format them as a structured JSON:
{text}
"""
# Call OpenAI API
response = openai.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": prompt}
],
temperature=0
)
print(f"response: {response}")
# Extract the response content
response_content = response.choices[0].message.content.strip()
# Debugging: Print raw response to check its format
print("Raw Response:", response_content)
# Step 1: Extract the JSON part from the markdown
# Split the response content to isolate the JSON part
json_part = response_content.split("```json\n")[1].split("\n```")[0]
# Step 2: Parse the JSON content
try:
structured_data = json.loads(json_part) # Parse the JSON content
print("Parsed JSON:", structured_data)
except json.JSONDecodeError:
print("Error: Response content is not valid JSON.")
# Convert the structured data into a DataFrame
df = pd.DataFrame(structured_data["materials"])
# Rename columns to desired format
df.columns = ["material_name", "material_quantity"]
# Print the DataFrame
print(df)
return df |