File size: 17,999 Bytes
5975965
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0a6aa1c
5975965
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5870b59
 
f1be059
5870b59
5975965
 
 
 
 
 
 
 
 
 
 
 
0a6aa1c
 
5975965
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a0d222a
5870b59
5975965
 
 
 
 
 
 
 
 
5ec66d0
5870b59
5975965
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f1be059
5870b59
f1be059
5870b59
fb9cae8
5870b59
 
fb9cae8
416ee2d
5975965
 
 
 
 
 
5870b59
 
5975965
 
 
 
 
 
5870b59
5975965
 
 
 
 
 
 
 
 
5870b59
5975965
 
 
416ee2d
 
5870b59
5975965
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f81145c
 
7dc0ac4
64a2db8
7dc0ac4
64a2db8
5975965
 
 
 
5870b59
 
 
5975965
 
 
 
 
 
 
 
 
819dc57
 
5870b59
 
a6e3784
819dc57
a6e3784
819dc57
5870b59
5975965
 
 
 
 
 
 
 
 
 
5870b59
5975965
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7dc0ac4
 
 
a6e3784
7dc0ac4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e0721fb
7dc0ac4
 
 
 
 
 
29bcfc5
 
 
a3682f0
29bcfc5
 
 
a3682f0
29bcfc5
 
 
7dc0ac4
 
 
 
 
 
 
 
 
 
746fb67
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
import os
import re
from langchain.agents import initialize_agent, Tool
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
import pandas as pd
from pandasai.llm.openai import OpenAI
from pandasai import SmartDataframe
    
# Initialize a blank DataFrame as a global variable
global_df = pd.DataFrame()


class ChatHandler:
    def __init__(self, vector_db_path, open_api_key, grok_api_key,db_final):
        self.vector_db_path = vector_db_path
        self.openai_embeddings = OpenAIEmbeddings(api_key=open_api_key)
        self.llm_openai = ChatOpenAI(model_name="gpt-4o-mini", api_key=open_api_key, max_tokens=500, temperature=0.2)
        self.grok_api_key = grok_api_key
        self.openai_api_key = open_api_key
        self.sql_db = db_final

    def _load_documents_from_vector_db(self, query):
        """Fetch relevant documents from the vector database."""
        results = []

        # Debug: Print the query being processed
        print(f"Processing query: {query}")

        for root, dirs, files in os.walk(self.vector_db_path):
            print(f"Searching in directory: {root}")  # Debug: Current directory being processed
            for dir in dirs:
                index_path = os.path.join(root, dir, "index.faiss")

                # Debug: Check if FAISS index exists
                if os.path.exists(index_path):
                    print(f"Found FAISS index at: {index_path}")

                    # Load the FAISS vector store
                    try:
                        vector_store = FAISS.load_local(
                            os.path.join(root, dir),
                            self.openai_embeddings,
                            allow_dangerous_deserialization=True
                        )
                        print(f"Loaded FAISS vector store from: {os.path.join(root, dir)}")
                    except Exception as e:
                        print(f"Error loading FAISS store: {e}")
                        continue

                    # Perform similarity search
                    try:
                        
                        response_with_scores = vector_store.similarity_search_with_relevance_scores(query, k=100)
                        #print(response_with_scores)
                        print(f"Similarity search returned {len(response_with_scores)} results.")
                        
                        filtered_results = [
                            (doc, score) for doc, score in response_with_scores                            
                            if score is not None and score > 0.7 #and material_name.lower() in doc.page_content.lower()  # Check material name in document
                        ]
                        print(f"Filtered results: {filtered_results}")
                        response_with_scores = filtered_results
                        # Debug: Print each document and score
                        for doc, score in response_with_scores:
                            print(f"Document: {doc.page_content[:100]}... Score: {score}")

                        results.extend([(doc.page_content, score) for doc, score in response_with_scores])
                    except Exception as e:
                        print(f"Error during similarity search: {e}")

        # Sort and return results
        sorted_results = [doc for doc, score in sorted(results, key=lambda x: -x[1])]
        print(f"Total results after sorting: {len(sorted_results)}")
       
        return sorted_results

    def _load_schema_from_database(self, query):
        
        """
        Fetch database schema, generate a SQL query from the user's question, and execute it.
        """
        try:
            # Fetch the schema
            schema = self.sql_db.get_table_info()

            # Define the prompt template
            template_query_generation = """
            
            Based on the table schema below, write a mySQL query with correct syntax that would answer the user's question.
            Only write the SQL query without explanations and without string 'sql'and '''.
            
            Schema:
            {schema}

            Question: {question}

            SQL Query:
            """
            prompt = PromptTemplate(
                input_variables=["schema", "question"],
                template=template_query_generation
            )

            # Initialize the language modelgpt-4o-mini
            llm = ChatOpenAI(model_name="gpt-4o-mini", api_key=self.openai_api_key, max_tokens=500, temperature=0.2)

            # Create the runnable sequence
            chain = prompt | llm | StrOutputParser()

            # Generate the SQL query
            sql_query = chain.invoke({"schema": schema, "question": query}).strip()

            if not sql_query:
                return "Could not generate an SQL query for your question."

            # Execute the SQL query
            try:
                result = self.sql_db.run(sql_query)
                print(f"SQL query executed successfully. Result: {result}")
                
            except Exception as e:
                print(f"Error executing SQL query: {str(e)}")
                return f"As you know I am still learning at this moment I am not able to respond to your question.\nThank you for your patience!"

            # If no result, return an appropriate message
            if not result:
                return "Query executed, but no results were returned."
           
            # Return the result
            return result

        except Exception as e:
            print( f"Error fetching schema details or processing query: {str(e)}")
            return f"As you know I am still learning at this moment I am not able to respond to your question.\nThank you for your patience!"

    def answer_question(self, query, visual_query):
        global global_df
        """Determine whether to use vector database or SQL database for the query."""
        tools = [
            # {
            #     "name": "Document Vector Store",
            #     "function": lambda q: "\n".join(self._load_documents_from_vector_db(q)),
            #     "description": """Search within the uploaded documents stored in the vector database. 
            #     Display the response as a combination of response summary and the response data in the form of table.
            #     If the user requested comparison between two or more years, data should be shown for all the years. (For example, if the user requested from 2020 to 2024, then display the output table with the columns [Month, Material value in 2020, Material value in 2021, Material value in 2022, Material value in 2023, Material value in 2024]) so that the records will be displayed for all the months from Jaunary to December across the years. 
            #      display the material quantity in blue colour if it the 'Type' column value is 'actual'. 
            #      display the Material Quanity in red colour if its value is 'predicted'. 
            #      include the table data in the Final answer of agent executor invoke.""",
            # },
              {
                  "name": "Database Schema",
                  "function": lambda q: self._load_schema_from_database(q),
                  "description": """Search within the mysql database schema and generate SQL-based responses.
                  The database has single table 'tp_material_forecast' which contains the columns 'date', 'material_name', 'material_quantity', and 'type'. Frame the query only with these four columns.
                  If the material name is given, frame the query in such a way that the material_name is not case-sensitive. 
                  If the material name is not present in the table, return the proper message as "This material name is not in the database". Do not give any false values if the material name is not available in database.
                  If the response has month column, display the month as name For example, January instead of displaying as 1.
                  If the user requested comparison between two or more years or the user asks for the data for all years, data should be shown for all the years with month as first column and the years like 2020, 2021 etc as the adjacent columns. 
                  Do not show everything in the same column. (For example, if the user requested from 2020 to 2024, then display the output table with the columns [Month, Material value in 2020, Material value in 2020, Material value in 2021, Material value in 2022, Material value in 2023, Material value in 2024]) so that the records will be displayed for all the months from Jaunary to December across the years. 
                  If there is any error while executing the user question, kindly display the error message as 'As you know I am still learning at this moment I am not able to respond to your question.\nThank you for your patience!
                  display the response as a combination of response summary and the response data in the form of table. Display the table properly in a professional manner. """,
              },
        ]

        agent_prompt = PromptTemplate(
            input_variables=["input", "agent_scratchpad"],
            template="""
                        You are a highly skilled AI assistant specializing in mysql database.
                        I have a mysql database for material demand forecasts with columns as 'date', 'material_name', 'material_quantity', and 'type'.
                        
                        The data includes historical demand information for various items.

                        1. The uploaded document includes:
                           - **Date:** The date of demand entry.
                           - **Material Name:** The name of the material or equipment.
                           - **Material Quantity:** The number of units actual or predicted.
                           - **Type:** Type contains actual or forecasted, actual represents the actual material utilized and forecasted represents the prediction by ai model.

                        2. I may ask questions such as:
                           - Forecasting future demand for specific items.
                           - Analyzing trends or patterns for materials over time.
                           - Summarizing the highest or lowest demands within a specific date range.
                           - Comparing demand values between two or more items.

                        Your task:
                        - If the query relates to forecasting, extract the necessary information from it 
                          and provide precise, professional, and data-driven responses.
                        
                        Make sure your answers are aligned with the uploaded document, depending on the context of the query.
                         display the response in the format as mentioned in the tool description.
                         include the table in the Final answer whereever it is required.
                         Do not display the first line and the last line of the table as '''
                        Tools available to you:
                        {tools}

                        Input Question:
                        {input}

                        {agent_scratchpad}
                    """,
        )

        # Initialize the agent
        agent = initialize_agent(
            tools=[Tool(name=t["name"], func=t["function"], description=t["description"]) for t in tools],
            llm=self.llm_openai,
            agent="zero-shot-react-description",
            verbose=True,
            prompt=agent_prompt
        )

        try:
            response = agent.invoke(query, handle_parsing_errors=True)
            print(f"response:{response}")
                   

           
            if isinstance(response, dict) and "output" in response:
                response = response["output"]  # Extract and return only the output field
            else:
                response = response  # Fallback if output field is not present
            if visual_query is not None:
                # Check if the response contains table-like formatting
                if "|" in response and "---" in response:
                    print("Table data is present in the response.")
                    #convert table data into dataframe
                    # Extract table rows
                    table_pattern = r"\|.*\|"
                    import re
                    table_data = re.findall(table_pattern, response)
                    # Remove separator lines (like |---|---|)
                    filtered_data = [row for row in table_data if not re.match(r"\|\-+\|", row)]

                    # Split rows into columns
                    split_data = [row.strip('|').split('|') for row in filtered_data]

                    # Create DataFrame
                    columns = [col.strip() for col in split_data[0]]  # First row is the header
                    data = [list(map(str.strip, row)) for row in split_data[1:]]  # Remaining rows are data
                    global_df = pd.DataFrame(data, columns=columns)
                    # Function to convert datatypes
                    global_df = convert_column_types(global_df)
                    print(f"Dataframe created from response:\n{global_df}")
                    visual_response = create_visualization_csv(visual_query)
                    
                    
                else:
                    print("No table data found in the response.")
                    global_df = unstructured_text_to_df(response)
                    print(global_df)
                    visual_response = create_visualization_csv(visual_query)
                    print(visual_response)
            else:
                visual_response = None
            return response, visual_response
        except Exception as e:
            print(f"Error while processing your query: {str(e)}")
            return f"As you know I am still learning at this moment I am not able to respond to your question.\nThank you for your patience!" , None
            



def create_visualization_csv(visual_query):
   
    global_df
    #import matplotlib
    #matplotlib.use('TkAgg')  # Replace with 'QtAgg' or 'MacOSX' if on macOS

    visual_query = visual_query + """   use proper axis scale so that all values can be plotted and shown properly.
                                        mention axis values properly. 
                                        Do not miss any values. 
                                        Mention only month name in date axis and not the numbers or the date.
                                        Do not place legend in the middle of the chart. 
                                        Increase the size of chart to make sure than the values are not trucated and the legend text is not truncated.
                                        Place the legend in such a way that the plotted chart is not hidden.
                                        
                                        Return the image path only after plotting all the values."""

    
    llm_chart = OpenAI()
    #from pandasai import PandasAI
    #pandas_ai = PandasAI(llm_chart, show_plots=False)
    #pandas_ai = PandasAI(show_plots=False)  # Avoids attempting to show plots

    sdf = SmartDataframe(global_df, config={"llm": llm_chart})
    llm_response = sdf.chat(visual_query)
    if "no result" in llm_response:
        return f"As you know I am still learning at this moment I am not able to respond to your question.\nThank you for your patience!"
    return llm_response


def convert_column_types(df):
    for col in df.columns:
        # Try to convert to integer
        if all(df[col].str.isdigit()):
            df[col] = df[col].astype(int)
        # Try to convert to datetime
        else:
            try:
                df[col] = pd.to_datetime(df[col], format='%Y-%m-%d', errors='raise')
            except ValueError:
                # Leave as string if neither integer nor date
                pass
    return df    

def unstructured_text_to_df(text):
    
    import openai
    import pandas as pd
    import os
    import json
    
    # Your OpenAI API key
    openai.api_key = os.getenv("OPENAI_API_KEY", "")
    
    # OpenAI prompt to structure the data
    prompt = f"""
    Extract the materials and their quantities from the following text and format them as a structured JSON:
    {text}
    """
    
    # Call OpenAI API
    response = openai.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": prompt}
    ],
    temperature=0
    )
    print(f"response: {response}")
    # Extract the response content
    response_content = response.choices[0].message.content.strip()
    
    # Debugging: Print raw response to check its format
    print("Raw Response:", response_content)
    
    # Step 1: Extract the JSON part from the markdown
    # Split the response content to isolate the JSON part
    json_part = response_content.split("```json\n")[1].split("\n```")[0]
    
    # Step 2: Parse the JSON content
    try:
        structured_data = json.loads(json_part)  # Parse the JSON content
        print("Parsed JSON:", structured_data)
    except json.JSONDecodeError:
        print("Error: Response content is not valid JSON.")

    
    
    # Convert the structured data into a DataFrame
    df = pd.DataFrame(structured_data["materials"])
    
    # Rename columns to desired format
    df.columns = ["material_name", "material_quantity"]
    
    # Print the DataFrame
    print(df)
    return df