hmrizal commited on
Commit
81f0d23
·
verified ·
1 Parent(s): 2fa763f

microsoft/phi4-mini-instruct

Browse files
Files changed (1) hide show
  1. app.py +185 -84
app.py CHANGED
@@ -3,15 +3,14 @@ import os
3
  import uuid
4
  import threading
5
  import pandas as pd
6
- from langchain.document_loaders.csv_loader import CSVLoader
7
- from langchain.embeddings import HuggingFaceEmbeddings
8
- from langchain.vectorstores import FAISS
9
- from langchain.llms import CTransformers
10
- from langchain.chains import ConversationalRetrievalChain
11
 
12
  # Global model cache
13
  MODEL_CACHE = {
14
  "model": None,
 
15
  "init_lock": threading.Lock()
16
  }
17
 
@@ -19,27 +18,52 @@ MODEL_CACHE = {
19
  os.makedirs("user_data", exist_ok=True)
20
 
21
  def initialize_model_once():
22
- """Initialize model once using CTransformers API"""
23
  with MODEL_CACHE["init_lock"]:
24
  if MODEL_CACHE["model"] is None:
25
- # Load Mistral-7B-Instruct-v0.2.Q4_K_M.gguf model
26
- MODEL_CACHE["model"] = CTransformers(
27
- model="TheBloke/Mistral-7B-Instruct-v0.2-GGUF",
28
- model_file="mistral-7b-instruct-v0.2.Q4_K_M.gguf",
29
- model_type="mistral",
30
- max_new_tokens=512,
31
- temperature=0.2,
32
- top_p=0.9,
33
- repetition_penalty=1.2
34
  )
35
 
36
- return MODEL_CACHE["model"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
  class ChatBot:
39
  def __init__(self, session_id):
40
  self.session_id = session_id
 
 
41
  self.chat_history = []
42
- self.chain = None
43
  self.user_dir = f"user_data/{session_id}"
44
  os.makedirs(self.user_dir, exist_ok=True)
45
 
@@ -50,95 +74,173 @@ class ChatBot:
50
  try:
51
  # Handle file from Gradio
52
  file_path = file.name if hasattr(file, 'name') else str(file)
 
53
 
54
- # Verify and save CSV
55
  try:
56
- df = pd.read_csv(file_path)
57
  user_file_path = f"{self.user_dir}/uploaded.csv"
58
- df.to_csv(user_file_path, index=False)
59
- print(f"CSV verified: {df.shape[0]} rows, {len(df.columns)} columns")
60
- except Exception as e:
61
- return f"Error membaca CSV: {str(e)}"
62
-
63
- # Load document
64
- try:
65
- loader = CSVLoader(file_path=file_path, encoding="utf-8", csv_args={'delimiter': ','})
66
- data = loader.load()
67
- print(f"Documents loaded: {len(data)}")
68
- except Exception as e:
69
- return f"Error loading documents: {str(e)}"
70
-
71
- # Create vector database
72
- try:
73
- db_path = f"{self.user_dir}/db_faiss"
74
- embeddings = HuggingFaceEmbeddings(
75
- model_name='sentence-transformers/all-MiniLM-L6-v2',
76
- model_kwargs={'device': 'cpu'} # Explicitly set to CPU
77
- )
78
 
79
- db = FAISS.from_documents(data, embeddings)
80
- db.save_local(db_path)
81
- print(f"Vector database created at {db_path}")
82
- except Exception as e:
83
- return f"Error creating vector database: {str(e)}"
84
-
85
- # Create LLM and chain
86
- try:
87
- llm = initialize_model_once()
88
- self.chain = ConversationalRetrievalChain.from_llm(
89
- llm=llm,
90
- retriever=db.as_retriever(search_kwargs={"k": 4}),
91
- return_source_documents=True
92
- )
93
- print("Chain created successfully")
94
  except Exception as e:
95
- return f"Error creating chain: {str(e)}"
96
 
97
  # Add file info to chat history
98
- file_info = f"CSV berhasil dimuat dengan {df.shape[0]} baris dan {len(df.columns)} kolom. Kolom: {', '.join(df.columns.tolist())}"
99
  self.chat_history.append(("System", file_info))
100
 
101
- return "File CSV berhasil diproses! Anda dapat mulai chat dengan Mistral 7B."
102
  except Exception as e:
103
  import traceback
104
  print(traceback.format_exc())
105
  return f"Error pemrosesan file: {str(e)}"
106
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  def chat(self, message, history):
108
- if self.chain is None:
109
  return "Mohon upload file CSV terlebih dahulu."
110
 
111
  try:
112
- # Process with the chain
113
- result = self.chain({"question": message, "chat_history": self.chat_history})
 
 
 
 
 
 
 
 
114
 
115
- # Update chat history
116
- answer = result["answer"]
 
 
117
 
118
- # Optional: Add source info to answer
119
- sources = result.get("source_documents", [])
120
- if sources:
121
- source_text = "\n\nSumber:\n"
122
- for i, doc in enumerate(sources[:2], 1): # Limit to top 2 sources
123
- source_text += f"{i}. {doc.page_content[:100]}...\n"
124
- answer += source_text
 
 
 
 
 
 
 
 
 
 
 
 
 
125
 
126
- self.chat_history.append((message, answer))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
 
128
- return answer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  except Exception as e:
130
  import traceback
131
  print(traceback.format_exc())
132
  return f"Error: {str(e)}"
133
 
134
- # UI Code dan handler functions sama seperti sebelumnya
135
  def create_gradio_interface():
136
- with gr.Blocks(title="Chat with CSV using Mistral 7B") as interface:
137
  session_id = gr.State(lambda: str(uuid.uuid4()))
138
  chatbot_state = gr.State(lambda: None)
139
 
140
- gr.HTML("<h1 style='text-align: center;'>Chat with CSV using Mistral 7B</h1>")
141
- gr.HTML("<h3 style='text-align: center;'>Asisten analisis CSV yang powerful</h3>")
142
 
143
  with gr.Row():
144
  with gr.Column(scale=1):
@@ -148,14 +250,13 @@ def create_gradio_interface():
148
  )
149
  process_button = gr.Button("Proses CSV")
150
 
151
- with gr.Accordion("Informasi Model", open=False):
152
  gr.Markdown("""
153
- **Model**: Mistral-7B-Instruct-v0.2-GGUF
154
-
155
- **Fitur**:
156
- - GGUF model yang dioptimalkan untuk CPU
157
- - Efisien untuk analisis data dan percakapan
158
- - Manajemen sesi per pengguna
159
  """)
160
 
161
  with gr.Column(scale=2):
@@ -164,8 +265,8 @@ def create_gradio_interface():
164
  height=400
165
  )
166
  message_input = gr.Textbox(
167
- label="Ketik pesan Anda",
168
- placeholder="Tanyakan tentang data CSV Anda...",
169
  lines=2
170
  )
171
  submit_button = gr.Button("Kirim")
 
3
  import uuid
4
  import threading
5
  import pandas as pd
6
+ import numpy as np
7
+ from transformers import AutoModelForCausalLM, AutoTokenizer
8
+ import torch
 
 
9
 
10
  # Global model cache
11
  MODEL_CACHE = {
12
  "model": None,
13
+ "tokenizer": None,
14
  "init_lock": threading.Lock()
15
  }
16
 
 
18
  os.makedirs("user_data", exist_ok=True)
19
 
20
  def initialize_model_once():
21
+ """Initialize Phi-4-mini model once"""
22
  with MODEL_CACHE["init_lock"]:
23
  if MODEL_CACHE["model"] is None:
24
+ # Load Phi-4-mini model
25
+ MODEL_CACHE["tokenizer"] = AutoTokenizer.from_pretrained("microsoft/Phi-4-mini-instruct")
26
+ MODEL_CACHE["model"] = AutoModelForCausalLM.from_pretrained(
27
+ "microsoft/Phi-4-mini-instruct",
28
+ torch_dtype=torch.float16,
29
+ device_map="auto"
 
 
 
30
  )
31
 
32
+ return MODEL_CACHE["model"], MODEL_CACHE["tokenizer"]
33
+
34
+ def generate_pandas_code(prompt, max_new_tokens=512):
35
+ """Generate Python code using the Phi-4-mini model"""
36
+ model, tokenizer = initialize_model_once()
37
+
38
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
39
+ with torch.no_grad():
40
+ outputs = model.generate(
41
+ **inputs,
42
+ max_new_tokens=max_new_tokens,
43
+ do_sample=True,
44
+ temperature=0.2,
45
+ top_p=0.9,
46
+ )
47
+
48
+ response = tokenizer.decode(outputs[0], skip_special_tokens=True)
49
+ # Extract only the generated part, removing the input prompt
50
+ generated_text = response[len(tokenizer.decode(inputs.input_ids[0], skip_special_tokens=True)):]
51
+
52
+ # Extract code between ```python and ``` if present
53
+ import re
54
+ code_match = re.search(r'```python\s*(.*?)\s*```', generated_text, re.DOTALL)
55
+ if code_match:
56
+ return code_match.group(1).strip()
57
+ else:
58
+ # Return the raw generated text as fallback
59
+ return generated_text.strip()
60
 
61
  class ChatBot:
62
  def __init__(self, session_id):
63
  self.session_id = session_id
64
+ self.csv_info = None
65
+ self.df = None
66
  self.chat_history = []
 
67
  self.user_dir = f"user_data/{session_id}"
68
  os.makedirs(self.user_dir, exist_ok=True)
69
 
 
74
  try:
75
  # Handle file from Gradio
76
  file_path = file.name if hasattr(file, 'name') else str(file)
77
+ file_name = os.path.basename(file_path)
78
 
79
+ # Load and save CSV directly with pandas
80
  try:
81
+ self.df = pd.read_csv(file_path)
82
  user_file_path = f"{self.user_dir}/uploaded.csv"
83
+ self.df.to_csv(user_file_path, index=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
 
85
+ # Store CSV info
86
+ self.csv_info = {
87
+ "filename": file_name,
88
+ "rows": self.df.shape[0],
89
+ "columns": self.df.shape[1],
90
+ "column_names": self.df.columns.tolist(),
91
+ }
92
+
93
+ print(f"CSV verified: {self.df.shape[0]} rows, {len(self.df.columns)} columns")
 
 
 
 
 
 
94
  except Exception as e:
95
+ return f"Error membaca CSV: {str(e)}"
96
 
97
  # Add file info to chat history
98
+ file_info = f"CSV berhasil dimuat: {file_name} dengan {self.df.shape[0]} baris dan {len(self.df.columns)} kolom. Kolom: {', '.join(self.df.columns.tolist())}"
99
  self.chat_history.append(("System", file_info))
100
 
101
+ return f"File CSV '{file_name}' berhasil diproses! Anda dapat mulai mengajukan pertanyaan tentang data."
102
  except Exception as e:
103
  import traceback
104
  print(traceback.format_exc())
105
  return f"Error pemrosesan file: {str(e)}"
106
 
107
+ def execute_query(self, code):
108
+ """Safely execute pandas code"""
109
+ try:
110
+ # Create local context with the dataframe
111
+ local_vars = {"df": self.df, "pd": pd, "np": np}
112
+
113
+ # Execute code with timeout
114
+ exec(code, {"pd": pd, "np": np}, local_vars)
115
+
116
+ # Get result
117
+ if "result" in local_vars:
118
+ return local_vars["result"]
119
+ else:
120
+ # If no result variable, find the last variable created
121
+ last_var = None
122
+ for var_name, var_value in local_vars.items():
123
+ if var_name not in ["df", "pd", "np"] and var_name != "__builtins__":
124
+ last_var = var_value
125
+
126
+ if last_var is not None:
127
+ return last_var
128
+ else:
129
+ return self.df # Return the dataframe as default
130
+ except Exception as e:
131
+ raise Exception(f"Gagal menjalankan kode: {str(e)}")
132
+
133
  def chat(self, message, history):
134
+ if self.df is None:
135
  return "Mohon upload file CSV terlebih dahulu."
136
 
137
  try:
138
+ # Handle common metadata questions directly to save resources
139
+ message_lower = message.lower()
140
+ if "nama file" in message_lower:
141
+ return f"Nama file CSV adalah: {self.csv_info['filename']}"
142
+ elif "nama kolom" in message_lower:
143
+ return f"Kolom dalam CSV: {', '.join(self.csv_info['column_names'])}"
144
+ elif "jumlah baris" in message_lower or "berapa baris" in message_lower:
145
+ return f"Jumlah baris dalam CSV: {self.csv_info['rows']}"
146
+ elif "jumlah kolom" in message_lower or "berapa kolom" in message_lower:
147
+ return f"Jumlah kolom dalam CSV: {self.csv_info['columns']}"
148
 
149
+ # Get sample data for context
150
+ sample_df = self.df.head(5)
151
+ sample_str = sample_df.to_string()
152
+ data_types = {col: str(dtype) for col, dtype in self.df.dtypes.items()}
153
 
154
+ # Create prompt for LLM
155
+ prompt = f"""
156
+ You are a data analyst that translates natural language questions into Python pandas code.
157
+
158
+ DataFrame information:
159
+ - Column names: {', '.join(self.csv_info['column_names'])}
160
+ - Data types: {data_types}
161
+ - Number of rows: {self.csv_info['rows']}
162
+ - Sample data:
163
+ {sample_str}
164
+
165
+ User question: {message}
166
+
167
+ Write a short Python code using pandas to answer the user's question.
168
+ The code must use the 'df' variable as the DataFrame name.
169
+ The code should assign the final result to a variable named 'result'.
170
+ Only return the Python code without any explanation.
171
+
172
+ ```python
173
+ """
174
 
175
+ # Generate code with Phi-4
176
+ try:
177
+ code = generate_pandas_code(prompt)
178
+
179
+ # Add result variable if not present
180
+ if not any(line.strip().startswith("result =") for line in code.split("\n")):
181
+ if code.startswith("df."):
182
+ code = "result = " + code
183
+ elif not "result" in code:
184
+ code = "result = " + code
185
+ except Exception as e:
186
+ print(f"Error generating code: {str(e)}")
187
+ # Fallback for basic questions
188
+ if "rata-rata" in message_lower or "mean" in message_lower:
189
+ code = "result = df.describe()"
190
+ elif "jumlah" in message_lower or "count" in message_lower:
191
+ code = "result = df.count()"
192
+ else:
193
+ return f"Maaf, saya tidak dapat menghasilkan kode untuk pertanyaan ini. Error: {str(e)}"
194
 
195
+ # Execute the code and get result
196
+ try:
197
+ print(f"Executing code: {code}")
198
+ result = self.execute_query(code)
199
+
200
+ # Check if result is relevant to the question
201
+ if result is None or (isinstance(result, pd.DataFrame) and result.empty):
202
+ return "Maaf, kita tidak bisa mendapatkan informasi terkait pertanyaan anda di dalam file CSV anda."
203
+
204
+ # Format result based on its type
205
+ if isinstance(result, pd.DataFrame):
206
+ if len(result) > 5:
207
+ result_str = result.head(5).to_string() + f"\n\n[Total {len(result)} baris]"
208
+ else:
209
+ result_str = result.to_string()
210
+ elif isinstance(result, (pd.Series, np.ndarray)):
211
+ if len(result) > 10:
212
+ result_str = str(result[:10]) + f"\n\n[Total {len(result)} item]"
213
+ else:
214
+ result_str = str(result)
215
+ elif hasattr(result, "__len__") and not isinstance(result, (str, int, float)):
216
+ result_str = str(result)
217
+ if len(result) > 0:
218
+ result_str += f"\n\n[Total {len(result)} item]"
219
+ else:
220
+ result_str = str(result)
221
+
222
+ # Format response
223
+ response = f"Hasil analisis:\n\n{result_str}\n\nKode yang dijalankan:\n```python\n{code}\n```"
224
+
225
+ self.chat_history.append((message, response))
226
+ return response
227
+
228
+ except Exception as e:
229
+ return f"Error saat menganalisis data: {str(e)}\n\nKode yang dicoba:\n```python\n{code}\n```"
230
+
231
  except Exception as e:
232
  import traceback
233
  print(traceback.format_exc())
234
  return f"Error: {str(e)}"
235
 
236
+ # UI Code (sama seperti sebelumnya)
237
  def create_gradio_interface():
238
+ with gr.Blocks(title="CSV Data Analyzer") as interface:
239
  session_id = gr.State(lambda: str(uuid.uuid4()))
240
  chatbot_state = gr.State(lambda: None)
241
 
242
+ gr.HTML("<h1 style='text-align: center;'>CSV Data Analyzer</h1>")
243
+ gr.HTML("<h3 style='text-align: center;'>Ajukan pertanyaan tentang data CSV Anda</h3>")
244
 
245
  with gr.Row():
246
  with gr.Column(scale=1):
 
250
  )
251
  process_button = gr.Button("Proses CSV")
252
 
253
+ with gr.Accordion("Contoh Pertanyaan", open=False):
254
  gr.Markdown("""
255
+ - "Berapa jumlah data yang memiliki nilai Glucose di atas 150?"
256
+ - "Hitung nilai rata-rata setiap kolom numerik"
257
+ - "Berapa banyak data untuk setiap kelompok dalam kolom Outcome?"
258
+ - "Berapa jumlah baris dalam dataset ini?"
259
+ - "Berapa jumlah kolom dalam dataset ini?"
 
260
  """)
261
 
262
  with gr.Column(scale=2):
 
265
  height=400
266
  )
267
  message_input = gr.Textbox(
268
+ label="Ketik pertanyaan Anda",
269
+ placeholder="Contoh: Berapa jumlah data yang memiliki nilai Glucose di atas 150?",
270
  lines=2
271
  )
272
  submit_button = gr.Button("Kirim")