hmrizal commited on
Commit
df0d042
·
verified ·
1 Parent(s): fdad3c6

kembali ke mistral 7b instruct gguf

Browse files
Files changed (1) hide show
  1. app.py +76 -174
app.py CHANGED
@@ -3,10 +3,11 @@ import os
3
  import uuid
4
  import threading
5
  import pandas as pd
6
- import numpy as np
 
 
7
  from langchain.llms import CTransformers
8
- from langchain.chains import LLMChain
9
- from langchain.prompts import PromptTemplate
10
 
11
  # Global model cache
12
  MODEL_CACHE = {
@@ -21,15 +22,15 @@ def initialize_model_once():
21
  """Initialize model once using CTransformers API"""
22
  with MODEL_CACHE["init_lock"]:
23
  if MODEL_CACHE["model"] is None:
24
- # Load TinyLlama model
25
  MODEL_CACHE["model"] = CTransformers(
26
- model="TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
27
- model_file="tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
28
- model_type="llama",
29
  max_new_tokens=512,
30
- temperature=0.1,
31
- repetition_penalty=1.1,
32
- context_length=2048
33
  )
34
 
35
  return MODEL_CACHE["model"]
@@ -37,9 +38,8 @@ def initialize_model_once():
37
  class ChatBot:
38
  def __init__(self, session_id):
39
  self.session_id = session_id
40
- self.csv_info = None
41
- self.df = None
42
  self.chat_history = []
 
43
  self.user_dir = f"user_data/{session_id}"
44
  os.makedirs(self.user_dir, exist_ok=True)
45
 
@@ -50,195 +50,95 @@ class ChatBot:
50
  try:
51
  # Handle file from Gradio
52
  file_path = file.name if hasattr(file, 'name') else str(file)
53
- file_name = os.path.basename(file_path)
54
 
55
- # Load and save CSV directly with pandas
56
  try:
57
- self.df = pd.read_csv(file_path)
58
  user_file_path = f"{self.user_dir}/uploaded.csv"
59
- self.df.to_csv(user_file_path, index=False)
60
-
61
- # Store CSV info
62
- self.csv_info = {
63
- "filename": file_name,
64
- "rows": self.df.shape[0],
65
- "columns": self.df.shape[1],
66
- "column_names": self.df.columns.tolist(),
67
- }
68
-
69
- print(f"CSV verified: {self.df.shape[0]} rows, {len(self.df.columns)} columns")
70
  except Exception as e:
71
  return f"Error membaca CSV: {str(e)}"
72
 
73
- # Create query translator
74
  try:
75
- llm = initialize_model_once()
76
-
77
- query_template = """
78
- Kamu adalah asisten data yang mengubah pertanyaan bahasa natural menjadi kode Python dengan Pandas.
79
-
80
- Informasi tentang DataFrame:
81
- - Nama kolom: {column_names}
82
- - Jumlah baris: {num_rows}
83
- - Sampel data:
84
- {sample_data}
85
-
86
- Pertanyaan pengguna: {question}
87
-
88
- Buat kode Python menggunakan pandas untuk menjawab pertanyaan tersebut.
89
- Berikan HANYA kode Python saja, tanpa penjelasan atau apapun.
90
- Kode harus menggunakan variabel 'df' sebagai nama DataFrame.
91
-
92
- Kode:
93
- """
94
-
95
- self.query_chain = LLMChain(
96
- llm=llm,
97
- prompt=PromptTemplate(
98
- input_variables=["column_names", "num_rows", "sample_data", "question"],
99
- template=query_template
100
- )
101
  )
102
 
103
- print("Query translator created successfully")
 
 
104
  except Exception as e:
105
- return f"Error creating query translator: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
106
 
107
  # Add file info to chat history
108
- file_info = f"CSV berhasil dimuat: {file_name} dengan {self.df.shape[0]} baris dan {len(self.df.columns)} kolom. Kolom: {', '.join(self.df.columns.tolist())}"
109
  self.chat_history.append(("System", file_info))
110
 
111
- return f"File CSV '{file_name}' berhasil diproses! Anda dapat mulai mengajukan pertanyaan tentang data."
112
  except Exception as e:
113
  import traceback
114
  print(traceback.format_exc())
115
  return f"Error pemrosesan file: {str(e)}"
116
 
117
- def execute_query(self, code):
118
- """Safely execute pandas code"""
119
- try:
120
- # Create local context with the dataframe
121
- local_vars = {"df": self.df, "pd": pd, "np": np}
122
-
123
- # Execute code with timeout
124
- exec(code, {"pd": pd, "np": np}, local_vars)
125
-
126
- # Get result
127
- if "result" in local_vars:
128
- return local_vars["result"]
129
- else:
130
- # If no result variable, find the last variable created
131
- last_var = None
132
- for var_name, var_value in local_vars.items():
133
- if var_name not in ["df", "pd", "np"] and var_name != "__builtins__":
134
- last_var = var_value
135
-
136
- if last_var is not None:
137
- return last_var
138
- else:
139
- return self.df # Return the dataframe as default
140
- except Exception as e:
141
- raise Exception(f"Gagal menjalankan kode: {str(e)}")
142
-
143
  def chat(self, message, history):
144
- if self.df is None:
145
  return "Mohon upload file CSV terlebih dahulu."
146
 
147
  try:
148
- # Handle metadata questions directly
149
- message_lower = message.lower()
150
- if "nama file" in message_lower:
151
- return f"Nama file CSV adalah: {self.csv_info['filename']}"
152
- elif "nama kolom" in message_lower:
153
- return f"Kolom dalam CSV: {', '.join(self.csv_info['column_names'])}"
154
- elif "jumlah baris" in message_lower or "berapa baris" in message_lower:
155
- return f"Jumlah baris dalam CSV: {self.csv_info['rows']}"
156
 
157
- # Handle pre-defined analysis questions
158
- if "glucose di atas 150" in message_lower:
159
- code = "result = len(df[df['Glucose'] > 150])"
160
- else:
161
- # Get sample data for context
162
- sample_str = self.df.head(3).to_string()
163
-
164
- # Translate question to pandas code using LLM
165
- try:
166
- code_response = self.query_chain.run(
167
- column_names=str(self.csv_info["column_names"]),
168
- num_rows=self.csv_info["rows"],
169
- sample_data=sample_str,
170
- question=message
171
- )
172
-
173
- # Clean the code
174
- code = code_response.strip().replace("```python", "").replace("```", "").strip()
175
-
176
- # Add result variable if not present
177
- if not any(line.strip().startswith("result =") for line in code.split("\n")):
178
- if code.startswith("df."):
179
- code = "result = " + code
180
- else:
181
- code = "result = df." + code
182
- except Exception as e:
183
- # Fallback for common queries if LLM fails
184
- if "rata-rata" in message_lower or "mean" in message_lower:
185
- code = "result = df.describe()"
186
- elif "jumlah" in message_lower or "count" in message_lower:
187
- code = "result = df.count()"
188
- elif "distribusi" in message_lower:
189
- col = next((c for c in self.csv_info["column_names"] if c.lower() in message_lower), None)
190
- if col:
191
- code = f"result = df['{col}'].value_counts()"
192
- else:
193
- code = "result = df.describe()"
194
- else:
195
- return f"Maaf, saya tidak dapat memproses pertanyaan ini. Error: {str(e)}"
196
 
197
- # Execute the code and get result
198
- try:
199
- print(f"Executing code: {code}")
200
- result = self.execute_query(code)
201
-
202
- # Format result based on its type
203
- if isinstance(result, pd.DataFrame):
204
- if len(result) > 5:
205
- result_str = result.head(5).to_string() + f"\n\n[Total {len(result)} baris]"
206
- else:
207
- result_str = result.to_string()
208
- elif isinstance(result, (pd.Series, np.ndarray)):
209
- if len(result) > 10:
210
- result_str = str(result[:10]) + f"\n\n[Total {len(result)} item]"
211
- else:
212
- result_str = str(result)
213
- elif hasattr(result, "__len__") and not isinstance(result, (str, int, float)):
214
- result_str = str(result)
215
- if len(result) > 0:
216
- result_str += f"\n\n[Total {len(result)} item]"
217
- else:
218
- result_str = str(result)
219
-
220
- # Format response
221
- response = f"Hasil analisis:\n\n{result_str}\n\nKode yang dijalankan:\n```python\n{code}\n```"
222
-
223
- self.chat_history.append((message, response))
224
- return response
225
-
226
- except Exception as e:
227
- return f"Error saat menganalisis data: {str(e)}\n\nKode yang dicoba:\n```python\n{code}\n```"
228
-
229
  except Exception as e:
230
  import traceback
231
  print(traceback.format_exc())
232
  return f"Error: {str(e)}"
233
 
234
- # UI Code (tidak berubah dari sebelumnya)
235
  def create_gradio_interface():
236
- with gr.Blocks(title="CSV Data Analyzer") as interface:
237
  session_id = gr.State(lambda: str(uuid.uuid4()))
238
  chatbot_state = gr.State(lambda: None)
239
 
240
- gr.HTML("<h1 style='text-align: center;'>CSV Data Analyzer</h1>")
241
- gr.HTML("<h3 style='text-align: center;'>Ajukan pertanyaan tentang data CSV Anda</h3>")
242
 
243
  with gr.Row():
244
  with gr.Column(scale=1):
@@ -248,12 +148,14 @@ def create_gradio_interface():
248
  )
249
  process_button = gr.Button("Proses CSV")
250
 
251
- with gr.Accordion("Contoh Pertanyaan", open=False):
252
  gr.Markdown("""
253
- - "Berapa jumlah data yang memiliki nilai Glucose di atas 150?"
254
- - "Hitung nilai rata-rata setiap kolom numerik"
255
- - "Berapa banyak data untuk setiap kelompok dalam kolom Outcome?"
256
- - "Berapa jumlah baris dalam dataset ini?"
 
 
257
  """)
258
 
259
  with gr.Column(scale=2):
@@ -262,8 +164,8 @@ def create_gradio_interface():
262
  height=400
263
  )
264
  message_input = gr.Textbox(
265
- label="Ketik pertanyaan Anda",
266
- placeholder="Contoh: Berapa jumlah data yang memiliki nilai Glucose di atas 150?",
267
  lines=2
268
  )
269
  submit_button = gr.Button("Kirim")
 
3
  import uuid
4
  import threading
5
  import pandas as pd
6
+ from langchain.document_loaders.csv_loader import CSVLoader
7
+ from langchain.embeddings import HuggingFaceEmbeddings
8
+ from langchain.vectorstores import FAISS
9
  from langchain.llms import CTransformers
10
+ from langchain.chains import ConversationalRetrievalChain
 
11
 
12
  # Global model cache
13
  MODEL_CACHE = {
 
22
  """Initialize model once using CTransformers API"""
23
  with MODEL_CACHE["init_lock"]:
24
  if MODEL_CACHE["model"] is None:
25
+ # Load Mistral-7B-Instruct-v0.2.Q4_K_M.gguf model
26
  MODEL_CACHE["model"] = CTransformers(
27
+ model="TheBloke/Mistral-7B-Instruct-v0.2-GGUF",
28
+ model_file="mistral-7b-instruct-v0.2.Q4_K_M.gguf",
29
+ model_type="mistral",
30
  max_new_tokens=512,
31
+ temperature=0.2,
32
+ top_p=0.9,
33
+ repetition_penalty=1.2
34
  )
35
 
36
  return MODEL_CACHE["model"]
 
38
  class ChatBot:
39
  def __init__(self, session_id):
40
  self.session_id = session_id
 
 
41
  self.chat_history = []
42
+ self.chain = None
43
  self.user_dir = f"user_data/{session_id}"
44
  os.makedirs(self.user_dir, exist_ok=True)
45
 
 
50
  try:
51
  # Handle file from Gradio
52
  file_path = file.name if hasattr(file, 'name') else str(file)
 
53
 
54
+ # Verify and save CSV
55
  try:
56
+ df = pd.read_csv(file_path)
57
  user_file_path = f"{self.user_dir}/uploaded.csv"
58
+ df.to_csv(user_file_path, index=False)
59
+ print(f"CSV verified: {df.shape[0]} rows, {len(df.columns)} columns")
 
 
 
 
 
 
 
 
 
60
  except Exception as e:
61
  return f"Error membaca CSV: {str(e)}"
62
 
63
+ # Load document
64
  try:
65
+ loader = CSVLoader(file_path=file_path, encoding="utf-8", csv_args={'delimiter': ','})
66
+ data = loader.load()
67
+ print(f"Documents loaded: {len(data)}")
68
+ except Exception as e:
69
+ return f"Error loading documents: {str(e)}"
70
+
71
+ # Create vector database
72
+ try:
73
+ db_path = f"{self.user_dir}/db_faiss"
74
+ embeddings = HuggingFaceEmbeddings(
75
+ model_name='sentence-transformers/all-MiniLM-L6-v2',
76
+ model_kwargs={'device': 'cpu'} # Explicitly set to CPU
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  )
78
 
79
+ db = FAISS.from_documents(data, embeddings)
80
+ db.save_local(db_path)
81
+ print(f"Vector database created at {db_path}")
82
  except Exception as e:
83
+ return f"Error creating vector database: {str(e)}"
84
+
85
+ # Create LLM and chain
86
+ try:
87
+ llm = initialize_model_once()
88
+ self.chain = ConversationalRetrievalChain.from_llm(
89
+ llm=llm,
90
+ retriever=db.as_retriever(search_kwargs={"k": 4}),
91
+ return_source_documents=True
92
+ )
93
+ print("Chain created successfully")
94
+ except Exception as e:
95
+ return f"Error creating chain: {str(e)}"
96
 
97
  # Add file info to chat history
98
+ file_info = f"CSV berhasil dimuat dengan {df.shape[0]} baris dan {len(df.columns)} kolom. Kolom: {', '.join(df.columns.tolist())}"
99
  self.chat_history.append(("System", file_info))
100
 
101
+ return "File CSV berhasil diproses! Anda dapat mulai chat dengan Mistral 7B."
102
  except Exception as e:
103
  import traceback
104
  print(traceback.format_exc())
105
  return f"Error pemrosesan file: {str(e)}"
106
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  def chat(self, message, history):
108
+ if self.chain is None:
109
  return "Mohon upload file CSV terlebih dahulu."
110
 
111
  try:
112
+ # Process with the chain
113
+ result = self.chain({"question": message, "chat_history": self.chat_history})
 
 
 
 
 
 
114
 
115
+ # Update chat history
116
+ answer = result["answer"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
 
118
+ # Optional: Add source info to answer
119
+ sources = result.get("source_documents", [])
120
+ if sources:
121
+ source_text = "\n\nSumber:\n"
122
+ for i, doc in enumerate(sources[:2], 1): # Limit to top 2 sources
123
+ source_text += f"{i}. {doc.page_content[:100]}...\n"
124
+ answer += source_text
125
+
126
+ self.chat_history.append((message, answer))
127
+
128
+ return answer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  except Exception as e:
130
  import traceback
131
  print(traceback.format_exc())
132
  return f"Error: {str(e)}"
133
 
134
+ # UI Code dan handler functions sama seperti sebelumnya
135
  def create_gradio_interface():
136
+ with gr.Blocks(title="Chat with CSV using Mistral 7B") as interface:
137
  session_id = gr.State(lambda: str(uuid.uuid4()))
138
  chatbot_state = gr.State(lambda: None)
139
 
140
+ gr.HTML("<h1 style='text-align: center;'>Chat with CSV using Mistral 7B</h1>")
141
+ gr.HTML("<h3 style='text-align: center;'>Asisten analisis CSV yang powerful</h3>")
142
 
143
  with gr.Row():
144
  with gr.Column(scale=1):
 
148
  )
149
  process_button = gr.Button("Proses CSV")
150
 
151
+ with gr.Accordion("Informasi Model", open=False):
152
  gr.Markdown("""
153
+ **Model**: Mistral-7B-Instruct-v0.2-GGUF
154
+
155
+ **Fitur**:
156
+ - GGUF model yang dioptimalkan untuk CPU
157
+ - Efisien untuk analisis data dan percakapan
158
+ - Manajemen sesi per pengguna
159
  """)
160
 
161
  with gr.Column(scale=2):
 
164
  height=400
165
  )
166
  message_input = gr.Textbox(
167
+ label="Ketik pesan Anda",
168
+ placeholder="Tanyakan tentang data CSV Anda...",
169
  lines=2
170
  )
171
  submit_button = gr.Button("Kirim")