Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -4,11 +4,7 @@ import uuid
|
|
4 |
import threading
|
5 |
import pandas as pd
|
6 |
import numpy as np
|
7 |
-
from langchain.document_loaders.csv_loader import CSVLoader
|
8 |
-
from langchain.embeddings import HuggingFaceEmbeddings
|
9 |
-
from langchain.vectorstores import FAISS
|
10 |
from langchain.llms import CTransformers
|
11 |
-
from langchain_experimental.agents import create_pandas_dataframe_agent
|
12 |
from langchain.chains import LLMChain
|
13 |
from langchain.prompts import PromptTemplate
|
14 |
|
@@ -25,14 +21,13 @@ def initialize_model_once():
|
|
25 |
"""Initialize model once using CTransformers API"""
|
26 |
with MODEL_CACHE["init_lock"]:
|
27 |
if MODEL_CACHE["model"] is None:
|
28 |
-
# Load
|
29 |
MODEL_CACHE["model"] = CTransformers(
|
30 |
model="TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
|
31 |
model_file="tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
|
32 |
model_type="llama",
|
33 |
max_new_tokens=512,
|
34 |
temperature=0.1,
|
35 |
-
top_p=0.9,
|
36 |
repetition_penalty=1.1,
|
37 |
context_length=2048
|
38 |
)
|
@@ -80,18 +75,19 @@ class ChatBot:
|
|
80 |
llm = initialize_model_once()
|
81 |
|
82 |
query_template = """
|
83 |
-
Kamu adalah asisten yang mengubah pertanyaan natural
|
84 |
|
85 |
Informasi tentang DataFrame:
|
86 |
- Nama kolom: {column_names}
|
87 |
- Jumlah baris: {num_rows}
|
88 |
-
-
|
89 |
{sample_data}
|
90 |
|
91 |
Pertanyaan pengguna: {question}
|
92 |
|
93 |
-
|
94 |
-
Berikan HANYA kode
|
|
|
95 |
|
96 |
Kode:
|
97 |
"""
|
@@ -118,8 +114,34 @@ class ChatBot:
|
|
118 |
print(traceback.format_exc())
|
119 |
return f"Error pemrosesan file: {str(e)}"
|
120 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
121 |
def chat(self, message, history):
|
122 |
-
if self.df is None
|
123 |
return "Mohon upload file CSV terlebih dahulu."
|
124 |
|
125 |
try:
|
@@ -132,64 +154,84 @@ class ChatBot:
|
|
132 |
elif "jumlah baris" in message_lower or "berapa baris" in message_lower:
|
133 |
return f"Jumlah baris dalam CSV: {self.csv_info['rows']}"
|
134 |
|
135 |
-
#
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
145 |
|
146 |
-
#
|
147 |
try:
|
148 |
-
code = code_response.strip()
|
149 |
-
# Add safety prefix to prevent malicious code
|
150 |
-
if not code.startswith("df"):
|
151 |
-
code = "result = " + code
|
152 |
-
else:
|
153 |
-
code = "result = " + code
|
154 |
-
|
155 |
-
# Create local context with the dataframe
|
156 |
-
locals_dict = {"df": self.df, "pd": pd, "np": np}
|
157 |
-
|
158 |
-
# Execute the code
|
159 |
print(f"Executing code: {code}")
|
160 |
-
|
161 |
-
result = locals_dict.get("result", "No result returned")
|
162 |
|
163 |
-
# Format
|
164 |
if isinstance(result, pd.DataFrame):
|
165 |
if len(result) > 5:
|
166 |
-
result_str = result.head(5).to_string() + f"\n\n[{len(result)} baris
|
167 |
else:
|
168 |
result_str = result.to_string()
|
169 |
elif isinstance(result, (pd.Series, np.ndarray)):
|
|
|
|
|
|
|
|
|
|
|
170 |
result_str = str(result)
|
|
|
|
|
171 |
else:
|
172 |
result_str = str(result)
|
173 |
|
174 |
-
#
|
175 |
-
response = f"Hasil analisis
|
176 |
-
response += f"Kode yang digunakan:\n```python\n{code}\n```\n\n"
|
177 |
-
response += f"Output:\n{result_str}"
|
178 |
|
179 |
self.chat_history.append((message, response))
|
180 |
return response
|
181 |
|
182 |
except Exception as e:
|
183 |
-
|
184 |
-
print(error_msg)
|
185 |
-
return error_msg
|
186 |
|
187 |
except Exception as e:
|
188 |
import traceback
|
189 |
print(traceback.format_exc())
|
190 |
return f"Error: {str(e)}"
|
191 |
|
192 |
-
# UI Code
|
193 |
def create_gradio_interface():
|
194 |
with gr.Blocks(title="CSV Data Analyzer") as interface:
|
195 |
session_id = gr.State(lambda: str(uuid.uuid4()))
|
@@ -209,9 +251,9 @@ def create_gradio_interface():
|
|
209 |
with gr.Accordion("Contoh Pertanyaan", open=False):
|
210 |
gr.Markdown("""
|
211 |
- "Berapa jumlah data yang memiliki nilai Glucose di atas 150?"
|
212 |
-
- "
|
213 |
-
- "
|
214 |
-
- "
|
215 |
""")
|
216 |
|
217 |
with gr.Column(scale=2):
|
|
|
4 |
import threading
|
5 |
import pandas as pd
|
6 |
import numpy as np
|
|
|
|
|
|
|
7 |
from langchain.llms import CTransformers
|
|
|
8 |
from langchain.chains import LLMChain
|
9 |
from langchain.prompts import PromptTemplate
|
10 |
|
|
|
21 |
"""Initialize model once using CTransformers API"""
|
22 |
with MODEL_CACHE["init_lock"]:
|
23 |
if MODEL_CACHE["model"] is None:
|
24 |
+
# Load TinyLlama model
|
25 |
MODEL_CACHE["model"] = CTransformers(
|
26 |
model="TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
|
27 |
model_file="tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
|
28 |
model_type="llama",
|
29 |
max_new_tokens=512,
|
30 |
temperature=0.1,
|
|
|
31 |
repetition_penalty=1.1,
|
32 |
context_length=2048
|
33 |
)
|
|
|
75 |
llm = initialize_model_once()
|
76 |
|
77 |
query_template = """
|
78 |
+
Kamu adalah asisten data yang mengubah pertanyaan bahasa natural menjadi kode Python dengan Pandas.
|
79 |
|
80 |
Informasi tentang DataFrame:
|
81 |
- Nama kolom: {column_names}
|
82 |
- Jumlah baris: {num_rows}
|
83 |
+
- Sampel data:
|
84 |
{sample_data}
|
85 |
|
86 |
Pertanyaan pengguna: {question}
|
87 |
|
88 |
+
Buat kode Python menggunakan pandas untuk menjawab pertanyaan tersebut.
|
89 |
+
Berikan HANYA kode Python saja, tanpa penjelasan atau apapun.
|
90 |
+
Kode harus menggunakan variabel 'df' sebagai nama DataFrame.
|
91 |
|
92 |
Kode:
|
93 |
"""
|
|
|
114 |
print(traceback.format_exc())
|
115 |
return f"Error pemrosesan file: {str(e)}"
|
116 |
|
117 |
+
def execute_query(self, code):
|
118 |
+
"""Safely execute pandas code"""
|
119 |
+
try:
|
120 |
+
# Create local context with the dataframe
|
121 |
+
local_vars = {"df": self.df, "pd": pd, "np": np}
|
122 |
+
|
123 |
+
# Execute code with timeout
|
124 |
+
exec(code, {"pd": pd, "np": np}, local_vars)
|
125 |
+
|
126 |
+
# Get result
|
127 |
+
if "result" in local_vars:
|
128 |
+
return local_vars["result"]
|
129 |
+
else:
|
130 |
+
# If no result variable, find the last variable created
|
131 |
+
last_var = None
|
132 |
+
for var_name, var_value in local_vars.items():
|
133 |
+
if var_name not in ["df", "pd", "np"] and var_name != "__builtins__":
|
134 |
+
last_var = var_value
|
135 |
+
|
136 |
+
if last_var is not None:
|
137 |
+
return last_var
|
138 |
+
else:
|
139 |
+
return self.df # Return the dataframe as default
|
140 |
+
except Exception as e:
|
141 |
+
raise Exception(f"Gagal menjalankan kode: {str(e)}")
|
142 |
+
|
143 |
def chat(self, message, history):
|
144 |
+
if self.df is None:
|
145 |
return "Mohon upload file CSV terlebih dahulu."
|
146 |
|
147 |
try:
|
|
|
154 |
elif "jumlah baris" in message_lower or "berapa baris" in message_lower:
|
155 |
return f"Jumlah baris dalam CSV: {self.csv_info['rows']}"
|
156 |
|
157 |
+
# Handle pre-defined analysis questions
|
158 |
+
if "glucose di atas 150" in message_lower:
|
159 |
+
code = "result = len(df[df['Glucose'] > 150])"
|
160 |
+
else:
|
161 |
+
# Get sample data for context
|
162 |
+
sample_str = self.df.head(3).to_string()
|
163 |
+
|
164 |
+
# Translate question to pandas code using LLM
|
165 |
+
try:
|
166 |
+
code_response = self.query_chain.run(
|
167 |
+
column_names=str(self.csv_info["column_names"]),
|
168 |
+
num_rows=self.csv_info["rows"],
|
169 |
+
sample_data=sample_str,
|
170 |
+
question=message
|
171 |
+
)
|
172 |
+
|
173 |
+
# Clean the code
|
174 |
+
code = code_response.strip().replace("```python", "").replace("```", "").strip()
|
175 |
+
|
176 |
+
# Add result variable if not present
|
177 |
+
if not any(line.strip().startswith("result =") for line in code.split("\n")):
|
178 |
+
if code.startswith("df."):
|
179 |
+
code = "result = " + code
|
180 |
+
else:
|
181 |
+
code = "result = df." + code
|
182 |
+
except Exception as e:
|
183 |
+
# Fallback for common queries if LLM fails
|
184 |
+
if "rata-rata" in message_lower or "mean" in message_lower:
|
185 |
+
code = "result = df.describe()"
|
186 |
+
elif "jumlah" in message_lower or "count" in message_lower:
|
187 |
+
code = "result = df.count()"
|
188 |
+
elif "distribusi" in message_lower:
|
189 |
+
col = next((c for c in self.csv_info["column_names"] if c.lower() in message_lower), None)
|
190 |
+
if col:
|
191 |
+
code = f"result = df['{col}'].value_counts()"
|
192 |
+
else:
|
193 |
+
code = "result = df.describe()"
|
194 |
+
else:
|
195 |
+
return f"Maaf, saya tidak dapat memproses pertanyaan ini. Error: {str(e)}"
|
196 |
|
197 |
+
# Execute the code and get result
|
198 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
199 |
print(f"Executing code: {code}")
|
200 |
+
result = self.execute_query(code)
|
|
|
201 |
|
202 |
+
# Format result based on its type
|
203 |
if isinstance(result, pd.DataFrame):
|
204 |
if len(result) > 5:
|
205 |
+
result_str = result.head(5).to_string() + f"\n\n[Total {len(result)} baris]"
|
206 |
else:
|
207 |
result_str = result.to_string()
|
208 |
elif isinstance(result, (pd.Series, np.ndarray)):
|
209 |
+
if len(result) > 10:
|
210 |
+
result_str = str(result[:10]) + f"\n\n[Total {len(result)} item]"
|
211 |
+
else:
|
212 |
+
result_str = str(result)
|
213 |
+
elif hasattr(result, "__len__") and not isinstance(result, (str, int, float)):
|
214 |
result_str = str(result)
|
215 |
+
if len(result) > 0:
|
216 |
+
result_str += f"\n\n[Total {len(result)} item]"
|
217 |
else:
|
218 |
result_str = str(result)
|
219 |
|
220 |
+
# Format response
|
221 |
+
response = f"Hasil analisis:\n\n{result_str}\n\nKode yang dijalankan:\n```python\n{code}\n```"
|
|
|
|
|
222 |
|
223 |
self.chat_history.append((message, response))
|
224 |
return response
|
225 |
|
226 |
except Exception as e:
|
227 |
+
return f"Error saat menganalisis data: {str(e)}\n\nKode yang dicoba:\n```python\n{code}\n```"
|
|
|
|
|
228 |
|
229 |
except Exception as e:
|
230 |
import traceback
|
231 |
print(traceback.format_exc())
|
232 |
return f"Error: {str(e)}"
|
233 |
|
234 |
+
# UI Code (tidak berubah dari sebelumnya)
|
235 |
def create_gradio_interface():
|
236 |
with gr.Blocks(title="CSV Data Analyzer") as interface:
|
237 |
session_id = gr.State(lambda: str(uuid.uuid4()))
|
|
|
251 |
with gr.Accordion("Contoh Pertanyaan", open=False):
|
252 |
gr.Markdown("""
|
253 |
- "Berapa jumlah data yang memiliki nilai Glucose di atas 150?"
|
254 |
+
- "Hitung nilai rata-rata setiap kolom numerik"
|
255 |
+
- "Berapa banyak data untuk setiap kelompok dalam kolom Outcome?"
|
256 |
+
- "Berapa jumlah baris dalam dataset ini?"
|
257 |
""")
|
258 |
|
259 |
with gr.Column(scale=2):
|