Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -3,11 +3,14 @@ import os
|
|
3 |
import uuid
|
4 |
import threading
|
5 |
import pandas as pd
|
|
|
6 |
from langchain.document_loaders.csv_loader import CSVLoader
|
7 |
from langchain.embeddings import HuggingFaceEmbeddings
|
8 |
from langchain.vectorstores import FAISS
|
9 |
from langchain.llms import CTransformers
|
10 |
-
from langchain.
|
|
|
|
|
11 |
|
12 |
# Global model cache
|
13 |
MODEL_CACHE = {
|
@@ -22,15 +25,16 @@ def initialize_model_once():
|
|
22 |
"""Initialize model once using CTransformers API"""
|
23 |
with MODEL_CACHE["init_lock"]:
|
24 |
if MODEL_CACHE["model"] is None:
|
25 |
-
# Load
|
26 |
MODEL_CACHE["model"] = CTransformers(
|
27 |
-
model="TheBloke/
|
28 |
-
model_file="
|
29 |
-
model_type="
|
30 |
max_new_tokens=512,
|
31 |
-
temperature=0.
|
32 |
top_p=0.9,
|
33 |
-
repetition_penalty=1.
|
|
|
34 |
)
|
35 |
|
36 |
return MODEL_CACHE["model"]
|
@@ -38,8 +42,9 @@ def initialize_model_once():
|
|
38 |
class ChatBot:
|
39 |
def __init__(self, session_id):
|
40 |
self.session_id = session_id
|
|
|
|
|
41 |
self.chat_history = []
|
42 |
-
self.chain = None
|
43 |
self.user_dir = f"user_data/{session_id}"
|
44 |
os.makedirs(self.user_dir, exist_ok=True)
|
45 |
|
@@ -50,95 +55,148 @@ class ChatBot:
|
|
50 |
try:
|
51 |
# Handle file from Gradio
|
52 |
file_path = file.name if hasattr(file, 'name') else str(file)
|
|
|
53 |
|
54 |
-
#
|
55 |
try:
|
56 |
-
df = pd.read_csv(file_path)
|
57 |
user_file_path = f"{self.user_dir}/uploaded.csv"
|
58 |
-
df.to_csv(user_file_path, index=False)
|
59 |
-
print(f"CSV verified: {df.shape[0]} rows, {len(df.columns)} columns")
|
60 |
-
except Exception as e:
|
61 |
-
return f"Error membaca CSV: {str(e)}"
|
62 |
-
|
63 |
-
# Load document
|
64 |
-
try:
|
65 |
-
loader = CSVLoader(file_path=file_path, encoding="utf-8", csv_args={'delimiter': ','})
|
66 |
-
data = loader.load()
|
67 |
-
print(f"Documents loaded: {len(data)}")
|
68 |
-
except Exception as e:
|
69 |
-
return f"Error loading documents: {str(e)}"
|
70 |
-
|
71 |
-
# Create vector database
|
72 |
-
try:
|
73 |
-
db_path = f"{self.user_dir}/db_faiss"
|
74 |
-
embeddings = HuggingFaceEmbeddings(
|
75 |
-
model_name='sentence-transformers/all-MiniLM-L6-v2',
|
76 |
-
model_kwargs={'device': 'cpu'} # Explicitly set to CPU
|
77 |
-
)
|
78 |
|
79 |
-
|
80 |
-
|
81 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
except Exception as e:
|
83 |
-
return f"Error
|
84 |
|
85 |
-
# Create
|
86 |
try:
|
87 |
llm = initialize_model_once()
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
)
|
93 |
-
|
|
|
94 |
except Exception as e:
|
95 |
-
return f"Error creating
|
96 |
|
97 |
# Add file info to chat history
|
98 |
-
file_info = f"CSV berhasil dimuat dengan {df.shape[0]} baris dan {len(df.columns)} kolom. Kolom: {', '.join(df.columns.tolist())}"
|
99 |
self.chat_history.append(("System", file_info))
|
100 |
|
101 |
-
return "File CSV berhasil diproses! Anda dapat mulai
|
102 |
except Exception as e:
|
103 |
import traceback
|
104 |
print(traceback.format_exc())
|
105 |
return f"Error pemrosesan file: {str(e)}"
|
106 |
|
107 |
def chat(self, message, history):
|
108 |
-
if self.
|
109 |
return "Mohon upload file CSV terlebih dahulu."
|
110 |
|
111 |
try:
|
112 |
-
#
|
113 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
|
115 |
-
#
|
116 |
-
|
117 |
|
118 |
-
#
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
self.chat_history.append((message, answer))
|
127 |
|
128 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
129 |
except Exception as e:
|
130 |
import traceback
|
131 |
print(traceback.format_exc())
|
132 |
return f"Error: {str(e)}"
|
133 |
|
134 |
-
# UI Code
|
135 |
def create_gradio_interface():
|
136 |
-
with gr.Blocks(title="
|
137 |
session_id = gr.State(lambda: str(uuid.uuid4()))
|
138 |
chatbot_state = gr.State(lambda: None)
|
139 |
|
140 |
-
gr.HTML("<h1 style='text-align: center;'>
|
141 |
-
gr.HTML("<h3 style='text-align: center;'>
|
142 |
|
143 |
with gr.Row():
|
144 |
with gr.Column(scale=1):
|
@@ -148,14 +206,12 @@ def create_gradio_interface():
|
|
148 |
)
|
149 |
process_button = gr.Button("Proses CSV")
|
150 |
|
151 |
-
with gr.Accordion("
|
152 |
gr.Markdown("""
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
-
|
157 |
-
- Efisien untuk analisis data dan percakapan
|
158 |
-
- Manajemen sesi per pengguna
|
159 |
""")
|
160 |
|
161 |
with gr.Column(scale=2):
|
@@ -164,8 +220,8 @@ def create_gradio_interface():
|
|
164 |
height=400
|
165 |
)
|
166 |
message_input = gr.Textbox(
|
167 |
-
label="Ketik
|
168 |
-
placeholder="
|
169 |
lines=2
|
170 |
)
|
171 |
submit_button = gr.Button("Kirim")
|
|
|
3 |
import uuid
|
4 |
import threading
|
5 |
import pandas as pd
|
6 |
+
import numpy as np
|
7 |
from langchain.document_loaders.csv_loader import CSVLoader
|
8 |
from langchain.embeddings import HuggingFaceEmbeddings
|
9 |
from langchain.vectorstores import FAISS
|
10 |
from langchain.llms import CTransformers
|
11 |
+
from langchain.agents import create_pandas_dataframe_agent
|
12 |
+
from langchain.chains import LLMChain
|
13 |
+
from langchain.prompts import PromptTemplate
|
14 |
|
15 |
# Global model cache
|
16 |
MODEL_CACHE = {
|
|
|
25 |
"""Initialize model once using CTransformers API"""
|
26 |
with MODEL_CACHE["init_lock"]:
|
27 |
if MODEL_CACHE["model"] is None:
|
28 |
+
# Load Phi-2 model (smaller than Mistral)
|
29 |
MODEL_CACHE["model"] = CTransformers(
|
30 |
+
model="TheBloke/phi-2-GGUF",
|
31 |
+
model_file="phi-2.Q4_K_M.gguf",
|
32 |
+
model_type="phi2",
|
33 |
max_new_tokens=512,
|
34 |
+
temperature=0.1,
|
35 |
top_p=0.9,
|
36 |
+
repetition_penalty=1.1,
|
37 |
+
context_length=2048
|
38 |
)
|
39 |
|
40 |
return MODEL_CACHE["model"]
|
|
|
42 |
class ChatBot:
|
43 |
def __init__(self, session_id):
|
44 |
self.session_id = session_id
|
45 |
+
self.csv_info = None
|
46 |
+
self.df = None
|
47 |
self.chat_history = []
|
|
|
48 |
self.user_dir = f"user_data/{session_id}"
|
49 |
os.makedirs(self.user_dir, exist_ok=True)
|
50 |
|
|
|
55 |
try:
|
56 |
# Handle file from Gradio
|
57 |
file_path = file.name if hasattr(file, 'name') else str(file)
|
58 |
+
file_name = os.path.basename(file_path)
|
59 |
|
60 |
+
# Load and save CSV directly with pandas
|
61 |
try:
|
62 |
+
self.df = pd.read_csv(file_path)
|
63 |
user_file_path = f"{self.user_dir}/uploaded.csv"
|
64 |
+
self.df.to_csv(user_file_path, index=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
|
66 |
+
# Store CSV info
|
67 |
+
self.csv_info = {
|
68 |
+
"filename": file_name,
|
69 |
+
"rows": self.df.shape[0],
|
70 |
+
"columns": self.df.shape[1],
|
71 |
+
"column_names": self.df.columns.tolist(),
|
72 |
+
}
|
73 |
+
|
74 |
+
print(f"CSV verified: {self.df.shape[0]} rows, {len(self.df.columns)} columns")
|
75 |
except Exception as e:
|
76 |
+
return f"Error membaca CSV: {str(e)}"
|
77 |
|
78 |
+
# Create query translator
|
79 |
try:
|
80 |
llm = initialize_model_once()
|
81 |
+
|
82 |
+
query_template = """
|
83 |
+
Kamu adalah asisten yang mengubah pertanyaan natural language menjadi kode Python dengan pandas.
|
84 |
+
|
85 |
+
Informasi tentang DataFrame:
|
86 |
+
- Nama kolom: {column_names}
|
87 |
+
- Jumlah baris: {num_rows}
|
88 |
+
- Sample data:
|
89 |
+
{sample_data}
|
90 |
+
|
91 |
+
Pertanyaan pengguna: {question}
|
92 |
+
|
93 |
+
Ubah pertanyaan tersebut menjadi kode pandas yang bisa dijalankan. Kode harus ringkas, efisien, dan menggunakan variabel 'df'.
|
94 |
+
Berikan HANYA kode python saja, tanpa backtick, tanpa penjelasan.
|
95 |
+
|
96 |
+
Kode:
|
97 |
+
"""
|
98 |
+
|
99 |
+
self.query_chain = LLMChain(
|
100 |
+
llm=llm,
|
101 |
+
prompt=PromptTemplate(
|
102 |
+
input_variables=["column_names", "num_rows", "sample_data", "question"],
|
103 |
+
template=query_template
|
104 |
+
)
|
105 |
)
|
106 |
+
|
107 |
+
print("Query translator created successfully")
|
108 |
except Exception as e:
|
109 |
+
return f"Error creating query translator: {str(e)}"
|
110 |
|
111 |
# Add file info to chat history
|
112 |
+
file_info = f"CSV berhasil dimuat: {file_name} dengan {self.df.shape[0]} baris dan {len(self.df.columns)} kolom. Kolom: {', '.join(self.df.columns.tolist())}"
|
113 |
self.chat_history.append(("System", file_info))
|
114 |
|
115 |
+
return f"File CSV '{file_name}' berhasil diproses! Anda dapat mulai mengajukan pertanyaan tentang data."
|
116 |
except Exception as e:
|
117 |
import traceback
|
118 |
print(traceback.format_exc())
|
119 |
return f"Error pemrosesan file: {str(e)}"
|
120 |
|
121 |
def chat(self, message, history):
|
122 |
+
if self.df is None or self.query_chain is None:
|
123 |
return "Mohon upload file CSV terlebih dahulu."
|
124 |
|
125 |
try:
|
126 |
+
# Handle metadata questions directly
|
127 |
+
message_lower = message.lower()
|
128 |
+
if "nama file" in message_lower:
|
129 |
+
return f"Nama file CSV adalah: {self.csv_info['filename']}"
|
130 |
+
elif "nama kolom" in message_lower:
|
131 |
+
return f"Kolom dalam CSV: {', '.join(self.csv_info['column_names'])}"
|
132 |
+
elif "jumlah baris" in message_lower or "berapa baris" in message_lower:
|
133 |
+
return f"Jumlah baris dalam CSV: {self.csv_info['rows']}"
|
134 |
|
135 |
+
# Get sample data for context
|
136 |
+
sample_str = self.df.head(3).to_string()
|
137 |
|
138 |
+
# Translate question to pandas code
|
139 |
+
code_response = self.query_chain.run(
|
140 |
+
column_names=str(self.csv_info["column_names"]),
|
141 |
+
num_rows=self.csv_info["rows"],
|
142 |
+
sample_data=sample_str,
|
143 |
+
question=message
|
144 |
+
)
|
|
|
|
|
145 |
|
146 |
+
# Clean and execute the code
|
147 |
+
try:
|
148 |
+
code = code_response.strip()
|
149 |
+
# Add safety prefix to prevent malicious code
|
150 |
+
if not code.startswith("df"):
|
151 |
+
code = "result = " + code
|
152 |
+
else:
|
153 |
+
code = "result = " + code
|
154 |
+
|
155 |
+
# Create local context with the dataframe
|
156 |
+
locals_dict = {"df": self.df, "pd": pd, "np": np}
|
157 |
+
|
158 |
+
# Execute the code
|
159 |
+
print(f"Executing code: {code}")
|
160 |
+
exec(code, {"pd": pd, "np": np}, locals_dict)
|
161 |
+
result = locals_dict.get("result", "No result returned")
|
162 |
+
|
163 |
+
# Format the result
|
164 |
+
if isinstance(result, pd.DataFrame):
|
165 |
+
if len(result) > 5:
|
166 |
+
result_str = result.head(5).to_string() + f"\n\n[{len(result)} baris ditemukan]"
|
167 |
+
else:
|
168 |
+
result_str = result.to_string()
|
169 |
+
elif isinstance(result, (pd.Series, np.ndarray)):
|
170 |
+
result_str = str(result)
|
171 |
+
else:
|
172 |
+
result_str = str(result)
|
173 |
+
|
174 |
+
# Build the response
|
175 |
+
response = f"Hasil analisis untuk pertanyaan: '{message}'\n\n"
|
176 |
+
response += f"Kode yang digunakan:\n```python\n{code}\n```\n\n"
|
177 |
+
response += f"Output:\n{result_str}"
|
178 |
+
|
179 |
+
self.chat_history.append((message, response))
|
180 |
+
return response
|
181 |
+
|
182 |
+
except Exception as e:
|
183 |
+
error_msg = f"Error mengeksekusi kode: {str(e)}\nKode yang dihasilkan:\n```python\n{code}\n```"
|
184 |
+
print(error_msg)
|
185 |
+
return error_msg
|
186 |
+
|
187 |
except Exception as e:
|
188 |
import traceback
|
189 |
print(traceback.format_exc())
|
190 |
return f"Error: {str(e)}"
|
191 |
|
192 |
+
# UI Code
|
193 |
def create_gradio_interface():
|
194 |
+
with gr.Blocks(title="CSV Data Analyzer") as interface:
|
195 |
session_id = gr.State(lambda: str(uuid.uuid4()))
|
196 |
chatbot_state = gr.State(lambda: None)
|
197 |
|
198 |
+
gr.HTML("<h1 style='text-align: center;'>CSV Data Analyzer</h1>")
|
199 |
+
gr.HTML("<h3 style='text-align: center;'>Ajukan pertanyaan tentang data CSV Anda</h3>")
|
200 |
|
201 |
with gr.Row():
|
202 |
with gr.Column(scale=1):
|
|
|
206 |
)
|
207 |
process_button = gr.Button("Proses CSV")
|
208 |
|
209 |
+
with gr.Accordion("Contoh Pertanyaan", open=False):
|
210 |
gr.Markdown("""
|
211 |
+
- "Berapa jumlah data yang memiliki nilai Glucose di atas 150?"
|
212 |
+
- "Bagaimana distribusi kolom Age?"
|
213 |
+
- "Hitung nilai rata-rata dan standar deviasi untuk setiap kolom numerik"
|
214 |
+
- "Buat tabel frekuensi untuk kolom Outcome"
|
|
|
|
|
215 |
""")
|
216 |
|
217 |
with gr.Column(scale=2):
|
|
|
220 |
height=400
|
221 |
)
|
222 |
message_input = gr.Textbox(
|
223 |
+
label="Ketik pertanyaan Anda",
|
224 |
+
placeholder="Contoh: Berapa jumlah data yang memiliki nilai Glucose di atas 150?",
|
225 |
lines=2
|
226 |
)
|
227 |
submit_button = gr.Button("Kirim")
|