Spaces:
Sleeping
Sleeping
kembali ke mistral 7b instruct gguf
Browse files
app.py
CHANGED
@@ -3,10 +3,11 @@ import os
|
|
3 |
import uuid
|
4 |
import threading
|
5 |
import pandas as pd
|
6 |
-
|
|
|
|
|
7 |
from langchain.llms import CTransformers
|
8 |
-
from langchain.chains import
|
9 |
-
from langchain.prompts import PromptTemplate
|
10 |
|
11 |
# Global model cache
|
12 |
MODEL_CACHE = {
|
@@ -21,15 +22,15 @@ def initialize_model_once():
|
|
21 |
"""Initialize model once using CTransformers API"""
|
22 |
with MODEL_CACHE["init_lock"]:
|
23 |
if MODEL_CACHE["model"] is None:
|
24 |
-
# Load
|
25 |
MODEL_CACHE["model"] = CTransformers(
|
26 |
-
model="TheBloke/
|
27 |
-
model_file="
|
28 |
-
model_type="
|
29 |
max_new_tokens=512,
|
30 |
-
temperature=0.
|
31 |
-
|
32 |
-
|
33 |
)
|
34 |
|
35 |
return MODEL_CACHE["model"]
|
@@ -37,9 +38,8 @@ def initialize_model_once():
|
|
37 |
class ChatBot:
|
38 |
def __init__(self, session_id):
|
39 |
self.session_id = session_id
|
40 |
-
self.csv_info = None
|
41 |
-
self.df = None
|
42 |
self.chat_history = []
|
|
|
43 |
self.user_dir = f"user_data/{session_id}"
|
44 |
os.makedirs(self.user_dir, exist_ok=True)
|
45 |
|
@@ -50,195 +50,95 @@ class ChatBot:
|
|
50 |
try:
|
51 |
# Handle file from Gradio
|
52 |
file_path = file.name if hasattr(file, 'name') else str(file)
|
53 |
-
file_name = os.path.basename(file_path)
|
54 |
|
55 |
-
#
|
56 |
try:
|
57 |
-
|
58 |
user_file_path = f"{self.user_dir}/uploaded.csv"
|
59 |
-
|
60 |
-
|
61 |
-
# Store CSV info
|
62 |
-
self.csv_info = {
|
63 |
-
"filename": file_name,
|
64 |
-
"rows": self.df.shape[0],
|
65 |
-
"columns": self.df.shape[1],
|
66 |
-
"column_names": self.df.columns.tolist(),
|
67 |
-
}
|
68 |
-
|
69 |
-
print(f"CSV verified: {self.df.shape[0]} rows, {len(self.df.columns)} columns")
|
70 |
except Exception as e:
|
71 |
return f"Error membaca CSV: {str(e)}"
|
72 |
|
73 |
-
#
|
74 |
try:
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
Buat kode Python menggunakan pandas untuk menjawab pertanyaan tersebut.
|
89 |
-
Berikan HANYA kode Python saja, tanpa penjelasan atau apapun.
|
90 |
-
Kode harus menggunakan variabel 'df' sebagai nama DataFrame.
|
91 |
-
|
92 |
-
Kode:
|
93 |
-
"""
|
94 |
-
|
95 |
-
self.query_chain = LLMChain(
|
96 |
-
llm=llm,
|
97 |
-
prompt=PromptTemplate(
|
98 |
-
input_variables=["column_names", "num_rows", "sample_data", "question"],
|
99 |
-
template=query_template
|
100 |
-
)
|
101 |
)
|
102 |
|
103 |
-
|
|
|
|
|
104 |
except Exception as e:
|
105 |
-
return f"Error creating
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
|
107 |
# Add file info to chat history
|
108 |
-
file_info = f"CSV berhasil dimuat
|
109 |
self.chat_history.append(("System", file_info))
|
110 |
|
111 |
-
return
|
112 |
except Exception as e:
|
113 |
import traceback
|
114 |
print(traceback.format_exc())
|
115 |
return f"Error pemrosesan file: {str(e)}"
|
116 |
|
117 |
-
def execute_query(self, code):
|
118 |
-
"""Safely execute pandas code"""
|
119 |
-
try:
|
120 |
-
# Create local context with the dataframe
|
121 |
-
local_vars = {"df": self.df, "pd": pd, "np": np}
|
122 |
-
|
123 |
-
# Execute code with timeout
|
124 |
-
exec(code, {"pd": pd, "np": np}, local_vars)
|
125 |
-
|
126 |
-
# Get result
|
127 |
-
if "result" in local_vars:
|
128 |
-
return local_vars["result"]
|
129 |
-
else:
|
130 |
-
# If no result variable, find the last variable created
|
131 |
-
last_var = None
|
132 |
-
for var_name, var_value in local_vars.items():
|
133 |
-
if var_name not in ["df", "pd", "np"] and var_name != "__builtins__":
|
134 |
-
last_var = var_value
|
135 |
-
|
136 |
-
if last_var is not None:
|
137 |
-
return last_var
|
138 |
-
else:
|
139 |
-
return self.df # Return the dataframe as default
|
140 |
-
except Exception as e:
|
141 |
-
raise Exception(f"Gagal menjalankan kode: {str(e)}")
|
142 |
-
|
143 |
def chat(self, message, history):
|
144 |
-
if self.
|
145 |
return "Mohon upload file CSV terlebih dahulu."
|
146 |
|
147 |
try:
|
148 |
-
#
|
149 |
-
|
150 |
-
if "nama file" in message_lower:
|
151 |
-
return f"Nama file CSV adalah: {self.csv_info['filename']}"
|
152 |
-
elif "nama kolom" in message_lower:
|
153 |
-
return f"Kolom dalam CSV: {', '.join(self.csv_info['column_names'])}"
|
154 |
-
elif "jumlah baris" in message_lower or "berapa baris" in message_lower:
|
155 |
-
return f"Jumlah baris dalam CSV: {self.csv_info['rows']}"
|
156 |
|
157 |
-
#
|
158 |
-
|
159 |
-
code = "result = len(df[df['Glucose'] > 150])"
|
160 |
-
else:
|
161 |
-
# Get sample data for context
|
162 |
-
sample_str = self.df.head(3).to_string()
|
163 |
-
|
164 |
-
# Translate question to pandas code using LLM
|
165 |
-
try:
|
166 |
-
code_response = self.query_chain.run(
|
167 |
-
column_names=str(self.csv_info["column_names"]),
|
168 |
-
num_rows=self.csv_info["rows"],
|
169 |
-
sample_data=sample_str,
|
170 |
-
question=message
|
171 |
-
)
|
172 |
-
|
173 |
-
# Clean the code
|
174 |
-
code = code_response.strip().replace("```python", "").replace("```", "").strip()
|
175 |
-
|
176 |
-
# Add result variable if not present
|
177 |
-
if not any(line.strip().startswith("result =") for line in code.split("\n")):
|
178 |
-
if code.startswith("df."):
|
179 |
-
code = "result = " + code
|
180 |
-
else:
|
181 |
-
code = "result = df." + code
|
182 |
-
except Exception as e:
|
183 |
-
# Fallback for common queries if LLM fails
|
184 |
-
if "rata-rata" in message_lower or "mean" in message_lower:
|
185 |
-
code = "result = df.describe()"
|
186 |
-
elif "jumlah" in message_lower or "count" in message_lower:
|
187 |
-
code = "result = df.count()"
|
188 |
-
elif "distribusi" in message_lower:
|
189 |
-
col = next((c for c in self.csv_info["column_names"] if c.lower() in message_lower), None)
|
190 |
-
if col:
|
191 |
-
code = f"result = df['{col}'].value_counts()"
|
192 |
-
else:
|
193 |
-
code = "result = df.describe()"
|
194 |
-
else:
|
195 |
-
return f"Maaf, saya tidak dapat memproses pertanyaan ini. Error: {str(e)}"
|
196 |
|
197 |
-
#
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
elif isinstance(result, (pd.Series, np.ndarray)):
|
209 |
-
if len(result) > 10:
|
210 |
-
result_str = str(result[:10]) + f"\n\n[Total {len(result)} item]"
|
211 |
-
else:
|
212 |
-
result_str = str(result)
|
213 |
-
elif hasattr(result, "__len__") and not isinstance(result, (str, int, float)):
|
214 |
-
result_str = str(result)
|
215 |
-
if len(result) > 0:
|
216 |
-
result_str += f"\n\n[Total {len(result)} item]"
|
217 |
-
else:
|
218 |
-
result_str = str(result)
|
219 |
-
|
220 |
-
# Format response
|
221 |
-
response = f"Hasil analisis:\n\n{result_str}\n\nKode yang dijalankan:\n```python\n{code}\n```"
|
222 |
-
|
223 |
-
self.chat_history.append((message, response))
|
224 |
-
return response
|
225 |
-
|
226 |
-
except Exception as e:
|
227 |
-
return f"Error saat menganalisis data: {str(e)}\n\nKode yang dicoba:\n```python\n{code}\n```"
|
228 |
-
|
229 |
except Exception as e:
|
230 |
import traceback
|
231 |
print(traceback.format_exc())
|
232 |
return f"Error: {str(e)}"
|
233 |
|
234 |
-
# UI Code
|
235 |
def create_gradio_interface():
|
236 |
-
with gr.Blocks(title="CSV
|
237 |
session_id = gr.State(lambda: str(uuid.uuid4()))
|
238 |
chatbot_state = gr.State(lambda: None)
|
239 |
|
240 |
-
gr.HTML("<h1 style='text-align: center;'>CSV
|
241 |
-
gr.HTML("<h3 style='text-align: center;'>
|
242 |
|
243 |
with gr.Row():
|
244 |
with gr.Column(scale=1):
|
@@ -248,12 +148,14 @@ def create_gradio_interface():
|
|
248 |
)
|
249 |
process_button = gr.Button("Proses CSV")
|
250 |
|
251 |
-
with gr.Accordion("
|
252 |
gr.Markdown("""
|
253 |
-
-
|
254 |
-
|
255 |
-
|
256 |
-
-
|
|
|
|
|
257 |
""")
|
258 |
|
259 |
with gr.Column(scale=2):
|
@@ -262,8 +164,8 @@ def create_gradio_interface():
|
|
262 |
height=400
|
263 |
)
|
264 |
message_input = gr.Textbox(
|
265 |
-
label="Ketik
|
266 |
-
placeholder="
|
267 |
lines=2
|
268 |
)
|
269 |
submit_button = gr.Button("Kirim")
|
|
|
3 |
import uuid
|
4 |
import threading
|
5 |
import pandas as pd
|
6 |
+
from langchain.document_loaders.csv_loader import CSVLoader
|
7 |
+
from langchain.embeddings import HuggingFaceEmbeddings
|
8 |
+
from langchain.vectorstores import FAISS
|
9 |
from langchain.llms import CTransformers
|
10 |
+
from langchain.chains import ConversationalRetrievalChain
|
|
|
11 |
|
12 |
# Global model cache
|
13 |
MODEL_CACHE = {
|
|
|
22 |
"""Initialize model once using CTransformers API"""
|
23 |
with MODEL_CACHE["init_lock"]:
|
24 |
if MODEL_CACHE["model"] is None:
|
25 |
+
# Load Mistral-7B-Instruct-v0.2.Q4_K_M.gguf model
|
26 |
MODEL_CACHE["model"] = CTransformers(
|
27 |
+
model="TheBloke/Mistral-7B-Instruct-v0.2-GGUF",
|
28 |
+
model_file="mistral-7b-instruct-v0.2.Q4_K_M.gguf",
|
29 |
+
model_type="mistral",
|
30 |
max_new_tokens=512,
|
31 |
+
temperature=0.2,
|
32 |
+
top_p=0.9,
|
33 |
+
repetition_penalty=1.2
|
34 |
)
|
35 |
|
36 |
return MODEL_CACHE["model"]
|
|
|
38 |
class ChatBot:
|
39 |
def __init__(self, session_id):
|
40 |
self.session_id = session_id
|
|
|
|
|
41 |
self.chat_history = []
|
42 |
+
self.chain = None
|
43 |
self.user_dir = f"user_data/{session_id}"
|
44 |
os.makedirs(self.user_dir, exist_ok=True)
|
45 |
|
|
|
50 |
try:
|
51 |
# Handle file from Gradio
|
52 |
file_path = file.name if hasattr(file, 'name') else str(file)
|
|
|
53 |
|
54 |
+
# Verify and save CSV
|
55 |
try:
|
56 |
+
df = pd.read_csv(file_path)
|
57 |
user_file_path = f"{self.user_dir}/uploaded.csv"
|
58 |
+
df.to_csv(user_file_path, index=False)
|
59 |
+
print(f"CSV verified: {df.shape[0]} rows, {len(df.columns)} columns")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
except Exception as e:
|
61 |
return f"Error membaca CSV: {str(e)}"
|
62 |
|
63 |
+
# Load document
|
64 |
try:
|
65 |
+
loader = CSVLoader(file_path=file_path, encoding="utf-8", csv_args={'delimiter': ','})
|
66 |
+
data = loader.load()
|
67 |
+
print(f"Documents loaded: {len(data)}")
|
68 |
+
except Exception as e:
|
69 |
+
return f"Error loading documents: {str(e)}"
|
70 |
+
|
71 |
+
# Create vector database
|
72 |
+
try:
|
73 |
+
db_path = f"{self.user_dir}/db_faiss"
|
74 |
+
embeddings = HuggingFaceEmbeddings(
|
75 |
+
model_name='sentence-transformers/all-MiniLM-L6-v2',
|
76 |
+
model_kwargs={'device': 'cpu'} # Explicitly set to CPU
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
)
|
78 |
|
79 |
+
db = FAISS.from_documents(data, embeddings)
|
80 |
+
db.save_local(db_path)
|
81 |
+
print(f"Vector database created at {db_path}")
|
82 |
except Exception as e:
|
83 |
+
return f"Error creating vector database: {str(e)}"
|
84 |
+
|
85 |
+
# Create LLM and chain
|
86 |
+
try:
|
87 |
+
llm = initialize_model_once()
|
88 |
+
self.chain = ConversationalRetrievalChain.from_llm(
|
89 |
+
llm=llm,
|
90 |
+
retriever=db.as_retriever(search_kwargs={"k": 4}),
|
91 |
+
return_source_documents=True
|
92 |
+
)
|
93 |
+
print("Chain created successfully")
|
94 |
+
except Exception as e:
|
95 |
+
return f"Error creating chain: {str(e)}"
|
96 |
|
97 |
# Add file info to chat history
|
98 |
+
file_info = f"CSV berhasil dimuat dengan {df.shape[0]} baris dan {len(df.columns)} kolom. Kolom: {', '.join(df.columns.tolist())}"
|
99 |
self.chat_history.append(("System", file_info))
|
100 |
|
101 |
+
return "File CSV berhasil diproses! Anda dapat mulai chat dengan Mistral 7B."
|
102 |
except Exception as e:
|
103 |
import traceback
|
104 |
print(traceback.format_exc())
|
105 |
return f"Error pemrosesan file: {str(e)}"
|
106 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
107 |
def chat(self, message, history):
|
108 |
+
if self.chain is None:
|
109 |
return "Mohon upload file CSV terlebih dahulu."
|
110 |
|
111 |
try:
|
112 |
+
# Process with the chain
|
113 |
+
result = self.chain({"question": message, "chat_history": self.chat_history})
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
|
115 |
+
# Update chat history
|
116 |
+
answer = result["answer"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
117 |
|
118 |
+
# Optional: Add source info to answer
|
119 |
+
sources = result.get("source_documents", [])
|
120 |
+
if sources:
|
121 |
+
source_text = "\n\nSumber:\n"
|
122 |
+
for i, doc in enumerate(sources[:2], 1): # Limit to top 2 sources
|
123 |
+
source_text += f"{i}. {doc.page_content[:100]}...\n"
|
124 |
+
answer += source_text
|
125 |
+
|
126 |
+
self.chat_history.append((message, answer))
|
127 |
+
|
128 |
+
return answer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
129 |
except Exception as e:
|
130 |
import traceback
|
131 |
print(traceback.format_exc())
|
132 |
return f"Error: {str(e)}"
|
133 |
|
134 |
+
# UI Code dan handler functions sama seperti sebelumnya
|
135 |
def create_gradio_interface():
|
136 |
+
with gr.Blocks(title="Chat with CSV using Mistral 7B") as interface:
|
137 |
session_id = gr.State(lambda: str(uuid.uuid4()))
|
138 |
chatbot_state = gr.State(lambda: None)
|
139 |
|
140 |
+
gr.HTML("<h1 style='text-align: center;'>Chat with CSV using Mistral 7B</h1>")
|
141 |
+
gr.HTML("<h3 style='text-align: center;'>Asisten analisis CSV yang powerful</h3>")
|
142 |
|
143 |
with gr.Row():
|
144 |
with gr.Column(scale=1):
|
|
|
148 |
)
|
149 |
process_button = gr.Button("Proses CSV")
|
150 |
|
151 |
+
with gr.Accordion("Informasi Model", open=False):
|
152 |
gr.Markdown("""
|
153 |
+
**Model**: Mistral-7B-Instruct-v0.2-GGUF
|
154 |
+
|
155 |
+
**Fitur**:
|
156 |
+
- GGUF model yang dioptimalkan untuk CPU
|
157 |
+
- Efisien untuk analisis data dan percakapan
|
158 |
+
- Manajemen sesi per pengguna
|
159 |
""")
|
160 |
|
161 |
with gr.Column(scale=2):
|
|
|
164 |
height=400
|
165 |
)
|
166 |
message_input = gr.Textbox(
|
167 |
+
label="Ketik pesan Anda",
|
168 |
+
placeholder="Tanyakan tentang data CSV Anda...",
|
169 |
lines=2
|
170 |
)
|
171 |
submit_button = gr.Button("Kirim")
|