Spaces:
Sleeping
Sleeping
microsoft/phi4-mini-instruct
Browse files
app.py
CHANGED
@@ -3,15 +3,14 @@ import os
|
|
3 |
import uuid
|
4 |
import threading
|
5 |
import pandas as pd
|
6 |
-
|
7 |
-
from
|
8 |
-
|
9 |
-
from langchain.llms import CTransformers
|
10 |
-
from langchain.chains import ConversationalRetrievalChain
|
11 |
|
12 |
# Global model cache
|
13 |
MODEL_CACHE = {
|
14 |
"model": None,
|
|
|
15 |
"init_lock": threading.Lock()
|
16 |
}
|
17 |
|
@@ -19,27 +18,52 @@ MODEL_CACHE = {
|
|
19 |
os.makedirs("user_data", exist_ok=True)
|
20 |
|
21 |
def initialize_model_once():
|
22 |
-
"""Initialize model once
|
23 |
with MODEL_CACHE["init_lock"]:
|
24 |
if MODEL_CACHE["model"] is None:
|
25 |
-
# Load
|
26 |
-
MODEL_CACHE["
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
temperature=0.2,
|
32 |
-
top_p=0.9,
|
33 |
-
repetition_penalty=1.2
|
34 |
)
|
35 |
|
36 |
-
return MODEL_CACHE["model"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
|
38 |
class ChatBot:
|
39 |
def __init__(self, session_id):
|
40 |
self.session_id = session_id
|
|
|
|
|
41 |
self.chat_history = []
|
42 |
-
self.chain = None
|
43 |
self.user_dir = f"user_data/{session_id}"
|
44 |
os.makedirs(self.user_dir, exist_ok=True)
|
45 |
|
@@ -50,95 +74,173 @@ class ChatBot:
|
|
50 |
try:
|
51 |
# Handle file from Gradio
|
52 |
file_path = file.name if hasattr(file, 'name') else str(file)
|
|
|
53 |
|
54 |
-
#
|
55 |
try:
|
56 |
-
df = pd.read_csv(file_path)
|
57 |
user_file_path = f"{self.user_dir}/uploaded.csv"
|
58 |
-
df.to_csv(user_file_path, index=False)
|
59 |
-
print(f"CSV verified: {df.shape[0]} rows, {len(df.columns)} columns")
|
60 |
-
except Exception as e:
|
61 |
-
return f"Error membaca CSV: {str(e)}"
|
62 |
-
|
63 |
-
# Load document
|
64 |
-
try:
|
65 |
-
loader = CSVLoader(file_path=file_path, encoding="utf-8", csv_args={'delimiter': ','})
|
66 |
-
data = loader.load()
|
67 |
-
print(f"Documents loaded: {len(data)}")
|
68 |
-
except Exception as e:
|
69 |
-
return f"Error loading documents: {str(e)}"
|
70 |
-
|
71 |
-
# Create vector database
|
72 |
-
try:
|
73 |
-
db_path = f"{self.user_dir}/db_faiss"
|
74 |
-
embeddings = HuggingFaceEmbeddings(
|
75 |
-
model_name='sentence-transformers/all-MiniLM-L6-v2',
|
76 |
-
model_kwargs={'device': 'cpu'} # Explicitly set to CPU
|
77 |
-
)
|
78 |
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
self.chain = ConversationalRetrievalChain.from_llm(
|
89 |
-
llm=llm,
|
90 |
-
retriever=db.as_retriever(search_kwargs={"k": 4}),
|
91 |
-
return_source_documents=True
|
92 |
-
)
|
93 |
-
print("Chain created successfully")
|
94 |
except Exception as e:
|
95 |
-
return f"Error
|
96 |
|
97 |
# Add file info to chat history
|
98 |
-
file_info = f"CSV berhasil dimuat dengan {df.shape[0]} baris dan {len(df.columns)} kolom. Kolom: {', '.join(df.columns.tolist())}"
|
99 |
self.chat_history.append(("System", file_info))
|
100 |
|
101 |
-
return "File CSV berhasil diproses! Anda dapat mulai
|
102 |
except Exception as e:
|
103 |
import traceback
|
104 |
print(traceback.format_exc())
|
105 |
return f"Error pemrosesan file: {str(e)}"
|
106 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
107 |
def chat(self, message, history):
|
108 |
-
if self.
|
109 |
return "Mohon upload file CSV terlebih dahulu."
|
110 |
|
111 |
try:
|
112 |
-
#
|
113 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
|
115 |
-
#
|
116 |
-
|
|
|
|
|
117 |
|
118 |
-
#
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
|
126 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
127 |
|
128 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
129 |
except Exception as e:
|
130 |
import traceback
|
131 |
print(traceback.format_exc())
|
132 |
return f"Error: {str(e)}"
|
133 |
|
134 |
-
# UI Code
|
135 |
def create_gradio_interface():
|
136 |
-
with gr.Blocks(title="
|
137 |
session_id = gr.State(lambda: str(uuid.uuid4()))
|
138 |
chatbot_state = gr.State(lambda: None)
|
139 |
|
140 |
-
gr.HTML("<h1 style='text-align: center;'>
|
141 |
-
gr.HTML("<h3 style='text-align: center;'>
|
142 |
|
143 |
with gr.Row():
|
144 |
with gr.Column(scale=1):
|
@@ -148,14 +250,13 @@ def create_gradio_interface():
|
|
148 |
)
|
149 |
process_button = gr.Button("Proses CSV")
|
150 |
|
151 |
-
with gr.Accordion("
|
152 |
gr.Markdown("""
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
-
|
157 |
-
-
|
158 |
-
- Manajemen sesi per pengguna
|
159 |
""")
|
160 |
|
161 |
with gr.Column(scale=2):
|
@@ -164,8 +265,8 @@ def create_gradio_interface():
|
|
164 |
height=400
|
165 |
)
|
166 |
message_input = gr.Textbox(
|
167 |
-
label="Ketik
|
168 |
-
placeholder="
|
169 |
lines=2
|
170 |
)
|
171 |
submit_button = gr.Button("Kirim")
|
|
|
3 |
import uuid
|
4 |
import threading
|
5 |
import pandas as pd
|
6 |
+
import numpy as np
|
7 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
8 |
+
import torch
|
|
|
|
|
9 |
|
10 |
# Global model cache
|
11 |
MODEL_CACHE = {
|
12 |
"model": None,
|
13 |
+
"tokenizer": None,
|
14 |
"init_lock": threading.Lock()
|
15 |
}
|
16 |
|
|
|
18 |
os.makedirs("user_data", exist_ok=True)
|
19 |
|
20 |
def initialize_model_once():
|
21 |
+
"""Initialize Phi-4-mini model once"""
|
22 |
with MODEL_CACHE["init_lock"]:
|
23 |
if MODEL_CACHE["model"] is None:
|
24 |
+
# Load Phi-4-mini model
|
25 |
+
MODEL_CACHE["tokenizer"] = AutoTokenizer.from_pretrained("microsoft/Phi-4-mini-instruct")
|
26 |
+
MODEL_CACHE["model"] = AutoModelForCausalLM.from_pretrained(
|
27 |
+
"microsoft/Phi-4-mini-instruct",
|
28 |
+
torch_dtype=torch.float16,
|
29 |
+
device_map="auto"
|
|
|
|
|
|
|
30 |
)
|
31 |
|
32 |
+
return MODEL_CACHE["model"], MODEL_CACHE["tokenizer"]
|
33 |
+
|
34 |
+
def generate_pandas_code(prompt, max_new_tokens=512):
|
35 |
+
"""Generate Python code using the Phi-4-mini model"""
|
36 |
+
model, tokenizer = initialize_model_once()
|
37 |
+
|
38 |
+
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
39 |
+
with torch.no_grad():
|
40 |
+
outputs = model.generate(
|
41 |
+
**inputs,
|
42 |
+
max_new_tokens=max_new_tokens,
|
43 |
+
do_sample=True,
|
44 |
+
temperature=0.2,
|
45 |
+
top_p=0.9,
|
46 |
+
)
|
47 |
+
|
48 |
+
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
49 |
+
# Extract only the generated part, removing the input prompt
|
50 |
+
generated_text = response[len(tokenizer.decode(inputs.input_ids[0], skip_special_tokens=True)):]
|
51 |
+
|
52 |
+
# Extract code between ```python and ``` if present
|
53 |
+
import re
|
54 |
+
code_match = re.search(r'```python\s*(.*?)\s*```', generated_text, re.DOTALL)
|
55 |
+
if code_match:
|
56 |
+
return code_match.group(1).strip()
|
57 |
+
else:
|
58 |
+
# Return the raw generated text as fallback
|
59 |
+
return generated_text.strip()
|
60 |
|
61 |
class ChatBot:
|
62 |
def __init__(self, session_id):
|
63 |
self.session_id = session_id
|
64 |
+
self.csv_info = None
|
65 |
+
self.df = None
|
66 |
self.chat_history = []
|
|
|
67 |
self.user_dir = f"user_data/{session_id}"
|
68 |
os.makedirs(self.user_dir, exist_ok=True)
|
69 |
|
|
|
74 |
try:
|
75 |
# Handle file from Gradio
|
76 |
file_path = file.name if hasattr(file, 'name') else str(file)
|
77 |
+
file_name = os.path.basename(file_path)
|
78 |
|
79 |
+
# Load and save CSV directly with pandas
|
80 |
try:
|
81 |
+
self.df = pd.read_csv(file_path)
|
82 |
user_file_path = f"{self.user_dir}/uploaded.csv"
|
83 |
+
self.df.to_csv(user_file_path, index=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
|
85 |
+
# Store CSV info
|
86 |
+
self.csv_info = {
|
87 |
+
"filename": file_name,
|
88 |
+
"rows": self.df.shape[0],
|
89 |
+
"columns": self.df.shape[1],
|
90 |
+
"column_names": self.df.columns.tolist(),
|
91 |
+
}
|
92 |
+
|
93 |
+
print(f"CSV verified: {self.df.shape[0]} rows, {len(self.df.columns)} columns")
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
except Exception as e:
|
95 |
+
return f"Error membaca CSV: {str(e)}"
|
96 |
|
97 |
# Add file info to chat history
|
98 |
+
file_info = f"CSV berhasil dimuat: {file_name} dengan {self.df.shape[0]} baris dan {len(self.df.columns)} kolom. Kolom: {', '.join(self.df.columns.tolist())}"
|
99 |
self.chat_history.append(("System", file_info))
|
100 |
|
101 |
+
return f"File CSV '{file_name}' berhasil diproses! Anda dapat mulai mengajukan pertanyaan tentang data."
|
102 |
except Exception as e:
|
103 |
import traceback
|
104 |
print(traceback.format_exc())
|
105 |
return f"Error pemrosesan file: {str(e)}"
|
106 |
|
107 |
+
def execute_query(self, code):
|
108 |
+
"""Safely execute pandas code"""
|
109 |
+
try:
|
110 |
+
# Create local context with the dataframe
|
111 |
+
local_vars = {"df": self.df, "pd": pd, "np": np}
|
112 |
+
|
113 |
+
# Execute code with timeout
|
114 |
+
exec(code, {"pd": pd, "np": np}, local_vars)
|
115 |
+
|
116 |
+
# Get result
|
117 |
+
if "result" in local_vars:
|
118 |
+
return local_vars["result"]
|
119 |
+
else:
|
120 |
+
# If no result variable, find the last variable created
|
121 |
+
last_var = None
|
122 |
+
for var_name, var_value in local_vars.items():
|
123 |
+
if var_name not in ["df", "pd", "np"] and var_name != "__builtins__":
|
124 |
+
last_var = var_value
|
125 |
+
|
126 |
+
if last_var is not None:
|
127 |
+
return last_var
|
128 |
+
else:
|
129 |
+
return self.df # Return the dataframe as default
|
130 |
+
except Exception as e:
|
131 |
+
raise Exception(f"Gagal menjalankan kode: {str(e)}")
|
132 |
+
|
133 |
def chat(self, message, history):
|
134 |
+
if self.df is None:
|
135 |
return "Mohon upload file CSV terlebih dahulu."
|
136 |
|
137 |
try:
|
138 |
+
# Handle common metadata questions directly to save resources
|
139 |
+
message_lower = message.lower()
|
140 |
+
if "nama file" in message_lower:
|
141 |
+
return f"Nama file CSV adalah: {self.csv_info['filename']}"
|
142 |
+
elif "nama kolom" in message_lower:
|
143 |
+
return f"Kolom dalam CSV: {', '.join(self.csv_info['column_names'])}"
|
144 |
+
elif "jumlah baris" in message_lower or "berapa baris" in message_lower:
|
145 |
+
return f"Jumlah baris dalam CSV: {self.csv_info['rows']}"
|
146 |
+
elif "jumlah kolom" in message_lower or "berapa kolom" in message_lower:
|
147 |
+
return f"Jumlah kolom dalam CSV: {self.csv_info['columns']}"
|
148 |
|
149 |
+
# Get sample data for context
|
150 |
+
sample_df = self.df.head(5)
|
151 |
+
sample_str = sample_df.to_string()
|
152 |
+
data_types = {col: str(dtype) for col, dtype in self.df.dtypes.items()}
|
153 |
|
154 |
+
# Create prompt for LLM
|
155 |
+
prompt = f"""
|
156 |
+
You are a data analyst that translates natural language questions into Python pandas code.
|
157 |
+
|
158 |
+
DataFrame information:
|
159 |
+
- Column names: {', '.join(self.csv_info['column_names'])}
|
160 |
+
- Data types: {data_types}
|
161 |
+
- Number of rows: {self.csv_info['rows']}
|
162 |
+
- Sample data:
|
163 |
+
{sample_str}
|
164 |
+
|
165 |
+
User question: {message}
|
166 |
+
|
167 |
+
Write a short Python code using pandas to answer the user's question.
|
168 |
+
The code must use the 'df' variable as the DataFrame name.
|
169 |
+
The code should assign the final result to a variable named 'result'.
|
170 |
+
Only return the Python code without any explanation.
|
171 |
+
|
172 |
+
```python
|
173 |
+
"""
|
174 |
|
175 |
+
# Generate code with Phi-4
|
176 |
+
try:
|
177 |
+
code = generate_pandas_code(prompt)
|
178 |
+
|
179 |
+
# Add result variable if not present
|
180 |
+
if not any(line.strip().startswith("result =") for line in code.split("\n")):
|
181 |
+
if code.startswith("df."):
|
182 |
+
code = "result = " + code
|
183 |
+
elif not "result" in code:
|
184 |
+
code = "result = " + code
|
185 |
+
except Exception as e:
|
186 |
+
print(f"Error generating code: {str(e)}")
|
187 |
+
# Fallback for basic questions
|
188 |
+
if "rata-rata" in message_lower or "mean" in message_lower:
|
189 |
+
code = "result = df.describe()"
|
190 |
+
elif "jumlah" in message_lower or "count" in message_lower:
|
191 |
+
code = "result = df.count()"
|
192 |
+
else:
|
193 |
+
return f"Maaf, saya tidak dapat menghasilkan kode untuk pertanyaan ini. Error: {str(e)}"
|
194 |
|
195 |
+
# Execute the code and get result
|
196 |
+
try:
|
197 |
+
print(f"Executing code: {code}")
|
198 |
+
result = self.execute_query(code)
|
199 |
+
|
200 |
+
# Check if result is relevant to the question
|
201 |
+
if result is None or (isinstance(result, pd.DataFrame) and result.empty):
|
202 |
+
return "Maaf, kita tidak bisa mendapatkan informasi terkait pertanyaan anda di dalam file CSV anda."
|
203 |
+
|
204 |
+
# Format result based on its type
|
205 |
+
if isinstance(result, pd.DataFrame):
|
206 |
+
if len(result) > 5:
|
207 |
+
result_str = result.head(5).to_string() + f"\n\n[Total {len(result)} baris]"
|
208 |
+
else:
|
209 |
+
result_str = result.to_string()
|
210 |
+
elif isinstance(result, (pd.Series, np.ndarray)):
|
211 |
+
if len(result) > 10:
|
212 |
+
result_str = str(result[:10]) + f"\n\n[Total {len(result)} item]"
|
213 |
+
else:
|
214 |
+
result_str = str(result)
|
215 |
+
elif hasattr(result, "__len__") and not isinstance(result, (str, int, float)):
|
216 |
+
result_str = str(result)
|
217 |
+
if len(result) > 0:
|
218 |
+
result_str += f"\n\n[Total {len(result)} item]"
|
219 |
+
else:
|
220 |
+
result_str = str(result)
|
221 |
+
|
222 |
+
# Format response
|
223 |
+
response = f"Hasil analisis:\n\n{result_str}\n\nKode yang dijalankan:\n```python\n{code}\n```"
|
224 |
+
|
225 |
+
self.chat_history.append((message, response))
|
226 |
+
return response
|
227 |
+
|
228 |
+
except Exception as e:
|
229 |
+
return f"Error saat menganalisis data: {str(e)}\n\nKode yang dicoba:\n```python\n{code}\n```"
|
230 |
+
|
231 |
except Exception as e:
|
232 |
import traceback
|
233 |
print(traceback.format_exc())
|
234 |
return f"Error: {str(e)}"
|
235 |
|
236 |
+
# UI Code (sama seperti sebelumnya)
|
237 |
def create_gradio_interface():
|
238 |
+
with gr.Blocks(title="CSV Data Analyzer") as interface:
|
239 |
session_id = gr.State(lambda: str(uuid.uuid4()))
|
240 |
chatbot_state = gr.State(lambda: None)
|
241 |
|
242 |
+
gr.HTML("<h1 style='text-align: center;'>CSV Data Analyzer</h1>")
|
243 |
+
gr.HTML("<h3 style='text-align: center;'>Ajukan pertanyaan tentang data CSV Anda</h3>")
|
244 |
|
245 |
with gr.Row():
|
246 |
with gr.Column(scale=1):
|
|
|
250 |
)
|
251 |
process_button = gr.Button("Proses CSV")
|
252 |
|
253 |
+
with gr.Accordion("Contoh Pertanyaan", open=False):
|
254 |
gr.Markdown("""
|
255 |
+
- "Berapa jumlah data yang memiliki nilai Glucose di atas 150?"
|
256 |
+
- "Hitung nilai rata-rata setiap kolom numerik"
|
257 |
+
- "Berapa banyak data untuk setiap kelompok dalam kolom Outcome?"
|
258 |
+
- "Berapa jumlah baris dalam dataset ini?"
|
259 |
+
- "Berapa jumlah kolom dalam dataset ini?"
|
|
|
260 |
""")
|
261 |
|
262 |
with gr.Column(scale=2):
|
|
|
265 |
height=400
|
266 |
)
|
267 |
message_input = gr.Textbox(
|
268 |
+
label="Ketik pertanyaan Anda",
|
269 |
+
placeholder="Contoh: Berapa jumlah data yang memiliki nilai Glucose di atas 150?",
|
270 |
lines=2
|
271 |
)
|
272 |
submit_button = gr.Button("Kirim")
|