hmrizal commited on
Commit
88c17a0
·
verified ·
1 Parent(s): 5057942

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +127 -71
app.py CHANGED
@@ -3,11 +3,14 @@ import os
3
  import uuid
4
  import threading
5
  import pandas as pd
 
6
  from langchain.document_loaders.csv_loader import CSVLoader
7
  from langchain.embeddings import HuggingFaceEmbeddings
8
  from langchain.vectorstores import FAISS
9
  from langchain.llms import CTransformers
10
- from langchain.chains import ConversationalRetrievalChain
 
 
11
 
12
  # Global model cache
13
  MODEL_CACHE = {
@@ -22,15 +25,16 @@ def initialize_model_once():
22
  """Initialize model once using CTransformers API"""
23
  with MODEL_CACHE["init_lock"]:
24
  if MODEL_CACHE["model"] is None:
25
- # Load Mistral-7B-Instruct-v0.2.Q4_K_M.gguf model
26
  MODEL_CACHE["model"] = CTransformers(
27
- model="TheBloke/Mistral-7B-Instruct-v0.2-GGUF",
28
- model_file="mistral-7b-instruct-v0.2.Q4_K_M.gguf",
29
- model_type="mistral",
30
  max_new_tokens=512,
31
- temperature=0.2,
32
  top_p=0.9,
33
- repetition_penalty=1.2
 
34
  )
35
 
36
  return MODEL_CACHE["model"]
@@ -38,8 +42,9 @@ def initialize_model_once():
38
  class ChatBot:
39
  def __init__(self, session_id):
40
  self.session_id = session_id
 
 
41
  self.chat_history = []
42
- self.chain = None
43
  self.user_dir = f"user_data/{session_id}"
44
  os.makedirs(self.user_dir, exist_ok=True)
45
 
@@ -50,95 +55,148 @@ class ChatBot:
50
  try:
51
  # Handle file from Gradio
52
  file_path = file.name if hasattr(file, 'name') else str(file)
 
53
 
54
- # Verify and save CSV
55
  try:
56
- df = pd.read_csv(file_path)
57
  user_file_path = f"{self.user_dir}/uploaded.csv"
58
- df.to_csv(user_file_path, index=False)
59
- print(f"CSV verified: {df.shape[0]} rows, {len(df.columns)} columns")
60
- except Exception as e:
61
- return f"Error membaca CSV: {str(e)}"
62
-
63
- # Load document
64
- try:
65
- loader = CSVLoader(file_path=file_path, encoding="utf-8", csv_args={'delimiter': ','})
66
- data = loader.load()
67
- print(f"Documents loaded: {len(data)}")
68
- except Exception as e:
69
- return f"Error loading documents: {str(e)}"
70
-
71
- # Create vector database
72
- try:
73
- db_path = f"{self.user_dir}/db_faiss"
74
- embeddings = HuggingFaceEmbeddings(
75
- model_name='sentence-transformers/all-MiniLM-L6-v2',
76
- model_kwargs={'device': 'cpu'} # Explicitly set to CPU
77
- )
78
 
79
- db = FAISS.from_documents(data, embeddings)
80
- db.save_local(db_path)
81
- print(f"Vector database created at {db_path}")
 
 
 
 
 
 
82
  except Exception as e:
83
- return f"Error creating vector database: {str(e)}"
84
 
85
- # Create LLM and chain
86
  try:
87
  llm = initialize_model_once()
88
- self.chain = ConversationalRetrievalChain.from_llm(
89
- llm=llm,
90
- retriever=db.as_retriever(search_kwargs={"k": 4}),
91
- return_source_documents=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  )
93
- print("Chain created successfully")
 
94
  except Exception as e:
95
- return f"Error creating chain: {str(e)}"
96
 
97
  # Add file info to chat history
98
- file_info = f"CSV berhasil dimuat dengan {df.shape[0]} baris dan {len(df.columns)} kolom. Kolom: {', '.join(df.columns.tolist())}"
99
  self.chat_history.append(("System", file_info))
100
 
101
- return "File CSV berhasil diproses! Anda dapat mulai chat dengan Mistral 7B."
102
  except Exception as e:
103
  import traceback
104
  print(traceback.format_exc())
105
  return f"Error pemrosesan file: {str(e)}"
106
 
107
  def chat(self, message, history):
108
- if self.chain is None:
109
  return "Mohon upload file CSV terlebih dahulu."
110
 
111
  try:
112
- # Process with the chain
113
- result = self.chain({"question": message, "chat_history": self.chat_history})
 
 
 
 
 
 
114
 
115
- # Update chat history
116
- answer = result["answer"]
117
 
118
- # Optional: Add source info to answer
119
- sources = result.get("source_documents", [])
120
- if sources:
121
- source_text = "\n\nSumber:\n"
122
- for i, doc in enumerate(sources[:2], 1): # Limit to top 2 sources
123
- source_text += f"{i}. {doc.page_content[:100]}...\n"
124
- answer += source_text
125
-
126
- self.chat_history.append((message, answer))
127
 
128
- return answer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  except Exception as e:
130
  import traceback
131
  print(traceback.format_exc())
132
  return f"Error: {str(e)}"
133
 
134
- # UI Code dan handler functions sama seperti sebelumnya
135
  def create_gradio_interface():
136
- with gr.Blocks(title="Chat with CSV using Mistral 7B") as interface:
137
  session_id = gr.State(lambda: str(uuid.uuid4()))
138
  chatbot_state = gr.State(lambda: None)
139
 
140
- gr.HTML("<h1 style='text-align: center;'>Chat with CSV using Mistral 7B</h1>")
141
- gr.HTML("<h3 style='text-align: center;'>Asisten analisis CSV yang powerful</h3>")
142
 
143
  with gr.Row():
144
  with gr.Column(scale=1):
@@ -148,14 +206,12 @@ def create_gradio_interface():
148
  )
149
  process_button = gr.Button("Proses CSV")
150
 
151
- with gr.Accordion("Informasi Model", open=False):
152
  gr.Markdown("""
153
- **Model**: Mistral-7B-Instruct-v0.2-GGUF
154
-
155
- **Fitur**:
156
- - GGUF model yang dioptimalkan untuk CPU
157
- - Efisien untuk analisis data dan percakapan
158
- - Manajemen sesi per pengguna
159
  """)
160
 
161
  with gr.Column(scale=2):
@@ -164,8 +220,8 @@ def create_gradio_interface():
164
  height=400
165
  )
166
  message_input = gr.Textbox(
167
- label="Ketik pesan Anda",
168
- placeholder="Tanyakan tentang data CSV Anda...",
169
  lines=2
170
  )
171
  submit_button = gr.Button("Kirim")
 
3
  import uuid
4
  import threading
5
  import pandas as pd
6
+ import numpy as np
7
  from langchain.document_loaders.csv_loader import CSVLoader
8
  from langchain.embeddings import HuggingFaceEmbeddings
9
  from langchain.vectorstores import FAISS
10
  from langchain.llms import CTransformers
11
+ from langchain.agents import create_pandas_dataframe_agent
12
+ from langchain.chains import LLMChain
13
+ from langchain.prompts import PromptTemplate
14
 
15
  # Global model cache
16
  MODEL_CACHE = {
 
25
  """Initialize model once using CTransformers API"""
26
  with MODEL_CACHE["init_lock"]:
27
  if MODEL_CACHE["model"] is None:
28
+ # Load Phi-2 model (smaller than Mistral)
29
  MODEL_CACHE["model"] = CTransformers(
30
+ model="TheBloke/phi-2-GGUF",
31
+ model_file="phi-2.Q4_K_M.gguf",
32
+ model_type="phi2",
33
  max_new_tokens=512,
34
+ temperature=0.1,
35
  top_p=0.9,
36
+ repetition_penalty=1.1,
37
+ context_length=2048
38
  )
39
 
40
  return MODEL_CACHE["model"]
 
42
  class ChatBot:
43
  def __init__(self, session_id):
44
  self.session_id = session_id
45
+ self.csv_info = None
46
+ self.df = None
47
  self.chat_history = []
 
48
  self.user_dir = f"user_data/{session_id}"
49
  os.makedirs(self.user_dir, exist_ok=True)
50
 
 
55
  try:
56
  # Handle file from Gradio
57
  file_path = file.name if hasattr(file, 'name') else str(file)
58
+ file_name = os.path.basename(file_path)
59
 
60
+ # Load and save CSV directly with pandas
61
  try:
62
+ self.df = pd.read_csv(file_path)
63
  user_file_path = f"{self.user_dir}/uploaded.csv"
64
+ self.df.to_csv(user_file_path, index=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
+ # Store CSV info
67
+ self.csv_info = {
68
+ "filename": file_name,
69
+ "rows": self.df.shape[0],
70
+ "columns": self.df.shape[1],
71
+ "column_names": self.df.columns.tolist(),
72
+ }
73
+
74
+ print(f"CSV verified: {self.df.shape[0]} rows, {len(self.df.columns)} columns")
75
  except Exception as e:
76
+ return f"Error membaca CSV: {str(e)}"
77
 
78
+ # Create query translator
79
  try:
80
  llm = initialize_model_once()
81
+
82
+ query_template = """
83
+ Kamu adalah asisten yang mengubah pertanyaan natural language menjadi kode Python dengan pandas.
84
+
85
+ Informasi tentang DataFrame:
86
+ - Nama kolom: {column_names}
87
+ - Jumlah baris: {num_rows}
88
+ - Sample data:
89
+ {sample_data}
90
+
91
+ Pertanyaan pengguna: {question}
92
+
93
+ Ubah pertanyaan tersebut menjadi kode pandas yang bisa dijalankan. Kode harus ringkas, efisien, dan menggunakan variabel 'df'.
94
+ Berikan HANYA kode python saja, tanpa backtick, tanpa penjelasan.
95
+
96
+ Kode:
97
+ """
98
+
99
+ self.query_chain = LLMChain(
100
+ llm=llm,
101
+ prompt=PromptTemplate(
102
+ input_variables=["column_names", "num_rows", "sample_data", "question"],
103
+ template=query_template
104
+ )
105
  )
106
+
107
+ print("Query translator created successfully")
108
  except Exception as e:
109
+ return f"Error creating query translator: {str(e)}"
110
 
111
  # Add file info to chat history
112
+ file_info = f"CSV berhasil dimuat: {file_name} dengan {self.df.shape[0]} baris dan {len(self.df.columns)} kolom. Kolom: {', '.join(self.df.columns.tolist())}"
113
  self.chat_history.append(("System", file_info))
114
 
115
+ return f"File CSV '{file_name}' berhasil diproses! Anda dapat mulai mengajukan pertanyaan tentang data."
116
  except Exception as e:
117
  import traceback
118
  print(traceback.format_exc())
119
  return f"Error pemrosesan file: {str(e)}"
120
 
121
  def chat(self, message, history):
122
+ if self.df is None or self.query_chain is None:
123
  return "Mohon upload file CSV terlebih dahulu."
124
 
125
  try:
126
+ # Handle metadata questions directly
127
+ message_lower = message.lower()
128
+ if "nama file" in message_lower:
129
+ return f"Nama file CSV adalah: {self.csv_info['filename']}"
130
+ elif "nama kolom" in message_lower:
131
+ return f"Kolom dalam CSV: {', '.join(self.csv_info['column_names'])}"
132
+ elif "jumlah baris" in message_lower or "berapa baris" in message_lower:
133
+ return f"Jumlah baris dalam CSV: {self.csv_info['rows']}"
134
 
135
+ # Get sample data for context
136
+ sample_str = self.df.head(3).to_string()
137
 
138
+ # Translate question to pandas code
139
+ code_response = self.query_chain.run(
140
+ column_names=str(self.csv_info["column_names"]),
141
+ num_rows=self.csv_info["rows"],
142
+ sample_data=sample_str,
143
+ question=message
144
+ )
 
 
145
 
146
+ # Clean and execute the code
147
+ try:
148
+ code = code_response.strip()
149
+ # Add safety prefix to prevent malicious code
150
+ if not code.startswith("df"):
151
+ code = "result = " + code
152
+ else:
153
+ code = "result = " + code
154
+
155
+ # Create local context with the dataframe
156
+ locals_dict = {"df": self.df, "pd": pd, "np": np}
157
+
158
+ # Execute the code
159
+ print(f"Executing code: {code}")
160
+ exec(code, {"pd": pd, "np": np}, locals_dict)
161
+ result = locals_dict.get("result", "No result returned")
162
+
163
+ # Format the result
164
+ if isinstance(result, pd.DataFrame):
165
+ if len(result) > 5:
166
+ result_str = result.head(5).to_string() + f"\n\n[{len(result)} baris ditemukan]"
167
+ else:
168
+ result_str = result.to_string()
169
+ elif isinstance(result, (pd.Series, np.ndarray)):
170
+ result_str = str(result)
171
+ else:
172
+ result_str = str(result)
173
+
174
+ # Build the response
175
+ response = f"Hasil analisis untuk pertanyaan: '{message}'\n\n"
176
+ response += f"Kode yang digunakan:\n```python\n{code}\n```\n\n"
177
+ response += f"Output:\n{result_str}"
178
+
179
+ self.chat_history.append((message, response))
180
+ return response
181
+
182
+ except Exception as e:
183
+ error_msg = f"Error mengeksekusi kode: {str(e)}\nKode yang dihasilkan:\n```python\n{code}\n```"
184
+ print(error_msg)
185
+ return error_msg
186
+
187
  except Exception as e:
188
  import traceback
189
  print(traceback.format_exc())
190
  return f"Error: {str(e)}"
191
 
192
+ # UI Code
193
  def create_gradio_interface():
194
+ with gr.Blocks(title="CSV Data Analyzer") as interface:
195
  session_id = gr.State(lambda: str(uuid.uuid4()))
196
  chatbot_state = gr.State(lambda: None)
197
 
198
+ gr.HTML("<h1 style='text-align: center;'>CSV Data Analyzer</h1>")
199
+ gr.HTML("<h3 style='text-align: center;'>Ajukan pertanyaan tentang data CSV Anda</h3>")
200
 
201
  with gr.Row():
202
  with gr.Column(scale=1):
 
206
  )
207
  process_button = gr.Button("Proses CSV")
208
 
209
+ with gr.Accordion("Contoh Pertanyaan", open=False):
210
  gr.Markdown("""
211
+ - "Berapa jumlah data yang memiliki nilai Glucose di atas 150?"
212
+ - "Bagaimana distribusi kolom Age?"
213
+ - "Hitung nilai rata-rata dan standar deviasi untuk setiap kolom numerik"
214
+ - "Buat tabel frekuensi untuk kolom Outcome"
 
 
215
  """)
216
 
217
  with gr.Column(scale=2):
 
220
  height=400
221
  )
222
  message_input = gr.Textbox(
223
+ label="Ketik pertanyaan Anda",
224
+ placeholder="Contoh: Berapa jumlah data yang memiliki nilai Glucose di atas 150?",
225
  lines=2
226
  )
227
  submit_button = gr.Button("Kirim")