Spaces:

hmrizal
/

CSVBot-Llama2

Sleeping

App Files Files Community

hmrizal commited on 5 days ago

Commit

81f0d23

verified ·

1 Parent(s): 2fa763f

microsoft/phi4-mini-instruct

Browse files

Files changed (1) hide show

app.py +185 -84

app.py CHANGED Viewed

@@ -3,15 +3,14 @@ import os
 import uuid
 import threading
 import pandas as pd
-from langchain.document_loaders.csv_loader import CSVLoader
-from langchain.embeddings import HuggingFaceEmbeddings
-from langchain.vectorstores import FAISS
-from langchain.llms import CTransformers
-from langchain.chains import ConversationalRetrievalChain
 # Global model cache
 MODEL_CACHE = {
     "model": None,
     "init_lock": threading.Lock()
 }
@@ -19,27 +18,52 @@ MODEL_CACHE = {
 os.makedirs("user_data", exist_ok=True)
 def initialize_model_once():
-    """Initialize model once using CTransformers API"""
     with MODEL_CACHE["init_lock"]:
         if MODEL_CACHE["model"] is None:
-            # Load Mistral-7B-Instruct-v0.2.Q4_K_M.gguf model
-            MODEL_CACHE["model"] = CTransformers(
-                model="TheBloke/Mistral-7B-Instruct-v0.2-GGUF",
-                model_file="mistral-7b-instruct-v0.2.Q4_K_M.gguf",
-                model_type="mistral",
-                max_new_tokens=512,
-                temperature=0.2,
-                top_p=0.9,
-                repetition_penalty=1.2
             )
-    return MODEL_CACHE["model"]
 class ChatBot:
     def __init__(self, session_id):
         self.session_id = session_id
         self.chat_history = []
-        self.chain = None
         self.user_dir = f"user_data/{session_id}"
         os.makedirs(self.user_dir, exist_ok=True)
@@ -50,95 +74,173 @@ class ChatBot:
         try:
             # Handle file from Gradio
             file_path = file.name if hasattr(file, 'name') else str(file)
-            # Verify and save CSV
             try:
-                df = pd.read_csv(file_path)
                 user_file_path = f"{self.user_dir}/uploaded.csv"
-                df.to_csv(user_file_path, index=False)
-                print(f"CSV verified: {df.shape[0]} rows, {len(df.columns)} columns")
-            except Exception as e:
-                return f"Error membaca CSV: {str(e)}"
-            # Load document
-            try:
-                loader = CSVLoader(file_path=file_path, encoding="utf-8", csv_args={'delimiter': ','})
-                data = loader.load()
-                print(f"Documents loaded: {len(data)}")
-            except Exception as e:
-                return f"Error loading documents: {str(e)}"
-            # Create vector database
-            try:
-                db_path = f"{self.user_dir}/db_faiss"
-                embeddings = HuggingFaceEmbeddings(
-                    model_name='sentence-transformers/all-MiniLM-L6-v2',
-                    model_kwargs={'device': 'cpu'}  # Explicitly set to CPU
-                )
-                db = FAISS.from_documents(data, embeddings)
-                db.save_local(db_path)
-                print(f"Vector database created at {db_path}")
-            except Exception as e:
-                return f"Error creating vector database: {str(e)}"
-            # Create LLM and chain
-            try:
-                llm = initialize_model_once()
-                self.chain = ConversationalRetrievalChain.from_llm(
-                    llm=llm,
-                    retriever=db.as_retriever(search_kwargs={"k": 4}),
-                    return_source_documents=True
-                )
-                print("Chain created successfully")
             except Exception as e:
-                return f"Error creating chain: {str(e)}"
             # Add file info to chat history
-            file_info = f"CSV berhasil dimuat dengan {df.shape[0]} baris dan {len(df.columns)} kolom. Kolom: {', '.join(df.columns.tolist())}"
             self.chat_history.append(("System", file_info))
-            return "File CSV berhasil diproses! Anda dapat mulai chat dengan Mistral 7B."
         except Exception as e:
             import traceback
             print(traceback.format_exc())
             return f"Error pemrosesan file: {str(e)}"
     def chat(self, message, history):
-        if self.chain is None:
             return "Mohon upload file CSV terlebih dahulu."
         try:
-            # Process with the chain
-            result = self.chain({"question": message, "chat_history": self.chat_history})
-            # Update chat history
-            answer = result["answer"]
-            # Optional: Add source info to answer
-            sources = result.get("source_documents", [])
-            if sources:
-                source_text = "\n\nSumber:\n"
-                for i, doc in enumerate(sources[:2], 1):  # Limit to top 2 sources
-                    source_text += f"{i}. {doc.page_content[:100]}...\n"
-                answer += source_text
-            self.chat_history.append((message, answer))
-            return answer
         except Exception as e:
             import traceback
             print(traceback.format_exc())
             return f"Error: {str(e)}"
-# UI Code dan handler functions sama seperti sebelumnya
 def create_gradio_interface():
-    with gr.Blocks(title="Chat with CSV using Mistral 7B") as interface:
         session_id = gr.State(lambda: str(uuid.uuid4()))
         chatbot_state = gr.State(lambda: None)
-        gr.HTML("<h1 style='text-align: center;'>Chat with CSV using Mistral 7B</h1>")
-        gr.HTML("<h3 style='text-align: center;'>Asisten analisis CSV yang powerful</h3>")
         with gr.Row():
             with gr.Column(scale=1):
@@ -148,14 +250,13 @@ def create_gradio_interface():
                 )
                 process_button = gr.Button("Proses CSV")
-                with gr.Accordion("Informasi Model", open=False):
                     gr.Markdown("""
-                    **Model**: Mistral-7B-Instruct-v0.2-GGUF
-                    **Fitur**:
-                    - GGUF model yang dioptimalkan untuk CPU
-                    - Efisien untuk analisis data dan percakapan
-                    - Manajemen sesi per pengguna
                     """)
             with gr.Column(scale=2):
@@ -164,8 +265,8 @@ def create_gradio_interface():
                     height=400
                 )
                 message_input = gr.Textbox(
-                    label="Ketik pesan Anda",
-                    placeholder="Tanyakan tentang data CSV Anda...",
                     lines=2
                 )
                 submit_button = gr.Button("Kirim")

 import uuid
 import threading
 import pandas as pd
+import numpy as np
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
 # Global model cache
 MODEL_CACHE = {
     "model": None,
+    "tokenizer": None,
     "init_lock": threading.Lock()
 }
 os.makedirs("user_data", exist_ok=True)
 def initialize_model_once():
+    """Initialize Phi-4-mini model once"""
     with MODEL_CACHE["init_lock"]:
         if MODEL_CACHE["model"] is None:
+            # Load Phi-4-mini model
+            MODEL_CACHE["tokenizer"] = AutoTokenizer.from_pretrained("microsoft/Phi-4-mini-instruct")
+            MODEL_CACHE["model"] = AutoModelForCausalLM.from_pretrained(
+                "microsoft/Phi-4-mini-instruct",
+                torch_dtype=torch.float16,
+                device_map="auto"
             )
+    return MODEL_CACHE["model"], MODEL_CACHE["tokenizer"]
+def generate_pandas_code(prompt, max_new_tokens=512):
+    """Generate Python code using the Phi-4-mini model"""
+    model, tokenizer = initialize_model_once()
+    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=max_new_tokens,
+            do_sample=True,
+            temperature=0.2,
+            top_p=0.9,
+        )
+    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    # Extract only the generated part, removing the input prompt
+    generated_text = response[len(tokenizer.decode(inputs.input_ids[0], skip_special_tokens=True)):]
+    # Extract code between ```python and ``` if present
+    import re
+    code_match = re.search(r'```python\s*(.*?)\s*```', generated_text, re.DOTALL)
+    if code_match:
+        return code_match.group(1).strip()
+    else:
+        # Return the raw generated text as fallback
+        return generated_text.strip()
 class ChatBot:
     def __init__(self, session_id):
         self.session_id = session_id
+        self.csv_info = None
+        self.df = None
         self.chat_history = []
         self.user_dir = f"user_data/{session_id}"
         os.makedirs(self.user_dir, exist_ok=True)
         try:
             # Handle file from Gradio
             file_path = file.name if hasattr(file, 'name') else str(file)
+            file_name = os.path.basename(file_path)
+            # Load and save CSV directly with pandas
             try:
+                self.df = pd.read_csv(file_path)
                 user_file_path = f"{self.user_dir}/uploaded.csv"
+                self.df.to_csv(user_file_path, index=False)
+                # Store CSV info
+                self.csv_info = {
+                    "filename": file_name,
+                    "rows": self.df.shape[0],
+                    "columns": self.df.shape[1],
+                    "column_names": self.df.columns.tolist(),
+                }
+                print(f"CSV verified: {self.df.shape[0]} rows, {len(self.df.columns)} columns")
             except Exception as e:
+                return f"Error membaca CSV: {str(e)}"
             # Add file info to chat history
+            file_info = f"CSV berhasil dimuat: {file_name} dengan {self.df.shape[0]} baris dan {len(self.df.columns)} kolom. Kolom: {', '.join(self.df.columns.tolist())}"
             self.chat_history.append(("System", file_info))
+            return f"File CSV '{file_name}' berhasil diproses! Anda dapat mulai mengajukan pertanyaan tentang data."
         except Exception as e:
             import traceback
             print(traceback.format_exc())
             return f"Error pemrosesan file: {str(e)}"
+    def execute_query(self, code):
+        """Safely execute pandas code"""
+        try:
+            # Create local context with the dataframe
+            local_vars = {"df": self.df, "pd": pd, "np": np}
+            # Execute code with timeout
+            exec(code, {"pd": pd, "np": np}, local_vars)
+            # Get result
+            if "result" in local_vars:
+                return local_vars["result"]
+            else:
+                # If no result variable, find the last variable created
+                last_var = None
+                for var_name, var_value in local_vars.items():
+                    if var_name not in ["df", "pd", "np"] and var_name != "__builtins__":
+                        last_var = var_value
+                if last_var is not None:
+                    return last_var
+                else:
+                    return self.df  # Return the dataframe as default
+        except Exception as e:
+            raise Exception(f"Gagal menjalankan kode: {str(e)}")
     def chat(self, message, history):
+        if self.df is None:
             return "Mohon upload file CSV terlebih dahulu."
         try:
+            # Handle common metadata questions directly to save resources
+            message_lower = message.lower()
+            if "nama file" in message_lower:
+                return f"Nama file CSV adalah: {self.csv_info['filename']}"
+            elif "nama kolom" in message_lower:
+                return f"Kolom dalam CSV: {', '.join(self.csv_info['column_names'])}"
+            elif "jumlah baris" in message_lower or "berapa baris" in message_lower:
+                return f"Jumlah baris dalam CSV: {self.csv_info['rows']}"
+            elif "jumlah kolom" in message_lower or "berapa kolom" in message_lower:
+                return f"Jumlah kolom dalam CSV: {self.csv_info['columns']}"
+            # Get sample data for context
+            sample_df = self.df.head(5)
+            sample_str = sample_df.to_string()
+            data_types = {col: str(dtype) for col, dtype in self.df.dtypes.items()}
+            # Create prompt for LLM
+            prompt = f"""
+            You are a data analyst that translates natural language questions into Python pandas code.
+            DataFrame information:
+            - Column names: {', '.join(self.csv_info['column_names'])}
+            - Data types: {data_types}
+            - Number of rows: {self.csv_info['rows']}
+            - Sample data:
+            {sample_str}
+            User question: {message}
+            Write a short Python code using pandas to answer the user's question.
+            The code must use the 'df' variable as the DataFrame name.
+            The code should assign the final result to a variable named 'result'.
+            Only return the Python code without any explanation.
+            ```python
+            """
+            # Generate code with Phi-4
+            try:
+                code = generate_pandas_code(prompt)
+                # Add result variable if not present
+                if not any(line.strip().startswith("result =") for line in code.split("\n")):
+                    if code.startswith("df."):
+                        code = "result = " + code
+                    elif not "result" in code:
+                        code = "result = " + code
+            except Exception as e:
+                print(f"Error generating code: {str(e)}")
+                # Fallback for basic questions
+                if "rata-rata" in message_lower or "mean" in message_lower:
+                    code = "result = df.describe()"
+                elif "jumlah" in message_lower or "count" in message_lower:
+                    code = "result = df.count()"
+                else:
+                    return f"Maaf, saya tidak dapat menghasilkan kode untuk pertanyaan ini. Error: {str(e)}"
+            # Execute the code and get result
+            try:
+                print(f"Executing code: {code}")
+                result = self.execute_query(code)
+                # Check if result is relevant to the question
+                if result is None or (isinstance(result, pd.DataFrame) and result.empty):
+                    return "Maaf, kita tidak bisa mendapatkan informasi terkait pertanyaan anda di dalam file CSV anda."
+                # Format result based on its type
+                if isinstance(result, pd.DataFrame):
+                    if len(result) > 5:
+                        result_str = result.head(5).to_string() + f"\n\n[Total {len(result)} baris]"
+                    else:
+                        result_str = result.to_string()
+                elif isinstance(result, (pd.Series, np.ndarray)):
+                    if len(result) > 10:
+                        result_str = str(result[:10]) + f"\n\n[Total {len(result)} item]"
+                    else:
+                        result_str = str(result)
+                elif hasattr(result, "__len__") and not isinstance(result, (str, int, float)):
+                    result_str = str(result)
+                    if len(result) > 0:
+                        result_str += f"\n\n[Total {len(result)} item]"
+                else:
+                    result_str = str(result)
+                # Format response
+                response = f"Hasil analisis:\n\n{result_str}\n\nKode yang dijalankan:\n```python\n{code}\n```"
+                self.chat_history.append((message, response))
+                return response
+            except Exception as e:
+                return f"Error saat menganalisis data: {str(e)}\n\nKode yang dicoba:\n```python\n{code}\n```"
         except Exception as e:
             import traceback
             print(traceback.format_exc())
             return f"Error: {str(e)}"
+# UI Code (sama seperti sebelumnya)
 def create_gradio_interface():
+    with gr.Blocks(title="CSV Data Analyzer") as interface:
         session_id = gr.State(lambda: str(uuid.uuid4()))
         chatbot_state = gr.State(lambda: None)
+        gr.HTML("<h1 style='text-align: center;'>CSV Data Analyzer</h1>")
+        gr.HTML("<h3 style='text-align: center;'>Ajukan pertanyaan tentang data CSV Anda</h3>")
         with gr.Row():
             with gr.Column(scale=1):
                 )
                 process_button = gr.Button("Proses CSV")
+                with gr.Accordion("Contoh Pertanyaan", open=False):
                     gr.Markdown("""
+                    - "Berapa jumlah data yang memiliki nilai Glucose di atas 150?"
+                    - "Hitung nilai rata-rata setiap kolom numerik"
+                    - "Berapa banyak data untuk setiap kelompok dalam kolom Outcome?"
+                    - "Berapa jumlah baris dalam dataset ini?"
+                    - "Berapa jumlah kolom dalam dataset ini?"
                     """)
             with gr.Column(scale=2):
                     height=400
                 )
                 message_input = gr.Textbox(
+                    label="Ketik pertanyaan Anda",
+                    placeholder="Contoh: Berapa jumlah data yang memiliki nilai Glucose di atas 150?",
                     lines=2
                 )
                 submit_button = gr.Button("Kirim")