Spaces:

hmrizal
/

CSVBot-Llama2

Sleeping

App Files Files Community

hmrizal commited on 7 days ago

Commit

df0d042

verified ·

1 Parent(s): fdad3c6

kembali ke mistral 7b instruct gguf

Browse files

Files changed (1) hide show

app.py +76 -174

app.py CHANGED Viewed

@@ -3,10 +3,11 @@ import os
 import uuid
 import threading
 import pandas as pd
-import numpy as np
 from langchain.llms import CTransformers
-from langchain.chains import LLMChain
-from langchain.prompts import PromptTemplate
 # Global model cache
 MODEL_CACHE = {
@@ -21,15 +22,15 @@ def initialize_model_once():
     """Initialize model once using CTransformers API"""
     with MODEL_CACHE["init_lock"]:
         if MODEL_CACHE["model"] is None:
-            # Load TinyLlama model
             MODEL_CACHE["model"] = CTransformers(
-                model="TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
-                model_file="tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
-                model_type="llama",
                 max_new_tokens=512,
-                temperature=0.1,
-                repetition_penalty=1.1,
-                context_length=2048
             )
     return MODEL_CACHE["model"]
@@ -37,9 +38,8 @@ def initialize_model_once():
 class ChatBot:
     def __init__(self, session_id):
         self.session_id = session_id
-        self.csv_info = None
-        self.df = None
         self.chat_history = []
         self.user_dir = f"user_data/{session_id}"
         os.makedirs(self.user_dir, exist_ok=True)
@@ -50,195 +50,95 @@ class ChatBot:
         try:
             # Handle file from Gradio
             file_path = file.name if hasattr(file, 'name') else str(file)
-            file_name = os.path.basename(file_path)
-            # Load and save CSV directly with pandas
             try:
-                self.df = pd.read_csv(file_path)
                 user_file_path = f"{self.user_dir}/uploaded.csv"
-                self.df.to_csv(user_file_path, index=False)
-                # Store CSV info
-                self.csv_info = {
-                    "filename": file_name,
-                    "rows": self.df.shape[0],
-                    "columns": self.df.shape[1],
-                    "column_names": self.df.columns.tolist(),
-                }
-                print(f"CSV verified: {self.df.shape[0]} rows, {len(self.df.columns)} columns")
             except Exception as e:
                 return f"Error membaca CSV: {str(e)}"
-            # Create query translator
             try:
-                llm = initialize_model_once()
-                query_template = """
-                Kamu adalah asisten data yang mengubah pertanyaan bahasa natural menjadi kode Python dengan Pandas.
-                Informasi tentang DataFrame:
-                - Nama kolom: {column_names}
-                - Jumlah baris: {num_rows}
-                - Sampel data:
-                {sample_data}
-                Pertanyaan pengguna: {question}
-                Buat kode Python menggunakan pandas untuk menjawab pertanyaan tersebut.
-                Berikan HANYA kode Python saja, tanpa penjelasan atau apapun.
-                Kode harus menggunakan variabel 'df' sebagai nama DataFrame.
-                Kode:
-                """
-                self.query_chain = LLMChain(
-                    llm=llm,
-                    prompt=PromptTemplate(
-                        input_variables=["column_names", "num_rows", "sample_data", "question"],
-                        template=query_template
-                    )
                 )
-                print("Query translator created successfully")
             except Exception as e:
-                return f"Error creating query translator: {str(e)}"
             # Add file info to chat history
-            file_info = f"CSV berhasil dimuat: {file_name} dengan {self.df.shape[0]} baris dan {len(self.df.columns)} kolom. Kolom: {', '.join(self.df.columns.tolist())}"
             self.chat_history.append(("System", file_info))
-            return f"File CSV '{file_name}' berhasil diproses! Anda dapat mulai mengajukan pertanyaan tentang data."
         except Exception as e:
             import traceback
             print(traceback.format_exc())
             return f"Error pemrosesan file: {str(e)}"
-    def execute_query(self, code):
-        """Safely execute pandas code"""
-        try:
-            # Create local context with the dataframe
-            local_vars = {"df": self.df, "pd": pd, "np": np}
-            # Execute code with timeout
-            exec(code, {"pd": pd, "np": np}, local_vars)
-            # Get result
-            if "result" in local_vars:
-                return local_vars["result"]
-            else:
-                # If no result variable, find the last variable created
-                last_var = None
-                for var_name, var_value in local_vars.items():
-                    if var_name not in ["df", "pd", "np"] and var_name != "__builtins__":
-                        last_var = var_value
-                if last_var is not None:
-                    return last_var
-                else:
-                    return self.df  # Return the dataframe as default
-        except Exception as e:
-            raise Exception(f"Gagal menjalankan kode: {str(e)}")
     def chat(self, message, history):
-        if self.df is None:
             return "Mohon upload file CSV terlebih dahulu."
         try:
-            # Handle metadata questions directly
-            message_lower = message.lower()
-            if "nama file" in message_lower:
-                return f"Nama file CSV adalah: {self.csv_info['filename']}"
-            elif "nama kolom" in message_lower:
-                return f"Kolom dalam CSV: {', '.join(self.csv_info['column_names'])}"
-            elif "jumlah baris" in message_lower or "berapa baris" in message_lower:
-                return f"Jumlah baris dalam CSV: {self.csv_info['rows']}"
-            # Handle pre-defined analysis questions
-            if "glucose di atas 150" in message_lower:
-                code = "result = len(df[df['Glucose'] > 150])"
-            else:
-                # Get sample data for context
-                sample_str = self.df.head(3).to_string()
-                # Translate question to pandas code using LLM
-                try:
-                    code_response = self.query_chain.run(
-                        column_names=str(self.csv_info["column_names"]),
-                        num_rows=self.csv_info["rows"],
-                        sample_data=sample_str,
-                        question=message
-                    )
-                    # Clean the code
-                    code = code_response.strip().replace("```python", "").replace("```", "").strip()
-                    # Add result variable if not present
-                    if not any(line.strip().startswith("result =") for line in code.split("\n")):
-                        if code.startswith("df."):
-                            code = "result = " + code
-                        else:
-                            code = "result = df." + code
-                except Exception as e:
-                    # Fallback for common queries if LLM fails
-                    if "rata-rata" in message_lower or "mean" in message_lower:
-                        code = "result = df.describe()"
-                    elif "jumlah" in message_lower or "count" in message_lower:
-                        code = "result = df.count()"
-                    elif "distribusi" in message_lower:
-                        col = next((c for c in self.csv_info["column_names"] if c.lower() in message_lower), None)
-                        if col:
-                            code = f"result = df['{col}'].value_counts()"
-                        else:
-                            code = "result = df.describe()"
-                    else:
-                        return f"Maaf, saya tidak dapat memproses pertanyaan ini. Error: {str(e)}"
-            # Execute the code and get result
-            try:
-                print(f"Executing code: {code}")
-                result = self.execute_query(code)
-                # Format result based on its type
-                if isinstance(result, pd.DataFrame):
-                    if len(result) > 5:
-                        result_str = result.head(5).to_string() + f"\n\n[Total {len(result)} baris]"
-                    else:
-                        result_str = result.to_string()
-                elif isinstance(result, (pd.Series, np.ndarray)):
-                    if len(result) > 10:
-                        result_str = str(result[:10]) + f"\n\n[Total {len(result)} item]"
-                    else:
-                        result_str = str(result)
-                elif hasattr(result, "__len__") and not isinstance(result, (str, int, float)):
-                    result_str = str(result)
-                    if len(result) > 0:
-                        result_str += f"\n\n[Total {len(result)} item]"
-                else:
-                    result_str = str(result)
-                # Format response
-                response = f"Hasil analisis:\n\n{result_str}\n\nKode yang dijalankan:\n```python\n{code}\n```"
-                self.chat_history.append((message, response))
-                return response
-            except Exception as e:
-                return f"Error saat menganalisis data: {str(e)}\n\nKode yang dicoba:\n```python\n{code}\n```"
         except Exception as e:
             import traceback
             print(traceback.format_exc())
             return f"Error: {str(e)}"
-# UI Code (tidak berubah dari sebelumnya)
 def create_gradio_interface():
-    with gr.Blocks(title="CSV Data Analyzer") as interface:
         session_id = gr.State(lambda: str(uuid.uuid4()))
         chatbot_state = gr.State(lambda: None)
-        gr.HTML("<h1 style='text-align: center;'>CSV Data Analyzer</h1>")
-        gr.HTML("<h3 style='text-align: center;'>Ajukan pertanyaan tentang data CSV Anda</h3>")
         with gr.Row():
             with gr.Column(scale=1):
@@ -248,12 +148,14 @@ def create_gradio_interface():
                 )
                 process_button = gr.Button("Proses CSV")
-                with gr.Accordion("Contoh Pertanyaan", open=False):
                     gr.Markdown("""
-                    - "Berapa jumlah data yang memiliki nilai Glucose di atas 150?"
-                    - "Hitung nilai rata-rata setiap kolom numerik"
-                    - "Berapa banyak data untuk setiap kelompok dalam kolom Outcome?"
-                    - "Berapa jumlah baris dalam dataset ini?"
                     """)
             with gr.Column(scale=2):
@@ -262,8 +164,8 @@ def create_gradio_interface():
                     height=400
                 )
                 message_input = gr.Textbox(
-                    label="Ketik pertanyaan Anda",
-                    placeholder="Contoh: Berapa jumlah data yang memiliki nilai Glucose di atas 150?",
                     lines=2
                 )
                 submit_button = gr.Button("Kirim")

 import uuid
 import threading
 import pandas as pd
+from langchain.document_loaders.csv_loader import CSVLoader
+from langchain.embeddings import HuggingFaceEmbeddings
+from langchain.vectorstores import FAISS
 from langchain.llms import CTransformers
+from langchain.chains import ConversationalRetrievalChain
 # Global model cache
 MODEL_CACHE = {
     """Initialize model once using CTransformers API"""
     with MODEL_CACHE["init_lock"]:
         if MODEL_CACHE["model"] is None:
+            # Load Mistral-7B-Instruct-v0.2.Q4_K_M.gguf model
             MODEL_CACHE["model"] = CTransformers(
+                model="TheBloke/Mistral-7B-Instruct-v0.2-GGUF",
+                model_file="mistral-7b-instruct-v0.2.Q4_K_M.gguf",
+                model_type="mistral",
                 max_new_tokens=512,
+                temperature=0.2,
+                top_p=0.9,
+                repetition_penalty=1.2
             )
     return MODEL_CACHE["model"]
 class ChatBot:
     def __init__(self, session_id):
         self.session_id = session_id
         self.chat_history = []
+        self.chain = None
         self.user_dir = f"user_data/{session_id}"
         os.makedirs(self.user_dir, exist_ok=True)
         try:
             # Handle file from Gradio
             file_path = file.name if hasattr(file, 'name') else str(file)
+            # Verify and save CSV
             try:
+                df = pd.read_csv(file_path)
                 user_file_path = f"{self.user_dir}/uploaded.csv"
+                df.to_csv(user_file_path, index=False)
+                print(f"CSV verified: {df.shape[0]} rows, {len(df.columns)} columns")
             except Exception as e:
                 return f"Error membaca CSV: {str(e)}"
+            # Load document
             try:
+                loader = CSVLoader(file_path=file_path, encoding="utf-8", csv_args={'delimiter': ','})
+                data = loader.load()
+                print(f"Documents loaded: {len(data)}")
+            except Exception as e:
+                return f"Error loading documents: {str(e)}"
+            # Create vector database
+            try:
+                db_path = f"{self.user_dir}/db_faiss"
+                embeddings = HuggingFaceEmbeddings(
+                    model_name='sentence-transformers/all-MiniLM-L6-v2',
+                    model_kwargs={'device': 'cpu'}  # Explicitly set to CPU
                 )
+                db = FAISS.from_documents(data, embeddings)
+                db.save_local(db_path)
+                print(f"Vector database created at {db_path}")
             except Exception as e:
+                return f"Error creating vector database: {str(e)}"
+            # Create LLM and chain
+            try:
+                llm = initialize_model_once()
+                self.chain = ConversationalRetrievalChain.from_llm(
+                    llm=llm,
+                    retriever=db.as_retriever(search_kwargs={"k": 4}),
+                    return_source_documents=True
+                )
+                print("Chain created successfully")
+            except Exception as e:
+                return f"Error creating chain: {str(e)}"
             # Add file info to chat history
+            file_info = f"CSV berhasil dimuat dengan {df.shape[0]} baris dan {len(df.columns)} kolom. Kolom: {', '.join(df.columns.tolist())}"
             self.chat_history.append(("System", file_info))
+            return "File CSV berhasil diproses! Anda dapat mulai chat dengan Mistral 7B."
         except Exception as e:
             import traceback
             print(traceback.format_exc())
             return f"Error pemrosesan file: {str(e)}"
     def chat(self, message, history):
+        if self.chain is None:
             return "Mohon upload file CSV terlebih dahulu."
         try:
+            # Process with the chain
+            result = self.chain({"question": message, "chat_history": self.chat_history})
+            # Update chat history
+            answer = result["answer"]
+            # Optional: Add source info to answer
+            sources = result.get("source_documents", [])
+            if sources:
+                source_text = "\n\nSumber:\n"
+                for i, doc in enumerate(sources[:2], 1):  # Limit to top 2 sources
+                    source_text += f"{i}. {doc.page_content[:100]}...\n"
+                answer += source_text
+            self.chat_history.append((message, answer))
+            return answer
         except Exception as e:
             import traceback
             print(traceback.format_exc())
             return f"Error: {str(e)}"
+# UI Code dan handler functions sama seperti sebelumnya
 def create_gradio_interface():
+    with gr.Blocks(title="Chat with CSV using Mistral 7B") as interface:
         session_id = gr.State(lambda: str(uuid.uuid4()))
         chatbot_state = gr.State(lambda: None)
+        gr.HTML("<h1 style='text-align: center;'>Chat with CSV using Mistral 7B</h1>")
+        gr.HTML("<h3 style='text-align: center;'>Asisten analisis CSV yang powerful</h3>")
         with gr.Row():
             with gr.Column(scale=1):
                 )
                 process_button = gr.Button("Proses CSV")
+                with gr.Accordion("Informasi Model", open=False):
                     gr.Markdown("""
+                    **Model**: Mistral-7B-Instruct-v0.2-GGUF
+                    **Fitur**:
+                    - GGUF model yang dioptimalkan untuk CPU
+                    - Efisien untuk analisis data dan percakapan
+                    - Manajemen sesi per pengguna
                     """)
             with gr.Column(scale=2):
                     height=400
                 )
                 message_input = gr.Textbox(
+                    label="Ketik pesan Anda",
+                    placeholder="Tanyakan tentang data CSV Anda...",
                     lines=2
                 )
                 submit_button = gr.Button("Kirim")