import pandas as pd
import numpy as np
import re
import streamlit as st
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import nltk
import requests
from collections import Counter
import tensorflow as tf
from transformers import TFBertForSequenceClassification, BertTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import unicodedata
from sklearn.cluster import KMeans
import datetime
    
# Muat data kamus
df_kamus_komen1 = pd.read_excel('data_komen_mundjidah_clean.xlsx')  # Kamus 1
df_kamus_komen2 = pd.read_excel('data_komen_warsubi_clean-v1.xlsx')  # Kamus 3

# Fungsi untuk memuat kamus normalisasi dari file lokal
def load_normalization_dict(file_path):
    try:
        with open(file_path, 'r') as file:
            lines = file.readlines()
            normalization_dict = {}
            for line in lines:
                line = line.strip()
                if ':' in line:  # Memastikan format key:value
                    key, value = line.split(':', 1)  # Pisahkan berdasarkan ':'
                    key = key.strip('"')  # Hapus tanda kutip pada key
                    value = value.strip('",')  # Hapus tanda kutip dan koma pada value
                    normalization_dict[key.strip()] = value.strip()
            return normalization_dict
    except Exception as e:
        st.error(f"Gagal memuat kamus normalisasi: {e}")
        return {}

# Muat kamus normalisasi dari file lokal
normalization_file = "slang.txt"
normalization_dict = load_normalization_dict(normalization_file)

# Fungsi untuk melakukan normalisasi teks
def normalize_text(text, normalization_dict):
    words = text.split()
    normalized_words = [normalization_dict.get(word, word) for word in words]
    return " ".join(normalized_words)
    
# Fungsi untuk membersihkan komentar dari username
def get_known_usernames(data):
    # Cek apakah kolom "Author" atau "Username" ada
    if "Author" in data.columns:
        return set(data["Author"].str.strip().str.lower())
    elif "Username" in data.columns:
        return set(data["Username"].str.strip().str.lower())
    elif "Nama Akun" in data.columns:
        return set(data["Nama Akun"].str.strip().str.lower())
    else:
        # Jika tidak ada kolom, kembalikan set kosong
        return set()

def remove_usernames(comment, usernames):
    for username in usernames:
        pattern = rf'\b{re.escape(username)}\b'
        comment = re.sub(pattern, '', comment, flags=re.IGNORECASE)
    return re.sub(r'\s+', ' ', comment.strip())

# Fungsi untuk membersihkan teks
def clean_text(text):
    text = str(text)
    
    # Menghapus URL dan mention serta hashtag
    text = re.sub(r'http[s]?://\S+', '', text)  # Hapus URL
    text = re.sub(r'@\w+|#\w+', '', text)  # Hapus mention dan hashtag
    
    # Mengganti angka tertentu menjadi kata
    text = re.sub(r'\b(01|1)\b', 'satu', text)
    text = re.sub(r'\b(02|2)\b', 'dua', text)
    
    # Menghapus angka lainnya
    text = re.sub(r'\b\d+\b', '', text)
    
    # Mengonversi karakter-karakter matematis atau bold menjadi karakter normal
    text = unicodedata.normalize('NFKD', text)  # Normalisasi karakter
    
    # Mengganti tanda baca (.,!?;:) dan emoji tertentu dengan spasi (' ')
    text = re.sub(r'[.,!?;:]', ' ', text)  # Ganti tanda baca tertentu dengan spasi
    text = re.sub(r'[🔥✨❤️]', ' ', text)  # Ganti emoji spesifik dengan spasi
    
    # Menghapus karakter yang tidak diinginkan kecuali huruf, angka, emoji ✌️ dan ☝️
    text = re.sub(r'[^\w\s\u2700-\u27BF\u2B50\u00A9\u00AE✌️☝️]', '', text)
    
    # Menurunkan huruf menjadi huruf kecil dan menghapus spasi ekstra
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()  # Menghapus spasi berlebihan

    return text

def load_slang_dict(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            lines = file.readlines()
            slang_dict = {}
            for line in lines:
                line = line.strip()
                if ':' in line:  # Memastikan format key:value
                    key, value = line.split(':', 1)  # Pisahkan berdasarkan ':'
                    key = key.strip('"').strip()  # Hapus tanda kutip pada key dan spasi ekstra
                    value = value.strip('",').strip()  # Hapus tanda kutip dan koma pada value
                    slang_dict[key] = value
            return slang_dict
    except Exception as e:
        st.error(f"Terjadi kesalahan saat membaca file slang.txt: {e}")
        return {}
    
# Muat kamus normalisasi dari file lokal
normalization_file = "slang.txt"
normalization_dict = load_normalization_dict(normalization_file)

def save_slang_dict(slang_dict, file_path):
    try:
        with open(file_path, 'w', encoding='utf-8') as file:
            for key, value in slang_dict.items():
                # Tulis setiap pasangan key-value dalam format "key":"value"
                file.write(f'"{key}":"{value}",\n')
        st.success("Kamus normalisasi berhasil disimpan!")
    except Exception as e:
        st.error(f"Terjadi kesalahan saat menyimpan file slang.txt: {e}")

def load_keywords(file_path):
    """Membaca keywords dari file txt dengan format kategori."""
    keywords = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        current_category = None
        for line in f:
            line = line.strip()
            if re.match(r'^\[.*\]$', line):  # Mendeteksi kategori seperti [Co-Optimism]
                current_category = line.strip('[]')
                keywords[current_category] = []
            elif current_category and line:
                keywords[current_category].append(line)
    return keywords

def load_negative_keywords(file_path):
    """Membaca negative keywords dengan model identifier."""
    negative_keywords = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        current_model = None
        for line in f:
            line = line.strip()
            if re.match(r'^\[.*\]$', line):  # Mendeteksi model identifier seperti [Model Mundjidah]
                current_model = line.strip('[]')
                negative_keywords[current_model] = []
            elif current_model and line:
                negative_keywords[current_model].append(line)
    return negative_keywords

def save_keywords(file_path, keywords):
    """Menyimpan keywords ke file txt."""
    with open(file_path, 'w', encoding='utf-8') as f:
        for category, words in keywords.items():
            f.write(f"[{category}]\n")
            for word in words:
                f.write(f"{word}\n")
            f.write("\n")  # Tambahkan baris kosong antar kategori

def save_negative_keywords(file_path, negative_keywords):
    """Menyimpan negative keywords ke file txt."""
    with open(file_path, 'w', encoding='utf-8') as f:
        for model, words in negative_keywords.items():
            f.write(f"[{model}]\n")
            for word in words:
                f.write(f"{word}\n")
            f.write("\n")
            
# Fungsi untuk menyimpan data ke file Excel sesuai model
def save_to_data_train(data, model_name):
    file_paths = {
        "Model Mundjidah": 'data_komen_mundjidah_clean.xlsx',
        "Model Warsubi V1": 'data_komen_warsubi_clean-v1.xlsx'
    }
    file_path = file_paths.get(model_name)
    if not file_path:
        st.error("Model tidak dikenali. Pastikan model sesuai.")
        return

    # Coba baca file lama atau buat data kosong
    try:
        existing_data = pd.read_excel(file_path)
    except FileNotFoundError:
        existing_data = pd.DataFrame(columns=data.columns)

    # Gabungkan data baru dan hapus duplikat
    updated_data = pd.concat([existing_data, data], ignore_index=True)
    updated_data = updated_data.drop_duplicates(subset=['Comment', 'Cleaned_Text'])

    # Simpan data
    updated_data.to_excel(file_path, index=False)
    return file_path

# Definisi parameter
PRE_TRAINED_MODEL = 'indobenchmark/indobert-base-p2'
EPOCHS = 5
BATCH_SIZE = 32
LEARNING_RATE = 1e-5

# Fungsi untuk melatih ulang model
def retrain_model(kamus_data, model_path):
    # Siapkan data
    X = kamus_data['Cleaned_Text']
    y = kamus_data['Brand Attitude']

    # Konversi label Brand Attitude ke angka
    label_map = {'Co-Likes': 0, 'Co-Support': 1, 'Co-Optimism': 2, 'Co-Negative': 3}
    y = y.map(label_map)

    # Split data menjadi training dan testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Tokenisasi menggunakan BERT tokenizer
    tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL)
    X_train_tokens = tokenizer(list(X_train), padding=True, truncation=True, max_length=128, return_tensors='tf')
    X_test_tokens = tokenizer(list(X_test), padding=True, truncation=True, max_length=128, return_tensors='tf')

    # Load model BERT
    bert_model = TFBertForSequenceClassification.from_pretrained(PRE_TRAINED_MODEL, num_labels=4)

    # Optimizer dan loss function
    optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

    # Compile model
    bert_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

    # Latih model
    bert_model.fit(
        X_train_tokens['input_ids'], y_train,
        epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        validation_data=(X_test_tokens['input_ids'], y_test)
    )

    # Simpan model
    bert_model.save_pretrained(model_path)
    

tf.config.set_visible_devices([], 'GPU')
# Tambahkan opsi di sidebar
menu = st.sidebar.selectbox("Pilih Menu", ["Upload Data", "Hasil Prediksi", "Perlu Validasi","Keyword BA","Normalisasi Kamus", "Overview Data","Retrain Model"])

if menu == "Upload Data":
    # Streamlit app
    st.title("Aplikasi Klasifikasi Sentimen dan Brand Attitude")
    
    # Pilihan model
    model_choice = st.selectbox("Pilih Model:", ["Model Mundjidah", "Model Warsubi V1"])
    
    # Upload file
    uploaded_file = st.file_uploader("Upload file Excel atau CSV", type=["xlsx", "csv"])

    if uploaded_file:
        try:
            # Baca file yang diunggah
            if uploaded_file.name.endswith('.xlsx'):
                data = pd.read_excel(uploaded_file)
            elif uploaded_file.name.endswith('.csv'):
                data = pd.read_csv(uploaded_file)
                
            st.session_state.data = data
    
            # Bersihkan data
            data.dropna(how='all', inplace=True)
            data['Comment'] = data['Comment'].fillna('')
            data = data[data['Comment'].str.strip() != '']
            
            # Proses pembersihan teks termasuk normalisasi
            known_usernames = get_known_usernames(data)
            data["Cleaned_Text"] = data["Comment"].apply(lambda x: remove_usernames(x, known_usernames))
            data["Cleaned_Text"] = data["Cleaned_Text"].apply(lambda x: normalize_text(clean_text(x), normalization_dict))
            
            keywords = load_keywords("keywords.txt")
            negative_keywords = load_negative_keywords("negative_keywords.txt")
            st.session_state.keywords = keywords
            st.session_state.negative_keywords = negative_keywords
            
            # Konfigurasi model berdasarkan pilihan
            if model_choice == "Model Mundjidah":
                sentiment_model_path = "mundjidah-model.h5"
                ba_model_path = "ba-mundjidah-model.h5"
                selected_df = df_kamus_komen1
                selected_negative_keywords = negative_keywords.get("Model Mundjidah", [])
                positive_keywords = ["semoga menang", "semoga", "baik", "bagus", "terbaik", "semangat", "mundjidah", "amin", "gas", "lanjutkan"]
                
            elif model_choice == "Model Warsubi V1":
                sentiment_model_path = "warsa-model.h5"
                ba_model_path = "ba-warsa-model.h5"
                selected_df = df_kamus_komen2
                selected_negative_keywords = negative_keywords.get("Model Warsubi V1", [])
                positive_keywords = ["hebat", "luar biasa", "bagus", "terbaik", "memilih dengan tepat", "all in abah subi", "pilih warsubi", "dua", "✌️", "abah", "sae","sehat","semangat"]
                
            else:  # Tambahan untuk model lain
                sentiment_model_path = "warsubi-v2-model.h5"
                ba_model_path = "ba-warsubi-v2-model.h5"
                positive_keywords = ["hebat"]
                negative_keywords = ["golput ae"]

            PRE_TRAINED_MODEL = 'indobenchmark/indobert-base-p2'
            st.session_state['model_choice'] = model_choice
            
            # Load model sentimen
            try:
                sentiment_model = TFBertForSequenceClassification.from_pretrained(PRE_TRAINED_MODEL, num_labels=3)
                sentiment_model.load_weights(sentiment_model_path)
                tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL)
            except Exception as e:
                st.error(f"Gagal memuat model sentimen: {e}")
                st.stop()
    
            # Fungsi prediksi sentimen dengan tambahan pencocokan keyword
            def predict_with_sentiment_model(text):
                if any(keyword.lower() in text.lower() for keyword in positive_keywords):
                    return 'positive'
                elif any(keyword.lower() in text.lower() for keyword in selected_negative_keywords):
                    return 'negative'

                # Prediksi menggunakan model jika tidak ada keyword yang cocok
                inputs = tokenizer(text, return_tensors="tf", truncation=True, padding=True, max_length=128)
                outputs = sentiment_model(inputs)
                logits = outputs.logits
                predicted_label = tf.argmax(logits, axis=1).numpy()[0]
                return ['negative', 'positive', 'neutral'][predicted_label]

            data['Sentimen_Prediksi'] = data['Cleaned_Text'].apply(predict_with_sentiment_model)
    
            # Load model Brand Attitude
            try:
                ba_model = TFBertForSequenceClassification.from_pretrained(PRE_TRAINED_MODEL, num_labels=4)
                ba_model.load_weights(ba_model_path)
            except Exception as e:
                st.error(f"Gagal memuat model Brand Attitude: {e}")
                st.stop()

            def predict_ba_with_model(text, ba_model, tokenizer, threshold=0.7):
                for label, keywords_list in keywords.items():
                    if any(keyword.lower() in text.lower() for keyword in keywords_list):
                        return label, 1.0  # Jika cocok keyword, prob = 1.0
                
                # Jika tidak ada keyword yang cocok, gunakan model untuk prediksi
                inputs = tokenizer(text, return_tensors="tf", truncation=True, padding=True, max_length=128)
                outputs = ba_model(inputs)
                logits = outputs.logits
                
                # Hitung probabilitas menggunakan softmax
                probabilities = tf.nn.softmax(logits, axis=-1).numpy()[0]
                max_prob = np.max(probabilities)  # Probabilitas tertinggi
                predicted_label_index = np.argmax(probabilities)  # Indeks dari label dengan probabilitas tertinggi
                predicted_label = ['Co-Likes', 'Co-Support', 'Co-Optimism', 'Co-Negative'][predicted_label_index]
                
                # Jika probabilitas tertinggi kurang dari threshold, set label sebagai 'Co-Likes' untuk review
                if max_prob < threshold:
                    predicted_label = 'Co-Likes'

                return predicted_label, max_prob
    
            # Menggunakan fungsi untuk menambahkan prediksi Brand Attitude ke data
            # data['Brand_Attitude'] = data['Cleaned_Text'].apply(lambda x: predict_ba_with_model(x, ba_model, tokenizer, threshold=0.7))
            
            # Menambahkan hasil klasifikasi ke DataFrame
            data[['Brand_Attitude', 'Probabilitas']] = data['Cleaned_Text'].apply(
                lambda x: pd.Series(predict_ba_with_model(x, ba_model, tokenizer, threshold=0.7))
            )

            # Tambahkan "Co-Negative" jika Sentimen_Prediksi adalah "negative"
            data['Brand_Attitude'] = data.apply(
                lambda row: "Co-Negative" if row['Sentimen_Prediksi'] == 'negative' else row['Brand_Attitude'], axis=1
            )

            # Jika Sentimen_Prediksi bukan "negative", tapi Brand_Attitude berisi "Co-Negative", ubah jadi "Co-Likes"
            data['Brand_Attitude'] = data.apply(
                lambda row: "Co-Likes" if row['Sentimen_Prediksi'] != 'negative' and row['Brand_Attitude'] == 'Co-Negative' else row['Brand_Attitude'], axis=1
            )

            st.session_state.classified_data = data
                        
            # Button to navigate to "Hasil Prediksi"
            st.success("Data berhasil diprediksi! Lihat di menu Hasil Prediksi.")
                
        except Exception as e:
            st.error(f"Terjadi kesalahan: {e}")

elif menu == "Hasil Prediksi":
    # Streamlit app
    if "classified_data" in st.session_state:
        data = st.session_state.classified_data
        st.title("Aplikasi Klasifikasi Sentimen dan Brand Attitude")
        
        # Tampilkan hasil
        st.write("Hasil Klasifikasi Sentimen dan Brand Attitude:")
        st.dataframe(data[['Comment', 'Cleaned_Text', 'Sentimen_Prediksi', 'Brand_Attitude']])

        # Distribusi level komentar
        st.write("Distribusi Level Komentar:")
        level_counts = data['Brand_Attitude'].value_counts()
        total_co_likes = level_counts.get('Co-Likes', 0)
        total_co_support = level_counts.get('Co-Support', 0)
        total_co_optimism = level_counts.get('Co-Optimism', 0)
        total_co_negative = level_counts.get('Co-Negative', 0)
        
        # Tampilkan total jumlah sentimen
        st.write(f"**Total BA Co-Likes:** {total_co_likes}")
        st.write(f"**Total BA Co-Support:** {total_co_support}")
        st.write(f"**Total BA Co-Optimism:** {total_co_optimism}")
        st.write(f"**Total BA Co-Negative:** {total_co_negative}")
        
        # Tampilkan jumlah setiap kategori
        st.bar_chart(level_counts)

        def generate_wordcloud(text):
            wordcloud = WordCloud(
                width=800,
                height=400,
                background_color='white',
                max_words=200,
                colormap='viridis'
            ).generate(text)
            fig, ax = plt.subplots(figsize=(10, 5))
            ax.imshow(wordcloud, interpolation='bilinear')
            ax.axis('off')
            return fig
                
        st.write("WordCloud Berdasarkan Brand Attitude:")
        for ba in ['Co-Likes', 'Co-Support', 'Co-Optimism','Co-Negative']:
            text = " ".join(data[data['Brand_Attitude'] == ba]['Cleaned_Text'].tolist())
            if text:
                st.write(f"WordCloud untuk Brand Attitude {ba.capitalize()}:")
                st.pyplot(generate_wordcloud(text))
        
        # Fungsi untuk tokenisasi teks
        def tokenize_text(text):
            """Membersihkan dan memisahkan teks menjadi kata-kata."""
            # Hilangkan tanda baca, konversi ke huruf kecil, dan split
            words = text.lower().replace('.', '').replace(',', '').split()
            return words

        # Fungsi untuk menghitung frekuensi kata
        def get_word_frequencies(data, column):
            """Menghitung frekuensi kata berdasarkan kolom teks tertentu."""
            all_words = []
            for text in data[column]:
                all_words.extend(tokenize_text(text))
            
            if len(all_words) == 0:
                return None  # Jika tidak ada kata yang ditemukan, kembalikan None
            return Counter(all_words)
        
        co_likes_data = data[data['Brand_Attitude'] == 'Co-Likes']
        co_support_data = data[data['Brand_Attitude'] == 'Co-Support']
        co_optimism_data = data[data['Brand_Attitude'] == 'Co-Optimism']
        co_negative_data = data[data['Brand_Attitude'] == 'Co-Negative']

        # Visualisasi chart untuk kata-kata di BA Co-Likes
        st.write("### Top Kata di BA Co-Likes")
        co_likes_word_counts = get_word_frequencies(co_likes_data, 'Cleaned_Text')
        if co_likes_word_counts is None:
            st.write("Tidak ada kata yang ditemukan di kategori Co-Likes.")
        else:
            co_likes_most_common = co_likes_word_counts.most_common(10)
            co_likes_words, co_likes_counts = zip(*co_likes_most_common)
            plt.figure(figsize=(10, 6))
            plt.barh(co_likes_words, co_likes_counts, color='green')
            plt.xlabel('Frequency')
            plt.ylabel('Words')
            plt.title('Top Words in Co-Likes Category')
            plt.gca().invert_yaxis()
            st.pyplot(plt)

        # Visualisasi chart untuk kata-kata di BA Co-Support
        st.write("### Top Kata di BA Co-Support")
        co_support_word_counts = get_word_frequencies(co_support_data, 'Cleaned_Text')
        if co_support_word_counts is None:
            st.write("Tidak ada kata yang ditemukan di kategori Co-Support.")
        else:
            co_support_most_common = co_support_word_counts.most_common(10)
            co_support_words, co_support_counts = zip(*co_support_most_common)
            plt.figure(figsize=(10, 6))
            plt.barh(co_support_words, co_support_counts, color='orange')
            plt.xlabel('Frequency')
            plt.ylabel('Words')
            plt.title('Top Words in Co-Support Category')
            plt.gca().invert_yaxis()
            st.pyplot(plt)

        # Visualisasi chart untuk kata-kata di BA Co-Optimism
        st.write("### Top Kata di BA Co-Optimism")
        co_optimism_word_counts = get_word_frequencies(co_optimism_data, 'Cleaned_Text')
        if co_optimism_word_counts is None:
            st.write("Tidak ada kata yang ditemukan di kategori Co-Optimism.")
        else:
            co_optimism_most_common = co_optimism_word_counts.most_common(10)
            co_optimism_words, co_optimism_counts = zip(*co_optimism_most_common)
            plt.figure(figsize=(10, 6))
            plt.barh(co_optimism_words, co_optimism_counts, color='blue')
            plt.xlabel('Frequency')
            plt.ylabel('Words')
            plt.title('Top Words in Co-Optimism Category')
            plt.gca().invert_yaxis()
            st.pyplot(plt)

        # Visualisasi chart untuk kata-kata di BA Co-Negative
        st.write("### Top Kata di BA Co-Negative")
        co_negative_word_counts = get_word_frequencies(co_negative_data, 'Cleaned_Text')
        if co_negative_word_counts is None:
            st.write("Tidak ada kata yang ditemukan di kategori Co-Negative.")
        else:
            co_negative_most_common = co_negative_word_counts.most_common(10)
            co_negative_words, co_negative_counts = zip(*co_negative_most_common)
            plt.figure(figsize=(10, 6))
            plt.barh(co_negative_words, co_negative_counts, color='red')
            plt.xlabel('Frequency')
            plt.ylabel('Words')
            plt.title('Top Words in Co-Negative Category')
            plt.gca().invert_yaxis()
            st.pyplot(plt)
    
        # Siapkan data untuk diperbarui
        new_data = data[['Comment', 'Cleaned_Text', 'Sentimen_Prediksi', 'Brand_Attitude']].copy()

    else:
        st.warning("Tidak ada hasil prediksi. Silakan upload data terlebih dahulu di menu 'Upload Data'.")

# Menu Perlu Validasi
elif menu == "Perlu Validasi":
    st.title("Komentar Perlu Validasi")

    # Periksa apakah data hasil klasifikasi tersedia
    if 'classified_data' not in st.session_state:
        st.error("Silakan klasifikasikan data terlebih dahulu di menu sebelumnya.")
    else:
        # Ambil data komentar yang probabilitasnya rendah
        data = st.session_state.classified_data
        
        if 'Status' not in data.columns:
            data['Status'] = False  # Default nilai False
        
        review_data = data[(data['Brand_Attitude'] == 'Co-Likes') & (data['Probabilitas'] < 0.7)]
        
        if review_data.empty:
            st.write("Tidak ada komentar yang memerlukan validasi saat ini.")
        else:
            # Proses Clustering
            st.write("### Clustering Komentar")
            vectorizer = TfidfVectorizer(max_features=500, stop_words='english')
            X = vectorizer.fit_transform(review_data['Cleaned_Text'])

            # Slider untuk memilih jumlah cluster
            k = st.slider("Pilih jumlah cluster:", min_value=2, max_value=10, value=3)
            kmeans = KMeans(n_clusters=k, random_state=42)
            review_data['Cluster'] = kmeans.fit_predict(X)

            # Dropdown untuk memilih cluster
            cluster_ids = sorted(review_data['Cluster'].unique())
            selected_cluster = st.selectbox("Pilih Cluster untuk Ditampilkan:", cluster_ids)

            # Tampilkan tabel komentar berdasarkan cluster yang dipilih
            st.write(f"### Komentar di Cluster {selected_cluster}")
            cluster_data = review_data[review_data['Cluster'] == selected_cluster]
            st.dataframe(cluster_data[['Cleaned_Text', 'Brand_Attitude', 'Probabilitas']])

            # Form untuk validasi Brand Attitude
            st.write("### Validasi Brand Attitude")
            with st.form(key=f"form_cluster_{selected_cluster}"):
                update_all = st.checkbox("Ubah seluruh komentar dalam cluster ini")
                if update_all:
                    # Ubah semua komentar dalam cluster
                    new_brand_attitude = st.selectbox("Pilih Brand Attitude Baru:", 
                                                        ["Co-Likes", "Co-Support", "Co-Optimism", "Co-Negative"],
                                                        key=f"all_{selected_cluster}")
                else:
                    # Ubah komentar tertentu dalam cluster
                    cleaned_text_to_update = st.selectbox("Pilih komentar untuk diubah:", cluster_data['Cleaned_Text'])
                    new_brand_attitude = st.selectbox("Pilih Brand Attitude Baru:", 
                                                        ["Co-Likes", "Co-Support", "Co-Optimism", "Co-Negative"],
                                                        key=f"one_{selected_cluster}")
                
                submit_button = st.form_submit_button("Update Brand Attitude")

                if submit_button:
                    if update_all:
                        # Update seluruh komentar dalam cluster
                        review_data.loc[review_data['Cluster'] == selected_cluster, 'Brand_Attitude'] = new_brand_attitude
                        review_data.loc[review_data['Cluster'] == selected_cluster, 'Status'] = True
                        st.success(f"Brand Attitude untuk seluruh komentar di Cluster {selected_cluster} berhasil diperbarui menjadi: {new_brand_attitude}")
                    else:
                        # Update komentar tertentu
                        review_data.loc[review_data['Cleaned_Text'] == cleaned_text_to_update, 'Brand_Attitude'] = new_brand_attitude
                        review_data.loc[review_data['Cleaned_Text'] == cleaned_text_to_update, 'Status'] = True
                        st.success(f"Brand Attitude berhasil diperbarui untuk komentar: {cleaned_text_to_update}")

            # Update data hasil prediksi awal di session_state
            st.session_state.classified_data.loc[review_data.index, :] = review_data
            
# Menu Keyword BA
elif menu == "Keyword BA":
    st.subheader("Keyword BA Menu")

    # Load keywords dari file
    keywords = load_keywords("keywords.txt")
    negative_keywords = load_negative_keywords("negative_keywords.txt")

    # Ambil model yang digunakan dari session state
    current_model = st.session_state.get("model_choice", "Model Mundjidah")

    # Update Co-Negative keywords berdasarkan model
    if current_model in negative_keywords:
        keywords['Co-Negative'] = negative_keywords[current_model]
    else:
        keywords['Co-Negative'] = []

    # Pilih Brand Attitude dan tampilkan komentar
    st.write("### Pilih Brand Attitude untuk melihat komentarnya")
    ba_option = st.selectbox("Pilih Brand Attitude", list(keywords.keys()), index=0)

    # Tampilkan keyword untuk BA
    st.write(f"### Keyword untuk {ba_option}")
    st.write(", ".join(keywords[ba_option]))
    
    # Tampilkan komentar sesuai BA
    data = st.session_state.classified_data
    filtered_data = data[data['Brand_Attitude'] == ba_option]
    filtered_data = filtered_data.sort_values(by='Cleaned_Text', ascending=True)  # Sort ascending
    if filtered_data.empty:
        st.write("Tidak ada komentar yang ditemukan untuk Brand Attitude ini.")
    else:
        st.write(filtered_data[['Cleaned_Text', 'Brand_Attitude']])
    
    if 'Status' not in data.columns:
            data['Status'] = False  # Default nilai False
            
    # CRUD Operations
    st.write("### Kelola Keyword")
    with st.form("manage_keywords_form"):
        # Pilih keyword untuk diupdate atau dihapus
        selected_keyword = st.selectbox("Pilih Keyword untuk Diubah atau Dihapus", keywords[ba_option])
        new_keyword_value = st.text_input("Ubah Keyword (Kosongkan jika ingin menghapus)", value=selected_keyword)
        action = st.radio("Pilih Aksi", ["Update", "Delete"], index=0)
        manage_submit_button = st.form_submit_button("Lakukan Perubahan")

        if manage_submit_button:
            if action == "Update" and new_keyword_value.strip():
                # Update keyword
                index = keywords[ba_option].index(selected_keyword)
                keywords[ba_option][index] = new_keyword_value.strip()
                save_keywords("keywords.txt", keywords)  # Simpan perubahan
                st.success(f"Keyword '{selected_keyword}' berhasil diubah menjadi '{new_keyword_value.strip()}'.")
            elif action == "Delete":
                # Delete keyword
                keywords[ba_option].remove(selected_keyword)
                save_keywords("keywords.txt", keywords)  # Simpan perubahan
                st.success(f"Keyword '{selected_keyword}' berhasil dihapus.")
            else:
                st.warning("Masukkan keyword baru untuk update atau pilih aksi delete.")
                
    # Tampilkan semua Brand Attitude dengan filter dan search
    st.write("### Tabel Semua Data dengan Filter dan Pencarian")

    # Periksa apakah classified_data tersedia
    if "classified_data" in st.session_state:
        data = st.session_state.classified_data

        # Input teks untuk filter
        search_text = st.text_input("Cari berdasarkan teks komentar atau Brand Attitude:")

        # Filter data berdasarkan input teks
        if search_text:
            filtered_data = data[
                data['Cleaned_Text'].str.contains(search_text, case=False, na=False) | 
                data['Brand_Attitude'].str.contains(search_text, case=False, na=False)
            ]
        else:
            filtered_data = data

        edited_data = st.data_editor(
            filtered_data[['Cleaned_Text', 'Brand_Attitude']].copy(),
            use_container_width=True,
            key="ba_editor"
        )

        # Tombol untuk menyimpan perubahan
        if st.button("Simpan Perubahan"):
            # Update kolom Brand Attitude dan Status di data asli berdasarkan perubahan di tabel
            for index, row in edited_data.iterrows():
                original_row = filtered_data.loc[index]
                if row['Brand_Attitude'] != original_row['Brand_Attitude']:
                    data.loc[index, 'Brand_Attitude'] = row['Brand_Attitude']
                    data.loc[index, 'Status'] = True  # Tandai sebagai diupdate

            # Simpan kembali ke session_state
            st.session_state.classified_data = data
            st.success("Perubahan berhasil disimpan!")
    else:
        st.warning("Tidak ada data yang tersedia. Silakan upload data terlebih dahulu.")

    # Tambahkan keyword baru
    st.write("### Tambahkan Keyword Baru")
    with st.form("add_keyword_form"):
        new_ba = st.selectbox("Pilih Brand Attitude untuk Keyword Baru", list(keywords.keys()))
        new_keyword = st.text_input("Masukkan Keyword Baru")
        add_submit_button = st.form_submit_button("Tambah Keyword")

        if add_submit_button and new_keyword.strip():
            if new_ba == "Co-Negative":
                # Tambahkan keyword ke negative_keywords.txt
                negative_keywords[current_model].append(new_keyword.strip())
                save_negative_keywords("negative_keywords.txt", negative_keywords)
                st.success(f"Keyword Co-Negative '{new_keyword.strip()}' berhasil ditambahkan untuk model '{current_model}'!")
            else:
                # Tambahkan keyword ke keywords.txt
                keywords[new_ba].append(new_keyword.strip())
                save_keywords("keywords.txt", keywords)
                st.success(f"Keyword '{new_keyword.strip()}' berhasil ditambahkan ke {new_ba}!")

    # Simpan ke session_state
    st.session_state.classified_data = data
    st.session_state.keywords = keywords
    st.session_state.negative_keywords = negative_keywords

    
elif menu == "Normalisasi Kamus":
    st.subheader("Normalisasi Kamus")

    # Mengambil data dari session_state jika tersedia
    if 'classified_data' not in st.session_state:
        st.error("Silakan unggah file dan lakukan klasifikasi di menu 'Klasifikasi Sentimen' terlebih dahulu.")
    else:
        # Mengambil data yang telah diproses dan diklasifikasikan
        data = st.session_state.classified_data

        # Pastikan kolom 'Status' ada di DataFrame
        if 'Status' not in data.columns:
            data['Status'] = False  # Tambahkan kolom 'Status' jika belum ada

        # Tokenisasi dan hitung frekuensi kata
        def tokenize(text):
            return re.findall(r'\b\w+\b', text.lower())  # Tokenisasi kata-kata, huruf kecil semua

        # Fungsi untuk menormalkan kata-kata di dalam data
        def normalize_data(data, slang_dict):
            # Proses normalisasi kata
            def normalize_text(text):
                words = text.split()
                normalized_words = []
                updated = False
                for word in words:
                    if word in slang_dict:
                        normalized_words.append(slang_dict[word])
                        updated = True
                    else:
                        normalized_words.append(word)
                # Tandai status sebagai TRUE jika terjadi perubahan
                if updated:
                    data.loc[data['Cleaned_Text'] == text, 'Status'] = True
                return ' '.join(normalized_words)

            data['Cleaned_Text'] = data['Cleaned_Text'].apply(normalize_text)
            return data

        # Gabungkan semua komentar untuk tokenisasi
        all_comments = ' '.join(data['Cleaned_Text'])
        words = tokenize(all_comments)

        # Hitung frekuensi kata
        word_counts = Counter(words)

        # Filter kata yang frekuensinya lebih dari 10
        filtered_word_counts = {word: count for word, count in word_counts.items()}

        # Urutkan berdasarkan frekuensi
        sorted_words = sorted(filtered_word_counts.items(), key=lambda x: x[1], reverse=True)

        # Tampilkan tabel kata dan frekuensinya
        st.write("Berikut adalah daftar kata-kata hasil tokenisasi:")
        word_df = pd.DataFrame(sorted_words, columns=["Kata", "Frekuensi"])
        st.dataframe(word_df)

        # Membaca kamus normalisasi dari file
        slang_dict = load_slang_dict('slang.txt')

        if not slang_dict:
            st.write("Belum ada kamus normalisasi yang ditemukan.")
        else:
            # Menampilkan kamus normalisasi yang sudah ada
            st.write("### Kamus Normalisasi yang Sudah Ada")
            norm_dict_df = pd.DataFrame(list(slang_dict.items()), columns=["Kata Asli", "Kata Normalisasi"])
            st.dataframe(norm_dict_df)

        # Tambahkan fitur untuk meng-update kata normalisasi
        st.write("### Tambahkan Normalisasi Kata")
        with st.form("add_normalization_form"):
            new_word = st.text_input("Masukkan kata yang belum normal", "")
            normalized_word = st.text_input("Masukkan kata normalisasi", "")
            submit_button = st.form_submit_button("Tambah Normalisasi")

            if submit_button:
                if new_word and normalized_word:
                    # Menambahkan normalisasi kata baru ke kamus
                    slang_dict[new_word] = normalized_word
                    save_slang_dict(slang_dict, 'slang.txt')  # Simpan pembaruan ke file
                    st.success(f"Normalisasi kata '{new_word}' -> '{normalized_word}' berhasil ditambahkan!")
                else:
                    st.warning("Harap masukkan kata yang belum normal dan kata normalisasi!")

        # Setelah menambahkan normalisasi, kita akan menormalkan data
        if slang_dict:
            data = normalize_data(data, slang_dict)

            # Menampilkan hasil normalisasi
            st.write("Hasil Normalisasi pada Data:")
            st.dataframe(data[['Comment', 'Cleaned_Text', 'Status']])

            # Menyimpan data yang telah dinormalisasi ke session state
            st.session_state.classified_data = data


# Menu Overview Data
elif menu == "Overview Data":
    st.title("Overview Data")

    # Periksa apakah data sudah tersedia
    if 'classified_data' not in st.session_state:
        st.error("Silakan unggah dan klasifikasikan data di menu sebelumnya.")
    else:
        data = st.session_state.classified_data
        
        # Pastikan kolom 'Status' ada
        if 'Status' not in data.columns:
            data['Status'] = False  # Tambahkan kolom 'Status' jika belum ada

        # Tampilkan data akhir
        st.write("### Data Akhir:")
        final_data = data[['Cleaned_Text', 'Brand_Attitude', 'Status']].copy()
        st.dataframe(final_data)

        # Summary Perolehan Brand Attitude
        st.write("### Summary Perolehan Brand Attitude:")
        ba_summary = data['Brand_Attitude'].value_counts().reset_index()
        ba_summary.columns = ['Brand_Attitude', 'Jumlah']
        st.table(ba_summary)

        # Hitung jumlah data yang tervalidasi ulang (status == True)
        total_validated = data[data['Status'] == True].shape[0]
        st.write(f"### Total Data yang Tervalidasi Ulang: {total_validated}")
        
        # Tambahkan kolom hitungan Brand Attitude
        data['Co-Likes'] = data['Brand_Attitude'].apply(lambda x: 1 if x == 'Co-Likes' else 0)
        data['Co-Support'] = data['Brand_Attitude'].apply(lambda x: 1 if x == 'Co-Support' else 0)
        data['Co-Optimism'] = data['Brand_Attitude'].apply(lambda x: 1 if x == 'Co-Optimism' else 0)
        data['Co-Negative'] = data['Brand_Attitude'].apply(lambda x: 1 if x == 'Co-Negative' else 0)

        # Hitung sebaran Brand Attitude per Parent Link
        ba_per_parent_link_updated = data.groupby('Parent Link').agg({
            'Nama Akun': 'first',  # Ambil hanya 1 Nama Akun pertama
            'Co-Likes': 'sum',
            'Co-Support': 'sum',
            'Co-Optimism': 'sum',
            'Co-Negative': 'sum'
        }).reset_index()

        # Reorganisasi kolom
        ba_per_parent_link_updated = ba_per_parent_link_updated[['Nama Akun', 'Parent Link', 'Co-Likes', 'Co-Support', 'Co-Optimism', 'Co-Negative']]
        st.write("### Hasil Perolehan Brand Attitude per Postingan:")
        st.dataframe(ba_per_parent_link_updated)
        
                # Tombol untuk update ke database postingan
        st.write("### Update Perolehan ke Database Postingan")
        if st.button("Update ke 'Data Jombang.xlsx'"):
            try:
                # Cek apakah file "Data Jombang.xlsx" sudah ada
                try:
                    existing_data = pd.read_excel('Data Jombang.xlsx')
                except FileNotFoundError:
                    existing_data = pd.DataFrame(columns=ba_per_parent_link_updated.columns)

                # Gabungkan data baru ke existing_data berdasarkan 'Parent Link'
                updated_data = pd.concat([existing_data, ba_per_parent_link_updated]).drop_duplicates(subset='Parent Link', keep='last')

                # Simpan hasil pembaruan ke file Excel
                updated_data.to_excel('Data Jombang.xlsx', index=False)
                st.success("Data berhasil diperbarui ke 'Data Jombang.xlsx'!")
            except Exception as e:
                st.error(f"Terjadi kesalahan saat memperbarui data: {e}")

        # Tombol Kirim Data ke Database
        st.write("### Kirim Data ke Database")
        if st.button("Kirim Data ke Database"):
            try:
                # Tambahkan kolom Created At
                data['Created At'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')

                # Gabungkan dengan data lama jika ada
                try:
                    db_data = pd.read_excel('database_komen.xlsx')
                    db_data = pd.concat([db_data, data], ignore_index=True)
                    db_data = db_data.drop_duplicates()  # Hapus duplikat
                except FileNotFoundError:
                    db_data = data

                # Simpan hasil ke file Excel
                db_data.to_excel('database_komen.xlsx', index=False)
                st.success("Data berhasil dikirim ke database!")
            except Exception as e:
                st.error(f"Terjadi kesalahan saat menyimpan ke database: {e}")

        # Tombol Kirim Data ke Retraining
        st.write("### Kirim Data ke Retraining")
        if 'model_choice' in st.session_state:
            model_name = st.session_state['model_choice']
            st.write(f"Model yang digunakan: **{model_name}**")

            if st.button("Kirim Data ke Data Train"):
                try:
                    # Siapkan data yang akan dikirim ke data train
                    data_to_train = data.copy()
                    data_to_train['Sentimen_Aktual'] = data_to_train['Sentimen_Prediksi']
                    data_to_train['Brand Attitude'] = data_to_train['Brand_Attitude']
                    data_to_train['Date'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')

                    # Reorganisasi kolom
                    data_to_train = data_to_train[['Comment', 'Sentimen_Aktual', 'Cleaned_Text', 
                                                    'Kandidat', 'Parent Link', 'Date', 'Brand Attitude']]

                    # Simpan data ke file train sesuai model
                    file_path = save_to_data_train(data_to_train, model_name)
                    st.success(f"Data berhasil dikirim ke retraining: **{file_path}**")
                except Exception as e:
                    st.error(f"Terjadi kesalahan: {e}")
        else:
            st.error("Model belum dipilih. Silakan klasifikasikan data terlebih dahulu.")
            
# Menu Retrain Model
elif menu == "Retrain Model":
    st.title("Retrain Model")
    kamus_option = st.selectbox(
        "Pilih Kamus yang Ingin Diedit:",
        ["data_komen_mundjidah_clean.xlsx", "data_komen_warsubi_clean-v1.xlsx"]
    )

    # Tentukan path model sesuai kamus
    model_paths = {
        "data_komen_mundjidah_clean.xlsx": "update_mundjidah-model",
        "data_komen_warsubi_clean-v1.xlsx": "update_warsubi-model"
    }
    model_path = model_paths[kamus_option]

    # Muat data kamus dari Excel
    try:
        kamus_data = pd.read_excel(kamus_option)

        st.write("### Tabel Kamus Saat Ini")
        edited_data = st.data_editor(
            kamus_data,
            use_container_width=True,
            height=500
        )

        # Simpan perubahan ke Excel
        if st.button("Simpan Perubahan"):
            edited_data.to_excel(kamus_option, index=False)
            st.success(f"Perubahan berhasil disimpan ke {kamus_option}!")

        # Tombol untuk retrain model
        if st.button("Retrain Model"):
            with st.spinner("Melatih ulang model..."):
                retrain_model(edited_data, model_path)
            st.success(f"Model berhasil dilatih ulang dan disimpan di path: {model_path}!")

    except Exception as e:
        st.error(f"Terjadi kesalahan saat memuat atau menyimpan kamus: {e}")