import pandas as pd import numpy as np import re import streamlit as st import matplotlib.pyplot as plt from wordcloud import WordCloud import nltk import requests from collections import Counter import tensorflow as tf from transformers import TFBertForSequenceClassification, BertTokenizer from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.model_selection import train_test_split import unicodedata from sklearn.cluster import KMeans import datetime # Muat data kamus df_kamus_komen1 = pd.read_excel('data_komen_mundjidah_clean.xlsx') # Kamus 1 df_kamus_komen2 = pd.read_excel('data_komen_warsubi_clean-v1.xlsx') # Kamus 3 # Fungsi untuk memuat kamus normalisasi dari file lokal def load_normalization_dict(file_path): try: with open(file_path, 'r') as file: lines = file.readlines() normalization_dict = {} for line in lines: line = line.strip() if ':' in line: # Memastikan format key:value key, value = line.split(':', 1) # Pisahkan berdasarkan ':' key = key.strip('"') # Hapus tanda kutip pada key value = value.strip('",') # Hapus tanda kutip dan koma pada value normalization_dict[key.strip()] = value.strip() return normalization_dict except Exception as e: st.error(f"Gagal memuat kamus normalisasi: {e}") return {} # Muat kamus normalisasi dari file lokal normalization_file = "slang.txt" normalization_dict = load_normalization_dict(normalization_file) # Fungsi untuk melakukan normalisasi teks def normalize_text(text, normalization_dict): words = text.split() normalized_words = [normalization_dict.get(word, word) for word in words] return " ".join(normalized_words) # Fungsi untuk membersihkan komentar dari username def get_known_usernames(data): # Cek apakah kolom "Author" atau "Username" ada if "Author" in data.columns: return set(data["Author"].str.strip().str.lower()) elif "Username" in data.columns: return set(data["Username"].str.strip().str.lower()) elif "Nama Akun" in data.columns: return set(data["Nama Akun"].str.strip().str.lower()) else: # Jika tidak ada kolom, kembalikan set kosong return set() def remove_usernames(comment, usernames): for username in usernames: pattern = rf'\b{re.escape(username)}\b' comment = re.sub(pattern, '', comment, flags=re.IGNORECASE) return re.sub(r'\s+', ' ', comment.strip()) # Fungsi untuk membersihkan teks def clean_text(text): text = str(text) # Menghapus URL dan mention serta hashtag text = re.sub(r'http[s]?://\S+', '', text) # Hapus URL text = re.sub(r'@\w+|#\w+', '', text) # Hapus mention dan hashtag # Mengganti angka tertentu menjadi kata text = re.sub(r'\b(01|1)\b', 'satu', text) text = re.sub(r'\b(02|2)\b', 'dua', text) # Menghapus angka lainnya text = re.sub(r'\b\d+\b', '', text) # Mengonversi karakter-karakter matematis atau bold menjadi karakter normal text = unicodedata.normalize('NFKD', text) # Normalisasi karakter # Mengganti tanda baca (.,!?;:) dan emoji tertentu dengan spasi (' ') text = re.sub(r'[.,!?;:]', ' ', text) # Ganti tanda baca tertentu dengan spasi text = re.sub(r'[🔥✨❤️]', ' ', text) # Ganti emoji spesifik dengan spasi # Menghapus karakter yang tidak diinginkan kecuali huruf, angka, emoji ✌️ dan ☝️ text = re.sub(r'[^\w\s\u2700-\u27BF\u2B50\u00A9\u00AE✌️☝️]', '', text) # Menurunkan huruf menjadi huruf kecil dan menghapus spasi ekstra text = text.lower() text = re.sub(r'\s+', ' ', text).strip() # Menghapus spasi berlebihan return text def load_slang_dict(file_path): try: with open(file_path, 'r', encoding='utf-8') as file: lines = file.readlines() slang_dict = {} for line in lines: line = line.strip() if ':' in line: # Memastikan format key:value key, value = line.split(':', 1) # Pisahkan berdasarkan ':' key = key.strip('"').strip() # Hapus tanda kutip pada key dan spasi ekstra value = value.strip('",').strip() # Hapus tanda kutip dan koma pada value slang_dict[key] = value return slang_dict except Exception as e: st.error(f"Terjadi kesalahan saat membaca file slang.txt: {e}") return {} # Muat kamus normalisasi dari file lokal normalization_file = "slang.txt" normalization_dict = load_normalization_dict(normalization_file) def save_slang_dict(slang_dict, file_path): try: with open(file_path, 'w', encoding='utf-8') as file: for key, value in slang_dict.items(): # Tulis setiap pasangan key-value dalam format "key":"value" file.write(f'"{key}":"{value}",\n') st.success("Kamus normalisasi berhasil disimpan!") except Exception as e: st.error(f"Terjadi kesalahan saat menyimpan file slang.txt: {e}") def load_keywords(file_path): """Membaca keywords dari file txt dengan format kategori.""" keywords = {} with open(file_path, 'r', encoding='utf-8') as f: current_category = None for line in f: line = line.strip() if re.match(r'^\[.*\]$', line): # Mendeteksi kategori seperti [Co-Optimism] current_category = line.strip('[]') keywords[current_category] = [] elif current_category and line: keywords[current_category].append(line) return keywords def load_negative_keywords(file_path): """Membaca negative keywords dengan model identifier.""" negative_keywords = {} with open(file_path, 'r', encoding='utf-8') as f: current_model = None for line in f: line = line.strip() if re.match(r'^\[.*\]$', line): # Mendeteksi model identifier seperti [Model Mundjidah] current_model = line.strip('[]') negative_keywords[current_model] = [] elif current_model and line: negative_keywords[current_model].append(line) return negative_keywords def save_keywords(file_path, keywords): """Menyimpan keywords ke file txt.""" with open(file_path, 'w', encoding='utf-8') as f: for category, words in keywords.items(): f.write(f"[{category}]\n") for word in words: f.write(f"{word}\n") f.write("\n") # Tambahkan baris kosong antar kategori def save_negative_keywords(file_path, negative_keywords): """Menyimpan negative keywords ke file txt.""" with open(file_path, 'w', encoding='utf-8') as f: for model, words in negative_keywords.items(): f.write(f"[{model}]\n") for word in words: f.write(f"{word}\n") f.write("\n") # Fungsi untuk menyimpan data ke file Excel sesuai model def save_to_data_train(data, model_name): file_paths = { "Model Mundjidah": 'data_komen_mundjidah_clean.xlsx', "Model Warsubi V1": 'data_komen_warsubi_clean-v1.xlsx' } file_path = file_paths.get(model_name) if not file_path: st.error("Model tidak dikenali. Pastikan model sesuai.") return # Coba baca file lama atau buat data kosong try: existing_data = pd.read_excel(file_path) except FileNotFoundError: existing_data = pd.DataFrame(columns=data.columns) # Gabungkan data baru dan hapus duplikat updated_data = pd.concat([existing_data, data], ignore_index=True) updated_data = updated_data.drop_duplicates(subset=['Comment', 'Cleaned_Text']) # Simpan data updated_data.to_excel(file_path, index=False) return file_path # Definisi parameter PRE_TRAINED_MODEL = 'indobenchmark/indobert-base-p2' EPOCHS = 5 BATCH_SIZE = 32 LEARNING_RATE = 1e-5 # Fungsi untuk melatih ulang model def retrain_model(kamus_data, model_path): # Siapkan data X = kamus_data['Cleaned_Text'] y = kamus_data['Brand Attitude'] # Konversi label Brand Attitude ke angka label_map = {'Co-Likes': 0, 'Co-Support': 1, 'Co-Optimism': 2, 'Co-Negative': 3} y = y.map(label_map) # Split data menjadi training dan testing X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Tokenisasi menggunakan BERT tokenizer tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL) X_train_tokens = tokenizer(list(X_train), padding=True, truncation=True, max_length=128, return_tensors='tf') X_test_tokens = tokenizer(list(X_test), padding=True, truncation=True, max_length=128, return_tensors='tf') # Load model BERT bert_model = TFBertForSequenceClassification.from_pretrained(PRE_TRAINED_MODEL, num_labels=4) # Optimizer dan loss function optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE) loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy') # Compile model bert_model.compile(optimizer=optimizer, loss=loss, metrics=[metric]) # Latih model bert_model.fit( X_train_tokens['input_ids'], y_train, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_data=(X_test_tokens['input_ids'], y_test) ) # Simpan model bert_model.save_pretrained(model_path) tf.config.set_visible_devices([], 'GPU') # Tambahkan opsi di sidebar menu = st.sidebar.selectbox("Pilih Menu", ["Upload Data", "Hasil Prediksi", "Perlu Validasi","Keyword BA","Normalisasi Kamus", "Overview Data","Retrain Model"]) if menu == "Upload Data": # Streamlit app st.title("Aplikasi Klasifikasi Sentimen dan Brand Attitude") # Pilihan model model_choice = st.selectbox("Pilih Model:", ["Model Mundjidah", "Model Warsubi V1"]) # Upload file uploaded_file = st.file_uploader("Upload file Excel atau CSV", type=["xlsx", "csv"]) if uploaded_file: try: # Baca file yang diunggah if uploaded_file.name.endswith('.xlsx'): data = pd.read_excel(uploaded_file) elif uploaded_file.name.endswith('.csv'): data = pd.read_csv(uploaded_file) st.session_state.data = data # Bersihkan data data.dropna(how='all', inplace=True) data['Comment'] = data['Comment'].fillna('') data = data[data['Comment'].str.strip() != ''] # Proses pembersihan teks termasuk normalisasi known_usernames = get_known_usernames(data) data["Cleaned_Text"] = data["Comment"].apply(lambda x: remove_usernames(x, known_usernames)) data["Cleaned_Text"] = data["Cleaned_Text"].apply(lambda x: normalize_text(clean_text(x), normalization_dict)) keywords = load_keywords("keywords.txt") negative_keywords = load_negative_keywords("negative_keywords.txt") st.session_state.keywords = keywords st.session_state.negative_keywords = negative_keywords # Konfigurasi model berdasarkan pilihan if model_choice == "Model Mundjidah": sentiment_model_path = "mundjidah-model.h5" ba_model_path = "ba-mundjidah-model.h5" selected_df = df_kamus_komen1 selected_negative_keywords = negative_keywords.get("Model Mundjidah", []) positive_keywords = ["semoga menang", "semoga", "baik", "bagus", "terbaik", "semangat", "mundjidah", "amin", "gas", "lanjutkan"] elif model_choice == "Model Warsubi V1": sentiment_model_path = "warsa-model.h5" ba_model_path = "ba-warsa-model.h5" selected_df = df_kamus_komen2 selected_negative_keywords = negative_keywords.get("Model Warsubi V1", []) positive_keywords = ["hebat", "luar biasa", "bagus", "terbaik", "memilih dengan tepat", "all in abah subi", "pilih warsubi", "dua", "✌️", "abah", "sae","sehat","semangat"] else: # Tambahan untuk model lain sentiment_model_path = "warsubi-v2-model.h5" ba_model_path = "ba-warsubi-v2-model.h5" positive_keywords = ["hebat"] negative_keywords = ["golput ae"] PRE_TRAINED_MODEL = 'indobenchmark/indobert-base-p2' st.session_state['model_choice'] = model_choice # Load model sentimen try: sentiment_model = TFBertForSequenceClassification.from_pretrained(PRE_TRAINED_MODEL, num_labels=3) sentiment_model.load_weights(sentiment_model_path) tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL) except Exception as e: st.error(f"Gagal memuat model sentimen: {e}") st.stop() # Fungsi prediksi sentimen dengan tambahan pencocokan keyword def predict_with_sentiment_model(text): if any(keyword.lower() in text.lower() for keyword in positive_keywords): return 'positive' elif any(keyword.lower() in text.lower() for keyword in selected_negative_keywords): return 'negative' # Prediksi menggunakan model jika tidak ada keyword yang cocok inputs = tokenizer(text, return_tensors="tf", truncation=True, padding=True, max_length=128) outputs = sentiment_model(inputs) logits = outputs.logits predicted_label = tf.argmax(logits, axis=1).numpy()[0] return ['negative', 'positive', 'neutral'][predicted_label] data['Sentimen_Prediksi'] = data['Cleaned_Text'].apply(predict_with_sentiment_model) # Load model Brand Attitude try: ba_model = TFBertForSequenceClassification.from_pretrained(PRE_TRAINED_MODEL, num_labels=4) ba_model.load_weights(ba_model_path) except Exception as e: st.error(f"Gagal memuat model Brand Attitude: {e}") st.stop() def predict_ba_with_model(text, ba_model, tokenizer, threshold=0.7): for label, keywords_list in keywords.items(): if any(keyword.lower() in text.lower() for keyword in keywords_list): return label, 1.0 # Jika cocok keyword, prob = 1.0 # Jika tidak ada keyword yang cocok, gunakan model untuk prediksi inputs = tokenizer(text, return_tensors="tf", truncation=True, padding=True, max_length=128) outputs = ba_model(inputs) logits = outputs.logits # Hitung probabilitas menggunakan softmax probabilities = tf.nn.softmax(logits, axis=-1).numpy()[0] max_prob = np.max(probabilities) # Probabilitas tertinggi predicted_label_index = np.argmax(probabilities) # Indeks dari label dengan probabilitas tertinggi predicted_label = ['Co-Likes', 'Co-Support', 'Co-Optimism', 'Co-Negative'][predicted_label_index] # Jika probabilitas tertinggi kurang dari threshold, set label sebagai 'Co-Likes' untuk review if max_prob < threshold: predicted_label = 'Co-Likes' return predicted_label, max_prob # Menggunakan fungsi untuk menambahkan prediksi Brand Attitude ke data # data['Brand_Attitude'] = data['Cleaned_Text'].apply(lambda x: predict_ba_with_model(x, ba_model, tokenizer, threshold=0.7)) # Menambahkan hasil klasifikasi ke DataFrame data[['Brand_Attitude', 'Probabilitas']] = data['Cleaned_Text'].apply( lambda x: pd.Series(predict_ba_with_model(x, ba_model, tokenizer, threshold=0.7)) ) # Tambahkan "Co-Negative" jika Sentimen_Prediksi adalah "negative" data['Brand_Attitude'] = data.apply( lambda row: "Co-Negative" if row['Sentimen_Prediksi'] == 'negative' else row['Brand_Attitude'], axis=1 ) # Jika Sentimen_Prediksi bukan "negative", tapi Brand_Attitude berisi "Co-Negative", ubah jadi "Co-Likes" data['Brand_Attitude'] = data.apply( lambda row: "Co-Likes" if row['Sentimen_Prediksi'] != 'negative' and row['Brand_Attitude'] == 'Co-Negative' else row['Brand_Attitude'], axis=1 ) st.session_state.classified_data = data # Button to navigate to "Hasil Prediksi" st.success("Data berhasil diprediksi! Lihat di menu Hasil Prediksi.") except Exception as e: st.error(f"Terjadi kesalahan: {e}") elif menu == "Hasil Prediksi": # Streamlit app if "classified_data" in st.session_state: data = st.session_state.classified_data st.title("Aplikasi Klasifikasi Sentimen dan Brand Attitude") # Tampilkan hasil st.write("Hasil Klasifikasi Sentimen dan Brand Attitude:") st.dataframe(data[['Comment', 'Cleaned_Text', 'Sentimen_Prediksi', 'Brand_Attitude']]) # Distribusi level komentar st.write("Distribusi Level Komentar:") level_counts = data['Brand_Attitude'].value_counts() total_co_likes = level_counts.get('Co-Likes', 0) total_co_support = level_counts.get('Co-Support', 0) total_co_optimism = level_counts.get('Co-Optimism', 0) total_co_negative = level_counts.get('Co-Negative', 0) # Tampilkan total jumlah sentimen st.write(f"**Total BA Co-Likes:** {total_co_likes}") st.write(f"**Total BA Co-Support:** {total_co_support}") st.write(f"**Total BA Co-Optimism:** {total_co_optimism}") st.write(f"**Total BA Co-Negative:** {total_co_negative}") # Tampilkan jumlah setiap kategori st.bar_chart(level_counts) def generate_wordcloud(text): wordcloud = WordCloud( width=800, height=400, background_color='white', max_words=200, colormap='viridis' ).generate(text) fig, ax = plt.subplots(figsize=(10, 5)) ax.imshow(wordcloud, interpolation='bilinear') ax.axis('off') return fig st.write("WordCloud Berdasarkan Brand Attitude:") for ba in ['Co-Likes', 'Co-Support', 'Co-Optimism','Co-Negative']: text = " ".join(data[data['Brand_Attitude'] == ba]['Cleaned_Text'].tolist()) if text: st.write(f"WordCloud untuk Brand Attitude {ba.capitalize()}:") st.pyplot(generate_wordcloud(text)) # Fungsi untuk tokenisasi teks def tokenize_text(text): """Membersihkan dan memisahkan teks menjadi kata-kata.""" # Hilangkan tanda baca, konversi ke huruf kecil, dan split words = text.lower().replace('.', '').replace(',', '').split() return words # Fungsi untuk menghitung frekuensi kata def get_word_frequencies(data, column): """Menghitung frekuensi kata berdasarkan kolom teks tertentu.""" all_words = [] for text in data[column]: all_words.extend(tokenize_text(text)) if len(all_words) == 0: return None # Jika tidak ada kata yang ditemukan, kembalikan None return Counter(all_words) co_likes_data = data[data['Brand_Attitude'] == 'Co-Likes'] co_support_data = data[data['Brand_Attitude'] == 'Co-Support'] co_optimism_data = data[data['Brand_Attitude'] == 'Co-Optimism'] co_negative_data = data[data['Brand_Attitude'] == 'Co-Negative'] # Visualisasi chart untuk kata-kata di BA Co-Likes st.write("### Top Kata di BA Co-Likes") co_likes_word_counts = get_word_frequencies(co_likes_data, 'Cleaned_Text') if co_likes_word_counts is None: st.write("Tidak ada kata yang ditemukan di kategori Co-Likes.") else: co_likes_most_common = co_likes_word_counts.most_common(10) co_likes_words, co_likes_counts = zip(*co_likes_most_common) plt.figure(figsize=(10, 6)) plt.barh(co_likes_words, co_likes_counts, color='green') plt.xlabel('Frequency') plt.ylabel('Words') plt.title('Top Words in Co-Likes Category') plt.gca().invert_yaxis() st.pyplot(plt) # Visualisasi chart untuk kata-kata di BA Co-Support st.write("### Top Kata di BA Co-Support") co_support_word_counts = get_word_frequencies(co_support_data, 'Cleaned_Text') if co_support_word_counts is None: st.write("Tidak ada kata yang ditemukan di kategori Co-Support.") else: co_support_most_common = co_support_word_counts.most_common(10) co_support_words, co_support_counts = zip(*co_support_most_common) plt.figure(figsize=(10, 6)) plt.barh(co_support_words, co_support_counts, color='orange') plt.xlabel('Frequency') plt.ylabel('Words') plt.title('Top Words in Co-Support Category') plt.gca().invert_yaxis() st.pyplot(plt) # Visualisasi chart untuk kata-kata di BA Co-Optimism st.write("### Top Kata di BA Co-Optimism") co_optimism_word_counts = get_word_frequencies(co_optimism_data, 'Cleaned_Text') if co_optimism_word_counts is None: st.write("Tidak ada kata yang ditemukan di kategori Co-Optimism.") else: co_optimism_most_common = co_optimism_word_counts.most_common(10) co_optimism_words, co_optimism_counts = zip(*co_optimism_most_common) plt.figure(figsize=(10, 6)) plt.barh(co_optimism_words, co_optimism_counts, color='blue') plt.xlabel('Frequency') plt.ylabel('Words') plt.title('Top Words in Co-Optimism Category') plt.gca().invert_yaxis() st.pyplot(plt) # Visualisasi chart untuk kata-kata di BA Co-Negative st.write("### Top Kata di BA Co-Negative") co_negative_word_counts = get_word_frequencies(co_negative_data, 'Cleaned_Text') if co_negative_word_counts is None: st.write("Tidak ada kata yang ditemukan di kategori Co-Negative.") else: co_negative_most_common = co_negative_word_counts.most_common(10) co_negative_words, co_negative_counts = zip(*co_negative_most_common) plt.figure(figsize=(10, 6)) plt.barh(co_negative_words, co_negative_counts, color='red') plt.xlabel('Frequency') plt.ylabel('Words') plt.title('Top Words in Co-Negative Category') plt.gca().invert_yaxis() st.pyplot(plt) # Siapkan data untuk diperbarui new_data = data[['Comment', 'Cleaned_Text', 'Sentimen_Prediksi', 'Brand_Attitude']].copy() else: st.warning("Tidak ada hasil prediksi. Silakan upload data terlebih dahulu di menu 'Upload Data'.") # Menu Perlu Validasi elif menu == "Perlu Validasi": st.title("Komentar Perlu Validasi") # Periksa apakah data hasil klasifikasi tersedia if 'classified_data' not in st.session_state: st.error("Silakan klasifikasikan data terlebih dahulu di menu sebelumnya.") else: # Ambil data komentar yang probabilitasnya rendah data = st.session_state.classified_data if 'Status' not in data.columns: data['Status'] = False # Default nilai False review_data = data[(data['Brand_Attitude'] == 'Co-Likes') & (data['Probabilitas'] < 0.7)] if review_data.empty: st.write("Tidak ada komentar yang memerlukan validasi saat ini.") else: # Proses Clustering st.write("### Clustering Komentar") vectorizer = TfidfVectorizer(max_features=500, stop_words='english') X = vectorizer.fit_transform(review_data['Cleaned_Text']) # Slider untuk memilih jumlah cluster k = st.slider("Pilih jumlah cluster:", min_value=2, max_value=10, value=3) kmeans = KMeans(n_clusters=k, random_state=42) review_data['Cluster'] = kmeans.fit_predict(X) # Dropdown untuk memilih cluster cluster_ids = sorted(review_data['Cluster'].unique()) selected_cluster = st.selectbox("Pilih Cluster untuk Ditampilkan:", cluster_ids) # Tampilkan tabel komentar berdasarkan cluster yang dipilih st.write(f"### Komentar di Cluster {selected_cluster}") cluster_data = review_data[review_data['Cluster'] == selected_cluster] st.dataframe(cluster_data[['Cleaned_Text', 'Brand_Attitude', 'Probabilitas']]) # Form untuk validasi Brand Attitude st.write("### Validasi Brand Attitude") with st.form(key=f"form_cluster_{selected_cluster}"): update_all = st.checkbox("Ubah seluruh komentar dalam cluster ini") if update_all: # Ubah semua komentar dalam cluster new_brand_attitude = st.selectbox("Pilih Brand Attitude Baru:", ["Co-Likes", "Co-Support", "Co-Optimism", "Co-Negative"], key=f"all_{selected_cluster}") else: # Ubah komentar tertentu dalam cluster cleaned_text_to_update = st.selectbox("Pilih komentar untuk diubah:", cluster_data['Cleaned_Text']) new_brand_attitude = st.selectbox("Pilih Brand Attitude Baru:", ["Co-Likes", "Co-Support", "Co-Optimism", "Co-Negative"], key=f"one_{selected_cluster}") submit_button = st.form_submit_button("Update Brand Attitude") if submit_button: if update_all: # Update seluruh komentar dalam cluster review_data.loc[review_data['Cluster'] == selected_cluster, 'Brand_Attitude'] = new_brand_attitude review_data.loc[review_data['Cluster'] == selected_cluster, 'Status'] = True st.success(f"Brand Attitude untuk seluruh komentar di Cluster {selected_cluster} berhasil diperbarui menjadi: {new_brand_attitude}") else: # Update komentar tertentu review_data.loc[review_data['Cleaned_Text'] == cleaned_text_to_update, 'Brand_Attitude'] = new_brand_attitude review_data.loc[review_data['Cleaned_Text'] == cleaned_text_to_update, 'Status'] = True st.success(f"Brand Attitude berhasil diperbarui untuk komentar: {cleaned_text_to_update}") # Update data hasil prediksi awal di session_state st.session_state.classified_data.loc[review_data.index, :] = review_data # Menu Keyword BA elif menu == "Keyword BA": st.subheader("Keyword BA Menu") # Load keywords dari file keywords = load_keywords("keywords.txt") negative_keywords = load_negative_keywords("negative_keywords.txt") # Ambil model yang digunakan dari session state current_model = st.session_state.get("model_choice", "Model Mundjidah") # Update Co-Negative keywords berdasarkan model if current_model in negative_keywords: keywords['Co-Negative'] = negative_keywords[current_model] else: keywords['Co-Negative'] = [] # Pilih Brand Attitude dan tampilkan komentar st.write("### Pilih Brand Attitude untuk melihat komentarnya") ba_option = st.selectbox("Pilih Brand Attitude", list(keywords.keys()), index=0) # Tampilkan keyword untuk BA st.write(f"### Keyword untuk {ba_option}") st.write(", ".join(keywords[ba_option])) # Tampilkan komentar sesuai BA data = st.session_state.classified_data filtered_data = data[data['Brand_Attitude'] == ba_option] filtered_data = filtered_data.sort_values(by='Cleaned_Text', ascending=True) # Sort ascending if filtered_data.empty: st.write("Tidak ada komentar yang ditemukan untuk Brand Attitude ini.") else: st.write(filtered_data[['Cleaned_Text', 'Brand_Attitude']]) if 'Status' not in data.columns: data['Status'] = False # Default nilai False # CRUD Operations st.write("### Kelola Keyword") with st.form("manage_keywords_form"): # Pilih keyword untuk diupdate atau dihapus selected_keyword = st.selectbox("Pilih Keyword untuk Diubah atau Dihapus", keywords[ba_option]) new_keyword_value = st.text_input("Ubah Keyword (Kosongkan jika ingin menghapus)", value=selected_keyword) action = st.radio("Pilih Aksi", ["Update", "Delete"], index=0) manage_submit_button = st.form_submit_button("Lakukan Perubahan") if manage_submit_button: if action == "Update" and new_keyword_value.strip(): # Update keyword index = keywords[ba_option].index(selected_keyword) keywords[ba_option][index] = new_keyword_value.strip() save_keywords("keywords.txt", keywords) # Simpan perubahan st.success(f"Keyword '{selected_keyword}' berhasil diubah menjadi '{new_keyword_value.strip()}'.") elif action == "Delete": # Delete keyword keywords[ba_option].remove(selected_keyword) save_keywords("keywords.txt", keywords) # Simpan perubahan st.success(f"Keyword '{selected_keyword}' berhasil dihapus.") else: st.warning("Masukkan keyword baru untuk update atau pilih aksi delete.") # Tampilkan semua Brand Attitude dengan filter dan search st.write("### Tabel Semua Data dengan Filter dan Pencarian") # Periksa apakah classified_data tersedia if "classified_data" in st.session_state: data = st.session_state.classified_data # Input teks untuk filter search_text = st.text_input("Cari berdasarkan teks komentar atau Brand Attitude:") # Filter data berdasarkan input teks if search_text: filtered_data = data[ data['Cleaned_Text'].str.contains(search_text, case=False, na=False) | data['Brand_Attitude'].str.contains(search_text, case=False, na=False) ] else: filtered_data = data edited_data = st.data_editor( filtered_data[['Cleaned_Text', 'Brand_Attitude']].copy(), use_container_width=True, key="ba_editor" ) # Tombol untuk menyimpan perubahan if st.button("Simpan Perubahan"): # Update kolom Brand Attitude dan Status di data asli berdasarkan perubahan di tabel for index, row in edited_data.iterrows(): original_row = filtered_data.loc[index] if row['Brand_Attitude'] != original_row['Brand_Attitude']: data.loc[index, 'Brand_Attitude'] = row['Brand_Attitude'] data.loc[index, 'Status'] = True # Tandai sebagai diupdate # Simpan kembali ke session_state st.session_state.classified_data = data st.success("Perubahan berhasil disimpan!") else: st.warning("Tidak ada data yang tersedia. Silakan upload data terlebih dahulu.") # Tambahkan keyword baru st.write("### Tambahkan Keyword Baru") with st.form("add_keyword_form"): new_ba = st.selectbox("Pilih Brand Attitude untuk Keyword Baru", list(keywords.keys())) new_keyword = st.text_input("Masukkan Keyword Baru") add_submit_button = st.form_submit_button("Tambah Keyword") if add_submit_button and new_keyword.strip(): if new_ba == "Co-Negative": # Tambahkan keyword ke negative_keywords.txt negative_keywords[current_model].append(new_keyword.strip()) save_negative_keywords("negative_keywords.txt", negative_keywords) st.success(f"Keyword Co-Negative '{new_keyword.strip()}' berhasil ditambahkan untuk model '{current_model}'!") else: # Tambahkan keyword ke keywords.txt keywords[new_ba].append(new_keyword.strip()) save_keywords("keywords.txt", keywords) st.success(f"Keyword '{new_keyword.strip()}' berhasil ditambahkan ke {new_ba}!") # Simpan ke session_state st.session_state.classified_data = data st.session_state.keywords = keywords st.session_state.negative_keywords = negative_keywords elif menu == "Normalisasi Kamus": st.subheader("Normalisasi Kamus") # Mengambil data dari session_state jika tersedia if 'classified_data' not in st.session_state: st.error("Silakan unggah file dan lakukan klasifikasi di menu 'Klasifikasi Sentimen' terlebih dahulu.") else: # Mengambil data yang telah diproses dan diklasifikasikan data = st.session_state.classified_data # Pastikan kolom 'Status' ada di DataFrame if 'Status' not in data.columns: data['Status'] = False # Tambahkan kolom 'Status' jika belum ada # Tokenisasi dan hitung frekuensi kata def tokenize(text): return re.findall(r'\b\w+\b', text.lower()) # Tokenisasi kata-kata, huruf kecil semua # Fungsi untuk menormalkan kata-kata di dalam data def normalize_data(data, slang_dict): # Proses normalisasi kata def normalize_text(text): words = text.split() normalized_words = [] updated = False for word in words: if word in slang_dict: normalized_words.append(slang_dict[word]) updated = True else: normalized_words.append(word) # Tandai status sebagai TRUE jika terjadi perubahan if updated: data.loc[data['Cleaned_Text'] == text, 'Status'] = True return ' '.join(normalized_words) data['Cleaned_Text'] = data['Cleaned_Text'].apply(normalize_text) return data # Gabungkan semua komentar untuk tokenisasi all_comments = ' '.join(data['Cleaned_Text']) words = tokenize(all_comments) # Hitung frekuensi kata word_counts = Counter(words) # Filter kata yang frekuensinya lebih dari 10 filtered_word_counts = {word: count for word, count in word_counts.items()} # Urutkan berdasarkan frekuensi sorted_words = sorted(filtered_word_counts.items(), key=lambda x: x[1], reverse=True) # Tampilkan tabel kata dan frekuensinya st.write("Berikut adalah daftar kata-kata hasil tokenisasi:") word_df = pd.DataFrame(sorted_words, columns=["Kata", "Frekuensi"]) st.dataframe(word_df) # Membaca kamus normalisasi dari file slang_dict = load_slang_dict('slang.txt') if not slang_dict: st.write("Belum ada kamus normalisasi yang ditemukan.") else: # Menampilkan kamus normalisasi yang sudah ada st.write("### Kamus Normalisasi yang Sudah Ada") norm_dict_df = pd.DataFrame(list(slang_dict.items()), columns=["Kata Asli", "Kata Normalisasi"]) st.dataframe(norm_dict_df) # Tambahkan fitur untuk meng-update kata normalisasi st.write("### Tambahkan Normalisasi Kata") with st.form("add_normalization_form"): new_word = st.text_input("Masukkan kata yang belum normal", "") normalized_word = st.text_input("Masukkan kata normalisasi", "") submit_button = st.form_submit_button("Tambah Normalisasi") if submit_button: if new_word and normalized_word: # Menambahkan normalisasi kata baru ke kamus slang_dict[new_word] = normalized_word save_slang_dict(slang_dict, 'slang.txt') # Simpan pembaruan ke file st.success(f"Normalisasi kata '{new_word}' -> '{normalized_word}' berhasil ditambahkan!") else: st.warning("Harap masukkan kata yang belum normal dan kata normalisasi!") # Setelah menambahkan normalisasi, kita akan menormalkan data if slang_dict: data = normalize_data(data, slang_dict) # Menampilkan hasil normalisasi st.write("Hasil Normalisasi pada Data:") st.dataframe(data[['Comment', 'Cleaned_Text', 'Status']]) # Menyimpan data yang telah dinormalisasi ke session state st.session_state.classified_data = data # Menu Overview Data elif menu == "Overview Data": st.title("Overview Data") # Periksa apakah data sudah tersedia if 'classified_data' not in st.session_state: st.error("Silakan unggah dan klasifikasikan data di menu sebelumnya.") else: data = st.session_state.classified_data # Pastikan kolom 'Status' ada if 'Status' not in data.columns: data['Status'] = False # Tambahkan kolom 'Status' jika belum ada # Tampilkan data akhir st.write("### Data Akhir:") final_data = data[['Cleaned_Text', 'Brand_Attitude', 'Status']].copy() st.dataframe(final_data) # Summary Perolehan Brand Attitude st.write("### Summary Perolehan Brand Attitude:") ba_summary = data['Brand_Attitude'].value_counts().reset_index() ba_summary.columns = ['Brand_Attitude', 'Jumlah'] st.table(ba_summary) # Hitung jumlah data yang tervalidasi ulang (status == True) total_validated = data[data['Status'] == True].shape[0] st.write(f"### Total Data yang Tervalidasi Ulang: {total_validated}") # Tambahkan kolom hitungan Brand Attitude data['Co-Likes'] = data['Brand_Attitude'].apply(lambda x: 1 if x == 'Co-Likes' else 0) data['Co-Support'] = data['Brand_Attitude'].apply(lambda x: 1 if x == 'Co-Support' else 0) data['Co-Optimism'] = data['Brand_Attitude'].apply(lambda x: 1 if x == 'Co-Optimism' else 0) data['Co-Negative'] = data['Brand_Attitude'].apply(lambda x: 1 if x == 'Co-Negative' else 0) # Hitung sebaran Brand Attitude per Parent Link ba_per_parent_link_updated = data.groupby('Parent Link').agg({ 'Nama Akun': 'first', # Ambil hanya 1 Nama Akun pertama 'Co-Likes': 'sum', 'Co-Support': 'sum', 'Co-Optimism': 'sum', 'Co-Negative': 'sum' }).reset_index() # Reorganisasi kolom ba_per_parent_link_updated = ba_per_parent_link_updated[['Nama Akun', 'Parent Link', 'Co-Likes', 'Co-Support', 'Co-Optimism', 'Co-Negative']] st.write("### Hasil Perolehan Brand Attitude per Postingan:") st.dataframe(ba_per_parent_link_updated) # Tombol untuk update ke database postingan st.write("### Update Perolehan ke Database Postingan") if st.button("Update ke 'Data Jombang.xlsx'"): try: # Cek apakah file "Data Jombang.xlsx" sudah ada try: existing_data = pd.read_excel('Data Jombang.xlsx') except FileNotFoundError: existing_data = pd.DataFrame(columns=ba_per_parent_link_updated.columns) # Gabungkan data baru ke existing_data berdasarkan 'Parent Link' updated_data = pd.concat([existing_data, ba_per_parent_link_updated]).drop_duplicates(subset='Parent Link', keep='last') # Simpan hasil pembaruan ke file Excel updated_data.to_excel('Data Jombang.xlsx', index=False) st.success("Data berhasil diperbarui ke 'Data Jombang.xlsx'!") except Exception as e: st.error(f"Terjadi kesalahan saat memperbarui data: {e}") # Tombol Kirim Data ke Database st.write("### Kirim Data ke Database") if st.button("Kirim Data ke Database"): try: # Tambahkan kolom Created At data['Created At'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') # Gabungkan dengan data lama jika ada try: db_data = pd.read_excel('database_komen.xlsx') db_data = pd.concat([db_data, data], ignore_index=True) db_data = db_data.drop_duplicates() # Hapus duplikat except FileNotFoundError: db_data = data # Simpan hasil ke file Excel db_data.to_excel('database_komen.xlsx', index=False) st.success("Data berhasil dikirim ke database!") except Exception as e: st.error(f"Terjadi kesalahan saat menyimpan ke database: {e}") # Tombol Kirim Data ke Retraining st.write("### Kirim Data ke Retraining") if 'model_choice' in st.session_state: model_name = st.session_state['model_choice'] st.write(f"Model yang digunakan: **{model_name}**") if st.button("Kirim Data ke Data Train"): try: # Siapkan data yang akan dikirim ke data train data_to_train = data.copy() data_to_train['Sentimen_Aktual'] = data_to_train['Sentimen_Prediksi'] data_to_train['Brand Attitude'] = data_to_train['Brand_Attitude'] data_to_train['Date'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') # Reorganisasi kolom data_to_train = data_to_train[['Comment', 'Sentimen_Aktual', 'Cleaned_Text', 'Kandidat', 'Parent Link', 'Date', 'Brand Attitude']] # Simpan data ke file train sesuai model file_path = save_to_data_train(data_to_train, model_name) st.success(f"Data berhasil dikirim ke retraining: **{file_path}**") except Exception as e: st.error(f"Terjadi kesalahan: {e}") else: st.error("Model belum dipilih. Silakan klasifikasikan data terlebih dahulu.") # Menu Retrain Model elif menu == "Retrain Model": st.title("Retrain Model") kamus_option = st.selectbox( "Pilih Kamus yang Ingin Diedit:", ["data_komen_mundjidah_clean.xlsx", "data_komen_warsubi_clean-v1.xlsx"] ) # Tentukan path model sesuai kamus model_paths = { "data_komen_mundjidah_clean.xlsx": "update_mundjidah-model", "data_komen_warsubi_clean-v1.xlsx": "update_warsubi-model" } model_path = model_paths[kamus_option] # Muat data kamus dari Excel try: kamus_data = pd.read_excel(kamus_option) st.write("### Tabel Kamus Saat Ini") edited_data = st.data_editor( kamus_data, use_container_width=True, height=500 ) # Simpan perubahan ke Excel if st.button("Simpan Perubahan"): edited_data.to_excel(kamus_option, index=False) st.success(f"Perubahan berhasil disimpan ke {kamus_option}!") # Tombol untuk retrain model if st.button("Retrain Model"): with st.spinner("Melatih ulang model..."): retrain_model(edited_data, model_path) st.success(f"Model berhasil dilatih ulang dan disimpan di path: {model_path}!") except Exception as e: st.error(f"Terjadi kesalahan saat memuat atau menyimpan kamus: {e}")