commited on
Browse files
@@ -10,26 +10,14 @@ from collections import Counter
10 |
import tensorflow as tf
11 |
from transformers import TFBertForSequenceClassification, BertTokenizer
12 |
from sklearn.feature_extraction.text import TfidfVectorizer
13 |
from sklearn.metrics.pairwise import cosine_similarity
14 |
from sklearn.model_selection import train_test_split
15 |
16 |
# Muat data kamus
17 |
df_kamus_komen1 = pd.read_excel('data_komen_mundjidah_clean.xlsx') # Kamus 1
18 |
19 |
20 |
# Daftar kata kunci negatif dan positif
21 |
negative_keywords_model1 = ["pilih nomor dua", "nomor dua", "buruk", "jelek", "✌️", "dua", "jalan rusak", "leren", "perubahan", "ganti bupati", "warsa", "abah", "janji manis", "omong tok", "nyocot", "bacot"]
22 |
negative_keywords_model2 = ["pilih nomor satu", "nomor satu", "buruk", "jelek","☝️"]
23 |
negative_keywords_model3 = ["buruk", "jelek", "☝️", "golput", "serang", "mundjidah", "janji manis", "omong tok", "nyocot", "bacot", "carmuk","cari muka"]
24 |
25 |
positive_keywords_model1 = ["semoga menang", "semoga", "baik", "bagus", "terbaik", "semangat", "mundjidah", "amin", "gas"]
26 |
positive_keywords_model2 = ["hebat", "luar biasa", "bagus", "terbaik", "memilih dengan tepat", "all in abah subi", "pilih warsubi"]
27 |
positive_keywords_model3 = ["hebat", "luar biasa", "bagus", "terbaik", "memilih dengan tepat", "all in abah subi", "pilih warsubi", "coblos", "dukung", "pilih", "semangat" , "allahuakbar","subhanallah","gus kautsar", "pemimpin", "gus", "pendherek",
28 |
"salam dua jari", "pemimpin baru", "alhamdulillah","salam","sowan", "waalaikumsalam", "tambah maju", "tambah sejahtera", "makin maju", "makin sejahtera", "makin apik","hadir", "sip", "jos", "mantap bah",
29 |
"warsa", "warsubi", "warsa bupatiku", "setuju", "dukung abah", "abah", "dua", "nomor dua", "amin", "gas", "ayo dukung", "warsubi tok", "semoga menang", "warsa ae", "warsa ae liane up", "tiang sae","bantu","beri","kasih",
30 |
"selamat","pasti menang", "assalamualaikum", "unggul", "telak", "perubahan", "semoga", "warga sejahtera", "semakin sejahtera", "tambah apik", "ganti bupati","ngayomi", "alhamdulillah","barokalloh", "pilih abah", "pilih warsa",
31 |
"aamiin", "bismilah", "pasti menang", "bismillah", "aamiin", "calon pemimpin", "dukung abah subi", "alhamdulillah", "masyaallah","mashaallah", "menang", "pemimpin", "warsah", "lanjutkan abah", "lanjutkan"
32 |
"semangat", "optimis", "semoga", "yakin", "amanah", "mantap", "mantab", "komitmen", "mengayomi","merangkul","bupati","calon bupati","bupati", "bukan pencitraan", "dermawan", "bantuan", "no dua", "no ✌️"]
33 |
34 |
# Fungsi untuk memuat kamus normalisasi dari file lokal
35 |
def load_normalization_dict(file_path):
@@ -77,34 +65,192 @@ def remove_usernames(comment, usernames):
77 |
pattern = rf'\b{re.escape(username)}\b'
78 |
comment = re.sub(pattern, '', comment, flags=re.IGNORECASE)
79 |
return re.sub(r'\s+', ' ', comment.strip())
80 |
81 |
# Fungsi untuk membersihkan teks
82 |
def clean_text(text):
83 |
text = str(text)
84 |
85 |
86 |
text = re.sub(r'\b(01|1)\b', 'satu', text)
87 |
text = re.sub(r'\b(02|2)\b', 'dua', text)
88 |
text = re.sub(r'\b\d+\b', '', text)
89 |
90 |
91 |
92 |
93 |
94 |
def update_kamus(file_path, new_data):
95 |
96 |
97 |
98 |
99 |
100 |
except Exception as e:
101 |
102 |
103 |
104 |
# Tambahkan opsi di sidebar
105 |
menu = st.sidebar.selectbox("Pilih Menu", ["
106 |
107 |
if menu == "
108 |
# Streamlit app
109 |
st.title("Aplikasi Klasifikasi Sentimen dan Brand Attitude")
110 |
@@ -121,6 +267,8 @@ if menu == "Klasifikasi Sentimen":
121 |
data = pd.read_excel(uploaded_file)
122 |
123 |
data = pd.read_csv(uploaded_file)
124 |
125 |
# Bersihkan data
126 |
data.dropna(how='all', inplace=True)
@@ -131,28 +279,36 @@ if menu == "Klasifikasi Sentimen":
131 |
known_usernames = get_known_usernames(data)
132 |
data["Cleaned_Text"] = data["Comment"].apply(lambda x: remove_usernames(x, known_usernames))
133 |
data["Cleaned_Text"] = data["Cleaned_Text"].apply(lambda x: normalize_text(clean_text(x), normalization_dict))
134 |
135 |
# Konfigurasi model berdasarkan pilihan
136 |
if model_choice == "Model Mundjidah":
137 |
sentiment_model_path = "mundjidah-model.h5"
138 |
ba_model_path = "ba-mundjidah-model.h5"
139 |
140 |
141 |
142 |
elif model_choice == "Model Warsubi V1":
143 |
sentiment_model_path = "warsa-model.h5"
144 |
ba_model_path = "ba-warsa-model.h5"
145 |
146 |
147 |
148 |
149 |
else: # Tambahan untuk model lain
150 |
sentiment_model_path = "warsubi-v2-model.h5"
151 |
ba_model_path = "ba-warsubi-v2-model.h5"
152 |
positive_keywords = ["hebat"
153 |
negative_keywords = ["golput ae"
154 |
155 |
PRE_TRAINED_MODEL = 'indobenchmark/indobert-base-p2'
156 |
# Load model sentimen
157 |
158 |
sentiment_model = TFBertForSequenceClassification.from_pretrained(PRE_TRAINED_MODEL, num_labels=3)
@@ -164,10 +320,9 @@ if menu == "Klasifikasi Sentimen":
164 |
165 |
# Fungsi prediksi sentimen dengan tambahan pencocokan keyword
166 |
def predict_with_sentiment_model(text):
167 |
# Pencocokan keyword
168 |
if any(keyword.lower() in text.lower() for keyword in positive_keywords):
169 |
return 'positive'
170 |
elif any(keyword.lower() in text.lower() for keyword in
171 |
return 'negative'
172 |
173 |
# Prediksi menggunakan model jika tidak ada keyword yang cocok
@@ -186,49 +341,37 @@ if menu == "Klasifikasi Sentimen":
186 |
except Exception as e:
187 |
st.error(f"Gagal memuat model Brand Attitude: {e}")
188 |
189 |
190 |
# Daftar keyword untuk masing-masing kategori
191 |
keywords = {
192 |
"Co-Optimism": ["semoga sehat selalu", "semoga sukses", "lanjutkan", "semangat", "sehat", "setuju", "ayo", "selamat", "sukses",
193 |
"semoga", "berharap", "mugo", "lebih maju", "optimis jombang satu", "bangga", "saget", "doa", "tambah maju",
194 |
"lebih maju", "tambah makmur", "tambah sejahtera", "majukan", "harap", "berharap", "menginginkan", "ingin",
195 |
"mendoakan", "sae bah", "bismilah", "cocok", "umkm maju", "butuh perubahan", "butuh ganti bupati", "memakmurkan",
196 |
"makmur", "buka lapangan kerja", "lancar", "lancar terus", "mugi", "bantuan", "sembako", "lebih baik", "tambah apik",
197 |
"sae", "tambah sae", "jombang maju bersama warsa", "jombang maju", "sejahtera", "yakin", "makin",
198 |
"optimis", "salam","jombang sejahtera","tambah sejahtera", "butuh pemimpin","bismillah", "warsa menang",
199 |
"menanti pemimpin", "bakalan maju", "bakalan sejahtera", "bakalan sukses","yakin", "majukan", "majulah", "doakan"],
200 |
201 |
"Co-Support": ["siap dukung", "all in", "menyala", "siap", "dukung", "gas", "warsa", "menang", "coblos", "coblos dua",
202 |
"ayo", "pilih dua", "pilih", "wonge abah", "warsubi tok", "merangkul", "program", "konkrit", "wong apik",
203 |
"baik", "niat apik", "merakyat", "mengayomi", "komitmen", "merangkul", "mendengar", "dengar", "panggah abah",
204 |
"panggah warsa", "antusias", "komitmen", "kebersamaan", "dukung abah", "dengan abah", "program konkrit", "abah satu",
205 |
"jombang satu", "orang baik", "pilih abah", "pilih warsa", "wonge abah", "ngopeni ngayomi mumpuni", "melu",
206 |
"tambah adem", "tambah sejuk", "dukung usaha", "no dua", "dukung umkm", "dukung ekonomi", "pendherek", "penderek",
207 |
"pengikut", "bismilah abah", "abah dua", "hadir support", "nggih", "turun tangan", "membantu", "bertindak",
208 |
"melaju", "program", "membantu", "bupati", "joss", "top", "jombang maju", "wayae", "wayahe", "maju", "mantap",
209 |
"abah", "bah", "ganti bupati", "sodaqoh", "wayahe ganti", "ganti", "meledak", "menyala", "dibutuhkan", "kawal",
210 |
"membara", "seru", "keren", "mantap", "istimewa", "ayo", "layak", "al in", "makin raket", "kerja nyata",
211 |
"selalu dihati", "pangah abah", "pangah warsa", "kebersaman", "dermawan", "sat set", "wat wet", "panggah abah",
212 |
"panggah warsa", "pangah warsa", "pangah", "wonge abah", "positif menang", "pemimpin", "wong mu"]
213 |
214 |
215 |
216 |
217 |
def predict_ba_with_model(text):
218 |
# Mengecek apakah teks mengandung kata-kata kunci dari kategori Co-Support atau Co-Optimism
219 |
for label, keywords_list in keywords.items():
220 |
if any(keyword.lower() in text.lower() for keyword in keywords_list):
221 |
return label # Jika
222 |
223 |
# Jika tidak ada keyword yang cocok, gunakan model untuk prediksi
224 |
inputs = tokenizer(text, return_tensors="tf", truncation=True, padding=True, max_length=128)
225 |
outputs = ba_model(inputs)
226 |
logits = outputs.logits
227 |
228 |
229 |
230 |
231 |
232 |
# Tambahkan "Co-Negative" jika Sentimen_Prediksi adalah "negative"
233 |
data['Brand_Attitude'] = data.apply(
234 |
lambda row: "Co-Negative" if row['Sentimen_Prediksi'] == 'negative' else row['Brand_Attitude'], axis=1
@@ -238,273 +381,599 @@ if menu == "Klasifikasi Sentimen":
238 |
data['Brand_Attitude'] = data.apply(
239 |
lambda row: "Co-Likes" if row['Sentimen_Prediksi'] != 'negative' and row['Brand_Attitude'] == 'Co-Negative' else row['Brand_Attitude'], axis=1
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
st.write("### Kalimat Netral")
312 |
st.write(data[data['Sentimen_Prediksi'] == 'neutral']['Comment'].tolist())
313 |
314 |
# Fungsi untuk tokenisasi teks
315 |
def tokenize_text(text):
316 |
"""Membersihkan dan memisahkan teks menjadi kata-kata."""
317 |
# Hilangkan tanda baca, konversi ke huruf kecil, dan split
318 |
words = text.lower().replace('.', '').replace(',', '').split()
319 |
return words
320 |
321 |
# Fungsi untuk menghitung frekuensi kata
322 |
def get_word_frequencies(data, column):
323 |
"""Menghitung frekuensi kata berdasarkan kolom teks tertentu."""
324 |
all_words = []
325 |
for text in data[column]:
326 |
327 |
return Counter(all_words)
328 |
329 |
# Filter data berdasarkan kategori
330 |
neutral_data = data[data['Sentimen_Prediksi'] == 'neutral']
331 |
co_likes_data = data[data['Brand_Attitude'] == 'Co-Likes']
332 |
333 |
# Hitung frekuensi kata untuk masing-masing kategori
334 |
neutral_word_counts = get_word_frequencies(neutral_data, 'Cleaned_Text')
335 |
co_likes_word_counts = get_word_frequencies(co_likes_data, 'Cleaned_Text')
336 |
337 |
# Visualisasi chart untuk kata-kata di sentimen neutral
338 |
st.write("### Top Kata di Sentimen Neutral")
339 |
neutral_most_common = neutral_word_counts.most_common(10)
340 |
neutral_words, neutral_counts = zip(*neutral_most_common)
341 |
342 |
plt.figure(figsize=(10, 6))
343 |
344 |
345 |
346 |
plt.title('Top Words in
347 |
348 |
349 |
350 |
351 |
352 |
353 |
354 |
355 |
plt.figure(figsize=(10, 6))
356 |
357 |
358 |
359 |
plt.title('Top Words in Co-
360 |
361 |
362 |
363 |
# Siapkan data untuk diperbarui
364 |
new_data = data[['Comment', 'Cleaned_Text', 'Sentimen_Prediksi']].copy()
365 |
new_data.rename(columns={'Sentimen_Prediksi': 'Sentimen_Aktual'}, inplace=True)
366 |
367 |
# Fungsi untuk mencari komentar yang mirip
368 |
def find_similar_comments(data, query_text, top_n=5):
369 |
# Membuat representasi TF-IDF dari teks
370 |
vectorizer = TfidfVectorizer(stop_words='english')
371 |
tfidf_matrix = vectorizer.fit_transform(data['Cleaned_Text'])
372 |
373 |
# Mencari query dalam database
374 |
query_tfidf = vectorizer.transform([query_text])
375 |
376 |
# Menghitung cosine similarity
377 |
similarity_scores = cosine_similarity(query_tfidf, tfidf_matrix)
378 |
379 |
# Menambahkan similarity ke dataframe
380 |
data['similarity'] = similarity_scores[0]
381 |
382 |
# Mengurutkan berdasarkan similarity tertinggi
383 |
similar_comments = data.sort_values(by='similarity', ascending=False).head(top_n)
384 |
385 |
return similar_comments
386 |
387 |
# Menampilkan data komentar yang mirip
388 |
st.write("Komentar yang Mirip dengan Sentimen yang Akan Diperbarui")
389 |
similar_comments = find_similar_comments(data, "Komentar yang ingin diubah sentimennya", top_n=5)
390 |
st.dataframe(similar_comments[['Comment', 'Cleaned_Text', 'Sentimen_Prediksi', 'similarity']])
391 |
392 |
# Menampilkan kolom input untuk mengubah sentimen dan brand attitude
393 |
new_sentiment = st.selectbox("Pilih Sentimen Baru", ['positive', 'negative', 'neutral'])
394 |
new_brand_attitude = st.selectbox("Pilih Brand Attitude Baru", ['Co-Likes', 'Co-Support', 'Co-Optimism', 'Co-Negative'])
395 |
396 |
# Tombol untuk memperbarui sentimen dan brand attitude
397 |
if st.button("Perbarui Sentimen dan Brand Attitude"):
398 |
updated_comments = similar_comments.copy()
399 |
updated_comments['Sentimen_Aktual'] = new_sentiment
400 |
updated_comments['Brand_Attitude'] = new_brand_attitude
401 |
402 |
# Update data di database atau dataframe
403 |
# Misalnya, jika data disimpan dalam DataFrame `data`
404 |
for index, row in updated_comments.iterrows():
405 |
data.loc[data['Cleaned_Text'] == row['Cleaned_Text'], 'Sentimen_Aktual'] = row['Sentimen_Aktual']
406 |
data.loc[data['Cleaned_Text'] == row['Cleaned_Text'], 'Brand_Attitude'] = row['Brand_Attitude']
407 |
408 |
st.success("Sentimen dan Brand Attitude berhasil diperbarui!")
409 |
410 |
# # Menyimpan setiap baris ke dalam database
411 |
# for index, row in new_data.iterrows():
412 |
# comment = row['Comment']
413 |
# cleaned_text = row['Cleaned_Text']
414 |
# sentimen_aktual = row['Sentimen_Aktual']
415 |
416 |
# # Tambahkan tombol untuk memperbarui kamus
417 |
# if st.button("Perbarui Kamus"):
418 |
# new_data = data[['Comment', 'Cleaned_Text', 'Sentimen_Prediksi']].copy()
419 |
# new_data.rename(columns={'Sentimen_Prediksi': 'Sentimen_Aktual'}, inplace=True)
420 |
# update_kamus(selected_file, new_data)
421 |
422 |
except Exception as e:
423 |
st.error(f"Terjadi kesalahan: {e}")
424 |
425 |
426 |
427 |
428 |
429 |
430 |
431 |
432 |
433 |
434 |
# Siapkan data
435 |
X = kamus_data['Cleaned_Text']
436 |
y = kamus_data['Sentimen_Aktual']
437 |
438 |
439 |
440 |
441 |
442 |
443 |
444 |
445 |
446 |
447 |
X_test_tokens = tokenizer(list(X_test), padding=True, truncation=True, max_length=128, return_tensors='tf')
448 |
449 |
450 |
451 |
model_path = 'update_mundjidah-model.h5'
452 |
elif kamus_data == "data_komen_warsubi_clean-v1.xlsx":
453 |
model_path = 'update_warsubi-model.h5'
454 |
455 |
# Load model BERT untuk Sequence Classification
456 |
bert_model = TFBertForSequenceClassification.from_pretrained(PRE_TRAINED_MODEL, num_labels=3)
457 |
458 |
459 |
460 |
461 |
462 |
463 |
464 |
465 |
466 |
467 |
468 |
469 |
470 |
471 |
472 |
473 |
474 |
475 |
476 |
477 |
478 |
479 |
480 |
481 |
kamus_option = st.selectbox(
482 |
"Pilih Kamus yang Ingin Diedit:",
483 |
["data_komen_mundjidah_clean.xlsx", "data_komen_warsubi_clean-v1.xlsx"]
484 |
485 |
486 |
487 |
488 |
489 |
490 |
491 |
492 |
st.write("Kamus Saat Ini:")
493 |
# Tampilkan tabel yang dapat diedit
494 |
edited_data = st.data_editor(
495 |
496 |
497 |
498 |
499 |
500 |
# Tombol untuk menyimpan perubahan
501 |
if st.button("Simpan Perubahan"):
502 |
edited_data.to_excel(kamus_option, index=False)
503 |
st.success("Perubahan berhasil disimpan ke file Excel!")
504 |
505 |
506 |
507 |
508 |
509 |
510 |
10 |
import tensorflow as tf
11 |
from transformers import TFBertForSequenceClassification, BertTokenizer
12 |
from sklearn.feature_extraction.text import TfidfVectorizer
13 |
from sklearn.model_selection import train_test_split
14 |
import unicodedata
15 |
from sklearn.cluster import KMeans
16 |
import datetime
17 |
18 |
# Muat data kamus
19 |
df_kamus_komen1 = pd.read_excel('data_komen_mundjidah_clean.xlsx') # Kamus 1
20 |
df_kamus_komen2 = pd.read_excel('data_komen_warsubi_clean-v1.xlsx') # Kamus 3
21 |
22 |
# Fungsi untuk memuat kamus normalisasi dari file lokal
23 |
def load_normalization_dict(file_path):
65 |
pattern = rf'\b{re.escape(username)}\b'
66 |
comment = re.sub(pattern, '', comment, flags=re.IGNORECASE)
67 |
return re.sub(r'\s+', ' ', comment.strip())
68 |
69 |
# Fungsi untuk membersihkan teks
70 |
def clean_text(text):
71 |
text = str(text)
72 |
73 |
# Menghapus URL dan mention serta hashtag
74 |
text = re.sub(r'http[s]?://\S+', '', text) # Hapus URL
75 |
text = re.sub(r'@\w+|#\w+', '', text) # Hapus mention dan hashtag
76 |
77 |
# Mengganti angka tertentu menjadi kata
78 |
text = re.sub(r'\b(01|1)\b', 'satu', text)
79 |
text = re.sub(r'\b(02|2)\b', 'dua', text)
80 |
81 |
# Menghapus angka lainnya
82 |
text = re.sub(r'\b\d+\b', '', text)
83 |
84 |
# Mengonversi karakter-karakter matematis atau bold menjadi karakter normal
85 |
text = unicodedata.normalize('NFKD', text) # Normalisasi karakter
86 |
87 |
# Mengganti tanda baca (.,!?;:) dan emoji tertentu dengan spasi (' ')
88 |
text = re.sub(r'[.,!?;:]', ' ', text) # Ganti tanda baca tertentu dengan spasi
89 |
text = re.sub(r'[🔥✨❤️]', ' ', text) # Ganti emoji spesifik dengan spasi
90 |
91 |
# Menghapus karakter yang tidak diinginkan kecuali huruf, angka, emoji ✌️ dan ☝️
92 |
text = re.sub(r'[^\w\s\u2700-\u27BF\u2B50\u00A9\u00AE✌️☝️]', '', text)
93 |
94 |
# Menurunkan huruf menjadi huruf kecil dan menghapus spasi ekstra
95 |
text = text.lower()
96 |
text = re.sub(r'\s+', ' ', text).strip() # Menghapus spasi berlebihan
97 |
98 |
return text
99 |
100 |
def load_slang_dict(file_path):
101 |
102 |
with open(file_path, 'r', encoding='utf-8') as file:
103 |
lines = file.readlines()
104 |
slang_dict = {}
105 |
for line in lines:
106 |
line = line.strip()
107 |
if ':' in line: # Memastikan format key:value
108 |
key, value = line.split(':', 1) # Pisahkan berdasarkan ':'
109 |
key = key.strip('"').strip() # Hapus tanda kutip pada key dan spasi ekstra
110 |
value = value.strip('",').strip() # Hapus tanda kutip dan koma pada value
111 |
slang_dict[key] = value
112 |
return slang_dict
113 |
except Exception as e:
114 |
st.error(f"Terjadi kesalahan saat membaca file slang.txt: {e}")
115 |
return {}
116 |
117 |
# Muat kamus normalisasi dari file lokal
118 |
normalization_file = "slang.txt"
119 |
normalization_dict = load_normalization_dict(normalization_file)
120 |
121 |
def save_slang_dict(slang_dict, file_path):
122 |
123 |
with open(file_path, 'w', encoding='utf-8') as file:
124 |
for key, value in slang_dict.items():
125 |
# Tulis setiap pasangan key-value dalam format "key":"value"
126 |
127 |
st.success("Kamus normalisasi berhasil disimpan!")
128 |
except Exception as e:
129 |
st.error(f"Terjadi kesalahan saat menyimpan file slang.txt: {e}")
130 |
131 |
def load_keywords(file_path):
132 |
"""Membaca keywords dari file txt dengan format kategori."""
133 |
keywords = {}
134 |
with open(file_path, 'r', encoding='utf-8') as f:
135 |
current_category = None
136 |
for line in f:
137 |
line = line.strip()
138 |
if re.match(r'^\[.*\]$', line): # Mendeteksi kategori seperti [Co-Optimism]
139 |
current_category = line.strip('[]')
140 |
keywords[current_category] = []
141 |
elif current_category and line:
142 |
143 |
return keywords
144 |
145 |
def load_negative_keywords(file_path):
146 |
"""Membaca negative keywords dengan model identifier."""
147 |
negative_keywords = {}
148 |
with open(file_path, 'r', encoding='utf-8') as f:
149 |
current_model = None
150 |
for line in f:
151 |
line = line.strip()
152 |
if re.match(r'^\[.*\]$', line): # Mendeteksi model identifier seperti [Model Mundjidah]
153 |
current_model = line.strip('[]')
154 |
negative_keywords[current_model] = []
155 |
elif current_model and line:
156 |
157 |
return negative_keywords
158 |
159 |
def save_keywords(file_path, keywords):
160 |
"""Menyimpan keywords ke file txt."""
161 |
with open(file_path, 'w', encoding='utf-8') as f:
162 |
for category, words in keywords.items():
163 |
164 |
for word in words:
165 |
166 |
f.write("\n") # Tambahkan baris kosong antar kategori
167 |
168 |
def save_negative_keywords(file_path, negative_keywords):
169 |
"""Menyimpan negative keywords ke file txt."""
170 |
with open(file_path, 'w', encoding='utf-8') as f:
171 |
for model, words in negative_keywords.items():
172 |
173 |
for word in words:
174 |
175 |
176 |
177 |
# Fungsi untuk menyimpan data ke file Excel sesuai model
178 |
def save_to_data_train(data, model_name):
179 |
file_paths = {
180 |
"Model Mundjidah": 'data_komen_mundjidah_clean.xlsx',
181 |
"Model Warsubi V1": 'data_komen_warsubi_clean-v1.xlsx'
182 |
183 |
file_path = file_paths.get(model_name)
184 |
if not file_path:
185 |
st.error("Model tidak dikenali. Pastikan model sesuai.")
186 |
187 |
188 |
# Coba baca file lama atau buat data kosong
189 |
190 |
existing_data = pd.read_excel(file_path)
191 |
except FileNotFoundError:
192 |
existing_data = pd.DataFrame(columns=data.columns)
193 |
194 |
# Gabungkan data baru dan hapus duplikat
195 |
updated_data = pd.concat([existing_data, data], ignore_index=True)
196 |
updated_data = updated_data.drop_duplicates(subset=['Comment', 'Cleaned_Text'])
197 |
198 |
# Simpan data
199 |
updated_data.to_excel(file_path, index=False)
200 |
return file_path
201 |
202 |
# Definisi parameter
203 |
PRE_TRAINED_MODEL = 'indobenchmark/indobert-base-p2'
204 |
205 |
206 |
207 |
208 |
# Fungsi untuk melatih ulang model
209 |
def retrain_model(kamus_data, model_path):
210 |
# Siapkan data
211 |
X = kamus_data['Cleaned_Text']
212 |
y = kamus_data['Brand Attitude']
213 |
214 |
# Konversi label Brand Attitude ke angka
215 |
label_map = {'Co-Likes': 0, 'Co-Support': 1, 'Co-Optimism': 2, 'Co-Negative': 3}
216 |
y =
217 |
218 |
# Split data menjadi training dan testing
219 |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
220 |
221 |
# Tokenisasi menggunakan BERT tokenizer
222 |
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL)
223 |
X_train_tokens = tokenizer(list(X_train), padding=True, truncation=True, max_length=128, return_tensors='tf')
224 |
X_test_tokens = tokenizer(list(X_test), padding=True, truncation=True, max_length=128, return_tensors='tf')
225 |
226 |
# Load model BERT
227 |
bert_model = TFBertForSequenceClassification.from_pretrained(PRE_TRAINED_MODEL, num_labels=4)
228 |
229 |
# Optimizer dan loss function
230 |
optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
231 |
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
232 |
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
233 |
234 |
# Compile model
235 |
bert_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
236 |
237 |
# Latih model
238 |
239 |
X_train_tokens['input_ids'], y_train,
240 |
241 |
242 |
validation_data=(X_test_tokens['input_ids'], y_test)
243 |
244 |
245 |
# Simpan model
246 |
247 |
248 |
249 |
tf.config.set_visible_devices([], 'GPU')
250 |
# Tambahkan opsi di sidebar
251 |
menu = st.sidebar.selectbox("Pilih Menu", ["Upload Data", "Hasil Prediksi", "Perlu Validasi","Keyword BA","Normalisasi Kamus", "Overview Data","Retrain Model"])
252 |
253 |
if menu == "Upload Data":
254 |
# Streamlit app
255 |
st.title("Aplikasi Klasifikasi Sentimen dan Brand Attitude")
256 |
267 |
data = pd.read_excel(uploaded_file)
268 |
269 |
data = pd.read_csv(uploaded_file)
270 |
271 |
+ = data
272 |
273 |
# Bersihkan data
274 |
data.dropna(how='all', inplace=True)
279 |
known_usernames = get_known_usernames(data)
280 |
data["Cleaned_Text"] = data["Comment"].apply(lambda x: remove_usernames(x, known_usernames))
281 |
data["Cleaned_Text"] = data["Cleaned_Text"].apply(lambda x: normalize_text(clean_text(x), normalization_dict))
282 |
283 |
keywords = load_keywords("keywords.txt")
284 |
negative_keywords = load_negative_keywords("negative_keywords.txt")
285 |
st.session_state.keywords = keywords
286 |
st.session_state.negative_keywords = negative_keywords
287 |
288 |
# Konfigurasi model berdasarkan pilihan
289 |
if model_choice == "Model Mundjidah":
290 |
sentiment_model_path = "mundjidah-model.h5"
291 |
ba_model_path = "ba-mundjidah-model.h5"
292 |
selected_df = df_kamus_komen1
293 |
selected_negative_keywords = negative_keywords.get("Model Mundjidah", [])
294 |
positive_keywords = ["semoga menang", "semoga", "baik", "bagus", "terbaik", "semangat", "mundjidah", "amin", "gas", "lanjutkan"]
295 |
296 |
elif model_choice == "Model Warsubi V1":
297 |
sentiment_model_path = "warsa-model.h5"
298 |
ba_model_path = "ba-warsa-model.h5"
299 |
selected_df = df_kamus_komen2
300 |
selected_negative_keywords = negative_keywords.get("Model Warsubi V1", [])
301 |
positive_keywords = ["hebat", "luar biasa", "bagus", "terbaik", "memilih dengan tepat", "all in abah subi", "pilih warsubi", "dua", "✌️", "abah", "sae","sehat","semangat"]
302 |
303 |
else: # Tambahan untuk model lain
304 |
sentiment_model_path = "warsubi-v2-model.h5"
305 |
ba_model_path = "ba-warsubi-v2-model.h5"
306 |
positive_keywords = ["hebat"]
307 |
negative_keywords = ["golput ae"]
308 |
309 |
PRE_TRAINED_MODEL = 'indobenchmark/indobert-base-p2'
310 |
st.session_state['model_choice'] = model_choice
311 |
312 |
# Load model sentimen
313 |
314 |
sentiment_model = TFBertForSequenceClassification.from_pretrained(PRE_TRAINED_MODEL, num_labels=3)
320 |
321 |
# Fungsi prediksi sentimen dengan tambahan pencocokan keyword
322 |
def predict_with_sentiment_model(text):
323 |
if any(keyword.lower() in text.lower() for keyword in positive_keywords):
324 |
return 'positive'
325 |
elif any(keyword.lower() in text.lower() for keyword in selected_negative_keywords):
326 |
return 'negative'
327 |
328 |
# Prediksi menggunakan model jika tidak ada keyword yang cocok
341 |
except Exception as e:
342 |
st.error(f"Gagal memuat model Brand Attitude: {e}")
343 |
344 |
345 |
def predict_ba_with_model(text, ba_model, tokenizer, threshold=0.7):
346 |
for label, keywords_list in keywords.items():
347 |
if any(keyword.lower() in text.lower() for keyword in keywords_list):
348 |
return label, 1.0 # Jika cocok keyword, prob = 1.0
349 |
350 |
# Jika tidak ada keyword yang cocok, gunakan model untuk prediksi
351 |
inputs = tokenizer(text, return_tensors="tf", truncation=True, padding=True, max_length=128)
352 |
outputs = ba_model(inputs)
353 |
logits = outputs.logits
354 |
355 |
# Hitung probabilitas menggunakan softmax
356 |
probabilities = tf.nn.softmax(logits, axis=-1).numpy()[0]
357 |
max_prob = np.max(probabilities) # Probabilitas tertinggi
358 |
predicted_label_index = np.argmax(probabilities) # Indeks dari label dengan probabilitas tertinggi
359 |
predicted_label = ['Co-Likes', 'Co-Support', 'Co-Optimism', 'Co-Negative'][predicted_label_index]
360 |
361 |
# Jika probabilitas tertinggi kurang dari threshold, set label sebagai 'Co-Likes' untuk review
362 |
if max_prob < threshold:
363 |
predicted_label = 'Co-Likes'
364 |
365 |
return predicted_label, max_prob
366 |
367 |
# Menggunakan fungsi untuk menambahkan prediksi Brand Attitude ke data
368 |
# data['Brand_Attitude'] = data['Cleaned_Text'].apply(lambda x: predict_ba_with_model(x, ba_model, tokenizer, threshold=0.7))
369 |
370 |
# Menambahkan hasil klasifikasi ke DataFrame
371 |
data[['Brand_Attitude', 'Probabilitas']] = data['Cleaned_Text'].apply(
372 |
lambda x: pd.Series(predict_ba_with_model(x, ba_model, tokenizer, threshold=0.7))
373 |
374 |
375 |
# Tambahkan "Co-Negative" jika Sentimen_Prediksi adalah "negative"
376 |
data['Brand_Attitude'] = data.apply(
377 |
lambda row: "Co-Negative" if row['Sentimen_Prediksi'] == 'negative' else row['Brand_Attitude'], axis=1
381 |
data['Brand_Attitude'] = data.apply(
382 |
lambda row: "Co-Likes" if row['Sentimen_Prediksi'] != 'negative' and row['Brand_Attitude'] == 'Co-Negative' else row['Brand_Attitude'], axis=1
383 |
384 |
385 |
st.session_state.classified_data = data
386 |
387 |
# Button to navigate to "Hasil Prediksi"
388 |
st.success("Data berhasil diprediksi! Lihat di menu Hasil Prediksi.")
389 |
390 |
except Exception as e:
391 |
st.error(f"Terjadi kesalahan: {e}")
392 |
393 |
elif menu == "Hasil Prediksi":
394 |
# Streamlit app
395 |
if "classified_data" in st.session_state:
396 |
data = st.session_state.classified_data
397 |
st.title("Aplikasi Klasifikasi Sentimen dan Brand Attitude")
398 |
399 |
# Tampilkan hasil
400 |
st.write("Hasil Klasifikasi Sentimen dan Brand Attitude:")
401 |
st.dataframe(data[['Comment', 'Cleaned_Text', 'Sentimen_Prediksi', 'Brand_Attitude']])
402 |
403 |
# Distribusi level komentar
404 |
st.write("Distribusi Level Komentar:")
405 |
level_counts = data['Brand_Attitude'].value_counts()
406 |
total_co_likes = level_counts.get('Co-Likes', 0)
407 |
total_co_support = level_counts.get('Co-Support', 0)
408 |
total_co_optimism = level_counts.get('Co-Optimism', 0)
409 |
total_co_negative = level_counts.get('Co-Negative', 0)
410 |
411 |
# Tampilkan total jumlah sentimen
412 |
st.write(f"**Total BA Co-Likes:** {total_co_likes}")
413 |
st.write(f"**Total BA Co-Support:** {total_co_support}")
414 |
st.write(f"**Total BA Co-Optimism:** {total_co_optimism}")
415 |
st.write(f"**Total BA Co-Negative:** {total_co_negative}")
416 |
417 |
# Tampilkan jumlah setiap kategori
418 |
419 |
420 |
def generate_wordcloud(text):
421 |
wordcloud = WordCloud(
422 |
423 |
424 |
425 |
426 |
427 |
428 |
fig, ax = plt.subplots(figsize=(10, 5))
429 |
ax.imshow(wordcloud, interpolation='bilinear')
430 |
431 |
return fig
432 |
433 |
st.write("WordCloud Berdasarkan Brand Attitude:")
434 |
for ba in ['Co-Likes', 'Co-Support', 'Co-Optimism','Co-Negative']:
435 |
text = " ".join(data[data['Brand_Attitude'] == ba]['Cleaned_Text'].tolist())
436 |
if text:
437 |
st.write(f"WordCloud untuk Brand Attitude {ba.capitalize()}:")
438 |
439 |
440 |
# Fungsi untuk tokenisasi teks
441 |
def tokenize_text(text):
442 |
"""Membersihkan dan memisahkan teks menjadi kata-kata."""
443 |
# Hilangkan tanda baca, konversi ke huruf kecil, dan split
444 |
words = text.lower().replace('.', '').replace(',', '').split()
445 |
return words
446 |
447 |
# Fungsi untuk menghitung frekuensi kata
448 |
def get_word_frequencies(data, column):
449 |
"""Menghitung frekuensi kata berdasarkan kolom teks tertentu."""
450 |
all_words = []
451 |
for text in data[column]:
452 |
453 |
454 |
if len(all_words) == 0:
455 |
return None # Jika tidak ada kata yang ditemukan, kembalikan None
456 |
return Counter(all_words)
457 |
458 |
co_likes_data = data[data['Brand_Attitude'] == 'Co-Likes']
459 |
co_support_data = data[data['Brand_Attitude'] == 'Co-Support']
460 |
co_optimism_data = data[data['Brand_Attitude'] == 'Co-Optimism']
461 |
co_negative_data = data[data['Brand_Attitude'] == 'Co-Negative']
462 |
463 |
# Visualisasi chart untuk kata-kata di BA Co-Likes
464 |
st.write("### Top Kata di BA Co-Likes")
465 |
co_likes_word_counts = get_word_frequencies(co_likes_data, 'Cleaned_Text')
466 |
if co_likes_word_counts is None:
467 |
st.write("Tidak ada kata yang ditemukan di kategori Co-Likes.")
468 |
469 |
co_likes_most_common = co_likes_word_counts.most_common(10)
470 |
co_likes_words, co_likes_counts = zip(*co_likes_most_common)
471 |
plt.figure(figsize=(10, 6))
472 |
plt.barh(co_likes_words, co_likes_counts, color='green')
473 |
474 |
475 |
plt.title('Top Words in Co-Likes Category')
476 |
477 |
478 |
479 |
# Visualisasi chart untuk kata-kata di BA Co-Support
480 |
st.write("### Top Kata di BA Co-Support")
481 |
co_support_word_counts = get_word_frequencies(co_support_data, 'Cleaned_Text')
482 |
if co_support_word_counts is None:
483 |
st.write("Tidak ada kata yang ditemukan di kategori Co-Support.")
484 |
485 |
co_support_most_common = co_support_word_counts.most_common(10)
486 |
co_support_words, co_support_counts = zip(*co_support_most_common)
487 |
plt.figure(figsize=(10, 6))
488 |
plt.barh(co_support_words, co_support_counts, color='orange')
489 |
490 |
491 |
plt.title('Top Words in Co-Support Category')
492 |
493 |
494 |
495 |
# Visualisasi chart untuk kata-kata di BA Co-Optimism
496 |
st.write("### Top Kata di BA Co-Optimism")
497 |
co_optimism_word_counts = get_word_frequencies(co_optimism_data, 'Cleaned_Text')
498 |
if co_optimism_word_counts is None:
499 |
st.write("Tidak ada kata yang ditemukan di kategori Co-Optimism.")
500 |
501 |
co_optimism_most_common = co_optimism_word_counts.most_common(10)
502 |
co_optimism_words, co_optimism_counts = zip(*co_optimism_most_common)
503 |
plt.figure(figsize=(10, 6))
504 |
plt.barh(co_optimism_words, co_optimism_counts, color='blue')
505 |
506 |
507 |
plt.title('Top Words in Co-Optimism Category')
508 |
509 |
510 |
511 |
# Visualisasi chart untuk kata-kata di BA Co-Negative
512 |
st.write("### Top Kata di BA Co-Negative")
513 |
co_negative_word_counts = get_word_frequencies(co_negative_data, 'Cleaned_Text')
514 |
if co_negative_word_counts is None:
515 |
st.write("Tidak ada kata yang ditemukan di kategori Co-Negative.")
516 |
517 |
co_negative_most_common = co_negative_word_counts.most_common(10)
518 |
co_negative_words, co_negative_counts = zip(*co_negative_most_common)
519 |
plt.figure(figsize=(10, 6))
520 |
plt.barh(co_negative_words, co_negative_counts, color='red')
521 |
522 |
523 |
plt.title('Top Words in Co-Negative Category')
524 |
525 |
526 |
527 |
# Siapkan data untuk diperbarui
528 |
new_data = data[['Comment', 'Cleaned_Text', 'Sentimen_Prediksi', 'Brand_Attitude']].copy()
529 |
530 |
531 |
st.warning("Tidak ada hasil prediksi. Silakan upload data terlebih dahulu di menu 'Upload Data'.")
532 |
533 |
# Menu Perlu Validasi
534 |
elif menu == "Perlu Validasi":
535 |
st.title("Komentar Perlu Validasi")
536 |
537 |
# Periksa apakah data hasil klasifikasi tersedia
538 |
if 'classified_data' not in st.session_state:
539 |
st.error("Silakan klasifikasikan data terlebih dahulu di menu sebelumnya.")
540 |
541 |
# Ambil data komentar yang probabilitasnya rendah
542 |
data = st.session_state.classified_data
543 |
544 |
if 'Status' not in data.columns:
545 |
data['Status'] = False # Default nilai False
546 |
547 |
review_data = data[(data['Brand_Attitude'] == 'Co-Likes') & (data['Probabilitas'] < 0.7)]
548 |
549 |
if review_data.empty:
550 |
st.write("Tidak ada komentar yang memerlukan validasi saat ini.")
551 |
552 |
# Proses Clustering
553 |
st.write("### Clustering Komentar")
554 |
vectorizer = TfidfVectorizer(max_features=500, stop_words='english')
555 |
X = vectorizer.fit_transform(review_data['Cleaned_Text'])
556 |
557 |
# Slider untuk memilih jumlah cluster
558 |
k = st.slider("Pilih jumlah cluster:", min_value=2, max_value=10, value=3)
559 |
kmeans = KMeans(n_clusters=k, random_state=42)
560 |
review_data['Cluster'] = kmeans.fit_predict(X)
561 |
562 |
# Dropdown untuk memilih cluster
563 |
cluster_ids = sorted(review_data['Cluster'].unique())
564 |
selected_cluster = st.selectbox("Pilih Cluster untuk Ditampilkan:", cluster_ids)
565 |
566 |
# Tampilkan tabel komentar berdasarkan cluster yang dipilih
567 |
st.write(f"### Komentar di Cluster {selected_cluster}")
568 |
cluster_data = review_data[review_data['Cluster'] == selected_cluster]
569 |
st.dataframe(cluster_data[['Cleaned_Text', 'Brand_Attitude', 'Probabilitas']])
570 |
571 |
# Form untuk validasi Brand Attitude
572 |
st.write("### Validasi Brand Attitude")
573 |
with st.form(key=f"form_cluster_{selected_cluster}"):
574 |
update_all = st.checkbox("Ubah seluruh komentar dalam cluster ini")
575 |
if update_all:
576 |
# Ubah semua komentar dalam cluster
577 |
new_brand_attitude = st.selectbox("Pilih Brand Attitude Baru:",
578 |
["Co-Likes", "Co-Support", "Co-Optimism", "Co-Negative"],
579 |
580 |
581 |
# Ubah komentar tertentu dalam cluster
582 |
cleaned_text_to_update = st.selectbox("Pilih komentar untuk diubah:", cluster_data['Cleaned_Text'])
583 |
new_brand_attitude = st.selectbox("Pilih Brand Attitude Baru:",
584 |
["Co-Likes", "Co-Support", "Co-Optimism", "Co-Negative"],
585 |
586 |
587 |
submit_button = st.form_submit_button("Update Brand Attitude")
588 |
589 |
if submit_button:
590 |
if update_all:
591 |
# Update seluruh komentar dalam cluster
592 |
review_data.loc[review_data['Cluster'] == selected_cluster, 'Brand_Attitude'] = new_brand_attitude
593 |
review_data.loc[review_data['Cluster'] == selected_cluster, 'Status'] = True
594 |
st.success(f"Brand Attitude untuk seluruh komentar di Cluster {selected_cluster} berhasil diperbarui menjadi: {new_brand_attitude}")
595 |
596 |
# Update komentar tertentu
597 |
review_data.loc[review_data['Cleaned_Text'] == cleaned_text_to_update, 'Brand_Attitude'] = new_brand_attitude
598 |
review_data.loc[review_data['Cleaned_Text'] == cleaned_text_to_update, 'Status'] = True
599 |
st.success(f"Brand Attitude berhasil diperbarui untuk komentar: {cleaned_text_to_update}")
600 |
601 |
# Update data hasil prediksi awal di session_state
602 |
st.session_state.classified_data.loc[review_data.index, :] = review_data
603 |
604 |
# Menu Keyword BA
605 |
elif menu == "Keyword BA":
606 |
st.subheader("Keyword BA Menu")
607 |
608 |
# Load keywords dari file
609 |
keywords = load_keywords("keywords.txt")
610 |
negative_keywords = load_negative_keywords("negative_keywords.txt")
611 |
612 |
# Ambil model yang digunakan dari session state
613 |
current_model = st.session_state.get("model_choice", "Model Mundjidah")
614 |
615 |
# Update Co-Negative keywords berdasarkan model
616 |
if current_model in negative_keywords:
617 |
keywords['Co-Negative'] = negative_keywords[current_model]
618 |
619 |
keywords['Co-Negative'] = []
620 |
621 |
# Pilih Brand Attitude dan tampilkan komentar
622 |
st.write("### Pilih Brand Attitude untuk melihat komentarnya")
623 |
ba_option = st.selectbox("Pilih Brand Attitude", list(keywords.keys()), index=0)
624 |
625 |
# Tampilkan keyword untuk BA
626 |
st.write(f"### Keyword untuk {ba_option}")
627 |
st.write(", ".join(keywords[ba_option]))
628 |
629 |
# Tampilkan komentar sesuai BA
630 |
data = st.session_state.classified_data
631 |
filtered_data = data[data['Brand_Attitude'] == ba_option]
632 |
filtered_data = filtered_data.sort_values(by='Cleaned_Text', ascending=True) # Sort ascending
633 |
if filtered_data.empty:
634 |
st.write("Tidak ada komentar yang ditemukan untuk Brand Attitude ini.")
635 |
636 |
st.write(filtered_data[['Cleaned_Text', 'Brand_Attitude']])
637 |
638 |
if 'Status' not in data.columns:
639 |
data['Status'] = False # Default nilai False
640 |
641 |
# CRUD Operations
642 |
st.write("### Kelola Keyword")
643 |
with st.form("manage_keywords_form"):
644 |
# Pilih keyword untuk diupdate atau dihapus
645 |
selected_keyword = st.selectbox("Pilih Keyword untuk Diubah atau Dihapus", keywords[ba_option])
646 |
new_keyword_value = st.text_input("Ubah Keyword (Kosongkan jika ingin menghapus)", value=selected_keyword)
647 |
action ="Pilih Aksi", ["Update", "Delete"], index=0)
648 |
manage_submit_button = st.form_submit_button("Lakukan Perubahan")
649 |
650 |
if manage_submit_button:
651 |
if action == "Update" and new_keyword_value.strip():
652 |
# Update keyword
653 |
index = keywords[ba_option].index(selected_keyword)
654 |
keywords[ba_option][index] = new_keyword_value.strip()
655 |
save_keywords("keywords.txt", keywords) # Simpan perubahan
656 |
st.success(f"Keyword '{selected_keyword}' berhasil diubah menjadi '{new_keyword_value.strip()}'.")
657 |
elif action == "Delete":
658 |
# Delete keyword
659 |
660 |
save_keywords("keywords.txt", keywords) # Simpan perubahan
661 |
st.success(f"Keyword '{selected_keyword}' berhasil dihapus.")
662 |
663 |
st.warning("Masukkan keyword baru untuk update atau pilih aksi delete.")
664 |
665 |
# Tampilkan semua Brand Attitude dengan filter dan search
666 |
st.write("### Tabel Semua Data dengan Filter dan Pencarian")
667 |
668 |
# Periksa apakah classified_data tersedia
669 |
if "classified_data" in st.session_state:
670 |
data = st.session_state.classified_data
671 |
672 |
# Input teks untuk filter
673 |
search_text = st.text_input("Cari berdasarkan teks komentar atau Brand Attitude:")
674 |
675 |
# Filter data berdasarkan input teks
676 |
if search_text:
677 |
filtered_data = data[
678 |
data['Cleaned_Text'].str.contains(search_text, case=False, na=False) |
679 |
data['Brand_Attitude'].str.contains(search_text, case=False, na=False)
680 |
681 |
682 |
filtered_data = data
683 |
684 |
edited_data = st.data_editor(
685 |
filtered_data[['Cleaned_Text', 'Brand_Attitude']].copy(),
686 |
687 |
688 |
689 |
690 |
# Tombol untuk menyimpan perubahan
691 |
if st.button("Simpan Perubahan"):
692 |
# Update kolom Brand Attitude dan Status di data asli berdasarkan perubahan di tabel
693 |
for index, row in edited_data.iterrows():
694 |
original_row = filtered_data.loc[index]
695 |
if row['Brand_Attitude'] != original_row['Brand_Attitude']:
696 |
data.loc[index, 'Brand_Attitude'] = row['Brand_Attitude']
697 |
data.loc[index, 'Status'] = True # Tandai sebagai diupdate
698 |
699 |
# Simpan kembali ke session_state
700 |
st.session_state.classified_data = data
701 |
st.success("Perubahan berhasil disimpan!")
702 |
703 |
st.warning("Tidak ada data yang tersedia. Silakan upload data terlebih dahulu.")
704 |
705 |
# Tambahkan keyword baru
706 |
st.write("### Tambahkan Keyword Baru")
707 |
with st.form("add_keyword_form"):
708 |
new_ba = st.selectbox("Pilih Brand Attitude untuk Keyword Baru", list(keywords.keys()))
709 |
new_keyword = st.text_input("Masukkan Keyword Baru")
710 |
add_submit_button = st.form_submit_button("Tambah Keyword")
711 |
712 |
if add_submit_button and new_keyword.strip():
713 |
if new_ba == "Co-Negative":
714 |
# Tambahkan keyword ke negative_keywords.txt
715 |
716 |
save_negative_keywords("negative_keywords.txt", negative_keywords)
717 |
st.success(f"Keyword Co-Negative '{new_keyword.strip()}' berhasil ditambahkan untuk model '{current_model}'!")
718 |
719 |
# Tambahkan keyword ke keywords.txt
720 |
721 |
save_keywords("keywords.txt", keywords)
722 |
st.success(f"Keyword '{new_keyword.strip()}' berhasil ditambahkan ke {new_ba}!")
723 |
724 |
# Simpan ke session_state
725 |
st.session_state.classified_data = data
726 |
st.session_state.keywords = keywords
727 |
st.session_state.negative_keywords = negative_keywords
728 |
729 |
730 |
elif menu == "Normalisasi Kamus":
731 |
st.subheader("Normalisasi Kamus")
732 |
733 |
# Mengambil data dari session_state jika tersedia
734 |
if 'classified_data' not in st.session_state:
735 |
st.error("Silakan unggah file dan lakukan klasifikasi di menu 'Klasifikasi Sentimen' terlebih dahulu.")
736 |
737 |
# Mengambil data yang telah diproses dan diklasifikasikan
738 |
data = st.session_state.classified_data
739 |
740 |
# Pastikan kolom 'Status' ada di DataFrame
741 |
if 'Status' not in data.columns:
742 |
data['Status'] = False # Tambahkan kolom 'Status' jika belum ada
743 |
744 |
# Tokenisasi dan hitung frekuensi kata
745 |
def tokenize(text):
746 |
return re.findall(r'\b\w+\b', text.lower()) # Tokenisasi kata-kata, huruf kecil semua
747 |
748 |
# Fungsi untuk menormalkan kata-kata di dalam data
749 |
def normalize_data(data, slang_dict):
750 |
# Proses normalisasi kata
751 |
def normalize_text(text):
752 |
words = text.split()
753 |
normalized_words = []
754 |
updated = False
755 |
for word in words:
756 |
if word in slang_dict:
757 |
758 |
updated = True
759 |
760 |
761 |
# Tandai status sebagai TRUE jika terjadi perubahan
762 |
if updated:
763 |
data.loc[data['Cleaned_Text'] == text, 'Status'] = True
764 |
return ' '.join(normalized_words)
765 |
766 |
data['Cleaned_Text'] = data['Cleaned_Text'].apply(normalize_text)
767 |
return data
768 |
769 |
# Gabungkan semua komentar untuk tokenisasi
770 |
all_comments = ' '.join(data['Cleaned_Text'])
771 |
words = tokenize(all_comments)
772 |
773 |
# Hitung frekuensi kata
774 |
word_counts = Counter(words)
775 |
776 |
# Filter kata yang frekuensinya lebih dari 10
777 |
filtered_word_counts = {word: count for word, count in word_counts.items()}
778 |
779 |
# Urutkan berdasarkan frekuensi
780 |
sorted_words = sorted(filtered_word_counts.items(), key=lambda x: x[1], reverse=True)
781 |
782 |
# Tampilkan tabel kata dan frekuensinya
783 |
st.write("Berikut adalah daftar kata-kata hasil tokenisasi:")
784 |
word_df = pd.DataFrame(sorted_words, columns=["Kata", "Frekuensi"])
785 |
786 |
787 |
# Membaca kamus normalisasi dari file
788 |
slang_dict = load_slang_dict('slang.txt')
789 |
790 |
if not slang_dict:
791 |
st.write("Belum ada kamus normalisasi yang ditemukan.")
792 |
793 |
# Menampilkan kamus normalisasi yang sudah ada
794 |
st.write("### Kamus Normalisasi yang Sudah Ada")
795 |
norm_dict_df = pd.DataFrame(list(slang_dict.items()), columns=["Kata Asli", "Kata Normalisasi"])
796 |
797 |
798 |
# Tambahkan fitur untuk meng-update kata normalisasi
799 |
st.write("### Tambahkan Normalisasi Kata")
800 |
with st.form("add_normalization_form"):
801 |
new_word = st.text_input("Masukkan kata yang belum normal", "")
802 |
normalized_word = st.text_input("Masukkan kata normalisasi", "")
803 |
submit_button = st.form_submit_button("Tambah Normalisasi")
804 |
805 |
if submit_button:
806 |
if new_word and normalized_word:
807 |
# Menambahkan normalisasi kata baru ke kamus
808 |
slang_dict[new_word] = normalized_word
809 |
save_slang_dict(slang_dict, 'slang.txt') # Simpan pembaruan ke file
810 |
st.success(f"Normalisasi kata '{new_word}' -> '{normalized_word}' berhasil ditambahkan!")
811 |
812 |
st.warning("Harap masukkan kata yang belum normal dan kata normalisasi!")
813 |
814 |
# Setelah menambahkan normalisasi, kita akan menormalkan data
815 |
if slang_dict:
816 |
data = normalize_data(data, slang_dict)
817 |
818 |
# Menampilkan hasil normalisasi
819 |
st.write("Hasil Normalisasi pada Data:")
820 |
st.dataframe(data[['Comment', 'Cleaned_Text', 'Status']])
821 |
822 |
# Menyimpan data yang telah dinormalisasi ke session state
823 |
st.session_state.classified_data = data
824 |
825 |
826 |
# Menu Overview Data
827 |
elif menu == "Overview Data":
828 |
st.title("Overview Data")
829 |
830 |
# Periksa apakah data sudah tersedia
831 |
if 'classified_data' not in st.session_state:
832 |
st.error("Silakan unggah dan klasifikasikan data di menu sebelumnya.")
833 |
834 |
data = st.session_state.classified_data
835 |
836 |
# Pastikan kolom 'Status' ada
837 |
if 'Status' not in data.columns:
838 |
data['Status'] = False # Tambahkan kolom 'Status' jika belum ada
839 |
840 |
# Tampilkan data akhir
841 |
st.write("### Data Akhir:")
842 |
final_data = data[['Cleaned_Text', 'Brand_Attitude', 'Status']].copy()
843 |
844 |
845 |
# Summary Perolehan Brand Attitude
846 |
st.write("### Summary Perolehan Brand Attitude:")
847 |
ba_summary = data['Brand_Attitude'].value_counts().reset_index()
848 |
ba_summary.columns = ['Brand_Attitude', 'Jumlah']
849 |
850 |
851 |
# Hitung jumlah data yang tervalidasi ulang (status == True)
852 |
total_validated = data[data['Status'] == True].shape[0]
853 |
st.write(f"### Total Data yang Tervalidasi Ulang: {total_validated}")
854 |
855 |
# Tambahkan kolom hitungan Brand Attitude
856 |
data['Co-Likes'] = data['Brand_Attitude'].apply(lambda x: 1 if x == 'Co-Likes' else 0)
857 |
data['Co-Support'] = data['Brand_Attitude'].apply(lambda x: 1 if x == 'Co-Support' else 0)
858 |
data['Co-Optimism'] = data['Brand_Attitude'].apply(lambda x: 1 if x == 'Co-Optimism' else 0)
859 |
data['Co-Negative'] = data['Brand_Attitude'].apply(lambda x: 1 if x == 'Co-Negative' else 0)
860 |
861 |
# Hitung sebaran Brand Attitude per Parent Link
862 |
ba_per_parent_link_updated = data.groupby('Parent Link').agg({
863 |
'Nama Akun': 'first', # Ambil hanya 1 Nama Akun pertama
864 |
'Co-Likes': 'sum',
865 |
'Co-Support': 'sum',
866 |
'Co-Optimism': 'sum',
867 |
'Co-Negative': 'sum'
868 |
869 |
870 |
# Reorganisasi kolom
871 |
ba_per_parent_link_updated = ba_per_parent_link_updated[['Nama Akun', 'Parent Link', 'Co-Likes', 'Co-Support', 'Co-Optimism', 'Co-Negative']]
872 |
st.write("### Hasil Perolehan Brand Attitude per Postingan:")
873 |
874 |
875 |
# Tombol untuk update ke database postingan
876 |
st.write("### Update Perolehan ke Database Postingan")
877 |
if st.button("Update ke 'Data Jombang.xlsx'"):
878 |
879 |
# Cek apakah file "Data Jombang.xlsx" sudah ada
880 |
881 |
existing_data = pd.read_excel('Data Jombang.xlsx')
882 |
except FileNotFoundError:
883 |
existing_data = pd.DataFrame(columns=ba_per_parent_link_updated.columns)
884 |
885 |
# Gabungkan data baru ke existing_data berdasarkan 'Parent Link'
886 |
updated_data = pd.concat([existing_data, ba_per_parent_link_updated]).drop_duplicates(subset='Parent Link', keep='last')
887 |
888 |
# Simpan hasil pembaruan ke file Excel
889 |
updated_data.to_excel('Data Jombang.xlsx', index=False)
890 |
st.success("Data berhasil diperbarui ke 'Data Jombang.xlsx'!")
891 |
except Exception as e:
892 |
st.error(f"Terjadi kesalahan saat memperbarui data: {e}")
893 |
894 |
# Tombol Kirim Data ke Database
895 |
st.write("### Kirim Data ke Database")
896 |
if st.button("Kirim Data ke Database"):
897 |
898 |
# Tambahkan kolom Created At
899 |
data['Created At'] ='%Y-%m-%d %H:%M:%S')
900 |
901 |
# Gabungkan dengan data lama jika ada
902 |
903 |
db_data = pd.read_excel('database_komen.xlsx')
904 |
db_data = pd.concat([db_data, data], ignore_index=True)
905 |
db_data = db_data.drop_duplicates() # Hapus duplikat
906 |
except FileNotFoundError:
907 |
db_data = data
908 |
909 |
# Simpan hasil ke file Excel
910 |
db_data.to_excel('database_komen.xlsx', index=False)
911 |
st.success("Data berhasil dikirim ke database!")
912 |
except Exception as e:
913 |
st.error(f"Terjadi kesalahan saat menyimpan ke database: {e}")
914 |
915 |
# Tombol Kirim Data ke Retraining
916 |
st.write("### Kirim Data ke Retraining")
917 |
if 'model_choice' in st.session_state:
918 |
model_name = st.session_state['model_choice']
919 |
st.write(f"Model yang digunakan: **{model_name}**")
920 |
921 |
if st.button("Kirim Data ke Data Train"):
922 |
923 |
# Siapkan data yang akan dikirim ke data train
924 |
data_to_train = data.copy()
925 |
data_to_train['Sentimen_Aktual'] = data_to_train['Sentimen_Prediksi']
926 |
data_to_train['Brand Attitude'] = data_to_train['Brand_Attitude']
927 |
data_to_train['Date'] ='%Y-%m-%d %H:%M:%S')
928 |
929 |
# Reorganisasi kolom
930 |
data_to_train = data_to_train[['Comment', 'Sentimen_Aktual', 'Cleaned_Text',
931 |
'Kandidat', 'Parent Link', 'Date', 'Brand Attitude']]
932 |
933 |
# Simpan data ke file train sesuai model
934 |
file_path = save_to_data_train(data_to_train, model_name)
935 |
st.success(f"Data berhasil dikirim ke retraining: **{file_path}**")
936 |
except Exception as e:
937 |
st.error(f"Terjadi kesalahan: {e}")
938 |
939 |
st.error("Model belum dipilih. Silakan klasifikasikan data terlebih dahulu.")
940 |
941 |
# Menu Retrain Model
942 |
elif menu == "Retrain Model":
943 |
st.title("Retrain Model")
944 |
kamus_option = st.selectbox(
945 |
"Pilih Kamus yang Ingin Diedit:",
946 |
["data_komen_mundjidah_clean.xlsx", "data_komen_warsubi_clean-v1.xlsx"]
947 |
948 |
949 |
# Tentukan path model sesuai kamus
950 |
model_paths = {
951 |
"data_komen_mundjidah_clean.xlsx": "update_mundjidah-model",
952 |
"data_komen_warsubi_clean-v1.xlsx": "update_warsubi-model"
953 |
954 |
model_path = model_paths[kamus_option]
955 |
956 |
# Muat data kamus dari Excel
957 |
958 |
kamus_data = pd.read_excel(kamus_option)
959 |
960 |
st.write("### Tabel Kamus Saat Ini")
961 |
edited_data = st.data_editor(
962 |
963 |
964 |
965 |
966 |
967 |
# Simpan perubahan ke Excel
968 |
if st.button("Simpan Perubahan"):
969 |
edited_data.to_excel(kamus_option, index=False)
970 |
st.success(f"Perubahan berhasil disimpan ke {kamus_option}!")
971 |
972 |
# Tombol untuk retrain model
973 |
if st.button("Retrain Model"):
974 |
with st.spinner("Melatih ulang model..."):
975 |
retrain_model(edited_data, model_path)
976 |
st.success(f"Model berhasil dilatih ulang dan disimpan di path: {model_path}!")
977 |
978 |
except Exception as e:
979 |
st.error(f"Terjadi kesalahan saat memuat atau menyimpan kamus: {e}")