zakyirhab0 commited on
Commit
8045cec
·
verified ·
1 Parent(s): b5b95ed

Upload 22 files

Browse files
app.py ADDED
@@ -0,0 +1,1548 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import joblib
3
+ import pandas as pd
4
+ import re
5
+ import emoji
6
+ import json
7
+ import io
8
+ import unicodedata
9
+ from sklearn.feature_extraction.text import TfidfVectorizer
10
+ from sklearn.metrics.pairwise import cosine_similarity
11
+ from sklearn.model_selection import train_test_split, GridSearchCV
12
+ from sklearn.ensemble import RandomForestClassifier
13
+ from sklearn.metrics import accuracy_score, f1_score
14
+ from imblearn.over_sampling import SMOTE
15
+ import matplotlib.pyplot as plt
16
+ import os
17
+ from wordcloud import WordCloud
18
+ from sklearn.metrics import classification_report, accuracy_score, f1_score
19
+ from sklearn.cluster import KMeans
20
+ from sklearn.decomposition import PCA
21
+ from sklearn.preprocessing import FunctionTransformer
22
+ from sklearn.pipeline import Pipeline
23
+ from transformers import pipeline
24
+ from collections import Counter
25
+ import nltk
26
+ from nltk.corpus import stopwords
27
+ from datetime import datetime
28
+
29
+ # === Preprocessing Functions === #
30
+ candidate_list = ["Lalu Muhamad Iqbal", "Indah Dhamayanti Putri", "Zulkieflimansyah", "M Suhaili", "Sitti Rohmi Djalilah", "Musyafirin"]
31
+
32
+ # Unduh stopwords jika belum terunduh
33
+ nltk.download('stopwords')
34
+ stop_words = set(stopwords.words('indonesian'))
35
+
36
+ # Memastikan data "BA Lainnya" tersedia
37
+ if 'data_with_ba' in st.session_state:
38
+ ba_lainnya_data = st.session_state['data_with_ba']
39
+ else:
40
+ ba_lainnya_data = None
41
+
42
+ def translate_emojis(text):
43
+ return ''.join(c for c in text if not emoji.is_emoji(c)) # Remove all emojis
44
+
45
+ def normalize_unicode(text):
46
+ return unicodedata.normalize('NFKD', text)
47
+
48
+ def clean_text(text):
49
+ text = str(text).casefold() # Convert to lowercase
50
+ text = re.sub(r'http\S+|www\S+', '', text) # Remove URLs
51
+ text = re.sub(r'[^a-z\s]', '', text) # Remove non-alphabetic characters
52
+ text = re.sub(r'\s+', ' ', text).strip() # Normalize spaces
53
+ return text
54
+
55
+ def handle_negation(text):
56
+ negation_words = {"tidak", "bukan", "jangan", "belum", "kurang", "gagal", "sulit"}
57
+ words = text.split()
58
+ result = []
59
+ skip_next = False
60
+ for i, word in enumerate(words):
61
+ if word in negation_words and i + 1 < len(words):
62
+ result.append(f"{word}_{words[i + 1]}") # Combine negation with next word
63
+ skip_next = True
64
+ elif skip_next:
65
+ skip_next = False
66
+ else:
67
+ result.append(word)
68
+ return ' '.join(result)
69
+
70
+ def handle_replies(text):
71
+ text = re.sub(r'=--*@\w+', '', text) # Remove multi-level reply patterns
72
+ text = re.sub(r'=-*@\w+', '', text) # Remove single-level reply patterns
73
+ text = re.sub(r'@\w+', '', text) # Remove standalone @username mentions
74
+ return text
75
+
76
+ def translate_text(text, dictionary):
77
+ words = text.split()
78
+ return ' '.join([dictionary.get(word.lower(), word) for word in words]) # Translate words using dictionary
79
+
80
+ # Fungsi untuk menetapkan sentimen berdasarkan kata kunci
81
+ def assign_sentiment_based_on_keywords(comment, keyword_dict):
82
+ for sentiment, keywords in keyword_dict.items():
83
+ if any(keyword in comment for keyword in keywords):
84
+ return sentiment
85
+ return 'unknown'
86
+
87
+ # === Load Dictionaries === #
88
+ def load_dictionary(file_path, file_type='json'):
89
+ if file_type == 'json':
90
+ with open(file_path, 'r', encoding='utf-8') as file:
91
+ return json.load(file)
92
+ elif file_type == 'csv':
93
+ df = pd.read_csv(file_path, names=['slang', 'formal'])
94
+ return pd.Series(df['formal'].values, index=df['slang']).to_dict()
95
+
96
+ ntb_dict = load_dictionary('/content/ntb_dict.json', 'json')
97
+ slang_dict = load_dictionary('/content/kamusalay.csv', 'csv')
98
+
99
+ # === Utility Functions === #
100
+ # Fungsi untuk memperbarui file JSON dengan riwayat update
101
+ def update_history_json(history_path, komentar, link, model_data, field, date):
102
+ # Konversi Timestamp menjadi string
103
+ if isinstance(date, pd.Timestamp):
104
+ date = date.strftime('%Y-%m-%d')
105
+
106
+ # Baca histori dari file JSON
107
+ try:
108
+ with open(history_path, 'r') as file:
109
+ history_data = json.load(file)
110
+ except (FileNotFoundError, json.JSONDecodeError):
111
+ history_data = {}
112
+
113
+ # Kunci riwayat untuk komentar, link, dan model data tertentu
114
+ key = f"{komentar}_{link}_{model_data}"
115
+
116
+ # Tambahkan riwayat baru ke kunci yang sesuai
117
+ if key in history_data:
118
+ history_data[key].append({field: date})
119
+ else:
120
+ history_data[key] = [{field: date}]
121
+
122
+ # Simpan kembali ke file JSON
123
+ with open(history_path, 'w') as file:
124
+ json.dump(history_data, file, indent=4)
125
+
126
+ # Fungsi untuk memperbarui dataset pelatihan dengan data baru
127
+ def update_training_dataset(output, candidate):
128
+ dataset_path = f"datasetntbnew_{candidate.lower().replace(' ', '_')}.xlsx"
129
+ history_path = f"history_{candidate.lower().replace(' ', '_')}.json"
130
+
131
+ try:
132
+ required_columns = ['model_data', 'Platform', 'komentar', 'link', 'kandidat', 'sentimen', 'tanggal', 'tanggal_masuk']
133
+ output = output[required_columns].copy()
134
+
135
+ if 'predicted_category' in output.columns:
136
+ output['sentimen'] = output['predicted_category']
137
+ output.drop(columns=['predicted_category'], inplace=True)
138
+
139
+ output['tanggal_masuk'] = pd.Timestamp.now()
140
+
141
+ if os.path.exists(dataset_path):
142
+ existing_data = pd.read_excel(dataset_path)
143
+ else:
144
+ existing_data = pd.DataFrame(columns=required_columns)
145
+
146
+ # Menambahkan kolom 'update_ba' jika belum ada
147
+ if 'update_ba' not in existing_data.columns:
148
+ existing_data['update_ba'] = None
149
+
150
+ # Menambahkan kolom 'missing_comment' jika belum ada
151
+ if 'missing_comment' not in existing_data.columns:
152
+ existing_data['missing_comment'] = False
153
+
154
+ # Langkah 1: Memeriksa Komentar yang Hilang
155
+ train_comments = existing_data.groupby('link')['komentar'].apply(list).to_dict()
156
+ new_comments = output.groupby('link')['komentar'].apply(list).to_dict()
157
+
158
+ for link, comments in train_comments.items():
159
+ if link in new_comments:
160
+ new_comment_set = set(new_comments[link])
161
+ for comment in comments:
162
+ if comment not in new_comment_set:
163
+ existing_data.loc[(existing_data['link'] == link) & (existing_data['komentar'] == comment), 'missing_comment'] = True
164
+ else:
165
+ existing_data.loc[(existing_data['link'] == link) & (existing_data['komentar'] == comment), 'missing_comment'] = False
166
+
167
+ # Fungsi untuk memperbarui data
168
+ def update_data(existing_data, new_data, history_path):
169
+ for index, row in new_data.iterrows():
170
+ komentar = row['komentar']
171
+ link = row['link']
172
+ model_data = row['model_data']
173
+ tanggal_klasifikasi = pd.Timestamp.now()
174
+
175
+ # Komentar Sama Sudah Ada
176
+ existing_entry = existing_data[(existing_data['link'] == link) & (existing_data['komentar'] == komentar)]
177
+ if not existing_entry.empty:
178
+ existing_data.loc[existing_entry.index, 'update_ba'] = tanggal_klasifikasi
179
+ update_history_json(history_path, komentar, link, model_data, 'update_ba', tanggal_klasifikasi)
180
+ else:
181
+ # Link Sama, Komentar Berbeda
182
+ existing_link_entry = existing_data[(existing_data['link'] == link)]
183
+ if not existing_link_entry.empty:
184
+ new_row = row.copy()
185
+ new_row['tanggal_masuk'] = tanggal_klasifikasi
186
+ new_row['update_ba'] = tanggal_klasifikasi
187
+ existing_data = pd.concat([existing_data, new_row.to_frame().T], ignore_index=True)
188
+ update_history_json(history_path, komentar, link, model_data, 'tanggal_masuk', tanggal_klasifikasi)
189
+ update_history_json(history_path, komentar, link, model_data, 'update_ba', tanggal_klasifikasi)
190
+ else:
191
+ # Link Baru
192
+ new_row = row.copy()
193
+ new_row['tanggal_masuk'] = tanggal_klasifikasi
194
+ new_row['update_ba'] = tanggal_klasifikasi
195
+ existing_data = pd.concat([existing_data, new_row.to_frame().T], ignore_index=True)
196
+ update_history_json(history_path, komentar, link, model_data, 'tanggal_masuk', tanggal_klasifikasi)
197
+ update_history_json(history_path, komentar, link, model_data, 'update_ba', tanggal_klasifikasi)
198
+
199
+ # Perbarui nilai None di update_ba dengan tanggal_masuk
200
+ existing_data['update_ba'] = pd.to_datetime(existing_data['update_ba'], errors='coerce')
201
+ existing_data['update_ba'].fillna(existing_data['tanggal_masuk'], inplace=True)
202
+ return existing_data
203
+
204
+ updated_data = update_data(existing_data, output, history_path)
205
+ updated_data.to_excel(dataset_path, index=False)
206
+
207
+ st.success(f"Data successfully updated in {candidate}'s training dataset.")
208
+
209
+ if 'missing_comment' in existing_data.columns and existing_data['missing_comment'].any():
210
+ st.subheader("Missing Comments")
211
+ st.write("Comments that were found to be missing:")
212
+ st.dataframe(existing_data[existing_data['missing_comment']])
213
+ except KeyError as e:
214
+ st.error(f"Missing column in the dataset: {e}")
215
+ except Exception as e:
216
+ st.error(f"An error occurred: {e}")
217
+
218
+
219
+ def clustering_based_evaluation(df, n_clusters=10):
220
+ st.write("Starting preprocessing...")
221
+ df['translated_emojis'] = df['komentar'].fillna('').astype(str).apply(translate_emojis)
222
+ df['normalized_unicode'] = df['translated_emojis'].apply(normalize_unicode)
223
+ df['reply_handled'] = df['normalized_unicode'].apply(handle_replies)
224
+ df['clean_text'] = df['reply_handled'].apply(clean_text)
225
+ df['translated_ntb'] = df['clean_text'].apply(lambda x: translate_text(x, ntb_dict))
226
+ df['translated_slang'] = df['translated_ntb'].apply(lambda x: translate_text(x, slang_dict))
227
+ df['negation_handled'] = df['translated_slang'].apply(handle_negation)
228
+
229
+ st.write("Generating TF-IDF vectors...")
230
+ tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
231
+ tfidf_matrix = tfidf_vectorizer.fit_transform(df['negation_handled'])
232
+
233
+ st.write(f"Clustering into {n_clusters} clusters...")
234
+ kmeans = KMeans(n_clusters=n_clusters, random_state=42)
235
+ df['Cluster'] = kmeans.fit_predict(tfidf_matrix)
236
+
237
+ st.write("Performing PCA for visualization...")
238
+ pca = PCA(n_components=2)
239
+ reduced_data = pca.fit_transform(tfidf_matrix.toarray())
240
+ df['PCA1'] = reduced_data[:, 0]
241
+ df['PCA2'] = reduced_data[:, 1]
242
+
243
+ st.write("Clustering completed successfully!")
244
+ return df
245
+
246
+ def load_and_process_data(dataset_path, history_path):
247
+ df = pd.read_excel(dataset_path)
248
+ df['tanggal_masuk'] = pd.to_datetime(df['tanggal_masuk'], errors='coerce')
249
+
250
+ if df['tanggal_masuk'].isnull().any():
251
+ st.warning("Some dates could not be parsed correctly. Please check the date format in the dataset.")
252
+ df['tanggal_masuk'].fillna(pd.Timestamp.now().strftime('%Y-%m-%d'), inplace=True)
253
+
254
+ required_columns = ['model_data', 'Platform', 'komentar', 'sentimen', 'tanggal', 'tanggal_masuk', 'evaluated_by_cluster']
255
+ for col in required_columns:
256
+ if col not in df.columns:
257
+ if col == 'tanggal_masuk':
258
+ df[col] = pd.Timestamp.now().strftime('%Y-%m-%d')
259
+ elif col == 'evaluated_by_cluster':
260
+ df[col] = False
261
+ else:
262
+ df[col] = None
263
+
264
+ df = df[required_columns]
265
+
266
+ try:
267
+ with open(history_path, "r") as f:
268
+ history = json.load(f)
269
+ except FileNotFoundError:
270
+ history = []
271
+
272
+ return df, history
273
+
274
+ # Define the function to handle Special Cluster
275
+ def handle_special_cluster(data, keywords, detector):
276
+ for index, row in data.iterrows():
277
+ text = row['negation_handled']
278
+
279
+ # Cek apakah teks mengandung salah satu kata kunci
280
+ detected = False
281
+ for sentiment, words in keywords.items():
282
+ if any(word in text for word in words):
283
+ data.loc[index, 'predicted_category'] = sentiment.replace('_', ' ')
284
+ data.loc[index, 'detected_by'] = 'keyword'
285
+ detected = True
286
+ break
287
+
288
+ if not detected:
289
+ # Jika tidak ada kata kunci yang cocok, gunakan deteksi sarkasme
290
+ result = detector(text)
291
+ if result[0]['label'] == 'SARCASM':
292
+ data.loc[index, 'predicted_category'] = 'co sarkastic'
293
+ data.loc[index, 'detected_by'] = 'sarcasm'
294
+ else:
295
+ # Jika tidak ada sentimen yang cocok
296
+ data.loc[index, 'predicted_category'] = 'Unknown'
297
+ data.loc[index, 'detected_by'] = 'unknown'
298
+
299
+ return data
300
+
301
+ def preprocess_text(text):
302
+ text = text.lower()
303
+ text = re.sub(r'\W+', ' ', text) # Remove non-alphanumeric characters
304
+ words = text.split()
305
+ words = [word for word in words if word not in stop_words]
306
+ return words
307
+
308
+ def display_word_frequencies(words, num_words):
309
+ st.subheader(f"Top {num_words} Words")
310
+ for word, freq in words:
311
+ st.write(f"{word}: {freq}")
312
+
313
+ def update_sentiment(index, new_sentimen):
314
+ df.loc[index, 'sentimen'] = new_sentimen
315
+ st.write(f"Updated sentiment for comment at index {index} to {new_sentimen}")
316
+
317
+ # Fungsi untuk memuat data `Corrected Comments` dari file Excel
318
+ def load_corrected_comments_from_excel(file_path):
319
+ try:
320
+ return pd.read_excel(file_path, sheet_name='Corrected Comments')
321
+ except FileNotFoundError:
322
+ return pd.DataFrame(columns=['model_data', 'Platform', 'komentar', 'tanggal', 'link', 'sentimen', 'Cluster_Name', 'corrected_by', 'tanggal_masuk'])
323
+ except Exception as e:
324
+ print(f"Error loading corrected comments: {e}")
325
+ return pd.DataFrame(columns=['model_data', 'Platform', 'komentar', 'tanggal', 'link', 'sentimen', 'Cluster_Name', 'corrected_by', 'tanggal_masuk'])
326
+
327
+ # Fungsi untuk menyimpan data `Corrected Comments` ke file Excel
328
+ def save_corrected_comments_to_excel(data, file_path):
329
+ with pd.ExcelWriter(file_path, engine='xlsxwriter') as writer:
330
+ data.to_excel(writer, sheet_name='Corrected Comments', index=False)
331
+
332
+ # Lokasi file Excel untuk menyimpan data `Corrected Comments`
333
+ corrected_comments_file = 'corrected_comments.xlsx'
334
+
335
+ # Fungsi untuk menampilkan visualisasi distribusi komentar dalam cluster
336
+ def display_cluster_visualization(ba_lainnya_data):
337
+ st.subheader("Cluster Visualization")
338
+
339
+ # Menghitung jumlah komentar dalam kategori `Similar Sentiment` dan `Special Cluster`
340
+ cluster_counts = ba_lainnya_data[ba_lainnya_data['Cluster_Name'].str.contains('Similar|Special Cluster')]['Cluster_Name'].value_counts()
341
+
342
+ # Membuat grafik batang (bar chart)
343
+ plt.figure(figsize=(10, 6))
344
+ plt.bar(cluster_counts.index, cluster_counts.values, color=['blue', 'green', 'orange', 'red', 'purple'])
345
+ plt.xlabel('Cluster Name')
346
+ plt.ylabel('Number of Comments')
347
+ plt.title('Distribution of Comments in Similar Sentiment and Special Cluster')
348
+ plt.xticks(rotation=45)
349
+ plt.show()
350
+ st.pyplot(plt)
351
+
352
+ def run_clustering_for_ba_lainnya():
353
+ st.title("Clustering for 'BA Lainnya'")
354
+
355
+ if 'data_with_ba' not in st.session_state:
356
+ st.error("No 'BA Lainnya' data found from the classification model. Please classify comments first.")
357
+ st.stop()
358
+
359
+ ba_lainnya_data = st.session_state['data_with_ba']
360
+
361
+ st.write(f"**'BA Lainnya' Data:** {len(ba_lainnya_data)} rows")
362
+
363
+ with open('keywords.json', 'r') as f:
364
+ keyword_dict = json.load(f)
365
+
366
+ selected_candidate = st.session_state['candidate']
367
+ candidate_keywords = keyword_dict.get(selected_candidate.replace(' ', '_'))
368
+
369
+ if candidate_keywords is None:
370
+ st.error("Keywords for the selected candidate not found.")
371
+ st.stop()
372
+
373
+ sarcasm_detector = pipeline('sentiment-analysis', model='unitary/toxic-bert')
374
+
375
+ dataset_path = f"datasetntbnew_{selected_candidate.lower().replace(' ', '_')}.xlsx"
376
+ corrected_comments_file = f"corrected_comments_{selected_candidate.lower().replace(' ', '_')}.xlsx"
377
+
378
+ try:
379
+ train_data = pd.read_excel(dataset_path)
380
+ labeled_data = train_data[train_data['sentimen'].isin(['Co Likes', 'Co Support', 'Co Optimism', 'Co Negative', 'Co Sarkastic'])]
381
+
382
+ st.write(f"**Labeled Data from Training Dataset:** {len(labeled_data)} rows")
383
+
384
+ ba_lainnya_data['Cluster'] = None
385
+ ba_lainnya_data['detected_by'] = None
386
+ ba_lainnya_data['Cluster_Name'] = None
387
+ ba_lainnya_data['corrected_by'] = "Not Corrected"
388
+ ba_lainnya_data['Sentiment'] = None # Ubah inisialisasi ke None untuk memastikan tidak ada nilai default yang salah
389
+
390
+ ba_lainnya_data['Cluster_Name'] = ba_lainnya_data['Cluster_Name'].astype(str)
391
+ ba_lainnya_data['corrected_by'] = ba_lainnya_data['corrected_by'].astype(str)
392
+ ba_lainnya_data['Sentiment'] = ba_lainnya_data['Sentiment'].astype(str)
393
+
394
+ for data in [ba_lainnya_data, labeled_data]:
395
+ data['translated_emojis'] = data['komentar'].fillna('').astype(str).apply(translate_emojis)
396
+ data['normalized_unicode'] = data['translated_emojis'].apply(normalize_unicode)
397
+ data['reply_handled'] = data['normalized_unicode'].apply(handle_replies)
398
+ data['clean_text'] = data['reply_handled'].apply(clean_text)
399
+ data['translated_ntb'] = data['clean_text'].apply(lambda x: translate_text(x, {}))
400
+ data['translated_slang'] = data['translated_ntb'].apply(lambda x: translate_text(x, {}))
401
+ data['negation_handled'] = data['translated_slang'].apply(handle_negation)
402
+ data['negation_handled'] = data['negation_handled'].fillna('')
403
+
404
+ combined_data = ba_lainnya_data.copy()
405
+ combined_data['Label'] = 'BA Lainnya'
406
+
407
+ for sentimen in ['Co Likes', 'Co Support', 'Co Optimism', 'Co Negative', 'Co Sarkastic']:
408
+ sentimen_data = labeled_data[labeled_data['sentimen'] == sentimen].copy()
409
+ sentimen_data['Label'] = sentimen
410
+ combined = pd.concat([combined_data, sentimen_data], ignore_index=True)
411
+
412
+ if len(combined) < 2:
413
+ st.warning(f"Not enough samples to cluster for {sentimen}.")
414
+ continue
415
+
416
+ vectorizer = TfidfVectorizer(ngram_range=(1, 1), max_features=5000)
417
+ tfidf_matrix = vectorizer.fit_transform(combined['negation_handled'])
418
+
419
+ st.write(f"Clustering 'BA Lainnya' comments similar to {sentimen}...")
420
+ kmeans = KMeans(n_clusters=2, random_state=42)
421
+ combined['Cluster'] = kmeans.fit_predict(tfidf_matrix)
422
+
423
+ valid_indices = combined.index[:len(ba_lainnya_data)]
424
+ valid_indices = valid_indices.intersection(ba_lainnya_data.index)
425
+
426
+ ba_lainnya_data.loc[valid_indices, 'Cluster'] = combined.loc[valid_indices, 'Cluster']
427
+ ba_lainnya_data.loc[ba_lainnya_data['Cluster'] == 0, 'Cluster_Name'] = f"{sentimen} Similar"
428
+ ba_lainnya_data.loc[ba_lainnya_data['Cluster'] == 1, 'Cluster_Name'] = f"{sentimen} Dissimilar"
429
+ ba_lainnya_data.loc[valid_indices, 'Sentiment'] = sentimen
430
+
431
+ for index, row in ba_lainnya_data.iterrows():
432
+ if row['Cluster_Name'].endswith('Dissimilar') or row['Cluster_Name'] == 'None':
433
+ dissimilar_comment = ba_lainnya_data.loc[[index]].copy()
434
+ for sentimen in ['Co Likes', 'Co Support', 'Co Optimism', 'Co Negative', 'Co Sarkastic']:
435
+ sentimen_data = labeled_data[labeled_data['sentimen'] == sentimen].copy()
436
+ combined = pd.concat([dissimilar_comment, sentimen_data], ignore_index=True)
437
+
438
+ if len(combined) < 2:
439
+ continue
440
+
441
+ tfidf_matrix = vectorizer.fit_transform(combined['negation_handled'])
442
+ if tfidf_matrix.shape[0] == 0:
443
+ continue
444
+
445
+ kmeans = KMeans(n_clusters=2, random_state=42)
446
+ combined['Cluster'] = kmeans.fit_predict(tfidf_matrix)
447
+
448
+ if len(combined) > 0 and combined.loc[0, 'Cluster'] == 0:
449
+ ba_lainnya_data.loc[index, 'Cluster_Name'] = f"{sentimen} Similar"
450
+ ba_lainnya_data.loc[index, 'Sentiment'] = sentimen
451
+ break
452
+ else:
453
+ ba_lainnya_data.loc[index, 'Cluster_Name'] = 'Special Cluster'
454
+ ba_lainnya_data.loc[index, 'corrected_by'] = 'Special Cluster'
455
+ ba_lainnya_data.loc[index, 'Sentiment'] = 'Special Sentiment'
456
+
457
+ ba_lainnya_data['Cluster_Name'] = ba_lainnya_data['Cluster_Name'].apply(lambda x: 'Special Cluster' if x == 'nan' else x)
458
+
459
+ special_cluster_data = ba_lainnya_data[ba_lainnya_data['Cluster_Name'] == 'Special Cluster']
460
+ if not special_cluster_data.empty:
461
+ special_cluster_data = handle_special_cluster(special_cluster_data, candidate_keywords, sarcasm_detector)
462
+ ba_lainnya_data.update(special_cluster_data)
463
+ ba_lainnya_data.loc[special_cluster_data.index, 'corrected_by'] = 'Special Cluster'
464
+ ba_lainnya_data.loc[special_cluster_data.index, 'Sentiment'] = 'Special Sentiment'
465
+
466
+ st.warning("Some comments were not captured by the current keywords. Please add new keywords in the 'Update Keywords' section.")
467
+
468
+ st.subheader("Detection Distribution in Special Cluster")
469
+ detection_counts = special_cluster_data['detected_by'].value_counts()
470
+ plt.figure(figsize=(10, 6))
471
+ plt.bar(detection_counts.index, detection_counts.values, color=['blue', 'orange', 'red'])
472
+ plt.xlabel('Detection Method')
473
+ plt.ylabel('Number of Comments')
474
+ plt.title('Detection Distribution in Special Cluster')
475
+ plt.show()
476
+ st.pyplot(plt)
477
+
478
+ st.write("Top Keywords in Special Cluster")
479
+ for sentiment, keywords in candidate_keywords.items():
480
+ st.write(f"{sentiment}: {', '.join(keywords)}")
481
+
482
+ st.subheader("Special Cluster Details")
483
+ st.dataframe(special_cluster_data[['komentar', 'Cluster_Name', 'detected_by']])
484
+
485
+ corrected_comments = load_corrected_comments_from_excel(corrected_comments_file)
486
+ display_cluster_visualization(ba_lainnya_data)
487
+
488
+ st.subheader("Search and Filter Clusters")
489
+ search_term = st.text_input("Enter a keyword to search for in the comments:")
490
+
491
+ if search_term:
492
+ filtered_data = ba_lainnya_data[ba_lainnya_data['komentar'].str.contains(search_term, case=False, na=False)]
493
+ st.write(f"Filtered Data (Showing first 100 rows) for search term '{search_term}':")
494
+ st.dataframe(filtered_data.head(100))
495
+ else:
496
+ st.dataframe(ba_lainnya_data.head(100))
497
+
498
+ st.subheader("Clustered Data")
499
+ selected_cluster = st.selectbox("Select a cluster to view comments:", sorted(ba_lainnya_data['Cluster_Name'].unique()))
500
+ cluster_comments = ba_lainnya_data[ba_lainnya_data['Cluster_Name'] == selected_cluster]
501
+ st.dataframe(cluster_comments[['komentar', 'Cluster_Name']].head(100))
502
+
503
+ new_sentimen = st.selectbox("Select new sentiment for this cluster:", ['Co Likes', 'Co Support', 'Co Optimism', 'Co Negative', 'Co Sarkastic'])
504
+ if st.button("Update Sentiment for this cluster"):
505
+ ba_lainnya_data.loc[ba_lainnya_data['Cluster_Name'] == selected_cluster, 'corrected_by'] = 'Batch Cluster'
506
+ ba_lainnya_data.loc[ba_lainnya_data['Cluster_Name'] == selected_cluster, 'sentimen'] = new_sentimen
507
+ st.success(f"Sentiment for cluster {selected_cluster} updated to {new_sentimen}")
508
+
509
+ # Save and refresh Corrected Comments table and Cluster Visualization
510
+ corrected_comments = pd.concat([corrected_comments, ba_lainnya_data[ba_lainnya_data['corrected_by'] != "Not Corrected"]])
511
+ corrected_comments.drop_duplicates(subset=['komentar'], keep='last', inplace=True)
512
+ save_corrected_comments_to_excel(corrected_comments, corrected_comments_file)
513
+ st.subheader("Corrected Comments")
514
+ st.dataframe(corrected_comments[['komentar', 'Cluster_Name', 'corrected_by', 'sentimen']].head(100))
515
+ display_cluster_visualization(ba_lainnya_data)
516
+
517
+ st.subheader("Special Rules Based on Keywords")
518
+ keyword = st.text_input("Enter a keyword to set a rule:")
519
+ specific_cluster = st.selectbox("Select a cluster for this keyword:", sorted(ba_lainnya_data['Cluster_Name'].unique()))
520
+
521
+ if keyword:
522
+ new_cluster = st.selectbox("Select sentiment for this keyword:", ['Co Likes', 'Co Support', 'Co Optimism', 'Co Negative', 'Co Sarkastic'])
523
+ if st.button("Apply Rule"):
524
+ ba_lainnya_data.loc[ba_lainnya_data['komentar'].str.contains(keyword, case=False, na=False), 'Cluster_Name'] = new_cluster
525
+ ba_lainnya_data.loc[ba_lainnya_data['komentar'].str.contains(keyword, case=False, na=False), 'detected_by'] = specific_cluster
526
+ ba_lainnya_data.loc[ba_lainnya_data['komentar'].str.contains(keyword, case=False, na=False), 'corrected_by'] = 'Keyword Rule'
527
+ ba_lainnya_data.loc[ba_lainnya_data['komentar'].str.contains(keyword, case=False, na=False), 'sentimen'] = new_cluster
528
+ st.success(f"All comments containing '{keyword}' have been updated to '{new_cluster}' sentiment.")
529
+
530
+ # Update keywords.json file to avoid duplicates
531
+ if selected_candidate.replace(' ', '_') in keyword_dict:
532
+ if new_cluster in keyword_dict[selected_candidate.replace(' ', '_')]:
533
+ if keyword not in keyword_dict[selected_candidate.replace(' ', '_')][new_cluster]:
534
+ keyword_dict[selected_candidate.replace(' ', '_')][new_cluster].append(keyword)
535
+ else:
536
+ keyword_dict[selected_candidate.replace(' ', '_')][new_cluster] = [keyword]
537
+ else:
538
+ keyword_dict[selected_candidate.replace(' ', '_')] = {new_cluster: [keyword]}
539
+
540
+ with open('keywords.json', 'w') as f:
541
+ json.dump(keyword_dict, f)
542
+
543
+ st.success(f"Keyword '{keyword}' has been added to the keyword list.")
544
+
545
+ # Save and refresh Corrected Comments table and Cluster Visualization
546
+ corrected_comments = pd.concat([corrected_comments, ba_lainnya_data[ba_lainnya_data['corrected_by'] != "Not Corrected"]])
547
+ corrected_comments.drop_duplicates(subset=['komentar'], keep='last', inplace=True)
548
+ save_corrected_comments_to_excel(corrected_comments, corrected_comments_file)
549
+ st.subheader("Corrected Comments")
550
+ st.dataframe(corrected_comments[['komentar', 'Cluster_Name', 'corrected_by', 'sentimen']].head(100))
551
+ display_cluster_visualization(ba_lainnya_data)
552
+
553
+ st.subheader("Corrected Comments")
554
+ corrected_comments = load_corrected_comments_from_excel(corrected_comments_file)
555
+ st.dataframe(corrected_comments[['komentar', 'Cluster_Name', 'corrected_by', 'sentimen']].head(100))
556
+
557
+ st.subheader("Visual Representation of Corrected Comments")
558
+ sentiment_counts = corrected_comments['sentimen'].value_counts()
559
+ plt.figure(figsize=(10, 6))
560
+ plt.bar(sentiment_counts.index, sentiment_counts.values, color=['blue', 'green', 'orange', 'red', 'purple'])
561
+ plt.xlabel('Sentimen')
562
+ plt.ylabel('Number of Corrected Comments')
563
+ plt.title('Number of Corrected Comments by Sentiment')
564
+ plt.show()
565
+ st.pyplot(plt)
566
+
567
+ st.subheader("Download Options")
568
+ excel_buffer_cluster = io.BytesIO()
569
+ with pd.ExcelWriter(excel_buffer_cluster, engine='xlsxwriter') as writer:
570
+ ba_lainnya_data.to_excel(writer, index=False, sheet_name='Clustered Data')
571
+ excel_buffer_cluster.seek(0)
572
+
573
+ st.download_button(
574
+ label=f"Download Clustered Data for {selected_candidate}",
575
+ data=excel_buffer_cluster,
576
+ file_name=f"clustered_data_{selected_candidate}.xlsx",
577
+ mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
578
+ )
579
+
580
+ st.subheader("Save Corrected Comments and Non-'BA Lainnya' Data to Training Dataset")
581
+ if st.button("Save All to Dataset"):
582
+ try:
583
+ # Gabungkan hasil klasifikasi tanpa tag 'BA Lainnya' dan komentar yang telah dikoreksi
584
+ combined_data = pd.concat([st.session_state['data_without_ba'], corrected_comments], ignore_index=True)
585
+ combined_data['tanggal_masuk'] = pd.Timestamp.now().strftime('%Y-%m-%d')
586
+ update_training_dataset(combined_data, st.session_state['candidate']) # Panggil fungsi baru
587
+ st.success("Corrected comments and classified data without 'BA Lainnya' have been saved to the training dataset.")
588
+
589
+ # Clear the session state to prevent duplicate saves
590
+ st.session_state['data_with_ba'] = pd.DataFrame(columns=corrected_comments.columns)
591
+ st.session_state['data_without_ba'] = pd.DataFrame(columns=corrected_comments.columns)
592
+
593
+ # Rerun automatically after saving to dataset
594
+ st.rerun()
595
+ except Exception as e:
596
+ st.error(f"An error occurred while saving the data: {e}")
597
+
598
+ except FileNotFoundError:
599
+ st.error(f"No dataset found for {selected_candidate}. Please add data to create the dataset.")
600
+ except Exception as e:
601
+ st.error(f"An unexpected error occurred: {e}")
602
+
603
+ # === Sidebar Navigation === #
604
+ menu = st.sidebar.radio("Select a Feature", ["Model-Based Classification","Clustering for 'BA Lainnya'", "Update Keywords","View Training Dataset","Evaluate Data Train","Maximize Preprocessing","Retraining Model"])
605
+ if menu == "Model-Based Classification":
606
+ st.title("Model-Based Classification")
607
+ candidate = st.selectbox("Choose a candidate:", candidate_list)
608
+ model_path = f"/content/best_rf_model_{candidate.replace(' ', '_').lower()}.joblib"
609
+ vectorizer_path = f"/content/tfidf_vectorizer_{candidate.replace(' ', '_').lower()}.joblib"
610
+
611
+ # Save the selected candidate to session state
612
+ st.session_state['candidate'] = candidate
613
+
614
+ uploaded_file = st.file_uploader("Upload an Excel file for classification", type=['xlsx'])
615
+
616
+ @st.cache_data
617
+ def load_model_and_vectorizer(model_path, vectorizer_path):
618
+ """Load model and vectorizer, cache them for efficiency."""
619
+ try:
620
+ model = joblib.load(model_path)
621
+ vectorizer = joblib.load(vectorizer_path)
622
+ return model, vectorizer
623
+ except FileNotFoundError:
624
+ return None, None
625
+
626
+ model, vectorizer = load_model_and_vectorizer(model_path, vectorizer_path)
627
+
628
+ if not model or not vectorizer:
629
+ st.error("Model or vectorizer not found for the selected candidate.")
630
+ st.stop()
631
+
632
+ # Save the vectorizer and model to session state
633
+ st.session_state['vectorizer'] = vectorizer
634
+ st.session_state['model'] = model
635
+
636
+ @st.cache_data
637
+ def preprocess_data(data):
638
+ """Preprocess comments with batching for large datasets."""
639
+ from joblib import Parallel, delayed
640
+
641
+ def preprocess_batch(batch):
642
+ batch['translated_emojis'] = batch['komentar'].apply(translate_emojis)
643
+ batch['normalized_unicode'] = batch['translated_emojis'].apply(normalize_unicode)
644
+ batch['reply_handled'] = batch['normalized_unicode'].apply(handle_replies)
645
+ batch['clean_text'] = batch['reply_handled'].apply(clean_text)
646
+ batch['translated_ntb'] = batch['clean_text'].apply(lambda x: translate_text(x, ntb_dict))
647
+ batch['translated_slang'] = batch['translated_ntb'].apply(lambda x: translate_text(x, slang_dict))
648
+ batch['negation_handled'] = batch['translated_slang'].apply(handle_negation)
649
+ return batch
650
+
651
+ batch_size = 10000 # Process 10,000 rows at a time
652
+ batches = [data.iloc[i:i+batch_size] for i in range(0, len(data), batch_size)]
653
+ processed_batches = Parallel(n_jobs=-1)(delayed(preprocess_batch)(batch) for batch in batches)
654
+ return pd.concat(processed_batches, ignore_index=True)
655
+
656
+ # Process uploaded file
657
+ if uploaded_file:
658
+ try:
659
+ data = pd.read_excel(uploaded_file)
660
+ if 'komentar' not in data.columns:
661
+ st.error("The uploaded file must include a 'komentar' column.")
662
+ st.stop()
663
+ data = preprocess_data(data)
664
+ except Exception as e:
665
+ st.error(f"An error occurred while processing the file: {e}")
666
+ st.stop()
667
+ elif 'model_classified_data' in st.session_state:
668
+ data = st.session_state['model_classified_data']
669
+ else:
670
+ st.info("Please upload a file for classification.")
671
+ st.stop()
672
+
673
+ # Transform comments into TF-IDF vectors
674
+ try:
675
+ tfidf_data = vectorizer.transform(data['negation_handled'].fillna(''))
676
+ data['predicted_category'] = model.predict(tfidf_data)
677
+ data['probabilities'] = model.predict_proba(tfidf_data).tolist()
678
+ data['max_probability'] = data['probabilities'].apply(lambda x: max(x))
679
+ except Exception as e:
680
+ st.error(f"An error occurred during model prediction: {e}")
681
+ st.stop()
682
+
683
+ # Cache classified data
684
+ st.session_state['model_classified_data'] = data
685
+
686
+ # Interactive threshold adjustment
687
+ st.subheader("Set Threshold for 'BA Lainnya'")
688
+ threshold = st.slider("Threshold for tagging 'BA Lainnya'", min_value=0.0, max_value=1.0, value=0.80, step=0.01)
689
+
690
+ # Apply threshold to tag "BA Lainnya"
691
+ data['tag'] = data['max_probability'].apply(lambda x: 'BA Lainnya' if x < threshold else '')
692
+
693
+ # Separate data for visualization
694
+ data_without_ba = data[data['tag'] != 'BA Lainnya']
695
+ data_with_ba = data[data['tag'] == 'BA Lainnya']
696
+
697
+ # Save updated results to session state for dynamic updates
698
+ st.session_state['data_without_ba'] = data_without_ba
699
+ st.session_state['data_with_ba'] = data_with_ba
700
+
701
+ # Preview Results
702
+ st.subheader("Preview Results")
703
+
704
+ st.write("### 1. Hasil Klasifikasi Tanpa Tag 'BA Lainnya'")
705
+ if not data_without_ba.empty:
706
+ st.dataframe(data_without_ba[['komentar', 'predicted_category', 'max_probability']])
707
+ else:
708
+ st.info("No high-probability classifications available.")
709
+
710
+ st.write("### 2. Hasil Klasifikasi Dengan Tag 'BA Lainnya'")
711
+ if not data_with_ba.empty:
712
+ st.dataframe(data_with_ba[['komentar', 'predicted_category', 'max_probability']])
713
+ else:
714
+ st.info("No low-probability classifications available.")
715
+
716
+ # Visualization: Sentiment Distribution
717
+ st.subheader("Sentiment Distribution Visualization")
718
+
719
+ def plot_distribution(data, title):
720
+ sentiment_counts = data['predicted_category'].value_counts()
721
+ fig, ax = plt.subplots()
722
+ ax.bar(sentiment_counts.index, sentiment_counts.values)
723
+ ax.set_title(title)
724
+ ax.set_xlabel("Sentiments")
725
+ ax.set_ylabel("Count")
726
+ st.pyplot(fig)
727
+
728
+ if not data_without_ba.empty:
729
+ plot_distribution(data_without_ba, "Sentiment Distribution (Without 'BA Lainnya')")
730
+ if not data_with_ba.empty:
731
+ plot_distribution(data_with_ba, "Sentiment Distribution (With 'BA Lainnya')")
732
+
733
+ # Download Results
734
+ st.subheader("Download Results")
735
+ excel_buffer = io.BytesIO()
736
+ with pd.ExcelWriter(excel_buffer, engine='xlsxwriter') as writer:
737
+ data.to_excel(writer, index=False, sheet_name='Classification Results')
738
+ excel_buffer.seek(0)
739
+
740
+ st.download_button(
741
+ label="Download All Classification Results",
742
+ data=excel_buffer,
743
+ file_name=f"classification_results_{candidate}.xlsx",
744
+ mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
745
+ )
746
+
747
+ # Save Non-"BA Lainnya" Data to Training Dataset
748
+ if not data_with_ba.empty:
749
+ st.warning("There are comments with 'BA Lainnya' tagging. Please proceed to 'Clustering for BA Lainnya'.")
750
+ elif not data_without_ba.empty:
751
+ st.subheader("Save Classified Data")
752
+ if st.button("Save Non-'BA Lainnya' Data to Training Dataset"):
753
+ try:
754
+ data_to_save = data_without_ba[['model_data', 'Platform', 'komentar', 'link', 'kandidat', 'tanggal']].copy()
755
+ data_to_save['sentimen'] = data_without_ba['predicted_category']
756
+ data_to_save['tanggal_masuk'] = pd.Timestamp.now().strftime('%Y-%m-%d')
757
+ update_training_dataset(data_to_save, candidate)
758
+ st.success("Data successfully saved to the training dataset.")
759
+ except Exception as e:
760
+ st.error(f"An error occurred while saving the data: {e}")
761
+ else:
762
+ st.info("No Non-'BA Lainnya' data available to save.")
763
+ pass # Placeholder
764
+
765
+ # Integrasi fungsi dalam halaman "Clustering for 'BA Lainnya'"
766
+ if menu == "Clustering for 'BA Lainnya'":
767
+ selected_candidate = st.session_state['selected_candidate'] if 'selected_candidate' in st.session_state else None
768
+ run_clustering_for_ba_lainnya()
769
+ pass # Placeholder
770
+
771
+ # Memastikan kode ini hanya dijalankan jika menu yang dipilih adalah "View Training Dataset"
772
+ if menu == "View Training Dataset":
773
+ st.title("View Training Dataset")
774
+
775
+ # Header untuk memilih kandidat
776
+ st.header("Options")
777
+ selected_candidate = st.selectbox("Choose a candidate:", list(candidate_list), key='candidate_select_view')
778
+
779
+ # Path dataset
780
+ dataset_path = f"datasetntbnew_{selected_candidate.lower().replace(' ', '_')}.xlsx"
781
+ history_path = f"history_{selected_candidate.lower().replace(' ', '_')}.json"
782
+
783
+ # Memuat dataset
784
+ try:
785
+ df = pd.read_excel(dataset_path)
786
+
787
+ # Memastikan kolom yang diperlukan ada
788
+ required_columns = ['model_data', 'Platform', 'komentar', 'sentimen', 'tanggal', 'tanggal_masuk', 'link', 'evaluated_by_data_train']
789
+ for col in required_columns:
790
+ if col not in df.columns:
791
+ if col == 'evaluated_by_data_train':
792
+ df[col] = False
793
+
794
+ # Menambahkan kolom 'update_ba' jika belum ada
795
+ if 'update_ba' not in df.columns:
796
+ df['update_ba'] = None
797
+
798
+ # Menambahkan kolom 'missing_comment' jika belum ada
799
+ if 'missing_comment' not in df.columns:
800
+ df['missing_comment'] = False
801
+
802
+ # Pastikan 'tanggal_masuk' dan 'tanggal' dalam format datetime yang benar
803
+ df['tanggal_masuk'] = pd.to_datetime(df['tanggal_masuk'], errors='coerce')
804
+ df['tanggal'] = pd.to_datetime(df['tanggal'], errors='coerce')
805
+
806
+ # Konversi kolom tanggal agar hanya menyimpan bagian tanggal tanpa waktu
807
+ df['tanggal'] = df['tanggal'].dt.date
808
+ df['tanggal_masuk'] = df['tanggal_masuk'].dt.date
809
+ df['update_ba'] = pd.to_datetime(df['update_ba'], errors='coerce').dt.date
810
+
811
+ # Menangani nilai NaT (Not a Time) jika ada
812
+ if df['tanggal_masuk'].isnull().any():
813
+ st.warning("Some dates 'tanggal_masuk' could not be parsed correctly. Please check the date format in the dataset.")
814
+ df['tanggal_masuk'].fillna(pd.Timestamp.now().date(), inplace=True)
815
+
816
+ if df['tanggal'].isnull().any():
817
+ st.warning("Some dates 'tanggal' could not be parsed correctly. Please check the date format in the dataset.")
818
+ df['tanggal'].fillna(pd.Timestamp.now().date(), inplace=True)
819
+
820
+ # Menambahkan kolom 'kandidat' jika belum ada dan mengisinya
821
+ if 'kandidat' not in df.columns:
822
+ df['kandidat'] = selected_candidate
823
+
824
+ # Mengambil subset kolom yang diperlukan
825
+ df = df[required_columns + ['update_ba', 'kandidat', 'missing_comment']]
826
+
827
+ # Perbarui nilai None di update_ba dengan tanggal_masuk
828
+ df['update_ba'].fillna(df['tanggal_masuk'], inplace=True)
829
+
830
+ # Menampilkan statistik dasar
831
+ st.subheader(f"Training Dataset for {selected_candidate}")
832
+ st.write(f"**Total rows in dataset:** {len(df)}")
833
+
834
+ if not df.empty:
835
+ # Visualisasi sebaran update BA
836
+ st.subheader("Visualisasi Postingan Berdasarkan Update BA")
837
+ ba_update_counts = df['update_ba'].value_counts().sort_index()
838
+ fig, ax = plt.subplots(figsize=(10, 6))
839
+ ba_update_counts.plot(kind='bar', ax=ax, color='blue')
840
+ ax.set_title('Sebaran Postingan Berdasarkan Update BA')
841
+ ax.set_xlabel('Tanggal Update BA')
842
+ ax.set_ylabel('Jumlah Postingan')
843
+ plt.xticks(rotation=45)
844
+ plt.tight_layout()
845
+ st.pyplot(fig)
846
+
847
+ # Visualisasi tambahan sebaran platform
848
+ st.subheader("Sebaran Platform Berdasarkan Update BA")
849
+ platform_counts = df['Platform'].value_counts()
850
+ fig, ax = plt.subplots(figsize=(10, 6))
851
+ platform_counts.plot(kind='bar', ax=ax, color='green')
852
+ ax.set_title('Sebaran Platform Berdasarkan Update BA')
853
+ ax.set_xlabel('Platform')
854
+ ax.set_ylabel('Jumlah Postingan')
855
+ plt.xticks(rotation=45)
856
+ plt.tight_layout()
857
+ st.pyplot(fig)
858
+
859
+ # Visualisasi jumlah komentar hilang berdasarkan platform
860
+ st.subheader("Jumlah Komentar Hilang Berdasarkan Platform")
861
+ missing_comments_by_platform = df.groupby('Platform')['missing_comment'].sum().sort_index()
862
+ fig, ax = plt.subplots(figsize=(10, 6))
863
+ missing_comments_by_platform.plot(kind='bar', ax=ax, color='red')
864
+ ax.set_title('Jumlah Komentar Hilang Berdasarkan Platform')
865
+ ax.set_xlabel('Platform')
866
+ ax.set_ylabel('Jumlah Komentar Hilang')
867
+ plt.xticks(rotation=45)
868
+ plt.tight_layout()
869
+ st.pyplot(fig)
870
+
871
+ # Filter berdasarkan status validasi
872
+ st.subheader("Filter Data")
873
+ validation_filter = st.radio(
874
+ "Choose data type to view:",
875
+ ["All Data", "Validated Data", "Non-Validated Data"],
876
+ key='validation_filter'
877
+ )
878
+
879
+ if validation_filter == "Validated Data":
880
+ filtered_data = df[df['evaluated_by_data_train'] == True]
881
+ elif validation_filter == "Non-Validated Data":
882
+ filtered_data = df[df['evaluated_by_data_train'] == False]
883
+ else:
884
+ filtered_data = df
885
+
886
+ if not filtered_data.empty:
887
+ st.subheader(f"Filtered Data: {validation_filter}")
888
+ st.dataframe(filtered_data) # Menampilkan semua data yang sesuai dengan filter
889
+ else:
890
+ st.warning("Tidak ada data yang sesuai dengan filter yang dipilih.")
891
+
892
+ # Menampilkan riwayat penambahan data
893
+ st.subheader("History of Data Additions")
894
+ try:
895
+ with open(history_path, "r") as f:
896
+ history = json.load(f)
897
+
898
+ history_list = []
899
+ for key, value in history.items():
900
+ for entry in value:
901
+ for k, v in entry.items():
902
+ history_list.append({
903
+ 'key': key,
904
+ 'field': k,
905
+ 'date': v
906
+ })
907
+
908
+ history_df = pd.DataFrame(history_list)
909
+ st.dataframe(history_df)
910
+ except FileNotFoundError:
911
+ st.write("No addition history available.")
912
+ except ValueError as e:
913
+ st.error(f"An error occurred while loading history data: {e}")
914
+
915
+ # Opsi untuk mengunduh dataset yang telah difilter
916
+ st.subheader("Download Options")
917
+ if not filtered_data.empty:
918
+ excel_buffer = io.BytesIO()
919
+ with pd.ExcelWriter(excel_buffer, engine='xlsxwriter') as writer:
920
+ filtered_data.to_excel(writer, index=False, sheet_name='Filtered Dataset')
921
+ excel_buffer.seek(0)
922
+
923
+ st.download_button(
924
+ label=f"Download Filtered Dataset for {selected_candidate}",
925
+ data=excel_buffer,
926
+ file_name=f"filtered_training_dataset_{selected_candidate}.xlsx",
927
+ mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
928
+ )
929
+
930
+ # Opsi untuk mengunduh seluruh dataset
931
+ if not df.empty:
932
+ excel_buffer_full = io.BytesIO()
933
+ with pd.ExcelWriter(excel_buffer_full, engine='xlsxwriter') as writer:
934
+ df.to_excel(writer, index=False, sheet_name='Training Dataset')
935
+ excel_buffer_full.seek(0)
936
+
937
+ st.download_button(
938
+ label=f"Download Full Training Dataset for {selected_candidate}",
939
+ data=excel_buffer_full,
940
+ file_name=f"training_dataset_{selected_candidate}.xlsx",
941
+ mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
942
+ )
943
+
944
+ # Fitur untuk mengunduh dataset berdasarkan filter tanggal
945
+ st.subheader("Download Aggregated Data by Date")
946
+
947
+ # Mendapatkan nilai maksimum dan minimum tanggal untuk default date_input
948
+ if not df['tanggal'].empty:
949
+ min_date = df['tanggal'].min()
950
+ max_date = df['tanggal'].max()
951
+
952
+ selected_start_date = st.date_input("Select start date for aggregation:", value=min_date)
953
+ selected_end_date = st.date_input("Select end date for aggregation:", value=max_date)
954
+
955
+ agg_filtered_data = df[(df['tanggal'] >= selected_start_date) & (df['tanggal'] <= selected_end_date)]
956
+
957
+ if not agg_filtered_data.empty:
958
+ # Pastikan kolom 'kandidat' ada dan diisi
959
+ agg_filtered_data['kandidat'] = selected_candidate
960
+
961
+ aggregated_data = agg_filtered_data.groupby(['kandidat', 'link', 'tanggal', 'Platform', 'model_data', 'update_ba']).agg(
962
+ co_likes=('sentimen', lambda x: (x == 'Co Likes').sum()),
963
+ co_support=('sentimen', lambda x: (x == 'Co Support').sum()),
964
+ co_optimism=('sentimen', lambda x: (x == 'Co Optimism').sum()),
965
+ co_sarkastic=('sentimen', lambda x: (x == 'Co Sarkastic').sum()),
966
+ co_negative=('sentimen', lambda x: (x == 'Co Negative').sum()),
967
+ missing_comments=('missing_comment', 'sum') # Menambahkan agregasi untuk missing_comment
968
+ ).reset_index()
969
+
970
+ # Menambahkan kolom-kolom baru untuk jumlah komentar
971
+ aggregated_data['jumlah_komentar_positif'] = aggregated_data['co_likes'] + aggregated_data['co_support'] + aggregated_data['co_optimism']
972
+ aggregated_data['jumlah_komentar_negatif'] = aggregated_data['co_sarkastic'] + aggregated_data['co_negative']
973
+ aggregated_data['jumlah_komentar'] = aggregated_data[['co_likes', 'co_support', 'co_optimism', 'co_sarkastic', 'co_negative']].sum(axis=1)
974
+
975
+ st.dataframe(aggregated_data)
976
+
977
+ # Visualisasi jumlah postingan yang diupdate BA dengan sebaran platform berdasarkan rentang tanggal
978
+ st.subheader("Visualisasi Postingan yang Diupdate BA Berdasarkan Rentang Tanggal")
979
+ ba_update_range = aggregated_data[aggregated_data['update_ba'] != 'Belum diupdate']
980
+ if not ba_update_range.empty:
981
+ plt.figure(figsize=(10, 6))
982
+ ba_update_range['Platform'].value_counts().plot(kind='bar', title='Sebaran Platform - Diupdate BA (Rentang Tanggal)')
983
+ plt.xlabel('Platform')
984
+ plt.ylabel('Jumlah Postingan')
985
+ st.pyplot(plt)
986
+
987
+ # Tambahan visualisasi sebaran postingan berdasarkan tanggal
988
+ st.subheader("Sebaran Postingan Berdasarkan Tanggal")
989
+ plt.figure(figsize=(10, 6))
990
+ ba_update_range['tanggal'].value_counts().sort_index().plot(kind='bar', title='Sebaran Postingan Berdasarkan Tanggal')
991
+ plt.xlabel('Tanggal')
992
+ plt.ylabel('Jumlah Postingan')
993
+ plt.xticks(rotation=45)
994
+ plt.tight_layout()
995
+ st.pyplot(plt)
996
+
997
+ # Tambahan visualisasi sebaran update BA
998
+ st.subheader("Sebaran Update BA")
999
+ plt.figure(figsize=(10, 6))
1000
+ ba_update_range['update_ba'].value_counts().sort_index().plot(kind='bar', title='Sebaran Update BA')
1001
+ plt.xlabel('Tanggal Update BA')
1002
+ plt.ylabel('Jumlah Postingan')
1003
+ plt.xticks(rotation=45)
1004
+ plt.tight_layout()
1005
+ st.pyplot(plt)
1006
+
1007
+ # Visualisasi jumlah komentar berdasarkan tanggal_masuk
1008
+ st.subheader("Jumlah Komentar Berdasarkan Tanggal Masuk")
1009
+ plt.figure(figsize=(10, 6))
1010
+ agg_filtered_data.groupby('tanggal_masuk')['komentar'].count().sort_index().plot(kind='bar', title='Jumlah Komentar Berdasarkan Tanggal Masuk')
1011
+ plt.xlabel('Tanggal Masuk')
1012
+ plt.ylabel('Jumlah Komentar')
1013
+ plt.xticks(rotation=45)
1014
+ plt.tight_layout()
1015
+ st.pyplot(plt)
1016
+
1017
+ # Visualisasi sebaran komentar di tiap platform
1018
+ st.subheader("Sebaran Komentar di Tiap Platform")
1019
+ plt.figure(figsize=(10, 6))
1020
+ agg_filtered_data['Platform'].value_counts().plot(kind='bar', title='Sebaran Komentar di Tiap Platform')
1021
+ plt.xlabel('Platform')
1022
+ plt.ylabel('Jumlah Komentar')
1023
+ plt.xticks(rotation=45)
1024
+ plt.tight_layout()
1025
+ st.pyplot(plt)
1026
+
1027
+ # Visualisasi jumlah missing comments berdasarkan postingan
1028
+ st.subheader("Jumlah Komentar Hilang Berdasarkan Postingan")
1029
+ plt.figure(figsize=(10, 6))
1030
+ aggregated_data.groupby('link')['missing_comments'].sum().sort_index().plot(kind='bar', title='Jumlah Komentar Hilang Berdasarkan Postingan')
1031
+ plt.xlabel('Link')
1032
+ plt.ylabel('Jumlah Komentar Hilang')
1033
+ plt.xticks(rotation=45)
1034
+ plt.tight_layout()
1035
+ st.pyplot(plt)
1036
+ else:
1037
+ st.warning("Tidak ada data yang diupdate BA untuk rentang tanggal yang dipilih.")
1038
+
1039
+ # Opsi untuk mengunduh dataset yang telah diagregasi
1040
+ excel_buffer_aggregated = io.BytesIO()
1041
+ with pd.ExcelWriter(excel_buffer_aggregated, engine='xlsxwriter') as writer:
1042
+ aggregated_data.to_excel(writer, index=False, sheet_name='Aggregated Data')
1043
+ excel_buffer_aggregated.seek(0)
1044
+
1045
+ st.download_button(
1046
+ label=f"Download Aggregated Data by Date for {selected_candidate}",
1047
+ data=excel_buffer_aggregated,
1048
+ file_name=f"aggregated_data_{selected_candidate}.xlsx",
1049
+ mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
1050
+ )
1051
+ else:
1052
+ st.warning("Tidak ada data yang sesuai dengan rentang tanggal yang dipilih untuk diagregasi.")
1053
+ else:
1054
+ st.warning("Tidak ada data dalam dataset untuk divisualisasikan.")
1055
+ else:
1056
+ st.warning("Tidak ada data dalam dataset untuk divisualisasikan.")
1057
+ except FileNotFoundError:
1058
+ st.error(f"No training dataset found for {selected_candidate}. Please add data to create the dataset.")
1059
+ except Exception as e:
1060
+ st.error(f"An error occurred: {e}")
1061
+ pass
1062
+
1063
+ if menu == "Evaluate Data Train":
1064
+ st.title("Evaluate Data Train")
1065
+
1066
+ selected_candidate = st.selectbox("Choose a candidate:", list(candidate_list), key='candidate_select_evaluate')
1067
+ dataset_path = f"datasetntbnew_{selected_candidate.lower().replace(' ', '_')}.xlsx"
1068
+
1069
+ try:
1070
+ df = pd.read_excel(dataset_path)
1071
+
1072
+ # Load existing keyword dictionary
1073
+ try:
1074
+ with open('keywords.json', 'r') as f:
1075
+ keyword_dict = json.load(f)
1076
+ st.success("keywords.json loaded successfully.")
1077
+ except FileNotFoundError:
1078
+ st.error("keywords.json file not found. Please ensure the file is in the correct directory.")
1079
+ st.stop()
1080
+ except json.JSONDecodeError:
1081
+ st.error("keywords.json file is not a valid JSON. Please check the file format.")
1082
+ st.stop()
1083
+
1084
+ # Select candidate-specific keywords
1085
+ candidate_key = selected_candidate.replace(' ', '_')
1086
+ candidate_keywords = keyword_dict.get(candidate_key)
1087
+
1088
+ if not candidate_keywords:
1089
+ st.error(f"No keywords found for the selected candidate '{selected_candidate}'. Please update the 'keywords.json' file with appropriate keywords.")
1090
+ st.stop()
1091
+
1092
+ keywords = [kw for sentiment_keywords in candidate_keywords.values() for kw in sentiment_keywords]
1093
+
1094
+ # Validasi Konsistensi Data
1095
+ st.subheader("Data Consistency Validation")
1096
+ missing_values = df.isnull().sum()
1097
+ st.write("Missing values in each column:")
1098
+ st.write(missing_values)
1099
+
1100
+ # Menghapus baris dengan nilai yang hilang
1101
+ st.write("Removing rows with missing values...")
1102
+ df.dropna(inplace=True)
1103
+
1104
+ # Distribusi Sentimen
1105
+ st.subheader("Sentiment Distribution")
1106
+ sentiment_counts = df['sentimen'].value_counts()
1107
+ st.write("Number of comments for each sentiment:")
1108
+ st.write(sentiment_counts)
1109
+
1110
+ # Kualitas Data
1111
+ st.subheader("Data Quality Check")
1112
+ invalid_entries = df[df['komentar'].str.len() == 0]
1113
+ st.write(f"Number of invalid comments (empty): {len(invalid_entries)}")
1114
+ if len(invalid_entries) > 0:
1115
+ st.write("Invalid comments (empty):")
1116
+ st.dataframe(invalid_entries.head(100)) # Preview 100 baris pertama
1117
+
1118
+ # Deteksi Inkonistensi Sentimen
1119
+ st.subheader("Inconsistent Sentiment Labels")
1120
+ duplicate_comments = df[df.duplicated(subset=['komentar'], keep=False)]
1121
+ inconsistent_labels = duplicate_comments.groupby('komentar')['sentimen'].nunique()
1122
+ inconsistent_labels = inconsistent_labels[inconsistent_labels > 1]
1123
+ if not inconsistent_labels.empty:
1124
+ inconsistent_labels_df = duplicate_comments[duplicate_comments['komentar'].isin(inconsistent_labels.index)]
1125
+ st.write(f"Number of comments with inconsistent sentiment labels: {len(inconsistent_labels_df)}")
1126
+ st.dataframe(inconsistent_labels_df.head(100)) # Preview 100 baris pertama
1127
+ else:
1128
+ st.write("No comments with inconsistent sentiment labels found.")
1129
+
1130
+ # Penanganan Masalah
1131
+ st.subheader("Problem Handling")
1132
+
1133
+ # Menghapus komentar yang kosong
1134
+ st.write("Removing invalid (empty) comments...")
1135
+ df = df[df['komentar'].str.len() > 0]
1136
+
1137
+ # Interaktif: Menangani komentar dengan label sentimen yang tidak konsisten
1138
+ st.write("Resolving inconsistent sentiment labels...")
1139
+ if not inconsistent_labels.empty:
1140
+ for index, row in inconsistent_labels_df.iterrows():
1141
+ st.write(f"Comment: {row['komentar']}")
1142
+ sentimen_options = df[df['komentar'] == row['komentar']]['sentimen'].unique().tolist()
1143
+ new_sentimen = st.selectbox("Select correct sentiment", sentimen_options, key=f'sentimen_{index}')
1144
+ if st.button("Update Sentiment", key=f'update_{index}'):
1145
+ update_sentiment(index, new_sentimen)
1146
+
1147
+ # Clustering menggunakan Keyword dan Model Sarkas
1148
+ st.write("Clustering comments using keywords and sarcasm model...")
1149
+ keyword_vectorizer = TfidfVectorizer(vocabulary=keywords)
1150
+ X_keywords = keyword_vectorizer.fit_transform(df['komentar'])
1151
+ kmeans = KMeans(n_clusters=10, random_state=0).fit(X_keywords)
1152
+ df['cluster'] = kmeans.labels_
1153
+
1154
+ # Identifikasi kluster yang perlu ditinjau
1155
+ review_clusters = df[df['cluster'].isin(df['cluster'].value_counts()[df['cluster'].value_counts() > 10].index)]
1156
+ st.write("Clusters identified for review:")
1157
+ st.dataframe(review_clusters.head(100)) # Preview 100 baris pertama
1158
+
1159
+ # Notifikasi Tambahkan Kata Kunci
1160
+ st.warning("Some comments were not captured by the current keywords. Please add new keywords in the 'Update Keywords' section.")
1161
+
1162
+ # Visualisasi Klaster
1163
+ cluster_counts = df['cluster'].value_counts()
1164
+ st.write("Number of comments in each cluster:")
1165
+ st.write(cluster_counts)
1166
+
1167
+ # Menampilkan hasil clustering
1168
+ st.write("Comments clustered by patterns:")
1169
+ st.dataframe(df.head(100)) # Preview 100 baris pertama
1170
+
1171
+ # Export hasil analisis dan penanganan
1172
+ st.subheader("Export Final Data")
1173
+ json_buffer = io.BytesIO()
1174
+ df.to_json(json_buffer, orient='records', lines=True)
1175
+ json_buffer.seek(0)
1176
+ st.download_button(
1177
+ label=f"Download Final Data for {selected_candidate}",
1178
+ data=json_buffer,
1179
+ file_name=f"final_data_{selected_candidate}.json",
1180
+ mime="application/json"
1181
+ )
1182
+
1183
+ except FileNotFoundError:
1184
+ st.error(f"No training dataset found for {selected_candidate}. Please add data to create the dataset.")
1185
+ except Exception as e:
1186
+ st.error(f"An error occurred: {e}")
1187
+
1188
+ pass # Placeholder
1189
+
1190
+ if menu == "Retraining Model":
1191
+ st.title("Retrain Model")
1192
+ selected_candidate = st.selectbox("Select a candidate to retrain the model:", list(candidate_list))
1193
+
1194
+ dataset_path = f"datasetntbnew_{selected_candidate.lower().replace(' ', '_')}.xlsx"
1195
+ model_path = f"best_rf_model_{selected_candidate.lower().replace(' ', '_')}.joblib"
1196
+ vectorizer_path = f"tfidf_vectorizer_{selected_candidate.lower().replace(' ', '_')}.joblib"
1197
+ retrain_history_path = f"retrain_history_{selected_candidate.lower().replace(' ', '_')}.json"
1198
+
1199
+ try:
1200
+ # Load dataset
1201
+ data = pd.read_excel(dataset_path)
1202
+
1203
+ # Ensure 'komentar' is string
1204
+ if 'komentar' not in data.columns:
1205
+ st.error("Dataset must include a 'komentar' column.")
1206
+ st.stop()
1207
+ else:
1208
+ # Convert all comments to string
1209
+ data['komentar'] = data['komentar'].fillna('').astype(str)
1210
+
1211
+ # Separate validated and unvalidated data
1212
+ if 'evaluated_by_cluster' in data.columns:
1213
+ validated_data = data[data['evaluated_by_cluster'] == True]
1214
+ unvalidated_data = data[data['evaluated_by_cluster'] == False]
1215
+ else:
1216
+ validated_data = pd.DataFrame(columns=data.columns)
1217
+ unvalidated_data = data
1218
+
1219
+ st.write(f"**Validated Data:** {len(validated_data)} rows")
1220
+ st.write(f"**Unvalidated Data:** {len(unvalidated_data)} rows")
1221
+
1222
+ # Check if all data is validated
1223
+ if len(unvalidated_data) > 0:
1224
+ st.warning("Model retraining is only allowed if all data has been validated through 'Evaluate Clustering'. Please ensure all data is validated before retraining the model.")
1225
+ st.stop()
1226
+
1227
+ # Combine all data for preprocessing
1228
+ combined_data = validated_data # Only use validated data
1229
+
1230
+ # Preprocessing Function
1231
+ @st.cache_data(show_spinner=True)
1232
+ def preprocess_data(data):
1233
+ from joblib import Parallel, delayed
1234
+
1235
+ def preprocess_comment(comment):
1236
+ comment = translate_emojis(comment)
1237
+ comment = normalize_unicode(comment)
1238
+ comment = handle_replies(comment)
1239
+ comment = clean_text(comment)
1240
+ comment = translate_text(comment, ntb_dict)
1241
+ comment = translate_text(comment, slang_dict)
1242
+ comment = handle_negation(comment)
1243
+ return comment
1244
+
1245
+ data['processed_comments'] = Parallel(n_jobs=-1)(
1246
+ delayed(preprocess_comment)(c) for c in data['komentar']
1247
+ )
1248
+ return data
1249
+
1250
+ # Preprocessing
1251
+ st.write("Starting preprocessing...")
1252
+ combined_data = preprocess_data(combined_data)
1253
+
1254
+ if st.button("Retrain Model"):
1255
+ # Vectorization
1256
+ st.write("Vectorizing data...")
1257
+ vectorizer = TfidfVectorizer(ngram_range=(1, 1), max_features=5000)
1258
+ X = vectorizer.fit_transform(combined_data['processed_comments'])
1259
+ y = combined_data['sentimen']
1260
+
1261
+ # Split Data
1262
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
1263
+
1264
+ # Handle Class Imbalance with SMOTE
1265
+ st.write("Balancing data with SMOTE...")
1266
+ smote = SMOTE(random_state=42, n_jobs=-1)
1267
+ X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
1268
+
1269
+ # Train Random Forest Model
1270
+ st.write("Training Random Forest model...")
1271
+ rf_model = RandomForestClassifier(n_estimators=200, max_depth=20, random_state=42)
1272
+ rf_model.fit(X_train_res, y_train_res)
1273
+
1274
+ # Evaluate on Training Data
1275
+ st.write("Evaluating model...")
1276
+ y_pred_train = rf_model.predict(X_train)
1277
+ accuracy_train = accuracy_score(y_train, y_pred_train)
1278
+ report_train = classification_report(y_train, y_pred_train, output_dict=True)
1279
+
1280
+ # Evaluate on Test Data
1281
+ y_pred_test = rf_model.predict(X_test)
1282
+ accuracy_test = accuracy_score(y_test, y_pred_test)
1283
+ report_test = classification_report(y_test, y_pred_test, output_dict=True)
1284
+
1285
+ # Save Model and Vectorizer
1286
+ st.write("Saving model and vectorizer...")
1287
+ joblib.dump(rf_model, model_path)
1288
+ joblib.dump(vectorizer, vectorizer_path)
1289
+
1290
+ # Log Retraining History
1291
+ st.write("Logging retraining history...")
1292
+ try:
1293
+ with open(retrain_history_path, "r") as f:
1294
+ retrain_history = json.load(f)
1295
+ except FileNotFoundError:
1296
+ retrain_history = []
1297
+
1298
+ retrain_history.append({
1299
+ "date_retrained": pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'),
1300
+ "accuracy_on_train": accuracy_train,
1301
+ "accuracy_on_test": accuracy_test,
1302
+ "f1_score_on_train": report_train['weighted avg']['f1-score'],
1303
+ "f1_score_on_test": report_test['weighted avg']['f1-score'],
1304
+ })
1305
+
1306
+ with open(retrain_history_path, "w") as f:
1307
+ json.dump(retrain_history, f, indent=4)
1308
+
1309
+ # Display Results
1310
+ st.success(f"Model retrained successfully! Accuracy on training data: {accuracy_train:.4f}, Accuracy on test data: {accuracy_test:.4f}")
1311
+ st.subheader("Model Metrics on Training Data")
1312
+ st.table(pd.DataFrame(report_train).T)
1313
+ st.subheader("Model Metrics on Test Data")
1314
+ st.table(pd.DataFrame(report_test).T)
1315
+
1316
+ # Show Retrain History
1317
+ st.subheader("Retrain History")
1318
+ st.json(retrain_history)
1319
+
1320
+ except FileNotFoundError:
1321
+ st.error(f"No training dataset found for {selected_candidate}. Please add data to create the dataset.")
1322
+ except Exception as e:
1323
+ st.error(f"An unexpected error occurred: {e}")
1324
+
1325
+ pass # Placeholder
1326
+
1327
+ if menu == "Maximize Preprocessing":
1328
+ st.title("Maximize Preprocessing")
1329
+
1330
+ # Load Dataset Train
1331
+ candidate = st.selectbox("Choose a candidate:", list(candidate_list))
1332
+ dataset_path = f"datasetntbnew_{candidate.lower().replace(' ', '_')}.xlsx"
1333
+ try:
1334
+ # Load dataset
1335
+ data = pd.read_excel(dataset_path)
1336
+
1337
+ # Ensure 'komentar' is string
1338
+ if 'komentar' not in data.columns:
1339
+ st.error("Dataset must include a 'komentar' column.")
1340
+ st.stop()
1341
+ data['komentar'] = data['komentar'].fillna('').astype(str)
1342
+
1343
+ # Preprocessing Steps
1344
+ @st.cache_data(show_spinner=True)
1345
+ def preprocess_data(data):
1346
+ st.write("Starting preprocessing...")
1347
+ data['translated_emojis'] = data['komentar'].apply(translate_emojis)
1348
+ data['normalized_unicode'] = data['translated_emojis'].apply(normalize_unicode)
1349
+ data['reply_handled'] = data['normalized_unicode'].apply(handle_replies)
1350
+ data['clean_text'] = data['reply_handled'].apply(clean_text)
1351
+ data['translated_ntb'] = data['clean_text'].apply(lambda x: translate_text(x, ntb_dict))
1352
+ data['translated_slang'] = data['translated_ntb'].apply(lambda x: translate_text(x, slang_dict))
1353
+ data['negation_handled'] = data['translated_slang'].apply(handle_negation)
1354
+ return data
1355
+
1356
+ data = preprocess_data(data)
1357
+
1358
+ # Check Unmapped Words
1359
+ st.subheader("Check for Unmapped Words")
1360
+ all_words = (word.lower() for comment in data['negation_handled'] for word in comment.split())
1361
+ unique_words = set(all_words)
1362
+ ntb_dict_keys = set(ntb_dict.keys())
1363
+ slang_dict_keys = set(slang_dict.keys())
1364
+ mapped_words = ntb_dict_keys.union(slang_dict_keys)
1365
+ unmapped_words = sorted(unique_words - mapped_words)
1366
+
1367
+ if unmapped_words:
1368
+ st.write(f"Found **{len(unmapped_words)} unmapped words.**")
1369
+
1370
+ # Pilihan jumlah kata yang ingin ditampilkan
1371
+ max_words = st.slider(
1372
+ "Select number of words to display:",
1373
+ min_value=10,
1374
+ max_value=len(unmapped_words),
1375
+ value=min(50, len(unmapped_words)),
1376
+ step=10,
1377
+ )
1378
+
1379
+ # Buat DataFrame untuk tampilan tabel
1380
+ unmapped_df = pd.DataFrame(unmapped_words, columns=["Unmapped Words"])
1381
+ st.dataframe(unmapped_df.head(max_words))
1382
+
1383
+ # Tampilkan jumlah total kata jika dibutuhkan
1384
+ st.caption(f"Showing {min(max_words, len(unmapped_words))} out of {len(unmapped_words)} unmapped words.")
1385
+ else:
1386
+ st.success("No unmapped words found!")
1387
+
1388
+ # Add Words to Dictionary
1389
+ st.subheader("Add New Words to Dictionary")
1390
+ new_word = st.text_input("Enter new word:")
1391
+ normalized_word = st.text_input("Enter normalized form:")
1392
+ dictionary_choice = st.radio("Select dictionary to update:", ["Kamus Alay", "Kamus ntb"])
1393
+
1394
+ if st.button("Add to Dictionary"):
1395
+ if new_word and normalized_word:
1396
+ if dictionary_choice == "Kamus Alay":
1397
+ slang_dict[new_word.lower()] = normalized_word
1398
+ st.success(f"Added '{new_word}' -> '{normalized_word}' to Kamus Alay.")
1399
+ elif dictionary_choice == "Kamus ntb":
1400
+ ntb_dict[new_word.lower()] = normalized_word
1401
+ st.success(f"Added '{new_word}' -> '{normalized_word}' to Kamus ntb.")
1402
+ else:
1403
+ st.warning("Please enter both the new word and its normalized form.")
1404
+
1405
+ # Save Updates to File
1406
+ st.subheader("Save Updated Dictionaries")
1407
+ if st.button("Save Kamus Alay"):
1408
+ kamus_alay_path = '/content/kamusalay.csv' # Adjust the path as needed
1409
+ pd.DataFrame(list(slang_dict.items()), columns=["slang", "formal"]).to_csv(kamus_alay_path, index=False)
1410
+ st.success(f"Kamus Alay saved successfully to {kamus_alay_path}.")
1411
+
1412
+ if st.button("Save Kamus ntb"):
1413
+ kamus_ntb_path = '/content/ntb_dict.json' # Adjust the path as needed
1414
+ with open(kamus_ntb_path, 'w', encoding='utf-8') as f:
1415
+ json.dump(ntb_dict, f, indent=4)
1416
+ st.success(f"Kamus ntb saved successfully to {kamus_ntb_path}.")
1417
+ except FileNotFoundError:
1418
+ st.error(f"No training dataset found for {candidate}. Please ensure the dataset is available.")
1419
+ except Exception as e:
1420
+ st.error(f"An unexpected error occurred: {e}")
1421
+
1422
+ pass # Placeholder
1423
+
1424
+ if menu == "Update Keywords":
1425
+ st.title("Update Keywords")
1426
+
1427
+ # Load existing keyword dictionary
1428
+ with open('keywords.json', 'r') as f:
1429
+ keyword_dict = json.load(f)
1430
+
1431
+ # Show current keywords
1432
+ st.subheader("Current Keywords")
1433
+ candidate = st.selectbox("Select candidate", list(keyword_dict.keys()))
1434
+ for sentiment, keywords in keyword_dict[candidate].items():
1435
+ st.write(f"{sentiment}: {', '.join(keywords)}")
1436
+
1437
+ # Add new keyword
1438
+ st.subheader("Add New Keyword")
1439
+ new_keyword = st.text_input("Enter new keyword")
1440
+ selected_sentiment = st.selectbox("Select sentiment for new keyword", list(keyword_dict[candidate].keys()))
1441
+
1442
+ if st.button("Add Keyword"):
1443
+ if new_keyword and selected_sentiment:
1444
+ keyword_dict[candidate][selected_sentiment].append(new_keyword)
1445
+ with open('keywords.json', 'w') as f:
1446
+ json.dump(keyword_dict, f, indent=4)
1447
+ st.success(f"Keyword '{new_keyword}' added to {selected_sentiment} for {candidate}")
1448
+ else:
1449
+ st.error("Please enter a keyword and select a sentiment")
1450
+
1451
+ # Analyze Special Cluster
1452
+ st.subheader("Analyze Special Cluster")
1453
+ if 'ba_lainnya_data' in st.session_state:
1454
+ try:
1455
+ # Langsung Memuat Data `Special Cluster`
1456
+ special_cluster_data = st.session_state['ba_lainnya_data'][st.session_state['ba_lainnya_data']['Cluster_Name'] == 'Special Cluster']
1457
+ if special_cluster_data.empty:
1458
+ st.warning("No data found in Special Cluster.")
1459
+ else:
1460
+ st.write(f"Total comments in Special Cluster: {len(special_cluster_data)}")
1461
+
1462
+ all_words_special = []
1463
+ for comment in special_cluster_data['negation_handled']:
1464
+ comment = translate_emojis(comment)
1465
+ comment = normalize_unicode(comment)
1466
+ comment = handle_replies(comment)
1467
+ comment = clean_text(comment)
1468
+ comment = translate_text(comment, {}) # Adjust based on your dictionary
1469
+ comment = handle_negation(comment)
1470
+ words = preprocess_text(comment)
1471
+ all_words_special.extend(words)
1472
+
1473
+ # Calculate word frequencies
1474
+ word_freq_special = Counter(all_words_special)
1475
+
1476
+ # Add slider to select number of words to display
1477
+ num_words_special = st.slider("Number of words to display (Special Cluster)", min_value=5, max_value=50, value=20)
1478
+ most_common_words_special = word_freq_special.most_common(num_words_special)
1479
+
1480
+ # Display word frequencies as a table
1481
+ st.subheader(f"Top {num_words_special} Word Frequencies in Special Cluster")
1482
+ word_freq_df_special = pd.DataFrame(most_common_words_special, columns=['Word', 'Frequency'])
1483
+ st.dataframe(word_freq_df_special)
1484
+
1485
+ except Exception as e:
1486
+ st.error(f"An error occurred: {e}")
1487
+ else:
1488
+ st.warning("No 'BA Lainnya' data found. Please classify comments first.")
1489
+
1490
+ # Analyze Training Data
1491
+ st.subheader("Analyze Training Data")
1492
+ dataset_path = f"datasetntbnew_{candidate.lower().replace(' ', '_')}.xlsx"
1493
+ try:
1494
+ train_data = pd.read_excel(dataset_path)
1495
+ if train_data.empty:
1496
+ st.warning("Training dataset is empty.")
1497
+ else:
1498
+ all_words_train = []
1499
+ for comment in train_data['komentar'].astype(str):
1500
+ comment = translate_emojis(comment)
1501
+ comment = normalize_unicode(comment)
1502
+ comment = handle_replies(comment)
1503
+ comment = clean_text(comment)
1504
+ comment = translate_text(comment, {}) # Adjust based on your dictionary
1505
+ comment = handle_negation(comment)
1506
+ words = preprocess_text(comment)
1507
+ all_words_train.extend(words)
1508
+
1509
+ # Calculate word frequencies
1510
+ word_freq_train = Counter(all_words_train)
1511
+
1512
+ # Add slider to select number of words to display
1513
+ num_words_train = st.slider("Number of words to display (Training Data)", min_value=5, max_value=50, value=20)
1514
+ most_common_words_train = word_freq_train.most_common(num_words_train)
1515
+
1516
+ # Display word frequencies as a table
1517
+ st.subheader(f"Top {num_words_train} Word Frequencies in Training Data")
1518
+ word_freq_df_train = pd.DataFrame(most_common_words_train, columns=['Word', 'Frequency'])
1519
+ st.dataframe(word_freq_df_train)
1520
+
1521
+ except FileNotFoundError:
1522
+ st.error(f"Training dataset for {candidate} not found.")
1523
+ except Exception as e:
1524
+ st.error(f"An error occurred: {e}")
1525
+
1526
+ # Option to export keywords
1527
+ st.subheader("Export Keywords")
1528
+ json_buffer = io.BytesIO()
1529
+ json_buffer.write(json.dumps(keyword_dict).encode('utf-8'))
1530
+ json_buffer.seek(0)
1531
+ st.download_button(
1532
+ label="Export Keywords",
1533
+ data=json_buffer,
1534
+ file_name="keywords.json",
1535
+ mime="application/json"
1536
+ )
1537
+
1538
+ # Option to import keywords
1539
+ st.subheader("Import Keywords")
1540
+ uploaded_file = st.file_uploader("Choose a JSON file", type="json")
1541
+ if uploaded_file is not None:
1542
+ imported_keywords = json.load(uploaded_file)
1543
+ keyword_dict.update(imported_keywords)
1544
+ with open('keywords.json', 'w') as f:
1545
+ json.dump(keyword_dict, f, indent=4)
1546
+ st.success("Keywords imported successfully")
1547
+ pass
1548
+
best_rf_model_indah_dhamayanti_putri.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e28bb4dacc869a7b71e089118a07351a8de60175fce28b3b8e2b8c01e651ceba
3
+ size 15044729
best_rf_model_lalu_muhamad_iqbal.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d421efb1c2b5876a1dab8d8dc11a351d75064bf0c32a24ef3df3a9913670182
3
+ size 44015033
best_rf_model_m_suhaili.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:949b91ab83ead20c613ced16780bda52780ecac0d56c0f32ccec44131d44ebe2
3
+ size 13640633
best_rf_model_musyafirin.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7cb226ee3fab98e8d2af12cd5329f73beb54e4b5a1fa871d01c2c2029a31d5d2
3
+ size 6092665
best_rf_model_sitti_rohmi_djalilah.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7eecdcd277c204d1d771cd767169ab0f026ed8544516f7f43389aab32f0a27a6
3
+ size 24894969
best_rf_model_zulkieflimansyah.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f5d7ce2da36aadee463102fa50e42d658ac46d3a690ff81329d7a4d81956b0f
3
+ size 23188793
datasetntbnew_indah_dhamayanti_putri.xlsx ADDED
Binary file (57.4 kB). View file
 
datasetntbnew_lalu_muhamad_iqbal.xlsx ADDED
Binary file (104 kB). View file
 
datasetntbnew_m_suhaili.xlsx ADDED
Binary file (57.9 kB). View file
 
datasetntbnew_musyafirin.xlsx ADDED
Binary file (88.5 kB). View file
 
datasetntbnew_sitti_rohmi_djalilah.xlsx ADDED
Binary file (67.5 kB). View file
 
datasetntbnew_zulkieflimansyah.xlsx ADDED
Binary file (67.6 kB). View file
 
kamusalay.csv ADDED
@@ -0,0 +1,238 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ajh,saja
2
+ ajj,saja
3
+ akoh,aku
4
+ amaca,ah masa
5
+ amuh,kamu
6
+ aph,apa
7
+ apose,apa
8
+ apz,apa
9
+ aq,saya
10
+ baryaw,sabar ya
11
+ baryw,sabar ya
12
+ bryaw,sabar ya
13
+ bryw,sabar ya
14
+ bay,selamat tinggal
15
+ besoq,besok
16
+ beud,banget
17
+ bhay,selamat tinggal
18
+ bisya,bisa
19
+ biza,bisa
20
+ bntr,sebentar
21
+ bokap,ayah
22
+ bkap,ayah
23
+ bkp,ayah
24
+ bokaps,ayah
25
+ bokapz,ayah
26
+ bs,bisa
27
+ bsa,bisa
28
+ bsk,besok
29
+ bsoq,besok
30
+ bte,bosan
31
+ bozen,bosan
32
+ bozn,bosan
33
+ bzn,bosan
34
+ bzen,bosan
35
+ cabut,pergi
36
+ caiank,sayang
37
+ cekola,sekolah
38
+ cekolah,sekolah
39
+ celalaw,selalu
40
+ celalu,selalu
41
+ cemungudh,semangat
42
+ cemungut,semangat
43
+ cemunguth,semangat
44
+ cibuq,sibuk
45
+ cini,sini
46
+ ciyus,serius
47
+ cll,selalu
48
+ cllu,selalu
49
+ cllw,selalu
50
+ cpe,capek
51
+ cpee,capek
52
+ cewe,cewek
53
+ cwe,cewek
54
+ cowo,cowok
55
+ cwo,cowok
56
+ dah,sudah
57
+ dapa,ada apa
58
+ dapah,ada apa
59
+ dftr,daftar
60
+ dh,sudah
61
+ dimance,dimana
62
+ dimandose,dimana
63
+ dimans,dimana
64
+ duluw,dulu
65
+ ea,ya
66
+ emg,memang
67
+ eteb,bosan
68
+ g,tidak
69
+ ga,tidak
70
+ gabut,menganggur
71
+ gak,tidak
72
+ gakz,tidak
73
+ gatau,tidak tahu
74
+ gataw,tidak tahu
75
+ gengges,ganggu
76
+ ghiy,lagi
77
+ gi,lagi
78
+ gk,tidak
79
+ gpp,tidak apa apa
80
+ gtw,tidak tahu
81
+ gue,saya
82
+ gw,saya
83
+ gx,tidak
84
+ hums,rumah
85
+ humz,rumah
86
+ huum,iya
87
+ iy,iya
88
+ iyach,iya
89
+ iyap,iya
90
+ iyapz,iya
91
+ iyup,iya
92
+ iyupz,iya
93
+ iz,iya
94
+ iza,iya
95
+ izza,iya
96
+ jamber,jam berapa
97
+ jd,jadi
98
+ jdi,jadi
99
+ jg,juga
100
+ jga,juga
101
+ jgn,jangan
102
+ jngan,jangan
103
+ jngn,jangan
104
+ kacian,kasihan
105
+ kaka,kakak
106
+ kau,kamu
107
+ keles,kali
108
+ kenapah,kenapa
109
+ kenaps,kenapa
110
+ kenapz,kenapa
111
+ kepo,ingin tahu
112
+ keyen,keren
113
+ khan,kan
114
+ khanz,kan
115
+ kk,kakak
116
+ klo,kalau
117
+ klw,kalau
118
+ km,kamu
119
+ kmrn,kemarin
120
+ kmu,kamu
121
+ knp,kenapa
122
+ koq,kok
123
+ kpan,kapan
124
+ kpn,kapan
125
+ kuq,kok
126
+ kuy,ayo
127
+ kw,kamu
128
+ kzl,kesal
129
+ lam,salam
130
+ leh,boleh
131
+ lo,kamu
132
+ loe,kamu
133
+ lom,belum
134
+ low,kalau
135
+ lp,lupa
136
+ lu,kamu
137
+ luchu,lucu
138
+ lum,belum
139
+ lun,belum
140
+ luthu,lucu
141
+ lw,kamu
142
+ maacih,terima kasih
143
+ maap,maaf
144
+ mager,malas bergerak
145
+ makaci,terima kasih
146
+ maw,mau
147
+ miapa,demi apa
148
+ miapah,demi apa
149
+ misal'a,misalnya
150
+ muup,maaf
151
+ mu'uv,maaf
152
+ mw,mau
153
+ nak,anak
154
+ naq,anak
155
+ nax,anak
156
+ nda,tidak
157
+ ndak,tidak
158
+ ndax,tidak
159
+ ngabungin,menggabungkan
160
+ ngajak,mengajak
161
+ ngerokok,merokok
162
+ ngga,tidak
163
+ nggak,tidak
164
+ nggax,tidak
165
+ nggesek,menggesek
166
+ nggosok,menggosok
167
+ ngibul,berbohong
168
+ nyokap,ibu
169
+ nykap,ibu
170
+ nykaps,ibu
171
+ nykapz,ibu
172
+ nykp,ibu
173
+ nich,ini
174
+ nntn,menonton
175
+ ntn,menonton
176
+ oc,oke
177
+ oce,oke
178
+ ohh,oh
179
+ ok,oke
180
+ okedech,oke
181
+ okedeh,oke
182
+ okeh,oke
183
+ okz,oke
184
+ org,orang
185
+ ouch,oh
186
+ ouh,oh
187
+ owh,oh
188
+ pasutri,pasangan suami istri
189
+ paz,pas
190
+ pengen,ingin
191
+ pengin,ingin
192
+ pgn,ingin
193
+ psti,pasti
194
+ pzt,pasti
195
+ q,saya
196
+ qaqa,kakak
197
+ qq,kakak
198
+ rmh,rumah
199
+ sabeb,bebas
200
+ sabi,bisa
201
+ salfok,salah fokus
202
+ saltum,salah kostum
203
+ sdh,sudah
204
+ selaw,santai
205
+ selow,santai
206
+ shap,siap
207
+ shaps,siap
208
+ syipp,sip
209
+ syp,siapa
210
+ tau,tahu
211
+ tauk,tahu
212
+ tdk,tidak
213
+ telp,telepon
214
+ tgl,tanggal
215
+ thx,terima kasih
216
+ tipi,televisi
217
+ tp,tapi
218
+ tq,terima kasih
219
+ trims,terima kasih
220
+ trimz,terima kasih
221
+ tuch,itu
222
+ tw,tahu
223
+ u,kamu
224
+ u,kamu
225
+ udah,sudah
226
+ udd,sudah
227
+ udh,sudah
228
+ uga,juga
229
+ von,telepon
230
+ w,saya
231
+ wad,buat
232
+ wat,buat
233
+ yank,sayang
234
+ yap,ya
235
+ yaw,ya
236
+ yoi,iya
237
+ yups,ya
238
+ yupz,ya
keywords.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Musyafirin": {
3
+ "Co Likes": ["keren", "bagus", "diakui", "disukai", "tegas"],
4
+ "Co Support": ["pemimpin baik", "pilihan tepat", "jujur", "adil", "kinerja baik"],
5
+ "Co Optimism": ["maju terus", "berhasil", "terdepan", "pengaruh positif", "optimis"],
6
+ "Co Negative": ["kekurangan", "buruk", "tidak peduli", "masalah", "tidak mampu"],
7
+ "Co Sarkastic": ["oh tentu", "iya benar", "seolah-olah", "oh hebat", "pasti", "benar sekali", "sangat meyakinkan", "tidak mungkin", "teruskan", "oh iya"]
8
+ },
9
+ "Sitti_Rohmi_Djalillah": {
10
+ "Co Likes": ["baik", "cantik", "inspiratif", "cerdas", "menarik"],
11
+ "Co Support": ["hebat", "terbaik", "pemimpin bijak", "solid", "juara", "unggul"],
12
+ "Co Optimism": ["masa depan cerah", "harapan", "kepercayaan", "optimis", "juara"],
13
+ "Co Negative": ["gagal", "tidak mendukung", "lemah", "tidak suka", "korupsi"],
14
+ "Co Sarkastic": ["oh tentu", "iya benar", "seolah-olah", "oh hebat", "pasti", "benar sekali", "sangat meyakinkan", "tidak mungkin", "teruskan", "oh iya"]
15
+ },
16
+ "Zulkieflimansyah": {
17
+ "Co Likes": ["inspiratif", "cerdas", "berprestasi", "bagus", "terpuji"],
18
+ "Co Support": ["terbaik", "pemimpin inspiratif", "solid", "juara", "bijaksana"],
19
+ "Co Optimism": ["optimis", "harapan", "masa depan", "kemenangan", "perubahan positif"],
20
+ "Co Negative": ["tidak berprestasi", "isu korupsi", "lemah", "tidak mendukung"],
21
+ "Co Sarkastic": ["oh tentu", "iya benar", "seolah-olah", "oh hebat", "pasti", "benar sekali", "sangat meyakinkan", "tidak mungkin", "teruskan", "oh iya"]
22
+ },
23
+ "Lalu_Muhamad_Iqbal": {
24
+ "Co Likes": ["bagus", "baik", "keren", "disukai", "cocok"],
25
+ "Co Support": ["dukung", "pilih", "mantap", "semangat", "nomor satu", "hebat"],
26
+ "Co Optimism": ["harapan", "optimis", "menang", "sukses", "terbaik", "pemimpin"],
27
+ "Co Negative": ["fitnah", "bohong", "tidak suka", "jelek", "kalah", "buruk"],
28
+ "Co Sarkastic": ["oh tentu", "iya benar", "seolah-olah", "oh hebat", "pasti", "benar sekali", "sangat meyakinkan", "tidak mungkin", "teruskan", "oh iya"]
29
+ },
30
+ "Indah_Dhamayanti_Putri": {
31
+ "Co Likes": ["bagus", "menarik", "cocok", "cantik", "baik hati"],
32
+ "Co Support": ["semangat", "mantap", "pilihan tepat", "hebat", "menang", "dukung terus"],
33
+ "Co Optimism": ["sukses", "maju", "terbaik", "inspirasi", "masa depan", "optimis"],
34
+ "Co Negative": ["isu", "korupsi", "tidak baik", "cacat", "buruk", "jelek"],
35
+ "Co Sarkastic": ["oh tentu", "iya benar", "seolah-olah", "oh hebat", "pasti", "benar sekali", "sangat meyakinkan", "tidak mungkin", "teruskan", "oh iya"]
36
+ }
37
+ }
ntb_dict.json ADDED
@@ -0,0 +1,396 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "gawe": "kerja",
3
+ "kepeng": "uang",
4
+ "mae": "datang",
5
+ "menyaman": "nyaman",
6
+ "bere": "berani",
7
+ "muter": "berjalan-jalan",
8
+ "endek": "tidak",
9
+ "lang": "belum",
10
+ "ngena": "makan",
11
+ "tongka": "pergi",
12
+ "nanem": "tanam",
13
+ "menteleng": "melihat",
14
+ "tepek": "tangan",
15
+ "dewe": "sendiri",
16
+ "sambel": "sambal",
17
+ "kene": "sini",
18
+ "bare": "baru",
19
+ "kek": "seperti",
20
+ "sedek": "sedikit",
21
+ "buin": "lagi",
22
+ "bareng": "bersama",
23
+ "beleng": "belok",
24
+ "reng": "orang",
25
+ "batur": "teman",
26
+ "lepok": "berbicara",
27
+ "gubuk": "rumah",
28
+ "lombok": "cabe",
29
+ "santun": "hormat",
30
+ "jelo": "jelek",
31
+ "susur": "bersih",
32
+ "laek": "naik",
33
+ "tembe": "kemudian",
34
+ "kereng": "keras",
35
+ "kajang": "jatuh",
36
+ "raos": "enak",
37
+ "tampah": "kotoran",
38
+ "engat": "ingat",
39
+ "ken": "kenal",
40
+ "baro": "kemarin",
41
+ "silo": "tidur",
42
+ "temek": "kecil",
43
+ "gole": "pergi",
44
+ "betuk": "buat",
45
+ "peng": "sakit",
46
+ "taman": "tambah",
47
+ "kunci": "kunci",
48
+ "sesu": "siap",
49
+ "pon": "sudah",
50
+ "kut": "kamu",
51
+ "gete": "besar",
52
+ "lingeh": "dengar",
53
+ "bueh": "jauh",
54
+ "male": "malu",
55
+ "pacong": "pelit",
56
+ "sate": "tidak ada",
57
+ "gati": "sangat",
58
+ "dase": "hidup",
59
+ "pukul": "pukul",
60
+ "rugu": "bodoh",
61
+ "tengaq": "tengah",
62
+ "juak": "jual",
63
+ "bijak": "bijak",
64
+ "seman": "sehat",
65
+ "masok": "masuk",
66
+ "lauk": "ikan",
67
+ "lengit": "hilang",
68
+ "pek": "samping",
69
+ "peteng": "gelap",
70
+ "rangkat": "angkat",
71
+ "sarak": "cepat",
72
+ "selak": "takut",
73
+ "tapok": "tutup",
74
+ "tepuk": "tangan",
75
+ "tere": "kiri",
76
+ "tuan": "tuan",
77
+ "ungak": "lompat",
78
+ "turun": "turun",
79
+ "waktu": "waktu",
80
+ "wuri": "belakang",
81
+ "yakin": "yakin",
82
+ "zaman": "zaman",
83
+ "nggawe": "sedang bekerja",
84
+ "ngena-ngena": "sedang makan",
85
+ "nanemin": "menanamkan",
86
+ "ngelingeh": "mendengarkan",
87
+ "nenga": "melihat",
88
+ "nengaq": "melihat",
89
+ "golet": "berpergian",
90
+ "lepokin": "membicarakan",
91
+ "betukin": "membuat",
92
+ "masukin": "memasukkan",
93
+ "jualin": "menjual",
94
+ "angkatin": "mengangkat",
95
+ "melangit": "melihat ke atas",
96
+ "nenggo": "menunggu",
97
+ "sedek-sedek": "sedikit-sedikit",
98
+ "bare-tek": "baru saja",
99
+ "lepok-lepok": "pembicaraan",
100
+ "dase-dase": "menghidupkan",
101
+ "paconge": "kepelitan",
102
+ "tapokin": "menutupkan",
103
+ "kerengin": "mengeraskan",
104
+ "silo-siloan": "sering tidur",
105
+ "ndek-nggawe": "tidak bekerja",
106
+ "kereng-kereng": "sangat keras",
107
+ "pacong-pacong": "sangat pelit",
108
+ "beleng-beleng": "belokan",
109
+ "tapok-tapok": "penutupan",
110
+ "kepeng-kepeng": "uang-uang",
111
+ "anake": "anaknya",
112
+ "nyong": "saya",
113
+ "ku": "aku",
114
+ "kit": "kita",
115
+ "iyong": "dia",
116
+ "nene": "mereka",
117
+ "geteh": "besar",
118
+ "alang": "tinggi",
119
+ "lendong": "lembut",
120
+ "ngele": "panas",
121
+ "se": "satu",
122
+ "due": "dua",
123
+ "telu": "tiga",
124
+ "empat": "empat",
125
+ "lima": "lima",
126
+ "enem": "enam",
127
+ "pitu": "tujuh",
128
+ "wolu": "delapan",
129
+ "sia": "sembilan",
130
+ "sepulu": "sepuluh",
131
+ "sewelas": "sebelas",
132
+ "duwelas": "dua belas",
133
+ "selikur": "dua puluh satu",
134
+ "telu likur": "dua puluh tiga",
135
+ "sekedik": "sedikit",
136
+ "bare-bare": "baru-baru",
137
+ "tembe-tembe": "nanti-nanti",
138
+ "reng-reng": "orang-orang",
139
+ "sambel-sambel": "bermacam-macam sambal",
140
+ "silo-silo": "berkali-kali tidur",
141
+ "sate-sate": "tidak ada sama sekali",
142
+ "gole-gole": "sering pergi",
143
+ "batur-batur": "teman-teman",
144
+ "ane": "saya",
145
+ "eto": "itu",
146
+ "maej": "mari",
147
+ "tangkong": "naik",
148
+ "tie": "di sana",
149
+ "skek": "sedikit",
150
+ "arik": "adik",
151
+ "tabah pribadi": "kuat secara pribadi",
152
+ "suhu": "guru",
153
+ "side": "anda",
154
+ "arak": "minuman keras",
155
+ "ruan": "ruang",
156
+ "paut": "ikat",
157
+ "jari": "jadi",
158
+ "penjuluk": "julukan",
159
+ "ndekn": "tidak (NTB)",
160
+ "care": "peduli",
161
+ "nenak": "enak",
162
+ "k'tuan": "tuan",
163
+ "ndek": "tidak",
164
+ "akak": "kakak",
165
+ "milu": "ikut",
166
+ "ust": "ustadz",
167
+ "laun": "pelan-pelan",
168
+ "mun": "kalau",
169
+ "wah": "wah",
170
+ "jadi": "jadi",
171
+ "gubernur": "gubernur",
172
+ "ja": "sudah",
173
+ "ngkah": "langkah",
174
+ "noglh": "menyusul",
175
+ "berbaur": "berbaur",
176
+ "karingan": "kering",
177
+ "aran": "nama",
178
+ "nane": "nama panggilan",
179
+ "kancen": "teman",
180
+ "nyalon": "calonkan diri",
181
+ "biase": "biasa",
182
+ "boyaq": "bohong",
183
+ "suare": "suara",
184
+ "lemaq": "bagus",
185
+ "ngengat": "memukul",
186
+ "bae": "baik",
187
+ "ndkn": "tidak (variant)",
188
+ "mle": "memulai",
189
+ "te": "ke sana",
190
+ "isik": "isi",
191
+ "sak": "sempit",
192
+ "iye": "iya",
193
+ "muk": "mulut",
194
+ "melek": "melek",
195
+ "ky": "seperti",
196
+ "kire": "kirikanan",
197
+ "jemaq": "banyak",
198
+ "seandaian": "seandainya",
199
+ "ne": "di sini",
200
+ "mele": "pergi",
201
+ "ye": "dia",
202
+ "malik": "balik",
203
+ "maraq": "semangat",
204
+ "ngini": "disini",
205
+ "perli": "sindir",
206
+ "melene": "lemah",
207
+ "ampok": "sampai",
208
+ "manto": "mantap",
209
+ "nge": "kamu",
210
+ "lalo": "pergi",
211
+ "ndk": "tidak",
212
+ "ta": "jangan",
213
+ "taok": "ke sana",
214
+ "pilen": "pemilu",
215
+ "min": "makanan ringan",
216
+ "dwg": "dengar",
217
+ "selebung": "tutup",
218
+ "enden": "endapkan",
219
+ "unin": "suara",
220
+ "mule": "pulang",
221
+ "lamun": "jika",
222
+ "ndkmn": "tidak mungkin",
223
+ "pilek": "pemilu",
224
+ "jak": "pergi",
225
+ "wayen": "waktu",
226
+ "pesilak": "minta tolong",
227
+ "balen": "kembali",
228
+ "pastin": "pastikan",
229
+ "laguk": "lagu",
230
+ "poton": "potong",
231
+ "idungm": "hidung",
232
+ "lamper": "lampirkan",
233
+ "sik": "juga",
234
+ "gemes": "tertarik",
235
+ "pete": "kacang panjang",
236
+ "yg": "yang",
237
+ "geratis": "gratis",
238
+ "melak": "melakukan",
239
+ "wahm": "wah",
240
+ "abotk": "berat",
241
+ "eak": "iya",
242
+ "belecok": "berbelok",
243
+ "mauk": "masuk",
244
+ "bdoe": "bodoh",
245
+ "mesak": "merasa",
246
+ "kentok": "kena",
247
+ "nani": "nanti",
248
+ "melen": "mendengar",
249
+ "besile": "berita",
250
+ "kance": "teman",
251
+ "gub": "daerah",
252
+ "bedengah": "tengah",
253
+ "lirimn": "lihat",
254
+ "wea": "anda",
255
+ "adoo": "ada",
256
+ "tenak": "makan",
257
+ "tye": "siapa",
258
+ "juluk": "julukan",
259
+ "peneng": "tenang",
260
+ "ampureeee": "maafkan",
261
+ "eku": "aku",
262
+ "loq": "siapa",
263
+ "maukn": "mau",
264
+ "angen": "bisa",
265
+ "kake": "takut",
266
+ "seragem": "seragam",
267
+ "senu": "biasa",
268
+ "keruan": "sangat",
269
+ "tepileq": "bisa",
270
+ "taon": "tahun",
271
+ "man": "saya",
272
+ "dait": "kait",
273
+ "sengak": "pintar",
274
+ "uah": "wah",
275
+ "surukm": "suruh",
276
+ "lasing": "berlaku",
277
+ "komenank": "komentar",
278
+ "jage": "jaga",
279
+ "melem": "makan",
280
+ "mako": "maaf",
281
+ "pileklah": "sudah",
282
+ "sdh": "sudah",
283
+ "permakoan": "pergaulan",
284
+ "ape": "apa",
285
+ "ite": "itu",
286
+ "jakm": "jaket",
287
+ "sai": "saya",
288
+ "maseh": "masih",
289
+ "maukm": "mau",
290
+ "timak": "ambil",
291
+ "auk": "satu",
292
+ "an": "saya",
293
+ "tadahn": "menangkap",
294
+ "kenak": "kena",
295
+ "berugak": "berdiri",
296
+ "elen": "lihat",
297
+ "setil": "segala",
298
+ "heh": "hei",
299
+ "kanatooo": "kenapa",
300
+ "made": "sudah",
301
+ "mpoipu": "mencari",
302
+ "panjamba": "panjang",
303
+ "ncau": "cau",
304
+ "rew": "redha",
305
+ "ur": "mau",
306
+ "karukumi": "berkurang",
307
+ "lokina": "di sini",
308
+ "wara": "uang",
309
+ "tanda-tanda": "tanda",
310
+ "ompu": "panggil",
311
+ "suki": "suka",
312
+ "doho": "bisa",
313
+ "ede": "di",
314
+ "na": "ada",
315
+ "noro": "apa",
316
+ "weaku": "aku",
317
+ "ragam": "beragam",
318
+ "ndi": "itu",
319
+ "aumu": "saya",
320
+ "ba": "baik",
321
+ "ma": "ya",
322
+ "meta": "mata",
323
+ "de": "di",
324
+ "bolpoin": "pulpen",
325
+ "wa": "wah",
326
+ "mpoi": "sampai",
327
+ "ba loan": "tidak ada",
328
+ "dahu": "kebun",
329
+ "k ntuwu": "kuat",
330
+ "weki": "hai",
331
+ "dou doho": "sangat",
332
+ "ringu": "melihat",
333
+ "aka": "sebutan",
334
+ "ncau re": "cau",
335
+ "ina": "ibu",
336
+ "mpanga": "mendengar",
337
+ "au": "saya",
338
+ "baba": "ayah",
339
+ "pala": "kepala",
340
+ "ngahi": "indah",
341
+ "hafa": "terus",
342
+ "karaka": "gampang",
343
+ "podaku": "saya",
344
+ "ne'e": "disini",
345
+ "wati": "perempuan",
346
+ "dahuna": "ada",
347
+ "loko": "tangan",
348
+ "ro": "sana",
349
+ "waura": "tempat",
350
+ "mbuku": "buku",
351
+ "konee": "kamu",
352
+ "matundu": "kebun",
353
+ "piti": "kecil",
354
+ "mudh": "mudah",
355
+ "progrm": "program",
356
+ "kturunanx": "turunan",
357
+ "ndiha": "disana",
358
+ "ece": "anak",
359
+ "kamanae": "kemana",
360
+ "ngomi": "ngomong",
361
+ "malao": "berlari",
362
+ "ipi": "ujung",
363
+ "sangufi": "bisa",
364
+ "hambu": "sangat",
365
+ "hondo": "berasa",
366
+ "langgengkan": "terus",
367
+ "jelung": "terkenal",
368
+ "kece": "keren",
369
+ "nggih": "ya",
370
+ "mlang": "jalan",
371
+ "tepung": "ketemu",
372
+ "ketok": "kelihatan",
373
+ "tamba": "obat",
374
+ "tulung": "tolong",
375
+ "wet": "air",
376
+ "ndemek": "menyentuh",
377
+ "nyandak": "mengambil",
378
+ "mbet": "memeluk",
379
+ "tepe": "mendorong",
380
+ "kliru": "salah",
381
+ "luweh": "lebih",
382
+ "akeh": "banyak",
383
+ "cemeng": "hitam",
384
+ "abang": "merah",
385
+ "jembar": "luas",
386
+ "ngombe": "minum",
387
+ "nyonggo": "membawa",
388
+ "nyilih": "meminjam",
389
+ "krempyeng-krempyeng": "sedikit demi sedikit",
390
+ "tekuk-tekuk": "membungkuk-bungkuk",
391
+ "sampeyan": "kamu (halus)",
392
+ "awak": "badan",
393
+ "satus": "seratus",
394
+ "sewu": "seribu",
395
+ "sejuta": "sejuta"
396
+ }
tfidf_vectorizer_indah_dhamayanti_putri.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:928fa30e9c66bd841663f7cef0c25adbbce5e51031219779a226eae424b63783
3
+ size 24377
tfidf_vectorizer_lalu_muhamad_iqbal.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:86d123b6d2e126182603c7727ac6d9afa98b5598e4f92d5053070d86f3090ae7
3
+ size 68943
tfidf_vectorizer_m_suhaili.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b5ccdd6525e6fb4b9e0e2ea42734e4b8945ab7cc2f38a708cf6afab3bd0272c4
3
+ size 36399
tfidf_vectorizer_musyafirin.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93d68cfb116d6688f4b797bb1d46e37701d66102cefdaaf9bcb9c24f737402ae
3
+ size 41568
tfidf_vectorizer_sitti_rohmi_djalilah.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90e500ec5c0ec3d1982e0a4fa33df38d8f34449cab9d1e4d89d267a83a546cae
3
+ size 55027
tfidf_vectorizer_zulkieflimansyah.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5036b20528b2b6b9cd9651573d564d219292d07c1f0817a90fd761410e42ed6
3
+ size 48165