Spaces:
Sleeping
Sleeping
zakyirhab0
commited on
Upload 22 files
Browse files- app.py +1548 -0
- best_rf_model_indah_dhamayanti_putri.joblib +3 -0
- best_rf_model_lalu_muhamad_iqbal.joblib +3 -0
- best_rf_model_m_suhaili.joblib +3 -0
- best_rf_model_musyafirin.joblib +3 -0
- best_rf_model_sitti_rohmi_djalilah.joblib +3 -0
- best_rf_model_zulkieflimansyah.joblib +3 -0
- datasetntbnew_indah_dhamayanti_putri.xlsx +0 -0
- datasetntbnew_lalu_muhamad_iqbal.xlsx +0 -0
- datasetntbnew_m_suhaili.xlsx +0 -0
- datasetntbnew_musyafirin.xlsx +0 -0
- datasetntbnew_sitti_rohmi_djalilah.xlsx +0 -0
- datasetntbnew_zulkieflimansyah.xlsx +0 -0
- kamusalay.csv +238 -0
- keywords.json +37 -0
- ntb_dict.json +396 -0
- tfidf_vectorizer_indah_dhamayanti_putri.joblib +3 -0
- tfidf_vectorizer_lalu_muhamad_iqbal.joblib +3 -0
- tfidf_vectorizer_m_suhaili.joblib +3 -0
- tfidf_vectorizer_musyafirin.joblib +3 -0
- tfidf_vectorizer_sitti_rohmi_djalilah.joblib +3 -0
- tfidf_vectorizer_zulkieflimansyah.joblib +3 -0
app.py
ADDED
@@ -0,0 +1,1548 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import joblib
|
3 |
+
import pandas as pd
|
4 |
+
import re
|
5 |
+
import emoji
|
6 |
+
import json
|
7 |
+
import io
|
8 |
+
import unicodedata
|
9 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
10 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
11 |
+
from sklearn.model_selection import train_test_split, GridSearchCV
|
12 |
+
from sklearn.ensemble import RandomForestClassifier
|
13 |
+
from sklearn.metrics import accuracy_score, f1_score
|
14 |
+
from imblearn.over_sampling import SMOTE
|
15 |
+
import matplotlib.pyplot as plt
|
16 |
+
import os
|
17 |
+
from wordcloud import WordCloud
|
18 |
+
from sklearn.metrics import classification_report, accuracy_score, f1_score
|
19 |
+
from sklearn.cluster import KMeans
|
20 |
+
from sklearn.decomposition import PCA
|
21 |
+
from sklearn.preprocessing import FunctionTransformer
|
22 |
+
from sklearn.pipeline import Pipeline
|
23 |
+
from transformers import pipeline
|
24 |
+
from collections import Counter
|
25 |
+
import nltk
|
26 |
+
from nltk.corpus import stopwords
|
27 |
+
from datetime import datetime
|
28 |
+
|
29 |
+
# === Preprocessing Functions === #
|
30 |
+
candidate_list = ["Lalu Muhamad Iqbal", "Indah Dhamayanti Putri", "Zulkieflimansyah", "M Suhaili", "Sitti Rohmi Djalilah", "Musyafirin"]
|
31 |
+
|
32 |
+
# Unduh stopwords jika belum terunduh
|
33 |
+
nltk.download('stopwords')
|
34 |
+
stop_words = set(stopwords.words('indonesian'))
|
35 |
+
|
36 |
+
# Memastikan data "BA Lainnya" tersedia
|
37 |
+
if 'data_with_ba' in st.session_state:
|
38 |
+
ba_lainnya_data = st.session_state['data_with_ba']
|
39 |
+
else:
|
40 |
+
ba_lainnya_data = None
|
41 |
+
|
42 |
+
def translate_emojis(text):
|
43 |
+
return ''.join(c for c in text if not emoji.is_emoji(c)) # Remove all emojis
|
44 |
+
|
45 |
+
def normalize_unicode(text):
|
46 |
+
return unicodedata.normalize('NFKD', text)
|
47 |
+
|
48 |
+
def clean_text(text):
|
49 |
+
text = str(text).casefold() # Convert to lowercase
|
50 |
+
text = re.sub(r'http\S+|www\S+', '', text) # Remove URLs
|
51 |
+
text = re.sub(r'[^a-z\s]', '', text) # Remove non-alphabetic characters
|
52 |
+
text = re.sub(r'\s+', ' ', text).strip() # Normalize spaces
|
53 |
+
return text
|
54 |
+
|
55 |
+
def handle_negation(text):
|
56 |
+
negation_words = {"tidak", "bukan", "jangan", "belum", "kurang", "gagal", "sulit"}
|
57 |
+
words = text.split()
|
58 |
+
result = []
|
59 |
+
skip_next = False
|
60 |
+
for i, word in enumerate(words):
|
61 |
+
if word in negation_words and i + 1 < len(words):
|
62 |
+
result.append(f"{word}_{words[i + 1]}") # Combine negation with next word
|
63 |
+
skip_next = True
|
64 |
+
elif skip_next:
|
65 |
+
skip_next = False
|
66 |
+
else:
|
67 |
+
result.append(word)
|
68 |
+
return ' '.join(result)
|
69 |
+
|
70 |
+
def handle_replies(text):
|
71 |
+
text = re.sub(r'=--*@\w+', '', text) # Remove multi-level reply patterns
|
72 |
+
text = re.sub(r'=-*@\w+', '', text) # Remove single-level reply patterns
|
73 |
+
text = re.sub(r'@\w+', '', text) # Remove standalone @username mentions
|
74 |
+
return text
|
75 |
+
|
76 |
+
def translate_text(text, dictionary):
|
77 |
+
words = text.split()
|
78 |
+
return ' '.join([dictionary.get(word.lower(), word) for word in words]) # Translate words using dictionary
|
79 |
+
|
80 |
+
# Fungsi untuk menetapkan sentimen berdasarkan kata kunci
|
81 |
+
def assign_sentiment_based_on_keywords(comment, keyword_dict):
|
82 |
+
for sentiment, keywords in keyword_dict.items():
|
83 |
+
if any(keyword in comment for keyword in keywords):
|
84 |
+
return sentiment
|
85 |
+
return 'unknown'
|
86 |
+
|
87 |
+
# === Load Dictionaries === #
|
88 |
+
def load_dictionary(file_path, file_type='json'):
|
89 |
+
if file_type == 'json':
|
90 |
+
with open(file_path, 'r', encoding='utf-8') as file:
|
91 |
+
return json.load(file)
|
92 |
+
elif file_type == 'csv':
|
93 |
+
df = pd.read_csv(file_path, names=['slang', 'formal'])
|
94 |
+
return pd.Series(df['formal'].values, index=df['slang']).to_dict()
|
95 |
+
|
96 |
+
ntb_dict = load_dictionary('/content/ntb_dict.json', 'json')
|
97 |
+
slang_dict = load_dictionary('/content/kamusalay.csv', 'csv')
|
98 |
+
|
99 |
+
# === Utility Functions === #
|
100 |
+
# Fungsi untuk memperbarui file JSON dengan riwayat update
|
101 |
+
def update_history_json(history_path, komentar, link, model_data, field, date):
|
102 |
+
# Konversi Timestamp menjadi string
|
103 |
+
if isinstance(date, pd.Timestamp):
|
104 |
+
date = date.strftime('%Y-%m-%d')
|
105 |
+
|
106 |
+
# Baca histori dari file JSON
|
107 |
+
try:
|
108 |
+
with open(history_path, 'r') as file:
|
109 |
+
history_data = json.load(file)
|
110 |
+
except (FileNotFoundError, json.JSONDecodeError):
|
111 |
+
history_data = {}
|
112 |
+
|
113 |
+
# Kunci riwayat untuk komentar, link, dan model data tertentu
|
114 |
+
key = f"{komentar}_{link}_{model_data}"
|
115 |
+
|
116 |
+
# Tambahkan riwayat baru ke kunci yang sesuai
|
117 |
+
if key in history_data:
|
118 |
+
history_data[key].append({field: date})
|
119 |
+
else:
|
120 |
+
history_data[key] = [{field: date}]
|
121 |
+
|
122 |
+
# Simpan kembali ke file JSON
|
123 |
+
with open(history_path, 'w') as file:
|
124 |
+
json.dump(history_data, file, indent=4)
|
125 |
+
|
126 |
+
# Fungsi untuk memperbarui dataset pelatihan dengan data baru
|
127 |
+
def update_training_dataset(output, candidate):
|
128 |
+
dataset_path = f"datasetntbnew_{candidate.lower().replace(' ', '_')}.xlsx"
|
129 |
+
history_path = f"history_{candidate.lower().replace(' ', '_')}.json"
|
130 |
+
|
131 |
+
try:
|
132 |
+
required_columns = ['model_data', 'Platform', 'komentar', 'link', 'kandidat', 'sentimen', 'tanggal', 'tanggal_masuk']
|
133 |
+
output = output[required_columns].copy()
|
134 |
+
|
135 |
+
if 'predicted_category' in output.columns:
|
136 |
+
output['sentimen'] = output['predicted_category']
|
137 |
+
output.drop(columns=['predicted_category'], inplace=True)
|
138 |
+
|
139 |
+
output['tanggal_masuk'] = pd.Timestamp.now()
|
140 |
+
|
141 |
+
if os.path.exists(dataset_path):
|
142 |
+
existing_data = pd.read_excel(dataset_path)
|
143 |
+
else:
|
144 |
+
existing_data = pd.DataFrame(columns=required_columns)
|
145 |
+
|
146 |
+
# Menambahkan kolom 'update_ba' jika belum ada
|
147 |
+
if 'update_ba' not in existing_data.columns:
|
148 |
+
existing_data['update_ba'] = None
|
149 |
+
|
150 |
+
# Menambahkan kolom 'missing_comment' jika belum ada
|
151 |
+
if 'missing_comment' not in existing_data.columns:
|
152 |
+
existing_data['missing_comment'] = False
|
153 |
+
|
154 |
+
# Langkah 1: Memeriksa Komentar yang Hilang
|
155 |
+
train_comments = existing_data.groupby('link')['komentar'].apply(list).to_dict()
|
156 |
+
new_comments = output.groupby('link')['komentar'].apply(list).to_dict()
|
157 |
+
|
158 |
+
for link, comments in train_comments.items():
|
159 |
+
if link in new_comments:
|
160 |
+
new_comment_set = set(new_comments[link])
|
161 |
+
for comment in comments:
|
162 |
+
if comment not in new_comment_set:
|
163 |
+
existing_data.loc[(existing_data['link'] == link) & (existing_data['komentar'] == comment), 'missing_comment'] = True
|
164 |
+
else:
|
165 |
+
existing_data.loc[(existing_data['link'] == link) & (existing_data['komentar'] == comment), 'missing_comment'] = False
|
166 |
+
|
167 |
+
# Fungsi untuk memperbarui data
|
168 |
+
def update_data(existing_data, new_data, history_path):
|
169 |
+
for index, row in new_data.iterrows():
|
170 |
+
komentar = row['komentar']
|
171 |
+
link = row['link']
|
172 |
+
model_data = row['model_data']
|
173 |
+
tanggal_klasifikasi = pd.Timestamp.now()
|
174 |
+
|
175 |
+
# Komentar Sama Sudah Ada
|
176 |
+
existing_entry = existing_data[(existing_data['link'] == link) & (existing_data['komentar'] == komentar)]
|
177 |
+
if not existing_entry.empty:
|
178 |
+
existing_data.loc[existing_entry.index, 'update_ba'] = tanggal_klasifikasi
|
179 |
+
update_history_json(history_path, komentar, link, model_data, 'update_ba', tanggal_klasifikasi)
|
180 |
+
else:
|
181 |
+
# Link Sama, Komentar Berbeda
|
182 |
+
existing_link_entry = existing_data[(existing_data['link'] == link)]
|
183 |
+
if not existing_link_entry.empty:
|
184 |
+
new_row = row.copy()
|
185 |
+
new_row['tanggal_masuk'] = tanggal_klasifikasi
|
186 |
+
new_row['update_ba'] = tanggal_klasifikasi
|
187 |
+
existing_data = pd.concat([existing_data, new_row.to_frame().T], ignore_index=True)
|
188 |
+
update_history_json(history_path, komentar, link, model_data, 'tanggal_masuk', tanggal_klasifikasi)
|
189 |
+
update_history_json(history_path, komentar, link, model_data, 'update_ba', tanggal_klasifikasi)
|
190 |
+
else:
|
191 |
+
# Link Baru
|
192 |
+
new_row = row.copy()
|
193 |
+
new_row['tanggal_masuk'] = tanggal_klasifikasi
|
194 |
+
new_row['update_ba'] = tanggal_klasifikasi
|
195 |
+
existing_data = pd.concat([existing_data, new_row.to_frame().T], ignore_index=True)
|
196 |
+
update_history_json(history_path, komentar, link, model_data, 'tanggal_masuk', tanggal_klasifikasi)
|
197 |
+
update_history_json(history_path, komentar, link, model_data, 'update_ba', tanggal_klasifikasi)
|
198 |
+
|
199 |
+
# Perbarui nilai None di update_ba dengan tanggal_masuk
|
200 |
+
existing_data['update_ba'] = pd.to_datetime(existing_data['update_ba'], errors='coerce')
|
201 |
+
existing_data['update_ba'].fillna(existing_data['tanggal_masuk'], inplace=True)
|
202 |
+
return existing_data
|
203 |
+
|
204 |
+
updated_data = update_data(existing_data, output, history_path)
|
205 |
+
updated_data.to_excel(dataset_path, index=False)
|
206 |
+
|
207 |
+
st.success(f"Data successfully updated in {candidate}'s training dataset.")
|
208 |
+
|
209 |
+
if 'missing_comment' in existing_data.columns and existing_data['missing_comment'].any():
|
210 |
+
st.subheader("Missing Comments")
|
211 |
+
st.write("Comments that were found to be missing:")
|
212 |
+
st.dataframe(existing_data[existing_data['missing_comment']])
|
213 |
+
except KeyError as e:
|
214 |
+
st.error(f"Missing column in the dataset: {e}")
|
215 |
+
except Exception as e:
|
216 |
+
st.error(f"An error occurred: {e}")
|
217 |
+
|
218 |
+
|
219 |
+
def clustering_based_evaluation(df, n_clusters=10):
|
220 |
+
st.write("Starting preprocessing...")
|
221 |
+
df['translated_emojis'] = df['komentar'].fillna('').astype(str).apply(translate_emojis)
|
222 |
+
df['normalized_unicode'] = df['translated_emojis'].apply(normalize_unicode)
|
223 |
+
df['reply_handled'] = df['normalized_unicode'].apply(handle_replies)
|
224 |
+
df['clean_text'] = df['reply_handled'].apply(clean_text)
|
225 |
+
df['translated_ntb'] = df['clean_text'].apply(lambda x: translate_text(x, ntb_dict))
|
226 |
+
df['translated_slang'] = df['translated_ntb'].apply(lambda x: translate_text(x, slang_dict))
|
227 |
+
df['negation_handled'] = df['translated_slang'].apply(handle_negation)
|
228 |
+
|
229 |
+
st.write("Generating TF-IDF vectors...")
|
230 |
+
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
|
231 |
+
tfidf_matrix = tfidf_vectorizer.fit_transform(df['negation_handled'])
|
232 |
+
|
233 |
+
st.write(f"Clustering into {n_clusters} clusters...")
|
234 |
+
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
|
235 |
+
df['Cluster'] = kmeans.fit_predict(tfidf_matrix)
|
236 |
+
|
237 |
+
st.write("Performing PCA for visualization...")
|
238 |
+
pca = PCA(n_components=2)
|
239 |
+
reduced_data = pca.fit_transform(tfidf_matrix.toarray())
|
240 |
+
df['PCA1'] = reduced_data[:, 0]
|
241 |
+
df['PCA2'] = reduced_data[:, 1]
|
242 |
+
|
243 |
+
st.write("Clustering completed successfully!")
|
244 |
+
return df
|
245 |
+
|
246 |
+
def load_and_process_data(dataset_path, history_path):
|
247 |
+
df = pd.read_excel(dataset_path)
|
248 |
+
df['tanggal_masuk'] = pd.to_datetime(df['tanggal_masuk'], errors='coerce')
|
249 |
+
|
250 |
+
if df['tanggal_masuk'].isnull().any():
|
251 |
+
st.warning("Some dates could not be parsed correctly. Please check the date format in the dataset.")
|
252 |
+
df['tanggal_masuk'].fillna(pd.Timestamp.now().strftime('%Y-%m-%d'), inplace=True)
|
253 |
+
|
254 |
+
required_columns = ['model_data', 'Platform', 'komentar', 'sentimen', 'tanggal', 'tanggal_masuk', 'evaluated_by_cluster']
|
255 |
+
for col in required_columns:
|
256 |
+
if col not in df.columns:
|
257 |
+
if col == 'tanggal_masuk':
|
258 |
+
df[col] = pd.Timestamp.now().strftime('%Y-%m-%d')
|
259 |
+
elif col == 'evaluated_by_cluster':
|
260 |
+
df[col] = False
|
261 |
+
else:
|
262 |
+
df[col] = None
|
263 |
+
|
264 |
+
df = df[required_columns]
|
265 |
+
|
266 |
+
try:
|
267 |
+
with open(history_path, "r") as f:
|
268 |
+
history = json.load(f)
|
269 |
+
except FileNotFoundError:
|
270 |
+
history = []
|
271 |
+
|
272 |
+
return df, history
|
273 |
+
|
274 |
+
# Define the function to handle Special Cluster
|
275 |
+
def handle_special_cluster(data, keywords, detector):
|
276 |
+
for index, row in data.iterrows():
|
277 |
+
text = row['negation_handled']
|
278 |
+
|
279 |
+
# Cek apakah teks mengandung salah satu kata kunci
|
280 |
+
detected = False
|
281 |
+
for sentiment, words in keywords.items():
|
282 |
+
if any(word in text for word in words):
|
283 |
+
data.loc[index, 'predicted_category'] = sentiment.replace('_', ' ')
|
284 |
+
data.loc[index, 'detected_by'] = 'keyword'
|
285 |
+
detected = True
|
286 |
+
break
|
287 |
+
|
288 |
+
if not detected:
|
289 |
+
# Jika tidak ada kata kunci yang cocok, gunakan deteksi sarkasme
|
290 |
+
result = detector(text)
|
291 |
+
if result[0]['label'] == 'SARCASM':
|
292 |
+
data.loc[index, 'predicted_category'] = 'co sarkastic'
|
293 |
+
data.loc[index, 'detected_by'] = 'sarcasm'
|
294 |
+
else:
|
295 |
+
# Jika tidak ada sentimen yang cocok
|
296 |
+
data.loc[index, 'predicted_category'] = 'Unknown'
|
297 |
+
data.loc[index, 'detected_by'] = 'unknown'
|
298 |
+
|
299 |
+
return data
|
300 |
+
|
301 |
+
def preprocess_text(text):
|
302 |
+
text = text.lower()
|
303 |
+
text = re.sub(r'\W+', ' ', text) # Remove non-alphanumeric characters
|
304 |
+
words = text.split()
|
305 |
+
words = [word for word in words if word not in stop_words]
|
306 |
+
return words
|
307 |
+
|
308 |
+
def display_word_frequencies(words, num_words):
|
309 |
+
st.subheader(f"Top {num_words} Words")
|
310 |
+
for word, freq in words:
|
311 |
+
st.write(f"{word}: {freq}")
|
312 |
+
|
313 |
+
def update_sentiment(index, new_sentimen):
|
314 |
+
df.loc[index, 'sentimen'] = new_sentimen
|
315 |
+
st.write(f"Updated sentiment for comment at index {index} to {new_sentimen}")
|
316 |
+
|
317 |
+
# Fungsi untuk memuat data `Corrected Comments` dari file Excel
|
318 |
+
def load_corrected_comments_from_excel(file_path):
|
319 |
+
try:
|
320 |
+
return pd.read_excel(file_path, sheet_name='Corrected Comments')
|
321 |
+
except FileNotFoundError:
|
322 |
+
return pd.DataFrame(columns=['model_data', 'Platform', 'komentar', 'tanggal', 'link', 'sentimen', 'Cluster_Name', 'corrected_by', 'tanggal_masuk'])
|
323 |
+
except Exception as e:
|
324 |
+
print(f"Error loading corrected comments: {e}")
|
325 |
+
return pd.DataFrame(columns=['model_data', 'Platform', 'komentar', 'tanggal', 'link', 'sentimen', 'Cluster_Name', 'corrected_by', 'tanggal_masuk'])
|
326 |
+
|
327 |
+
# Fungsi untuk menyimpan data `Corrected Comments` ke file Excel
|
328 |
+
def save_corrected_comments_to_excel(data, file_path):
|
329 |
+
with pd.ExcelWriter(file_path, engine='xlsxwriter') as writer:
|
330 |
+
data.to_excel(writer, sheet_name='Corrected Comments', index=False)
|
331 |
+
|
332 |
+
# Lokasi file Excel untuk menyimpan data `Corrected Comments`
|
333 |
+
corrected_comments_file = 'corrected_comments.xlsx'
|
334 |
+
|
335 |
+
# Fungsi untuk menampilkan visualisasi distribusi komentar dalam cluster
|
336 |
+
def display_cluster_visualization(ba_lainnya_data):
|
337 |
+
st.subheader("Cluster Visualization")
|
338 |
+
|
339 |
+
# Menghitung jumlah komentar dalam kategori `Similar Sentiment` dan `Special Cluster`
|
340 |
+
cluster_counts = ba_lainnya_data[ba_lainnya_data['Cluster_Name'].str.contains('Similar|Special Cluster')]['Cluster_Name'].value_counts()
|
341 |
+
|
342 |
+
# Membuat grafik batang (bar chart)
|
343 |
+
plt.figure(figsize=(10, 6))
|
344 |
+
plt.bar(cluster_counts.index, cluster_counts.values, color=['blue', 'green', 'orange', 'red', 'purple'])
|
345 |
+
plt.xlabel('Cluster Name')
|
346 |
+
plt.ylabel('Number of Comments')
|
347 |
+
plt.title('Distribution of Comments in Similar Sentiment and Special Cluster')
|
348 |
+
plt.xticks(rotation=45)
|
349 |
+
plt.show()
|
350 |
+
st.pyplot(plt)
|
351 |
+
|
352 |
+
def run_clustering_for_ba_lainnya():
|
353 |
+
st.title("Clustering for 'BA Lainnya'")
|
354 |
+
|
355 |
+
if 'data_with_ba' not in st.session_state:
|
356 |
+
st.error("No 'BA Lainnya' data found from the classification model. Please classify comments first.")
|
357 |
+
st.stop()
|
358 |
+
|
359 |
+
ba_lainnya_data = st.session_state['data_with_ba']
|
360 |
+
|
361 |
+
st.write(f"**'BA Lainnya' Data:** {len(ba_lainnya_data)} rows")
|
362 |
+
|
363 |
+
with open('keywords.json', 'r') as f:
|
364 |
+
keyword_dict = json.load(f)
|
365 |
+
|
366 |
+
selected_candidate = st.session_state['candidate']
|
367 |
+
candidate_keywords = keyword_dict.get(selected_candidate.replace(' ', '_'))
|
368 |
+
|
369 |
+
if candidate_keywords is None:
|
370 |
+
st.error("Keywords for the selected candidate not found.")
|
371 |
+
st.stop()
|
372 |
+
|
373 |
+
sarcasm_detector = pipeline('sentiment-analysis', model='unitary/toxic-bert')
|
374 |
+
|
375 |
+
dataset_path = f"datasetntbnew_{selected_candidate.lower().replace(' ', '_')}.xlsx"
|
376 |
+
corrected_comments_file = f"corrected_comments_{selected_candidate.lower().replace(' ', '_')}.xlsx"
|
377 |
+
|
378 |
+
try:
|
379 |
+
train_data = pd.read_excel(dataset_path)
|
380 |
+
labeled_data = train_data[train_data['sentimen'].isin(['Co Likes', 'Co Support', 'Co Optimism', 'Co Negative', 'Co Sarkastic'])]
|
381 |
+
|
382 |
+
st.write(f"**Labeled Data from Training Dataset:** {len(labeled_data)} rows")
|
383 |
+
|
384 |
+
ba_lainnya_data['Cluster'] = None
|
385 |
+
ba_lainnya_data['detected_by'] = None
|
386 |
+
ba_lainnya_data['Cluster_Name'] = None
|
387 |
+
ba_lainnya_data['corrected_by'] = "Not Corrected"
|
388 |
+
ba_lainnya_data['Sentiment'] = None # Ubah inisialisasi ke None untuk memastikan tidak ada nilai default yang salah
|
389 |
+
|
390 |
+
ba_lainnya_data['Cluster_Name'] = ba_lainnya_data['Cluster_Name'].astype(str)
|
391 |
+
ba_lainnya_data['corrected_by'] = ba_lainnya_data['corrected_by'].astype(str)
|
392 |
+
ba_lainnya_data['Sentiment'] = ba_lainnya_data['Sentiment'].astype(str)
|
393 |
+
|
394 |
+
for data in [ba_lainnya_data, labeled_data]:
|
395 |
+
data['translated_emojis'] = data['komentar'].fillna('').astype(str).apply(translate_emojis)
|
396 |
+
data['normalized_unicode'] = data['translated_emojis'].apply(normalize_unicode)
|
397 |
+
data['reply_handled'] = data['normalized_unicode'].apply(handle_replies)
|
398 |
+
data['clean_text'] = data['reply_handled'].apply(clean_text)
|
399 |
+
data['translated_ntb'] = data['clean_text'].apply(lambda x: translate_text(x, {}))
|
400 |
+
data['translated_slang'] = data['translated_ntb'].apply(lambda x: translate_text(x, {}))
|
401 |
+
data['negation_handled'] = data['translated_slang'].apply(handle_negation)
|
402 |
+
data['negation_handled'] = data['negation_handled'].fillna('')
|
403 |
+
|
404 |
+
combined_data = ba_lainnya_data.copy()
|
405 |
+
combined_data['Label'] = 'BA Lainnya'
|
406 |
+
|
407 |
+
for sentimen in ['Co Likes', 'Co Support', 'Co Optimism', 'Co Negative', 'Co Sarkastic']:
|
408 |
+
sentimen_data = labeled_data[labeled_data['sentimen'] == sentimen].copy()
|
409 |
+
sentimen_data['Label'] = sentimen
|
410 |
+
combined = pd.concat([combined_data, sentimen_data], ignore_index=True)
|
411 |
+
|
412 |
+
if len(combined) < 2:
|
413 |
+
st.warning(f"Not enough samples to cluster for {sentimen}.")
|
414 |
+
continue
|
415 |
+
|
416 |
+
vectorizer = TfidfVectorizer(ngram_range=(1, 1), max_features=5000)
|
417 |
+
tfidf_matrix = vectorizer.fit_transform(combined['negation_handled'])
|
418 |
+
|
419 |
+
st.write(f"Clustering 'BA Lainnya' comments similar to {sentimen}...")
|
420 |
+
kmeans = KMeans(n_clusters=2, random_state=42)
|
421 |
+
combined['Cluster'] = kmeans.fit_predict(tfidf_matrix)
|
422 |
+
|
423 |
+
valid_indices = combined.index[:len(ba_lainnya_data)]
|
424 |
+
valid_indices = valid_indices.intersection(ba_lainnya_data.index)
|
425 |
+
|
426 |
+
ba_lainnya_data.loc[valid_indices, 'Cluster'] = combined.loc[valid_indices, 'Cluster']
|
427 |
+
ba_lainnya_data.loc[ba_lainnya_data['Cluster'] == 0, 'Cluster_Name'] = f"{sentimen} Similar"
|
428 |
+
ba_lainnya_data.loc[ba_lainnya_data['Cluster'] == 1, 'Cluster_Name'] = f"{sentimen} Dissimilar"
|
429 |
+
ba_lainnya_data.loc[valid_indices, 'Sentiment'] = sentimen
|
430 |
+
|
431 |
+
for index, row in ba_lainnya_data.iterrows():
|
432 |
+
if row['Cluster_Name'].endswith('Dissimilar') or row['Cluster_Name'] == 'None':
|
433 |
+
dissimilar_comment = ba_lainnya_data.loc[[index]].copy()
|
434 |
+
for sentimen in ['Co Likes', 'Co Support', 'Co Optimism', 'Co Negative', 'Co Sarkastic']:
|
435 |
+
sentimen_data = labeled_data[labeled_data['sentimen'] == sentimen].copy()
|
436 |
+
combined = pd.concat([dissimilar_comment, sentimen_data], ignore_index=True)
|
437 |
+
|
438 |
+
if len(combined) < 2:
|
439 |
+
continue
|
440 |
+
|
441 |
+
tfidf_matrix = vectorizer.fit_transform(combined['negation_handled'])
|
442 |
+
if tfidf_matrix.shape[0] == 0:
|
443 |
+
continue
|
444 |
+
|
445 |
+
kmeans = KMeans(n_clusters=2, random_state=42)
|
446 |
+
combined['Cluster'] = kmeans.fit_predict(tfidf_matrix)
|
447 |
+
|
448 |
+
if len(combined) > 0 and combined.loc[0, 'Cluster'] == 0:
|
449 |
+
ba_lainnya_data.loc[index, 'Cluster_Name'] = f"{sentimen} Similar"
|
450 |
+
ba_lainnya_data.loc[index, 'Sentiment'] = sentimen
|
451 |
+
break
|
452 |
+
else:
|
453 |
+
ba_lainnya_data.loc[index, 'Cluster_Name'] = 'Special Cluster'
|
454 |
+
ba_lainnya_data.loc[index, 'corrected_by'] = 'Special Cluster'
|
455 |
+
ba_lainnya_data.loc[index, 'Sentiment'] = 'Special Sentiment'
|
456 |
+
|
457 |
+
ba_lainnya_data['Cluster_Name'] = ba_lainnya_data['Cluster_Name'].apply(lambda x: 'Special Cluster' if x == 'nan' else x)
|
458 |
+
|
459 |
+
special_cluster_data = ba_lainnya_data[ba_lainnya_data['Cluster_Name'] == 'Special Cluster']
|
460 |
+
if not special_cluster_data.empty:
|
461 |
+
special_cluster_data = handle_special_cluster(special_cluster_data, candidate_keywords, sarcasm_detector)
|
462 |
+
ba_lainnya_data.update(special_cluster_data)
|
463 |
+
ba_lainnya_data.loc[special_cluster_data.index, 'corrected_by'] = 'Special Cluster'
|
464 |
+
ba_lainnya_data.loc[special_cluster_data.index, 'Sentiment'] = 'Special Sentiment'
|
465 |
+
|
466 |
+
st.warning("Some comments were not captured by the current keywords. Please add new keywords in the 'Update Keywords' section.")
|
467 |
+
|
468 |
+
st.subheader("Detection Distribution in Special Cluster")
|
469 |
+
detection_counts = special_cluster_data['detected_by'].value_counts()
|
470 |
+
plt.figure(figsize=(10, 6))
|
471 |
+
plt.bar(detection_counts.index, detection_counts.values, color=['blue', 'orange', 'red'])
|
472 |
+
plt.xlabel('Detection Method')
|
473 |
+
plt.ylabel('Number of Comments')
|
474 |
+
plt.title('Detection Distribution in Special Cluster')
|
475 |
+
plt.show()
|
476 |
+
st.pyplot(plt)
|
477 |
+
|
478 |
+
st.write("Top Keywords in Special Cluster")
|
479 |
+
for sentiment, keywords in candidate_keywords.items():
|
480 |
+
st.write(f"{sentiment}: {', '.join(keywords)}")
|
481 |
+
|
482 |
+
st.subheader("Special Cluster Details")
|
483 |
+
st.dataframe(special_cluster_data[['komentar', 'Cluster_Name', 'detected_by']])
|
484 |
+
|
485 |
+
corrected_comments = load_corrected_comments_from_excel(corrected_comments_file)
|
486 |
+
display_cluster_visualization(ba_lainnya_data)
|
487 |
+
|
488 |
+
st.subheader("Search and Filter Clusters")
|
489 |
+
search_term = st.text_input("Enter a keyword to search for in the comments:")
|
490 |
+
|
491 |
+
if search_term:
|
492 |
+
filtered_data = ba_lainnya_data[ba_lainnya_data['komentar'].str.contains(search_term, case=False, na=False)]
|
493 |
+
st.write(f"Filtered Data (Showing first 100 rows) for search term '{search_term}':")
|
494 |
+
st.dataframe(filtered_data.head(100))
|
495 |
+
else:
|
496 |
+
st.dataframe(ba_lainnya_data.head(100))
|
497 |
+
|
498 |
+
st.subheader("Clustered Data")
|
499 |
+
selected_cluster = st.selectbox("Select a cluster to view comments:", sorted(ba_lainnya_data['Cluster_Name'].unique()))
|
500 |
+
cluster_comments = ba_lainnya_data[ba_lainnya_data['Cluster_Name'] == selected_cluster]
|
501 |
+
st.dataframe(cluster_comments[['komentar', 'Cluster_Name']].head(100))
|
502 |
+
|
503 |
+
new_sentimen = st.selectbox("Select new sentiment for this cluster:", ['Co Likes', 'Co Support', 'Co Optimism', 'Co Negative', 'Co Sarkastic'])
|
504 |
+
if st.button("Update Sentiment for this cluster"):
|
505 |
+
ba_lainnya_data.loc[ba_lainnya_data['Cluster_Name'] == selected_cluster, 'corrected_by'] = 'Batch Cluster'
|
506 |
+
ba_lainnya_data.loc[ba_lainnya_data['Cluster_Name'] == selected_cluster, 'sentimen'] = new_sentimen
|
507 |
+
st.success(f"Sentiment for cluster {selected_cluster} updated to {new_sentimen}")
|
508 |
+
|
509 |
+
# Save and refresh Corrected Comments table and Cluster Visualization
|
510 |
+
corrected_comments = pd.concat([corrected_comments, ba_lainnya_data[ba_lainnya_data['corrected_by'] != "Not Corrected"]])
|
511 |
+
corrected_comments.drop_duplicates(subset=['komentar'], keep='last', inplace=True)
|
512 |
+
save_corrected_comments_to_excel(corrected_comments, corrected_comments_file)
|
513 |
+
st.subheader("Corrected Comments")
|
514 |
+
st.dataframe(corrected_comments[['komentar', 'Cluster_Name', 'corrected_by', 'sentimen']].head(100))
|
515 |
+
display_cluster_visualization(ba_lainnya_data)
|
516 |
+
|
517 |
+
st.subheader("Special Rules Based on Keywords")
|
518 |
+
keyword = st.text_input("Enter a keyword to set a rule:")
|
519 |
+
specific_cluster = st.selectbox("Select a cluster for this keyword:", sorted(ba_lainnya_data['Cluster_Name'].unique()))
|
520 |
+
|
521 |
+
if keyword:
|
522 |
+
new_cluster = st.selectbox("Select sentiment for this keyword:", ['Co Likes', 'Co Support', 'Co Optimism', 'Co Negative', 'Co Sarkastic'])
|
523 |
+
if st.button("Apply Rule"):
|
524 |
+
ba_lainnya_data.loc[ba_lainnya_data['komentar'].str.contains(keyword, case=False, na=False), 'Cluster_Name'] = new_cluster
|
525 |
+
ba_lainnya_data.loc[ba_lainnya_data['komentar'].str.contains(keyword, case=False, na=False), 'detected_by'] = specific_cluster
|
526 |
+
ba_lainnya_data.loc[ba_lainnya_data['komentar'].str.contains(keyword, case=False, na=False), 'corrected_by'] = 'Keyword Rule'
|
527 |
+
ba_lainnya_data.loc[ba_lainnya_data['komentar'].str.contains(keyword, case=False, na=False), 'sentimen'] = new_cluster
|
528 |
+
st.success(f"All comments containing '{keyword}' have been updated to '{new_cluster}' sentiment.")
|
529 |
+
|
530 |
+
# Update keywords.json file to avoid duplicates
|
531 |
+
if selected_candidate.replace(' ', '_') in keyword_dict:
|
532 |
+
if new_cluster in keyword_dict[selected_candidate.replace(' ', '_')]:
|
533 |
+
if keyword not in keyword_dict[selected_candidate.replace(' ', '_')][new_cluster]:
|
534 |
+
keyword_dict[selected_candidate.replace(' ', '_')][new_cluster].append(keyword)
|
535 |
+
else:
|
536 |
+
keyword_dict[selected_candidate.replace(' ', '_')][new_cluster] = [keyword]
|
537 |
+
else:
|
538 |
+
keyword_dict[selected_candidate.replace(' ', '_')] = {new_cluster: [keyword]}
|
539 |
+
|
540 |
+
with open('keywords.json', 'w') as f:
|
541 |
+
json.dump(keyword_dict, f)
|
542 |
+
|
543 |
+
st.success(f"Keyword '{keyword}' has been added to the keyword list.")
|
544 |
+
|
545 |
+
# Save and refresh Corrected Comments table and Cluster Visualization
|
546 |
+
corrected_comments = pd.concat([corrected_comments, ba_lainnya_data[ba_lainnya_data['corrected_by'] != "Not Corrected"]])
|
547 |
+
corrected_comments.drop_duplicates(subset=['komentar'], keep='last', inplace=True)
|
548 |
+
save_corrected_comments_to_excel(corrected_comments, corrected_comments_file)
|
549 |
+
st.subheader("Corrected Comments")
|
550 |
+
st.dataframe(corrected_comments[['komentar', 'Cluster_Name', 'corrected_by', 'sentimen']].head(100))
|
551 |
+
display_cluster_visualization(ba_lainnya_data)
|
552 |
+
|
553 |
+
st.subheader("Corrected Comments")
|
554 |
+
corrected_comments = load_corrected_comments_from_excel(corrected_comments_file)
|
555 |
+
st.dataframe(corrected_comments[['komentar', 'Cluster_Name', 'corrected_by', 'sentimen']].head(100))
|
556 |
+
|
557 |
+
st.subheader("Visual Representation of Corrected Comments")
|
558 |
+
sentiment_counts = corrected_comments['sentimen'].value_counts()
|
559 |
+
plt.figure(figsize=(10, 6))
|
560 |
+
plt.bar(sentiment_counts.index, sentiment_counts.values, color=['blue', 'green', 'orange', 'red', 'purple'])
|
561 |
+
plt.xlabel('Sentimen')
|
562 |
+
plt.ylabel('Number of Corrected Comments')
|
563 |
+
plt.title('Number of Corrected Comments by Sentiment')
|
564 |
+
plt.show()
|
565 |
+
st.pyplot(plt)
|
566 |
+
|
567 |
+
st.subheader("Download Options")
|
568 |
+
excel_buffer_cluster = io.BytesIO()
|
569 |
+
with pd.ExcelWriter(excel_buffer_cluster, engine='xlsxwriter') as writer:
|
570 |
+
ba_lainnya_data.to_excel(writer, index=False, sheet_name='Clustered Data')
|
571 |
+
excel_buffer_cluster.seek(0)
|
572 |
+
|
573 |
+
st.download_button(
|
574 |
+
label=f"Download Clustered Data for {selected_candidate}",
|
575 |
+
data=excel_buffer_cluster,
|
576 |
+
file_name=f"clustered_data_{selected_candidate}.xlsx",
|
577 |
+
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
578 |
+
)
|
579 |
+
|
580 |
+
st.subheader("Save Corrected Comments and Non-'BA Lainnya' Data to Training Dataset")
|
581 |
+
if st.button("Save All to Dataset"):
|
582 |
+
try:
|
583 |
+
# Gabungkan hasil klasifikasi tanpa tag 'BA Lainnya' dan komentar yang telah dikoreksi
|
584 |
+
combined_data = pd.concat([st.session_state['data_without_ba'], corrected_comments], ignore_index=True)
|
585 |
+
combined_data['tanggal_masuk'] = pd.Timestamp.now().strftime('%Y-%m-%d')
|
586 |
+
update_training_dataset(combined_data, st.session_state['candidate']) # Panggil fungsi baru
|
587 |
+
st.success("Corrected comments and classified data without 'BA Lainnya' have been saved to the training dataset.")
|
588 |
+
|
589 |
+
# Clear the session state to prevent duplicate saves
|
590 |
+
st.session_state['data_with_ba'] = pd.DataFrame(columns=corrected_comments.columns)
|
591 |
+
st.session_state['data_without_ba'] = pd.DataFrame(columns=corrected_comments.columns)
|
592 |
+
|
593 |
+
# Rerun automatically after saving to dataset
|
594 |
+
st.rerun()
|
595 |
+
except Exception as e:
|
596 |
+
st.error(f"An error occurred while saving the data: {e}")
|
597 |
+
|
598 |
+
except FileNotFoundError:
|
599 |
+
st.error(f"No dataset found for {selected_candidate}. Please add data to create the dataset.")
|
600 |
+
except Exception as e:
|
601 |
+
st.error(f"An unexpected error occurred: {e}")
|
602 |
+
|
603 |
+
# === Sidebar Navigation === #
|
604 |
+
menu = st.sidebar.radio("Select a Feature", ["Model-Based Classification","Clustering for 'BA Lainnya'", "Update Keywords","View Training Dataset","Evaluate Data Train","Maximize Preprocessing","Retraining Model"])
|
605 |
+
if menu == "Model-Based Classification":
|
606 |
+
st.title("Model-Based Classification")
|
607 |
+
candidate = st.selectbox("Choose a candidate:", candidate_list)
|
608 |
+
model_path = f"/content/best_rf_model_{candidate.replace(' ', '_').lower()}.joblib"
|
609 |
+
vectorizer_path = f"/content/tfidf_vectorizer_{candidate.replace(' ', '_').lower()}.joblib"
|
610 |
+
|
611 |
+
# Save the selected candidate to session state
|
612 |
+
st.session_state['candidate'] = candidate
|
613 |
+
|
614 |
+
uploaded_file = st.file_uploader("Upload an Excel file for classification", type=['xlsx'])
|
615 |
+
|
616 |
+
@st.cache_data
|
617 |
+
def load_model_and_vectorizer(model_path, vectorizer_path):
|
618 |
+
"""Load model and vectorizer, cache them for efficiency."""
|
619 |
+
try:
|
620 |
+
model = joblib.load(model_path)
|
621 |
+
vectorizer = joblib.load(vectorizer_path)
|
622 |
+
return model, vectorizer
|
623 |
+
except FileNotFoundError:
|
624 |
+
return None, None
|
625 |
+
|
626 |
+
model, vectorizer = load_model_and_vectorizer(model_path, vectorizer_path)
|
627 |
+
|
628 |
+
if not model or not vectorizer:
|
629 |
+
st.error("Model or vectorizer not found for the selected candidate.")
|
630 |
+
st.stop()
|
631 |
+
|
632 |
+
# Save the vectorizer and model to session state
|
633 |
+
st.session_state['vectorizer'] = vectorizer
|
634 |
+
st.session_state['model'] = model
|
635 |
+
|
636 |
+
@st.cache_data
|
637 |
+
def preprocess_data(data):
|
638 |
+
"""Preprocess comments with batching for large datasets."""
|
639 |
+
from joblib import Parallel, delayed
|
640 |
+
|
641 |
+
def preprocess_batch(batch):
|
642 |
+
batch['translated_emojis'] = batch['komentar'].apply(translate_emojis)
|
643 |
+
batch['normalized_unicode'] = batch['translated_emojis'].apply(normalize_unicode)
|
644 |
+
batch['reply_handled'] = batch['normalized_unicode'].apply(handle_replies)
|
645 |
+
batch['clean_text'] = batch['reply_handled'].apply(clean_text)
|
646 |
+
batch['translated_ntb'] = batch['clean_text'].apply(lambda x: translate_text(x, ntb_dict))
|
647 |
+
batch['translated_slang'] = batch['translated_ntb'].apply(lambda x: translate_text(x, slang_dict))
|
648 |
+
batch['negation_handled'] = batch['translated_slang'].apply(handle_negation)
|
649 |
+
return batch
|
650 |
+
|
651 |
+
batch_size = 10000 # Process 10,000 rows at a time
|
652 |
+
batches = [data.iloc[i:i+batch_size] for i in range(0, len(data), batch_size)]
|
653 |
+
processed_batches = Parallel(n_jobs=-1)(delayed(preprocess_batch)(batch) for batch in batches)
|
654 |
+
return pd.concat(processed_batches, ignore_index=True)
|
655 |
+
|
656 |
+
# Process uploaded file
|
657 |
+
if uploaded_file:
|
658 |
+
try:
|
659 |
+
data = pd.read_excel(uploaded_file)
|
660 |
+
if 'komentar' not in data.columns:
|
661 |
+
st.error("The uploaded file must include a 'komentar' column.")
|
662 |
+
st.stop()
|
663 |
+
data = preprocess_data(data)
|
664 |
+
except Exception as e:
|
665 |
+
st.error(f"An error occurred while processing the file: {e}")
|
666 |
+
st.stop()
|
667 |
+
elif 'model_classified_data' in st.session_state:
|
668 |
+
data = st.session_state['model_classified_data']
|
669 |
+
else:
|
670 |
+
st.info("Please upload a file for classification.")
|
671 |
+
st.stop()
|
672 |
+
|
673 |
+
# Transform comments into TF-IDF vectors
|
674 |
+
try:
|
675 |
+
tfidf_data = vectorizer.transform(data['negation_handled'].fillna(''))
|
676 |
+
data['predicted_category'] = model.predict(tfidf_data)
|
677 |
+
data['probabilities'] = model.predict_proba(tfidf_data).tolist()
|
678 |
+
data['max_probability'] = data['probabilities'].apply(lambda x: max(x))
|
679 |
+
except Exception as e:
|
680 |
+
st.error(f"An error occurred during model prediction: {e}")
|
681 |
+
st.stop()
|
682 |
+
|
683 |
+
# Cache classified data
|
684 |
+
st.session_state['model_classified_data'] = data
|
685 |
+
|
686 |
+
# Interactive threshold adjustment
|
687 |
+
st.subheader("Set Threshold for 'BA Lainnya'")
|
688 |
+
threshold = st.slider("Threshold for tagging 'BA Lainnya'", min_value=0.0, max_value=1.0, value=0.80, step=0.01)
|
689 |
+
|
690 |
+
# Apply threshold to tag "BA Lainnya"
|
691 |
+
data['tag'] = data['max_probability'].apply(lambda x: 'BA Lainnya' if x < threshold else '')
|
692 |
+
|
693 |
+
# Separate data for visualization
|
694 |
+
data_without_ba = data[data['tag'] != 'BA Lainnya']
|
695 |
+
data_with_ba = data[data['tag'] == 'BA Lainnya']
|
696 |
+
|
697 |
+
# Save updated results to session state for dynamic updates
|
698 |
+
st.session_state['data_without_ba'] = data_without_ba
|
699 |
+
st.session_state['data_with_ba'] = data_with_ba
|
700 |
+
|
701 |
+
# Preview Results
|
702 |
+
st.subheader("Preview Results")
|
703 |
+
|
704 |
+
st.write("### 1. Hasil Klasifikasi Tanpa Tag 'BA Lainnya'")
|
705 |
+
if not data_without_ba.empty:
|
706 |
+
st.dataframe(data_without_ba[['komentar', 'predicted_category', 'max_probability']])
|
707 |
+
else:
|
708 |
+
st.info("No high-probability classifications available.")
|
709 |
+
|
710 |
+
st.write("### 2. Hasil Klasifikasi Dengan Tag 'BA Lainnya'")
|
711 |
+
if not data_with_ba.empty:
|
712 |
+
st.dataframe(data_with_ba[['komentar', 'predicted_category', 'max_probability']])
|
713 |
+
else:
|
714 |
+
st.info("No low-probability classifications available.")
|
715 |
+
|
716 |
+
# Visualization: Sentiment Distribution
|
717 |
+
st.subheader("Sentiment Distribution Visualization")
|
718 |
+
|
719 |
+
def plot_distribution(data, title):
|
720 |
+
sentiment_counts = data['predicted_category'].value_counts()
|
721 |
+
fig, ax = plt.subplots()
|
722 |
+
ax.bar(sentiment_counts.index, sentiment_counts.values)
|
723 |
+
ax.set_title(title)
|
724 |
+
ax.set_xlabel("Sentiments")
|
725 |
+
ax.set_ylabel("Count")
|
726 |
+
st.pyplot(fig)
|
727 |
+
|
728 |
+
if not data_without_ba.empty:
|
729 |
+
plot_distribution(data_without_ba, "Sentiment Distribution (Without 'BA Lainnya')")
|
730 |
+
if not data_with_ba.empty:
|
731 |
+
plot_distribution(data_with_ba, "Sentiment Distribution (With 'BA Lainnya')")
|
732 |
+
|
733 |
+
# Download Results
|
734 |
+
st.subheader("Download Results")
|
735 |
+
excel_buffer = io.BytesIO()
|
736 |
+
with pd.ExcelWriter(excel_buffer, engine='xlsxwriter') as writer:
|
737 |
+
data.to_excel(writer, index=False, sheet_name='Classification Results')
|
738 |
+
excel_buffer.seek(0)
|
739 |
+
|
740 |
+
st.download_button(
|
741 |
+
label="Download All Classification Results",
|
742 |
+
data=excel_buffer,
|
743 |
+
file_name=f"classification_results_{candidate}.xlsx",
|
744 |
+
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
745 |
+
)
|
746 |
+
|
747 |
+
# Save Non-"BA Lainnya" Data to Training Dataset
|
748 |
+
if not data_with_ba.empty:
|
749 |
+
st.warning("There are comments with 'BA Lainnya' tagging. Please proceed to 'Clustering for BA Lainnya'.")
|
750 |
+
elif not data_without_ba.empty:
|
751 |
+
st.subheader("Save Classified Data")
|
752 |
+
if st.button("Save Non-'BA Lainnya' Data to Training Dataset"):
|
753 |
+
try:
|
754 |
+
data_to_save = data_without_ba[['model_data', 'Platform', 'komentar', 'link', 'kandidat', 'tanggal']].copy()
|
755 |
+
data_to_save['sentimen'] = data_without_ba['predicted_category']
|
756 |
+
data_to_save['tanggal_masuk'] = pd.Timestamp.now().strftime('%Y-%m-%d')
|
757 |
+
update_training_dataset(data_to_save, candidate)
|
758 |
+
st.success("Data successfully saved to the training dataset.")
|
759 |
+
except Exception as e:
|
760 |
+
st.error(f"An error occurred while saving the data: {e}")
|
761 |
+
else:
|
762 |
+
st.info("No Non-'BA Lainnya' data available to save.")
|
763 |
+
pass # Placeholder
|
764 |
+
|
765 |
+
# Integrasi fungsi dalam halaman "Clustering for 'BA Lainnya'"
|
766 |
+
if menu == "Clustering for 'BA Lainnya'":
|
767 |
+
selected_candidate = st.session_state['selected_candidate'] if 'selected_candidate' in st.session_state else None
|
768 |
+
run_clustering_for_ba_lainnya()
|
769 |
+
pass # Placeholder
|
770 |
+
|
771 |
+
# Memastikan kode ini hanya dijalankan jika menu yang dipilih adalah "View Training Dataset"
|
772 |
+
if menu == "View Training Dataset":
|
773 |
+
st.title("View Training Dataset")
|
774 |
+
|
775 |
+
# Header untuk memilih kandidat
|
776 |
+
st.header("Options")
|
777 |
+
selected_candidate = st.selectbox("Choose a candidate:", list(candidate_list), key='candidate_select_view')
|
778 |
+
|
779 |
+
# Path dataset
|
780 |
+
dataset_path = f"datasetntbnew_{selected_candidate.lower().replace(' ', '_')}.xlsx"
|
781 |
+
history_path = f"history_{selected_candidate.lower().replace(' ', '_')}.json"
|
782 |
+
|
783 |
+
# Memuat dataset
|
784 |
+
try:
|
785 |
+
df = pd.read_excel(dataset_path)
|
786 |
+
|
787 |
+
# Memastikan kolom yang diperlukan ada
|
788 |
+
required_columns = ['model_data', 'Platform', 'komentar', 'sentimen', 'tanggal', 'tanggal_masuk', 'link', 'evaluated_by_data_train']
|
789 |
+
for col in required_columns:
|
790 |
+
if col not in df.columns:
|
791 |
+
if col == 'evaluated_by_data_train':
|
792 |
+
df[col] = False
|
793 |
+
|
794 |
+
# Menambahkan kolom 'update_ba' jika belum ada
|
795 |
+
if 'update_ba' not in df.columns:
|
796 |
+
df['update_ba'] = None
|
797 |
+
|
798 |
+
# Menambahkan kolom 'missing_comment' jika belum ada
|
799 |
+
if 'missing_comment' not in df.columns:
|
800 |
+
df['missing_comment'] = False
|
801 |
+
|
802 |
+
# Pastikan 'tanggal_masuk' dan 'tanggal' dalam format datetime yang benar
|
803 |
+
df['tanggal_masuk'] = pd.to_datetime(df['tanggal_masuk'], errors='coerce')
|
804 |
+
df['tanggal'] = pd.to_datetime(df['tanggal'], errors='coerce')
|
805 |
+
|
806 |
+
# Konversi kolom tanggal agar hanya menyimpan bagian tanggal tanpa waktu
|
807 |
+
df['tanggal'] = df['tanggal'].dt.date
|
808 |
+
df['tanggal_masuk'] = df['tanggal_masuk'].dt.date
|
809 |
+
df['update_ba'] = pd.to_datetime(df['update_ba'], errors='coerce').dt.date
|
810 |
+
|
811 |
+
# Menangani nilai NaT (Not a Time) jika ada
|
812 |
+
if df['tanggal_masuk'].isnull().any():
|
813 |
+
st.warning("Some dates 'tanggal_masuk' could not be parsed correctly. Please check the date format in the dataset.")
|
814 |
+
df['tanggal_masuk'].fillna(pd.Timestamp.now().date(), inplace=True)
|
815 |
+
|
816 |
+
if df['tanggal'].isnull().any():
|
817 |
+
st.warning("Some dates 'tanggal' could not be parsed correctly. Please check the date format in the dataset.")
|
818 |
+
df['tanggal'].fillna(pd.Timestamp.now().date(), inplace=True)
|
819 |
+
|
820 |
+
# Menambahkan kolom 'kandidat' jika belum ada dan mengisinya
|
821 |
+
if 'kandidat' not in df.columns:
|
822 |
+
df['kandidat'] = selected_candidate
|
823 |
+
|
824 |
+
# Mengambil subset kolom yang diperlukan
|
825 |
+
df = df[required_columns + ['update_ba', 'kandidat', 'missing_comment']]
|
826 |
+
|
827 |
+
# Perbarui nilai None di update_ba dengan tanggal_masuk
|
828 |
+
df['update_ba'].fillna(df['tanggal_masuk'], inplace=True)
|
829 |
+
|
830 |
+
# Menampilkan statistik dasar
|
831 |
+
st.subheader(f"Training Dataset for {selected_candidate}")
|
832 |
+
st.write(f"**Total rows in dataset:** {len(df)}")
|
833 |
+
|
834 |
+
if not df.empty:
|
835 |
+
# Visualisasi sebaran update BA
|
836 |
+
st.subheader("Visualisasi Postingan Berdasarkan Update BA")
|
837 |
+
ba_update_counts = df['update_ba'].value_counts().sort_index()
|
838 |
+
fig, ax = plt.subplots(figsize=(10, 6))
|
839 |
+
ba_update_counts.plot(kind='bar', ax=ax, color='blue')
|
840 |
+
ax.set_title('Sebaran Postingan Berdasarkan Update BA')
|
841 |
+
ax.set_xlabel('Tanggal Update BA')
|
842 |
+
ax.set_ylabel('Jumlah Postingan')
|
843 |
+
plt.xticks(rotation=45)
|
844 |
+
plt.tight_layout()
|
845 |
+
st.pyplot(fig)
|
846 |
+
|
847 |
+
# Visualisasi tambahan sebaran platform
|
848 |
+
st.subheader("Sebaran Platform Berdasarkan Update BA")
|
849 |
+
platform_counts = df['Platform'].value_counts()
|
850 |
+
fig, ax = plt.subplots(figsize=(10, 6))
|
851 |
+
platform_counts.plot(kind='bar', ax=ax, color='green')
|
852 |
+
ax.set_title('Sebaran Platform Berdasarkan Update BA')
|
853 |
+
ax.set_xlabel('Platform')
|
854 |
+
ax.set_ylabel('Jumlah Postingan')
|
855 |
+
plt.xticks(rotation=45)
|
856 |
+
plt.tight_layout()
|
857 |
+
st.pyplot(fig)
|
858 |
+
|
859 |
+
# Visualisasi jumlah komentar hilang berdasarkan platform
|
860 |
+
st.subheader("Jumlah Komentar Hilang Berdasarkan Platform")
|
861 |
+
missing_comments_by_platform = df.groupby('Platform')['missing_comment'].sum().sort_index()
|
862 |
+
fig, ax = plt.subplots(figsize=(10, 6))
|
863 |
+
missing_comments_by_platform.plot(kind='bar', ax=ax, color='red')
|
864 |
+
ax.set_title('Jumlah Komentar Hilang Berdasarkan Platform')
|
865 |
+
ax.set_xlabel('Platform')
|
866 |
+
ax.set_ylabel('Jumlah Komentar Hilang')
|
867 |
+
plt.xticks(rotation=45)
|
868 |
+
plt.tight_layout()
|
869 |
+
st.pyplot(fig)
|
870 |
+
|
871 |
+
# Filter berdasarkan status validasi
|
872 |
+
st.subheader("Filter Data")
|
873 |
+
validation_filter = st.radio(
|
874 |
+
"Choose data type to view:",
|
875 |
+
["All Data", "Validated Data", "Non-Validated Data"],
|
876 |
+
key='validation_filter'
|
877 |
+
)
|
878 |
+
|
879 |
+
if validation_filter == "Validated Data":
|
880 |
+
filtered_data = df[df['evaluated_by_data_train'] == True]
|
881 |
+
elif validation_filter == "Non-Validated Data":
|
882 |
+
filtered_data = df[df['evaluated_by_data_train'] == False]
|
883 |
+
else:
|
884 |
+
filtered_data = df
|
885 |
+
|
886 |
+
if not filtered_data.empty:
|
887 |
+
st.subheader(f"Filtered Data: {validation_filter}")
|
888 |
+
st.dataframe(filtered_data) # Menampilkan semua data yang sesuai dengan filter
|
889 |
+
else:
|
890 |
+
st.warning("Tidak ada data yang sesuai dengan filter yang dipilih.")
|
891 |
+
|
892 |
+
# Menampilkan riwayat penambahan data
|
893 |
+
st.subheader("History of Data Additions")
|
894 |
+
try:
|
895 |
+
with open(history_path, "r") as f:
|
896 |
+
history = json.load(f)
|
897 |
+
|
898 |
+
history_list = []
|
899 |
+
for key, value in history.items():
|
900 |
+
for entry in value:
|
901 |
+
for k, v in entry.items():
|
902 |
+
history_list.append({
|
903 |
+
'key': key,
|
904 |
+
'field': k,
|
905 |
+
'date': v
|
906 |
+
})
|
907 |
+
|
908 |
+
history_df = pd.DataFrame(history_list)
|
909 |
+
st.dataframe(history_df)
|
910 |
+
except FileNotFoundError:
|
911 |
+
st.write("No addition history available.")
|
912 |
+
except ValueError as e:
|
913 |
+
st.error(f"An error occurred while loading history data: {e}")
|
914 |
+
|
915 |
+
# Opsi untuk mengunduh dataset yang telah difilter
|
916 |
+
st.subheader("Download Options")
|
917 |
+
if not filtered_data.empty:
|
918 |
+
excel_buffer = io.BytesIO()
|
919 |
+
with pd.ExcelWriter(excel_buffer, engine='xlsxwriter') as writer:
|
920 |
+
filtered_data.to_excel(writer, index=False, sheet_name='Filtered Dataset')
|
921 |
+
excel_buffer.seek(0)
|
922 |
+
|
923 |
+
st.download_button(
|
924 |
+
label=f"Download Filtered Dataset for {selected_candidate}",
|
925 |
+
data=excel_buffer,
|
926 |
+
file_name=f"filtered_training_dataset_{selected_candidate}.xlsx",
|
927 |
+
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
928 |
+
)
|
929 |
+
|
930 |
+
# Opsi untuk mengunduh seluruh dataset
|
931 |
+
if not df.empty:
|
932 |
+
excel_buffer_full = io.BytesIO()
|
933 |
+
with pd.ExcelWriter(excel_buffer_full, engine='xlsxwriter') as writer:
|
934 |
+
df.to_excel(writer, index=False, sheet_name='Training Dataset')
|
935 |
+
excel_buffer_full.seek(0)
|
936 |
+
|
937 |
+
st.download_button(
|
938 |
+
label=f"Download Full Training Dataset for {selected_candidate}",
|
939 |
+
data=excel_buffer_full,
|
940 |
+
file_name=f"training_dataset_{selected_candidate}.xlsx",
|
941 |
+
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
942 |
+
)
|
943 |
+
|
944 |
+
# Fitur untuk mengunduh dataset berdasarkan filter tanggal
|
945 |
+
st.subheader("Download Aggregated Data by Date")
|
946 |
+
|
947 |
+
# Mendapatkan nilai maksimum dan minimum tanggal untuk default date_input
|
948 |
+
if not df['tanggal'].empty:
|
949 |
+
min_date = df['tanggal'].min()
|
950 |
+
max_date = df['tanggal'].max()
|
951 |
+
|
952 |
+
selected_start_date = st.date_input("Select start date for aggregation:", value=min_date)
|
953 |
+
selected_end_date = st.date_input("Select end date for aggregation:", value=max_date)
|
954 |
+
|
955 |
+
agg_filtered_data = df[(df['tanggal'] >= selected_start_date) & (df['tanggal'] <= selected_end_date)]
|
956 |
+
|
957 |
+
if not agg_filtered_data.empty:
|
958 |
+
# Pastikan kolom 'kandidat' ada dan diisi
|
959 |
+
agg_filtered_data['kandidat'] = selected_candidate
|
960 |
+
|
961 |
+
aggregated_data = agg_filtered_data.groupby(['kandidat', 'link', 'tanggal', 'Platform', 'model_data', 'update_ba']).agg(
|
962 |
+
co_likes=('sentimen', lambda x: (x == 'Co Likes').sum()),
|
963 |
+
co_support=('sentimen', lambda x: (x == 'Co Support').sum()),
|
964 |
+
co_optimism=('sentimen', lambda x: (x == 'Co Optimism').sum()),
|
965 |
+
co_sarkastic=('sentimen', lambda x: (x == 'Co Sarkastic').sum()),
|
966 |
+
co_negative=('sentimen', lambda x: (x == 'Co Negative').sum()),
|
967 |
+
missing_comments=('missing_comment', 'sum') # Menambahkan agregasi untuk missing_comment
|
968 |
+
).reset_index()
|
969 |
+
|
970 |
+
# Menambahkan kolom-kolom baru untuk jumlah komentar
|
971 |
+
aggregated_data['jumlah_komentar_positif'] = aggregated_data['co_likes'] + aggregated_data['co_support'] + aggregated_data['co_optimism']
|
972 |
+
aggregated_data['jumlah_komentar_negatif'] = aggregated_data['co_sarkastic'] + aggregated_data['co_negative']
|
973 |
+
aggregated_data['jumlah_komentar'] = aggregated_data[['co_likes', 'co_support', 'co_optimism', 'co_sarkastic', 'co_negative']].sum(axis=1)
|
974 |
+
|
975 |
+
st.dataframe(aggregated_data)
|
976 |
+
|
977 |
+
# Visualisasi jumlah postingan yang diupdate BA dengan sebaran platform berdasarkan rentang tanggal
|
978 |
+
st.subheader("Visualisasi Postingan yang Diupdate BA Berdasarkan Rentang Tanggal")
|
979 |
+
ba_update_range = aggregated_data[aggregated_data['update_ba'] != 'Belum diupdate']
|
980 |
+
if not ba_update_range.empty:
|
981 |
+
plt.figure(figsize=(10, 6))
|
982 |
+
ba_update_range['Platform'].value_counts().plot(kind='bar', title='Sebaran Platform - Diupdate BA (Rentang Tanggal)')
|
983 |
+
plt.xlabel('Platform')
|
984 |
+
plt.ylabel('Jumlah Postingan')
|
985 |
+
st.pyplot(plt)
|
986 |
+
|
987 |
+
# Tambahan visualisasi sebaran postingan berdasarkan tanggal
|
988 |
+
st.subheader("Sebaran Postingan Berdasarkan Tanggal")
|
989 |
+
plt.figure(figsize=(10, 6))
|
990 |
+
ba_update_range['tanggal'].value_counts().sort_index().plot(kind='bar', title='Sebaran Postingan Berdasarkan Tanggal')
|
991 |
+
plt.xlabel('Tanggal')
|
992 |
+
plt.ylabel('Jumlah Postingan')
|
993 |
+
plt.xticks(rotation=45)
|
994 |
+
plt.tight_layout()
|
995 |
+
st.pyplot(plt)
|
996 |
+
|
997 |
+
# Tambahan visualisasi sebaran update BA
|
998 |
+
st.subheader("Sebaran Update BA")
|
999 |
+
plt.figure(figsize=(10, 6))
|
1000 |
+
ba_update_range['update_ba'].value_counts().sort_index().plot(kind='bar', title='Sebaran Update BA')
|
1001 |
+
plt.xlabel('Tanggal Update BA')
|
1002 |
+
plt.ylabel('Jumlah Postingan')
|
1003 |
+
plt.xticks(rotation=45)
|
1004 |
+
plt.tight_layout()
|
1005 |
+
st.pyplot(plt)
|
1006 |
+
|
1007 |
+
# Visualisasi jumlah komentar berdasarkan tanggal_masuk
|
1008 |
+
st.subheader("Jumlah Komentar Berdasarkan Tanggal Masuk")
|
1009 |
+
plt.figure(figsize=(10, 6))
|
1010 |
+
agg_filtered_data.groupby('tanggal_masuk')['komentar'].count().sort_index().plot(kind='bar', title='Jumlah Komentar Berdasarkan Tanggal Masuk')
|
1011 |
+
plt.xlabel('Tanggal Masuk')
|
1012 |
+
plt.ylabel('Jumlah Komentar')
|
1013 |
+
plt.xticks(rotation=45)
|
1014 |
+
plt.tight_layout()
|
1015 |
+
st.pyplot(plt)
|
1016 |
+
|
1017 |
+
# Visualisasi sebaran komentar di tiap platform
|
1018 |
+
st.subheader("Sebaran Komentar di Tiap Platform")
|
1019 |
+
plt.figure(figsize=(10, 6))
|
1020 |
+
agg_filtered_data['Platform'].value_counts().plot(kind='bar', title='Sebaran Komentar di Tiap Platform')
|
1021 |
+
plt.xlabel('Platform')
|
1022 |
+
plt.ylabel('Jumlah Komentar')
|
1023 |
+
plt.xticks(rotation=45)
|
1024 |
+
plt.tight_layout()
|
1025 |
+
st.pyplot(plt)
|
1026 |
+
|
1027 |
+
# Visualisasi jumlah missing comments berdasarkan postingan
|
1028 |
+
st.subheader("Jumlah Komentar Hilang Berdasarkan Postingan")
|
1029 |
+
plt.figure(figsize=(10, 6))
|
1030 |
+
aggregated_data.groupby('link')['missing_comments'].sum().sort_index().plot(kind='bar', title='Jumlah Komentar Hilang Berdasarkan Postingan')
|
1031 |
+
plt.xlabel('Link')
|
1032 |
+
plt.ylabel('Jumlah Komentar Hilang')
|
1033 |
+
plt.xticks(rotation=45)
|
1034 |
+
plt.tight_layout()
|
1035 |
+
st.pyplot(plt)
|
1036 |
+
else:
|
1037 |
+
st.warning("Tidak ada data yang diupdate BA untuk rentang tanggal yang dipilih.")
|
1038 |
+
|
1039 |
+
# Opsi untuk mengunduh dataset yang telah diagregasi
|
1040 |
+
excel_buffer_aggregated = io.BytesIO()
|
1041 |
+
with pd.ExcelWriter(excel_buffer_aggregated, engine='xlsxwriter') as writer:
|
1042 |
+
aggregated_data.to_excel(writer, index=False, sheet_name='Aggregated Data')
|
1043 |
+
excel_buffer_aggregated.seek(0)
|
1044 |
+
|
1045 |
+
st.download_button(
|
1046 |
+
label=f"Download Aggregated Data by Date for {selected_candidate}",
|
1047 |
+
data=excel_buffer_aggregated,
|
1048 |
+
file_name=f"aggregated_data_{selected_candidate}.xlsx",
|
1049 |
+
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
1050 |
+
)
|
1051 |
+
else:
|
1052 |
+
st.warning("Tidak ada data yang sesuai dengan rentang tanggal yang dipilih untuk diagregasi.")
|
1053 |
+
else:
|
1054 |
+
st.warning("Tidak ada data dalam dataset untuk divisualisasikan.")
|
1055 |
+
else:
|
1056 |
+
st.warning("Tidak ada data dalam dataset untuk divisualisasikan.")
|
1057 |
+
except FileNotFoundError:
|
1058 |
+
st.error(f"No training dataset found for {selected_candidate}. Please add data to create the dataset.")
|
1059 |
+
except Exception as e:
|
1060 |
+
st.error(f"An error occurred: {e}")
|
1061 |
+
pass
|
1062 |
+
|
1063 |
+
if menu == "Evaluate Data Train":
|
1064 |
+
st.title("Evaluate Data Train")
|
1065 |
+
|
1066 |
+
selected_candidate = st.selectbox("Choose a candidate:", list(candidate_list), key='candidate_select_evaluate')
|
1067 |
+
dataset_path = f"datasetntbnew_{selected_candidate.lower().replace(' ', '_')}.xlsx"
|
1068 |
+
|
1069 |
+
try:
|
1070 |
+
df = pd.read_excel(dataset_path)
|
1071 |
+
|
1072 |
+
# Load existing keyword dictionary
|
1073 |
+
try:
|
1074 |
+
with open('keywords.json', 'r') as f:
|
1075 |
+
keyword_dict = json.load(f)
|
1076 |
+
st.success("keywords.json loaded successfully.")
|
1077 |
+
except FileNotFoundError:
|
1078 |
+
st.error("keywords.json file not found. Please ensure the file is in the correct directory.")
|
1079 |
+
st.stop()
|
1080 |
+
except json.JSONDecodeError:
|
1081 |
+
st.error("keywords.json file is not a valid JSON. Please check the file format.")
|
1082 |
+
st.stop()
|
1083 |
+
|
1084 |
+
# Select candidate-specific keywords
|
1085 |
+
candidate_key = selected_candidate.replace(' ', '_')
|
1086 |
+
candidate_keywords = keyword_dict.get(candidate_key)
|
1087 |
+
|
1088 |
+
if not candidate_keywords:
|
1089 |
+
st.error(f"No keywords found for the selected candidate '{selected_candidate}'. Please update the 'keywords.json' file with appropriate keywords.")
|
1090 |
+
st.stop()
|
1091 |
+
|
1092 |
+
keywords = [kw for sentiment_keywords in candidate_keywords.values() for kw in sentiment_keywords]
|
1093 |
+
|
1094 |
+
# Validasi Konsistensi Data
|
1095 |
+
st.subheader("Data Consistency Validation")
|
1096 |
+
missing_values = df.isnull().sum()
|
1097 |
+
st.write("Missing values in each column:")
|
1098 |
+
st.write(missing_values)
|
1099 |
+
|
1100 |
+
# Menghapus baris dengan nilai yang hilang
|
1101 |
+
st.write("Removing rows with missing values...")
|
1102 |
+
df.dropna(inplace=True)
|
1103 |
+
|
1104 |
+
# Distribusi Sentimen
|
1105 |
+
st.subheader("Sentiment Distribution")
|
1106 |
+
sentiment_counts = df['sentimen'].value_counts()
|
1107 |
+
st.write("Number of comments for each sentiment:")
|
1108 |
+
st.write(sentiment_counts)
|
1109 |
+
|
1110 |
+
# Kualitas Data
|
1111 |
+
st.subheader("Data Quality Check")
|
1112 |
+
invalid_entries = df[df['komentar'].str.len() == 0]
|
1113 |
+
st.write(f"Number of invalid comments (empty): {len(invalid_entries)}")
|
1114 |
+
if len(invalid_entries) > 0:
|
1115 |
+
st.write("Invalid comments (empty):")
|
1116 |
+
st.dataframe(invalid_entries.head(100)) # Preview 100 baris pertama
|
1117 |
+
|
1118 |
+
# Deteksi Inkonistensi Sentimen
|
1119 |
+
st.subheader("Inconsistent Sentiment Labels")
|
1120 |
+
duplicate_comments = df[df.duplicated(subset=['komentar'], keep=False)]
|
1121 |
+
inconsistent_labels = duplicate_comments.groupby('komentar')['sentimen'].nunique()
|
1122 |
+
inconsistent_labels = inconsistent_labels[inconsistent_labels > 1]
|
1123 |
+
if not inconsistent_labels.empty:
|
1124 |
+
inconsistent_labels_df = duplicate_comments[duplicate_comments['komentar'].isin(inconsistent_labels.index)]
|
1125 |
+
st.write(f"Number of comments with inconsistent sentiment labels: {len(inconsistent_labels_df)}")
|
1126 |
+
st.dataframe(inconsistent_labels_df.head(100)) # Preview 100 baris pertama
|
1127 |
+
else:
|
1128 |
+
st.write("No comments with inconsistent sentiment labels found.")
|
1129 |
+
|
1130 |
+
# Penanganan Masalah
|
1131 |
+
st.subheader("Problem Handling")
|
1132 |
+
|
1133 |
+
# Menghapus komentar yang kosong
|
1134 |
+
st.write("Removing invalid (empty) comments...")
|
1135 |
+
df = df[df['komentar'].str.len() > 0]
|
1136 |
+
|
1137 |
+
# Interaktif: Menangani komentar dengan label sentimen yang tidak konsisten
|
1138 |
+
st.write("Resolving inconsistent sentiment labels...")
|
1139 |
+
if not inconsistent_labels.empty:
|
1140 |
+
for index, row in inconsistent_labels_df.iterrows():
|
1141 |
+
st.write(f"Comment: {row['komentar']}")
|
1142 |
+
sentimen_options = df[df['komentar'] == row['komentar']]['sentimen'].unique().tolist()
|
1143 |
+
new_sentimen = st.selectbox("Select correct sentiment", sentimen_options, key=f'sentimen_{index}')
|
1144 |
+
if st.button("Update Sentiment", key=f'update_{index}'):
|
1145 |
+
update_sentiment(index, new_sentimen)
|
1146 |
+
|
1147 |
+
# Clustering menggunakan Keyword dan Model Sarkas
|
1148 |
+
st.write("Clustering comments using keywords and sarcasm model...")
|
1149 |
+
keyword_vectorizer = TfidfVectorizer(vocabulary=keywords)
|
1150 |
+
X_keywords = keyword_vectorizer.fit_transform(df['komentar'])
|
1151 |
+
kmeans = KMeans(n_clusters=10, random_state=0).fit(X_keywords)
|
1152 |
+
df['cluster'] = kmeans.labels_
|
1153 |
+
|
1154 |
+
# Identifikasi kluster yang perlu ditinjau
|
1155 |
+
review_clusters = df[df['cluster'].isin(df['cluster'].value_counts()[df['cluster'].value_counts() > 10].index)]
|
1156 |
+
st.write("Clusters identified for review:")
|
1157 |
+
st.dataframe(review_clusters.head(100)) # Preview 100 baris pertama
|
1158 |
+
|
1159 |
+
# Notifikasi Tambahkan Kata Kunci
|
1160 |
+
st.warning("Some comments were not captured by the current keywords. Please add new keywords in the 'Update Keywords' section.")
|
1161 |
+
|
1162 |
+
# Visualisasi Klaster
|
1163 |
+
cluster_counts = df['cluster'].value_counts()
|
1164 |
+
st.write("Number of comments in each cluster:")
|
1165 |
+
st.write(cluster_counts)
|
1166 |
+
|
1167 |
+
# Menampilkan hasil clustering
|
1168 |
+
st.write("Comments clustered by patterns:")
|
1169 |
+
st.dataframe(df.head(100)) # Preview 100 baris pertama
|
1170 |
+
|
1171 |
+
# Export hasil analisis dan penanganan
|
1172 |
+
st.subheader("Export Final Data")
|
1173 |
+
json_buffer = io.BytesIO()
|
1174 |
+
df.to_json(json_buffer, orient='records', lines=True)
|
1175 |
+
json_buffer.seek(0)
|
1176 |
+
st.download_button(
|
1177 |
+
label=f"Download Final Data for {selected_candidate}",
|
1178 |
+
data=json_buffer,
|
1179 |
+
file_name=f"final_data_{selected_candidate}.json",
|
1180 |
+
mime="application/json"
|
1181 |
+
)
|
1182 |
+
|
1183 |
+
except FileNotFoundError:
|
1184 |
+
st.error(f"No training dataset found for {selected_candidate}. Please add data to create the dataset.")
|
1185 |
+
except Exception as e:
|
1186 |
+
st.error(f"An error occurred: {e}")
|
1187 |
+
|
1188 |
+
pass # Placeholder
|
1189 |
+
|
1190 |
+
if menu == "Retraining Model":
|
1191 |
+
st.title("Retrain Model")
|
1192 |
+
selected_candidate = st.selectbox("Select a candidate to retrain the model:", list(candidate_list))
|
1193 |
+
|
1194 |
+
dataset_path = f"datasetntbnew_{selected_candidate.lower().replace(' ', '_')}.xlsx"
|
1195 |
+
model_path = f"best_rf_model_{selected_candidate.lower().replace(' ', '_')}.joblib"
|
1196 |
+
vectorizer_path = f"tfidf_vectorizer_{selected_candidate.lower().replace(' ', '_')}.joblib"
|
1197 |
+
retrain_history_path = f"retrain_history_{selected_candidate.lower().replace(' ', '_')}.json"
|
1198 |
+
|
1199 |
+
try:
|
1200 |
+
# Load dataset
|
1201 |
+
data = pd.read_excel(dataset_path)
|
1202 |
+
|
1203 |
+
# Ensure 'komentar' is string
|
1204 |
+
if 'komentar' not in data.columns:
|
1205 |
+
st.error("Dataset must include a 'komentar' column.")
|
1206 |
+
st.stop()
|
1207 |
+
else:
|
1208 |
+
# Convert all comments to string
|
1209 |
+
data['komentar'] = data['komentar'].fillna('').astype(str)
|
1210 |
+
|
1211 |
+
# Separate validated and unvalidated data
|
1212 |
+
if 'evaluated_by_cluster' in data.columns:
|
1213 |
+
validated_data = data[data['evaluated_by_cluster'] == True]
|
1214 |
+
unvalidated_data = data[data['evaluated_by_cluster'] == False]
|
1215 |
+
else:
|
1216 |
+
validated_data = pd.DataFrame(columns=data.columns)
|
1217 |
+
unvalidated_data = data
|
1218 |
+
|
1219 |
+
st.write(f"**Validated Data:** {len(validated_data)} rows")
|
1220 |
+
st.write(f"**Unvalidated Data:** {len(unvalidated_data)} rows")
|
1221 |
+
|
1222 |
+
# Check if all data is validated
|
1223 |
+
if len(unvalidated_data) > 0:
|
1224 |
+
st.warning("Model retraining is only allowed if all data has been validated through 'Evaluate Clustering'. Please ensure all data is validated before retraining the model.")
|
1225 |
+
st.stop()
|
1226 |
+
|
1227 |
+
# Combine all data for preprocessing
|
1228 |
+
combined_data = validated_data # Only use validated data
|
1229 |
+
|
1230 |
+
# Preprocessing Function
|
1231 |
+
@st.cache_data(show_spinner=True)
|
1232 |
+
def preprocess_data(data):
|
1233 |
+
from joblib import Parallel, delayed
|
1234 |
+
|
1235 |
+
def preprocess_comment(comment):
|
1236 |
+
comment = translate_emojis(comment)
|
1237 |
+
comment = normalize_unicode(comment)
|
1238 |
+
comment = handle_replies(comment)
|
1239 |
+
comment = clean_text(comment)
|
1240 |
+
comment = translate_text(comment, ntb_dict)
|
1241 |
+
comment = translate_text(comment, slang_dict)
|
1242 |
+
comment = handle_negation(comment)
|
1243 |
+
return comment
|
1244 |
+
|
1245 |
+
data['processed_comments'] = Parallel(n_jobs=-1)(
|
1246 |
+
delayed(preprocess_comment)(c) for c in data['komentar']
|
1247 |
+
)
|
1248 |
+
return data
|
1249 |
+
|
1250 |
+
# Preprocessing
|
1251 |
+
st.write("Starting preprocessing...")
|
1252 |
+
combined_data = preprocess_data(combined_data)
|
1253 |
+
|
1254 |
+
if st.button("Retrain Model"):
|
1255 |
+
# Vectorization
|
1256 |
+
st.write("Vectorizing data...")
|
1257 |
+
vectorizer = TfidfVectorizer(ngram_range=(1, 1), max_features=5000)
|
1258 |
+
X = vectorizer.fit_transform(combined_data['processed_comments'])
|
1259 |
+
y = combined_data['sentimen']
|
1260 |
+
|
1261 |
+
# Split Data
|
1262 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
1263 |
+
|
1264 |
+
# Handle Class Imbalance with SMOTE
|
1265 |
+
st.write("Balancing data with SMOTE...")
|
1266 |
+
smote = SMOTE(random_state=42, n_jobs=-1)
|
1267 |
+
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
|
1268 |
+
|
1269 |
+
# Train Random Forest Model
|
1270 |
+
st.write("Training Random Forest model...")
|
1271 |
+
rf_model = RandomForestClassifier(n_estimators=200, max_depth=20, random_state=42)
|
1272 |
+
rf_model.fit(X_train_res, y_train_res)
|
1273 |
+
|
1274 |
+
# Evaluate on Training Data
|
1275 |
+
st.write("Evaluating model...")
|
1276 |
+
y_pred_train = rf_model.predict(X_train)
|
1277 |
+
accuracy_train = accuracy_score(y_train, y_pred_train)
|
1278 |
+
report_train = classification_report(y_train, y_pred_train, output_dict=True)
|
1279 |
+
|
1280 |
+
# Evaluate on Test Data
|
1281 |
+
y_pred_test = rf_model.predict(X_test)
|
1282 |
+
accuracy_test = accuracy_score(y_test, y_pred_test)
|
1283 |
+
report_test = classification_report(y_test, y_pred_test, output_dict=True)
|
1284 |
+
|
1285 |
+
# Save Model and Vectorizer
|
1286 |
+
st.write("Saving model and vectorizer...")
|
1287 |
+
joblib.dump(rf_model, model_path)
|
1288 |
+
joblib.dump(vectorizer, vectorizer_path)
|
1289 |
+
|
1290 |
+
# Log Retraining History
|
1291 |
+
st.write("Logging retraining history...")
|
1292 |
+
try:
|
1293 |
+
with open(retrain_history_path, "r") as f:
|
1294 |
+
retrain_history = json.load(f)
|
1295 |
+
except FileNotFoundError:
|
1296 |
+
retrain_history = []
|
1297 |
+
|
1298 |
+
retrain_history.append({
|
1299 |
+
"date_retrained": pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'),
|
1300 |
+
"accuracy_on_train": accuracy_train,
|
1301 |
+
"accuracy_on_test": accuracy_test,
|
1302 |
+
"f1_score_on_train": report_train['weighted avg']['f1-score'],
|
1303 |
+
"f1_score_on_test": report_test['weighted avg']['f1-score'],
|
1304 |
+
})
|
1305 |
+
|
1306 |
+
with open(retrain_history_path, "w") as f:
|
1307 |
+
json.dump(retrain_history, f, indent=4)
|
1308 |
+
|
1309 |
+
# Display Results
|
1310 |
+
st.success(f"Model retrained successfully! Accuracy on training data: {accuracy_train:.4f}, Accuracy on test data: {accuracy_test:.4f}")
|
1311 |
+
st.subheader("Model Metrics on Training Data")
|
1312 |
+
st.table(pd.DataFrame(report_train).T)
|
1313 |
+
st.subheader("Model Metrics on Test Data")
|
1314 |
+
st.table(pd.DataFrame(report_test).T)
|
1315 |
+
|
1316 |
+
# Show Retrain History
|
1317 |
+
st.subheader("Retrain History")
|
1318 |
+
st.json(retrain_history)
|
1319 |
+
|
1320 |
+
except FileNotFoundError:
|
1321 |
+
st.error(f"No training dataset found for {selected_candidate}. Please add data to create the dataset.")
|
1322 |
+
except Exception as e:
|
1323 |
+
st.error(f"An unexpected error occurred: {e}")
|
1324 |
+
|
1325 |
+
pass # Placeholder
|
1326 |
+
|
1327 |
+
if menu == "Maximize Preprocessing":
|
1328 |
+
st.title("Maximize Preprocessing")
|
1329 |
+
|
1330 |
+
# Load Dataset Train
|
1331 |
+
candidate = st.selectbox("Choose a candidate:", list(candidate_list))
|
1332 |
+
dataset_path = f"datasetntbnew_{candidate.lower().replace(' ', '_')}.xlsx"
|
1333 |
+
try:
|
1334 |
+
# Load dataset
|
1335 |
+
data = pd.read_excel(dataset_path)
|
1336 |
+
|
1337 |
+
# Ensure 'komentar' is string
|
1338 |
+
if 'komentar' not in data.columns:
|
1339 |
+
st.error("Dataset must include a 'komentar' column.")
|
1340 |
+
st.stop()
|
1341 |
+
data['komentar'] = data['komentar'].fillna('').astype(str)
|
1342 |
+
|
1343 |
+
# Preprocessing Steps
|
1344 |
+
@st.cache_data(show_spinner=True)
|
1345 |
+
def preprocess_data(data):
|
1346 |
+
st.write("Starting preprocessing...")
|
1347 |
+
data['translated_emojis'] = data['komentar'].apply(translate_emojis)
|
1348 |
+
data['normalized_unicode'] = data['translated_emojis'].apply(normalize_unicode)
|
1349 |
+
data['reply_handled'] = data['normalized_unicode'].apply(handle_replies)
|
1350 |
+
data['clean_text'] = data['reply_handled'].apply(clean_text)
|
1351 |
+
data['translated_ntb'] = data['clean_text'].apply(lambda x: translate_text(x, ntb_dict))
|
1352 |
+
data['translated_slang'] = data['translated_ntb'].apply(lambda x: translate_text(x, slang_dict))
|
1353 |
+
data['negation_handled'] = data['translated_slang'].apply(handle_negation)
|
1354 |
+
return data
|
1355 |
+
|
1356 |
+
data = preprocess_data(data)
|
1357 |
+
|
1358 |
+
# Check Unmapped Words
|
1359 |
+
st.subheader("Check for Unmapped Words")
|
1360 |
+
all_words = (word.lower() for comment in data['negation_handled'] for word in comment.split())
|
1361 |
+
unique_words = set(all_words)
|
1362 |
+
ntb_dict_keys = set(ntb_dict.keys())
|
1363 |
+
slang_dict_keys = set(slang_dict.keys())
|
1364 |
+
mapped_words = ntb_dict_keys.union(slang_dict_keys)
|
1365 |
+
unmapped_words = sorted(unique_words - mapped_words)
|
1366 |
+
|
1367 |
+
if unmapped_words:
|
1368 |
+
st.write(f"Found **{len(unmapped_words)} unmapped words.**")
|
1369 |
+
|
1370 |
+
# Pilihan jumlah kata yang ingin ditampilkan
|
1371 |
+
max_words = st.slider(
|
1372 |
+
"Select number of words to display:",
|
1373 |
+
min_value=10,
|
1374 |
+
max_value=len(unmapped_words),
|
1375 |
+
value=min(50, len(unmapped_words)),
|
1376 |
+
step=10,
|
1377 |
+
)
|
1378 |
+
|
1379 |
+
# Buat DataFrame untuk tampilan tabel
|
1380 |
+
unmapped_df = pd.DataFrame(unmapped_words, columns=["Unmapped Words"])
|
1381 |
+
st.dataframe(unmapped_df.head(max_words))
|
1382 |
+
|
1383 |
+
# Tampilkan jumlah total kata jika dibutuhkan
|
1384 |
+
st.caption(f"Showing {min(max_words, len(unmapped_words))} out of {len(unmapped_words)} unmapped words.")
|
1385 |
+
else:
|
1386 |
+
st.success("No unmapped words found!")
|
1387 |
+
|
1388 |
+
# Add Words to Dictionary
|
1389 |
+
st.subheader("Add New Words to Dictionary")
|
1390 |
+
new_word = st.text_input("Enter new word:")
|
1391 |
+
normalized_word = st.text_input("Enter normalized form:")
|
1392 |
+
dictionary_choice = st.radio("Select dictionary to update:", ["Kamus Alay", "Kamus ntb"])
|
1393 |
+
|
1394 |
+
if st.button("Add to Dictionary"):
|
1395 |
+
if new_word and normalized_word:
|
1396 |
+
if dictionary_choice == "Kamus Alay":
|
1397 |
+
slang_dict[new_word.lower()] = normalized_word
|
1398 |
+
st.success(f"Added '{new_word}' -> '{normalized_word}' to Kamus Alay.")
|
1399 |
+
elif dictionary_choice == "Kamus ntb":
|
1400 |
+
ntb_dict[new_word.lower()] = normalized_word
|
1401 |
+
st.success(f"Added '{new_word}' -> '{normalized_word}' to Kamus ntb.")
|
1402 |
+
else:
|
1403 |
+
st.warning("Please enter both the new word and its normalized form.")
|
1404 |
+
|
1405 |
+
# Save Updates to File
|
1406 |
+
st.subheader("Save Updated Dictionaries")
|
1407 |
+
if st.button("Save Kamus Alay"):
|
1408 |
+
kamus_alay_path = '/content/kamusalay.csv' # Adjust the path as needed
|
1409 |
+
pd.DataFrame(list(slang_dict.items()), columns=["slang", "formal"]).to_csv(kamus_alay_path, index=False)
|
1410 |
+
st.success(f"Kamus Alay saved successfully to {kamus_alay_path}.")
|
1411 |
+
|
1412 |
+
if st.button("Save Kamus ntb"):
|
1413 |
+
kamus_ntb_path = '/content/ntb_dict.json' # Adjust the path as needed
|
1414 |
+
with open(kamus_ntb_path, 'w', encoding='utf-8') as f:
|
1415 |
+
json.dump(ntb_dict, f, indent=4)
|
1416 |
+
st.success(f"Kamus ntb saved successfully to {kamus_ntb_path}.")
|
1417 |
+
except FileNotFoundError:
|
1418 |
+
st.error(f"No training dataset found for {candidate}. Please ensure the dataset is available.")
|
1419 |
+
except Exception as e:
|
1420 |
+
st.error(f"An unexpected error occurred: {e}")
|
1421 |
+
|
1422 |
+
pass # Placeholder
|
1423 |
+
|
1424 |
+
if menu == "Update Keywords":
|
1425 |
+
st.title("Update Keywords")
|
1426 |
+
|
1427 |
+
# Load existing keyword dictionary
|
1428 |
+
with open('keywords.json', 'r') as f:
|
1429 |
+
keyword_dict = json.load(f)
|
1430 |
+
|
1431 |
+
# Show current keywords
|
1432 |
+
st.subheader("Current Keywords")
|
1433 |
+
candidate = st.selectbox("Select candidate", list(keyword_dict.keys()))
|
1434 |
+
for sentiment, keywords in keyword_dict[candidate].items():
|
1435 |
+
st.write(f"{sentiment}: {', '.join(keywords)}")
|
1436 |
+
|
1437 |
+
# Add new keyword
|
1438 |
+
st.subheader("Add New Keyword")
|
1439 |
+
new_keyword = st.text_input("Enter new keyword")
|
1440 |
+
selected_sentiment = st.selectbox("Select sentiment for new keyword", list(keyword_dict[candidate].keys()))
|
1441 |
+
|
1442 |
+
if st.button("Add Keyword"):
|
1443 |
+
if new_keyword and selected_sentiment:
|
1444 |
+
keyword_dict[candidate][selected_sentiment].append(new_keyword)
|
1445 |
+
with open('keywords.json', 'w') as f:
|
1446 |
+
json.dump(keyword_dict, f, indent=4)
|
1447 |
+
st.success(f"Keyword '{new_keyword}' added to {selected_sentiment} for {candidate}")
|
1448 |
+
else:
|
1449 |
+
st.error("Please enter a keyword and select a sentiment")
|
1450 |
+
|
1451 |
+
# Analyze Special Cluster
|
1452 |
+
st.subheader("Analyze Special Cluster")
|
1453 |
+
if 'ba_lainnya_data' in st.session_state:
|
1454 |
+
try:
|
1455 |
+
# Langsung Memuat Data `Special Cluster`
|
1456 |
+
special_cluster_data = st.session_state['ba_lainnya_data'][st.session_state['ba_lainnya_data']['Cluster_Name'] == 'Special Cluster']
|
1457 |
+
if special_cluster_data.empty:
|
1458 |
+
st.warning("No data found in Special Cluster.")
|
1459 |
+
else:
|
1460 |
+
st.write(f"Total comments in Special Cluster: {len(special_cluster_data)}")
|
1461 |
+
|
1462 |
+
all_words_special = []
|
1463 |
+
for comment in special_cluster_data['negation_handled']:
|
1464 |
+
comment = translate_emojis(comment)
|
1465 |
+
comment = normalize_unicode(comment)
|
1466 |
+
comment = handle_replies(comment)
|
1467 |
+
comment = clean_text(comment)
|
1468 |
+
comment = translate_text(comment, {}) # Adjust based on your dictionary
|
1469 |
+
comment = handle_negation(comment)
|
1470 |
+
words = preprocess_text(comment)
|
1471 |
+
all_words_special.extend(words)
|
1472 |
+
|
1473 |
+
# Calculate word frequencies
|
1474 |
+
word_freq_special = Counter(all_words_special)
|
1475 |
+
|
1476 |
+
# Add slider to select number of words to display
|
1477 |
+
num_words_special = st.slider("Number of words to display (Special Cluster)", min_value=5, max_value=50, value=20)
|
1478 |
+
most_common_words_special = word_freq_special.most_common(num_words_special)
|
1479 |
+
|
1480 |
+
# Display word frequencies as a table
|
1481 |
+
st.subheader(f"Top {num_words_special} Word Frequencies in Special Cluster")
|
1482 |
+
word_freq_df_special = pd.DataFrame(most_common_words_special, columns=['Word', 'Frequency'])
|
1483 |
+
st.dataframe(word_freq_df_special)
|
1484 |
+
|
1485 |
+
except Exception as e:
|
1486 |
+
st.error(f"An error occurred: {e}")
|
1487 |
+
else:
|
1488 |
+
st.warning("No 'BA Lainnya' data found. Please classify comments first.")
|
1489 |
+
|
1490 |
+
# Analyze Training Data
|
1491 |
+
st.subheader("Analyze Training Data")
|
1492 |
+
dataset_path = f"datasetntbnew_{candidate.lower().replace(' ', '_')}.xlsx"
|
1493 |
+
try:
|
1494 |
+
train_data = pd.read_excel(dataset_path)
|
1495 |
+
if train_data.empty:
|
1496 |
+
st.warning("Training dataset is empty.")
|
1497 |
+
else:
|
1498 |
+
all_words_train = []
|
1499 |
+
for comment in train_data['komentar'].astype(str):
|
1500 |
+
comment = translate_emojis(comment)
|
1501 |
+
comment = normalize_unicode(comment)
|
1502 |
+
comment = handle_replies(comment)
|
1503 |
+
comment = clean_text(comment)
|
1504 |
+
comment = translate_text(comment, {}) # Adjust based on your dictionary
|
1505 |
+
comment = handle_negation(comment)
|
1506 |
+
words = preprocess_text(comment)
|
1507 |
+
all_words_train.extend(words)
|
1508 |
+
|
1509 |
+
# Calculate word frequencies
|
1510 |
+
word_freq_train = Counter(all_words_train)
|
1511 |
+
|
1512 |
+
# Add slider to select number of words to display
|
1513 |
+
num_words_train = st.slider("Number of words to display (Training Data)", min_value=5, max_value=50, value=20)
|
1514 |
+
most_common_words_train = word_freq_train.most_common(num_words_train)
|
1515 |
+
|
1516 |
+
# Display word frequencies as a table
|
1517 |
+
st.subheader(f"Top {num_words_train} Word Frequencies in Training Data")
|
1518 |
+
word_freq_df_train = pd.DataFrame(most_common_words_train, columns=['Word', 'Frequency'])
|
1519 |
+
st.dataframe(word_freq_df_train)
|
1520 |
+
|
1521 |
+
except FileNotFoundError:
|
1522 |
+
st.error(f"Training dataset for {candidate} not found.")
|
1523 |
+
except Exception as e:
|
1524 |
+
st.error(f"An error occurred: {e}")
|
1525 |
+
|
1526 |
+
# Option to export keywords
|
1527 |
+
st.subheader("Export Keywords")
|
1528 |
+
json_buffer = io.BytesIO()
|
1529 |
+
json_buffer.write(json.dumps(keyword_dict).encode('utf-8'))
|
1530 |
+
json_buffer.seek(0)
|
1531 |
+
st.download_button(
|
1532 |
+
label="Export Keywords",
|
1533 |
+
data=json_buffer,
|
1534 |
+
file_name="keywords.json",
|
1535 |
+
mime="application/json"
|
1536 |
+
)
|
1537 |
+
|
1538 |
+
# Option to import keywords
|
1539 |
+
st.subheader("Import Keywords")
|
1540 |
+
uploaded_file = st.file_uploader("Choose a JSON file", type="json")
|
1541 |
+
if uploaded_file is not None:
|
1542 |
+
imported_keywords = json.load(uploaded_file)
|
1543 |
+
keyword_dict.update(imported_keywords)
|
1544 |
+
with open('keywords.json', 'w') as f:
|
1545 |
+
json.dump(keyword_dict, f, indent=4)
|
1546 |
+
st.success("Keywords imported successfully")
|
1547 |
+
pass
|
1548 |
+
|
best_rf_model_indah_dhamayanti_putri.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e28bb4dacc869a7b71e089118a07351a8de60175fce28b3b8e2b8c01e651ceba
|
3 |
+
size 15044729
|
best_rf_model_lalu_muhamad_iqbal.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5d421efb1c2b5876a1dab8d8dc11a351d75064bf0c32a24ef3df3a9913670182
|
3 |
+
size 44015033
|
best_rf_model_m_suhaili.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:949b91ab83ead20c613ced16780bda52780ecac0d56c0f32ccec44131d44ebe2
|
3 |
+
size 13640633
|
best_rf_model_musyafirin.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7cb226ee3fab98e8d2af12cd5329f73beb54e4b5a1fa871d01c2c2029a31d5d2
|
3 |
+
size 6092665
|
best_rf_model_sitti_rohmi_djalilah.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7eecdcd277c204d1d771cd767169ab0f026ed8544516f7f43389aab32f0a27a6
|
3 |
+
size 24894969
|
best_rf_model_zulkieflimansyah.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8f5d7ce2da36aadee463102fa50e42d658ac46d3a690ff81329d7a4d81956b0f
|
3 |
+
size 23188793
|
datasetntbnew_indah_dhamayanti_putri.xlsx
ADDED
Binary file (57.4 kB). View file
|
|
datasetntbnew_lalu_muhamad_iqbal.xlsx
ADDED
Binary file (104 kB). View file
|
|
datasetntbnew_m_suhaili.xlsx
ADDED
Binary file (57.9 kB). View file
|
|
datasetntbnew_musyafirin.xlsx
ADDED
Binary file (88.5 kB). View file
|
|
datasetntbnew_sitti_rohmi_djalilah.xlsx
ADDED
Binary file (67.5 kB). View file
|
|
datasetntbnew_zulkieflimansyah.xlsx
ADDED
Binary file (67.6 kB). View file
|
|
kamusalay.csv
ADDED
@@ -0,0 +1,238 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
ajh,saja
|
2 |
+
ajj,saja
|
3 |
+
akoh,aku
|
4 |
+
amaca,ah masa
|
5 |
+
amuh,kamu
|
6 |
+
aph,apa
|
7 |
+
apose,apa
|
8 |
+
apz,apa
|
9 |
+
aq,saya
|
10 |
+
baryaw,sabar ya
|
11 |
+
baryw,sabar ya
|
12 |
+
bryaw,sabar ya
|
13 |
+
bryw,sabar ya
|
14 |
+
bay,selamat tinggal
|
15 |
+
besoq,besok
|
16 |
+
beud,banget
|
17 |
+
bhay,selamat tinggal
|
18 |
+
bisya,bisa
|
19 |
+
biza,bisa
|
20 |
+
bntr,sebentar
|
21 |
+
bokap,ayah
|
22 |
+
bkap,ayah
|
23 |
+
bkp,ayah
|
24 |
+
bokaps,ayah
|
25 |
+
bokapz,ayah
|
26 |
+
bs,bisa
|
27 |
+
bsa,bisa
|
28 |
+
bsk,besok
|
29 |
+
bsoq,besok
|
30 |
+
bte,bosan
|
31 |
+
bozen,bosan
|
32 |
+
bozn,bosan
|
33 |
+
bzn,bosan
|
34 |
+
bzen,bosan
|
35 |
+
cabut,pergi
|
36 |
+
caiank,sayang
|
37 |
+
cekola,sekolah
|
38 |
+
cekolah,sekolah
|
39 |
+
celalaw,selalu
|
40 |
+
celalu,selalu
|
41 |
+
cemungudh,semangat
|
42 |
+
cemungut,semangat
|
43 |
+
cemunguth,semangat
|
44 |
+
cibuq,sibuk
|
45 |
+
cini,sini
|
46 |
+
ciyus,serius
|
47 |
+
cll,selalu
|
48 |
+
cllu,selalu
|
49 |
+
cllw,selalu
|
50 |
+
cpe,capek
|
51 |
+
cpee,capek
|
52 |
+
cewe,cewek
|
53 |
+
cwe,cewek
|
54 |
+
cowo,cowok
|
55 |
+
cwo,cowok
|
56 |
+
dah,sudah
|
57 |
+
dapa,ada apa
|
58 |
+
dapah,ada apa
|
59 |
+
dftr,daftar
|
60 |
+
dh,sudah
|
61 |
+
dimance,dimana
|
62 |
+
dimandose,dimana
|
63 |
+
dimans,dimana
|
64 |
+
duluw,dulu
|
65 |
+
ea,ya
|
66 |
+
emg,memang
|
67 |
+
eteb,bosan
|
68 |
+
g,tidak
|
69 |
+
ga,tidak
|
70 |
+
gabut,menganggur
|
71 |
+
gak,tidak
|
72 |
+
gakz,tidak
|
73 |
+
gatau,tidak tahu
|
74 |
+
gataw,tidak tahu
|
75 |
+
gengges,ganggu
|
76 |
+
ghiy,lagi
|
77 |
+
gi,lagi
|
78 |
+
gk,tidak
|
79 |
+
gpp,tidak apa apa
|
80 |
+
gtw,tidak tahu
|
81 |
+
gue,saya
|
82 |
+
gw,saya
|
83 |
+
gx,tidak
|
84 |
+
hums,rumah
|
85 |
+
humz,rumah
|
86 |
+
huum,iya
|
87 |
+
iy,iya
|
88 |
+
iyach,iya
|
89 |
+
iyap,iya
|
90 |
+
iyapz,iya
|
91 |
+
iyup,iya
|
92 |
+
iyupz,iya
|
93 |
+
iz,iya
|
94 |
+
iza,iya
|
95 |
+
izza,iya
|
96 |
+
jamber,jam berapa
|
97 |
+
jd,jadi
|
98 |
+
jdi,jadi
|
99 |
+
jg,juga
|
100 |
+
jga,juga
|
101 |
+
jgn,jangan
|
102 |
+
jngan,jangan
|
103 |
+
jngn,jangan
|
104 |
+
kacian,kasihan
|
105 |
+
kaka,kakak
|
106 |
+
kau,kamu
|
107 |
+
keles,kali
|
108 |
+
kenapah,kenapa
|
109 |
+
kenaps,kenapa
|
110 |
+
kenapz,kenapa
|
111 |
+
kepo,ingin tahu
|
112 |
+
keyen,keren
|
113 |
+
khan,kan
|
114 |
+
khanz,kan
|
115 |
+
kk,kakak
|
116 |
+
klo,kalau
|
117 |
+
klw,kalau
|
118 |
+
km,kamu
|
119 |
+
kmrn,kemarin
|
120 |
+
kmu,kamu
|
121 |
+
knp,kenapa
|
122 |
+
koq,kok
|
123 |
+
kpan,kapan
|
124 |
+
kpn,kapan
|
125 |
+
kuq,kok
|
126 |
+
kuy,ayo
|
127 |
+
kw,kamu
|
128 |
+
kzl,kesal
|
129 |
+
lam,salam
|
130 |
+
leh,boleh
|
131 |
+
lo,kamu
|
132 |
+
loe,kamu
|
133 |
+
lom,belum
|
134 |
+
low,kalau
|
135 |
+
lp,lupa
|
136 |
+
lu,kamu
|
137 |
+
luchu,lucu
|
138 |
+
lum,belum
|
139 |
+
lun,belum
|
140 |
+
luthu,lucu
|
141 |
+
lw,kamu
|
142 |
+
maacih,terima kasih
|
143 |
+
maap,maaf
|
144 |
+
mager,malas bergerak
|
145 |
+
makaci,terima kasih
|
146 |
+
maw,mau
|
147 |
+
miapa,demi apa
|
148 |
+
miapah,demi apa
|
149 |
+
misal'a,misalnya
|
150 |
+
muup,maaf
|
151 |
+
mu'uv,maaf
|
152 |
+
mw,mau
|
153 |
+
nak,anak
|
154 |
+
naq,anak
|
155 |
+
nax,anak
|
156 |
+
nda,tidak
|
157 |
+
ndak,tidak
|
158 |
+
ndax,tidak
|
159 |
+
ngabungin,menggabungkan
|
160 |
+
ngajak,mengajak
|
161 |
+
ngerokok,merokok
|
162 |
+
ngga,tidak
|
163 |
+
nggak,tidak
|
164 |
+
nggax,tidak
|
165 |
+
nggesek,menggesek
|
166 |
+
nggosok,menggosok
|
167 |
+
ngibul,berbohong
|
168 |
+
nyokap,ibu
|
169 |
+
nykap,ibu
|
170 |
+
nykaps,ibu
|
171 |
+
nykapz,ibu
|
172 |
+
nykp,ibu
|
173 |
+
nich,ini
|
174 |
+
nntn,menonton
|
175 |
+
ntn,menonton
|
176 |
+
oc,oke
|
177 |
+
oce,oke
|
178 |
+
ohh,oh
|
179 |
+
ok,oke
|
180 |
+
okedech,oke
|
181 |
+
okedeh,oke
|
182 |
+
okeh,oke
|
183 |
+
okz,oke
|
184 |
+
org,orang
|
185 |
+
ouch,oh
|
186 |
+
ouh,oh
|
187 |
+
owh,oh
|
188 |
+
pasutri,pasangan suami istri
|
189 |
+
paz,pas
|
190 |
+
pengen,ingin
|
191 |
+
pengin,ingin
|
192 |
+
pgn,ingin
|
193 |
+
psti,pasti
|
194 |
+
pzt,pasti
|
195 |
+
q,saya
|
196 |
+
qaqa,kakak
|
197 |
+
qq,kakak
|
198 |
+
rmh,rumah
|
199 |
+
sabeb,bebas
|
200 |
+
sabi,bisa
|
201 |
+
salfok,salah fokus
|
202 |
+
saltum,salah kostum
|
203 |
+
sdh,sudah
|
204 |
+
selaw,santai
|
205 |
+
selow,santai
|
206 |
+
shap,siap
|
207 |
+
shaps,siap
|
208 |
+
syipp,sip
|
209 |
+
syp,siapa
|
210 |
+
tau,tahu
|
211 |
+
tauk,tahu
|
212 |
+
tdk,tidak
|
213 |
+
telp,telepon
|
214 |
+
tgl,tanggal
|
215 |
+
thx,terima kasih
|
216 |
+
tipi,televisi
|
217 |
+
tp,tapi
|
218 |
+
tq,terima kasih
|
219 |
+
trims,terima kasih
|
220 |
+
trimz,terima kasih
|
221 |
+
tuch,itu
|
222 |
+
tw,tahu
|
223 |
+
u,kamu
|
224 |
+
u,kamu
|
225 |
+
udah,sudah
|
226 |
+
udd,sudah
|
227 |
+
udh,sudah
|
228 |
+
uga,juga
|
229 |
+
von,telepon
|
230 |
+
w,saya
|
231 |
+
wad,buat
|
232 |
+
wat,buat
|
233 |
+
yank,sayang
|
234 |
+
yap,ya
|
235 |
+
yaw,ya
|
236 |
+
yoi,iya
|
237 |
+
yups,ya
|
238 |
+
yupz,ya
|
keywords.json
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"Musyafirin": {
|
3 |
+
"Co Likes": ["keren", "bagus", "diakui", "disukai", "tegas"],
|
4 |
+
"Co Support": ["pemimpin baik", "pilihan tepat", "jujur", "adil", "kinerja baik"],
|
5 |
+
"Co Optimism": ["maju terus", "berhasil", "terdepan", "pengaruh positif", "optimis"],
|
6 |
+
"Co Negative": ["kekurangan", "buruk", "tidak peduli", "masalah", "tidak mampu"],
|
7 |
+
"Co Sarkastic": ["oh tentu", "iya benar", "seolah-olah", "oh hebat", "pasti", "benar sekali", "sangat meyakinkan", "tidak mungkin", "teruskan", "oh iya"]
|
8 |
+
},
|
9 |
+
"Sitti_Rohmi_Djalillah": {
|
10 |
+
"Co Likes": ["baik", "cantik", "inspiratif", "cerdas", "menarik"],
|
11 |
+
"Co Support": ["hebat", "terbaik", "pemimpin bijak", "solid", "juara", "unggul"],
|
12 |
+
"Co Optimism": ["masa depan cerah", "harapan", "kepercayaan", "optimis", "juara"],
|
13 |
+
"Co Negative": ["gagal", "tidak mendukung", "lemah", "tidak suka", "korupsi"],
|
14 |
+
"Co Sarkastic": ["oh tentu", "iya benar", "seolah-olah", "oh hebat", "pasti", "benar sekali", "sangat meyakinkan", "tidak mungkin", "teruskan", "oh iya"]
|
15 |
+
},
|
16 |
+
"Zulkieflimansyah": {
|
17 |
+
"Co Likes": ["inspiratif", "cerdas", "berprestasi", "bagus", "terpuji"],
|
18 |
+
"Co Support": ["terbaik", "pemimpin inspiratif", "solid", "juara", "bijaksana"],
|
19 |
+
"Co Optimism": ["optimis", "harapan", "masa depan", "kemenangan", "perubahan positif"],
|
20 |
+
"Co Negative": ["tidak berprestasi", "isu korupsi", "lemah", "tidak mendukung"],
|
21 |
+
"Co Sarkastic": ["oh tentu", "iya benar", "seolah-olah", "oh hebat", "pasti", "benar sekali", "sangat meyakinkan", "tidak mungkin", "teruskan", "oh iya"]
|
22 |
+
},
|
23 |
+
"Lalu_Muhamad_Iqbal": {
|
24 |
+
"Co Likes": ["bagus", "baik", "keren", "disukai", "cocok"],
|
25 |
+
"Co Support": ["dukung", "pilih", "mantap", "semangat", "nomor satu", "hebat"],
|
26 |
+
"Co Optimism": ["harapan", "optimis", "menang", "sukses", "terbaik", "pemimpin"],
|
27 |
+
"Co Negative": ["fitnah", "bohong", "tidak suka", "jelek", "kalah", "buruk"],
|
28 |
+
"Co Sarkastic": ["oh tentu", "iya benar", "seolah-olah", "oh hebat", "pasti", "benar sekali", "sangat meyakinkan", "tidak mungkin", "teruskan", "oh iya"]
|
29 |
+
},
|
30 |
+
"Indah_Dhamayanti_Putri": {
|
31 |
+
"Co Likes": ["bagus", "menarik", "cocok", "cantik", "baik hati"],
|
32 |
+
"Co Support": ["semangat", "mantap", "pilihan tepat", "hebat", "menang", "dukung terus"],
|
33 |
+
"Co Optimism": ["sukses", "maju", "terbaik", "inspirasi", "masa depan", "optimis"],
|
34 |
+
"Co Negative": ["isu", "korupsi", "tidak baik", "cacat", "buruk", "jelek"],
|
35 |
+
"Co Sarkastic": ["oh tentu", "iya benar", "seolah-olah", "oh hebat", "pasti", "benar sekali", "sangat meyakinkan", "tidak mungkin", "teruskan", "oh iya"]
|
36 |
+
}
|
37 |
+
}
|
ntb_dict.json
ADDED
@@ -0,0 +1,396 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"gawe": "kerja",
|
3 |
+
"kepeng": "uang",
|
4 |
+
"mae": "datang",
|
5 |
+
"menyaman": "nyaman",
|
6 |
+
"bere": "berani",
|
7 |
+
"muter": "berjalan-jalan",
|
8 |
+
"endek": "tidak",
|
9 |
+
"lang": "belum",
|
10 |
+
"ngena": "makan",
|
11 |
+
"tongka": "pergi",
|
12 |
+
"nanem": "tanam",
|
13 |
+
"menteleng": "melihat",
|
14 |
+
"tepek": "tangan",
|
15 |
+
"dewe": "sendiri",
|
16 |
+
"sambel": "sambal",
|
17 |
+
"kene": "sini",
|
18 |
+
"bare": "baru",
|
19 |
+
"kek": "seperti",
|
20 |
+
"sedek": "sedikit",
|
21 |
+
"buin": "lagi",
|
22 |
+
"bareng": "bersama",
|
23 |
+
"beleng": "belok",
|
24 |
+
"reng": "orang",
|
25 |
+
"batur": "teman",
|
26 |
+
"lepok": "berbicara",
|
27 |
+
"gubuk": "rumah",
|
28 |
+
"lombok": "cabe",
|
29 |
+
"santun": "hormat",
|
30 |
+
"jelo": "jelek",
|
31 |
+
"susur": "bersih",
|
32 |
+
"laek": "naik",
|
33 |
+
"tembe": "kemudian",
|
34 |
+
"kereng": "keras",
|
35 |
+
"kajang": "jatuh",
|
36 |
+
"raos": "enak",
|
37 |
+
"tampah": "kotoran",
|
38 |
+
"engat": "ingat",
|
39 |
+
"ken": "kenal",
|
40 |
+
"baro": "kemarin",
|
41 |
+
"silo": "tidur",
|
42 |
+
"temek": "kecil",
|
43 |
+
"gole": "pergi",
|
44 |
+
"betuk": "buat",
|
45 |
+
"peng": "sakit",
|
46 |
+
"taman": "tambah",
|
47 |
+
"kunci": "kunci",
|
48 |
+
"sesu": "siap",
|
49 |
+
"pon": "sudah",
|
50 |
+
"kut": "kamu",
|
51 |
+
"gete": "besar",
|
52 |
+
"lingeh": "dengar",
|
53 |
+
"bueh": "jauh",
|
54 |
+
"male": "malu",
|
55 |
+
"pacong": "pelit",
|
56 |
+
"sate": "tidak ada",
|
57 |
+
"gati": "sangat",
|
58 |
+
"dase": "hidup",
|
59 |
+
"pukul": "pukul",
|
60 |
+
"rugu": "bodoh",
|
61 |
+
"tengaq": "tengah",
|
62 |
+
"juak": "jual",
|
63 |
+
"bijak": "bijak",
|
64 |
+
"seman": "sehat",
|
65 |
+
"masok": "masuk",
|
66 |
+
"lauk": "ikan",
|
67 |
+
"lengit": "hilang",
|
68 |
+
"pek": "samping",
|
69 |
+
"peteng": "gelap",
|
70 |
+
"rangkat": "angkat",
|
71 |
+
"sarak": "cepat",
|
72 |
+
"selak": "takut",
|
73 |
+
"tapok": "tutup",
|
74 |
+
"tepuk": "tangan",
|
75 |
+
"tere": "kiri",
|
76 |
+
"tuan": "tuan",
|
77 |
+
"ungak": "lompat",
|
78 |
+
"turun": "turun",
|
79 |
+
"waktu": "waktu",
|
80 |
+
"wuri": "belakang",
|
81 |
+
"yakin": "yakin",
|
82 |
+
"zaman": "zaman",
|
83 |
+
"nggawe": "sedang bekerja",
|
84 |
+
"ngena-ngena": "sedang makan",
|
85 |
+
"nanemin": "menanamkan",
|
86 |
+
"ngelingeh": "mendengarkan",
|
87 |
+
"nenga": "melihat",
|
88 |
+
"nengaq": "melihat",
|
89 |
+
"golet": "berpergian",
|
90 |
+
"lepokin": "membicarakan",
|
91 |
+
"betukin": "membuat",
|
92 |
+
"masukin": "memasukkan",
|
93 |
+
"jualin": "menjual",
|
94 |
+
"angkatin": "mengangkat",
|
95 |
+
"melangit": "melihat ke atas",
|
96 |
+
"nenggo": "menunggu",
|
97 |
+
"sedek-sedek": "sedikit-sedikit",
|
98 |
+
"bare-tek": "baru saja",
|
99 |
+
"lepok-lepok": "pembicaraan",
|
100 |
+
"dase-dase": "menghidupkan",
|
101 |
+
"paconge": "kepelitan",
|
102 |
+
"tapokin": "menutupkan",
|
103 |
+
"kerengin": "mengeraskan",
|
104 |
+
"silo-siloan": "sering tidur",
|
105 |
+
"ndek-nggawe": "tidak bekerja",
|
106 |
+
"kereng-kereng": "sangat keras",
|
107 |
+
"pacong-pacong": "sangat pelit",
|
108 |
+
"beleng-beleng": "belokan",
|
109 |
+
"tapok-tapok": "penutupan",
|
110 |
+
"kepeng-kepeng": "uang-uang",
|
111 |
+
"anake": "anaknya",
|
112 |
+
"nyong": "saya",
|
113 |
+
"ku": "aku",
|
114 |
+
"kit": "kita",
|
115 |
+
"iyong": "dia",
|
116 |
+
"nene": "mereka",
|
117 |
+
"geteh": "besar",
|
118 |
+
"alang": "tinggi",
|
119 |
+
"lendong": "lembut",
|
120 |
+
"ngele": "panas",
|
121 |
+
"se": "satu",
|
122 |
+
"due": "dua",
|
123 |
+
"telu": "tiga",
|
124 |
+
"empat": "empat",
|
125 |
+
"lima": "lima",
|
126 |
+
"enem": "enam",
|
127 |
+
"pitu": "tujuh",
|
128 |
+
"wolu": "delapan",
|
129 |
+
"sia": "sembilan",
|
130 |
+
"sepulu": "sepuluh",
|
131 |
+
"sewelas": "sebelas",
|
132 |
+
"duwelas": "dua belas",
|
133 |
+
"selikur": "dua puluh satu",
|
134 |
+
"telu likur": "dua puluh tiga",
|
135 |
+
"sekedik": "sedikit",
|
136 |
+
"bare-bare": "baru-baru",
|
137 |
+
"tembe-tembe": "nanti-nanti",
|
138 |
+
"reng-reng": "orang-orang",
|
139 |
+
"sambel-sambel": "bermacam-macam sambal",
|
140 |
+
"silo-silo": "berkali-kali tidur",
|
141 |
+
"sate-sate": "tidak ada sama sekali",
|
142 |
+
"gole-gole": "sering pergi",
|
143 |
+
"batur-batur": "teman-teman",
|
144 |
+
"ane": "saya",
|
145 |
+
"eto": "itu",
|
146 |
+
"maej": "mari",
|
147 |
+
"tangkong": "naik",
|
148 |
+
"tie": "di sana",
|
149 |
+
"skek": "sedikit",
|
150 |
+
"arik": "adik",
|
151 |
+
"tabah pribadi": "kuat secara pribadi",
|
152 |
+
"suhu": "guru",
|
153 |
+
"side": "anda",
|
154 |
+
"arak": "minuman keras",
|
155 |
+
"ruan": "ruang",
|
156 |
+
"paut": "ikat",
|
157 |
+
"jari": "jadi",
|
158 |
+
"penjuluk": "julukan",
|
159 |
+
"ndekn": "tidak (NTB)",
|
160 |
+
"care": "peduli",
|
161 |
+
"nenak": "enak",
|
162 |
+
"k'tuan": "tuan",
|
163 |
+
"ndek": "tidak",
|
164 |
+
"akak": "kakak",
|
165 |
+
"milu": "ikut",
|
166 |
+
"ust": "ustadz",
|
167 |
+
"laun": "pelan-pelan",
|
168 |
+
"mun": "kalau",
|
169 |
+
"wah": "wah",
|
170 |
+
"jadi": "jadi",
|
171 |
+
"gubernur": "gubernur",
|
172 |
+
"ja": "sudah",
|
173 |
+
"ngkah": "langkah",
|
174 |
+
"noglh": "menyusul",
|
175 |
+
"berbaur": "berbaur",
|
176 |
+
"karingan": "kering",
|
177 |
+
"aran": "nama",
|
178 |
+
"nane": "nama panggilan",
|
179 |
+
"kancen": "teman",
|
180 |
+
"nyalon": "calonkan diri",
|
181 |
+
"biase": "biasa",
|
182 |
+
"boyaq": "bohong",
|
183 |
+
"suare": "suara",
|
184 |
+
"lemaq": "bagus",
|
185 |
+
"ngengat": "memukul",
|
186 |
+
"bae": "baik",
|
187 |
+
"ndkn": "tidak (variant)",
|
188 |
+
"mle": "memulai",
|
189 |
+
"te": "ke sana",
|
190 |
+
"isik": "isi",
|
191 |
+
"sak": "sempit",
|
192 |
+
"iye": "iya",
|
193 |
+
"muk": "mulut",
|
194 |
+
"melek": "melek",
|
195 |
+
"ky": "seperti",
|
196 |
+
"kire": "kirikanan",
|
197 |
+
"jemaq": "banyak",
|
198 |
+
"seandaian": "seandainya",
|
199 |
+
"ne": "di sini",
|
200 |
+
"mele": "pergi",
|
201 |
+
"ye": "dia",
|
202 |
+
"malik": "balik",
|
203 |
+
"maraq": "semangat",
|
204 |
+
"ngini": "disini",
|
205 |
+
"perli": "sindir",
|
206 |
+
"melene": "lemah",
|
207 |
+
"ampok": "sampai",
|
208 |
+
"manto": "mantap",
|
209 |
+
"nge": "kamu",
|
210 |
+
"lalo": "pergi",
|
211 |
+
"ndk": "tidak",
|
212 |
+
"ta": "jangan",
|
213 |
+
"taok": "ke sana",
|
214 |
+
"pilen": "pemilu",
|
215 |
+
"min": "makanan ringan",
|
216 |
+
"dwg": "dengar",
|
217 |
+
"selebung": "tutup",
|
218 |
+
"enden": "endapkan",
|
219 |
+
"unin": "suara",
|
220 |
+
"mule": "pulang",
|
221 |
+
"lamun": "jika",
|
222 |
+
"ndkmn": "tidak mungkin",
|
223 |
+
"pilek": "pemilu",
|
224 |
+
"jak": "pergi",
|
225 |
+
"wayen": "waktu",
|
226 |
+
"pesilak": "minta tolong",
|
227 |
+
"balen": "kembali",
|
228 |
+
"pastin": "pastikan",
|
229 |
+
"laguk": "lagu",
|
230 |
+
"poton": "potong",
|
231 |
+
"idungm": "hidung",
|
232 |
+
"lamper": "lampirkan",
|
233 |
+
"sik": "juga",
|
234 |
+
"gemes": "tertarik",
|
235 |
+
"pete": "kacang panjang",
|
236 |
+
"yg": "yang",
|
237 |
+
"geratis": "gratis",
|
238 |
+
"melak": "melakukan",
|
239 |
+
"wahm": "wah",
|
240 |
+
"abotk": "berat",
|
241 |
+
"eak": "iya",
|
242 |
+
"belecok": "berbelok",
|
243 |
+
"mauk": "masuk",
|
244 |
+
"bdoe": "bodoh",
|
245 |
+
"mesak": "merasa",
|
246 |
+
"kentok": "kena",
|
247 |
+
"nani": "nanti",
|
248 |
+
"melen": "mendengar",
|
249 |
+
"besile": "berita",
|
250 |
+
"kance": "teman",
|
251 |
+
"gub": "daerah",
|
252 |
+
"bedengah": "tengah",
|
253 |
+
"lirimn": "lihat",
|
254 |
+
"wea": "anda",
|
255 |
+
"adoo": "ada",
|
256 |
+
"tenak": "makan",
|
257 |
+
"tye": "siapa",
|
258 |
+
"juluk": "julukan",
|
259 |
+
"peneng": "tenang",
|
260 |
+
"ampureeee": "maafkan",
|
261 |
+
"eku": "aku",
|
262 |
+
"loq": "siapa",
|
263 |
+
"maukn": "mau",
|
264 |
+
"angen": "bisa",
|
265 |
+
"kake": "takut",
|
266 |
+
"seragem": "seragam",
|
267 |
+
"senu": "biasa",
|
268 |
+
"keruan": "sangat",
|
269 |
+
"tepileq": "bisa",
|
270 |
+
"taon": "tahun",
|
271 |
+
"man": "saya",
|
272 |
+
"dait": "kait",
|
273 |
+
"sengak": "pintar",
|
274 |
+
"uah": "wah",
|
275 |
+
"surukm": "suruh",
|
276 |
+
"lasing": "berlaku",
|
277 |
+
"komenank": "komentar",
|
278 |
+
"jage": "jaga",
|
279 |
+
"melem": "makan",
|
280 |
+
"mako": "maaf",
|
281 |
+
"pileklah": "sudah",
|
282 |
+
"sdh": "sudah",
|
283 |
+
"permakoan": "pergaulan",
|
284 |
+
"ape": "apa",
|
285 |
+
"ite": "itu",
|
286 |
+
"jakm": "jaket",
|
287 |
+
"sai": "saya",
|
288 |
+
"maseh": "masih",
|
289 |
+
"maukm": "mau",
|
290 |
+
"timak": "ambil",
|
291 |
+
"auk": "satu",
|
292 |
+
"an": "saya",
|
293 |
+
"tadahn": "menangkap",
|
294 |
+
"kenak": "kena",
|
295 |
+
"berugak": "berdiri",
|
296 |
+
"elen": "lihat",
|
297 |
+
"setil": "segala",
|
298 |
+
"heh": "hei",
|
299 |
+
"kanatooo": "kenapa",
|
300 |
+
"made": "sudah",
|
301 |
+
"mpoipu": "mencari",
|
302 |
+
"panjamba": "panjang",
|
303 |
+
"ncau": "cau",
|
304 |
+
"rew": "redha",
|
305 |
+
"ur": "mau",
|
306 |
+
"karukumi": "berkurang",
|
307 |
+
"lokina": "di sini",
|
308 |
+
"wara": "uang",
|
309 |
+
"tanda-tanda": "tanda",
|
310 |
+
"ompu": "panggil",
|
311 |
+
"suki": "suka",
|
312 |
+
"doho": "bisa",
|
313 |
+
"ede": "di",
|
314 |
+
"na": "ada",
|
315 |
+
"noro": "apa",
|
316 |
+
"weaku": "aku",
|
317 |
+
"ragam": "beragam",
|
318 |
+
"ndi": "itu",
|
319 |
+
"aumu": "saya",
|
320 |
+
"ba": "baik",
|
321 |
+
"ma": "ya",
|
322 |
+
"meta": "mata",
|
323 |
+
"de": "di",
|
324 |
+
"bolpoin": "pulpen",
|
325 |
+
"wa": "wah",
|
326 |
+
"mpoi": "sampai",
|
327 |
+
"ba loan": "tidak ada",
|
328 |
+
"dahu": "kebun",
|
329 |
+
"k ntuwu": "kuat",
|
330 |
+
"weki": "hai",
|
331 |
+
"dou doho": "sangat",
|
332 |
+
"ringu": "melihat",
|
333 |
+
"aka": "sebutan",
|
334 |
+
"ncau re": "cau",
|
335 |
+
"ina": "ibu",
|
336 |
+
"mpanga": "mendengar",
|
337 |
+
"au": "saya",
|
338 |
+
"baba": "ayah",
|
339 |
+
"pala": "kepala",
|
340 |
+
"ngahi": "indah",
|
341 |
+
"hafa": "terus",
|
342 |
+
"karaka": "gampang",
|
343 |
+
"podaku": "saya",
|
344 |
+
"ne'e": "disini",
|
345 |
+
"wati": "perempuan",
|
346 |
+
"dahuna": "ada",
|
347 |
+
"loko": "tangan",
|
348 |
+
"ro": "sana",
|
349 |
+
"waura": "tempat",
|
350 |
+
"mbuku": "buku",
|
351 |
+
"konee": "kamu",
|
352 |
+
"matundu": "kebun",
|
353 |
+
"piti": "kecil",
|
354 |
+
"mudh": "mudah",
|
355 |
+
"progrm": "program",
|
356 |
+
"kturunanx": "turunan",
|
357 |
+
"ndiha": "disana",
|
358 |
+
"ece": "anak",
|
359 |
+
"kamanae": "kemana",
|
360 |
+
"ngomi": "ngomong",
|
361 |
+
"malao": "berlari",
|
362 |
+
"ipi": "ujung",
|
363 |
+
"sangufi": "bisa",
|
364 |
+
"hambu": "sangat",
|
365 |
+
"hondo": "berasa",
|
366 |
+
"langgengkan": "terus",
|
367 |
+
"jelung": "terkenal",
|
368 |
+
"kece": "keren",
|
369 |
+
"nggih": "ya",
|
370 |
+
"mlang": "jalan",
|
371 |
+
"tepung": "ketemu",
|
372 |
+
"ketok": "kelihatan",
|
373 |
+
"tamba": "obat",
|
374 |
+
"tulung": "tolong",
|
375 |
+
"wet": "air",
|
376 |
+
"ndemek": "menyentuh",
|
377 |
+
"nyandak": "mengambil",
|
378 |
+
"mbet": "memeluk",
|
379 |
+
"tepe": "mendorong",
|
380 |
+
"kliru": "salah",
|
381 |
+
"luweh": "lebih",
|
382 |
+
"akeh": "banyak",
|
383 |
+
"cemeng": "hitam",
|
384 |
+
"abang": "merah",
|
385 |
+
"jembar": "luas",
|
386 |
+
"ngombe": "minum",
|
387 |
+
"nyonggo": "membawa",
|
388 |
+
"nyilih": "meminjam",
|
389 |
+
"krempyeng-krempyeng": "sedikit demi sedikit",
|
390 |
+
"tekuk-tekuk": "membungkuk-bungkuk",
|
391 |
+
"sampeyan": "kamu (halus)",
|
392 |
+
"awak": "badan",
|
393 |
+
"satus": "seratus",
|
394 |
+
"sewu": "seribu",
|
395 |
+
"sejuta": "sejuta"
|
396 |
+
}
|
tfidf_vectorizer_indah_dhamayanti_putri.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:928fa30e9c66bd841663f7cef0c25adbbce5e51031219779a226eae424b63783
|
3 |
+
size 24377
|
tfidf_vectorizer_lalu_muhamad_iqbal.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:86d123b6d2e126182603c7727ac6d9afa98b5598e4f92d5053070d86f3090ae7
|
3 |
+
size 68943
|
tfidf_vectorizer_m_suhaili.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b5ccdd6525e6fb4b9e0e2ea42734e4b8945ab7cc2f38a708cf6afab3bd0272c4
|
3 |
+
size 36399
|
tfidf_vectorizer_musyafirin.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:93d68cfb116d6688f4b797bb1d46e37701d66102cefdaaf9bcb9c24f737402ae
|
3 |
+
size 41568
|
tfidf_vectorizer_sitti_rohmi_djalilah.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:90e500ec5c0ec3d1982e0a4fa33df38d8f34449cab9d1e4d89d267a83a546cae
|
3 |
+
size 55027
|
tfidf_vectorizer_zulkieflimansyah.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d5036b20528b2b6b9cd9651573d564d219292d07c1f0817a90fd761410e42ed6
|
3 |
+
size 48165
|