File size: 20,323 Bytes
d35f894 4a3ec7a 00f20fe d35f894 4a3ec7a e6b9234 d35f894 16b920f d35f894 24f9f13 d35f894 24f9f13 d35f894 24f9f13 d35f894 24f9f13 d35f894 24f9f13 d35f894 24f9f13 41457e8 15b606e a8fe0b7 d35f894 cf55003 7e56286 6f065f8 7e56286 e8c15a5 21d0ed5 7e56286 16b920f 7e56286 d35f894 1d246f3 16b920f 1d246f3 d35f894 e8c15a5 6f065f8 1d246f3 eaab897 21d0ed5 0c9cbe9 e8c15a5 0c9cbe9 e8c15a5 eaab897 e8c15a5 eaab897 e8c15a5 eaab897 e8c15a5 eaab897 d35f894 ef91cc2 8adc6f8 d35f894 0b0a15c d35f894 8adc6f8 d35f894 0a19809 d35f894 0a19809 d35f894 e8c15a5 1d246f3 6ea428f 1d246f3 1c314bc 1d246f3 deecdee ef91cc2 dc081f9 ef91cc2 6feef59 dc081f9 78d7324 dc081f9 ef91cc2 dc081f9 ef91cc2 6feef59 dc081f9 ef91cc2 dc081f9 d35f894 6feef59 d35f894 222b841 601154c 7e56286 1d246f3 0fbd120 aaeb0a5 2e68b49 87238e9 2e68b49 14a6264 2e68b49 bae16f7 321fe57 845af00 2e68b49 14a6264 2e68b49 0c99ce7 cf55003 0c99ce7 deecdee 3dc9465 deecdee 3dc9465 deecdee 0c99ce7 14a6264 81d223f 14a6264 6feef59 cd34a68 2e68b49 321fe57 2e68b49 16b920f 2e68b49 1d246f3 2e68b49 321fe57 986afa3 0b0a15c 2e68b49 944ad8c 2e68b49 14a6264 2e68b49 222b841 2e68b49 8545923 2e68b49 8545923 5d34b59 8545923 2e68b49 14a6264 0c99ce7 cf55003 0c99ce7 deecdee 25ee7eb deecdee 0c99ce7 36a2c5c 0c99ce7 cf55003 0c99ce7 deecdee 0c99ce7 64708f7 14a6264 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 |
import streamlit as st
import pandas as pd
import numpy as np
import re
import json
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
# Impor library tambahan
#import matplotlib.pyplot as plt
#import seaborn as sns
#import plotly.express as px
from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords
#from transformers import pipeline
# Fungsi untuk membersihkan teks dengan ekspresi reguler
#@st.cache_data
def clean_text(text):
# Tahap-1: Menghapus karakter non-ASCII
text = re.sub(r'[^\x00-\x7F]+', '', text)
# Tahap-2: Menghapus URL
text = re.sub(r'http[s]?://.[a-zA-Z0-9./_?=%&#+!]+', '', text)
text = re.sub(r'pic.twitter.com?.[a-zA-Z0-9./_?=%&#+!]+', '', text)
# Tahap-3: Menghapus mentions
text = re.sub(r'@[\w]+', '', text)
# Tahap-4: Menghapus hashtag
text = re.sub(r'#([\w]+)', '', text)
# Tahap-5 Menghapus 'amp' yang menempel pada '&' dan 'gt' yang menempel pada '&'
text = re.sub(r'&|>', '', text)
# Tahap-6: Menghapus karakter khusus (simbol)
text = re.sub(r'[!$%^&*@#()_+|~=`{}\[\]%\-:";\'<>?,./]', '', text)
# Tahap-7: Menghapus angka
text = re.sub(r'[0-9]+', '', text)
# Tahap-8: Menggabungkan spasi ganda menjadi satu spasi
text = re.sub(' +', ' ', text)
# Tahap-9: Menghapus spasi di awal dan akhir kalimat
text = text.strip()
# Tahap-10: Konversi teks ke huruf kecil
text = text.lower()
# Tahap-11: koreksi duplikasi tiga karakter beruntun atau lebih (contoh. yukkk)
# text = re.sub(r'([a-zA-Z])\1\1', '\\1', text)
#text = re.sub(r'(.)(\1{2,})', r'\1\1', text)
text = re.sub(r'(\w)\1{2,}', r'\1', text)
return text
@st.cache_resource
def load_file(kamus_path, kamus_sendiri_path):
# Membaca kamus kata gaul Salsabila
with open(kamus_path) as f:
data = f.read()
lookp_dict = json.loads(data)
# Dict kata gaul saya sendiri yang tidak masuk di dict Salsabila
with open(kamus_sendiri_path) as f:
kamus_sendiri = f.read()
kamus_gaul_baru = json.loads(kamus_sendiri)
# Menambahkan dict kata gaul baru ke kamus yang sudah ada
lookp_dict.update(kamus_gaul_baru)
nltk.download("stopwords")
stop_words = set(stopwords.words("indonesian"))
tfidf_vectorizer = joblib.load(tfidf_model_path)
model_ensemble = joblib.load('ensemble_clf_soft_smote.joblib')
#model_rf
model_nb = joblib.load('naive_bayes_model_smote.joblib')
model_lr = joblib.load('logreg_model_smote.joblib')
return lookp_dict, stop_words, tfidf_vectorizer, model_ensemble, model_nb, model_lr
# Fungsi untuk normalisasi kata gaul
#@st.cache_data
def normalize_slang(text, slang_dict):
words = text.split()
normalized_words = [slang_dict.get(word, word) for word in words]
return ' '.join(normalized_words)
#---------------------------------------------------NLTK Remove Stopwords----------------------------------------------------------------------
#@st.cache_data
def remove_stopwords(text, stop_words):
# Pecah teks menjadi kata-kata
words = text.split()
# Hapus stopwords bahasa Indonesia
words = [word for word in words if word not in stop_words]
return " ".join(words)
#---------------------------------------------------TFIDF----------------------------------------------------------------------
# Memuat model TF-IDF dengan joblib (pastikan path-nya benar)
# Fungsi untuk ekstraksi fitur TF-IDF
#@st.cache_data
#def extract_tfidf_features(texts, _tfidf_vectorizer):
# tfidf_matrix = tfidf_vectorizer.transform(texts)
# return tfidf_matrix
#---------------------------------------------------Milih Model----------------------------------------------------------------------
# Fungsi untuk memilih model berdasarkan pilihan pengguna
def select_sentiment_model(selected_model, model_enesmble, model_nb, model_lr):
if selected_model == "Ensemble":
model = model_ensemble
elif selected_model == "Random Forest":
model = model_ensemble
elif selected_model == "Naive Bayes":
model = model_nb
elif selected_model == "Logistic Regression":
model = model_lr
else:
# Fallback ke model default jika pilihan tidak valid
model = model_ensemble
return model
# Fungsi untuk prediksi sentimen
def predict_sentiment(text, _sentiment_model, _tfidf_vectorizer, slang_dict):
# Tahap-1: Membersihkan dan normalisasi teks
cleaned_text = clean_text(text)
norm_slang_text = normalize_slang(cleaned_text, slang_dict)
# Tahap-2: Ekstraksi fitur TF-IDF
tfidf_matrix = _tfidf_vectorizer.transform([norm_slang_text])
# Tahap-3: Lakukan prediksi sentimen
sentiment = _sentiment_model.predict(tfidf_matrix)
# Tahap-4: Menggantikan indeks dengan label sentimen
labels = {0: "Negatif", 1: "Netral", 2: "Positif"}
sentiment_label = labels[int(sentiment)]
if sentiment == "Positif":
emoticon = "π" # Emotikon untuk sentimen positif
elif sentiment == "Negatif":
emoticon = "π" # Emotikon untuk sentimen negatif
else:
emoticon = "π" # Emotikon untuk sentimen netral
return sentiment_label, emoticon
@st.cache_data
def buat_chart(df, target_year):
target_year = int(target_year)
st.write(f"Bar Chart Tahun {target_year}:")
# Ambil bulan
df['Date'] = pd.to_datetime(df['Date']) # Convert 'Date' column to datetime
df['month'] = df['Date'].dt.month
df['year'] = df['Date'].dt.year
# Filter DataFrame for the desired year
df_filtered = df[df['year'] == target_year]
# Check if data for the target year is available
if df_filtered.empty:
st.warning(f"Tidak ada data untuk tahun {target_year}.")
return
# Mapping nilai bulan ke nama bulan
bulan_mapping = {
1: f'Januari {target_year}',
2: f'Februari {target_year}',
3: f'Maret {target_year}',
4: f'April {target_year}',
5: f'Mei {target_year}',
6: f'Juni {target_year}',
7: f'Juli {target_year}',
8: f'Agustus {target_year}',
9: f'September {target_year}',
10: f'Oktober {target_year}',
11: f'November {target_year}',
12: f'Desember {target_year}'
}
# Mengganti nilai dalam kolom 'month' menggunakan mapping
df_filtered['month'] = df_filtered['month'].replace(bulan_mapping)
# Menentukan warna untuk setiap kategori dalam kolom 'score'
warna_label = {
'Negatif': '#FF9AA2',
'Netral': '#FFDAC1',
'Positif': '#B5EAD7'
}
# Sorting unique scores
unique_label = sorted(df_filtered['label'].unique())
# Ensure months are in the correct order
months_order = [
f'Januari {target_year}', f'Februari {target_year}', f'Maret {target_year}', f'April {target_year}', f'Mei {target_year}', f'Juni {target_year}',
f'Juli {target_year}', f'Agustus {target_year}', f'September {target_year}', f'Oktober {target_year}', f'November {target_year}', f'Desember {target_year}'
]
# Sort DataFrame based on the custom order of months
df_filtered['month'] = pd.Categorical(df_filtered['month'], categories=months_order, ordered=True)
df_filtered = df_filtered.sort_values('month')
# Create a bar chart with stacking and manual colors
st.bar_chart(
df_filtered.groupby(['month', 'label']).size().unstack().fillna(0),
color=[warna_label[label] for label in unique_label]
)
@st.cache_data(show_spinner = 'On progress, please wait...')
def all_data_process(texts, df, lookp_dict, stop_words, _sentiment_model, _tfidf_vectorizer):
results = []
analisis = False
if 'Text' in df.columns:
if 'Date' in df.columns:
for text, date in zip(texts, df['Date']):
sentiment_label, emoticon = predict_sentiment(text, _sentiment_model, _tfidf_vectorizer, lookp_dict)
cleaned_text = clean_text(text)
norm_slang_text = normalize_slang(cleaned_text, lookp_dict)
tanpa_stopwords = remove_stopwords(norm_slang_text, stop_words)
result_entry = {
'Date': date,
'Text': text,
'cleaned-text': cleaned_text,
'normalisasi-text': norm_slang_text,
'stopwords-remove': tanpa_stopwords,
'label': sentiment_label,
'emotikon': emoticon,
}
results.append(result_entry)
analisis = True
else:
for text in texts:
sentiment_label, emoticon = predict_sentiment(text, _sentiment_model, _tfidf_vectorizer, lookp_dict)
cleaned_text = clean_text(text)
norm_slang_text = normalize_slang(cleaned_text, lookp_dict)
tanpa_stopwords = remove_stopwords(norm_slang_text, stop_words)
result_entry = {
'Text': text,
'cleaned-text': cleaned_text,
'normalisasi-text': norm_slang_text,
'stopwords-remove': tanpa_stopwords,
'label': sentiment_label,
'emotikon': emoticon,
}
results.append(result_entry)
analisis = True
else:
st.warning("Berkas XLSX harus memiliki kolom bernama 'Text' untuk analisis sentimen.")
return results, analisis
# Fungsi untuk membuat tautan unduhan
def get_table_download_link(df, download_format):
if download_format == "XLSX":
df.to_excel("hasil_sentimen.xlsx", index=False)
return f'<a href="hasil_sentimen.xlsx" download="hasil_sentimen.xlsx">Unduh File XLSX</a>'
else:
csv = df.to_csv(index=False)
return f'<a href="data:file/csv;base64,{b64encode(csv.encode()).decode()}" download="hasil_sentimen.csv">Unduh File CSV</a>'
# Judul
st.title("Sentiment Analysis : Based on Tweets Biskita Transpakuan Bogor 2022-2023")
preference_barchart_date = False
#-----------------------------------------------------General Settings---------------------------------------------------------------
with st.sidebar :
st.subheader('Settings :')
with st.expander("General Settings :"):
# Tambahkan widget untuk memilih model
selected_model = st.selectbox("Pilih Model Sentimen:", ("Ensemble", "Naive Bayes", "Logistic Regression", "Transformer"))
# Pilihan input teks manual atau berkas XLSX
input_option = st.radio("Pilih metode input:", ("Teks Manual", "Unggah Berkas XLSX"))
if input_option == "Teks Manual":
# Input teks dari pengguna
user_input = st.text_area("Masukkan teks:", "")
else:
# Input berkas XLSX
uploaded_file = st.file_uploader("Unggah berkas XLSX", type=["xlsx"])
st.caption("Pastikan berkas XLSX Anda memiliki kolom yang bernama :blue[Text] _(Maks.500 data)_.")
st.caption("Jika terdapat kolom type :blue[datetime], ganti nama kolom menjadi :blue[Date]")
if uploaded_file is not None:
df = pd.read_excel(uploaded_file)
df = df[:500]
if 'Text' not in df.columns:
st.warning("Berkas XLSX harus memiliki kolom bernama 'Text' untuk analisis sentimen.")
if not df['Text'].empty:
st.warning("Kolom 'Text' harus mempunyai value.")
else:
texts = df['Text'] # Sesuaikan dengan nama kolom di berkas XLSX Anda
if "Date" in df.columns :
if not df['Date'].empty:
dates = df['Date']
preference_barchart_date = True
#-----------------------------------------------------Preference Settings--------------------------------------------------
with st.expander ("Preference Settings:"):
colormap = st.selectbox("Pilih Warna Wordclouds :", ["Greys", "Purples", "Blues", "Greens", "Oranges", "Reds", "YlOrBr", "YlOrRd", "OrRd", "PuRd", "RdPu", "BuPu", "GnBu", "PuBu", "YlGnBu", "PuBuGn", "BuGn", "YlGn"])
if preference_barchart_date == True:
bar = st.selectbox("Pilih Tampilan Bar Chart :", ("Distribusi Kelas", "Distribusi Kelas Berdasarkan Waktu"), index = 0)
df_target_year = df['Date'].astype(str)
target_year = st.selectbox("Pilih Tahun Bar Chart :", df_target_year.str[:4].unique())
st.info('Tekan "Analysis" kembali jika tampilan menghilang', icon = 'βΉοΈ')
button = st.button("Analysis")
tab1, tab2, tab3, tab4 = st.tabs(["π Documentation", "π Results", "π€΅ Creator", "π More"])
with tab1:
@st.cache_resource
def tab_1():
st.header("Documentation:")
'''
Langkah - langkah :
1. Buka sidebar sebelah kiri
2. Buka General Settings
3. Pilih Model
4. Pilih Input ('Text Manual', 'File Xlsx')
- Input manual dapat berisi banyak input, lakukan dengan tekan 'enter' untuk menambah line baru
5. File xlsx harus memiliki kolom 'Text'
6. Kolom type datetime "%Y-%m-%d %H:%M:%S" harus bernama 'Date', untuk mengaktifkan fitur tambahan
7. Buka Preferences Settings untuk menyetel tampilan Wordclouds/Barchart
8. Klik Analysis
9. Buka tab Results
'''
st.write('Data bisa dicari di sini:')
more1, more2, more3 = st.columns(3)
with more1 :
st.image('playstore.png', caption = 'Scraping Playstore Reviews')
more1_link = "https://huggingface.co/spaces/naufalnashif/scraping-playstore-reviews"
st.markdown(f"[{more1_link}]({more1_link})")
with more2 :
st.image('News.png', caption = 'Scraping News Headline')
more2_link = "https://huggingface.co/spaces/naufalnashif/scraping-news-headline"
st.markdown(f"[{more2_link}]({more2_link})")
with more3 :
st.image('Ecommerce.png', caption = 'Scraping Ecommerce Product')
more3_link = "https://huggingface.co/spaces/naufalnashif/scraping-ecommerce-2023"
st.markdown(f"[{more3_link}]({more3_link})")
tab_1()
with tab2:
st.header("Results:")
kamus_path = '_json_colloquial-indonesian-lexicon (1).txt'
kamus_sendiri_path = 'kamus_gaul_custom.txt'
tfidf_model_path = 'X_tfidf_model.joblib'
lookp_dict, stop_words, tfidf_vectorizer, model_ensemble, model_nb, model_lr = load_file(kamus_path, kamus_sendiri_path)
sentiment_model = select_sentiment_model(selected_model, model_ensemble, model_lr, model_nb)
# Analisis sentimen
results = []
analisis = False
if input_option == "Teks Manual" and user_input:
if button:
# Pisahkan teks yang dimasukkan pengguna menjadi baris-baris terpisah
user_texts = user_input.split('\n')
for text in user_texts:
sentiment_label, emoticon = predict_sentiment(text, sentiment_model, tfidf_vectorizer, lookp_dict)
cleaned_text = clean_text(text)
norm_slang_text = normalize_slang(cleaned_text, lookp_dict)
tanpa_stopwords = remove_stopwords(norm_slang_text, stop_words)
results.append({
'Text': text,
'cleaned-text' : cleaned_text,
'normalisasi-text' : norm_slang_text,
'stopwords-remove' : tanpa_stopwords,
'label' : sentiment_label,
'emotikon' : emoticon,
})
analisis = True
elif input_option == "Unggah Berkas XLSX" and uploaded_file is not None:
if button:
results, analisis = all_data_process(texts, df, lookp_dict, stop_words, sentiment_model, tfidf_vectorizer)
if results and analisis == True:
df_results = pd.DataFrame(results)
# Membagi tampilan menjadi dua kolom
columns = st.columns(2)
# Kolom pertama untuk Word Cloud
with columns[0]:
st.write("Wordclouds:")
all_texts = [result['stopwords-remove'] for result in results if result['stopwords-remove'] is not None and not pd.isna(result['stopwords-remove'])]
all_texts = " ".join(all_texts)
if all_texts:
wordcloud = WordCloud(width=800, height=660, background_color='white',
colormap=colormap, # Warna huruf
contour_color='black', # Warna kontur
contour_width=2, # Lebar kontur
mask=None, # Gunakan mask untuk bentuk kustom
).generate(all_texts)
st.image(wordcloud.to_array())
else:
st.write("Tidak ada data untuk ditampilkan dalam Word Cloud.")
if 'Date' in df_results.columns:
if bar == "Distribusi Kelas Berdasarkan Waktu":
if not df_results['Date'].empty:
with columns[1]:
buat_chart(df_results, target_year)
else :
# Kolom kedua untuk Bar Chart
with columns[1]:
st.write("Bar Chart :")
# Membuat bar chart
st.bar_chart(
df_results["label"].value_counts()
)
else :
# Kolom kedua untuk Bar Chart
with columns[1]:
st.write("Bar Chart :")
# Membuat bar chart
st.bar_chart(
df_results["label"].value_counts()
)
# Menampilkan hasil analisis sentimen dalam kotak yang dapat diperluas
with st.expander("Hasil Analisis Sentimen"):
# Tampilkan tabel hasil analisis sentimen
st.write(pd.DataFrame(results))
if results:
# Simpan DataFrame ke dalam file CSV
df = pd.DataFrame(results)
csv = df.to_csv(index=False)
# Tampilkan tombol unduh CSV
st.download_button(label="Unduh CSV", data=csv, key="csv_download", file_name="hasil_sentimen.csv")
else:
st.write("Tidak ada data untuk diunduh.")
else:
st.write("Tidak ada data untuk ditampilkan")
with tab3:
@st.cache_resource
def tab_3():
st.header("Profile:")
st.image('https://github.com/naufalnashif/naufalnashif.github.io/blob/main/assets/img/my-profile-semhas.jpeg?raw=true', caption='Naufal Nashif')
st.subheader('Hello, nice to meet you !')
# Tautan ke GitHub
github_link = "https://github.com/naufalnashif/"
st.markdown(f"GitHub: [{github_link}]({github_link})")
# Tautan ke Instagram
instagram_link = "https://www.instagram.com/naufal.nashif/"
st.markdown(f"Instagram: [{instagram_link}]({instagram_link})")
tab_3()
with tab4:
@st.cache_resource
def tab_4():
st.header("More:")
more1, more2, more3 = st.columns(3)
with more1 :
st.image('playstore.png', caption = 'Scraping Playstore Reviews')
more1_link = "https://huggingface.co/spaces/naufalnashif/scraping-playstore-reviews"
st.markdown(f"[{more1_link}]({more1_link})")
with more2 :
st.image('News.png', caption = 'Scraping News Headline')
more2_link = "https://huggingface.co/spaces/naufalnashif/scraping-news-headline"
st.markdown(f"[{more2_link}]({more2_link})")
with more3 :
st.image('Ecommerce.png', caption = 'Scraping Ecommerce Product')
more3_link = "https://huggingface.co/spaces/naufalnashif/scraping-ecommerce-2023"
st.markdown(f"[{more3_link}]({more3_link})")
tab_4()
# Garis pemisah
st.divider()
st.write('Thank you for trying the demo!')
st.caption('Best regards, Naufal Nashif :sunglasses: | Β©οΈ 2023')
|