File size: 18,175 Bytes
d35f894
 
 
 
 
 
 
 
 
 
4a3ec7a
 
00f20fe
d35f894
4a3ec7a
 
 
e6b9234
 
d35f894
222b841
d35f894
 
 
 
 
 
 
 
 
 
 
 
 
 
24f9f13
 
 
 
d35f894
 
24f9f13
d35f894
 
24f9f13
d35f894
 
24f9f13
d35f894
 
24f9f13
d35f894
 
24f9f13
41457e8
15b606e
 
a8fe0b7
d35f894
222b841
7e56286
 
6f065f8
 
 
 
 
 
 
 
 
 
 
 
7e56286
 
 
 
 
 
 
 
 
 
 
d35f894
 
 
 
1d246f3
 
6f065f8
1d246f3
 
 
 
 
 
 
 
 
d35f894
a4e3f31
d35f894
6f065f8
 
 
 
 
1d246f3
 
eaab897
0b0a15c
eaab897
0c9cbe9
eaab897
0c9cbe9
 
eaab897
 
 
 
 
 
 
 
 
 
 
 
d35f894
0f970e3
6f065f8
d35f894
 
 
 
 
0b0a15c
d35f894
 
0b0a15c
d35f894
 
 
 
 
 
6f065f8
d35f894
 
 
 
 
 
 
 
 
 
6f065f8
1d246f3
6ea428f
1d246f3
 
 
1c314bc
 
 
1d246f3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dc081f9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78d7324
dc081f9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d35f894
6f065f8
d35f894
 
 
 
 
 
 
 
222b841
 
601154c
7e56286
1d246f3
0fbd120
aaeb0a5
2e68b49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
321fe57
845af00
2e68b49
af06659
2e68b49
 
 
aaeb0a5
 
af06659
 
 
 
 
 
 
 
 
aaeb0a5
2e68b49
81d223f
2e68b49
 
 
 
 
321fe57
2e68b49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1d246f3
2e68b49
321fe57
2e68b49
0b0a15c
2e68b49
 
 
 
944ad8c
2e68b49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
222b841
2e68b49
 
 
 
 
 
 
8545923
2e68b49
 
 
 
 
8545923
 
5d34b59
8545923
 
 
 
 
2e68b49
 
 
 
 
 
 
 
 
 
 
 
 
 
d35f894
bf1fbf1
 
64708f7
bf1fbf1
36a2c5c
 
 
 
 
 
 
64708f7
 
 
f20009f
64708f7
36a2c5c
64708f7
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458

import streamlit as st
import pandas as pd
import numpy as np
import re
import json
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer

# Impor library tambahan
#import matplotlib.pyplot as plt
#import seaborn as sns
#import plotly.express as px
from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords
#from transformers import pipeline


# Fungsi untuk membersihkan teks dengan ekspresi reguler
@st.cache_data
def clean_text(text):
    # Tahap-1: Menghapus karakter non-ASCII
    text = re.sub(r'[^\x00-\x7F]+', '', text)

    # Tahap-2: Menghapus URL
    text = re.sub(r'http[s]?://.[a-zA-Z0-9./_?=%&#+!]+', '', text)
    text = re.sub(r'pic.twitter.com?.[a-zA-Z0-9./_?=%&#+!]+', '', text)

    # Tahap-3: Menghapus mentions
    text = re.sub(r'@[\w]+', '', text)

    # Tahap-4: Menghapus hashtag
    text = re.sub(r'#([\w]+)', '', text)

    # Tahap-5 Menghapus 'amp' yang menempel pada '&' dan 'gt' yang menempel pada '&'
    text = re.sub(r'&|>', '', text)

    # Tahap-6: Menghapus karakter khusus (simbol)
    text = re.sub(r'[!$%^&*@#()_+|~=`{}\[\]%\-:";\'<>?,./]', '', text)

    # Tahap-7: Menghapus angka
    text = re.sub(r'[0-9]+', '', text)

    # Tahap-8: Menggabungkan spasi ganda menjadi satu spasi
    text = re.sub(' +', ' ', text)

    # Tahap-9: Menghapus spasi di awal dan akhir kalimat
    text = text.strip()

    # Tahap-10: Konversi teks ke huruf kecil
    text = text.lower()

    # Tahap-11: koreksi duplikasi tiga karakter beruntun atau lebih (contoh. yukkk)
    # text = re.sub(r'([a-zA-Z])\1\1', '\\1', text)
    #text = re.sub(r'(.)(\1{2,})', r'\1\1', text)
    text = re.sub(r'(\w)\1{2,}', r'\1', text)

    return text
@st.cache_data
def load_file(kamus_path, kamus_sendiri_path):
    # Membaca kamus kata gaul Salsabila  
    with open(kamus_path) as f:
        data = f.read()
    lookp_dict = json.loads(data)
    
    # Dict kata gaul saya sendiri yang tidak masuk di dict Salsabila
    with open(kamus_sendiri_path) as f:
        kamus_sendiri = f.read()
    kamus_gaul_baru = json.loads(kamus_sendiri)
    
    # Menambahkan dict kata gaul baru ke kamus yang sudah ada
    lookp_dict.update(kamus_gaul_baru)
    
    nltk.download("stopwords")
    stop_words = set(stopwords.words("indonesian"))
    return lookp_dict, stop_words
    
kamus_path = '_json_colloquial-indonesian-lexicon (1).txt'
kamus_sendiri_path = 'kamus_gaul_custom.txt'
lookp_dict, stop_words = load_file(kamus_path, kamus_sendiri_path)

# Fungsi untuk normalisasi kata gaul
@st.cache_data
def normalize_slang(text, slang_dict):
    words = text.split()
    normalized_words = [slang_dict.get(word, word) for word in words]
    return ' '.join(normalized_words)

#---------------------------------------------------NLTK Remove Stopwords----------------------------------------------------------------------

@st.cache_data
def remove_stopwords(text, stop_words):
    # Pecah teks menjadi kata-kata
    words = text.split()

    # Hapus stopwords bahasa Indonesia
    words = [word for word in words if word not in stop_words]

    return " ".join(words)
#---------------------------------------------------TFIDF----------------------------------------------------------------------  
# Memuat model TF-IDF dengan joblib (pastikan path-nya benar)
tfidf_model_path = 'X_tfidf_model.joblib'
tfidf_vectorizer = joblib.load(tfidf_model_path)
# Fungsi untuk ekstraksi fitur TF-IDF
#@st.cache_data
#def extract_tfidf_features(texts, _tfidf_vectorizer):
#    tfidf_matrix = tfidf_vectorizer.transform(texts)
#    return tfidf_matrix
#---------------------------------------------------Milih Model----------------------------------------------------------------------

# Fungsi untuk memilih model berdasarkan pilihan pengguna
@st.cache_data
def select_sentiment_model(selected_model):
    if selected_model == "Ensemble":
        model_path = 'ensemble_clf_soft_smote.joblib'
    elif selected_model == "Random Forest":
        model_path = 'best_rf_model_smote.joblib'
    elif selected_model == "Naive Bayes":
        model_path = 'naive_bayes_model_smote.joblib'
    elif selected_model == "Logistic Regression":
        model_path = 'logreg_model_smote.joblib'
    else:
        # Fallback ke model default jika pilihan tidak valid
        model_path = 'ensemble_clf_soft_smote.joblib'

    model = joblib.load(model_path)
    return model


# Fungsi untuk prediksi sentimen

def predict_sentiment(text, _model, _tfidf_vectorizer, slang_dict):
    # Tahap-1: Membersihkan dan normalisasi teks
    cleaned_text = clean_text(text)
    norm_slang_text = normalize_slang(cleaned_text, slang_dict)

    # Tahap-2: Ekstraksi fitur TF-IDF
    tfidf_matrix = _tfidf_vectorizer.transform([norm_slang_text])

    # Tahap-3: Lakukan prediksi sentimen
    sentiment = _model.predict(tfidf_matrix)

    # Tahap-4: Menggantikan indeks dengan label sentimen
    labels = {0: "Negatif", 1: "Netral", 2: "Positif"}
    sentiment_label = labels[int(sentiment)]

    return sentiment_label
@st.cache_data
def get_emoticon(sentiment):
    if sentiment == "Positif":
        emoticon = "πŸ˜„"  # Emotikon untuk sentimen positif
    elif sentiment == "Negatif":
        emoticon = "😞"  # Emotikon untuk sentimen negatif
    else:
        emoticon = "😐"  # Emotikon untuk sentimen netral

    return emoticon

@st.cache_data
def buat_chart(df, target_year):
    target_year = int(target_year)
    st.write(f"Bar Chart Tahun {target_year}:")

    # Ambil bulan
    df['Date'] = pd.to_datetime(df['Date'])  # Convert 'Date' column to datetime
    df['month'] = df['Date'].dt.month
    df['year'] = df['Date'].dt.year

    # Filter DataFrame for the desired year
    df_filtered = df[df['year'] == target_year]

    # Check if data for the target year is available
    if df_filtered.empty:
        st.warning(f"Tidak ada data untuk tahun {target_year}.")
        return

    # Mapping nilai bulan ke nama bulan
    bulan_mapping = {
        1: f'Januari {target_year}',
        2: f'Februari {target_year}',
        3: f'Maret {target_year}',
        4: f'April {target_year}',
        5: f'Mei {target_year}',
        6: f'Juni {target_year}',
        7: f'Juli {target_year}',
        8: f'Agustus {target_year}',
        9: f'September {target_year}',
        10: f'Oktober {target_year}',
        11: f'November {target_year}',
        12: f'Desember {target_year}'
    }

    # Mengganti nilai dalam kolom 'month' menggunakan mapping
    df_filtered['month'] = df_filtered['month'].replace(bulan_mapping)

    # Menentukan warna untuk setiap kategori dalam kolom 'score'
    warna_label = {
        'Negatif': '#FF9AA2',
        'Netral': '#FFDAC1',
        'Positif': '#B5EAD7'
    }

    # Sorting unique scores
    unique_label = sorted(df_filtered['label'].unique())

    # Ensure months are in the correct order
    months_order = [
        f'Januari {target_year}', f'Februari {target_year}', f'Maret {target_year}', f'April {target_year}', f'Mei {target_year}', f'Juni {target_year}',
        f'Juli {target_year}', f'Agustus {target_year}', f'September {target_year}', f'Oktober {target_year}', f'November {target_year}', f'Desember {target_year}'
    ]

    # Sort DataFrame based on the custom order of months
    df_filtered['month'] = pd.Categorical(df_filtered['month'], categories=months_order, ordered=True)
    df_filtered = df_filtered.sort_values('month')

    # Create a bar chart with stacking and manual colors
    st.bar_chart(
        df_filtered.groupby(['month', 'label']).size().unstack().fillna(0),
        color=[warna_label[label] for label in unique_label]
    )
    
@st.cache_data
def all_data_process(texts, df, _sentiment_model, _tfidf_vectorizer, lookp_dict, stop_words):
    results = []
    analisis = False

    if 'Text' in df.columns:
        if 'Date' in df.columns:
            for text, date in zip(texts, df['Date']):
                sentiment_label = predict_sentiment(text, sentiment_model, tfidf_vectorizer, lookp_dict)
                emoticon = get_emoticon(sentiment_label)
                cleaned_text = clean_text(text)
                norm_slang_text = normalize_slang(cleaned_text, lookp_dict)
                tanpa_stopwords = remove_stopwords(norm_slang_text, stop_words)

                result_entry = {
                    'Date': date,
                    'Text': text,
                    'cleaned-text': cleaned_text,
                    'normalisasi-text': norm_slang_text,
                    'stopwords-remove': tanpa_stopwords,
                    'label': sentiment_label,
                    'emotikon': emoticon,
                }

                results.append(result_entry)

            analisis = True
        else:
            for text in texts:
                sentiment_label = predict_sentiment(text, sentiment_model, tfidf_vectorizer, lookp_dict)
                emoticon = get_emoticon(sentiment_label)
                cleaned_text = clean_text(text)
                norm_slang_text = normalize_slang(cleaned_text, lookp_dict)
                tanpa_stopwords = remove_stopwords(norm_slang_text, stop_words)

                result_entry = {
                    'Text': text,
                    'cleaned-text': cleaned_text,
                    'normalisasi-text': norm_slang_text,
                    'stopwords-remove': tanpa_stopwords,
                    'label': sentiment_label,
                    'emotikon': emoticon,
                }

                results.append(result_entry)

            analisis = True
    else:
        st.warning("Berkas XLSX harus memiliki kolom bernama 'Text' untuk analisis sentimen.")

    return results, analisis
    
# Fungsi untuk membuat tautan unduhan
@st.cache_data
def get_table_download_link(df, download_format):
    if download_format == "XLSX":
        df.to_excel("hasil_sentimen.xlsx", index=False)
        return f'<a href="hasil_sentimen.xlsx" download="hasil_sentimen.xlsx">Unduh File XLSX</a>'
    else:
        csv = df.to_csv(index=False)
        return f'<a href="data:file/csv;base64,{b64encode(csv.encode()).decode()}" download="hasil_sentimen.csv">Unduh File CSV</a>'


# Judul
st.title("Sentiment Analysis : Based on Tweets Biskita Transpakuan Bogor 2022-2023")
preference_barchart_date = False
#-----------------------------------------------------General Settings---------------------------------------------------------------
with st.sidebar :
    st.subheader('Settings :')
    with st.expander("General Settings :"):
        # Tambahkan widget untuk memilih model
        selected_model = st.selectbox("Pilih Model Sentimen:", ("Ensemble", "Naive Bayes", "Logistic Regression", "Transformer"))
        
        # Memilih model sentimen berdasarkan pilihan pengguna
        sentiment_model = select_sentiment_model(selected_model)
        
        # Pilihan input teks manual atau berkas XLSX
        input_option = st.radio("Pilih metode input:", ("Teks Manual", "Unggah Berkas XLSX"))
        
        if input_option == "Teks Manual":
            # Input teks dari pengguna
            user_input = st.text_area("Masukkan teks:", "")
        else:
            # Input berkas XLSX
            uploaded_file = st.file_uploader("Unggah berkas XLSX", type=["xlsx"])
            st.caption("Pastikan berkas XLSX Anda memiliki kolom yang bernama :blue[Text] _(Maks.500 data)_.")
            st.caption("Jika terdapat kolom type :blue[datetime], ganti nama kolom menjadi :blue[Date]")
        
            if uploaded_file is not None:
                df = pd.read_excel(uploaded_file)
                df = df[:500]
                if 'Text' not in df.columns:
                    st.warning("Berkas XLSX harus memiliki kolom bernama 'Text' untuk analisis sentimen.")
                    if not df['Text'].empty:
                        st.warning("Kolom 'Text' harus mempunyai value.")
                else:
                    texts = df['Text']  # Sesuaikan dengan nama kolom di berkas XLSX Anda
                if "Date" in df.columns :
                    if not df['Date'].empty:
                        dates = df['Date']
                        preference_barchart_date = True
                        
    #-----------------------------------------------------Preference Settings--------------------------------------------------
    with st.expander ("Preference Settings :"):
        colormap = st.selectbox("Pilih Warna Wordclouds :", ["Greys", "Purples", "Blues", "Greens", "Oranges", "Reds", "YlOrBr", "YlOrRd", "OrRd", "PuRd", "RdPu", "BuPu", "GnBu", "PuBu", "YlGnBu", "PuBuGn", "BuGn", "YlGn"])
        if preference_barchart_date == True:
            bar = st.selectbox("Pilih Tampilan Bar Chart :", ("Distribusi Kelas", "Distribusi Kelas Berdasarkan Waktu"), index = 0)
            target_year = st.selectbox("Pilih Tahun Bar Chart :", df['Date'].str[:4].unique())
    st.info('Tekan "Analysis" kembali jika tampilan menghilang', icon = 'ℹ️')    
    button = st.button("Analysis")

tab1, tab2, tab3 = st.tabs(["πŸ“‹ Documentation", "πŸ“ˆ Results", "🀡 Creator"])

with tab1:
    st.header("Documentation :")
    '''
    Langkah - langkah :
    1. Buka sidebar sebelah kiri
    2. Buka General Settings
    3. Pilih Model 
    4. Pilih Input ('Text Manual', 'File Xlsx')
    5. Jika file Xlsx harus memiliki kolom 'Text'
    6. Jika ada kolom type Datetime, ada fitur tambahan asalkan kolom bernama 'Date'
    7. Buka Preferences Settings untuk menyetel tampilan Wordclouds/Barchart
    8. Klik Analysis
    9. Klik tab Results
    '''

with tab2:
    st.header("Results :")
    # Analisis sentimen
    results = []
    analisis = False
    if input_option == "Teks Manual" and user_input:
        if button:
            # Pisahkan teks yang dimasukkan pengguna menjadi baris-baris terpisah
            user_texts = user_input.split('\n')
            for text in user_texts:
                sentiment_label = predict_sentiment(text, sentiment_model, tfidf_vectorizer, lookp_dict)
                emoticon = get_emoticon(sentiment_label)
                cleaned_text = clean_text(text)
                norm_slang_text = normalize_slang(cleaned_text, lookp_dict)
                tanpa_stopwords = remove_stopwords(norm_slang_text, stop_words)
                
                results.append({
                    'Text': text, 
                    'cleaned-text' : cleaned_text, 
                    'normalisasi-text' : norm_slang_text, 
                    'stopwords-remove' : tanpa_stopwords,
                    'label' : sentiment_label,
                    'emotikon' : emoticon,
                })
                analisis = True
    
    elif input_option == "Unggah Berkas XLSX" and uploaded_file is not None:
        if button:
            results, analisis = all_data_process(texts, df, sentiment_model, tfidf_vectorizer, lookp_dict, stop_words)
            
    if results and analisis == True:
        df_results = pd.DataFrame(results)
        # Membagi tampilan menjadi dua kolom
        columns = st.columns(2)
        
        # Kolom pertama untuk Word Cloud
        with columns[0]:
            st.write("Wordclouds :")
            all_texts = [result['stopwords-remove'] for result in results if result['stopwords-remove'] is not None and not pd.isna(result['stopwords-remove'])]
            all_texts = " ".join(all_texts)
        
            if all_texts:
                wordcloud = WordCloud(width=800, height=660, background_color='white',
                                      colormap=colormap,   # Warna huruf
                                      contour_color='black',  # Warna kontur
                                      contour_width=2,       # Lebar kontur
                                      mask=None,             # Gunakan mask untuk bentuk kustom
                                      ).generate(all_texts)
                st.image(wordcloud.to_array())
            else:
                st.write("Tidak ada data untuk ditampilkan dalam Word Cloud.")
    
        if 'Date' in df_results.columns:
            if bar == "Distribusi Kelas Berdasarkan Waktu":
                if not df_results['Date'].empty:
                    with columns[1]:
                        buat_chart(df_results, target_year)
            else :
                # Kolom kedua untuk Bar Chart
                with columns[1]:
                    st.write("Bar Chart :")   
                    # Membuat bar chart
                    st.bar_chart(
                        df_results["label"].value_counts()
                    )
        else :
            # Kolom kedua untuk Bar Chart
            with columns[1]:
                st.write("Bar Chart :")   
                # Membuat bar chart
                st.bar_chart(
                    df_results["label"].value_counts()
                )
        # Menampilkan hasil analisis sentimen dalam kotak yang dapat diperluas
        with st.expander("Hasil Analisis Sentimen"):
            # Tampilkan tabel hasil analisis sentimen
            st.write(pd.DataFrame(results))
        
        if results:
            # Simpan DataFrame ke dalam file CSV
            df = pd.DataFrame(results)
            csv = df.to_csv(index=False)
        
            # Tampilkan tombol unduh CSV
            st.download_button(label="Unduh CSV", data=csv, key="csv_download", file_name="hasil_sentimen.csv")
        else:
            st.write("Tidak ada data untuk diunduh.")

with tab3:
    st.header("Profile :")
    st.image('https://naufalnashif.github.io/assets/images/WhatsApp%20Image%202023-01-26%20at%2020.37.17.jpeg', caption='Naufal Nashif')
    st.subheader('Hello, nice to meet you !')
    # Tautan ke GitHub
    github_link = "https://github.com/naufalnashif/"
    st.markdown(f"GitHub: [{github_link}]({github_link})")
    
    # Tautan ke Instagram
    instagram_link = "https://www.instagram.com/naufal.nashif/"
    st.markdown(f"Instagram: [{instagram_link}]({instagram_link})")
# Garis pemisah
st.divider()
st.write('Thank you for trying the demo!')
left, right = st.columns(2) 
with left :
    st.caption('Best regards, Naufal Nashif :sunglasses:')
with right :
    st.caption('©️ 2023')