|
|
|
import streamlit as st |
|
import pandas as pd |
|
import numpy as np |
|
import re |
|
import json |
|
import joblib |
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
|
|
|
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
from wordcloud import WordCloud |
|
|
|
|
|
|
|
st.set_page_config(page_title="naufalnashif-ML") |
|
|
|
|
|
st.title("Aplikasi ML Analisis Sentimen based on data Biskita Transpakuan") |
|
|
|
|
|
def clean_text(text): |
|
|
|
text = re.sub(r'[^\x00-\x7F]+', '', text) |
|
|
|
|
|
text = re.sub(r'http[s]?://.[a-zA-Z0-9./_?=%&#+!]+', '', text) |
|
text = re.sub(r'pic.twitter.com?.[a-zA-Z0-9./_?=%&#+!]+', '', text) |
|
|
|
|
|
text = re.sub(r'@[\w]+', '', text) |
|
|
|
|
|
text = re.sub(r'#([\w]+)', '', text) |
|
|
|
|
|
text = re.sub(r'[!$%^&*@#()_+|~=`{}\[\]%\-:";\'<>?,./]', '', text) |
|
|
|
|
|
text = re.sub(r'[0-9]+', '', text) |
|
|
|
|
|
text = re.sub(' +', ' ', text) |
|
|
|
|
|
text = text.strip() |
|
|
|
|
|
text = text.lower() |
|
|
|
|
|
|
|
|
|
text = re.sub(r'(\w)\1{2,}', r'\1', text) |
|
|
|
return text |
|
|
|
|
|
kamus_path = '_json_colloquial-indonesian-lexicon (1).txt' |
|
with open(kamus_path) as f: |
|
data = f.read() |
|
lookp_dict = json.loads(data) |
|
|
|
|
|
kamus_sendiri_path = 'kamus_gaul_custom.txt' |
|
with open(kamus_sendiri_path) as f: |
|
kamus_sendiri = f.read() |
|
kamus_gaul_baru = json.loads(kamus_sendiri) |
|
|
|
|
|
lookp_dict.update(kamus_gaul_baru) |
|
|
|
|
|
def normalize_slang(text, slang_dict): |
|
words = text.split() |
|
normalized_words = [slang_dict.get(word, word) for word in words] |
|
return ' '.join(normalized_words) |
|
|
|
|
|
def extract_tfidf_features(texts, tfidf_vectorizer): |
|
tfidf_matrix = tfidf_vectorizer.transform(texts) |
|
return tfidf_matrix |
|
|
|
|
|
tfidf_model_path = 'X_tfidf_model.joblib' |
|
tfidf_vectorizer = joblib.load(tfidf_model_path) |
|
|
|
|
|
selected_model = st.selectbox("Pilih Model Sentimen:", ("Ensemble", "Naive Bayes", "Logistic Regression")) |
|
|
|
|
|
def select_sentiment_model(selected_model): |
|
if selected_model == "Ensemble": |
|
model_path = 'ensemble_clf_soft_smote.joblib' |
|
elif selected_model == "Random Forest": |
|
model_path = 'best_rf_model_smote.joblib' |
|
elif selected_model == "Naive Bayes": |
|
model_path = 'naive_bayes_model_smote.joblib' |
|
elif selected_model == "Logistic Regression": |
|
model_path = 'logreg_model_smote.joblib' |
|
else: |
|
|
|
model_path = 'ensemble_clf_soft_smote.joblib' |
|
|
|
model = joblib.load(model_path) |
|
return model |
|
|
|
|
|
sentiment_model = select_sentiment_model(selected_model) |
|
|
|
|
|
|
|
def predict_sentiment(text, model, tfidf_vectorizer, slang_dict): |
|
|
|
cleaned_text = clean_text(text) |
|
norm_slang_text = normalize_slang(cleaned_text, slang_dict) |
|
|
|
|
|
tfidf_matrix = tfidf_vectorizer.transform([norm_slang_text]) |
|
|
|
|
|
sentiment = model.predict(tfidf_matrix) |
|
|
|
|
|
labels = {0: "Negatif", 1: "Netral", 2: "Positif"} |
|
sentiment_label = labels[int(sentiment)] |
|
|
|
return sentiment_label |
|
|
|
def get_emoticon(sentiment): |
|
if sentiment == "Positif": |
|
emoticon = "π" |
|
elif sentiment == "Negatif": |
|
emoticon = "π" |
|
else: |
|
emoticon = "π" |
|
|
|
return emoticon |
|
|
|
|
|
def get_table_download_link(df, download_format): |
|
if download_format == "XLSX": |
|
df.to_excel("hasil_sentimen.xlsx", index=False) |
|
return f'<a href="hasil_sentimen.xlsx" download="hasil_sentimen.xlsx">Unduh File XLSX</a>' |
|
else: |
|
csv = df.to_csv(index=False) |
|
return f'<a href="data:file/csv;base64,{b64encode(csv.encode()).decode()}" download="hasil_sentimen.csv">Unduh File CSV</a>' |
|
|
|
|
|
input_option = st.radio("Pilih metode input:", ("Teks Manual", "Unggah Berkas XLSX")) |
|
|
|
if input_option == "Teks Manual": |
|
|
|
user_input = st.text_area("Masukkan teks:", "") |
|
else: |
|
|
|
uploaded_file = st.file_uploader("Unggah berkas XLSX", type=["xlsx"]) |
|
st.write("**Pastikan berkas XLSX Anda memiliki kolom yang bernama 'Text'.**") |
|
|
|
if uploaded_file is not None: |
|
df = pd.read_excel(uploaded_file) |
|
|
|
if 'Text' not in df.columns: |
|
st.warning("Berkas XLSX harus memiliki kolom bernama 'Text' untuk analisis sentimen.") |
|
else: |
|
texts = df['Text'] |
|
|
|
|
|
results = [] |
|
|
|
if input_option == "Teks Manual" and user_input: |
|
|
|
user_texts = user_input.split('\n') |
|
for text in user_texts: |
|
sentiment_label = predict_sentiment(text, sentiment_model, tfidf_vectorizer, lookp_dict) |
|
emoticon = get_emoticon(sentiment_label) |
|
cleaned_text = clean_text(text) |
|
norm_slang_text = normalize_slang(cleaned_text, lookp_dict) |
|
results.append((text, cleaned_text, norm_slang_text, sentiment_label, emoticon)) |
|
|
|
elif input_option == "Unggah Berkas XLSX" and uploaded_file is not None: |
|
if 'Text' in df.columns: |
|
for text in texts: |
|
sentiment_label = predict_sentiment(text, sentiment_model, tfidf_vectorizer, lookp_dict) |
|
emoticon = get_emoticon(sentiment_label) |
|
cleaned_text = clean_text(text) |
|
norm_slang_text = normalize_slang(cleaned_text, lookp_dict) |
|
results.append((text, cleaned_text, norm_slang_text, sentiment_label, emoticon)) |
|
else: |
|
st.warning("Berkas XLSX harus memiliki kolom bernama 'Text' untuk analisis sentimen.") |
|
|
|
|
|
|
|
columns = st.columns(2) |
|
|
|
|
|
with columns[0]: |
|
if results: |
|
all_texts = [result[2] for result in results if result[2] is not None and not pd.isna(result[2])] |
|
all_texts = " ".join(all_texts) |
|
|
|
st.subheader("Word Cloud") |
|
|
|
if all_texts: |
|
wordcloud = WordCloud(width=800, height=660, background_color='white', |
|
colormap='Purples', |
|
contour_color='black', |
|
contour_width=2, |
|
mask=None, |
|
).generate(all_texts) |
|
st.image(wordcloud.to_array()) |
|
else: |
|
st.write("Tidak ada data untuk ditampilkan dalam Word Cloud.") |
|
|
|
|
|
with columns[1]: |
|
st.subheader("Chart") |
|
if results: |
|
df_results = pd.DataFrame(results, columns=["Teks", "Cleaned Text", "Norm Text", "Hasil Analisis Sentimen", "Emotikon"]) |
|
sns.set_style("whitegrid") |
|
|
|
|
|
class_labels = ["Negatif", "Netral", "Positif"] |
|
|
|
|
|
value_counts = df_results["Hasil Analisis Sentimen"].value_counts() |
|
|
|
|
|
value_counts = value_counts.reindex(class_labels) |
|
|
|
fig, ax = plt.subplots() |
|
sns.barplot(x=value_counts.index, y=value_counts.values, ax=ax) |
|
plt.xticks(rotation=45) |
|
|
|
st.pyplot(fig) |
|
|
|
|
|
with st.expander("Hasil Analisis Sentimen"): |
|
|
|
st.table(pd.DataFrame(results, columns=["Teks", "Cleaned Text", "Norm Text", "Hasil Analisis Sentimen", "Emotikon"])) |
|
|
|
|
|
|
|
st.subheader("Unduh Hasil") |
|
download_format = st.selectbox("Pilih format unduhan:", ["XLSX", "CSV"]) |
|
if results: |
|
if download_format == "XLSX": |
|
|
|
df = pd.DataFrame(results, columns=["Teks", "Cleaned Text", "Norm Text", "Hasil Analisis Sentimen", "Emotikon"]) |
|
df.to_excel("hasil_sentimen.xlsx", index=False) |
|
|
|
|
|
st.download_button(label="Unduh XLSX", data=open("hasil_sentimen.xlsx", "rb").read(), key="xlsx_download", file_name="hasil_sentimen.xlsx") |
|
|
|
else: |
|
|
|
df = pd.DataFrame(results, columns=["Teks", "Cleaned Text", "Norm Text", "Hasil Analisis Sentimen", "Emotikon"]) |
|
csv = df.to_csv(index=False) |
|
|
|
|
|
st.download_button(label="Unduh CSV", data=csv, key="csv_download", file_name="hasil_sentimen.csv") |
|
else: |
|
st.write("Tidak ada data untuk diunduh.") |
|
|
|
|
|
|
|
st.divider() |
|
|
|
|
|
github_link = "https://github.com/naufalnashif/" |
|
st.markdown(f"GitHub: [{github_link}]({github_link})") |
|
|
|
|
|
instagram_link = "https://www.instagram.com/naufal.nashif/" |
|
st.markdown(f"Instagram: [{instagram_link}]({instagram_link})") |
|
|
|
|
|
st.write('Thank you for trying the demo!') |
|
st.write('Best regards, Naufal Nashif') |
|
|