|
|
|
import streamlit as st |
|
import pandas as pd |
|
import numpy as np |
|
import re |
|
import json |
|
import joblib |
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
|
|
|
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
from wordcloud import WordCloud |
|
|
|
|
|
|
|
st.set_page_config(page_title="naufalnashif-ML") |
|
|
|
|
|
def clean_text(text): |
|
|
|
text = re.sub(r'[^\x00-\x7F]+', '', text) |
|
|
|
|
|
text = re.sub(r'http[s]?://.[a-zA-Z0-9./_?=%&#+!]+', '', text) |
|
text = re.sub(r'pic.twitter.com?.[a-zA-Z0-9./_?=%&#+!]+', '', text) |
|
|
|
|
|
text = re.sub(r'@[\w]+', '', text) |
|
|
|
|
|
text = re.sub(r'#([\w]+)', '', text) |
|
|
|
|
|
text = re.sub(r'[!$%^&*@#()_+|~=`{}\[\]%\-:";\'<>?,./]', '', text) |
|
|
|
|
|
text = re.sub(r'[0-9]+', '', text) |
|
|
|
|
|
text = re.sub(' +', ' ', text) |
|
|
|
|
|
text = text.strip() |
|
|
|
|
|
text = text.lower() |
|
|
|
return text |
|
|
|
|
|
kamus_path = '_json_colloquial-indonesian-lexicon (1).txt' |
|
with open(kamus_path) as f: |
|
data = f.read() |
|
lookp_dict = json.loads(data) |
|
|
|
|
|
kamus_gaul_baru = { |
|
'kurangg': 'kurang', |
|
'udaa': 'udah', |
|
'mnurut': 'menurut', |
|
'anyinh': 'anjing', |
|
'seputat': 'seputar', |
|
'ijo' : 'hijau', |
|
'dmma' : 'dimana', |
|
'anjrot' : 'anjing', |
|
'ajgg' : 'anjing', |
|
'keboen' : 'kebun', |
|
'aseekk' : 'asik', |
|
'bliau' : 'beliau', |
|
'aseek' : 'asik', |
|
'berpaa' : 'berapa', |
|
'berpa' : 'berapa', |
|
'bggtt' : 'banget', |
|
'cntoh' : 'contoh', |
|
'anzink' : 'anjing', |
|
'jrg' : 'jarang', |
|
'msi' : 'masih', |
|
'anjirt' : 'anjing', |
|
'kesampeian' : 'kesampaian', |
|
'dtgnya' : 'datangnya', |
|
'dtg' : 'datang', |
|
'dngin' : 'dingin', |
|
'ktub' : 'kutub', |
|
'brngkt' : 'berangkat', |
|
'antra' : 'antara', |
|
'pinuh': 'penuh', |
|
'anjink': 'anjing', |
|
'anjir' : 'anjing', |
|
'ajg': 'anjing', |
|
'smpet': 'sempat', |
|
'sempet': 'sempat', |
|
'makai': 'memakai', |
|
'bgst': 'bangsat', |
|
'anjg': 'anjing', |
|
'cpk': 'lelah', |
|
'capek': 'lelah', |
|
'capk': 'lelah', |
|
'cpek': 'lelah', |
|
'anjrit': 'anjing', |
|
'anjig': 'anjing', |
|
'anjigg': 'anjing', |
|
'anjingg': 'anjing', |
|
'bukann': 'bukan', |
|
'skrgg': 'sekarang', |
|
'makasihh': 'terimakasih', |
|
'asu': 'anjing', |
|
'moga': 'semoga', |
|
'cok': 'jancok', |
|
'cokk': 'jancok', |
|
'cook': 'jancok', |
|
'cookk': 'jancok', |
|
'amgkot': 'angkot', |
|
'gua' : 'aku', |
|
'gweh': 'aku', |
|
'guah': 'aku', |
|
'gw': 'aku', |
|
'gwah': 'aku', |
|
'gue' : 'aku', |
|
'wkwkwk' : 'wkwk', |
|
'dah' : 'udah', |
|
'tkt' : 'takut', |
|
'gabisa' : 'gabisa', |
|
'umumm' : 'umum', |
|
'umuum' : 'umum', |
|
'yah' : 'yah', |
|
'drtd' : 'daritadi', |
|
'drtdi' : 'daritadi', |
|
'ges':'gais', |
|
'gays': 'gais', |
|
'geys':'gais', |
|
'trans pakuan': 'transpakuan', |
|
'anjr' : 'anjir', |
|
'anjer' : 'anjing', |
|
'njir' : 'anjing', |
|
'anjr' : 'anjing', |
|
'trans pakuan' : 'transpakuan', |
|
'gblk' : 'goblok', |
|
} |
|
|
|
|
|
lookp_dict.update(kamus_gaul_baru) |
|
|
|
|
|
def normalize_slang(text, slang_dict): |
|
words = text.split() |
|
normalized_words = [slang_dict.get(word, word) for word in words] |
|
return ' '.join(normalized_words) |
|
|
|
|
|
def extract_tfidf_features(texts, tfidf_vectorizer): |
|
tfidf_matrix = tfidf_vectorizer.transform(texts) |
|
return tfidf_matrix |
|
|
|
|
|
tfidf_model_path = 'X_tfidf_model.joblib' |
|
tfidf_vectorizer = joblib.load(tfidf_model_path) |
|
|
|
|
|
def predict_sentiment(text, model, tfidf_vectorizer, slang_dict): |
|
|
|
cleaned_text = clean_text(text) |
|
norm_slang_text = normalize_slang(cleaned_text, slang_dict) |
|
|
|
|
|
tfidf_matrix = tfidf_vectorizer.transform([norm_slang_text]) |
|
|
|
|
|
sentiment = model.predict(tfidf_matrix) |
|
|
|
|
|
labels = {0: "Negatif", 1: "Netral", 2: "Positif"} |
|
sentiment_label = labels[int(sentiment)] |
|
|
|
return sentiment_label |
|
|
|
|
|
sentiment_model_path = 'ensemble_clf_soft_smote.joblib' |
|
sentiment_model = joblib.load(sentiment_model_path) |
|
|
|
def get_emoticon(sentiment): |
|
if sentiment == "Positif": |
|
emoticon = "π" |
|
elif sentiment == "Negatif": |
|
emoticon = "π" |
|
else: |
|
emoticon = "π" |
|
|
|
return emoticon |
|
|
|
|
|
def get_table_download_link(df, download_format): |
|
if download_format == "XLSX": |
|
df.to_excel("hasil_sentimen.xlsx", index=False) |
|
return f'<a href="hasil_sentimen.xlsx" download="hasil_sentimen.xlsx">Unduh File XLSX</a>' |
|
else: |
|
csv = df.to_csv(index=False) |
|
return f'<a href="data:file/csv;base64,{b64encode(csv.encode()).decode()}" download="hasil_sentimen.csv">Unduh File CSV</a>' |
|
|
|
|
|
|
|
st.title("Aplikasi ML Analisis Sentimen based on data Biskita Transpakuan") |
|
|
|
|
|
input_option = st.radio("Pilih metode input:", ("Teks Manual", "Unggah Berkas XLSX")) |
|
|
|
if input_option == "Teks Manual": |
|
|
|
user_input = st.text_area("Masukkan teks:", "") |
|
else: |
|
|
|
uploaded_file = st.file_uploader("Unggah berkas XLSX", type=["xlsx"]) |
|
st.write("**Pastikan berkas XLSX Anda memiliki kolom yang bernama 'Text'.**") |
|
|
|
if uploaded_file is not None: |
|
df = pd.read_excel(uploaded_file) |
|
|
|
if 'Text' not in df.columns: |
|
st.warning("Berkas XLSX harus memiliki kolom bernama 'Text' untuk analisis sentimen.") |
|
else: |
|
texts = df['Text'] |
|
|
|
|
|
results = [] |
|
|
|
if input_option == "Teks Manual" and user_input: |
|
|
|
user_texts = user_input.split('\n') |
|
for text in user_texts: |
|
sentiment_label = predict_sentiment(text, sentiment_model, tfidf_vectorizer, lookp_dict) |
|
emoticon = get_emoticon(sentiment_label) |
|
cleaned_text = clean_text(text) |
|
norm_slang_text = normalize_slang(cleaned_text, lookp_dict) |
|
results.append((text, cleaned_text, norm_slang_text, sentiment_label, emoticon)) |
|
|
|
elif input_option == "Unggah Berkas XLSX" and uploaded_file is not None: |
|
if 'Text' in df.columns: |
|
for text in texts: |
|
sentiment_label = predict_sentiment(text, sentiment_model, tfidf_vectorizer, lookp_dict) |
|
emoticon = get_emoticon(sentiment_label) |
|
cleaned_text = clean_text(text) |
|
norm_slang_text = normalize_slang(cleaned_text, lookp_dict) |
|
results.append((text, cleaned_text, norm_slang_text, sentiment_label, emoticon)) |
|
else: |
|
st.warning("Berkas XLSX harus memiliki kolom bernama 'Text' untuk analisis sentimen.") |
|
|
|
|
|
|
|
columns = st.columns(2) |
|
|
|
|
|
with columns[0]: |
|
if results: |
|
all_texts = [result[2] for result in results if result[2] is not None and not pd.isna(result[2])] |
|
all_texts = " ".join(all_texts) |
|
|
|
st.subheader("Word Cloud") |
|
|
|
if all_texts: |
|
wordcloud = WordCloud(width=800, height=660, background_color='white', |
|
colormap='Purples', |
|
contour_color='black', |
|
contour_width=2, |
|
mask=None, |
|
).generate(all_texts) |
|
st.image(wordcloud.to_array()) |
|
else: |
|
st.write("Tidak ada data untuk ditampilkan dalam Word Cloud.") |
|
|
|
|
|
with columns[1]: |
|
st.subheader("Chart") |
|
if results: |
|
df_results = pd.DataFrame(results, columns=["Teks", "Cleaned Text", "Norm Text", "Hasil Analisis Sentimen", "Emotikon"]) |
|
sns.set_style("whitegrid") |
|
|
|
|
|
class_labels = ["Negatif", "Netral", "Positif"] |
|
|
|
|
|
value_counts = df_results["Hasil Analisis Sentimen"].value_counts() |
|
|
|
|
|
value_counts = value_counts.reindex(class_labels) |
|
|
|
fig, ax = plt.subplots() |
|
sns.barplot(x=value_counts.index, y=value_counts.values, ax=ax) |
|
plt.xticks(rotation=45) |
|
|
|
st.pyplot(fig) |
|
|
|
|
|
with st.expander("Hasil Analisis Sentimen"): |
|
|
|
st.table(pd.DataFrame(results, columns=["Teks", "Cleaned Text", "Norm Text", "Hasil Analisis Sentimen", "Emotikon"])) |
|
|
|
|
|
|
|
st.subheader("Unduh Hasil") |
|
download_format = st.selectbox("Pilih format unduhan:", ["XLSX", "CSV"]) |
|
if results: |
|
if download_format == "XLSX": |
|
|
|
df = pd.DataFrame(results, columns=["Teks", "Cleaned Text", "Norm Text", "Hasil Analisis Sentimen", "Emotikon"]) |
|
df.to_excel("hasil_sentimen.xlsx", index=False) |
|
|
|
|
|
st.download_button(label="Unduh XLSX", data=open("hasil_sentimen.xlsx", "rb").read(), key="xlsx_download", file_name="hasil_sentimen.xlsx") |
|
|
|
else: |
|
|
|
df = pd.DataFrame(results, columns=["Teks", "Cleaned Text", "Norm Text", "Hasil Analisis Sentimen", "Emotikon"]) |
|
csv = df.to_csv(index=False) |
|
|
|
|
|
st.download_button(label="Unduh CSV", data=csv, key="csv_download", file_name="hasil_sentimen.csv") |
|
else: |
|
st.write("Tidak ada data untuk diunduh.") |
|
|
|
|
|
|
|
st.divider() |
|
|
|
|
|
github_link = "https://github.com/naufalnashif/" |
|
st.markdown(f"GitHub: [{github_link}]({github_link})") |
|
|
|
|
|
instagram_link = "https://www.instagram.com/naufal.nashif/" |
|
st.markdown(f"Instagram: [{instagram_link}]({instagram_link})") |
|
|
|
|
|
st.write('Thank you for trying the demo!') |
|
st.write('Best regards, Naufal Nashif') |
|
|