import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import pickle import nltk import tensorflow as tf import re import nltk from nltk.corpus import stopwords from nltk.tokenize import word_tokenize, sent_tokenize from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report, accuracy_score, confusion_matrix import seaborn as sns from transformers import BertTokenizer from transformers import TFBertForSequenceClassification nltk.download('punkt') nltk.download('stopwords') df = pd.read_csv("dataset_tweet_sentiment_pilkada_DKI_2017.csv") df.rename(columns={ 'Sentiment': 'sentiment', 'Pasangan Calon': 'calon', 'Text Tweet': 'text' }, inplace=True) df.dropna(inplace=True) #preprocessing def clean_text(text): text = re.sub(r"https?://\S+|www\.\S+", "", text) #hapus url text = re.sub(r"@\S+", "", text) #hapus mention text = re.sub(r"#\S+", "", text) #hapus hastag text = re.sub(r"\d+", "", text) #hapus nomor text = re.sub(r"[^\w\s]", "", text) #hapus tanda baca text = re.sub(r"(.)\1{2,}", r"\1", text) #hapus double karakter text = text.strip() #hapus spasi di depan dan di belakang text = text.lower() #ubah menjadi huruf kecil return text stopword_pilkada = pd.read_csv("stopword_tweet_pilkada_DKI_2017.csv", header=None) stopword_pilkada.columns = ['stopword'] stop_words = set(stopwords.words('indonesian')) additional_sw = set(stopword_pilkada.stopword.values) stop_words = stop_words.union(additional_sw) def remove_stopwords(text): word_tokens = word_tokenize(text) filtered_sentence = [w for w in word_tokens if not w in stop_words] return " ".join(filtered_sentence) def preprocess_text(text): text = clean_text(text) text = remove_stopwords(text) return(text) text_to_process = "sangat gak bagus pak ahok" processed_text = preprocess_text(text_to_process) print(processed_text) df_train, df_test = train_test_split(df, test_size=0.3, random_state=42) df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=42) print("Train Data Size: ", len(df_train)) #70% print("Validation Data Size: ", len(df_val)) #15% print("Test Data Size: ", len(df_test)) #15% PRETRAINED_MODEL = "indobenchmark/indobert-base-p2" tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL) vocab = tokenizer.get_vocab() #mengecek distrubusi data untuk mengetahui panjang maksimal untuk token token_lens = [] for txt in df["text"]: tokens = tokenizer.encode(txt) token_lens.append(len(tokens)) MAX_LEN = 60 df_train['sentiment'] = df_train['sentiment'].map({'positive': 1, 'negative': 0}) df_val['sentiment'] = df_val['sentiment'].map({'positive': 1, 'negative': 0}) def encode_sentence(sent): return tokenizer.encode_plus( sent, add_special_tokens =True, padding = 'max_length', truncation = 'longest_first', max_length = MAX_LEN, return_attention_mask =True, return_token_type_ids=True ) def map_example_to_dict(input_ids, attention_masks, token_type_ids, label): return{ "input_ids": input_ids, "attention_mask": attention_masks, "token_type_ids": token_type_ids, }, label def encode_dataset(ds, limit=-1): input_ids_list = [] attention_mask_list = [] token_type_ids_list = [] label_list = [] for index, row in ds.iterrows(): if limit > 0 and index >= limit: break input_ids, attention_mask, token_type_ids =encode_sentence(row["text"])["input_ids"],\ encode_sentence(row["text"])["attention_mask"],\ encode_sentence(row["text"])["token_type_ids"] label = row["sentiment"] input_ids_list.append(input_ids) attention_mask_list.append(attention_mask) token_type_ids_list.append(token_type_ids) label_list.append(label) return tf.data.Dataset.from_tensor_slices(( input_ids_list, attention_mask_list, token_type_ids_list, label_list )).map(map_example_to_dict) EPOCH = 1 BATCH_SIZE = 42 LEARNING_RATE = 1e-5 df_train_shuffled = df_train.sample(frac=1, random_state=42) train_data = encode_dataset(df_train_shuffled).batch(BATCH_SIZE) val_data = encode_dataset(df_val).batch(BATCH_SIZE) test_data = encode_dataset(df_test).batch(BATCH_SIZE) model = TFBertForSequenceClassification.from_pretrained(PRETRAINED_MODEL, num_labels=2) model.summary() optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE) loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy') model.compile(optimizer, loss=loss, metrics=[metric]) history = model.fit( train_data, epochs=EPOCH, batch_size=BATCH_SIZE, validation_data=val_data ) # Convert string labels to numeric format for the test dataset df_test['sentiment'] = df_test['sentiment'].map({'positive': 1, 'negative': 0}) # Create the test_data with the updated DataFrame test_data = encode_dataset(df_test).batch(BATCH_SIZE) # Evaluate the model model.evaluate(test_data) y_pred = model.predict(test_data) y_actual = np.concatenate([y for x, y in test_data], axis=0) labels = ["negative", "positive"] def predict(text): input_ids, attention_mask, token_type_ids = encode_sentence(text)["input_ids"],\ encode_sentence(text)["attention_mask"],\ encode_sentence(text)["token_type_ids"] input_ids = tf.expand_dims(input_ids, 0) attention_mask = tf.expand_dims(attention_mask, 0) token_type_ids = tf.expand_dims(token_type_ids, 0) outputs = model([input_ids, attention_mask, token_type_ids]) return labels[np.argmax(tf.nn.softmax(outputs[0], axis=1).numpy()[0])]