Spaces:

rxxnzz
/

sentiment-pilkada

Sleeping

App Files Files Community

rxxnzz commited on Jun 14, 2024

Commit

804a5d9

verified ·

1 Parent(s): 537e3aa

Update test.py

Browse files

Files changed (1) hide show

test.py +181 -181

test.py CHANGED Viewed

@@ -1,182 +1,182 @@
-import pandas as pd
-import numpy as np
-import matplotlib.pyplot as plt
-import seaborn as sns
-import pickle
-import nltk
-import tensorflow as tf
-import re
-import nltk
-from nltk.corpus import stopwords
-from nltk.tokenize import word_tokenize, sent_tokenize
-from sklearn.model_selection import train_test_split
-from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
-import seaborn as sns
-from transformers import BertTokenizer
-from transformers import TFBertForSequenceClassification
-nltk.download('punkt')
-nltk.download('stopwords')
-df = pd.read_csv("dataset_tweet_sentiment_pilkada_DKI_2017.csv")
-df.rename(columns={
-    'Sentiment': 'sentiment',
-    'Pasangan Calon': 'calon',
-    'Text Tweet': 'text'
-}, inplace=True)
-df.dropna(inplace=True)
-#preprocessing
-def clean_text(text):
-  text = re.sub(r"https?://\S+|www\.\S+", "", text) #hapus url
-  text = re.sub(r"@\S+", "", text) #hapus mention
-  text = re.sub(r"#\S+", "", text) #hapus hastag
-  text = re.sub(r"\d+", "", text) #hapus nomor
-  text = re.sub(r"[^\w\s]", "", text) #hapus tanda baca
-  text = re.sub(r"(.)\1{2,}", r"\1", text) #hapus double karakter
-  text = text.strip() #hapus spasi di depan dan di belakang
-  text = text.lower() #ubah menjadi huruf kecil
-  return text
-stopword_pilkada = pd.read_csv("stopword_tweet_pilkada_DKI_2017.csv", header=None)
-stopword_pilkada.columns = ['stopword']
-stop_words = set(stopwords.words('indonesian'))
-additional_sw = set(stopword_pilkada.stopword.values)
-stop_words = stop_words.union(additional_sw)
-def remove_stopwords(text):
-  word_tokens = word_tokenize(text)
-  filtered_sentence = [w for w in word_tokens if not w in stop_words]
-  return " ".join(filtered_sentence)
-def preprocess_text(text):
-  text = clean_text(text)
-  text = remove_stopwords(text)
-  return(text)
-text_to_process = "sangat gak bagus pak ahok"
-processed_text = preprocess_text(text_to_process)
-print(processed_text)
-df_train, df_test = train_test_split(df, test_size=0.3, random_state=42)
-df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=42)
-print("Train Data Size: ", len(df_train)) #70%
-print("Validation Data Size: ", len(df_val)) #15%
-print("Test Data Size: ", len(df_test)) #15%
-PRETRAINED_MODEL = "indobenchmark/indobert-base-p2"
-tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL)
-vocab = tokenizer.get_vocab()
-#mengecek distrubusi data untuk mengetahui panjang maksimal untuk token
-token_lens = []
-for txt in df["text"]:
-  tokens = tokenizer.encode(txt)
-  token_lens.append(len(tokens))
-MAX_LEN = 60
-df_train['sentiment'] = df_train['sentiment'].map({'positive': 1, 'negative': 0})
-df_val['sentiment'] = df_val['sentiment'].map({'positive': 1, 'negative': 0})
-def encode_sentence(sent):
-  return tokenizer.encode_plus(
-      sent,
-      add_special_tokens =True,
-      padding = 'max_length',
-      truncation = 'longest_first',
-      max_length = MAX_LEN,
-      return_attention_mask =True,
-      return_token_type_ids=True
-  )
-def map_example_to_dict(input_ids, attention_masks, token_type_ids, label):
-  return{
-      "input_ids": input_ids,
-      "attention_mask": attention_masks,
-      "token_type_ids": token_type_ids,
-  }, label
-def encode_dataset(ds, limit=-1):
-  input_ids_list = []
-  attention_mask_list = []
-  token_type_ids_list = []
-  label_list = []
-  for index, row in ds.iterrows():
-    if limit > 0 and index >= limit:
-      break
-    input_ids, attention_mask, token_type_ids =encode_sentence(row["text"])["input_ids"],\
-    encode_sentence(row["text"])["attention_mask"],\
-    encode_sentence(row["text"])["token_type_ids"]
-    label = row["sentiment"]
-    input_ids_list.append(input_ids)
-    attention_mask_list.append(attention_mask)
-    token_type_ids_list.append(token_type_ids)
-    label_list.append(label)
-  return tf.data.Dataset.from_tensor_slices((
-      input_ids_list,
-      attention_mask_list,
-      token_type_ids_list,
-      label_list
-  )).map(map_example_to_dict)
-EPOCH = 5
-BATCH_SIZE = 42
-LEARNING_RATE = 1e-5
-df_train_shuffled = df_train.sample(frac=1, random_state=42)
-train_data = encode_dataset(df_train_shuffled).batch(BATCH_SIZE)
-val_data = encode_dataset(df_val).batch(BATCH_SIZE)
-test_data = encode_dataset(df_test).batch(BATCH_SIZE)
-model = TFBertForSequenceClassification.from_pretrained(PRETRAINED_MODEL, num_labels=2)
-model.summary()
-optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
-loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
-metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
-model.compile(optimizer, loss=loss, metrics=[metric])
-history = model.fit(
-    train_data,
-    epochs=EPOCH,
-    batch_size=BATCH_SIZE,
-    validation_data=val_data
-)
-# Convert string labels to numeric format for the test dataset
-df_test['sentiment'] = df_test['sentiment'].map({'positive': 1, 'negative': 0})
-# Create the test_data with the updated DataFrame
-test_data = encode_dataset(df_test).batch(BATCH_SIZE)
-# Evaluate the model
-model.evaluate(test_data)
-y_pred = model.predict(test_data)
-y_actual = np.concatenate([y for x, y in test_data], axis=0)
-labels = ["negative", "positive"]
-def predict(text):
-  input_ids, attention_mask, token_type_ids = encode_sentence(text)["input_ids"],\
-  encode_sentence(text)["attention_mask"],\
-  encode_sentence(text)["token_type_ids"]
-  input_ids = tf.expand_dims(input_ids, 0)
-  attention_mask = tf.expand_dims(attention_mask, 0)
-  token_type_ids = tf.expand_dims(token_type_ids, 0)
-  outputs = model([input_ids, attention_mask, token_type_ids])
   return labels[np.argmax(tf.nn.softmax(outputs[0], axis=1).numpy()[0])]

+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+import pickle
+import nltk
+import tensorflow as tf
+import re
+import nltk
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize, sent_tokenize
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
+import seaborn as sns
+from transformers import BertTokenizer
+from transformers import TFBertForSequenceClassification
+nltk.download('punkt')
+nltk.download('stopwords')
+df = pd.read_csv("dataset_tweet_sentiment_pilkada_DKI_2017.csv")
+df.rename(columns={
+    'Sentiment': 'sentiment',
+    'Pasangan Calon': 'calon',
+    'Text Tweet': 'text'
+}, inplace=True)
+df.dropna(inplace=True)
+#preprocessing
+def clean_text(text):
+  text = re.sub(r"https?://\S+|www\.\S+", "", text) #hapus url
+  text = re.sub(r"@\S+", "", text) #hapus mention
+  text = re.sub(r"#\S+", "", text) #hapus hastag
+  text = re.sub(r"\d+", "", text) #hapus nomor
+  text = re.sub(r"[^\w\s]", "", text) #hapus tanda baca
+  text = re.sub(r"(.)\1{2,}", r"\1", text) #hapus double karakter
+  text = text.strip() #hapus spasi di depan dan di belakang
+  text = text.lower() #ubah menjadi huruf kecil
+  return text
+stopword_pilkada = pd.read_csv("stopword_tweet_pilkada_DKI_2017.csv", header=None)
+stopword_pilkada.columns = ['stopword']
+stop_words = set(stopwords.words('indonesian'))
+additional_sw = set(stopword_pilkada.stopword.values)
+stop_words = stop_words.union(additional_sw)
+def remove_stopwords(text):
+  word_tokens = word_tokenize(text)
+  filtered_sentence = [w for w in word_tokens if not w in stop_words]
+  return " ".join(filtered_sentence)
+def preprocess_text(text):
+  text = clean_text(text)
+  text = remove_stopwords(text)
+  return(text)
+text_to_process = "sangat gak bagus pak ahok"
+processed_text = preprocess_text(text_to_process)
+print(processed_text)
+df_train, df_test = train_test_split(df, test_size=0.3, random_state=42)
+df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=42)
+print("Train Data Size: ", len(df_train)) #70%
+print("Validation Data Size: ", len(df_val)) #15%
+print("Test Data Size: ", len(df_test)) #15%
+PRETRAINED_MODEL = "indobenchmark/indobert-base-p2"
+tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL)
+vocab = tokenizer.get_vocab()
+#mengecek distrubusi data untuk mengetahui panjang maksimal untuk token
+token_lens = []
+for txt in df["text"]:
+  tokens = tokenizer.encode(txt)
+  token_lens.append(len(tokens))
+MAX_LEN = 60
+df_train['sentiment'] = df_train['sentiment'].map({'positive': 1, 'negative': 0})
+df_val['sentiment'] = df_val['sentiment'].map({'positive': 1, 'negative': 0})
+def encode_sentence(sent):
+  return tokenizer.encode_plus(
+      sent,
+      add_special_tokens =True,
+      padding = 'max_length',
+      truncation = 'longest_first',
+      max_length = MAX_LEN,
+      return_attention_mask =True,
+      return_token_type_ids=True
+  )
+def map_example_to_dict(input_ids, attention_masks, token_type_ids, label):
+  return{
+      "input_ids": input_ids,
+      "attention_mask": attention_masks,
+      "token_type_ids": token_type_ids,
+  }, label
+def encode_dataset(ds, limit=-1):
+  input_ids_list = []
+  attention_mask_list = []
+  token_type_ids_list = []
+  label_list = []
+  for index, row in ds.iterrows():
+    if limit > 0 and index >= limit:
+      break
+    input_ids, attention_mask, token_type_ids =encode_sentence(row["text"])["input_ids"],\
+    encode_sentence(row["text"])["attention_mask"],\
+    encode_sentence(row["text"])["token_type_ids"]
+    label = row["sentiment"]
+    input_ids_list.append(input_ids)
+    attention_mask_list.append(attention_mask)
+    token_type_ids_list.append(token_type_ids)
+    label_list.append(label)
+  return tf.data.Dataset.from_tensor_slices((
+      input_ids_list,
+      attention_mask_list,
+      token_type_ids_list,
+      label_list
+  )).map(map_example_to_dict)
+EPOCH = 20
+BATCH_SIZE = 42
+LEARNING_RATE = 1e-5
+df_train_shuffled = df_train.sample(frac=1, random_state=42)
+train_data = encode_dataset(df_train_shuffled).batch(BATCH_SIZE)
+val_data = encode_dataset(df_val).batch(BATCH_SIZE)
+test_data = encode_dataset(df_test).batch(BATCH_SIZE)
+model = TFBertForSequenceClassification.from_pretrained(PRETRAINED_MODEL, num_labels=2)
+model.summary()
+optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
+loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
+model.compile(optimizer, loss=loss, metrics=[metric])
+history = model.fit(
+    train_data,
+    epochs=EPOCH,
+    batch_size=BATCH_SIZE,
+    validation_data=val_data
+)
+# Convert string labels to numeric format for the test dataset
+df_test['sentiment'] = df_test['sentiment'].map({'positive': 1, 'negative': 0})
+# Create the test_data with the updated DataFrame
+test_data = encode_dataset(df_test).batch(BATCH_SIZE)
+# Evaluate the model
+model.evaluate(test_data)
+y_pred = model.predict(test_data)
+y_actual = np.concatenate([y for x, y in test_data], axis=0)
+labels = ["negative", "positive"]
+def predict(text):
+  input_ids, attention_mask, token_type_ids = encode_sentence(text)["input_ids"],\
+  encode_sentence(text)["attention_mask"],\
+  encode_sentence(text)["token_type_ids"]
+  input_ids = tf.expand_dims(input_ids, 0)
+  attention_mask = tf.expand_dims(attention_mask, 0)
+  token_type_ids = tf.expand_dims(token_type_ids, 0)
+  outputs = model([input_ids, attention_mask, token_type_ids])
   return labels[np.argmax(tf.nn.softmax(outputs[0], axis=1).numpy()[0])]