Spaces:
Sleeping
Sleeping
Update test.py
Browse files
test.py
CHANGED
@@ -1,182 +1,182 @@
|
|
1 |
-
|
2 |
-
import pandas as pd
|
3 |
-
import numpy as np
|
4 |
-
import matplotlib.pyplot as plt
|
5 |
-
import seaborn as sns
|
6 |
-
import pickle
|
7 |
-
import nltk
|
8 |
-
import tensorflow as tf
|
9 |
-
import re
|
10 |
-
import nltk
|
11 |
-
from nltk.corpus import stopwords
|
12 |
-
from nltk.tokenize import word_tokenize, sent_tokenize
|
13 |
-
from sklearn.model_selection import train_test_split
|
14 |
-
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
|
15 |
-
import seaborn as sns
|
16 |
-
from transformers import BertTokenizer
|
17 |
-
from transformers import TFBertForSequenceClassification
|
18 |
-
nltk.download('punkt')
|
19 |
-
nltk.download('stopwords')
|
20 |
-
|
21 |
-
df = pd.read_csv("dataset_tweet_sentiment_pilkada_DKI_2017.csv")
|
22 |
-
|
23 |
-
df.rename(columns={
|
24 |
-
'Sentiment': 'sentiment',
|
25 |
-
'Pasangan Calon': 'calon',
|
26 |
-
'Text Tweet': 'text'
|
27 |
-
}, inplace=True)
|
28 |
-
|
29 |
-
df.dropna(inplace=True)
|
30 |
-
|
31 |
-
#preprocessing
|
32 |
-
def clean_text(text):
|
33 |
-
text = re.sub(r"https?://\S+|www\.\S+", "", text) #hapus url
|
34 |
-
text = re.sub(r"@\S+", "", text) #hapus mention
|
35 |
-
text = re.sub(r"#\S+", "", text) #hapus hastag
|
36 |
-
text = re.sub(r"\d+", "", text) #hapus nomor
|
37 |
-
text = re.sub(r"[^\w\s]", "", text) #hapus tanda baca
|
38 |
-
text = re.sub(r"(.)\1{2,}", r"\1", text) #hapus double karakter
|
39 |
-
text = text.strip() #hapus spasi di depan dan di belakang
|
40 |
-
text = text.lower() #ubah menjadi huruf kecil
|
41 |
-
return text
|
42 |
-
|
43 |
-
stopword_pilkada = pd.read_csv("stopword_tweet_pilkada_DKI_2017.csv", header=None)
|
44 |
-
stopword_pilkada.columns = ['stopword']
|
45 |
-
|
46 |
-
stop_words = set(stopwords.words('indonesian'))
|
47 |
-
additional_sw = set(stopword_pilkada.stopword.values)
|
48 |
-
stop_words = stop_words.union(additional_sw)
|
49 |
-
|
50 |
-
def remove_stopwords(text):
|
51 |
-
word_tokens = word_tokenize(text)
|
52 |
-
filtered_sentence = [w for w in word_tokens if not w in stop_words]
|
53 |
-
return " ".join(filtered_sentence)
|
54 |
-
|
55 |
-
def preprocess_text(text):
|
56 |
-
text = clean_text(text)
|
57 |
-
text = remove_stopwords(text)
|
58 |
-
return(text)
|
59 |
-
|
60 |
-
text_to_process = "sangat gak bagus pak ahok"
|
61 |
-
processed_text = preprocess_text(text_to_process)
|
62 |
-
print(processed_text)
|
63 |
-
|
64 |
-
df_train, df_test = train_test_split(df, test_size=0.3, random_state=42)
|
65 |
-
df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=42)
|
66 |
-
|
67 |
-
print("Train Data Size: ", len(df_train)) #70%
|
68 |
-
print("Validation Data Size: ", len(df_val)) #15%
|
69 |
-
print("Test Data Size: ", len(df_test)) #15%
|
70 |
-
|
71 |
-
PRETRAINED_MODEL = "indobenchmark/indobert-base-p2"
|
72 |
-
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL)
|
73 |
-
vocab = tokenizer.get_vocab()
|
74 |
-
|
75 |
-
#mengecek distrubusi data untuk mengetahui panjang maksimal untuk token
|
76 |
-
token_lens = []
|
77 |
-
|
78 |
-
for txt in df["text"]:
|
79 |
-
tokens = tokenizer.encode(txt)
|
80 |
-
token_lens.append(len(tokens))
|
81 |
-
|
82 |
-
MAX_LEN = 60
|
83 |
-
|
84 |
-
df_train['sentiment'] = df_train['sentiment'].map({'positive': 1, 'negative': 0})
|
85 |
-
df_val['sentiment'] = df_val['sentiment'].map({'positive': 1, 'negative': 0})
|
86 |
-
|
87 |
-
def encode_sentence(sent):
|
88 |
-
return tokenizer.encode_plus(
|
89 |
-
sent,
|
90 |
-
add_special_tokens =True,
|
91 |
-
padding = 'max_length',
|
92 |
-
truncation = 'longest_first',
|
93 |
-
max_length = MAX_LEN,
|
94 |
-
return_attention_mask =True,
|
95 |
-
return_token_type_ids=True
|
96 |
-
)
|
97 |
-
|
98 |
-
def map_example_to_dict(input_ids, attention_masks, token_type_ids, label):
|
99 |
-
return{
|
100 |
-
"input_ids": input_ids,
|
101 |
-
"attention_mask": attention_masks,
|
102 |
-
"token_type_ids": token_type_ids,
|
103 |
-
}, label
|
104 |
-
|
105 |
-
def encode_dataset(ds, limit=-1):
|
106 |
-
input_ids_list = []
|
107 |
-
attention_mask_list = []
|
108 |
-
token_type_ids_list = []
|
109 |
-
label_list = []
|
110 |
-
|
111 |
-
for index, row in ds.iterrows():
|
112 |
-
if limit > 0 and index >= limit:
|
113 |
-
break
|
114 |
-
|
115 |
-
input_ids, attention_mask, token_type_ids =encode_sentence(row["text"])["input_ids"],\
|
116 |
-
encode_sentence(row["text"])["attention_mask"],\
|
117 |
-
encode_sentence(row["text"])["token_type_ids"]
|
118 |
-
label = row["sentiment"]
|
119 |
-
|
120 |
-
input_ids_list.append(input_ids)
|
121 |
-
attention_mask_list.append(attention_mask)
|
122 |
-
token_type_ids_list.append(token_type_ids)
|
123 |
-
label_list.append(label)
|
124 |
-
|
125 |
-
return tf.data.Dataset.from_tensor_slices((
|
126 |
-
input_ids_list,
|
127 |
-
attention_mask_list,
|
128 |
-
token_type_ids_list,
|
129 |
-
label_list
|
130 |
-
)).map(map_example_to_dict)
|
131 |
-
|
132 |
-
EPOCH =
|
133 |
-
BATCH_SIZE = 42
|
134 |
-
LEARNING_RATE = 1e-5
|
135 |
-
|
136 |
-
df_train_shuffled = df_train.sample(frac=1, random_state=42)
|
137 |
-
train_data = encode_dataset(df_train_shuffled).batch(BATCH_SIZE)
|
138 |
-
val_data = encode_dataset(df_val).batch(BATCH_SIZE)
|
139 |
-
test_data = encode_dataset(df_test).batch(BATCH_SIZE)
|
140 |
-
|
141 |
-
model = TFBertForSequenceClassification.from_pretrained(PRETRAINED_MODEL, num_labels=2)
|
142 |
-
|
143 |
-
model.summary()
|
144 |
-
|
145 |
-
optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
|
146 |
-
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
|
147 |
-
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
|
148 |
-
|
149 |
-
model.compile(optimizer, loss=loss, metrics=[metric])
|
150 |
-
|
151 |
-
history = model.fit(
|
152 |
-
train_data,
|
153 |
-
epochs=EPOCH,
|
154 |
-
batch_size=BATCH_SIZE,
|
155 |
-
validation_data=val_data
|
156 |
-
)
|
157 |
-
|
158 |
-
|
159 |
-
# Convert string labels to numeric format for the test dataset
|
160 |
-
df_test['sentiment'] = df_test['sentiment'].map({'positive': 1, 'negative': 0})
|
161 |
-
|
162 |
-
# Create the test_data with the updated DataFrame
|
163 |
-
test_data = encode_dataset(df_test).batch(BATCH_SIZE)
|
164 |
-
|
165 |
-
# Evaluate the model
|
166 |
-
model.evaluate(test_data)
|
167 |
-
|
168 |
-
y_pred = model.predict(test_data)
|
169 |
-
y_actual = np.concatenate([y for x, y in test_data], axis=0)
|
170 |
-
|
171 |
-
labels = ["negative", "positive"]
|
172 |
-
|
173 |
-
def predict(text):
|
174 |
-
input_ids, attention_mask, token_type_ids = encode_sentence(text)["input_ids"],\
|
175 |
-
encode_sentence(text)["attention_mask"],\
|
176 |
-
encode_sentence(text)["token_type_ids"]
|
177 |
-
input_ids = tf.expand_dims(input_ids, 0)
|
178 |
-
attention_mask = tf.expand_dims(attention_mask, 0)
|
179 |
-
token_type_ids = tf.expand_dims(token_type_ids, 0)
|
180 |
-
|
181 |
-
outputs = model([input_ids, attention_mask, token_type_ids])
|
182 |
return labels[np.argmax(tf.nn.softmax(outputs[0], axis=1).numpy()[0])]
|
|
|
1 |
+
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
import matplotlib.pyplot as plt
|
5 |
+
import seaborn as sns
|
6 |
+
import pickle
|
7 |
+
import nltk
|
8 |
+
import tensorflow as tf
|
9 |
+
import re
|
10 |
+
import nltk
|
11 |
+
from nltk.corpus import stopwords
|
12 |
+
from nltk.tokenize import word_tokenize, sent_tokenize
|
13 |
+
from sklearn.model_selection import train_test_split
|
14 |
+
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
|
15 |
+
import seaborn as sns
|
16 |
+
from transformers import BertTokenizer
|
17 |
+
from transformers import TFBertForSequenceClassification
|
18 |
+
nltk.download('punkt')
|
19 |
+
nltk.download('stopwords')
|
20 |
+
|
21 |
+
df = pd.read_csv("dataset_tweet_sentiment_pilkada_DKI_2017.csv")
|
22 |
+
|
23 |
+
df.rename(columns={
|
24 |
+
'Sentiment': 'sentiment',
|
25 |
+
'Pasangan Calon': 'calon',
|
26 |
+
'Text Tweet': 'text'
|
27 |
+
}, inplace=True)
|
28 |
+
|
29 |
+
df.dropna(inplace=True)
|
30 |
+
|
31 |
+
#preprocessing
|
32 |
+
def clean_text(text):
|
33 |
+
text = re.sub(r"https?://\S+|www\.\S+", "", text) #hapus url
|
34 |
+
text = re.sub(r"@\S+", "", text) #hapus mention
|
35 |
+
text = re.sub(r"#\S+", "", text) #hapus hastag
|
36 |
+
text = re.sub(r"\d+", "", text) #hapus nomor
|
37 |
+
text = re.sub(r"[^\w\s]", "", text) #hapus tanda baca
|
38 |
+
text = re.sub(r"(.)\1{2,}", r"\1", text) #hapus double karakter
|
39 |
+
text = text.strip() #hapus spasi di depan dan di belakang
|
40 |
+
text = text.lower() #ubah menjadi huruf kecil
|
41 |
+
return text
|
42 |
+
|
43 |
+
stopword_pilkada = pd.read_csv("stopword_tweet_pilkada_DKI_2017.csv", header=None)
|
44 |
+
stopword_pilkada.columns = ['stopword']
|
45 |
+
|
46 |
+
stop_words = set(stopwords.words('indonesian'))
|
47 |
+
additional_sw = set(stopword_pilkada.stopword.values)
|
48 |
+
stop_words = stop_words.union(additional_sw)
|
49 |
+
|
50 |
+
def remove_stopwords(text):
|
51 |
+
word_tokens = word_tokenize(text)
|
52 |
+
filtered_sentence = [w for w in word_tokens if not w in stop_words]
|
53 |
+
return " ".join(filtered_sentence)
|
54 |
+
|
55 |
+
def preprocess_text(text):
|
56 |
+
text = clean_text(text)
|
57 |
+
text = remove_stopwords(text)
|
58 |
+
return(text)
|
59 |
+
|
60 |
+
text_to_process = "sangat gak bagus pak ahok"
|
61 |
+
processed_text = preprocess_text(text_to_process)
|
62 |
+
print(processed_text)
|
63 |
+
|
64 |
+
df_train, df_test = train_test_split(df, test_size=0.3, random_state=42)
|
65 |
+
df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=42)
|
66 |
+
|
67 |
+
print("Train Data Size: ", len(df_train)) #70%
|
68 |
+
print("Validation Data Size: ", len(df_val)) #15%
|
69 |
+
print("Test Data Size: ", len(df_test)) #15%
|
70 |
+
|
71 |
+
PRETRAINED_MODEL = "indobenchmark/indobert-base-p2"
|
72 |
+
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL)
|
73 |
+
vocab = tokenizer.get_vocab()
|
74 |
+
|
75 |
+
#mengecek distrubusi data untuk mengetahui panjang maksimal untuk token
|
76 |
+
token_lens = []
|
77 |
+
|
78 |
+
for txt in df["text"]:
|
79 |
+
tokens = tokenizer.encode(txt)
|
80 |
+
token_lens.append(len(tokens))
|
81 |
+
|
82 |
+
MAX_LEN = 60
|
83 |
+
|
84 |
+
df_train['sentiment'] = df_train['sentiment'].map({'positive': 1, 'negative': 0})
|
85 |
+
df_val['sentiment'] = df_val['sentiment'].map({'positive': 1, 'negative': 0})
|
86 |
+
|
87 |
+
def encode_sentence(sent):
|
88 |
+
return tokenizer.encode_plus(
|
89 |
+
sent,
|
90 |
+
add_special_tokens =True,
|
91 |
+
padding = 'max_length',
|
92 |
+
truncation = 'longest_first',
|
93 |
+
max_length = MAX_LEN,
|
94 |
+
return_attention_mask =True,
|
95 |
+
return_token_type_ids=True
|
96 |
+
)
|
97 |
+
|
98 |
+
def map_example_to_dict(input_ids, attention_masks, token_type_ids, label):
|
99 |
+
return{
|
100 |
+
"input_ids": input_ids,
|
101 |
+
"attention_mask": attention_masks,
|
102 |
+
"token_type_ids": token_type_ids,
|
103 |
+
}, label
|
104 |
+
|
105 |
+
def encode_dataset(ds, limit=-1):
|
106 |
+
input_ids_list = []
|
107 |
+
attention_mask_list = []
|
108 |
+
token_type_ids_list = []
|
109 |
+
label_list = []
|
110 |
+
|
111 |
+
for index, row in ds.iterrows():
|
112 |
+
if limit > 0 and index >= limit:
|
113 |
+
break
|
114 |
+
|
115 |
+
input_ids, attention_mask, token_type_ids =encode_sentence(row["text"])["input_ids"],\
|
116 |
+
encode_sentence(row["text"])["attention_mask"],\
|
117 |
+
encode_sentence(row["text"])["token_type_ids"]
|
118 |
+
label = row["sentiment"]
|
119 |
+
|
120 |
+
input_ids_list.append(input_ids)
|
121 |
+
attention_mask_list.append(attention_mask)
|
122 |
+
token_type_ids_list.append(token_type_ids)
|
123 |
+
label_list.append(label)
|
124 |
+
|
125 |
+
return tf.data.Dataset.from_tensor_slices((
|
126 |
+
input_ids_list,
|
127 |
+
attention_mask_list,
|
128 |
+
token_type_ids_list,
|
129 |
+
label_list
|
130 |
+
)).map(map_example_to_dict)
|
131 |
+
|
132 |
+
EPOCH = 20
|
133 |
+
BATCH_SIZE = 42
|
134 |
+
LEARNING_RATE = 1e-5
|
135 |
+
|
136 |
+
df_train_shuffled = df_train.sample(frac=1, random_state=42)
|
137 |
+
train_data = encode_dataset(df_train_shuffled).batch(BATCH_SIZE)
|
138 |
+
val_data = encode_dataset(df_val).batch(BATCH_SIZE)
|
139 |
+
test_data = encode_dataset(df_test).batch(BATCH_SIZE)
|
140 |
+
|
141 |
+
model = TFBertForSequenceClassification.from_pretrained(PRETRAINED_MODEL, num_labels=2)
|
142 |
+
|
143 |
+
model.summary()
|
144 |
+
|
145 |
+
optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
|
146 |
+
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
|
147 |
+
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
|
148 |
+
|
149 |
+
model.compile(optimizer, loss=loss, metrics=[metric])
|
150 |
+
|
151 |
+
history = model.fit(
|
152 |
+
train_data,
|
153 |
+
epochs=EPOCH,
|
154 |
+
batch_size=BATCH_SIZE,
|
155 |
+
validation_data=val_data
|
156 |
+
)
|
157 |
+
|
158 |
+
|
159 |
+
# Convert string labels to numeric format for the test dataset
|
160 |
+
df_test['sentiment'] = df_test['sentiment'].map({'positive': 1, 'negative': 0})
|
161 |
+
|
162 |
+
# Create the test_data with the updated DataFrame
|
163 |
+
test_data = encode_dataset(df_test).batch(BATCH_SIZE)
|
164 |
+
|
165 |
+
# Evaluate the model
|
166 |
+
model.evaluate(test_data)
|
167 |
+
|
168 |
+
y_pred = model.predict(test_data)
|
169 |
+
y_actual = np.concatenate([y for x, y in test_data], axis=0)
|
170 |
+
|
171 |
+
labels = ["negative", "positive"]
|
172 |
+
|
173 |
+
def predict(text):
|
174 |
+
input_ids, attention_mask, token_type_ids = encode_sentence(text)["input_ids"],\
|
175 |
+
encode_sentence(text)["attention_mask"],\
|
176 |
+
encode_sentence(text)["token_type_ids"]
|
177 |
+
input_ids = tf.expand_dims(input_ids, 0)
|
178 |
+
attention_mask = tf.expand_dims(attention_mask, 0)
|
179 |
+
token_type_ids = tf.expand_dims(token_type_ids, 0)
|
180 |
+
|
181 |
+
outputs = model([input_ids, attention_mask, token_type_ids])
|
182 |
return labels[np.argmax(tf.nn.softmax(outputs[0], axis=1).numpy()[0])]
|