rxxnzz commited on
Commit
804a5d9
·
verified ·
1 Parent(s): 537e3aa

Update test.py

Browse files
Files changed (1) hide show
  1. test.py +181 -181
test.py CHANGED
@@ -1,182 +1,182 @@
1
-
2
- import pandas as pd
3
- import numpy as np
4
- import matplotlib.pyplot as plt
5
- import seaborn as sns
6
- import pickle
7
- import nltk
8
- import tensorflow as tf
9
- import re
10
- import nltk
11
- from nltk.corpus import stopwords
12
- from nltk.tokenize import word_tokenize, sent_tokenize
13
- from sklearn.model_selection import train_test_split
14
- from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
15
- import seaborn as sns
16
- from transformers import BertTokenizer
17
- from transformers import TFBertForSequenceClassification
18
- nltk.download('punkt')
19
- nltk.download('stopwords')
20
-
21
- df = pd.read_csv("dataset_tweet_sentiment_pilkada_DKI_2017.csv")
22
-
23
- df.rename(columns={
24
- 'Sentiment': 'sentiment',
25
- 'Pasangan Calon': 'calon',
26
- 'Text Tweet': 'text'
27
- }, inplace=True)
28
-
29
- df.dropna(inplace=True)
30
-
31
- #preprocessing
32
- def clean_text(text):
33
- text = re.sub(r"https?://\S+|www\.\S+", "", text) #hapus url
34
- text = re.sub(r"@\S+", "", text) #hapus mention
35
- text = re.sub(r"#\S+", "", text) #hapus hastag
36
- text = re.sub(r"\d+", "", text) #hapus nomor
37
- text = re.sub(r"[^\w\s]", "", text) #hapus tanda baca
38
- text = re.sub(r"(.)\1{2,}", r"\1", text) #hapus double karakter
39
- text = text.strip() #hapus spasi di depan dan di belakang
40
- text = text.lower() #ubah menjadi huruf kecil
41
- return text
42
-
43
- stopword_pilkada = pd.read_csv("stopword_tweet_pilkada_DKI_2017.csv", header=None)
44
- stopword_pilkada.columns = ['stopword']
45
-
46
- stop_words = set(stopwords.words('indonesian'))
47
- additional_sw = set(stopword_pilkada.stopword.values)
48
- stop_words = stop_words.union(additional_sw)
49
-
50
- def remove_stopwords(text):
51
- word_tokens = word_tokenize(text)
52
- filtered_sentence = [w for w in word_tokens if not w in stop_words]
53
- return " ".join(filtered_sentence)
54
-
55
- def preprocess_text(text):
56
- text = clean_text(text)
57
- text = remove_stopwords(text)
58
- return(text)
59
-
60
- text_to_process = "sangat gak bagus pak ahok"
61
- processed_text = preprocess_text(text_to_process)
62
- print(processed_text)
63
-
64
- df_train, df_test = train_test_split(df, test_size=0.3, random_state=42)
65
- df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=42)
66
-
67
- print("Train Data Size: ", len(df_train)) #70%
68
- print("Validation Data Size: ", len(df_val)) #15%
69
- print("Test Data Size: ", len(df_test)) #15%
70
-
71
- PRETRAINED_MODEL = "indobenchmark/indobert-base-p2"
72
- tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL)
73
- vocab = tokenizer.get_vocab()
74
-
75
- #mengecek distrubusi data untuk mengetahui panjang maksimal untuk token
76
- token_lens = []
77
-
78
- for txt in df["text"]:
79
- tokens = tokenizer.encode(txt)
80
- token_lens.append(len(tokens))
81
-
82
- MAX_LEN = 60
83
-
84
- df_train['sentiment'] = df_train['sentiment'].map({'positive': 1, 'negative': 0})
85
- df_val['sentiment'] = df_val['sentiment'].map({'positive': 1, 'negative': 0})
86
-
87
- def encode_sentence(sent):
88
- return tokenizer.encode_plus(
89
- sent,
90
- add_special_tokens =True,
91
- padding = 'max_length',
92
- truncation = 'longest_first',
93
- max_length = MAX_LEN,
94
- return_attention_mask =True,
95
- return_token_type_ids=True
96
- )
97
-
98
- def map_example_to_dict(input_ids, attention_masks, token_type_ids, label):
99
- return{
100
- "input_ids": input_ids,
101
- "attention_mask": attention_masks,
102
- "token_type_ids": token_type_ids,
103
- }, label
104
-
105
- def encode_dataset(ds, limit=-1):
106
- input_ids_list = []
107
- attention_mask_list = []
108
- token_type_ids_list = []
109
- label_list = []
110
-
111
- for index, row in ds.iterrows():
112
- if limit > 0 and index >= limit:
113
- break
114
-
115
- input_ids, attention_mask, token_type_ids =encode_sentence(row["text"])["input_ids"],\
116
- encode_sentence(row["text"])["attention_mask"],\
117
- encode_sentence(row["text"])["token_type_ids"]
118
- label = row["sentiment"]
119
-
120
- input_ids_list.append(input_ids)
121
- attention_mask_list.append(attention_mask)
122
- token_type_ids_list.append(token_type_ids)
123
- label_list.append(label)
124
-
125
- return tf.data.Dataset.from_tensor_slices((
126
- input_ids_list,
127
- attention_mask_list,
128
- token_type_ids_list,
129
- label_list
130
- )).map(map_example_to_dict)
131
-
132
- EPOCH = 5
133
- BATCH_SIZE = 42
134
- LEARNING_RATE = 1e-5
135
-
136
- df_train_shuffled = df_train.sample(frac=1, random_state=42)
137
- train_data = encode_dataset(df_train_shuffled).batch(BATCH_SIZE)
138
- val_data = encode_dataset(df_val).batch(BATCH_SIZE)
139
- test_data = encode_dataset(df_test).batch(BATCH_SIZE)
140
-
141
- model = TFBertForSequenceClassification.from_pretrained(PRETRAINED_MODEL, num_labels=2)
142
-
143
- model.summary()
144
-
145
- optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
146
- loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
147
- metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
148
-
149
- model.compile(optimizer, loss=loss, metrics=[metric])
150
-
151
- history = model.fit(
152
- train_data,
153
- epochs=EPOCH,
154
- batch_size=BATCH_SIZE,
155
- validation_data=val_data
156
- )
157
-
158
-
159
- # Convert string labels to numeric format for the test dataset
160
- df_test['sentiment'] = df_test['sentiment'].map({'positive': 1, 'negative': 0})
161
-
162
- # Create the test_data with the updated DataFrame
163
- test_data = encode_dataset(df_test).batch(BATCH_SIZE)
164
-
165
- # Evaluate the model
166
- model.evaluate(test_data)
167
-
168
- y_pred = model.predict(test_data)
169
- y_actual = np.concatenate([y for x, y in test_data], axis=0)
170
-
171
- labels = ["negative", "positive"]
172
-
173
- def predict(text):
174
- input_ids, attention_mask, token_type_ids = encode_sentence(text)["input_ids"],\
175
- encode_sentence(text)["attention_mask"],\
176
- encode_sentence(text)["token_type_ids"]
177
- input_ids = tf.expand_dims(input_ids, 0)
178
- attention_mask = tf.expand_dims(attention_mask, 0)
179
- token_type_ids = tf.expand_dims(token_type_ids, 0)
180
-
181
- outputs = model([input_ids, attention_mask, token_type_ids])
182
  return labels[np.argmax(tf.nn.softmax(outputs[0], axis=1).numpy()[0])]
 
1
+
2
+ import pandas as pd
3
+ import numpy as np
4
+ import matplotlib.pyplot as plt
5
+ import seaborn as sns
6
+ import pickle
7
+ import nltk
8
+ import tensorflow as tf
9
+ import re
10
+ import nltk
11
+ from nltk.corpus import stopwords
12
+ from nltk.tokenize import word_tokenize, sent_tokenize
13
+ from sklearn.model_selection import train_test_split
14
+ from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
15
+ import seaborn as sns
16
+ from transformers import BertTokenizer
17
+ from transformers import TFBertForSequenceClassification
18
+ nltk.download('punkt')
19
+ nltk.download('stopwords')
20
+
21
+ df = pd.read_csv("dataset_tweet_sentiment_pilkada_DKI_2017.csv")
22
+
23
+ df.rename(columns={
24
+ 'Sentiment': 'sentiment',
25
+ 'Pasangan Calon': 'calon',
26
+ 'Text Tweet': 'text'
27
+ }, inplace=True)
28
+
29
+ df.dropna(inplace=True)
30
+
31
+ #preprocessing
32
+ def clean_text(text):
33
+ text = re.sub(r"https?://\S+|www\.\S+", "", text) #hapus url
34
+ text = re.sub(r"@\S+", "", text) #hapus mention
35
+ text = re.sub(r"#\S+", "", text) #hapus hastag
36
+ text = re.sub(r"\d+", "", text) #hapus nomor
37
+ text = re.sub(r"[^\w\s]", "", text) #hapus tanda baca
38
+ text = re.sub(r"(.)\1{2,}", r"\1", text) #hapus double karakter
39
+ text = text.strip() #hapus spasi di depan dan di belakang
40
+ text = text.lower() #ubah menjadi huruf kecil
41
+ return text
42
+
43
+ stopword_pilkada = pd.read_csv("stopword_tweet_pilkada_DKI_2017.csv", header=None)
44
+ stopword_pilkada.columns = ['stopword']
45
+
46
+ stop_words = set(stopwords.words('indonesian'))
47
+ additional_sw = set(stopword_pilkada.stopword.values)
48
+ stop_words = stop_words.union(additional_sw)
49
+
50
+ def remove_stopwords(text):
51
+ word_tokens = word_tokenize(text)
52
+ filtered_sentence = [w for w in word_tokens if not w in stop_words]
53
+ return " ".join(filtered_sentence)
54
+
55
+ def preprocess_text(text):
56
+ text = clean_text(text)
57
+ text = remove_stopwords(text)
58
+ return(text)
59
+
60
+ text_to_process = "sangat gak bagus pak ahok"
61
+ processed_text = preprocess_text(text_to_process)
62
+ print(processed_text)
63
+
64
+ df_train, df_test = train_test_split(df, test_size=0.3, random_state=42)
65
+ df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=42)
66
+
67
+ print("Train Data Size: ", len(df_train)) #70%
68
+ print("Validation Data Size: ", len(df_val)) #15%
69
+ print("Test Data Size: ", len(df_test)) #15%
70
+
71
+ PRETRAINED_MODEL = "indobenchmark/indobert-base-p2"
72
+ tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL)
73
+ vocab = tokenizer.get_vocab()
74
+
75
+ #mengecek distrubusi data untuk mengetahui panjang maksimal untuk token
76
+ token_lens = []
77
+
78
+ for txt in df["text"]:
79
+ tokens = tokenizer.encode(txt)
80
+ token_lens.append(len(tokens))
81
+
82
+ MAX_LEN = 60
83
+
84
+ df_train['sentiment'] = df_train['sentiment'].map({'positive': 1, 'negative': 0})
85
+ df_val['sentiment'] = df_val['sentiment'].map({'positive': 1, 'negative': 0})
86
+
87
+ def encode_sentence(sent):
88
+ return tokenizer.encode_plus(
89
+ sent,
90
+ add_special_tokens =True,
91
+ padding = 'max_length',
92
+ truncation = 'longest_first',
93
+ max_length = MAX_LEN,
94
+ return_attention_mask =True,
95
+ return_token_type_ids=True
96
+ )
97
+
98
+ def map_example_to_dict(input_ids, attention_masks, token_type_ids, label):
99
+ return{
100
+ "input_ids": input_ids,
101
+ "attention_mask": attention_masks,
102
+ "token_type_ids": token_type_ids,
103
+ }, label
104
+
105
+ def encode_dataset(ds, limit=-1):
106
+ input_ids_list = []
107
+ attention_mask_list = []
108
+ token_type_ids_list = []
109
+ label_list = []
110
+
111
+ for index, row in ds.iterrows():
112
+ if limit > 0 and index >= limit:
113
+ break
114
+
115
+ input_ids, attention_mask, token_type_ids =encode_sentence(row["text"])["input_ids"],\
116
+ encode_sentence(row["text"])["attention_mask"],\
117
+ encode_sentence(row["text"])["token_type_ids"]
118
+ label = row["sentiment"]
119
+
120
+ input_ids_list.append(input_ids)
121
+ attention_mask_list.append(attention_mask)
122
+ token_type_ids_list.append(token_type_ids)
123
+ label_list.append(label)
124
+
125
+ return tf.data.Dataset.from_tensor_slices((
126
+ input_ids_list,
127
+ attention_mask_list,
128
+ token_type_ids_list,
129
+ label_list
130
+ )).map(map_example_to_dict)
131
+
132
+ EPOCH = 20
133
+ BATCH_SIZE = 42
134
+ LEARNING_RATE = 1e-5
135
+
136
+ df_train_shuffled = df_train.sample(frac=1, random_state=42)
137
+ train_data = encode_dataset(df_train_shuffled).batch(BATCH_SIZE)
138
+ val_data = encode_dataset(df_val).batch(BATCH_SIZE)
139
+ test_data = encode_dataset(df_test).batch(BATCH_SIZE)
140
+
141
+ model = TFBertForSequenceClassification.from_pretrained(PRETRAINED_MODEL, num_labels=2)
142
+
143
+ model.summary()
144
+
145
+ optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
146
+ loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
147
+ metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
148
+
149
+ model.compile(optimizer, loss=loss, metrics=[metric])
150
+
151
+ history = model.fit(
152
+ train_data,
153
+ epochs=EPOCH,
154
+ batch_size=BATCH_SIZE,
155
+ validation_data=val_data
156
+ )
157
+
158
+
159
+ # Convert string labels to numeric format for the test dataset
160
+ df_test['sentiment'] = df_test['sentiment'].map({'positive': 1, 'negative': 0})
161
+
162
+ # Create the test_data with the updated DataFrame
163
+ test_data = encode_dataset(df_test).batch(BATCH_SIZE)
164
+
165
+ # Evaluate the model
166
+ model.evaluate(test_data)
167
+
168
+ y_pred = model.predict(test_data)
169
+ y_actual = np.concatenate([y for x, y in test_data], axis=0)
170
+
171
+ labels = ["negative", "positive"]
172
+
173
+ def predict(text):
174
+ input_ids, attention_mask, token_type_ids = encode_sentence(text)["input_ids"],\
175
+ encode_sentence(text)["attention_mask"],\
176
+ encode_sentence(text)["token_type_ids"]
177
+ input_ids = tf.expand_dims(input_ids, 0)
178
+ attention_mask = tf.expand_dims(attention_mask, 0)
179
+ token_type_ids = tf.expand_dims(token_type_ids, 0)
180
+
181
+ outputs = model([input_ids, attention_mask, token_type_ids])
182
  return labels[np.argmax(tf.nn.softmax(outputs[0], axis=1).numpy()[0])]