In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.11.3-py3-none-any.whl (2.9 MB)
[K     |████████████████████████████████| 2.9 MB 5.2 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 37.4 MB/s 
[?25hCollecting huggingface-hub>=0.0.17
  Downloading huggingface_hub-0.0.19-py3-none-any.whl (56 kB)
[K     |████████████████████████████████| 56 kB 4.7 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 41.5 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 26.2 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  At

In [37]:
import tensorflow as tf
import json
from transformers import AutoConfig, AutoTokenizer, TFAutoModelForSequenceClassification

config = AutoConfig.from_pretrained('malay-huggingface/bert-tiny-bahasa-cased', id2label={"0": "negative","1": "positive"}, 
                                    label2id={"negative": 0,"positive": 1})
tokenizer = AutoTokenizer.from_pretrained('malay-huggingface/bert-tiny-bahasa-cased')
model = TFAutoModelForSequenceClassification.from_pretrained("malay-huggingface/bert-tiny-bahasa-cased", from_pt=True, config=config)

# config = AutoConfig.from_pretrained('malay-huggingface/bert-base-bahasa-cased', id2label={"0": "negative","1": "positive"}, 
#                                     label2id={"negative": 0,"positive": 1})

# tokenizer = AutoTokenizer.from_pretrained("malay-huggingface/bert-base-bahasa-cased")
# model = TFAutoModelForSequenceClassification.from_pretrained("malay-huggingface/bert-base-bahasa-cased", from_pt=True, config=config)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
import pandas as pd

In [5]:
sentiment_df = pd.read_csv("https://raw.githubusercontent.com/huseinzol05/malaya/master/finetune/sentiment-data-v2.csv")
sentiment_df

Unnamed: 0,label,text
0,Negative,Lebih-lebih lagi dengan kemudahan internet da...
1,Positive,boleh memberi teguran kepada parti tetapi perl...
2,Negative,Adalah membingungkan mengapa masyarakat Cina b...
3,Positive,Kami menurunkan defisit daripada 6.7 peratus p...
4,Negative,"Ini masalahnya. Bukan rakyat, tetapi sistem"
...,...,...
3680,Positive,Jelas pembangkang buat tuduhan untuk mengeliru...
3681,Positive,demokrasi adalah kuasa rakyat di mana pegawai ...
3682,Positive,"Selain dapat menyelesaikan isu beg berat, peng..."
3683,Positive,Hospital Langkawi buat masa ini hanya dapat me...


In [6]:
sentiment_df["label"] = sentiment_df["label"].map({'Positive': 1, 'Negative': 0})

positive_df = pd.read_csv("https://raw.githubusercontent.com/huseinzol05/malay-dataset/master/sentiment/translate/polarity/polarity-positive-translated.txt", names=["text"])
positive_df["label"] = 1

negative_df = pd.read_csv("https://raw.githubusercontent.com/huseinzol05/malay-dataset/master/sentiment/translate/polarity/polarity-negative-translated.txt", names=["text"])
negative_df["label"] = 0

In [7]:
amazon_df = pd.read_json("https://raw.githubusercontent.com/huseinzol05/malay-dataset/master/sentiment/translate/multidomain-sentiment/bm-amazon.json", orient='index').T
yelp_df = pd.read_json("https://raw.githubusercontent.com/huseinzol05/malay-dataset/master/sentiment/translate/multidomain-sentiment/bm-yelp.json", orient='index').T
imdb_df = pd.read_json("https://raw.githubusercontent.com/huseinzol05/malay-dataset/master/sentiment/translate/multidomain-sentiment/bm-imdb.json", orient='index').T

def process_json_df(df):
  positive_df = df[["positive"]].dropna()
  positive_df.columns = ["text"]
  positive_df["label"] = 1

  negative_df = df[["negative"]].dropna()
  negative_df.columns = ["text"]
  negative_df["label"] = 0

  return pd.concat([positive_df, negative_df])

In [8]:
# df = pd.concat([sentiment_df, positive_df, negative_df, process_json_df(amazon_df), process_json_df(yelp_df), process_json_df(imdb_df)], ignore_index=True)
# df = pd.concat([sentiment_df, process_json_df(amazon_df), process_json_df(yelp_df), process_json_df(imdb_df)], ignore_index=True)
df = pd.concat([sentiment_df, positive_df, negative_df, process_json_df(amazon_df), process_json_df(yelp_df), process_json_df(imdb_df)], ignore_index=True)

df

Unnamed: 0,label,text
0,0,Lebih-lebih lagi dengan kemudahan internet da...
1,1,boleh memberi teguran kepada parti tetapi perl...
2,0,Adalah membingungkan mengapa masyarakat Cina b...
3,1,Kami menurunkan defisit daripada 6.7 peratus p...
4,0,"Ini masalahnya. Bukan rakyat, tetapi sistem"
...,...,...
16720,0,"dalam satu perkataan, ia memalukan."
16721,0,Saya tidak pernah keluar dari filem dengan pan...
16722,0,saya hanya bosan menonton jessica lange mengam...
16723,0,semua dalam satu penghinaan terhadap kecerdasa...


In [9]:
from sklearn.model_selection import train_test_split

# sentences = sarcasm_df["headline"].tolist()
# labels = sarcasm_df["is_sarcastic"].tolist()


sentences = df["text"].tolist()
labels = df["label"].tolist()

training_sentences, validation_sentences, training_labels, validation_labels = train_test_split(sentences, labels, train_size=0.8, random_state=1)

print(len(training_sentences))
print(len(validation_sentences))

13380
3345


In [10]:
train_encodings = tokenizer(training_sentences, truncation=True, padding=True)
val_encodings = tokenizer(validation_sentences, truncation=True, padding=True)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [11]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    training_labels
))

val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    validation_labels
))

In [12]:
from keras.callbacks import EarlyStopping, ModelCheckpoint

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)
# mc = ModelCheckpoint('best_model', monitor='val_accuracy', mode='max', verbose=1, save_best_only=True)

In [13]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=['accuracy'])
model.fit(train_dataset.shuffle(100).batch(16),
          epochs=10,
          batch_size=16,
          callbacks=[es],
          validation_data=val_dataset.shuffle(100).batch(16))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 00005: early stopping


<keras.callbacks.History at 0x7efdb1594e10>

In [14]:
model.save_pretrained("model")

In [38]:
tokenizer.save_pretrained("tokenize")

('tokenize/tokenizer_config.json',
 'tokenize/special_tokens_map.json',
 'tokenize/vocab.txt',
 'tokenize/added_tokens.json',
 'tokenize/tokenizer.json')

In [16]:
#### Load saved model and run predict function

In [17]:
loaded_model = TFAutoModelForSequenceClassification.from_pretrained("model")

Some layers from the model checkpoint at model were not used when initializing TFBertForSequenceClassification: ['dropout_13']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


In [18]:
from transformers import pipeline

pipe = pipeline('text-classification', model=loaded_model, tokenizer=tokenizer)

In [30]:
pipe(["Saya gembira kerana saya boleh meluangkan masa bersama keluarga.", "Cikgu Azam adalah yang terbaik!", "Terima kasih, pertolongan anda adalah amat dihargai"])

[{'label': 'positive', 'score': 0.9960972666740417},
 {'label': 'positive', 'score': 0.9960286617279053},
 {'label': 'positive', 'score': 0.9795612692832947}]

In [29]:
pipe(["I'm happy to spend time with my family", "Mr Azam is the best!", "Thank you, your help is much appreciated", "Thank you, I appreciate your help"])

[{'label': 'positive', 'score': 0.9666869640350342},
 {'label': 'positive', 'score': 0.9939473867416382},
 {'label': 'negative', 'score': 0.949023425579071},
 {'label': 'positive', 'score': 0.7437461018562317}]

In [32]:
pipe(["Sikap tidak peduli dia menyebabkan ibu bapa dia geram", "Saya sangat benci warna merah", "Cis! Dompet aku hilang!"])

[{'label': 'negative', 'score': 0.9914922118186951},
 {'label': 'negative', 'score': 0.9830396771430969},
 {'label': 'negative', 'score': 0.9941385984420776}]

In [34]:
pipe(["His don't care attitude causes much strife to his parents", "I hate red color", "Gah! My Wallet is missing!"])

[{'label': 'negative', 'score': 0.9114706516265869},
 {'label': 'positive', 'score': 0.9896261692047119},
 {'label': 'negative', 'score': 0.9341222047805786}]

In [21]:
def predict_sentiment(sentence):
  predict_input = tokenizer.encode(sentence,
                                  truncation=True,
                                  padding=True,
                                  return_tensors="tf")

  tf_output = loaded_model.predict(predict_input)[0]
  tf_prediction = tf.nn.softmax(tf_output, axis=1).numpy()[0]

  sentiment = 0 if tf_prediction[0] > tf_prediction[1] else 1
  print(tf_prediction)
  return sentiment

In [22]:
predict_sentiment("gembira")

[0.0143008  0.98569924]


1

In [23]:
predict_sentiment("marah")

[0.57475716 0.4252428 ]


0