import pandas as pd |
import nltk |
nltk.download("punkt") |
from nltk import tokenize |
import time |
from sklearn.model_selection import train_test_split |
from transformers import BertConfig, BertTokenizer, TFBertModel |
from tensorflow.keras.preprocessing.sequence import pad_sequences |
from tensorflow import convert_to_tensor |
from tensorflow.keras.layers import Input, Dense |
from tensorflow.keras.initializers import TruncatedNormal |
from tensorflow.keras.models import Model |
from tensorflow.keras.optimizers import Adam |
from tensorflow.keras.metrics import CategoricalAccuracy, Precision, Recall |
DATA_PATH="..." |
tab=pd.read_hdf(DATA_PATH) |
def data_to_values(dataframe): |
"""Converts data to values. |
""" |
abstracts=dataframe.Abstract.values |
labels=dataframe.Label.values |
return abstracts, labels |
def tokenize_abstracts(abstracts): |
"""For given texts, adds '[CLS]' and '[SEP]' tokens |
at the beginning and the end of each sentence, respectively. |
""" |
t_abstracts=[] |
for abstract in abstracts: |
t_abstract="[CLS] " |
for sentence in tokenize.sent_tokenize(abstract): |
t_abstract=t_abstract + sentence + " [SEP] " |
t_abstracts.append(t_abstract) |
return t_abstracts |
tokenizer=BertTokenizer.from_pretrained('bert-base-multilingual-uncased') |
def b_tokenize_abstracts(t_abstracts, max_len=512): |
"""Tokenizes sentences with the help |
of a 'bert-base-multilingual-uncased' tokenizer. |
""" |
b_t_abstracts=[tokenizer.tokenize(_)[:max_len] for _ in t_abstracts] |
return b_t_abstracts |
def convert_to_ids(b_t_abstracts): |
"""Converts tokens to its specific |
IDs in a bert vocabulary. |
""" |
input_ids=[tokenizer.convert_tokens_to_ids(_) for _ in b_t_abstracts] |
return input_ids |
def abstracts_to_ids(abstracts): |
"""Tokenizes abstracts and converts |
tokens to their specific IDs |
in a bert vocabulary. |
""" |
tokenized_abstracts=tokenize_abstracts(abstracts) |
b_tokenized_abstracts=b_tokenize_abstracts(tokenized_abstracts) |
ids=convert_to_ids(b_tokenized_abstracts) |
return ids |
def pad_ids(input_ids, max_len=512): |
"""Padds sequences of a given IDs. |
""" |
p_input_ids=pad_sequences(input_ids, |
maxlen=max_len, |
dtype="long", |
truncating="post", |
padding="post") |
return p_input_ids |
def create_attention_masks(inputs): |
"""Creates attention masks |
for a given seuquences. |
""" |
masks=[] |
for sequence in inputs: |
sequence_mask=[float(_>0) for _ in sequence] |
masks.append(sequence_mask) |
return masks |
def create_model(): |
config=BertConfig.from_pretrained( |
"bert-base-multilingual-uncased", |
num_labels=17, |
hidden_dropout_prob=0.2, |
attention_probs_dropout_prob=0.2) |
bert=TFBertModel.from_pretrained( |
"bert-base-multilingual-uncased", |
config=config) |
bert_layer=bert.layers[0] |
input_ids_layer=Input( |
shape=(512), |
name="input_ids", |
dtype="int32") |
input_attention_masks_layer=Input( |
shape=(512), |
name="attention_masks", |
dtype="int32") |
bert_model=bert_layer( |
input_ids_layer, |
input_attention_masks_layer) |
target_layer=Dense( |
units=17, |
kernel_initializer=TruncatedNormal(stddev=config.initializer_range), |
name="target_layer", |
activation="sigmoid")(bert_model[1]) |
model=Model( |
inputs=[input_ids_layer, input_attention_masks_layer], |
outputs=target_layer, |
name="aurora_sdg_mbert_multilabel") |
optimizer=Adam( |
learning_rate=5e-05, |
epsilon=1e-08, |
decay=0.01, |
clipnorm=1.0) |
model.compile( |
optimizer=optimizer, |
loss="binary_crossentropy", |
metrics=[Precision(), Recall()]) |
return model |
abstracts, labels=data_to_values(tab) |
ids=abstracts_to_ids(abstracts) |
print("Abstracts tokenized, tokens converted to ids.") |
padded_ids=pad_ids(ids) |
print("Sequences padded.") |
train_inputs, temp_inputs, train_labels, temp_labels=train_test_split(padded_ids, labels, random_state=1993, test_size=0.3) |
validation_inputs, test_inputs, validation_labels, test_labels=train_test_split(temp_inputs, temp_labels, random_state=1993, test_size=0.5) |
print("Data splited into train, validation, test sets.") |
train_masks, validation_masks, test_masks=[create_attention_masks(_) for _ in [train_inputs, validation_inputs, test_inputs]] |
print("Attention masks created.") |
train_inputs, validation_inputs, test_inputs=[convert_to_tensor(_) for _ in [train_inputs, validation_inputs, test_inputs]] |
print("Inputs converted to tensors.") |
train_labels, validation_labels, test_labels=[convert_to_tensor(_) for _ in [train_labels, validation_labels, test_labels]] |
print("Labels converted to tensors.") |
train_masks, validation_masks, test_masks=[convert_to_tensor(_) for _ in [train_masks, validation_masks, test_masks]] |
print("Masks converted to tensors.") |
model=create_model() |
print("Model initialized.") |
history=model.fit([train_inputs, train_masks], train_labels, |
batch_size=16, |
epochs=4, |
validation_data=([validation_inputs, validation_masks], validation_labels)) |
model.save(SAVE_MODEL_TO+"mbert_multilabel.h5") |
print("Model saved.") |
test_score=model.evaluate([test_inputs, test_masks], test_labels, |
batch_size=8) |
print("Model tested.") |
stats=pd.DataFrame(test_score) |
stats.to_excel(SAVE_MODEL_TO+"mbert_multilabel_stats.xlsx", index=False) |
print("Stats saved.") |