compAnIonv1 / compAnIonv1.py
jrippert's picture
Update compAnIonv1.py
e3004fe verified
# compAnIonv1.py>
# Setup environment for Spark
import os
#os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
#os.environ["SPARK_HOME"] = '/home/ubuntu/spark-3.5.1-bin-hadoop3'
#pip install transformers==4.31.0 -q
# importing pandas as pd
import pandas as pd
# Install spark-nlp
#!pip install spark-nlp
#import sparknlp
#from sparknlp.base import *
#from sparknlp.annotator import *
#from sparknlp.common import *
#from pyspark.sql.functions import *
#from pyspark.sql.functions import lit
#from pyspark.sql.window import Window
#from pyspark.sql.types import *
#from pyspark.ml.classification import LogisticRegression
#from pyspark.ml.evaluation import BinaryClassificationEvaluator
#from pyspark.mllib.evaluation import MulticlassMetrics
#from pyspark.ml import Pipeline
#from pyspark.ml.feature import StandardScaler, VectorAssembler, Imputer, OneHotEncoder, StringIndexer
#from pyspark.ml.tuning import ParamGridBuilder, CrossValidator, CrossValidatorModel, TrainValidationSplit, TrainValidationSplitModel
#from pyspark.ml.evaluation import MulticlassClassificationEvaluator
#from pyspark.ml.linalg import Vectors, VectorUDT
#import pyspark.pandas as ps
# Import Tensorflow and BERT models
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from transformers import BertTokenizer
from transformers import TFBertModel
MAX_SEQUENCE_LENGTH = 400
def create_bert_classification_model(bert_model,
num_train_layers=0,
max_sequence_length=MAX_SEQUENCE_LENGTH,
num_filters = [100, 100, 50, 25],
kernel_sizes = [3, 4, 5, 10],
hidden_size = 200,
hidden2_size = 100,
dropout = 0.1,
learning_rate = 0.001,
label_smoothing = 0.03
):
"""
Build a simple classification model with BERT. Use the Pooler Output or CLS for classification purposes
"""
if num_train_layers == 0:
# Freeze all layers of pre-trained BERT model
bert_model.trainable = False
elif num_train_layers == 12:
# Train all layers of the BERT model
bert_model.trainable = True
else:
# Restrict training to the num_train_layers outer transformer layers
retrain_layers = []
for retrain_layer_number in range(num_train_layers):
layer_code = '_' + str(11 - retrain_layer_number)
retrain_layers.append(layer_code)
#print('retrain layers: ', retrain_layers)
for w in bert_model.weights:
if not any([x in w.name for x in retrain_layers]):
#print('freezing: ', w)
w._trainable = False
input_ids = tf.keras.layers.Input(shape=(MAX_SEQUENCE_LENGTH,), dtype=tf.int64, name='input_ids')
token_type_ids = tf.keras.layers.Input(shape=(MAX_SEQUENCE_LENGTH,), dtype=tf.int64, name='token_type_ids')
attention_mask = tf.keras.layers.Input(shape=(MAX_SEQUENCE_LENGTH,), dtype=tf.int64, name='attention_mask')
bert_inputs = {'input_ids': input_ids,
'token_type_ids': token_type_ids,
'attention_mask': attention_mask}
bert_out = bert_model(bert_inputs)
pooler_token = bert_out[1]
cls_token = bert_out[0][:, 0, :]
bert_out_avg = tf.math.reduce_mean(bert_out[0], axis=1)
cnn_token = bert_out[0]
conv_layers_for_all_kernel_sizes = []
for kernel_size, filters in zip(kernel_sizes, num_filters):
conv_layer = tf.keras.layers.Conv1D(filters=filters, kernel_size=kernel_size, activation='relu')(cnn_token)
conv_layer = tf.keras.layers.GlobalMaxPooling1D()(conv_layer)
conv_layers_for_all_kernel_sizes.append(conv_layer)
conv_output = tf.keras.layers.concatenate(conv_layers_for_all_kernel_sizes, axis=1)
# classification layer
hidden = tf.keras.layers.Dense(hidden_size, activation='relu', name='hidden_layer')(conv_output)
hidden = tf.keras.layers.Dropout(dropout)(hidden)
hidden = tf.keras.layers.Dense(hidden2_size, activation='relu', name='hidden_layer2')(hidden)
hidden = tf.keras.layers.Dropout(dropout)(hidden)
classification = tf.keras.layers.Dense(1, activation='sigmoid',name='classification_layer')(hidden)
classification_model = tf.keras.Model(inputs=[input_ids, token_type_ids, attention_mask], outputs=[classification])
classification_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
# LOSS FUNCTION
loss=tf.keras.losses.BinaryFocalCrossentropy(
gamma=2.0, from_logits=False, apply_class_balancing=True, label_smoothing=label_smoothing
),
# METRIC FUNCTIONS
metrics=['accuracy']
)
return classification_model
f_one_or_zero = lambda x: 1 if x > 0.5 else 0
def run_inference_model(conversations):
# Tokenize conversations with BERT tokenizer
tokenized_input = tokenizer(conversations,
max_length=MAX_SEQUENCE_LENGTH,
truncation=True,
padding='max_length',
return_tensors='tf')
bert_inputs = [tokenized_input.input_ids,
tokenized_input.token_type_ids,
tokenized_input.attention_mask]
# Apply Model Prediction to testData
y_pred = inference_model.predict(bert_inputs)
#prediction = f_one_or_zero(y_pred)
return y_pred
model_checkpoint = "bert-base-uncased"
# Step 1: Load BERT Tokenizer
tokenizer = BertTokenizer.from_pretrained(model_checkpoint)
# Step 2: Load Pretrained BERT model
bert_model = TFBertModel.from_pretrained(model_checkpoint)
# Stage 3: Create custom BERT model on top of the pretrained model
inference_model = create_bert_classification_model(bert_model=bert_model)
# Stage 4: Load Inference model with saved weights
save_path = 'bert_cnn_ensemble_resample_uncased_mdl.h5'
inference_model.load_weights(save_path)