File size: 6,480 Bytes
53254fb
 
 
 
 
c31884e
53254fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e3004fe
2c78356
53254fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dfc7d74
53254fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c62692a
 
53254fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# compAnIonv1.py>
# Setup environment for Spark
import os
#os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
#os.environ["SPARK_HOME"] = '/home/ubuntu/spark-3.5.1-bin-hadoop3'
#pip install transformers==4.31.0 -q
# importing pandas as pd
import pandas as pd

# Install spark-nlp
#!pip install spark-nlp
#import sparknlp
#from sparknlp.base import *
#from sparknlp.annotator import *
#from sparknlp.common import *
#from pyspark.sql.functions import *
#from pyspark.sql.functions import lit
#from pyspark.sql.window import Window
#from pyspark.sql.types import *
#from pyspark.ml.classification import LogisticRegression
#from pyspark.ml.evaluation import BinaryClassificationEvaluator
#from pyspark.mllib.evaluation import MulticlassMetrics
#from pyspark.ml import Pipeline
#from pyspark.ml.feature import StandardScaler, VectorAssembler, Imputer, OneHotEncoder, StringIndexer
#from pyspark.ml.tuning import ParamGridBuilder, CrossValidator, CrossValidatorModel, TrainValidationSplit, TrainValidationSplitModel
#from pyspark.ml.evaluation import MulticlassClassificationEvaluator
#from pyspark.ml.linalg import Vectors, VectorUDT
#import pyspark.pandas as ps

# Import Tensorflow and BERT models
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from transformers import BertTokenizer
from transformers import TFBertModel

MAX_SEQUENCE_LENGTH = 400

def create_bert_classification_model(bert_model,
                                     num_train_layers=0,
                                     max_sequence_length=MAX_SEQUENCE_LENGTH,
                                     num_filters = [100, 100, 50, 25],
                                     kernel_sizes = [3, 4, 5, 10],
                                     hidden_size = 200,
                                     hidden2_size = 100,
                                     dropout = 0.1,
                                     learning_rate = 0.001,
                                     label_smoothing = 0.03
                                    ):
    """
    Build a simple classification model with BERT. Use the Pooler Output or CLS for classification purposes
    """
    if num_train_layers == 0:
        # Freeze all layers of pre-trained BERT model
        bert_model.trainable = False

    elif num_train_layers == 12:
        # Train all layers of the BERT model
        bert_model.trainable = True

    else:
        # Restrict training to the num_train_layers outer transformer layers
        retrain_layers = []

        for retrain_layer_number in range(num_train_layers):

            layer_code = '_' + str(11 - retrain_layer_number)
            retrain_layers.append(layer_code)


        #print('retrain layers: ', retrain_layers)

        for w in bert_model.weights:
            if not any([x in w.name for x in retrain_layers]):
                #print('freezing: ', w)
                w._trainable = False

    input_ids = tf.keras.layers.Input(shape=(MAX_SEQUENCE_LENGTH,), dtype=tf.int64, name='input_ids')
    token_type_ids = tf.keras.layers.Input(shape=(MAX_SEQUENCE_LENGTH,), dtype=tf.int64, name='token_type_ids')
    attention_mask = tf.keras.layers.Input(shape=(MAX_SEQUENCE_LENGTH,), dtype=tf.int64, name='attention_mask')
                                    
    bert_inputs = {'input_ids': input_ids,
                   'token_type_ids': token_type_ids,
                   'attention_mask': attention_mask}

    bert_out = bert_model(bert_inputs)

    pooler_token = bert_out[1]
    cls_token = bert_out[0][:, 0, :]
    bert_out_avg = tf.math.reduce_mean(bert_out[0], axis=1)
    cnn_token = bert_out[0]

    conv_layers_for_all_kernel_sizes = []
    for kernel_size, filters in zip(kernel_sizes, num_filters):
        conv_layer = tf.keras.layers.Conv1D(filters=filters, kernel_size=kernel_size, activation='relu')(cnn_token)
        conv_layer = tf.keras.layers.GlobalMaxPooling1D()(conv_layer)
        conv_layers_for_all_kernel_sizes.append(conv_layer)

    conv_output = tf.keras.layers.concatenate(conv_layers_for_all_kernel_sizes, axis=1)

    # classification layer
    hidden = tf.keras.layers.Dense(hidden_size, activation='relu', name='hidden_layer')(conv_output)
    hidden = tf.keras.layers.Dropout(dropout)(hidden)

    hidden = tf.keras.layers.Dense(hidden2_size, activation='relu', name='hidden_layer2')(hidden)
    hidden = tf.keras.layers.Dropout(dropout)(hidden)
    
    classification = tf.keras.layers.Dense(1, activation='sigmoid',name='classification_layer')(hidden)

    classification_model = tf.keras.Model(inputs=[input_ids, token_type_ids, attention_mask], outputs=[classification])

    classification_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                                 # LOSS FUNCTION
                                 loss=tf.keras.losses.BinaryFocalCrossentropy(
                                   gamma=2.0, from_logits=False, apply_class_balancing=True, label_smoothing=label_smoothing
                                 ),
                                 # METRIC FUNCTIONS
                                 metrics=['accuracy']
                                 )
    return classification_model


f_one_or_zero = lambda x: 1 if x > 0.5 else 0

def run_inference_model(conversations):
        # Tokenize conversations with BERT tokenizer
        tokenized_input = tokenizer(conversations,
                                    max_length=MAX_SEQUENCE_LENGTH,
                                    truncation=True,
                                    padding='max_length',
                                    return_tensors='tf')
        bert_inputs = [tokenized_input.input_ids,
                       tokenized_input.token_type_ids,
                       tokenized_input.attention_mask]

        # Apply Model Prediction to testData
        y_pred = inference_model.predict(bert_inputs)
        #prediction = f_one_or_zero(y_pred)
        return y_pred



model_checkpoint = "bert-base-uncased"
 # Step 1: Load BERT Tokenizer
tokenizer = BertTokenizer.from_pretrained(model_checkpoint)
# Step 2: Load Pretrained BERT model
bert_model = TFBertModel.from_pretrained(model_checkpoint)
# Stage 3: Create custom BERT model on top of the pretrained model
inference_model = create_bert_classification_model(bert_model=bert_model)
# Stage 4: Load Inference model with saved weights
save_path = 'bert_cnn_ensemble_resample_uncased_mdl.h5'
inference_model.load_weights(save_path)