Casio991ms commited on
Commit
94f0dd7
1 Parent(s): 23442bf

added inference file

Browse files
Files changed (1) hide show
  1. app.py +770 -4
app.py CHANGED
@@ -1,7 +1,773 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
 
3
- def greet(input):
4
- return "Hello " + input + "!!"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
- iface = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """MWP_Solver_-_Transformer_with_Multi-head_Attention_Block (1).ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1Tn_j0k8EJ7ny_h7Pjm0stJhNMG4si_y_
8
+ """
9
+
10
+ ! pip install -q gradio
11
+
12
+ import pandas as pd
13
+ import re
14
+ import os
15
+ import time
16
+ import random
17
+ import numpy as np
18
+
19
+ import tensorflow as tf
20
+ import matplotlib.pyplot as plt
21
+ import matplotlib.ticker as ticker
22
+ from sklearn.model_selection import train_test_split
23
+
24
+ import pickle
25
+
26
+ import spacy
27
+
28
+ from nltk.translate.bleu_score import corpus_bleu
29
+
30
  import gradio as gr
31
 
32
+ ! wget -nc "https://docs.google.com/uc?export=download&id=1Y8Ee4lUs30BAfFtL3d3VjwChmbDG7O6H" -O data_final.pkl
33
+ ! wget -nc --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1gAQVaxg_2mNcr8qwx0J2UwpkvoJgLu6a' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1gAQVaxg_2mNcr8qwx0J2UwpkvoJgLu6a" -O checkpoints.zip && rm -rf /tmp/cookies.txt
34
+
35
+ ! unzip -n "/content/checkpoints.zip" -d "./"
36
+
37
+ nlp = spacy.load("en_core_web_sm")
38
+
39
+ tf.__version__
40
+
41
+ with open('data_final.pkl', 'rb') as f:
42
+ df = pickle.load(f)
43
+
44
+ df.shape
45
+
46
+ df.head()
47
+
48
+ input_exps = list(df['Question'].values)
49
+
50
+ def convert_eqn(eqn):
51
+ '''
52
+ Add a space between every character in the equation string.
53
+ Eg: 'x = 23 + 88' becomes 'x = 2 3 + 8 8'
54
+ '''
55
+ elements = list(eqn)
56
+ return ' '.join(elements)
57
+
58
+ target_exps = list(df['Equation'].apply(lambda x: convert_eqn(x)).values)
59
+
60
+ # Input: Word problem
61
+ input_exps[:5]
62
+
63
+ # Target: Equation
64
+ target_exps[:5]
65
+
66
+ len(pd.Series(input_exps)), len(pd.Series(input_exps).unique())
67
+
68
+ len(pd.Series(target_exps)), len(pd.Series(target_exps).unique())
69
+
70
+ def preprocess_input(sentence):
71
+ '''
72
+ For the word problem, convert everything to lowercase, add spaces around all
73
+ punctuations and digits, and remove any extra spaces.
74
+ '''
75
+ sentence = sentence.lower().strip()
76
+ sentence = re.sub(r"([?.!,’])", r" \1 ", sentence)
77
+ sentence = re.sub(r"([0-9])", r" \1 ", sentence)
78
+ sentence = re.sub(r'[" "]+', " ", sentence)
79
+ sentence = sentence.rstrip().strip()
80
+ return sentence
81
+
82
+ def preprocess_target(sentence):
83
+ '''
84
+ For the equation, convert it to lowercase and remove extra spaces
85
+ '''
86
+ sentence = sentence.lower().strip()
87
+ return sentence
88
+
89
+ preprocessed_input_exps = list(map(preprocess_input, input_exps))
90
+ preprocessed_target_exps = list(map(preprocess_target, target_exps))
91
+
92
+ preprocessed_input_exps[:5]
93
+
94
+ preprocessed_target_exps[:5]
95
+
96
+ def tokenize(lang):
97
+ '''
98
+ Tokenize the given list of strings and return the tokenized output
99
+ along with the fitted tokenizer.
100
+ '''
101
+ lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
102
+ lang_tokenizer.fit_on_texts(lang)
103
+ tensor = lang_tokenizer.texts_to_sequences(lang)
104
+ return tensor, lang_tokenizer
105
+
106
+ input_tensor, inp_lang_tokenizer = tokenize(preprocessed_input_exps)
107
+
108
+ len(inp_lang_tokenizer.word_index)
109
+
110
+ target_tensor, targ_lang_tokenizer = tokenize(preprocessed_target_exps)
111
+
112
+ old_len = len(targ_lang_tokenizer.word_index)
113
+
114
+ def append_start_end(x,last_int):
115
+ '''
116
+ Add integers for start and end tokens for input/target exps
117
+ '''
118
+ l = []
119
+ l.append(last_int+1)
120
+ l.extend(x)
121
+ l.append(last_int+2)
122
+ return l
123
+
124
+ input_tensor_list = [append_start_end(i,len(inp_lang_tokenizer.word_index)) for i in input_tensor]
125
+ target_tensor_list = [append_start_end(i,len(targ_lang_tokenizer.word_index)) for i in target_tensor]
126
+
127
+ # Pad all sequences such that they are of equal length
128
+ input_tensor = tf.keras.preprocessing.sequence.pad_sequences(input_tensor_list, padding='post')
129
+ target_tensor = tf.keras.preprocessing.sequence.pad_sequences(target_tensor_list, padding='post')
130
+
131
+ input_tensor
132
+
133
+ target_tensor
134
+
135
+ # Here we are increasing the vocabulary size of the target, by adding a
136
+ # few extra vocabulary words (which will not actually be used) as otherwise the
137
+ # small vocab size causes issues downstream in the network.
138
+ keys = [str(i) for i in range(10,51)]
139
+ for i,k in enumerate(keys):
140
+ targ_lang_tokenizer.word_index[k]=len(targ_lang_tokenizer.word_index)+i+4
141
+
142
+ len(targ_lang_tokenizer.word_index)
143
+
144
+ # Creating training and validation sets
145
+ input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor,
146
+ target_tensor,
147
+ test_size=0.05,
148
+ random_state=42)
149
+
150
+ len(input_tensor_train)
151
+
152
+ len(input_tensor_val)
153
+
154
+ BUFFER_SIZE = len(input_tensor_train)
155
+ BATCH_SIZE = 64
156
+ steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
157
+ dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
158
+ dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)
159
+ num_layers = 4
160
+ d_model = 128
161
+ dff = 512
162
+ num_heads = 8
163
+ input_vocab_size = len(inp_lang_tokenizer.word_index)+3
164
+ target_vocab_size = len(targ_lang_tokenizer.word_index)+3
165
+ dropout_rate = 0.0
166
+
167
+ example_input_batch, example_target_batch = next(iter(dataset))
168
+ example_input_batch.shape, example_target_batch.shape
169
+
170
+ # We provide positional information about the data to the model,
171
+ # otherwise each sentence will be treated as Bag of Words
172
+ def get_angles(pos, i, d_model):
173
+ angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
174
+ return pos * angle_rates
175
+
176
+ def positional_encoding(position, d_model):
177
+ angle_rads = get_angles(np.arange(position)[:, np.newaxis],
178
+ np.arange(d_model)[np.newaxis, :],
179
+ d_model)
180
+
181
+ # apply sin to even indices in the array; 2i
182
+ angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
183
+
184
+ # apply cos to odd indices in the array; 2i+1
185
+ angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
186
+
187
+ pos_encoding = angle_rads[np.newaxis, ...]
188
+
189
+ return tf.cast(pos_encoding, dtype=tf.float32)
190
+
191
+ # mask all elements are that not words (padding) so that it is not treated as input
192
+ def create_padding_mask(seq):
193
+ seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
194
+
195
+ # add extra dimensions to add the padding
196
+ # to the attention logits.
197
+ return seq[:, tf.newaxis, tf.newaxis, :] # (batch_size, 1, 1, seq_len)
198
+
199
+ def create_look_ahead_mask(size):
200
+ mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
201
+ return mask
202
+
203
+ dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
204
+
205
+ def scaled_dot_product_attention(q, k, v, mask):
206
+ matmul_qk = tf.matmul(q, k, transpose_b=True) # (..., seq_len_q, seq_len_k)
207
+
208
+ # scale matmul_qk
209
+ dk = tf.cast(tf.shape(k)[-1], tf.float32)
210
+ scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
211
+
212
+ # add the mask to the scaled tensor.
213
+ if mask is not None:
214
+ scaled_attention_logits += (mask * -1e9)
215
+
216
+ # softmax is normalized on the last axis (seq_len_k) so that the scores
217
+ # add up to 1.
218
+ attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1) # (..., seq_len_q, seq_len_k)
219
+
220
+ output = tf.matmul(attention_weights, v) # (..., seq_len_q, depth_v)
221
+
222
+ return output, attention_weights
223
+
224
+ class MultiHeadAttention(tf.keras.layers.Layer):
225
+ def __init__(self, d_model, num_heads):
226
+ super(MultiHeadAttention, self).__init__()
227
+ self.num_heads = num_heads
228
+ self.d_model = d_model
229
+
230
+ assert d_model % self.num_heads == 0
231
+
232
+ self.depth = d_model // self.num_heads
233
+
234
+ self.wq = tf.keras.layers.Dense(d_model)
235
+ self.wk = tf.keras.layers.Dense(d_model)
236
+ self.wv = tf.keras.layers.Dense(d_model)
237
+
238
+ self.dense = tf.keras.layers.Dense(d_model)
239
+
240
+ def split_heads(self, x, batch_size):
241
+ """Split the last dimension into (num_heads, depth).
242
+ Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
243
+ """
244
+ x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
245
+ return tf.transpose(x, perm=[0, 2, 1, 3])
246
+
247
+ def call(self, v, k, q, mask):
248
+ batch_size = tf.shape(q)[0]
249
+
250
+ q = self.wq(q) # (batch_size, seq_len, d_model)
251
+ k = self.wk(k) # (batch_size, seq_len, d_model)
252
+ v = self.wv(v) # (batch_size, seq_len, d_model)
253
+
254
+ q = self.split_heads(q, batch_size) # (batch_size, num_heads, seq_len_q, depth)
255
+ k = self.split_heads(k, batch_size) # (batch_size, num_heads, seq_len_k, depth)
256
+ v = self.split_heads(v, batch_size) # (batch_size, num_heads, seq_len_v, depth)
257
+
258
+ # scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)
259
+ # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
260
+ scaled_attention, attention_weights = scaled_dot_product_attention(
261
+ q, k, v, mask)
262
+
263
+ scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3]) # (batch_size, seq_len_q, num_heads, depth)
264
+
265
+ concat_attention = tf.reshape(scaled_attention,
266
+ (batch_size, -1, self.d_model)) # (batch_size, seq_len_q, d_model)
267
+
268
+ output = self.dense(concat_attention) # (batch_size, seq_len_q, d_model)
269
+
270
+ return output, attention_weights
271
+
272
+ def point_wise_feed_forward_network(d_model, dff):
273
+ return tf.keras.Sequential([
274
+ tf.keras.layers.Dense(dff, activation='relu'), # (batch_size, seq_len, dff)
275
+ tf.keras.layers.Dense(d_model) # (batch_size, seq_len, d_model)
276
+ ])
277
+
278
+ class EncoderLayer(tf.keras.layers.Layer):
279
+ def __init__(self, d_model, num_heads, dff, rate=0.1):
280
+ super(EncoderLayer, self).__init__()
281
+
282
+ self.mha = MultiHeadAttention(d_model, num_heads)
283
+ self.ffn = point_wise_feed_forward_network(d_model, dff)
284
+
285
+ # normalize data per feature instead of batch
286
+ self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
287
+ self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
288
+
289
+ self.dropout1 = tf.keras.layers.Dropout(rate)
290
+ self.dropout2 = tf.keras.layers.Dropout(rate)
291
+
292
+ def call(self, x, training, mask):
293
+ # Multi-head attention layer
294
+ attn_output, _ = self.mha(x, x, x, mask)
295
+ attn_output = self.dropout1(attn_output, training=training)
296
+ # add residual connection to avoid vanishing gradient problem
297
+ out1 = self.layernorm1(x + attn_output)
298
+
299
+ # Feedforward layer
300
+ ffn_output = self.ffn(out1)
301
+ ffn_output = self.dropout2(ffn_output, training=training)
302
+ # add residual connection to avoid vanishing gradient problem
303
+ out2 = self.layernorm2(out1 + ffn_output)
304
+ return out2
305
+
306
+ class Encoder(tf.keras.layers.Layer):
307
+ def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size,
308
+ maximum_position_encoding, rate=0.1):
309
+ super(Encoder, self).__init__()
310
+
311
+ self.d_model = d_model
312
+ self.num_layers = num_layers
313
+
314
+ self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
315
+ self.pos_encoding = positional_encoding(maximum_position_encoding,
316
+ self.d_model)
317
+
318
+ # Create encoder layers (count: num_layers)
319
+ self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate)
320
+ for _ in range(num_layers)]
321
+
322
+ self.dropout = tf.keras.layers.Dropout(rate)
323
+
324
+ def call(self, x, training, mask):
325
+
326
+ seq_len = tf.shape(x)[1]
327
+
328
+ # adding embedding and position encoding.
329
+ x = self.embedding(x)
330
+ x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
331
+ x += self.pos_encoding[:, :seq_len, :]
332
+
333
+ x = self.dropout(x, training=training)
334
+
335
+ for i in range(self.num_layers):
336
+ x = self.enc_layers[i](x, training, mask)
337
+
338
+ return x
339
+
340
+ class DecoderLayer(tf.keras.layers.Layer):
341
+ def __init__(self, d_model, num_heads, dff, rate=0.1):
342
+ super(DecoderLayer, self).__init__()
343
+
344
+ self.mha1 = MultiHeadAttention(d_model, num_heads)
345
+ self.mha2 = MultiHeadAttention(d_model, num_heads)
346
+
347
+ self.ffn = point_wise_feed_forward_network(d_model, dff)
348
+
349
+ self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
350
+ self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
351
+ self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
352
+
353
+ self.dropout1 = tf.keras.layers.Dropout(rate)
354
+ self.dropout2 = tf.keras.layers.Dropout(rate)
355
+ self.dropout3 = tf.keras.layers.Dropout(rate)
356
+
357
+
358
+ def call(self, x, enc_output, training,
359
+ look_ahead_mask, padding_mask):
360
+
361
+ # Masked multihead attention layer (padding + look-ahead)
362
+ attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask)
363
+ attn1 = self.dropout1(attn1, training=training)
364
+ # again add residual connection
365
+ out1 = self.layernorm1(attn1 + x)
366
+
367
+ # Masked multihead attention layer (only padding)
368
+ # with input from encoder as Key and Value, and input from previous layer as Query
369
+ attn2, attn_weights_block2 = self.mha2(
370
+ enc_output, enc_output, out1, padding_mask)
371
+ attn2 = self.dropout2(attn2, training=training)
372
+ # again add residual connection
373
+ out2 = self.layernorm2(attn2 + out1)
374
+
375
+ # Feedforward layer
376
+ ffn_output = self.ffn(out2)
377
+ ffn_output = self.dropout3(ffn_output, training=training)
378
+ # again add residual connection
379
+ out3 = self.layernorm3(ffn_output + out2)
380
+ return out3, attn_weights_block1, attn_weights_block2
381
+
382
+ class Decoder(tf.keras.layers.Layer):
383
+ def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size,
384
+ maximum_position_encoding, rate=0.1):
385
+ super(Decoder, self).__init__()
386
+
387
+ self.d_model = d_model
388
+ self.num_layers = num_layers
389
+
390
+ self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)
391
+ self.pos_encoding = positional_encoding(maximum_position_encoding, d_model)
392
+
393
+ # Create decoder layers (count: num_layers)
394
+ self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate)
395
+ for _ in range(num_layers)]
396
+ self.dropout = tf.keras.layers.Dropout(rate)
397
+
398
+ def call(self, x, enc_output, training,
399
+ look_ahead_mask, padding_mask):
400
+
401
+ seq_len = tf.shape(x)[1]
402
+ attention_weights = {}
403
+
404
+ x = self.embedding(x) # (batch_size, target_seq_len, d_model)
405
+
406
+ x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
407
+
408
+ x += self.pos_encoding[:,:seq_len,:]
409
+
410
+ x = self.dropout(x, training=training)
411
+
412
+ for i in range(self.num_layers):
413
+ x, block1, block2 = self.dec_layers[i](x, enc_output, training,
414
+ look_ahead_mask, padding_mask)
415
+
416
+ # store attenion weights, they can be used to visualize while translating
417
+ attention_weights['decoder_layer{}_block1'.format(i+1)] = block1
418
+ attention_weights['decoder_layer{}_block2'.format(i+1)] = block2
419
+
420
+ return x, attention_weights
421
+
422
+ class Transformer(tf.keras.Model):
423
+ def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size,
424
+ target_vocab_size, pe_input, pe_target, rate=0.1):
425
+ super(Transformer, self).__init__()
426
+
427
+ self.encoder = Encoder(num_layers, d_model, num_heads, dff,
428
+ input_vocab_size, pe_input, rate)
429
+
430
+ self.decoder = Decoder(num_layers, d_model, num_heads, dff,
431
+ target_vocab_size, pe_target, rate)
432
+
433
+ self.final_layer = tf.keras.layers.Dense(target_vocab_size)
434
+
435
+ def call(self, inp, tar, training, enc_padding_mask,
436
+ look_ahead_mask, dec_padding_mask):
437
+
438
+ # Pass the input to the encoder
439
+ enc_output = self.encoder(inp, training, enc_padding_mask)
440
+
441
+ # Pass the encoder output to the decoder
442
+ dec_output, attention_weights = self.decoder(
443
+ tar, enc_output, training, look_ahead_mask, dec_padding_mask)
444
+
445
+ # Pass the decoder output to the last linear layer
446
+ final_output = self.final_layer(dec_output)
447
+
448
+ return final_output, attention_weights
449
+
450
+ class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
451
+ def __init__(self, d_model, warmup_steps=4000):
452
+ super(CustomSchedule, self).__init__()
453
+
454
+ self.d_model = d_model
455
+ self.d_model = tf.cast(self.d_model, tf.float32)
456
+
457
+ self.warmup_steps = warmup_steps
458
+
459
+ def __call__(self, step):
460
+ arg1 = tf.math.rsqrt(step)
461
+ arg2 = step * (self.warmup_steps ** -1.5)
462
+
463
+ return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)
464
+
465
+ learning_rate = CustomSchedule(d_model)
466
+
467
+ # Adam optimizer with a custom learning rate
468
+ optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,
469
+ epsilon=1e-9)
470
+
471
+ loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
472
+ from_logits=True, reduction='none')
473
+
474
+ def loss_function(real, pred):
475
+ # Apply a mask to paddings (0)
476
+ mask = tf.math.logical_not(tf.math.equal(real, 0))
477
+ loss_ = loss_object(real, pred)
478
+
479
+ mask = tf.cast(mask, dtype=loss_.dtype)
480
+ loss_ *= mask
481
+
482
+ return tf.reduce_mean(loss_)
483
+
484
+ train_loss = tf.keras.metrics.Mean(name='train_loss')
485
+ train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
486
+ name='train_accuracy')
487
+
488
+ transformer = Transformer(num_layers, d_model, num_heads, dff,
489
+ input_vocab_size, target_vocab_size,
490
+ pe_input=input_vocab_size,
491
+ pe_target=target_vocab_size,
492
+ rate=dropout_rate)
493
+
494
+ def create_masks(inp, tar):
495
+ # Encoder padding mask
496
+ enc_padding_mask = create_padding_mask(inp)
497
+
498
+ # Decoder padding mask
499
+ dec_padding_mask = create_padding_mask(inp)
500
+
501
+ # Look ahead mask (for hiding the rest of the sequence in the 1st decoder attention layer)
502
+ look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
503
+ dec_target_padding_mask = create_padding_mask(tar)
504
+ combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)
505
+
506
+ return enc_padding_mask, combined_mask, dec_padding_mask
507
+
508
+ # drive_root = '/gdrive/My Drive/'
509
+ drive_root = './'
510
+
511
+ checkpoint_dir = os.path.join(drive_root, "checkpoints")
512
+ checkpoint_dir = os.path.join(checkpoint_dir, "training_checkpoints/moops_transfomer")
513
+
514
+ print("Checkpoints directory is", checkpoint_dir)
515
+ if os.path.exists(checkpoint_dir):
516
+ print("Checkpoints folder already exists")
517
+ else:
518
+ print("Creating a checkpoints directory")
519
+ os.makedirs(checkpoint_dir)
520
+
521
+
522
+ checkpoint = tf.train.Checkpoint(transformer=transformer,
523
+ optimizer=optimizer)
524
+
525
+ ckpt_manager = tf.train.CheckpointManager(checkpoint, checkpoint_dir, max_to_keep=5)
526
+
527
+ latest = ckpt_manager.latest_checkpoint
528
+ latest
529
+
530
+ if latest:
531
+ epoch_num = int(latest.split('/')[-1].split('-')[-1])
532
+ checkpoint.restore(latest)
533
+ print ('Latest checkpoint restored!!')
534
+ else:
535
+ epoch_num = 0
536
+
537
+ epoch_num
538
+
539
+ # EPOCHS = 17
540
+
541
+ # def train_step(inp, tar):
542
+ # tar_inp = tar[:, :-1]
543
+ # tar_real = tar[:, 1:]
544
+
545
+ # enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)
546
+
547
+ # with tf.GradientTape() as tape:
548
+ # predictions, _ = transformer(inp, tar_inp,
549
+ # True,
550
+ # enc_padding_mask,
551
+ # combined_mask,
552
+ # dec_padding_mask)
553
+ # loss = loss_function(tar_real, predictions)
554
+
555
+ # gradients = tape.gradient(loss, transformer.trainable_variables)
556
+ # optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))
557
+
558
+ # train_loss(loss)
559
+ # train_accuracy(tar_real, predictions)
560
+
561
+ # for epoch in range(epoch_num, EPOCHS):
562
+ # start = time.time()
563
+
564
+ # train_loss.reset_states()
565
+ # train_accuracy.reset_states()
566
+
567
+ # # inp -> question, tar -> equation
568
+ # for (batch, (inp, tar)) in enumerate(dataset):
569
+ # train_step(inp, tar)
570
+
571
+ # if batch % 50 == 0:
572
+ # print ('Epoch {} Batch {} Loss {:.4f} Accuracy {:.4f}'.format(
573
+ # epoch + 1, batch, train_loss.result(), train_accuracy.result()))
574
+
575
+ # ckpt_save_path = ckpt_manager.save()
576
+ # print ('Saving checkpoint for epoch {} at {}'.format(epoch+1,
577
+ # ckpt_save_path))
578
+
579
+ # print ('Epoch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1,
580
+ # train_loss.result(),
581
+ # train_accuracy.result()))
582
+
583
+ # print ('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))
584
+
585
+ def evaluate(inp_sentence):
586
+ start_token = [len(inp_lang_tokenizer.word_index)+1]
587
+ end_token = [len(inp_lang_tokenizer.word_index)+2]
588
+
589
+ # inp sentence is the word problem, hence adding the start and end token
590
+ inp_sentence = start_token + [inp_lang_tokenizer.word_index.get(i, inp_lang_tokenizer.word_index['john']) for i in preprocess_input(inp_sentence).split(' ')] + end_token
591
+ encoder_input = tf.expand_dims(inp_sentence, 0)
592
+
593
+ # start with equation's start token
594
+ decoder_input = [old_len+1]
595
+ output = tf.expand_dims(decoder_input, 0)
596
+
597
+ for i in range(MAX_LENGTH):
598
+ enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
599
+ encoder_input, output)
600
+
601
+ predictions, attention_weights = transformer(encoder_input,
602
+ output,
603
+ False,
604
+ enc_padding_mask,
605
+ combined_mask,
606
+ dec_padding_mask)
607
+
608
+ # select the last word from the seq_len dimension
609
+ predictions = predictions[: ,-1:, :]
610
+ predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)
611
+
612
+ # return the result if the predicted_id is equal to the end token
613
+ if predicted_id == old_len+2:
614
+ return tf.squeeze(output, axis=0), attention_weights
615
+
616
+ # concatentate the predicted_id to the output which is given to the decoder
617
+ # as its input.
618
+ output = tf.concat([output, predicted_id], axis=-1)
619
+ return tf.squeeze(output, axis=0), attention_weights
620
+
621
+ # def plot_attention_weights(attention, sentence, result, layer):
622
+ # fig = plt.figure(figsize=(16, 8))
623
+
624
+ # sentence = preprocess_input(sentence)
625
+
626
+ # attention = tf.squeeze(attention[layer], axis=0)
627
+
628
+ # for head in range(attention.shape[0]):
629
+ # ax = fig.add_subplot(2, 4, head+1)
630
+
631
+ # # plot the attention weights
632
+ # ax.matshow(attention[head][:-1, :], cmap='viridis')
633
+
634
+ # fontdict = {'fontsize': 10}
635
+
636
+ # ax.set_xticks(range(len(sentence.split(' '))+2))
637
+ # ax.set_yticks(range(len([targ_lang_tokenizer.index_word[i] for i in list(result.numpy())
638
+ # if i < len(targ_lang_tokenizer.word_index) and i not in [0,old_len+1,old_len+2]])+3))
639
+
640
+
641
+ # ax.set_ylim(len([targ_lang_tokenizer.index_word[i] for i in list(result.numpy())
642
+ # if i < len(targ_lang_tokenizer.word_index) and i not in [0,old_len+1,old_len+2]]), -0.5)
643
+
644
+ # ax.set_xticklabels(
645
+ # ['<start>']+sentence.split(' ')+['<end>'],
646
+ # fontdict=fontdict, rotation=90)
647
+
648
+ # ax.set_yticklabels([targ_lang_tokenizer.index_word[i] for i in list(result.numpy())
649
+ # if i < len(targ_lang_tokenizer.word_index) and i not in [0,old_len+1,old_len+2]],
650
+ # fontdict=fontdict)
651
+
652
+ # ax.set_xlabel('Head {}'.format(head+1))
653
+
654
+ # plt.tight_layout()
655
+ # plt.show()
656
+
657
+ MAX_LENGTH = 40
658
+
659
+ def translate(sentence, plot=''):
660
+
661
+
662
+
663
+ result, attention_weights = evaluate(sentence)
664
+
665
+ # use the result tokens to convert prediction into a list of characters
666
+ # (not inclusing padding, start and end tokens)
667
+ predicted_sentence = [targ_lang_tokenizer.index_word[i] for i in list(result.numpy()) if (i < len(targ_lang_tokenizer.word_index) and i not in [0,46,47])]
668
+
669
+ # print('Input: {}'.format(sentence))
670
+ return ''.join(predicted_sentence)
671
+
672
+ if plot:
673
+ plot_attention_weights(attention_weights, sentence, result, plot)
674
+
675
+ # def evaluate_results(inp_sentence):
676
+ # start_token = [len(inp_lang_tokenizer.word_index)+1]
677
+ # end_token = [len(inp_lang_tokenizer.word_index)+2]
678
+
679
+ # # inp sentence is the word problem, hence adding the start and end token
680
+ # inp_sentence = start_token + list(inp_sentence.numpy()[0]) + end_token
681
+
682
+ # encoder_input = tf.expand_dims(inp_sentence, 0)
683
+
684
+
685
+ # decoder_input = [old_len+1]
686
+ # output = tf.expand_dims(decoder_input, 0)
687
+
688
+ # for i in range(MAX_LENGTH):
689
+ # enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
690
+ # encoder_input, output)
691
+
692
+ # # predictions.shape == (batch_size, seq_len, vocab_size)
693
+ # predictions, attention_weights = transformer(encoder_input,
694
+ # output,
695
+ # False,
696
+ # enc_padding_mask,
697
+ # combined_mask,
698
+ # dec_padding_mask)
699
+
700
+ # # select the last word from the seq_len dimension
701
+ # predictions = predictions[: ,-1:, :] # (batch_size, 1, vocab_size)
702
+
703
+ # predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)
704
+
705
+ # # return the result if the predicted_id is equal to the end token
706
+ # if predicted_id == old_len+2:
707
+ # return tf.squeeze(output, axis=0), attention_weights
708
+
709
+ # # concatentate the predicted_id to the output which is given to the decoder
710
+ # # as its input.
711
+ # output = tf.concat([output, predicted_id], axis=-1)
712
+
713
+ # return tf.squeeze(output, axis=0), attention_weights
714
+
715
+ # dataset_val = tf.data.Dataset.from_tensor_slices((input_tensor_val, target_tensor_val)).shuffle(BUFFER_SIZE)
716
+ # dataset_val = dataset_val.batch(1, drop_remainder=True)
717
+
718
+ # y_true = []
719
+ # y_pred = []
720
+ # acc_cnt = 0
721
+
722
+ # a = 0
723
+ # for (inp_val_batch, target_val_batch) in iter(dataset_val):
724
+ # a += 1
725
+ # if a % 100 == 0:
726
+ # print(a)
727
+ # print("Accuracy count: ",acc_cnt)
728
+ # print('------------------')
729
+ # target_sentence = ''
730
+ # for i in target_val_batch.numpy()[0]:
731
+ # if i not in [0,old_len+1,old_len+2]:
732
+ # target_sentence += (targ_lang_tokenizer.index_word[i] + ' ')
733
+
734
+ # y_true.append([target_sentence.split(' ')[:-1]])
735
+
736
+ # result, _ = evaluate_results(inp_val_batch)
737
+ # predicted_sentence = [targ_lang_tokenizer.index_word[i] for i in list(result.numpy()) if (i < len(targ_lang_tokenizer.word_index) and i not in [0,old_len+1,old_len+2])]
738
+ # y_pred.append(predicted_sentence)
739
+
740
+ # if target_sentence.split(' ')[:-1] == predicted_sentence:
741
+ # acc_cnt += 1
742
+
743
+ # len(y_true), len(y_pred)
744
+
745
+ # print('Corpus BLEU score of the model: ', corpus_bleu(y_true, y_pred))
746
+
747
+ # print('Accuracy of the model: ', acc_cnt/len(input_tensor_val))
748
+
749
+ check_str = ' '.join([inp_lang_tokenizer.index_word[i] for i in input_tensor_val[242] if i not in [0,
750
+ len(inp_lang_tokenizer.word_index)+1,
751
+ len(inp_lang_tokenizer.word_index)+2]])
752
+
753
+ check_str
754
+
755
+ translate(check_str)
756
+
757
+ #'victor had some car . john took 3 0 from him . now victor has 6 8 car . how many car victor had originally ?'
758
+ translate('Nafis had 31 raspberry . He slice each raspberry into 19 slices . How many raspberry slices did Denise make?')
759
 
760
+ interface = gr.Interface(
761
+ fn = translate,
762
+ inputs = 'text',
763
+ outputs = 'text',
764
+ examples = [
765
+ ['Denise had 31 raspberry. He slice each raspberry into 19 slices. How many raspberry slices did Denise make?'],
766
+ ['Cynthia snap up 14 bags of blueberry. how many blueberry in each bag? If total 94 blueberry Cynthia snap up.'],
767
+ ['Donald had some Watch. Jonathan gave him 7 more. Now Donald has 18 Watch. How many Watch did Donald have initially?']
768
+ ],
769
+ theme = 'grass',
770
+ title = 'Mathbot',
771
+ description = 'Enter a simple math word problem and our AI will try to predict an expression to solve it. Mathbot occasionally makes mistakes. Feel free to press "flag" if you encounter such a scenario.',
772
+ )
773
+ interface.launch()