nnmthuw commited on
Commit
9c2ca71
·
1 Parent(s): 209515a

commit all

Browse files
app.py CHANGED
@@ -5,17 +5,15 @@ import pickle
5
  import torch
6
  import torch.nn as nn
7
  from torchtext.transforms import PadTransform
8
- from torch.utils.data import Dataset, DataLoader
9
  from torch.nn import functional as F
10
  from tqdm import tqdm
11
- from underthesea import word_tokenize, text_normalize
12
 
13
  # Build Vocabulary
14
- # device = 'cuda' if torch.cuda.is_available() else 'cpu'
15
  device = "cpu"
16
 
17
  # Build Vocabulary
18
- MAX_LENGTH = 15
19
  class Vocabulary:
20
  """The Vocabulary class is used to record words, which are used to convert
21
  text to numbers and vice versa.
@@ -75,14 +73,22 @@ class Vocabulary:
75
  def preprocessing_sent(self, sent, lang="en"):
76
  """Preprocessing a sentence (depend on language english or vietnamese)
77
  @param sent (str)
78
- @param lang (str)
79
  """
80
 
81
  # Lowercase sentence and remove space at beginning and ending
82
  sent = sent.lower().strip()
83
 
 
 
 
 
 
 
84
  # Remove unnecessary space
85
  sent = re.sub("(?<=\w)\.", " .", sent)
 
 
86
  sent = re.sub("(?<=\w),", " ,", sent)
87
  sent = re.sub("(?<=\w)\?", " ?", sent)
88
  sent = re.sub("(?<=\w)\!", " !", sent)
@@ -93,10 +99,12 @@ class Vocabulary:
93
  sent = re.sub("what's", "what is", sent)
94
  sent = re.sub("who's", "who is", sent)
95
  sent = re.sub("which's", "which is", sent)
 
 
 
 
96
 
97
  sent = re.sub("i'm", "i am", sent)
98
- # Dont know to preprocess with possessive case
99
- sent = re.sub("it's", "it is", sent)
100
  sent = re.sub("'re ", " are ", sent)
101
  sent = re.sub("'ve ", " have ", sent)
102
  sent = re.sub("'ll ", " will ", sent)
@@ -115,7 +123,8 @@ class Vocabulary:
115
  else:
116
  # Package underthesea.text_normalize support to normalize vietnamese
117
  sent = text_normalize(sent)
118
-
 
119
  return sent.strip()
120
 
121
  def tokenize_corpus(self, corpus, disable=False):
@@ -165,40 +174,51 @@ class Vocabulary:
165
  return corpus
166
 
167
 
168
- with open("vocab_source.pkl", "rb") as file:
169
  VOCAB_SOURCE = pickle.load(file)
170
- with open("vocab_target.pkl", "rb") as file:
171
  VOCAB_TARGET = pickle.load(file)
172
 
173
  input_embedding = torch.zeros((len(VOCAB_SOURCE), 100))
174
  output_embedding = torch.zeros((len(VOCAB_TARGET), 100))
175
 
176
 
177
- def create_input_emb_layer():
178
- num_embeddings, embedding_dim = input_embedding.size()
 
 
 
 
179
  emb_layer = nn.Embedding(num_embeddings, embedding_dim)
 
180
  emb_layer.weight.requires_grad = False
 
181
  return emb_layer, embedding_dim
182
 
183
- def create_output_emb_layer():
184
- num_embeddings, embedding_dim = output_embedding.size()
 
 
 
 
185
  emb_layer = nn.Embedding(num_embeddings, embedding_dim)
 
186
  emb_layer.weight.requires_grad = False
 
187
  return emb_layer, embedding_dim
188
 
189
 
190
- class EncoderRNN(nn.Module):
191
  def __init__(self, input_dim, hidden_dim, dropout = 0.1):
192
  """ Encoder RNN
193
  @param input_dim (int): size of vocab_souce
194
  @param hidden_dim (int)
195
  @param dropout (float): dropout ratio of layer drop out
196
  """
197
- super(EncoderRNN, self).__init__()
198
  self.hidden_dim = hidden_dim
199
- #self.embedding = nn.Embedding(input_dim, hidden_dim)
200
- # Đổi thành input embedding
201
- self.embedding, self.embedding_dim = create_input_emb_layer()
202
  self.gru = nn.GRU(self.embedding_dim, hidden_dim, batch_first=True)
203
  self.dropout = nn.Dropout(dropout)
204
 
@@ -207,7 +227,6 @@ class EncoderRNN(nn.Module):
207
  output, hidden = self.gru(embedded)
208
  return output, hidden
209
 
210
-
211
  class BahdanauAttention(nn.Module):
212
  def __init__(self, hidden_size):
213
  """ Bahdanau Attention
@@ -227,20 +246,21 @@ class BahdanauAttention(nn.Module):
227
 
228
  return context, weights
229
 
230
- class AttnDecoderRNN(nn.Module):
231
- def __init__(self, hidden_size, output_size, dropout_p=0.1):
232
  """ Decoder RNN using Attention
233
  @param hidden_size (int)
234
  @param output_size (int): size of vocab_target
235
  @param dropout (float): dropout ratio of layer drop out
236
  """
237
- super(AttnDecoderRNN, self).__init__()
238
- self.embedding, self.embedding_dim = create_output_emb_layer()
 
239
  self.fc = nn.Linear(self.embedding_dim, hidden_size)
240
  self.attention = BahdanauAttention(hidden_size)
241
  self.gru = nn.GRU(2 * hidden_size, hidden_size, batch_first=True)
242
  self.out = nn.Linear(hidden_size, output_size)
243
- self.dropout = nn.Dropout(dropout_p)
244
 
245
  def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
246
  batch_size = encoder_outputs.size(0)
@@ -293,13 +313,13 @@ OUTPUT_DIM = len(VOCAB_TARGET)
293
  HID_DIM = 512
294
 
295
  # Load our Model Translation
296
- ENCODER = EncoderRNN(INPUT_DIM, HID_DIM)
297
- ENCODER.load_state_dict(torch.load("hid512_encoder_att_epoch_20.pt"))
298
- DECODER = AttnDecoderRNN(HID_DIM, OUTPUT_DIM)
299
- DECODER.load_state_dict(torch.load("hid512_decoder_att_epoch_20.pt"))
300
 
301
 
302
- def evaluate(encoder, decoder, sentence, vocab_source, vocab_target, disable=False):
303
  encoder.eval()
304
  decoder.eval()
305
  with torch.no_grad():
@@ -326,12 +346,14 @@ def evaluate(encoder, decoder, sentence, vocab_source, vocab_target, disable=Fal
326
  return decoded_words, decoder_attn
327
 
328
 
329
- def my_translate_model(sentence):
330
- output_words, _ = evaluate(
331
- ENCODER, DECODER, sentence, VOCAB_SOURCE, VOCAB_TARGET, disable=True
332
- )
 
 
333
 
334
- return " ".join(output_words[1:-1]).capitalize()
335
 
336
 
337
  def envit5_translation(text):
@@ -339,44 +361,44 @@ def envit5_translation(text):
339
  text,
340
  max_length=512,
341
  early_stopping=True,
342
- )[0][
343
- "translation_text"
344
- ][3:]
345
  return res
346
 
347
 
348
  def translation(text):
349
- output1 = my_translate_model(text)
 
 
 
350
  output2 = envit5_translation(text)
351
- #output3 = finetune_BERT(text)
352
 
353
  return (output1, output2)
354
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
355
 
356
- examples = [["Input: Hello guys"],
357
- ["Output: Xin chào các bạn"]]
358
-
359
- demo = gr.Interface(
360
- theme = gr.themes.Base(),
361
- fn=translation,
362
- title="Co Gai Mo Duong",
363
- description="""
364
- ## Machine Translation: English to Vietnamese
365
- """,
366
- examples=examples,
367
- inputs=[
368
- gr.Textbox(
369
- lines=5, placeholder="Enter text", label="Input"
370
- )
371
- ],
372
- outputs=[
373
- gr.Textbox(
374
- "text", label="Our Machine Translation"
375
- ),
376
- gr.Textbox(
377
- "text", label="VietAI Machine Translation"
378
- )
379
- ]
380
- )
381
-
382
- demo.launch(share = True)
 
5
  import torch
6
  import torch.nn as nn
7
  from torchtext.transforms import PadTransform
 
8
  from torch.nn import functional as F
9
  from tqdm import tqdm
10
+ from underthesea import text_normalize
11
 
12
  # Build Vocabulary
 
13
  device = "cpu"
14
 
15
  # Build Vocabulary
16
+ MAX_LENGTH = 20
17
  class Vocabulary:
18
  """The Vocabulary class is used to record words, which are used to convert
19
  text to numbers and vice versa.
 
73
  def preprocessing_sent(self, sent, lang="en"):
74
  """Preprocessing a sentence (depend on language english or vietnamese)
75
  @param sent (str)
76
+ @param lang (str)
77
  """
78
 
79
  # Lowercase sentence and remove space at beginning and ending
80
  sent = sent.lower().strip()
81
 
82
+ # Replace HTML charecterist
83
+ sent = re.sub("&apos;", "'", sent)
84
+ sent = re.sub("&quot;", '"', sent)
85
+ sent = re.sub("&#91;", "[", sent)
86
+ sent = re.sub("&#93;", "]", sent)
87
+
88
  # Remove unnecessary space
89
  sent = re.sub("(?<=\w)\.", " .", sent)
90
+
91
+ # Normalizing the distance between tokens (word and punctuation)
92
  sent = re.sub("(?<=\w),", " ,", sent)
93
  sent = re.sub("(?<=\w)\?", " ?", sent)
94
  sent = re.sub("(?<=\w)\!", " !", sent)
 
99
  sent = re.sub("what's", "what is", sent)
100
  sent = re.sub("who's", "who is", sent)
101
  sent = re.sub("which's", "which is", sent)
102
+ sent = re.sub("who's", "who is", sent)
103
+ sent = re.sub("here's", "here is", sent)
104
+ sent = re.sub("there's", "there is", sent)
105
+ sent = re.sub("it's", "it is", sent)
106
 
107
  sent = re.sub("i'm", "i am", sent)
 
 
108
  sent = re.sub("'re ", " are ", sent)
109
  sent = re.sub("'ve ", " have ", sent)
110
  sent = re.sub("'ll ", " will ", sent)
 
123
  else:
124
  # Package underthesea.text_normalize support to normalize vietnamese
125
  sent = text_normalize(sent)
126
+ if not sent.endswith(('.', '!', '?')):
127
+ sent = sent + ' .'
128
  return sent.strip()
129
 
130
  def tokenize_corpus(self, corpus, disable=False):
 
174
  return corpus
175
 
176
 
177
+ with open("vocab_source_final.pkl", "rb") as file:
178
  VOCAB_SOURCE = pickle.load(file)
179
+ with open("vocab_target_final.pkl", "rb") as file:
180
  VOCAB_TARGET = pickle.load(file)
181
 
182
  input_embedding = torch.zeros((len(VOCAB_SOURCE), 100))
183
  output_embedding = torch.zeros((len(VOCAB_TARGET), 100))
184
 
185
 
186
+ def create_input_emb_layer(pretrained = False):
187
+ if not pretrained:
188
+ weights_matrix = torch.zeros((len(VOCAB_SOURCE), 100))
189
+ else:
190
+ weights_matrix = input_embedding
191
+ num_embeddings, embedding_dim = weights_matrix.size()
192
  emb_layer = nn.Embedding(num_embeddings, embedding_dim)
193
+ emb_layer.weight.data = weights_matrix
194
  emb_layer.weight.requires_grad = False
195
+
196
  return emb_layer, embedding_dim
197
 
198
+ def create_output_emb_layer(pretrained = False):
199
+ if not pretrained:
200
+ weights_matrix = torch.zeros((len(VOCAB_TARGET), 100))
201
+ else:
202
+ weights_matrix = output_embedding
203
+ num_embeddings, embedding_dim = weights_matrix.size()
204
  emb_layer = nn.Embedding(num_embeddings, embedding_dim)
205
+ emb_layer.weight.data = weights_matrix
206
  emb_layer.weight.requires_grad = False
207
+
208
  return emb_layer, embedding_dim
209
 
210
 
211
+ class EncoderAtt(nn.Module):
212
  def __init__(self, input_dim, hidden_dim, dropout = 0.1):
213
  """ Encoder RNN
214
  @param input_dim (int): size of vocab_souce
215
  @param hidden_dim (int)
216
  @param dropout (float): dropout ratio of layer drop out
217
  """
218
+ super(EncoderAtt, self).__init__()
219
  self.hidden_dim = hidden_dim
220
+ # Using pretrained Embedding
221
+ self.embedding, self.embedding_dim = create_input_emb_layer(True)
 
222
  self.gru = nn.GRU(self.embedding_dim, hidden_dim, batch_first=True)
223
  self.dropout = nn.Dropout(dropout)
224
 
 
227
  output, hidden = self.gru(embedded)
228
  return output, hidden
229
 
 
230
  class BahdanauAttention(nn.Module):
231
  def __init__(self, hidden_size):
232
  """ Bahdanau Attention
 
246
 
247
  return context, weights
248
 
249
+ class DecoderAtt(nn.Module):
250
+ def __init__(self, hidden_size, output_size, dropout=0.1):
251
  """ Decoder RNN using Attention
252
  @param hidden_size (int)
253
  @param output_size (int): size of vocab_target
254
  @param dropout (float): dropout ratio of layer drop out
255
  """
256
+ super(DecoderAtt, self).__init__()
257
+ # Using pretrained Embedding
258
+ self.embedding, self.embedding_dim = create_output_emb_layer(True)
259
  self.fc = nn.Linear(self.embedding_dim, hidden_size)
260
  self.attention = BahdanauAttention(hidden_size)
261
  self.gru = nn.GRU(2 * hidden_size, hidden_size, batch_first=True)
262
  self.out = nn.Linear(hidden_size, output_size)
263
+ self.dropout = nn.Dropout(dropout)
264
 
265
  def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
266
  batch_size = encoder_outputs.size(0)
 
313
  HID_DIM = 512
314
 
315
  # Load our Model Translation
316
+ ENCODER = EncoderAtt(INPUT_DIM, HID_DIM)
317
+ #ENCODER.load_state_dict(torch.load("hid512_encoder_att_epoch_20.pt"), map_location=torch.device('cpu'))
318
+ DECODER = DecoderAtt(HID_DIM, OUTPUT_DIM)
319
+ #DECODER.load_state_dict(torch.load("hid512_decoder_att_epoch_20.pt"), map_location=torch.device('cpu'))
320
 
321
 
322
+ def evaluate_final_model(encoder, decoder, sentence, vocab_source, vocab_target, disable=False):
323
  encoder.eval()
324
  decoder.eval()
325
  with torch.no_grad():
 
346
  return decoded_words, decoder_attn
347
 
348
 
349
+ def my_translation(sentence):
350
+ output_words, _ = evaluate_final_model(sentence, ENCODER, DECODER, VOCAB_SOURCE, VOCAB_TARGET, disable= True)
351
+ output_words = output_words.remove("<pad>")
352
+ output_words = output_words.remove("<unk>")
353
+ output_words = output_words.remove("<sos>")
354
+ output_words = output_words.remove("<eos>")
355
 
356
+ return ' '.join(output_words[1:-1]).capitalize()
357
 
358
 
359
  def envit5_translation(text):
 
361
  text,
362
  max_length=512,
363
  early_stopping=True,
364
+ )[0]["translation_text"][3:]
 
 
365
  return res
366
 
367
 
368
  def translation(text):
369
+ if not text.endswith(('.', '!', '?')):
370
+ text = text + '.'
371
+ #output1 = my_translation(text)
372
+ output1 = "Something"
373
  output2 = envit5_translation(text)
 
374
 
375
  return (output1, output2)
376
 
377
+ if __name__ == "__main__":
378
+ examples = [["Hello guys", "Input"],
379
+ ["Xin chào các bạn", "Output"]]
380
+
381
+ demo = gr.Interface(
382
+ theme = gr.themes.Base(),
383
+ fn=translation,
384
+ title="Co Gai Mo Duong",
385
+ description="""
386
+ ## Machine Translation: English to Vietnamese
387
+ """,
388
+ examples=examples,
389
+ inputs=[
390
+ gr.Textbox(
391
+ lines=5, placeholder="Enter text", label="Input"
392
+ )
393
+ ],
394
+ outputs=[
395
+ gr.Textbox(
396
+ "text", label="Our Machine Translation"
397
+ ),
398
+ gr.Textbox(
399
+ "text", label="VietAI Machine Translation"
400
+ )
401
+ ]
402
+ )
403
 
404
+ demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
vocab_source.pkl → vocab_source_final.pkl RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cf38f3daacf3feb3b80cba2069210d5ac3b770c232233178f42434b709bba360
3
- size 659103
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:470ea7a4a120e9c2274db2ad7f5b68241eb1cea444881852245013ef91f69106
3
+ size 682848
vocab_target.pkl → vocab_target_final.pkl RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ac7bd376478b2b3bbcfbeeccd5ced630340b95d3da5eab8d7c1c9e01d74b50d2
3
- size 228271
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:853cf7ecce86d078c1a8cf81b867f55454d1b7bf21679832fea8391711198c6f
3
+ size 250477