Spaces:

nnmthuw
/

CoGaiMoDuong

Sleeping

App Files Files Community

nnmthuw commited on Jan 11, 2024

Commit

29a9269

1 Parent(s): 124bc25

commit all

Browse files

Files changed (8) hide show

app.py +362 -29
flagged/log.csv +2 -0
hid512_decoder_att_epoch_20.pt +3 -0
hid512_encoder_att_epoch_20.pt +3 -0
requirements.txt +5 -1
temp.ipynb +569 -0
vocab_source.pkl +3 -0
vocab_target.pkl +3 -0

app.py CHANGED Viewed

@@ -1,49 +1,382 @@
 import gradio as gr
-from transformers import pipeline
 envit5_translater = pipeline("translation", model="VietAI/envit5-translation")
 def envit5_translation(text):
     res = envit5_translater(
         text,
         max_length=512,
         early_stopping=True,
-    )[0]['translation_text'][3:]
     return res
-def my_translation(text):
-    return "My Translation"
-def finetune_BERT(text):
-    return "BERT"
 def translation(text):
-    output1 = my_translation(text)
     output2 = envit5_translation(text)
-    output3 = finetune_BERT(text)
-    return (output1, output2, output3)
-description = """
-<p>
-<center>
-Multi-domain Translation Between English and Vietnamese
-Using VietAI Translation
-</center>
-</p>
-"""
-examples = [
-    ["Dear God, thank you for granting us the evergreen garden of this world", "en->vi"],
-    ["Thuốc này đã bị cấm sử dụng trong ngành thú y tại Ấn Độ.", "vi->en"]
-]
 demo = gr.Interface(
     fn=translation,
     title="Co Gai Mo Duong",
-    description=description,
     examples=examples,
-    inputs=gr.Textbox(lines=5, placeholder="Enter text (maximum 5 lines)", label="Input"),
-    outputs=["text", "text", "text"]
-    )
-demo.launch()

 import gradio as gr
+from transformers import pipeline
+import re
+import pickle
+import torch
+import torch.nn as nn
+from torchtext.transforms import PadTransform
+from torch.utils.data import Dataset, DataLoader
+from torch.nn import functional as F
+from tqdm import tqdm
+from underthesea import word_tokenize, text_normalize
+# Build Vocabulary
+# device = 'cuda' if torch.cuda.is_available() else 'cpu'
+device = "cpu"
+# Build Vocabulary
+MAX_LENGTH = 15
+class Vocabulary:
+    """The Vocabulary class is used to record words, which are used to convert
+    text to numbers and vice versa.
+    """
+    def __init__(self, lang="vi"):
+        self.lang = lang
+        self.word2id = dict()
+        self.word2id["<sos>"] = 0  # Start of Sentence Token
+        self.word2id["<eos>"] = 1  # End of Sentence Token
+        self.word2id["<unk>"] = 2  # Unknown Token
+        self.word2id["<pad>"] = 3  # Pad Token
+        self.sos_id = self.word2id["<sos>"]
+        self.eos_id = self.word2id["<eos>"]
+        self.unk_id = self.word2id["<unk>"]
+        self.pad_id = self.word2id["<pad>"]
+        self.id2word = {v: k for k, v in self.word2id.items()}
+        self.pad_transform = PadTransform(max_length = MAX_LENGTH, pad_value = self.pad_id)
+    def __getitem__(self, word):
+        """Return ID of word if existed else return ID unknown token
+        @param word (str)
+        """
+        return self.word2id.get(word, self.unk_id)
+    def __contains__(self, word):
+        """Return True if word in Vocabulary else return False
+        @param word (str)
+        """
+        return word in self.word2id
+    def __len__(self):
+        """
+        Return number of tokens(include sos, eos, unk and pad tokens) in Vocabulary
+        """
+        return len(self.word2id)
+    def lookup_tokens(self, word_indexes: list):
+        """Return the list of words by lookup by ID
+        @param word_indexes (list(int))
+        @return words (list(str))
+        """
+        return [self.id2word[word_index] for word_index in word_indexes]
+    def add(self, word):
+        """Add word to vocabulary
+        @param word (str)
+        @return index (str): index of the word just added
+        """
+        if word not in self:
+            word_index = self.word2id[word] = len(self.word2id)
+            self.id2word[word_index] = word
+            return word_index
+        else:
+            return self[word]
+    def preprocessing_sent(self, sent, lang="en"):
+        """Preprocessing a sentence (depend on language english or vietnamese)
+        @param sent (str)
+        @param lang (str)
+        """
+        # Lowercase sentence and remove space at beginning and ending
+        sent = sent.lower().strip()
+        # Remove unnecessary space
+        sent = re.sub("(?<=\w)\.", " .", sent)
+        sent = re.sub("(?<=\w),", " ,", sent)
+        sent = re.sub("(?<=\w)\?", " ?", sent)
+        sent = re.sub("(?<=\w)\!", " !", sent)
+        sent = re.sub(" +", " ", sent)
+        if (lang == "en") or (lang == "eng") or (lang == "english"):
+            # Replace short form
+            sent = re.sub("what's", "what is", sent)
+            sent = re.sub("who's", "who is", sent)
+            sent = re.sub("which's", "which is", sent)
+            sent = re.sub("i'm", "i am", sent)
+            # Dont know to preprocess with possessive case
+            sent = re.sub("it's", "it is", sent)
+            sent = re.sub("'re ", " are ", sent)
+            sent = re.sub("'ve ", " have ", sent)
+            sent = re.sub("'ll ", " will ", sent)
+            sent = re.sub("'d ", " would ", sent)
+            sent = re.sub("aren't", "are not", sent)
+            sent = re.sub("isn't", "is not", sent)
+            sent = re.sub("don't", "do not", sent)
+            sent = re.sub("doesn't", "does not", sent)
+            sent = re.sub("wasn't", "was not", sent)
+            sent = re.sub("weren't", "were not", sent)
+            sent = re.sub("won't", "will not", sent)
+            sent = re.sub("can't", "can not", sent)
+            sent = re.sub("let's", "let us", sent)
+        else:
+            # Package underthesea.text_normalize support to normalize vietnamese
+            sent = text_normalize(sent)
+        return sent.strip()
+    def tokenize_corpus(self, corpus, disable=False):
+        """Split the documents of the corpus into words
+        @param corpus (list(str)): list of documents
+        @param disable (bool): notified or not
+        @return tokenized_corpus (list(list(str))): list of words
+        """
+        if not disable:
+            print("Tokenize the corpus...")
+        tokenized_corpus = list()
+        for document in tqdm(corpus, disable=disable):
+            tokenized_document = ["<sos>"] + self.preprocessing_sent(document, self.lang).split(" ") + ["<eos>"]
+            tokenized_corpus.append(tokenized_document)
+        return tokenized_corpus
+    def corpus_to_tensor(self, corpus, is_tokenized=False, disable=False):
+        """Convert corpus to a list of indices tensor
+        @param corpus (list(str) if is_tokenized==False else list(list(str)))
+        @param is_tokenized (bool)
+        @return indicies_corpus (list(tensor))
+        """
+        if is_tokenized:
+            tokenized_corpus = corpus
+        else:
+            tokenized_corpus = self.tokenize_corpus(corpus, disable=disable)
+        indicies_corpus = list()
+        for document in tqdm(tokenized_corpus, disable=disable):
+            indicies_document = torch.tensor(
+                list(map(lambda word: self[word], document)), dtype=torch.int64
+            )
+            indicies_corpus.append(self.pad_transform(indicies_document))
+        return indicies_corpus
+    def tensor_to_corpus(self, tensor, disable=False):
+        """Convert list of indices tensor to a list of tokenized documents
+        @param indicies_corpus (list(tensor))
+        @return corpus (list(list(str)))
+        """
+        corpus = list()
+        for indicies in tqdm(tensor, disable=disable):
+            document = list(map(lambda index: self.id2word[index.item()], indicies))
+            corpus.append(document)
+        return corpus
+with open("vocab_source.pkl", "rb") as file:
+    VOCAB_SOURCE = pickle.load(file)
+with open("vocab_target.pkl", "rb") as file:
+    VOCAB_TARGET = pickle.load(file)
+input_embedding = torch.zeros((len(VOCAB_SOURCE), 100))
+output_embedding = torch.zeros((len(VOCAB_TARGET), 100))
+def create_input_emb_layer():
+    num_embeddings, embedding_dim = input_embedding.size()
+    emb_layer = nn.Embedding(num_embeddings, embedding_dim)
+    emb_layer.weight.requires_grad = False
+    return emb_layer, embedding_dim
+def create_output_emb_layer():
+    num_embeddings, embedding_dim = output_embedding.size()
+    emb_layer = nn.Embedding(num_embeddings, embedding_dim)
+    emb_layer.weight.requires_grad = False
+    return emb_layer, embedding_dim
+class EncoderRNN(nn.Module):
+    def __init__(self, input_dim, hidden_dim, dropout = 0.1):
+        """ Encoder RNN
+        @param input_dim (int): size of vocab_souce
+        @param hidden_dim (int)
+        @param dropout (float): dropout ratio of layer drop out
+        """
+        super(EncoderRNN, self).__init__()
+        self.hidden_dim = hidden_dim
+        #self.embedding = nn.Embedding(input_dim, hidden_dim)
+        # Đổi thành input embedding
+        self.embedding, self.embedding_dim = create_input_emb_layer()
+        self.gru = nn.GRU(self.embedding_dim, hidden_dim, batch_first=True)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, src):
+        embedded = self.dropout(self.embedding(src))
+        output, hidden = self.gru(embedded)
+        return output, hidden
+class BahdanauAttention(nn.Module):
+    def __init__(self, hidden_size):
+        """ Bahdanau Attention
+        @param hidden_size (int)
+        """
+        super(BahdanauAttention, self).__init__()
+        self.Wa = nn.Linear(hidden_size, hidden_size)
+        self.Ua = nn.Linear(hidden_size, hidden_size)
+        self.Va = nn.Linear(hidden_size, 1)
+    def forward(self, query, keys):
+        scores = self.Va(torch.tanh(self.Wa(query) + self.Ua(keys)))
+        scores = scores.squeeze(2).unsqueeze(1)
+        weights = F.softmax(scores, dim=-1)
+        context = torch.bmm(weights, keys)
+        return context, weights
+class AttnDecoderRNN(nn.Module):
+    def __init__(self, hidden_size, output_size, dropout_p=0.1):
+        """ Decoder RNN using Attention
+        @param hidden_size (int)
+        @param output_size (int): size of vocab_target
+        @param dropout (float): dropout ratio of layer drop out
+        """
+        super(AttnDecoderRNN, self).__init__()
+        self.embedding, self.embedding_dim = create_output_emb_layer()
+        self.fc = nn.Linear(self.embedding_dim, hidden_size)
+        self.attention = BahdanauAttention(hidden_size)
+        self.gru = nn.GRU(2 * hidden_size, hidden_size, batch_first=True)
+        self.out = nn.Linear(hidden_size, output_size)
+        self.dropout = nn.Dropout(dropout_p)
+    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
+        batch_size = encoder_outputs.size(0)
+        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(0)
+        decoder_hidden = encoder_hidden
+        decoder_outputs = []
+        attentions = []
+        for i in range(MAX_LENGTH):
+            decoder_output, decoder_hidden, attn_weights = self.forward_step(
+                decoder_input, decoder_hidden, encoder_outputs
+            )
+            decoder_outputs.append(decoder_output)
+            attentions.append(attn_weights)
+            # Teacher forcing
+            if target_tensor is not None:
+                # Teacher forcing: Feed the target as the next input
+                decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
+            else:
+                # Without teacher forcing: use its own predictions as the next input
+                _, topi = decoder_output.topk(1)
+                decoder_input = topi.squeeze(-1).detach()  # detach from history as input
+        decoder_outputs = torch.cat(decoder_outputs, dim=1)
+        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
+        attentions = torch.cat(attentions, dim=1)
+        return decoder_outputs, decoder_hidden, attentions
+    def forward_step(self, input, hidden, encoder_outputs):
+        embedded =  self.dropout(self.fc(self.embedding(input)))
+        query = hidden.permute(1, 0, 2)
+        context, attn_weights = self.attention(query, encoder_outputs)
+        input_gru = torch.cat((embedded, context), dim=2)
+        output, hidden = self.gru(input_gru, hidden)
+        output = self.out(output)
+        return output, hidden, attn_weights
+# Load VietAI Translation
 envit5_translater = pipeline("translation", model="VietAI/envit5-translation")
+INPUT_DIM = len(VOCAB_SOURCE)
+OUTPUT_DIM = len(VOCAB_TARGET)
+HID_DIM = 512
+# Load our Model Translation
+ENCODER = EncoderRNN(INPUT_DIM, HID_DIM)
+ENCODER.load_state_dict(torch.load("hid512_encoder_att_epoch_20.pt"))
+DECODER = AttnDecoderRNN(HID_DIM, OUTPUT_DIM)
+DECODER.load_state_dict(torch.load("hid512_decoder_att_epoch_20.pt"))
+def evaluate(encoder, decoder, sentence, vocab_source, vocab_target, disable=False):
+    encoder.eval()
+    decoder.eval()
+    with torch.no_grad():
+        input_tensor = (
+            vocab_source.corpus_to_tensor([sentence], disable=disable)[0]
+            .view(1, -1)
+            .to(device)
+        )
+        encoder_outputs, encoder_hidden = encoder(input_tensor)
+        decoder_outputs, decoder_hidden, decoder_attn = decoder(
+            encoder_outputs, encoder_hidden
+        )
+        _, topi = decoder_outputs.topk(1)
+        decoded_ids = topi.squeeze()
+        decoded_words = []
+        for idx in decoded_ids:
+            if idx.item() == vocab_target.eos_id:
+                decoded_words.append("<eos>")
+                break
+            decoded_words.append(vocab_target.id2word[idx.item()])
+    return decoded_words, decoder_attn
+def my_translate_model(sentence):
+    output_words, _ = evaluate(
+        ENCODER, DECODER, sentence, VOCAB_SOURCE, VOCAB_TARGET, disable=True
+    )
+    return " ".join(output_words[1:-1]).capitalize()
 def envit5_translation(text):
     res = envit5_translater(
         text,
         max_length=512,
         early_stopping=True,
+    )[0][
+        "translation_text"
+    ][3:]
     return res
 def translation(text):
+    output1 = my_translate_model(text)
     output2 = envit5_translation(text)
+    #output3 = finetune_BERT(text)
+    return (output1, output2)
+examples = [["Input: Hello guys"],
+            ["Output: Xin chào các bạn"]]
 demo = gr.Interface(
+    theme = gr.themes.Base(),
     fn=translation,
     title="Co Gai Mo Duong",
+    description="""
+    ## Machine Translation: English to Vietnamese
+    """,
     examples=examples,
+    inputs=[
+        gr.Textbox(
+            lines=5, placeholder="Enter text", label="Input"
+        )
+    ],
+    outputs=[
+        gr.Textbox(
+            "text", label="Our Machine Translation"
+        ),
+        gr.Textbox(
+            "text", label="VietAI Machine Translation"
+        )
+    ]
+)
+demo.launch(share = True)

flagged/log.csv ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ Input,Our Machine Translation,VietAI Machine Translation,flag,username,timestamp
2	+ Today is a beautiful day.,Hôm nay là một ngày đẹp đẹp, Hôm nay là một ngày đẹp trời.,,,2024-01-11 02:01:36.293799

hid512_decoder_att_epoch_20.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2b13f49e00d60a51226db3a66e343ef3b73eccf06e0efe771cac417e1994a706
+size 40323250

hid512_encoder_att_epoch_20.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ec38b650930515f30086a04a16285c88430ceed352cfbd52cc27e34b4283221a
+size 16096464

requirements.txt CHANGED Viewed

@@ -2,4 +2,8 @@ transformers
 sentencepiece
 tokenizers
 torch
-gradio

 sentencepiece
 tokenizers
 torch
+gradio
+re
+pickle
+torchtext
+underthesea

temp.ipynb ADDED Viewed

	@@ -0,0 +1,569 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "WARNING:tensorflow:From c:\\Users\\THU\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\keras\\src\\losses.py:2976: The name tf.losses.sparse_softmax_cross_entropy is deprecated. Please use tf.compat.v1.losses.sparse_softmax_cross_entropy instead.\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "import gradio as gr\n",
+    "from transformers import pipeline    \n",
+    "import re\n",
+    "import pickle \n",
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "from torchtext.transforms import PadTransform\n",
+    "from torch.utils.data import Dataset, DataLoader\n",
+    "from torch.nn import functional as F\n",
+    "from tqdm import tqdm\n",
+    "from underthesea import word_tokenize, text_normalize"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Running on local URL:  http://127.0.0.1:7864\n",
+      "\n",
+      "To create a public link, set `share=True` in `launch()`.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div><iframe src=\"http://127.0.0.1:7864/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": []
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import gradio as gr\n",
+    "\n",
+    "def translation(text):\n",
+    "    output1 = 1\n",
+    "    output2 = 2\n",
+    "    #output3 = finetune_BERT(text)\n",
+    "\n",
+    "    return (output1, output2)\n",
+    "\n",
+    "\n",
+    "\n",
+    "examples = [[\"Input: Hello guys\"], \n",
+    "            [\"Output: Xin chào các bạn\"]]\n",
+    "\n",
+    "demo = gr.Interface(\n",
+    "    theme = gr.themes.Base(),\n",
+    "    fn=translation,\n",
+    "    title=\"Co Gai Mo Duong\",\n",
+    "    description=\"\"\"\n",
+    "    ## Machine Translation: English to Vietnamese\n",
+    "    \"\"\",\n",
+    "    examples=examples,\n",
+    "    inputs=[\n",
+    "        gr.Textbox(\n",
+    "            lines=5, placeholder=\"Enter text\", label=\"Input\"\n",
+    "        )\n",
+    "    ],\n",
+    "    outputs=[\n",
+    "        gr.Textbox(\n",
+    "            \"text\", label=\"Our Machine Translation\"\n",
+    "        ),\n",
+    "        gr.Textbox(\n",
+    "            \"text\", label=\"VietAI Machine Translation\"\n",
+    "        )\n",
+    "    ]\n",
+    ")\n",
+    "\n",
+    "demo.launch(shared = True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Build Vocabulary\n",
+    "MAX_LENGTH = 30\n",
+    "#device = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
+    "device = 'cpu'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class Vocabulary:\n",
+    "    \"\"\"The Vocabulary class is used to record words, which are used to convert\n",
+    "    text to numbers and vice versa.\n",
+    "    \"\"\"\n",
+    "\n",
+    "    def __init__(self, lang=\"vi\"):\n",
+    "        self.lang = lang\n",
+    "        self.word2id = dict()\n",
+    "        self.word2id[\"<sos>\"] = 0  # Start of Sentece Token\n",
+    "        self.word2id[\"<eos>\"] = 1  # End of Sentence Token\n",
+    "        self.word2id[\"<unk>\"] = 2  # Unknown Token\n",
+    "        self.word2id[\"<pad>\"] = 3  # Pad Token\n",
+    "        self.sos_id = self.word2id[\"<sos>\"]\n",
+    "        self.eos_id = self.word2id[\"<eos>\"]\n",
+    "        self.unk_id = self.word2id[\"<unk>\"]\n",
+    "        self.pad_id = self.word2id[\"<pad>\"]\n",
+    "        self.id2word = {v: k for k, v in self.word2id.items()}\n",
+    "        self.pad_transform = PadTransform(max_length = MAX_LENGTH, pad_value = self.pad_id)\n",
+    "\n",
+    "    def __getitem__(self, word):\n",
+    "        \"\"\"Return ID of word if existed else return ID unknown token\n",
+    "        @param word (str)\n",
+    "        \"\"\"\n",
+    "        return self.word2id.get(word, self.unk_id)\n",
+    "\n",
+    "    def __contains__(self, word):\n",
+    "        \"\"\"Return True if word in Vocabulary else return False\n",
+    "        @param word (str)\n",
+    "        \"\"\"\n",
+    "        return word in self.word2id\n",
+    "\n",
+    "    def __len__(self):\n",
+    "        \"\"\"\n",
+    "        Return number of tokens(include sos, eos, unk and pad tokens) in Vocabulary\n",
+    "        \"\"\"\n",
+    "        return len(self.word2id)\n",
+    "\n",
+    "    def lookup_tokens(self, word_indexes: list):\n",
+    "        \"\"\"Return the list of words by lookup by ID\n",
+    "        @param word_indexes (list(int))\n",
+    "        @return words (list(str))\n",
+    "        \"\"\"\n",
+    "        return [self.id2word[word_index] for word_index in word_indexes]\n",
+    "\n",
+    "    def add(self, word):\n",
+    "        \"\"\"Add word to vocabulary\n",
+    "        @param word (str)\n",
+    "        @return index (str): index of the word just added\n",
+    "        \"\"\"\n",
+    "        if word not in self:\n",
+    "            word_index = self.word2id[word] = len(self.word2id)\n",
+    "            self.id2word[word_index] = word\n",
+    "            return word_index\n",
+    "        else:\n",
+    "            return self[word]\n",
+    "\n",
+    "    def preprocessing_sent(self, sent, lang=\"en\"):\n",
+    "        \"\"\"Preprocessing a sentence (depend on language english or vietnamese)\"\"\"\n",
+    "\n",
+    "        if (lang == \"en\") or (lang == \"eng\") or (lang == \"english\"):\n",
+    "            # Remove unnecessary space\n",
+    "            sent = re.sub(\" +\", \" \", sent)\n",
+    "\n",
+    "            # Replace short form\n",
+    "            sent = re.sub(\"&apos;m \", \"am \", sent)\n",
+    "            # Dont know to preprocess with possessive case\n",
+    "            sent = re.sub(\"&apos;s \", \"is \", sent)\n",
+    "            sent = re.sub(\"&apos;re \", \"are \", sent)\n",
+    "            sent = re.sub(\"&apos;ve \", \"have \", sent)\n",
+    "            sent = re.sub(\"&apos;ll \", \"will \", sent)\n",
+    "            sent = re.sub(\"&apos;d \", \"would \", sent)\n",
+    "\n",
+    "            sent = re.sub(\"aren &apos;t\", \"are not\", sent)\n",
+    "            sent = re.sub(\"isn &apos;t\", \"is not\", sent)\n",
+    "            sent = re.sub(\"don &apos;t\", \"do not\", sent)\n",
+    "            sent = re.sub(\"doesn &apos;t\", \"does not\", sent)\n",
+    "            sent = re.sub(\"wasn &apos;t\", \"was not\", sent)\n",
+    "            sent = re.sub(\"weren &apos;t\", \"were not\", sent)\n",
+    "            sent = re.sub(\"won &apos;t\", \"will not\", sent)\n",
+    "            sent = re.sub(\"can &apos;t\", \"can not\", sent)\n",
+    "            sent = re.sub(\"let &apos;s\", \"let us\", sent)\n",
+    "\n",
+    "        else:\n",
+    "            # Package underthesea.text_normalize support to normalize vietnamese\n",
+    "            sent = text_normalize(sent)\n",
+    "\n",
+    "        sent = re.sub(\"&apos;\", \"'\", sent)\n",
+    "        sent = re.sub(\"&quot;\", '\"', sent)\n",
+    "        sent = re.sub(\"&#91;\", \"[\", sent)\n",
+    "        sent = re.sub(\"&#93;\", \"]\", sent)\n",
+    "        \n",
+    "        # Lowercase sentence and remove space at beginning and ending\n",
+    "        return sent.lower().strip()\n",
+    "\n",
+    "    def tokenize_corpus(self, corpus, disable=False):\n",
+    "        \"\"\"Split the documents of the corpus into words\n",
+    "        @param corpus (list(str)): list of documents\n",
+    "        @return tokenized_corpus (list(list(str))): list of words\n",
+    "        \"\"\"\n",
+    "        if not disable:\n",
+    "            print(\"Tokenize the corpus...\")\n",
+    "        tokenized_corpus = list()\n",
+    "        for document in tqdm(corpus, disable=disable):\n",
+    "            tokenized_document = [\"<sos>\"] + self.preprocessing_sent(document).split(\" \") + [\"<eos>\"]\n",
+    "            tokenized_corpus.append(tokenized_document)\n",
+    "        return tokenized_corpus\n",
+    "\n",
+    "    def corpus_to_tensor(self, corpus, is_tokenized=False, disable=False):\n",
+    "        \"\"\"Convert corpus to a list of indices tensor\n",
+    "        @param corpus (list(str) if is_tokenized==False else list(list(str)))\n",
+    "        @param is_tokenized (bool)\n",
+    "        @return indicies_corpus (list(tensor))\n",
+    "        \"\"\"\n",
+    "        if is_tokenized:\n",
+    "            tokenized_corpus = corpus\n",
+    "        else:\n",
+    "            tokenized_corpus = self.tokenize_corpus(corpus, disable=disable)\n",
+    "        indicies_corpus = list()\n",
+    "        for document in tqdm(tokenized_corpus, disable=disable):\n",
+    "            indicies_document = torch.tensor(\n",
+    "                list(map(lambda word: self[word], document)), dtype=torch.int64\n",
+    "            )\n",
+    "            \n",
+    "            indicies_corpus.append(self.pad_transform(indicies_document))\n",
+    "\n",
+    "        return indicies_corpus\n",
+    "\n",
+    "    def tensor_to_corpus(self, tensor, disable=False):\n",
+    "        \"\"\"Convert list of indices tensor to a list of tokenized documents\n",
+    "        @param indicies_corpus (list(tensor))\n",
+    "        @return corpus (list(list(str)))\n",
+    "        \"\"\"\n",
+    "        corpus = list()\n",
+    "        for indicies in tqdm(tensor, disable=disable):\n",
+    "            document = list(map(lambda index: self.id2word[index.item()], indicies))\n",
+    "            corpus.append(document)\n",
+    "\n",
+    "        return corpus"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_input_emb_layer():\n",
+    "    num_embeddings, embedding_dim = 32998, 100\n",
+    "    emb_layer = nn.Embedding(num_embeddings, embedding_dim)\n",
+    "    emb_layer.weight.requires_grad = False\n",
+    "\n",
+    "    return emb_layer, embedding_dim\n",
+    "\n",
+    "def create_output_emb_layer():\n",
+    "    num_embeddings, embedding_dim = 15405, 100\n",
+    "    emb_layer = nn.Embedding(num_embeddings, embedding_dim)\n",
+    "    emb_layer.weight.requires_grad = False\n",
+    "\n",
+    "    return emb_layer, embedding_dim\n",
+    "    \n",
+    "class EncoderRNN(nn.Module):\n",
+    "    def __init__(self, input_dim, hidden_dim, dropout = 0.2):\n",
+    "        super(EncoderRNN, self).__init__()\n",
+    "        \n",
+    "        self.hidden_dim = hidden_dim\n",
+    "        #self.embedding = nn.Embedding(input_dim, hidden_dim)\n",
+    "        # Đổi thành input embedding\n",
+    "        self.embedding, self.embedding_dim = create_input_emb_layer()\n",
+    "        self.gru = nn.GRU(self.embedding_dim, hidden_dim, batch_first=True)\n",
+    "        self.dropout = nn.Dropout(dropout)\n",
+    "\n",
+    "    def forward(self, src):\n",
+    "        embedded = self.dropout(self.embedding(src))\n",
+    "        output, hidden = self.gru(embedded)\n",
+    "        return output, hidden\n",
+    "       \n",
+    "class BahdanauAttention(nn.Module):\n",
+    "    def __init__(self, hidden_size):\n",
+    "        super(BahdanauAttention, self).__init__()\n",
+    "        self.Wa = nn.Linear(hidden_size, hidden_size)\n",
+    "        self.Ua = nn.Linear(hidden_size, hidden_size)\n",
+    "        self.Va = nn.Linear(hidden_size, 1)\n",
+    "\n",
+    "    def forward(self, query, keys):\n",
+    "        scores = self.Va(torch.tanh(self.Wa(query) + self.Ua(keys)))\n",
+    "        scores = scores.squeeze(2).unsqueeze(1)\n",
+    "\n",
+    "        weights = F.softmax(scores, dim=-1)\n",
+    "        context = torch.bmm(weights, keys)\n",
+    "\n",
+    "        return context, weights\n",
+    "\n",
+    "class AttnDecoderRNN(nn.Module):\n",
+    "    def __init__(self, hidden_size, output_size, dropout_p=0.1):\n",
+    "        super(AttnDecoderRNN, self).__init__()\n",
+    "        # self.embedding = nn.Embedding(output_size, hidden_size)\n",
+    "        # Đổi thành output embedding\n",
+    "        self.embedding, self.embedding_dim = create_output_emb_layer()\n",
+    "        self.fc = nn.Linear(self.embedding_dim, hidden_size)\n",
+    "        self.attention = BahdanauAttention(hidden_size)\n",
+    "        self.gru = nn.GRU(2 * hidden_size, hidden_size, batch_first=True)\n",
+    "        self.out = nn.Linear(hidden_size, output_size)\n",
+    "        self.dropout = nn.Dropout(dropout_p)\n",
+    "\n",
+    "    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):\n",
+    "        batch_size = encoder_outputs.size(0)\n",
+    "        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(0)\n",
+    "        decoder_hidden = encoder_hidden\n",
+    "        decoder_outputs = []\n",
+    "        attentions = []\n",
+    "\n",
+    "        for i in range(MAX_LENGTH):\n",
+    "            decoder_output, decoder_hidden, attn_weights = self.forward_step(\n",
+    "                decoder_input, decoder_hidden, encoder_outputs\n",
+    "            )\n",
+    "            decoder_outputs.append(decoder_output)\n",
+    "            attentions.append(attn_weights)\n",
+    "\n",
+    "            if target_tensor is not None:\n",
+    "                # Teacher forcing: Feed the target as the next input\n",
+    "                decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing\n",
+    "            else:\n",
+    "                # Without teacher forcing: use its own predictions as the next input\n",
+    "                _, topi = decoder_output.topk(1)\n",
+    "                decoder_input = topi.squeeze(-1).detach()  # detach from history as input\n",
+    "\n",
+    "        decoder_outputs = torch.cat(decoder_outputs, dim=1)\n",
+    "        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)\n",
+    "        attentions = torch.cat(attentions, dim=1)\n",
+    "\n",
+    "        return decoder_outputs, decoder_hidden, attentions\n",
+    "\n",
+    "\n",
+    "    def forward_step(self, input, hidden, encoder_outputs):\n",
+    "        embedded =  self.dropout(self.fc(self.embedding(input)))\n",
+    "        \n",
+    "        query = hidden.permute(1, 0, 2)\n",
+    "        context, attn_weights = self.attention(query, encoder_outputs)\n",
+    "        input_gru = torch.cat((embedded, context), dim=2)\n",
+    "\n",
+    "        output, hidden = self.gru(input_gru, hidden)\n",
+    "        output = self.out(output)\n",
+    "\n",
+    "        return output, hidden, attn_weights"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<All keys matched successfully>"
+      ]
+     },
+     "execution_count": 41,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "with open(\"vocab_source.pkl\", \"rb\") as file:\n",
+    "    VOCAB_SOURCE = pickle.load(file)\n",
+    "with open(\"vocab_target.pkl\", \"rb\") as file:\n",
+    "    VOCAB_TARGET = pickle.load(file)\n",
+    "\n",
+    "INPUT_DIM = len(VOCAB_SOURCE)\n",
+    "OUTPUT_DIM = len(VOCAB_TARGET)\n",
+    "HID_DIM = 512\n",
+    "\n",
+    "# Load our Model Translation\n",
+    "ENCODER = EncoderRNN(INPUT_DIM, HID_DIM)\n",
+    "ENCODER.load_state_dict(torch.load('encoder_att_epoch_16.pt'))\n",
+    "DECODER = AttnDecoderRNN(HID_DIM, OUTPUT_DIM)\n",
+    "DECODER.load_state_dict(torch.load('decoder_att_epoch_16.pt'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def evaluate(encoder, decoder, sentence, vocab_source, vocab_target, disable = False):\n",
+    "    encoder.eval()\n",
+    "    decoder.eval()\n",
+    "    with torch.no_grad():\n",
+    "        input_tensor = vocab_source.corpus_to_tensor([sentence], disable = disable)[0].view(1,-1).to(device)\n",
+    "        \n",
+    "        encoder_outputs, encoder_hidden = encoder(input_tensor)\n",
+    "        decoder_outputs, decoder_hidden, decoder_attn = decoder(encoder_outputs, encoder_hidden)\n",
+    "\n",
+    "        _, topi = decoder_outputs.topk(1)\n",
+    "        decoded_ids = topi.squeeze()\n",
+    "\n",
+    "        decoded_words = []\n",
+    "        for idx in decoded_ids:\n",
+    "            if idx.item() == vocab_target.eos_id:\n",
+    "                decoded_words.append('<eos>')\n",
+    "                break\n",
+    "            decoded_words.append(vocab_target.id2word[idx.item()])\n",
+    "    return decoded_words, decoder_attn\n",
+    "\n",
+    "def my_translate_model(sentence):\n",
+    "    output_words, _ = evaluate(ENCODER, DECODER, sentence, VOCAB_SOURCE, VOCAB_TARGET, disable= True)\n",
+    "    \n",
+    "    return ' '.join(output_words[1:-1]).capitalize()+ '.'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 61,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'Tôi hy vọng các bạn sẽ có thể làm được giải pháp.'"
+      ]
+     },
+     "execution_count": 61,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "my_translate_model(\"I hope you will be better\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 60,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<All keys matched successfully>"
+      ]
+     },
+     "execution_count": 60,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "ENCODER = EncoderRNN(INPUT_DIM, HID_DIM)\n",
+    "ENCODER.load_state_dict(torch.load('encoder_att_epoch_16.pt'))\n",
+    "DECODER = AttnDecoderRNN(HID_DIM, OUTPUT_DIM)\n",
+    "DECODER.load_state_dict(torch.load('decoder_att_epoch_16.pt'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "odict_keys(['embedding.weight', 'fc.weight', 'fc.bias', 'attention.Wa.weight', 'attention.Wa.bias', 'attention.Ua.weight', 'attention.Ua.bias', 'attention.Va.weight', 'attention.Va.bias', 'gru.weight_ih_l0', 'gru.weight_hh_l0', 'gru.bias_ih_l0', 'gru.bias_hh_l0', 'out.weight', 'out.bias'])"
+      ]
+     },
+     "execution_count": 48,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "torch.load('decoder_att_epoch_16.pt').keys()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<All keys matched successfully>"
+      ]
+     },
+     "execution_count": 52,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "DECODER.load_state_dict(torch.load('decoder_att_epoch_16.pt'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 57,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<All keys matched successfully>"
+      ]
+     },
+     "execution_count": 57,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "DECODER = AttnDecoderRNN(HID_DIM, OUTPUT_DIM)\n",
+    "DECODER.load_state_dict(torch.load('decoder_att_epoch_16.pt'))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

vocab_source.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cf38f3daacf3feb3b80cba2069210d5ac3b770c232233178f42434b709bba360
+size 659103

vocab_target.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ac7bd376478b2b3bbcfbeeccd5ced630340b95d3da5eab8d7c1c9e01d74b50d2
+size 228271