{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "colab_type": "text",
        "id": "view-in-github"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/pankajrawat9075/CS6910_assignment_3/blob/main/DL_PA3_final.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "hRdpoWePeYHn"
      },
      "source": [
        "## Importing Libraries and models"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "0LBvFtYGCNgJ"
      },
      "outputs": [],
      "source": [
        "%%capture\n",
        "!pip install wandb"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "zkZTzr7OCPBM"
      },
      "outputs": [],
      "source": [
        "import wandb"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "z4ZVrIumZcDt"
      },
      "outputs": [],
      "source": [
        "from __future__ import unicode_literals, print_function, division\n",
        "from io import open\n",
        "import unicodedata\n",
        "import string\n",
        "import re\n",
        "import random\n",
        "import pandas as pd\n",
        "import torch\n",
        "import torch.nn as nn\n",
        "from torch import optim\n",
        "import torch.nn.functional as F\n",
        "\n",
        "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
        "torch.cuda.empty_cache()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "qwL09v65CIse",
        "outputId": "f1dcbc80-5110-48f9-d0c5-836a2daa05b4"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "cuda\n"
          ]
        }
      ],
      "source": [
        "print(device)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "44xIRolL_T_d"
      },
      "source": [
        "## Load Dataset"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "-XRMpx9eBzRK",
        "outputId": "177ee7ae-bb7d-46ea-9269-fa3aa045a89e"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Mounted at /content/drive\n"
          ]
        }
      ],
      "source": [
        "from google.colab import drive\n",
        "drive.mount('/content/drive')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Y4zemXiyE6Fi"
      },
      "outputs": [],
      "source": [
        "class Lang:\n",
        "    def __init__(self, name):\n",
        "        self.name = name\n",
        "        self.char2index = {'#': 0, '$': 1, '^': 2}\n",
        "        self.char2count = {'#': 1, '$': 1, '^': 1}\n",
        "        self.index2char = {0: '#', 1: '$', 2: '^'}\n",
        "        self.n_chars = 3  # Count\n",
        "        self.data = {}\n",
        "        \n",
        "\n",
        "    def addWord(self, word):\n",
        "        for char in word:\n",
        "            self.addChar(char)\n",
        "\n",
        "    def addChar(self, char):\n",
        "        if char not in self.char2index:\n",
        "            self.char2index[char] = self.n_chars\n",
        "            self.char2count[char] = 1\n",
        "            self.index2char[self.n_chars] = char\n",
        "            self.n_chars += 1\n",
        "        else:\n",
        "            self.char2count[char] += 1\n",
        "\n",
        "    \n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "dCR658yRvXpy"
      },
      "outputs": [],
      "source": [
        "# return max length of input and output words\n",
        "def maxLength(data):\n",
        "    ip_mlen, op_mlen = 0, 0\n",
        "\n",
        "    for i in range(len(data)):\n",
        "        input = data[0][i]\n",
        "        output = data[1][i]\n",
        "        if(len(input)>ip_mlen):\n",
        "            ip_mlen=len(input)\n",
        "\n",
        "        if(len(output)>op_mlen):\n",
        "            op_mlen=len(output)\n",
        "\n",
        "    return ip_mlen, op_mlen"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "IDGaCO8DkYpc"
      },
      "outputs": [],
      "source": [
        "import numpy\n",
        "input_shape = 0\n",
        "from torch.utils.data import TensorDataset, DataLoader\n",
        "def preprocess(data, input_lang, output_lang):\n",
        "    maxlenInput, maxlenOutput = maxLength(data)\n",
        "    # we use maxlenInput as 26 since it is the maximum of all input len\n",
        "    maxlenInput = 26\n",
        "    input = numpy.zeros((len(data), maxlenInput + 1))\n",
        "    output = numpy.zeros((len(data), maxlenOutput + 2))\n",
        "    maxlenInput, maxlenOutput = maxLength(data)\n",
        "    unknown = input_lang.char2index['$']\n",
        "\n",
        "    for i in range(len(data)):\n",
        "        op = '^' + data[1][i]\n",
        "        ip = data[0][i].ljust(maxlenInput + 1, '#')\n",
        "        op = op.ljust(maxlenOutput + 2, '#')\n",
        "        \n",
        "\n",
        "        for index, char in enumerate(ip):\n",
        "            if input_lang.char2index.get(char) is not None:\n",
        "                input[i][index] = input_lang.char2index[char]\n",
        "            else:\n",
        "                input[i][index] = unknown\n",
        "        \n",
        "\n",
        "        \n",
        "        for index, char in enumerate(op):\n",
        "            if output_lang.char2index.get(char) is not None:\n",
        "                output[i][index] = output_lang.char2index[char]\n",
        "            else:\n",
        "                output[i][index] = unknown  \n",
        "\n",
        "    print(input.shape)\n",
        "    print(output.shape)\n",
        "\n",
        "    return TensorDataset(torch.from_numpy(input), torch.from_numpy(output))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "PdS5OXKxfdCX",
        "outputId": "178f1d73-5b0c-431d-ca9b-d9435b924c41"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "(51200, 27)\n",
            "(51200, 22)\n",
            "(4096, 27)\n",
            "(4096, 22)\n",
            "(4096, 27)\n",
            "(4096, 22)\n"
          ]
        }
      ],
      "source": [
        "def loadData(lang):\n",
        "    train_df = pd.read_csv(f\"drive/MyDrive/aksharantar_sampled/{lang}/{lang}_train.csv\", header = None)\n",
        "    val_df = pd.read_csv(f\"drive/MyDrive/aksharantar_sampled/{lang}/{lang}_valid.csv\", header = None)\n",
        "    test_df = pd.read_csv(f\"drive/MyDrive/aksharantar_sampled/{lang}/{lang}_test.csv\", header = None)\n",
        "\n",
        "    input_lang = Lang('eng')\n",
        "    output_lang = Lang(lang)\n",
        "    \n",
        "    # add the words to the respective languages\n",
        "    for i in range(len(train_df)):\n",
        "        \n",
        "        input_lang.addWord(train_df[0][i])\n",
        "        output_lang.addWord(train_df[1][i])\n",
        "\n",
        "    # print(input_lang.char2index)\n",
        "    # print(input_lang.index2char)\n",
        "    trainDataset = preprocess(train_df, input_lang, output_lang)\n",
        "    testDataset = preprocess(test_df, input_lang, output_lang)\n",
        "    valDataset = preprocess(val_df, input_lang, output_lang)\n",
        "\n",
        "    return trainDataset, testDataset, valDataset, input_lang, output_lang\n",
        "\n",
        "\n",
        "trainData, testData, valData, ipLang, opLang = loadData('hin')\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "SvmzS5Lt_Jnl",
        "outputId": "33defb60-5aee-46cb-e683-ee2df9e98436"
      },
      "outputs": [
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "\u001b[34m\u001b[1mwandb\u001b[0m: W&B API key is configured. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n",
            "\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m If you're specifying your api key in code, ensure this code is not shared publicly.\n",
            "\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m Consider setting the WANDB_API_KEY environment variable, or running `wandb login` from the command line.\n",
            "\u001b[34m\u001b[1mwandb\u001b[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc\n"
          ]
        },
        {
          "data": {
            "text/plain": [
              "True"
            ]
          },
          "execution_count": 10,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "wandb.login(key =\"\")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Q1TioafYgICa"
      },
      "source": [
        "# seq2seq model"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "svxssm9Havhb"
      },
      "source": [
        "## Encoder"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "YTwk8nKNcbkb"
      },
      "outputs": [],
      "source": [
        "class EncoderRNN(nn.Module):\n",
        "    def __init__(self, input_size, hidden_size, embedding_size, # input_size is size of input language dictionary\n",
        "                 num_layers, cell_type,\n",
        "                  bidirectional, dropout, batch_size) :\n",
        "        super(EncoderRNN, self).__init__()\n",
        "        self.hidden_size = hidden_size  # size of an hidden state representation\n",
        "        self.num_layers = num_layers   \n",
        "        self.bidirectional = True if bidirectional == 'Yes' else False\n",
        "        self.batch_size = batch_size\n",
        "        self.cell_type = cell_type\n",
        "        self.embedding_size=embedding_size\n",
        "\n",
        "        # this adds the embedding layer\n",
        "        self.embedding = nn.Embedding(num_embeddings=input_size,embedding_dim= embedding_size)\n",
        "        self.dropout = nn.Dropout(dropout)\n",
        "\n",
        "        # this adds the Neural Network layer for the encoder\n",
        "        if self.cell_type == \"GRU\":\n",
        "            self.rnn = nn.GRU(embedding_size, hidden_size, num_layers=num_layers, bidirectional=self.bidirectional, dropout=dropout)\n",
        "        elif self.cell_type == \"LSTM\":\n",
        "            self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers=num_layers, bidirectional=self.bidirectional, dropout=dropout)\n",
        "        else:\n",
        "            self.rnn = nn.RNN(embedding_size, hidden_size, num_layers=num_layers, bidirectional=self.bidirectional, dropout=dropout)\n",
        "\n",
        "    def forward(self, input, hidden): # input shape (seq_len, batch_size) hidden shape tuple for lstm, otherwise single\n",
        "        embedded = self.embedding(input.long()).view(-1,self.batch_size, self.embedding_size)\n",
        "        output = self.dropout(embedded) # output shape (seq_len, batch_size, embedding size)\n",
        "\n",
        "        output, hidden = self.rnn(output, hidden) # for LSTM hidden is a tuple\n",
        "        if self.bidirectional:\n",
        "            if self.cell_type == \"LSTM\":\n",
        "                hidden_state = hidden[0].resize(2,self.num_layers,self.batch_size,self.hidden_size)\n",
        "                cell_state = hidden[1].resize(2,self.num_layers,self.batch_size,self.hidden_size)\n",
        "                hidden = (torch.add(hidden_state[0],hidden_state[1])/2, torch.add(cell_state[0],cell_state[1])/2)\n",
        "            else:\n",
        "                hidden=hidden.resize(2,self.num_layers,self.batch_size,self.hidden_size)\n",
        "                hidden=torch.add(hidden[0],hidden[1])/2\n",
        "            \n",
        "            split_tensor= torch.split(output, self.hidden_size, dim=-1)\n",
        "            output=torch.add(split_tensor[0],split_tensor[1])/2\n",
        "        return output, hidden\n",
        "\n",
        "    # initializing the initial hidden state for the encoder\n",
        "    def initHidden(self):\n",
        "        num_directions = 2 if self.bidirectional else 1\n",
        "        if self.cell_type == \"LSTM\":\n",
        "            return (torch.zeros(self.num_layers * num_directions, self.batch_size, self.hidden_size, device=device),\n",
        "                    torch.zeros(self.num_layers * num_directions, self.batch_size, self.hidden_size, device=device))\n",
        "        else:\n",
        "            return torch.zeros(self.num_layers * num_directions, self.batch_size, self.hidden_size, device=device)\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "J56aq1J6a07q"
      },
      "source": [
        "## Decoder"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "53ki6eJUH2u2"
      },
      "outputs": [],
      "source": [
        "class DecoderRNN(nn.Module):\n",
        "    def __init__(self, hidden_size, output_size, embedding_size, num_layers, # output size is the size of output language dictionary\n",
        "                 cell_type, dropout, batch_size):\n",
        "        super(DecoderRNN, self).__init__()\n",
        "        self.hidden_size = hidden_size\n",
        "        self.num_layers = num_layers\n",
        "        self.cell_type = cell_type.lower()\n",
        "        self.batch_size = batch_size\n",
        "        self.embedding_size=embedding_size\n",
        "\n",
        "        self.embedding = nn.Embedding(output_size, embedding_size)\n",
        "        # self.dropout = nn.Dropout(dropout)\n",
        "        \n",
        "        if self.cell_type == \"gru\":\n",
        "            self.rnn = nn.GRU(embedding_size, hidden_size, num_layers=num_layers)\n",
        "        elif self.cell_type == \"lstm\":\n",
        "            self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers=num_layers)\n",
        "        else:\n",
        "            self.rnn = nn.RNN(embedding_size, hidden_size, num_layers=num_layers)\n",
        "\n",
        "        self.out = nn.Linear(hidden_size, output_size)\n",
        "        self.softmax = nn.LogSoftmax(dim=2)\n",
        "\n",
        "    def forward(self, input, hidden): # input shape (1, batch_size)\n",
        "        embedded = self.embedding(input.long()).view(-1, self.batch_size, self.embedding_size)\n",
        "        # # shape (1, batch_size, embedding_size)\n",
        "        output = F.relu(embedded)\n",
        "        output, hidden = self.rnn(output, hidden) # output shape (1, batch_size, hidden_size)\n",
        "        output = self.softmax(self.out(output)) # shape (1, batch_size, output_size)\n",
        "        return output, hidden\n",
        "\n",
        "    # not needed since hidden will be provided by the encoder"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "5JcQdylzI_Fc"
      },
      "source": [
        "## Attention Decoder"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "R1Xysuv9I-Qr"
      },
      "outputs": [],
      "source": [
        "class AttentionDecoderRNN(nn.Module):\n",
        "    def __init__(self, hidden_size, output_size, embedding_size, num_layers,\n",
        "                 cell_type, dropout, batch_size, max_length):\n",
        "        super(AttentionDecoderRNN, self).__init__()\n",
        "        self.hidden_size = hidden_size\n",
        "        self.num_layers = num_layers\n",
        "        self.cell_type = cell_type\n",
        "        self.batch_size = batch_size\n",
        "        self.embedding_size = embedding_size\n",
        "        self.max_length = max_length\n",
        "        self.dropout = dropout\n",
        "\n",
        "        self.embedding = nn.Embedding(output_size, embedding_size)\n",
        "        self.dropout = nn.Dropout(self.dropout)\n",
        "        self.attention = nn.Linear(hidden_size + embedding_size, self.max_length)\n",
        "        self.attention_combine = nn.Linear(hidden_size + embedding_size, hidden_size)\n",
        "\n",
        "        if self.cell_type == \"GRU\":\n",
        "            self.rnn = nn.GRU(hidden_size, hidden_size, num_layers=num_layers)\n",
        "        elif self.cell_type == \"LSTM\":\n",
        "            self.rnn = nn.LSTM(hidden_size, hidden_size, num_layers=num_layers)\n",
        "        else:\n",
        "            self.rnn = nn.RNN(hidden_size, hidden_size, num_layers=num_layers)\n",
        "\n",
        "        self.out = nn.Linear(hidden_size, output_size)\n",
        "        self.softmax = nn.LogSoftmax(dim=2)\n",
        "\n",
        "    def forward(self, input, hidden, encoder_outputs): #input shape (1, batch_size)\n",
        "        embedded = self.embedding(input.long()).view(-1, self.batch_size, self.embedding_size) \n",
        "        # embedded shape (1, batch_size, embedding_size)\n",
        "        embedded = F.relu(embedded)\n",
        "\n",
        "        # Compute attention scores\n",
        "        if self.cell_type == \"LSTM\":\n",
        "            attn_hidden = torch.mean(hidden[0], dim=0)\n",
        "        else:\n",
        "            attn_hidden = torch.mean(hidden, dim = 0)\n",
        "        attn_scores = self.attention(torch.cat((embedded, attn_hidden.unsqueeze(0)), dim=2)) # attn_scores shape (1, batch_size, max_length)\n",
        "        \n",
        "        attn_weights = F.softmax(attn_scores, dim=-1)  # attn_scores shape (1, 16, 25)\n",
        "        \n",
        "\n",
        "        # Apply attention weights to encoder outputs\n",
        "        attn_applied = torch.bmm(attn_weights.transpose(0, 1), encoder_outputs.transpose(0, 1))\n",
        "        \n",
        "        # Combine attention output and embedded input\n",
        "        combined = torch.cat((embedded, attn_applied.transpose(0, 1)), dim=2)\n",
        "        combined = self.attention_combine(combined)\n",
        "        combined = F.relu(combined) # shape (1, batch_size, hidden_size)\n",
        "\n",
        "        # Run through the RNN\n",
        "        output, hidden = self.rnn(combined, hidden)\n",
        "        # output shape: (1, batch_size, hidden_size)\n",
        "\n",
        "        # Pass through linear layer and softmax activation\n",
        "        output = self.out(output)  # shape: (1, batch_size, output_size)\n",
        "        output = self.softmax(output)\n",
        "        return output, hidden, attn_weights.transpose(0, 1)\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "LJ2Papj_jTX8"
      },
      "outputs": [],
      "source": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "658W9RARGEUf"
      },
      "source": [
        "# Helper functions"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "q7fAgs5uQni_"
      },
      "source": [
        "## count matches"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "8fzy8U6_lbug"
      },
      "outputs": [],
      "source": [
        "def count_exact_matches(pred, target):\n",
        "    \"\"\"\n",
        "    Counts the number of rows in preds tensor that match exactly with each row in y tensor.\n",
        "    pred: tensor of shape (batch_size, seq_len-1)\n",
        "    y: tensor of shape (batch_size, seq_len-1)\n",
        "    \"\"\"\n",
        "    \n",
        "    count=0;\n",
        "    for i in range(pred.shape[0]):\n",
        "      flag = True\n",
        "      for j in range(pred.shape[1]):\n",
        "        if(target[i][j]!=pred[i][j]):\n",
        "          flag=False\n",
        "          break;\n",
        "         \n",
        "      if(flag):\n",
        "        count+=1;\n",
        "    \n",
        "    return count"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "n4rGh7vuQqaa"
      },
      "source": [
        "## evaluation"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "zp6gvWmDlWoB"
      },
      "outputs": [],
      "source": [
        "def evaluate(data,encoder, decoder,output_size,batch_size,hidden_size,num_layers_encoder,num_layers_decoder, cell_type, attention):\n",
        "    \n",
        "\n",
        "\n",
        "    running_loss = 0\n",
        "    correct =0\n",
        "    \n",
        "    loader = DataLoader(data, batch_size=batch_size)\n",
        "    loss_fun = nn.CrossEntropyLoss(reduction=\"sum\")\n",
        "    seq_len = 0\n",
        "\n",
        "    atten_weights = torch.zeros(1,21, 27).to(device) # required to return the attention weights\n",
        "    predictions = torch.zeros(22-1, 1).to(device)\n",
        "    with torch.no_grad():\n",
        "      for j,(x,y) in enumerate(loader):\n",
        "        loss=0\n",
        "        encoder.eval()\n",
        "        decoder.eval()\n",
        "\n",
        "        x = x.to(device)\n",
        "        y = y.to(device)\n",
        "\n",
        "        x = x.T\n",
        "        y = y.T\n",
        "        seq_len = len(y)\n",
        "        \n",
        "        encoder_hidden=encoder.initHidden()\n",
        "        encoder_output,encoder_hidden = encoder(x,encoder_hidden)\n",
        "        \n",
        "        \n",
        "        decoder_input =y[0]\n",
        "        \n",
        "        # Handle different numbers of layers in the encoder and decoder\n",
        "        if num_layers_encoder != num_layers_decoder:\n",
        "            if num_layers_encoder < num_layers_decoder:\n",
        "                remaining_layers = num_layers_decoder - num_layers_encoder\n",
        "\n",
        "                # Copy all encoder hidden layers and then repeat the top layer\n",
        "                if cell_type == \"LSTM\":\n",
        "                    top_layer_hidden = (encoder_hidden[0][-1].unsqueeze(0), encoder_hidden[1][-1].unsqueeze(0))\n",
        "                    extra_hidden = (top_layer_hidden[0].repeat(remaining_layers, 1, 1), top_layer_hidden[1].repeat(remaining_layers, 1, 1))\n",
        "                    decoder_hidden = (torch.cat((encoder_hidden[0], extra_hidden[0]), dim=0), torch.cat((encoder_hidden[1], extra_hidden[1]), dim=0))\n",
        "                else:\n",
        "                    top_layer_hidden = encoder_hidden[-1].unsqueeze(0) #top_layer_hidden shape (1, batch_size, hidden_size)\n",
        "                    extra_hidden = top_layer_hidden.repeat(remaining_layers, 1, 1)\n",
        "                    decoder_hidden = torch.cat((encoder_hidden, extra_hidden), dim=0)\n",
        "\n",
        "            else:\n",
        "                # Slice the hidden states of the encoder to match the decoder layers\n",
        "                if cell_type == \"LSTM\":\n",
        "                    decoder_hidden = (encoder_hidden[0][-num_layers_decoder:], encoder_hidden[1][-num_layers_decoder:])\n",
        "                else :\n",
        "                    decoder_hidden = encoder_hidden[-num_layers_decoder:]\n",
        "        else:\n",
        "            decoder_hidden = encoder_hidden\n",
        "\n",
        "        pred=torch.zeros(len(y)-1, batch_size).to(device)\n",
        "        atten_weight_default = torch.zeros(batch_size,1, 27).to(device)\n",
        "        for k in range(1,len(y)):\n",
        "          if attention == \"Yes\":\n",
        "              \n",
        "              decoder_output, decoder_hidden, atten_weight = decoder(decoder_input, decoder_hidden, encoder_output)\n",
        "              atten_weight_default = torch.cat((atten_weight_default, atten_weight), dim = 1)\n",
        "          else:\n",
        "              decoder_output, decoder_hidden= decoder(decoder_input, decoder_hidden)\n",
        "          max_prob, index = decoder_output.topk(1) # max_prob shape (1, batch_size, 1)\n",
        "          decoder_output = torch.squeeze(decoder_output)\n",
        "          loss += loss_fun(decoder_output, y[k].long())\n",
        "          pred[k-1]= torch.squeeze(index)\n",
        "          decoder_input = index\n",
        "        if attention == \"Yes\":\n",
        "            atten_weights = torch.cat((atten_weights, atten_weight_default[:, 1:, :]), dim = 0)\n",
        "\n",
        "        running_loss += loss.item()\n",
        "        correct += count_exact_matches(pred.T,y[1:,:].T)\n",
        "        predictions = torch.cat((predictions, pred), dim=1)\n",
        "\n",
        "        \n",
        "    avg_loss = running_loss / (len(data) * seq_len)\n",
        "    print(\"correct =\", correct)\n",
        "    avg_acc = 100 * (correct / (len(data)))\n",
        "    if attention == \"Yes\":\n",
        "        return avg_loss, avg_acc, predictions, atten_weights[1:, :, :]\n",
        "    else:\n",
        "        return avg_loss, avg_acc, predictions\n",
        "            \n",
        "   \n",
        " "
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "0SsnRWlgQmCI"
      },
      "source": [
        "# Training function"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "PhDgsZG0QqPW"
      },
      "outputs": [],
      "source": [
        "def train(sweeps = True, test = False):\n",
        "\n",
        "    if sweeps == False: \n",
        "        configs = config_defaults  # use the default configuration which has the best hyperparameters\n",
        "    else:\n",
        "        wandb.init(config= config_defaults, project='DL_assign_3')   # if not test then run wandb sweeps\n",
        "        configs=wandb.config\n",
        "       \n",
        "\n",
        "    learn_rate = configs['learn_rate']\n",
        "    batch_size = configs['batch_size']\n",
        "    hidden_size = configs['hidden_size']\n",
        "    embedding_size = configs['embedding_size']\n",
        "    num_layers_encoder = configs['num_layers_encoder']\n",
        "    num_layers_decoder = configs['num_layers_decoder']\n",
        "    cell_type = configs['cell_type']\n",
        "    bidirectional = configs['bidirectional']\n",
        "    dropout = configs['dropout']\n",
        "    teach_ratio = configs['teach_ratio']\n",
        "    epochs = configs['epochs']\n",
        "    attention = configs['attention']\n",
        "\n",
        "    if sweeps:\n",
        "       wandb.run.name='hidden_'+str(hidden_size)+'_batch_'+str(batch_size)+'_embed_size_'+str(embedding_size)+'_dropout_'+str(dropout)+'_cell_'+str(cell_type)\n",
        "\n",
        "    input_len = ipLang.n_chars\n",
        "    output_len = opLang.n_chars\n",
        "    \n",
        "    encoder = EncoderRNN(input_len, hidden_size, embedding_size, \n",
        "                 num_layers_encoder, cell_type,\n",
        "                  bidirectional, dropout, batch_size)\n",
        "    \n",
        "    if attention ==\"Yes\":\n",
        "        decoder = AttentionDecoderRNN(hidden_size, output_len, embedding_size, num_layers_decoder, \n",
        "                 cell_type, dropout, batch_size, 27)\n",
        "    else:\n",
        "        decoder = DecoderRNN(hidden_size, output_len, embedding_size, num_layers_decoder, \n",
        "                 cell_type, dropout, batch_size)#dropout not used\n",
        "    \n",
        "    train_loader = DataLoader(trainData, batch_size=batch_size, shuffle=True)\n",
        "    val_loader = DataLoader(valData, batch_size=batch_size, shuffle=True)\n",
        "\n",
        "    encoder_optimizer=optim.Adam(encoder.parameters(),learn_rate)\n",
        "    decoder_optimizer=optim.Adam(decoder.parameters(),learn_rate)\n",
        "    loss_fun=nn.CrossEntropyLoss(reduction=\"sum\")\n",
        "\n",
        "    encoder.to(device)\n",
        "    decoder.to(device)\n",
        "    seq_len = 0\n",
        "\n",
        "    # Initialize variables for early stopping\n",
        "    best_val_loss = float('inf')\n",
        "    patience = 5\n",
        "    epochs_without_improvement = 0\n",
        "\n",
        "    for i in range(epochs):\n",
        "        \n",
        "        running_loss = 0.0\n",
        "        train_correct = 0\n",
        "\n",
        "        encoder.train()\n",
        "        decoder.train()\n",
        "\n",
        "        for j,(train_x,train_y) in enumerate(train_loader):\n",
        "            train_x = train_x.to(device)\n",
        "            train_y = train_y.to(device)\n",
        "\n",
        "            encoder_optimizer.zero_grad()\n",
        "            decoder_optimizer.zero_grad()\n",
        "\n",
        "            train_x=train_x.T\n",
        "            train_y=train_y.T\n",
        "            # print(\"train_x.shapetrain_x.shape)\n",
        "            seq_len = len(train_y)\n",
        "            encoder_hidden=encoder.initHidden()\n",
        "            # for LSTM encoder_hidden shape ((num_layers * num_directions, batch_size,hidden_size),(self.num_layers * num_directions, batch_size, hidden_size))\n",
        "            encoder_output,encoder_hidden = encoder(train_x,encoder_hidden)\n",
        "            # encoder_hidden shape (num_layers, batch_size, hidden_size)\n",
        "            \n",
        "            \n",
        "            # lets move to the decoder\n",
        "            decoder_input = train_y[0] # shape (1, batch_size)\n",
        "           \n",
        "            # Handle different numbers of layers in the encoder and decoder\n",
        "            if num_layers_encoder != num_layers_decoder:\n",
        "                if num_layers_encoder < num_layers_decoder:\n",
        "                    remaining_layers = num_layers_decoder - num_layers_encoder\n",
        "                    # Copy all encoder hidden layers and then repeat the top layer\n",
        "                    if cell_type == \"LSTM\":\n",
        "                        top_layer_hidden = (encoder_hidden[0][-1].unsqueeze(0), encoder_hidden[1][-1].unsqueeze(0))\n",
        "                        extra_hidden = (top_layer_hidden[0].repeat(remaining_layers, 1, 1), top_layer_hidden[1].repeat(remaining_layers, 1, 1))\n",
        "                        decoder_hidden = (torch.cat((encoder_hidden[0], extra_hidden[0]), dim=0), torch.cat((encoder_hidden[1], extra_hidden[1]), dim=0))\n",
        "                    else:\n",
        "                        top_layer_hidden = encoder_hidden[-1].unsqueeze(0) #top_layer_hidden shape (1, batch_size, hidden_size)\n",
        "                        extra_hidden = top_layer_hidden.repeat(remaining_layers, 1, 1)\n",
        "                        decoder_hidden = torch.cat((encoder_hidden, extra_hidden), dim=0)\n",
        "  \n",
        "                else:\n",
        "                    # Slice the hidden states of the encoder to match the decoder layers\n",
        "                    if cell_type == \"LSTM\":\n",
        "                        decoder_hidden = (encoder_hidden[0][-num_layers_decoder:], encoder_hidden[1][-num_layers_decoder:])\n",
        "                    else :\n",
        "                        decoder_hidden = encoder_hidden[-num_layers_decoder:]\n",
        "            else:\n",
        "                decoder_hidden = encoder_hidden\n",
        "            \n",
        "            loss = 0\n",
        "            correct = 0\n",
        "           \n",
        "            for k in range(0, len(train_y)-1):\n",
        "                \n",
        "                if attention == \"Yes\":\n",
        "                    decoder_output, decoder_hidden, atten_weights = decoder(decoder_input, decoder_hidden, encoder_output)\n",
        "                else:\n",
        "                    decoder_output, decoder_hidden= decoder(decoder_input, decoder_hidden) # decoder_output shape (1, batch_size, output_size)\n",
        "\n",
        "                max_prob, index = decoder_output.topk(1) # max_prob shape (1, batch_size, 1)\n",
        "                index = torch.squeeze(index) # shape (batch_size)\n",
        "                decoder_output = torch.squeeze(decoder_output)\n",
        "                loss += loss_fun(decoder_output, train_y[k+1].long())\n",
        "                \n",
        "                correct += (index == train_y[k+1]).sum().item()\n",
        "\n",
        "                # Apply teacher forcing\n",
        "                use_teacher_forcing = True if random.random() < teach_ratio else False\n",
        "\n",
        "                if use_teacher_forcing:\n",
        "                    decoder_input = train_y[k+1]\n",
        "                \n",
        "                else:\n",
        "                    decoder_input = index\n",
        "\n",
        "            running_loss += loss.item()\n",
        "            train_correct += correct\n",
        "            loss.backward()\n",
        "            encoder_optimizer.step()\n",
        "            decoder_optimizer.step()\n",
        "        \n",
        "\n",
        "        # find train loss and accuracy and print + log to wandb\n",
        "        if attention == \"Yes\":\n",
        "            _, train_accuracy,_, _ = evaluate(trainData,encoder, decoder,output_len,batch_size,hidden_size,num_layers_encoder,num_layers_decoder, cell_type, attention)\n",
        "        else:\n",
        "            _, train_accuracy,_= evaluate(trainData,encoder, decoder,output_len,batch_size,hidden_size,num_layers_encoder,num_layers_decoder, cell_type, attention)\n",
        "        \n",
        "        print(f\"epoch {i}, training loss {running_loss/(len(trainData)* seq_len)}, training accuracy {train_accuracy}\")\n",
        "        if sweeps:\n",
        "            wandb.log({\"epoch\": i, \"train_loss\": running_loss/(len(trainData)* seq_len), \"train_accuracy\": train_accuracy})\n",
        "        \n",
        "        # # find validation loss and accuracy and print + log to wandb\n",
        "        if attention == \"Yes\":\n",
        "            val_loss, val_accuracy,_, _ = evaluate(valData,encoder, decoder,output_len,batch_size,hidden_size,num_layers_encoder,num_layers_decoder, cell_type, attention)\n",
        "        else:\n",
        "            val_loss, val_accuracy,_ = evaluate(valData,encoder, decoder,output_len,batch_size,hidden_size,num_layers_encoder,num_layers_decoder, cell_type, attention)\n",
        "        \n",
        "        print(f\"epoch {i}, validation loss {val_loss}, validation accuracy {val_accuracy}\")\n",
        "        if sweeps:\n",
        "            wandb.log({\"val_loss\": val_loss, \"val_accuracy\": val_accuracy})\n",
        "\n",
        "        # Check for early stopping\n",
        "        if val_loss < best_val_loss:\n",
        "            best_val_loss = val_loss\n",
        "            epochs_without_improvement = 0\n",
        "            # Save the model weights\n",
        "            torch.save(encoder.state_dict(), 'best_encoder.pt')\n",
        "            torch.save(decoder.state_dict(), 'best_decoder.pt')\n",
        "        else:\n",
        "            epochs_without_improvement += 1\n",
        "            if epochs_without_improvement >= patience:\n",
        "                print(\"Early stopping triggered. No improvement in validation loss.\")\n",
        "                break\n",
        "        \n",
        "    \n",
        "    # if testing mode is on print the test accuracy \n",
        "    if test:\n",
        "        # Load the best model weights\n",
        "        encoder.load_state_dict(torch.load('best_encoder.pt'))\n",
        "        decoder.load_state_dict(torch.load('best_decoder.pt'))\n",
        "        if attention == \"Yes\":\n",
        "            _, test_accuracy, pred, atten_weights = evaluate(testData,encoder, decoder,output_len,batch_size,hidden_size,num_layers_encoder,num_layers_decoder, cell_type, attention)\n",
        "        else:\n",
        "            _, test_accuracy, pred = evaluate(testData,encoder, decoder,output_len,batch_size,hidden_size,num_layers_encoder,num_layers_decoder, cell_type, attention)\n",
        "        print(f\"test accuracy {test_accuracy}\")\n",
        "\n",
        "    if attention == \"Yes\":\n",
        "        return pred, atten_weights\n",
        "    else:\n",
        "        return pred\n",
        "           "
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "nvyRJWUUbR2f"
      },
      "source": [
        "# Translating predictions to words\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Hd3zCTnSbSaL"
      },
      "outputs": [],
      "source": [
        "def translate_prediction(input_dict , input, output_dict, pred,target):\n",
        "    \n",
        "    '''pred in shape of seq_len-1 * dataset_size\n",
        "       target in shape datasize * seq_len-1\n",
        "    '''\n",
        "    pred = pred.T # shape datasize * seq len-1\n",
        "    pred = pred[1:, :-1] # ignore last index of each row\n",
        "    input = input[:, :-1] # ignore  last index of each row\n",
        "    target = target[:, 1:-1] # ignore last index of each row\n",
        "    print(f\"pred shape {pred.shape}, input shape {input.shape}, target shape {target.shape}\")\n",
        "    predictions = [] \n",
        "    Input = [] \n",
        "    Target = []\n",
        "    for i in range(len(pred)):\n",
        "        \n",
        "        pred_word=\"\"\n",
        "        input_word=\"\"\n",
        "        target_word = \"\"\n",
        "\n",
        "        for j in range(pred.shape[1]):\n",
        "\n",
        "            # Ignore padding\n",
        "            if(target[i][j].item() != 0):\n",
        "              \n",
        "              pred_word += output_dict[pred[i][j].item()]\n",
        "              target_word += output_dict[target[i][j].item()]\n",
        "                    \n",
        "        for j in range(input.shape[1]):\n",
        "            \n",
        "               if(input[i][j].item()!=0):\n",
        "                    \n",
        "                    input_word += input_dict[input[i][j].item()]   \n",
        "\n",
        "        # Append words in respective List\n",
        "        \n",
        "        predictions.append(pred_word)\n",
        "        Input.append(input_word)         \n",
        "        Target.append(target_word)   \n",
        "\n",
        "    # Create a DataFrame\n",
        "    df = pd.DataFrame({\"input\": Input, \"predicted\": predictions,\"Actual\":Target})\n",
        "    return df\n",
        "\n",
        "            "
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "8ETW0BG_Pa24"
      },
      "source": [
        "#call train"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "pgGp7MoGzfPg"
      },
      "outputs": [],
      "source": [
        "# train(sweeps = False, test = True)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "MQPGy32rnD3V"
      },
      "source": [
        "# Runnning sweeps for models without Attention\n",
        "\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "z_aYZvDD1OHU"
      },
      "source": [
        "## Sweep Config"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "SVv8bI-D1Q_I"
      },
      "outputs": [],
      "source": [
        "sweep_config = {\n",
        "  'name': 'sweepDL',  \n",
        "  'method': 'bayes',\n",
        "  'metric': {\n",
        "        'name': 'val_accuracy',\n",
        "        'goal': 'maximize'\n",
        "    },\n",
        "  'parameters': {\n",
        "        \n",
        "        'learn_rate': {\n",
        "            'values': [0.01, 0.001, 0.001]\n",
        "        },\n",
        "        'embedding_size': {\n",
        "            'values': [32, 64, 128, 256, 512, 1024]\n",
        "        },\n",
        "        'batch_size':{\n",
        "            'values':[16, 32, 64, 128, 256]\n",
        "        },\n",
        "        'hidden_size':{\n",
        "            'values':[32, 64, 128, 256, 512, 1024]\n",
        "        },\n",
        "        'teach_ratio':{\n",
        "            'values':[0.4, 0.5, 0.6]\n",
        "        },\n",
        "        'dropout':{\n",
        "            'values':[0, 0.2, 0.4]\n",
        "        },\n",
        "        'cell_type':{\n",
        "            'values':[\"RNN\", \"LSTM\", \"GRU\"]\n",
        "        },\n",
        "        'bidirectional':{\n",
        "            'values' : [\"Yes\",\"No\"]\n",
        "        },\n",
        "        'num_layers_decoder':{\n",
        "            'values': [1,2, 3, 4]\n",
        "        },\n",
        "        'num_layers_encoder':{\n",
        "            'values': [1,2,3,4]\n",
        "        },\n",
        "        'epochs':{\n",
        "            'values': [10, 15, 20, 25, 30]\n",
        "        },\n",
        "        'attention':{\n",
        "            'values': [\"Yes\"]\n",
        "        }\n",
        "           \n",
        "    }\n",
        "}\n",
        "config_defaults={\n",
        "    'learn_rate' : 0.001,\n",
        "    'embedding_size': 32,\n",
        "    'batch_size': 256,\n",
        "    'hidden_size' : 1024,\n",
        "    'num_layers_encoder': 3,\n",
        "    'num_layers_decoder': 3,\n",
        "    'bidirectional': 'No',\n",
        "    'cell_type': \"LSTM\",\n",
        "    'teach_ratio': 0.6,\n",
        "    'dropout': 0.4,\n",
        "    'epochs': 15,\n",
        "    'attention': \"No\"\n",
        "}"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "4KxsOOpvr1oi"
      },
      "outputs": [],
      "source": [
        "sweep_id=wandb.sweep(sweep_config, project=\"CS6910_Assignment_3\")\n",
        "wandb.agent(sweep_id,function=train)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "pKvBd5mKf0Hf"
      },
      "source": [
        "# Testing the Best Model(without Attention) on Test Data \n",
        "Set default hyperparameters to the best hyperparameters got from sweeps Hyperparamer tuning"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "kMQvZjZl0q4U"
      },
      "outputs": [],
      "source": [
        "config_defaults={\n",
        "    'learn_rate' : 0.001,\n",
        "    'embedding_size': 32,\n",
        "    'batch_size': 256,\n",
        "    'hidden_size' : 1024,\n",
        "    'num_layers_encoder': 3,\n",
        "    'num_layers_decoder': 3,\n",
        "    'bidirectional': 'No',\n",
        "    'cell_type': \"LSTM\",\n",
        "    'teach_ratio': 0.6,\n",
        "    'dropout': 0.4,\n",
        "    'epochs': 15,\n",
        "    'attention': \"No\"\n",
        "}"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "ygtFpEvp8jFU",
        "outputId": "1a71d3be-f17f-498c-8844-3c115c411f0a"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "correct = 1490\n",
            "test accuracy 36.376953125\n"
          ]
        }
      ],
      "source": [
        "pred= train(sweeps = False, test = True)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "hMf0OAuscOJx"
      },
      "source": [
        "# Saving the predictions by Vanilla model in csv file"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "1cgUOUdsfzUB",
        "outputId": "8784a3aa-315e-476f-cced-c38ebb8434b3"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "pred shape torch.Size([4096, 20]), input shape torch.Size([4096, 26]), target shape torch.Size([4096, 20])\n"
          ]
        }
      ],
      "source": [
        "# save the predictions\n",
        "dataframe = translate_prediction(ipLang.index2char, testData[:][0], opLang.index2char, pred, testData[:][1])\n",
        "dataframe.to_csv(\"predictions.csv\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "ZZW-IEWZ5syU"
      },
      "outputs": [],
      "source": [
        "import pandas as pd\n",
        "data = pd.read_csv(\"predictions.csv\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 424
        },
        "id": "2sOkc_0vmDlB",
        "outputId": "750d06b5-fee2-4eb8-d7e6-a7043cd0c15a"
      },
      "outputs": [],
      "source": [
        "data"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 142
        },
        "id": "AkG1vCpZ_vjG",
        "outputId": "d64b794c-d173-4871-80fc-93b8211ebedc"
      },
      "outputs": [],
      "source": [
        "# We also want to plot the prdiction table to wandb\n",
        "wandb.init(project=\"CS6910_Assignment_3\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "MmKDX6V5_kGu"
      },
      "outputs": [],
      "source": [
        "table = wandb.Table(dataframe=data)\n",
        "wandb.log({\"data\": table})"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "FYMa5jTQRUaB"
      },
      "source": [
        "## Plotting the confusion matrix in wandB"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "YBaJZCIBRAGZ"
      },
      "outputs": [],
      "source": [
        "import numpy as np\n",
        "CM = np.zeros((opLang.n_chars, ipLang.n_chars))\n",
        "\n",
        "for i in range(len(testData[1])):\n",
        "  for j in range(testData[1].shape[1]):\n",
        "      pred = int(pred[i][j])\n",
        "      targ = int(testData[1][i][j])\n",
        "      CM[pred][targ] += 1\n",
        "\n",
        "classes =[]\n",
        "\n",
        "for i in range(len(CM)):\n",
        "  classes.append(opLang.index2char[i])\n",
        "\n",
        "percentages = 100 * (CM / np.sum(CM))\n",
        "\n",
        "# Define the text for each cell\n",
        "cell_text = []\n",
        "for i in range(len(classes)):\n",
        "    row_text = []\n",
        "    for j in range(len(classes)):\n",
        "\n",
        "        txt = \"Total \"+f'{CM[i, j]}Per. ({percentages[i, j]:.3f})'\n",
        "        if(i==j):\n",
        "          txt =\"Correcty Predicted \" +classes[i]+\"\"+txt\n",
        "        if(i!=j):\n",
        "          txt =\"Predicted \" +classes[j]+\" For \"+classes[i]+\"\"+txt\n",
        "        row_text.append(txt)\n",
        "    cell_text.append(row_text)\n",
        "\n",
        "import plotly.graph_objs as go\n",
        "\n",
        "# Define the trace\n",
        "trace = go.Heatmap(z=percentages,\n",
        "                  x=classes,\n",
        "                  y=classes,\n",
        "                  colorscale='Blues',\n",
        "                  colorbar=dict(title='Percentage'),\n",
        "                  hovertemplate='%{text}%',\n",
        "                  text=cell_text,\n",
        "                  )\n",
        "\n",
        "# Define the layout\n",
        "layout = go.Layout(title='Confusion Matrix',\n",
        "                  xaxis=dict(title='Predicted Character'),\n",
        "                  yaxis=dict(title='True Character'),\n",
        "                  )\n",
        "\n",
        "# Plot the figure\n",
        "fig = go.Figure(data=[trace], layout=layout)\n",
        "wandb.log({'confusion_matrix': (fig)})"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "zfuv5FoA1wt2"
      },
      "source": [
        "# Runnning sweeps for models with Attention\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "tsHS0PkNGHdV"
      },
      "source": [
        "## Sweep Config"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "HwCn-Ci5xkTb"
      },
      "outputs": [],
      "source": [
        "sweep_config = {\n",
        "  'name': 'sweepDL',  \n",
        "  'method': 'bayes',\n",
        "  'metric': {\n",
        "        'name': 'val_accuracy',\n",
        "        'goal': 'maximize'\n",
        "    },\n",
        "  'parameters': {\n",
        "        \n",
        "        'learn_rate': {\n",
        "            'values': [0.01, 0.001, 0.001]\n",
        "        },\n",
        "        'embedding_size': {\n",
        "            'values': [32, 64, 128, 256, 512, 1024]\n",
        "        },\n",
        "        'batch_size':{\n",
        "            'values':[16, 32, 64, 128, 256]\n",
        "        },\n",
        "        'hidden_size':{\n",
        "            'values':[32, 64, 128, 256, 512, 1024]\n",
        "        },\n",
        "        'teach_ratio':{\n",
        "            'values':[0.4, 0.5, 0.6]\n",
        "        },\n",
        "        'dropout':{\n",
        "            'values':[0, 0.2, 0.4]\n",
        "        },\n",
        "        'cell_type':{\n",
        "            'values':[\"RNN\", \"LSTM\", \"GRU\"]\n",
        "        },\n",
        "        'bidirectional':{\n",
        "            'values' : [\"Yes\",\"No\"]\n",
        "        },\n",
        "        'num_layers_decoder':{\n",
        "            'values': [1,2, 3, 4]\n",
        "        },\n",
        "        'num_layers_encoder':{\n",
        "            'values': [1,2,3,4]\n",
        "        },\n",
        "        'epochs':{\n",
        "            'values': [10, 15, 20, 25, 30]\n",
        "        },\n",
        "        'attention':{\n",
        "            'values': [\"Yes\"]\n",
        "        }\n",
        "           \n",
        "    }\n",
        "}\n",
        "config_defaults={\n",
        "    'learn_rate' : 0.001,\n",
        "    'embedding_size': 32,\n",
        "    'batch_size': 64,\n",
        "    'hidden_size' : 1024,\n",
        "    'num_layers_encoder': 1,\n",
        "    'num_layers_decoder': 1,\n",
        "    'bidirectional': 'Yes',\n",
        "    'cell_type': \"LSTM\",\n",
        "    'teach_ratio': 0.5,\n",
        "    'dropout': 0.4,\n",
        "    'epochs': 20,\n",
        "    'attention': \"Yes\"\n",
        "}"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "3ADMwinqaQVF"
      },
      "outputs": [],
      "source": [
        "sweep_id=wandb.sweep(sweep_config, project=\"CS6910_Assignment_3\")\n",
        "wandb.agent(sweep_id,function=train)\n",
        "# wandb.agent(sweep_id= \"xiyggu44\",function=train, project=\"CS6910_Assignment_3\")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "W7CYNChRGuGK"
      },
      "source": [
        "# Testing the Best Model(with Attention) on Test Data \n",
        "Set default hyperparameters to the best hyperparameters got from sweeps Hyperparamer tuning"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "C9MUrsXu_Rr4"
      },
      "outputs": [],
      "source": [
        "config_defaults={\n",
        "    'learn_rate' : 0.001,\n",
        "    'embedding_size': 32,\n",
        "    'batch_size': 64,\n",
        "    'hidden_size' : 1024,\n",
        "    'num_layers_encoder': 1,\n",
        "    'num_layers_decoder': 1,\n",
        "    'bidirectional': 'Yes',\n",
        "    'cell_type': \"LSTM\",\n",
        "    'teach_ratio': 0.5,\n",
        "    'dropout': 0.4,\n",
        "    'epochs': 20,\n",
        "    'attention': \"Yes\"\n",
        "}"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "u7XAB4Q5Hpxj"
      },
      "outputs": [],
      "source": [
        "pred, atten_weights = train(sweeps = False, test = True)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "fld21YRZdRdG"
      },
      "source": [
        "# Saving the predictions by Vanilla model in csv file"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "BpDQ1mrydYWg",
        "outputId": "8784a3aa-315e-476f-cced-c38ebb8434b3"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "pred shape torch.Size([4096, 20]), input shape torch.Size([4096, 26]), target shape torch.Size([4096, 20])\n"
          ]
        }
      ],
      "source": [
        "# save the predictions\n",
        "dataframe = translate_prediction(ipLang.index2char, testData[:][0], opLang.index2char, pred, testData[:][1])\n",
        "dataframe.to_csv(\"predictions.csv\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "PKMYPZdtdbDh"
      },
      "outputs": [],
      "source": [
        "import pandas as pd\n",
        "data = pd.read_csv(\"predictions.csv\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 142
        },
        "id": "8gCL1rXCdgYp",
        "outputId": "d64b794c-d173-4871-80fc-93b8211ebedc"
      },
      "outputs": [],
      "source": [
        "# We also want to plot the prdiction table to wandb\n",
        "wandb.init(project=\"CS6910_Assignment_3\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "N1r2ownhdjbz"
      },
      "outputs": [],
      "source": [
        "table = wandb.Table(dataframe=data)\n",
        "wandb.log({\"data\": table})"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "LDP4KvWdFnIL"
      },
      "source": [
        "# Plotting the Attention HeatMaps"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 1000,
          "referenced_widgets": [
            "1b0c5a6e21a349cba57322f850ad9f48",
            "3aa935a6db14483d8aaada58a84a3e47",
            "eabcea7a8bbf42f6aaa3995c0dece721",
            "b3b7711edb5542e08c53c4f37da10203",
            "39a8a3a9b6f1495ea17fd1b3d86b67c0",
            "18a8e2e817b947f9aad87b1ccaf96ea6",
            "da62d6e5ad0a462b98e1591d39038e1e",
            "9b5bb4f7f4a846c28ab967b64107726e"
          ]
        },
        "id": "4WfJEdcgFmiI",
        "outputId": "ff266529-4345-4cdc-9860-11914b099052"
      },
      "outputs": [],
      "source": [
        "import matplotlib.pyplot as plt\n",
        "import numpy as np\n",
        "from matplotlib.font_manager import FontProperties\n",
        "tel_font = FontProperties(fname = 'TiroDevanagariHindi-Regular.ttf')\n",
        "# Assuming you have attention_weights of shape (batch_size, output_sequence_length, batch_size, input_sequence_length)\n",
        "# and prediction_matrix of shape (batch_size, output_sequence_length)\n",
        "# and input_matrix of shape (batch_size, input_sequence_length)\n",
        "\n",
        "# Define the grid dimensions\n",
        "rows = int(np.ceil(np.sqrt(12)))\n",
        "cols = int(np.ceil(12 / rows))\n",
        "\n",
        "# Create a figure and subplots\n",
        "fig, axes = plt.subplots(rows, cols, figsize=(9, 9))\n",
        "\n",
        "for i, ax in enumerate(axes.flatten()):\n",
        "    if i < 12:\n",
        "        prediction = [opLang.index2char[j.item()] for j in pred[i+1]]\n",
        "        \n",
        "        pred_word=\"\"\n",
        "        input_word=\"\"\n",
        "\n",
        "        for j in range(len(prediction)):\n",
        "            # Ignore padding\n",
        "            if(prediction[j] != '#'):\n",
        "                pred_word += prediction[j]\n",
        "            else : \n",
        "                break\n",
        "        input_seq = [ipLang.index2char[j.item()] for j in testData[i][0]]\n",
        "                    \n",
        "        for j in range(len(input_seq)):\n",
        "            if(input_seq[j] != '#'):\n",
        "                    input_word += input_seq[j]\n",
        "            else : \n",
        "                break\n",
        "        attn_weights = atten_weights[i, :len(pred_word), :len(input_word)].detach().cpu().numpy()\n",
        "        ax.imshow(attn_weights.T, cmap='hot', interpolation='nearest')\n",
        "        ax.xaxis.set_label_position('top')\n",
        "        ax.set_title(f'Example {i+1}')\n",
        "        ax.set_xlabel('Output predicted')\n",
        "        ax.set_ylabel('Input word')\n",
        "        ax.set_xticks(np.arange(len(pred_word)))\n",
        "        ax.set_xticklabels(pred_word, rotation = 90, fontproperties = tel_font,fontdict={'fontsize':8})\n",
        "        ax.xaxis.tick_top()\n",
        "\n",
        "        ax.set_yticks(np.arange(len(input_word)))\n",
        "        ax.set_yticklabels(input_word, rotation=90)\n",
        "        \n",
        "        \n",
        "\n",
        "# Adjust the spacing between subplots\n",
        "plt.tight_layout()\n",
        "\n",
        "# Show the plot\n",
        "plt.show()\n",
        "wandb.init(project='CS6910_Assignment_3')\n",
        "\n",
        "# Convert the matplotlib figure to an image\n",
        "fig.canvas.draw()\n",
        "image = np.frombuffer(fig.canvas.tostring_rgb(), dtype='uint8')\n",
        "image = image.reshape(fig.canvas.get_width_height()[::-1] + (3,))\n",
        "\n",
        "# Log the image in wandb\n",
        "wandb.log({\"attention_heatmaps\": [wandb.Image(image)]})"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "FnHR_oql6-S4"
      },
      "outputs": [],
      "source": []
    }
  ],
  "metadata": {
    "accelerator": "GPU",
    "colab": {
      "collapsed_sections": [
        "hRdpoWePeYHn",
        "44xIRolL_T_d",
        "Q1TioafYgICa",
        "svxssm9Havhb",
        "J56aq1J6a07q",
        "5JcQdylzI_Fc",
        "658W9RARGEUf",
        "q7fAgs5uQni_",
        "n4rGh7vuQqaa",
        "0SsnRWlgQmCI",
        "nvyRJWUUbR2f",
        "8ETW0BG_Pa24",
        "MQPGy32rnD3V",
        "z_aYZvDD1OHU",
        "pKvBd5mKf0Hf",
        "FYMa5jTQRUaB",
        "zfuv5FoA1wt2",
        "W7CYNChRGuGK"
      ],
      "gpuType": "T4",
      "include_colab_link": true,
      "provenance": []
    },
    "gpuClass": "standard",
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "name": "python"
    },
    "widgets": {
      "application/vnd.jupyter.widget-state+json": {
        "18a8e2e817b947f9aad87b1ccaf96ea6": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "DescriptionStyleModel",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        },
        "1b0c5a6e21a349cba57322f850ad9f48": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "VBoxModel",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "VBoxModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "VBoxView",
            "box_style": "",
            "children": [
              "IPY_MODEL_3aa935a6db14483d8aaada58a84a3e47",
              "IPY_MODEL_eabcea7a8bbf42f6aaa3995c0dece721"
            ],
            "layout": "IPY_MODEL_b3b7711edb5542e08c53c4f37da10203"
          }
        },
        "39a8a3a9b6f1495ea17fd1b3d86b67c0": {
          "model_module": "@jupyter-widgets/base",
          "model_module_version": "1.2.0",
          "model_name": "LayoutModel",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "3aa935a6db14483d8aaada58a84a3e47": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "LabelModel",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "LabelModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "LabelView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_39a8a3a9b6f1495ea17fd1b3d86b67c0",
            "placeholder": "​",
            "style": "IPY_MODEL_18a8e2e817b947f9aad87b1ccaf96ea6",
            "value": "0.071 MB of 0.071 MB uploaded (0.000 MB deduped)\r"
          }
        },
        "9b5bb4f7f4a846c28ab967b64107726e": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "ProgressStyleModel",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "ProgressStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "bar_color": null,
            "description_width": ""
          }
        },
        "b3b7711edb5542e08c53c4f37da10203": {
          "model_module": "@jupyter-widgets/base",
          "model_module_version": "1.2.0",
          "model_name": "LayoutModel",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "da62d6e5ad0a462b98e1591d39038e1e": {
          "model_module": "@jupyter-widgets/base",
          "model_module_version": "1.2.0",
          "model_name": "LayoutModel",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "eabcea7a8bbf42f6aaa3995c0dece721": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "FloatProgressModel",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "FloatProgressModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "ProgressView",
            "bar_style": "",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_da62d6e5ad0a462b98e1591d39038e1e",
            "max": 1,
            "min": 0,
            "orientation": "horizontal",
            "style": "IPY_MODEL_9b5bb4f7f4a846c28ab967b64107726e",
            "value": 1
          }
        }
      }
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}