Count\n", " self.data = {}\n", " \n", "\n", " def addWord(self, word):\n", " for char in word:\n", " self.addChar(char)\n", "\n", " def addChar(self, char):\n", " if char not in self.char2index:\n", " self.char2index[char] = self.n_chars\n", " self.char2count[char] = 1\n", " self.index2char[self.n_chars] = char\n", " self.n_chars += 1\n", " else:\n", " self.char2count[char] += 1\n", "\n", " \n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "dCR658yRvXpy" }, "outputs": [], "source": [ "# return max length of input and output words\n", "def maxLength(data):\n", " ip_mlen, op_mlen = 0, 0\n", "\n", " for i in range(len(data)):\n", " input = data[0][i]\n", " output = data[1][i]\n", " if(len(input)>ip_mlen):\n", " ip_mlen=len(input)\n", "\n", " if(len(output)>op_mlen):\n", " op_mlen=len(output)\n", "\n", " return ip_mlen, op_mlen" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "IDGaCO8DkYpc" }, "outputs": [], "source": [ "import numpy\n", "input_shape = 0\n", "from torch.utils.data import TensorDataset, DataLoader\n", "def preprocess(data, input_lang, output_lang):\n", " maxlenInput, maxlenOutput = maxLength(data)\n", " # we use maxlenInput as 26 since it is the maximum of all input len\n", " maxlenInput = 26\n", " input = numpy.zeros((len(data), maxlenInput + 1))\n", " output = numpy.zeros((len(data), maxlenOutput + 2))\n", " maxlenInput, maxlenOutput = maxLength(data)\n", " unknown = input_lang.char2index['$']\n", "\n", " for i in range(len(data)):\n", " op = '^' + data[1][i]\n", " ip = data[0][i].ljust(maxlenInput + 1, '#')\n", " op = op.ljust(maxlenOutput + 2, '#')\n", " \n", "\n", " for index, char in enumerate(ip):\n", " if input_lang.char2index.get(char) is not None:\n", " input[i][index] = input_lang.char2index[char]\n", " else:\n", " input[i][index] = unknown\n", " \n", "\n", " \n", " for index, char in enumerate(op):\n", " if output_lang.char2index.get(char) is not None:\n", " output[i][index] = output_lang.char2index[char]\n", " else:\n", " output[i][index] = unknown \n", "\n", " print(input.shape)\n", " print(output.shape)\n", "\n", " return TensorDataset(torch.from_numpy(input), torch.from_numpy(output))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "PdS5OXKxfdCX", "outputId": "178f1d73-5b0c-431d-ca9b-d9435b924c41" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(51200, 27)\n", "(51200, 22)\n", "(4096, 27)\n", "(4096, 22)\n", "(4096, 27)\n", "(4096, 22)\n" ] } ], "source": [ "def loadData(lang):\n", " train_df = pd.read_csv(f\"drive/MyDrive/aksharantar_sampled/{lang}/{lang}_train.csv\", header = None)\n", " val_df = pd.read_csv(f\"drive/MyDrive/aksharantar_sampled/{lang}/{lang}_valid.csv\", header = None)\n", " test_df = pd.read_csv(f\"drive/MyDrive/aksharantar_sampled/{lang}/{lang}_test.csv\", header = None)\n", "\n", " input_lang = Lang('eng')\n", " output_lang = Lang(lang)\n", " \n", " # add the words to the respective languages\n", " for i in range(len(train_df)):\n", " \n", " input_lang.addWord(train_df[0][i])\n", " output_lang.addWord(train_df[1][i])\n", "\n", " # print(input_lang.char2index)\n", " # print(input_lang.index2char)\n", " trainDataset = preprocess(train_df, input_lang, output_lang)\n", " testDataset = preprocess(test_df, input_lang, output_lang)\n", " valDataset = preprocess(val_df, input_lang, output_lang)\n", "\n", " return trainDataset, testDataset, valDataset, input_lang, output_lang\n", "\n", "\n", "trainData, testData, valData, ipLang, opLang = loadData('hin')\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "SvmzS5Lt_Jnl", "outputId": "33defb60-5aee-46cb-e683-ee2df9e98436" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "\u001b[34m\u001b[1mwandb\u001b[0m: W&B API key is configured. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n", "\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m If you're specifying your api key in code, ensure this code is not shared publicly.\n", "\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m Consider setting the WANDB_API_KEY environment variable, or running `wandb login` from the command line.\n", "\u001b[34m\u001b[1mwandb\u001b[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc\n" ] }, { "data": { "text/plain": [ "True" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wandb.login(key =\"\")" ] }, { "cell_type": "markdown", "metadata": { "id": "Q1TioafYgICa" }, "source": [ "# seq2seq model" ] }, { "cell_type": "markdown", "metadata": { "id": "svxssm9Havhb" }, "source": [ "## Encoder" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "YTwk8nKNcbkb" }, "outputs": [], "source": [ "class EncoderRNN(nn.Module):\n", " def __init__(self, input_size, hidden_size, embedding_size, # input_size is size of input language dictionary\n", " num_layers, cell_type,\n", " bidirectional, dropout, batch_size) :\n", " super(EncoderRNN, self).__init__()\n", " self.hidden_size = hidden_size # size of an hidden state representation\n", " self.num_layers = num_layers \n", " self.bidirectional = True if bidirectional == 'Yes' else False\n", " self.batch_size = batch_size\n", " self.cell_type = cell_type\n", " self.embedding_size=embedding_size\n", "\n", " # this adds the embedding layer\n", " self.embedding = nn.Embedding(num_embeddings=input_size,embedding_dim= embedding_size)\n", " self.dropout = nn.Dropout(dropout)\n", "\n", " # this adds the Neural Network layer for the encoder\n", " if self.cell_type == \"GRU\":\n", " self.rnn = nn.GRU(embedding_size, hidden_size, num_layers=num_layers, bidirectional=self.bidirectional, dropout=dropout)\n", " elif self.cell_type == \"LSTM\":\n", " self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers=num_layers, bidirectional=self.bidirectional, dropout=dropout)\n", " else:\n", " self.rnn = nn.RNN(embedding_size, hidden_size, num_layers=num_layers, bidirectional=self.bidirectional, dropout=dropout)\n", "\n", " def forward(self, input, hidden): # input shape (seq_len, batch_size) hidden shape tuple for lstm, otherwise single\n", " embedded = self.embedding(input.long()).view(-1,self.batch_size, self.embedding_size)\n", " output = self.dropout(embedded) # output shape (seq_len, batch_size, embedding size)\n", "\n", " output, hidden = self.rnn(output, hidden) # for LSTM hidden is a tuple\n", " if self.bidirectional:\n", " if self.cell_type == \"LSTM\":\n", " hidden_state = hidden[0].resize(2,self.num_layers,self.batch_size,self.hidden_size)\n", " cell_state = hidden[1].resize(2,self.num_layers,self.batch_size,self.hidden_size)\n", " hidden = (torch.add(hidden_state[0],hidden_state[1])/2, torch.add(cell_state[0],cell_state[1])/2)\n", " else:\n", " hidden=hidden.resize(2,self.num_layers,self.batch_size,self.hidden_size)\n", " hidden=torch.add(hidden[0],hidden[1])/2\n", " \n", " split_tensor= torch.split(output, self.hidden_size, dim=-1)\n", " output=torch.add(split_tensor[0],split_tensor[1])/2\n", " return output, hidden\n", "\n", " # initializing the initial hidden state for the encoder\n", " def initHidden(self):\n", " num_directions = 2 if self.bidirectional else 1\n", " if self.cell_type == \"LSTM\":\n", " return (torch.zeros(self.num_layers * num_directions, self.batch_size, self.hidden_size, device=device),\n", " torch.zeros(self.num_layers * num_directions, self.batch_size, self.hidden_size, device=device))\n", " else:\n", " return torch.zeros(self.num_layers * num_directions, self.batch_size, self.hidden_size, device=device)\n" ] }, { "cell_type": "markdown", "metadata": { "id": "J56aq1J6a07q" }, "source": [ "## Decoder" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "53ki6eJUH2u2" }, "outputs": [], "source": [ "class DecoderRNN(nn.Module):\n", " def __init__(self, hidden_size, output_size, embedding_size, num_layers, # output size is the size of output language dictionary\n", " cell_type, dropout, batch_size):\n", " super(DecoderRNN, self).__init__()\n", " self.hidden_size = hidden_size\n", " self.num_layers = num_layers\n", " self.cell_type = cell_type.lower()\n", " self.batch_size = batch_size\n", " self.embedding_size=embedding_size\n", "\n", " self.embedding = nn.Embedding(output_size, embedding_size)\n", " # self.dropout = nn.Dropout(dropout)\n", " \n", " if self.cell_type == \"gru\":\n", " self.rnn = nn.GRU(embedding_size, hidden_size, num_layers=num_layers)\n", " elif self.cell_type == \"lstm\":\n", " self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers=num_layers)\n", " else:\n", " self.rnn = nn.RNN(embedding_size, hidden_size, num_layers=num_layers)\n", "\n", " self.out = nn.Linear(hidden_size, output_size)\n", " self.softmax = nn.LogSoftmax(dim=2)\n", "\n", " def forward(self, input, hidden): # input shape (1, batch_size)\n", " embedded = self.embedding(input.long()).view(-1, self.batch_size, self.embedding_size)\n", " # # shape (1, batch_size, embedding_size)\n", " output = F.relu(embedded)\n", " output, hidden = self.rnn(output, hidden) # output shape (1, batch_size, hidden_size)\n", " output = self.softmax(self.out(output)) # shape (1, batch_size, output_size)\n", " return output, hidden\n", "\n", " # not needed since hidden will be provided by the encoder" ] }, { "cell_type": "markdown", "metadata": { "id": "5JcQdylzI_Fc" }, "source": [ "## Attention Decoder" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "R1Xysuv9I-Qr" }, "outputs": [], "source": [ "class AttentionDecoderRNN(nn.Module):\n", " def __init__(self, hidden_size, output_size, embedding_size, num_layers,\n", " cell_type, dropout, batch_size, max_length):\n", " super(AttentionDecoderRNN, self).__init__()\n", " self.hidden_size = hidden_size\n", " self.num_layers = num_layers\n", " self.cell_type = cell_type\n", " self.batch_size = batch_size\n", " self.embedding_size = embedding_size\n", " self.max_length = max_length\n", " self.dropout = dropout\n", "\n", " self.embedding = nn.Embedding(output_size, embedding_size)\n", " self.dropout = nn.Dropout(self.dropout)\n", " self.attention = nn.Linear(hidden_size + embedding_size, self.max_length)\n", " self.attention_combine = nn.Linear(hidden_size + embedding_size, hidden_size)\n", "\n", " if self.cell_type == \"GRU\":\n", " self.rnn = nn.GRU(hidden_size, hidden_size, num_layers=num_layers)\n", " elif self.cell_type == \"LSTM\":\n", " self.rnn = nn.LSTM(hidden_size, hidden_size, num_layers=num_layers)\n", " else:\n", " self.rnn = nn.RNN(hidden_size, hidden_size, num_layers=num_layers)\n", "\n", " self.out = nn.Linear(hidden_size, output_size)\n", " self.softmax = nn.LogSoftmax(dim=2)\n", "\n", " def forward(self, input, hidden, encoder_outputs): #input shape (1, batch_size)\n", " embedded = self.embedding(input.long()).view(-1, self.batch_size, self.embedding_size) \n", " # embedded shape (1, batch_size, embedding_size)\n", " embedded = F.relu(embedded)\n", "\n", " # Compute attention scores\n", " if self.cell_type == \"LSTM\":\n", " attn_hidden = torch.mean(hidden[0], dim=0)\n", " else:\n", " attn_hidden = torch.mean(hidden, dim = 0)\n", " attn_scores = self.attention(torch.cat((embedded, attn_hidden.unsqueeze(0)), dim=2)) # attn_scores shape (1, batch_size, max_length)\n", " \n", " attn_weights = F.softmax(attn_scores, dim=-1) # attn_scores shape (1, 16, 25)\n", " \n", "\n", " # Apply attention weights to encoder outputs\n", " attn_applied = torch.bmm(attn_weights.transpose(0, 1), encoder_outputs.transpose(0, 1))\n", " \n", " # Combine attention output and embedded input\n", " combined = torch.cat((embedded, attn_applied.transpose(0, 1)), dim=2)\n", " combined = self.attention_combine(combined)\n", " combined = F.relu(combined) # shape (1, batch_size, hidden_size)\n", "\n", " # Run through the RNN\n", " output, hidden = self.rnn(combined, hidden)\n", " # output shape: (1, batch_size, hidden_size)\n", "\n", " # Pass through linear layer and softmax activation\n", " output = self.out(output) # shape: (1, batch_size, output_size)\n", " output = self.softmax(output)\n", " return output, hidden, attn_weights.transpose(0, 1)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "LJ2Papj_jTX8" }, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": { "id": "658W9RARGEUf" }, "source": [ "# Helper functions" ] }, { "cell_type": "markdown", "metadata": { "id": "q7fAgs5uQni_" }, "source": [ "## count matches" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "8fzy8U6_lbug" }, "outputs": [], "source": [ "def count_exact_matches(pred, target):\n", " \"\"\"\n", " Counts the number of rows in preds tensor that match exactly with each row in y tensor.\n", " pred: tensor of shape (batch_size, seq_len-1)\n", " y: tensor of shape (batch_size, seq_len-1)\n", " \"\"\"\n", " \n", " count=0;\n", " for i in range(pred.shape[0]):\n", " flag = True\n", " for j in range(pred.shape[1]):\n", " if(target[i][j]!=pred[i][j]):\n", " flag=False\n", " break;\n", " \n", " if(flag):\n", " count+=1;\n", " \n", " return count" ] }, { "cell_type": "markdown", "metadata": { "id": "n4rGh7vuQqaa" }, "source": [ "## evaluation" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "zp6gvWmDlWoB" }, "outputs": [], "source": [ "def evaluate(data,encoder, decoder,output_size,batch_size,hidden_size,num_layers_encoder,num_layers_decoder, cell_type, attention):\n", " \n", "\n", "\n", " running_loss = 0\n", " correct =0\n", " \n", " loader = DataLoader(data, batch_size=batch_size)\n", " loss_fun = nn.CrossEntropyLoss(reduction=\"sum\")\n", " seq_len = 0\n", "\n", " atten_weights = torch.zeros(1,21, 27).to(device) # required to return the attention weights\n", " predictions = torch.zeros(22-1, 1).to(device)\n", " with torch.no_grad():\n", " for j,(x,y) in enumerate(loader):\n", " loss=0\n", " encoder.eval()\n", " decoder.eval()\n", "\n", " x = x.to(device)\n", " y = y.to(device)\n", "\n", " x = x.T\n", " y = y.T\n", " seq_len = len(y)\n", " \n", " encoder_hidden=encoder.initHidden()\n", " encoder_output,encoder_hidden = encoder(x,encoder_hidden)\n", " \n", " \n", " decoder_input =y[0]\n", " \n", " # Handle different numbers of layers in the encoder and decoder\n", " if num_layers_encoder != num_layers_decoder:\n", " if num_layers_encoder < num_layers_decoder:\n", " remaining_layers = num_layers_decoder - num_layers_encoder\n", "\n", " # Copy all encoder hidden layers and then repeat the top layer\n", " if cell_type == \"LSTM\":\n", " top_layer_hidden = (encoder_hidden[0][-1].unsqueeze(0), encoder_hidden[1][-1].unsqueeze(0))\n", " extra_hidden = (top_layer_hidden[0].repeat(remaining_layers, 1, 1), top_layer_hidden[1].repeat(remaining_layers, 1, 1))\n", " decoder_hidden = (torch.cat((encoder_hidden[0], extra_hidden[0]), dim=0), torch.cat((encoder_hidden[1], extra_hidden[1]), dim=0))\n", " else:\n", " top_layer_hidden = encoder_hidden[-1].unsqueeze(0) #top_layer_hidden shape (1, batch_size, hidden_size)\n", " extra_hidden = top_layer_hidden.repeat(remaining_layers, 1, 1)\n", " decoder_hidden = torch.cat((encoder_hidden, extra_hidden), dim=0)\n", "\n", " else:\n", " # Slice the hidden states of the encoder to match the decoder layers\n", " if cell_type == \"LSTM\":\n", " decoder_hidden = (encoder_hidden[0][-num_layers_decoder:], encoder_hidden[1][-num_layers_decoder:])\n", " else :\n", " decoder_hidden = encoder_hidden[-num_layers_decoder:]\n", " else:\n", " decoder_hidden = encoder_hidden\n", "\n", " pred=torch.zeros(len(y)-1, batch_size).to(device)\n", " atten_weight_default = torch.zeros(batch_size,1, 27).to(device)\n", " for k in range(1,len(y)):\n", " if attention == \"Yes\":\n", " \n", " decoder_output, decoder_hidden, atten_weight = decoder(decoder_input, decoder_hidden, encoder_output)\n", " atten_weight_default = torch.cat((atten_weight_default, atten_weight), dim = 1)\n", " else:\n", " decoder_output, decoder_hidden= decoder(decoder_input, decoder_hidden)\n", " max_prob, index = decoder_output.topk(1) # max_prob shape (1, batch_size, 1)\n", " decoder_output = torch.squeeze(decoder_output)\n", " loss += loss_fun(decoder_output, y[k].long())\n", " pred[k-1]= torch.squeeze(index)\n", " decoder_input = index\n", " if attention == \"Yes\":\n", " atten_weights = torch.cat((atten_weights, atten_weight_default[:, 1:, :]), dim = 0)\n", "\n", " running_loss += loss.item()\n", " correct += count_exact_matches(pred.T,y[1:,:].T)\n", " predictions = torch.cat((predictions, pred), dim=1)\n", "\n", " \n", " avg_loss = running_loss / (len(data) * seq_len)\n", " print(\"correct =\", correct)\n", " avg_acc = 100 * (correct / (len(data)))\n", " if attention == \"Yes\":\n", " return avg_loss, avg_acc, predictions, atten_weights[1:, :, :]\n", " else:\n", " return avg_loss, avg_acc, predictions\n", " \n", " \n", " " ] }, { "cell_type": "markdown", "metadata": { "id": "0SsnRWlgQmCI" }, "source": [ "# Training function" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "PhDgsZG0QqPW" }, "outputs": [], "source": [ "def train(sweeps = True, test = False):\n", "\n", " if sweeps == False: \n", " configs = config_defaults # use the default configuration which has the best hyperparameters\n", " else:\n", " wandb.init(config= config_defaults, project='DL_assign_3') # if not test then run wandb sweeps\n", " configs=wandb.config\n", " \n", "\n", " learn_rate = configs['learn_rate']\n", " batch_size = configs['batch_size']\n", " hidden_size = configs['hidden_size']\n", " embedding_size = configs['embedding_size']\n", " num_layers_encoder = configs['num_layers_encoder']\n", " num_layers_decoder = configs['num_layers_decoder']\n", " cell_type = configs['cell_type']\n", " bidirectional = configs['bidirectional']\n", " dropout = configs['dropout']\n", " teach_ratio = configs['teach_ratio']\n", " epochs = configs['epochs']\n", " attention = configs['attention']\n", "\n", " if sweeps:\n", " wandb.run.name='hidden_'+str(hidden_size)+'_batch_'+str(batch_size)+'_embed_size_'+str(embedding_size)+'_dropout_'+str(dropout)+'_cell_'+str(cell_type)\n", "\n", " input_len = ipLang.n_chars\n", " output_len = opLang.n_chars\n", " \n", " encoder = EncoderRNN(input_len, hidden_size, embedding_size, \n", " num_layers_encoder, cell_type,\n", " bidirectional, dropout, batch_size)\n", " \n", " if attention ==\"Yes\":\n", " decoder = AttentionDecoderRNN(hidden_size, output_len, embedding_size, num_layers_decoder, \n", " cell_type, dropout, batch_size, 27)\n", " else:\n", " decoder = DecoderRNN(hidden_size, output_len, embedding_size, num_layers_decoder, \n", " cell_type, dropout, batch_size)#dropout not used\n", " \n", " train_loader = DataLoader(trainData, batch_size=batch_size, shuffle=True)\n", " val_loader = DataLoader(valData, batch_size=batch_size, shuffle=True)\n", "\n", " encoder_optimizer=optim.Adam(encoder.parameters(),learn_rate)\n", " decoder_optimizer=optim.Adam(decoder.parameters(),learn_rate)\n", " loss_fun=nn.CrossEntropyLoss(reduction=\"sum\")\n", "\n", " encoder.to(device)\n", " decoder.to(device)\n", " seq_len = 0\n", "\n", " # Initialize variables for early stopping\n", " best_val_loss = float('inf')\n", " patience = 5\n", " epochs_without_improvement = 0\n", "\n", " for i in range(epochs):\n", " \n", " running_loss = 0.0\n", " train_correct = 0\n", "\n", " encoder.train()\n", " decoder.train()\n", "\n", " for j,(train_x,train_y) in enumerate(train_loader):\n", " train_x = train_x.to(device)\n", " train_y = train_y.to(device)\n", "\n", " encoder_optimizer.zero_grad()\n", " decoder_optimizer.zero_grad()\n", "\n", " train_x=train_x.T\n", " train_y=train_y.T\n", " # print(\"train_x.shapetrain_x.shape)\n", " seq_len = len(train_y)\n", " encoder_hidden=encoder.initHidden()\n", " # for LSTM encoder_hidden shape ((num_layers * num_directions, batch_size,hidden_size),(self.num_layers * num_directions, batch_size, hidden_size))\n", " encoder_output,encoder_hidden = encoder(train_x,encoder_hidden)\n", " # encoder_hidden shape (num_layers, batch_size, hidden_size)\n", " \n", " \n", " # lets move to the decoder\n", " decoder_input = train_y[0] # shape (1, batch_size)\n", " \n", " # Handle different numbers of layers in the encoder and decoder\n", " if num_layers_encoder != num_layers_decoder:\n", " if num_layers_encoder < num_layers_decoder:\n", " remaining_layers = num_layers_decoder - num_layers_encoder\n", " # Copy all encoder hidden layers and then repeat the top layer\n", " if cell_type == \"LSTM\":\n", " top_layer_hidden = (encoder_hidden[0][-1].unsqueeze(0), encoder_hidden[1][-1].unsqueeze(0))\n", " extra_hidden = (top_layer_hidden[0].repeat(remaining_layers, 1, 1), top_layer_hidden[1].repeat(remaining_layers, 1, 1))\n", " decoder_hidden = (torch.cat((encoder_hidden[0], extra_hidden[0]), dim=0), torch.cat((encoder_hidden[1], extra_hidden[1]), dim=0))\n", " else:\n", " top_layer_hidden = encoder_hidden[-1].unsqueeze(0) #top_layer_hidden shape (1, batch_size, hidden_size)\n", " extra_hidden = top_layer_hidden.repeat(remaining_layers, 1, 1)\n", " decoder_hidden = torch.cat((encoder_hidden, extra_hidden), dim=0)\n", " \n", " else:\n", " # Slice the hidden states of the encoder to match the decoder layers\n", " if cell_type == \"LSTM\":\n", " decoder_hidden = (encoder_hidden[0][-num_layers_decoder:], encoder_hidden[1][-num_layers_decoder:])\n", " else :\n", " decoder_hidden = encoder_hidden[-num_layers_decoder:]\n", " else:\n", " decoder_hidden = encoder_hidden\n", " \n", " loss = 0\n", " correct = 0\n", " \n", " for k in range(0, len(train_y)-1):\n", " \n", " if attention == \"Yes\":\n", " decoder_output, decoder_hidden, atten_weights = decoder(decoder_input, decoder_hidden, encoder_output)\n", " else:\n", " decoder_output, decoder_hidden= decoder(decoder_input, decoder_hidden) # decoder_output shape (1, batch_size, output_size)\n", "\n", " max_prob, index = decoder_output.topk(1) # max_prob shape (1, batch_size, 1)\n", " index = torch.squeeze(index) # shape (batch_size)\n", " decoder_output = torch.squeeze(decoder_output)\n", " loss += loss_fun(decoder_output, train_y[k+1].long())\n", " \n", " correct += (index == train_y[k+1]).sum().item()\n", "\n", " # Apply teacher forcing\n", " use_teacher_forcing = True if random.random() < teach_ratio else False\n", "\n", " if use_teacher_forcing:\n", " decoder_input = train_y[k+1]\n", " \n", " else:\n", " decoder_input = index\n", "\n", " running_loss += loss.item()\n", " train_correct += correct\n", " loss.backward()\n", " encoder_optimizer.step()\n", " decoder_optimizer.step()\n", " \n", "\n", " # find train loss and accuracy and print + log to wandb\n", " if attention == \"Yes\":\n", " _, train_accuracy,_, _ = evaluate(trainData,encoder, decoder,output_len,batch_size,hidden_size,num_layers_encoder,num_layers_decoder, cell_type, attention)\n", " else:\n", " _, train_accuracy,_= evaluate(trainData,encoder, decoder,output_len,batch_size,hidden_size,num_layers_encoder,num_layers_decoder, cell_type, attention)\n", " \n", " print(f\"epoch {i}, training loss {running_loss/(len(trainData)* seq_len)}, training accuracy {train_accuracy}\")\n", " if sweeps:\n", " wandb.log({\"epoch\": i, \"train_loss\": running_loss/(len(trainData)* seq_len), \"train_accuracy\": train_accuracy})\n", " \n", " # # find validation loss and accuracy and print + log to wandb\n", " if attention == \"Yes\":\n", " val_loss, val_accuracy,_, _ = evaluate(valData,encoder, decoder,output_len,batch_size,hidden_size,num_layers_encoder,num_layers_decoder, cell_type, attention)\n", " else:\n", " val_loss, val_accuracy,_ = evaluate(valData,encoder, decoder,output_len,batch_size,hidden_size,num_layers_encoder,num_layers_decoder, cell_type, attention)\n", " \n", " print(f\"epoch {i}, validation loss {val_loss}, validation accuracy {val_accuracy}\")\n", " if sweeps:\n", " wandb.log({\"val_loss\": val_loss, \"val_accuracy\": val_accuracy})\n", "\n", " # Check for early stopping\n", " if val_loss < best_val_loss:\n", " best_val_loss = val_loss\n", " epochs_without_improvement = 0\n", " # Save the model weights\n", " torch.save(encoder.state_dict(), 'best_encoder.pt')\n", " torch.save(decoder.state_dict(), 'best_decoder.pt')\n", " else:\n", " epochs_without_improvement += 1\n", " if epochs_without_improvement >= patience:\n", " print(\"Early stopping triggered. No improvement in validation loss.\")\n", " break\n", " \n", " \n", " # if testing mode is on print the test accuracy \n", " if test:\n", " # Load the best model weights\n", " encoder.load_state_dict(torch.load('best_encoder.pt'))\n", " decoder.load_state_dict(torch.load('best_decoder.pt'))\n", " if attention == \"Yes\":\n", " _, test_accuracy, pred, atten_weights = evaluate(testData,encoder, decoder,output_len,batch_size,hidden_size,num_layers_encoder,num_layers_decoder, cell_type, attention)\n", " else:\n", " _, test_accuracy, pred = evaluate(testData,encoder, decoder,output_len,batch_size,hidden_size,num_layers_encoder,num_layers_decoder, cell_type, attention)\n", " print(f\"test accuracy {test_accuracy}\")\n", "\n", " if attention == \"Yes\":\n", " return pred, atten_weights\n", " else:\n", " return pred\n", " " ] }, { "cell_type": "markdown", "metadata": { "id": "nvyRJWUUbR2f" }, "source": [ "# Translating predictions to words\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Hd3zCTnSbSaL" }, "outputs": [], "source": [ "def translate_prediction(input_dict , input, output_dict, pred,target):\n", " \n", " '''pred in shape of seq_len-1 * dataset_size\n", " target in shape datasize * seq_len-1\n", " '''\n", " pred = pred.T # shape datasize * seq len-1\n", " pred = pred[1:, :-1] # ignore last index of each row\n", " input = input[:, :-1] # ignore last index of each row\n", " target = target[:, 1:-1] # ignore last index of each row\n", " print(f\"pred shape {pred.shape}, input shape {input.shape}, target shape {target.shape}\")\n", " predictions = [] \n", " Input = [] \n", " Target = []\n", " for i in range(len(pred)):\n", " \n", " pred_word=\"\"\n", " input_word=\"\"\n", " target_word = \"\"\n", "\n", " for j in range(pred.shape[1]):\n", "\n", " # Ignore padding\n", " if(target[i][j].item() != 0):\n", " \n", " pred_word += output_dict[pred[i][j].item()]\n", " target_word += output_dict[target[i][j].item()]\n", " \n", " for j in range(input.shape[1]):\n", " \n", " if(input[i][j].item()!=0):\n", " \n", " input_word += input_dict[input[i][j].item()] \n", "\n", " # Append words in respective List\n", " \n", " predictions.append(pred_word)\n", " Input.append(input_word) \n", " Target.append(target_word) \n", "\n", " # Create a DataFrame\n", " df = pd.DataFrame({\"input\": Input, \"predicted\": predictions,\"Actual\":Target})\n", " return df\n", "\n", " " ] }, { "cell_type": "markdown", "metadata": { "id": "8ETW0BG_Pa24" }, "source": [ "#call train" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "pgGp7MoGzfPg" }, "outputs": [], "source": [ "# train(sweeps = False, test = True)" ] }, { "cell_type": "markdown", "metadata": { "id": "MQPGy32rnD3V" }, "source": [ "# Runnning sweeps for models without Attention\n", "\n" ] }, { "cell_type": "markdown", "metadata": { "id": "z_aYZvDD1OHU" }, "source": [ "## Sweep Config" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "SVv8bI-D1Q_I" }, "outputs": [], "source": [ "sweep_config = {\n", " 'name': 'sweepDL', \n", " 'method': 'bayes',\n", " 'metric': {\n", " 'name': 'val_accuracy',\n", " 'goal': 'maximize'\n", " },\n", " 'parameters': {\n", " \n", " 'learn_rate': {\n", " 'values': [0.01, 0.001, 0.001]\n", " },\n", " 'embedding_size': {\n", " 'values': [32, 64, 128, 256, 512, 1024]\n", " },\n", " 'batch_size':{\n", " 'values':[16, 32, 64, 128, 256]\n", " },\n", " 'hidden_size':{\n", " 'values':[32, 64, 128, 256, 512, 1024]\n", " },\n", " 'teach_ratio':{\n", " 'values':[0.4, 0.5, 0.6]\n", " },\n", " 'dropout':{\n", " 'values':[0, 0.2, 0.4]\n", " },\n", " 'cell_type':{\n", " 'values':[\"RNN\", \"LSTM\", \"GRU\"]\n", " },\n", " 'bidirectional':{\n", " 'values' : [\"Yes\",\"No\"]\n", " },\n", " 'num_layers_decoder':{\n", " 'values': [1,2, 3, 4]\n", " },\n", " 'num_layers_encoder':{\n", " 'values': [1,2,3,4]\n", " },\n", " 'epochs':{\n", " 'values': [10, 15, 20, 25, 30]\n", " },\n", " 'attention':{\n", " 'values': [\"Yes\"]\n", " }\n", " \n", " }\n", "}\n", "config_defaults={\n", " 'learn_rate' : 0.001,\n", " 'embedding_size': 32,\n", " 'batch_size': 256,\n", " 'hidden_size' : 1024,\n", " 'num_layers_encoder': 3,\n", " 'num_layers_decoder': 3,\n", " 'bidirectional': 'No',\n", " 'cell_type': \"LSTM\",\n", " 'teach_ratio': 0.6,\n", " 'dropout': 0.4,\n", " 'epochs': 15,\n", " 'attention': \"No\"\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "4KxsOOpvr1oi" }, "outputs": [], "source": [ "sweep_id=wandb.sweep(sweep_config, project=\"CS6910_Assignment_3\")\n", "wandb.agent(sweep_id,function=train)" ] }, { "cell_type": "markdown", "metadata": { "id": "pKvBd5mKf0Hf" }, "source": [ "# Testing the Best Model(without Attention) on Test Data \n", "Set default hyperparameters to the best hyperparameters got from sweeps Hyperparamer tuning" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "kMQvZjZl0q4U" }, "outputs": [], "source": [ "config_defaults={\n", " 'learn_rate' : 0.001,\n", " 'embedding_size': 32,\n", " 'batch_size': 256,\n", " 'hidden_size' : 1024,\n", " 'num_layers_encoder': 3,\n", " 'num_layers_decoder': 3,\n", " 'bidirectional': 'No',\n", " 'cell_type': \"LSTM\",\n", " 'teach_ratio': 0.6,\n", " 'dropout': 0.4,\n", " 'epochs': 15,\n", " 'attention': \"No\"\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "ygtFpEvp8jFU", "outputId": "1a71d3be-f17f-498c-8844-3c115c411f0a" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "correct = 1490\n", "test accuracy 36.376953125\n" ] } ], "source": [ "pred= train(sweeps = False, test = True)" ] }, { "cell_type": "markdown", "metadata": { "id": "hMf0OAuscOJx" }, "source": [ "# Saving the predictions by Vanilla model in csv file" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "1cgUOUdsfzUB", "outputId": "8784a3aa-315e-476f-cced-c38ebb8434b3" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "pred shape torch.Size([4096, 20]), input shape torch.Size([4096, 26]), target shape torch.Size([4096, 20])\n" ] } ], "source": [ "# save the predictions\n", "dataframe = translate_prediction(ipLang.index2char, testData[:][0], opLang.index2char, pred, testData[:][1])\n", "dataframe.to_csv(\"predictions.csv\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "ZZW-IEWZ5syU" }, "outputs": [], "source": [ "import pandas as pd\n", "data = pd.read_csv(\"predictions.csv\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 424 }, "id": "2sOkc_0vmDlB", "outputId": "750d06b5-fee2-4eb8-d7e6-a7043cd0c15a" }, "outputs": [], "source": [ "data" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 142 }, "id": "AkG1vCpZ_vjG", "outputId": "d64b794c-d173-4871-80fc-93b8211ebedc" }, "outputs": [], "source": [ "# We also want to plot the prdiction table to wandb\n", "wandb.init(project=\"CS6910_Assignment_3\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "MmKDX6V5_kGu" }, "outputs": [], "source": [ "table = wandb.Table(dataframe=data)\n", "wandb.log({\"data\": table})" ] }, { "cell_type": "markdown", "metadata": { "id": "FYMa5jTQRUaB" }, "source": [ "## Plotting the confusion matrix in wandB" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "YBaJZCIBRAGZ" }, "outputs": [], "source": [ "import numpy as np\n", "CM = np.zeros((opLang.n_chars, ipLang.n_chars))\n", "\n", "for i in range(len(testData[1])):\n", " for j in range(testData[1].shape[1]):\n", " pred = int(pred[i][j])\n", " targ = int(testData[1][i][j])\n", " CM[pred][targ] += 1\n", "\n", "classes =[]\n", "\n", "for i in range(len(CM)):\n", " classes.append(opLang.index2char[i])\n", "\n", "percentages = 100 * (CM / np.sum(CM))\n", "\n", "# Define the text for each cell\n", "cell_text = []\n", "for i in range(len(classes)):\n", " row_text = []\n", " for j in range(len(classes)):\n", "\n", " txt = \"Total \"+f'{CM[i, j]}Per. ({percentages[i, j]:.3f})'\n", " if(i==j):\n", " txt =\"Correcty Predicted \" +classes[i]+\"\"+txt\n", " if(i!=j):\n", " txt =\"Predicted \" +classes[j]+\" For \"+classes[i]+\"\"+txt\n", " row_text.append(txt)\n", " cell_text.append(row_text)\n", "\n", "import plotly.graph_objs as go\n", "\n", "# Define the trace\n", "trace = go.Heatmap(z=percentages,\n", " x=classes,\n", " y=classes,\n", " colorscale='Blues',\n", " colorbar=dict(title='Percentage'),\n", " hovertemplate='%{text}%',\n", " text=cell_text,\n", " )\n", "\n", "# Define the layout\n", "layout = go.Layout(title='Confusion Matrix',\n", " xaxis=dict(title='Predicted Character'),\n", " yaxis=dict(title='True Character'),\n", " )\n", "\n", "# Plot the figure\n", "fig = go.Figure(data=[trace], layout=layout)\n", "wandb.log({'confusion_matrix': (fig)})" ] }, { "cell_type": "markdown", "metadata": { "id": "zfuv5FoA1wt2" }, "source": [ "# Runnning sweeps for models with Attention\n" ] }, { "cell_type": "markdown", "metadata": { "id": "tsHS0PkNGHdV" }, "source": [ "## Sweep Config" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "HwCn-Ci5xkTb" }, "outputs": [], "source": [ "sweep_config = {\n", " 'name': 'sweepDL', \n", " 'method': 'bayes',\n", " 'metric': {\n", " 'name': 'val_accuracy',\n", " 'goal': 'maximize'\n", " },\n", " 'parameters': {\n", " \n", " 'learn_rate': {\n", " 'values': [0.01, 0.001, 0.001]\n", " },\n", " 'embedding_size': {\n", " 'values': [32, 64, 128, 256, 512, 1024]\n", " },\n", " 'batch_size':{\n", " 'values':[16, 32, 64, 128, 256]\n", " },\n", " 'hidden_size':{\n", " 'values':[32, 64, 128, 256, 512, 1024]\n", " },\n", " 'teach_ratio':{\n", " 'values':[0.4, 0.5, 0.6]\n", " },\n", " 'dropout':{\n", " 'values':[0, 0.2, 0.4]\n", " },\n", " 'cell_type':{\n", " 'values':[\"RNN\", \"LSTM\", \"GRU\"]\n", " },\n", " 'bidirectional':{\n", " 'values' : [\"Yes\",\"No\"]\n", " },\n", " 'num_layers_decoder':{\n", " 'values': [1,2, 3, 4]\n", " },\n", " 'num_layers_encoder':{\n", " 'values': [1,2,3,4]\n", " },\n", " 'epochs':{\n", " 'values': [10, 15, 20, 25, 30]\n", " },\n", " 'attention':{\n", " 'values': [\"Yes\"]\n", " }\n", " \n", " }\n", "}\n", "config_defaults={\n", " 'learn_rate' : 0.001,\n", " 'embedding_size': 32,\n", " 'batch_size': 64,\n", " 'hidden_size' : 1024,\n", " 'num_layers_encoder': 1,\n", " 'num_layers_decoder': 1,\n", " 'bidirectional': 'Yes',\n", " 'cell_type': \"LSTM\",\n", " 'teach_ratio': 0.5,\n", " 'dropout': 0.4,\n", " 'epochs': 20,\n", " 'attention': \"Yes\"\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "3ADMwinqaQVF" }, "outputs": [], "source": [ "sweep_id=wandb.sweep(sweep_config, project=\"CS6910_Assignment_3\")\n", "wandb.agent(sweep_id,function=train)\n", "# wandb.agent(sweep_id= \"xiyggu44\",function=train, project=\"CS6910_Assignment_3\")" ] }, { "cell_type": "markdown", "metadata": { "id": "W7CYNChRGuGK" }, "source": [ "# Testing the Best Model(with Attention) on Test Data \n", "Set default hyperparameters to the best hyperparameters got from sweeps Hyperparamer tuning" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "C9MUrsXu_Rr4" }, "outputs": [], "source": [ "config_defaults={\n", " 'learn_rate' : 0.001,\n", " 'embedding_size': 32,\n", " 'batch_size': 64,\n", " 'hidden_size' : 1024,\n", " 'num_layers_encoder': 1,\n", " 'num_layers_decoder': 1,\n", " 'bidirectional': 'Yes',\n", " 'cell_type': \"LSTM\",\n", " 'teach_ratio': 0.5,\n", " 'dropout': 0.4,\n", " 'epochs': 20,\n", " 'attention': \"Yes\"\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "u7XAB4Q5Hpxj" }, "outputs": [], "source": [ "pred, atten_weights = train(sweeps = False, test = True)" ] }, { "cell_type": "markdown", "metadata": { "id": "fld21YRZdRdG" }, "source": [ "# Saving the predictions by Vanilla model in csv file" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "BpDQ1mrydYWg", "outputId": "8784a3aa-315e-476f-cced-c38ebb8434b3" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "pred shape torch.Size([4096, 20]), input shape torch.Size([4096, 26]), target shape torch.Size([4096, 20])\n" ] } ], "source": [ "# save the predictions\n", "dataframe = translate_prediction(ipLang.index2char, testData[:][0], opLang.index2char, pred, testData[:][1])\n", "dataframe.to_csv(\"predictions.csv\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "PKMYPZdtdbDh" }, "outputs": [], "source": [ "import pandas as pd\n", "data = pd.read_csv(\"predictions.csv\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 142 }, "id": "8gCL1rXCdgYp", "outputId": "d64b794c-d173-4871-80fc-93b8211ebedc" }, "outputs": [], "source": [ "# We also want to plot the prdiction table to wandb\n", "wandb.init(project=\"CS6910_Assignment_3\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "N1r2ownhdjbz" }, "outputs": [], "source": [ "table = wandb.Table(dataframe=data)\n", "wandb.log({\"data\": table})" ] }, { "cell_type": "markdown", "metadata": { "id": "LDP4KvWdFnIL" }, "source": [ "# Plotting the Attention HeatMaps" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 1000, "referenced_widgets": [ "1b0c5a6e21a349cba57322f850ad9f48", "3aa935a6db14483d8aaada58a84a3e47", "eabcea7a8bbf42f6aaa3995c0dece721", "b3b7711edb5542e08c53c4f37da10203", "39a8a3a9b6f1495ea17fd1b3d86b67c0", "18a8e2e817b947f9aad87b1ccaf96ea6", "da62d6e5ad0a462b98e1591d39038e1e", "9b5bb4f7f4a846c28ab967b64107726e" ] }, "id": "4WfJEdcgFmiI", "outputId": "ff266529-4345-4cdc-9860-11914b099052" }, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "import numpy as np\n", "from matplotlib.font_manager import FontProperties\n", "tel_font = FontProperties(fname = 'TiroDevanagariHindi-Regular.ttf')\n", "# Assuming you have attention_weights of shape (batch_size, output_sequence_length, batch_size, input_sequence_length)\n", "# and prediction_matrix of shape (batch_size, output_sequence_length)\n", "# and input_matrix of shape (batch_size, input_sequence_length)\n", "\n", "# Define the grid dimensions\n", "rows = int(np.ceil(np.sqrt(12)))\n", "cols = int(np.ceil(12 / rows))\n", "\n", "# Create a figure and subplots\n", "fig, axes = plt.subplots(rows, cols, figsize=(9, 9))\n", "\n", "for i, ax in enumerate(axes.flatten()):\n", " if i < 12:\n", " prediction = [opLang.index2char[j.item()] for j in pred[i+1]]\n", " \n", " pred_word=\"\"\n", " input_word=\"\"\n", "\n", " for j in range(len(prediction)):\n", " # Ignore padding\n", " if(prediction[j] != '#'):\n", " pred_word += prediction[j]\n", " else : \n", " break\n", " input_seq = [ipLang.index2char[j.item()] for j in testData[i][0]]\n", " \n", " for j in range(len(input_seq)):\n", " if(input_seq[j] != '#'):\n", " input_word += input_seq[j]\n", " else : \n", " break\n", " attn_weights = atten_weights[i, :len(pred_word), :len(input_word)].detach().cpu().numpy()\n", " ax.imshow(attn_weights.T, cmap='hot', interpolation='nearest')\n", " ax.xaxis.set_label_position('top')\n", " ax.set_title(f'Example {i+1}')\n", " ax.set_xlabel('Output predicted')\n", " ax.set_ylabel('Input word')\n", " ax.set_xticks(np.arange(len(pred_word)))\n", " ax.set_xticklabels(pred_word, rotation = 90, fontproperties = tel_font,fontdict={'fontsize':8})\n", " ax.xaxis.tick_top()\n", "\n", " ax.set_yticks(np.arange(len(input_word)))\n", " ax.set_yticklabels(input_word, rotation=90)\n", " \n", " \n", "\n", "# Adjust the spacing between subplots\n", "plt.tight_layout()\n", "\n", "# Show the plot\n", "plt.show()\n", "wandb.init(project='CS6910_Assignment_3')\n", "\n", "# Convert the matplotlib figure to an image\n", "fig.canvas.draw()\n", "image = np.frombuffer(fig.canvas.tostring_rgb(), dtype='uint8')\n", "image = 