9wimu9
/

sinhala-bert-1.2

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "6b7d6a9b-db7e-46b2-8ab8-d6914e18f1e1",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-07-10T17:34:42.903289Z",
+     "iopub.status.busy": "2023-07-10T17:34:42.902734Z",
+     "iopub.status.idle": "2023-07-10T17:34:53.910158Z",
+     "shell.execute_reply": "2023-07-10T17:34:53.909356Z",
+     "shell.execute_reply.started": "2023-07-10T17:34:42.903264Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
+      "\u001b[0m"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install datasets transformers accelerate wandb -U -q"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "a2a349ee-2749-4490-9807-cdf18f428181",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-07-10T17:34:53.914661Z",
+     "iopub.status.busy": "2023-07-10T17:34:53.914431Z",
+     "iopub.status.idle": "2023-07-10T17:35:00.111736Z",
+     "shell.execute_reply": "2023-07-10T17:35:00.111000Z",
+     "shell.execute_reply.started": "2023-07-10T17:34:53.914639Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: You can find your API key in your browser here: https://wandb.ai/authorize\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:"
+     ]
+    },
+    {
+     "name": "stdin",
+     "output_type": "stream",
+     "text": [
+      "  ········································\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import wandb\n",
+    "wandb.login()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "bce905bd-4fc3-4a6a-bd76-48bba9ebc1d9",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-07-10T17:35:00.113303Z",
+     "iopub.status.busy": "2023-07-10T17:35:00.112906Z",
+     "iopub.status.idle": "2023-07-10T17:35:00.118570Z",
+     "shell.execute_reply": "2023-07-10T17:35:00.117737Z",
+     "shell.execute_reply.started": "2023-07-10T17:35:00.113271Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "env: WANDB_PROJECT=sinhala_bert_v1.2\n"
+     ]
+    }
+   ],
+   "source": [
+    "%env WANDB_PROJECT=sinhala_bert_v1.2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "2551a71d-2804-48ed-bf85-6e0aa94d47d8",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-07-10T17:35:00.120776Z",
+     "iopub.status.busy": "2023-07-10T17:35:00.120489Z",
+     "iopub.status.idle": "2023-07-10T17:35:00.124426Z",
+     "shell.execute_reply": "2023-07-10T17:35:00.123689Z",
+     "shell.execute_reply.started": "2023-07-10T17:35:00.120749Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "model_checkpoint = \"9wimu9/sinhala-bert-1\"\n",
+    "tokenizer_checkpoint= \"9wimu9/sinhala-bert-1\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "137a18d4-6fc5-4bfe-bf3c-54f4bdef5f4b",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-07-10T17:35:00.126040Z",
+     "iopub.status.busy": "2023-07-10T17:35:00.125473Z",
+     "iopub.status.idle": "2023-07-10T17:36:06.352581Z",
+     "shell.execute_reply": "2023-07-10T17:36:06.351983Z",
+     "shell.execute_reply.started": "2023-07-10T17:35:00.126013Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d7d3d422d87f467f899071b2c4ed86b4",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading readme:   0%|          | 0.00/608 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading and preparing dataset None/None to /root/.cache/huggingface/datasets/9wimu9___parquet/9wimu9--sinhala_30m_tokenized-4ef7deb3027f7158/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d0f8e126c22c4fd88dc49b3ad4cf2d22",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "fd3367b878d54022a264d978e91f8b0e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/115M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "3940cbda3f324a4dbd6ae25c81c89a0b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/115M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "234a689e98054963b50ccb99a09bbc76",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/115M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "544d17642acf492c92958c5348c5e3a4",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/115M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "06a99252f2e2433e951566252b3d5f22",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/115M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "79276ae55ce14e22bd45a0e5606fbd48",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/114M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "41c589dddbf14ac489f62f9bcf5a28ce",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/115M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "cf5c2a8975bf4353992067eaeeea1743",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/115M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "139953de43e046fbb85bca9be7155d28",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/114M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c455b35acc6a4ee2a6c76c65e4a29d45",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/115M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "dac6e8d5597544e883a13498159cddb5",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/115M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "92fbb207ac494845af3d94bb542576cf",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/115M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "28b3fa3e947b418a8c963429b9a1dd41",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/115M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "86caa15d02994bda8775768afbd85b94",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/114M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2076c346b559410bb151663a2707813c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/115M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d9ae84724e4e4073aded53ba05f53743",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/115M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "55f54d71a81c43f5affe26ee2a5cdf6a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/114M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "da771932e1ae443eb83a6882441593fb",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/115M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2bb6366bdd4545f895d73c8e556f8e85",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/115M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "5553d13a989c48d39df14955eb4701e2",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/115M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "adedd757f27240c989029c243d5ee76d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/115M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c2e6754349e14f7da1cfd44e8cc23e11",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/115M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a242424a016a4f3f86e2e74683358b28",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/115M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "13deb04a7b294cbe8c5365aa5fb7f037",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/73.2M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a7045bb0d6034a80b14fe5faca6dc8cf",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/73.2M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "179036346c214ebd8f8286ad1c097455",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/73.1M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0ed65fd339cb4293b9d2a312e6012b08",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/73.1M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "27d5e5b89c114454b831fc3c4e3fb80b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "13e100995a514014b5f2033c045ecafa",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split:   0%|          | 0/7310725 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "21391aec64e5442e93395b4e4a0db3ea",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating test split:   0%|          | 0/406414 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "aa8eba7dc1e749b4a9ba497d00715161",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating valid split:   0%|          | 0/405841 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/9wimu9___parquet/9wimu9--sinhala_30m_tokenized-4ef7deb3027f7158/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2bc96a6692d543f1b4bda73c6c03d592",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/3 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "DatasetDict({\n",
+       "    train: Dataset({\n",
+       "        features: ['input_ids', 'attention_mask', 'special_tokens_mask'],\n",
+       "        num_rows: 7310725\n",
+       "    })\n",
+       "    test: Dataset({\n",
+       "        features: ['input_ids', 'attention_mask', 'special_tokens_mask'],\n",
+       "        num_rows: 406414\n",
+       "    })\n",
+       "    valid: Dataset({\n",
+       "        features: ['input_ids', 'attention_mask', 'special_tokens_mask'],\n",
+       "        num_rows: 405841\n",
+       "    })\n",
+       "})"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from datasets import load_dataset\n",
+    "lm_datasets = load_dataset('9wimu9/sinhala_30m_tokenized')\n",
+    "lm_datasets"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "e81c4c2a-d6e2-4a41-bcef-218c205d9544",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-07-10T17:37:42.118612Z",
+     "iopub.status.busy": "2023-07-10T17:37:42.117810Z",
+     "iopub.status.idle": "2023-07-10T17:37:48.570390Z",
+     "shell.execute_reply": "2023-07-10T17:37:48.569592Z",
+     "shell.execute_reply.started": "2023-07-10T17:37:42.118586Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "RobertaConfig {\n",
+       "  \"_name_or_path\": \"/notebooks/roberta-large-pretrained-si\",\n",
+       "  \"architectures\": [\n",
+       "    \"RobertaForMaskedLM\"\n",
+       "  ],\n",
+       "  \"attention_probs_dropout_prob\": 0.1,\n",
+       "  \"bos_token_id\": 0,\n",
+       "  \"classifier_dropout\": null,\n",
+       "  \"eos_token_id\": 2,\n",
+       "  \"hidden_act\": \"gelu\",\n",
+       "  \"hidden_dropout_prob\": 0.1,\n",
+       "  \"hidden_size\": 1024,\n",
+       "  \"initializer_range\": 0.02,\n",
+       "  \"intermediate_size\": 4096,\n",
+       "  \"layer_norm_eps\": 1e-05,\n",
+       "  \"max_position_embeddings\": 514,\n",
+       "  \"model_type\": \"roberta\",\n",
+       "  \"num_attention_heads\": 16,\n",
+       "  \"num_hidden_layers\": 24,\n",
+       "  \"pad_token_id\": 1,\n",
+       "  \"position_embedding_type\": \"absolute\",\n",
+       "  \"transformers_version\": \"4.30.2\",\n",
+       "  \"type_vocab_size\": 1,\n",
+       "  \"use_cache\": true,\n",
+       "  \"vocab_size\": 12500\n",
+       "}"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from transformers import AutoConfig, AutoModelForMaskedLM\n",
+    "# config = AutoConfig.from_pretrained(model_checkpoint)\n",
+    "config = AutoConfig.from_pretrained('/notebooks/roberta-large-pretrained-si')\n",
+    "\n",
+    "model = AutoModelForMaskedLM.from_config(config)\n",
+    "config"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "065f3958-2b05-4a5a-8f05-b049c14fb5f0",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-07-10T09:32:55.634897Z",
+     "iopub.status.busy": "2023-07-10T09:32:55.634368Z",
+     "iopub.status.idle": "2023-07-10T09:32:55.640686Z",
+     "shell.execute_reply": "2023-07-10T09:32:55.640297Z",
+     "shell.execute_reply.started": "2023-07-10T09:32:55.634879Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# max_token_size=128\n",
+    "# use model architecture -> BERT large\n",
+    "# 24 layers, 1,024 dimensions, 16 heads, 4,096 hidden dimensions in the feed-forward layer, with pre-layer normalization\n",
+    "\n",
+    "\n",
+    "# We follow the optimization of RoBERTa (Liu et al., 2019) and use \n",
+    "# AdamW (Loshchilov and Hutter, 2019) with \n",
+    "# β1 = 0.9, β2 = 0.98, ε = 1e-6, \n",
+    "# weight decay of 0.01, dropout 0.1, and \n",
+    "# attention dropout 0.1.\n",
+    "\n",
+    "\n",
+    "# Hyperparameters\n",
+    "\n",
+    "# batch size -> 4k, 8k, and 16k (via gradient accumilation)\n",
+    "\n",
+    "# Warmup Proportion (wu) We determine the number of warmup steps as a proportion of the total number of steps. \n",
+    "# Specifically, we try 0%, 2%, 4%, and 6%, which all reflect significantly fewer warmup steps than in BERT.\n",
+    "\n",
+    "# Peak Learning Rate (lr) Our linear learning rate scheduler, \n",
+    "# which starts at 0, warms up to the peak learning rate, and then decays back to 0. We try 5e-4, 1e-3, and 2e-3\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "858cd60b-32c4-4c0f-859e-10a1ee3bf68e",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-07-10T17:37:48.572108Z",
+     "iopub.status.busy": "2023-07-10T17:37:48.571665Z",
+     "iopub.status.idle": "2023-07-10T17:37:48.610050Z",
+     "shell.execute_reply": "2023-07-10T17:37:48.609409Z",
+     "shell.execute_reply.started": "2023-07-10T17:37:48.572101Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from transformers import AutoTokenizer\n",
+    "# tokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint,model_max_length=256)\n",
+    "tokenizer = AutoTokenizer.from_pretrained('/notebooks/roberta-large-pretrained-si',model_max_length=256)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "5812f8da-3434-4ec8-a2e6-a6bdc30ecf72",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-07-10T17:38:51.772892Z",
+     "iopub.status.busy": "2023-07-10T17:38:51.772628Z",
+     "iopub.status.idle": "2023-07-10T17:38:51.777952Z",
+     "shell.execute_reply": "2023-07-10T17:38:51.777265Z",
+     "shell.execute_reply.started": "2023-07-10T17:38:51.772871Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "RobertaTokenizerFast(name_or_path='/notebooks/roberta-large-pretrained-si', vocab_size=1868, model_max_length=256, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken(\"<mask>\", rstrip=False, lstrip=True, single_word=False, normalized=False)}, clean_up_tokenization_spaces=True)"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tokenizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "0905ef8c-9faa-49d6-ad0a-06753ce856fa",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-07-10T17:37:49.993189Z",
+     "iopub.status.busy": "2023-07-10T17:37:49.992541Z",
+     "iopub.status.idle": "2023-07-10T17:37:49.996729Z",
+     "shell.execute_reply": "2023-07-10T17:37:49.996008Z",
+     "shell.execute_reply.started": "2023-07-10T17:37:49.993157Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "per_device_train_batch_size=400\n",
+    "gradient_accumulation_steps=10\n",
+    "num_train_epochs=1\n",
+    "warmup_rate=0.01"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "6056f333-46f9-4bea-a93d-423f3a1a959e",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-07-10T17:37:55.793688Z",
+     "iopub.status.busy": "2023-07-10T17:37:55.792933Z",
+     "iopub.status.idle": "2023-07-10T17:37:58.921474Z",
+     "shell.execute_reply": "2023-07-10T17:37:58.920666Z",
+     "shell.execute_reply.started": "2023-07-10T17:37:55.793660Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from transformers import TrainingArguments\n",
+    "training_args = TrainingArguments(\n",
+    "    model_checkpoint,\n",
+    "    evaluation_strategy = \"epoch\",\n",
+    "    # push_to_hub=True,\n",
+    "    # hub_model_id=\"sinhala-bert-v.1\",\n",
+    "    per_device_train_batch_size=per_device_train_batch_size, # 4000,8000,16000\n",
+    "    gradient_accumulation_steps=gradient_accumulation_steps,\n",
+    "    gradient_checkpointing=True,\n",
+    "    fp16=True,\n",
+    "    report_to=\"wandb\", \n",
+    "    num_train_epochs=num_train_epochs,\n",
+    "    no_cuda=False,\n",
+    "    logging_steps=1,\n",
+    "    save_steps=25,\n",
+    "    save_total_limit=3,\n",
+    "    # load_best_model_at_end=True,  # whether to load the best model (in terms of loss) at the end of training\n",
+    ")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "7f6078f0-ba64-4509-ac8f-39dd0cd7fe04",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-07-10T17:38:00.867885Z",
+     "iopub.status.busy": "2023-07-10T17:38:00.867375Z",
+     "iopub.status.idle": "2023-07-10T17:38:00.876595Z",
+     "shell.execute_reply": "2023-07-10T17:38:00.875989Z",
+     "shell.execute_reply.started": "2023-07-10T17:38:00.867857Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(7310725, 1828, 18, 4000)"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from transformers import get_polynomial_decay_schedule_with_warmup,AdamW,get_linear_schedule_with_warmup\n",
+    "import math,torch\n",
+    "\n",
+    "params = filter(lambda x: x.requires_grad, model.parameters())\n",
+    "\n",
+    "optimizer = torch.optim.AdamW(params,lr=2e-5,betas=(0.9,0.98),eps=1e-6,weight_decay=0.01)\n",
+    "\n",
+    "batch_size = per_device_train_batch_size*gradient_accumulation_steps\n",
+    "\n",
+    "num_warmup_steps = math.ceil(lm_datasets[\"train\"].num_rows / batch_size) * warmup_rate*num_train_epochs\n",
+    "num_warmup_steps = int(num_warmup_steps)\n",
+    "num_training_steps = math.ceil(lm_datasets[\"train\"].num_rows / batch_size) * num_train_epochs\n",
+    "\n",
+    "\n",
+    "scheduler = get_linear_schedule_with_warmup(optimizer,\n",
+    "    num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)\n",
+    "\n",
+    "lm_datasets[\"train\"].num_rows,num_training_steps,num_warmup_steps,batch_size"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "ebf14d20-e630-4961-a6d4-d9c8fa90e941",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-07-10T17:38:05.602802Z",
+     "iopub.status.busy": "2023-07-10T17:38:05.602191Z",
+     "iopub.status.idle": "2023-07-10T17:38:11.030425Z",
+     "shell.execute_reply": "2023-07-10T17:38:11.029681Z",
+     "shell.execute_reply.started": "2023-07-10T17:38:05.602778Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Reading package lists... Done\n",
+      "Building dependency tree       \n",
+      "Reading state information... Done\n",
+      "The following NEW packages will be installed:\n",
+      "  git-lfs\n",
+      "0 upgraded, 1 newly installed, 0 to remove and 3 not upgraded.\n",
+      "Need to get 3316 kB of archives.\n",
+      "After this operation, 11.1 MB of additional disk space will be used.\n",
+      "Get:1 http://archive.ubuntu.com/ubuntu focal/universe amd64 git-lfs amd64 2.9.2-1 [3316 kB]\n",
+      "Fetched 3316 kB in 1s (3375 kB/s)  \u001b[0m33m\u001b[33m\n",
+      "\n",
+      "\u001b7\u001b[0;23r\u001b8\u001b[1ASelecting previously unselected package git-lfs.\n",
+      "(Reading database ... 69943 files and directories currently installed.)\n",
+      "Preparing to unpack .../git-lfs_2.9.2-1_amd64.deb ...\n",
+      "\u001b7\u001b[24;0f\u001b[42m\u001b[30mProgress: [  0%]\u001b[49m\u001b[39m [..........................................................] \u001b8\u001b7\u001b[24;0f\u001b[42m\u001b[30mProgress: [ 20%]\u001b[49m\u001b[39m [###########...............................................] \u001b8Unpacking git-lfs (2.9.2-1) ...\n",
+      "\u001b7\u001b[24;0f\u001b[42m\u001b[30mProgress: [ 40%]\u001b[49m\u001b[39m [#######################...................................] \u001b8Setting up git-lfs (2.9.2-1) ...\n",
+      "\u001b7\u001b[24;0f\u001b[42m\u001b[30mProgress: [ 60%]\u001b[49m\u001b[39m [##################################........................] \u001b8\u001b7\u001b[24;0f\u001b[42m\u001b[30mProgress: [ 80%]\u001b[49m\u001b[39m [##############################################............] \u001b8Processing triggers for man-db (2.9.1-1) ...\n",
+      "\n",
+      "\u001b7\u001b[0;24r\u001b8\u001b[1A\u001b[J"
+     ]
+    }
+   ],
+   "source": [
+    "!sudo apt install git-lfs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "632113ee-cdcf-45a9-a325-60eaaa1b5f1c",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-07-10T18:25:28.092991Z",
+     "iopub.status.busy": "2023-07-10T18:25:28.092179Z",
+     "iopub.status.idle": "2023-07-10T18:25:35.965867Z",
+     "shell.execute_reply": "2023-07-10T18:25:35.965309Z",
+     "shell.execute_reply.started": "2023-07-10T18:25:28.092953Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# from transformers import RobertaForMaskedLM\n",
+    "# model = RobertaForMaskedLM.from_pretrained(\"/notebooks/9wimu9/sinhala-bert-1/checkpoint-1625\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "969484c6-4035-4234-8ac7-209ab4a014bc",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-07-10T18:25:50.083080Z",
+     "iopub.status.busy": "2023-07-10T18:25:50.082571Z",
+     "iopub.status.idle": "2023-07-10T18:25:50.269795Z",
+     "shell.execute_reply": "2023-07-10T18:25:50.269277Z",
+     "shell.execute_reply.started": "2023-07-10T18:25:50.083058Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from transformers import DataCollatorForLanguageModeling,Trainer\n",
+    "\n",
+    "data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer)\n",
+    "\n",
+    "trainer = Trainer(\n",
+    "    model=model,\n",
+    "    args=training_args,\n",
+    "    train_dataset=lm_datasets[\"train\"],\n",
+    "    eval_dataset=lm_datasets[\"valid\"],\n",
+    "    data_collator=data_collator,\n",
+    "    optimizers=[optimizer, scheduler]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "4c2f4490-b3bc-4ec6-bef1-2bd71933369a",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-07-10T15:10:13.622770Z",
+     "iopub.status.busy": "2023-07-10T15:10:13.622142Z",
+     "iopub.status.idle": "2023-07-10T15:10:13.625595Z",
+     "shell.execute_reply": "2023-07-10T15:10:13.625073Z",
+     "shell.execute_reply.started": "2023-07-10T15:10:13.622747Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "wandb.finish()\n",
+    "# wandb.init()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "17979cc2-2e66-4055-aabb-29d9ee90112d",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-07-08T07:31:19.523715Z",
+     "iopub.status.busy": "2023-07-08T07:31:19.523529Z",
+     "iopub.status.idle": "2023-07-08T07:31:20.383711Z",
+     "shell.execute_reply": "2023-07-08T07:31:20.382696Z",
+     "shell.execute_reply.started": "2023-07-08T07:31:19.523696Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# !rm -rf /notebooks/9wimu9/sinhala-bert-1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b8bd0ab4-6412-4c0c-a215-a0c5cd5d4626",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-07-10T15:10:17.837648Z",
+     "iopub.status.busy": "2023-07-10T15:10:17.837138Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "Changes to your `wandb` environment variables will be ignored because your `wandb` session has already started. For more information on how to modify your settings with `wandb.init()` arguments, please refer to <a href='https://wandb.me/wandb-init' target=\"_blank\">the W&B docs</a>."
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33m9wimu9\u001b[0m (\u001b[33msinquad\u001b[0m). Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "Tracking run with wandb version 0.15.5"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Run data is saved locally in <code>/notebooks/wandb/run-20230710_151033-wsjuqghz</code>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Syncing run <strong><a href='https://wandb.ai/sinquad/sinhala_bert_v1.2/runs/wsjuqghz' target=\"_blank\">classic-eon-6</a></strong> to <a href='https://wandb.ai/sinquad/sinhala_bert_v1.2' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/run' target=\"_blank\">docs</a>)<br/>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       " View project at <a href='https://wandb.ai/sinquad/sinhala_bert_v1.2' target=\"_blank\">https://wandb.ai/sinquad/sinhala_bert_v1.2</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       " View run at <a href='https://wandb.ai/sinquad/sinhala_bert_v1.2/runs/wsjuqghz' target=\"_blank\">https://wandb.ai/sinquad/sinhala_bert_v1.2/runs/wsjuqghz</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <div>\n",
+       "      \n",
+       "      <progress value='1638' max='1827' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [1638/1827 1:55:38 < 3:16:54, 0.02 it/s, Epoch 0.90/1]\n",
+       "    </div>\n",
+       "    <table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       " <tr style=\"text-align: left;\">\n",
+       "      <th>Epoch</th>\n",
+       "      <th>Training Loss</th>\n",
+       "      <th>Validation Loss</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "  </tbody>\n",
+       "</table><p>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "trainer.train(resume_from_checkpoint = True)\n",
+    "wandb.finish()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "bbf22bea-7026-42c9-a643-ba65ab8cdbff",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-07-10T18:26:14.038132Z",
+     "iopub.status.busy": "2023-07-10T18:26:14.037456Z",
+     "iopub.status.idle": "2023-07-10T18:57:49.712287Z",
+     "shell.execute_reply": "2023-07-10T18:57:49.711640Z",
+     "shell.execute_reply.started": "2023-07-10T18:26:14.038103Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <div>\n",
+       "      \n",
+       "      <progress value='50731' max='50731' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [50731/50731 31:35]\n",
+       "    </div>\n",
+       "    "
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Perplexity: 78.33\n"
+     ]
+    }
+   ],
+   "source": [
+    "eval_results = trainer.evaluate()\n",
+    "print(f\"Perplexity: {math.exp(eval_results['eval_loss']):.2f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "f04eaadd-a13d-4651-ad14-91bcc01f92e1",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-07-10T18:58:07.077477Z",
+     "iopub.status.busy": "2023-07-10T18:58:07.076979Z",
+     "iopub.status.idle": "2023-07-10T18:58:07.082203Z",
+     "shell.execute_reply": "2023-07-10T18:58:07.081426Z",
+     "shell.execute_reply.started": "2023-07-10T18:58:07.077454Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'eval_loss': 4.360935211181641,\n",
+       " 'eval_runtime': 1895.6573,\n",
+       " 'eval_samples_per_second': 214.09,\n",
+       " 'eval_steps_per_second': 26.762}"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "eval_results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "d3417a50-f0a7-4cd7-bc3b-14106660be58",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-07-10T18:58:52.507374Z",
+     "iopub.status.busy": "2023-07-10T18:58:52.506748Z",
+     "iopub.status.idle": "2023-07-10T18:58:53.770508Z",
+     "shell.execute_reply": "2023-07-10T18:58:53.769992Z",
+     "shell.execute_reply.started": "2023-07-10T18:58:52.507341Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "trainer.save_model(\"path_to_save\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "6a3b42de-552c-41fc-a454-afe8a0bf567d",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-07-10T18:59:46.871782Z",
+     "iopub.status.busy": "2023-07-10T18:59:46.871272Z",
+     "iopub.status.idle": "2023-07-10T18:59:49.794057Z",
+     "shell.execute_reply": "2023-07-10T18:59:49.793583Z",
+     "shell.execute_reply.started": "2023-07-10T18:59:46.871761Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Some weights of the model checkpoint at /notebooks/path_to_save were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']\n",
+      "- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+      "- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
+      "Some weights of RobertaModel were not initialized from the model checkpoint at /notebooks/path_to_save and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+     ]
+    }
+   ],
+   "source": [
+    "from transformers import AutoModel \n",
+    "model = AutoModel.from_pretrained('/notebooks/path_to_save',local_files_only=True)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b6f2c49a-9a09-4949-b67f-29df6d0aa895",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-07-10T19:01:49.192299Z",
+     "iopub.status.busy": "2023-07-10T19:01:49.191549Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "77a76086b50b43a2a0bb1cc869ef8e26",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "pytorch_model.bin:   0%|          | 0.00/1.27G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "model.push_to_hub('9wimu9/sinhala-bert-1.2')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "d4553ec7-1e38-4b44-8c5f-e46786cd3cfc",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-07-09T13:08:15.514124Z",
+     "iopub.status.busy": "2023-07-09T13:08:15.513517Z",
+     "iopub.status.idle": "2023-07-09T13:08:15.918801Z",
+     "shell.execute_reply": "2023-07-09T13:08:15.918326Z",
+     "shell.execute_reply.started": "2023-07-09T13:08:15.514097Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from huggingface_hub import HfApi\n",
+    "api = HfApi()\n",
+    "files = ['tokenizer.json','training_args.bin','trainer.ipynb']\n",
+    "for file in files:\n",
+    "    api.upload_file(\n",
+    "        path_or_fileobj=\"/notebooks/path_to_save/\"+file,\n",
+    "        path_in_repo=file,\n",
+    "        repo_id=\"9wimu9/sinhala-bert-1.1\",\n",
+    "        repo_type=\"model\",\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d1614503-df5d-454f-a81d-d96bb1899443",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "learning rate scheduler details can be find here\n",
+    "https://dev.classmethod.jp/articles/huggingface-usage-scheluder-type/"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cd184295-1c0b-4625-a516-da417beb814f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "bert hyper params\n",
+    "======================\n",
+    "β1 = 0.9,\n",
+    "β2 = 0.999, \n",
+    "ǫ = 1e-6\n",
+    "L2 weight decay = 0.01\n",
+    "learning rate = warmed up first 10k to a peak of 1e-4 then linearly decayed\n",
+    "drop out 0.1\n",
+    "batch size = 256\n",
+    "step size = 1m\n",
+    "max_token_length = 512\n",
+    "\n",
+    "roberta\n",
+    "============\n",
+    "β2 = 0.98 for lareg batch sizs\n",
+    "max_token_length = 512\n",
+    "batch size = 2k\n",
+    "lr = 7e-4\n",
+    "\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.16"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}