{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import emoji\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from transformers import AutoTokenizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# train_df=pd.read_csv('/DATA/sin-kaf/offenseval-tr-training-v1.tsv',sep='\\t')\n",
    "# test_df=pd.read_csv('/DATA/sin-kaf/offenseval-tr-testset-v1.tsv',sep='\\t')\n",
    "# augmented_df=pd.read_csv('augmented_data_offensive.csv')\n",
    "# selin_df=pd.read_csv('/DATA/sin-kaf/selin_data.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "outliers_df=pd.read_csv('/DATA/sin-kaf/cluster_outliers.csv')\n",
    "outliers_df=outliers_df.drop(['Unnamed: 0'], axis=1)\n",
    "outliers_df['subtask_a'] = outliers_df['subtas_a']\n",
    "outliers_df=outliers_df.drop(['subtas_a'], axis=1)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df=outliers_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# augmented_df=augmented_df.drop(['Unnamed: 0'], axis=1)\n",
    "# augmented_df = augmented_df.dropna()\n",
    "# train_df=pd.concat([train_df,augmented_df], axis=0)\n",
    "# train_df=pd.concat([train_df,test_df], axis=0)\n",
    "# train_df=train_df.drop(['id'], axis=1)\n",
    "data=train_df['tweet'].tolist()\n",
    "for i in range(len(data)):\n",
    "    data[i] = data[i].replace('@USER','')\n",
    "    data[i] = data[i].replace('#','')\n",
    "    data[i] = data[i].replace('$','')\n",
    "    data[i] = emoji.demojize(data[i])\n",
    "    \n",
    "train_df['tweet'] = data\n",
    "lab = LabelEncoder()\n",
    "train_df['subtask_a'] = lab.fit_transform(train_df['subtask_a'])\n",
    "df = train_df[train_df.subtask_a != 2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>tweet</th>\n",
       "      <th>subtask_a</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>en güzel uyuyan insan ödülü jeon jungkook'a g...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Mekanı cennet olsun, saygılar sayın avukatımı...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Kızlar aranızda kas yığını beylere düşenler ol...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Biraz ders çalışayım. Tembellik ve uyku düşman...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Trezeguet yerine El Sharawy daha iyi olmaz mı</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41177</th>\n",
       "      <td>Hil**adamlar kesinlikle kelimeleri anlamıyorla...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41178</th>\n",
       "      <td>Böyle piçlerin çok erken ölmemelerini ve çok f...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41179</th>\n",
       "      <td>Turgay denilen bu holigonda bir sorun yok, gur...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41180</th>\n",
       "      <td>Umarım ülkenin düşük zekadan kurtulması ilgile...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41181</th>\n",
       "      <td>CHP sandıkları bırakmaz, üzerine oturur, bir c...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>41182 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                   tweet  subtask_a\n",
       "0       en güzel uyuyan insan ödülü jeon jungkook'a g...          0\n",
       "1       Mekanı cennet olsun, saygılar sayın avukatımı...          0\n",
       "2      Kızlar aranızda kas yığını beylere düşenler ol...          0\n",
       "3      Biraz ders çalışayım. Tembellik ve uyku düşman...          0\n",
       "4          Trezeguet yerine El Sharawy daha iyi olmaz mı          0\n",
       "...                                                  ...        ...\n",
       "41177  Hil**adamlar kesinlikle kelimeleri anlamıyorla...          1\n",
       "41178  Böyle piçlerin çok erken ölmemelerini ve çok f...          1\n",
       "41179  Turgay denilen bu holigonda bir sorun yok, gur...          1\n",
       "41180  Umarım ülkenin düşük zekadan kurtulması ilgile...          1\n",
       "41181  CHP sandıkları bırakmaz, üzerine oturur, bir c...          1\n",
       "\n",
       "[41182 rows x 2 columns]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# train_df=pd.concat([train_df,selin_df], axis=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>tweet</th>\n",
       "      <th>subtask_a</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>en güzel uyuyan insan ödülü jeon jungkook'a g...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Mekanı cennet olsun, saygılar sayın avukatımı...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Kızlar aranızda kas yığını beylere düşenler ol...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Biraz ders çalışayım. Tembellik ve uyku düşman...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Trezeguet yerine El Sharawy daha iyi olmaz mı</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41177</th>\n",
       "      <td>Hil**adamlar kesinlikle kelimeleri anlamıyorla...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41178</th>\n",
       "      <td>Böyle piçlerin çok erken ölmemelerini ve çok f...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41179</th>\n",
       "      <td>Turgay denilen bu holigonda bir sorun yok, gur...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41180</th>\n",
       "      <td>Umarım ülkenin düşük zekadan kurtulması ilgile...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41181</th>\n",
       "      <td>CHP sandıkları bırakmaz, üzerine oturur, bir c...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>41182 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                   tweet  subtask_a\n",
       "0       en güzel uyuyan insan ödülü jeon jungkook'a g...          0\n",
       "1       Mekanı cennet olsun, saygılar sayın avukatımı...          0\n",
       "2      Kızlar aranızda kas yığını beylere düşenler ol...          0\n",
       "3      Biraz ders çalışayım. Tembellik ve uyku düşman...          0\n",
       "4          Trezeguet yerine El Sharawy daha iyi olmaz mı          0\n",
       "...                                                  ...        ...\n",
       "41177  Hil**adamlar kesinlikle kelimeleri anlamıyorla...          1\n",
       "41178  Böyle piçlerin çok erken ölmemelerini ve çok f...          1\n",
       "41179  Turgay denilen bu holigonda bir sorun yok, gur...          1\n",
       "41180  Umarım ülkenin düşük zekadan kurtulması ilgile...          1\n",
       "41181  CHP sandıkları bırakmaz, üzerine oturur, bir c...          1\n",
       "\n",
       "[41182 rows x 2 columns]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df = df.sample(frac = 0.7, random_state = 200)\n",
    "df_2 = df.drop(train_df.index)\n",
    "test_df = df_2.sample(frac = 0.15, random_state = 200)\n",
    "val_df = df_2.drop(test_df.index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "text_train = train_df.tweet.values\n",
    "label_train = train_df.subtask_a.values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "text_test = test_df.tweet.values\n",
    "label_test = test_df.subtask_a.values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "text_val = val_df.tweet.values\n",
    "label_val = val_df.subtask_a.values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets.dataset_dict import DatasetDict\n",
    "from datasets import Dataset\n",
    "dataset={'train':Dataset.from_dict({'label':label_train,'text':text_train}),\n",
    "        'val':Dataset.from_dict({'label':label_val,'text':text_val}),\n",
    "        'test':Dataset.from_dict({'label':label_test,'text':text_test})\n",
    "        }\n",
    "dataset = DatasetDict(dataset)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "# tokenizer = AutoTokenizer.from_pretrained(\"dbmdz/bert-base-turkish-128k-uncased\")\n",
    "# tokenizer = AutoTokenizer.from_pretrained(\"dbmdz/distilbert-base-turkish-cased\")\n",
    "tokenizer = AutoTokenizer.from_pretrained(\"Overfit-GM/distilbert-base-turkish-cased-offensive\")\n",
    "# tokenizer = AutoTokenizer.from_pretrained(\"Overfit-GM/distilbert-base-turkish-cased-offensive\",max_length=208,padding=\"max_length\",truncation=True,return_tensors=\"pt\",add_special_tokens=True,)\n",
    "# tokenizer = AutoTokenizer.from_pretrained(\"stage_f/pretrain_mlm_distilbert-base-turkish-cased\")\n",
    "def tokenize_function(examples):\n",
    "    return tokenizer(examples[\"text\"], padding=\"max_length\", truncation=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "5fba4c9671724e9a93d6ad14a1427345",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/28827 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "2fff446f4f094d2fb66da549a49ad8a4",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/10502 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "675f3b595b21489abaca01453c06db2c",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/1853 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "tokenized_datasets = dataset.map(tokenize_function, batched=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "small_train_dataset = tokenized_datasets[\"train\"].shuffle(seed=42)\n",
    "small_eval_dataset = tokenized_datasets[\"test\"].shuffle(seed=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['label', 'text', 'input_ids', 'attention_mask'],\n",
       "    num_rows: 28827\n",
       "})"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "small_train_dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['label', 'text', 'input_ids', 'attention_mask'],\n",
       "    num_rows: 1853\n",
       "})"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "small_eval_dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at Overfit-GM/distilbert-base-turkish-cased-offensive and are newly initialized because the shapes did not match:\n",
      "- classifier.weight: found shape torch.Size([5, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated\n",
      "- classifier.bias: found shape torch.Size([5]) in the checkpoint and torch.Size([2]) in the model instantiated\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
     ]
    }
   ],
   "source": [
    "from transformers import AutoModelForSequenceClassification\n",
    "\n",
    "# model = AutoModelForSequenceClassification.from_pretrained(\"dbmdz/bert-base-turkish-128k-uncased\",num_labels = 2)\n",
    "# model = AutoModelForSequenceClassification.from_pretrained(\"dbmdz/distilbert-base-turkish-cased\",num_labels = 2)\n",
    "# model = AutoModelForSequenceClassification.from_pretrained(\"Overfit-GM/distilbert-base-turkish-cased-offensive\",num_labels = 2, ignore_mismatched_sizes=True)\n",
    "model = AutoModelForSequenceClassification.from_pretrained(\"Overfit-GM/distilbert-base-turkish-cased-offensive\",num_labels = 2, ignore_mismatched_sizes=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import TrainingArguments\n",
    "\n",
    "training_args = TrainingArguments(output_dir=\"test_trainer\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import numpy as np\n",
    "# import evaluate\n",
    "\n",
    "# # metric = evaluate.load(\"accuracy\")\n",
    "# # confusion_matrix = evaluate.load(\"BucketHeadP65/confusion_matrix\")\n",
    "# # metric = evaluate.combine([\"accuracy\", \"f1\", \"precision\", \"recall\", \"confusion_matrix\"])\n",
    "# metric = evaluate.combine([\"accuracy\", \"f1\", \"precision\", \"recall\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import evaluate\n",
    "\n",
    "metric = evaluate.combine([\"accuracy\", \"f1\", \"precision\", \"recall\"])\n",
    "conf_matrix = evaluate.load(\"BucketHeadP65/confusion_matrix\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "def compute_metrics(eval_pred):\n",
    "    logits, labels = eval_pred\n",
    "    predictions = np.argmax(logits, axis=-1)\n",
    "    print(conf_matrix.compute(predictions=predictions, references=labels))\n",
    "    return metric.compute(predictions=predictions, references=labels)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import TrainingArguments, Trainer\n",
    "from pytorch_lightning.loggers import TensorBoardLogger,MLFlowLogger\n",
    "\n",
    "training_args = TrainingArguments(output_dir=\"test_trainer\", evaluation_strategy=\"epoch\", num_train_epochs = 5, logging_dir ='TensorBoard',report_to ='mlflow')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "trainer = Trainer(\n",
    "    model=model,\n",
    "    args=training_args,\n",
    "    train_dataset=small_train_dataset,\n",
    "    eval_dataset=small_eval_dataset,\n",
    "    compute_metrics=compute_metrics,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
      "To disable this warning, you can either:\n",
      "\t- Avoid using `tokenizers` before the fork if possible\n",
      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
      "To disable this warning, you can either:\n",
      "\t- Avoid using `tokenizers` before the fork if possible\n",
      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
      "To disable this warning, you can either:\n",
      "\t- Avoid using `tokenizers` before the fork if possible\n",
      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "a38121a009be4a0f90e30fc9c0cf49ed",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/18020 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'loss': 0.4638, 'learning_rate': 4.86126526082131e-05, 'epoch': 0.14}\n",
      "{'loss': 0.3886, 'learning_rate': 4.72253052164262e-05, 'epoch': 0.28}\n",
      "{'loss': 0.3893, 'learning_rate': 4.583795782463929e-05, 'epoch': 0.42}\n",
      "{'loss': 0.3594, 'learning_rate': 4.445061043285239e-05, 'epoch': 0.55}\n",
      "{'loss': 0.3547, 'learning_rate': 4.306326304106548e-05, 'epoch': 0.69}\n",
      "{'loss': 0.3384, 'learning_rate': 4.167591564927858e-05, 'epoch': 0.83}\n",
      "{'loss': 0.3498, 'learning_rate': 4.028856825749168e-05, 'epoch': 0.97}\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "94ab139e1ebb482da2111517ad5a3a78",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/232 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'confusion_matrix': array([[966,  90],\n",
      "       [118, 679]])}\n",
      "{'eval_loss': 0.28741681575775146, 'eval_accuracy': 0.8877495952509444, 'eval_f1': 0.8671775223499362, 'eval_precision': 0.88296488946684, 'eval_recall': 0.8519447929736512, 'eval_runtime': 11.4928, 'eval_samples_per_second': 161.231, 'eval_steps_per_second': 20.186, 'epoch': 1.0}\n",
      "{'loss': 0.2449, 'learning_rate': 3.890122086570477e-05, 'epoch': 1.11}\n",
      "{'loss': 0.2178, 'learning_rate': 3.751387347391787e-05, 'epoch': 1.25}\n",
      "{'loss': 0.2431, 'learning_rate': 3.612652608213097e-05, 'epoch': 1.39}\n",
      "{'loss': 0.2261, 'learning_rate': 3.4739178690344064e-05, 'epoch': 1.53}\n",
      "{'loss': 0.2365, 'learning_rate': 3.3351831298557165e-05, 'epoch': 1.66}\n",
      "{'loss': 0.2169, 'learning_rate': 3.196448390677026e-05, 'epoch': 1.8}\n",
      "{'loss': 0.222, 'learning_rate': 3.0577136514983354e-05, 'epoch': 1.94}\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "063c47c6cae0467194d4c0827e67c277",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/232 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'confusion_matrix': array([[900, 156],\n",
      "       [ 76, 721]])}\n",
      "{'eval_loss': 0.47509443759918213, 'eval_accuracy': 0.8747976254722072, 'eval_f1': 0.8614097968936678, 'eval_precision': 0.82212086659065, 'eval_recall': 0.904642409033877, 'eval_runtime': 11.6203, 'eval_samples_per_second': 159.462, 'eval_steps_per_second': 19.965, 'epoch': 2.0}\n",
      "{'loss': 0.146, 'learning_rate': 2.918978912319645e-05, 'epoch': 2.08}\n",
      "{'loss': 0.1163, 'learning_rate': 2.7802441731409544e-05, 'epoch': 2.22}\n",
      "{'loss': 0.1008, 'learning_rate': 2.641509433962264e-05, 'epoch': 2.36}\n",
      "{'loss': 0.0967, 'learning_rate': 2.502774694783574e-05, 'epoch': 2.5}\n",
      "{'loss': 0.1456, 'learning_rate': 2.3640399556048838e-05, 'epoch': 2.64}\n",
      "{'loss': 0.1178, 'learning_rate': 2.2253052164261932e-05, 'epoch': 2.77}\n",
      "{'loss': 0.1155, 'learning_rate': 2.0865704772475027e-05, 'epoch': 2.91}\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "4fa52dfbbae54cde8c627a237bed51bc",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/232 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'confusion_matrix': array([[954, 102],\n",
      "       [106, 691]])}\n",
      "{'eval_loss': 0.5530020594596863, 'eval_accuracy': 0.8877495952509444, 'eval_f1': 0.8691823899371071, 'eval_precision': 0.8713745271122321, 'eval_recall': 0.8670012547051443, 'eval_runtime': 11.6026, 'eval_samples_per_second': 159.706, 'eval_steps_per_second': 19.996, 'epoch': 3.0}\n",
      "{'loss': 0.0879, 'learning_rate': 1.9478357380688125e-05, 'epoch': 3.05}\n",
      "{'loss': 0.0351, 'learning_rate': 1.8091009988901223e-05, 'epoch': 3.19}\n",
      "{'loss': 0.0501, 'learning_rate': 1.670366259711432e-05, 'epoch': 3.33}\n",
      "{'loss': 0.0425, 'learning_rate': 1.5316315205327412e-05, 'epoch': 3.47}\n",
      "{'loss': 0.0564, 'learning_rate': 1.392896781354051e-05, 'epoch': 3.61}\n",
      "{'loss': 0.05, 'learning_rate': 1.2541620421753608e-05, 'epoch': 3.75}\n",
      "{'loss': 0.034, 'learning_rate': 1.1154273029966705e-05, 'epoch': 3.88}\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "a9b754cd0e7641cb8d8023f28bc32a06",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/232 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'confusion_matrix': array([[966,  90],\n",
      "       [109, 688]])}\n",
      "{'eval_loss': 0.824292778968811, 'eval_accuracy': 0.8926065839179709, 'eval_f1': 0.8736507936507937, 'eval_precision': 0.884318766066838, 'eval_recall': 0.863237139272271, 'eval_runtime': 11.6185, 'eval_samples_per_second': 159.487, 'eval_steps_per_second': 19.968, 'epoch': 4.0}\n",
      "{'loss': 0.0354, 'learning_rate': 9.766925638179801e-06, 'epoch': 4.02}\n",
      "{'loss': 0.0165, 'learning_rate': 8.379578246392897e-06, 'epoch': 4.16}\n",
      "{'loss': 0.0119, 'learning_rate': 6.992230854605994e-06, 'epoch': 4.3}\n",
      "{'loss': 0.0145, 'learning_rate': 5.60488346281909e-06, 'epoch': 4.44}\n",
      "{'loss': 0.0169, 'learning_rate': 4.217536071032187e-06, 'epoch': 4.58}\n",
      "{'loss': 0.0132, 'learning_rate': 2.830188679245283e-06, 'epoch': 4.72}\n",
      "{'loss': 0.0232, 'learning_rate': 1.4428412874583796e-06, 'epoch': 4.86}\n",
      "{'loss': 0.0189, 'learning_rate': 5.549389567147614e-08, 'epoch': 4.99}\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e66e5b59c6ba42ae9939f55dcda3c877",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/232 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'confusion_matrix': array([[955, 101],\n",
      "       [111, 686]])}\n",
      "{'eval_loss': 0.937654972076416, 'eval_accuracy': 0.8855909336211549, 'eval_f1': 0.8661616161616161, 'eval_precision': 0.8716645489199492, 'eval_recall': 0.8607277289836889, 'eval_runtime': 11.5644, 'eval_samples_per_second': 160.233, 'eval_steps_per_second': 20.062, 'epoch': 5.0}\n",
      "{'train_runtime': 3027.4521, 'train_samples_per_second': 47.609, 'train_steps_per_second': 5.952, 'train_loss': 0.15528733040680712, 'epoch': 5.0}\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "TrainOutput(global_step=18020, training_loss=0.15528733040680712, metrics={'train_runtime': 3027.4521, 'train_samples_per_second': 47.609, 'train_steps_per_second': 5.952, 'train_loss': 0.15528733040680712, 'epoch': 5.0})"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "trainer.train()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# best case"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
      "To disable this warning, you can either:\n",
      "\t- Avoid using `tokenizers` before the fork if possible\n",
      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
      "To disable this warning, you can either:\n",
      "\t- Avoid using `tokenizers` before the fork if possible\n",
      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
      "To disable this warning, you can either:\n",
      "\t- Avoid using `tokenizers` before the fork if possible\n",
      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "4620503cb22c41a582c44a3d17fac2f6",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/18825 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'loss': 0.4623, 'learning_rate': 4.867197875166003e-05, 'epoch': 0.13}\n",
      "{'loss': 0.3955, 'learning_rate': 4.734395750332006e-05, 'epoch': 0.27}\n",
      "{'loss': 0.3695, 'learning_rate': 4.601593625498008e-05, 'epoch': 0.4}\n",
      "{'loss': 0.368, 'learning_rate': 4.4687915006640105e-05, 'epoch': 0.53}\n",
      "{'loss': 0.3418, 'learning_rate': 4.335989375830013e-05, 'epoch': 0.66}\n",
      "{'loss': 0.3519, 'learning_rate': 4.203187250996016e-05, 'epoch': 0.8}\n",
      "{'loss': 0.3418, 'learning_rate': 4.070385126162019e-05, 'epoch': 0.93}\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "c81779b9a7eb43cfa29966957f13ec31",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/242 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'eval_loss': 0.2548353374004364, 'eval_accuracy': 0.9013429752066116, 'eval_f1': 0.8737607402511566, 'eval_precision': 0.9218967921896792, 'eval_recall': 0.8304020100502513, 'eval_runtime': 12.1488, 'eval_samples_per_second': 159.357, 'eval_steps_per_second': 19.92, 'epoch': 1.0}\n",
      "{'loss': 0.2884, 'learning_rate': 3.9375830013280215e-05, 'epoch': 1.06}\n",
      "{'loss': 0.2136, 'learning_rate': 3.804780876494024e-05, 'epoch': 1.2}\n",
      "{'loss': 0.2422, 'learning_rate': 3.671978751660027e-05, 'epoch': 1.33}\n",
      "{'loss': 0.2105, 'learning_rate': 3.53917662682603e-05, 'epoch': 1.46}\n",
      "{'loss': 0.2203, 'learning_rate': 3.406374501992032e-05, 'epoch': 1.59}\n",
      "{'loss': 0.2455, 'learning_rate': 3.2735723771580345e-05, 'epoch': 1.73}\n",
      "{'loss': 0.2282, 'learning_rate': 3.140770252324037e-05, 'epoch': 1.86}\n",
      "{'loss': 0.2328, 'learning_rate': 3.00796812749004e-05, 'epoch': 1.99}\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "f83c5030d5c34216ba6422f2c22858ba",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/242 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'eval_loss': 0.4118729829788208, 'eval_accuracy': 0.8982438016528925, 'eval_f1': 0.8763339610797238, 'eval_precision': 0.875784190715182, 'eval_recall': 0.8768844221105527, 'eval_runtime': 12.1691, 'eval_samples_per_second': 159.092, 'eval_steps_per_second': 19.886, 'epoch': 2.0}\n",
      "{'loss': 0.1086, 'learning_rate': 2.8751660026560427e-05, 'epoch': 2.12}\n",
      "{'loss': 0.1137, 'learning_rate': 2.742363877822045e-05, 'epoch': 2.26}\n",
      "{'loss': 0.1058, 'learning_rate': 2.609561752988048e-05, 'epoch': 2.39}\n",
      "{'loss': 0.1073, 'learning_rate': 2.4767596281540506e-05, 'epoch': 2.52}\n",
      "{'loss': 0.0953, 'learning_rate': 2.3439575033200534e-05, 'epoch': 2.66}\n",
      "{'loss': 0.1066, 'learning_rate': 2.2111553784860558e-05, 'epoch': 2.79}\n",
      "{'loss': 0.1152, 'learning_rate': 2.0783532536520585e-05, 'epoch': 2.92}\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "3c4d464cb3a340d4aa4f6a1a8e4d95b9",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/242 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'eval_loss': 0.4992543160915375, 'eval_accuracy': 0.9039256198347108, 'eval_f1': 0.8831658291457286, 'eval_precision': 0.8831658291457286, 'eval_recall': 0.8831658291457286, 'eval_runtime': 12.145, 'eval_samples_per_second': 159.407, 'eval_steps_per_second': 19.926, 'epoch': 3.0}\n",
      "{'loss': 0.0761, 'learning_rate': 1.9455511288180613e-05, 'epoch': 3.05}\n",
      "{'loss': 0.0434, 'learning_rate': 1.812749003984064e-05, 'epoch': 3.19}\n",
      "{'loss': 0.0395, 'learning_rate': 1.6799468791500664e-05, 'epoch': 3.32}\n",
      "{'loss': 0.0516, 'learning_rate': 1.547144754316069e-05, 'epoch': 3.45}\n",
      "{'loss': 0.0344, 'learning_rate': 1.4143426294820719e-05, 'epoch': 3.59}\n",
      "{'loss': 0.0588, 'learning_rate': 1.2815405046480745e-05, 'epoch': 3.72}\n",
      "{'loss': 0.0323, 'learning_rate': 1.148738379814077e-05, 'epoch': 3.85}\n",
      "{'loss': 0.0574, 'learning_rate': 1.0159362549800798e-05, 'epoch': 3.98}\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "bf0675bd947c472bb221d755dc55a219",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/242 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'eval_loss': 0.6084339618682861, 'eval_accuracy': 0.9121900826446281, 'eval_f1': 0.8933500627352573, 'eval_precision': 0.8922305764411027, 'eval_recall': 0.8944723618090452, 'eval_runtime': 11.9875, 'eval_samples_per_second': 161.502, 'eval_steps_per_second': 20.188, 'epoch': 4.0}\n",
      "{'loss': 0.0175, 'learning_rate': 8.831341301460823e-06, 'epoch': 4.12}\n",
      "{'loss': 0.0248, 'learning_rate': 7.503320053120851e-06, 'epoch': 4.25}\n",
      "{'loss': 0.0212, 'learning_rate': 6.175298804780877e-06, 'epoch': 4.38}\n",
      "{'loss': 0.0215, 'learning_rate': 4.847277556440903e-06, 'epoch': 4.52}\n",
      "{'loss': 0.0216, 'learning_rate': 3.51925630810093e-06, 'epoch': 4.65}\n",
      "{'loss': 0.0169, 'learning_rate': 2.1912350597609563e-06, 'epoch': 4.78}\n",
      "{'loss': 0.0199, 'learning_rate': 8.632138114209828e-07, 'epoch': 4.91}\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "0ac0cee28031479d9721321ec9c949a4",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/242 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'eval_loss': 0.6909418106079102, 'eval_accuracy': 0.9158057851239669, 'eval_f1': 0.8963763509218055, 'eval_precision': 0.9073359073359073, 'eval_recall': 0.885678391959799, 'eval_runtime': 12.1798, 'eval_samples_per_second': 158.952, 'eval_steps_per_second': 19.869, 'epoch': 5.0}\n",
      "{'train_runtime': 3197.4084, 'train_samples_per_second': 47.101, 'train_steps_per_second': 5.888, 'train_loss': 0.15457879885892628, 'epoch': 5.0}\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "TrainOutput(global_step=18825, training_loss=0.15457879885892628, metrics={'train_runtime': 3197.4084, 'train_samples_per_second': 47.101, 'train_steps_per_second': 5.888, 'train_loss': 0.15457879885892628, 'epoch': 5.0})"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "trainer.train()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# load model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = AutoModelForSequenceClassification.from_pretrained('/DATA/sin-kaf/test_trainer/checkpoint-16000')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Original:  güzel kızz\n",
      "Token IDs: tensor([[   2, 2639, 2889, 1050,    3,    0,    0,    0,    0,    0,    0,    0,\n",
      "            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
      "            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
      "            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
      "            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
      "            0,    0,    0,    0]])\n",
      "Token IDs: tensor([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
      "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
      "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])\n"
     ]
    }
   ],
   "source": [
    "sent = 'güzel kızz'\n",
    "input_ids = []\n",
    "attention_masks = []\n",
    "\n",
    "encoded_dict = tokenizer.encode_plus(\n",
    "                    sent,\n",
    "                    add_special_tokens = True,\n",
    "                    max_length = 64,\n",
    "                    pad_to_max_length = True,\n",
    "                    return_attention_mask = True,\n",
    "                    return_tensors = 'pt',\n",
    "                )\n",
    "\n",
    "\n",
    "input_ids = encoded_dict['input_ids']\n",
    "attention_masks = encoded_dict['attention_mask']\n",
    "\n",
    "\n",
    "input_ids = torch.cat([input_ids], dim=0)\n",
    "input_mask = torch.cat([attention_masks], dim=0)\n",
    "\n",
    "\n",
    "\n",
    "print('Original: ', sent)\n",
    "print('Token IDs:', input_ids)\n",
    "print('Token IDs:', input_mask)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "outputs = model(input_ids, input_mask)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "SequenceClassifierOutput(loss=None, logits=tensor([[ 3.6835, -3.6147]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "outputs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor(0)"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "torch.argmax(outputs['logits'])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "dlenv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.17"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}