{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# dataset link (Turkis)\n", "# https://sites.google.com/site/offensevalsharedtask/more-datasets" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/sebit/anaconda3/envs/dl_env/lib/python3.9/site-packages/neptune/internal/backends/hosted_client.py:51: NeptuneDeprecationWarning: The 'neptune-client' package has been deprecated and will be removed in the future. Install the 'neptune' package instead. For more, see https://docs.neptune.ai/setup/upgrading/\n", " from neptune.version import version as neptune_client_version\n", "/home/sebit/anaconda3/envs/dl_env/lib/python3.9/site-packages/pytorch_lightning/loggers/neptune.py:39: NeptuneDeprecationWarning: You're importing the Neptune client library via the deprecated `neptune.new` module, which will be removed in a future release. Import directly from `neptune` instead.\n", " from neptune import new as neptune\n" ] } ], "source": [ "import os\n", "import numpy as np\n", "import pandas as pd\n", "import pytorch_lightning as pl\n", "import random\n", "import torch\n", "import emoji\n", "\n", "\n", "import datetime\n", "import numpy as np\n", "import torch.optim as optim\n", "\n", "\n", "import torch.nn as nn\n", "\n", "from torch.utils.data import DataLoader,Dataset,random_split,TensorDataset ,RandomSampler, SequentialSampler\n", "from torchmetrics import Accuracy, F1Score \n", "from sklearn.preprocessing import LabelEncoder\n", "from pytorch_lightning.callbacks import EarlyStopping,ModelCheckpoint\n", "from pytorch_lightning.loggers import TensorBoardLogger,MLFlowLogger\n", "from sklearn.model_selection import train_test_split\n", "\n", "from sklearn.preprocessing import LabelEncoder\n", "from transformers import BertForSequenceClassification, AdamW, BertConfig,BertTokenizer,get_linear_schedule_with_warmup" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "device(type='cuda', index=0)" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n", "device" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "torch.cuda.is_available()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "seed_val = 42\n", "random.seed(seed_val)\n", "np.random.seed(seed_val)\n", "torch.manual_seed(seed_val)\n", "torch.cuda.manual_seed_all(seed_val)" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# load dataaset\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# train_df=pd.read_csv('SemEval-2020 dataset/offenseval2020-turkish/offenseval2020-turkish/offenseval-tr-training-v1/offenseval-tr-training-v1.tsv',sep='\\t')\n", "# test_df=pd.read_csv('SemEval-2020 dataset/offenseval2020-turkish/offenseval2020-turkish/offenseval-tr-testset-v1/offenseval-tr-testset-v1.tsv',sep='\\t')" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'train_df' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[6], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m train_df\u001b[39m=\u001b[39mpd\u001b[39m.\u001b[39mconcat([train_df,test_df], axis\u001b[39m=\u001b[39m\u001b[39m0\u001b[39m)\n\u001b[1;32m 2\u001b[0m train_df\u001b[39m=\u001b[39mtrain_df\u001b[39m.\u001b[39mdrop([\u001b[39m'\u001b[39m\u001b[39mid\u001b[39m\u001b[39m'\u001b[39m], axis\u001b[39m=\u001b[39m\u001b[39m1\u001b[39m)\n", "\u001b[0;31mNameError\u001b[0m: name 'train_df' is not defined" ] } ], "source": [ "train_df=pd.concat([train_df,test_df], axis=0)\n", "train_df=train_df.drop(['id'], axis=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "subtask_a\n", "NOT 25231\n", "OFF 6046\n", "Name: count, dtype: int64" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_df['subtask_a'].value_counts()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "data=train_df['tweet'].tolist()\n", "for i in range(len(data)):\n", " data[i] = data[i].replace('@USER','')\n", " data[i] = data[i].replace('#','')\n", " data[i] = data[i].replace('$','')\n", " data[i] = emoji.demojize(data[i])\n", " \n", "train_df['tweet'] = data" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "lab = LabelEncoder()\n", "train_df['subtask_a'] = lab.fit_transform(train_df['subtask_a'])" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "subtask_a\n", "0 25231\n", "1 6046\n", "2 3515\n", "Name: count, dtype: int64" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_df['subtask_a'].value_counts()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "train_df.drop(train_df[train_df['subtask_a'] == 2].index, inplace = True)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "subtask_a\n", "0 22345\n", "1 5417\n", "Name: count, dtype: int64" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_df['subtask_a'].value_counts()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | tweet | \n", "subtask_a | \n", "
---|---|---|
3515 | \n", "holstein ineği (alacalı siyah-beyaz inek, yani... | \n", "0 | \n", "
3516 | \n", "Haaaa. O zaman oylar Binali'ye demek. | \n", "0 | \n", "
3517 | \n", "Disk genel merkez yönetimine HDP'nin hiç etki... | \n", "0 | \n", "
3518 | \n", "Bir insanı zorla kaliteli yapamazsın. Sen elin... | \n", "0 | \n", "
3519 | \n", "Sus yaa açtım sonra korkudan telefon elimden ... | \n", "0 | \n", "
... | \n", "... | \n", "... | \n", "
31272 | \n", "Bu ödül sunan kızı kim giydirdiyse, kızın en b... | \n", "0 | \n", "
31273 | \n", "Bunu sana beddua olarak etmiyorum bunlar ilerd... | \n", "0 | \n", "
31274 | \n", "CHP'liler sandıkları bırakmıyor üstüne oturmuş... | \n", "1 | \n", "
31275 | \n", "karanlığın içinde yalnız kalsam ne oluuuuurr | \n", "0 | \n", "
31276 | \n", "Ne yalan söyleyeyim bu haftalıkta fitil olara... | \n", "0 | \n", "
27762 rows × 2 columns
\n", "