{"cells":[{"cell_type":"markdown","metadata":{},"source":["# Evaluation of the Models"]},{"cell_type":"code","execution_count":73,"metadata":{"_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","execution":{"iopub.execute_input":"2024-05-06T20:02:03.387301Z","iopub.status.busy":"2024-05-06T20:02:03.386545Z","iopub.status.idle":"2024-05-06T20:02:03.412072Z","shell.execute_reply":"2024-05-06T20:02:03.411093Z","shell.execute_reply.started":"2024-05-06T20:02:03.387269Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["/kaggle/input/nlp-project-test/test.csv\n","/kaggle/input/train-augmented/train_augmented.csv\n","/kaggle/input/turkish-t5/pytorch/v2.0/1/model-last/config.json\n","/kaggle/input/turkish-t5/pytorch/v2.0/1/model-last/training_args.bin\n","/kaggle/input/turkish-t5/pytorch/v2.0/1/model-last/model.safetensors\n","/kaggle/input/turkish-t5/pytorch/v2.0/1/model-last/generation_config.json\n","/kaggle/input/turkish-t5/pytorch/v1.1/1/model/config.json\n","/kaggle/input/turkish-t5/pytorch/v1.1/1/model/trainer_state.json\n","/kaggle/input/turkish-t5/pytorch/v1.1/1/model/training_args.bin\n","/kaggle/input/turkish-t5/pytorch/v1.1/1/model/scheduler.pt\n","/kaggle/input/turkish-t5/pytorch/v1.1/1/model/model.safetensors\n","/kaggle/input/turkish-t5/pytorch/v1.1/1/model/optimizer.pt\n","/kaggle/input/turkish-t5/pytorch/v1.1/1/model/rng_state.pth\n","/kaggle/input/turkish-t5/pytorch/v1.1/1/model/generation_config.json\n"]}],"source":["# # This Python 3 environment comes with many helpful analytics libraries installed\n","# # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n","# # For example, here's several helpful packages to load\n","\n","# import numpy as np # linear algebra\n","# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n","\n","# # Input data files are available in the read-only \"../input/\" directory\n","# # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n","\n","# import os\n","# for dirname, _, filenames in os.walk('/kaggle/input'):\n","# for filename in filenames:\n","# print(os.path.join(dirname, filename))\n","\n","# # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\" \n","# # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session"]},{"cell_type":"markdown","metadata":{},"source":["## Importing Libraries\n"]},{"cell_type":"code","execution_count":74,"metadata":{"execution":{"iopub.execute_input":"2024-05-06T20:02:04.080878Z","iopub.status.busy":"2024-05-06T20:02:04.080233Z","iopub.status.idle":"2024-05-06T20:02:04.085997Z","shell.execute_reply":"2024-05-06T20:02:04.084835Z","shell.execute_reply.started":"2024-05-06T20:02:04.080846Z"},"trusted":true},"outputs":[],"source":["import torch\n","from transformers import (\n"," AutoTokenizer,\n"," AutoModelForSeq2SeqLM,\n"," AutoModelForTokenClassification,\n"," pipeline)\n","\n","import pandas as pd"]},{"cell_type":"markdown","metadata":{},"source":["## Uploading Pretrained NER Model"]},{"cell_type":"code","execution_count":75,"metadata":{"execution":{"iopub.execute_input":"2024-05-06T20:02:04.491063Z","iopub.status.busy":"2024-05-06T20:02:04.490671Z","iopub.status.idle":"2024-05-06T20:02:05.746532Z","shell.execute_reply":"2024-05-06T20:02:05.745397Z","shell.execute_reply.started":"2024-05-06T20:02:04.491034Z"},"trusted":true},"outputs":[],"source":["ner_model = AutoModelForTokenClassification.from_pretrained(\"akdeniz27/bert-base-turkish-cased-ner\",device_map = \"cuda:0\") # pretrained ner model \n","ner_tokenizer = AutoTokenizer.from_pretrained(\"akdeniz27/bert-base-turkish-cased-ner\") # pretrained ner tokenizer\n","ner = pipeline('ner', model=ner_model, tokenizer=ner_tokenizer, aggregation_strategy=\"first\") # ner pipeline\n"]},{"cell_type":"code","execution_count":76,"metadata":{"execution":{"iopub.execute_input":"2024-05-06T20:02:05.749750Z","iopub.status.busy":"2024-05-06T20:02:05.748895Z","iopub.status.idle":"2024-05-06T20:02:05.754301Z","shell.execute_reply":"2024-05-06T20:02:05.753358Z","shell.execute_reply.started":"2024-05-06T20:02:05.749713Z"},"trusted":true},"outputs":[],"source":["model_dir = \"/kaggle/input/turkish-t5/pytorch/v2.0/1/model-last\" # our model "]},{"cell_type":"code","execution_count":77,"metadata":{"execution":{"iopub.execute_input":"2024-05-06T20:02:05.756941Z","iopub.status.busy":"2024-05-06T20:02:05.755806Z","iopub.status.idle":"2024-05-06T20:02:05.764492Z","shell.execute_reply":"2024-05-06T20:02:05.763371Z","shell.execute_reply.started":"2024-05-06T20:02:05.756876Z"},"trusted":true},"outputs":[],"source":["test_dataset_dir = \"/kaggle/input/nlp-project-test/test.csv\" # dataset location"]},{"cell_type":"code","execution_count":79,"metadata":{"execution":{"iopub.execute_input":"2024-05-06T20:02:06.191431Z","iopub.status.busy":"2024-05-06T20:02:06.190727Z","iopub.status.idle":"2024-05-06T20:02:06.201149Z","shell.execute_reply":"2024-05-06T20:02:06.200218Z","shell.execute_reply.started":"2024-05-06T20:02:06.191389Z"},"trusted":true},"outputs":[],"source":["df_test = pd.read_csv(test_dataset_dir) # read dataset"]},{"cell_type":"markdown","metadata":{},"source":["## Testing NER Model Usage"]},{"cell_type":"code","execution_count":87,"metadata":{"execution":{"iopub.execute_input":"2024-05-06T20:02:11.544600Z","iopub.status.busy":"2024-05-06T20:02:11.543964Z","iopub.status.idle":"2024-05-06T20:02:11.589457Z","shell.execute_reply":"2024-05-06T20:02:11.588309Z","shell.execute_reply.started":"2024-05-06T20:02:11.544567Z"},"trusted":true},"outputs":[{"data":{"text/plain":["[{'entity_group': 'LOC',\n"," 'score': 0.57057905,\n"," 'word': 'franken',\n"," 'start': 285,\n"," 'end': 292},\n"," {'entity_group': 'PER',\n"," 'score': 0.990382,\n"," 'word': 'Joe',\n"," 'start': 383,\n"," 'end': 386},\n"," {'entity_group': 'PER',\n"," 'score': 0.98566425,\n"," 'word': 'Marley',\n"," 'start': 415,\n"," 'end': 421},\n"," {'entity_group': 'LOC',\n"," 'score': 0.9250223,\n"," 'word': 'Roma´ya',\n"," 'start': 545,\n"," 'end': 552},\n"," {'entity_group': 'PER',\n"," 'score': 0.9943376,\n"," 'word': 'Sammy´nin',\n"," 'start': 612,\n"," 'end': 621},\n"," {'entity_group': 'LOC',\n"," 'score': 0.9336442,\n"," 'word': 'Istanbul',\n"," 'start': 717,\n"," 'end': 725},\n"," {'entity_group': 'PER',\n"," 'score': 0.9977376,\n"," 'word': 'Abraham Lincoln',\n"," 'start': 775,\n"," 'end': 790}]"]},"execution_count":87,"metadata":{},"output_type":"execute_result"}],"source":["ner(df_test['Sentence'].values[167]) # testing ner pipeline"]},{"cell_type":"markdown","metadata":{},"source":["## Uploading Our Fine-Tuned Transformers Model"]},{"cell_type":"code","execution_count":82,"metadata":{"execution":{"iopub.execute_input":"2024-05-06T20:02:07.439647Z","iopub.status.busy":"2024-05-06T20:02:07.438955Z","iopub.status.idle":"2024-05-06T20:02:08.592399Z","shell.execute_reply":"2024-05-06T20:02:08.591574Z","shell.execute_reply.started":"2024-05-06T20:02:07.439616Z"},"trusted":true},"outputs":[],"source":["model = AutoModelForSeq2SeqLM.from_pretrained(model_dir, device_map=\"cuda:0\") # load model "]},{"cell_type":"code","execution_count":83,"metadata":{"execution":{"iopub.execute_input":"2024-05-06T20:02:08.594453Z","iopub.status.busy":"2024-05-06T20:02:08.594143Z","iopub.status.idle":"2024-05-06T20:02:08.598606Z","shell.execute_reply":"2024-05-06T20:02:08.597684Z","shell.execute_reply.started":"2024-05-06T20:02:08.594427Z"},"trusted":true},"outputs":[],"source":["tokenizer_dir = \"Turkish-NLP/t5-efficient-small-turkish\" # tokenizer location "]},{"cell_type":"code","execution_count":84,"metadata":{"execution":{"iopub.execute_input":"2024-05-06T20:02:08.600343Z","iopub.status.busy":"2024-05-06T20:02:08.600006Z","iopub.status.idle":"2024-05-06T20:02:08.608039Z","shell.execute_reply":"2024-05-06T20:02:08.606966Z","shell.execute_reply.started":"2024-05-06T20:02:08.600313Z"},"trusted":true},"outputs":[],"source":["device = torch.device('cuda:0') # device "]},{"cell_type":"code","execution_count":85,"metadata":{"execution":{"iopub.execute_input":"2024-05-06T20:02:09.509859Z","iopub.status.busy":"2024-05-06T20:02:09.509503Z","iopub.status.idle":"2024-05-06T20:02:09.920290Z","shell.execute_reply":"2024-05-06T20:02:09.919467Z","shell.execute_reply.started":"2024-05-06T20:02:09.509822Z"},"trusted":true},"outputs":[],"source":["tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir) # load tokenizer "]},{"cell_type":"markdown","metadata":{},"source":["## Adding the Special Tokens"]},{"cell_type":"code","execution_count":86,"metadata":{"execution":{"iopub.execute_input":"2024-05-06T20:02:10.931548Z","iopub.status.busy":"2024-05-06T20:02:10.931164Z","iopub.status.idle":"2024-05-06T20:02:11.080444Z","shell.execute_reply":"2024-05-06T20:02:11.079391Z","shell.execute_reply.started":"2024-05-06T20:02:10.931519Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["q not in vocab\n","( not in vocab\n","° not in vocab\n","[ not in vocab\n","´ not in vocab\n","] not in vocab\n","{ not in vocab\n","} not in vocab\n","& not in vocab\n"]}],"source":["missing_tokens = [\"q\",\"(\",\"°\",\"[\",\"´\",\"]\",\"{\",\"}\",\"&\"] \n","for i in missing_tokens: \n"," if i not in tokenizer.vocab.keys():\n"," print(f\"{i} not in vocab\")"]},{"cell_type":"code","execution_count":89,"metadata":{"execution":{"iopub.execute_input":"2024-05-06T20:02:17.127810Z","iopub.status.busy":"2024-05-06T20:02:17.126948Z","iopub.status.idle":"2024-05-06T20:02:17.133819Z","shell.execute_reply":"2024-05-06T20:02:17.132677Z","shell.execute_reply.started":"2024-05-06T20:02:17.127777Z"},"trusted":true},"outputs":[],"source":["for i in missing_tokens: \n"," tokenizer.add_tokens(i)"]},{"cell_type":"markdown","metadata":{},"source":["## Testing Function"]},{"cell_type":"code","execution_count":94,"metadata":{"execution":{"iopub.execute_input":"2024-05-06T20:04:08.374259Z","iopub.status.busy":"2024-05-06T20:04:08.373829Z","iopub.status.idle":"2024-05-06T20:04:08.380610Z","shell.execute_reply":"2024-05-06T20:04:08.379473Z","shell.execute_reply.started":"2024-05-06T20:04:08.374218Z"},"trusted":true},"outputs":[],"source":["def generate_result(text):\n"," prefix = \"Correct diacritics for : \"\n"," postfix = \" \"\n"," text = prefix + text + postfix\n"," \n"," tokenizer.truncation_side = \"left\"\n"," batch = tokenizer(text, return_tensors='pt', max_length = 64, truncation = False).to(device)\n"," result = model.generate(**batch, max_new_tokens = 128)\n"," result = tokenizer.batch_decode(result)\n"," \n"," return str(result[0])"]},{"cell_type":"markdown","metadata":{},"source":["## Processing Functions before Testing"]},{"cell_type":"code","execution_count":182,"metadata":{"execution":{"iopub.execute_input":"2024-05-06T20:36:12.529255Z","iopub.status.busy":"2024-05-06T20:36:12.528564Z","iopub.status.idle":"2024-05-06T20:36:12.543577Z","shell.execute_reply":"2024-05-06T20:36:12.542475Z","shell.execute_reply.started":"2024-05-06T20:36:12.529220Z"},"trusted":true},"outputs":[],"source":["# json\n","import re \n","\n","def ner_predict_mapping(text, threshold=0.3):\n"," result = ner(text)\n"," if len(result) == 0:\n"," return []\n"," else:\n"," special_words = [result[\"word\"] for result in result if result[\"score\"] > threshold]\n"," special_words_ = []\n"," for word_ in special_words:\n"," if word_.lower()[0] == \"i\":\n"," word_ = word_.replace(\"I\",\"İ\")\n"," if len(word_.split()) > 1:\n"," special_words_.extend(word_.split())\n"," else:\n"," special_words_.append(word_)\n"," \n"," return special_words_\n"," \n","def split_text_into_n_worded_chunks(text, n):\n"," words = text.split()\n"," chunks = []\n"," for i in range(0, len(words), n):\n"," chunks.append(' '.join(words[i:i+n]))\n"," last_chunk_words = len(words) % n\n"," if last_chunk_words != 0:\n"," chunks[-1] = ' '.join(words[-last_chunk_words:])\n"," return chunks\n","\n","def chunk_2(text):\n"," chunks = split_text_into_n_worded_chunks(text, 2)\n"," processed_chunks = [re.sub(r'([\"q(°\\[\\]{}&´])\\s+', r'\\1',generate_result(chunk)) for chunk in chunks] \n"," result = ' '.join(processed_chunks)\n"," return result.replace(\"\",\"\").replace(\"\",\"\").replace(\" \",\" \")\n","\n","def chunk_1(text): \n"," chunks = split_text_into_n_worded_chunks(text, 1)\n"," processed_chunks = [generate_result(chunk).replace(\" \",\"\") for chunk in chunks]\n"," result = ''.join(processed_chunks)\n"," return result.replace(\"\",\" \").replace(\"\",\"\")\n","\n","def process_text(text):\n"," words = ner_predict_mapping(text)\n"," two_chunk = chunk_2(text)\n"," one_chunk = chunk_1(text)\n"," if len(one_chunk.split()) != len(two_chunk.split()):\n"," for word in words: \n"," one_chunk = one_chunk.replace(word.lower().replace('i̇',\"i\"),word)\n"," return one_chunk\n"," else: \n"," for word in words: \n"," two_chunk = two_chunk.replace(word.lower().replace('i̇',\"i\"),word)\n"," return two_chunk"]},{"cell_type":"code","execution_count":191,"metadata":{"execution":{"iopub.execute_input":"2024-05-06T20:36:51.027557Z","iopub.status.busy":"2024-05-06T20:36:51.027167Z","iopub.status.idle":"2024-05-06T20:59:13.613401Z","shell.execute_reply":"2024-05-06T20:59:13.612486Z","shell.execute_reply.started":"2024-05-06T20:36:51.027527Z"},"trusted":true},"outputs":[],"source":["df_test[\"Result\"] = df_test[\"Sentence\"].apply(process_text) # apply preprocessing to the dataset"]},{"cell_type":"code","execution_count":192,"metadata":{"execution":{"iopub.execute_input":"2024-05-06T20:59:21.080714Z","iopub.status.busy":"2024-05-06T20:59:21.080065Z","iopub.status.idle":"2024-05-06T20:59:21.099328Z","shell.execute_reply":"2024-05-06T20:59:21.098441Z","shell.execute_reply.started":"2024-05-06T20:59:21.080684Z"},"trusted":true},"outputs":[{"data":{"text/html":["
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
IDSentenceResult
00tr ekonomi ve politika haberleri turkiye nin ...tr ekonomi ve politika haberleri türkiye nin ...
11uye girisiüye girişi
22son guncelleme 12:12son güncelleme 12:12
33Imrali Mit gorusmesi ihtiyac duyuldukca oluyorimralı Mit görüşmesi ihtiyaç duyuldukça oluyor
44Suriye deki silahli selefi muhalifler yeni ku...Suriye deki silahlı selefi muhalifler yeni ku...
............
11521152Yuregir Adana ilimize ait sirin bir ilcediryüreğir Adana ilimize ait şirin bir ilçedir
11531153yuze guluculugun at oynattigi bir aydinlar ort...yüze gülücülüğün at oynattığı bir aydınlar or...
11541154zavalli adami oracikta astilar ve hic kimse se...zavallı adamı oracıkta astılar ve hiç kimse s...
11551155zengin cocuklarina ariz munasebetsizlikler fak...zengin çocuklarına ariz münakaşsizlikler faki...
11561156senin acin hepimizin acisidirsenin açın hepimizin acısıdır
\n","

1157 rows × 3 columns

\n","
"],"text/plain":[" ID Sentence \\\n","0 0 tr ekonomi ve politika haberleri turkiye nin ... \n","1 1 uye girisi \n","2 2 son guncelleme 12:12 \n","3 3 Imrali Mit gorusmesi ihtiyac duyuldukca oluyor \n","4 4 Suriye deki silahli selefi muhalifler yeni ku... \n","... ... ... \n","1152 1152 Yuregir Adana ilimize ait sirin bir ilcedir \n","1153 1153 yuze guluculugun at oynattigi bir aydinlar ort... \n","1154 1154 zavalli adami oracikta astilar ve hic kimse se... \n","1155 1155 zengin cocuklarina ariz munasebetsizlikler fak... \n","1156 1156 senin acin hepimizin acisidir \n","\n"," Result \n","0 tr ekonomi ve politika haberleri türkiye nin ... \n","1 üye girişi \n","2 son güncelleme 12:12 \n","3 imralı Mit görüşmesi ihtiyaç duyuldukça oluyor \n","4 Suriye deki silahlı selefi muhalifler yeni ku... \n","... ... \n","1152 yüreğir Adana ilimize ait şirin bir ilçedir \n","1153 yüze gülücülüğün at oynattığı bir aydınlar or... \n","1154 zavallı adamı oracıkta astılar ve hiç kimse s... \n","1155 zengin çocuklarına ariz münakaşsizlikler faki... \n","1156 senin açın hepimizin acısıdır \n","\n","[1157 rows x 3 columns]"]},"execution_count":192,"metadata":{},"output_type":"execute_result"}],"source":["df_test"]},{"cell_type":"code","execution_count":193,"metadata":{"execution":{"iopub.execute_input":"2024-05-06T20:59:27.981615Z","iopub.status.busy":"2024-05-06T20:59:27.981246Z","iopub.status.idle":"2024-05-06T20:59:28.001820Z","shell.execute_reply":"2024-05-06T20:59:28.000894Z","shell.execute_reply.started":"2024-05-06T20:59:27.981586Z"},"trusted":true},"outputs":[],"source":["df_test.to_csv(\"/kaggle/working/test_designed2.csv\") # save the result "]},{"cell_type":"code","execution_count":194,"metadata":{"execution":{"iopub.execute_input":"2024-05-06T21:00:22.851347Z","iopub.status.busy":"2024-05-06T21:00:22.850931Z","iopub.status.idle":"2024-05-06T21:00:22.856811Z","shell.execute_reply":"2024-05-06T21:00:22.855676Z","shell.execute_reply.started":"2024-05-06T21:00:22.851317Z"},"trusted":true},"outputs":[],"source":["df_test['Sentence'] = df_test['Result'] # making the result the input for the competition"]},{"cell_type":"code","execution_count":195,"metadata":{"execution":{"iopub.execute_input":"2024-05-06T21:00:49.554221Z","iopub.status.busy":"2024-05-06T21:00:49.553096Z","iopub.status.idle":"2024-05-06T21:00:49.560833Z","shell.execute_reply":"2024-05-06T21:00:49.559888Z","shell.execute_reply.started":"2024-05-06T21:00:49.554186Z"},"trusted":true},"outputs":[],"source":["df_test = df_test.drop(columns= ['Result'])"]},{"cell_type":"markdown","metadata":{},"source":["## Saving the Test Results"]},{"cell_type":"code","execution_count":4,"metadata":{},"outputs":[],"source":["df_test.to_csv('testv3.csv', index = False) # save the result"]}],"metadata":{"kaggle":{"accelerator":"nvidiaTeslaT4","dataSources":[{"datasetId":4868624,"sourceId":8214383,"sourceType":"datasetVersion"},{"datasetId":4943387,"sourceId":8321889,"sourceType":"datasetVersion"},{"isSourceIdPinned":true,"modelInstanceId":36800,"sourceId":43823,"sourceType":"modelInstanceVersion"},{"isSourceIdPinned":true,"modelInstanceId":37084,"sourceId":44158,"sourceType":"modelInstanceVersion"}],"dockerImageVersionId":30699,"isGpuEnabled":true,"isInternetEnabled":true,"language":"python","sourceType":"notebook"},"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.8.19"}},"nbformat":4,"nbformat_minor":4}