{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "gpuType": "T4" }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "accelerator": "GPU", "widgets": { "application/vnd.jupyter.widget-state+json": { "e68b6e6997844bf788a057f9c7feedfb": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_295e4080ccd64e48806a36b83e50ddfa", "IPY_MODEL_c4025862f06b412cb99165b67ad7daae", "IPY_MODEL_5ac369dab692489cb13cdb664c47fd96" ], "layout": "IPY_MODEL_434aa0b7bd76440d9b9b64d8b53133d3" } }, "295e4080ccd64e48806a36b83e50ddfa": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_9e2a1fea814f408ebb4d15db83b1130b", "placeholder": "​", "style": "IPY_MODEL_4a2f178864244d68bd915ee57379251d", "value": "Map: 100%" } }, "c4025862f06b412cb99165b67ad7daae": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_7125f94d482a46999fd4dd3be1b3e87e", "max": 1148, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_96486cdef9714482a4ffa2aca1b3628b", "value": 1148 } }, "5ac369dab692489cb13cdb664c47fd96": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_2364eb3ce5b345788902c5f9d316a00a", "placeholder": "​", "style": "IPY_MODEL_52f799ea10d4403cb18e33ba80d739d3", "value": " 1148/1148 [00:01<00:00, 781.70 examples/s]" } }, "434aa0b7bd76440d9b9b64d8b53133d3": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "9e2a1fea814f408ebb4d15db83b1130b": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "4a2f178864244d68bd915ee57379251d": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "7125f94d482a46999fd4dd3be1b3e87e": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "96486cdef9714482a4ffa2aca1b3628b": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "2364eb3ce5b345788902c5f9d316a00a": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "52f799ea10d4403cb18e33ba80d739d3": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "3e18acb6f1504f4dace716a96e8d90f4": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_953e7d76140e4ed2ade688ccd5467a75", "IPY_MODEL_3a70d75b4eb949598e7cb9430acfcf81", "IPY_MODEL_54719990ff1f40cb8fed06badb378d01" ], "layout": "IPY_MODEL_5d1be2eaa2c143bbbc35f7d0f33f64de" } }, "953e7d76140e4ed2ade688ccd5467a75": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_002c9d35efa54fccb875a08e7059997f", "placeholder": "​", "style": "IPY_MODEL_21dd8d7b7e5a4e27922ff1e3bec7745a", "value": "Map: 100%" } }, "3a70d75b4eb949598e7cb9430acfcf81": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_48abc963896a404886fbcf75b0b19bb9", "max": 287, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_87e3a17419334bf8b2448a8914f9d721", "value": 287 } }, "54719990ff1f40cb8fed06badb378d01": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_f8303a91b4084791971947ca45c6b459", "placeholder": "​", "style": "IPY_MODEL_a878599cc49347a896c793f3c45914e3", "value": " 287/287 [00:00<00:00, 556.23 examples/s]" } }, "5d1be2eaa2c143bbbc35f7d0f33f64de": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "002c9d35efa54fccb875a08e7059997f": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "21dd8d7b7e5a4e27922ff1e3bec7745a": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "48abc963896a404886fbcf75b0b19bb9": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "87e3a17419334bf8b2448a8914f9d721": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "f8303a91b4084791971947ca45c6b459": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "a878599cc49347a896c793f3c45914e3": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } } } } }, "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "SRajt-tUH3ms", "outputId": "f6077695-1508-4b60-b33a-7a29f37b4c75" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (4.31.0)\n", "Requirement already satisfied: datasets in /usr/local/lib/python3.10/dist-packages (2.14.4)\n", "Requirement already satisfied: evaluate in /usr/local/lib/python3.10/dist-packages (0.4.0)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.12.2)\n", "Requirement already satisfied: huggingface-hub<1.0,>=0.14.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.16.4)\n", "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.23.5)\n", "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (23.1)\n", "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0.1)\n", "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2023.6.3)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.31.0)\n", "Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.13.3)\n", "Requirement already satisfied: safetensors>=0.3.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.3.2)\n", "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.66.1)\n", "Requirement already satisfied: pyarrow>=8.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (9.0.0)\n", "Requirement already satisfied: dill<0.3.8,>=0.3.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.3.7)\n", "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets) (1.5.3)\n", "Requirement already satisfied: xxhash in /usr/local/lib/python3.10/dist-packages (from datasets) (3.3.0)\n", "Requirement already satisfied: multiprocess in /usr/local/lib/python3.10/dist-packages (from datasets) (0.70.15)\n", "Requirement already satisfied: fsspec[http]>=2021.11.1 in /usr/local/lib/python3.10/dist-packages (from datasets) (2023.6.0)\n", "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets) (3.8.5)\n", "Requirement already satisfied: responses<0.19 in /usr/local/lib/python3.10/dist-packages (from evaluate) (0.18.0)\n", "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (23.1.0)\n", "Requirement already satisfied: charset-normalizer<4.0,>=2.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (3.2.0)\n", "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (6.0.4)\n", "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (4.0.3)\n", "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.9.2)\n", "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.4.0)\n", "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.3.1)\n", "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.14.1->transformers) (4.7.1)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.4)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.0.4)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2023.7.22)\n", "Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2.8.2)\n", "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2023.3)\n", "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.1->pandas->datasets) (1.16.0)\n" ] } ], "source": [ "! pip install transformers datasets evaluate" ] }, { "cell_type": "code", "source": [ "from transformers import AutoTokenizer\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-uncased\")" ], "metadata": { "id": "rjE6lHHJJdyv" }, "execution_count": 2, "outputs": [] }, { "cell_type": "code", "source": [ "import pandas as pd\n", "from sklearn.model_selection import train_test_split\n", "\n", "data = pd.read_csv(\"ielts_writing_dataset_new.csv\")\n", "\n", "data.label = data.label.replace(1,0)\n", "data.label = data.label.replace(3,0)\n", "data.label = data.label.replace(3.5,0)\n", "data.label = data.label.replace(4,0)\n", "data.label = data.label.replace(4.5,0)\n", "data.label = data.label.replace(5,0)\n", "data.label = data.label.replace(5.5,1)\n", "data.label = data.label.replace(6,1)\n", "data.label = data.label.replace(6.5,1)\n", "data.label = data.label.replace(7,1)\n", "data.label = data.label.replace(7.5,1)\n", "data.label = data.label.replace(8,2)\n", "data.label = data.label.replace(8.5,2)\n", "data.label = data.label.replace(9,2)\n", "\n", "data.label = data.label.astype(int)\n", "\n", "train, test = train_test_split(data, test_size=0.2)\n" ], "metadata": { "id": "GpD5w5t2JihL" }, "execution_count": 3, "outputs": [] }, { "cell_type": "code", "source": [ "data[:10]" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 363 }, "id": "Cos-ypQ7n7d9", "outputId": "92caed9a-43e5-4a28-adf3-1727e3a15357" }, "execution_count": 4, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " label text\n", "0 1 Between 1995 and 2010, a study was conducted r...\n", "1 1 Poverty represents a worldwide crisis. It is t...\n", "2 0 The left chart shows the population change hap...\n", "3 1 Human beings are facing many challenges nowada...\n", "4 1 Information about the thousands of visits from...\n", "5 1 Whether countries should only invest facilitie...\n", "6 1 This graph depicts the changes in tourists vis...\n", "7 1 Sports is an essential part to most of us , so...\n", "8 2 The line graph illustrates the number of overs...\n", "9 2 International sports events require the most w..." ], "text/html": [ "\n", "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
labeltext
01Between 1995 and 2010, a study was conducted r...
11Poverty represents a worldwide crisis. It is t...
20The left chart shows the population change hap...
31Human beings are facing many challenges nowada...
41Information about the thousands of visits from...
51Whether countries should only invest facilitie...
61This graph depicts the changes in tourists vis...
71Sports is an essential part to most of us , so...
82The line graph illustrates the number of overs...
92International sports events require the most w...
\n", "
\n", " \n", "\n", "\n", "\n", "
\n", " \n", "
\n", "\n", "\n", "\n", " \n", "\n", "\n", " \n", " \n", "\n", " \n", "
\n", "
\n" ] }, "metadata": {}, "execution_count": 4 } ] }, { "cell_type": "code", "source": [ "import datasets\n", "from datasets import Dataset, DatasetDict\n", "\n", "train = Dataset.from_pandas(train)\n", "test = Dataset.from_pandas(test)\n", "\n", "\n", "dataset = DatasetDict()\n", "\n", "dataset['train'] = train\n", "dataset['test'] = test\n", "dataset = dataset.remove_columns([\"__index_level_0__\"])\n", "dataset" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Mi7bkZ00L6ZB", "outputId": "3532f0d9-1961-44fc-ac50-bace0add6005" }, "execution_count": 5, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "DatasetDict({\n", " train: Dataset({\n", " features: ['label', 'text'],\n", " num_rows: 1148\n", " })\n", " test: Dataset({\n", " features: ['label', 'text'],\n", " num_rows: 287\n", " })\n", "})" ] }, "metadata": {}, "execution_count": 5 } ] }, { "cell_type": "code", "source": [ "dataset[\"test\"][0]" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "QGCPOgv5MO1k", "outputId": "2d26d51c-2c62-4207-b0ac-8570aa89c798" }, "execution_count": 6, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "{'label': 1,\n", " 'text': 'Everything has two sides and the globalization is not exception. Our first thoughts about this topic include the process of global “McDonaldisation” and, generally speaking, spreading across the whole Globe.Firstly, I would try to concentrate on the positive aspects of globalisation. As far as economy is concerned, like the Global Bank or IMF are always focused on developing the ‘Third World’ and helping poor people to combat their life obstacles (through loans and donations). Moreover, the world becomes an area of sharing thoughts (e.g. philosophical or economical doctrines), which become popular due to lack of barriers.However, disadvantages of globalization are also widely known. Some people insist that because of this process, the spirit of countries and nations rapidly disappears. The integrity, established years ago is on the verge of collapsing. Furthermore, there’s a strong lobby of communists who , that the globalization indicates an uncontrolled reign of capitalists and slave work of lower labour-class. We should never forget about the detrimental impact of global investments on the environment – the green house effect or soar rains are triggered by globalization.To sum up, globalization has both positive and negative influence on our everyday life. I can’t agree with the popular statement that we should try to avoid being affected by it. However, we must not forget about our surroundings and local communities. They have a great value which should last forever.'}" ] }, "metadata": {}, "execution_count": 6 } ] }, { "cell_type": "code", "source": [ "def preprocess_function(examples):\n", " return tokenizer(examples[\"text\"], truncation=True)" ], "metadata": { "id": "z-Q57XYTMWsU" }, "execution_count": 7, "outputs": [] }, { "cell_type": "code", "source": [ "tokenized_dataset = dataset.map(preprocess_function, batched=True)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 81, "referenced_widgets": [ "e68b6e6997844bf788a057f9c7feedfb", "295e4080ccd64e48806a36b83e50ddfa", "c4025862f06b412cb99165b67ad7daae", "5ac369dab692489cb13cdb664c47fd96", "434aa0b7bd76440d9b9b64d8b53133d3", "9e2a1fea814f408ebb4d15db83b1130b", "4a2f178864244d68bd915ee57379251d", "7125f94d482a46999fd4dd3be1b3e87e", "96486cdef9714482a4ffa2aca1b3628b", "2364eb3ce5b345788902c5f9d316a00a", "52f799ea10d4403cb18e33ba80d739d3", "3e18acb6f1504f4dace716a96e8d90f4", "953e7d76140e4ed2ade688ccd5467a75", "3a70d75b4eb949598e7cb9430acfcf81", "54719990ff1f40cb8fed06badb378d01", "5d1be2eaa2c143bbbc35f7d0f33f64de", "002c9d35efa54fccb875a08e7059997f", "21dd8d7b7e5a4e27922ff1e3bec7745a", "48abc963896a404886fbcf75b0b19bb9", "87e3a17419334bf8b2448a8914f9d721", "f8303a91b4084791971947ca45c6b459", "a878599cc49347a896c793f3c45914e3" ] }, "id": "0-Api6H3Mcqc", "outputId": "5fc02809-9cda-48da-9ac4-fe34f1742c22" }, "execution_count": 8, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "Map: 0%| | 0/1148 [00:00=0.14.1 in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (0.16.4)\n", "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (1.23.5)\n", "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (23.1)\n", "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (6.0.1)\n", "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (2023.6.3)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (2.31.0)\n", "Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (0.13.3)\n", "Requirement already satisfied: safetensors>=0.3.1 in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (0.3.2)\n", "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (4.66.1)\n", "Requirement already satisfied: torch!=1.12.0,>=1.9 in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (2.0.1+cu118)\n", "Requirement already satisfied: accelerate>=0.20.3 in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (0.21.0)\n", "Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.20.3->transformers[torch]) (5.9.5)\n", "Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.14.1->transformers[torch]) (2023.6.0)\n", "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.14.1->transformers[torch]) (4.7.1)\n", "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch!=1.12.0,>=1.9->transformers[torch]) (1.12)\n", "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch!=1.12.0,>=1.9->transformers[torch]) (3.1)\n", "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch!=1.12.0,>=1.9->transformers[torch]) (3.1.2)\n", "Requirement already satisfied: triton==2.0.0 in /usr/local/lib/python3.10/dist-packages (from torch!=1.12.0,>=1.9->transformers[torch]) (2.0.0)\n", "Requirement already satisfied: cmake in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch!=1.12.0,>=1.9->transformers[torch]) (3.27.2)\n", "Requirement already satisfied: lit in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch!=1.12.0,>=1.9->transformers[torch]) (16.0.6)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers[torch]) (3.2.0)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers[torch]) (3.4)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers[torch]) (2.0.4)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers[torch]) (2023.7.22)\n", "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch!=1.12.0,>=1.9->transformers[torch]) (2.1.3)\n", "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch!=1.12.0,>=1.9->transformers[torch]) (1.3.0)\n" ] } ] }, { "cell_type": "code", "source": [ "from torch import nn\n", "\n", "class ClassificationTrainer(Trainer):\n", " def compute_loss(self, model, inputs, return_outputs=False):\n", " labels = inputs.get(\"label\")\n", " outputs = model(**inputs)\n", " outputs = outputs.unsqueeze(1)\n", " logits = outputs.get('logits')\n", " loss_fct = nn.CrossEntropyLoss()\n", " loss = loss_fct(logits.squeeze(), labels.squeeze())\n", " return (loss, outputs) if return_outputs else loss" ], "metadata": { "id": "KQ2UskBkU4D9" }, "execution_count": 16, "outputs": [] }, { "cell_type": "code", "source": [ "training_args = TrainingArguments(\n", " output_dir=\"essayl0\",\n", " learning_rate=2e-5,\n", " per_device_train_batch_size=16,\n", " per_device_eval_batch_size=16,\n", " num_train_epochs=15,\n", " weight_decay=0.01,\n", " evaluation_strategy=\"epoch\",\n", " save_strategy=\"epoch\",\n", " load_best_model_at_end=True,\n", ")\n", "\n", "trainer = Trainer(\n", " model=model,\n", " args=training_args,\n", " train_dataset=tokenized_dataset[\"train\"],\n", " eval_dataset=tokenized_dataset[\"test\"],\n", " tokenizer=tokenizer,\n", " data_collator=data_collator,\n", " compute_metrics=compute_metrics,\n", ")\n", "\n", "trainer.train()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 656 }, "id": "BwyTlAy0OdRS", "outputId": "dca2b59c-a8d7-40fb-ded3-d0a3685949d7" }, "execution_count": 17, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.10/dist-packages/transformers/optimization.py:411: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", " warnings.warn(\n", "You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n" ] }, { "output_type": "display_data", "data": { "text/plain": [ "" ], "text/html": [ "\n", "
\n", " \n", " \n", " [1080/1080 29:09, Epoch 15/15]\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
EpochTraining LossValidation LossAccuracy
1No log0.6014370.752613
2No log0.4442180.860627
3No log0.5106110.815331
4No log0.7232150.766551
5No log0.5562840.850174
6No log0.7834230.794425
70.2758000.7359230.850174
80.2758000.6547910.878049
90.2758000.6335030.888502
100.2758001.1050060.783972
110.2758000.7101190.878049
120.2758000.7923140.839721
130.2758000.8634350.843206
140.0185000.8345550.843206
150.0185000.8648100.832753

" ] }, "metadata": {} }, { "output_type": "execute_result", "data": { "text/plain": [ "TrainOutput(global_step=1080, training_loss=0.13700703542541576, metrics={'train_runtime': 1752.9066, 'train_samples_per_second': 9.824, 'train_steps_per_second': 0.616, 'total_flos': 4194210824632584.0, 'train_loss': 0.13700703542541576, 'epoch': 15.0})" ] }, "metadata": {}, "execution_count": 17 } ] }, { "cell_type": "code", "source": [ "!zip -r /content/checkpoint.zip /content/essayl0/checkpoint-1080/" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "s6wG4purBmfX", "outputId": "3363587c-a6e3-4a40-db80-73d6eaf26cf7" }, "execution_count": 18, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ " adding: content/essayl0/checkpoint-1080/ (stored 0%)\n", " adding: content/essayl0/checkpoint-1080/special_tokens_map.json (deflated 42%)\n", " adding: content/essayl0/checkpoint-1080/rng_state.pth (deflated 28%)\n", " adding: content/essayl0/checkpoint-1080/vocab.txt (deflated 53%)\n", " adding: content/essayl0/checkpoint-1080/tokenizer.json (deflated 71%)\n", " adding: content/essayl0/checkpoint-1080/config.json (deflated 50%)\n", " adding: content/essayl0/checkpoint-1080/trainer_state.json (deflated 78%)\n", " adding: content/essayl0/checkpoint-1080/pytorch_model.bin (deflated 7%)\n", " adding: content/essayl0/checkpoint-1080/optimizer.pt (deflated 21%)\n", " adding: content/essayl0/checkpoint-1080/training_args.bin (deflated 48%)\n", " adding: content/essayl0/checkpoint-1080/tokenizer_config.json (deflated 43%)\n", " adding: content/essayl0/checkpoint-1080/scheduler.pt (deflated 49%)\n" ] } ] } ] }