{"metadata":{"kernelspec":{"name":"python3","display_name":"Python 3","language":"python"},"datalore":{"computation_mode":"JUPYTER","package_manager":"pip","base_environment":"default","packages":[],"report_row_ids":[],"version":3},"language_info":{"name":"python","version":"3.10.13","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"nvidiaTeslaT4","dataSources":[{"sourceId":61542,"databundleVersionId":7516023,"sourceType":"competition"},{"sourceId":2468672,"sourceType":"datasetVersion","datasetId":1455358},{"sourceId":6977472,"sourceType":"datasetVersion","datasetId":4005256}],"dockerImageVersionId":30648,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":true}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"markdown","source":"# LLM - Detect AI Generated Text","metadata":{"datalore":{"node_id":"Pa2yxouvTjqcDsgCYUzPjr","type":"MD","hide_input_from_viewers":true,"hide_output_from_viewers":true}}},{"cell_type":"markdown","source":"This competition challenges participants to develop a machine learning model that can accurately detect whether an essay was written by a student or an LLM. The competition dataset comprises a mix of student-written essays and essays generated by a variety of LLMs.","metadata":{}},{"cell_type":"markdown","source":"**Evaluation:**\nSubmissions are evaluated on area under the ROC curve between the predicted probability and the observed target.","metadata":{}},{"cell_type":"markdown","source":"## 1. Import libraries","metadata":{"datalore":{"node_id":"ek6n1GGlnkNFGIJ0DB0C1g","type":"MD","hide_input_from_viewers":true,"hide_output_from_viewers":true}}},{"cell_type":"code","source":"!pip install datasets==2.15","metadata":{"datalore":{"node_id":"gLJbvMXVqhcKYPln9YBRIq","type":"CODE","hide_input_from_viewers":true,"hide_output_from_viewers":true},"execution":{"iopub.status.busy":"2024-02-07T16:12:50.593215Z","iopub.execute_input":"2024-02-07T16:12:50.593867Z","iopub.status.idle":"2024-02-07T16:13:03.047070Z","shell.execute_reply.started":"2024-02-07T16:12:50.593836Z","shell.execute_reply":"2024-02-07T16:13:03.045794Z"},"trusted":true},"execution_count":44,"outputs":[{"name":"stderr","text":"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\nTo disable this warning, you can either:\n\t- Avoid using `tokenizers` before the fork if possible\n\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n","output_type":"stream"},{"name":"stdout","text":"Requirement already satisfied: datasets==2.15 in /opt/conda/lib/python3.10/site-packages (2.15.0)\nRequirement already satisfied: numpy>=1.17 in /opt/conda/lib/python3.10/site-packages (from datasets==2.15) (1.24.4)\nRequirement already satisfied: pyarrow>=8.0.0 in /opt/conda/lib/python3.10/site-packages (from datasets==2.15) (11.0.0)\nRequirement already satisfied: pyarrow-hotfix in /opt/conda/lib/python3.10/site-packages (from datasets==2.15) (0.6)\nRequirement already satisfied: dill<0.3.8,>=0.3.0 in /opt/conda/lib/python3.10/site-packages (from datasets==2.15) (0.3.7)\nRequirement already satisfied: pandas in /opt/conda/lib/python3.10/site-packages (from datasets==2.15) (2.1.4)\nRequirement already satisfied: requests>=2.19.0 in /opt/conda/lib/python3.10/site-packages (from datasets==2.15) (2.31.0)\nRequirement already satisfied: tqdm>=4.62.1 in /opt/conda/lib/python3.10/site-packages (from datasets==2.15) (4.66.1)\nRequirement already satisfied: xxhash in /opt/conda/lib/python3.10/site-packages (from datasets==2.15) (3.4.1)\nRequirement already satisfied: multiprocess in /opt/conda/lib/python3.10/site-packages (from datasets==2.15) (0.70.15)\nRequirement already satisfied: fsspec<=2023.10.0,>=2023.1.0 in /opt/conda/lib/python3.10/site-packages (from fsspec[http]<=2023.10.0,>=2023.1.0->datasets==2.15) (2023.10.0)\nRequirement already satisfied: aiohttp in /opt/conda/lib/python3.10/site-packages (from datasets==2.15) (3.9.1)\nRequirement already satisfied: huggingface-hub>=0.18.0 in /opt/conda/lib/python3.10/site-packages (from datasets==2.15) (0.20.3)\nRequirement already satisfied: packaging in /opt/conda/lib/python3.10/site-packages (from datasets==2.15) (21.3)\nRequirement already satisfied: pyyaml>=5.1 in /opt/conda/lib/python3.10/site-packages (from datasets==2.15) (6.0.1)\nRequirement already satisfied: attrs>=17.3.0 in /opt/conda/lib/python3.10/site-packages (from aiohttp->datasets==2.15) (23.2.0)\nRequirement already satisfied: multidict<7.0,>=4.5 in /opt/conda/lib/python3.10/site-packages (from aiohttp->datasets==2.15) (6.0.4)\nRequirement already satisfied: yarl<2.0,>=1.0 in /opt/conda/lib/python3.10/site-packages (from aiohttp->datasets==2.15) (1.9.3)\nRequirement already satisfied: frozenlist>=1.1.1 in /opt/conda/lib/python3.10/site-packages (from aiohttp->datasets==2.15) (1.4.1)\nRequirement already satisfied: aiosignal>=1.1.2 in /opt/conda/lib/python3.10/site-packages (from aiohttp->datasets==2.15) (1.3.1)\nRequirement already satisfied: async-timeout<5.0,>=4.0 in /opt/conda/lib/python3.10/site-packages (from aiohttp->datasets==2.15) (4.0.3)\nRequirement already satisfied: filelock in /opt/conda/lib/python3.10/site-packages (from huggingface-hub>=0.18.0->datasets==2.15) (3.13.1)\nRequirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/lib/python3.10/site-packages (from huggingface-hub>=0.18.0->datasets==2.15) (4.9.0)\nRequirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /opt/conda/lib/python3.10/site-packages (from packaging->datasets==2.15) (3.1.1)\nRequirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests>=2.19.0->datasets==2.15) (3.3.2)\nRequirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests>=2.19.0->datasets==2.15) (3.6)\nRequirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests>=2.19.0->datasets==2.15) (1.26.18)\nRequirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests>=2.19.0->datasets==2.15) (2023.11.17)\nRequirement already satisfied: python-dateutil>=2.8.2 in /opt/conda/lib/python3.10/site-packages (from pandas->datasets==2.15) (2.8.2)\nRequirement already satisfied: pytz>=2020.1 in /opt/conda/lib/python3.10/site-packages (from pandas->datasets==2.15) (2023.3.post1)\nRequirement already satisfied: tzdata>=2022.1 in /opt/conda/lib/python3.10/site-packages (from pandas->datasets==2.15) (2023.4)\nRequirement already satisfied: six>=1.5 in /opt/conda/lib/python3.10/site-packages (from python-dateutil>=2.8.2->pandas->datasets==2.15) (1.16.0)\n","output_type":"stream"}]},{"cell_type":"code","source":"import numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\nimport re\n\nfrom transformers import PushToHubCallback\nfrom transformers import AutoTokenizer, DataCollatorWithPadding\nfrom transformers import TFAutoModelForSequenceClassification\nfrom tensorflow.keras.optimizers.schedules import PolynomialDecay\nfrom tensorflow.keras.optimizers import Adam\nimport tensorflow as tf\nfrom keras.callbacks import EarlyStopping\nimport datasets\n\nfrom sklearn.metrics import ConfusionMatrixDisplay\nfrom sklearn.metrics import classification_report, f1_score\n","metadata":{"datalore":{"node_id":"0tAJiErffFXiL5lrISxce7","type":"CODE","hide_input_from_viewers":true,"hide_output_from_viewers":true},"execution":{"iopub.status.busy":"2024-02-07T16:13:03.049654Z","iopub.execute_input":"2024-02-07T16:13:03.049987Z","iopub.status.idle":"2024-02-07T16:13:03.059305Z","shell.execute_reply.started":"2024-02-07T16:13:03.049953Z","shell.execute_reply":"2024-02-07T16:13:03.058441Z"},"trusted":true},"execution_count":45,"outputs":[]},{"cell_type":"code","source":"from huggingface_hub import notebook_login\n\nnotebook_login()","metadata":{"execution":{"iopub.status.busy":"2024-02-07T16:13:03.060453Z","iopub.execute_input":"2024-02-07T16:13:03.060739Z","iopub.status.idle":"2024-02-07T16:13:03.097365Z","shell.execute_reply.started":"2024-02-07T16:13:03.060718Z","shell.execute_reply":"2024-02-07T16:13:03.096453Z"},"trusted":true},"execution_count":46,"outputs":[{"output_type":"display_data","data":{"text/plain":"VBox(children=(HTML(value='
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
idprompt_idtextgeneratedtext_len
70482131f681This essay will analyze, discuss and prove one...11356
493606ec5420I think limiting car usage is a great idea for...01486
1115cbc48dd70Zroom! Cars have been developing for hundreds ...01492
74086fe4f181I strongly believe that the Electoral College ...11500
1337f81d371d1Dear, senator I believe the electoral college ...01595
..................
1082c3e2e9e50Driving is the primary way of transportation, ...06957
175223bbf180When limiting car usage the first thing that m...07190
326405242180As the global concern for the environment incr...07373
9715f7ea581Dear Senator, Concerning the topic of the meri...08033
821947b8cca1To tohe stoatoe and tohe stoatoe's countory, t...08436
\n

1378 rows × 5 columns

\n"},"metadata":{}}]},{"cell_type":"code","source":"train_essays['text_len'].describe()","metadata":{"datalore":{"node_id":"BpbeG8fdc0uRS2KG0Apoht","type":"CODE","hide_input_from_viewers":true,"hide_output_from_viewers":true},"execution":{"iopub.status.busy":"2024-02-07T16:13:03.196036Z","iopub.execute_input":"2024-02-07T16:13:03.196404Z","iopub.status.idle":"2024-02-07T16:13:03.210475Z","shell.execute_reply.started":"2024-02-07T16:13:03.196371Z","shell.execute_reply":"2024-02-07T16:13:03.209616Z"},"trusted":true},"execution_count":50,"outputs":[{"execution_count":50,"output_type":"execute_result","data":{"text/plain":"count 1378.000000\nmean 3169.050798\nstd 920.588198\nmin 1356.000000\n25% 2554.250000\n50% 2985.500000\n75% 3623.750000\nmax 8436.000000\nName: text_len, dtype: float64"},"metadata":{}}]},{"cell_type":"code","source":"ax = sns.countplot(data=train_essays, x='generated')\nax.bar_label(ax.containers[0])\nplt.title('Distribution of texts');","metadata":{"datalore":{"node_id":"1j0urtcgAun9NZKxouvwdA","type":"CODE","hide_input_from_viewers":true,"hide_output_from_viewers":true},"execution":{"iopub.status.busy":"2024-02-07T16:13:03.211689Z","iopub.execute_input":"2024-02-07T16:13:03.212001Z","iopub.status.idle":"2024-02-07T16:13:03.413478Z","shell.execute_reply.started":"2024-02-07T16:13:03.211978Z","shell.execute_reply":"2024-02-07T16:13:03.412536Z"},"trusted":true},"execution_count":51,"outputs":[{"output_type":"display_data","data":{"text/plain":"
","image/png":""},"metadata":{}}]},{"cell_type":"code","source":"generated_essays = train_essays[train_essays['generated'] == 1]\ngenerated_essays","metadata":{"datalore":{"node_id":"PoYwJW5KtkraGRQiCmLFge","type":"CODE","hide_input_from_viewers":true,"hide_output_from_viewers":true},"execution":{"iopub.status.busy":"2024-02-07T16:13:03.414733Z","iopub.execute_input":"2024-02-07T16:13:03.415006Z","iopub.status.idle":"2024-02-07T16:13:03.425880Z","shell.execute_reply.started":"2024-02-07T16:13:03.414982Z","shell.execute_reply":"2024-02-07T16:13:03.424958Z"},"trusted":true},"execution_count":52,"outputs":[{"execution_count":52,"output_type":"execute_result","data":{"text/plain":" id prompt_id text \\\n704 82131f68 1 This essay will analyze, discuss and prove one... \n740 86fe4f18 1 I strongly believe that the Electoral College ... \n1262 eafb8a56 0 Limiting car use causes pollution, increases c... \n\n generated text_len \n704 1 1356 \n740 1 1500 \n1262 1 1797 ","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
idprompt_idtextgeneratedtext_len
70482131f681This essay will analyze, discuss and prove one...11356
74086fe4f181I strongly believe that the Electoral College ...11500
1262eafb8a560Limiting car use causes pollution, increases c...11797
\n
"},"metadata":{}}]},{"cell_type":"code","source":"ax2 = sns.countplot(data=train_essays, x='prompt_id')\nax2.bar_label(ax2.containers[0])\nplt.title('Distribution of prompts');","metadata":{"datalore":{"node_id":"ylPh4tN2nxXxzTwLkufmz4","type":"CODE","hide_input_from_viewers":true,"hide_output_from_viewers":true},"execution":{"iopub.status.busy":"2024-02-07T16:13:03.430932Z","iopub.execute_input":"2024-02-07T16:13:03.431331Z","iopub.status.idle":"2024-02-07T16:13:03.627755Z","shell.execute_reply.started":"2024-02-07T16:13:03.431287Z","shell.execute_reply":"2024-02-07T16:13:03.626923Z"},"trusted":true},"execution_count":53,"outputs":[{"output_type":"display_data","data":{"text/plain":"
","image/png":""},"metadata":{}}]},{"cell_type":"markdown","source":"### Exploring test_essays","metadata":{"datalore":{"node_id":"qn2ISuRTmxMZ27LkJpEDDM","type":"MD","hide_input_from_viewers":true,"hide_output_from_viewers":true}}},{"cell_type":"code","source":"test_essays","metadata":{"datalore":{"node_id":"swH8B4pdgpa0wOVoBhfbal","type":"CODE","hide_input_from_viewers":true,"hide_output_from_viewers":true},"execution":{"iopub.status.busy":"2024-02-07T16:13:03.628834Z","iopub.execute_input":"2024-02-07T16:13:03.629176Z","iopub.status.idle":"2024-02-07T16:13:03.638296Z","shell.execute_reply.started":"2024-02-07T16:13:03.629143Z","shell.execute_reply":"2024-02-07T16:13:03.637369Z"},"trusted":true},"execution_count":54,"outputs":[{"execution_count":54,"output_type":"execute_result","data":{"text/plain":" id prompt_id text\n0 0000aaaa 2 Aaa bbb ccc.\n1 1111bbbb 3 Bbb ccc ddd.\n2 2222cccc 4 CCC ddd eee.","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
idprompt_idtext
00000aaaa2Aaa bbb ccc.
11111bbbb3Bbb ccc ddd.
22222cccc4CCC ddd eee.
\n
"},"metadata":{}}]},{"cell_type":"markdown","source":"There is only dummy data in test_essays, after submission the text will be replaced with real text.","metadata":{"datalore":{"node_id":"eCoBMveMWE8eAs95Z2g9oA","type":"MD","hide_input_from_viewers":true,"hide_output_from_viewers":true}}},{"cell_type":"markdown","source":"### Exploring Train Prompts data","metadata":{"datalore":{"node_id":"urZA7FLe9NfAUqNoKjObON","type":"MD","hide_input_from_viewers":true,"hide_output_from_viewers":true}}},{"cell_type":"code","source":"train_prompts","metadata":{"datalore":{"node_id":"jUDH4BTi5z6rvr59VFk343","type":"CODE","hide_input_from_viewers":true,"hide_output_from_viewers":true},"execution":{"iopub.status.busy":"2024-02-07T16:13:03.639476Z","iopub.execute_input":"2024-02-07T16:13:03.639753Z","iopub.status.idle":"2024-02-07T16:13:03.654284Z","shell.execute_reply.started":"2024-02-07T16:13:03.639730Z","shell.execute_reply":"2024-02-07T16:13:03.653356Z"},"trusted":true},"execution_count":55,"outputs":[{"execution_count":55,"output_type":"execute_result","data":{"text/plain":" prompt_id prompt_name \\\n0 0 Car-free cities \n1 1 Does the electoral college work? \n\n instructions \\\n0 Write an explanatory essay to inform fellow ci... \n1 Write a letter to your state senator in which ... \n\n source_text \n0 # In German Suburb, Life Goes On Without Cars ... \n1 # What Is the Electoral College? by the Office... ","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
prompt_idprompt_nameinstructionssource_text
00Car-free citiesWrite an explanatory essay to inform fellow ci...# In German Suburb, Life Goes On Without Cars ...
11Does the electoral college work?Write a letter to your state senator in which ...# What Is the Electoral College? by the Office...
\n
"},"metadata":{}}]},{"cell_type":"code","source":"train_prompts.iloc[0]['instructions']","metadata":{"datalore":{"node_id":"IhBtxcZceIE7zNVgxyRZml","type":"CODE","hide_input_from_viewers":true,"hide_output_from_viewers":true},"execution":{"iopub.status.busy":"2024-02-07T16:13:03.655312Z","iopub.execute_input":"2024-02-07T16:13:03.655624Z","iopub.status.idle":"2024-02-07T16:13:03.665807Z","shell.execute_reply.started":"2024-02-07T16:13:03.655594Z","shell.execute_reply":"2024-02-07T16:13:03.664903Z"},"trusted":true},"execution_count":56,"outputs":[{"execution_count":56,"output_type":"execute_result","data":{"text/plain":"'Write an explanatory essay to inform fellow citizens about the advantages of limiting car usage. Your essay must be based on ideas and information that can be found in the passage set. Manage your time carefully so that you can read the passages; plan your response; write your response; and revise and edit your response. Be sure to use evidence from multiple sources; and avoid overly relying on one source. Your response should be in the form of a multiparagraph essay. Write your essay in the space provided.'"},"metadata":{}}]},{"cell_type":"markdown","source":"**Prompt_id = 0**\\\n**prompt_name = Car-free cities**\\\n'Write an explanatory essay to inform fellow citizens about the advantages of limiting car usage. Your essay must be based on ideas and information that can be found in the passage set. Manage your time carefully so that you can read the passages; plan your response; write your response; and revise and edit your response. Be sure to use evidence from multiple sources; and avoid overly relying on one source. Your response should be in the form of a multiparagraph essay. Write your essay in the space provided.'","metadata":{"datalore":{"node_id":"YBADwMdIO4pNpI8MUskJT7","type":"MD","hide_input_from_viewers":true,"hide_output_from_viewers":true}}},{"cell_type":"code","source":"train_prompts.iloc[1]['instructions']","metadata":{"datalore":{"node_id":"slJneLtnk2LCqrwBUMShys","type":"CODE","hide_input_from_viewers":true,"hide_output_from_viewers":true},"execution":{"iopub.status.busy":"2024-02-07T16:13:03.666700Z","iopub.execute_input":"2024-02-07T16:13:03.666924Z","iopub.status.idle":"2024-02-07T16:13:03.676438Z","shell.execute_reply.started":"2024-02-07T16:13:03.666904Z","shell.execute_reply":"2024-02-07T16:13:03.675611Z"},"trusted":true},"execution_count":57,"outputs":[{"execution_count":57,"output_type":"execute_result","data":{"text/plain":"'Write a letter to your state senator in which you argue in favor of keeping the Electoral College or changing to election by popular vote for the president of the United States. Use the information from the texts in your essay. Manage your time carefully so that you can read the passages; plan your response; write your response; and revise and edit your response. Be sure to include a claim; address counterclaims; use evidence from multiple sources; and avoid overly relying on one source. Your response should be in the form of a multiparagraph essay. Write your response in the space provided.'"},"metadata":{}}]},{"cell_type":"markdown","source":"**Prompt_id = 1**\\\n**prompt_name = Does the electoral college work?**\\\n'Write a letter to your state senator in which you argue in favor of keeping the Electoral College or changing to election by popular vote for the president of the United States. Use the information from the texts in your essay. Manage your time carefully so that you can read the passages; plan your response; write your response; and revise and edit your response. Be sure to include a claim; address counterclaims; use evidence from multiple sources; and avoid overly relying on one source. Your response should be in the form of a multiparagraph essay. Write your response in the space provided.'","metadata":{"datalore":{"node_id":"KFNcrRCeP8O0rBaXUUMvFi","type":"MD","hide_input_from_viewers":true,"hide_output_from_viewers":true}}},{"cell_type":"markdown","source":"## 3. Loading external dataset","metadata":{"datalore":{"node_id":"jD4ZTKf0EQNPKEX9ic8a5l","type":"MD","hide_input_from_viewers":true,"hide_output_from_viewers":true}}},{"cell_type":"markdown","source":"Since there are only 3 AI generated essays, I need extra dataset with AI generated text.\\\nLuckily there is such data on Kaggle.","metadata":{"datalore":{"node_id":"Yy2tKqV4xjtf2WiwpLJNx5","type":"MD","hide_input_from_viewers":true,"hide_output_from_viewers":true}}},{"cell_type":"code","source":"external_essays = pd.read_csv('/kaggle/input/daigt-v2-train-dataset/train_v2_drcat_02.csv')","metadata":{"datalore":{"node_id":"IoEuwVB0cfQC5DSC41JG0V","type":"CODE","hide_input_from_viewers":true,"hide_output_from_viewers":true},"execution":{"iopub.status.busy":"2024-02-07T16:13:03.677497Z","iopub.execute_input":"2024-02-07T16:13:03.677760Z","iopub.status.idle":"2024-02-07T16:13:04.747705Z","shell.execute_reply.started":"2024-02-07T16:13:03.677738Z","shell.execute_reply":"2024-02-07T16:13:04.746628Z"},"trusted":true},"execution_count":58,"outputs":[]},{"cell_type":"code","source":"external_essays.head(10)","metadata":{"datalore":{"node_id":"ZNPA0op16zbCTzBLwFuwmD","type":"CODE","hide_input_from_viewers":true,"hide_output_from_viewers":true},"execution":{"iopub.status.busy":"2024-02-07T16:13:04.748974Z","iopub.execute_input":"2024-02-07T16:13:04.749308Z","iopub.status.idle":"2024-02-07T16:13:04.761456Z","shell.execute_reply.started":"2024-02-07T16:13:04.749281Z","shell.execute_reply":"2024-02-07T16:13:04.760478Z"},"trusted":true},"execution_count":59,"outputs":[{"execution_count":59,"output_type":"execute_result","data":{"text/plain":" text label \\\n0 Phones\\n\\nModern humans today are always on th... 0 \n1 This essay will explain if drivers should or s... 0 \n2 Driving while the use of cellular devices\\n\\nT... 0 \n3 Phones & Driving\\n\\nDrivers should not be able... 0 \n4 Cell Phone Operation While Driving\\n\\nThe abil... 0 \n5 Cell phone use should not be legal while drivi... 0 \n6 Phones and Driving\\n\\nDriving is a good way to... 0 \n7 PHONES AND DRIVING\\n\\nIn this world in which w... 0 \n8 People are debating whether if drivers should ... 0 \n9 Texting and driving\\n\\nOver half of drivers in... 0 \n\n prompt_name source RDizzl3_seven \n0 Phones and driving persuade_corpus False \n1 Phones and driving persuade_corpus False \n2 Phones and driving persuade_corpus False \n3 Phones and driving persuade_corpus False \n4 Phones and driving persuade_corpus False \n5 Phones and driving persuade_corpus False \n6 Phones and driving persuade_corpus False \n7 Phones and driving persuade_corpus False \n8 Phones and driving persuade_corpus False \n9 Phones and driving persuade_corpus False ","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
textlabelprompt_namesourceRDizzl3_seven
0Phones\\n\\nModern humans today are always on th...0Phones and drivingpersuade_corpusFalse
1This essay will explain if drivers should or s...0Phones and drivingpersuade_corpusFalse
2Driving while the use of cellular devices\\n\\nT...0Phones and drivingpersuade_corpusFalse
3Phones & Driving\\n\\nDrivers should not be able...0Phones and drivingpersuade_corpusFalse
4Cell Phone Operation While Driving\\n\\nThe abil...0Phones and drivingpersuade_corpusFalse
5Cell phone use should not be legal while drivi...0Phones and drivingpersuade_corpusFalse
6Phones and Driving\\n\\nDriving is a good way to...0Phones and drivingpersuade_corpusFalse
7PHONES AND DRIVING\\n\\nIn this world in which w...0Phones and drivingpersuade_corpusFalse
8People are debating whether if drivers should ...0Phones and drivingpersuade_corpusFalse
9Texting and driving\\n\\nOver half of drivers in...0Phones and drivingpersuade_corpusFalse
\n
"},"metadata":{}}]},{"cell_type":"code","source":"external_essays['text_len'] = external_essays['text'].apply(len)\nexternal_essays = external_essays.sort_values('text_len')\nexternal_essays","metadata":{"datalore":{"node_id":"HUEkPkK7SPKGYXJXY3XqOc","type":"CODE","hide_input_from_viewers":true,"hide_output_from_viewers":true},"execution":{"iopub.status.busy":"2024-02-07T16:13:04.762834Z","iopub.execute_input":"2024-02-07T16:13:04.763088Z","iopub.status.idle":"2024-02-07T16:13:04.815853Z","shell.execute_reply.started":"2024-02-07T16:13:04.763066Z","shell.execute_reply":"2024-02-07T16:13:04.814908Z"},"trusted":true},"execution_count":60,"outputs":[{"execution_count":60,"output_type":"execute_result","data":{"text/plain":" text label \\\n41204 In recent years, there has been a growing trend 1 \n40767 Dear Senator,\\n\\nI am writing in support of k... 1 \n41168 Car usage has long been a significant factor ... 1 \n41167 Limiting car usage is a concept that has gain... 1 \n34960 Passage 1:\\n\\nPassage 2:\\n\\nPassage 3:\\n\\nPass... 1 \n... ... ... \n8895 The author did not do a good job at supporting... 0 \n19290 Dear Senator,\\n\\nI favoring of keeping the Ele... 0 \n1772 This passage is about a germany mom from the s... 0 \n2549 Imagen the streets with no cars empty with onl... 0 \n1517 if we look back at time in the united states y... 0 \n\n prompt_name source \\\n41204 Distance learning mistralai/Mistral-7B-Instruct-v0.1 \n40767 Does the electoral college work? mistralai/Mistral-7B-Instruct-v0.1 \n41168 Car-free cities mistralai/Mistral-7B-Instruct-v0.1 \n41167 Car-free cities mistralai/Mistral-7B-Instruct-v0.1 \n34960 Seeking multiple opinions falcon_180b_v1 \n... ... ... \n8895 Exploring Venus persuade_corpus \n19290 Does the electoral college work? persuade_corpus \n1772 Car-free cities persuade_corpus \n2549 Car-free cities persuade_corpus \n1517 Car-free cities persuade_corpus \n\n RDizzl3_seven text_len \n41204 False 48 \n40767 True 272 \n41168 True 273 \n41167 True 304 \n34960 False 314 \n... ... ... \n8895 True 9980 \n19290 True 10309 \n1772 True 11641 \n2549 True 18125 \n1517 True 18322 \n\n[44868 rows x 6 columns]","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
textlabelprompt_namesourceRDizzl3_seventext_len
41204In recent years, there has been a growing trend1Distance learningmistralai/Mistral-7B-Instruct-v0.1False48
40767Dear Senator,\\n\\nI am writing in support of k...1Does the electoral college work?mistralai/Mistral-7B-Instruct-v0.1True272
41168Car usage has long been a significant factor ...1Car-free citiesmistralai/Mistral-7B-Instruct-v0.1True273
41167Limiting car usage is a concept that has gain...1Car-free citiesmistralai/Mistral-7B-Instruct-v0.1True304
34960Passage 1:\\n\\nPassage 2:\\n\\nPassage 3:\\n\\nPass...1Seeking multiple opinionsfalcon_180b_v1False314
.....................
8895The author did not do a good job at supporting...0Exploring Venuspersuade_corpusTrue9980
19290Dear Senator,\\n\\nI favoring of keeping the Ele...0Does the electoral college work?persuade_corpusTrue10309
1772This passage is about a germany mom from the s...0Car-free citiespersuade_corpusTrue11641
2549Imagen the streets with no cars empty with onl...0Car-free citiespersuade_corpusTrue18125
1517if we look back at time in the united states y...0Car-free citiespersuade_corpusTrue18322
\n

44868 rows × 6 columns

\n
"},"metadata":{}}]},{"cell_type":"code","source":"# Unique values in the columns\ncols_unique = ['label', 'prompt_name', 'source','RDizzl3_seven']\nfor col in cols_unique:\n print(external_essays[col].unique())","metadata":{"datalore":{"node_id":"jXqzMwojHCEJxPBqjoKLAw","type":"CODE","hide_input_from_viewers":true,"hide_output_from_viewers":true},"execution":{"iopub.status.busy":"2024-02-07T16:13:04.816862Z","iopub.execute_input":"2024-02-07T16:13:04.817150Z","iopub.status.idle":"2024-02-07T16:13:04.831336Z","shell.execute_reply.started":"2024-02-07T16:13:04.817102Z","shell.execute_reply":"2024-02-07T16:13:04.830016Z"},"trusted":true},"execution_count":61,"outputs":[{"name":"stdout","text":"[1 0]\n['Distance learning' 'Does the electoral college work?' 'Car-free cities'\n 'Seeking multiple opinions' 'Summer projects'\n 'Facial action coding system' 'Mandatory extracurricular activities'\n 'Grades for extracurricular activities' '\"A Cowboy Who Rode the Waves\"'\n 'Cell phones at school' 'Community service' 'Exploring Venus'\n 'Driverless cars' 'The Face on Mars' 'Phones and driving']\n['mistralai/Mistral-7B-Instruct-v0.1' 'falcon_180b_v1' 'chat_gpt_moth'\n 'mistral7binstruct_v2' 'llama2_chat' 'persuade_corpus'\n 'mistral7binstruct_v1' 'cohere-command' 'llama_70b_v1' 'palm-text-bison1'\n 'kingki19_palm' 'darragh_claude_v7' 'darragh_claude_v6' 'train_essays'\n 'NousResearch/Llama-2-7b-chat-hf' 'radek_500' 'radekgpt4']\n[False True]\n","output_type":"stream"}]},{"cell_type":"code","source":"ax3 = sns.countplot(data=external_essays, x='label')\nax3.bar_label(ax3.containers[0])\nplt.title('Distribution of texts on external_essays');","metadata":{"datalore":{"node_id":"wLOfDfmnoYWEKy9skp1mZ8","type":"CODE","hide_input_from_viewers":true,"hide_output_from_viewers":true},"execution":{"iopub.status.busy":"2024-02-07T16:13:04.834434Z","iopub.execute_input":"2024-02-07T16:13:04.834700Z","iopub.status.idle":"2024-02-07T16:13:05.017631Z","shell.execute_reply.started":"2024-02-07T16:13:04.834677Z","shell.execute_reply":"2024-02-07T16:13:05.016715Z"},"trusted":true},"execution_count":62,"outputs":[{"output_type":"display_data","data":{"text/plain":"
","image/png":""},"metadata":{}}]},{"cell_type":"code","source":"external_essays = external_essays.rename(columns={'label': 'generated'})","metadata":{"datalore":{"node_id":"4IqXcSQ20yKEwIVt4aEkMU","type":"CODE","hide_input_from_viewers":true,"hide_output_from_viewers":true},"execution":{"iopub.status.busy":"2024-02-07T16:13:05.018746Z","iopub.execute_input":"2024-02-07T16:13:05.019087Z","iopub.status.idle":"2024-02-07T16:13:05.026593Z","shell.execute_reply.started":"2024-02-07T16:13:05.019055Z","shell.execute_reply":"2024-02-07T16:13:05.025693Z"},"trusted":true},"execution_count":63,"outputs":[]},{"cell_type":"markdown","source":"**Concatenating the datasets**","metadata":{"datalore":{"node_id":"lNnLE86XV6G6LqNWNs5enI","type":"MD","hide_input_from_viewers":true,"hide_output_from_viewers":true}}},{"cell_type":"code","source":"df = pd.concat([external_essays[['text', 'generated']], train_essays[['text', 'generated']]], ignore_index=True)","metadata":{"datalore":{"node_id":"rAypYp2yNcT6NHZjqdot6E","type":"CODE","hide_input_from_viewers":true,"hide_output_from_viewers":true},"execution":{"iopub.status.busy":"2024-02-07T16:13:05.027827Z","iopub.execute_input":"2024-02-07T16:13:05.028447Z","iopub.status.idle":"2024-02-07T16:13:05.041241Z","shell.execute_reply.started":"2024-02-07T16:13:05.028418Z","shell.execute_reply":"2024-02-07T16:13:05.040288Z"},"trusted":true},"execution_count":64,"outputs":[]},{"cell_type":"code","source":"# df = df.sample(frac=0.02, random_state=42)","metadata":{"execution":{"iopub.status.busy":"2024-02-07T16:13:05.042763Z","iopub.execute_input":"2024-02-07T16:13:05.043040Z","iopub.status.idle":"2024-02-07T16:13:05.051498Z","shell.execute_reply.started":"2024-02-07T16:13:05.043016Z","shell.execute_reply":"2024-02-07T16:13:05.050544Z"},"trusted":true},"execution_count":65,"outputs":[]},{"cell_type":"code","source":"ax4 = sns.countplot(data=df, x='generated')\nax4.bar_label(ax4.containers[0])\nplt.title('Distribution of Label');","metadata":{"datalore":{"node_id":"zGhGuMuLpxQUftCSQaishZ","type":"CODE","hide_input_from_viewers":true,"hide_output_from_viewers":true},"execution":{"iopub.status.busy":"2024-02-07T16:13:05.052688Z","iopub.execute_input":"2024-02-07T16:13:05.052952Z","iopub.status.idle":"2024-02-07T16:13:05.253363Z","shell.execute_reply.started":"2024-02-07T16:13:05.052929Z","shell.execute_reply":"2024-02-07T16:13:05.252425Z"},"trusted":true},"execution_count":66,"outputs":[{"output_type":"display_data","data":{"text/plain":"
","image/png":""},"metadata":{}}]},{"cell_type":"code","source":"df.head()","metadata":{"datalore":{"node_id":"ItJikExfedJp3LE8X7Srha","type":"CODE","hide_input_from_viewers":true,"hide_output_from_viewers":true},"execution":{"iopub.status.busy":"2024-02-07T16:13:05.254735Z","iopub.execute_input":"2024-02-07T16:13:05.255590Z","iopub.status.idle":"2024-02-07T16:13:05.265677Z","shell.execute_reply.started":"2024-02-07T16:13:05.255533Z","shell.execute_reply":"2024-02-07T16:13:05.264692Z"},"trusted":true},"execution_count":67,"outputs":[{"execution_count":67,"output_type":"execute_result","data":{"text/plain":" text generated\n0 In recent years, there has been a growing trend 1\n1 Dear Senator,\\n\\nI am writing in support of k... 1\n2 Car usage has long been a significant factor ... 1\n3 Limiting car usage is a concept that has gain... 1\n4 Passage 1:\\n\\nPassage 2:\\n\\nPassage 3:\\n\\nPass... 1","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
textgenerated
0In recent years, there has been a growing trend1
1Dear Senator,\\n\\nI am writing in support of k...1
2Car usage has long been a significant factor ...1
3Limiting car usage is a concept that has gain...1
4Passage 1:\\n\\nPassage 2:\\n\\nPassage 3:\\n\\nPass...1
\n
"},"metadata":{}}]},{"cell_type":"markdown","source":"**Clean text**","metadata":{"datalore":{"node_id":"8978uMqZOlfcnT3mHvQZp1","type":"MD","hide_input_from_viewers":true,"hide_output_from_viewers":true}}},{"cell_type":"code","source":"df.iloc[7][['text','generated']]","metadata":{"datalore":{"node_id":"3b3ir2r9QMkiwmxPnJ0E5l","type":"CODE","hide_input_from_viewers":true,"hide_output_from_viewers":true},"execution":{"iopub.status.busy":"2024-02-07T16:13:05.267026Z","iopub.execute_input":"2024-02-07T16:13:05.267368Z","iopub.status.idle":"2024-02-07T16:13:05.279580Z","shell.execute_reply.started":"2024-02-07T16:13:05.267338Z","shell.execute_reply":"2024-02-07T16:13:05.278553Z"},"trusted":true},"execution_count":68,"outputs":[{"execution_count":68,"output_type":"execute_result","data":{"text/plain":"text After researching extensively into different p...\ngenerated 1\nName: 7, dtype: object"},"metadata":{}}]},{"cell_type":"code","source":"def clean_text(text):\n #delete non-alphanumeric characters\n text = re.sub(r\"[^A-Za-z0-9\\s]\", \"\", text)\n #delete extra whitespaces\n text = re.sub(r\"\\s+\", \" \", text)\n #convert to lowercase\n text = text.lower()\n\n return text","metadata":{"datalore":{"node_id":"N1DjNMEJW5CsHvDCHngtmn","type":"CODE","hide_input_from_viewers":true,"hide_output_from_viewers":true},"execution":{"iopub.status.busy":"2024-02-07T16:13:05.280640Z","iopub.execute_input":"2024-02-07T16:13:05.280937Z","iopub.status.idle":"2024-02-07T16:13:05.289950Z","shell.execute_reply.started":"2024-02-07T16:13:05.280914Z","shell.execute_reply":"2024-02-07T16:13:05.289146Z"},"trusted":true},"execution_count":69,"outputs":[]},{"cell_type":"code","source":"df['text'] = df['text'].map(clean_text)","metadata":{"datalore":{"node_id":"qjp0S2zUgdmaJANVSOwwXv","type":"CODE","hide_input_from_viewers":true,"hide_output_from_viewers":true},"execution":{"iopub.status.busy":"2024-02-07T16:13:05.291016Z","iopub.execute_input":"2024-02-07T16:13:05.291337Z","iopub.status.idle":"2024-02-07T16:13:16.672977Z","shell.execute_reply.started":"2024-02-07T16:13:05.291313Z","shell.execute_reply":"2024-02-07T16:13:16.672155Z"},"trusted":true},"execution_count":70,"outputs":[]},{"cell_type":"code","source":"df.iloc[7]['text']","metadata":{"execution":{"iopub.status.busy":"2024-02-07T16:13:16.678991Z","iopub.execute_input":"2024-02-07T16:13:16.679353Z","iopub.status.idle":"2024-02-07T16:13:16.685339Z","shell.execute_reply.started":"2024-02-07T16:13:16.679328Z","shell.execute_reply":"2024-02-07T16:13:16.684457Z"},"trusted":true},"execution_count":71,"outputs":[{"execution_count":71,"output_type":"execute_result","data":{"text/plain":"'after researching extensively into different potential career paths i have shortlisted my top 5 options i have read up on the various qualifications responsibilities and other important aspects of each one and weighed them up against my skills and experience now i am planning to discuss my top 5 with my parents and teachers to get their opinions on which one could potentially benefit me the best'"},"metadata":{}}]},{"cell_type":"markdown","source":"## 4. Tokenizing","metadata":{"datalore":{"node_id":"2Tt8LbIyZKzdQgMtl7VNhK","type":"MD","hide_input_from_viewers":true,"hide_output_from_viewers":true}}},{"cell_type":"code","source":"# checkpoint = '../input/transformers/distilbert-base-uncased'\ncheckpoint = 'distilbert/distilbert-base-uncased-finetuned-sst-2-english'","metadata":{"datalore":{"node_id":"SrkC7QpLgPn1I0cIx9UaA0","type":"CODE","hide_input_from_viewers":true,"hide_output_from_viewers":true},"execution":{"iopub.status.busy":"2024-02-07T16:13:16.686480Z","iopub.execute_input":"2024-02-07T16:13:16.686807Z","iopub.status.idle":"2024-02-07T16:13:16.698985Z","shell.execute_reply.started":"2024-02-07T16:13:16.686776Z","shell.execute_reply":"2024-02-07T16:13:16.698299Z"},"trusted":true},"execution_count":72,"outputs":[]},{"cell_type":"code","source":"batch_size = 25","metadata":{"execution":{"iopub.status.busy":"2024-02-07T16:13:16.700077Z","iopub.execute_input":"2024-02-07T16:13:16.700702Z","iopub.status.idle":"2024-02-07T16:13:16.711065Z","shell.execute_reply.started":"2024-02-07T16:13:16.700670Z","shell.execute_reply":"2024-02-07T16:13:16.710157Z"},"trusted":true},"execution_count":73,"outputs":[]},{"cell_type":"code","source":"tokenizer = AutoTokenizer.from_pretrained(checkpoint)","metadata":{"datalore":{"node_id":"2X59C8t23MfaAuhJcGqTAB","type":"CODE","hide_input_from_viewers":true,"hide_output_from_viewers":true},"execution":{"iopub.status.busy":"2024-02-07T16:13:16.712051Z","iopub.execute_input":"2024-02-07T16:13:16.712349Z","iopub.status.idle":"2024-02-07T16:13:16.874897Z","shell.execute_reply.started":"2024-02-07T16:13:16.712325Z","shell.execute_reply":"2024-02-07T16:13:16.874159Z"},"trusted":true},"execution_count":74,"outputs":[]},{"cell_type":"code","source":"def tokenize_and_split(text):\n encoding = tokenizer(\n text[\"text\"],\n truncation=True,\n padding=True,\n max_length=512,\n return_overflowing_tokens=True,\n )\n # Extract mapping between new and old indices\n sample_map = encoding.pop(\"overflow_to_sample_mapping\")\n for key, values in text.items():\n encoding[key] = [values[i] for i in sample_map]\n return encoding","metadata":{"datalore":{"node_id":"Np01F6IxXptsTFbJEtQcVi","type":"CODE","hide_input_from_viewers":true,"hide_output_from_viewers":true},"execution":{"iopub.status.busy":"2024-02-07T16:13:16.875924Z","iopub.execute_input":"2024-02-07T16:13:16.876226Z","iopub.status.idle":"2024-02-07T16:13:16.881595Z","shell.execute_reply.started":"2024-02-07T16:13:16.876202Z","shell.execute_reply":"2024-02-07T16:13:16.880686Z"},"trusted":true},"execution_count":75,"outputs":[]},{"cell_type":"code","source":"raw_ds = datasets.Dataset.from_pandas(df)\nraw_ds = raw_ds.train_test_split(test_size=0.2)\nraw_ds","metadata":{"datalore":{"node_id":"ok2iNWqKhS3Xr6BgnS8XKL","type":"CODE","hide_input_from_viewers":true,"hide_output_from_viewers":true},"execution":{"iopub.status.busy":"2024-02-07T16:13:16.882783Z","iopub.execute_input":"2024-02-07T16:13:16.883139Z","iopub.status.idle":"2024-02-07T16:13:17.133386Z","shell.execute_reply.started":"2024-02-07T16:13:16.883084Z","shell.execute_reply":"2024-02-07T16:13:17.132340Z"},"trusted":true},"execution_count":76,"outputs":[{"execution_count":76,"output_type":"execute_result","data":{"text/plain":"DatasetDict({\n train: Dataset({\n features: ['text', 'generated'],\n num_rows: 36996\n })\n test: Dataset({\n features: ['text', 'generated'],\n num_rows: 9250\n })\n})"},"metadata":{}}]},{"cell_type":"code","source":"tokenized_ds = raw_ds.map(tokenize_and_split, batched=True)","metadata":{"datalore":{"node_id":"MDZBrC9YPCTvp51uU2TETh","type":"CODE","hide_input_from_viewers":true,"hide_output_from_viewers":true},"execution":{"iopub.status.busy":"2024-02-07T16:13:17.134367Z","iopub.execute_input":"2024-02-07T16:13:17.136441Z","iopub.status.idle":"2024-02-07T16:13:57.609413Z","shell.execute_reply.started":"2024-02-07T16:13:17.136406Z","shell.execute_reply":"2024-02-07T16:13:57.608495Z"},"trusted":true},"execution_count":77,"outputs":[{"output_type":"display_data","data":{"text/plain":"Map: 0%| | 0/36996 [00:00 (tf.Tensor, tf.Tensor) \n : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor) \nNew behaviour: columns=['a'],labels=['labels'] -> ({'a': tf.Tensor}, {'labels': tf.Tensor}) \n : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor) \n warnings.warn(\n","output_type":"stream"}]},{"cell_type":"code","source":"num_epochs = 2","metadata":{"execution":{"iopub.status.busy":"2024-02-07T16:13:57.783312Z","iopub.execute_input":"2024-02-07T16:13:57.783569Z","iopub.status.idle":"2024-02-07T16:13:57.787307Z","shell.execute_reply.started":"2024-02-07T16:13:57.783547Z","shell.execute_reply":"2024-02-07T16:13:57.786339Z"},"trusted":true},"execution_count":81,"outputs":[]},{"cell_type":"code","source":"model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2, from_pt=True)","metadata":{"datalore":{"node_id":"N0KmKqUgQAhDA3T22hXvH0","type":"CODE","hide_input_from_viewers":true,"hide_output_from_viewers":true},"execution":{"iopub.status.busy":"2024-02-07T16:13:57.788362Z","iopub.execute_input":"2024-02-07T16:13:57.788638Z","iopub.status.idle":"2024-02-07T16:13:58.918881Z","shell.execute_reply.started":"2024-02-07T16:13:57.788614Z","shell.execute_reply":"2024-02-07T16:13:58.917823Z"},"trusted":true},"execution_count":82,"outputs":[{"name":"stderr","text":"All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.\n\nAll the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.\nIf your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.\n","output_type":"stream"}]},{"cell_type":"code","source":"# The number of training steps is the number of samples in the dataset, divided by the batch size then multiplied\n# by the total number of epochs. Note that the tf_train_dataset here is a batched tf.data.Dataset,\n# not the original Hugging Face Dataset, so its len() is already num_samples // batch_size.\nnum_train_steps = len(tf_train_dataset) * num_epochs\nlr_scheduler = PolynomialDecay(\n initial_learning_rate=5e-5, end_learning_rate=0.0, decay_steps=num_train_steps\n)\n\nopt = Adam(learning_rate=lr_scheduler)\n\nloss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)\n\nes = EarlyStopping(monitor='val_loss', patience=1, verbose=1, mode='auto', restore_best_weights=True)\n\ncallback = PushToHubCallback(\n \"LLM_generated_text_detector\", save_strategy=\"no\", tokenizer=tokenizer\n)","metadata":{"datalore":{"node_id":"KgCP6HcT4sIzhVUP361LJt","type":"CODE","hide_input_from_viewers":true,"hide_output_from_viewers":true},"execution":{"iopub.status.busy":"2024-02-07T16:13:58.920321Z","iopub.execute_input":"2024-02-07T16:13:58.920593Z","iopub.status.idle":"2024-02-07T16:13:59.156543Z","shell.execute_reply.started":"2024-02-07T16:13:58.920568Z","shell.execute_reply":"2024-02-07T16:13:59.155652Z"},"trusted":true},"execution_count":83,"outputs":[{"name":"stderr","text":"/opt/conda/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:131: FutureWarning: 'Repository' (from 'huggingface_hub.repository') is deprecated and will be removed from version '1.0'. Please prefer the http-based alternatives instead. Given its large adoption in legacy code, the complete removal is only planned on next major release.\nFor more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.\n warnings.warn(warning_message, FutureWarning)\n/kaggle/working/LLM_generated_text_detector is already a clone of https://huggingface.co/Wintersmith/LLM_generated_text_detector. Make sure you pull the latest changes with `repo.git_pull()`.\n","output_type":"stream"}]},{"cell_type":"code","source":"model.compile(optimizer=opt, loss=loss, metrics=[\"accuracy\"])","metadata":{"datalore":{"node_id":"FWsaXd4PpT0LElBgYMitIc","type":"CODE","hide_input_from_viewers":true,"hide_output_from_viewers":true},"execution":{"iopub.status.busy":"2024-02-07T16:13:59.157601Z","iopub.execute_input":"2024-02-07T16:13:59.157864Z","iopub.status.idle":"2024-02-07T16:13:59.169580Z","shell.execute_reply.started":"2024-02-07T16:13:59.157840Z","shell.execute_reply":"2024-02-07T16:13:59.168698Z"},"trusted":true},"execution_count":84,"outputs":[]},{"cell_type":"code","source":"model.fit(tf_train_dataset, validation_data=tf_test_dataset, epochs=num_epochs, callbacks=[es, callback])","metadata":{"datalore":{"node_id":"Q57EJS3MbWm5NXRe9pHyM6","type":"CODE","hide_input_from_viewers":true,"hide_output_from_viewers":true},"execution":{"iopub.status.busy":"2024-02-07T16:13:59.170764Z","iopub.execute_input":"2024-02-07T16:13:59.171079Z","iopub.status.idle":"2024-02-07T17:47:39.001846Z","shell.execute_reply.started":"2024-02-07T16:13:59.171049Z","shell.execute_reply":"2024-02-07T17:47:39.000824Z"},"trusted":true},"execution_count":85,"outputs":[{"name":"stdout","text":"Epoch 1/2\n1815/1815 [==============================] - 2812s 2s/step - loss: 0.0579 - accuracy: 0.9809 - val_loss: 0.0272 - val_accuracy: 0.9920\nEpoch 2/2\n1815/1815 [==============================] - 2790s 2s/step - loss: 0.0082 - accuracy: 0.9974 - val_loss: 0.0191 - val_accuracy: 0.9941\n","output_type":"stream"},{"output_type":"display_data","data":{"text/plain":"Upload file tf_model.h5: 0%| | 1.00/256M [00:00 main\n\n","output_type":"stream"},{"execution_count":85,"output_type":"execute_result","data":{"text/plain":""},"metadata":{}}]},{"cell_type":"code","source":"#model.save_pretrained(\"/kaggle/working/model_trained\", saved_model=True)\n","metadata":{"execution":{"iopub.status.busy":"2024-02-07T17:47:39.003003Z","iopub.execute_input":"2024-02-07T17:47:39.003387Z","iopub.status.idle":"2024-02-07T17:47:39.007252Z","shell.execute_reply.started":"2024-02-07T17:47:39.003360Z","shell.execute_reply":"2024-02-07T17:47:39.006371Z"},"trusted":true},"execution_count":86,"outputs":[]},{"cell_type":"markdown","source":"## 6. Evaluation","metadata":{}},{"cell_type":"code","source":"preds = model.predict(tf_test_dataset)[\"logits\"]\n\ny_pred = np.argmax(preds, axis=1)\nprint(preds.shape, y_pred.shape)\n\ny_pred\n\n#y_true = np.concatenate([y for x, y in tf_test_dataset], axis=0)\n# https://stackoverflow.com/questions/56226621/how-to-extract-data-labels-back-from-tensorflow-dataset","metadata":{"execution":{"iopub.status.busy":"2024-02-07T17:47:39.008525Z","iopub.execute_input":"2024-02-07T17:47:39.009205Z","iopub.status.idle":"2024-02-07T17:51:20.348390Z","shell.execute_reply.started":"2024-02-07T17:47:39.009171Z","shell.execute_reply":"2024-02-07T17:51:20.347443Z"},"trusted":true},"execution_count":87,"outputs":[{"name":"stdout","text":"451/451 [==============================] - 221s 486ms/step\n(11267, 2) (11267,)\n","output_type":"stream"},{"execution_count":87,"output_type":"execute_result","data":{"text/plain":"array([1, 0, 0, ..., 1, 1, 0])"},"metadata":{}}]},{"cell_type":"code","source":"def get_probabilities(input_text):\n logits_pred = model.predict(input_text)['logits']\n probs = tf.nn.sigmoid(logits_pred)\n class_1_probability = probs[:, 1].numpy()\n return class_1_probability","metadata":{"execution":{"iopub.status.busy":"2024-02-07T17:51:20.349613Z","iopub.execute_input":"2024-02-07T17:51:20.349921Z","iopub.status.idle":"2024-02-07T17:51:20.355158Z","shell.execute_reply.started":"2024-02-07T17:51:20.349892Z","shell.execute_reply":"2024-02-07T17:51:20.354154Z"},"trusted":true},"execution_count":88,"outputs":[]},{"cell_type":"code","source":"y_prob = get_probabilities(tf_test_dataset)\ny_prob","metadata":{"execution":{"iopub.status.busy":"2024-02-07T17:51:20.356452Z","iopub.execute_input":"2024-02-07T17:51:20.356843Z","iopub.status.idle":"2024-02-07T17:54:59.830689Z","shell.execute_reply.started":"2024-02-07T17:51:20.356797Z","shell.execute_reply":"2024-02-07T17:54:59.829525Z"},"trusted":true},"execution_count":89,"outputs":[{"name":"stdout","text":"451/451 [==============================] - 219s 486ms/step\n","output_type":"stream"},{"execution_count":89,"output_type":"execute_result","data":{"text/plain":"array([0.9966794 , 0.01150257, 0.01455727, ..., 0.99643433, 0.99624926,\n 0.03235682], dtype=float32)"},"metadata":{}}]},{"cell_type":"code","source":"y_true = np.concatenate([y for x, y in tf_test_dataset], axis=0)","metadata":{"execution":{"iopub.status.busy":"2024-02-07T17:54:59.831848Z","iopub.execute_input":"2024-02-07T17:54:59.832147Z","iopub.status.idle":"2024-02-07T17:55:02.179651Z","shell.execute_reply.started":"2024-02-07T17:54:59.832104Z","shell.execute_reply":"2024-02-07T17:55:02.178681Z"},"trusted":true},"execution_count":90,"outputs":[]},{"cell_type":"code","source":"def evaluate_model(y_true, y_pred):\n print(ConfusionMatrixDisplay.from_predictions(y_true, y_pred))\n print(classification_report(y_true, y_pred))\n print('F1 score:', f1_score(y_true, y_pred))","metadata":{"execution":{"iopub.status.busy":"2024-02-07T17:55:02.181096Z","iopub.execute_input":"2024-02-07T17:55:02.181519Z","iopub.status.idle":"2024-02-07T17:55:02.186565Z","shell.execute_reply.started":"2024-02-07T17:55:02.181485Z","shell.execute_reply":"2024-02-07T17:55:02.185529Z"},"trusted":true},"execution_count":91,"outputs":[]},{"cell_type":"code","source":"evaluate_model(y_true=y_true, y_pred=y_pred)","metadata":{"execution":{"iopub.status.busy":"2024-02-07T17:55:02.187803Z","iopub.execute_input":"2024-02-07T17:55:02.188185Z","iopub.status.idle":"2024-02-07T17:55:02.500352Z","shell.execute_reply.started":"2024-02-07T17:55:02.188150Z","shell.execute_reply":"2024-02-07T17:55:02.499174Z"},"trusted":true},"execution_count":92,"outputs":[{"name":"stdout","text":"\n precision recall f1-score support\n\n 0 1.00 0.99 1.00 7613\n 1 0.99 0.99 0.99 3654\n\n accuracy 0.99 11267\n macro avg 0.99 0.99 0.99 11267\nweighted avg 0.99 0.99 0.99 11267\n\nF1 score: 0.9908457439540921\n","output_type":"stream"},{"output_type":"display_data","data":{"text/plain":"
","image/png":""},"metadata":{}}]},{"cell_type":"markdown","source":"## 7. Submitting predictions on test set","metadata":{}},{"cell_type":"code","source":"test_essays\n","metadata":{"execution":{"iopub.status.busy":"2024-02-07T17:55:02.501599Z","iopub.execute_input":"2024-02-07T17:55:02.501910Z","iopub.status.idle":"2024-02-07T17:55:02.513693Z","shell.execute_reply.started":"2024-02-07T17:55:02.501884Z","shell.execute_reply":"2024-02-07T17:55:02.512625Z"},"trusted":true},"execution_count":93,"outputs":[{"execution_count":93,"output_type":"execute_result","data":{"text/plain":" id prompt_id text\n0 0000aaaa 2 Aaa bbb ccc.\n1 1111bbbb 3 Bbb ccc ddd.\n2 2222cccc 4 CCC ddd eee.","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
idprompt_idtext
00000aaaa2Aaa bbb ccc.
11111bbbb3Bbb ccc ddd.
22222cccc4CCC ddd eee.
\n
"},"metadata":{}}]},{"cell_type":"code","source":"test_essays['text'] = test_essays['text'].map(clean_text)\n","metadata":{"execution":{"iopub.status.busy":"2024-02-07T17:55:02.516021Z","iopub.execute_input":"2024-02-07T17:55:02.516980Z","iopub.status.idle":"2024-02-07T17:55:02.537197Z","shell.execute_reply.started":"2024-02-07T17:55:02.516927Z","shell.execute_reply":"2024-02-07T17:55:02.535910Z"},"trusted":true},"execution_count":94,"outputs":[]},{"cell_type":"code","source":"raw_final_ds = datasets.Dataset.from_pandas(test_essays)\nraw_final_ds","metadata":{"execution":{"iopub.status.busy":"2024-02-07T17:55:02.538476Z","iopub.execute_input":"2024-02-07T17:55:02.538853Z","iopub.status.idle":"2024-02-07T17:55:02.556050Z","shell.execute_reply.started":"2024-02-07T17:55:02.538825Z","shell.execute_reply":"2024-02-07T17:55:02.554954Z"},"trusted":true},"execution_count":95,"outputs":[{"execution_count":95,"output_type":"execute_result","data":{"text/plain":"Dataset({\n features: ['id', 'prompt_id', 'text'],\n num_rows: 3\n})"},"metadata":{}}]},{"cell_type":"code","source":"tokenized_test_dataset = raw_final_ds.map(tokenize_and_split, batched=True)\ntokenized_test_dataset","metadata":{"execution":{"iopub.status.busy":"2024-02-07T17:55:02.557634Z","iopub.execute_input":"2024-02-07T17:55:02.558575Z","iopub.status.idle":"2024-02-07T17:55:02.611994Z","shell.execute_reply.started":"2024-02-07T17:55:02.558522Z","shell.execute_reply":"2024-02-07T17:55:02.610964Z"},"trusted":true},"execution_count":96,"outputs":[{"output_type":"display_data","data":{"text/plain":"Map: 0%| | 0/3 [00:00