Upload fine-tune-led (1).ipynb

Browse files

Files changed (1) hide show

fine-tune-led (1).ipynb +1 -0

fine-tune-led (1).ipynb ADDED Viewed

	@@ -0,0 +1 @@

+ {"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.13","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"gpu","dataSources":[],"dockerImageVersionId":30733,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":true}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n# For example, here's several helpful packages to load\n\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n\n# Input data files are available in the read-only \"../input/\" directory\n# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n\nimport os\nfor dirname, _, filenames in os.walk('/kaggle/input'):\n for filename in filenames:\n print(os.path.join(dirname, filename))\n \n\n# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\" \n# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","execution":{"iopub.status.busy":"2024-07-23T05:03:49.373068Z","iopub.execute_input":"2024-07-23T05:03:49.373486Z","iopub.status.idle":"2024-07-23T05:03:49.753583Z","shell.execute_reply.started":"2024-07-23T05:03:49.373456Z","shell.execute_reply":"2024-07-23T05:03:49.752465Z"},"trusted":true},"execution_count":1,"outputs":[]},{"cell_type":"code","source":"import torch\nfrom datasets import load_dataset, load_metric\nfrom transformers import LEDTokenizer, LEDForConditionalGeneration, TrainingArguments, Trainer\n\n# Load the Quora dataset\ndataset = load_dataset('quora')\nprint(f\"Total size of the dataset: {len(dataset['train'])}\")\n\n# Use a subset of the dataset (50,000 samples)\nsubset_size = 20000\ntrain_size = int(0.8 * subset_size) # 80% for training\neval_size = 2000 # 20% for eval\n\n# Select subsets for training and evaluation\nsubset_dataset = dataset['train'].select(range(subset_size))\ntrain_dataset = subset_dataset.select(range(train_size))\neval_dataset = subset_dataset.select(range(train_size, train_size + eval_size))\n\nprint(f\"Size of train dataset: {len(train_dataset)}\")\nprint(f\"Size of eval dataset: {len(eval_dataset)}\")\n","metadata":{"execution":{"iopub.status.busy":"2024-07-23T05:03:52.633654Z","iopub.execute_input":"2024-07-23T05:03:52.634143Z","iopub.status.idle":"2024-07-23T05:04:01.562932Z","shell.execute_reply.started":"2024-07-23T05:03:52.634110Z","shell.execute_reply":"2024-07-23T05:04:01.561880Z"},"trusted":true},"execution_count":2,"outputs":[{"name":"stderr","text":"2024-07-23 05:03:56.977951: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n2024-07-23 05:03:56.978017: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n2024-07-23 05:03:56.979634: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n/opt/conda/lib/python3.10/site-packages/datasets/load.py:1491: FutureWarning: The repository for quora contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/quora\nYou can avoid this message in future by passing the argument `trust_remote_code=True`.\nPassing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.\n warnings.warn(\n","output_type":"stream"},{"name":"stdout","text":"Total size of the dataset: 404290\nSize of train dataset: 16000\nSize of eval dataset: 2000\n","output_type":"stream"}]},{"cell_type":"code","source":"# Load the tokenizer and model\ntokenizer = LEDTokenizer.from_pretrained(\"allenai/led-base-16384\")\nmodel = LEDForConditionalGeneration.from_pretrained(\"allenai/led-base-16384\")\n","metadata":{"execution":{"iopub.status.busy":"2024-07-23T05:04:07.887385Z","iopub.execute_input":"2024-07-23T05:04:07.888683Z","iopub.status.idle":"2024-07-23T05:04:08.892374Z","shell.execute_reply.started":"2024-07-23T05:04:07.888644Z","shell.execute_reply":"2024-07-23T05:04:08.891549Z"},"trusted":true},"execution_count":3,"outputs":[{"name":"stderr","text":"/opt/conda/lib/python3.10/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly. To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n return self.fget.__get__(instance, owner)()\n","output_type":"stream"}]},{"cell_type":"code","source":"max_input_length = 1024\nmax_output_length = 256\nbatch_size = 2\n","metadata":{"execution":{"iopub.status.busy":"2024-07-23T05:04:12.147447Z","iopub.execute_input":"2024-07-23T05:04:12.148504Z","iopub.status.idle":"2024-07-23T05:04:12.152953Z","shell.execute_reply.started":"2024-07-23T05:04:12.148464Z","shell.execute_reply":"2024-07-23T05:04:12.151833Z"},"trusted":true},"execution_count":4,"outputs":[]},{"cell_type":"code","source":"def process_data_to_model_inputs(batch):\n # tokenize the inputs and labels\n inputs = tokenizer(\n [f\"paraphrase: {q['text'][0]}\" for q in batch['questions']],\n padding=\"max_length\",\n truncation=True,\n max_length=max_input_length,\n )\n outputs = tokenizer(\n [q['text'][1] for q in batch['questions']],\n padding=\"max_length\",\n truncation=True,\n max_length=max_output_length,\n )\n\n batch[\"input_ids\"] = inputs.input_ids\n batch[\"attention_mask\"] = inputs.attention_mask\n\n # create 0 global_attention_mask lists\n batch[\"global_attention_mask\"] = len(batch[\"input_ids\"]) * [\n [0 for _ in range(len(batch[\"input_ids\"][0]))]\n ]\n\n # since above lists are references, the following line changes the 0 index for all samples\n for global_attention_mask in batch[\"global_attention_mask\"]:\n global_attention_mask[0] = 1\n\n batch[\"labels\"] = outputs.input_ids\n\n # We have to make sure that the PAD token is ignored\n batch[\"labels\"] = [\n [-100 if token == tokenizer.pad_token_id else token for token in labels]\n for labels in batch[\"labels\"]\n ]\n\n return batch","metadata":{"execution":{"iopub.status.busy":"2024-07-23T05:04:14.232390Z","iopub.execute_input":"2024-07-23T05:04:14.233132Z","iopub.status.idle":"2024-07-23T05:04:14.241621Z","shell.execute_reply.started":"2024-07-23T05:04:14.233097Z","shell.execute_reply":"2024-07-23T05:04:14.240482Z"},"trusted":true},"execution_count":5,"outputs":[]},{"cell_type":"code","source":"train_dataset = train_dataset.map(\n process_data_to_model_inputs,\n batched=True,\n batch_size=batch_size,\n remove_columns=['questions', 'is_duplicate'],\n)","metadata":{"execution":{"iopub.status.busy":"2024-07-23T05:04:18.687416Z","iopub.execute_input":"2024-07-23T05:04:18.687792Z","iopub.status.idle":"2024-07-23T05:04:21.032241Z","shell.execute_reply.started":"2024-07-23T05:04:18.687761Z","shell.execute_reply":"2024-07-23T05:04:21.031386Z"},"trusted":true},"execution_count":6,"outputs":[]},{"cell_type":"code","source":"eval_dataset = eval_dataset.map(\n process_data_to_model_inputs,\n batched=True,\n batch_size=batch_size,\n remove_columns=['questions', 'is_duplicate'],\n)","metadata":{"execution":{"iopub.status.busy":"2024-07-23T05:04:24.727815Z","iopub.execute_input":"2024-07-23T05:04:24.728538Z","iopub.status.idle":"2024-07-23T05:04:31.660424Z","shell.execute_reply.started":"2024-07-23T05:04:24.728501Z","shell.execute_reply":"2024-07-23T05:04:31.659421Z"},"trusted":true},"execution_count":7,"outputs":[{"output_type":"display_data","data":{"text/plain":"Map: 0%| | 0/2000 [00:00<?, ? examples/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"d312ca8554964f5a9fdb722385b085ab"}},"metadata":{}}]},{"cell_type":"code","source":"train_dataset.set_format(\n type=\"torch\",\n columns=[\"input_ids\", \"attention_mask\", \"global_attention_mask\", \"labels\"],\n)","metadata":{"execution":{"iopub.status.busy":"2024-07-23T05:04:36.562700Z","iopub.execute_input":"2024-07-23T05:04:36.563098Z","iopub.status.idle":"2024-07-23T05:04:36.571070Z","shell.execute_reply.started":"2024-07-23T05:04:36.563067Z","shell.execute_reply":"2024-07-23T05:04:36.570284Z"},"trusted":true},"execution_count":8,"outputs":[]},{"cell_type":"code","source":"eval_dataset.set_format(\n type=\"torch\",\n columns=[\"input_ids\", \"attention_mask\", \"global_attention_mask\", \"labels\"],\n)","metadata":{"execution":{"iopub.status.busy":"2024-07-23T05:04:40.067907Z","iopub.execute_input":"2024-07-23T05:04:40.068333Z","iopub.status.idle":"2024-07-23T05:04:40.074423Z","shell.execute_reply.started":"2024-07-23T05:04:40.068297Z","shell.execute_reply":"2024-07-23T05:04:40.073250Z"},"trusted":true},"execution_count":9,"outputs":[]},{"cell_type":"code","source":"# Load the model\nled = LEDForConditionalGeneration.from_pretrained(\"allenai/led-base-16384\", gradient_checkpointing=True, use_cache=False)\n\n# set generate hyperparameters\nled.config.num_beams = 2\nled.config.max_length = 256\nled.config.min_length = 100\nled.config.length_penalty = 2.0\nled.config.early_stopping = True\nled.config.no_repeat_ngram_size = 3\n","metadata":{"execution":{"iopub.status.busy":"2024-07-23T05:04:45.507540Z","iopub.execute_input":"2024-07-23T05:04:45.508127Z","iopub.status.idle":"2024-07-23T05:04:46.339791Z","shell.execute_reply.started":"2024-07-23T05:04:45.508092Z","shell.execute_reply":"2024-07-23T05:04:46.338973Z"},"trusted":true},"execution_count":10,"outputs":[]},{"cell_type":"code","source":"!pip install rouge_score\n!pip install sacrebleu","metadata":{"execution":{"iopub.status.busy":"2024-07-23T05:05:03.932316Z","iopub.execute_input":"2024-07-23T05:05:03.932699Z","iopub.status.idle":"2024-07-23T05:05:32.578791Z","shell.execute_reply.started":"2024-07-23T05:05:03.932665Z","shell.execute_reply":"2024-07-23T05:05:32.577611Z"},"trusted":true},"execution_count":12,"outputs":[{"name":"stderr","text":"/opt/conda/lib/python3.10/pty.py:89: RuntimeWarning: os.fork() was called. os.fork() is incompatible with multithreaded code, and JAX is multithreaded, so this will likely lead to a deadlock.\n pid, fd = os.forkpty()\n","output_type":"stream"},{"name":"stdout","text":"Collecting rouge_score\n Downloading rouge_score-0.1.2.tar.gz (17 kB)\n Preparing metadata (setup.py) ... \u001b[?25ldone\n\u001b[?25hRequirement already satisfied: absl-py in /opt/conda/lib/python3.10/site-packages (from rouge_score) (1.4.0)\nRequirement already satisfied: nltk in /opt/conda/lib/python3.10/site-packages (from rouge_score) (3.2.4)\nRequirement already satisfied: numpy in /opt/conda/lib/python3.10/site-packages (from rouge_score) (1.26.4)\nRequirement already satisfied: six>=1.14.0 in /opt/conda/lib/python3.10/site-packages (from rouge_score) (1.16.0)\nBuilding wheels for collected packages: rouge_score\n Building wheel for rouge_score (setup.py) ... \u001b[?25ldone\n\u001b[?25h Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=bfcc07a53e4b1f31a6a8ee90c55e46f6ec11e9505be57024294e793bd4ec8c63\n Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4\nSuccessfully built rouge_score\nInstalling collected packages: rouge_score\nSuccessfully installed rouge_score-0.1.2\nCollecting sacrebleu\n Downloading sacrebleu-2.4.2-py3-none-any.whl.metadata (58 kB)\n\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.0/58.0 kB\u001b[0m \u001b[31m2.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n\u001b[?25hCollecting portalocker (from sacrebleu)\n Downloading portalocker-2.10.1-py3-none-any.whl.metadata (8.5 kB)\nRequirement already satisfied: regex in /opt/conda/lib/python3.10/site-packages (from sacrebleu) (2023.12.25)\nRequirement already satisfied: tabulate>=0.8.9 in /opt/conda/lib/python3.10/site-packages (from sacrebleu) (0.9.0)\nRequirement already satisfied: numpy>=1.17 in /opt/conda/lib/python3.10/site-packages (from sacrebleu) (1.26.4)\nRequirement already satisfied: colorama in /opt/conda/lib/python3.10/site-packages (from sacrebleu) (0.4.6)\nRequirement already satisfied: lxml in /opt/conda/lib/python3.10/site-packages (from sacrebleu) (5.2.2)\nDownloading sacrebleu-2.4.2-py3-none-any.whl (106 kB)\n\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m106.7/106.7 kB\u001b[0m \u001b[31m5.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n\u001b[?25hDownloading portalocker-2.10.1-py3-none-any.whl (18 kB)\nInstalling collected packages: portalocker, sacrebleu\nSuccessfully installed portalocker-2.10.1 sacrebleu-2.4.2\n","output_type":"stream"}]},{"cell_type":"code","source":"# Load rouge and bleu for evaluation\nrouge = load_metric(\"rouge\")\nbleu = load_metric(\"bleu\")\n\ndef compute_metrics(pred):\n labels_ids = pred.label_ids\n pred_ids = pred.predictions\n\n pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)\n labels_ids[labels_ids == -100] = tokenizer.pad_token_id\n label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)\n\n # Compute ROUGE score\n rouge_output = rouge.compute(\n predictions=pred_str, references=label_str, rouge_types=[\"rouge2\"]\n )[\"rouge2\"].mid\n\n # Compute BLEU score\n bleu_output = bleu.compute(predictions=[p.split() for p in pred_str], references=[[r.split()] for r in label_str])\n bleu_score = bleu_output[\"bleu\"]\n\n return {\n \"rouge2_precision\": round(rouge_output.precision, 4),\n \"rouge2_recall\": round(rouge_output.recall, 4),\n \"rouge2_fmeasure\": round(rouge_output.fmeasure, 4),\n \"bleu_score\": round(bleu_score, 4),\n }","metadata":{"execution":{"iopub.status.busy":"2024-07-23T05:06:08.932048Z","iopub.execute_input":"2024-07-23T05:06:08.932851Z","iopub.status.idle":"2024-07-23T05:06:10.026264Z","shell.execute_reply.started":"2024-07-23T05:06:08.932815Z","shell.execute_reply":"2024-07-23T05:06:10.025274Z"},"trusted":true},"execution_count":13,"outputs":[{"name":"stderr","text":"/opt/conda/lib/python3.10/site-packages/datasets/load.py:759: FutureWarning: The repository for rouge contains custom code which must be executed to correctly load the metric. You can inspect the repository content at https://raw.githubusercontent.com/huggingface/datasets/2.19.2/metrics/rouge/rouge.py\nYou can avoid this message in future by passing the argument `trust_remote_code=True`.\nPassing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.\n warnings.warn(\n/opt/conda/lib/python3.10/site-packages/datasets/load.py:759: FutureWarning: The repository for bleu contains custom code which must be executed to correctly load the metric. You can inspect the repository content at https://raw.githubusercontent.com/huggingface/datasets/2.19.2/metrics/bleu/bleu.py\nYou can avoid this message in future by passing the argument `trust_remote_code=True`.\nPassing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.\n warnings.warn(\n","output_type":"stream"},{"output_type":"display_data","data":{"text/plain":"Downloading builder script: 0%| | 0.00/2.48k [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"aa182886b55849d19024c478e12a348a"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"Downloading extra modules: 0%| | 0.00/1.55k [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"206b8bb19dd5477c82d3533df7e5b63d"}},"metadata":{}}]},{"cell_type":"code","source":"# Enable fp16 apex training\nfrom transformers import AutoTokenizer, LEDForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments\n\ntraining_args = Seq2SeqTrainingArguments(\n predict_with_generate=True,\n evaluation_strategy=\"steps\",\n per_device_train_batch_size=batch_size,\n per_device_eval_batch_size=batch_size,\n output_dir=\"./results\",\n logging_steps=5,\n eval_steps=10,\n save_steps=10,\n save_total_limit=2,\n gradient_accumulation_steps=4,\n num_train_epochs=1,\n)\n\ntrainer = Seq2SeqTrainer(\n model=led,\n tokenizer=tokenizer,\n args=training_args,\n compute_metrics=compute_metrics,\n train_dataset=train_dataset,\n eval_dataset=eval_dataset,\n)\n\n\n","metadata":{"execution":{"iopub.status.busy":"2024-07-23T05:06:34.171545Z","iopub.execute_input":"2024-07-23T05:06:34.172370Z","iopub.status.idle":"2024-07-23T05:06:34.213338Z","shell.execute_reply.started":"2024-07-23T05:06:34.172332Z","shell.execute_reply":"2024-07-23T05:06:34.212322Z"},"trusted":true},"execution_count":15,"outputs":[{"name":"stderr","text":"/opt/conda/lib/python3.10/site-packages/transformers/training_args.py:1474: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead\n warnings.warn(\n","output_type":"stream"}]},{"cell_type":"code","source":"trainer.train()","metadata":{"execution":{"iopub.status.busy":"2024-07-23T05:06:40.251235Z","iopub.execute_input":"2024-07-23T05:06:40.251626Z","iopub.status.idle":"2024-07-23T07:18:38.468025Z","shell.execute_reply.started":"2024-07-23T05:06:40.251595Z","shell.execute_reply":"2024-07-23T07:18:38.466440Z"},"trusted":true},"execution_count":16,"outputs":[{"name":"stderr","text":"\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m The `run_name` is currently set to the same value as `TrainingArguments.output_dir`. If this was not intended, please specify a different run name by setting the `TrainingArguments.run_name` parameter.\n\u001b[34m\u001b[1mwandb\u001b[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)\n\u001b[34m\u001b[1mwandb\u001b[0m: You can find your API key in your browser here: https://wandb.ai/authorize\n\u001b[34m\u001b[1mwandb\u001b[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:","output_type":"stream"},{"output_type":"stream","name":"stdin","text":" ········································\n"},{"name":"stderr","text":"\u001b[34m\u001b[1mwandb\u001b[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc\n","output_type":"stream"},{"output_type":"display_data","data":{"text/plain":"<IPython.core.display.HTML object>","text/html":"wandb version 0.17.5 is available! To upgrade, please run:\n $ pip install wandb --upgrade"},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"<IPython.core.display.HTML object>","text/html":"Tracking run with wandb version 0.17.0"},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"<IPython.core.display.HTML object>","text/html":"Run data is saved locally in <code>/kaggle/working/wandb/run-20240723_050729-32yu7xxq</code>"},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"<IPython.core.display.HTML object>","text/html":"Syncing run <strong><a href='https://wandb.ai/rakshita-kulkarni3487-The%20Assigner/huggingface/runs/32yu7xxq' target=\"_blank\">./results</a></strong> to <a href='https://wandb.ai/rakshita-kulkarni3487-The%20Assigner/huggingface' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/run' target=\"_blank\">docs</a>)<br/>"},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"<IPython.core.display.HTML object>","text/html":" View project at <a href='https://wandb.ai/rakshita-kulkarni3487-The%20Assigner/huggingface' target=\"_blank\">https://wandb.ai/rakshita-kulkarni3487-The%20Assigner/huggingface</a>"},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"<IPython.core.display.HTML object>","text/html":" View run at <a href='https://wandb.ai/rakshita-kulkarni3487-The%20Assigner/huggingface/runs/32yu7xxq' target=\"_blank\">https://wandb.ai/rakshita-kulkarni3487-The%20Assigner/huggingface/runs/32yu7xxq</a>"},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"<IPython.core.display.HTML object>","text/html":"\n <div>\n \n <progress value='33' max='2000' style='width:300px; height:20px; vertical-align: middle;'></progress>\n [ 33/2000 2:10:47 < 138:19:12, 0.00 it/s, Epoch 0.02/1]\n </div>\n <table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: left;\">\n <th>Step</th>\n <th>Training Loss</th>\n <th>Validation Loss</th>\n <th>Rouge2 Precision</th>\n <th>Rouge2 Recall</th>\n <th>Rouge2 Fmeasure</th>\n <th>Bleu Score</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <td>10</td>\n <td>2.115200</td>\n <td>2.294765</td>\n <td>0.040100</td>\n <td>0.385500</td>\n <td>0.071200</td>\n <td>0.025300</td>\n </tr>\n <tr>\n <td>20</td>\n <td>2.130700</td>\n <td>2.233369</td>\n <td>0.036800</td>\n <td>0.369100</td>\n <td>0.065600</td>\n <td>0.023500</td>\n </tr>\n <tr>\n <td>30</td>\n <td>2.254000</td>\n <td>2.244298</td>\n <td>0.043000</td>\n <td>0.374200</td>\n <td>0.074700</td>\n <td>0.028000</td>\n </tr>\n </tbody>\n</table><p>"},"metadata":{}},{"name":"stderr","text":"/opt/conda/lib/python3.10/site-packages/transformers/generation/utils.py:1283: UserWarning: You have modified the pretrained model configuration to control generation. This is a deprecated strategy to control generation and will be removed soon, in a future version. Please use and modify the model generation configuration (see https://huggingface.co/docs/transformers/generation_strategies#default-text-generation-configuration )\n warnings.warn(\nSome non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.\nNon-default generation parameters: {'max_length': 256, 'min_length': 100, 'early_stopping': True, 'num_beams': 2, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}\nSome non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.\nNon-default generation parameters: {'max_length': 256, 'min_length': 100, 'early_stopping': True, 'num_beams': 2, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}\nSome non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.\nNon-default generation parameters: {'max_length': 256, 'min_length': 100, 'early_stopping': True, 'num_beams': 2, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}\n","output_type":"stream"},{"traceback":["\u001b[0;31m---------------------------------------------------------------------------\u001b[0m","\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)","Cell \u001b[0;32mIn[16], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrain\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n","File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/transformers/trainer.py:1885\u001b[0m, in \u001b[0;36mTrainer.train\u001b[0;34m(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)\u001b[0m\n\u001b[1;32m 1883\u001b[0m hf_hub_utils\u001b[38;5;241m.\u001b[39menable_progress_bars()\n\u001b[1;32m 1884\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1885\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43minner_training_loop\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1886\u001b[0m \u001b[43m \u001b[49m\u001b[43margs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1887\u001b[0m \u001b[43m \u001b[49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1888\u001b[0m \u001b[43m \u001b[49m\u001b[43mtrial\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrial\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1889\u001b[0m \u001b[43m \u001b[49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1890\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n","File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/transformers/trainer.py:2216\u001b[0m, in \u001b[0;36mTrainer._inner_training_loop\u001b[0;34m(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)\u001b[0m\n\u001b[1;32m 2213\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcallback_handler\u001b[38;5;241m.\u001b[39mon_step_begin(args, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol)\n\u001b[1;32m 2215\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39maccelerator\u001b[38;5;241m.\u001b[39maccumulate(model):\n\u001b[0;32m-> 2216\u001b[0m tr_loss_step \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtraining_step\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2218\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[1;32m 2219\u001b[0m args\u001b[38;5;241m.\u001b[39mlogging_nan_inf_filter\n\u001b[1;32m 2220\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_torch_xla_available()\n\u001b[1;32m 2221\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m (torch\u001b[38;5;241m.\u001b[39misnan(tr_loss_step) \u001b[38;5;129;01mor\u001b[39;00m torch\u001b[38;5;241m.\u001b[39misinf(tr_loss_step))\n\u001b[1;32m 2222\u001b[0m ):\n\u001b[1;32m 2223\u001b[0m \u001b[38;5;66;03m# if loss is nan or inf simply add the average of previous logged losses\u001b[39;00m\n\u001b[1;32m 2224\u001b[0m tr_loss \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m tr_loss \u001b[38;5;241m/\u001b[39m (\u001b[38;5;241m1\u001b[39m \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mglobal_step \u001b[38;5;241m-\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_globalstep_last_logged)\n","File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/transformers/trainer.py:3250\u001b[0m, in \u001b[0;36mTrainer.training_step\u001b[0;34m(***failed resolving arguments***)\u001b[0m\n\u001b[1;32m 3248\u001b[0m scaled_loss\u001b[38;5;241m.\u001b[39mbackward()\n\u001b[1;32m 3249\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 3250\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43maccelerator\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\u001b[43mloss\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3252\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m loss\u001b[38;5;241m.\u001b[39mdetach() \u001b[38;5;241m/\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mgradient_accumulation_steps\n","File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/accelerate/accelerator.py:2125\u001b[0m, in \u001b[0;36mAccelerator.backward\u001b[0;34m(self, loss, **kwargs)\u001b[0m\n\u001b[1;32m 2123\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlomo_backward(loss, learning_rate)\n\u001b[1;32m 2124\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 2125\u001b[0m \u001b[43mloss\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n","File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_tensor.py:492\u001b[0m, in \u001b[0;36mTensor.backward\u001b[0;34m(self, gradient, retain_graph, create_graph, inputs)\u001b[0m\n\u001b[1;32m 482\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m has_torch_function_unary(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m 483\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m handle_torch_function(\n\u001b[1;32m 484\u001b[0m Tensor\u001b[38;5;241m.\u001b[39mbackward,\n\u001b[1;32m 485\u001b[0m (\u001b[38;5;28mself\u001b[39m,),\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 490\u001b[0m inputs\u001b[38;5;241m=\u001b[39minputs,\n\u001b[1;32m 491\u001b[0m )\n\u001b[0;32m--> 492\u001b[0m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mautograd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 493\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgradient\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minputs\u001b[49m\n\u001b[1;32m 494\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n","File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/autograd/__init__.py:251\u001b[0m, in \u001b[0;36mbackward\u001b[0;34m(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)\u001b[0m\n\u001b[1;32m 246\u001b[0m retain_graph \u001b[38;5;241m=\u001b[39m create_graph\n\u001b[1;32m 248\u001b[0m \u001b[38;5;66;03m# The reason we repeat the same comment below is that\u001b[39;00m\n\u001b[1;32m 249\u001b[0m \u001b[38;5;66;03m# some Python versions print out the first line of a multi-line function\u001b[39;00m\n\u001b[1;32m 250\u001b[0m \u001b[38;5;66;03m# calls in the traceback and some print out the last line\u001b[39;00m\n\u001b[0;32m--> 251\u001b[0m \u001b[43mVariable\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_execution_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun_backward\u001b[49m\u001b[43m(\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# Calls into the C++ engine to run the backward pass\u001b[39;49;00m\n\u001b[1;32m 252\u001b[0m \u001b[43m \u001b[49m\u001b[43mtensors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 253\u001b[0m \u001b[43m \u001b[49m\u001b[43mgrad_tensors_\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 254\u001b[0m \u001b[43m \u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 255\u001b[0m \u001b[43m \u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 256\u001b[0m \u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 257\u001b[0m \u001b[43m \u001b[49m\u001b[43mallow_unreachable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 258\u001b[0m \u001b[43m \u001b[49m\u001b[43maccumulate_grad\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 259\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n","\u001b[0;31mKeyboardInterrupt\u001b[0m: "],"ename":"KeyboardInterrupt","evalue":"","output_type":"error"}]},{"cell_type":"code","source":"\n# Save the model and tokenizer\nled.save_pretrained('/kaggle/working/led-paraphrase-model')\ntokenizer.save_pretrained('/kaggle/working/led-paraphrase-tokenizer')","metadata":{"execution":{"iopub.status.busy":"2024-07-23T07:19:42.813195Z","iopub.execute_input":"2024-07-23T07:19:42.813569Z","iopub.status.idle":"2024-07-23T07:19:44.307663Z","shell.execute_reply.started":"2024-07-23T07:19:42.813542Z","shell.execute_reply":"2024-07-23T07:19:44.306344Z"},"trusted":true},"execution_count":17,"outputs":[{"name":"stderr","text":"Some non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.\nNon-default generation parameters: {'max_length': 256, 'min_length': 100, 'early_stopping': True, 'num_beams': 2, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}\n","output_type":"stream"},{"execution_count":17,"output_type":"execute_result","data":{"text/plain":"('/kaggle/working/led-paraphrase-tokenizer/tokenizer_config.json',\n '/kaggle/working/led-paraphrase-tokenizer/special_tokens_map.json',\n '/kaggle/working/led-paraphrase-tokenizer/vocab.json',\n '/kaggle/working/led-paraphrase-tokenizer/merges.txt',\n '/kaggle/working/led-paraphrase-tokenizer/added_tokens.json')"},"metadata":{}}]},{"cell_type":"code","source":"# Select a single example from the test dataset\nexample = subset_dataset[train_size] # You can change the index to select a different example\n\n# Prepare the input\ninput_text = f\"paraphrase: {example['questions']['text'][0]}\"\ninputs = tokenizer(input_text, padding=\"max_length\", truncation=True, max_length=512, return_tensors=\"pt\")\n\ninput_ids = inputs.input_ids.to(\"cuda\")\nattention_mask = inputs.attention_mask.to(\"cuda\")\nglobal_attention_mask = torch.zeros_like(attention_mask)\nglobal_attention_mask[:, 0] = 1\n\n# Generate the paraphrase\npredicted_ids = led.generate(input_ids, attention_mask=attention_mask, global_attention_mask=global_attention_mask)\nparaphrased_text = tokenizer.decode(predicted_ids[0], skip_special_tokens=True)\n\n# Display the original and paraphrased texts\nprint(\"Original:\", example['questions']['text'][0])\nprint(\"Paraphrased:\", paraphrased_text)\n","metadata":{"execution":{"iopub.status.busy":"2024-07-23T07:23:10.743248Z","iopub.execute_input":"2024-07-23T07:23:10.743978Z","iopub.status.idle":"2024-07-23T07:23:12.142569Z","shell.execute_reply.started":"2024-07-23T07:23:10.743945Z","shell.execute_reply":"2024-07-23T07:23:12.141350Z"},"trusted":true},"execution_count":25,"outputs":[{"name":"stdout","text":"Original: If you could invent an item of clothing, what would it be?\nParaphrased: What would be the first item of clothing to be invented? What would it be? What are the first items of clothing that would be created? What is the next item?What is the most important item? What should be the most popular item? The first item to be invented? What? What will be the last item of clothes to be created in the first year?What should be first item for the first 1 year clothing inventors?What are the most common items? What could be the next?\n","output_type":"stream"}]},{"cell_type":"code","source":"","metadata":{},"execution_count":null,"outputs":[]}]}