{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.14","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"gpu","dataSources":[],"dockerImageVersionId":30762,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":true}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"!pip install transformers datasets evaluate sacrebleu","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","execution":{"iopub.status.busy":"2024-09-22T16:50:45.081814Z","iopub.execute_input":"2024-09-22T16:50:45.082178Z","iopub.status.idle":"2024-09-22T16:51:00.847186Z","shell.execute_reply.started":"2024-09-22T16:50:45.082142Z","shell.execute_reply":"2024-09-22T16:51:00.846189Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"from huggingface_hub import notebook_login\n\nnotebook_login()","metadata":{"execution":{"iopub.status.busy":"2024-09-22T16:53:01.731576Z","iopub.execute_input":"2024-09-22T16:53:01.732612Z","iopub.status.idle":"2024-09-22T16:53:02.070092Z","shell.execute_reply.started":"2024-09-22T16:53:01.732561Z","shell.execute_reply":"2024-09-22T16:53:02.068675Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"from datasets import load_dataset\n\ndata = load_dataset(\"syubraj/roman2nepali-transliteration\")","metadata":{"execution":{"iopub.status.busy":"2024-09-22T16:53:19.592468Z","iopub.execute_input":"2024-09-22T16:53:19.592905Z","iopub.status.idle":"2024-09-22T16:53:27.147254Z","shell.execute_reply.started":"2024-09-22T16:53:19.592866Z","shell.execute_reply":"2024-09-22T16:53:27.146252Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"data = data['train'].train_test_split(test_size=0.02)","metadata":{"execution":{"iopub.status.busy":"2024-09-22T16:53:27.159658Z","iopub.execute_input":"2024-09-22T16:53:27.159978Z","iopub.status.idle":"2024-09-22T16:53:28.254948Z","shell.execute_reply.started":"2024-09-22T16:53:27.159945Z","shell.execute_reply":"2024-09-22T16:53:28.254138Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"from transformers import AutoTokenizer\n\ncheckpoint = \"google-t5/t5-small\"\ntokenizer_checkpoint = \"FacebookAI/xlm-roberta-base\"\ntokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint)","metadata":{"execution":{"iopub.status.busy":"2024-09-22T16:53:28.257447Z","iopub.execute_input":"2024-09-22T16:53:28.257770Z","iopub.status.idle":"2024-09-22T16:53:33.442359Z","shell.execute_reply.started":"2024-09-22T16:53:28.257734Z","shell.execute_reply":"2024-09-22T16:53:33.441518Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"source_lang = 'roman'\ntarget_lang = 'nepali'\nprefix = \"translate Roman to Nepali: \"\n\n\ndef preprocess_function(examples):\n    inputs = [prefix + example[source_lang] for example in examples[\"translation\"]]\n    targets = [example[target_lang] for example in examples[\"translation\"]]\n    model_inputs = tokenizer(inputs, text_target=targets, max_length=30, truncation=True)\n    return model_inputs","metadata":{"execution":{"iopub.status.busy":"2024-09-22T16:53:33.443485Z","iopub.execute_input":"2024-09-22T16:53:33.443837Z","iopub.status.idle":"2024-09-22T16:53:33.449493Z","shell.execute_reply.started":"2024-09-22T16:53:33.443801Z","shell.execute_reply":"2024-09-22T16:53:33.448556Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"tokenized_data = data.map(preprocess_function, batched=True)\nprint(\"Data mapping done\")","metadata":{"execution":{"iopub.status.busy":"2024-09-22T16:56:35.660144Z","iopub.execute_input":"2024-09-22T16:56:35.660919Z","iopub.status.idle":"2024-09-22T17:00:01.565246Z","shell.execute_reply.started":"2024-09-22T16:56:35.660877Z","shell.execute_reply":"2024-09-22T17:00:01.564308Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"from transformers import DataCollatorForSeq2Seq\n\ndata_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)","metadata":{"execution":{"iopub.status.busy":"2024-09-22T17:00:06.956005Z","iopub.execute_input":"2024-09-22T17:00:06.956402Z","iopub.status.idle":"2024-09-22T17:00:19.227805Z","shell.execute_reply.started":"2024-09-22T17:00:06.956363Z","shell.execute_reply":"2024-09-22T17:00:19.226974Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"import evaluate\n\nmetric = evaluate.load(\"sacrebleu\")","metadata":{"execution":{"iopub.status.busy":"2024-09-22T17:00:26.591170Z","iopub.execute_input":"2024-09-22T17:00:26.591860Z","iopub.status.idle":"2024-09-22T17:00:29.284578Z","shell.execute_reply.started":"2024-09-22T17:00:26.591822Z","shell.execute_reply":"2024-09-22T17:00:29.283684Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"import numpy as np\n\n\ndef postprocess_text(preds, labels):\n    preds = [pred.strip() for pred in preds]\n    labels = [[label.strip()] for label in labels]\n\n    return preds, labels\n\n\ndef compute_metrics(eval_preds):\n    preds, labels = eval_preds\n    if isinstance(preds, tuple):\n        preds = preds[0]\n    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)\n\n    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)\n    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)\n\n    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)\n\n    result = metric.compute(predictions=decoded_preds, references=decoded_labels)\n    result = {\"bleu\": result[\"score\"]}\n\n    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]\n    result[\"gen_len\"] = np.mean(prediction_lens)\n    result = {k: round(v, 4) for k, v in result.items()}\n    return result","metadata":{"execution":{"iopub.status.busy":"2024-09-22T17:00:38.702200Z","iopub.execute_input":"2024-09-22T17:00:38.702924Z","iopub.status.idle":"2024-09-22T17:00:38.712287Z","shell.execute_reply.started":"2024-09-22T17:00:38.702882Z","shell.execute_reply":"2024-09-22T17:00:38.711336Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer\n\nmodel = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)","metadata":{"execution":{"iopub.status.busy":"2024-09-22T17:00:40.951814Z","iopub.execute_input":"2024-09-22T17:00:40.952201Z","iopub.status.idle":"2024-09-22T17:00:46.435083Z","shell.execute_reply.started":"2024-09-22T17:00:40.952163Z","shell.execute_reply":"2024-09-22T17:00:46.434107Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"import wandb\n\ntry:\n    from kaggle_secrets import UserSecretsClient\n    user_secrets = UserSecretsClient()\n    api_key = user_secrets.get_secret(\"wandb_api\")\n    wandb.login(key=api_key)\n    anony = None\nexcept:\n    anony = \"must\"\n    print('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. \\nGet your W&B access token from here: https://wandb.ai/authorize')","metadata":{"execution":{"iopub.status.busy":"2024-09-22T17:00:46.437207Z","iopub.execute_input":"2024-09-22T17:00:46.438049Z","iopub.status.idle":"2024-09-22T17:00:49.392150Z","shell.execute_reply.started":"2024-09-22T17:00:46.437981Z","shell.execute_reply":"2024-09-22T17:00:49.391318Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"training_args = Seq2SeqTrainingArguments(\n    output_dir=\"/kaggle/working/romaneng2nep/\",\n    eval_strategy=\"epoch\",\n    learning_rate=2e-5,\n    per_device_train_batch_size=16,\n    per_device_eval_batch_size=16,\n    weight_decay=0.01,\n    lr_scheduler = linear,\n    save_total_limit=3,\n    num_train_epochs=1,\n    predict_with_generate=True,\n    fp16=True,\n    report_to = 'wandb'\n    push_to_hub = True,\n)\n\ntrainer = Seq2SeqTrainer(\n    model=model,\n    args=training_args,\n    train_dataset=tokenized_data[\"train\"],\n    eval_dataset=tokenized_data[\"test\"],\n    tokenizer=tokenizer,\n    data_collator=data_collator,\n    compute_metrics=compute_metrics,\n)\n","metadata":{"execution":{"iopub.status.busy":"2024-09-22T17:01:55.818916Z","iopub.execute_input":"2024-09-22T17:01:55.819976Z","iopub.status.idle":"2024-09-22T17:01:55.981315Z","shell.execute_reply.started":"2024-09-22T17:01:55.819917Z","shell.execute_reply":"2024-09-22T17:01:55.980163Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"trainer.train()","metadata":{"execution":{"iopub.status.busy":"2024-09-22T17:01:57.802474Z","iopub.execute_input":"2024-09-22T17:01:57.803390Z","iopub.status.idle":"2024-09-22T17:03:13.942728Z","shell.execute_reply.started":"2024-09-22T17:01:57.803348Z","shell.execute_reply":"2024-09-22T17:03:13.940096Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"trainer.push_to_hub('syubraj/romanized_english_2_nepali')","metadata":{},"execution_count":null,"outputs":[]}]}