{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from huggingface_hub import login\n",
    "from dotenv import load_dotenv\n",
    "import os\n",
    "load_dotenv()\n",
    "\n",
    "# Login to Hugging Face Hub\n",
    "login(token=os.getenv(\"HUGGINGFACE_TOKEN\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Dataset\n",
    "Modifyify the dataset to fit the Gemma 2 prompt format"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Repo card metadata block was not found. Setting CardData to empty.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'questionID': 0, 'questionTitle': 'Do I have too many issues for counseling?', 'questionText': 'I have so many issues to address. I have a history of sexual abuse, I’m a breast cancer survivor and I am a lifetime insomniac.    I have a long history of depression and I’m beginning to have anxiety. I have low self esteem but I’ve been happily married for almost 35 years.\\n   I’ve never had counseling about any of this. Do I have too many issues to address in counseling?', 'questionLink': 'https://counselchat.com/questions/do-i-have-too-many-issues-for-counseling', 'topic': 'depression', 'therapistInfo': 'Jennifer MolinariHypnotherapist & Licensed Counselor', 'therapistURL': 'https://counselchat.com/therapists/jennifer-molinari', 'answerText': 'It is very common for\\xa0people to have multiple issues that they want to (and need to) address in counseling.\\xa0 I have had clients ask that same question and through more exploration, there is often an underlying fear that they\\xa0 \"can\\'t be helped\" or that they will \"be too much for their therapist.\" I don\\'t know if any of this rings true for you. But, most people have more than one problem in their lives and more often than not,\\xa0 people have numerous significant stressors in their lives.\\xa0 Let\\'s face it, life can be complicated! Therapists are completely ready and equipped to handle all of the issues small or large that a client presents in session. Most therapists over the first couple of sessions will help you prioritize the issues you are facing so that you start addressing the issues that are causing you the most distress.\\xa0 You can never have too many issues to address in counseling.\\xa0 All of the issues you mention above can be successfully worked through in counseling.', 'upvotes': 3, 'views': 1971}\n",
      "\n",
      " Dataset({\n",
      "    features: ['questionID', 'questionTitle', 'questionText', 'questionLink', 'topic', 'therapistInfo', 'therapistURL', 'answerText', 'upvotes', 'views'],\n",
      "    num_rows: 2775\n",
      "})\n"
     ]
    }
   ],
   "source": [
    "from datasets import load_dataset\n",
    "dataset_name = \"nbertagnolli/counsel-chat\"\n",
    "dataset = load_dataset(dataset_name, split=\"train\",cache_dir=\".cache/\")\n",
    "\n",
    "# Print the first example from the dataset\n",
    "print(dataset[0])\n",
    "print(f\"\\n {dataset}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " \n",
      "### System:\n",
      "You are a Therapist Assistant, an LLM fine-tuned on Gemma 2 model by Google.\n",
      "You provide safe and responsible support to users while encouraging them to visit a mental health professional if needed. \n",
      "You are committed to promoting wellness, understanding, and support. Your responses should be clear, concise, and evidence-based, while maintaining a friendly and approachable tone.\n",
      "\n",
      "### User:\n",
      "I have so many issues to address. I have a history of sexual abuse, I’m a breast cancer survivor and I am a lifetime insomniac.    I have a long history of depression and I’m beginning to have anxiety. I have low self esteem but I’ve been happily married for almost 35 years.\n",
      "   I’ve never had counseling about any of this. Do I have too many issues to address in counseling?\n",
      "\n",
      "### Response:\n",
      "It is very common for people to have multiple issues that they want to (and need to) address in counseling.  I have had clients ask that same question and through more exploration, there is often an underlying fear that they  \"can't be helped\" or that they will \"be too much for their therapist.\" I don't know if any of this rings true for you. But, most people have more than one problem in their lives and more often than not,  people have numerous significant stressors in their lives.  Let's face it, life can be complicated! Therapists are completely ready and equipped to handle all of the issues small or large that a client presents in session. Most therapists over the first couple of sessions will help you prioritize the issues you are facing so that you start addressing the issues that are causing you the most distress.  You can never have too many issues to address in counseling.  All of the issues you mention above can be successfully worked through in counseling.\n",
      "\n"
     ]
    }
   ],
   "source": [
    "gemma_prompt = \"\"\" \n",
    "### System:\n",
    "You are a Therapist Assistant, an LLM fine-tuned on Gemma 2 model by Google.\n",
    "You provide safe and responsible support to users while encouraging them to visit a mental health professional if needed. \n",
    "You are committed to promoting wellness, understanding, and support. Your responses should be clear, concise, and evidence-based, while maintaining a friendly and approachable tone.\n",
    "\n",
    "### User:\n",
    "{}\n",
    "\n",
    "### Response:\n",
    "{}\n",
    "\"\"\"\n",
    "\n",
    "def format_prompts_func(example):\n",
    "    \"\"\"Formats questionText and answerText into the Gemma 2 prompt format.\"\"\"\n",
    "    question_texts = example[\"questionText\"]\n",
    "    answer_texts = example[\"answerText\"]\n",
    "    texts = []\n",
    "    for q, a in zip(question_texts, answer_texts):\n",
    "        text = gemma_prompt.format(q, a)\n",
    "        texts.append(text)\n",
    "\n",
    "    return {\"text\": texts}\n",
    "pass\n",
    "# Apply the formatting function to the dataset\n",
    "formatted_dataset = dataset.map(format_prompts_func, batched=True)\n",
    "print(formatted_dataset['text'][0])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(2220, 11) (555, 11)\n"
     ]
    }
   ],
   "source": [
    "dataset = formatted_dataset.train_test_split(test_size=0.2, seed=42)\n",
    "print(dataset['train'].shape, dataset['test'].shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Fine tuning hyperpterparameters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "from transformers import (\n",
    "    AutoModelForCausalLM,\n",
    "    AutoTokenizer,\n",
    "    BitsAndBytesConfig,\n",
    "    TrainingArguments,\n",
    ")\n",
    "from peft import LoraConfig\n",
    "from trl import SFTTrainer\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "import yaml\n",
    "with open(\"hyperparams.yaml\", 'r') as file:\n",
    "    hyperparams = yaml.load(file, Loader=yaml.FullLoader)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "compute_dtype = getattr(torch, hyperparams['bnb_4bit_compute_dtype'])\n",
    "\n",
    "bnb_config = BitsAndBytesConfig(\n",
    "    load_in_4bit=hyperparams['use_4bit'], # Activates 4-bit precision loading\n",
    "    bnb_4bit_quant_type=hyperparams['bnb_4bit_quant_type'], # nf4\n",
    "    bnb_4bit_compute_dtype=compute_dtype, # float16\n",
    "    bnb_4bit_use_double_quant=hyperparams['use_nested_quant'], # False\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Setting BF16 to True\n"
     ]
    }
   ],
   "source": [
    "# Check GPU compatibility with bfloat16\n",
    "if compute_dtype == torch.float16 and hyperparams['use_4bit']:\n",
    "    major, _ = torch.cuda.get_device_capability()\n",
    "    if major >= 8:\n",
    "        print(\"Setting BF16 to True\")\n",
    "        hyperparams['bf16'] = True\n",
    "    else:\n",
    "        hyperparams['bf16'] = False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "3a112598cc9d4adf99116a9b19074886",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "model = AutoModelForCausalLM.from_pretrained(\n",
    "    hyperparams['model_name'],\n",
    "    token=os.getenv(\"HUGGINGFACE_TOKEN\"),\n",
    "    quantization_config=bnb_config,\n",
    "    device_map=hyperparams['device_map'],\n",
    "    cache_dir=\".cache/\",\n",
    ")\n",
    "model.config.use_cache = False\n",
    "model.config.pretraining_tp = 1\n",
    "\n",
    "tokenizer = AutoTokenizer.from_pretrained(hyperparams['model_name'], token=os.getenv(\"HUGGINGFACE_TOKEN\"), trust_remote_code=True, cache_dir=\".cache/\")\n",
    "tokenizer.pad_token = tokenizer.eos_token\n",
    "tokenizer.padding_side = \"right\" # Fix weird overflow issue with fp16 training"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load LoRA configuration\n",
    "peft_config = LoraConfig(\n",
    "    lora_alpha=hyperparams['lora_alpha'],\n",
    "    lora_dropout=hyperparams['lora_dropout'],\n",
    "    r=hyperparams['lora_r'],\n",
    "    bias=\"none\",\n",
    "    task_type=\"CAUSAL_LM\",\n",
    "    target_modules=[\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\",\"gate_proj\", \"up_proj\"]\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\u001b[34m\u001b[1mwandb\u001b[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.\n",
      "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mkausikremella\u001b[0m (\u001b[33mkausikremella-vit-ap\u001b[0m). Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n",
      "\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m If you're specifying your api key in code, ensure this code is not shared publicly.\n",
      "\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m Consider setting the WANDB_API_KEY environment variable, or running `wandb login` from the command line.\n",
      "\u001b[34m\u001b[1mwandb\u001b[0m: Appending key for api.wandb.ai to your netrc file: C:\\Users\\Nitin Kausik Remella\\_netrc\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "Tracking run with wandb version 0.18.7"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "Run data is saved locally in <code>f:\\TADBot\\Gemma2_2B\\wandb\\run-20241115_192539-7eelojfi</code>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "Syncing run <strong><a href='https://wandb.ai/kausikremella-vit-ap/TADBot/runs/7eelojfi' target=\"_blank\">eager-morning-3</a></strong> to <a href='https://wandb.ai/kausikremella-vit-ap/TADBot' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/developer-guide' target=\"_blank\">docs</a>)<br/>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       " View project at <a href='https://wandb.ai/kausikremella-vit-ap/TADBot' target=\"_blank\">https://wandb.ai/kausikremella-vit-ap/TADBot</a>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       " View run at <a href='https://wandb.ai/kausikremella-vit-ap/TADBot/runs/7eelojfi' target=\"_blank\">https://wandb.ai/kausikremella-vit-ap/TADBot/runs/7eelojfi</a>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "TrainingArguments(\n",
       "_n_gpu=1,\n",
       "accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},\n",
       "adafactor=False,\n",
       "adam_beta1=0.9,\n",
       "adam_beta2=0.999,\n",
       "adam_epsilon=1e-08,\n",
       "auto_find_batch_size=False,\n",
       "average_tokens_across_devices=False,\n",
       "batch_eval_metrics=False,\n",
       "bf16=True,\n",
       "bf16_full_eval=False,\n",
       "data_seed=None,\n",
       "dataloader_drop_last=False,\n",
       "dataloader_num_workers=0,\n",
       "dataloader_persistent_workers=False,\n",
       "dataloader_pin_memory=True,\n",
       "dataloader_prefetch_factor=None,\n",
       "ddp_backend=None,\n",
       "ddp_broadcast_buffers=None,\n",
       "ddp_bucket_cap_mb=None,\n",
       "ddp_find_unused_parameters=None,\n",
       "ddp_timeout=1800,\n",
       "debug=[],\n",
       "deepspeed=None,\n",
       "disable_tqdm=False,\n",
       "dispatch_batches=None,\n",
       "do_eval=True,\n",
       "do_predict=False,\n",
       "do_train=False,\n",
       "eval_accumulation_steps=None,\n",
       "eval_delay=0,\n",
       "eval_do_concat_batches=True,\n",
       "eval_on_start=False,\n",
       "eval_steps=0.2,\n",
       "eval_strategy=IntervalStrategy.STEPS,\n",
       "eval_use_gather_object=False,\n",
       "evaluation_strategy=None,\n",
       "fp16=False,\n",
       "fp16_backend=auto,\n",
       "fp16_full_eval=False,\n",
       "fp16_opt_level=O1,\n",
       "fsdp=[],\n",
       "fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},\n",
       "fsdp_min_num_params=0,\n",
       "fsdp_transformer_layer_cls_to_wrap=None,\n",
       "full_determinism=False,\n",
       "gradient_accumulation_steps=2,\n",
       "gradient_checkpointing=False,\n",
       "gradient_checkpointing_kwargs=None,\n",
       "greater_is_better=None,\n",
       "group_by_length=True,\n",
       "half_precision_backend=auto,\n",
       "hub_always_push=False,\n",
       "hub_model_id=None,\n",
       "hub_private_repo=False,\n",
       "hub_strategy=HubStrategy.EVERY_SAVE,\n",
       "hub_token=<HUB_TOKEN>,\n",
       "ignore_data_skip=False,\n",
       "include_for_metrics=[],\n",
       "include_inputs_for_metrics=False,\n",
       "include_num_input_tokens_seen=False,\n",
       "include_tokens_per_second=False,\n",
       "jit_mode_eval=False,\n",
       "label_names=None,\n",
       "label_smoothing_factor=0.0,\n",
       "learning_rate=0.0002,\n",
       "length_column_name=length,\n",
       "load_best_model_at_end=False,\n",
       "local_rank=0,\n",
       "log_level=passive,\n",
       "log_level_replica=warning,\n",
       "log_on_each_node=True,\n",
       "logging_dir=./outputs/google/gemma-2-2b-it--health-bot-1731678943/logs,\n",
       "logging_first_step=False,\n",
       "logging_nan_inf_filter=True,\n",
       "logging_steps=50,\n",
       "logging_strategy=IntervalStrategy.STEPS,\n",
       "lr_scheduler_kwargs={},\n",
       "lr_scheduler_type=SchedulerType.CONSTANT,\n",
       "max_grad_norm=0.3,\n",
       "max_steps=-1,\n",
       "metric_for_best_model=None,\n",
       "mp_parameters=,\n",
       "neftune_noise_alpha=None,\n",
       "no_cuda=False,\n",
       "num_train_epochs=1,\n",
       "optim=OptimizerNames.PAGED_ADAMW,\n",
       "optim_args=None,\n",
       "optim_target_modules=None,\n",
       "output_dir=./outputs/google/gemma-2-2b-it--health-bot-1731678943,\n",
       "overwrite_output_dir=False,\n",
       "past_index=-1,\n",
       "per_device_eval_batch_size=2,\n",
       "per_device_train_batch_size=2,\n",
       "prediction_loss_only=False,\n",
       "push_to_hub=False,\n",
       "push_to_hub_model_id=None,\n",
       "push_to_hub_organization=None,\n",
       "push_to_hub_token=<PUSH_TO_HUB_TOKEN>,\n",
       "ray_scope=last,\n",
       "remove_unused_columns=True,\n",
       "report_to=['wandb'],\n",
       "restore_callback_states_from_checkpoint=False,\n",
       "resume_from_checkpoint=None,\n",
       "run_name=google/gemma-2-2b-it--health-bot-1731678943,\n",
       "save_on_each_node=False,\n",
       "save_only_model=False,\n",
       "save_safetensors=True,\n",
       "save_steps=50,\n",
       "save_strategy=IntervalStrategy.STEPS,\n",
       "save_total_limit=None,\n",
       "seed=42,\n",
       "skip_memory_metrics=True,\n",
       "split_batches=None,\n",
       "tf32=None,\n",
       "torch_compile=False,\n",
       "torch_compile_backend=None,\n",
       "torch_compile_mode=None,\n",
       "torch_empty_cache_steps=None,\n",
       "torchdynamo=None,\n",
       "tpu_metrics_debug=False,\n",
       "tpu_num_cores=None,\n",
       "use_cpu=False,\n",
       "use_ipex=False,\n",
       "use_legacy_prediction_loop=False,\n",
       "use_liger_kernel=False,\n",
       "use_mps_device=False,\n",
       "warmup_ratio=0.0,\n",
       "warmup_steps=5,\n",
       "weight_decay=0.001,\n",
       ")"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import wandb\n",
    "import time\n",
    "wandb.login(key=os.getenv(\"WANDB_API_KEY\"))\n",
    "run = wandb.init(\n",
    "    project='TADBot',\n",
    "    job_type=\"training\",\n",
    "    anonymous=\"allow\"\n",
    ")\n",
    "run_name = f\"{hyperparams['model_name']}--health-bot-{int(time.time())}\"\n",
    "\n",
    "# Set training parameters\n",
    "training_arguments = TrainingArguments(\n",
    "    output_dir=f\"./outputs/{run_name}\",\n",
    "    per_device_train_batch_size=hyperparams[\"per_device_train_batch_size\"],\n",
    "    per_device_eval_batch_size=hyperparams[\"per_device_eval_batch_size\"],\n",
    "    gradient_accumulation_steps=hyperparams[\"gradient_accumulation_steps\"],\n",
    "    optim=hyperparams[\"optimizer\"],\n",
    "    num_train_epochs=hyperparams[\"num_train_epochs\"],\n",
    "    eval_steps=hyperparams[\"eval_steps\"],\n",
    "    eval_strategy=hyperparams[\"eval_strategy\"],\n",
    "    save_steps=hyperparams[\"save_steps\"],\n",
    "    logging_steps=hyperparams[\"logging_steps\"],\n",
    "    logging_strategy=hyperparams[\"logging_strategy\"],\n",
    "    warmup_steps=hyperparams[\"warmup_steps\"],\n",
    "    learning_rate=float(hyperparams[\"learning_rate\"]),\n",
    "    weight_decay=hyperparams[\"weight_decay\"],\n",
    "    fp16=hyperparams[\"fp16\"],\n",
    "    bf16=hyperparams[\"bf16\"],\n",
    "    max_grad_norm=hyperparams[\"max_grad_norm\"],\n",
    "    max_steps=hyperparams[\"max_steps\"],\n",
    "    group_by_length=hyperparams[\"group_by_length\"],\n",
    "    lr_scheduler_type=hyperparams[\"lr_scheduler_type\"],\n",
    "    logging_dir=f\"./outputs/{run_name}/logs\",\n",
    "    report_to=\"wandb\",\n",
    "    run_name=run_name\n",
    ")\n",
    "training_arguments"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "f:\\TADBot\\.venv\\Lib\\site-packages\\huggingface_hub\\utils\\_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': dataset_text_field, max_seq_length, packing. Will not be supported from version '0.13.0'.\n",
      "\n",
      "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
      "  warnings.warn(message, FutureWarning)\n",
      "f:\\TADBot\\.venv\\Lib\\site-packages\\trl\\trainer\\sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
      "  warnings.warn(\n",
      "f:\\TADBot\\.venv\\Lib\\site-packages\\trl\\trainer\\sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
      "  warnings.warn(\n",
      "f:\\TADBot\\.venv\\Lib\\site-packages\\trl\\trainer\\sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
      "  warnings.warn(\n"
     ]
    }
   ],
   "source": [
    "trainer = SFTTrainer(\n",
    "    model=model,\n",
    "    train_dataset=dataset[\"train\"],\n",
    "    eval_dataset=dataset['test'],\n",
    "    peft_config=peft_config,\n",
    "    dataset_text_field=\"text\",\n",
    "    # formatting_func=format_prompts_fn,\n",
    "    max_seq_length=hyperparams[\"max_seq_length\"],\n",
    "    tokenizer=tokenizer,\n",
    "    args=training_arguments,\n",
    "    packing=hyperparams[\"packing\"],\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Fine tuning the model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "b86eb0836dc64d2d929ef2f0b2f2bdf9",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1544 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'loss': 2.4221, 'grad_norm': 0.682584822177887, 'learning_rate': 0.0002, 'epoch': 0.03}\n",
      "{'loss': 1.9163, 'grad_norm': 0.5597965121269226, 'learning_rate': 0.0002, 'epoch': 0.06}\n",
      "{'loss': 1.9249, 'grad_norm': 0.5598402619361877, 'learning_rate': 0.0002, 'epoch': 0.1}\n",
      "{'loss': 1.9756, 'grad_norm': 0.6536526679992676, 'learning_rate': 0.0002, 'epoch': 0.13}\n",
      "{'loss': 1.9548, 'grad_norm': 0.608141303062439, 'learning_rate': 0.0002, 'epoch': 0.16}\n",
      "{'loss': 1.8867, 'grad_norm': 0.4548989534378052, 'learning_rate': 0.0002, 'epoch': 0.19}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.\n",
      "Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "626464ae6db34340a6c248c021b3dfc8",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/767 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'eval_loss': 1.902209997177124, 'eval_runtime': 305.3236, 'eval_samples_per_second': 5.021, 'eval_steps_per_second': 2.512, 'epoch': 0.2}\n",
      "{'loss': 1.9035, 'grad_norm': 0.43129104375839233, 'learning_rate': 0.0002, 'epoch': 0.23}\n",
      "{'loss': 1.8868, 'grad_norm': 0.49856260418891907, 'learning_rate': 0.0002, 'epoch': 0.26}\n",
      "{'loss': 1.7944, 'grad_norm': 0.4600728750228882, 'learning_rate': 0.0002, 'epoch': 0.29}\n",
      "{'loss': 1.8076, 'grad_norm': 0.5697025656700134, 'learning_rate': 0.0002, 'epoch': 0.32}\n",
      "{'loss': 1.8321, 'grad_norm': 0.7373968958854675, 'learning_rate': 0.0002, 'epoch': 0.36}\n",
      "{'loss': 1.9213, 'grad_norm': 0.5277324318885803, 'learning_rate': 0.0002, 'epoch': 0.39}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.\n",
      "Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "43e8d2b139874448be3241192f492b13",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/767 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'eval_loss': 1.852689266204834, 'eval_runtime': 71.3284, 'eval_samples_per_second': 21.492, 'eval_steps_per_second': 10.753, 'epoch': 0.4}\n",
      "{'loss': 1.8277, 'grad_norm': 0.5442835688591003, 'learning_rate': 0.0002, 'epoch': 0.42}\n",
      "{'loss': 1.7947, 'grad_norm': 0.4261704981327057, 'learning_rate': 0.0002, 'epoch': 0.45}\n",
      "{'loss': 1.8975, 'grad_norm': 0.43769732117652893, 'learning_rate': 0.0002, 'epoch': 0.49}\n",
      "{'loss': 1.8065, 'grad_norm': 0.6723660230636597, 'learning_rate': 0.0002, 'epoch': 0.52}\n",
      "{'loss': 1.6969, 'grad_norm': 0.7517312169075012, 'learning_rate': 0.0002, 'epoch': 0.55}\n",
      "{'loss': 1.7825, 'grad_norm': 0.5381327867507935, 'learning_rate': 0.0002, 'epoch': 0.58}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.\n",
      "Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "700c8a3cdc694fd88e19ae4f442464d4",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/767 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'eval_loss': 1.81912362575531, 'eval_runtime': 71.971, 'eval_samples_per_second': 21.3, 'eval_steps_per_second': 10.657, 'epoch': 0.6}\n",
      "{'loss': 1.7915, 'grad_norm': 0.6141555309295654, 'learning_rate': 0.0002, 'epoch': 0.62}\n",
      "{'loss': 1.7635, 'grad_norm': 0.5057688355445862, 'learning_rate': 0.0002, 'epoch': 0.65}\n",
      "{'loss': 1.728, 'grad_norm': 0.49006038904190063, 'learning_rate': 0.0002, 'epoch': 0.68}\n",
      "{'loss': 1.8424, 'grad_norm': 0.4901270866394043, 'learning_rate': 0.0002, 'epoch': 0.71}\n",
      "{'loss': 1.8308, 'grad_norm': 0.6117296814918518, 'learning_rate': 0.0002, 'epoch': 0.74}\n",
      "{'loss': 1.8729, 'grad_norm': 0.5475451946258545, 'learning_rate': 0.0002, 'epoch': 0.78}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.\n",
      "Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "7d41852d6bad4d65bdf1c972c7c86547",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/767 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'eval_loss': 1.786774754524231, 'eval_runtime': 71.1209, 'eval_samples_per_second': 21.555, 'eval_steps_per_second': 10.784, 'epoch': 0.8}\n",
      "{'loss': 1.6851, 'grad_norm': 0.4951877295970917, 'learning_rate': 0.0002, 'epoch': 0.81}\n",
      "{'loss': 1.7613, 'grad_norm': 1.3179290294647217, 'learning_rate': 0.0002, 'epoch': 0.84}\n",
      "{'loss': 1.8753, 'grad_norm': 0.45116502046585083, 'learning_rate': 0.0002, 'epoch': 0.87}\n",
      "{'loss': 1.7441, 'grad_norm': 0.550654411315918, 'learning_rate': 0.0002, 'epoch': 0.91}\n",
      "{'loss': 1.8054, 'grad_norm': 0.4832320511341095, 'learning_rate': 0.0002, 'epoch': 0.94}\n",
      "{'loss': 1.7869, 'grad_norm': 0.5937925577163696, 'learning_rate': 0.0002, 'epoch': 0.97}\n",
      "{'train_runtime': 1964.5956, 'train_samples_per_second': 3.145, 'train_steps_per_second': 0.786, 'train_loss': 1.846395028069847, 'epoch': 1.0}\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "TrainOutput(global_step=1544, training_loss=1.846395028069847, metrics={'train_runtime': 1964.5956, 'train_samples_per_second': 3.145, 'train_steps_per_second': 0.786, 'total_flos': 9905705513385984.0, 'train_loss': 1.846395028069847, 'epoch': 0.9996762706377469})"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.config.use_cache = False\n",
    "trainer.train()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "47c032db65ce47c6921c3087916cf02f",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "VBox(children=(Label(value='0.022 MB of 0.022 MB uploaded\\r'), FloatProgress(value=1.0, max=1.0)))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "\n",
       "    <style>\n",
       "        .wandb-row {\n",
       "            display: flex;\n",
       "            flex-direction: row;\n",
       "            flex-wrap: wrap;\n",
       "            justify-content: flex-start;\n",
       "            width: 100%;\n",
       "        }\n",
       "        .wandb-col {\n",
       "            display: flex;\n",
       "            flex-direction: column;\n",
       "            flex-basis: 100%;\n",
       "            flex: 1;\n",
       "            padding: 10px;\n",
       "        }\n",
       "    </style>\n",
       "<div class=\"wandb-row\"><div class=\"wandb-col\"><h3>Run history:</h3><br/><table class=\"wandb\"><tr><td>eval/loss</td><td>█▅▃▁</td></tr><tr><td>eval/runtime</td><td>█▁▁▁</td></tr><tr><td>eval/samples_per_second</td><td>▁███</td></tr><tr><td>eval/steps_per_second</td><td>▁███</td></tr><tr><td>train/epoch</td><td>▁▁▁▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇███</td></tr><tr><td>train/global_step</td><td>▁▁▁▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇███</td></tr><tr><td>train/grad_norm</td><td>▃▂▂▃▂▁▁▂▁▂▃▂▂▁▁▃▄▂▂▂▂▂▂▂▂█▁▂▁▂</td></tr><tr><td>train/learning_rate</td><td>▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁</td></tr><tr><td>train/loss</td><td>█▃▃▄▄▃▃▃▂▂▂▃▂▂▃▂▁▂▂▂▁▂▂▃▁▂▃▂▂▂</td></tr></table><br/></div><div class=\"wandb-col\"><h3>Run summary:</h3><br/><table class=\"wandb\"><tr><td>eval/loss</td><td>1.78677</td></tr><tr><td>eval/runtime</td><td>71.1209</td></tr><tr><td>eval/samples_per_second</td><td>21.555</td></tr><tr><td>eval/steps_per_second</td><td>10.784</td></tr><tr><td>total_flos</td><td>9905705513385984.0</td></tr><tr><td>train/epoch</td><td>0.99968</td></tr><tr><td>train/global_step</td><td>1544</td></tr><tr><td>train/grad_norm</td><td>0.59379</td></tr><tr><td>train/learning_rate</td><td>0.0002</td></tr><tr><td>train/loss</td><td>1.7869</td></tr><tr><td>train_loss</td><td>1.8464</td></tr><tr><td>train_runtime</td><td>1964.5956</td></tr><tr><td>train_samples_per_second</td><td>3.145</td></tr><tr><td>train_steps_per_second</td><td>0.786</td></tr></table><br/></div></div>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       " View run <strong style=\"color:#cdcd00\">eager-morning-3</strong> at: <a href='https://wandb.ai/kausikremella-vit-ap/TADBot/runs/7eelojfi' target=\"_blank\">https://wandb.ai/kausikremella-vit-ap/TADBot/runs/7eelojfi</a><br/> View project at: <a href='https://wandb.ai/kausikremella-vit-ap/TADBot' target=\"_blank\">https://wandb.ai/kausikremella-vit-ap/TADBot</a><br/>Synced 4 W&B file(s), 0 media file(s), 3 artifact file(s) and 0 other file(s)"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "Find logs at: <code>.\\wandb\\run-20241115_192539-7eelojfi\\logs</code>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "wandb.finish()\n",
    "model.config.use_cache = True\n",
    "# Save the model\n",
    "trainer.model.save_pretrained(hyperparams[\"new_model_name\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "%tensorboard  --logdir Gemma2_2B\\\\results\\\\runs"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}