{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "a6b7c9f9-db9d-4278-8e0c-192db80afb9b",
   "metadata": {},
   "source": [
    "### Importing Libraries\n",
    "\n",
    "This cell imports the necessary libraries for the project. `keras` is a high-level neural networks API, and `keras_nlp` provides additional tools and functionalities for natural language processing tasks using Keras.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "09958eb5-8363-47dd-b508-353d6e538827",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2024-08-29 18:01:15.929029: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
      "2024-08-29 18:01:15.944717: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
      "2024-08-29 18:01:15.963838: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
      "2024-08-29 18:01:15.969620: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
      "2024-08-29 18:01:15.983677: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
      "To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
      "/usr/local/lib/python3.10/dist-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    }
   ],
   "source": [
    "import keras\n",
    "import keras_nlp\n",
    "import tensorflow"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "de2731f6-422e-46bf-839d-ef8f474e7742",
   "metadata": {},
   "outputs": [],
   "source": [
    "# import os\n",
    "\n",
    "# os.environ[\"KERAS_BACKEND\"] = \"jax\"  \n",
    "# # Avoid memory fragmentation on JAX backend.\n",
    "# os.environ[\"XLA_PYTHON_CLIENT_MEM_FRACTION\"]=\"1.00\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8725e8ba-f1c3-4e5f-8bb8-1451e3a7a394",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "# Initialize an empty list to hold the processed data.\n",
    "data = []\n",
    "\n",
    "# Open and read the JSON file line by line.\n",
    "with open('/project/data/combined_dataset.json') as file:\n",
    "    for line in file:\n",
    "        features = json.loads(line)\n",
    "        \n",
    "        # Filter out examples without \"Context\".\n",
    "        if not features.get(\"Context\"):\n",
    "            continue\n",
    "        \n",
    "        # Format the example as a string.\n",
    "        template = \"Instruction:\\n{Context}\\n\\nResponse:\\n{Response}\"\n",
    "        formatted_example = template.format(**features)\n",
    "        \n",
    "        # Append the formatted example to the data list.\n",
    "        data.append(formatted_example)\n",
    "\n",
    "# Now data contains a list of formatted strings.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a26993cb-c8aa-42b1-943f-b6033d909336",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "# Set Kaggle API credentials\n",
    "os.environ[\"KAGGLE_USERNAME\"] = \"rogerkorantenng\"\n",
    "os.environ[\"KAGGLE_KEY\"] = \"9a33b6e88bcb6058b1281d777fa6808d\"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "008c3f60-b0f6-4709-a7c7-836a6ea4f5cb",
   "metadata": {},
   "outputs": [],
   "source": [
    "gemma_lm = keras_nlp.models.GemmaCausalLM.from_preset(\"gemma_2b_en\")\n",
    "gemma_lm.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8fcd0823-61b8-4037-863d-d81dfcd8dec1",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define the template with placeholders for 'instruction' and 'response'\n",
    "template = \"Instruction:\\n{instruction}\\n\\nResponse:\\n{response}\"\n",
    "\n",
    "# Create the prompt by formatting the template with actual data\n",
    "prompt = template.format(\n",
    "    instruction=\"I'm going through some things with my feelings and myself. I barely sleep and I do nothing but think about how I'm worthless and how I shouldn't be here.\\n   I've never tried or contemplated suicide. I've always wanted to fix my issues, but I never get around to it.\\n   How can I change my feeling of being worthless to everyone?\",\n",
    "    response=\"\",\n",
    ")\n",
    "\n",
    "# Assuming gemma_lm is a language model that you're using to generate text\n",
    "print(gemma_lm.generate(prompt, max_length=256))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "834bffa8-4361-4ec9-8c3a-7403d3ce83c4",
   "metadata": {},
   "outputs": [],
   "source": [
    "# gemma_lm.save(\"gemma_model.h5\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3af9301d-2623-4773-9f3b-07efcc788fc4",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Enable LoRA for the model and set the LoRA rank to 4.\n",
    "gemma_lm.backbone.enable_lora(rank=10)\n",
    "gemma_lm.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ab365ee3-da5f-4b5c-b00b-428005ff42e4",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import tensorflow as tf\n",
    "import keras_nlp\n",
    "import keras\n",
    "import json\n",
    "\n",
    "# Set Kaggle API credentials\n",
    "os.environ[\"KAGGLE_USERNAME\"] = \"rogerkorantenng\"\n",
    "os.environ[\"KAGGLE_KEY\"] = \"9a33b6e88bcb6058b1281d777fa6808d\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ae2db527-0964-491d-8fd6-0c746cbcae2e",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_compiled_model():\n",
    "    gemma_lm = keras_nlp.models.GemmaCausalLM.from_preset(\"gemma_2b_en\")\n",
    "    gemma_lm.summary()\n",
    "\n",
    "    gemma_lm.backbone.enable_lora(rank=2)\n",
    "    gemma_lm.summary()\n",
    "    \n",
    "    # Set the sequence length to 128 before using the model.\n",
    "    gemma_lm.preprocessor.sequence_length = 128\n",
    "    \n",
    "    # Use AdamW (a common optimizer for transformer models).\n",
    "    optimizer = keras.optimizers.AdamW(\n",
    "        learning_rate=5e-5,\n",
    "        weight_decay=0.01,\n",
    "    )\n",
    "    \n",
    "    # Exclude layernorm and bias terms from decay.\n",
    "    optimizer.exclude_from_weight_decay(var_names=[\"bias\", \"scale\"])\n",
    "    \n",
    "    gemma_lm.compile(\n",
    "        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),\n",
    "        optimizer=optimizer,\n",
    "        weighted_metrics=[keras.metrics.SparseCategoricalAccuracy()],\n",
    "    )\n",
    "\n",
    "    \n",
    "    return gemma_lm\n",
    "print(gemma_lm.)\n",
    "\n",
    "print(gemma_lm.summary())\n",
    "\n",
    "\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bd11ce52-9eb8-4e48-a4da-fdc759e7f789",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_dataset():\n",
    "    # Initialize an empty list to hold the processed data.\n",
    "    data = []\n",
    "    \n",
    "    # Open and read the JSON file line by line.\n",
    "    with open('/project/data/HealthCareMagic-100k-en.jsonl') as file:\n",
    "        for line in file:\n",
    "            features = json.loads(line)\n",
    "            \n",
    "            # Filter out examples without \"Context\".\n",
    "            if not features.get(\"Context\"):\n",
    "                continue\n",
    "            \n",
    "            # Format the example as a string.\n",
    "            template = \"Instruction:\\n{Context}\\n\\nResponse:\\n{Response}\"\n",
    "            formatted_example = template.format(**features)\n",
    "            \n",
    "            # Append the formatted example to the data list.\n",
    "            data.append(formatted_example)\n",
    "    \n",
    "    return data "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4ec94c5f-70b6-4683-95d4-53c1231f5a9c",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = get_compiled_model()\n",
    "\n",
    "# Get the dataset outside the strategy scope.\n",
    "data = get_dataset()\n",
    "\n",
    "# Fit the model using the data.\n",
    "model.fit(data, epochs=2, batch_size=0, verbose=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "bfe053b2-a02f-4e2b-af04-48c28a00c20e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Hello\n"
     ]
    }
   ],
   "source": [
    "print('Hello')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "35d93608-b1ef-44a1-a57b-4c1a3d3dbebb",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}