{ "cells": [ { "cell_type": "markdown", "id": "a6b7c9f9-db9d-4278-8e0c-192db80afb9b", "metadata": {}, "source": [ "### Importing Libraries\n", "\n", "This cell imports the necessary libraries for the project. `keras` is a high-level neural networks API, and `keras_nlp` provides additional tools and functionalities for natural language processing tasks using Keras.\n" ] }, { "cell_type": "code", "execution_count": 1, "id": "09958eb5-8363-47dd-b508-353d6e538827", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2024-08-29 18:01:15.929029: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n", "2024-08-29 18:01:15.944717: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n", "2024-08-29 18:01:15.963838: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n", "2024-08-29 18:01:15.969620: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n", "2024-08-29 18:01:15.983677: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", "To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", "/usr/local/lib/python3.10/dist-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] } ], "source": [ "import keras\n", "import keras_nlp\n", "import tensorflow" ] }, { "cell_type": "code", "execution_count": null, "id": "de2731f6-422e-46bf-839d-ef8f474e7742", "metadata": {}, "outputs": [], "source": [ "# import os\n", "\n", "# os.environ[\"KERAS_BACKEND\"] = \"jax\" \n", "# # Avoid memory fragmentation on JAX backend.\n", "# os.environ[\"XLA_PYTHON_CLIENT_MEM_FRACTION\"]=\"1.00\"" ] }, { "cell_type": "code", "execution_count": null, "id": "8725e8ba-f1c3-4e5f-8bb8-1451e3a7a394", "metadata": { "scrolled": true }, "outputs": [], "source": [ "import json\n", "\n", "# Initialize an empty list to hold the processed data.\n", "data = []\n", "\n", "# Open and read the JSON file line by line.\n", "with open('/project/data/combined_dataset.json') as file:\n", " for line in file:\n", " features = json.loads(line)\n", " \n", " # Filter out examples without \"Context\".\n", " if not features.get(\"Context\"):\n", " continue\n", " \n", " # Format the example as a string.\n", " template = \"Instruction:\\n{Context}\\n\\nResponse:\\n{Response}\"\n", " formatted_example = template.format(**features)\n", " \n", " # Append the formatted example to the data list.\n", " data.append(formatted_example)\n", "\n", "# Now data contains a list of formatted strings.\n" ] }, { "cell_type": "code", "execution_count": null, "id": "a26993cb-c8aa-42b1-943f-b6033d909336", "metadata": {}, "outputs": [], "source": [ "import os\n", "\n", "# Set Kaggle API credentials\n", "os.environ[\"KAGGLE_USERNAME\"] = \"rogerkorantenng\"\n", "os.environ[\"KAGGLE_KEY\"] = \"9a33b6e88bcb6058b1281d777fa6808d\"\n" ] }, { "cell_type": "code", "execution_count": null, "id": "008c3f60-b0f6-4709-a7c7-836a6ea4f5cb", "metadata": {}, "outputs": [], "source": [ "gemma_lm = keras_nlp.models.GemmaCausalLM.from_preset(\"gemma_2b_en\")\n", "gemma_lm.summary()" ] }, { "cell_type": "code", "execution_count": null, "id": "8fcd0823-61b8-4037-863d-d81dfcd8dec1", "metadata": {}, "outputs": [], "source": [ "# Define the template with placeholders for 'instruction' and 'response'\n", "template = \"Instruction:\\n{instruction}\\n\\nResponse:\\n{response}\"\n", "\n", "# Create the prompt by formatting the template with actual data\n", "prompt = template.format(\n", " instruction=\"I'm going through some things with my feelings and myself. I barely sleep and I do nothing but think about how I'm worthless and how I shouldn't be here.\\n I've never tried or contemplated suicide. I've always wanted to fix my issues, but I never get around to it.\\n How can I change my feeling of being worthless to everyone?\",\n", " response=\"\",\n", ")\n", "\n", "# Assuming gemma_lm is a language model that you're using to generate text\n", "print(gemma_lm.generate(prompt, max_length=256))\n" ] }, { "cell_type": "code", "execution_count": null, "id": "834bffa8-4361-4ec9-8c3a-7403d3ce83c4", "metadata": {}, "outputs": [], "source": [ "# gemma_lm.save(\"gemma_model.h5\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "3af9301d-2623-4773-9f3b-07efcc788fc4", "metadata": {}, "outputs": [], "source": [ "# Enable LoRA for the model and set the LoRA rank to 4.\n", "gemma_lm.backbone.enable_lora(rank=10)\n", "gemma_lm.summary()" ] }, { "cell_type": "code", "execution_count": null, "id": "ab365ee3-da5f-4b5c-b00b-428005ff42e4", "metadata": {}, "outputs": [], "source": [ "import os\n", "import tensorflow as tf\n", "import keras_nlp\n", "import keras\n", "import json\n", "\n", "# Set Kaggle API credentials\n", "os.environ[\"KAGGLE_USERNAME\"] = \"rogerkorantenng\"\n", "os.environ[\"KAGGLE_KEY\"] = \"9a33b6e88bcb6058b1281d777fa6808d\"" ] }, { "cell_type": "code", "execution_count": null, "id": "ae2db527-0964-491d-8fd6-0c746cbcae2e", "metadata": {}, "outputs": [], "source": [ "def get_compiled_model():\n", " gemma_lm = keras_nlp.models.GemmaCausalLM.from_preset(\"gemma_2b_en\")\n", " gemma_lm.summary()\n", "\n", " gemma_lm.backbone.enable_lora(rank=2)\n", " gemma_lm.summary()\n", " \n", " # Set the sequence length to 128 before using the model.\n", " gemma_lm.preprocessor.sequence_length = 128\n", " \n", " # Use AdamW (a common optimizer for transformer models).\n", " optimizer = keras.optimizers.AdamW(\n", " learning_rate=5e-5,\n", " weight_decay=0.01,\n", " )\n", " \n", " # Exclude layernorm and bias terms from decay.\n", " optimizer.exclude_from_weight_decay(var_names=[\"bias\", \"scale\"])\n", " \n", " gemma_lm.compile(\n", " loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),\n", " optimizer=optimizer,\n", " weighted_metrics=[keras.metrics.SparseCategoricalAccuracy()],\n", " )\n", "\n", " \n", " return gemma_lm\n", "print(gemma_lm.)\n", "\n", "print(gemma_lm.summary())\n", "\n", "\n", " " ] }, { "cell_type": "code", "execution_count": null, "id": "bd11ce52-9eb8-4e48-a4da-fdc759e7f789", "metadata": {}, "outputs": [], "source": [ "def get_dataset():\n", " # Initialize an empty list to hold the processed data.\n", " data = []\n", " \n", " # Open and read the JSON file line by line.\n", " with open('/project/data/HealthCareMagic-100k-en.jsonl') as file:\n", " for line in file:\n", " features = json.loads(line)\n", " \n", " # Filter out examples without \"Context\".\n", " if not features.get(\"Context\"):\n", " continue\n", " \n", " # Format the example as a string.\n", " template = \"Instruction:\\n{Context}\\n\\nResponse:\\n{Response}\"\n", " formatted_example = template.format(**features)\n", " \n", " # Append the formatted example to the data list.\n", " data.append(formatted_example)\n", " \n", " return data " ] }, { "cell_type": "code", "execution_count": null, "id": "4ec94c5f-70b6-4683-95d4-53c1231f5a9c", "metadata": {}, "outputs": [], "source": [ "model = get_compiled_model()\n", "\n", "# Get the dataset outside the strategy scope.\n", "data = get_dataset()\n", "\n", "# Fit the model using the data.\n", "model.fit(data, epochs=2, batch_size=0, verbose=1)" ] }, { "cell_type": "code", "execution_count": 2, "id": "bfe053b2-a02f-4e2b-af04-48c28a00c20e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Hello\n" ] } ], "source": [ "print('Hello')" ] }, { "cell_type": "code", "execution_count": null, "id": "35d93608-b1ef-44a1-a57b-4c1a3d3dbebb", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" } }, "nbformat": 4, "nbformat_minor": 5 }