Spaces:

lxe
/

lora-cerebras-gpt2.7b-alpaca-shortprompt

Runtime error

App Files Files Community

lxe commited on Mar 31, 2023

Commit

40fab0e

•

1 Parent(s): 12e7ebc

Initial Commit

Browse files

Files changed (3) hide show

app.py +227 -0
finetune.ipynb +1220 -0
requirements.txt +5 -0

app.py ADDED Viewed

	@@ -0,0 +1,227 @@

+import torch
+import gradio as gr
+import re
+import transformers
+import peft
+import traceback
+from queue import Queue
+from threading import Thread
+import gc
+CUDA_AVAILABLE = torch.cuda.is_available()
+device = torch.device("cuda" if CUDA_AVAILABLE else "cpu")
+tokenizer = transformers.AutoTokenizer.from_pretrained("cerebras/Cerebras-GPT-2.7B")
+tokenizer.pad_token_id = 0
+model = transformers.AutoModelForCausalLM.from_pretrained(
+    "cerebras/Cerebras-GPT-2.7B",
+    load_in_8bit=True,
+    torch_dtype=torch.float16,
+    device_map={'':0} if CUDA_AVAILABLE else 'auto',
+)
+model = peft.PeftModel.from_pretrained(
+    model,
+    'lxe/lora-cerebras-gpt2.7b-alpaca-shortprompt',
+    torch_dtype=torch.float16
+)
+model.half()
+# Streaming functionality taken from https://github.com/oobabooga/text-generation-webui/blob/master/modules/text_generation.py#L105
+class Stream(transformers.StoppingCriteria):
+    def __init__(self, callback_func=None):
+        self.callback_func = callback_func
+    def __call__(self, input_ids, scores) -> bool:
+        if self.callback_func is not None:
+            self.callback_func(input_ids[0])
+        return False
+class Iteratorize:
+    """
+    Transforms a function that takes a callback
+    into a lazy iterator (generator).
+    """
+    def __init__(self, func, kwargs={}, callback=None):
+        self.mfunc=func
+        self.c_callback=callback
+        self.q = Queue()
+        self.sentinel = object()
+        self.kwargs = kwargs
+        self.stop_now = False
+        def _callback(val):
+            if self.stop_now:
+                raise ValueError
+            self.q.put(val)
+        def gentask():
+            try:
+                ret = self.mfunc(callback=_callback, **self.kwargs)
+            except ValueError:
+                traceback.print_exc()
+                pass
+            except:
+                traceback.print_exc()
+                pass
+            clear_torch_cache()
+            self.q.put(self.sentinel)
+            if self.c_callback:
+                self.c_callback(ret)
+        self.thread = Thread(target=gentask)
+        self.thread.start()
+    def __iter__(self):
+        return self
+    def __next__(self):
+        obj = self.q.get(True,None)
+        if obj is self.sentinel:
+            raise StopIteration
+        else:
+            return obj
+    def __del__(self):
+        clear_torch_cache()
+    def __enter__(self):
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.stop_now = True
+        clear_torch_cache()
+def clear_torch_cache():
+    gc.collect()
+    if CUDA_AVAILABLE:
+        torch.cuda.empty_cache()
+def generate_text(
+    history,
+    max_new_tokens,
+    do_sample,
+    temperature,
+    top_p,
+    top_k,
+    repetition_penalty,
+    typical_p,
+    num_beams
+):
+    # Create a conversation context of the last 4 entries in the history
+    inp = ''.join([
+        f"Human: {h[0]}\n\nAssistant: {'' if h[1] is None else h[1]}\n\n" for h in history[-4:]
+    ]).strip()
+    input_ids = tokenizer.encode(
+        inp,
+        return_tensors='pt',
+        truncation=True,
+        add_special_tokens=False
+    ).to(device) # type: ignore
+    generate_params = {
+        "input_ids": input_ids,
+        "max_new_tokens": max_new_tokens,
+        "do_sample": do_sample,
+        "temperature": temperature,
+        "top_p": top_p,
+        "top_k": top_k,
+        "repetition_penalty": repetition_penalty,
+        "typical_p": typical_p,
+        "num_beams": num_beams,
+        "stopping_criteria": transformers.StoppingCriteriaList(),
+        "pad_token_id": tokenizer.pad_token_id,
+    }
+    def generate_with_callback(callback=None, **kwargs):
+        kwargs['stopping_criteria'].append(Stream(callback_func=callback))
+        clear_torch_cache()
+        with torch.no_grad():
+            model.generate(**kwargs) # type: ignore
+    def generate_with_streaming(**kwargs):
+        return Iteratorize(generate_with_callback, kwargs, callback=None)
+    with generate_with_streaming(**generate_params) as generator:
+        for output in generator:
+            new_tokens = len(output) - len(input_ids[0])
+            reply = tokenizer.decode(output[-new_tokens:], skip_special_tokens=True)
+            # If reply contains '^Human:' or '^Assistant:'
+            # then we have reached the end of the assistant's response
+            stop_re = re.compile(r'^(Human|Assistant):', re.MULTILINE)
+            if re.search(stop_re, reply):
+                reply = ''.join(reply.split('\n')[:-1])
+                history[-1][1] = reply.strip()
+                yield history
+                break
+            # if reply contains 'EOS' then we have reached the end of the conversation
+            if output[-1] in [tokenizer.eos_token_id]:
+                yield history
+                break
+            history[-1][1] = reply.strip()
+            yield history
+with gr.Blocks() as demo:
+    gr.Markdown("""
+    ## Cerebras GPT-2.7B Alpcaca-Shortprompt LoRA Demo
+    This is a very fast and relatively coherent chatbot. It uses the [Cerebras-GPT-2.7B](https://huggingface.co/cerebras/Cerebras-GPT-2.7B), with a LoRA finetuned on the [Alpcaca Dataset]) dataset using a shorter prompt. The chatbok keeps a very short conversation context as well.
+    """)
+    with gr.Row():
+        with gr.Column():
+            chatbot = gr.Chatbot()
+            msg = gr.Textbox(value="How old is the Earth?", placeholder="Type a message...")
+            with gr.Row():
+                clear = gr.Button("Clear")
+        with gr.Column():
+            max_new_tokens = gr.Slider(0, 2048, 200, step=1, label="max_new_tokens")
+            do_sample = gr.Checkbox(True, label="do_sample")
+            with gr.Row():
+                with gr.Column():
+                    temperature = gr.Slider(0, 2, 0.5, step=0.01, label="temperature")
+                    top_p = gr.Slider(0, 1, 0.75, step=0.01, label="top_p")
+                    top_k = gr.Slider(0, 100, 80, step=1, label="top_k")
+                with gr.Column():
+                    repetition_penalty = gr.Slider(0, 10, 1.5, step=0.01, label="repetition_penalty")
+                    typical_p = gr.Slider(0, 1, 1, step=0.01, label="typical_p")
+                    num_beams = gr.Slider(0, 10, 1, step=1, label="num_beams")
+    def user(user_message, history):
+        return "", history + [[user_message, None]]
+    def fix_history(history):
+        update_history = False
+        for i, (user, bot) in enumerate(history):
+            if bot is None:
+                update_history = True
+                history[i][1] = "_silence_"
+        if update_history:
+            chatbot.update(history)
+    msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
+        generate_text, inputs=[
+            chatbot,
+            max_new_tokens,
+            do_sample,
+            temperature,
+            top_p,
+            top_k,
+            repetition_penalty,
+            typical_p,
+            num_beams
+        ], outputs=[chatbot],
+    ).then(fix_history, chatbot)
+    clear.click(lambda: None, None, chatbot, queue=False)
+demo.queue().launch()

finetune.ipynb ADDED Viewed

	@@ -0,0 +1,1220 @@

+{
+  "cells": [
+   {
+    "cell_type": "code",
+    "execution_count": 1,
+    "id": "20b1e7bd",
+    "metadata": {},
+    "outputs": [],
+    "source": [
+     "import torch"
+    ]
+   },
+   {
+    "cell_type": "code",
+    "execution_count": 2,
+    "id": "4e92fff5",
+    "metadata": {},
+    "outputs": [],
+    "source": [
+     "import transformers\n",
+     "\n",
+     "tokenizer = transformers.AutoTokenizer.from_pretrained('cerebras/Cerebras-GPT-2.7B')\n",
+     "tokenizer.pad_token_id = 0"
+    ]
+   },
+   {
+    "cell_type": "code",
+    "execution_count": 13,
+    "id": "77637440",
+    "metadata": {},
+    "outputs": [
+     {
+      "name": "stderr",
+      "output_type": "stream",
+      "text": [
+       "Found cached dataset json (/root/.cache/huggingface/datasets/json/default-8d265dbd6f34ccd3/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n"
+      ]
+     },
+     {
+      "data": {
+       "application/vnd.jupyter.widget-view+json": {
+        "model_id": "1f5bceec2f7540f9b46c29f8074c4760",
+        "version_major": 2,
+        "version_minor": 0
+       },
+       "text/plain": [
+        "  0%|          | 0/1 [00:00<?, ?it/s]"
+       ]
+      },
+      "metadata": {},
+      "output_type": "display_data"
+     }
+    ],
+    "source": [
+     "import datasets\n",
+     "dataset = datasets.load_dataset('json', data_files='alpaca_data_cleaned.json')"
+    ]
+   },
+   {
+    "cell_type": "code",
+    "execution_count": 14,
+    "id": "dc81310c",
+    "metadata": {},
+    "outputs": [
+     {
+      "name": "stdout",
+      "output_type": "stream",
+      "text": [
+       "DatasetDict({\n",
+       "    train: Dataset({\n",
+       "        features: ['instruction', 'input', 'output'],\n",
+       "        num_rows: 51942\n",
+       "    })\n",
+       "})\n"
+      ]
+     }
+    ],
+    "source": [
+     "print(dataset)"
+    ]
+   },
+   {
+    "cell_type": "code",
+    "execution_count": 15,
+    "id": "660f022e",
+    "metadata": {},
+    "outputs": [],
+    "source": [
+     "cutoff_len = 512\n",
+     "\n",
+     "def generate_prompt(entry):\n",
+     "    if entry['input']:\n",
+     "        return f\"User: {entry['instruction']}: {entry['input']}\\n\\nAssistant: {entry['output']}\"\n",
+     "    else:\n",
+     "        return f\"User: {entry['instruction']}\\n\\nAssistant: {entry['output']}\"\n",
+     "\n",
+     "def tokenize(item, add_eos_token=True):\n",
+     "    result = tokenizer(\n",
+     "        generate_prompt(item),\n",
+     "        truncation=True,\n",
+     "        max_length=cutoff_len,\n",
+     "        padding=False,\n",
+     "        return_tensors=None,\n",
+     "    )\n",
+     "\n",
+     "    if (\n",
+     "        result[\"input_ids\"][-1] != tokenizer.eos_token_id\n",
+     "        and len(result[\"input_ids\"]) < cutoff_len\n",
+     "        and add_eos_token\n",
+     "    ):\n",
+     "        result[\"input_ids\"].append(tokenizer.eos_token_id)\n",
+     "        result[\"attention_mask\"].append(1)\n",
+     "\n",
+     "    result[\"labels\"] = result[\"input_ids\"].copy()\n",
+     "\n",
+     "    return result"
+    ]
+   },
+   {
+    "cell_type": "code",
+    "execution_count": 16,
+    "id": "28bc5713",
+    "metadata": {},
+    "outputs": [
+     {
+      "data": {
+       "application/vnd.jupyter.widget-view+json": {
+        "model_id": "5ec872a8d87d49d79f0b9ed2f1946af1",
+        "version_major": 2,
+        "version_minor": 0
+       },
+       "text/plain": [
+        "Map:   0%|          | 0/41553 [00:00<?, ? examples/s]"
+       ]
+      },
+      "metadata": {},
+      "output_type": "display_data"
+     },
+     {
+      "data": {
+       "application/vnd.jupyter.widget-view+json": {
+        "model_id": "3a2a0426dd664b5e892895cbd06fe02a",
+        "version_major": 2,
+        "version_minor": 0
+       },
+       "text/plain": [
+        "Map:   0%|          | 0/10389 [00:00<?, ? examples/s]"
+       ]
+      },
+      "metadata": {},
+      "output_type": "display_data"
+     }
+    ],
+    "source": [
+     "train_val = dataset[\"train\"].train_test_split(test_size=0.2, shuffle=True, seed=42)\n",
+     "train_data = train_val[\"train\"].shuffle().map(tokenize)\n",
+     "val_data = train_val[\"test\"].shuffle().map(tokenize)"
+    ]
+   },
+   {
+    "cell_type": "code",
+    "execution_count": 18,
+    "id": "10d2fc55",
+    "metadata": {},
+    "outputs": [
+     {
+      "name": "stdout",
+      "output_type": "stream",
+      "text": [
+       "\n",
+       "===================================BUG REPORT===================================\n",
+       "Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues\n",
+       "================================================================================\n",
+       "CUDA SETUP: CUDA runtime path found: /root/miniconda3/envs/llama/lib/libcudart.so\n",
+       "CUDA SETUP: Highest compute capability among GPUs detected: 8.6\n",
+       "CUDA SETUP: Detected CUDA version 117\n",
+       "CUDA SETUP: Loading binary /root/miniconda3/envs/llama/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so...\n"
+      ]
+     }
+    ],
+    "source": [
+     "if 'model' in globals(): \n",
+     "    del model\n",
+     "    torch.cuda.empty_cache()\n",
+     "\n",
+     "model = transformers.AutoModelForCausalLM.from_pretrained(\n",
+     "    'cerebras/Cerebras-GPT-2.7B',    \n",
+     "    load_in_8bit=True,\n",
+     "    torch_dtype=torch.float16,\n",
+     "    device_map={'': 0}\n",
+     ")"
+    ]
+   },
+   {
+    "cell_type": "code",
+    "execution_count": 19,
+    "id": "2fd1028c",
+    "metadata": {},
+    "outputs": [],
+    "source": [
+     "import peft\n",
+     "\n",
+     "model = peft.prepare_model_for_int8_training(model)\n",
+     "\n",
+     "model = peft.get_peft_model(model, peft.LoraConfig(\n",
+     "    r=8,\n",
+     "    lora_alpha=16,\n",
+     "    # target_modules=[\"q_proj\", \"v_proj\"],\n",
+     "    target_modules=[\"c_attn\"],\n",
+     "    lora_dropout=0.05,\n",
+     "    bias=\"none\",\n",
+     "    task_type=\"CAUSAL_LM\",\n",
+     "))"
+    ]
+   },
+   {
+    "cell_type": "code",
+    "execution_count": 20,
+    "id": "deb33df4",
+    "metadata": {},
+    "outputs": [
+     {
+      "ename": "ValueError",
+      "evalue": "Can't find config.json at 'lora-cerebras-gpt2.7b-alpaca'",
+      "output_type": "error",
+      "traceback": [
+       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+       "\u001b[0;31mHTTPError\u001b[0m                                 Traceback (most recent call last)",
+       "File \u001b[0;32m~/miniconda3/envs/llama/lib/python3.10/site-packages/huggingface_hub/utils/_errors.py:259\u001b[0m, in \u001b[0;36mhf_raise_for_status\u001b[0;34m(response, endpoint_name)\u001b[0m\n\u001b[1;32m    258\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 259\u001b[0m     response\u001b[39m.\u001b[39;49mraise_for_status()\n\u001b[1;32m    260\u001b[0m \u001b[39mexcept\u001b[39;00m HTTPError \u001b[39mas\u001b[39;00m e:\n",
+       "File \u001b[0;32m~/miniconda3/envs/llama/lib/python3.10/site-packages/requests/models.py:1021\u001b[0m, in \u001b[0;36mResponse.raise_for_status\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m   1020\u001b[0m \u001b[39mif\u001b[39;00m http_error_msg:\n\u001b[0;32m-> 1021\u001b[0m     \u001b[39mraise\u001b[39;00m HTTPError(http_error_msg, response\u001b[39m=\u001b[39m\u001b[39mself\u001b[39m)\n",
+       "\u001b[0;31mHTTPError\u001b[0m: 404 Client Error: Not Found for url: https://huggingface.co/lora-cerebras-gpt2.7b-alpaca/resolve/main/adapter_config.json",
+       "\nThe above exception was the direct cause of the following exception:\n",
+       "\u001b[0;31mRepositoryNotFoundError\u001b[0m                   Traceback (most recent call last)",
+       "File \u001b[0;32m~/miniconda3/envs/llama/lib/python3.10/site-packages/peft/utils/config.py:99\u001b[0m, in \u001b[0;36mPeftConfigMixin.from_pretrained\u001b[0;34m(cls, pretrained_model_name_or_path, **kwargs)\u001b[0m\n\u001b[1;32m     98\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m---> 99\u001b[0m     config_file \u001b[39m=\u001b[39m hf_hub_download(pretrained_model_name_or_path, CONFIG_NAME)\n\u001b[1;32m    100\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mException\u001b[39;00m:\n",
+       "File \u001b[0;32m~/miniconda3/envs/llama/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py:120\u001b[0m, in \u001b[0;36mvalidate_hf_hub_args.<locals>._inner_fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    118\u001b[0m     kwargs \u001b[39m=\u001b[39m smoothly_deprecate_use_auth_token(fn_name\u001b[39m=\u001b[39mfn\u001b[39m.\u001b[39m\u001b[39m__name__\u001b[39m, has_token\u001b[39m=\u001b[39mhas_token, kwargs\u001b[39m=\u001b[39mkwargs)\n\u001b[0;32m--> 120\u001b[0m \u001b[39mreturn\u001b[39;00m fn(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n",
+       "File \u001b[0;32m~/miniconda3/envs/llama/lib/python3.10/site-packages/huggingface_hub/file_download.py:1160\u001b[0m, in \u001b[0;36mhf_hub_download\u001b[0;34m(repo_id, filename, subfolder, repo_type, revision, library_name, library_version, cache_dir, local_dir, local_dir_use_symlinks, user_agent, force_download, force_filename, proxies, etag_timeout, resume_download, token, local_files_only, legacy_cache_layout)\u001b[0m\n\u001b[1;32m   1159\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m-> 1160\u001b[0m     metadata \u001b[39m=\u001b[39m get_hf_file_metadata(\n\u001b[1;32m   1161\u001b[0m         url\u001b[39m=\u001b[39;49murl,\n\u001b[1;32m   1162\u001b[0m         token\u001b[39m=\u001b[39;49mtoken,\n\u001b[1;32m   1163\u001b[0m         proxies\u001b[39m=\u001b[39;49mproxies,\n\u001b[1;32m   1164\u001b[0m         timeout\u001b[39m=\u001b[39;49metag_timeout,\n\u001b[1;32m   1165\u001b[0m     )\n\u001b[1;32m   1166\u001b[0m \u001b[39mexcept\u001b[39;00m EntryNotFoundError \u001b[39mas\u001b[39;00m http_error:\n\u001b[1;32m   1167\u001b[0m     \u001b[39m# Cache the non-existence of the file and raise\u001b[39;00m\n",
+       "File \u001b[0;32m~/miniconda3/envs/llama/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py:120\u001b[0m, in \u001b[0;36mvalidate_hf_hub_args.<locals>._inner_fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    118\u001b[0m     kwargs \u001b[39m=\u001b[39m smoothly_deprecate_use_auth_token(fn_name\u001b[39m=\u001b[39mfn\u001b[39m.\u001b[39m\u001b[39m__name__\u001b[39m, has_token\u001b[39m=\u001b[39mhas_token, kwargs\u001b[39m=\u001b[39mkwargs)\n\u001b[0;32m--> 120\u001b[0m \u001b[39mreturn\u001b[39;00m fn(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n",
+       "File \u001b[0;32m~/miniconda3/envs/llama/lib/python3.10/site-packages/huggingface_hub/file_download.py:1501\u001b[0m, in \u001b[0;36mget_hf_file_metadata\u001b[0;34m(url, token, proxies, timeout)\u001b[0m\n\u001b[1;32m   1492\u001b[0m r \u001b[39m=\u001b[39m _request_wrapper(\n\u001b[1;32m   1493\u001b[0m     method\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mHEAD\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[1;32m   1494\u001b[0m     url\u001b[39m=\u001b[39murl,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m   1499\u001b[0m     timeout\u001b[39m=\u001b[39mtimeout,\n\u001b[1;32m   1500\u001b[0m )\n\u001b[0;32m-> 1501\u001b[0m hf_raise_for_status(r)\n\u001b[1;32m   1503\u001b[0m \u001b[39m# Return\u001b[39;00m\n",
+       "File \u001b[0;32m~/miniconda3/envs/llama/lib/python3.10/site-packages/huggingface_hub/utils/_errors.py:291\u001b[0m, in \u001b[0;36mhf_raise_for_status\u001b[0;34m(response, endpoint_name)\u001b[0m\n\u001b[1;32m    283\u001b[0m     message \u001b[39m=\u001b[39m (\n\u001b[1;32m    284\u001b[0m         \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m{\u001b[39;00mresponse\u001b[39m.\u001b[39mstatus_code\u001b[39m}\u001b[39;00m\u001b[39m Client Error.\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m    285\u001b[0m         \u001b[39m+\u001b[39m \u001b[39m\"\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39m\\n\u001b[39;00m\u001b[39m\"\u001b[39m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    289\u001b[0m         \u001b[39m\"\u001b[39m\u001b[39m make sure you are authenticated.\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m    290\u001b[0m     )\n\u001b[0;32m--> 291\u001b[0m     \u001b[39mraise\u001b[39;00m RepositoryNotFoundError(message, response) \u001b[39mfrom\u001b[39;00m \u001b[39me\u001b[39;00m\n\u001b[1;32m    293\u001b[0m \u001b[39melif\u001b[39;00m response\u001b[39m.\u001b[39mstatus_code \u001b[39m==\u001b[39m \u001b[39m400\u001b[39m:\n",
+       "\u001b[0;31mRepositoryNotFoundError\u001b[0m: 404 Client Error. (Request ID: Root=1-6424c7f5-7796bb54152221004f83dc73)\n\nRepository Not Found for url: https://huggingface.co/lora-cerebras-gpt2.7b-alpaca/resolve/main/adapter_config.json.\nPlease make sure you specified the correct `repo_id` and `repo_type`.\nIf you are trying to access a private or gated repo, make sure you are authenticated.",
+       "\nDuring handling of the above exception, another exception occurred:\n",
+       "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
+       "Cell \u001b[0;32mIn[20], line 5\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mpeft\u001b[39;00m\n\u001b[1;32m      3\u001b[0m output_dir \u001b[39m=\u001b[39m \u001b[39m'\u001b[39m\u001b[39mlora-cerebras-gpt2.7b-alpaca\u001b[39m\u001b[39m'\u001b[39m\n\u001b[0;32m----> 5\u001b[0m model \u001b[39m=\u001b[39m peft\u001b[39m.\u001b[39;49mPeftModel\u001b[39m.\u001b[39;49mfrom_pretrained(\n\u001b[1;32m      6\u001b[0m     model,\n\u001b[1;32m      7\u001b[0m     \u001b[39m# 'lora-cerebras-gpt2.7b-hh-rlhf-helpful-online',\u001b[39;49;00m\n\u001b[1;32m      8\u001b[0m     output_dir,\n\u001b[1;32m      9\u001b[0m     torch_dtype\u001b[39m=\u001b[39;49mtorch\u001b[39m.\u001b[39;49mfloat16\n\u001b[1;32m     10\u001b[0m )\n",
+       "File \u001b[0;32m~/miniconda3/envs/llama/lib/python3.10/site-packages/peft/peft_model.py:135\u001b[0m, in \u001b[0;36mPeftModel.from_pretrained\u001b[0;34m(cls, model, model_id, **kwargs)\u001b[0m\n\u001b[1;32m    132\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39mmapping\u001b[39;00m \u001b[39mimport\u001b[39;00m MODEL_TYPE_TO_PEFT_MODEL_MAPPING, PEFT_TYPE_TO_CONFIG_MAPPING\n\u001b[1;32m    134\u001b[0m \u001b[39m# load the config\u001b[39;00m\n\u001b[0;32m--> 135\u001b[0m config \u001b[39m=\u001b[39m PEFT_TYPE_TO_CONFIG_MAPPING[PeftConfig\u001b[39m.\u001b[39;49mfrom_pretrained(model_id)\u001b[39m.\u001b[39mpeft_type]\u001b[39m.\u001b[39mfrom_pretrained(model_id)\n\u001b[1;32m    137\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mgetattr\u001b[39m(model, \u001b[39m\"\u001b[39m\u001b[39mhf_device_map\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39mNone\u001b[39;00m) \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m    138\u001b[0m     remove_hook_from_submodules(model)\n",
+       "File \u001b[0;32m~/miniconda3/envs/llama/lib/python3.10/site-packages/peft/utils/config.py:101\u001b[0m, in \u001b[0;36mPeftConfigMixin.from_pretrained\u001b[0;34m(cls, pretrained_model_name_or_path, **kwargs)\u001b[0m\n\u001b[1;32m     99\u001b[0m         config_file \u001b[39m=\u001b[39m hf_hub_download(pretrained_model_name_or_path, CONFIG_NAME)\n\u001b[1;32m    100\u001b[0m     \u001b[39mexcept\u001b[39;00m \u001b[39mException\u001b[39;00m:\n\u001b[0;32m--> 101\u001b[0m         \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mCan\u001b[39m\u001b[39m'\u001b[39m\u001b[39mt find config.json at \u001b[39m\u001b[39m'\u001b[39m\u001b[39m{\u001b[39;00mpretrained_model_name_or_path\u001b[39m}\u001b[39;00m\u001b[39m'\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m    103\u001b[0m loaded_attributes \u001b[39m=\u001b[39m \u001b[39mcls\u001b[39m\u001b[39m.\u001b[39mfrom_json_file(config_file)\n\u001b[1;32m    105\u001b[0m config \u001b[39m=\u001b[39m \u001b[39mcls\u001b[39m(\u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n",
+       "\u001b[0;31mValueError\u001b[0m: Can't find config.json at 'lora-cerebras-gpt2.7b-alpaca'"
+      ]
+     }
+    ],
+    "source": [
+     "import peft\n",
+     "\n",
+     "\n",
+     "\n",
+     "model = peft.PeftModel.from_pretrained(\n",
+     "    model,\n",
+     "    # 'lora-cerebras-gpt2.7b-hh-rlhf-helpful-online',\n",
+     "    output_dir,\n",
+     "    torch_dtype=torch.float16\n",
+     ")"
+    ]
+   },
+   {
+    "cell_type": "code",
+    "execution_count": 28,
+    "id": "8ec93ed2",
+    "metadata": {},
+    "outputs": [],
+    "source": [
+     "\n",
+     "\n",
+     "import os\n",
+     "import wandb \n",
+     "\n",
+     "output_dir = 'lora-cerebras-gpt2.7b-alpaca'\n",
+     "\n",
+     "use_wandb = True,\n",
+     "wandb_run_name = f\"{output_dir}-{wandb.util.generate_id()}\"\n",
+     "\n",
+     "# set the wandb project where this run will be logged\n",
+     "os.environ[\"WANDB_PROJECT\"]=output_dir\n",
+     "\n",
+     "# save your trained model checkpoint to wandb\n",
+     "os.environ[\"WANDB_LOG_MODEL\"]=\"true\"\n",
+     "\n",
+     "# turn off watch to log faster\n",
+     "os.environ[\"WANDB_WATCH\"]=\"false\"\n",
+     "\n",
+     "training_args = transformers.TrainingArguments(\n",
+     "    per_device_train_batch_size=16, \n",
+     "    gradient_accumulation_steps=8,  \n",
+     "    num_train_epochs=3,  \n",
+     "    learning_rate=1e-4, \n",
+     "    fp16=True,\n",
+     "    optim=\"adamw_torch\",\n",
+     "    logging_steps=10, \n",
+     "    evaluation_strategy=\"steps\",\n",
+     "    save_strategy=\"steps\",\n",
+     "    eval_steps=200,\n",
+     "    save_steps=200,\n",
+     "    output_dir=output_dir, \n",
+     "    save_total_limit=3,\n",
+     "\n",
+     "    report_to=\"wandb\" if use_wandb else None,\n",
+     "    run_name=wandb_run_name if use_wandb else None,\n",
+     ")"
+    ]
+   },
+   {
+    "cell_type": "code",
+    "execution_count": 32,
+    "id": "2686ecf2",
+    "metadata": {},
+    "outputs": [
+     {
+      "data": {
+       "text/html": [
+        "\n",
+        "    <div>\n",
+        "      \n",
+        "      <progress value='972' max='972' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+        "      [972/972 27:33, Epoch 2/3]\n",
+        "    </div>\n",
+        "    <table border=\"1\" class=\"dataframe\">\n",
+        "  <thead>\n",
+        " <tr style=\"text-align: left;\">\n",
+        "      <th>Step</th>\n",
+        "      <th>Training Loss</th>\n",
+        "      <th>Validation Loss</th>\n",
+        "    </tr>\n",
+        "  </thead>\n",
+        "  <tbody>\n",
+        "  </tbody>\n",
+        "</table><p>"
+       ],
+       "text/plain": [
+        "<IPython.core.display.HTML object>"
+       ]
+      },
+      "metadata": {},
+      "output_type": "display_data"
+     },
+     {
+      "data": {
+       "text/html": [
+        "Waiting for W&B process to finish... <strong style=\"color:green\">(success).</strong>"
+       ],
+       "text/plain": [
+        "<IPython.core.display.HTML object>"
+       ]
+      },
+      "metadata": {},
+      "output_type": "display_data"
+     },
+     {
+      "data": {
+       "text/html": [
+        "<style>\n",
+        "    table.wandb td:nth-child(1) { padding: 0 10px; text-align: left ; width: auto;} td:nth-child(2) {text-align: left ; width: 100%}\n",
+        "    .wandb-row { display: flex; flex-direction: row; flex-wrap: wrap; justify-content: flex-start; width: 100% }\n",
+        "    .wandb-col { display: flex; flex-direction: column; flex-basis: 100%; flex: 1; padding: 10px; }\n",
+        "    </style>\n",
+        "<div class=\"wandb-row\"><div class=\"wandb-col\"><h3>Run history:</h3><br/><table class=\"wandb\"><tr><td>eval/loss</td><td>█▄▂▁</td></tr><tr><td>eval/runtime</td><td>▅█▄▁</td></tr><tr><td>eval/samples_per_second</td><td>▄▁▅█</td></tr><tr><td>eval/steps_per_second</td><td>▄▁▅█</td></tr><tr><td>train/epoch</td><td>▁▁▁▁▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇██▇▇▇███</td></tr><tr><td>train/global_step</td><td>▁▁▁▁▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇██▇▇▇███</td></tr><tr><td>train/learning_rate</td><td>████▇▇▇▇▇▆▆▆▆▆▅▅▅▅▄▄▄▄▄▃▃▃▃▂▂▂▂▂▁▁▂▂▂▂▁▁</td></tr><tr><td>train/loss</td><td>█▃▃▂▂▂▂▂▂▂▂▁▂▁▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁</td></tr><tr><td>train/total_flos</td><td>▁█</td></tr><tr><td>train/train_loss</td><td>█▁</td></tr><tr><td>train/train_runtime</td><td>█▁</td></tr><tr><td>train/train_samples_per_second</td><td>▁█</td></tr><tr><td>train/train_steps_per_second</td><td>▁█</td></tr></table><br/></div><div class=\"wandb-col\"><h3>Run summary:</h3><br/><table class=\"wandb\"><tr><td>eval/loss</td><td>1.69353</td></tr><tr><td>eval/runtime</td><td>213.477</td></tr><tr><td>eval/samples_per_second</td><td>48.666</td></tr><tr><td>eval/steps_per_second</td><td>6.085</td></tr><tr><td>train/epoch</td><td>3.0</td></tr><tr><td>train/global_step</td><td>972</td></tr><tr><td>train/learning_rate</td><td>0.0</td></tr><tr><td>train/loss</td><td>1.7007</td></tr><tr><td>train/total_flos</td><td>4.1553623137959936e+17</td></tr><tr><td>train/train_loss</td><td>0.29741</td></tr><tr><td>train/train_runtime</td><td>1642.1473</td></tr><tr><td>train/train_samples_per_second</td><td>75.912</td></tr><tr><td>train/train_steps_per_second</td><td>0.592</td></tr></table><br/></div></div>"
+       ],
+       "text/plain": [
+        "<IPython.core.display.HTML object>"
+       ]
+      },
+      "metadata": {},
+      "output_type": "display_data"
+     },
+     {
+      "data": {
+       "text/html": [
+        " View run <strong style=\"color:#cdcd00\">lora-cerebras-gpt2.7b-alpaca-jecyepye</strong> at: <a href='https://wandb.ai/lxelxe/lora-cerebras-gpt2.7b-alpaca/runs/3up74y7g' target=\"_blank\">https://wandb.ai/lxelxe/lora-cerebras-gpt2.7b-alpaca/runs/3up74y7g</a><br/>Synced 6 W&B file(s), 0 media file(s), 2 artifact file(s) and 0 other file(s)"
+       ],
+       "text/plain": [
+        "<IPython.core.display.HTML object>"
+       ]
+      },
+      "metadata": {},
+      "output_type": "display_data"
+     },
+     {
+      "data": {
+       "text/html": [
+        "Find logs at: <code>./wandb/run-20230329_232219-3up74y7g/logs</code>"
+       ],
+       "text/plain": [
+        "<IPython.core.display.HTML object>"
+       ]
+      },
+      "metadata": {},
+      "output_type": "display_data"
+     }
+    ],
+    "source": [
+     "trainer = transformers.Trainer(\n",
+     "    model=model, \n",
+     "    train_dataset=train_data,\n",
+     "    eval_dataset=val_data,\n",
+     "    args=training_args, \n",
+     "    data_collator=transformers.DataCollatorForSeq2Seq(\n",
+     "        tokenizer, pad_to_multiple_of=8, return_tensors=\"pt\", padding=True\n",
+     "    ),\n",
+     ")\n",
+     "\n",
+     "model.config.use_cache = False\n",
+     "result = trainer.train('lora-cerebras-gpt2.7b-alpaca/checkpoint-800')\n",
+     "model.save_pretrained(output_dir)\n",
+     "\n",
+     "wandb.finish()"
+    ]
+   },
+   {
+    "cell_type": "code",
+    "execution_count": 33,
+    "id": "27e9ad70",
+    "metadata": {},
+    "outputs": [
+     {
+      "name": "stdout",
+      "output_type": "stream",
+      "text": [
+       "torch.float16\n"
+      ]
+     },
+     {
+      "data": {
+       "text/plain": [
+        "PeftModelForCausalLM(\n",
+        "  (base_model): LoraModel(\n",
+        "    (model): GPT2LMHeadModel(\n",
+        "      (transformer): GPT2Model(\n",
+        "        (wte): Embedding(50257, 2560)\n",
+        "        (wpe): Embedding(2048, 2560)\n",
+        "        (drop): Dropout(p=0.0, inplace=False)\n",
+        "        (h): ModuleList(\n",
+        "          (0): GPT2Block(\n",
+        "            (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "            (attn): GPT2Attention(\n",
+        "              (c_attn): MergedLinear(\n",
+        "                in_features=2560, out_features=7680, bias=True\n",
+        "                (lora_dropout): Dropout(p=0.05, inplace=False)\n",
+        "                (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n",
+        "                (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n",
+        "              )\n",
+        "              (c_proj): Conv1D()\n",
+        "              (attn_dropout): Dropout(p=0.0, inplace=False)\n",
+        "              (resid_dropout): Dropout(p=0.0, inplace=False)\n",
+        "            )\n",
+        "            (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "            (mlp): GPT2MLP(\n",
+        "              (c_fc): Conv1D()\n",
+        "              (c_proj): Conv1D()\n",
+        "              (act): GELUActivation()\n",
+        "              (dropout): Dropout(p=0.0, inplace=False)\n",
+        "            )\n",
+        "          )\n",
+        "          (1): GPT2Block(\n",
+        "            (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "            (attn): GPT2Attention(\n",
+        "              (c_attn): MergedLinear(\n",
+        "                in_features=2560, out_features=7680, bias=True\n",
+        "                (lora_dropout): Dropout(p=0.05, inplace=False)\n",
+        "                (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n",
+        "                (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n",
+        "              )\n",
+        "              (c_proj): Conv1D()\n",
+        "              (attn_dropout): Dropout(p=0.0, inplace=False)\n",
+        "              (resid_dropout): Dropout(p=0.0, inplace=False)\n",
+        "            )\n",
+        "            (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "            (mlp): GPT2MLP(\n",
+        "              (c_fc): Conv1D()\n",
+        "              (c_proj): Conv1D()\n",
+        "              (act): GELUActivation()\n",
+        "              (dropout): Dropout(p=0.0, inplace=False)\n",
+        "            )\n",
+        "          )\n",
+        "          (2): GPT2Block(\n",
+        "            (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "            (attn): GPT2Attention(\n",
+        "              (c_attn): MergedLinear(\n",
+        "                in_features=2560, out_features=7680, bias=True\n",
+        "                (lora_dropout): Dropout(p=0.05, inplace=False)\n",
+        "                (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n",
+        "                (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n",
+        "              )\n",
+        "              (c_proj): Conv1D()\n",
+        "              (attn_dropout): Dropout(p=0.0, inplace=False)\n",
+        "              (resid_dropout): Dropout(p=0.0, inplace=False)\n",
+        "            )\n",
+        "            (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "            (mlp): GPT2MLP(\n",
+        "              (c_fc): Conv1D()\n",
+        "              (c_proj): Conv1D()\n",
+        "              (act): GELUActivation()\n",
+        "              (dropout): Dropout(p=0.0, inplace=False)\n",
+        "            )\n",
+        "          )\n",
+        "          (3): GPT2Block(\n",
+        "            (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "            (attn): GPT2Attention(\n",
+        "              (c_attn): MergedLinear(\n",
+        "                in_features=2560, out_features=7680, bias=True\n",
+        "                (lora_dropout): Dropout(p=0.05, inplace=False)\n",
+        "                (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n",
+        "                (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n",
+        "              )\n",
+        "              (c_proj): Conv1D()\n",
+        "              (attn_dropout): Dropout(p=0.0, inplace=False)\n",
+        "              (resid_dropout): Dropout(p=0.0, inplace=False)\n",
+        "            )\n",
+        "            (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "            (mlp): GPT2MLP(\n",
+        "              (c_fc): Conv1D()\n",
+        "              (c_proj): Conv1D()\n",
+        "              (act): GELUActivation()\n",
+        "              (dropout): Dropout(p=0.0, inplace=False)\n",
+        "            )\n",
+        "          )\n",
+        "          (4): GPT2Block(\n",
+        "            (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "            (attn): GPT2Attention(\n",
+        "              (c_attn): MergedLinear(\n",
+        "                in_features=2560, out_features=7680, bias=True\n",
+        "                (lora_dropout): Dropout(p=0.05, inplace=False)\n",
+        "                (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n",
+        "                (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n",
+        "              )\n",
+        "              (c_proj): Conv1D()\n",
+        "              (attn_dropout): Dropout(p=0.0, inplace=False)\n",
+        "              (resid_dropout): Dropout(p=0.0, inplace=False)\n",
+        "            )\n",
+        "            (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "            (mlp): GPT2MLP(\n",
+        "              (c_fc): Conv1D()\n",
+        "              (c_proj): Conv1D()\n",
+        "              (act): GELUActivation()\n",
+        "              (dropout): Dropout(p=0.0, inplace=False)\n",
+        "            )\n",
+        "          )\n",
+        "          (5): GPT2Block(\n",
+        "            (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "            (attn): GPT2Attention(\n",
+        "              (c_attn): MergedLinear(\n",
+        "                in_features=2560, out_features=7680, bias=True\n",
+        "                (lora_dropout): Dropout(p=0.05, inplace=False)\n",
+        "                (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n",
+        "                (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n",
+        "              )\n",
+        "              (c_proj): Conv1D()\n",
+        "              (attn_dropout): Dropout(p=0.0, inplace=False)\n",
+        "              (resid_dropout): Dropout(p=0.0, inplace=False)\n",
+        "            )\n",
+        "            (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "            (mlp): GPT2MLP(\n",
+        "              (c_fc): Conv1D()\n",
+        "              (c_proj): Conv1D()\n",
+        "              (act): GELUActivation()\n",
+        "              (dropout): Dropout(p=0.0, inplace=False)\n",
+        "            )\n",
+        "          )\n",
+        "          (6): GPT2Block(\n",
+        "            (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "            (attn): GPT2Attention(\n",
+        "              (c_attn): MergedLinear(\n",
+        "                in_features=2560, out_features=7680, bias=True\n",
+        "                (lora_dropout): Dropout(p=0.05, inplace=False)\n",
+        "                (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n",
+        "                (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n",
+        "              )\n",
+        "              (c_proj): Conv1D()\n",
+        "              (attn_dropout): Dropout(p=0.0, inplace=False)\n",
+        "              (resid_dropout): Dropout(p=0.0, inplace=False)\n",
+        "            )\n",
+        "            (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "            (mlp): GPT2MLP(\n",
+        "              (c_fc): Conv1D()\n",
+        "              (c_proj): Conv1D()\n",
+        "              (act): GELUActivation()\n",
+        "              (dropout): Dropout(p=0.0, inplace=False)\n",
+        "            )\n",
+        "          )\n",
+        "          (7): GPT2Block(\n",
+        "            (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "            (attn): GPT2Attention(\n",
+        "              (c_attn): MergedLinear(\n",
+        "                in_features=2560, out_features=7680, bias=True\n",
+        "                (lora_dropout): Dropout(p=0.05, inplace=False)\n",
+        "                (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n",
+        "                (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n",
+        "              )\n",
+        "              (c_proj): Conv1D()\n",
+        "              (attn_dropout): Dropout(p=0.0, inplace=False)\n",
+        "              (resid_dropout): Dropout(p=0.0, inplace=False)\n",
+        "            )\n",
+        "            (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "            (mlp): GPT2MLP(\n",
+        "              (c_fc): Conv1D()\n",
+        "              (c_proj): Conv1D()\n",
+        "              (act): GELUActivation()\n",
+        "              (dropout): Dropout(p=0.0, inplace=False)\n",
+        "            )\n",
+        "          )\n",
+        "          (8): GPT2Block(\n",
+        "            (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "            (attn): GPT2Attention(\n",
+        "              (c_attn): MergedLinear(\n",
+        "                in_features=2560, out_features=7680, bias=True\n",
+        "                (lora_dropout): Dropout(p=0.05, inplace=False)\n",
+        "                (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n",
+        "                (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n",
+        "              )\n",
+        "              (c_proj): Conv1D()\n",
+        "              (attn_dropout): Dropout(p=0.0, inplace=False)\n",
+        "              (resid_dropout): Dropout(p=0.0, inplace=False)\n",
+        "            )\n",
+        "            (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "            (mlp): GPT2MLP(\n",
+        "              (c_fc): Conv1D()\n",
+        "              (c_proj): Conv1D()\n",
+        "              (act): GELUActivation()\n",
+        "              (dropout): Dropout(p=0.0, inplace=False)\n",
+        "            )\n",
+        "          )\n",
+        "          (9): GPT2Block(\n",
+        "            (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "            (attn): GPT2Attention(\n",
+        "              (c_attn): MergedLinear(\n",
+        "                in_features=2560, out_features=7680, bias=True\n",
+        "                (lora_dropout): Dropout(p=0.05, inplace=False)\n",
+        "                (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n",
+        "                (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n",
+        "              )\n",
+        "              (c_proj): Conv1D()\n",
+        "              (attn_dropout): Dropout(p=0.0, inplace=False)\n",
+        "              (resid_dropout): Dropout(p=0.0, inplace=False)\n",
+        "            )\n",
+        "            (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "            (mlp): GPT2MLP(\n",
+        "              (c_fc): Conv1D()\n",
+        "              (c_proj): Conv1D()\n",
+        "              (act): GELUActivation()\n",
+        "              (dropout): Dropout(p=0.0, inplace=False)\n",
+        "            )\n",
+        "          )\n",
+        "          (10): GPT2Block(\n",
+        "            (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "            (attn): GPT2Attention(\n",
+        "              (c_attn): MergedLinear(\n",
+        "                in_features=2560, out_features=7680, bias=True\n",
+        "                (lora_dropout): Dropout(p=0.05, inplace=False)\n",
+        "                (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n",
+        "                (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n",
+        "              )\n",
+        "              (c_proj): Conv1D()\n",
+        "              (attn_dropout): Dropout(p=0.0, inplace=False)\n",
+        "              (resid_dropout): Dropout(p=0.0, inplace=False)\n",
+        "            )\n",
+        "            (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "            (mlp): GPT2MLP(\n",
+        "              (c_fc): Conv1D()\n",
+        "              (c_proj): Conv1D()\n",
+        "              (act): GELUActivation()\n",
+        "              (dropout): Dropout(p=0.0, inplace=False)\n",
+        "            )\n",
+        "          )\n",
+        "          (11): GPT2Block(\n",
+        "            (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "            (attn): GPT2Attention(\n",
+        "              (c_attn): MergedLinear(\n",
+        "                in_features=2560, out_features=7680, bias=True\n",
+        "                (lora_dropout): Dropout(p=0.05, inplace=False)\n",
+        "                (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n",
+        "                (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n",
+        "              )\n",
+        "              (c_proj): Conv1D()\n",
+        "              (attn_dropout): Dropout(p=0.0, inplace=False)\n",
+        "              (resid_dropout): Dropout(p=0.0, inplace=False)\n",
+        "            )\n",
+        "            (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "            (mlp): GPT2MLP(\n",
+        "              (c_fc): Conv1D()\n",
+        "              (c_proj): Conv1D()\n",
+        "              (act): GELUActivation()\n",
+        "              (dropout): Dropout(p=0.0, inplace=False)\n",
+        "            )\n",
+        "          )\n",
+        "          (12): GPT2Block(\n",
+        "            (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "            (attn): GPT2Attention(\n",
+        "              (c_attn): MergedLinear(\n",
+        "                in_features=2560, out_features=7680, bias=True\n",
+        "                (lora_dropout): Dropout(p=0.05, inplace=False)\n",
+        "                (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n",
+        "                (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n",
+        "              )\n",
+        "              (c_proj): Conv1D()\n",
+        "              (attn_dropout): Dropout(p=0.0, inplace=False)\n",
+        "              (resid_dropout): Dropout(p=0.0, inplace=False)\n",
+        "            )\n",
+        "            (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "            (mlp): GPT2MLP(\n",
+        "              (c_fc): Conv1D()\n",
+        "              (c_proj): Conv1D()\n",
+        "              (act): GELUActivation()\n",
+        "              (dropout): Dropout(p=0.0, inplace=False)\n",
+        "            )\n",
+        "          )\n",
+        "          (13): GPT2Block(\n",
+        "            (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "            (attn): GPT2Attention(\n",
+        "              (c_attn): MergedLinear(\n",
+        "                in_features=2560, out_features=7680, bias=True\n",
+        "                (lora_dropout): Dropout(p=0.05, inplace=False)\n",
+        "                (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n",
+        "                (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n",
+        "              )\n",
+        "              (c_proj): Conv1D()\n",
+        "              (attn_dropout): Dropout(p=0.0, inplace=False)\n",
+        "              (resid_dropout): Dropout(p=0.0, inplace=False)\n",
+        "            )\n",
+        "            (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "            (mlp): GPT2MLP(\n",
+        "              (c_fc): Conv1D()\n",
+        "              (c_proj): Conv1D()\n",
+        "              (act): GELUActivation()\n",
+        "              (dropout): Dropout(p=0.0, inplace=False)\n",
+        "            )\n",
+        "          )\n",
+        "          (14): GPT2Block(\n",
+        "            (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "            (attn): GPT2Attention(\n",
+        "              (c_attn): MergedLinear(\n",
+        "                in_features=2560, out_features=7680, bias=True\n",
+        "                (lora_dropout): Dropout(p=0.05, inplace=False)\n",
+        "                (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n",
+        "                (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n",
+        "              )\n",
+        "              (c_proj): Conv1D()\n",
+        "              (attn_dropout): Dropout(p=0.0, inplace=False)\n",
+        "              (resid_dropout): Dropout(p=0.0, inplace=False)\n",
+        "            )\n",
+        "            (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "            (mlp): GPT2MLP(\n",
+        "              (c_fc): Conv1D()\n",
+        "              (c_proj): Conv1D()\n",
+        "              (act): GELUActivation()\n",
+        "              (dropout): Dropout(p=0.0, inplace=False)\n",
+        "            )\n",
+        "          )\n",
+        "          (15): GPT2Block(\n",
+        "            (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "            (attn): GPT2Attention(\n",
+        "              (c_attn): MergedLinear(\n",
+        "                in_features=2560, out_features=7680, bias=True\n",
+        "                (lora_dropout): Dropout(p=0.05, inplace=False)\n",
+        "                (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n",
+        "                (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n",
+        "              )\n",
+        "              (c_proj): Conv1D()\n",
+        "              (attn_dropout): Dropout(p=0.0, inplace=False)\n",
+        "              (resid_dropout): Dropout(p=0.0, inplace=False)\n",
+        "            )\n",
+        "            (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "            (mlp): GPT2MLP(\n",
+        "              (c_fc): Conv1D()\n",
+        "              (c_proj): Conv1D()\n",
+        "              (act): GELUActivation()\n",
+        "              (dropout): Dropout(p=0.0, inplace=False)\n",
+        "            )\n",
+        "          )\n",
+        "          (16): GPT2Block(\n",
+        "            (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "            (attn): GPT2Attention(\n",
+        "              (c_attn): MergedLinear(\n",
+        "                in_features=2560, out_features=7680, bias=True\n",
+        "                (lora_dropout): Dropout(p=0.05, inplace=False)\n",
+        "                (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n",
+        "                (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n",
+        "              )\n",
+        "              (c_proj): Conv1D()\n",
+        "              (attn_dropout): Dropout(p=0.0, inplace=False)\n",
+        "              (resid_dropout): Dropout(p=0.0, inplace=False)\n",
+        "            )\n",
+        "            (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "            (mlp): GPT2MLP(\n",
+        "              (c_fc): Conv1D()\n",
+        "              (c_proj): Conv1D()\n",
+        "              (act): GELUActivation()\n",
+        "              (dropout): Dropout(p=0.0, inplace=False)\n",
+        "            )\n",
+        "          )\n",
+        "          (17): GPT2Block(\n",
+        "            (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "            (attn): GPT2Attention(\n",
+        "              (c_attn): MergedLinear(\n",
+        "                in_features=2560, out_features=7680, bias=True\n",
+        "                (lora_dropout): Dropout(p=0.05, inplace=False)\n",
+        "                (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n",
+        "                (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n",
+        "              )\n",
+        "              (c_proj): Conv1D()\n",
+        "              (attn_dropout): Dropout(p=0.0, inplace=False)\n",
+        "              (resid_dropout): Dropout(p=0.0, inplace=False)\n",
+        "            )\n",
+        "            (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "            (mlp): GPT2MLP(\n",
+        "              (c_fc): Conv1D()\n",
+        "              (c_proj): Conv1D()\n",
+        "              (act): GELUActivation()\n",
+        "              (dropout): Dropout(p=0.0, inplace=False)\n",
+        "            )\n",
+        "          )\n",
+        "          (18): GPT2Block(\n",
+        "            (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "            (attn): GPT2Attention(\n",
+        "              (c_attn): MergedLinear(\n",
+        "                in_features=2560, out_features=7680, bias=True\n",
+        "                (lora_dropout): Dropout(p=0.05, inplace=False)\n",
+        "                (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n",
+        "                (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n",
+        "              )\n",
+        "              (c_proj): Conv1D()\n",
+        "              (attn_dropout): Dropout(p=0.0, inplace=False)\n",
+        "              (resid_dropout): Dropout(p=0.0, inplace=False)\n",
+        "            )\n",
+        "            (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "            (mlp): GPT2MLP(\n",
+        "              (c_fc): Conv1D()\n",
+        "              (c_proj): Conv1D()\n",
+        "              (act): GELUActivation()\n",
+        "              (dropout): Dropout(p=0.0, inplace=False)\n",
+        "            )\n",
+        "          )\n",
+        "          (19): GPT2Block(\n",
+        "            (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "            (attn): GPT2Attention(\n",
+        "              (c_attn): MergedLinear(\n",
+        "                in_features=2560, out_features=7680, bias=True\n",
+        "                (lora_dropout): Dropout(p=0.05, inplace=False)\n",
+        "                (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n",
+        "                (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n",
+        "              )\n",
+        "              (c_proj): Conv1D()\n",
+        "              (attn_dropout): Dropout(p=0.0, inplace=False)\n",
+        "              (resid_dropout): Dropout(p=0.0, inplace=False)\n",
+        "            )\n",
+        "            (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "            (mlp): GPT2MLP(\n",
+        "              (c_fc): Conv1D()\n",
+        "              (c_proj): Conv1D()\n",
+        "              (act): GELUActivation()\n",
+        "              (dropout): Dropout(p=0.0, inplace=False)\n",
+        "            )\n",
+        "          )\n",
+        "          (20): GPT2Block(\n",
+        "            (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "            (attn): GPT2Attention(\n",
+        "              (c_attn): MergedLinear(\n",
+        "                in_features=2560, out_features=7680, bias=True\n",
+        "                (lora_dropout): Dropout(p=0.05, inplace=False)\n",
+        "                (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n",
+        "                (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n",
+        "              )\n",
+        "              (c_proj): Conv1D()\n",
+        "              (attn_dropout): Dropout(p=0.0, inplace=False)\n",
+        "              (resid_dropout): Dropout(p=0.0, inplace=False)\n",
+        "            )\n",
+        "            (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "            (mlp): GPT2MLP(\n",
+        "              (c_fc): Conv1D()\n",
+        "              (c_proj): Conv1D()\n",
+        "              (act): GELUActivation()\n",
+        "              (dropout): Dropout(p=0.0, inplace=False)\n",
+        "            )\n",
+        "          )\n",
+        "          (21): GPT2Block(\n",
+        "            (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "            (attn): GPT2Attention(\n",
+        "              (c_attn): MergedLinear(\n",
+        "                in_features=2560, out_features=7680, bias=True\n",
+        "                (lora_dropout): Dropout(p=0.05, inplace=False)\n",
+        "                (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n",
+        "                (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n",
+        "              )\n",
+        "              (c_proj): Conv1D()\n",
+        "              (attn_dropout): Dropout(p=0.0, inplace=False)\n",
+        "              (resid_dropout): Dropout(p=0.0, inplace=False)\n",
+        "            )\n",
+        "            (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "            (mlp): GPT2MLP(\n",
+        "              (c_fc): Conv1D()\n",
+        "              (c_proj): Conv1D()\n",
+        "              (act): GELUActivation()\n",
+        "              (dropout): Dropout(p=0.0, inplace=False)\n",
+        "            )\n",
+        "          )\n",
+        "          (22): GPT2Block(\n",
+        "            (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "            (attn): GPT2Attention(\n",
+        "              (c_attn): MergedLinear(\n",
+        "                in_features=2560, out_features=7680, bias=True\n",
+        "                (lora_dropout): Dropout(p=0.05, inplace=False)\n",
+        "                (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n",
+        "                (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n",
+        "              )\n",
+        "              (c_proj): Conv1D()\n",
+        "              (attn_dropout): Dropout(p=0.0, inplace=False)\n",
+        "              (resid_dropout): Dropout(p=0.0, inplace=False)\n",
+        "            )\n",
+        "            (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "            (mlp): GPT2MLP(\n",
+        "              (c_fc): Conv1D()\n",
+        "              (c_proj): Conv1D()\n",
+        "              (act): GELUActivation()\n",
+        "              (dropout): Dropout(p=0.0, inplace=False)\n",
+        "            )\n",
+        "          )\n",
+        "          (23): GPT2Block(\n",
+        "            (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "            (attn): GPT2Attention(\n",
+        "              (c_attn): MergedLinear(\n",
+        "                in_features=2560, out_features=7680, bias=True\n",
+        "                (lora_dropout): Dropout(p=0.05, inplace=False)\n",
+        "                (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n",
+        "                (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n",
+        "              )\n",
+        "              (c_proj): Conv1D()\n",
+        "              (attn_dropout): Dropout(p=0.0, inplace=False)\n",
+        "              (resid_dropout): Dropout(p=0.0, inplace=False)\n",
+        "            )\n",
+        "            (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "            (mlp): GPT2MLP(\n",
+        "              (c_fc): Conv1D()\n",
+        "              (c_proj): Conv1D()\n",
+        "              (act): GELUActivation()\n",
+        "              (dropout): Dropout(p=0.0, inplace=False)\n",
+        "            )\n",
+        "          )\n",
+        "          (24): GPT2Block(\n",
+        "            (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "            (attn): GPT2Attention(\n",
+        "              (c_attn): MergedLinear(\n",
+        "                in_features=2560, out_features=7680, bias=True\n",
+        "                (lora_dropout): Dropout(p=0.05, inplace=False)\n",
+        "                (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n",
+        "                (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n",
+        "              )\n",
+        "              (c_proj): Conv1D()\n",
+        "              (attn_dropout): Dropout(p=0.0, inplace=False)\n",
+        "              (resid_dropout): Dropout(p=0.0, inplace=False)\n",
+        "            )\n",
+        "            (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "            (mlp): GPT2MLP(\n",
+        "              (c_fc): Conv1D()\n",
+        "              (c_proj): Conv1D()\n",
+        "              (act): GELUActivation()\n",
+        "              (dropout): Dropout(p=0.0, inplace=False)\n",
+        "            )\n",
+        "          )\n",
+        "          (25): GPT2Block(\n",
+        "            (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "            (attn): GPT2Attention(\n",
+        "              (c_attn): MergedLinear(\n",
+        "                in_features=2560, out_features=7680, bias=True\n",
+        "                (lora_dropout): Dropout(p=0.05, inplace=False)\n",
+        "                (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n",
+        "                (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n",
+        "              )\n",
+        "              (c_proj): Conv1D()\n",
+        "              (attn_dropout): Dropout(p=0.0, inplace=False)\n",
+        "              (resid_dropout): Dropout(p=0.0, inplace=False)\n",
+        "            )\n",
+        "            (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "            (mlp): GPT2MLP(\n",
+        "              (c_fc): Conv1D()\n",
+        "              (c_proj): Conv1D()\n",
+        "              (act): GELUActivation()\n",
+        "              (dropout): Dropout(p=0.0, inplace=False)\n",
+        "            )\n",
+        "          )\n",
+        "          (26): GPT2Block(\n",
+        "            (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "            (attn): GPT2Attention(\n",
+        "              (c_attn): MergedLinear(\n",
+        "                in_features=2560, out_features=7680, bias=True\n",
+        "                (lora_dropout): Dropout(p=0.05, inplace=False)\n",
+        "                (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n",
+        "                (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n",
+        "              )\n",
+        "              (c_proj): Conv1D()\n",
+        "              (attn_dropout): Dropout(p=0.0, inplace=False)\n",
+        "              (resid_dropout): Dropout(p=0.0, inplace=False)\n",
+        "            )\n",
+        "            (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "            (mlp): GPT2MLP(\n",
+        "              (c_fc): Conv1D()\n",
+        "              (c_proj): Conv1D()\n",
+        "              (act): GELUActivation()\n",
+        "              (dropout): Dropout(p=0.0, inplace=False)\n",
+        "            )\n",
+        "          )\n",
+        "          (27): GPT2Block(\n",
+        "            (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "            (attn): GPT2Attention(\n",
+        "              (c_attn): MergedLinear(\n",
+        "                in_features=2560, out_features=7680, bias=True\n",
+        "                (lora_dropout): Dropout(p=0.05, inplace=False)\n",
+        "                (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n",
+        "                (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n",
+        "              )\n",
+        "              (c_proj): Conv1D()\n",
+        "              (attn_dropout): Dropout(p=0.0, inplace=False)\n",
+        "              (resid_dropout): Dropout(p=0.0, inplace=False)\n",
+        "            )\n",
+        "            (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "            (mlp): GPT2MLP(\n",
+        "              (c_fc): Conv1D()\n",
+        "              (c_proj): Conv1D()\n",
+        "              (act): GELUActivation()\n",
+        "              (dropout): Dropout(p=0.0, inplace=False)\n",
+        "            )\n",
+        "          )\n",
+        "          (28): GPT2Block(\n",
+        "            (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "            (attn): GPT2Attention(\n",
+        "              (c_attn): MergedLinear(\n",
+        "                in_features=2560, out_features=7680, bias=True\n",
+        "                (lora_dropout): Dropout(p=0.05, inplace=False)\n",
+        "                (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n",
+        "                (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n",
+        "              )\n",
+        "              (c_proj): Conv1D()\n",
+        "              (attn_dropout): Dropout(p=0.0, inplace=False)\n",
+        "              (resid_dropout): Dropout(p=0.0, inplace=False)\n",
+        "            )\n",
+        "            (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "            (mlp): GPT2MLP(\n",
+        "              (c_fc): Conv1D()\n",
+        "              (c_proj): Conv1D()\n",
+        "              (act): GELUActivation()\n",
+        "              (dropout): Dropout(p=0.0, inplace=False)\n",
+        "            )\n",
+        "          )\n",
+        "          (29): GPT2Block(\n",
+        "            (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "            (attn): GPT2Attention(\n",
+        "              (c_attn): MergedLinear(\n",
+        "                in_features=2560, out_features=7680, bias=True\n",
+        "                (lora_dropout): Dropout(p=0.05, inplace=False)\n",
+        "                (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n",
+        "                (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n",
+        "              )\n",
+        "              (c_proj): Conv1D()\n",
+        "              (attn_dropout): Dropout(p=0.0, inplace=False)\n",
+        "              (resid_dropout): Dropout(p=0.0, inplace=False)\n",
+        "            )\n",
+        "            (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "            (mlp): GPT2MLP(\n",
+        "              (c_fc): Conv1D()\n",
+        "              (c_proj): Conv1D()\n",
+        "              (act): GELUActivation()\n",
+        "              (dropout): Dropout(p=0.0, inplace=False)\n",
+        "            )\n",
+        "          )\n",
+        "          (30): GPT2Block(\n",
+        "            (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "            (attn): GPT2Attention(\n",
+        "              (c_attn): MergedLinear(\n",
+        "                in_features=2560, out_features=7680, bias=True\n",
+        "                (lora_dropout): Dropout(p=0.05, inplace=False)\n",
+        "                (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n",
+        "                (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n",
+        "              )\n",
+        "              (c_proj): Conv1D()\n",
+        "              (attn_dropout): Dropout(p=0.0, inplace=False)\n",
+        "              (resid_dropout): Dropout(p=0.0, inplace=False)\n",
+        "            )\n",
+        "            (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "            (mlp): GPT2MLP(\n",
+        "              (c_fc): Conv1D()\n",
+        "              (c_proj): Conv1D()\n",
+        "              (act): GELUActivation()\n",
+        "              (dropout): Dropout(p=0.0, inplace=False)\n",
+        "            )\n",
+        "          )\n",
+        "          (31): GPT2Block(\n",
+        "            (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "            (attn): GPT2Attention(\n",
+        "              (c_attn): MergedLinear(\n",
+        "                in_features=2560, out_features=7680, bias=True\n",
+        "                (lora_dropout): Dropout(p=0.05, inplace=False)\n",
+        "                (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n",
+        "                (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n",
+        "              )\n",
+        "              (c_proj): Conv1D()\n",
+        "              (attn_dropout): Dropout(p=0.0, inplace=False)\n",
+        "              (resid_dropout): Dropout(p=0.0, inplace=False)\n",
+        "            )\n",
+        "            (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "            (mlp): GPT2MLP(\n",
+        "              (c_fc): Conv1D()\n",
+        "              (c_proj): Conv1D()\n",
+        "              (act): GELUActivation()\n",
+        "              (dropout): Dropout(p=0.0, inplace=False)\n",
+        "            )\n",
+        "          )\n",
+        "        )\n",
+        "        (ln_f): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
+        "      )\n",
+        "      (lm_head): CastOutputToFloat(\n",
+        "        (0): Linear(in_features=2560, out_features=50257, bias=False)\n",
+        "      )\n",
+        "    )\n",
+        "  )\n",
+        ")"
+       ]
+      },
+      "execution_count": 33,
+      "metadata": {},
+      "output_type": "execute_result"
+     }
+    ],
+    "source": [
+     "model.config\n",
+     "print(model.dtype)\n",
+     "\n",
+     "model.half()"
+    ]
+   },
+   {
+    "cell_type": "code",
+    "execution_count": 35,
+    "id": "9cca3b03",
+    "metadata": {},
+    "outputs": [
+     {
+      "name": "stderr",
+      "output_type": "stream",
+      "text": [
+       "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n",
+       "/root/miniconda3/envs/llama/lib/python3.10/site-packages/torch/utils/checkpoint.py:31: UserWarning: None of the inputs have requires_grad=True. Gradients will be None\n",
+       "  warnings.warn(\"None of the inputs have requires_grad=True. Gradients will be None\")\n"
+      ]
+     },
+     {
+      "name": "stdout",
+      "output_type": "stream",
+      "text": [
+       "Human: Can I run inference on my local machine?\n",
+       "Assistant: Yes, you can. You should be able to use the same model and data as your local machine for inference. The only difference is that you will need to download the necessary packages from the cloud or install them locally.\n"
+      ]
+     }
+    ],
+    "source": [
+     "text = \"Human: Can I run inference on my local machine?\\nAssistant:\"\n",
+     "\n",
+     "inputs = tokenizer(text, return_tensors=\"pt\")\n",
+     "input_ids = inputs[\"input_ids\"].to(model.device)\n",
+     "\n",
+     "generation_config = transformers.GenerationConfig(\n",
+     "    max_new_tokens=100,\n",
+     "    temperature=0.2,\n",
+     "    top_p=0.75,\n",
+     "    top_k=50,\n",
+     "    repetition_penalty=1.2,\n",
+     "    do_sample=True,\n",
+     "    early_stopping=True,\n",
+     "#     num_beams=5,\n",
+     "    \n",
+     "    pad_token_id=model.config.pad_token_id,\n",
+     "    eos_token_id=model.config.eos_token_id,\n",
+     ")\n",
+     "\n",
+     "with torch.no_grad():\n",
+     "    output = model.generate(\n",
+     "        input_ids=input_ids,\n",
+     "        attention_mask=torch.ones_like(input_ids),\n",
+     "        generation_config=generation_config\n",
+     "    )[0].cuda()\n",
+     "\n",
+     "result = tokenizer.decode(output, skip_special_tokens=True).strip()\n",
+     "print(result)"
+    ]
+   },
+   {
+    "cell_type": "code",
+    "execution_count": null,
+    "id": "be542e91",
+    "metadata": {},
+    "outputs": [],
+    "source": []
+   }
+  ],
+  "metadata": {
+   "kernelspec": {
+    "display_name": "Python 3 (ipykernel)",
+    "language": "python",
+    "name": "python3"
+   },
+   "language_info": {
+    "codemirror_mode": {
+     "name": "ipython",
+     "version": 3
+    },
+    "file_extension": ".py",
+    "mimetype": "text/x-python",
+    "name": "python",
+    "nbconvert_exporter": "python",
+    "pygments_lexer": "ipython3",
+    "version": "3.10.9"
+   }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 5
+ }

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+gradio
+transformers
+bitsandbytes
+accelerate
+peft