{ "cells": [ { "cell_type": "markdown", "id": "9c3e4532", "metadata": { "papermill": { "duration": 1.064429, "end_time": "2023-10-23T04:10:32.617552", "exception": false, "start_time": "2023-10-23T04:10:31.553123", "status": "completed" }, "tags": [] }, "source": [ "# Train models using HuggingFace libraries\n", "\n", "This notebook takes parameters from a params.json file which is automatically\n", "created by Substratus K8s operator.\n", "\n", "The following parameters influence what happens in this notebook:\n", "- `dataset_urls`: A comma separated list of URLs. The URLs should point to\n", " json files that contain your training dataset. If unset a json or jsonl\n", " file should be present under the `/content/data/` directory.\n", "- `prompt_template`: The prompt template to use for training\n", "- `push_to_hub`: if this variable is set a repo id, then the trained\n", " model will get pushed to HuggingFace hub. For example,\n", " set it to \"substratusai/my-model\" to publish to substratusai HF org." ] }, { "cell_type": "code", "execution_count": 1, "id": "86ccd646", "metadata": { "execution": { "iopub.execute_input": "2023-10-23T04:10:34.494504Z", "iopub.status.busy": "2023-10-23T04:10:34.493261Z", "iopub.status.idle": "2023-10-23T04:10:34.506648Z", "shell.execute_reply": "2023-10-23T04:10:34.506011Z" }, "papermill": { "duration": 0.898669, "end_time": "2023-10-23T04:10:34.508149", "exception": false, "start_time": "2023-10-23T04:10:33.609480", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "{'dataset_urls': 'https://huggingface.co/datasets/weaviate/WithoutRetrieval-SchemaSplit-Train-80/resolve/main/WithoutRetrieval-SchemaSplit-Train-80.json',\n", " 'logging_steps': 50,\n", " 'modules_to_save': 'embed_tokens, lm_head',\n", " 'num_train_epochs': 3,\n", " 'per_device_eval_batch_size': 1,\n", " 'per_device_train_batch_size': 1,\n", " 'prompt_template': '## Instruction\\nYour task is to write GraphQL for the Natural Language Query provided. Use the provided Schema to generate the GraphQL. The GraphQL should be valid for Weaviate.\\n\\n## Natural Language Query\\n{nlcommand}\\n\\n## Schema\\n{schema}\\n\\n## Answer\\n{output}\\n',\n", " 'push_to_hub': 'substratusai/wgql-WithoutRetrieval-SchemaSplit-Train-80',\n", " 'save_steps': 50,\n", " 'target_modules': 'q_proj, up_proj, o_proj, k_proj, down_proj, gate_proj, v_proj',\n", " 'warmup_steps': 100}" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import json\n", "from pathlib import Path\n", "\n", "params = {}\n", "params_path = Path(\"/content/params.json\")\n", "if params_path.is_file():\n", " with params_path.open(\"r\", encoding=\"UTF-8\") as params_file:\n", " params = json.load(params_file)\n", "\n", "\n", "params" ] }, { "cell_type": "code", "execution_count": 2, "id": "9fafd16b-d8c9-47bf-9116-c27b1d43a019", "metadata": { "execution": { "iopub.execute_input": "2023-10-23T04:10:36.304465Z", "iopub.status.busy": "2023-10-23T04:10:36.303766Z", "iopub.status.idle": "2023-10-23T04:10:39.687535Z", "shell.execute_reply": "2023-10-23T04:10:39.686882Z" }, "papermill": { "duration": 4.284256, "end_time": "2023-10-23T04:10:39.689024", "exception": false, "start_time": "2023-10-23T04:10:35.404768", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Using the following URLs for the dataset: ['https://huggingface.co/datasets/weaviate/WithoutRetrieval-SchemaSplit-Train-80/resolve/main/WithoutRetrieval-SchemaSplit-Train-80.json']\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "e9a6ea0ca5c047b1a8ad11457dcaa2e8", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading data files: 0%| | 0/1 [00:00\n" ] } ], "source": [ "default_prompt = \"\"\"\n", "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n", "### Instruction:\n", "{prompt}\n", "### Response:\n", "{completion}\n", "\"\"\"\n", "\n", "prompt = params.get(\"prompt_template\", default_prompt)\n", "\n", "eos_token = tokenizer.convert_ids_to_tokens(model.config.eos_token_id)\n", "if prompt[-len(eos_token):] != eos_token:\n", " prompt = prompt + eos_token\n", "\n", "print(prompt)\n" ] }, { "cell_type": "code", "execution_count": 6, "id": "0abf96e1-3bc1-4ae7-80ac-c2e585e9c7c1", "metadata": { "execution": { "iopub.execute_input": "2023-10-23T04:14:32.183851Z", "iopub.status.busy": "2023-10-23T04:14:32.183550Z", "iopub.status.idle": "2023-10-23T04:14:33.043374Z", "shell.execute_reply": "2023-10-23T04:14:33.042525Z" }, "papermill": { "duration": 1.829206, "end_time": "2023-10-23T04:14:33.045328", "exception": false, "start_time": "2023-10-23T04:14:31.216122", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Mon Oct 23 04:14:32 2023 \r\n", "+-----------------------------------------------------------------------------+\r\n", "| NVIDIA-SMI 525.105.17 Driver Version: 525.105.17 CUDA Version: 12.0 |\r\n", "|-------------------------------+----------------------+----------------------+\r\n", "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\r\n", "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\r\n", "| | | MIG M. |\r\n", "|===============================+======================+======================|\r\n", "| 0 NVIDIA L4 Off | 00000000:00:04.0 Off | 0 |\r\n", "| N/A 59C P0 31W / 72W | 3570MiB / 23034MiB | 0% Default |\r\n", "| | | N/A |\r\n", "+-------------------------------+----------------------+----------------------+\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "| 1 NVIDIA L4 Off | 00000000:00:05.0 Off | 0 |\r\n", "| N/A 64C P0 32W / 72W | 4096MiB / 23034MiB | 0% Default |\r\n", "| | | N/A |\r\n", "+-------------------------------+----------------------+----------------------+\r\n", "| 2 NVIDIA L4 Off | 00000000:00:06.0 Off | 0 |\r\n", "| N/A 65C P0 32W / 72W | 4096MiB / 23034MiB | 0% Default |\r\n", "| | | N/A |\r\n", "+-------------------------------+----------------------+----------------------+\r\n", "| 3 NVIDIA L4 Off | 00000000:00:07.0 Off | 0 |\r\n", "| N/A 62C P0 29W / 72W | 3570MiB / 23034MiB | 0% Default |\r\n", "| | | N/A |\r\n", "+-------------------------------+----------------------+----------------------+\r\n", " \r\n", "+-----------------------------------------------------------------------------+\r\n", "| Processes: |\r\n", "| GPU GI CI PID Type Process name GPU Memory |\r\n", "| ID ID Usage |\r\n", "|=============================================================================|\r\n", "+-----------------------------------------------------------------------------+\r\n" ] } ], "source": [ "! nvidia-smi" ] }, { "attachments": {}, "cell_type": "markdown", "id": "4d1e1795-c783-4ddf-999e-f1de19258928", "metadata": { "papermill": { "duration": 1.031477, "end_time": "2023-10-23T04:14:35.109265", "exception": false, "start_time": "2023-10-23T04:14:34.077788", "status": "completed" }, "tags": [] }, "source": [ "Prompt before fine tuning" ] }, { "cell_type": "code", "execution_count": 7, "id": "f5dd944b-e2bd-4bfd-a5fa-55bc90239926", "metadata": { "execution": { "iopub.execute_input": "2023-10-23T04:14:42.797437Z", "iopub.status.busy": "2023-10-23T04:14:42.796639Z", "iopub.status.idle": "2023-10-23T04:14:42.819008Z", "shell.execute_reply": "2023-10-23T04:14:42.818263Z" }, "papermill": { "duration": 6.737466, "end_time": "2023-10-23T04:14:42.820457", "exception": false, "start_time": "2023-10-23T04:14:36.082991", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "LlamaTokenizerFast(name_or_path='/content/model/', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '', 'eos_token': '', 'unk_token': '', 'pad_token': '[PAD]'}, clean_up_tokenization_spaces=False), added_tokens_decoder={\n", "\t0: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t1: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t2: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t32000: AddedToken(\"[PAD]\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "}" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from typing import Dict\n", "# source: https://github.com/artidoro/qlora\n", "DEFAULT_PAD_TOKEN = params.get(\"pad_token\", \"[PAD]\")\n", "\n", "def smart_tokenizer_and_embedding_resize(\n", " special_tokens_dict: Dict,\n", " tokenizer: transformers.PreTrainedTokenizer,\n", " model: transformers.PreTrainedModel,\n", "):\n", " \"\"\"Resize tokenizer and embedding.\n", "\n", " Note: This is the unoptimized version that may make your embedding size not be divisible by 64.\n", " \"\"\"\n", " num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)\n", " model.resize_token_embeddings(len(tokenizer))\n", " if num_new_tokens > 0:\n", " input_embeddings_data = model.get_input_embeddings().weight.data\n", " output_embeddings_data = model.get_output_embeddings().weight.data\n", "\n", " input_embeddings_avg = input_embeddings_data[:-num_new_tokens].mean(dim=0, keepdim=True)\n", " output_embeddings_avg = output_embeddings_data[:-num_new_tokens].mean(dim=0, keepdim=True)\n", "\n", " input_embeddings_data[-num_new_tokens:] = input_embeddings_avg\n", " output_embeddings_data[-num_new_tokens:] = output_embeddings_avg\n", "\n", "if tokenizer._pad_token is None:\n", " smart_tokenizer_and_embedding_resize(\n", " special_tokens_dict=dict(pad_token=DEFAULT_PAD_TOKEN),\n", " tokenizer=tokenizer,\n", " model=model,\n", " )\n", "\n", "if isinstance(tokenizer, transformers.LlamaTokenizer):\n", " # LLaMA tokenizer may not have correct special tokens set.\n", " # Check and add them if missing to prevent them from being parsed into different tokens.\n", " # Note that these are present in the vocabulary.\n", " # Note also that `model.config.pad_token_id` is 0 which corresponds to `` token.\n", " print('Adding special tokens.')\n", " tokenizer.add_special_tokens({\n", " \"eos_token\": tokenizer.convert_ids_to_tokens(model.config.eos_token_id),\n", " \"bos_token\": tokenizer.convert_ids_to_tokens(model.config.bos_token_id),\n", " \"unk_token\": tokenizer.convert_ids_to_tokens(\n", " model.config.pad_token_id if model.config.pad_token_id != -1 else tokenizer.pad_token_id\n", " ),\n", " })\n", "\n", "tokenizer" ] }, { "cell_type": "code", "execution_count": 8, "id": "e78b510d", "metadata": { "execution": { "iopub.execute_input": "2023-10-23T04:14:44.839307Z", "iopub.status.busy": "2023-10-23T04:14:44.838706Z", "iopub.status.idle": "2023-10-23T04:14:49.825169Z", "shell.execute_reply": "2023-10-23T04:14:49.824517Z" }, "papermill": { "duration": 6.202467, "end_time": "2023-10-23T04:14:50.025232", "exception": false, "start_time": "2023-10-23T04:14:43.822765", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "9aae387275d64be9a5a634561fb9c78c", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/3163 [00:00, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='CAUSAL_LM', inference_mode=False, r=16, target_modules=['q_proj', 'up_proj', 'o_proj', 'k_proj', 'down_proj', 'gate_proj', 'v_proj'], lora_alpha=16, lora_dropout=0.05, fan_in_fan_out=False, bias='none', modules_to_save=['embed_tokens', 'lm_head'], init_lora_weights=True, layers_to_transform=None, layers_pattern=None)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "trainable params: 564,281,344 || all params: 7,040,552,960 || trainable%: 8.01473047935144\n" ] } ], "source": [ "from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training\n", "\n", "target_modules = params.get(\"target_modules\")\n", "if target_modules:\n", " target_modules = [mod.strip() for mod in target_modules.split(\",\")]\n", "\n", "modules_to_save = params.get(\"modules_to_save\")\n", "if modules_to_save:\n", " modules_to_save = [mod.strip() for mod in modules_to_save.split(\",\")]\n", "\n", "lora_config2 = LoraConfig(\n", " r=16,\n", " lora_alpha=16,\n", " lora_dropout=0.05,\n", " bias=\"none\",\n", " task_type=\"CAUSAL_LM\",\n", " target_modules=target_modules,\n", " modules_to_save = modules_to_save\n", ")\n", "print(lora_config2)\n", "\n", "model = prepare_model_for_kbit_training(model)\n", "\n", "# add LoRA adaptor\n", "model = get_peft_model(model, lora_config2)\n", "model.print_trainable_parameters()" ] }, { "cell_type": "code", "execution_count": 10, "id": "70a3e36c-62cf-45aa-8f37-0db0e40857dc", "metadata": { "execution": { "iopub.execute_input": "2023-10-23T04:16:28.759999Z", "iopub.status.busy": "2023-10-23T04:16:28.759043Z", "iopub.status.idle": "2023-10-23T04:16:28.778449Z", "shell.execute_reply": "2023-10-23T04:16:28.777816Z" }, "papermill": { "duration": 1.003134, "end_time": "2023-10-23T04:16:28.780332", "exception": false, "start_time": "2023-10-23T04:16:27.777198", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "TrainingArguments(\n", "_n_gpu=4,\n", "adafactor=False,\n", "adam_beta1=0.9,\n", "adam_beta2=0.999,\n", "adam_epsilon=1e-08,\n", "auto_find_batch_size=False,\n", "bf16=False,\n", "bf16_full_eval=False,\n", "data_seed=None,\n", "dataloader_drop_last=False,\n", "dataloader_num_workers=0,\n", "dataloader_pin_memory=True,\n", "ddp_backend=None,\n", "ddp_broadcast_buffers=None,\n", "ddp_bucket_cap_mb=None,\n", "ddp_find_unused_parameters=None,\n", "ddp_timeout=1800,\n", "debug=[],\n", "deepspeed=None,\n", "disable_tqdm=False,\n", "dispatch_batches=None,\n", "do_eval=False,\n", "do_predict=False,\n", "do_train=False,\n", "eval_accumulation_steps=None,\n", "eval_delay=0,\n", "eval_steps=None,\n", "evaluation_strategy=no,\n", "fp16=True,\n", "fp16_backend=auto,\n", "fp16_full_eval=False,\n", "fp16_opt_level=O1,\n", "fsdp=[],\n", "fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},\n", "fsdp_min_num_params=0,\n", "fsdp_transformer_layer_cls_to_wrap=None,\n", "full_determinism=False,\n", "gradient_accumulation_steps=4,\n", "gradient_checkpointing=False,\n", "greater_is_better=None,\n", "group_by_length=False,\n", "half_precision_backend=auto,\n", "hub_always_push=False,\n", "hub_model_id=None,\n", "hub_private_repo=False,\n", "hub_strategy=every_save,\n", "hub_token=,\n", "ignore_data_skip=False,\n", "include_inputs_for_metrics=False,\n", "include_tokens_per_second=False,\n", "jit_mode_eval=False,\n", "label_names=None,\n", "label_smoothing_factor=0.0,\n", "learning_rate=3e-05,\n", "length_column_name=length,\n", "load_best_model_at_end=False,\n", "local_rank=0,\n", "log_level=passive,\n", "log_level_replica=warning,\n", "log_on_each_node=True,\n", "logging_dir=/content/artifacts/checkpoints/runs/Oct23_04-16-28_wgqlg-withoutretrieval-schemasplit-train-80-v2-modeller-wk9wh,\n", "logging_first_step=False,\n", "logging_nan_inf_filter=True,\n", "logging_steps=50,\n", "logging_strategy=steps,\n", "lr_scheduler_type=cosine,\n", "max_grad_norm=1.0,\n", "max_steps=-1,\n", "metric_for_best_model=None,\n", "mp_parameters=,\n", "no_cuda=False,\n", "num_train_epochs=3.0,\n", "optim=paged_adamw_32bit,\n", "optim_args=None,\n", "output_dir=/content/artifacts/checkpoints,\n", "overwrite_output_dir=False,\n", "past_index=-1,\n", "per_device_eval_batch_size=1,\n", "per_device_train_batch_size=1,\n", "prediction_loss_only=False,\n", "push_to_hub=False,\n", "push_to_hub_model_id=None,\n", "push_to_hub_organization=None,\n", "push_to_hub_token=,\n", "ray_scope=last,\n", "remove_unused_columns=True,\n", "report_to=[],\n", "resume_from_checkpoint=None,\n", "run_name=/content/artifacts/checkpoints,\n", "save_on_each_node=False,\n", "save_safetensors=False,\n", "save_steps=50,\n", "save_strategy=steps,\n", "save_total_limit=None,\n", "seed=42,\n", "sharded_ddp=[],\n", "skip_memory_metrics=True,\n", "tf32=None,\n", "torch_compile=False,\n", "torch_compile_backend=None,\n", "torch_compile_mode=None,\n", "torchdynamo=None,\n", "tpu_metrics_debug=False,\n", "tpu_num_cores=None,\n", "use_cpu=False,\n", "use_ipex=False,\n", "use_legacy_prediction_loop=False,\n", "use_mps_device=False,\n", "warmup_ratio=0.02,\n", "warmup_steps=100,\n", "weight_decay=0.0,\n", ")" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from utils import parse_training_args\n", "\n", "training_args = parse_training_args(params)\n", "training_args" ] }, { "cell_type": "code", "execution_count": 11, "id": "2ae3e5f9-e28e-457b-b6bf-a62a472241bf", "metadata": { "execution": { "iopub.execute_input": "2023-10-23T04:16:30.856558Z", "iopub.status.busy": "2023-10-23T04:16:30.845583Z", "iopub.status.idle": "2023-10-23T04:16:30.859550Z", "shell.execute_reply": "2023-10-23T04:16:30.858918Z" }, "papermill": { "duration": 1.039895, "end_time": "2023-10-23T04:16:30.861071", "exception": false, "start_time": "2023-10-23T04:16:29.821176", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "# data = data[\"train\"].train_test_split(test_size=0.1)\n", "# data\n" ] }, { "cell_type": "code", "execution_count": 12, "id": "5bc91439-6108-445c-8f85-e6558c9f0677", "metadata": { "execution": { "iopub.execute_input": "2023-10-23T04:16:32.873189Z", "iopub.status.busy": "2023-10-23T04:16:32.872448Z", "iopub.status.idle": "2023-10-23T04:16:33.145627Z", "shell.execute_reply": "2023-10-23T04:16:33.144802Z" }, "papermill": { "duration": 1.290055, "end_time": "2023-10-23T04:16:33.147320", "exception": false, "start_time": "2023-10-23T04:16:31.857265", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] } ], "source": [ "! mkdir -p {trained_model_path_lora}" ] }, { "cell_type": "code", "execution_count": 13, "id": "b33e407a-9d4f-49f6-a74b-b80db8cc3a8a", "metadata": { "execution": { "iopub.execute_input": "2023-10-23T04:16:36.127583Z", "iopub.status.busy": "2023-10-23T04:16:36.126817Z", "iopub.status.idle": "2023-10-23T07:07:47.130996Z", "shell.execute_reply": "2023-10-23T07:07:47.130335Z" }, "papermill": { "duration": 10272.969761, "end_time": "2023-10-23T07:07:47.132555", "exception": false, "start_time": "2023-10-23T04:16:34.162794", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n" ] }, { "data": { "text/html": [ "\n", "
\n", " \n", " \n", " [2370/2370 2:51:03, Epoch 2/3]\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
StepTraining Loss
500.881200
1000.341200
1500.178000
2000.138400
2500.104300
3000.085100
3500.070900
4000.059100
4500.054200
5000.052800
5500.049400
6000.046500
6500.041700
7000.044300
7500.043600
8000.042000
8500.035900
9000.038100
9500.033700
10000.033300
10500.033800
11000.033500
11500.032800
12000.033500
12500.031600
13000.033600
13500.032900
14000.029600
14500.033000
15000.032800
15500.032300
16000.030600
16500.025900
17000.027000
17500.027400
18000.025700
18500.025400
19000.026400
19500.025500
20000.026300
20500.025600
21000.026500
21500.025600
22000.026000
22500.026500
23000.025700
23500.025800

" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "TrainOutput(global_step=2370, training_loss=0.06678998734377607, metrics={'train_runtime': 10270.6027, 'train_samples_per_second': 0.924, 'train_steps_per_second': 0.231, 'total_flos': 2.160162583196713e+17, 'train_loss': 0.06678998734377607, 'epoch': 3.0})" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "trainer = transformers.Trainer(\n", " model=model,\n", " train_dataset=data[\"train\"],\n", "# eval_dataset=data[\"test\"],\n", " args=training_args,\n", " data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),\n", ")\n", "model.config.use_cache = False # silence the warnings. Please re-enable for inference!\n", "\n", "checkpoint_path = Path(\"/content/artifacts/checkpoints\")\n", "\n", "# Only set resume_from_checkpoint True when directory exists and contains files\n", "resume_from_checkpoint = checkpoint_path.is_dir() and any(checkpoint_path.iterdir())\n", "if resume_from_checkpoint:\n", " print(\"Resuming from checkpoint:\", list(checkpoint_path.rglob(\"\")))\n", "trainer.train(resume_from_checkpoint=resume_from_checkpoint)" ] }, { "cell_type": "code", "execution_count": 14, "id": "172e47a7-400e-4f82-a5e3-38135ecf532f", "metadata": { "execution": { "iopub.execute_input": "2023-10-23T07:07:49.427665Z", "iopub.status.busy": "2023-10-23T07:07:49.427050Z", "iopub.status.idle": "2023-10-23T07:08:07.740366Z", "shell.execute_reply": "2023-10-23T07:08:07.739680Z" }, "papermill": { "duration": 19.377847, "end_time": "2023-10-23T07:08:07.742055", "exception": false, "start_time": "2023-10-23T07:07:48.364208", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "PeftModelForCausalLM(\n", " (base_model): LoraModel(\n", " (model): LlamaForCausalLM(\n", " (model): LlamaModel(\n", " (embed_tokens): ModulesToSaveWrapper(\n", " (original_module): Embedding(32001, 4096)\n", " (modules_to_save): ModuleDict(\n", " (default): Embedding(32001, 4096)\n", " )\n", " )\n", " (layers): ModuleList(\n", " (0-31): 32 x LlamaDecoderLayer(\n", " (self_attn): LlamaAttention(\n", " (q_proj): Linear(\n", " in_features=4096, out_features=4096, bias=False\n", " (lora_dropout): ModuleDict(\n", " (default): Dropout(p=0.05, inplace=False)\n", " )\n", " (lora_A): ModuleDict(\n", " (default): Linear(in_features=4096, out_features=16, bias=False)\n", " )\n", " (lora_B): ModuleDict(\n", " (default): Linear(in_features=16, out_features=4096, bias=False)\n", " )\n", " (lora_embedding_A): ParameterDict()\n", " (lora_embedding_B): ParameterDict()\n", " )\n", " (k_proj): Linear(\n", " in_features=4096, out_features=4096, bias=False\n", " (lora_dropout): ModuleDict(\n", " (default): Dropout(p=0.05, inplace=False)\n", " )\n", " (lora_A): ModuleDict(\n", " (default): Linear(in_features=4096, out_features=16, bias=False)\n", " )\n", " (lora_B): ModuleDict(\n", " (default): Linear(in_features=16, out_features=4096, bias=False)\n", " )\n", " (lora_embedding_A): ParameterDict()\n", " (lora_embedding_B): ParameterDict()\n", " )\n", " (v_proj): Linear(\n", " in_features=4096, out_features=4096, bias=False\n", " (lora_dropout): ModuleDict(\n", " (default): Dropout(p=0.05, inplace=False)\n", " )\n", " (lora_A): ModuleDict(\n", " (default): Linear(in_features=4096, out_features=16, bias=False)\n", " )\n", " (lora_B): ModuleDict(\n", " (default): Linear(in_features=16, out_features=4096, bias=False)\n", " )\n", " (lora_embedding_A): ParameterDict()\n", " (lora_embedding_B): ParameterDict()\n", " )\n", " (o_proj): Linear(\n", " in_features=4096, out_features=4096, bias=False\n", " (lora_dropout): ModuleDict(\n", " (default): Dropout(p=0.05, inplace=False)\n", " )\n", " (lora_A): ModuleDict(\n", " (default): Linear(in_features=4096, out_features=16, bias=False)\n", " )\n", " (lora_B): ModuleDict(\n", " (default): Linear(in_features=16, out_features=4096, bias=False)\n", " )\n", " (lora_embedding_A): ParameterDict()\n", " (lora_embedding_B): ParameterDict()\n", " )\n", " (rotary_emb): LlamaRotaryEmbedding()\n", " )\n", " (mlp): LlamaMLP(\n", " (gate_proj): Linear(\n", " in_features=4096, out_features=11008, bias=False\n", " (lora_dropout): ModuleDict(\n", " (default): Dropout(p=0.05, inplace=False)\n", " )\n", " (lora_A): ModuleDict(\n", " (default): Linear(in_features=4096, out_features=16, bias=False)\n", " )\n", " (lora_B): ModuleDict(\n", " (default): Linear(in_features=16, out_features=11008, bias=False)\n", " )\n", " (lora_embedding_A): ParameterDict()\n", " (lora_embedding_B): ParameterDict()\n", " )\n", " (up_proj): Linear(\n", " in_features=4096, out_features=11008, bias=False\n", " (lora_dropout): ModuleDict(\n", " (default): Dropout(p=0.05, inplace=False)\n", " )\n", " (lora_A): ModuleDict(\n", " (default): Linear(in_features=4096, out_features=16, bias=False)\n", " )\n", " (lora_B): ModuleDict(\n", " (default): Linear(in_features=16, out_features=11008, bias=False)\n", " )\n", " (lora_embedding_A): ParameterDict()\n", " (lora_embedding_B): ParameterDict()\n", " )\n", " (down_proj): Linear(\n", " in_features=11008, out_features=4096, bias=False\n", " (lora_dropout): ModuleDict(\n", " (default): Dropout(p=0.05, inplace=False)\n", " )\n", " (lora_A): ModuleDict(\n", " (default): Linear(in_features=11008, out_features=16, bias=False)\n", " )\n", " (lora_B): ModuleDict(\n", " (default): Linear(in_features=16, out_features=4096, bias=False)\n", " )\n", " (lora_embedding_A): ParameterDict()\n", " (lora_embedding_B): ParameterDict()\n", " )\n", " (act_fn): SiLUActivation()\n", " )\n", " (input_layernorm): LlamaRMSNorm()\n", " (post_attention_layernorm): LlamaRMSNorm()\n", " )\n", " )\n", " (norm): LlamaRMSNorm()\n", " )\n", " (lm_head): ModulesToSaveWrapper(\n", " (original_module): Linear(in_features=4096, out_features=32001, bias=False)\n", " (modules_to_save): ModuleDict(\n", " (default): Linear(in_features=4096, out_features=32001, bias=False)\n", " )\n", " )\n", " )\n", " )\n", ")" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.save_pretrained(trained_model_path_lora)\n", "model" ] }, { "cell_type": "code", "execution_count": 15, "id": "dea4e68e-57a7-48bd-bad9-f03dfe3f8a06", "metadata": { "execution": { "iopub.execute_input": "2023-10-23T07:08:09.719819Z", "iopub.status.busy": "2023-10-23T07:08:09.719055Z", "iopub.status.idle": "2023-10-23T07:08:09.968284Z", "shell.execute_reply": "2023-10-23T07:08:09.967347Z" }, "papermill": { "duration": 1.229019, "end_time": "2023-10-23T07:08:09.969828", "exception": false, "start_time": "2023-10-23T07:08:08.740809", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 1.2G\r\n", " 512 -rw-r--r-- 1 root 3003 88 Oct 23 07:07 README.md\r\n", "1.0K -rw-r--r-- 1 root 3003 550 Oct 23 07:08 adapter_config.json\r\n", "1.2G -rw-r--r-- 1 root 3003 1.2G Oct 23 07:07 adapter_model.bin\r\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] } ], "source": [ "! ls -lash {trained_model_path_lora}" ] }, { "cell_type": "code", "execution_count": 16, "id": "09db36b7-ead6-4368-9bfb-13ba1ba800a5", "metadata": { "execution": { "iopub.execute_input": "2023-10-23T07:08:11.940246Z", "iopub.status.busy": "2023-10-23T07:08:11.939444Z", "iopub.status.idle": "2023-10-23T07:09:04.484842Z", "shell.execute_reply": "2023-10-23T07:09:04.484162Z" }, "papermill": { "duration": 54.728628, "end_time": "2023-10-23T07:09:05.635793", "exception": false, "start_time": "2023-10-23T07:08:10.907165", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "LlamaForCausalLM(\n", " (model): LlamaModel(\n", " (embed_tokens): Embedding(32001, 4096)\n", " (layers): ModuleList(\n", " (0-31): 32 x LlamaDecoderLayer(\n", " (self_attn): LlamaAttention(\n", " (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n", " (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\n", " (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\n", " (o_proj): Linear(in_features=4096, out_features=4096, bias=False)\n", " (rotary_emb): LlamaRotaryEmbedding()\n", " )\n", " (mlp): LlamaMLP(\n", " (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)\n", " (up_proj): Linear(in_features=4096, out_features=11008, bias=False)\n", " (down_proj): Linear(in_features=11008, out_features=4096, bias=False)\n", " (act_fn): SiLUActivation()\n", " )\n", " (input_layernorm): LlamaRMSNorm()\n", " (post_attention_layernorm): LlamaRMSNorm()\n", " )\n", " )\n", " (norm): LlamaRMSNorm()\n", " )\n", " (lm_head): Linear(in_features=4096, out_features=32001, bias=False)\n", ")" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model = model.merge_and_unload().half()\n", "model" ] }, { "cell_type": "code", "execution_count": 17, "id": "270a9a72-3a12-4d83-aa7d-2d167cb28cb4", "metadata": { "execution": { "iopub.execute_input": "2023-10-23T07:09:07.731540Z", "iopub.status.busy": "2023-10-23T07:09:07.730902Z", "iopub.status.idle": "2023-10-23T07:09:07.975280Z", "shell.execute_reply": "2023-10-23T07:09:07.974458Z" }, "papermill": { "duration": 1.355032, "end_time": "2023-10-23T07:09:07.976846", "exception": false, "start_time": "2023-10-23T07:09:06.621814", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 0\r\n", "drwxr-xr-x 1 root 3003 0 Oct 23 04:16 checkpoints\r\n", "drwxr-xr-x 1 root 3003 0 Oct 23 04:16 lora\r\n", "drwxr-xr-x 1 root 3003 0 Oct 23 04:10 src\r\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] } ], "source": [ "! ls -l {trained_model_path}" ] }, { "cell_type": "code", "execution_count": 18, "id": "260e9d79-6eb8-4516-bf8f-825a25606391", "metadata": { "execution": { "iopub.execute_input": "2023-10-23T07:09:09.990340Z", "iopub.status.busy": "2023-10-23T07:09:09.989655Z", "iopub.status.idle": "2023-10-23T07:11:33.903117Z", "shell.execute_reply": "2023-10-23T07:11:33.902350Z" }, "papermill": { "duration": 145.986999, "end_time": "2023-10-23T07:11:34.968252", "exception": false, "start_time": "2023-10-23T07:09:08.981253", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "('/content/artifacts/tokenizer_config.json',\n", " '/content/artifacts/special_tokens_map.json',\n", " '/content/artifacts/tokenizer.model',\n", " '/content/artifacts/added_tokens.json',\n", " '/content/artifacts/tokenizer.json')" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.save_pretrained(trained_model_path)\n", "tokenizer.save_pretrained(trained_model_path)" ] }, { "cell_type": "code", "execution_count": 19, "id": "6d90a920-fb22-4291-8466-411ff41e31be", "metadata": { "execution": { "iopub.execute_input": "2023-10-23T07:11:36.839690Z", "iopub.status.busy": "2023-10-23T07:11:36.838894Z", "iopub.status.idle": "2023-10-23T07:11:37.088096Z", "shell.execute_reply": "2023-10-23T07:11:37.087230Z" }, "papermill": { "duration": 1.198205, "end_time": "2023-10-23T07:11:37.089762", "exception": false, "start_time": "2023-10-23T07:11:35.891557", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 13G\r\n", " 512 -rw-r--r-- 1 root 3003 21 Oct 23 07:11 added_tokens.json\r\n", " 0 drwxr-xr-x 1 root 3003 0 Oct 23 04:16 checkpoints\r\n", "1.0K -rw-r--r-- 1 root 3003 648 Oct 23 07:09 config.json\r\n", " 512 -rw-r--r-- 1 root 3003 183 Oct 23 07:09 generation_config.json\r\n", " 0 drwxr-xr-x 1 root 3003 0 Oct 23 04:16 lora\r\n", "9.3G -rw-r--r-- 1 root 3003 9.3G Oct 23 07:09 pytorch_model-00001-of-00002.bin\r\n", "3.3G -rw-r--r-- 1 root 3003 3.3G Oct 23 07:11 pytorch_model-00002-of-00002.bin\r\n", " 24K -rw-r--r-- 1 root 3003 24K Oct 23 07:11 pytorch_model.bin.index.json\r\n", "1.0K -rw-r--r-- 1 root 3003 552 Oct 23 07:11 special_tokens_map.json\r\n", " 0 drwxr-xr-x 1 root 3003 0 Oct 23 04:10 src\r\n", "1.8M -rw-r--r-- 1 root 3003 1.8M Oct 23 07:11 tokenizer.json\r\n", "489K -rw-r--r-- 1 root 3003 489K Oct 23 07:11 tokenizer.model\r\n", "1.5K -rw-r--r-- 1 root 3003 1.1K Oct 23 07:11 tokenizer_config.json\r\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] } ], "source": [ "! ls -lash {trained_model_path}" ] }, { "cell_type": "code", "execution_count": 20, "id": "202a694a", "metadata": { "execution": { "iopub.execute_input": "2023-10-23T07:11:39.015703Z", "iopub.status.busy": "2023-10-23T07:11:39.014885Z" }, "papermill": { "duration": null, "end_time": null, "exception": false, "start_time": "2023-10-23T07:11:38.011529", "status": "running" }, "tags": [] }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "06408c12de9a45139bdafb067bc717dd", "version_major": 2, "version_minor": 0 }, "text/plain": [ "pytorch_model-00002-of-00002.bin: 0%| | 0.00/3.50G [00:00