{ "cells": [ { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [], "source": [ "import os\n", "from uuid import uuid4\n", "import pandas as pd\n", "\n", "from datasets import load_dataset\n", "import subprocess\n", "from transformers import AutoTokenizer" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [], "source": [ "# from dotenv import load_dotenv,find_dotenv\n", "# load_dotenv(find_dotenv(),override=True)\n", "\n", "def max_token_len(dataset):\n", " max_seq_length = 0\n", " for row in dataset:\n", " tokens = len(tokenizer(row['text'])['input_ids'])\n", " if tokens > max_seq_length:\n", " max_seq_length = tokens\n", " return max_seq_length" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Model Max Length: 1000000000000000019884624838656\n" ] } ], "source": [ "# model_name='TinyLlama/TinyLlama-1.1B-Chat-v0.1'\n", "model_name = 'mistralai/Mistral-7B-v0.1'\n", "# model_name = 'distilbert-base-uncased'\n", "tokenizer = AutoTokenizer.from_pretrained(model_name)\n", "model_max_length = tokenizer.model_max_length\n", "print(\"Model Max Length:\", model_max_length)" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Max token length train: 1121\n", "Max token length validation: 38\n", "Block size: 2242\n" ] } ], "source": [ "# Load dataset\n", "dataset_name = 'ai-aerospace/ams_data_train_generic_v0.1_100'\n", "dataset=load_dataset(dataset_name)\n", "\n", "# Write dataset files into data directory\n", "data_directory = './fine_tune_data/'\n", "\n", "# Create the data directory if it doesn't exist\n", "os.makedirs(data_directory, exist_ok=True)\n", "\n", "# Write the train data to a CSV file\n", "train_data='train_data'\n", "train_filename = os.path.join(data_directory, train_data)\n", "dataset['train'].to_pandas().to_csv(train_filename+'.csv', columns=['text'], index=False)\n", "max_token_length_train=max_token_len(dataset['train'])\n", "print('Max token length train: '+str(max_token_length_train))\n", "\n", "# Write the validation data to a CSV file\n", "validation_data='validation_data'\n", "validation_filename = os.path.join(data_directory, validation_data)\n", "dataset['validation'].to_pandas().to_csv(validation_filename+'.csv', columns=['text'], index=False)\n", "max_token_length_validation=max_token_len(dataset['validation'])\n", "print('Max token length validation: '+str(max_token_length_validation))\n", " \n", "max_token_length=max(max_token_length_train,max_token_length_validation)\n", "if max_token_length > model_max_length:\n", " raise ValueError(\"Maximum token length exceeds model limits.\")\n", "block_size=2*max_token_length\n", "print('Block size: '+str(block_size))\n", "\n", "# Define project parameters\n", "username='ai-aerospace'\n", "project_name='./llms/'+'ams_data_train-100_'+str(uuid4())\n", "repo_name='ams-data-train-100-'+str(uuid4())" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'project_name': './llms/ams_data_train-100_6abb23dc-cb9d-428e-9079-e47deee0edd9', 'model_name': 'mistralai/Mistral-7B-v0.1', 'repo_id': 'ai-aerospace/ams-data-train-100-4601c8c8-0903-4f18-a6e8-1d2a40a697ce', 'train_data': 'train_data', 'validation_data': 'validation_data', 'data_directory': './fine_tune_data/', 'block_size': 2242, 'model_max_length': 1121, 'logging_steps': -1, 'evaluation_strategy': 'epoch', 'save_total_limit': 1, 'save_strategy': 'epoch', 'mixed_precision': 'fp16', 'lr': 3e-05, 'epochs': 3, 'batch_size': 2, 'warmup_ratio': 0.1, 'gradient_accumulation': 1, 'optimizer': 'adamw_torch', 'scheduler': 'linear', 'weight_decay': 0, 'max_grad_norm': 1, 'seed': 42, 'quantization': 'int4', 'lora_r': 16, 'lora_alpha': 32, 'lora_dropout': 0.05}\n" ] } ], "source": [ "\"\"\"\n", "This set of parameters runs on a low memory gpu on hugging face spaces:\n", "{\n", " \"block_size\": 1024,\n", " \"model_max_length\": 2048,\n", " x\"use_flash_attention_2\": false,\n", " x\"disable_gradient_checkpointing\": false,\n", " \"logging_steps\": -1,\n", " \"evaluation_strategy\": \"epoch\",\n", " \"save_total_limit\": 1,\n", " \"save_strategy\": \"epoch\",\n", " x\"auto_find_batch_size\": false,\n", " \"mixed_precision\": \"fp16\",\n", " \"lr\": 0.00003,\n", " \"epochs\": 3,\n", " \"batch_size\": 2,\n", " \"warmup_ratio\": 0.1,\n", " \"gradient_accumulation\": 1,\n", " \"optimizer\": \"adamw_torch\",\n", " \"scheduler\": \"linear\",\n", " \"weight_decay\": 0,\n", " \"max_grad_norm\": 1,\n", " \"seed\": 42,\n", " \"apply_chat_template\": false,\n", " \"quantization\": \"int4\",\n", " \"target_modules\": \"\",\n", " x\"merge_adapter\": false,\n", " \"peft\": true,\n", " \"lora_r\": 16,\n", " \"lora_alpha\": 32,\n", " \"lora_dropout\": 0.05\n", "}\n", "\"\"\"\n", "\n", "model_params={\n", " \"project_name\": project_name,\n", " \"model_name\": model_name,\n", " \"repo_id\": username+'/'+repo_name,\n", " \"train_data\": train_data,\n", " \"validation_data\": validation_data,\n", " \"data_directory\": data_directory,\n", " \"block_size\": block_size,\n", " \"model_max_length\": max_token_length,\n", " \"logging_steps\": -1,\n", " \"evaluation_strategy\": \"epoch\",\n", " \"save_total_limit\": 1,\n", " \"save_strategy\": \"epoch\",\n", " \"mixed_precision\": \"fp16\",\n", " \"lr\": 0.00003,\n", " \"epochs\": 3,\n", " \"batch_size\": 2,\n", " \"warmup_ratio\": 0.1,\n", " \"gradient_accumulation\": 1,\n", " \"optimizer\": \"adamw_torch\",\n", " \"scheduler\": \"linear\",\n", " \"weight_decay\": 0,\n", " \"max_grad_norm\": 1,\n", " \"seed\": 42,\n", " \"quantization\": \"int4\",\n", " \"lora_r\": 16,\n", " \"lora_alpha\": 32,\n", " \"lora_dropout\": 0.05\n", "}\n", "for key, value in model_params.items():\n", " os.environ[key] = str(value)\n", "\n", "print(model_params)\n", "\n", "\n", "# Save parameters to environment variables\n", "# os.environ[\"project_name\"] = project_name\n", "# os.environ[\"model_name\"] = model_name\n", "# os.environ[\"repo_id\"] = username+'/'+repo_name\n", "# os.environ[\"train_data\"] = train_data \n", "# os.environ[\"validation_data\"] = validation_data\n", "# os.environ[\"data_directory\"] = data_directory" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "⚠️ WARNING | 2023-12-22 10:41:00 | autotrain.cli.run_dreambooth::14 - ❌ Some DreamBooth components are missing! Please run `autotrain setup` to install it. Ignore this warning if you are not using DreamBooth or running `autotrain setup` already.\n", "Traceback (most recent call last):\n", " File \"/home/dsmueller/Repositories/HuggingFace/autotrain-playground/.venv/bin/autotrain\", line 8, in \n", " sys.exit(main())\n", " ^^^^^^\n", " File \"/home/dsmueller/Repositories/HuggingFace/autotrain-playground/.venv/lib/python3.11/site-packages/autotrain/cli/autotrain.py\", line 47, in main\n", " command = args.func(args)\n", " ^^^^^^^^^^^^^^^\n", " File \"/home/dsmueller/Repositories/HuggingFace/autotrain-playground/.venv/lib/python3.11/site-packages/autotrain/cli/run_llm.py\", line 14, in run_llm_command_factory\n", " return RunAutoTrainLLMCommand(args)\n", " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", " File \"/home/dsmueller/Repositories/HuggingFace/autotrain-playground/.venv/lib/python3.11/site-packages/autotrain/cli/run_llm.py\", line 473, in __init__\n", " raise ValueError(\"No GPU/MPS device found. LLM training requires an accelerator\")\n", "ValueError: No GPU/MPS device found. LLM training requires an accelerator\n" ] }, { "ename": "CalledProcessError", "evalue": "Command '\nautotrain llm --train --trainer sft --project_name ./llms/ams_data_train-100_6abb23dc-cb9d-428e-9079-e47deee0edd9 --model mistralai/Mistral-7B-v0.1 --data_path ./fine_tune_data/ --train_split train_data --valid_split validation_data --repo_id ai-aerospace/ams-data-train-100-4601c8c8-0903-4f18-a6e8-1d2a40a697ce --push_to_hub --token HUGGINGFACE_TOKEN --block_size 2242 --model_max_length 1121 --logging_steps -1 --evaluation_strategy epoch --save_total_limit 1 --save_strategy epoch --fp16 --lr 3e-05 --num_train_epochs 3 --train_batch_size 2 --warmup_ratio 0.1 --gradient_accumulation 1 --optimizer adamw_torch --scheduler linear --weight_decay 0 --max_grad_norm 1 --seed 42 --use_int4 --use-peft --lora_r 16 --lora_alpha 32 --lora_dropout 0.05\n' returned non-zero exit status 1.", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mCalledProcessError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[50], line 40\u001b[0m\n\u001b[1;32m 4\u001b[0m command\u001b[38;5;241m=\u001b[39m\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\"\"\u001b[39m\n\u001b[1;32m 5\u001b[0m \u001b[38;5;124mautotrain llm --train \u001b[39m\u001b[38;5;130;01m\\\u001b[39;00m\n\u001b[1;32m 6\u001b[0m \u001b[38;5;124m --trainer sft \u001b[39m\u001b[38;5;130;01m\\\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 36\u001b[0m \u001b[38;5;124m --lora_dropout \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmodel_params[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mlora_dropout\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;132;01m}\u001b[39;00m\n\u001b[1;32m 37\u001b[0m \u001b[38;5;124m\"\"\"\u001b[39m\n\u001b[1;32m 39\u001b[0m \u001b[38;5;66;03m# Use subprocess.run() to execute the command\u001b[39;00m\n\u001b[0;32m---> 40\u001b[0m \u001b[43msubprocess\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcommand\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mshell\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcheck\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m/usr/lib/python3.11/subprocess.py:571\u001b[0m, in \u001b[0;36mrun\u001b[0;34m(input, capture_output, timeout, check, *popenargs, **kwargs)\u001b[0m\n\u001b[1;32m 569\u001b[0m retcode \u001b[38;5;241m=\u001b[39m process\u001b[38;5;241m.\u001b[39mpoll()\n\u001b[1;32m 570\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m check \u001b[38;5;129;01mand\u001b[39;00m retcode:\n\u001b[0;32m--> 571\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m CalledProcessError(retcode, process\u001b[38;5;241m.\u001b[39margs,\n\u001b[1;32m 572\u001b[0m output\u001b[38;5;241m=\u001b[39mstdout, stderr\u001b[38;5;241m=\u001b[39mstderr)\n\u001b[1;32m 573\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m CompletedProcess(process\u001b[38;5;241m.\u001b[39margs, retcode, stdout, stderr)\n", "\u001b[0;31mCalledProcessError\u001b[0m: Command '\nautotrain llm --train --trainer sft --project_name ./llms/ams_data_train-100_6abb23dc-cb9d-428e-9079-e47deee0edd9 --model mistralai/Mistral-7B-v0.1 --data_path ./fine_tune_data/ --train_split train_data --valid_split validation_data --repo_id ai-aerospace/ams-data-train-100-4601c8c8-0903-4f18-a6e8-1d2a40a697ce --push_to_hub --token HUGGINGFACE_TOKEN --block_size 2242 --model_max_length 1121 --logging_steps -1 --evaluation_strategy epoch --save_total_limit 1 --save_strategy epoch --fp16 --lr 3e-05 --num_train_epochs 3 --train_batch_size 2 --warmup_ratio 0.1 --gradient_accumulation 1 --optimizer adamw_torch --scheduler linear --weight_decay 0 --max_grad_norm 1 --seed 42 --use_int4 --use-peft --lora_r 16 --lora_alpha 32 --lora_dropout 0.05\n' returned non-zero exit status 1." ] } ], "source": [ "\n", "# Set .venv and execute the autotrain script\n", "# To see all parameters: autotrain llm --help\n", "# !autotrain llm --train --project_name my-llm --model TinyLlama/TinyLlama-1.1B-Chat-v0.1 --data_path . --use-peft --use_int4 --learning_rate 2e-4 --train_batch_size 6 --num_train_epochs 3 --trainer sft\n", "command=f\"\"\"\n", "autotrain llm --train \\\n", " --trainer sft \\\n", " --project_name {model_params['project_name']} \\\n", " --model {model_params['model_name']} \\\n", " --data_path {model_params['data_directory']} \\\n", " --train_split {model_params['train_data']} \\\n", " --valid_split {model_params['validation_data']} \\\n", " --repo_id {model_params['repo_id']} \\\n", " --push_to_hub \\\n", " --token HUGGINGFACE_TOKEN \\\n", " --block_size {model_params['block_size']} \\\n", " --model_max_length {model_params['model_max_length']} \\\n", " --logging_steps {model_params['logging_steps']} \\\n", " --evaluation_strategy {model_params['evaluation_strategy']} \\\n", " --save_total_limit {model_params['save_total_limit']} \\\n", " --save_strategy {model_params['save_strategy']} \\\n", " --fp16 \\\n", " --lr {model_params['lr']} \\\n", " --num_train_epochs {model_params['epochs']} \\\n", " --train_batch_size {model_params['batch_size']} \\\n", " --warmup_ratio {model_params['warmup_ratio']} \\\n", " --gradient_accumulation {model_params['gradient_accumulation']} \\\n", " --optimizer {model_params['optimizer']} \\\n", " --scheduler linear \\\n", " --weight_decay {model_params['weight_decay']} \\\n", " --max_grad_norm {model_params['max_grad_norm']} \\\n", " --seed {model_params['seed']} \\\n", " --use_int4 \\\n", " --use-peft \\\n", " --lora_r {model_params['lora_r']} \\\n", " --lora_alpha {model_params['lora_alpha']} \\\n", " --lora_dropout {model_params['lora_dropout']}\n", "\"\"\"\n", "\n", "# Use subprocess.run() to execute the command\n", "subprocess.run(command, shell=True, check=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.7" } }, "nbformat": 4, "nbformat_minor": 2 }