{ "cells": [ { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset\n", "from trl import SFTTrainer\n", "from peft import LoraConfig, get_peft_model\n", "\n", "import os\n", "from uuid import uuid4\n", "import pandas as pd\n", "\n", "import subprocess\n", "import evaluate\n", "import transformers\n", "from transformers import AutoModelForCausalLM, AutoTokenizer\n" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "def max_token_len(dataset):\n", " max_seq_length = 0\n", " for row in dataset:\n", " tokens = len(tokenizer(row['text'])['input_ids'])\n", " if tokens > max_seq_length:\n", " max_seq_length = tokens\n", " return max_seq_length" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Model Max Length: 1000000000000000019884624838656\n" ] } ], "source": [ "# model_name='TinyLlama/TinyLlama-1.1B-Chat-v0.1'\n", "model_name = 'mistralai/Mistral-7B-v0.1'\n", "# model_name = 'distilbert-base-uncased'\n", "tokenizer = AutoTokenizer.from_pretrained(model_name)\n", "model_max_length = tokenizer.model_max_length\n", "print(\"Model Max Length:\", model_max_length)\n", "\n", "# dataset = load_dataset(\"imdb\", split=\"train\")\n", "dataset_name = 'ai-aerospace/ams_data_train_generic_v0.1_100'\n", "dataset = load_dataset(dataset_name)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Max token length train: 1121\n", "Max token length validation: 38\n", "Block size: 2242\n", "{'project_name': './llms/ams_data_train-100_91a45e55-876a-4b93-a9e7-70d26238cd33', 'model_name': 'mistralai/Mistral-7B-v0.1', 'repo_id': 'ai-aerospace/ams-data-train-100-81dbb7fc-16f6-4870-a898-c1840e33430d', 'train_data': 'train_data', 'validation_data': 'validation_data', 'data_directory': './fine_tune_data/', 'block_size': 2242, 'model_max_length': 1121, 'logging_steps': -1, 'evaluation_strategy': 'epoch', 'save_total_limit': 1, 'save_strategy': 'epoch', 'mixed_precision': 'fp16', 'lr': 3e-05, 'epochs': 3, 'batch_size': 2, 'warmup_ratio': 0.1, 'gradient_accumulation': 1, 'optimizer': 'adamw_torch', 'scheduler': 'linear', 'weight_decay': 0, 'max_grad_norm': 1, 'seed': 42, 'quantization': 'int4', 'lora_r': 16, 'lora_alpha': 32, 'lora_dropout': 0.05}\n" ] } ], "source": [ "# Write dataset files into data directory\n", "data_directory = './fine_tune_data/'\n", "\n", "# Create the data directory if it doesn't exist\n", "os.makedirs(data_directory, exist_ok=True)\n", "\n", "# Write the train data to a CSV file\n", "train_data='train_data'\n", "train_filename = os.path.join(data_directory, train_data)\n", "dataset['train'].to_pandas().to_csv(train_filename+'.csv', columns=['text'], index=False)\n", "max_token_length_train=max_token_len(dataset['train'])\n", "print('Max token length train: '+str(max_token_length_train))\n", "\n", "# Write the validation data to a CSV file\n", "validation_data='validation_data'\n", "validation_filename = os.path.join(data_directory, validation_data)\n", "dataset['validation'].to_pandas().to_csv(validation_filename+'.csv', columns=['text'], index=False)\n", "max_token_length_validation=max_token_len(dataset['validation'])\n", "print('Max token length validation: '+str(max_token_length_validation))\n", " \n", "max_token_length=max(max_token_length_train,max_token_length_validation)\n", "# max_token_length=max_token_length_train\n", "if max_token_length > model_max_length:\n", " raise ValueError(\"Maximum token length exceeds model limits.\")\n", "block_size=2*max_token_length\n", "print('Block size: '+str(block_size))\n", "\n", "# Define project parameters\n", "username='ai-aerospace'\n", "project_name='./llms/'+'ams_data_train-100_'+str(uuid4())\n", "repo_name='ams-data-train-100-'+str(uuid4())\n", "\n", "model_params={\n", " \"project_name\": project_name,\n", " \"model_name\": model_name,\n", " \"repo_id\": username+'/'+repo_name,\n", " \"train_data\": train_data,\n", " \"validation_data\": validation_data,\n", " \"data_directory\": data_directory,\n", " \"block_size\": block_size,\n", " \"model_max_length\": max_token_length,\n", " \"logging_steps\": -1,\n", " \"evaluation_strategy\": \"epoch\",\n", " \"save_total_limit\": 1,\n", " \"save_strategy\": \"epoch\",\n", " \"mixed_precision\": \"fp16\",\n", " \"lr\": 0.00003,\n", " \"epochs\": 3,\n", " \"batch_size\": 2,\n", " \"warmup_ratio\": 0.1,\n", " \"gradient_accumulation\": 1,\n", " \"optimizer\": \"adamw_torch\",\n", " \"scheduler\": \"linear\",\n", " \"weight_decay\": 0,\n", " \"max_grad_norm\": 1,\n", " \"seed\": 42,\n", " \"quantization\": \"int4\",\n", " \"lora_r\": 16,\n", " \"lora_alpha\": 32,\n", " \"lora_dropout\": 0.05\n", "}\n", "for key, value in model_params.items():\n", " os.environ[key] = str(value)\n", "\n", "print(model_params)\n" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "ename": "ValueError", "evalue": "FP16 Mixed precision training with AMP or APEX (`--fp16`) and FP16 half precision evaluation (`--fp16_full_eval`) can only be used on CUDA or NPU devices or certain XPU devices (with IPEX).", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[27], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m args_custom\u001b[38;5;241m=\u001b[39m\u001b[43mtransformers\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mTrainingArguments\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2\u001b[0m \u001b[43m \u001b[49m\u001b[43mper_device_train_batch_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmodel_params\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mbatch_size\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3\u001b[0m \u001b[43m \u001b[49m\u001b[43mper_device_eval_batch_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmodel_params\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mbatch_size\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 4\u001b[0m \u001b[43m \u001b[49m\u001b[43mgradient_accumulation_steps\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmodel_params\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mgradient_accumulation\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 5\u001b[0m \u001b[43m \u001b[49m\u001b[43mwarmup_ratio\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmodel_params\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mwarmup_ratio\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 6\u001b[0m \u001b[43m \u001b[49m\u001b[43mnum_train_epochs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmodel_params\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mepochs\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 7\u001b[0m \u001b[43m \u001b[49m\u001b[43mlearning_rate\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmodel_params\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mlr\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 8\u001b[0m \u001b[43m \u001b[49m\u001b[43mfp16\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 9\u001b[0m \u001b[43m \u001b[49m\u001b[43mlogging_steps\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmodel_params\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mlogging_steps\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 10\u001b[0m \u001b[43m \u001b[49m\u001b[43msave_total_limit\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmodel_params\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43msave_total_limit\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 11\u001b[0m \u001b[43m \u001b[49m\u001b[43mevaluation_strategy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmodel_params\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mevaluation_strategy\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 12\u001b[0m \u001b[43m \u001b[49m\u001b[43mmetric_for_best_model\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mf1\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 13\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mmodel_outputs\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 14\u001b[0m \u001b[43m \u001b[49m\u001b[43mlogging_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mmodel_outputs\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 15\u001b[0m \u001b[43m \u001b[49m\u001b[43moptim\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmodel_params\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43moptimizer\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 16\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_grad_norm\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmodel_params\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mmax_grad_norm\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 17\u001b[0m \u001b[43m \u001b[49m\u001b[43mweight_decay\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmodel_params\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mweight_decay\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 18\u001b[0m \u001b[43m \u001b[49m\u001b[43mlr_scheduler_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmodel_params\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mscheduler\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[1;32m 19\u001b[0m \u001b[43m)\u001b[49m\n\u001b[1;32m 21\u001b[0m \u001b[38;5;66;03m# Args from medium article\u001b[39;00m\n\u001b[1;32m 22\u001b[0m args_medium\u001b[38;5;241m=\u001b[39mtransformers\u001b[38;5;241m.\u001b[39mTrainingArguments(\n\u001b[1;32m 23\u001b[0m per_device_train_batch_size\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m8\u001b[39m,\n\u001b[1;32m 24\u001b[0m per_device_eval_batch_size\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m32\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 40\u001b[0m report_to\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mwandb\u001b[39m\u001b[38;5;124m'\u001b[39m \u001b[38;5;66;03m# enable logging to W&B\u001b[39;00m\n\u001b[1;32m 41\u001b[0m )\n", "File \u001b[0;32m:121\u001b[0m, in \u001b[0;36m__init__\u001b[0;34m(self, output_dir, overwrite_output_dir, do_train, do_eval, do_predict, evaluation_strategy, prediction_loss_only, per_device_train_batch_size, per_device_eval_batch_size, per_gpu_train_batch_size, per_gpu_eval_batch_size, gradient_accumulation_steps, eval_accumulation_steps, eval_delay, learning_rate, weight_decay, adam_beta1, adam_beta2, adam_epsilon, max_grad_norm, num_train_epochs, max_steps, lr_scheduler_type, lr_scheduler_kwargs, warmup_ratio, warmup_steps, log_level, log_level_replica, log_on_each_node, logging_dir, logging_strategy, logging_first_step, logging_steps, logging_nan_inf_filter, save_strategy, save_steps, save_total_limit, save_safetensors, save_on_each_node, save_only_model, no_cuda, use_cpu, use_mps_device, seed, data_seed, jit_mode_eval, use_ipex, bf16, fp16, fp16_opt_level, half_precision_backend, bf16_full_eval, fp16_full_eval, tf32, local_rank, ddp_backend, tpu_num_cores, tpu_metrics_debug, debug, dataloader_drop_last, eval_steps, dataloader_num_workers, past_index, run_name, disable_tqdm, remove_unused_columns, label_names, load_best_model_at_end, metric_for_best_model, greater_is_better, ignore_data_skip, fsdp, fsdp_min_num_params, fsdp_config, fsdp_transformer_layer_cls_to_wrap, deepspeed, label_smoothing_factor, optim, optim_args, adafactor, group_by_length, length_column_name, report_to, ddp_find_unused_parameters, ddp_bucket_cap_mb, ddp_broadcast_buffers, dataloader_pin_memory, dataloader_persistent_workers, skip_memory_metrics, use_legacy_prediction_loop, push_to_hub, resume_from_checkpoint, hub_model_id, hub_strategy, hub_token, hub_private_repo, hub_always_push, gradient_checkpointing, gradient_checkpointing_kwargs, include_inputs_for_metrics, fp16_backend, push_to_hub_model_id, push_to_hub_organization, push_to_hub_token, mp_parameters, auto_find_batch_size, full_determinism, torchdynamo, ray_scope, ddp_timeout, torch_compile, torch_compile_backend, torch_compile_mode, dispatch_batches, split_batches, include_tokens_per_second, include_num_input_tokens_seen, neftune_noise_alpha)\u001b[0m\n", "File \u001b[0;32m~/Repositories/HuggingFace/fine-tuning-playground/.venv/lib/python3.11/site-packages/transformers/training_args.py:1499\u001b[0m, in \u001b[0;36mTrainingArguments.__post_init__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1488\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m--optim adamw_torch_fused with --fp16 requires PyTorch>2.0\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 1490\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[1;32m 1491\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mframework \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpt\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1492\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m is_torch_available()\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1497\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfp16 \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfp16_full_eval)\n\u001b[1;32m 1498\u001b[0m ):\n\u001b[0;32m-> 1499\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 1500\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFP16 Mixed precision training with AMP or APEX (`--fp16`) and FP16 half precision evaluation\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1501\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m (`--fp16_full_eval`) can only be used on CUDA or NPU devices or certain XPU devices (with IPEX).\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1502\u001b[0m )\n\u001b[1;32m 1504\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[1;32m 1505\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mframework \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpt\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1506\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m is_torch_available()\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1513\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbf16 \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbf16_full_eval)\n\u001b[1;32m 1514\u001b[0m ):\n\u001b[1;32m 1515\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 1516\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mBF16 Mixed precision training with AMP (`--bf16`) and BF16 half precision evaluation\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1517\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m (`--bf16_full_eval`) can only be used on CUDA, XPU (with IPEX), NPU or CPU/TPU/NeuronCore devices.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1518\u001b[0m )\n", "\u001b[0;31mValueError\u001b[0m: FP16 Mixed precision training with AMP or APEX (`--fp16`) and FP16 half precision evaluation (`--fp16_full_eval`) can only be used on CUDA or NPU devices or certain XPU devices (with IPEX)." ] } ], "source": [ "args_custom=transformers.TrainingArguments(\n", " per_device_train_batch_size=model_params['batch_size'],\n", " per_device_eval_batch_size=model_params['batch_size'],\n", " gradient_accumulation_steps=model_params['gradient_accumulation'],\n", " warmup_ratio=model_params['warmup_ratio'],\n", " num_train_epochs=model_params['epochs'],\n", " learning_rate=model_params['lr'],\n", " fp16=True,\n", " logging_steps=model_params['logging_steps'],\n", " save_total_limit=model_params['save_total_limit'],\n", " evaluation_strategy=model_params['evaluation_strategy'],\n", " metric_for_best_model=\"f1\",\n", " output_dir='model_outputs',\n", " logging_dir='model_outputs',\n", " optim=model_params['optimizer'],\n", " max_grad_norm=model_params['max_grad_norm'],\n", " weight_decay=model_params['weight_decay'],\n", " lr_scheduler_type=model_params['scheduler']\n", ")\n", "\n", "# Args from medium article\n", "args_medium=transformers.TrainingArguments(\n", " per_device_train_batch_size=8,\n", " per_device_eval_batch_size=32,\n", " gradient_accumulation_steps=4,\n", " warmup_steps=100,\n", " max_steps=12276,\n", " learning_rate=2e-4,\n", " fp16=True,\n", " eval_steps= 1000,\n", " logging_steps=1000,\n", " save_steps=1000,\n", " evaluation_strategy=\"steps\",\n", " do_eval=True,\n", " load_best_model_at_end=True,\n", " metric_for_best_model=\"f1\",\n", " output_dir='model_outputs',\n", " logging_dir='model_outputs',\n", " remove_unused_columns =False, \n", " report_to='wandb' # enable logging to W&B\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "### Start trainer\n", "# trainer = SFTTrainer(\n", "# model_name,\n", "# train_dataset=dataset,\n", "# dataset_text_field=\"text\",\n", "# max_seq_length=512,\n", "# )\n", "\n", "peft_config = LoraConfig(\n", " r=model_params['lora_r'],\n", " lora_alpha=model_params['lora_alpha'],\n", " lora_dropout=model_params['lora_dropout']\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Load the model\n", "model = AutoModelForCausalLM.from_pretrained(\n", " model_name,\n", " load_in_4bit=True\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Setting up the LoRA model\n", "# import os\n", "# os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\"\n", "# from transformers import AutoModelForSequenceClassification\n", "# from peft import LoraConfig, get_peft_model, TaskType\n", "\n", "# MODEL =\"xlm-roberta-large\"\n", "\n", "# config = LoraConfig(\n", "# task_type=\"SEQ_CLS\",\n", "# r=16,\n", "# lora_alpha=16,\n", "# target_modules=[\"query\", \"value\"], # Targets the attention blocks in the model\n", "# lora_dropout=0.1,\n", "# bias=\"none\",\n", "# modules_to_save=[\"classifier\"],\n", "# )\n", "\n", "# model = AutoModelForSequenceClassification.from_pretrained(\n", "# MODEL,\n", "# num_labels=len(unique_subissues),\n", "# id2label=id2label,\n", "# label2id=label2id,\n", "# ignore_mismatched_sizes=True\n", "# ) \n", "\n", "lora_model = get_peft_model(model, peft_config)\n", "lora_model.print_trainable_parameters()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# trainer = SFTTrainer(\n", "# model,\n", "# train_dataset=dataset,\n", "# dataset_text_field=\"text\",\n", "# peft_config=peft_config,\n", "# max_seq_length=model_params['model_max_length']\n", "# )\n", "\n", "# trainer.train()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "f1_metric = evaluate.load(\"f1\")\n", "recall_metric = evaluate.load(\"recall\")\n", "accuracy_metric = evaluate.load(\"accuracy\")\n", "precision_metric = evaluate.load(\"precision\")\n", "\n", "def compute_metrics(eval_pred):\n", " logits, labels = eval_pred\n", " predictions = np.argmax(logits, axis=-1)\n", " results = {}\n", " results.update(f1_metric.compute(predictions=predictions, references = labels, average=\"macro\"))\n", " results.update(recall_metric.compute(predictions=predictions, references = labels, average=\"macro\"))\n", " results.update(accuracy_metric.compute(predictions=predictions, references = labels))\n", " results.update(precision_metric.compute(predictions=predictions, references = labels, average=\"macro\"))\n", "\n", " return results\n", "\n", "# See https://towardsdatascience.com/fine-tune-your-llm-without-maxing-out-your-gpu-db2278603d78 for details\n", "trainer = transformers.Trainer(\n", " model=lora_model,\n", " train_dataset=model_params['train_data'],\n", " eval_dataset=model_params['validation_data'],\n", " compute_metrics=compute_metrics,\n", " args=args_custom\n", ")\n", "trainer.train()" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.7" } }, "nbformat": 4, "nbformat_minor": 2 }