Spaces:
Runtime error
Runtime error
File size: 4,963 Bytes
f481cbe fe25472 f481cbe fe25472 f481cbe fe25472 f481cbe 3eea93e f481cbe f6c29be f481cbe 1cd5d29 f481cbe fe25472 f481cbe 1cd5d29 f481cbe fe25472 f481cbe 694a287 f481cbe fe25472 f481cbe fe25472 f481cbe 9b37590 f481cbe fe25472 9b37590 f481cbe fe25472 f481cbe fe25472 f481cbe 9b37590 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
import os
from uuid import uuid4
import pandas as pd
from datasets import load_dataset
import subprocess
from transformers import AutoTokenizer
### Read environment variables
# from dotenv import load_dotenv,find_dotenv
# load_dotenv(find_dotenv(),override=True)
### Functions
def max_token_len(dataset):
max_seq_length = 0
for row in dataset:
tokens = len(tokenizer(row['text'])['input_ids'])
if tokens > max_seq_length:
max_seq_length = tokens
return max_seq_length
### Model details
# model_name='TinyLlama/TinyLlama-1.1B-Chat-v0.1'
model_name = 'mistralai/Mistral-7B-v0.1'
# model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model_max_length = tokenizer.model_max_length
print("Model Max Length:", model_max_length)
### Repo name, dataset initialization, and data directory
# Load dataset
dataset_name = 'ai-aerospace/ams_data_train_generic_v0.1_100'
dataset=load_dataset(dataset_name)
# Write dataset files into data directory
data_directory = './fine_tune_data/'
# Create the data directory if it doesn't exist
os.makedirs(data_directory, exist_ok=True)
# Write the train data to a CSV file
train_data='train_data'
train_filename = os.path.join(data_directory, train_data)
dataset['train'].to_pandas().to_csv(train_filename+'.csv', columns=['text'], index=False)
max_token_length_train=max_token_len(dataset['train'])
print('Max token length train: '+str(max_token_length_train))
# Write the validation data to a CSV file
validation_data='validation_data'
validation_filename = os.path.join(data_directory, validation_data)
dataset['validation'].to_pandas().to_csv(validation_filename+'.csv', columns=['text'], index=False)
max_token_length_validation=max_token_len(dataset['validation'])
print('Max token length validation: '+str(max_token_length_validation))
max_token_length=max(max_token_length_train,max_token_length_validation)
if max_token_length > model_max_length:
raise ValueError("Maximum token length exceeds model limits.")
block_size=2*max_token_length
# Define project parameters
username='ai-aerospace'
project_name='./llms/'+'ams_data_train-100_'+str(uuid4())
repo_name='ams-data-train-100-'+str(uuid4())
### Set training params
model_params={
"project_name": project_name,
"model_name": model_name,
"repo_id": username+'/'+repo_name,
"train_data": train_data,
"validation_data": validation_data,
"data_directory": data_directory,
"block_size": block_size,
"model_max_length": max_token_length,
"logging_steps": -1,
"evaluation_strategy": "epoch",
"save_total_limit": 1,
"save_strategy": "epoch",
"mixed_precision": "fp16",
"lr": 0.00003,
"epochs": 3,
"batch_size": 2,
"warmup_ratio": 0.1,
"gradient_accumulation": 1,
"optimizer": "adamw_torch",
"scheduler": "linear",
"weight_decay": 0,
"max_grad_norm": 1,
"seed": 42,
"quantization": "int4",
"target_modules": "",
"lora_r": 16,
"lora_alpha": 32,
"lora_dropout": 0.05
}
for key, value in model_params.items():
os.environ[key] = str(value)
### Feed into and run autotrain command
# Set .venv and execute the autotrain script
# To see all parameters: autotrain llm --help
# !autotrain llm --train --project_name my-llm --model TinyLlama/TinyLlama-1.1B-Chat-v0.1 --data_path . --use-peft --use_int4 --learning_rate 2e-4 --train_batch_size 6 --num_train_epochs 3 --trainer sft
command=f"""
autotrain llm --train \
--trainer sft \
--project_name {model_params['project_name']} \
--model {model_params['model_name']} \
--data_path {model_params['data_directory']} \
--train_split {model_params['train_data']} \
--valid_split {model_params['validation_data']} \
--repo_id {model_params['repo_id']} \
--push_to_hub \
--token HUGGINGFACE_TOKEN
--block_size {model_params['block_size']} \
--model_max_length {model_params['model_max_length']} \
--logging_steps {model_params['logging_steps']} \
--evaluation_strategy {model_params['evaluation_strategy']} \
--save_total_limit {model_params['save_total_limit']} \
--save_strategy {model_params['save_strategy']} \
--fp16 \
--lr {model_params['lr']} \
--num_train_epochs {model_params['lr']} \
--batch_size {model_params['batch_size']} \
--warmup_ratio {model_params['warmup_ratio']} \
--gradient_accumulation {model_params['gradient_accumulation']} \
--optimizer {model_params['gradient_accumulation']} \
--scheduler linear \
--weight_decay {model_params['weight_decay']} \
--max_grad_norm {model_params['max_grad_norm']} \
--seed {model_params['seed']} \
--use_int4 \
--target_modules {model_params['target_modules']} \
--use-peft \
--lora_r {model_params['lora_r']} \
--lora_alpha {model_params['lora_alpha']} \
--lora_dropout {model_params['lora_dropout']}
"""
# Use subprocess.run() to execute the command
subprocess.run(command, shell=True, check=True) |