import os from uuid import uuid4 import pandas as pd from datasets import load_dataset import subprocess from transformers import AutoTokenizer ### Read environment variables # from dotenv import load_dotenv,find_dotenv # load_dotenv(find_dotenv(),override=True) ### Functions def max_token_len(dataset): max_seq_length = 0 for row in dataset: tokens = len(tokenizer(row['text'])['input_ids']) if tokens > max_seq_length: max_seq_length = tokens return max_seq_length ### Model details # model_name='TinyLlama/TinyLlama-1.1B-Chat-v0.1' model_name = 'mistralai/Mistral-7B-v0.1' # model_name = 'distilbert-base-uncased' tokenizer = AutoTokenizer.from_pretrained(model_name) model_max_length = tokenizer.model_max_length print("Model Max Length:", model_max_length) ### Repo name, dataset initialization, and data directory # Load dataset dataset_name = 'ai-aerospace/ams_data_train_generic_v0.1_100' dataset=load_dataset(dataset_name) # Write dataset files into data directory data_directory = './fine_tune_data/' # Create the data directory if it doesn't exist os.makedirs(data_directory, exist_ok=True) # Write the train data to a CSV file train_data='train_data' train_filename = os.path.join(data_directory, train_data) dataset['train'].to_pandas().to_csv(train_filename+'.csv', columns=['text'], index=False) max_token_length_train=max_token_len(dataset['train']) print('Max token length train: '+str(max_token_length_train)) # Write the validation data to a CSV file validation_data='validation_data' validation_filename = os.path.join(data_directory, validation_data) dataset['validation'].to_pandas().to_csv(validation_filename+'.csv', columns=['text'], index=False) max_token_length_validation=max_token_len(dataset['validation']) print('Max token length validation: '+str(max_token_length_validation)) max_token_length=max(max_token_length_train,max_token_length_validation) if max_token_length > model_max_length: raise ValueError("Maximum token length exceeds model limits.") block_size=2*max_token_length # Define project parameters username='ai-aerospace' project_name='./llms/'+'ams_data_train-100_'+str(uuid4()) repo_name='ams-data-train-100-'+str(uuid4()) ### Set training params model_params={ "project_name": project_name, "model_name": model_name, "repo_id": username+'/'+repo_name, "train_data": train_data, "validation_data": validation_data, "data_directory": data_directory, "block_size": block_size, "model_max_length": max_token_length, "logging_steps": -1, "evaluation_strategy": "epoch", "save_total_limit": 1, "save_strategy": "epoch", "mixed_precision": "fp16", "lr": 0.00003, "epochs": 3, "batch_size": 2, "warmup_ratio": 0.1, "gradient_accumulation": 1, "optimizer": "adamw_torch", "scheduler": "linear", "weight_decay": 0, "max_grad_norm": 1, "seed": 42, "quantization": "int4", "target_modules": "", "lora_r": 16, "lora_alpha": 32, "lora_dropout": 0.05 } for key, value in model_params.items(): os.environ[key] = str(value) ### Feed into and run autotrain command # Set .venv and execute the autotrain script # To see all parameters: autotrain llm --help # !autotrain llm --train --project_name my-llm --model TinyLlama/TinyLlama-1.1B-Chat-v0.1 --data_path . --use-peft --use_int4 --learning_rate 2e-4 --train_batch_size 6 --num_train_epochs 3 --trainer sft command=f""" autotrain llm --train \ --trainer sft \ --project_name {model_params['project_name']} \ --model {model_params['model_name']} \ --data_path {model_params['data_directory']} \ --train_split {model_params['train_data']} \ --valid_split {model_params['validation_data']} \ --repo_id {model_params['repo_id']} \ --push_to_hub \ --token HUGGINGFACE_TOKEN --block_size {model_params['block_size']} \ --model_max_length {model_params['model_max_length']} \ --logging_steps {model_params['logging_steps']} \ --evaluation_strategy {model_params['evaluation_strategy']} \ --save_total_limit {model_params['save_total_limit']} \ --save_strategy {model_params['save_strategy']} \ --fp16 \ --lr {model_params['lr']} \ --num_train_epochs {model_params['lr']} \ --batch_size {model_params['batch_size']} \ --warmup_ratio {model_params['warmup_ratio']} \ --gradient_accumulation {model_params['gradient_accumulation']} \ --optimizer {model_params['gradient_accumulation']} \ --scheduler linear \ --weight_decay {model_params['weight_decay']} \ --max_grad_norm {model_params['max_grad_norm']} \ --seed {model_params['seed']} \ --use_int4 \ --target_modules {model_params['target_modules']} \ --use-peft \ --lora_r {model_params['lora_r']} \ --lora_alpha {model_params['lora_alpha']} \ --lora_dropout {model_params['lora_dropout']} """ # Use subprocess.run() to execute the command subprocess.run(command, shell=True, check=True)