File size: 4,963 Bytes
f481cbe
 
 
 
 
 
fe25472
f481cbe
fe25472
f481cbe
 
 
fe25472
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f481cbe
 
3eea93e
f481cbe
 
f6c29be
f481cbe
 
 
 
 
1cd5d29
f481cbe
fe25472
 
 
f481cbe
 
1cd5d29
f481cbe
fe25472
 
 
 
 
 
 
 
f481cbe
 
 
 
694a287
f481cbe
fe25472
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f481cbe
fe25472
f481cbe
9b37590
f481cbe
fe25472
9b37590
f481cbe
fe25472
 
 
 
 
 
f481cbe
fe25472
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f481cbe
 
 
9b37590
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import os
from uuid import uuid4
import pandas as pd

from datasets import load_dataset
import subprocess
from transformers import AutoTokenizer

### Read environment variables
# from dotenv import load_dotenv,find_dotenv
# load_dotenv(find_dotenv(),override=True)

### Functions
def max_token_len(dataset):
    max_seq_length = 0
    for row in dataset:
        tokens = len(tokenizer(row['text'])['input_ids'])
        if tokens > max_seq_length:
            max_seq_length = tokens
    return max_seq_length

### Model details
# model_name='TinyLlama/TinyLlama-1.1B-Chat-v0.1'
model_name = 'mistralai/Mistral-7B-v0.1'
# model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model_max_length = tokenizer.model_max_length
print("Model Max Length:", model_max_length)

### Repo name, dataset initialization, and data directory
# Load dataset
dataset_name = 'ai-aerospace/ams_data_train_generic_v0.1_100'
dataset=load_dataset(dataset_name)

# Write dataset files into data directory
data_directory = './fine_tune_data/'

# Create the data directory if it doesn't exist
os.makedirs(data_directory, exist_ok=True)

# Write the train data to a CSV file
train_data='train_data'
train_filename = os.path.join(data_directory, train_data)
dataset['train'].to_pandas().to_csv(train_filename+'.csv', columns=['text'], index=False)
max_token_length_train=max_token_len(dataset['train'])
print('Max token length train: '+str(max_token_length_train))

# Write the validation data to a CSV file
validation_data='validation_data'
validation_filename = os.path.join(data_directory, validation_data)
dataset['validation'].to_pandas().to_csv(validation_filename+'.csv', columns=['text'], index=False)
max_token_length_validation=max_token_len(dataset['validation'])
print('Max token length validation: '+str(max_token_length_validation))
      
max_token_length=max(max_token_length_train,max_token_length_validation)
if max_token_length > model_max_length:
    raise ValueError("Maximum token length exceeds model limits.")
block_size=2*max_token_length

# Define project parameters
username='ai-aerospace'
project_name='./llms/'+'ams_data_train-100_'+str(uuid4())
repo_name='ams-data-train-100-'+str(uuid4())

### Set training params
model_params={
  "project_name": project_name,
  "model_name": model_name,
  "repo_id": username+'/'+repo_name,
  "train_data": train_data,
  "validation_data": validation_data,
  "data_directory": data_directory,
  "block_size": block_size,
  "model_max_length": max_token_length,
  "logging_steps": -1,
  "evaluation_strategy": "epoch",
  "save_total_limit": 1,
  "save_strategy": "epoch",
  "mixed_precision": "fp16",
  "lr": 0.00003,
  "epochs": 3,
  "batch_size": 2,
  "warmup_ratio": 0.1,
  "gradient_accumulation": 1,
  "optimizer": "adamw_torch",
  "scheduler": "linear",
  "weight_decay": 0,
  "max_grad_norm": 1,
  "seed": 42,
  "quantization": "int4",
  "target_modules": "",
  "lora_r": 16,
  "lora_alpha": 32,
  "lora_dropout": 0.05
}
for key, value in model_params.items():
  os.environ[key] = str(value)

### Feed into and run autotrain command
# Set .venv and execute the autotrain script
# To see all parameters: autotrain llm --help
# !autotrain llm --train --project_name my-llm --model TinyLlama/TinyLlama-1.1B-Chat-v0.1 --data_path . --use-peft --use_int4 --learning_rate 2e-4 --train_batch_size 6 --num_train_epochs 3 --trainer sft
command=f"""
autotrain llm --train \
    --trainer sft \
    --project_name {model_params['project_name']} \
    --model {model_params['model_name']} \
    --data_path {model_params['data_directory']} \
    --train_split {model_params['train_data']} \
    --valid_split {model_params['validation_data']} \
    --repo_id {model_params['repo_id']} \
    --push_to_hub \
    --token HUGGINGFACE_TOKEN
    --block_size {model_params['block_size']} \
    --model_max_length {model_params['model_max_length']} \
    --logging_steps {model_params['logging_steps']} \
    --evaluation_strategy {model_params['evaluation_strategy']} \
    --save_total_limit {model_params['save_total_limit']} \
    --save_strategy {model_params['save_strategy']} \
    --fp16 \
    --lr {model_params['lr']} \
    --num_train_epochs {model_params['lr']} \
    --batch_size {model_params['batch_size']} \
    --warmup_ratio {model_params['warmup_ratio']} \
    --gradient_accumulation {model_params['gradient_accumulation']} \
    --optimizer {model_params['gradient_accumulation']} \
    --scheduler linear \
    --weight_decay {model_params['weight_decay']} \
    --max_grad_norm {model_params['max_grad_norm']} \
    --seed {model_params['seed']} \
    --use_int4 \
    --target_modules {model_params['target_modules']} \
    --use-peft \
    --lora_r {model_params['lora_r']} \
    --lora_alpha {model_params['lora_alpha']} \
    --lora_dropout {model_params['lora_dropout']}
"""

# Use subprocess.run() to execute the command
subprocess.run(command, shell=True, check=True)