|
import spaces |
|
import glob |
|
import gradio as gr |
|
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForSeq2SeqLM |
|
from transformers import DataCollatorForSeq2Seq, AutoConfig |
|
from datasets import load_dataset, concatenate_datasets, load_from_disk, DatasetDict |
|
import traceback |
|
from sklearn.metrics import accuracy_score |
|
import numpy as np |
|
import torch |
|
import os |
|
import evaluate |
|
from huggingface_hub import login |
|
from peft import get_peft_model, LoraConfig |
|
|
|
os.environ['HF_HOME'] = '/data/.huggingface' |
|
''' |
|
lora_config = LoraConfig( |
|
r=16, # Rank of the low-rank adaptation |
|
lora_alpha=32, # Scaling factor |
|
lora_dropout=0.1, # Dropout for LoRA layers |
|
bias="none" # Bias handling |
|
) |
|
model = AutoModelForSeq2SeqLM.from_pretrained('google/t5-efficient-tiny', num_labels=2, force_download=True) |
|
model = get_peft_model(model, lora_config) |
|
model.gradient_checkpointing_enable() |
|
model_save_path = '/data/lora_finetuned_model' # Specify your desired save path |
|
model.save_pretrained(model_save_path) |
|
''' |
|
|
|
def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size, lr, grad): |
|
try: |
|
torch.cuda.empty_cache() |
|
torch.nn.CrossEntropyLoss() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
login(api_key.strip()) |
|
|
|
|
|
|
|
|
|
|
|
training_args = TrainingArguments( |
|
output_dir='/data/results', |
|
eval_strategy="steps", |
|
save_strategy='steps', |
|
learning_rate=lr*0.00001, |
|
per_device_train_batch_size=int(batch_size), |
|
per_device_eval_batch_size=int(batch_size), |
|
num_train_epochs=int(num_epochs), |
|
weight_decay=0.01, |
|
|
|
|
|
|
|
|
|
|
|
logging_dir='/data/logs', |
|
logging_steps=200, |
|
|
|
hub_model_id=hub_id.strip(), |
|
fp16=True, |
|
|
|
save_steps=200, |
|
save_total_limit=3, |
|
) |
|
|
|
|
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained('google/t5-efficient-tiny-nh8', use_fast=True, trust_remote_code=True) |
|
|
|
|
|
max_length = 512 |
|
|
|
def tokenize_function(examples): |
|
|
|
|
|
model_inputs = tokenizer( |
|
examples['text'], |
|
max_length=max_length, |
|
truncation=True, |
|
padding='max_length', |
|
|
|
|
|
) |
|
|
|
|
|
with tokenizer.as_target_tokenizer(): |
|
labels = tokenizer( |
|
examples['target'], |
|
max_length=max_length, |
|
truncation=True, |
|
padding='max_length', |
|
|
|
|
|
|
|
) |
|
|
|
|
|
|
|
|
|
model_inputs["labels"] = labels["input_ids"] |
|
return model_inputs |
|
|
|
|
|
|
|
column_names = ['text', 'target'] |
|
|
|
try: |
|
saved_dataset = load_from_disk(f'/data/{hub_id.strip()}_train_dataset') |
|
if os.access(f'/data/{hub_id.strip()}_test_dataset', os.R_OK): |
|
train_dataset = load_from_disk(f'/data/{hub_id.strip()}_train_dataset3') |
|
saved_test_dataset = load_from_disk(f'/data/{hub_id.strip()}_validation_dataset') |
|
dataset = load_dataset(dataset_name.strip()) |
|
print("FOUND TEST") |
|
|
|
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) |
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=train_dataset, |
|
eval_dataset=saved_test_dataset, |
|
|
|
|
|
|
|
) |
|
|
|
elif os.access(f'/data/{hub_id.strip()}_train_dataset3', os.R_OK): |
|
dataset = load_dataset(dataset_name.strip()) |
|
|
|
dataset['test'] = dataset['test'].select(range(50)) |
|
del dataset['train'] |
|
del dataset['validation'] |
|
test_set = dataset.map(tokenize_function, batched=True) |
|
test_set['test'].save_to_disk(f'/data/{hub_id.strip()}_test_dataset') |
|
return 'TOKENS DONE' |
|
|
|
elif os.access(f'/data/{hub_id.strip()}_validation_dataset', os.R_OK): |
|
dataset = load_dataset(dataset_name.strip()) |
|
train_size = len(dataset['train']) |
|
third_size = train_size // 3 |
|
del dataset['test'] |
|
del dataset['validation'] |
|
print("FOUND VALIDATION") |
|
saved_dataset = load_from_disk(f'/data/{hub_id.strip()}_train_dataset2') |
|
third_third = dataset['train'].select(range(third_size*2, train_size)) |
|
dataset['train'] = third_third |
|
|
|
tokenized_second_half = dataset.map(tokenize_function, batched=True) |
|
dataset['train'] = concatenate_datasets([saved_dataset, tokenized_second_half['train']]) |
|
dataset['train'].save_to_disk(f'/data/{hub_id.strip()}_train_dataset3') |
|
return 'THIRD THIRD LOADED' |
|
|
|
|
|
if os.access(f'/data/{hub_id.strip()}_train_dataset', os.R_OK) and not os.access(f'/data/{hub_id.strip()}_train_dataset3', os.R_OK): |
|
dataset = load_dataset(dataset_name.strip()) |
|
train_size = len(dataset['train']) |
|
third_size = train_size // 3 |
|
second_third = dataset['train'].select(range(third_size, third_size*2)) |
|
dataset['train'] = second_third |
|
del dataset['test'] |
|
tokenized_sh_fq_dataset = dataset.map(tokenize_function, batched=True,) |
|
dataset['validation'] = dataset['validation'].map(tokenize_function, batched=True) |
|
saved_dataset = load_from_disk(f'/data/{hub_id.strip()}_train_dataset') |
|
dataset['train'] = concatenate_datasets([saved_dataset, tokenized_sh_fq_dataset['train']]) |
|
dataset['train'].save_to_disk(f'/data/{hub_id.strip()}_train_dataset2') |
|
dataset['validation'].save_to_disk(f'/data/{hub_id.strip()}_validation_dataset') |
|
return 'SECOND THIRD LOADED' |
|
|
|
except Exception as e: |
|
print(f"An error occurred: {str(e)}, TB: {traceback.format_exc()}") |
|
dataset = load_dataset(dataset_name.strip()) |
|
train_size = len(dataset['train']) |
|
third_size = train_size // 3 |
|
|
|
first_third = dataset['train'].select(range(third_size)) |
|
dataset['train'] = first_third |
|
del dataset['test'] |
|
del dataset['validation'] |
|
tokenized_first_third = dataset.map(tokenize_function, batched=True,) |
|
|
|
tokenized_first_third['train'].save_to_disk(f'/data/{hub_id.strip()}_train_dataset') |
|
print('DONE') |
|
return 'RUN AGAIN TO LOAD REST OF DATA' |
|
dataset = load_dataset(dataset_name.strip()) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for entry in os.listdir('/data/results'): |
|
try: |
|
current_dir = os.listdir(entry) |
|
print(f'{entry}: {current_dir}') |
|
except: |
|
pass |
|
|
|
def get_checkpoint_int(s): |
|
int_index = s.find('-') |
|
return int(s[int_index+1:]) |
|
|
|
def filter_checkpoints_dirs(l): |
|
new_list = list() |
|
for entry in l: |
|
print(entry) |
|
if 'checkpoint' in entry: |
|
new_list.append(entry) |
|
return new_list |
|
|
|
try: |
|
train_result = trainer.train(resume_from_checkpoint=True) |
|
except Exception as e: |
|
print(f"An error occurred: {str(e)}, TB: {traceback.format_exc()}") |
|
import shutil |
|
checkpoint_dir = training_args.output_dir |
|
|
|
dir_entries = filter_checkpoints_dirs(os.listdir(checkpoint_dir)) |
|
previous_checkpoints = sorted(dir_entries, key=get_checkpoint_int, reverse=True) |
|
print(f'CHECKPOINTs: {previous_checkpoints}') |
|
for check in previous_checkpoints: |
|
try: |
|
print(f"Removing previous checkpoint {check}") |
|
shutil.rmtree(os.path.join(checkpoint_dir, check)) |
|
train_result = trainer.train(resume_from_checkpoint=True) |
|
trainer.push_to_hub(commit_message="Training complete!") |
|
return 'DONE!' |
|
except: |
|
pass |
|
|
|
print("No previous checkpoints found. Starting training from scratch.") |
|
train_result = trainer.train() |
|
|
|
except Exception as e: |
|
return f"An error occurred: {str(e)}, TB: {traceback.format_exc()}" |
|
return 'DONE!' |
|
|
|
@spaces.GPU |
|
def test(text): |
|
from transformers import pipeline |
|
model_name = 'shorecode/t5-efficient-tiny-nh8-summarizer' |
|
summarizer = pipeline( |
|
"summarization", |
|
model=model_name, |
|
tokenizer=model_name, |
|
clean_up_tokenization_spaces=True, |
|
) |
|
|
|
max_length = 500 |
|
summary = summarizer(text, max_length=max_length, min_length=40, no_repeat_ngram_size=2) |
|
return summary |
|
|
|
|
|
@spaces.GPU(duration=120) |
|
def run_train(dataset_name, hub_id, api_key, num_epochs, batch_size, lr, grad): |
|
def initialize_weights(model): |
|
for name, param in model.named_parameters(): |
|
if 'encoder.block.0.layer.0.DenseReluDense.wi.weight' in name: |
|
torch.nn.init.xavier_uniform_(param.data) |
|
elif 'encoder.block.0.layer.0.DenseReluDense.wo.weight' in name: |
|
torch.nn.init.kaiming_normal_(param.data) |
|
model = AutoModelForSeq2SeqLM.from_pretrained("tarekziade/wikipedia-summaries-t5-efficient-tiny") |
|
lora_config = LoraConfig( |
|
r=4, |
|
lora_alpha=8, |
|
lora_dropout=0.1, |
|
bias="none" |
|
) |
|
|
|
result = fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size, lr, grad) |
|
return result |
|
|
|
''' |
|
try: |
|
iface = gr.Interface( |
|
fn=test, |
|
inputs=[ |
|
gr.Textbox(label="Text to summarize:"), |
|
], |
|
outputs="text", |
|
title="Fine-Tune Hugging Face Model shorecode/t5-efficient-tiny-nh8-summarizer", |
|
description="This interface allows you to test shorecode/t5-efficient-tiny-nh8-summarizer." |
|
) |
|
|
|
# Launch the interface |
|
iface.launch() |
|
except Exception as e: |
|
print(f"An error occurred: {str(e)}, TB: {traceback.format_exc()}") |
|
''' |
|
|
|
try: |
|
iface = gr.Interface( |
|
fn=run_train, |
|
inputs=[ |
|
gr.Textbox(label="Dataset Name (e.g., 'imdb')"), |
|
gr.Textbox(label="HF hub to push to after training"), |
|
gr.Textbox(label="HF API token"), |
|
gr.Slider(minimum=1, maximum=10, value=3, label="Number of Epochs", step=1), |
|
gr.Slider(minimum=1, maximum=2000, value=1, label="Batch Size", step=1), |
|
gr.Slider(minimum=1, maximum=1000, value=1, label="Learning Rate (e-5)", step=1), |
|
gr.Slider(minimum=1, maximum=100, value=1, label="Gradient accumulation", step=1), |
|
], |
|
outputs="text", |
|
title="Fine-Tune Hugging Face Model", |
|
description="This interface allows you to fine-tune a Hugging Face model on a specified dataset." |
|
) |
|
|
|
|
|
iface.launch() |
|
except Exception as e: |
|
print(f"An error occurred: {str(e)}, TB: {traceback.format_exc()}") |
|
|