gradio-3 / app.py
Kevin Fink
dev
3a30063
raw
history blame
15.8 kB
import spaces
import glob
import gradio as gr
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import DataCollatorForSeq2Seq, AutoConfig
from datasets import load_dataset, concatenate_datasets, load_from_disk, DatasetDict
import traceback
from sklearn.metrics import accuracy_score
import numpy as np
import torch
import os
import evaluate
from huggingface_hub import login
from peft import get_peft_model, LoraConfig
os.environ['HF_HOME'] = '/data/.huggingface'
'''
lora_config = LoraConfig(
r=16, # Rank of the low-rank adaptation
lora_alpha=32, # Scaling factor
lora_dropout=0.1, # Dropout for LoRA layers
bias="none" # Bias handling
)
model = AutoModelForSeq2SeqLM.from_pretrained('google/t5-efficient-tiny', num_labels=2, force_download=True)
model = get_peft_model(model, lora_config)
model.gradient_checkpointing_enable()
model_save_path = '/data/lora_finetuned_model' # Specify your desired save path
model.save_pretrained(model_save_path)
'''
def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size, lr, grad):
try:
torch.cuda.empty_cache()
torch.nn.CrossEntropyLoss()
#rouge_metric = evaluate.load("rouge", cache_dir='/data/cache')
#def compute_metrics(eval_preds):
#preds, labels = eval_preds
#if isinstance(preds, tuple):
#preds = preds[0]
#from pprint import pprint as pp
#pp(preds)
## Replace -100s used for padding as we can't decode them
#preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
#labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
## Decode predictions and labels
#decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
#decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
## Compute ROUGE metrics
#result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels)
#result = {k: round(v * 100, 4) for k, v in result.items()}
## Calculate accuracy
#accuracy = accuracy_score(decoded_labels, decoded_preds)
#result["eval_accuracy"] = round(accuracy * 100, 4)
## Calculate average generation length
#prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
#result["gen_len"] = np.mean(prediction_lens)
#return result
login(api_key.strip())
# Load the model and tokenizer
# Set training arguments
training_args = TrainingArguments(
output_dir='/data/results',
eval_strategy="steps", # Change this to steps
save_strategy='steps',
learning_rate=lr*0.00001,
per_device_train_batch_size=int(batch_size),
per_device_eval_batch_size=int(batch_size),
num_train_epochs=int(num_epochs),
weight_decay=0.01,
#gradient_accumulation_steps=int(grad),
#max_grad_norm = 3.0,
#load_best_model_at_end=True,
#metric_for_best_model="loss",
#greater_is_better=True,
logging_dir='/data/logs',
logging_steps=200,
#push_to_hub=True,
hub_model_id=hub_id.strip(),
fp16=True,
#lr_scheduler_type='cosine',
save_steps=200, # Save checkpoint every 500 steps
save_total_limit=3,
)
# Check if a checkpoint exists and load it
#if os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir):
#print("Loading model from checkpoint...")
#model = AutoModelForSeq2SeqLM.from_pretrained(training_args.output_dir)
tokenizer = AutoTokenizer.from_pretrained('google/t5-efficient-tiny-nh8', use_fast=True, trust_remote_code=True)
#max_length = model.get_input_embeddings().weight.shape[0]
max_length = 512
def tokenize_function(examples):
# Assuming 'text' is the input and 'target' is the expected output
model_inputs = tokenizer(
examples['text'],
max_length=max_length, # Set to None for dynamic padding
truncation=True,
padding='max_length',
#return_tensors='pt',
#padding=True,
)
# Setup the decoder input IDs (shifted right)
with tokenizer.as_target_tokenizer():
labels = tokenizer(
examples['target'],
max_length=max_length, # Set to None for dynamic padding
truncation=True,
padding='max_length',
#text_target=examples['target'],
#return_tensors='pt',
#padding=True,
)
#labels["input_ids"] = [
# [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
#]
# Add labels to the model inputs
model_inputs["labels"] = labels["input_ids"]
return model_inputs
#max_length = 512
# Load the dataset
column_names = ['text', 'target']
try:
saved_dataset = load_from_disk(f'/data/{hub_id.strip()}_train_dataset')
if os.access(f'/data/{hub_id.strip()}_test_dataset', os.R_OK):
train_dataset = load_from_disk(f'/data/{hub_id.strip()}_train_dataset3')
saved_test_dataset = load_from_disk(f'/data/{hub_id.strip()}_validation_dataset')
dataset = load_dataset(dataset_name.strip())
print("FOUND TEST")
# Create Trainer
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=saved_test_dataset,
#compute_metrics=compute_metrics,
#data_collator=data_collator,
#processing_class=tokenizer,
)
elif os.access(f'/data/{hub_id.strip()}_train_dataset3', os.R_OK):
dataset = load_dataset(dataset_name.strip())
#dataset['test'] = dataset['test'].select(range(700))
dataset['test'] = dataset['test'].select(range(50))
del dataset['train']
del dataset['validation']
test_set = dataset.map(tokenize_function, batched=True)
test_set['test'].save_to_disk(f'/data/{hub_id.strip()}_test_dataset')
return 'TOKENS DONE'
elif os.access(f'/data/{hub_id.strip()}_validation_dataset', os.R_OK):
dataset = load_dataset(dataset_name.strip())
train_size = len(dataset['train'])
third_size = train_size // 3
del dataset['test']
del dataset['validation']
print("FOUND VALIDATION")
saved_dataset = load_from_disk(f'/data/{hub_id.strip()}_train_dataset2')
third_third = dataset['train'].select(range(third_size*2, train_size))
dataset['train'] = third_third
#tokenized_second_half = tokenize_function(third_third)
tokenized_second_half = dataset.map(tokenize_function, batched=True)
dataset['train'] = concatenate_datasets([saved_dataset, tokenized_second_half['train']])
dataset['train'].save_to_disk(f'/data/{hub_id.strip()}_train_dataset3')
return 'THIRD THIRD LOADED'
if os.access(f'/data/{hub_id.strip()}_train_dataset', os.R_OK) and not os.access(f'/data/{hub_id.strip()}_train_dataset3', os.R_OK):
dataset = load_dataset(dataset_name.strip())
train_size = len(dataset['train'])
third_size = train_size // 3
second_third = dataset['train'].select(range(third_size, third_size*2))
dataset['train'] = second_third
del dataset['test']
tokenized_sh_fq_dataset = dataset.map(tokenize_function, batched=True,)
dataset['validation'] = dataset['validation'].map(tokenize_function, batched=True)
saved_dataset = load_from_disk(f'/data/{hub_id.strip()}_train_dataset')
dataset['train'] = concatenate_datasets([saved_dataset, tokenized_sh_fq_dataset['train']])
dataset['train'].save_to_disk(f'/data/{hub_id.strip()}_train_dataset2')
dataset['validation'].save_to_disk(f'/data/{hub_id.strip()}_validation_dataset')
return 'SECOND THIRD LOADED'
except Exception as e:
print(f"An error occurred: {str(e)}, TB: {traceback.format_exc()}")
dataset = load_dataset(dataset_name.strip())
train_size = len(dataset['train'])
third_size = train_size // 3
# Tokenize the dataset
first_third = dataset['train'].select(range(third_size))
dataset['train'] = first_third
del dataset['test']
del dataset['validation']
tokenized_first_third = dataset.map(tokenize_function, batched=True,)
tokenized_first_third['train'].save_to_disk(f'/data/{hub_id.strip()}_train_dataset')
print('DONE')
return 'RUN AGAIN TO LOAD REST OF DATA'
dataset = load_dataset(dataset_name.strip())
#dataset['train'] = dataset['train'].select(range(4000))
#dataset['validation'] = dataset['validation'].select(range(200))
#train_set = dataset.map(tokenize_function, batched=True)
#print(train_set.keys())
#data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
#trainer = Trainer(
#model=model,
#args=training_args,
#train_dataset=train_set['train'],
#eval_dataset=train_set['validation'],
##compute_metrics=compute_metrics,
##data_collator=data_collator,
##processing_class=tokenizer,
#)
for entry in os.listdir('/data/results'):
try:
current_dir = os.listdir(entry)
print(f'{entry}: {current_dir}')
except:
pass
def get_checkpoint_int(s):
int_index = s.find('-')
return int(s[int_index+1:])
def filter_checkpoints_dirs(l):
new_list = list()
for entry in l:
print(entry)
if 'checkpoint' in entry:
new_list.append(entry)
return new_list
try:
train_result = trainer.train(resume_from_checkpoint=True)
except Exception as e:
print(f"An error occurred: {str(e)}, TB: {traceback.format_exc()}")
import shutil
checkpoint_dir = training_args.output_dir
# If the trainer_state.json is missing, look for the previous checkpoint
dir_entries = filter_checkpoints_dirs(os.listdir(checkpoint_dir))
previous_checkpoints = sorted(dir_entries, key=get_checkpoint_int, reverse=True)
print(f'CHECKPOINTs: {previous_checkpoints}')
for check in previous_checkpoints:
try:
print(f"Removing previous checkpoint {check}")
shutil.rmtree(os.path.join(checkpoint_dir, check))
train_result = trainer.train(resume_from_checkpoint=True)
trainer.push_to_hub(commit_message="Training complete!")
return 'DONE!'#train_result
except:
pass
print("No previous checkpoints found. Starting training from scratch.")
train_result = trainer.train()
#trainer.push_to_hub(commit_message="Training complete!")
except Exception as e:
return f"An error occurred: {str(e)}, TB: {traceback.format_exc()}"
return 'DONE!'#train_result
@spaces.GPU
def test(text):
from transformers import pipeline
model_name = 'shorecode/t5-efficient-tiny-nh8-summarizer'
summarizer = pipeline(
"summarization",
model=model_name,
tokenizer=model_name,
clean_up_tokenization_spaces=True,
)
max_length = 500
summary = summarizer(text, max_length=max_length, min_length=40, no_repeat_ngram_size=2)
return summary
@spaces.GPU(duration=120)
def run_train(dataset_name, hub_id, api_key, num_epochs, batch_size, lr, grad):
def initialize_weights(model):
for name, param in model.named_parameters():
if 'encoder.block.0.layer.0.DenseReluDense.wi.weight' in name: # Example layer
torch.nn.init.xavier_uniform_(param.data) # Xavier initialization
elif 'encoder.block.0.layer.0.DenseReluDense.wo.weight' in name: # Another example layer
torch.nn.init.kaiming_normal_(param.data) # Kaiming initialization
model = AutoModelForSeq2SeqLM.from_pretrained("tarekziade/wikipedia-summaries-t5-efficient-tiny")
lora_config = LoraConfig(
r=4, # Rank of the low-rank adaptation
lora_alpha=8, # Scaling factor
lora_dropout=0.1, # Dropout for LoRA layers
bias="none" # Bias handling
)
#model = get_peft_model(model, lora_config)
result = fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size, lr, grad)
return result
'''
try:
iface = gr.Interface(
fn=test,
inputs=[
gr.Textbox(label="Text to summarize:"),
],
outputs="text",
title="Fine-Tune Hugging Face Model shorecode/t5-efficient-tiny-nh8-summarizer",
description="This interface allows you to test shorecode/t5-efficient-tiny-nh8-summarizer."
)
# Launch the interface
iface.launch()
except Exception as e:
print(f"An error occurred: {str(e)}, TB: {traceback.format_exc()}")
'''
# Create Gradio interface
try:
iface = gr.Interface(
fn=run_train,
inputs=[
gr.Textbox(label="Dataset Name (e.g., 'imdb')"),
gr.Textbox(label="HF hub to push to after training"),
gr.Textbox(label="HF API token"),
gr.Slider(minimum=1, maximum=10, value=3, label="Number of Epochs", step=1),
gr.Slider(minimum=1, maximum=2000, value=1, label="Batch Size", step=1),
gr.Slider(minimum=1, maximum=1000, value=1, label="Learning Rate (e-5)", step=1),
gr.Slider(minimum=1, maximum=100, value=1, label="Gradient accumulation", step=1),
],
outputs="text",
title="Fine-Tune Hugging Face Model",
description="This interface allows you to fine-tune a Hugging Face model on a specified dataset."
)
# Launch the interface
iface.launch()
except Exception as e:
print(f"An error occurred: {str(e)}, TB: {traceback.format_exc()}")