|
import spaces |
|
import gradio as gr |
|
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForSeq2SeqLM |
|
from transformers import DataCollatorForSeq2Seq, AutoConfig |
|
from datasets import load_dataset, concatenate_datasets, load_from_disk |
|
import traceback |
|
from sklearn.metrics import accuracy_score |
|
import numpy as np |
|
import torch |
|
import os |
|
import evaluate |
|
from huggingface_hub import login |
|
from peft import get_peft_model, LoraConfig |
|
|
|
os.environ['HF_HOME'] = '/data/.huggingface' |
|
''' |
|
lora_config = LoraConfig( |
|
r=16, # Rank of the low-rank adaptation |
|
lora_alpha=32, # Scaling factor |
|
lora_dropout=0.1, # Dropout for LoRA layers |
|
bias="none" # Bias handling |
|
) |
|
model = AutoModelForSeq2SeqLM.from_pretrained('google/t5-efficient-tiny', num_labels=2, force_download=True) |
|
model = get_peft_model(model, lora_config) |
|
model.gradient_checkpointing_enable() |
|
model_save_path = '/data/lora_finetuned_model' # Specify your desired save path |
|
model.save_pretrained(model_save_path) |
|
''' |
|
|
|
def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size, lr, grad): |
|
try: |
|
metric = evaluate.load("rouge", cache_dir='/cache') |
|
def compute_metrics(eval_preds): |
|
preds, labels = eval_preds |
|
if isinstance(preds, tuple): |
|
preds = preds[0] |
|
|
|
preds = np.where(preds != -100, preds, tokenizer.pad_token_id) |
|
decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True) |
|
labels = np.where(labels != -100, labels, tokenizer.pad_token_id) |
|
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) |
|
|
|
result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True) |
|
result = {k: round(v * 100, 4) for k, v in result.items()} |
|
prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds] |
|
result["gen_len"] = np.mean(prediction_lens) |
|
return result |
|
|
|
login(api_key.strip()) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
training_args = TrainingArguments( |
|
output_dir='/data/results', |
|
eval_strategy="steps", |
|
save_strategy='steps', |
|
learning_rate=lr*0.00001, |
|
per_device_train_batch_size=int(batch_size), |
|
per_device_eval_batch_size=int(batch_size), |
|
num_train_epochs=int(num_epochs), |
|
weight_decay=0.01, |
|
|
|
|
|
load_best_model_at_end=True, |
|
metric_for_best_model="accuracy", |
|
greater_is_better=True, |
|
logging_dir='/data/logs', |
|
logging_steps=10, |
|
|
|
hub_model_id=hub_id.strip(), |
|
fp16=True, |
|
|
|
save_steps=100, |
|
save_total_limit=3, |
|
) |
|
|
|
if os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir): |
|
print("Loading model from checkpoint...") |
|
model = AutoModelForSeq2SeqLM.from_pretrained(training_args.output_dir) |
|
|
|
|
|
max_length = model.get_input_embeddings().weight.shape[0] |
|
try: |
|
tokenized_train_dataset = load_from_disk(f'/data/{hub_id.strip()}_train_dataset') |
|
tokenized_test_dataset = load_from_disk(f'/data/{hub_id.strip()}_test_dataset') |
|
|
|
|
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=tokenized_train_dataset, |
|
eval_dataset=tokenized_test_dataset, |
|
compute_metrics=compute_metrics, |
|
) |
|
except: |
|
|
|
dataset = load_dataset(dataset_name.strip()) |
|
tokenizer = AutoTokenizer.from_pretrained('google/t5-efficient-tiny-nh8') |
|
|
|
def tokenize_function(examples): |
|
|
|
|
|
model_inputs = tokenizer( |
|
examples['text'], |
|
max_length=max_length, |
|
truncation=True, |
|
padding=True, |
|
) |
|
|
|
|
|
labels = tokenizer( |
|
examples['target'], |
|
max_length=max_length, |
|
truncation=True, |
|
padding=True, |
|
text_target=examples['target'] |
|
) |
|
|
|
|
|
model_inputs["labels"] = labels["input_ids"] |
|
return model_inputs |
|
|
|
tokenized_datasets = dataset.map(tokenize_function, batched=True) |
|
|
|
tokenized_datasets['train'].save_to_disk(f'/data/{hub_id.strip()}_train_dataset') |
|
tokenized_datasets['test'].save_to_disk(f'/data/{hub_id.strip()}_test_dataset') |
|
|
|
|
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=tokenized_datasets['train'], |
|
eval_dataset=tokenized_datasets['test'], |
|
compute_metrics=compute_metrics, |
|
|
|
) |
|
|
|
|
|
if os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir): |
|
train_result = trainer.train(resume_from_checkpoint=True) |
|
else: |
|
train_result = trainer.train() |
|
trainer.push_to_hub(commit_message="Training complete!") |
|
except Exception as e: |
|
return f"An error occurred: {str(e)}, TB: {traceback.format_exc()}" |
|
return 'DONE!' |
|
''' |
|
# Define Gradio interface |
|
def predict(text): |
|
model = AutoModelForSeq2SeqLM.from_pretrained(model_name.strip(), num_labels=2) |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True) |
|
outputs = model(inputs) |
|
predictions = outputs.logits.argmax(dim=-1) |
|
return predictions.item() |
|
''' |
|
|
|
@spaces.GPU(duration=120) |
|
def run_train(dataset_name, hub_id, api_key, num_epochs, batch_size, lr, grad): |
|
def initialize_weights(model): |
|
for name, param in model.named_parameters(): |
|
if 'encoder.block.0.layer.0.DenseReluDense.wi.weight' in name: |
|
torch.nn.init.xavier_uniform_(param.data) |
|
elif 'encoder.block.0.layer.0.DenseReluDense.wo.weight' in name: |
|
torch.nn.init.kaiming_normal_(param.data) |
|
|
|
config = AutoConfig.from_pretrained("google/t5-efficient-tiny") |
|
model = AutoModelForSeq2SeqLM.from_config(config) |
|
initialize_weights(model) |
|
print(list(model.named_parameters())) |
|
lora_config = LoraConfig( |
|
r=16, |
|
lora_alpha=32, |
|
lora_dropout=0.1, |
|
bias="none" |
|
) |
|
model = get_peft_model(model, lora_config) |
|
result = fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size, lr, grad) |
|
return result |
|
|
|
try: |
|
iface = gr.Interface( |
|
fn=run_train, |
|
inputs=[ |
|
gr.Textbox(label="Dataset Name (e.g., 'imdb')"), |
|
gr.Textbox(label="HF hub to push to after training"), |
|
gr.Textbox(label="HF API token"), |
|
gr.Slider(minimum=1, maximum=10, value=3, label="Number of Epochs", step=1), |
|
gr.Slider(minimum=1, maximum=2000, value=1, label="Batch Size", step=1), |
|
gr.Slider(minimum=1, maximum=1000, value=1, label="Learning Rate (e-5)", step=1), |
|
gr.Slider(minimum=1, maximum=100, value=1, label="Gradient accumulation", step=1), |
|
], |
|
outputs="text", |
|
title="Fine-Tune Hugging Face Model", |
|
description="This interface allows you to fine-tune a Hugging Face model on a specified dataset." |
|
) |
|
''' |
|
iface = gr.Interface( |
|
fn=predict, |
|
inputs=[ |
|
gr.Textbox(label="Query"), |
|
], |
|
outputs="text", |
|
title="Fine-Tune Hugging Face Model", |
|
description="This interface allows you to test a fine-tune Hugging Face model." |
|
) |
|
''' |
|
|
|
iface.launch() |
|
except Exception as e: |
|
print(f"An error occurred: {str(e)}, TB: {traceback.format_exc()}") |
|
|
|
|