|
--- |
|
license: apache-2.0 |
|
language: |
|
- en |
|
library_name: pythae |
|
tags: |
|
- music |
|
--- |
|
--- |
|
license: agpl-3.0 |
|
---from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments |
|
from datasets import load_dataset |
|
import numpy as np |
|
|
|
# Carica il modello e il tokenizer |
|
tokenizer = GPT2Tokenizer.from_pretrained('gpt2') |
|
model = GPT2LMHeadModel.from_pretrained('gpt2') |
|
|
|
# Carica un dataset personalizzato (esempio con CSV) |
|
dataset = load_dataset('csv', data_files={'train': 'path/to/train.csv', 'test': 'path/to/test.csv'}) |
|
|
|
# Tokenizzazione del dataset |
|
def tokenize_function(examples): |
|
return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128) |
|
|
|
tokenized_datasets = dataset.map(tokenize_function, batched=True) |
|
|
|
# Configura i parametri di addestramento |
|
training_args = TrainingArguments( |
|
output_dir='./results', |
|
num_train_epochs=3, |
|
per_device_train_batch_size=4, |
|
save_steps=10_000, |
|
save_total_limit=2, |
|
evaluation_strategy="epoch" |
|
) |
|
|
|
# Funzione per calcolare le metriche |
|
def compute_metrics(eval_pred): |
|
logits, labels = eval_pred |
|
predictions = np.argmax(logits, axis=-1) |
|
return metric.compute(predictions=predictions, references=labels) |
|
|
|
# Crea il trainer |
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=tokenized_datasets['train'], |
|
eval_dataset=tokenized_datasets['test'], |
|
compute_metrics=compute_metrics |
|
) |
|
|
|
# Esegui l'addestramento |
|
trainer.train() |