|
import logging |
|
import os |
|
import psutil |
|
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForSeq2Seq |
|
from datasets import load_dataset |
|
import torch |
|
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') |
|
logger = logging.getLogger(__name__) |
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
logger.info(f"Usando dispositivo: {device}") |
|
|
|
def log_gpu_memory(): |
|
if torch.cuda.is_available(): |
|
gpu_memory = torch.cuda.memory_allocated() / 1024 / 1024 |
|
logger.info(f"Uso de memória GPU: {gpu_memory:.2f} MB") |
|
else: |
|
logger.info("GPU não disponível") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
logger.info("Carregando modelo e tokenizer...") |
|
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small") |
|
tokenizer = AutoTokenizer.from_pretrained("t5-small") |
|
log_gpu_memory() |
|
|
|
|
|
logger.info("Carregando dataset...") |
|
dataset = load_dataset('csv', data_files='../extract-csv/social_keywords.csv') |
|
log_gpu_memory() |
|
|
|
|
|
def preprocess_function(examples): |
|
inputs = examples['keyword'] |
|
targets = examples['context'] |
|
model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding='max_length') |
|
|
|
|
|
with tokenizer.as_target_tokenizer(): |
|
labels = tokenizer(targets, max_length=128, truncation=True, padding='max_length') |
|
|
|
model_inputs['labels'] = labels['input_ids'] |
|
return model_inputs |
|
|
|
|
|
logger.info("Pré-processando dataset...") |
|
train_dataset = dataset['train'].map(preprocess_function, batched=True, remove_columns=dataset['train'].column_names) |
|
log_gpu_memory() |
|
|
|
print("Amostra de dados processados:") |
|
print(train_dataset[:5]) |
|
|
|
|
|
logger.info("Configurando argumentos de treinamento...") |
|
training_args = TrainingArguments( |
|
output_dir="./results", |
|
num_train_epochs=3, |
|
per_device_train_batch_size=16, |
|
save_steps=10_000, |
|
save_total_limit=2, |
|
logging_dir='./logs', |
|
logging_steps=100, |
|
) |
|
|
|
|
|
class MemoryMonitorTrainer(Trainer): |
|
def training_step(self, model, inputs): |
|
loss = super().training_step(model, inputs) |
|
if self.state.global_step % 100 == 0: |
|
log_gpu_memory() |
|
return loss |
|
|
|
|
|
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding='max_length', max_length=128) |
|
|
|
|
|
logger.info("Iniciando treinamento...") |
|
trainer = MemoryMonitorTrainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=train_dataset, |
|
tokenizer=tokenizer, |
|
data_collator=data_collator, |
|
) |
|
|
|
trainer.train() |
|
|
|
logger.info("Treinamento concluído!") |
|
log_gpu_memory() |
|
|
|
|
|
output_dir = "./meu_modelo_treinado" |
|
logger.info(f"Salvando o modelo treinado em {output_dir}") |
|
model.save_pretrained(output_dir) |
|
tokenizer.save_pretrained(output_dir) |
|
log_gpu_memory() |
|
|
|
logger.info("Processo finalizado!") |