|
import torch |
|
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup, |
|
from datasets import load_dataset |
|
|
|
|
|
dataset = load_dataset("ysharma/short_jokes") |
|
|
|
train_data = dataset['train'] |
|
|
|
twenty_percent_size = int(0.2 * len(train_data)) |
|
subset = train_data.shuffle(seed=42)[:twenty_percent_size] |
|
|
|
|
|
|
|
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium") |
|
tokenizer.pad_token = tokenizer.eos_token |
|
|
|
|
|
def tokenize_function(examples): |
|
return tokenizer(examples["Joke"], padding="max_length", truncation=True, max_length=50) |
|
|
|
tokenized_datasets = dataset.map(tokenize_function, batched=True) |
|
|
|
|
|
model = GPT2LMHeadModel.from_pretrained("gpt2-medium") |
|
model.train() |
|
|
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
model.to(device) |
|
optimizer = AdamW(model.parameters(), lr=5e-5) |
|
num_epochs = 100 |
|
total_steps = len(tokenized_datasets["train"]) * num_epochs |
|
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps) |
|
|
|
|
|
for epoch in range(num_epochs): |
|
for idx, batch in enumerate(tokenized_datasets["train"]): |
|
inputs = torch.tensor(batch["input_ids"]).to(device) |
|
attention_mask = torch.tensor(batch["attention_mask"]).to(device) |
|
outputs = model(inputs, attention_mask=attention_mask, labels=inputs) |
|
loss = outputs.loss |
|
loss.backward() |
|
optimizer.step() |
|
scheduler.step() |
|
optimizer.zero_grad() |
|
|
|
if idx % 100 == 0: |
|
print(f"Epoch: {epoch}, Batch: {idx}, Loss: {loss.item()}") |
|
if epoch%5==0: |
|
save_directory = f"./trained_gpt2_jokes/{epoch}" |
|
model.save_pretrained(save_directory) |
|
tokenizer.save_pretrained(save_directory) |
|
|
|
|
|
print("Training completed!") |
|
save_directory = "./trained_gpt2_jokes/final" |
|
model.save_pretrained(save_directory) |
|
tokenizer.save_pretrained(save_directory) |
|
|
|
print(f"Model and tokenizer saved to {save_directory}") |