import torch from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup, from datasets import load_dataset # Load the jokes dataset dataset = load_dataset("ysharma/short_jokes") # Accessing the train split train_data = dataset['train'] # Shuffle the dataset and select 20% of the data twenty_percent_size = int(0.2 * len(train_data)) subset = train_data.shuffle(seed=42)[:twenty_percent_size] # Use GPT-2's tokenizer tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium") tokenizer.pad_token = tokenizer.eos_token # Tokenize the dataset def tokenize_function(examples): return tokenizer(examples["Joke"], padding="max_length", truncation=True, max_length=50) tokenized_datasets = dataset.map(tokenize_function, batched=True) # Load GPT-2 model model = GPT2LMHeadModel.from_pretrained("gpt2-medium") model.train() # Training parameters device = "cuda" if torch.cuda.is_available() else "cpu" model.to(device) optimizer = AdamW(model.parameters(), lr=5e-5) num_epochs = 100 total_steps = len(tokenized_datasets["train"]) * num_epochs scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps) # Training loop for epoch in range(num_epochs): for idx, batch in enumerate(tokenized_datasets["train"]): inputs = torch.tensor(batch["input_ids"]).to(device) attention_mask = torch.tensor(batch["attention_mask"]).to(device) outputs = model(inputs, attention_mask=attention_mask, labels=inputs) loss = outputs.loss loss.backward() optimizer.step() scheduler.step() optimizer.zero_grad() if idx % 100 == 0: print(f"Epoch: {epoch}, Batch: {idx}, Loss: {loss.item()}") if epoch%5==0: save_directory = f"./trained_gpt2_jokes/{epoch}" model.save_pretrained(save_directory) tokenizer.save_pretrained(save_directory) print("Training completed!") save_directory = "./trained_gpt2_jokes/final" model.save_pretrained(save_directory) tokenizer.save_pretrained(save_directory) print(f"Model and tokenizer saved to {save_directory}")