ameerazam08 commited on
Commit
c6a61ed
1 Parent(s): 64c9def

Update train.py

Browse files
Files changed (1) hide show
  1. train.py +61 -0
train.py CHANGED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup,
3
+ from datasets import load_dataset
4
+
5
+ # Load the jokes dataset
6
+ dataset = load_dataset("ysharma/short_jokes")
7
+ # Accessing the train split
8
+ train_data = dataset['train']
9
+ # Shuffle the dataset and select 20% of the data
10
+ twenty_percent_size = int(0.2 * len(train_data))
11
+ subset = train_data.shuffle(seed=42)[:twenty_percent_size]
12
+
13
+
14
+ # Use GPT-2's tokenizer
15
+ tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
16
+ tokenizer.pad_token = tokenizer.eos_token
17
+
18
+ # Tokenize the dataset
19
+ def tokenize_function(examples):
20
+ return tokenizer(examples["Joke"], padding="max_length", truncation=True, max_length=50)
21
+
22
+ tokenized_datasets = dataset.map(tokenize_function, batched=True)
23
+
24
+ # Load GPT-2 model
25
+ model = GPT2LMHeadModel.from_pretrained("gpt2-medium")
26
+ model.train()
27
+
28
+ # Training parameters
29
+ device = "cuda" if torch.cuda.is_available() else "cpu"
30
+ model.to(device)
31
+ optimizer = AdamW(model.parameters(), lr=5e-5)
32
+ num_epochs = 100
33
+ total_steps = len(tokenized_datasets["train"]) * num_epochs
34
+ scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
35
+
36
+ # Training loop
37
+ for epoch in range(num_epochs):
38
+ for idx, batch in enumerate(tokenized_datasets["train"]):
39
+ inputs = torch.tensor(batch["input_ids"]).to(device)
40
+ attention_mask = torch.tensor(batch["attention_mask"]).to(device)
41
+ outputs = model(inputs, attention_mask=attention_mask, labels=inputs)
42
+ loss = outputs.loss
43
+ loss.backward()
44
+ optimizer.step()
45
+ scheduler.step()
46
+ optimizer.zero_grad()
47
+
48
+ if idx % 100 == 0:
49
+ print(f"Epoch: {epoch}, Batch: {idx}, Loss: {loss.item()}")
50
+ if epoch%5==0:
51
+ save_directory = f"./trained_gpt2_jokes/{epoch}"
52
+ model.save_pretrained(save_directory)
53
+ tokenizer.save_pretrained(save_directory)
54
+
55
+
56
+ print("Training completed!")
57
+ save_directory = "./trained_gpt2_jokes/final"
58
+ model.save_pretrained(save_directory)
59
+ tokenizer.save_pretrained(save_directory)
60
+
61
+ print(f"Model and tokenizer saved to {save_directory}")