File size: 2,280 Bytes
a9d0655
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# Import libraries
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from torch.utils.data import Dataset, DataLoader
import os

# Define dataset and functions
class TextDataset(Dataset):
    def __init__(self, file_path, block_size):
        self.block_size = block_size
        with open(file_path, 'r', encoding='utf-8') as f:
            self.examples = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]

        self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
        self.special_tokens_dict = {'pad_token': '<PAD>'}
        self.num_added_toks = self.tokenizer.add_special_tokens(self.special_tokens_dict)

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        text = self.examples[idx]
        tokenized_text = self.tokenizer.encode(text)
        if len(tokenized_text) > self.block_size:
            tokenized_text = tokenized_text[:self.block_size]
        tokenized_text += [self.tokenizer.pad_token_id] * (self.block_size - len(tokenized_text))
        return torch.tensor(tokenized_text)

# Define training
def train():
    train_dataset = TextDataset('path/to/your/text/file.txt', block_size=512)
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = GPT2LMHeadModel.from_pretrained('gpt2-medium')
    model.resize_token_embeddings(len(train_dataset.tokenizer))

    model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
    criterion = torch.nn.CrossEntropyLoss(ignore_index=train_dataset.tokenizer.pad_token_id)

    epochs = 5

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in train_loader:
            batch = batch.to(device)
            optimizer.zero_grad()
            outputs = model(input_ids=batch[:, :-1], labels=batch[:, 1:])
            loss = criterion(outputs.logits.view(-1, outputs.logits.shape[-1]), batch[:, 1:].view(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f'Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}')

    model.save_pretrained('finetuned_model')

# TRAIN THE MODEL!!!
train()