tyo-gpt2 / main.py
tekkonetes's picture
Create main.py
a9d0655
# Import libraries
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from torch.utils.data import Dataset, DataLoader
import os
# Define dataset and functions
class TextDataset(Dataset):
def __init__(self, file_path, block_size):
self.block_size = block_size
with open(file_path, 'r', encoding='utf-8') as f:
self.examples = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]
self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
self.special_tokens_dict = {'pad_token': '<PAD>'}
self.num_added_toks = self.tokenizer.add_special_tokens(self.special_tokens_dict)
def __len__(self):
return len(self.examples)
def __getitem__(self, idx):
text = self.examples[idx]
tokenized_text = self.tokenizer.encode(text)
if len(tokenized_text) > self.block_size:
tokenized_text = tokenized_text[:self.block_size]
tokenized_text += [self.tokenizer.pad_token_id] * (self.block_size - len(tokenized_text))
return torch.tensor(tokenized_text)
# Define training
def train():
train_dataset = TextDataset('path/to/your/text/file.txt', block_size=512)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = GPT2LMHeadModel.from_pretrained('gpt2-medium')
model.resize_token_embeddings(len(train_dataset.tokenizer))
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
criterion = torch.nn.CrossEntropyLoss(ignore_index=train_dataset.tokenizer.pad_token_id)
epochs = 5
for epoch in range(epochs):
model.train()
total_loss = 0
for batch in train_loader:
batch = batch.to(device)
optimizer.zero_grad()
outputs = model(input_ids=batch[:, :-1], labels=batch[:, 1:])
loss = criterion(outputs.logits.view(-1, outputs.logits.shape[-1]), batch[:, 1:].view(-1))
loss.backward()
optimizer.step()
total_loss += loss.item()
print(f'Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}')
model.save_pretrained('finetuned_model')
# TRAIN THE MODEL!!!
train()