|
|
|
import torch |
|
from transformers import GPT2Tokenizer, GPT2LMHeadModel |
|
from torch.utils.data import Dataset, DataLoader |
|
import os |
|
|
|
|
|
class TextDataset(Dataset): |
|
def __init__(self, file_path, block_size): |
|
self.block_size = block_size |
|
with open(file_path, 'r', encoding='utf-8') as f: |
|
self.examples = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())] |
|
|
|
self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium') |
|
self.special_tokens_dict = {'pad_token': '<PAD>'} |
|
self.num_added_toks = self.tokenizer.add_special_tokens(self.special_tokens_dict) |
|
|
|
def __len__(self): |
|
return len(self.examples) |
|
|
|
def __getitem__(self, idx): |
|
text = self.examples[idx] |
|
tokenized_text = self.tokenizer.encode(text) |
|
if len(tokenized_text) > self.block_size: |
|
tokenized_text = tokenized_text[:self.block_size] |
|
tokenized_text += [self.tokenizer.pad_token_id] * (self.block_size - len(tokenized_text)) |
|
return torch.tensor(tokenized_text) |
|
|
|
|
|
def train(): |
|
train_dataset = TextDataset('path/to/your/text/file.txt', block_size=512) |
|
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True) |
|
|
|
device = 'cuda' if torch.cuda.is_available() else 'cpu' |
|
model = GPT2LMHeadModel.from_pretrained('gpt2-medium') |
|
model.resize_token_embeddings(len(train_dataset.tokenizer)) |
|
|
|
model.to(device) |
|
|
|
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5) |
|
criterion = torch.nn.CrossEntropyLoss(ignore_index=train_dataset.tokenizer.pad_token_id) |
|
|
|
epochs = 5 |
|
|
|
for epoch in range(epochs): |
|
model.train() |
|
total_loss = 0 |
|
for batch in train_loader: |
|
batch = batch.to(device) |
|
optimizer.zero_grad() |
|
outputs = model(input_ids=batch[:, :-1], labels=batch[:, 1:]) |
|
loss = criterion(outputs.logits.view(-1, outputs.logits.shape[-1]), batch[:, 1:].view(-1)) |
|
loss.backward() |
|
optimizer.step() |
|
total_loss += loss.item() |
|
|
|
print(f'Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}') |
|
|
|
model.save_pretrained('finetuned_model') |
|
|
|
|
|
train() |