pip install transformers datasets torch

from datasets import load_dataset

Load your custom dataset (ensure it's in the proper format)

dataset = load_dataset('Hamses/EU_Regulation_261_2004', data_files={'train': 'train.txt', 'test': 'test.txt'})

Load the GPT-2 tokenizer

from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

Preprocess the dataset

def preprocess_function(examples): return tokenizer(examples['text'], padding='max_length', truncation=True)

encoded_dataset = dataset.map(preprocess_function, batched=True)

from transformers import GPT2LMHeadModel, TrainingArguments, Trainer

Load the GPT-2 model

model = GPT2LMHeadModel.from_pretrained('gpt2')

Define training arguments

training_args = TrainingArguments( output_dir='./results',
num_train_epochs=3,
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
warmup_steps=500,
weight_decay=0.01,
logging_dir='./logs',
)

Initialize the Trainer

trainer = Trainer( model=model,
args=training_args,
train_dataset=encoded_dataset['train'],
eval_dataset=encoded_dataset['test'] )

Train the model

trainer.train()

Evaluate the model

results = trainer.evaluate() print(results)

Save the model

model.save_pretrained('./gpt2-finetuned') tokenizer.save_pretrained('./gpt2-finetuned')

Downloads last month

-

Downloads are not tracked for this model. How to track
Inference Examples
This model does not have enough activity to be deployed to Inference API (serverless) yet. Increase its social visibility and check back later, or deploy to Inference Endpoints (dedicated) instead.

Dataset used to train Hamses/EU_Regulation_261_2004