In [2]:
import os
import re
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

import warnings
warnings.filterwarnings('ignore')

In [4]:
# function to read different file types:
def read_file(filepath):
    with open(filepath, 'r') as f:
        text = f.read()
    return text

text_file = read_file('object_detection.txt')

# remove access new lines:
text_file = re.sub(r'\n+','\n', text_file).strip()

#split the text into training and validation
train_fraction = 0.8
split_index = int(train_fraction * len(text_file))

train_text = text_file[:split_index]
val_text = text_file[split_index:]

In [5]:
train_text

"# Mask R-CNN for Object Detection and Segmentation\nThis is an implementation of [Mask R-CNN](https://arxiv.org/abs/1703.06870) on Python 3, Keras, and TensorFlow. The model generates bounding boxes and segmentation masks for each instance of an object in the image. It's based on Feature Pyramid Network (FPN) and a ResNet101 backbone.\n![Instance Segmentation Sample](assets/street.png)\nThe repository includes:\n* Source code of Mask R-CNN built on FPN and ResNet101.\n* Training code for MS COCO\n* Pre-trained weights for MS COCO\n* Jupyter notebooks to visualize the detection pipeline at every step\n* ParallelModel class for multi-GPU training\n* Evaluation on MS COCO metrics (AP)\n* Example of training on your own dataset\nThe code is documented and designed to be easy to extend. If you use it in your research, please consider citing this repository (bibtex below). If you work on 3D vision, you might find our recently released [Matterport3D](https://matterport.com/blog/2017/09/20/an

In [6]:
with open("train.txt", "w") as f:
    f.write(train_text)

with open("val.txt", "w") as f:
    f.write(val_text)

In [7]:
## Load pre- trained Tokenizer GP2Tokenizer
checkpoint = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(checkpoint)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [8]:
# Tokenize train data
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path='train.txt',
    block_size=128
)

# Tokenize val data
val_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path='val.txt',
    block_size=128
)
print(f'Train dataset length: {len(train_dataset)}; Val dataset length: {len(val_dataset)}')

Train dataset length: 23; Val dataset length: 6


In [9]:
# Data collator : used for creating batch using a list of dataset elements as inputs, apply processings like padding, random data augmentation

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False, return_tensors = 'pt'
)
# tokenizer is created earlier
# loading GPT2 model transformer with language modelling head top
model = GPT2LMHeadModel.from_pretrained(checkpoint)


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [22]:
# prompt: how upload the trained model 'trainer' on hugging face

from huggingface_hub import notebook_login

notebook_login()

trainer.push_to_hub("saman.khan.stats@gmail.com/GPT2-fineTuned")


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.11k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Sam1995/gpt_model/commit/18b9da2dcc172cb5ac13b9c1272988fd903bd5e0', commit_message='saman.khan.stats@gmail.com/GPT2-fineTuned', commit_description='', oid='18b9da2dcc172cb5ac13b9c1272988fd903bd5e0', pr_url=None, pr_revision=None, pr_num=None)

In [10]:
# Fine Tuning the model
model_output_path = '/content/gpt_model'

training_args = TrainingArguments(
    output_dir = model_output_path,
    overwrite_output_dir = True,
    per_device_train_batch_size = 4, # try with 2
    per_device_eval_batch_size = 4,  #  try with 2
    num_train_epochs = 100,
    save_steps = 1_000,
    save_total_limit = 2,
    logging_dir = './logs',
    )

# Train the model
trainer = Trainer(
    model = model,
    args = training_args,
    data_collator = data_collator,
    train_dataset = train_dataset,
    eval_dataset = val_dataset,
)

trainer.train()

# Save the model
trainer.save_model(model_output_path)

# Save the tokenizer
tokenizer.save_pretrained(model_output_path)

Step,Training Loss
500,0.3705


('/content/gpt_model/tokenizer_config.json',
 '/content/gpt_model/special_tokens_map.json',
 '/content/gpt_model/vocab.json',
 '/content/gpt_model/merges.txt',
 '/content/gpt_model/added_tokens.json')

In [17]:
# use Gradio App to take input and generate output:

def generate_response(model, tokenizer, prompt, max_length=100):
    input_ids = tokenizer.encode(prompt, return_tensors='pt')

    attention_mask = torch.ones_like(input_ids)
    pad_token_id = tokenizer.eos_token_id

    output = model.generate(
        input_ids,
        attention_mask=attention_mask,
        pad_token_id=pad_token_id,
        max_length=max_length,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.7,
        early_stopping=True
    )
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response



In [20]:

import gradio as gr

# Load the fine-tuned model and tokenizer
model = GPT2LMHeadModel.from_pretrained(model_output_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_output_path)

iface = gr.Interface(
    fn=lambda prompt: generate_response(model, tokenizer, prompt, 250),
    inputs = "textbox",
    outputs="text",
    title="GPT-2 Fine-Tuned Text Generator",
    description="Enter a prompt and the model will generate text based on your input."
)

iface.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://fc087f7820fdecd3b1.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


