poonehmousavi's picture
Update custom.py
4483827
raw
history blame contribute delete
No virus
3.16 kB
"""This lobe enables the integration of huggingface pretrained GPT2LMHeadModel model plus the expanding embedding layer for additional tokens like BOS, EOS and Speakers .
Transformer from HuggingFace needs to be installed:
https://huggingface.co/transformers/installation.html
Authors
* Pooneh Mousavi 2023
"""
import logging
from torch import Tensor
import torch
import torch.nn as nn
from speechbrain.lobes.models.huggingface_transformers.gpt import GPT
try:
from transformers import GPT2LMHeadModel
from transformers import GPT2Tokenizer
except ImportError:
MSG = "Please install transformers from HuggingFace to use GPT2\n"
MSG += "E.G. run: pip install transformers"
raise ImportError(MSG)
logger = logging.getLogger(__name__)
class HuggingFaceGPT_expanded(GPT):
"""This lobe enables the integration of HuggingFace pretrained GPT model.
Source paper whisper:
https://life-extension.github.io/2020/05/27/GPT%E6%8A%80%E6%9C%AF%E5%88%9D%E6%8E%A2/language-models.pdf
Transformer from HuggingFace needs to be installed:
https://huggingface.co/transformers/installation.html
The model can be finetuned. It will download automatically the model from
HuggingFace or use a local path.
Arguments
---------
source : str
HuggingFace hub name: e.g "gpt2"
save_path : str
Path (dir) of the downloaded model.
freeze : bool (default: False)
If True, the model is frozen. If False, the model will be trained
alongside with the rest of the pipeline.
Example
-------
>>> model_hub = "gpt2"
>>> save_path = "savedir"
>>> model = HuggingFaceGPT(model_hub, save_path)
>>> tokens = torch.tensor([[1, 1]])
>>> tokens_type = torch.tensor([[1, 1]])
>>> attention_mask = torch.tensor([[1, 1]])
>>> outputs = model(tokens, tokens_type, attention_mask)
"""
def __init__(
self, *args, **kwrds
) -> None:
super().__init__( *args, **kwrds)
# Load tokenizer and add special tokens
self.tokenizer = GPT2Tokenizer.from_pretrained(kwrds['source'], pad_token=None)
# # Add special tokens to the tokenizer and resize model embedding
# Special tokens
bos_token = "BOS"
eos_token="EOS"
system_token= "SPK_1"
user_token= "SPK_2"
additional_special_tokens= [
system_token,
user_token
]
attr_to_special_tokens={"bos_token": bos_token,
"eos_token": eos_token,
"additional_special_tokens": additional_special_tokens}
self.add_special_tokens_(
attr_to_special_tokens
)
def add_special_tokens_(self, attr_to_special_token,) -> None:
orig_num_tokens = len(self.tokenizer.encoder)
num_added_tokens = self.tokenizer.add_special_tokens(
attr_to_special_token # type: ignore
) # doesn't add if they are already there
if num_added_tokens > 0:
self.model.resize_token_embeddings(
new_num_tokens=orig_num_tokens + num_added_tokens
)