|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" Transnormer configuration""" |
|
|
|
from transformers.configuration_utils import PretrainedConfig |
|
from transformers.utils import logging |
|
|
|
logger = logging.get_logger(__name__) |
|
|
|
|
|
class TransnormerConfig(PretrainedConfig): |
|
model_type = "transnormer" |
|
keys_to_ignore_at_inference = ["past_key_values"] |
|
|
|
def __init__( |
|
self, |
|
pad_token_id=0, |
|
bos_token_id=1, |
|
eos_token_id=2, |
|
vocab_size=64000, |
|
use_cache=True, |
|
init_std=0.02, |
|
|
|
decoder_embed_dim=1024, |
|
decoder_layers=24, |
|
decoder_attention_heads=8, |
|
no_scale_embedding=False, |
|
add_bos_token=False, |
|
norm_type="simplermsnorm", |
|
linear_use_lrpe_list=[], |
|
hidden_dim=1024, |
|
linear_act_fun="silu", |
|
glu_dim=2816, |
|
bias=False, |
|
**kwargs, |
|
): |
|
super().__init__( |
|
pad_token_id=pad_token_id, |
|
bos_token_id=bos_token_id, |
|
eos_token_id=eos_token_id, |
|
**kwargs, |
|
) |
|
|
|
self.vocab_size = vocab_size |
|
self.use_cache = use_cache |
|
self.init_std = init_std |
|
|
|
self.decoder_embed_dim = decoder_embed_dim |
|
self.decoder_layers = decoder_layers |
|
self.decoder_attention_heads = decoder_attention_heads |
|
self.no_scale_embedding = no_scale_embedding |
|
self.add_bos_token = add_bos_token |
|
self.norm_type = norm_type |
|
self.linear_use_lrpe_list = linear_use_lrpe_list |
|
self.hidden_dim = hidden_dim |
|
self.linear_act_fun = linear_act_fun |
|
self.glu_dim = glu_dim |
|
self.bias = bias |
|
|