Qwen-7B-Chat-GPTQ / configuration_qwen.py
basiliskinstitute's picture
Upload 14 files
1edc3a9
raw
history blame
2.42 kB
# Copyright (c) Alibaba Cloud.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
from transformers import PretrainedConfig
class QWenConfig(PretrainedConfig):
model_type = "qwen"
keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {
"hidden_size": "n_embd",
"num_attention_heads": "n_head",
"max_position_embeddings": "n_positions",
"num_hidden_layers": "n_layer",
}
def __init__(
self,
vocab_size=151851,
n_embd=4096,
n_layer=32,
n_head=32,
n_inner=None,
embd_pdrop=0.0,
attn_pdrop=0.0,
layer_norm_epsilon=1e-5,
initializer_range=0.02,
scale_attn_weights=True,
use_cache=True,
eos_token_id=151643,
apply_residual_connection_post_layernorm=False,
bf16=False,
fp16=False,
fp32=False,
kv_channels=128,
rotary_pct=1.0,
rotary_emb_base=10000,
use_dynamic_ntk=False,
use_logn_attn=False,
use_flash_attn=True,
ffn_hidden_size=22016,
no_bias=True,
tie_word_embeddings=False,
**kwargs,
):
self.eos_token_id = eos_token_id
super().__init__(
eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs
)
self.vocab_size = vocab_size
self.n_embd = n_embd
self.n_layer = n_layer
self.n_head = n_head
self.n_inner = n_inner
self.embd_pdrop = embd_pdrop
self.attn_pdrop = attn_pdrop
self.layer_norm_epsilon = layer_norm_epsilon
self.initializer_range = initializer_range
self.scale_attn_weights = scale_attn_weights
self.use_cache = use_cache
self.apply_residual_connection_post_layernorm = (
apply_residual_connection_post_layernorm
)
self.bf16 = bf16
self.fp16 = fp16
self.fp32 = fp32
self.kv_channels = kv_channels
self.rotary_pct = rotary_pct
self.rotary_emb_base = rotary_emb_base
self.use_dynamic_ntk = use_dynamic_ntk
self.use_logn_attn = use_logn_attn
self.use_flash_attn = use_flash_attn
self.ffn_hidden_size = ffn_hidden_size
self.no_bias = no_bias
self.tie_word_embeddings = tie_word_embeddings