ChatGLM-6B-Int4-API-OpenAI-Compatible
/
models
/models--silver--chatglm-6b-int4-slim
/snapshots
/02e096b3805c579caf5741a6d8eddd5ba7a74e0d
/configuration_chatglm.py
""" ChatGLM model configuration """ | |
from transformers.configuration_utils import PretrainedConfig | |
from transformers.utils import logging | |
logger = logging.get_logger(__name__) | |
class ChatGLMConfig(PretrainedConfig): | |
r""" | |
This is the configuration class to store the configuration of a [`~ChatGLMModel`]. | |
It is used to instantiate an ChatGLM model according to the specified arguments, defining the model | |
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of | |
the ChatGLM-6B [THUDM/ChatGLM-6B](https://huggingface.co/THUDM/chatglm-6b) architecture. | |
Configuration objects inherit from [`PretrainedConfig`] and can be used | |
to control the model outputs. Read the documentation from [`PretrainedConfig`] | |
for more information. | |
Args: | |
vocab_size (`int`, *optional*, defaults to 130528): | |
Vocabulary size of the ChatGLM-6B model. Defines the number of different tokens that can be represented by the | |
`inputs_ids` passed when calling [`~ChatGLMModel`] or | |
[`~TFChatGLMModel`]. | |
hidden_size (`int`, *optional*, defaults to 4096): | |
Dimension of the encoder layers and the pooler layer. | |
num_hidden_layers (`int`, *optional*, defaults to 28): | |
Number of hidden layers in the Transformer encoder. | |
num_attention_heads (`int`, *optional*, defaults to 32): | |
Number of attention heads for each attention layer in the Transformer encoder. | |
inner_hidden_size (`int`, *optional*, defaults to 16384): | |
Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. | |
max_sequence_length (`int`, *optional*, defaults to 512): | |
The maximum sequence length that this model might ever be used with. | |
Typically set this to something large just in case (e.g., 512 or 1024 or 2048). | |
layernorm_epsilon (`float`, *optional*, defaults to 1e-5): | |
The epsilon used by the layer normalization layers. | |
use_cache (`bool`, *optional*, defaults to `True`): | |
Whether the model should return the last key/values attentions (not used by all models). | |
Example: | |
```python | |
>>> from configuration_chatglm import ChatGLMConfig | |
>>> from modeling_chatglm import ChatGLMModel | |
>>> # Initializing a ChatGLM-6B THUDM/ChatGLM-6B style configuration | |
>>> configuration = ChatGLMConfig() | |
>>> # Initializing a model from the THUDM/ChatGLM-6B style configuration | |
>>> model = ChatGLMModel(configuration) | |
>>> # Accessing the model configuration | |
>>> configuration = model.config | |
``` | |
""" | |
model_type = "chatglm" | |
def __init__( | |
self, | |
vocab_size=130528, | |
hidden_size=4096, | |
num_layers=28, | |
num_attention_heads=32, | |
layernorm_epsilon=1e-5, | |
use_cache=False, | |
bos_token_id=130004, | |
eos_token_id=130005, | |
pad_token_id=0, | |
max_sequence_length=2048, | |
inner_hidden_size=16384, | |
position_encoding_2d=True, | |
quantization_bit=0, | |
quantization_embeddings=False, | |
**kwargs | |
): | |
self.num_layers = num_layers | |
self.vocab_size = vocab_size | |
self.hidden_size = hidden_size | |
self.num_attention_heads = num_attention_heads | |
self.max_sequence_length = max_sequence_length | |
self.layernorm_epsilon = layernorm_epsilon | |
self.inner_hidden_size = inner_hidden_size | |
self.use_cache = use_cache | |
self.bos_token_id = bos_token_id | |
self.eos_token_id = eos_token_id | |
self.pad_token_id = pad_token_id | |
self.position_encoding_2d = position_encoding_2d | |
self.quantization_bit=quantization_bit | |
self.quantization_embeddings=quantization_embeddings | |
super().__init__( | |
pad_token_id=pad_token_id, | |
bos_token_id=bos_token_id, | |
eos_token_id=eos_token_id, | |
**kwargs | |
) | |