File size: 3,700 Bytes
367391a
 
 
 
 
 
 
 
a874ac3
367391a
 
a874ac3
 
367391a
 
a874ac3
367391a
 
 
 
a874ac3
 
 
367391a
a874ac3
 
 
367391a
 
 
a874ac3
 
 
 
 
 
367391a
a874ac3
 
 
367391a
a874ac3
367391a
 
a874ac3
367391a
a874ac3
367391a
 
 
 
a874ac3
 
 
367391a
a874ac3
 
 
367391a
 
a874ac3
367391a
a874ac3
 
367391a
a874ac3
 
367391a
a874ac3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
""" HelpingAI model configuration"""

from transformers import PretrainedConfig
from transformers.utils import logging


logger = logging.get_logger(__name__)


class HelpingAIConfig(PretrainedConfig):
    model_type = "HelpingAI"
    keys_to_ignore_at_inference = ["past_key_values"]

    def __init__(
        self,
        vocab_size=50281,
        hidden_size=2560,
        num_hidden_layers=32,
        num_attention_heads=32,
        head_dim=256,
        num_local_experts=8,
        num_experts_per_tok=2,
        intermediate_size=6912,
        hidden_act="silu",
        hidden_dropout=0.0,
        attention_dropout=0.0,
        classifier_dropout=0.1,
        max_position_embeddings=4096,
        initializer_range=0.02,
        rms_norm_eps=1e-6,
        layer_norm_eps=1e-5,
        use_cache=False,
        bos_token_id=50278,
        eos_token_id=50279,
        pad_token_id=50279,
        tie_word_embeddings=False,
        rope_pct=0.25,
        rope_theta=10000,
        partial_rotary_factor=0.25,
        use_qkv_bias=False,
        output_router_logits=False,
        router_aux_loss_coef=0.02,
        **kwargs,
    ):
        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.head_dim = head_dim
        self.num_local_experts = num_local_experts
        self.num_experts_per_tok = num_experts_per_tok
        self.intermediate_size = intermediate_size
        self.hidden_act = hidden_act
        self.hidden_dropout = hidden_dropout
        self.attention_dropout = attention_dropout
        self.classifier_dropout = classifier_dropout
        self.initializer_range = initializer_range
        self.rms_norm_eps = rms_norm_eps
        self.layer_norm_eps = layer_norm_eps
        self.use_cache = use_cache
        self.tie_word_embeddings = tie_word_embeddings
        self.rope_pct = rope_pct
        self.rope_theta = rope_theta
        self.partial_rotary_factor = partial_rotary_factor
        self.use_qkv_bias = use_qkv_bias
        self.output_router_logits = output_router_logits
        self.router_aux_loss_coef = router_aux_loss_coef

        if self.hidden_size % self.num_attention_heads != 0:
            raise ValueError(
                "The hidden size is not divisble by the number of attention heads! Make sure to update them!"
            )

    # Copied from transformers.models.llama.configuration_llama.LlamaConfig._rope_scaling_validation
    def _rope_scaling_validation(self):
        """
        Validate the `rope_scaling` configuration.
        """
        if self.rope_scaling is None:
            return

        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
            raise ValueError(
                "`rope_scaling` must be a dictionary with two fields, `type` and `factor`, " f"got {self.rope_scaling}"
            )
        rope_scaling_type = self.rope_scaling.get("type", None)
        rope_scaling_factor = self.rope_scaling.get("factor", None)
        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
            raise ValueError(
                f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
            )
        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
            raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")