shunxing1234
commited on
Commit
•
d334358
1
Parent(s):
3d19935
Update configuration_aquila.py
Browse files- configuration_aquila.py +15 -0
configuration_aquila.py
CHANGED
@@ -83,6 +83,7 @@ class AquilaConfig(PretrainedConfig):
|
|
83 |
intermediate_size=11008,
|
84 |
num_hidden_layers=32,
|
85 |
num_attention_heads=32,
|
|
|
86 |
hidden_act="silu",
|
87 |
max_position_embeddings=2048,
|
88 |
initializer_range=0.02,
|
@@ -91,7 +92,10 @@ class AquilaConfig(PretrainedConfig):
|
|
91 |
pad_token_id=0,
|
92 |
bos_token_id=1,
|
93 |
eos_token_id=2,
|
|
|
94 |
tie_word_embeddings=False,
|
|
|
|
|
95 |
**kwargs,
|
96 |
):
|
97 |
self.vocab_size = vocab_size
|
@@ -99,11 +103,22 @@ class AquilaConfig(PretrainedConfig):
|
|
99 |
self.hidden_size = hidden_size
|
100 |
self.intermediate_size = intermediate_size
|
101 |
self.num_hidden_layers = num_hidden_layers
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
self.num_attention_heads = num_attention_heads
|
103 |
self.hidden_act = hidden_act
|
104 |
self.initializer_range = initializer_range
|
105 |
self.rms_norm_eps = rms_norm_eps
|
|
|
106 |
self.use_cache = use_cache
|
|
|
|
|
|
|
107 |
super().__init__(
|
108 |
pad_token_id=pad_token_id,
|
109 |
bos_token_id=bos_token_id,
|
|
|
83 |
intermediate_size=11008,
|
84 |
num_hidden_layers=32,
|
85 |
num_attention_heads=32,
|
86 |
+
num_key_value_heads=None,
|
87 |
hidden_act="silu",
|
88 |
max_position_embeddings=2048,
|
89 |
initializer_range=0.02,
|
|
|
92 |
pad_token_id=0,
|
93 |
bos_token_id=1,
|
94 |
eos_token_id=2,
|
95 |
+
pretraining_tp=1,
|
96 |
tie_word_embeddings=False,
|
97 |
+
rope_theta=10000.0,
|
98 |
+
rope_scaling=None,
|
99 |
**kwargs,
|
100 |
):
|
101 |
self.vocab_size = vocab_size
|
|
|
103 |
self.hidden_size = hidden_size
|
104 |
self.intermediate_size = intermediate_size
|
105 |
self.num_hidden_layers = num_hidden_layers
|
106 |
+
|
107 |
+
# for backward compatibility
|
108 |
+
if num_key_value_heads is None:
|
109 |
+
num_key_value_heads = num_attention_heads
|
110 |
+
|
111 |
+
self.num_key_value_heads = num_key_value_heads
|
112 |
+
|
113 |
self.num_attention_heads = num_attention_heads
|
114 |
self.hidden_act = hidden_act
|
115 |
self.initializer_range = initializer_range
|
116 |
self.rms_norm_eps = rms_norm_eps
|
117 |
+
self.pretraining_tp = pretraining_tp
|
118 |
self.use_cache = use_cache
|
119 |
+
self.rope_theta = rope_theta
|
120 |
+
self.rope_scaling = rope_scaling
|
121 |
+
|
122 |
super().__init__(
|
123 |
pad_token_id=pad_token_id,
|
124 |
bos_token_id=bos_token_id,
|