Update modeling_qwen.py
#1
by
takatosp1
- opened
- modeling_qwen.py +7 -7
modeling_qwen.py
CHANGED
@@ -154,7 +154,7 @@ class QWenAttention(nn.Module):
|
|
154 |
if self.rotary_ndims is not None
|
155 |
else self.hidden_size_per_attention_head
|
156 |
)
|
157 |
-
self.rotary_emb =
|
158 |
|
159 |
self.use_dynamic_ntk = config.use_dynamic_ntk
|
160 |
self.use_logn_attn = config.use_logn_attn
|
@@ -386,12 +386,12 @@ class QWenBlock(nn.Module):
|
|
386 |
hidden_size = config.hidden_size
|
387 |
self.bf16 = config.bf16
|
388 |
|
389 |
-
self.ln_1 =
|
390 |
hidden_size,
|
391 |
eps=config.layer_norm_epsilon,
|
392 |
)
|
393 |
self.attn = QWenAttention(config)
|
394 |
-
self.ln_2 =
|
395 |
hidden_size,
|
396 |
eps=config.layer_norm_epsilon,
|
397 |
)
|
@@ -460,7 +460,7 @@ class QWenPreTrainedModel(PreTrainedModel):
|
|
460 |
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
|
461 |
if module.padding_idx is not None:
|
462 |
module.weight.data[module.padding_idx].zero_()
|
463 |
-
elif isinstance(module,
|
464 |
module.weight.data.fill_(1.0)
|
465 |
|
466 |
for name, p in module.named_parameters():
|
@@ -500,7 +500,7 @@ class QWenModel(QWenPreTrainedModel):
|
|
500 |
for i in range(config.num_hidden_layers)
|
501 |
]
|
502 |
)
|
503 |
-
self.ln_f =
|
504 |
self.embed_dim,
|
505 |
eps=config.layer_norm_epsilon,
|
506 |
)
|
@@ -1041,7 +1041,7 @@ class QWenLMHeadModel(QWenPreTrainedModel):
|
|
1041 |
)
|
1042 |
|
1043 |
|
1044 |
-
class
|
1045 |
def __init__(self, dim, base=10000):
|
1046 |
super().__init__()
|
1047 |
self.dim = dim
|
@@ -1104,7 +1104,7 @@ def apply_rotary_pos_emb(t, freqs):
|
|
1104 |
return torch.cat((t_, t_pass_), dim=-1).type_as(t)
|
1105 |
|
1106 |
|
1107 |
-
class
|
1108 |
def __init__(self, dim: int, eps: float = 1e-6):
|
1109 |
super().__init__()
|
1110 |
self.eps = eps
|
|
|
154 |
if self.rotary_ndims is not None
|
155 |
else self.hidden_size_per_attention_head
|
156 |
)
|
157 |
+
self.rotary_emb = QWenRotaryEmbedding(dim, base=config.rotary_emb_base)
|
158 |
|
159 |
self.use_dynamic_ntk = config.use_dynamic_ntk
|
160 |
self.use_logn_attn = config.use_logn_attn
|
|
|
386 |
hidden_size = config.hidden_size
|
387 |
self.bf16 = config.bf16
|
388 |
|
389 |
+
self.ln_1 = QWenRMSNorm(
|
390 |
hidden_size,
|
391 |
eps=config.layer_norm_epsilon,
|
392 |
)
|
393 |
self.attn = QWenAttention(config)
|
394 |
+
self.ln_2 = QWenRMSNorm(
|
395 |
hidden_size,
|
396 |
eps=config.layer_norm_epsilon,
|
397 |
)
|
|
|
460 |
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
|
461 |
if module.padding_idx is not None:
|
462 |
module.weight.data[module.padding_idx].zero_()
|
463 |
+
elif isinstance(module, QWenRMSNorm):
|
464 |
module.weight.data.fill_(1.0)
|
465 |
|
466 |
for name, p in module.named_parameters():
|
|
|
500 |
for i in range(config.num_hidden_layers)
|
501 |
]
|
502 |
)
|
503 |
+
self.ln_f = QWenRMSNorm(
|
504 |
self.embed_dim,
|
505 |
eps=config.layer_norm_epsilon,
|
506 |
)
|
|
|
1041 |
)
|
1042 |
|
1043 |
|
1044 |
+
class QWenRotaryEmbedding(torch.nn.Module):
|
1045 |
def __init__(self, dim, base=10000):
|
1046 |
super().__init__()
|
1047 |
self.dim = dim
|
|
|
1104 |
return torch.cat((t_, t_pass_), dim=-1).type_as(t)
|
1105 |
|
1106 |
|
1107 |
+
class QWenRMSNorm(torch.nn.Module):
|
1108 |
def __init__(self, dim: int, eps: float = 1e-6):
|
1109 |
super().__init__()
|
1110 |
self.eps = eps
|