runningSnail
commited on
Commit
·
1e57a45
1
Parent(s):
6ad73d9
registration works
Browse files- configuration_dolphin.py +6 -123
- modeling_dolphin.py +20 -22
configuration_dolphin.py
CHANGED
@@ -84,8 +84,8 @@ class DolphinConfig(PretrainedConfig):
|
|
84 |
|
85 |
def __init__(
|
86 |
self,
|
87 |
-
vocab_size=
|
88 |
-
hidden_size=
|
89 |
intermediate_size=22016,
|
90 |
num_hidden_layers=32,
|
91 |
num_attention_heads=32,
|
@@ -133,7 +133,7 @@ class DolphinConfig(PretrainedConfig):
|
|
133 |
)
|
134 |
|
135 |
encoder_config_dict = {
|
136 |
-
"_name_or_path": "
|
137 |
"add_cross_attention": False,
|
138 |
"architectures": ["Qwen2ForCausalLM"],
|
139 |
"attention_dropout": 0.0,
|
@@ -208,123 +208,6 @@ encoder_config_dict = {
|
|
208 |
"attn_implementation": None,
|
209 |
}
|
210 |
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
This is the configuration class to store the configuration of a [`Qwen2Model`]. It is used to instantiate a
|
215 |
-
Qwen2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
|
216 |
-
with the defaults will yield a similar configuration to that of
|
217 |
-
Qwen2-7B-beta [Qwen/Qwen2-7B-beta](https://huggingface.co/Qwen/Qwen2-7B-beta).
|
218 |
-
|
219 |
-
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
220 |
-
documentation from [`PretrainedConfig`] for more information.
|
221 |
-
|
222 |
-
|
223 |
-
Args:
|
224 |
-
vocab_size (`int`, *optional*, defaults to 151936):
|
225 |
-
Vocabulary size of the Qwen2 model. Defines the number of different tokens that can be represented by the
|
226 |
-
`inputs_ids` passed when calling [`Qwen2Model`]
|
227 |
-
hidden_size (`int`, *optional*, defaults to 4096):
|
228 |
-
Dimension of the hidden representations.
|
229 |
-
intermediate_size (`int`, *optional*, defaults to 22016):
|
230 |
-
Dimension of the MLP representations.
|
231 |
-
num_hidden_layers (`int`, *optional*, defaults to 32):
|
232 |
-
Number of hidden layers in the Transformer encoder.
|
233 |
-
num_attention_heads (`int`, *optional*, defaults to 32):
|
234 |
-
Number of attention heads for each attention layer in the Transformer encoder.
|
235 |
-
num_key_value_heads (`int`, *optional*, defaults to 32):
|
236 |
-
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
|
237 |
-
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
238 |
-
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
239 |
-
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
240 |
-
by meanpooling all the original heads within that group. For more details checkout [this
|
241 |
-
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
|
242 |
-
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
243 |
-
The non-linear activation function (function or string) in the decoder.
|
244 |
-
max_position_embeddings (`int`, *optional*, defaults to 32768):
|
245 |
-
The maximum sequence length that this model might ever be used with.
|
246 |
-
initializer_range (`float`, *optional*, defaults to 0.02):
|
247 |
-
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
248 |
-
rms_norm_eps (`float`, *optional*, defaults to 1e-06):
|
249 |
-
The epsilon used by the rms normalization layers.
|
250 |
-
use_cache (`bool`, *optional*, defaults to `True`):
|
251 |
-
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
252 |
-
relevant if `config.is_decoder=True`.
|
253 |
-
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
254 |
-
Whether the model's input and output word embeddings should be tied.
|
255 |
-
rope_theta (`float`, *optional*, defaults to 10000.0):
|
256 |
-
The base period of the RoPE embeddings.
|
257 |
-
use_sliding_window (`bool`, *optional*, defaults to `False`):
|
258 |
-
Whether to use sliding window attention.
|
259 |
-
sliding_window (`int`, *optional*, defaults to 4096):
|
260 |
-
Sliding window attention (SWA) window size. If not specified, will default to `4096`.
|
261 |
-
max_window_layers (`int`, *optional*, defaults to 28):
|
262 |
-
The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
|
263 |
-
attention_dropout (`float`, *optional*, defaults to 0.0):
|
264 |
-
The dropout ratio for the attention probabilities.
|
265 |
-
|
266 |
-
```python
|
267 |
-
>>> from transformers import Qwen2Model, Qwen2Config
|
268 |
-
|
269 |
-
>>> # Initializing a Qwen2 style configuration
|
270 |
-
>>> configuration = Qwen2Config()
|
271 |
-
|
272 |
-
>>> # Initializing a model from the Qwen2-7B style configuration
|
273 |
-
>>> model = Qwen2Model(configuration)
|
274 |
-
|
275 |
-
>>> # Accessing the model configuration
|
276 |
-
>>> configuration = model.config
|
277 |
-
```"""
|
278 |
-
|
279 |
-
model_type = "qwen2"
|
280 |
-
keys_to_ignore_at_inference = ["past_key_values"]
|
281 |
-
|
282 |
-
def __init__(
|
283 |
-
self,
|
284 |
-
vocab_size=151936,
|
285 |
-
hidden_size=4096,
|
286 |
-
intermediate_size=22016,
|
287 |
-
num_hidden_layers=32,
|
288 |
-
num_attention_heads=32,
|
289 |
-
num_key_value_heads=32,
|
290 |
-
hidden_act="silu",
|
291 |
-
max_position_embeddings=32768,
|
292 |
-
initializer_range=0.02,
|
293 |
-
rms_norm_eps=1e-6,
|
294 |
-
use_cache=True,
|
295 |
-
tie_word_embeddings=False,
|
296 |
-
rope_theta=10000.0,
|
297 |
-
use_sliding_window=False,
|
298 |
-
sliding_window=4096,
|
299 |
-
max_window_layers=28,
|
300 |
-
attention_dropout=0.0,
|
301 |
-
encoder_config=None,
|
302 |
-
**kwargs,
|
303 |
-
):
|
304 |
-
self.vocab_size = vocab_size
|
305 |
-
self.max_position_embeddings = max_position_embeddings
|
306 |
-
self.hidden_size = hidden_size
|
307 |
-
self.intermediate_size = intermediate_size
|
308 |
-
self.num_hidden_layers = num_hidden_layers
|
309 |
-
self.num_attention_heads = num_attention_heads
|
310 |
-
self.use_sliding_window = use_sliding_window
|
311 |
-
self.sliding_window = sliding_window
|
312 |
-
self.max_window_layers = max_window_layers
|
313 |
-
|
314 |
-
# for backward compatibility
|
315 |
-
if num_key_value_heads is None:
|
316 |
-
num_key_value_heads = num_attention_heads
|
317 |
-
|
318 |
-
self.num_key_value_heads = num_key_value_heads
|
319 |
-
self.hidden_act = hidden_act
|
320 |
-
self.initializer_range = initializer_range
|
321 |
-
self.rms_norm_eps = rms_norm_eps
|
322 |
-
self.use_cache = use_cache
|
323 |
-
self.rope_theta = rope_theta
|
324 |
-
self.attention_dropout = attention_dropout
|
325 |
-
self.encoder_config = encoder_config
|
326 |
-
|
327 |
-
super().__init__(
|
328 |
-
tie_word_embeddings=tie_word_embeddings,
|
329 |
-
**kwargs,
|
330 |
-
)
|
|
|
84 |
|
85 |
def __init__(
|
86 |
self,
|
87 |
+
vocab_size=152064, # Updated to match the checkpoint
|
88 |
+
hidden_size=3584, # Updated to match the checkpoint
|
89 |
intermediate_size=22016,
|
90 |
num_hidden_layers=32,
|
91 |
num_attention_heads=32,
|
|
|
133 |
)
|
134 |
|
135 |
encoder_config_dict = {
|
136 |
+
"_name_or_path": "Qwen/Qwen2-0.5B",
|
137 |
"add_cross_attention": False,
|
138 |
"architectures": ["Qwen2ForCausalLM"],
|
139 |
"attention_dropout": 0.0,
|
|
|
208 |
"attn_implementation": None,
|
209 |
}
|
210 |
|
211 |
+
if __name__ == "__main__":
|
212 |
+
config = DolphinConfig(encoder_config=encoder_config_dict)
|
213 |
+
config.save_pretrained("dolphin-config")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
modeling_dolphin.py
CHANGED
@@ -12,7 +12,7 @@ from transformers.models.qwen2.modeling_qwen2 import (
|
|
12 |
Qwen2PreTrainedModel, Qwen2Model, Qwen2RMSNorm
|
13 |
)
|
14 |
from transformers.modeling_attn_mask_utils import (
|
15 |
-
AttentionMaskConverter
|
16 |
)
|
17 |
from transformers.models.qwen2.modeling_qwen2 import Qwen2DecoderLayer
|
18 |
from transformers.models.qwen2.configuration_qwen2 import Qwen2Config
|
@@ -186,7 +186,7 @@ class DolphinModel(Qwen2PreTrainedModel):
|
|
186 |
Args:
|
187 |
config: DolphinModel
|
188 |
"""
|
189 |
-
config_class = DolphinConfig
|
190 |
|
191 |
def __init__(self, config: DolphinConfig):
|
192 |
super().__init__(config)
|
@@ -732,33 +732,30 @@ class DolphinForCausalLM(Qwen2PreTrainedModel):
|
|
732 |
)
|
733 |
return reordered_past
|
734 |
|
735 |
-
|
736 |
-
def inference_instruct(mycontext, device
|
737 |
import time
|
738 |
-
|
|
|
739 |
generated_token_ids = []
|
740 |
-
prompt = " <context>
|
741 |
-
print("input prompt: " + prompt)
|
742 |
-
print("input context: " + mycontext)
|
743 |
text_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split("<context>")]
|
744 |
input_ids = (
|
745 |
-
torch.tensor(
|
|
|
|
|
746 |
.unsqueeze(0)
|
747 |
.to(device)
|
748 |
)
|
749 |
-
# print(input_ids)
|
750 |
# to process the context
|
751 |
context_tokenized = tokenizer(
|
752 |
mycontext + "".join([f"[memory_{i}]" for i in range(MEMORY_SIZE)]),
|
753 |
return_tensors="pt",
|
754 |
)
|
755 |
context_tokenized = {k: v.to(device) for k, v in context_tokenized.items()}
|
756 |
-
|
757 |
-
context_token_count = (context_tokenized["input_ids"]).shape[1] - MEMORY_SIZE
|
758 |
-
print("length of context: " + str(context_token_count) + " tokens")
|
759 |
# We conduct a inference process
|
760 |
for i in range(context_token_count):
|
761 |
-
print(f"\rGenerating token {i+1}/{context_token_count}", end="")
|
762 |
next_token = (
|
763 |
model(
|
764 |
input_ids,
|
@@ -772,23 +769,24 @@ def inference_instruct(mycontext, device = "cuda:0"):
|
|
772 |
break
|
773 |
generated_token_ids.append(next_token.item())
|
774 |
input_ids = torch.cat([input_ids, next_token.unsqueeze(1)], dim=-1)
|
775 |
-
|
776 |
-
|
777 |
-
|
778 |
|
779 |
|
780 |
if __name__ == "__main__":
|
781 |
# Register your configuration and model
|
782 |
AutoConfig.register("dolphin", DolphinConfig)
|
783 |
AutoModelForCausalLM.register(DolphinConfig, DolphinForCausalLM)
|
|
|
784 |
|
785 |
# Load the tokenizer and model
|
786 |
tokenizer = AutoTokenizer.from_pretrained('NexaAIDev/Dolphin', trust_remote_code=True)
|
787 |
-
model = AutoModelForCausalLM.from_pretrained('NexaAIDev/Dolphin', trust_remote_code=True)
|
788 |
|
789 |
# Run inference example
|
790 |
mycontext = "Nexa AI is a Cupertino-based company founded in May 2023 that researches and develops models and tools for on-device AI applications. The company is founded by Alex and Zack. The company is known for its Octopus-series models, which rival large-scale language models in capabilities such as function-calling, multimodality, and action-planning, while remaining efficient and compact for edge device deployment. Nexa AI's mission is to advance on-device AI in collaboration with the global developer community. To this end, the company has created an on-device model hub for users to find, share, and collaborate on open-source AI models optimized for edge devices, as well as an SDK for developers to run and deploy AI models locally"
|
791 |
-
|
792 |
-
|
793 |
-
inference_instruct(mycontext,
|
794 |
-
|
|
|
12 |
Qwen2PreTrainedModel, Qwen2Model, Qwen2RMSNorm
|
13 |
)
|
14 |
from transformers.modeling_attn_mask_utils import (
|
15 |
+
AttentionMaskConverter
|
16 |
)
|
17 |
from transformers.models.qwen2.modeling_qwen2 import Qwen2DecoderLayer
|
18 |
from transformers.models.qwen2.configuration_qwen2 import Qwen2Config
|
|
|
186 |
Args:
|
187 |
config: DolphinModel
|
188 |
"""
|
189 |
+
# config_class = DolphinConfig
|
190 |
|
191 |
def __init__(self, config: DolphinConfig):
|
192 |
super().__init__(config)
|
|
|
732 |
)
|
733 |
return reordered_past
|
734 |
|
735 |
+
|
736 |
+
def inference_instruct(mycontext, question, device="cuda:0"):
|
737 |
import time
|
738 |
+
MEMORY_SIZE = 32
|
739 |
+
start_time = time.time()
|
740 |
generated_token_ids = []
|
741 |
+
prompt = f" <context>{question}"
|
|
|
|
|
742 |
text_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split("<context>")]
|
743 |
input_ids = (
|
744 |
+
torch.tensor(
|
745 |
+
text_chunks[0] + [-1] * MEMORY_SIZE + text_chunks[1], dtype=torch.long
|
746 |
+
)
|
747 |
.unsqueeze(0)
|
748 |
.to(device)
|
749 |
)
|
|
|
750 |
# to process the context
|
751 |
context_tokenized = tokenizer(
|
752 |
mycontext + "".join([f"[memory_{i}]" for i in range(MEMORY_SIZE)]),
|
753 |
return_tensors="pt",
|
754 |
)
|
755 |
context_tokenized = {k: v.to(device) for k, v in context_tokenized.items()}
|
756 |
+
context_token_count = (context_tokenized["input_ids"]).shape[1] - MEMORY_SIZE
|
|
|
|
|
757 |
# We conduct a inference process
|
758 |
for i in range(context_token_count):
|
|
|
759 |
next_token = (
|
760 |
model(
|
761 |
input_ids,
|
|
|
769 |
break
|
770 |
generated_token_ids.append(next_token.item())
|
771 |
input_ids = torch.cat([input_ids, next_token.unsqueeze(1)], dim=-1)
|
772 |
+
result = tokenizer.decode(generated_token_ids)
|
773 |
+
print(f"Time taken: {time.time() - start_time}")
|
774 |
+
return result
|
775 |
|
776 |
|
777 |
if __name__ == "__main__":
|
778 |
# Register your configuration and model
|
779 |
AutoConfig.register("dolphin", DolphinConfig)
|
780 |
AutoModelForCausalLM.register(DolphinConfig, DolphinForCausalLM)
|
781 |
+
device_name = "cuda:0" if torch.cuda.is_available() else "cpu"
|
782 |
|
783 |
# Load the tokenizer and model
|
784 |
tokenizer = AutoTokenizer.from_pretrained('NexaAIDev/Dolphin', trust_remote_code=True)
|
785 |
+
model = AutoModelForCausalLM.from_pretrained('NexaAIDev/Dolphin', trust_remote_code=True, torch_dtype=torch.bfloat16, device_map="cuda:0")
|
786 |
|
787 |
# Run inference example
|
788 |
mycontext = "Nexa AI is a Cupertino-based company founded in May 2023 that researches and develops models and tools for on-device AI applications. The company is founded by Alex and Zack. The company is known for its Octopus-series models, which rival large-scale language models in capabilities such as function-calling, multimodality, and action-planning, while remaining efficient and compact for edge device deployment. Nexa AI's mission is to advance on-device AI in collaboration with the global developer community. To this end, the company has created an on-device model hub for users to find, share, and collaborate on open-source AI models optimized for edge devices, as well as an SDK for developers to run and deploy AI models locally"
|
789 |
+
question = "Who founded Nexa AI?"
|
790 |
+
# Pass the context and the correct device string
|
791 |
+
result = inference_instruct(mycontext, question, device=device_name)
|
792 |
+
print("Result:", result)
|