runningSnail commited on
Commit
1e57a45
·
1 Parent(s): 6ad73d9

registration works

Browse files
Files changed (2) hide show
  1. configuration_dolphin.py +6 -123
  2. modeling_dolphin.py +20 -22
configuration_dolphin.py CHANGED
@@ -84,8 +84,8 @@ class DolphinConfig(PretrainedConfig):
84
 
85
  def __init__(
86
  self,
87
- vocab_size=151936,
88
- hidden_size=4096,
89
  intermediate_size=22016,
90
  num_hidden_layers=32,
91
  num_attention_heads=32,
@@ -133,7 +133,7 @@ class DolphinConfig(PretrainedConfig):
133
  )
134
 
135
  encoder_config_dict = {
136
- "_name_or_path": "alexchen4ai/Qwen2-0.5B",
137
  "add_cross_attention": False,
138
  "architectures": ["Qwen2ForCausalLM"],
139
  "attention_dropout": 0.0,
@@ -208,123 +208,6 @@ encoder_config_dict = {
208
  "attn_implementation": None,
209
  }
210
 
211
-
212
- class Qwen2Config(PretrainedConfig):
213
- r"""
214
- This is the configuration class to store the configuration of a [`Qwen2Model`]. It is used to instantiate a
215
- Qwen2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
216
- with the defaults will yield a similar configuration to that of
217
- Qwen2-7B-beta [Qwen/Qwen2-7B-beta](https://huggingface.co/Qwen/Qwen2-7B-beta).
218
-
219
- Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
220
- documentation from [`PretrainedConfig`] for more information.
221
-
222
-
223
- Args:
224
- vocab_size (`int`, *optional*, defaults to 151936):
225
- Vocabulary size of the Qwen2 model. Defines the number of different tokens that can be represented by the
226
- `inputs_ids` passed when calling [`Qwen2Model`]
227
- hidden_size (`int`, *optional*, defaults to 4096):
228
- Dimension of the hidden representations.
229
- intermediate_size (`int`, *optional*, defaults to 22016):
230
- Dimension of the MLP representations.
231
- num_hidden_layers (`int`, *optional*, defaults to 32):
232
- Number of hidden layers in the Transformer encoder.
233
- num_attention_heads (`int`, *optional*, defaults to 32):
234
- Number of attention heads for each attention layer in the Transformer encoder.
235
- num_key_value_heads (`int`, *optional*, defaults to 32):
236
- This is the number of key_value heads that should be used to implement Grouped Query Attention. If
237
- `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
238
- `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
239
- converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
240
- by meanpooling all the original heads within that group. For more details checkout [this
241
- paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
242
- hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
243
- The non-linear activation function (function or string) in the decoder.
244
- max_position_embeddings (`int`, *optional*, defaults to 32768):
245
- The maximum sequence length that this model might ever be used with.
246
- initializer_range (`float`, *optional*, defaults to 0.02):
247
- The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
248
- rms_norm_eps (`float`, *optional*, defaults to 1e-06):
249
- The epsilon used by the rms normalization layers.
250
- use_cache (`bool`, *optional*, defaults to `True`):
251
- Whether or not the model should return the last key/values attentions (not used by all models). Only
252
- relevant if `config.is_decoder=True`.
253
- tie_word_embeddings (`bool`, *optional*, defaults to `False`):
254
- Whether the model's input and output word embeddings should be tied.
255
- rope_theta (`float`, *optional*, defaults to 10000.0):
256
- The base period of the RoPE embeddings.
257
- use_sliding_window (`bool`, *optional*, defaults to `False`):
258
- Whether to use sliding window attention.
259
- sliding_window (`int`, *optional*, defaults to 4096):
260
- Sliding window attention (SWA) window size. If not specified, will default to `4096`.
261
- max_window_layers (`int`, *optional*, defaults to 28):
262
- The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
263
- attention_dropout (`float`, *optional*, defaults to 0.0):
264
- The dropout ratio for the attention probabilities.
265
-
266
- ```python
267
- >>> from transformers import Qwen2Model, Qwen2Config
268
-
269
- >>> # Initializing a Qwen2 style configuration
270
- >>> configuration = Qwen2Config()
271
-
272
- >>> # Initializing a model from the Qwen2-7B style configuration
273
- >>> model = Qwen2Model(configuration)
274
-
275
- >>> # Accessing the model configuration
276
- >>> configuration = model.config
277
- ```"""
278
-
279
- model_type = "qwen2"
280
- keys_to_ignore_at_inference = ["past_key_values"]
281
-
282
- def __init__(
283
- self,
284
- vocab_size=151936,
285
- hidden_size=4096,
286
- intermediate_size=22016,
287
- num_hidden_layers=32,
288
- num_attention_heads=32,
289
- num_key_value_heads=32,
290
- hidden_act="silu",
291
- max_position_embeddings=32768,
292
- initializer_range=0.02,
293
- rms_norm_eps=1e-6,
294
- use_cache=True,
295
- tie_word_embeddings=False,
296
- rope_theta=10000.0,
297
- use_sliding_window=False,
298
- sliding_window=4096,
299
- max_window_layers=28,
300
- attention_dropout=0.0,
301
- encoder_config=None,
302
- **kwargs,
303
- ):
304
- self.vocab_size = vocab_size
305
- self.max_position_embeddings = max_position_embeddings
306
- self.hidden_size = hidden_size
307
- self.intermediate_size = intermediate_size
308
- self.num_hidden_layers = num_hidden_layers
309
- self.num_attention_heads = num_attention_heads
310
- self.use_sliding_window = use_sliding_window
311
- self.sliding_window = sliding_window
312
- self.max_window_layers = max_window_layers
313
-
314
- # for backward compatibility
315
- if num_key_value_heads is None:
316
- num_key_value_heads = num_attention_heads
317
-
318
- self.num_key_value_heads = num_key_value_heads
319
- self.hidden_act = hidden_act
320
- self.initializer_range = initializer_range
321
- self.rms_norm_eps = rms_norm_eps
322
- self.use_cache = use_cache
323
- self.rope_theta = rope_theta
324
- self.attention_dropout = attention_dropout
325
- self.encoder_config = encoder_config
326
-
327
- super().__init__(
328
- tie_word_embeddings=tie_word_embeddings,
329
- **kwargs,
330
- )
 
84
 
85
  def __init__(
86
  self,
87
+ vocab_size=152064, # Updated to match the checkpoint
88
+ hidden_size=3584, # Updated to match the checkpoint
89
  intermediate_size=22016,
90
  num_hidden_layers=32,
91
  num_attention_heads=32,
 
133
  )
134
 
135
  encoder_config_dict = {
136
+ "_name_or_path": "Qwen/Qwen2-0.5B",
137
  "add_cross_attention": False,
138
  "architectures": ["Qwen2ForCausalLM"],
139
  "attention_dropout": 0.0,
 
208
  "attn_implementation": None,
209
  }
210
 
211
+ if __name__ == "__main__":
212
+ config = DolphinConfig(encoder_config=encoder_config_dict)
213
+ config.save_pretrained("dolphin-config")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
modeling_dolphin.py CHANGED
@@ -12,7 +12,7 @@ from transformers.models.qwen2.modeling_qwen2 import (
12
  Qwen2PreTrainedModel, Qwen2Model, Qwen2RMSNorm
13
  )
14
  from transformers.modeling_attn_mask_utils import (
15
- AttentionMaskConverter,
16
  )
17
  from transformers.models.qwen2.modeling_qwen2 import Qwen2DecoderLayer
18
  from transformers.models.qwen2.configuration_qwen2 import Qwen2Config
@@ -186,7 +186,7 @@ class DolphinModel(Qwen2PreTrainedModel):
186
  Args:
187
  config: DolphinModel
188
  """
189
- config_class = DolphinConfig
190
 
191
  def __init__(self, config: DolphinConfig):
192
  super().__init__(config)
@@ -732,33 +732,30 @@ class DolphinForCausalLM(Qwen2PreTrainedModel):
732
  )
733
  return reordered_past
734
 
735
- MEMORY_SIZE = 32
736
- def inference_instruct(mycontext, device = "cuda:0"):
737
  import time
738
- start = time.time()
 
739
  generated_token_ids = []
740
- prompt = " <context>Who and when founded the Shanda group?"
741
- print("input prompt: " + prompt)
742
- print("input context: " + mycontext)
743
  text_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split("<context>")]
744
  input_ids = (
745
- torch.tensor(text_chunks[0] + [-1] * MEMORY_SIZE + text_chunks[1], dtype=torch.long)
 
 
746
  .unsqueeze(0)
747
  .to(device)
748
  )
749
- # print(input_ids)
750
  # to process the context
751
  context_tokenized = tokenizer(
752
  mycontext + "".join([f"[memory_{i}]" for i in range(MEMORY_SIZE)]),
753
  return_tensors="pt",
754
  )
755
  context_tokenized = {k: v.to(device) for k, v in context_tokenized.items()}
756
- # print(context_tokenized["input_ids"])
757
- context_token_count = (context_tokenized["input_ids"]).shape[1] - MEMORY_SIZE
758
- print("length of context: " + str(context_token_count) + " tokens")
759
  # We conduct a inference process
760
  for i in range(context_token_count):
761
- print(f"\rGenerating token {i+1}/{context_token_count}", end="")
762
  next_token = (
763
  model(
764
  input_ids,
@@ -772,23 +769,24 @@ def inference_instruct(mycontext, device = "cuda:0"):
772
  break
773
  generated_token_ids.append(next_token.item())
774
  input_ids = torch.cat([input_ids, next_token.unsqueeze(1)], dim=-1)
775
- print("\noutput: " + tokenizer.decode(generated_token_ids))
776
- end = time.time()
777
- print(f"Elapsed time: {end - start:.2f}s")
778
 
779
 
780
  if __name__ == "__main__":
781
  # Register your configuration and model
782
  AutoConfig.register("dolphin", DolphinConfig)
783
  AutoModelForCausalLM.register(DolphinConfig, DolphinForCausalLM)
 
784
 
785
  # Load the tokenizer and model
786
  tokenizer = AutoTokenizer.from_pretrained('NexaAIDev/Dolphin', trust_remote_code=True)
787
- model = AutoModelForCausalLM.from_pretrained('NexaAIDev/Dolphin', trust_remote_code=True)
788
 
789
  # Run inference example
790
  mycontext = "Nexa AI is a Cupertino-based company founded in May 2023 that researches and develops models and tools for on-device AI applications. The company is founded by Alex and Zack. The company is known for its Octopus-series models, which rival large-scale language models in capabilities such as function-calling, multimodality, and action-planning, while remaining efficient and compact for edge device deployment. Nexa AI's mission is to advance on-device AI in collaboration with the global developer community. To this end, the company has created an on-device model hub for users to find, share, and collaborate on open-source AI models optimized for edge devices, as well as an SDK for developers to run and deploy AI models locally"
791
- inference_instruct(mycontext, "who founded Nexa AI?")
792
- inference_instruct(mycontext, "what is the mission of Nexa AI?")
793
- inference_instruct(mycontext, "what is the performance of Octopus V2 and V3?")
794
- inference_instruct(mycontext, "when is Nexa AI founded?")
 
12
  Qwen2PreTrainedModel, Qwen2Model, Qwen2RMSNorm
13
  )
14
  from transformers.modeling_attn_mask_utils import (
15
+ AttentionMaskConverter
16
  )
17
  from transformers.models.qwen2.modeling_qwen2 import Qwen2DecoderLayer
18
  from transformers.models.qwen2.configuration_qwen2 import Qwen2Config
 
186
  Args:
187
  config: DolphinModel
188
  """
189
+ # config_class = DolphinConfig
190
 
191
  def __init__(self, config: DolphinConfig):
192
  super().__init__(config)
 
732
  )
733
  return reordered_past
734
 
735
+
736
+ def inference_instruct(mycontext, question, device="cuda:0"):
737
  import time
738
+ MEMORY_SIZE = 32
739
+ start_time = time.time()
740
  generated_token_ids = []
741
+ prompt = f" <context>{question}"
 
 
742
  text_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split("<context>")]
743
  input_ids = (
744
+ torch.tensor(
745
+ text_chunks[0] + [-1] * MEMORY_SIZE + text_chunks[1], dtype=torch.long
746
+ )
747
  .unsqueeze(0)
748
  .to(device)
749
  )
 
750
  # to process the context
751
  context_tokenized = tokenizer(
752
  mycontext + "".join([f"[memory_{i}]" for i in range(MEMORY_SIZE)]),
753
  return_tensors="pt",
754
  )
755
  context_tokenized = {k: v.to(device) for k, v in context_tokenized.items()}
756
+ context_token_count = (context_tokenized["input_ids"]).shape[1] - MEMORY_SIZE
 
 
757
  # We conduct a inference process
758
  for i in range(context_token_count):
 
759
  next_token = (
760
  model(
761
  input_ids,
 
769
  break
770
  generated_token_ids.append(next_token.item())
771
  input_ids = torch.cat([input_ids, next_token.unsqueeze(1)], dim=-1)
772
+ result = tokenizer.decode(generated_token_ids)
773
+ print(f"Time taken: {time.time() - start_time}")
774
+ return result
775
 
776
 
777
  if __name__ == "__main__":
778
  # Register your configuration and model
779
  AutoConfig.register("dolphin", DolphinConfig)
780
  AutoModelForCausalLM.register(DolphinConfig, DolphinForCausalLM)
781
+ device_name = "cuda:0" if torch.cuda.is_available() else "cpu"
782
 
783
  # Load the tokenizer and model
784
  tokenizer = AutoTokenizer.from_pretrained('NexaAIDev/Dolphin', trust_remote_code=True)
785
+ model = AutoModelForCausalLM.from_pretrained('NexaAIDev/Dolphin', trust_remote_code=True, torch_dtype=torch.bfloat16, device_map="cuda:0")
786
 
787
  # Run inference example
788
  mycontext = "Nexa AI is a Cupertino-based company founded in May 2023 that researches and develops models and tools for on-device AI applications. The company is founded by Alex and Zack. The company is known for its Octopus-series models, which rival large-scale language models in capabilities such as function-calling, multimodality, and action-planning, while remaining efficient and compact for edge device deployment. Nexa AI's mission is to advance on-device AI in collaboration with the global developer community. To this end, the company has created an on-device model hub for users to find, share, and collaborate on open-source AI models optimized for edge devices, as well as an SDK for developers to run and deploy AI models locally"
789
+ question = "Who founded Nexa AI?"
790
+ # Pass the context and the correct device string
791
+ result = inference_instruct(mycontext, question, device=device_name)
792
+ print("Result:", result)