farzadab commited on
Commit
b844485
1 Parent(s): 4f9e82c

pushing the projector weights only

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
config.json CHANGED
@@ -1,9 +1,19 @@
1
  {
2
- "_name_or_path": "artifacts/model-70B-bs6-ga1-cont:v5/",
3
  "architectures": [
4
  "UltravoxModel"
5
  ],
6
  "audio_model_id": "openai/whisper-medium",
 
 
 
 
 
 
 
 
 
 
7
  "audio_token_index": 32000,
8
  "auto_map": {
9
  "AutoConfig": "ultravox_config.UltravoxConfig",
@@ -27,38 +37,18 @@
27
  "pad_token_id": 128009,
28
  "projector_act": "swiglu",
29
  "stack_factor": 8,
30
- "text_config": {
31
- "_name_or_path": "meta-llama/Meta-Llama-3.1-70B-Instruct",
32
- "architectures": [
33
- "LlamaForCausalLM"
34
- ],
35
- "bos_token_id": 128000,
36
- "eos_token_id": [
37
- 128001,
38
- 128008,
39
- 128009
40
- ],
41
- "hidden_size": 8192,
42
- "intermediate_size": 28672,
43
- "max_position_embeddings": 131072,
44
- "model_type": "llama",
45
- "num_attention_heads": 64,
46
- "num_hidden_layers": 80,
47
- "num_key_value_heads": 8,
48
- "rms_norm_eps": 1e-05,
49
- "rope_scaling": {
50
- "factor": 8.0,
51
- "high_freq_factor": 4.0,
52
- "low_freq_factor": 1.0,
53
- "original_max_position_embeddings": 8192,
54
- "rope_type": "llama3"
55
- },
56
- "rope_theta": 500000.0,
57
- "torch_dtype": "bfloat16",
58
- "vocab_size": 128256
59
  },
60
- "text_model_id": null,
61
  "torch_dtype": "bfloat16",
62
- "transformers_version": "4.44.0",
63
  "vocab_size": 128256
64
  }
 
1
  {
2
+ "_name_or_path": "fixie-ai/ultravox-v0_4-llama-3_1-70b",
3
  "architectures": [
4
  "UltravoxModel"
5
  ],
6
  "audio_model_id": "openai/whisper-medium",
7
+ "audio_model_lora_config": {
8
+ "lora_alpha": 8,
9
+ "r": 0,
10
+ "target_modules": [
11
+ "k_proj",
12
+ "q_proj",
13
+ "linear_k",
14
+ "linear_q"
15
+ ]
16
+ },
17
  "audio_token_index": 32000,
18
  "auto_map": {
19
  "AutoConfig": "ultravox_config.UltravoxConfig",
 
37
  "pad_token_id": 128009,
38
  "projector_act": "swiglu",
39
  "stack_factor": 8,
40
+ "text_model_id": "meta-llama/Meta-Llama-3.1-70B-Instruct",
41
+ "text_model_lora_config": {
42
+ "lora_alpha": 8,
43
+ "r": 0,
44
+ "target_modules": [
45
+ "k_proj",
46
+ "q_proj",
47
+ "linear_k",
48
+ "linear_q"
49
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  },
 
51
  "torch_dtype": "bfloat16",
52
+ "transformers_version": "4.45.0",
53
  "vocab_size": 128256
54
  }
generation_config.json CHANGED
@@ -7,5 +7,5 @@
7
  128009
8
  ],
9
  "pad_token_id": 128009,
10
- "transformers_version": "4.44.0"
11
  }
 
7
  128009
8
  ],
9
  "pad_token_id": 128009,
10
+ "transformers_version": "4.45.0"
11
  }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a09b15fd86a2015f62df808820cea641aa974968732c290d7c725d41631ba67
3
+ size 100696544
special_tokens_map.json CHANGED
@@ -13,5 +13,11 @@
13
  "rstrip": false,
14
  "single_word": false
15
  },
16
- "pad_token": "<|eot_id|>"
 
 
 
 
 
 
17
  }
 
13
  "rstrip": false,
14
  "single_word": false
15
  },
16
+ "pad_token": {
17
+ "content": "<|eot_id|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
  }
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
ultravox_config.py CHANGED
@@ -99,7 +99,6 @@ class UltravoxConfig(transformers.PretrainedConfig):
99
  audio_model_id: Optional[str] = None,
100
  text_model_id: Optional[str] = None,
101
  ignore_index: int = -100,
102
- audio_token_index: int = 32000,
103
  hidden_size: int = 4096,
104
  stack_factor: int = 8,
105
  norm_init: float = 0.4,
@@ -112,7 +111,6 @@ class UltravoxConfig(transformers.PretrainedConfig):
112
 
113
  self.audio_model_id = audio_model_id
114
  self.text_model_id = text_model_id
115
- self.audio_token_index = audio_token_index
116
 
117
  self.hidden_size = hidden_size
118
  self.stack_factor = stack_factor
 
99
  audio_model_id: Optional[str] = None,
100
  text_model_id: Optional[str] = None,
101
  ignore_index: int = -100,
 
102
  hidden_size: int = 4096,
103
  stack_factor: int = 8,
104
  norm_init: float = 0.4,
 
111
 
112
  self.audio_model_id = audio_model_id
113
  self.text_model_id = text_model_id
 
114
 
115
  self.hidden_size = hidden_size
116
  self.stack_factor = stack_factor
ultravox_model.py CHANGED
@@ -51,36 +51,18 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
51
  self.vocab_size = config.vocab_size
52
 
53
  self.audio_tower = self._create_audio_tower(config)
54
- self.multi_modal_projector = UltravoxProjector(config)
55
  self.language_model = self._create_language_model(config)
56
 
57
- # Determine no_split_modules dynamically to use with FSDP auto_wrap policy. FSDP throws an error if
58
- # some of the layer types are not found in the model.
59
  # This would be something like ["LlamaDecoderLayer", "WhisperEncoderLayer"]
60
- self._no_split_modules = (
61
- self.language_model._no_split_modules + self.audio_tower._no_split_modules
62
  )
63
 
64
  self.loss_config = LossConfig()
65
  self.post_init()
66
- self.multi_modal_projector.to(dtype=config.torch_dtype)
67
-
68
- def save_pretrained(
69
- self, *args, state_dict: Optional[Dict[str, Any]] = None, **kwargs
70
- ):
71
- if state_dict is None:
72
- state_dict = super().state_dict()
73
-
74
- named_params = dict(self.named_parameters())
75
-
76
- state_dict = {
77
- k: v
78
- for k, v in state_dict.items()
79
- if k in self.keep_params
80
- or (k in named_params and named_params[k].requires_grad)
81
- }
82
-
83
- super().save_pretrained(*args, state_dict=state_dict, **kwargs)
84
 
85
  def get_input_embeddings(self):
86
  return self.language_model.get_input_embeddings()
@@ -290,6 +272,14 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
290
 
291
  return model_input
292
 
 
 
 
 
 
 
 
 
293
  @classmethod
294
  def _create_audio_tower(
295
  cls, config: UltravoxConfig
@@ -311,7 +301,7 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
311
  # we only ever use from_config if the weights are retrained, hence initializing is not
312
  # required. This makes the model quite creation faster since init on CPU is quite slow.
313
  audio_tower = transformers.AutoModel.from_config(
314
- config.audio_config, torch_dtype=config.torch_dtype
315
  )
316
 
317
  if isinstance(
@@ -341,14 +331,18 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
341
  # we only ever use from_config if the weights are retrained, hence initializing is not
342
  # required. This makes the model quite creation faster since init on CPU is quite slow.
343
  language_model = transformers.AutoModelForCausalLM.from_config(
344
- config.text_config, attn_implementation=config._attn_implementation
 
 
345
  )
346
 
347
  language_model = apply_lora(language_model, config.text_model_lora_config)
348
  return language_model
349
 
350
- def _add_language_model_weights_to_keep(self):
351
- if self.config.text_model_id is not None:
 
 
352
  self.config.text_model_id = None
353
  self.keep_params.update(
354
  set(
@@ -359,8 +353,9 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
359
  )
360
  )
361
 
362
- def _add_audio_tower_weights_to_keep(self):
363
- if self.config.audio_model_id is not None:
 
364
  self.config.audio_model_id = None
365
  self.keep_params.update(
366
  set(
@@ -371,17 +366,6 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
371
  )
372
  )
373
 
374
- def merge_and_unload(self):
375
- if isinstance(self.language_model, peft.PeftModel):
376
- self.language_model = self.language_model.merge_and_unload()
377
- # no need to download base language model weights anymore, so we can remove the id
378
- self._add_language_model_weights_to_keep()
379
-
380
- if isinstance(self.audio_tower, peft.PeftModel):
381
- self.audio_tower = self.audio_tower.merge_and_unload()
382
- # no need to download base audio model weights anymore, so we can remove the id
383
- self._add_audio_tower_weights_to_keep()
384
-
385
  for param in ["text_model_lora_config", "audio_model_lora_config"]:
386
  if hasattr(self.config, param):
387
  delattr(self.config, param)
@@ -391,6 +375,31 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
391
  self.to(self.language_model.dtype)
392
  return super().push_to_hub(*args, **kwargs)
393
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
394
  def print_trainable_parameters(self):
395
  """
396
  Prints the number of trainable parameters in the model (reuses Peft model's method)
@@ -419,9 +428,6 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
419
  f" || Projector: {100 * projector_trainable_params / projector_all_params:.1f}%"
420
  )
421
 
422
- def _pre_load_state_dict_hook(self, state_dict: Dict[str, Any], *args, **kwargs):
423
- self.keep_params.update(set(state_dict.keys()))
424
-
425
 
426
  def is_cache_empty(
427
  past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]]
 
51
  self.vocab_size = config.vocab_size
52
 
53
  self.audio_tower = self._create_audio_tower(config)
54
+ self.multi_modal_projector = self._create_multi_modal_projector(config)
55
  self.language_model = self._create_language_model(config)
56
 
57
+ # Determine no_split_modules dynamically to use with FSDP auto_wrap policy.
58
+ # FSDP throws an error if some of the layer types are not found in the model.
59
  # This would be something like ["LlamaDecoderLayer", "WhisperEncoderLayer"]
60
+ self._no_split_modules = (self.language_model._no_split_modules or []) + (
61
+ self.audio_tower._no_split_modules or []
62
  )
63
 
64
  self.loss_config = LossConfig()
65
  self.post_init()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
  def get_input_embeddings(self):
68
  return self.language_model.get_input_embeddings()
 
272
 
273
  return model_input
274
 
275
+ @classmethod
276
+ def _create_multi_modal_projector(
277
+ cls, config: UltravoxConfig
278
+ ) -> "UltravoxProjector":
279
+ projector = UltravoxProjector(config)
280
+ projector.to(config.torch_dtype)
281
+ return projector
282
+
283
  @classmethod
284
  def _create_audio_tower(
285
  cls, config: UltravoxConfig
 
301
  # we only ever use from_config if the weights are retrained, hence initializing is not
302
  # required. This makes the model quite creation faster since init on CPU is quite slow.
303
  audio_tower = transformers.AutoModel.from_config(
304
+ config.audio_config
305
  )
306
 
307
  if isinstance(
 
331
  # we only ever use from_config if the weights are retrained, hence initializing is not
332
  # required. This makes the model quite creation faster since init on CPU is quite slow.
333
  language_model = transformers.AutoModelForCausalLM.from_config(
334
+ config.text_config,
335
+ attn_implementation=config._attn_implementation,
336
+ torch_dtype=config.torch_dtype,
337
  )
338
 
339
  language_model = apply_lora(language_model, config.text_model_lora_config)
340
  return language_model
341
 
342
+ def merge_and_unload(self):
343
+ if isinstance(self.language_model, peft.PeftModel):
344
+ self.language_model = self.language_model.merge_and_unload()
345
+ # no need to download base language model weights anymore, so we can remove the id
346
  self.config.text_model_id = None
347
  self.keep_params.update(
348
  set(
 
353
  )
354
  )
355
 
356
+ if isinstance(self.audio_tower, peft.PeftModel):
357
+ self.audio_tower = self.audio_tower.merge_and_unload()
358
+ # no need to download base audio model weights anymore, so we can remove the id
359
  self.config.audio_model_id = None
360
  self.keep_params.update(
361
  set(
 
366
  )
367
  )
368
 
 
 
 
 
 
 
 
 
 
 
 
369
  for param in ["text_model_lora_config", "audio_model_lora_config"]:
370
  if hasattr(self.config, param):
371
  delattr(self.config, param)
 
375
  self.to(self.language_model.dtype)
376
  return super().push_to_hub(*args, **kwargs)
377
 
378
+ def save_pretrained(
379
+ self, *args, state_dict: Optional[Dict[str, Any]] = None, **kwargs
380
+ ):
381
+ if state_dict is None:
382
+ state_dict = {}
383
+ for module, keep in [
384
+ ("multi_modal_projector", True),
385
+ ("audio_tower", self.config.audio_model_id is None),
386
+ ("language_model", self.config.text_model_id is None),
387
+ ]:
388
+ if keep:
389
+ state_dict.update(
390
+ {
391
+ f"{module}.{name}": param
392
+ for name, param in getattr(self, module)
393
+ .state_dict()
394
+ .items()
395
+ }
396
+ )
397
+
398
+ super().save_pretrained(*args, state_dict=state_dict, **kwargs)
399
+
400
+ def _pre_load_state_dict_hook(self, state_dict: Dict[str, Any], *args, **kwargs):
401
+ self.keep_params.update(set(state_dict.keys()))
402
+
403
  def print_trainable_parameters(self):
404
  """
405
  Prints the number of trainable parameters in the model (reuses Peft model's method)
 
428
  f" || Projector: {100 * projector_trainable_params / projector_all_params:.1f}%"
429
  )
430
 
 
 
 
431
 
432
  def is_cache_empty(
433
  past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]]