pushing the projector weights only
Browse files- .gitattributes +1 -0
- config.json +22 -32
- generation_config.json +1 -1
- model.safetensors +3 -0
- special_tokens_map.json +7 -1
- tokenizer.json +0 -0
- ultravox_config.py +0 -2
- ultravox_model.py +49 -43
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
config.json
CHANGED
@@ -1,9 +1,19 @@
|
|
1 |
{
|
2 |
-
"_name_or_path": "
|
3 |
"architectures": [
|
4 |
"UltravoxModel"
|
5 |
],
|
6 |
"audio_model_id": "openai/whisper-medium",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
"audio_token_index": 32000,
|
8 |
"auto_map": {
|
9 |
"AutoConfig": "ultravox_config.UltravoxConfig",
|
@@ -27,38 +37,18 @@
|
|
27 |
"pad_token_id": 128009,
|
28 |
"projector_act": "swiglu",
|
29 |
"stack_factor": 8,
|
30 |
-
"
|
31 |
-
|
32 |
-
"
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
],
|
41 |
-
"hidden_size": 8192,
|
42 |
-
"intermediate_size": 28672,
|
43 |
-
"max_position_embeddings": 131072,
|
44 |
-
"model_type": "llama",
|
45 |
-
"num_attention_heads": 64,
|
46 |
-
"num_hidden_layers": 80,
|
47 |
-
"num_key_value_heads": 8,
|
48 |
-
"rms_norm_eps": 1e-05,
|
49 |
-
"rope_scaling": {
|
50 |
-
"factor": 8.0,
|
51 |
-
"high_freq_factor": 4.0,
|
52 |
-
"low_freq_factor": 1.0,
|
53 |
-
"original_max_position_embeddings": 8192,
|
54 |
-
"rope_type": "llama3"
|
55 |
-
},
|
56 |
-
"rope_theta": 500000.0,
|
57 |
-
"torch_dtype": "bfloat16",
|
58 |
-
"vocab_size": 128256
|
59 |
},
|
60 |
-
"text_model_id": null,
|
61 |
"torch_dtype": "bfloat16",
|
62 |
-
"transformers_version": "4.
|
63 |
"vocab_size": 128256
|
64 |
}
|
|
|
1 |
{
|
2 |
+
"_name_or_path": "fixie-ai/ultravox-v0_4-llama-3_1-70b",
|
3 |
"architectures": [
|
4 |
"UltravoxModel"
|
5 |
],
|
6 |
"audio_model_id": "openai/whisper-medium",
|
7 |
+
"audio_model_lora_config": {
|
8 |
+
"lora_alpha": 8,
|
9 |
+
"r": 0,
|
10 |
+
"target_modules": [
|
11 |
+
"k_proj",
|
12 |
+
"q_proj",
|
13 |
+
"linear_k",
|
14 |
+
"linear_q"
|
15 |
+
]
|
16 |
+
},
|
17 |
"audio_token_index": 32000,
|
18 |
"auto_map": {
|
19 |
"AutoConfig": "ultravox_config.UltravoxConfig",
|
|
|
37 |
"pad_token_id": 128009,
|
38 |
"projector_act": "swiglu",
|
39 |
"stack_factor": 8,
|
40 |
+
"text_model_id": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
41 |
+
"text_model_lora_config": {
|
42 |
+
"lora_alpha": 8,
|
43 |
+
"r": 0,
|
44 |
+
"target_modules": [
|
45 |
+
"k_proj",
|
46 |
+
"q_proj",
|
47 |
+
"linear_k",
|
48 |
+
"linear_q"
|
49 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
},
|
|
|
51 |
"torch_dtype": "bfloat16",
|
52 |
+
"transformers_version": "4.45.0",
|
53 |
"vocab_size": 128256
|
54 |
}
|
generation_config.json
CHANGED
@@ -7,5 +7,5 @@
|
|
7 |
128009
|
8 |
],
|
9 |
"pad_token_id": 128009,
|
10 |
-
"transformers_version": "4.
|
11 |
}
|
|
|
7 |
128009
|
8 |
],
|
9 |
"pad_token_id": 128009,
|
10 |
+
"transformers_version": "4.45.0"
|
11 |
}
|
model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0a09b15fd86a2015f62df808820cea641aa974968732c290d7c725d41631ba67
|
3 |
+
size 100696544
|
special_tokens_map.json
CHANGED
@@ -13,5 +13,11 @@
|
|
13 |
"rstrip": false,
|
14 |
"single_word": false
|
15 |
},
|
16 |
-
"pad_token":
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
}
|
|
|
13 |
"rstrip": false,
|
14 |
"single_word": false
|
15 |
},
|
16 |
+
"pad_token": {
|
17 |
+
"content": "<|eot_id|>",
|
18 |
+
"lstrip": false,
|
19 |
+
"normalized": false,
|
20 |
+
"rstrip": false,
|
21 |
+
"single_word": false
|
22 |
+
}
|
23 |
}
|
tokenizer.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
ultravox_config.py
CHANGED
@@ -99,7 +99,6 @@ class UltravoxConfig(transformers.PretrainedConfig):
|
|
99 |
audio_model_id: Optional[str] = None,
|
100 |
text_model_id: Optional[str] = None,
|
101 |
ignore_index: int = -100,
|
102 |
-
audio_token_index: int = 32000,
|
103 |
hidden_size: int = 4096,
|
104 |
stack_factor: int = 8,
|
105 |
norm_init: float = 0.4,
|
@@ -112,7 +111,6 @@ class UltravoxConfig(transformers.PretrainedConfig):
|
|
112 |
|
113 |
self.audio_model_id = audio_model_id
|
114 |
self.text_model_id = text_model_id
|
115 |
-
self.audio_token_index = audio_token_index
|
116 |
|
117 |
self.hidden_size = hidden_size
|
118 |
self.stack_factor = stack_factor
|
|
|
99 |
audio_model_id: Optional[str] = None,
|
100 |
text_model_id: Optional[str] = None,
|
101 |
ignore_index: int = -100,
|
|
|
102 |
hidden_size: int = 4096,
|
103 |
stack_factor: int = 8,
|
104 |
norm_init: float = 0.4,
|
|
|
111 |
|
112 |
self.audio_model_id = audio_model_id
|
113 |
self.text_model_id = text_model_id
|
|
|
114 |
|
115 |
self.hidden_size = hidden_size
|
116 |
self.stack_factor = stack_factor
|
ultravox_model.py
CHANGED
@@ -51,36 +51,18 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
|
|
51 |
self.vocab_size = config.vocab_size
|
52 |
|
53 |
self.audio_tower = self._create_audio_tower(config)
|
54 |
-
self.multi_modal_projector =
|
55 |
self.language_model = self._create_language_model(config)
|
56 |
|
57 |
-
# Determine no_split_modules dynamically to use with FSDP auto_wrap policy.
|
58 |
-
# some of the layer types are not found in the model.
|
59 |
# This would be something like ["LlamaDecoderLayer", "WhisperEncoderLayer"]
|
60 |
-
self._no_split_modules = (
|
61 |
-
self.
|
62 |
)
|
63 |
|
64 |
self.loss_config = LossConfig()
|
65 |
self.post_init()
|
66 |
-
self.multi_modal_projector.to(dtype=config.torch_dtype)
|
67 |
-
|
68 |
-
def save_pretrained(
|
69 |
-
self, *args, state_dict: Optional[Dict[str, Any]] = None, **kwargs
|
70 |
-
):
|
71 |
-
if state_dict is None:
|
72 |
-
state_dict = super().state_dict()
|
73 |
-
|
74 |
-
named_params = dict(self.named_parameters())
|
75 |
-
|
76 |
-
state_dict = {
|
77 |
-
k: v
|
78 |
-
for k, v in state_dict.items()
|
79 |
-
if k in self.keep_params
|
80 |
-
or (k in named_params and named_params[k].requires_grad)
|
81 |
-
}
|
82 |
-
|
83 |
-
super().save_pretrained(*args, state_dict=state_dict, **kwargs)
|
84 |
|
85 |
def get_input_embeddings(self):
|
86 |
return self.language_model.get_input_embeddings()
|
@@ -290,6 +272,14 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
|
|
290 |
|
291 |
return model_input
|
292 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
293 |
@classmethod
|
294 |
def _create_audio_tower(
|
295 |
cls, config: UltravoxConfig
|
@@ -311,7 +301,7 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
|
|
311 |
# we only ever use from_config if the weights are retrained, hence initializing is not
|
312 |
# required. This makes the model quite creation faster since init on CPU is quite slow.
|
313 |
audio_tower = transformers.AutoModel.from_config(
|
314 |
-
config.audio_config
|
315 |
)
|
316 |
|
317 |
if isinstance(
|
@@ -341,14 +331,18 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
|
|
341 |
# we only ever use from_config if the weights are retrained, hence initializing is not
|
342 |
# required. This makes the model quite creation faster since init on CPU is quite slow.
|
343 |
language_model = transformers.AutoModelForCausalLM.from_config(
|
344 |
-
config.text_config,
|
|
|
|
|
345 |
)
|
346 |
|
347 |
language_model = apply_lora(language_model, config.text_model_lora_config)
|
348 |
return language_model
|
349 |
|
350 |
-
def
|
351 |
-
if self.
|
|
|
|
|
352 |
self.config.text_model_id = None
|
353 |
self.keep_params.update(
|
354 |
set(
|
@@ -359,8 +353,9 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
|
|
359 |
)
|
360 |
)
|
361 |
|
362 |
-
|
363 |
-
|
|
|
364 |
self.config.audio_model_id = None
|
365 |
self.keep_params.update(
|
366 |
set(
|
@@ -371,17 +366,6 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
|
|
371 |
)
|
372 |
)
|
373 |
|
374 |
-
def merge_and_unload(self):
|
375 |
-
if isinstance(self.language_model, peft.PeftModel):
|
376 |
-
self.language_model = self.language_model.merge_and_unload()
|
377 |
-
# no need to download base language model weights anymore, so we can remove the id
|
378 |
-
self._add_language_model_weights_to_keep()
|
379 |
-
|
380 |
-
if isinstance(self.audio_tower, peft.PeftModel):
|
381 |
-
self.audio_tower = self.audio_tower.merge_and_unload()
|
382 |
-
# no need to download base audio model weights anymore, so we can remove the id
|
383 |
-
self._add_audio_tower_weights_to_keep()
|
384 |
-
|
385 |
for param in ["text_model_lora_config", "audio_model_lora_config"]:
|
386 |
if hasattr(self.config, param):
|
387 |
delattr(self.config, param)
|
@@ -391,6 +375,31 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
|
|
391 |
self.to(self.language_model.dtype)
|
392 |
return super().push_to_hub(*args, **kwargs)
|
393 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
394 |
def print_trainable_parameters(self):
|
395 |
"""
|
396 |
Prints the number of trainable parameters in the model (reuses Peft model's method)
|
@@ -419,9 +428,6 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
|
|
419 |
f" || Projector: {100 * projector_trainable_params / projector_all_params:.1f}%"
|
420 |
)
|
421 |
|
422 |
-
def _pre_load_state_dict_hook(self, state_dict: Dict[str, Any], *args, **kwargs):
|
423 |
-
self.keep_params.update(set(state_dict.keys()))
|
424 |
-
|
425 |
|
426 |
def is_cache_empty(
|
427 |
past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]]
|
|
|
51 |
self.vocab_size = config.vocab_size
|
52 |
|
53 |
self.audio_tower = self._create_audio_tower(config)
|
54 |
+
self.multi_modal_projector = self._create_multi_modal_projector(config)
|
55 |
self.language_model = self._create_language_model(config)
|
56 |
|
57 |
+
# Determine no_split_modules dynamically to use with FSDP auto_wrap policy.
|
58 |
+
# FSDP throws an error if some of the layer types are not found in the model.
|
59 |
# This would be something like ["LlamaDecoderLayer", "WhisperEncoderLayer"]
|
60 |
+
self._no_split_modules = (self.language_model._no_split_modules or []) + (
|
61 |
+
self.audio_tower._no_split_modules or []
|
62 |
)
|
63 |
|
64 |
self.loss_config = LossConfig()
|
65 |
self.post_init()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
|
67 |
def get_input_embeddings(self):
|
68 |
return self.language_model.get_input_embeddings()
|
|
|
272 |
|
273 |
return model_input
|
274 |
|
275 |
+
@classmethod
|
276 |
+
def _create_multi_modal_projector(
|
277 |
+
cls, config: UltravoxConfig
|
278 |
+
) -> "UltravoxProjector":
|
279 |
+
projector = UltravoxProjector(config)
|
280 |
+
projector.to(config.torch_dtype)
|
281 |
+
return projector
|
282 |
+
|
283 |
@classmethod
|
284 |
def _create_audio_tower(
|
285 |
cls, config: UltravoxConfig
|
|
|
301 |
# we only ever use from_config if the weights are retrained, hence initializing is not
|
302 |
# required. This makes the model quite creation faster since init on CPU is quite slow.
|
303 |
audio_tower = transformers.AutoModel.from_config(
|
304 |
+
config.audio_config
|
305 |
)
|
306 |
|
307 |
if isinstance(
|
|
|
331 |
# we only ever use from_config if the weights are retrained, hence initializing is not
|
332 |
# required. This makes the model quite creation faster since init on CPU is quite slow.
|
333 |
language_model = transformers.AutoModelForCausalLM.from_config(
|
334 |
+
config.text_config,
|
335 |
+
attn_implementation=config._attn_implementation,
|
336 |
+
torch_dtype=config.torch_dtype,
|
337 |
)
|
338 |
|
339 |
language_model = apply_lora(language_model, config.text_model_lora_config)
|
340 |
return language_model
|
341 |
|
342 |
+
def merge_and_unload(self):
|
343 |
+
if isinstance(self.language_model, peft.PeftModel):
|
344 |
+
self.language_model = self.language_model.merge_and_unload()
|
345 |
+
# no need to download base language model weights anymore, so we can remove the id
|
346 |
self.config.text_model_id = None
|
347 |
self.keep_params.update(
|
348 |
set(
|
|
|
353 |
)
|
354 |
)
|
355 |
|
356 |
+
if isinstance(self.audio_tower, peft.PeftModel):
|
357 |
+
self.audio_tower = self.audio_tower.merge_and_unload()
|
358 |
+
# no need to download base audio model weights anymore, so we can remove the id
|
359 |
self.config.audio_model_id = None
|
360 |
self.keep_params.update(
|
361 |
set(
|
|
|
366 |
)
|
367 |
)
|
368 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
369 |
for param in ["text_model_lora_config", "audio_model_lora_config"]:
|
370 |
if hasattr(self.config, param):
|
371 |
delattr(self.config, param)
|
|
|
375 |
self.to(self.language_model.dtype)
|
376 |
return super().push_to_hub(*args, **kwargs)
|
377 |
|
378 |
+
def save_pretrained(
|
379 |
+
self, *args, state_dict: Optional[Dict[str, Any]] = None, **kwargs
|
380 |
+
):
|
381 |
+
if state_dict is None:
|
382 |
+
state_dict = {}
|
383 |
+
for module, keep in [
|
384 |
+
("multi_modal_projector", True),
|
385 |
+
("audio_tower", self.config.audio_model_id is None),
|
386 |
+
("language_model", self.config.text_model_id is None),
|
387 |
+
]:
|
388 |
+
if keep:
|
389 |
+
state_dict.update(
|
390 |
+
{
|
391 |
+
f"{module}.{name}": param
|
392 |
+
for name, param in getattr(self, module)
|
393 |
+
.state_dict()
|
394 |
+
.items()
|
395 |
+
}
|
396 |
+
)
|
397 |
+
|
398 |
+
super().save_pretrained(*args, state_dict=state_dict, **kwargs)
|
399 |
+
|
400 |
+
def _pre_load_state_dict_hook(self, state_dict: Dict[str, Any], *args, **kwargs):
|
401 |
+
self.keep_params.update(set(state_dict.keys()))
|
402 |
+
|
403 |
def print_trainable_parameters(self):
|
404 |
"""
|
405 |
Prints the number of trainable parameters in the model (reuses Peft model's method)
|
|
|
428 |
f" || Projector: {100 * projector_trainable_params / projector_all_params:.1f}%"
|
429 |
)
|
430 |
|
|
|
|
|
|
|
431 |
|
432 |
def is_cache_empty(
|
433 |
past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]]
|