fixie-ai
/

ultravox-v0_2

@@ -16,10 +16,7 @@ from .ultravox_config import UltravoxConfig
 from .whisper_model_modified import WhisperEncoder as ModifiedWhisperEncoder
-class UltravoxModel(
-    transformers.LlamaPreTrainedModel,
-    transformers.GenerationMixin,
-):
     """
     The Ultravox model which consists of an audio encoder and a language model.
@@ -101,7 +98,7 @@ class UltravoxModel(
         attention_mask: Optional[torch.Tensor] = None,
         audio_token_start_idx: Optional[torch.Tensor] = None,
         audio_token_len: Optional[torch.Tensor] = None,
-        past_key_values: Optional[Tuple] = None,
         **kwargs,
     ) -> Union[Tuple, transformers.modeling_outputs.CausalLMOutputWithPast]:
         """
@@ -166,7 +163,7 @@ class UltravoxModel(
         audio_values: Optional[torch.FloatTensor] = None,
         audio_token_start_idx: Optional[torch.Tensor] = None,
         audio_token_len: Optional[torch.Tensor] = None,
-        past_key_values: Optional[Tuple] = None,
         attention_mask: Optional[torch.Tensor] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs,
@@ -179,7 +176,7 @@ class UltravoxModel(
             **kwargs,
         )
-        if past_key_values is None and audio_values is not None:
             # We only want to use audio features in the 1st generation step
             model_input["audio_values"] = audio_values
             model_input["audio_token_start_idx"] = audio_token_start_idx
@@ -320,6 +317,19 @@ class UltravoxModel(
         )
 def apply_lora(model: torch.nn.Module, lora_config: dict) -> torch.nn.Module:
     """
     Applies LoRA finetuning to the model. If the `r` parameter is set to 0, the model is frozen instead.
@@ -402,6 +412,6 @@ UltravoxModel.register_for_auto_class()
 transformers.AutoConfig.register("ultravox", UltravoxConfig)
 transformers.AutoModel.register(UltravoxConfig, UltravoxModel)
-# transformers.AutoProcessor.register(UltravoxConfig, UltravoxProcessor)  # TODO: make processo work standalone
 transformers.activations.ACT2FN["swiglu"] = SwiGLU

 from .whisper_model_modified import WhisperEncoder as ModifiedWhisperEncoder
+class UltravoxModel(transformers.LlamaPreTrainedModel):
     """
     The Ultravox model which consists of an audio encoder and a language model.
         attention_mask: Optional[torch.Tensor] = None,
         audio_token_start_idx: Optional[torch.Tensor] = None,
         audio_token_len: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]] = None,
         **kwargs,
     ) -> Union[Tuple, transformers.modeling_outputs.CausalLMOutputWithPast]:
         """
         audio_values: Optional[torch.FloatTensor] = None,
         audio_token_start_idx: Optional[torch.Tensor] = None,
         audio_token_len: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]] = None,
         attention_mask: Optional[torch.Tensor] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs,
             **kwargs,
         )
+        if is_cache_empty(past_key_values) and audio_values is not None:
             # We only want to use audio features in the 1st generation step
             model_input["audio_values"] = audio_values
             model_input["audio_token_start_idx"] = audio_token_start_idx
         )
+def is_cache_empty(
+    past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]]
+) -> bool:
+    """
+    Check if the cache is empty.
+    """
+    if past_key_values is None:
+        return True
+    if isinstance(past_key_values, tuple):
+        return all(len(c) == 0 for c in past_key_values)
+    return past_key_values.get_seq_length() == 0
 def apply_lora(model: torch.nn.Module, lora_config: dict) -> torch.nn.Module:
     """
     Applies LoRA finetuning to the model. If the `r` parameter is set to 0, the model is frozen instead.
 transformers.AutoConfig.register("ultravox", UltravoxConfig)
 transformers.AutoModel.register(UltravoxConfig, UltravoxModel)
+# transformers.AutoProcessor.register(UltravoxConfig, UltravoxProcessor)  # TODO: make processor work standalone
 transformers.activations.ACT2FN["swiglu"] = SwiGLU