Update ultravox_processing.py
Browse files- ultravox_processing.py +5 -0
ultravox_processing.py
CHANGED
@@ -120,6 +120,7 @@ class UltravoxProcessor(transformers.ProcessorMixin):
|
|
120 |
audio_embed_frames = int(np.ceil(nb_encoder_frames / self.stack_factor))
|
121 |
data["audio_token_len"] = [audio_embed_frames]
|
122 |
|
|
|
123 |
x = self.audio_processor(
|
124 |
audio,
|
125 |
sampling_rate=sampling_rate,
|
@@ -149,6 +150,10 @@ class UltravoxProcessor(transformers.ProcessorMixin):
|
|
149 |
)
|
150 |
)
|
151 |
data["audio_token_start_idx"] = [start_idx]
|
|
|
|
|
|
|
|
|
152 |
text = text.replace(
|
153 |
self.audio_placeholder,
|
154 |
self.audio_token_replacement * audio_embed_frames,
|
|
|
120 |
audio_embed_frames = int(np.ceil(nb_encoder_frames / self.stack_factor))
|
121 |
data["audio_token_len"] = [audio_embed_frames]
|
122 |
|
123 |
+
# Main audio processing. The processor is model-specific.
|
124 |
x = self.audio_processor(
|
125 |
audio,
|
126 |
sampling_rate=sampling_rate,
|
|
|
150 |
)
|
151 |
)
|
152 |
data["audio_token_start_idx"] = [start_idx]
|
153 |
+
|
154 |
+
# Replace the audio placeholder with the audio token.
|
155 |
+
# e.g. "Transcribe <|audio|>" -> "Transcribe </s></s></s></s></s></s></s></s>"
|
156 |
+
# where the number of </s> is the number of audio frames.
|
157 |
text = text.replace(
|
158 |
self.audio_placeholder,
|
159 |
self.audio_token_replacement * audio_embed_frames,
|