vocos-bark

Running

App Files Files Community

ylacombe commited on Oct 13, 2023

Commit

9f58137

1 Parent(s): 8281081

Update app.py

Browse files

Files changed (1) hide show

app.py +101 -208

app.py CHANGED Viewed

@@ -1,209 +1,102 @@
-from typing import Dict, Optional, Tuple, Union
-from transformers.models.bark import BarkSemanticModel, BarkCoarseModel, BarkFineModel, BarkPreTrainedModel
-from transformers.models.bark.generation_configuration_bark import (
-    BarkCoarseGenerationConfig,
-    BarkFineGenerationConfig,
-    BarkSemanticGenerationConfig,
-)
-from transformers import BarkConfig, AutoModel
-from transformers.modeling_utils import get_parameter_device
-from transformers.utils import (
-    is_accelerate_available,
-)
 import torch
-class BarkModel(BarkPreTrainedModel):
-    config_class = BarkConfig
-    def __init__(self, config):
-        super().__init__(config)
-        self.semantic = BarkSemanticModel(config.semantic_config)
-        self.coarse_acoustics = BarkCoarseModel(config.coarse_acoustics_config)
-        self.fine_acoustics = BarkFineModel(config.fine_acoustics_config)
-        self.codec_model = AutoModel.from_config(config.codec_config)
-        self.config = config
-    @property
-    def device(self) -> torch.device:
-        """
-        `torch.device`: The device on which the module is (assuming that all the module parameters are on the same
-        device).
-        """
-        # for bark_model, device must be verified on its sub-models
-        # if has _hf_hook, has been offloaded so the device has to be found in the hook
-        if not hasattr(self.semantic, "_hf_hook"):
-            return get_parameter_device(self)
-        for module in self.semantic.modules():
-            if (
-                hasattr(module, "_hf_hook")
-                and hasattr(module._hf_hook, "execution_device")
-                and module._hf_hook.execution_device is not None
-            ):
-                return torch.device(module._hf_hook.execution_device)
-    def enable_cpu_offload(self, gpu_id: Optional[int] = 0):
-        r"""
-        Offloads all sub-models to CPU using accelerate, reducing memory usage with a low impact on performance. This
-        method moves one whole sub-model at a time to the GPU when it is used, and the sub-model remains in GPU until
-        the next sub-model runs.
-        Args:
-            gpu_id (`int`, *optional*, defaults to 0):
-                GPU id on which the sub-models will be loaded and offloaded.
-        """
-        if is_accelerate_available():
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate`.")
-        device = torch.device(f"cuda:{gpu_id}")
-        if self.device.type != "cpu":
-            self.to("cpu")
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-        # this layer is used outside the first foward pass of semantic so need to be loaded before semantic
-        self.semantic.input_embeds_layer, _ = cpu_offload_with_hook(self.semantic.input_embeds_layer, device)
-        hook = None
-        for cpu_offloaded_model in [
-            self.semantic,
-            self.coarse_acoustics,
-            self.fine_acoustics,
-        ]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-        self.fine_acoustics_hook = hook
-        _, hook = cpu_offload_with_hook(self.codec_model, device, prev_module_hook=hook)
-        # We'll offload the last model manually.
-        self.codec_model_hook = hook
-    def codec_decode(self, fine_output):
-        """Turn quantized audio codes into audio array using encodec."""
-        fine_output = fine_output.transpose(0, 1)
-        emb = self.codec_model.quantizer.decode(fine_output)
-        out = self.codec_model.decoder(emb)
-        audio_arr = out.squeeze(1)  # squeeze the codebook dimension
-        return audio_arr
-    @torch.no_grad()
-    def generate(
-        self,
-        input_ids: Optional[torch.Tensor] = None,
-        history_prompt: Optional[Dict[str, torch.Tensor]] = None,
-        **kwargs,
-    ) -> torch.LongTensor:
-        """
-        Generates audio from an input prompt and an additional optional `Bark` speaker prompt.
-        Args:
-            input_ids (`Optional[torch.Tensor]` of shape (batch_size, seq_len), *optional*):
-                Input ids. Will be truncated up to 256 tokens. Note that the output audios will be as long as the
-                longest generation among the batch.
-            history_prompt (`Optional[Dict[str,torch.Tensor]]`, *optional*):
-                Optional `Bark` speaker prompt. Note that for now, this model takes only one speaker prompt per batch.
-            kwargs (*optional*): Remaining dictionary of keyword arguments. Keyword arguments are of two types:
-                - Without a prefix, they will be entered as `**kwargs` for the `generate` method of each sub-model.
-                - With a *semantic_*, *coarse_*, *fine_* prefix, they will be input for the `generate` method of the
-                semantic, coarse and fine respectively. It has the priority over the keywords without a prefix.
-                This means you can, for example, specify a generation strategy for all sub-models except one.
-        Returns:
-            torch.LongTensor: Output generated audio.
-        Example:
-        ```python
-        >>> from transformers import AutoProcessor, BarkModel
-        >>> processor = AutoProcessor.from_pretrained("suno/bark-small")
-        >>> model = BarkModel.from_pretrained("suno/bark-small")
-        >>> # To add a voice preset, you can pass `voice_preset` to `BarkProcessor.__call__(...)`
-        >>> voice_preset = "v2/en_speaker_6"
-        >>> inputs = processor("Hello, my dog is cute, I need him in my life", voice_preset=voice_preset)
-        >>> audio_array = model.generate(**inputs, semantic_max_new_tokens=100)
-        >>> audio_array = audio_array.cpu().numpy().squeeze()
-        ```
-        """
-        # TODO (joao):workaround until nested generation config is compatible with PreTrained Model
-        # todo: dict
-        semantic_generation_config = BarkSemanticGenerationConfig(**self.generation_config.semantic_config)
-        coarse_generation_config = BarkCoarseGenerationConfig(**self.generation_config.coarse_acoustics_config)
-        fine_generation_config = BarkFineGenerationConfig(**self.generation_config.fine_acoustics_config)
-        kwargs_semantic = {
-            # if "attention_mask" is set, it should not be passed to CoarseModel and FineModel
-            "attention_mask": kwargs.pop("attention_mask", None)
-        }
-        kwargs_coarse = {}
-        kwargs_fine = {}
-        for key, value in kwargs.items():
-            if key.startswith("semantic_"):
-                key = key[len("semantic_") :]
-                kwargs_semantic[key] = value
-            elif key.startswith("coarse_"):
-                key = key[len("coarse_") :]
-                kwargs_coarse[key] = value
-            elif key.startswith("fine_"):
-                key = key[len("fine_") :]
-                kwargs_fine[key] = value
-            else:
-                # If the key is already in a specific config, then it's been set with a
-                # submodules specific value and we don't override
-                if key not in kwargs_semantic:
-                    kwargs_semantic[key] = value
-                if key not in kwargs_coarse:
-                    kwargs_coarse[key] = value
-                if key not in kwargs_fine:
-                    kwargs_fine[key] = value
-        # 1. Generate from the semantic model
-        semantic_output = self.semantic.generate(
-            input_ids,
-            history_prompt=history_prompt,
-            semantic_generation_config=semantic_generation_config,
-            **kwargs_semantic,
-        )
-        # 2. Generate from the coarse model
-        coarse_output = self.coarse_acoustics.generate(
-            semantic_output,
-            history_prompt=history_prompt,
-            semantic_generation_config=semantic_generation_config,
-            coarse_generation_config=coarse_generation_config,
-            codebook_size=self.generation_config.codebook_size,
-            **kwargs_coarse,
-        )
-        # 3. "generate" from the fine model
-        output = self.fine_acoustics.generate(
-            coarse_output,
-            history_prompt=history_prompt,
-            semantic_generation_config=semantic_generation_config,
-            coarse_generation_config=coarse_generation_config,
-            fine_generation_config=fine_generation_config,
-            codebook_size=self.generation_config.codebook_size,
-            **kwargs_fine,
-        )
-        if getattr(self, "fine_acoustics_hook", None) is not None:
-            # Manually offload fine_acoustics to CPU
-            # and load codec_model to GPU
-            # since bark doesn't use codec_model forward pass
-            self.fine_acoustics_hook.offload()
-            self.codec_model = self.codec_model.to(self.device)
-        return output

 import torch
+from threading import Thread
+from transformers import AutoProcessor
+from transformers import set_seed
+from vocos_bark import BarkModel
+from scipy.io.wavfile import write
+from pydub import AudioSegment
+import numpy as np
+import os
+import gradio as gr
+import uuid
+import io
+from vocos import Vocos
+import os
+os.environ["GRADIO_TEMP_DIR"] = "/home/yoach/spaces/tmp"
+set_seed(0)
+def _grab_best_device(use_gpu=True):
+    if torch.cuda.device_count() > 0 and use_gpu:
+        device = "cuda"
+    else:
+        device = "cpu"
+    return device
+device = _grab_best_device()
+HUB_PATH = "suno/bark"
+processor = AutoProcessor.from_pretrained(HUB_PATH)
+speaker_embeddings = sorted([key for key in processor.speaker_embeddings.keys() if "speaker" in key])
+SAMPLE_RATE = 24_000
+vocos = Vocos.from_pretrained("hubertsiuzdak/vocos-encodec-24khz-v2").to(device)
+# import model
+if device == "cpu":
+    bark = BarkModel.from_pretrained(HUB_PATH)
+else:
+    bark = BarkModel.from_pretrained(HUB_PATH).to(device)
+    bark = bark.to_bettertransformer()
+# streaming inference
+def generate_audio(text, voice_preset = None, lag = 0):
+    if voice_preset not in speaker_embeddings:
+        voice_preset = None
+    sentences = [
+        text,
+    ]
+    inputs = processor(sentences, voice_preset=voice_preset).to(device)
+    # Run the generation in a separate thread, so that we can fetch the generated text in a non-blocking way.
+    fine_output = bark.generate(
+        **inputs, coarse_temperature = 0.8, temperature = 0.5, do_sample=True
+    )
+    print("Fine tokens generated")
+    with torch.no_grad():
+        encodec_waveform = bark.codec_decode(fine_output)
+        features = vocos.codes_to_features(fine_output.transpose(0,1))
+        vocos_waveform = vocos.decode(features, bandwidth_id=torch.tensor([2], device=device))
+    return (SAMPLE_RATE, encodec_waveform.cpu().squeeze().numpy()), (SAMPLE_RATE, vocos_waveform.cpu().squeeze().numpy())
+# Gradio blocks demo
+with gr.Blocks() as demo_blocks:
+    gr.Markdown("""<h1 align="center">🐶BARK with Vocos</h1>""")
+    gr.HTML("""<h3 style="text-align:center;">📢Vocos-enhanced TTS 🦾! </h3>""")
+    with gr.Group():
+      with gr.Row():
+        inp_text = gr.Textbox(label="What should Bark say?", info="Enter text here")
+        dd = gr.Dropdown(
+                speaker_embeddings,
+                value=None,
+                label="Available voice presets",
+                info="Defaults to no speaker embeddings!"
+                )
+    with gr.Row():
+        btn = gr.Button("Bark with Vocos TTS")
+    with gr.Row():
+        out_audio_encodec = gr.Audio(type="numpy", autoplay=False, label="original output", show_label=True)
+        out_audio_vocos = gr.Audio(type="numpy", autoplay=False, label="vocos enhanced output", show_label=True)
+        btn.click(generate_audio, [inp_text, dd], [out_audio_encodec, out_audio_vocos])
+demo_blocks.queue().launch(debug=True)