Spaces:
Sleeping
Sleeping
from vocos import Vocos | |
from typing import Dict, Optional, Tuple, Union | |
from transformers.models.bark import BarkSemanticModel, BarkCoarseModel, BarkFineModel, BarkPreTrainedModel | |
from transformers.models.bark.generation_configuration_bark import ( | |
BarkCoarseGenerationConfig, | |
BarkFineGenerationConfig, | |
BarkSemanticGenerationConfig, | |
) | |
from transformers import BarkConfig | |
from transformers.modeling_utils import get_parameter_device | |
from transformers.utils import ( | |
is_accelerate_available, | |
) | |
import torch | |
class BarkModel(BarkPreTrainedModel): | |
config_class = BarkConfig | |
def __init__(self, config): | |
super().__init__(config) | |
self.semantic = BarkSemanticModel(config.semantic_config) | |
self.coarse_acoustics = BarkCoarseModel(config.coarse_acoustics_config) | |
self.fine_acoustics = BarkFineModel(config.fine_acoustics_config) | |
self.vocos = Vocos.from_pretrained("hubertsiuzdak/vocos-encodec-24khz-v2") | |
self.config = config | |
def device(self) -> torch.device: | |
""" | |
`torch.device`: The device on which the module is (assuming that all the module parameters are on the same | |
device). | |
""" | |
# for bark_model, device must be verified on its sub-models | |
# if has _hf_hook, has been offloaded so the device has to be found in the hook | |
if not hasattr(self.semantic, "_hf_hook"): | |
return get_parameter_device(self) | |
for module in self.semantic.modules(): | |
if ( | |
hasattr(module, "_hf_hook") | |
and hasattr(module._hf_hook, "execution_device") | |
and module._hf_hook.execution_device is not None | |
): | |
return torch.device(module._hf_hook.execution_device) | |
def enable_cpu_offload(self, gpu_id: Optional[int] = 0): | |
r""" | |
Offloads all sub-models to CPU using accelerate, reducing memory usage with a low impact on performance. This | |
method moves one whole sub-model at a time to the GPU when it is used, and the sub-model remains in GPU until | |
the next sub-model runs. | |
Args: | |
gpu_id (`int`, *optional*, defaults to 0): | |
GPU id on which the sub-models will be loaded and offloaded. | |
""" | |
if is_accelerate_available(): | |
from accelerate import cpu_offload_with_hook | |
else: | |
raise ImportError("`enable_model_cpu_offload` requires `accelerate`.") | |
device = torch.device(f"cuda:{gpu_id}") | |
if self.device.type != "cpu": | |
self.to("cpu") | |
torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist) | |
# this layer is used outside the first foward pass of semantic so need to be loaded before semantic | |
self.semantic.input_embeds_layer, _ = cpu_offload_with_hook(self.semantic.input_embeds_layer, device) | |
hook = None | |
for cpu_offloaded_model in [ | |
self.semantic, | |
self.coarse_acoustics, | |
self.fine_acoustics, | |
]: | |
_, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook) | |
self.fine_acoustics_hook = hook | |
_, hook = cpu_offload_with_hook(self.vocos, device, prev_module_hook=hook) | |
# We'll offload the last model manually. | |
self.codec_model_hook = hook | |
def generate( | |
self, | |
input_ids: Optional[torch.Tensor] = None, | |
history_prompt: Optional[Dict[str, torch.Tensor]] = None, | |
**kwargs, | |
) -> torch.LongTensor: | |
""" | |
Generates audio from an input prompt and an additional optional `Bark` speaker prompt. | |
Args: | |
input_ids (`Optional[torch.Tensor]` of shape (batch_size, seq_len), *optional*): | |
Input ids. Will be truncated up to 256 tokens. Note that the output audios will be as long as the | |
longest generation among the batch. | |
history_prompt (`Optional[Dict[str,torch.Tensor]]`, *optional*): | |
Optional `Bark` speaker prompt. Note that for now, this model takes only one speaker prompt per batch. | |
kwargs (*optional*): Remaining dictionary of keyword arguments. Keyword arguments are of two types: | |
- Without a prefix, they will be entered as `**kwargs` for the `generate` method of each sub-model. | |
- With a *semantic_*, *coarse_*, *fine_* prefix, they will be input for the `generate` method of the | |
semantic, coarse and fine respectively. It has the priority over the keywords without a prefix. | |
This means you can, for example, specify a generation strategy for all sub-models except one. | |
Returns: | |
torch.LongTensor: Output generated audio. | |
Example: | |
```python | |
>>> from transformers import AutoProcessor, BarkModel | |
>>> processor = AutoProcessor.from_pretrained("suno/bark-small") | |
>>> model = BarkModel.from_pretrained("suno/bark-small") | |
>>> # To add a voice preset, you can pass `voice_preset` to `BarkProcessor.__call__(...)` | |
>>> voice_preset = "v2/en_speaker_6" | |
>>> inputs = processor("Hello, my dog is cute, I need him in my life", voice_preset=voice_preset) | |
>>> audio_array = model.generate(**inputs, semantic_max_new_tokens=100) | |
>>> audio_array = audio_array.cpu().numpy().squeeze() | |
``` | |
""" | |
# TODO (joao):workaround until nested generation config is compatible with PreTrained Model | |
# todo: dict | |
semantic_generation_config = BarkSemanticGenerationConfig(**self.generation_config.semantic_config) | |
coarse_generation_config = BarkCoarseGenerationConfig(**self.generation_config.coarse_acoustics_config) | |
fine_generation_config = BarkFineGenerationConfig(**self.generation_config.fine_acoustics_config) | |
kwargs_semantic = { | |
# if "attention_mask" is set, it should not be passed to CoarseModel and FineModel | |
"attention_mask": kwargs.pop("attention_mask", None) | |
} | |
kwargs_coarse = {} | |
kwargs_fine = {} | |
for key, value in kwargs.items(): | |
if key.startswith("semantic_"): | |
key = key[len("semantic_") :] | |
kwargs_semantic[key] = value | |
elif key.startswith("coarse_"): | |
key = key[len("coarse_") :] | |
kwargs_coarse[key] = value | |
elif key.startswith("fine_"): | |
key = key[len("fine_") :] | |
kwargs_fine[key] = value | |
else: | |
# If the key is already in a specific config, then it's been set with a | |
# submodules specific value and we don't override | |
if key not in kwargs_semantic: | |
kwargs_semantic[key] = value | |
if key not in kwargs_coarse: | |
kwargs_coarse[key] = value | |
if key not in kwargs_fine: | |
kwargs_fine[key] = value | |
# 1. Generate from the semantic model | |
semantic_output = self.semantic.generate( | |
input_ids, | |
history_prompt=history_prompt, | |
semantic_generation_config=semantic_generation_config, | |
**kwargs_semantic, | |
) | |
# 2. Generate from the coarse model | |
coarse_output = self.coarse_acoustics.generate( | |
semantic_output, | |
history_prompt=history_prompt, | |
semantic_generation_config=semantic_generation_config, | |
coarse_generation_config=coarse_generation_config, | |
codebook_size=self.generation_config.codebook_size, | |
**kwargs_coarse, | |
) | |
# 3. "generate" from the fine model | |
output = self.fine_acoustics.generate( | |
coarse_output, | |
history_prompt=history_prompt, | |
semantic_generation_config=semantic_generation_config, | |
coarse_generation_config=coarse_generation_config, | |
fine_generation_config=fine_generation_config, | |
codebook_size=self.generation_config.codebook_size, | |
**kwargs_fine, | |
) | |
if getattr(self, "fine_acoustics_hook", None) is not None: | |
# Manually offload fine_acoustics to CPU | |
# and load codec_model to GPU | |
# since bark doesn't use codec_model forward pass | |
self.fine_acoustics_hook.offload() | |
self.vocos = self.vocos.to(self.device) | |
# 4. Decode the output and generate audio array | |
bandwidth_id = torch.tensor([2]).to(self.device) | |
# transpose | |
value = output.transpose(0,1) | |
value = self.vocos.codes_to_features(value) | |
value = self.vocos.decode(value, bandwidth_id=bandwidth_id) | |
if getattr(self, "codec_model_hook", None) is not None: | |
# Offload codec_model to CPU | |
self.vocos.offload() | |
return value |