Spaces:
Sleeping
Sleeping
File size: 8,962 Bytes
0810335 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 |
from vocos import Vocos
from typing import Dict, Optional, Tuple, Union
from transformers.models.bark import BarkSemanticModel, BarkCoarseModel, BarkFineModel, BarkPreTrainedModel
from transformers.models.bark.generation_configuration_bark import (
BarkCoarseGenerationConfig,
BarkFineGenerationConfig,
BarkSemanticGenerationConfig,
)
from transformers import BarkConfig
from transformers.modeling_utils import get_parameter_device
from transformers.utils import (
is_accelerate_available,
)
import torch
class BarkModel(BarkPreTrainedModel):
config_class = BarkConfig
def __init__(self, config):
super().__init__(config)
self.semantic = BarkSemanticModel(config.semantic_config)
self.coarse_acoustics = BarkCoarseModel(config.coarse_acoustics_config)
self.fine_acoustics = BarkFineModel(config.fine_acoustics_config)
self.vocos = Vocos.from_pretrained("hubertsiuzdak/vocos-encodec-24khz-v2")
self.config = config
@property
def device(self) -> torch.device:
"""
`torch.device`: The device on which the module is (assuming that all the module parameters are on the same
device).
"""
# for bark_model, device must be verified on its sub-models
# if has _hf_hook, has been offloaded so the device has to be found in the hook
if not hasattr(self.semantic, "_hf_hook"):
return get_parameter_device(self)
for module in self.semantic.modules():
if (
hasattr(module, "_hf_hook")
and hasattr(module._hf_hook, "execution_device")
and module._hf_hook.execution_device is not None
):
return torch.device(module._hf_hook.execution_device)
def enable_cpu_offload(self, gpu_id: Optional[int] = 0):
r"""
Offloads all sub-models to CPU using accelerate, reducing memory usage with a low impact on performance. This
method moves one whole sub-model at a time to the GPU when it is used, and the sub-model remains in GPU until
the next sub-model runs.
Args:
gpu_id (`int`, *optional*, defaults to 0):
GPU id on which the sub-models will be loaded and offloaded.
"""
if is_accelerate_available():
from accelerate import cpu_offload_with_hook
else:
raise ImportError("`enable_model_cpu_offload` requires `accelerate`.")
device = torch.device(f"cuda:{gpu_id}")
if self.device.type != "cpu":
self.to("cpu")
torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
# this layer is used outside the first foward pass of semantic so need to be loaded before semantic
self.semantic.input_embeds_layer, _ = cpu_offload_with_hook(self.semantic.input_embeds_layer, device)
hook = None
for cpu_offloaded_model in [
self.semantic,
self.coarse_acoustics,
self.fine_acoustics,
]:
_, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
self.fine_acoustics_hook = hook
_, hook = cpu_offload_with_hook(self.vocos, device, prev_module_hook=hook)
# We'll offload the last model manually.
self.codec_model_hook = hook
@torch.no_grad()
def generate(
self,
input_ids: Optional[torch.Tensor] = None,
history_prompt: Optional[Dict[str, torch.Tensor]] = None,
**kwargs,
) -> torch.LongTensor:
"""
Generates audio from an input prompt and an additional optional `Bark` speaker prompt.
Args:
input_ids (`Optional[torch.Tensor]` of shape (batch_size, seq_len), *optional*):
Input ids. Will be truncated up to 256 tokens. Note that the output audios will be as long as the
longest generation among the batch.
history_prompt (`Optional[Dict[str,torch.Tensor]]`, *optional*):
Optional `Bark` speaker prompt. Note that for now, this model takes only one speaker prompt per batch.
kwargs (*optional*): Remaining dictionary of keyword arguments. Keyword arguments are of two types:
- Without a prefix, they will be entered as `**kwargs` for the `generate` method of each sub-model.
- With a *semantic_*, *coarse_*, *fine_* prefix, they will be input for the `generate` method of the
semantic, coarse and fine respectively. It has the priority over the keywords without a prefix.
This means you can, for example, specify a generation strategy for all sub-models except one.
Returns:
torch.LongTensor: Output generated audio.
Example:
```python
>>> from transformers import AutoProcessor, BarkModel
>>> processor = AutoProcessor.from_pretrained("suno/bark-small")
>>> model = BarkModel.from_pretrained("suno/bark-small")
>>> # To add a voice preset, you can pass `voice_preset` to `BarkProcessor.__call__(...)`
>>> voice_preset = "v2/en_speaker_6"
>>> inputs = processor("Hello, my dog is cute, I need him in my life", voice_preset=voice_preset)
>>> audio_array = model.generate(**inputs, semantic_max_new_tokens=100)
>>> audio_array = audio_array.cpu().numpy().squeeze()
```
"""
# TODO (joao):workaround until nested generation config is compatible with PreTrained Model
# todo: dict
semantic_generation_config = BarkSemanticGenerationConfig(**self.generation_config.semantic_config)
coarse_generation_config = BarkCoarseGenerationConfig(**self.generation_config.coarse_acoustics_config)
fine_generation_config = BarkFineGenerationConfig(**self.generation_config.fine_acoustics_config)
kwargs_semantic = {
# if "attention_mask" is set, it should not be passed to CoarseModel and FineModel
"attention_mask": kwargs.pop("attention_mask", None)
}
kwargs_coarse = {}
kwargs_fine = {}
for key, value in kwargs.items():
if key.startswith("semantic_"):
key = key[len("semantic_") :]
kwargs_semantic[key] = value
elif key.startswith("coarse_"):
key = key[len("coarse_") :]
kwargs_coarse[key] = value
elif key.startswith("fine_"):
key = key[len("fine_") :]
kwargs_fine[key] = value
else:
# If the key is already in a specific config, then it's been set with a
# submodules specific value and we don't override
if key not in kwargs_semantic:
kwargs_semantic[key] = value
if key not in kwargs_coarse:
kwargs_coarse[key] = value
if key not in kwargs_fine:
kwargs_fine[key] = value
# 1. Generate from the semantic model
semantic_output = self.semantic.generate(
input_ids,
history_prompt=history_prompt,
semantic_generation_config=semantic_generation_config,
**kwargs_semantic,
)
# 2. Generate from the coarse model
coarse_output = self.coarse_acoustics.generate(
semantic_output,
history_prompt=history_prompt,
semantic_generation_config=semantic_generation_config,
coarse_generation_config=coarse_generation_config,
codebook_size=self.generation_config.codebook_size,
**kwargs_coarse,
)
# 3. "generate" from the fine model
output = self.fine_acoustics.generate(
coarse_output,
history_prompt=history_prompt,
semantic_generation_config=semantic_generation_config,
coarse_generation_config=coarse_generation_config,
fine_generation_config=fine_generation_config,
codebook_size=self.generation_config.codebook_size,
**kwargs_fine,
)
if getattr(self, "fine_acoustics_hook", None) is not None:
# Manually offload fine_acoustics to CPU
# and load codec_model to GPU
# since bark doesn't use codec_model forward pass
self.fine_acoustics_hook.offload()
self.vocos = self.vocos.to(self.device)
# 4. Decode the output and generate audio array
bandwidth_id = torch.tensor([2]).to(self.device)
# transpose
value = output.transpose(0,1)
value = self.vocos.codes_to_features(value)
value = self.vocos.decode(value, bandwidth_id=bandwidth_id)
if getattr(self, "codec_model_hook", None) is not None:
# Offload codec_model to CPU
self.vocos.offload()
return value |