ylacombe commited on
Commit
9f58137
·
1 Parent(s): 8281081

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +101 -208
app.py CHANGED
@@ -1,209 +1,102 @@
1
- from typing import Dict, Optional, Tuple, Union
2
-
3
- from transformers.models.bark import BarkSemanticModel, BarkCoarseModel, BarkFineModel, BarkPreTrainedModel
4
- from transformers.models.bark.generation_configuration_bark import (
5
- BarkCoarseGenerationConfig,
6
- BarkFineGenerationConfig,
7
- BarkSemanticGenerationConfig,
8
- )
9
- from transformers import BarkConfig, AutoModel
10
- from transformers.modeling_utils import get_parameter_device
11
- from transformers.utils import (
12
- is_accelerate_available,
13
- )
14
-
15
  import torch
16
-
17
- class BarkModel(BarkPreTrainedModel):
18
- config_class = BarkConfig
19
-
20
- def __init__(self, config):
21
- super().__init__(config)
22
-
23
- self.semantic = BarkSemanticModel(config.semantic_config)
24
- self.coarse_acoustics = BarkCoarseModel(config.coarse_acoustics_config)
25
- self.fine_acoustics = BarkFineModel(config.fine_acoustics_config)
26
-
27
- self.codec_model = AutoModel.from_config(config.codec_config)
28
-
29
- self.config = config
30
-
31
- @property
32
- def device(self) -> torch.device:
33
- """
34
- `torch.device`: The device on which the module is (assuming that all the module parameters are on the same
35
- device).
36
- """
37
- # for bark_model, device must be verified on its sub-models
38
- # if has _hf_hook, has been offloaded so the device has to be found in the hook
39
- if not hasattr(self.semantic, "_hf_hook"):
40
- return get_parameter_device(self)
41
- for module in self.semantic.modules():
42
- if (
43
- hasattr(module, "_hf_hook")
44
- and hasattr(module._hf_hook, "execution_device")
45
- and module._hf_hook.execution_device is not None
46
- ):
47
- return torch.device(module._hf_hook.execution_device)
48
-
49
- def enable_cpu_offload(self, gpu_id: Optional[int] = 0):
50
- r"""
51
- Offloads all sub-models to CPU using accelerate, reducing memory usage with a low impact on performance. This
52
- method moves one whole sub-model at a time to the GPU when it is used, and the sub-model remains in GPU until
53
- the next sub-model runs.
54
-
55
- Args:
56
- gpu_id (`int`, *optional*, defaults to 0):
57
- GPU id on which the sub-models will be loaded and offloaded.
58
- """
59
- if is_accelerate_available():
60
- from accelerate import cpu_offload_with_hook
61
- else:
62
- raise ImportError("`enable_model_cpu_offload` requires `accelerate`.")
63
-
64
- device = torch.device(f"cuda:{gpu_id}")
65
-
66
- if self.device.type != "cpu":
67
- self.to("cpu")
68
- torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
69
-
70
- # this layer is used outside the first foward pass of semantic so need to be loaded before semantic
71
- self.semantic.input_embeds_layer, _ = cpu_offload_with_hook(self.semantic.input_embeds_layer, device)
72
-
73
- hook = None
74
- for cpu_offloaded_model in [
75
- self.semantic,
76
- self.coarse_acoustics,
77
- self.fine_acoustics,
78
- ]:
79
- _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
80
-
81
- self.fine_acoustics_hook = hook
82
-
83
- _, hook = cpu_offload_with_hook(self.codec_model, device, prev_module_hook=hook)
84
-
85
- # We'll offload the last model manually.
86
- self.codec_model_hook = hook
87
-
88
- def codec_decode(self, fine_output):
89
- """Turn quantized audio codes into audio array using encodec."""
90
-
91
- fine_output = fine_output.transpose(0, 1)
92
- emb = self.codec_model.quantizer.decode(fine_output)
93
- out = self.codec_model.decoder(emb)
94
- audio_arr = out.squeeze(1) # squeeze the codebook dimension
95
-
96
- return audio_arr
97
-
98
- @torch.no_grad()
99
- def generate(
100
- self,
101
- input_ids: Optional[torch.Tensor] = None,
102
- history_prompt: Optional[Dict[str, torch.Tensor]] = None,
103
- **kwargs,
104
- ) -> torch.LongTensor:
105
- """
106
- Generates audio from an input prompt and an additional optional `Bark` speaker prompt.
107
-
108
- Args:
109
- input_ids (`Optional[torch.Tensor]` of shape (batch_size, seq_len), *optional*):
110
- Input ids. Will be truncated up to 256 tokens. Note that the output audios will be as long as the
111
- longest generation among the batch.
112
- history_prompt (`Optional[Dict[str,torch.Tensor]]`, *optional*):
113
- Optional `Bark` speaker prompt. Note that for now, this model takes only one speaker prompt per batch.
114
- kwargs (*optional*): Remaining dictionary of keyword arguments. Keyword arguments are of two types:
115
-
116
- - Without a prefix, they will be entered as `**kwargs` for the `generate` method of each sub-model.
117
- - With a *semantic_*, *coarse_*, *fine_* prefix, they will be input for the `generate` method of the
118
- semantic, coarse and fine respectively. It has the priority over the keywords without a prefix.
119
-
120
- This means you can, for example, specify a generation strategy for all sub-models except one.
121
- Returns:
122
- torch.LongTensor: Output generated audio.
123
-
124
- Example:
125
-
126
- ```python
127
- >>> from transformers import AutoProcessor, BarkModel
128
-
129
- >>> processor = AutoProcessor.from_pretrained("suno/bark-small")
130
- >>> model = BarkModel.from_pretrained("suno/bark-small")
131
-
132
- >>> # To add a voice preset, you can pass `voice_preset` to `BarkProcessor.__call__(...)`
133
- >>> voice_preset = "v2/en_speaker_6"
134
-
135
- >>> inputs = processor("Hello, my dog is cute, I need him in my life", voice_preset=voice_preset)
136
-
137
- >>> audio_array = model.generate(**inputs, semantic_max_new_tokens=100)
138
- >>> audio_array = audio_array.cpu().numpy().squeeze()
139
- ```
140
- """
141
- # TODO (joao):workaround until nested generation config is compatible with PreTrained Model
142
- # todo: dict
143
- semantic_generation_config = BarkSemanticGenerationConfig(**self.generation_config.semantic_config)
144
- coarse_generation_config = BarkCoarseGenerationConfig(**self.generation_config.coarse_acoustics_config)
145
- fine_generation_config = BarkFineGenerationConfig(**self.generation_config.fine_acoustics_config)
146
-
147
- kwargs_semantic = {
148
- # if "attention_mask" is set, it should not be passed to CoarseModel and FineModel
149
- "attention_mask": kwargs.pop("attention_mask", None)
150
- }
151
- kwargs_coarse = {}
152
- kwargs_fine = {}
153
- for key, value in kwargs.items():
154
- if key.startswith("semantic_"):
155
- key = key[len("semantic_") :]
156
- kwargs_semantic[key] = value
157
- elif key.startswith("coarse_"):
158
- key = key[len("coarse_") :]
159
- kwargs_coarse[key] = value
160
- elif key.startswith("fine_"):
161
- key = key[len("fine_") :]
162
- kwargs_fine[key] = value
163
- else:
164
- # If the key is already in a specific config, then it's been set with a
165
- # submodules specific value and we don't override
166
- if key not in kwargs_semantic:
167
- kwargs_semantic[key] = value
168
- if key not in kwargs_coarse:
169
- kwargs_coarse[key] = value
170
- if key not in kwargs_fine:
171
- kwargs_fine[key] = value
172
-
173
- # 1. Generate from the semantic model
174
- semantic_output = self.semantic.generate(
175
- input_ids,
176
- history_prompt=history_prompt,
177
- semantic_generation_config=semantic_generation_config,
178
- **kwargs_semantic,
179
- )
180
-
181
- # 2. Generate from the coarse model
182
- coarse_output = self.coarse_acoustics.generate(
183
- semantic_output,
184
- history_prompt=history_prompt,
185
- semantic_generation_config=semantic_generation_config,
186
- coarse_generation_config=coarse_generation_config,
187
- codebook_size=self.generation_config.codebook_size,
188
- **kwargs_coarse,
189
- )
190
-
191
- # 3. "generate" from the fine model
192
- output = self.fine_acoustics.generate(
193
- coarse_output,
194
- history_prompt=history_prompt,
195
- semantic_generation_config=semantic_generation_config,
196
- coarse_generation_config=coarse_generation_config,
197
- fine_generation_config=fine_generation_config,
198
- codebook_size=self.generation_config.codebook_size,
199
- **kwargs_fine,
200
- )
201
-
202
- if getattr(self, "fine_acoustics_hook", None) is not None:
203
- # Manually offload fine_acoustics to CPU
204
- # and load codec_model to GPU
205
- # since bark doesn't use codec_model forward pass
206
- self.fine_acoustics_hook.offload()
207
- self.codec_model = self.codec_model.to(self.device)
208
-
209
- return output
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import torch
2
+ from threading import Thread
3
+
4
+ from transformers import AutoProcessor
5
+ from transformers import set_seed
6
+
7
+ from vocos_bark import BarkModel
8
+ from scipy.io.wavfile import write
9
+ from pydub import AudioSegment
10
+
11
+ import numpy as np
12
+
13
+ import os
14
+ import gradio as gr
15
+ import uuid
16
+ import io
17
+ from vocos import Vocos
18
+
19
+ import os
20
+ os.environ["GRADIO_TEMP_DIR"] = "/home/yoach/spaces/tmp"
21
+
22
+
23
+ set_seed(0)
24
+
25
+ def _grab_best_device(use_gpu=True):
26
+ if torch.cuda.device_count() > 0 and use_gpu:
27
+ device = "cuda"
28
+ else:
29
+ device = "cpu"
30
+ return device
31
+
32
+ device = _grab_best_device()
33
+
34
+ HUB_PATH = "suno/bark"
35
+
36
+ processor = AutoProcessor.from_pretrained(HUB_PATH)
37
+
38
+ speaker_embeddings = sorted([key for key in processor.speaker_embeddings.keys() if "speaker" in key])
39
+
40
+ SAMPLE_RATE = 24_000
41
+
42
+ vocos = Vocos.from_pretrained("hubertsiuzdak/vocos-encodec-24khz-v2").to(device)
43
+
44
+ # import model
45
+ if device == "cpu":
46
+ bark = BarkModel.from_pretrained(HUB_PATH)
47
+ else:
48
+ bark = BarkModel.from_pretrained(HUB_PATH).to(device)
49
+ bark = bark.to_bettertransformer()
50
+
51
+
52
+ # streaming inference
53
+ def generate_audio(text, voice_preset = None, lag = 0):
54
+ if voice_preset not in speaker_embeddings:
55
+ voice_preset = None
56
+
57
+ sentences = [
58
+ text,
59
+ ]
60
+ inputs = processor(sentences, voice_preset=voice_preset).to(device)
61
+ # Run the generation in a separate thread, so that we can fetch the generated text in a non-blocking way.
62
+
63
+ fine_output = bark.generate(
64
+ **inputs, coarse_temperature = 0.8, temperature = 0.5, do_sample=True
65
+ )
66
+
67
+ print("Fine tokens generated")
68
+
69
+ with torch.no_grad():
70
+
71
+ encodec_waveform = bark.codec_decode(fine_output)
72
+
73
+ features = vocos.codes_to_features(fine_output.transpose(0,1))
74
+ vocos_waveform = vocos.decode(features, bandwidth_id=torch.tensor([2], device=device))
75
+
76
+ return (SAMPLE_RATE, encodec_waveform.cpu().squeeze().numpy()), (SAMPLE_RATE, vocos_waveform.cpu().squeeze().numpy())
77
+
78
+
79
+ # Gradio blocks demo
80
+ with gr.Blocks() as demo_blocks:
81
+ gr.Markdown("""<h1 align="center">🐶BARK with Vocos</h1>""")
82
+ gr.HTML("""<h3 style="text-align:center;">📢Vocos-enhanced TTS 🦾! </h3>""")
83
+ with gr.Group():
84
+ with gr.Row():
85
+ inp_text = gr.Textbox(label="What should Bark say?", info="Enter text here")
86
+ dd = gr.Dropdown(
87
+ speaker_embeddings,
88
+ value=None,
89
+ label="Available voice presets",
90
+ info="Defaults to no speaker embeddings!"
91
+ )
92
+
93
+ with gr.Row():
94
+ btn = gr.Button("Bark with Vocos TTS")
95
+
96
+ with gr.Row():
97
+ out_audio_encodec = gr.Audio(type="numpy", autoplay=False, label="original output", show_label=True)
98
+ out_audio_vocos = gr.Audio(type="numpy", autoplay=False, label="vocos enhanced output", show_label=True)
99
+
100
+ btn.click(generate_audio, [inp_text, dd], [out_audio_encodec, out_audio_vocos])
101
+
102
+ demo_blocks.queue().launch(debug=True)