Spaces:
Runtime error
Runtime error
stanimirovb
commited on
Commit
•
2ef4b5e
1
Parent(s):
95fc384
gradio demo
Browse files- .gitignore +10 -0
- LICENSE +21 -0
- README.md +8 -4
- app.py +263 -0
- requirements.txt +4 -0
.gitignore
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# ides, editors
|
2 |
+
.vscode/
|
3 |
+
|
4 |
+
# temporary dev artefacts
|
5 |
+
tmp/
|
6 |
+
|
7 |
+
# Byte-compiled / optimized / DLL files
|
8 |
+
__pycache__/
|
9 |
+
*.py[cod]
|
10 |
+
*$py.class
|
LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2023 PlayHT
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
README.md
CHANGED
@@ -1,13 +1,17 @@
|
|
1 |
---
|
2 |
title: Play Voice V0 Demo
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 4.
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: mit
|
|
|
|
|
|
|
|
|
11 |
---
|
12 |
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
title: Play Voice V0 Demo
|
3 |
+
emoji: 🔊
|
4 |
+
colorFrom: red
|
5 |
+
colorTo: pink
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 4.8.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: mit
|
11 |
+
models:
|
12 |
+
- PlayHT/play-voice-v0-multi
|
13 |
+
datasets:
|
14 |
+
- PlayHT/play-voice-voices
|
15 |
---
|
16 |
|
17 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,263 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import random
|
3 |
+
|
4 |
+
import gradio as gr
|
5 |
+
import numpy as np
|
6 |
+
import torch
|
7 |
+
import torchaudio
|
8 |
+
from huggingface_hub import snapshot_download
|
9 |
+
|
10 |
+
from play_voice_inference.utils.voice_tokenizer import VoiceBpeTokenizer
|
11 |
+
from play_voice_inference.models.play_voice import LanguageIdentifiers, SpeakerAttributes, SpeechAttributes, load_play_voice
|
12 |
+
from play_voice_inference.utils.play_voice_sampler import PlayVoiceSampler
|
13 |
+
from play_voice_inference.utils.pv_diff_sampler import PlayVoiceDiffusionDecoderSampler
|
14 |
+
|
15 |
+
torch.set_grad_enabled(False)
|
16 |
+
device = torch.device('cuda')
|
17 |
+
|
18 |
+
HF_TOKEN = os.environ['HF_TOKEN']
|
19 |
+
print("Loading models...")
|
20 |
+
|
21 |
+
tokenizer = VoiceBpeTokenizer()
|
22 |
+
|
23 |
+
MODEL_DIR = snapshot_download('PlayHT/play-voice-v0-multi', token=HF_TOKEN)
|
24 |
+
|
25 |
+
PV_AR_PT = MODEL_DIR + '/pv-v1-ar.pth'
|
26 |
+
play_voice = load_play_voice(PV_AR_PT, device)
|
27 |
+
sampler = PlayVoiceSampler(play_voice).to(device)
|
28 |
+
|
29 |
+
NUM_DIFFUSION_STEPS: int = 150
|
30 |
+
DIFFUSION_PT = MODEL_DIR + '/pv-v1-diff-xf.pth'
|
31 |
+
DIFFUSION_VOCODER_PT = MODEL_DIR + '/pv-v1-diff-bigvgan.pt'
|
32 |
+
vocoder = PlayVoiceDiffusionDecoderSampler.from_path(
|
33 |
+
DIFFUSION_PT,
|
34 |
+
DIFFUSION_VOCODER_PT,
|
35 |
+
steps=NUM_DIFFUSION_STEPS,
|
36 |
+
silent=True,
|
37 |
+
use_fp16=True,
|
38 |
+
device=device
|
39 |
+
)
|
40 |
+
|
41 |
+
print("Preparing voices...")
|
42 |
+
VOICES_DIR = snapshot_download('PlayHT/play-voice-voices', repo_type='dataset', token=HF_TOKEN)
|
43 |
+
|
44 |
+
def load_audio(path: str, sr=24000):
|
45 |
+
audio, orig_sr = torchaudio.load(path)
|
46 |
+
if orig_sr != sr:
|
47 |
+
audio = torchaudio.transforms.Resample(orig_sr, sr)(audio)
|
48 |
+
return audio
|
49 |
+
|
50 |
+
def make_pcm(audio: torch.Tensor):
|
51 |
+
# Must convert to 16-bit PCM for gradio
|
52 |
+
# remove batch dim if any
|
53 |
+
# if len(audio.shape) > 2:
|
54 |
+
# audio = audio[0]
|
55 |
+
# audio = audio.transpose(0, 1) # gradio expects [samples, channels] and throws very unhelpful errors if it's wrong
|
56 |
+
gen_np = audio.squeeze().cpu().numpy()
|
57 |
+
i = np.iinfo("int16")
|
58 |
+
abs_max = 2 ** (i.bits - 1)
|
59 |
+
offset = i.min + abs_max
|
60 |
+
gen_np = (gen_np * abs_max + offset).clip(i.min, i.max).astype("int16")
|
61 |
+
return gen_np
|
62 |
+
|
63 |
+
initial_voices = []
|
64 |
+
for item in os.listdir(VOICES_DIR):
|
65 |
+
if item.endswith(".wav"):
|
66 |
+
name = os.path.splitext(item)[0]
|
67 |
+
initial_voices.append({"name": name, "audio": load_audio(os.path.join(VOICES_DIR, item))})
|
68 |
+
initial_voices.sort(key=lambda x: x["name"])
|
69 |
+
print(f"Found {len(initial_voices)} initial voices")
|
70 |
+
|
71 |
+
def get_voice_labels(voices: list[dict]):
|
72 |
+
labels = []
|
73 |
+
for voice in voices:
|
74 |
+
labels.append(voice["name"])
|
75 |
+
return labels
|
76 |
+
|
77 |
+
|
78 |
+
with gr.Blocks(analytics_enabled=False, title="Play Voice", mode="tts") as iface:
|
79 |
+
local_voices = gr.State(initial_voices)
|
80 |
+
|
81 |
+
def get_selected_voice_by_label(voices, label: str):
|
82 |
+
labels = get_voice_labels(voices)
|
83 |
+
for i, voice_label in enumerate(labels):
|
84 |
+
if voice_label == label:
|
85 |
+
return voices[i]
|
86 |
+
raise Exception("Voice not found: " + label)
|
87 |
+
|
88 |
+
def make_voice_dropdown(voices):
|
89 |
+
choices = get_voice_labels(voices)
|
90 |
+
return gr.Dropdown(
|
91 |
+
choices=choices,
|
92 |
+
value=choices[-1] if len(choices) > 0 else None,
|
93 |
+
label="Voice",
|
94 |
+
)
|
95 |
+
|
96 |
+
def make_enum_dropdown(enum, label, default=None, allow_none=False):
|
97 |
+
choices = [e.name for e in enum]
|
98 |
+
if allow_none:
|
99 |
+
choices.append("none")
|
100 |
+
return gr.Dropdown(
|
101 |
+
choices=choices,
|
102 |
+
value=default,
|
103 |
+
label=label,
|
104 |
+
)
|
105 |
+
|
106 |
+
def get_enum_value(enum, value):
|
107 |
+
if value == "none":
|
108 |
+
return None
|
109 |
+
return enum[value]
|
110 |
+
|
111 |
+
gr.Markdown("# Play Voice\n")
|
112 |
+
|
113 |
+
with gr.Tab("TTS"):
|
114 |
+
speak_text = gr.Textbox(lines=2, placeholder="What would you like to say?", label="Text")
|
115 |
+
speak_voice = make_voice_dropdown(initial_voices)
|
116 |
+
|
117 |
+
with gr.Accordion("Settings", open=False):
|
118 |
+
speaker_attributes = make_enum_dropdown(
|
119 |
+
SpeakerAttributes, "Speaker Attributes", "full_sentence", allow_none=True
|
120 |
+
)
|
121 |
+
speech_attributes = make_enum_dropdown(SpeechAttributes, "Speech Attributes", "none", allow_none=True)
|
122 |
+
language = make_enum_dropdown(LanguageIdentifiers, "Language", "none", allow_none=True)
|
123 |
+
|
124 |
+
temperature = gr.Slider(minimum=0, maximum=2.0, value=0.3, label="Temperature")
|
125 |
+
repetition_penalty = gr.Slider(minimum=1.0, maximum=10.0, value=1.8, label="Repetition Penalty")
|
126 |
+
filter_thresh = gr.Slider(minimum=0.1, maximum=1.0, value=0.75, label="Top-p Threshold")
|
127 |
+
|
128 |
+
voice_guidance = gr.Slider(minimum=0.0, maximum=6.0, value=0.4, label="Voice Guidance")
|
129 |
+
style_guidance = gr.Slider(minimum=0.0, maximum=6.0, value=0.1, label="Style Guidance")
|
130 |
+
text_guidance = gr.Slider(minimum=0.0, maximum=6.0, value=0.6, label="Text Guidance")
|
131 |
+
|
132 |
+
speak_submit = gr.Button("Speak!")
|
133 |
+
speak_result = gr.Audio(label="Result", interactive=False)
|
134 |
+
ref_voice = gr.Audio(label="Reference Voice", interactive=False)
|
135 |
+
|
136 |
+
@torch.no_grad()
|
137 |
+
def handle_speak(
|
138 |
+
text,
|
139 |
+
voices,
|
140 |
+
voice_name,
|
141 |
+
voice_guidance,
|
142 |
+
speaker_attributes,
|
143 |
+
speech_attributes,
|
144 |
+
language,
|
145 |
+
temperature,
|
146 |
+
repetition_penalty,
|
147 |
+
top_p,
|
148 |
+
style_guidance,
|
149 |
+
text_guidance,
|
150 |
+
):
|
151 |
+
if text.strip() == "":
|
152 |
+
text = "I am PlayVoice, the voice of the future. Feed me your words and I will speak them, hahahaha!"
|
153 |
+
voice = get_selected_voice_by_label(voices, voice_name)
|
154 |
+
seed = random.randint(0, 2**32 - 1)
|
155 |
+
|
156 |
+
print(f"Voice: {voice['name']} Text: {text}")
|
157 |
+
|
158 |
+
voice_emb = sampler.get_voice_embedding(voice["audio"])
|
159 |
+
|
160 |
+
text_tokens = []
|
161 |
+
text_tokens.append(torch.tensor(tokenizer.encode(text), dtype=torch.int, device=device))
|
162 |
+
text_tokens = torch.nn.utils.rnn.pad_sequence(text_tokens, batch_first=True, padding_value=0)
|
163 |
+
|
164 |
+
torch.manual_seed(seed)
|
165 |
+
sample_result = sampler.sample_batched(
|
166 |
+
text_tokens=text_tokens,
|
167 |
+
text_guidance=text_guidance,
|
168 |
+
voice_emb=voice_emb,
|
169 |
+
voice_guidance=voice_guidance,
|
170 |
+
speaker_attributes=get_enum_value(SpeakerAttributes, speaker_attributes),
|
171 |
+
speech_attributes=get_enum_value(SpeechAttributes, speech_attributes),
|
172 |
+
language_identifier=get_enum_value(LanguageIdentifiers, language),
|
173 |
+
style_guidance=float(style_guidance),
|
174 |
+
temperature=float(temperature),
|
175 |
+
repetition_penalty=float(repetition_penalty),
|
176 |
+
top_p=float(top_p),
|
177 |
+
)
|
178 |
+
|
179 |
+
latents = sample_result["latents"]
|
180 |
+
|
181 |
+
audio = vocoder.sample(text_tokens, latents, ref_wav=voice["audio"])
|
182 |
+
audio = make_pcm(audio)
|
183 |
+
|
184 |
+
return {
|
185 |
+
speak_result: (vocoder.OUTPUT_FREQUENCY, audio),
|
186 |
+
ref_voice: (22050, make_pcm(voice["audio"])),
|
187 |
+
}
|
188 |
+
|
189 |
+
speak_submit.click(
|
190 |
+
handle_speak,
|
191 |
+
inputs=[
|
192 |
+
speak_text,
|
193 |
+
local_voices,
|
194 |
+
speak_voice,
|
195 |
+
voice_guidance,
|
196 |
+
speaker_attributes,
|
197 |
+
speech_attributes,
|
198 |
+
language,
|
199 |
+
temperature,
|
200 |
+
repetition_penalty,
|
201 |
+
filter_thresh,
|
202 |
+
style_guidance,
|
203 |
+
text_guidance,
|
204 |
+
],
|
205 |
+
outputs=[
|
206 |
+
speak_result,
|
207 |
+
ref_voice,
|
208 |
+
],
|
209 |
+
)
|
210 |
+
|
211 |
+
with gr.Tab("Clone Voice"):
|
212 |
+
new_voice_name = gr.Textbox(value="cloned-voice", label="Voice Name")
|
213 |
+
new_voice_audio = gr.Audio(label="Voice Audio (20s min, ideally 30s, anything longer will be truncated)",
|
214 |
+
sources=["upload", "microphone"],
|
215 |
+
)
|
216 |
+
new_voice_submit = gr.Button("Create!")
|
217 |
+
new_voice_result = gr.Label("")
|
218 |
+
|
219 |
+
def on_new_voice_submit(voices, name, raw_audio):
|
220 |
+
assert raw_audio is not None, "Must provide audio"
|
221 |
+
|
222 |
+
sr = raw_audio[0]
|
223 |
+
torch_audio = torch.from_numpy(raw_audio[1]).float() / 32768.0
|
224 |
+
|
225 |
+
if torch_audio.ndim == 1:
|
226 |
+
torch_audio = torch_audio.unsqueeze(0)
|
227 |
+
else:
|
228 |
+
torch_audio = torch_audio.transpose(0, 1).mean(dim=0, keepdim=True)
|
229 |
+
|
230 |
+
if sr != 24000:
|
231 |
+
if sr < 16000:
|
232 |
+
raise Exception(
|
233 |
+
"Garbage in, garbage out. Please provide audio with a sample rate of at least 16kHz, ideally 24kHz."
|
234 |
+
)
|
235 |
+
torch_audio = torchaudio.transforms.Resample(sr, 24000)(torch_audio)
|
236 |
+
|
237 |
+
# trim to 30s
|
238 |
+
if torch_audio.shape[1] > 24000 * 30:
|
239 |
+
torch_audio = torch_audio[:, : 24000 * 30]
|
240 |
+
|
241 |
+
# add to local voices
|
242 |
+
voices.append({"name": name, "audio": torch_audio})
|
243 |
+
|
244 |
+
return {
|
245 |
+
speak_voice: make_voice_dropdown(voices),
|
246 |
+
new_voice_result: f"Created voice {name}",
|
247 |
+
}
|
248 |
+
|
249 |
+
new_voice_submit.click(
|
250 |
+
on_new_voice_submit,
|
251 |
+
inputs = [
|
252 |
+
local_voices,
|
253 |
+
new_voice_name,
|
254 |
+
new_voice_audio
|
255 |
+
],
|
256 |
+
outputs=[
|
257 |
+
speak_voice,
|
258 |
+
new_voice_result
|
259 |
+
]
|
260 |
+
)
|
261 |
+
|
262 |
+
|
263 |
+
iface.launch(show_error=True, share=False)
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch
|
2 |
+
torchaudio
|
3 |
+
transformers
|
4 |
+
git+https://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@github.com/playht/play-voice-inference.git@v1.0.0
|