stanimirovb commited on
Commit
2ef4b5e
1 Parent(s): 95fc384

gradio demo

Browse files
Files changed (5) hide show
  1. .gitignore +10 -0
  2. LICENSE +21 -0
  3. README.md +8 -4
  4. app.py +263 -0
  5. requirements.txt +4 -0
.gitignore ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # ides, editors
2
+ .vscode/
3
+
4
+ # temporary dev artefacts
5
+ tmp/
6
+
7
+ # Byte-compiled / optimized / DLL files
8
+ __pycache__/
9
+ *.py[cod]
10
+ *$py.class
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 PlayHT
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,13 +1,17 @@
1
  ---
2
  title: Play Voice V0 Demo
3
- emoji: 👁
4
- colorFrom: yellow
5
- colorTo: blue
6
  sdk: gradio
7
- sdk_version: 4.9.1
8
  app_file: app.py
9
  pinned: false
10
  license: mit
 
 
 
 
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: Play Voice V0 Demo
3
+ emoji: 🔊
4
+ colorFrom: red
5
+ colorTo: pink
6
  sdk: gradio
7
+ sdk_version: 4.8.0
8
  app_file: app.py
9
  pinned: false
10
  license: mit
11
+ models:
12
+ - PlayHT/play-voice-v0-multi
13
+ datasets:
14
+ - PlayHT/play-voice-voices
15
  ---
16
 
17
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import random
3
+
4
+ import gradio as gr
5
+ import numpy as np
6
+ import torch
7
+ import torchaudio
8
+ from huggingface_hub import snapshot_download
9
+
10
+ from play_voice_inference.utils.voice_tokenizer import VoiceBpeTokenizer
11
+ from play_voice_inference.models.play_voice import LanguageIdentifiers, SpeakerAttributes, SpeechAttributes, load_play_voice
12
+ from play_voice_inference.utils.play_voice_sampler import PlayVoiceSampler
13
+ from play_voice_inference.utils.pv_diff_sampler import PlayVoiceDiffusionDecoderSampler
14
+
15
+ torch.set_grad_enabled(False)
16
+ device = torch.device('cuda')
17
+
18
+ HF_TOKEN = os.environ['HF_TOKEN']
19
+ print("Loading models...")
20
+
21
+ tokenizer = VoiceBpeTokenizer()
22
+
23
+ MODEL_DIR = snapshot_download('PlayHT/play-voice-v0-multi', token=HF_TOKEN)
24
+
25
+ PV_AR_PT = MODEL_DIR + '/pv-v1-ar.pth'
26
+ play_voice = load_play_voice(PV_AR_PT, device)
27
+ sampler = PlayVoiceSampler(play_voice).to(device)
28
+
29
+ NUM_DIFFUSION_STEPS: int = 150
30
+ DIFFUSION_PT = MODEL_DIR + '/pv-v1-diff-xf.pth'
31
+ DIFFUSION_VOCODER_PT = MODEL_DIR + '/pv-v1-diff-bigvgan.pt'
32
+ vocoder = PlayVoiceDiffusionDecoderSampler.from_path(
33
+ DIFFUSION_PT,
34
+ DIFFUSION_VOCODER_PT,
35
+ steps=NUM_DIFFUSION_STEPS,
36
+ silent=True,
37
+ use_fp16=True,
38
+ device=device
39
+ )
40
+
41
+ print("Preparing voices...")
42
+ VOICES_DIR = snapshot_download('PlayHT/play-voice-voices', repo_type='dataset', token=HF_TOKEN)
43
+
44
+ def load_audio(path: str, sr=24000):
45
+ audio, orig_sr = torchaudio.load(path)
46
+ if orig_sr != sr:
47
+ audio = torchaudio.transforms.Resample(orig_sr, sr)(audio)
48
+ return audio
49
+
50
+ def make_pcm(audio: torch.Tensor):
51
+ # Must convert to 16-bit PCM for gradio
52
+ # remove batch dim if any
53
+ # if len(audio.shape) > 2:
54
+ # audio = audio[0]
55
+ # audio = audio.transpose(0, 1) # gradio expects [samples, channels] and throws very unhelpful errors if it's wrong
56
+ gen_np = audio.squeeze().cpu().numpy()
57
+ i = np.iinfo("int16")
58
+ abs_max = 2 ** (i.bits - 1)
59
+ offset = i.min + abs_max
60
+ gen_np = (gen_np * abs_max + offset).clip(i.min, i.max).astype("int16")
61
+ return gen_np
62
+
63
+ initial_voices = []
64
+ for item in os.listdir(VOICES_DIR):
65
+ if item.endswith(".wav"):
66
+ name = os.path.splitext(item)[0]
67
+ initial_voices.append({"name": name, "audio": load_audio(os.path.join(VOICES_DIR, item))})
68
+ initial_voices.sort(key=lambda x: x["name"])
69
+ print(f"Found {len(initial_voices)} initial voices")
70
+
71
+ def get_voice_labels(voices: list[dict]):
72
+ labels = []
73
+ for voice in voices:
74
+ labels.append(voice["name"])
75
+ return labels
76
+
77
+
78
+ with gr.Blocks(analytics_enabled=False, title="Play Voice", mode="tts") as iface:
79
+ local_voices = gr.State(initial_voices)
80
+
81
+ def get_selected_voice_by_label(voices, label: str):
82
+ labels = get_voice_labels(voices)
83
+ for i, voice_label in enumerate(labels):
84
+ if voice_label == label:
85
+ return voices[i]
86
+ raise Exception("Voice not found: " + label)
87
+
88
+ def make_voice_dropdown(voices):
89
+ choices = get_voice_labels(voices)
90
+ return gr.Dropdown(
91
+ choices=choices,
92
+ value=choices[-1] if len(choices) > 0 else None,
93
+ label="Voice",
94
+ )
95
+
96
+ def make_enum_dropdown(enum, label, default=None, allow_none=False):
97
+ choices = [e.name for e in enum]
98
+ if allow_none:
99
+ choices.append("none")
100
+ return gr.Dropdown(
101
+ choices=choices,
102
+ value=default,
103
+ label=label,
104
+ )
105
+
106
+ def get_enum_value(enum, value):
107
+ if value == "none":
108
+ return None
109
+ return enum[value]
110
+
111
+ gr.Markdown("# Play Voice\n")
112
+
113
+ with gr.Tab("TTS"):
114
+ speak_text = gr.Textbox(lines=2, placeholder="What would you like to say?", label="Text")
115
+ speak_voice = make_voice_dropdown(initial_voices)
116
+
117
+ with gr.Accordion("Settings", open=False):
118
+ speaker_attributes = make_enum_dropdown(
119
+ SpeakerAttributes, "Speaker Attributes", "full_sentence", allow_none=True
120
+ )
121
+ speech_attributes = make_enum_dropdown(SpeechAttributes, "Speech Attributes", "none", allow_none=True)
122
+ language = make_enum_dropdown(LanguageIdentifiers, "Language", "none", allow_none=True)
123
+
124
+ temperature = gr.Slider(minimum=0, maximum=2.0, value=0.3, label="Temperature")
125
+ repetition_penalty = gr.Slider(minimum=1.0, maximum=10.0, value=1.8, label="Repetition Penalty")
126
+ filter_thresh = gr.Slider(minimum=0.1, maximum=1.0, value=0.75, label="Top-p Threshold")
127
+
128
+ voice_guidance = gr.Slider(minimum=0.0, maximum=6.0, value=0.4, label="Voice Guidance")
129
+ style_guidance = gr.Slider(minimum=0.0, maximum=6.0, value=0.1, label="Style Guidance")
130
+ text_guidance = gr.Slider(minimum=0.0, maximum=6.0, value=0.6, label="Text Guidance")
131
+
132
+ speak_submit = gr.Button("Speak!")
133
+ speak_result = gr.Audio(label="Result", interactive=False)
134
+ ref_voice = gr.Audio(label="Reference Voice", interactive=False)
135
+
136
+ @torch.no_grad()
137
+ def handle_speak(
138
+ text,
139
+ voices,
140
+ voice_name,
141
+ voice_guidance,
142
+ speaker_attributes,
143
+ speech_attributes,
144
+ language,
145
+ temperature,
146
+ repetition_penalty,
147
+ top_p,
148
+ style_guidance,
149
+ text_guidance,
150
+ ):
151
+ if text.strip() == "":
152
+ text = "I am PlayVoice, the voice of the future. Feed me your words and I will speak them, hahahaha!"
153
+ voice = get_selected_voice_by_label(voices, voice_name)
154
+ seed = random.randint(0, 2**32 - 1)
155
+
156
+ print(f"Voice: {voice['name']} Text: {text}")
157
+
158
+ voice_emb = sampler.get_voice_embedding(voice["audio"])
159
+
160
+ text_tokens = []
161
+ text_tokens.append(torch.tensor(tokenizer.encode(text), dtype=torch.int, device=device))
162
+ text_tokens = torch.nn.utils.rnn.pad_sequence(text_tokens, batch_first=True, padding_value=0)
163
+
164
+ torch.manual_seed(seed)
165
+ sample_result = sampler.sample_batched(
166
+ text_tokens=text_tokens,
167
+ text_guidance=text_guidance,
168
+ voice_emb=voice_emb,
169
+ voice_guidance=voice_guidance,
170
+ speaker_attributes=get_enum_value(SpeakerAttributes, speaker_attributes),
171
+ speech_attributes=get_enum_value(SpeechAttributes, speech_attributes),
172
+ language_identifier=get_enum_value(LanguageIdentifiers, language),
173
+ style_guidance=float(style_guidance),
174
+ temperature=float(temperature),
175
+ repetition_penalty=float(repetition_penalty),
176
+ top_p=float(top_p),
177
+ )
178
+
179
+ latents = sample_result["latents"]
180
+
181
+ audio = vocoder.sample(text_tokens, latents, ref_wav=voice["audio"])
182
+ audio = make_pcm(audio)
183
+
184
+ return {
185
+ speak_result: (vocoder.OUTPUT_FREQUENCY, audio),
186
+ ref_voice: (22050, make_pcm(voice["audio"])),
187
+ }
188
+
189
+ speak_submit.click(
190
+ handle_speak,
191
+ inputs=[
192
+ speak_text,
193
+ local_voices,
194
+ speak_voice,
195
+ voice_guidance,
196
+ speaker_attributes,
197
+ speech_attributes,
198
+ language,
199
+ temperature,
200
+ repetition_penalty,
201
+ filter_thresh,
202
+ style_guidance,
203
+ text_guidance,
204
+ ],
205
+ outputs=[
206
+ speak_result,
207
+ ref_voice,
208
+ ],
209
+ )
210
+
211
+ with gr.Tab("Clone Voice"):
212
+ new_voice_name = gr.Textbox(value="cloned-voice", label="Voice Name")
213
+ new_voice_audio = gr.Audio(label="Voice Audio (20s min, ideally 30s, anything longer will be truncated)",
214
+ sources=["upload", "microphone"],
215
+ )
216
+ new_voice_submit = gr.Button("Create!")
217
+ new_voice_result = gr.Label("")
218
+
219
+ def on_new_voice_submit(voices, name, raw_audio):
220
+ assert raw_audio is not None, "Must provide audio"
221
+
222
+ sr = raw_audio[0]
223
+ torch_audio = torch.from_numpy(raw_audio[1]).float() / 32768.0
224
+
225
+ if torch_audio.ndim == 1:
226
+ torch_audio = torch_audio.unsqueeze(0)
227
+ else:
228
+ torch_audio = torch_audio.transpose(0, 1).mean(dim=0, keepdim=True)
229
+
230
+ if sr != 24000:
231
+ if sr < 16000:
232
+ raise Exception(
233
+ "Garbage in, garbage out. Please provide audio with a sample rate of at least 16kHz, ideally 24kHz."
234
+ )
235
+ torch_audio = torchaudio.transforms.Resample(sr, 24000)(torch_audio)
236
+
237
+ # trim to 30s
238
+ if torch_audio.shape[1] > 24000 * 30:
239
+ torch_audio = torch_audio[:, : 24000 * 30]
240
+
241
+ # add to local voices
242
+ voices.append({"name": name, "audio": torch_audio})
243
+
244
+ return {
245
+ speak_voice: make_voice_dropdown(voices),
246
+ new_voice_result: f"Created voice {name}",
247
+ }
248
+
249
+ new_voice_submit.click(
250
+ on_new_voice_submit,
251
+ inputs = [
252
+ local_voices,
253
+ new_voice_name,
254
+ new_voice_audio
255
+ ],
256
+ outputs=[
257
+ speak_voice,
258
+ new_voice_result
259
+ ]
260
+ )
261
+
262
+
263
+ iface.launch(show_error=True, share=False)
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ torch
2
+ torchaudio
3
+ transformers
4
+ git+https://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@github.com/playht/play-voice-inference.git@v1.0.0