antoniomae1234 commited on
Commit
80ddb80
1 Parent(s): e0340e5

Upload 9 files

Browse files
.gitattributes ADDED
@@ -0,0 +1 @@
 
 
1
+ examples/female.wav filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: XTTS-streaming
3
+ emoji: 🐸
4
+ colorFrom: green
5
+ colorTo: red
6
+ sdk: static
7
+ pinned: false
8
+ models:
9
+ - coqui/XTTS-v1
10
+ ---
11
+
12
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,550 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import io, os, stat
3
+ import subprocess
4
+ import random
5
+ from zipfile import ZipFile
6
+ import uuid
7
+
8
+ import time
9
+ import torch
10
+ import torchaudio
11
+ # By using XTTS you agree to CPML license https://coqui.ai/cpml
12
+ os.environ["COQUI_TOS_AGREED"] = "1"
13
+
14
+ # langid is used to detect language for longer text
15
+ # Most users expect text to be their own language, there is checkbox to disable it
16
+ import langid
17
+
18
+ import base64
19
+ import csv
20
+ from io import StringIO
21
+ import datetime
22
+
23
+ import gradio as gr
24
+ from scipy.io.wavfile import write
25
+ from pydub import AudioSegment
26
+
27
+ from TTS.api import TTS
28
+ from TTS.tts.configs.xtts_config import XttsConfig
29
+ from TTS.tts.models.xtts import Xtts
30
+ from TTS.utils.generic_utils import get_user_data_dir
31
+
32
+ HF_TOKEN = os.environ.get("HF_TOKEN")
33
+
34
+ from huggingface_hub import HfApi
35
+
36
+ # will use api to restart space on a unrecoverable error
37
+ api = HfApi(token=HF_TOKEN)
38
+ repo_id = "coqui/xtts-streaming"
39
+
40
+ # Use never ffmpeg binary for Ubuntu20 to use denoising for microphone input
41
+ print("Export newer ffmpeg binary for denoise filter")
42
+ ZipFile("ffmpeg.zip").extractall()
43
+ print("Make ffmpeg binary executable")
44
+ st = os.stat('ffmpeg')
45
+ os.chmod('ffmpeg', st.st_mode | stat.S_IEXEC)
46
+
47
+ # This will trigger downloading model
48
+ print("Downloading if not downloaded Coqui XTTS V1.1")
49
+ from TTS.utils.manage import ModelManager
50
+ model_name = "tts_models/multilingual/multi-dataset/xtts_v1.1"
51
+ ModelManager().download_model(model_name)
52
+ model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
53
+ print("XTTS downloaded")
54
+
55
+ config = XttsConfig()
56
+ config.load_json(os.path.join(model_path, "config.json"))
57
+ model = Xtts.init_from_config(config)
58
+ model.load_checkpoint(
59
+ config,
60
+ checkpoint_path=os.path.join(model_path, "model.pth"),
61
+ vocab_path=os.path.join(model_path, "vocab.json"),
62
+ eval=True,
63
+ use_deepspeed=True
64
+ )
65
+ model.cuda()
66
+
67
+ # it should be there just to be sure
68
+ if "ja" not in config.languages:
69
+ config.languages.append("ja")
70
+
71
+ # This is for debugging purposes only
72
+ DEVICE_ASSERT_DETECTED=0
73
+ DEVICE_ASSERT_PROMPT=None
74
+ DEVICE_ASSERT_LANG=None
75
+
76
+ #supported_languages=["pt"]
77
+ supported_languages=config.languages
78
+
79
+ def predict(prompt, language, audio_file_pth, mic_file_path, use_mic, voice_cleanup, no_lang_auto_detect, agree,):
80
+ if agree == True:
81
+
82
+
83
+ if language not in supported_languages:
84
+ gr.Warning("Language you put in is not in is not in our Supported Languages, please choose from dropdown")
85
+
86
+ return (
87
+ None,
88
+ None,
89
+ None,
90
+ )
91
+
92
+ language_predicted=langid.classify(prompt)[0].strip() # strip need as there is space at end!
93
+
94
+ # tts expects chinese as zh-cn
95
+ if language_predicted == "zh":
96
+ #we use zh-cn
97
+ language_predicted = "zh-cn"
98
+ print(f"Detected language:{language_predicted}, Chosen language:{language}")
99
+
100
+ # After text character length 15 trigger language detection
101
+ if len(prompt)>15:
102
+ # allow any language for short text as some may be common
103
+ # If user unchecks language autodetection it will not trigger
104
+ # You may remove this completely for own use
105
+ if language_predicted != language and not no_lang_auto_detect:
106
+ #Please duplicate and remove this check if you really want this
107
+ #Or auto-detector fails to identify language (which it can on pretty short text or mixed text)
108
+ gr.Warning(f"It looks like your text isn’t the language you chose , if you’re sure the text is the same language you chose, please check disable language auto-detection checkbox" )
109
+
110
+ return (
111
+ None,
112
+ None,
113
+ None,
114
+ )
115
+
116
+
117
+ if use_mic == True:
118
+ if mic_file_path is not None:
119
+ speaker_wav=mic_file_path
120
+ else:
121
+ gr.Warning("Please record your voice with Microphone, or uncheck Use Microphone to use reference audios")
122
+ return (
123
+ None,
124
+ None,
125
+ None,
126
+ )
127
+
128
+ else:
129
+ speaker_wav=audio_file_pth
130
+
131
+
132
+ # Filtering for microphone input, as it has BG noise, maybe silence in beginning and end
133
+ # This is fast filtering not perfect
134
+
135
+ # Apply all on demand
136
+ lowpassfilter=denoise=trim=loudness=True
137
+
138
+ if lowpassfilter:
139
+ lowpass_highpass="lowpass=8000,highpass=75,"
140
+ else:
141
+ lowpass_highpass=""
142
+
143
+ if trim:
144
+ # better to remove silence in beginning and end for microphone
145
+ trim_silence="areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,"
146
+ else:
147
+ trim_silence=""
148
+
149
+ if (voice_cleanup):
150
+ try:
151
+ out_filename = speaker_wav + str(uuid.uuid4()) + ".wav" #ffmpeg to know output format
152
+
153
+ #we will use newer ffmpeg as that has afftn denoise filter
154
+ shell_command = f"./ffmpeg -y -i {speaker_wav} -af {lowpass_highpass}{trim_silence} {out_filename}".split(" ")
155
+
156
+ command_result = subprocess.run([item for item in shell_command], capture_output=False,text=True, check=True)
157
+ speaker_wav=out_filename
158
+ print("Filtered microphone input")
159
+ except subprocess.CalledProcessError:
160
+ # There was an error - command exited with non-zero code
161
+ print("Error: failed filtering, use original microphone input")
162
+ else:
163
+ speaker_wav=speaker_wav
164
+
165
+ if len(prompt)<1:
166
+ gr.Warning("Please give a longer prompt text")
167
+ return (
168
+ None,
169
+ None,
170
+ None,
171
+ )
172
+ if len(prompt)>3000:
173
+ gr.Warning("Text length limited to characters for this demo, please try shorter text. You can clone this space and edit code for your own usage")
174
+ return (
175
+ None,
176
+ None,
177
+ None,
178
+ )
179
+ global DEVICE_ASSERT_DETECTED
180
+ if DEVICE_ASSERT_DETECTED:
181
+ global DEVICE_ASSERT_PROMPT
182
+ global DEVICE_ASSERT_LANG
183
+ #It will likely never come here as we restart space on first unrecoverable error now
184
+ print(f"Unrecoverable exception caused by language:{DEVICE_ASSERT_LANG} prompt:{DEVICE_ASSERT_PROMPT}")
185
+
186
+
187
+ metrics_text= ""
188
+
189
+
190
+ try:
191
+
192
+ t_latent=time.time()
193
+ try:
194
+ gpt_cond_latent, _, speaker_embedding = model.get_conditioning_latents(audio_path=speaker_wav)
195
+ except Exception as e:
196
+ print("Speaker encoding error", str(e))
197
+ gr.Warning("It appears something wrong with reference, did you unmute your microphone?")
198
+ return (
199
+ None,
200
+ None,
201
+ None,
202
+ None,
203
+ )
204
+
205
+ latent_calculation_time = time.time() - t_latent
206
+ ##metrics_text=f"Embedding calculation time: {latent_calculation_time:.2f} seconds\n"
207
+
208
+ wav_chunks = []
209
+
210
+ t_inference=time.time()
211
+
212
+ chunks = model.inference_stream(
213
+ prompt,
214
+ language,
215
+ gpt_cond_latent,
216
+ speaker_embedding,)
217
+
218
+ first_chunk=True
219
+ for i, chunk in enumerate(chunks):
220
+ if first_chunk:
221
+ first_chunk_time = time.time() - t_inference
222
+ metrics_text+=f"Latency to first audio chunk: {round(first_chunk_time*1000)} milliseconds\n"
223
+ first_chunk=False
224
+
225
+
226
+ wav_chunks.append(chunk)
227
+ print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
228
+
229
+ out_file = f'{i}.wav'
230
+ write(out_file, 24000, chunk.detach().cpu().numpy().squeeze())
231
+ audio = AudioSegment.from_file(out_file)
232
+ audio.export(out_file, format='wav')
233
+
234
+ yield (None, out_file, metrics_text, None)
235
+
236
+ except RuntimeError as e :
237
+ if "device-side assert" in str(e):
238
+ # cannot do anything on cuda device side error, need tor estart
239
+ print(f"Exit due to: Unrecoverable exception caused by language:{language} prompt:{prompt}", flush=True)
240
+ gr.Warning("Unhandled Exception encounter, please retry in a minute")
241
+ print("Cuda device-assert Runtime encountered need restart")
242
+ if not DEVICE_ASSERT_DETECTED:
243
+ DEVICE_ASSERT_DETECTED=1
244
+ DEVICE_ASSERT_PROMPT=prompt
245
+ DEVICE_ASSERT_LANG=language
246
+
247
+ # just before restarting save what caused the issue so we can handle it in future
248
+ # Uploading Error data only happens for unrecovarable error
249
+ error_time = datetime.datetime.now().strftime('%d-%m-%Y-%H:%M:%S')
250
+ error_data = [error_time, prompt, language, audio_file_pth, mic_file_path, use_mic, voice_cleanup, no_lang_auto_detect, agree]
251
+ error_data = [str(e) if type(e)!=str else e for e in error_data]
252
+ print(error_data)
253
+ print(speaker_wav)
254
+ write_io = StringIO()
255
+ csv.writer(write_io).writerows([error_data])
256
+ csv_upload= write_io.getvalue().encode()
257
+
258
+ filename = error_time+"_xtts-stream_" + str(uuid.uuid4()) +".csv"
259
+ print("Writing error csv")
260
+ error_api = HfApi()
261
+ error_api.upload_file(
262
+ path_or_fileobj=csv_upload,
263
+ path_in_repo=filename,
264
+ repo_id="coqui/xtts-flagged-dataset",
265
+ repo_type="dataset",
266
+ )
267
+
268
+ #speaker_wav
269
+ print("Writing error reference audio")
270
+ speaker_filename = error_time+"_reference_xtts-stream_"+ str(uuid.uuid4()) +".wav"
271
+ error_api = HfApi()
272
+ error_api.upload_file(
273
+ path_or_fileobj=speaker_wav,
274
+ path_in_repo=speaker_filename,
275
+ repo_id="coqui/xtts-flagged-dataset",
276
+ repo_type="dataset",
277
+ )
278
+
279
+ # HF Space specific.. This error is unrecoverable need to restart space
280
+ api.restart_space(repo_id=repo_id)
281
+ else:
282
+ if "Failed to decode" in str(e):
283
+ print("Speaker encoding error", str(e))
284
+ gr.Warning("It appears something wrong with reference, did you unmute your microphone?")
285
+ else:
286
+ print("RuntimeError: non device-side assert error:", str(e))
287
+ gr.Warning("Something unexpected happened please retry again.")
288
+
289
+ return (
290
+ None,
291
+ None,
292
+ None,
293
+ None,
294
+ )
295
+
296
+
297
+ wav = torch.cat(wav_chunks, dim=0)
298
+ torchaudio.save("output.wav", wav.squeeze().unsqueeze(0).cpu(), 24000)
299
+
300
+ second_of_silence = AudioSegment.silent() # use default
301
+ second_of_silence.export("sil.wav", format='wav')
302
+
303
+ yield (
304
+ gr.make_waveform(
305
+ audio="output.wav",
306
+ ),
307
+ "sil.wav",
308
+ metrics_text,
309
+ speaker_wav,
310
+ )
311
+ else:
312
+ gr.Warning("Please accept the Terms & Condition!")
313
+ return (
314
+ None,
315
+ None,
316
+ None,
317
+ None,
318
+ )
319
+
320
+
321
+ title = "Coqui🐸 XTTS - Streaming"
322
+
323
+ description = """
324
+ <div>
325
+ <a style="display:inline-block" href='https://github.com/coqui-ai/TTS'><img src='https://img.shields.io/github/stars/coqui-ai/TTS?style=social' /></a>
326
+ <a style='display:inline-block' href='https://discord.gg/5eXr5seRrv'><img src='https://discord.com/api/guilds/1037326658807533628/widget.png?style=shield' /></a>
327
+ <a href="https://huggingface.co/spaces/coqui/xtts-streaming?duplicate=true">
328
+ <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
329
+ </div>
330
+ <img referrerpolicy="no-referrer-when-downgrade" src="https://static.scarf.sh/a.png?x-pxid=0d00920c-8cc9-4bf3-90f2-a615797e5f59" />
331
+ <a href="https://huggingface.co/coqui/XTTS-v1">XTTS</a> is a Voice generation model that lets you clone voices into different languages by using just a quick 6-second audio clip.
332
+ <br/>
333
+ XTTS is built on previous research, like Tortoise, with additional architectural innovations and training to make cross-language voice cloning and multilingual speech generation possible.
334
+ <br/>
335
+ This is the same model that powers our creator application <a href="https://coqui.ai">Coqui Studio</a> as well as the <a href="https://docs.coqui.ai">Coqui API</a>. In production we apply modifications to make low-latency streaming possible.
336
+ <br/>
337
+ Leave a star on the Github <a href="https://github.com/coqui-ai/TTS">🐸TTS</a>, where our open-source inference and training code lives.
338
+ <br/>
339
+ <p>For faster inference without waiting in the queue, you should duplicate this space and upgrade to GPU via the settings.
340
+ <br/>
341
+
342
+ </p>
343
+ <p>Language Selectors:
344
+ Arabic: ar, Brazilian Portuguese: pt
345
+ </p>
346
+ <p> Notice: Autoplay may not work on mobile, if you see black waveform image on mobile click it your Audio is there</p>
347
+ <img referrerpolicy="no-referrer-when-downgrade" src="https://static.scarf.sh/a.png?x-pxid=8946ef36-c454-4a8e-a9c9-8a8dd735fabd" />
348
+ """
349
+
350
+ article = """
351
+ <div style='margin:20px auto;'>
352
+ <p>By using this demo you agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml</p>
353
+ <p>We collect data only for error cases for improvement.</p>
354
+ </div>
355
+ """
356
+ examples = [
357
+ [
358
+ "Once when I was six years old I saw a magnificent picture",
359
+ "en",
360
+ "examples/female.wav",
361
+ None,
362
+ False,
363
+ False,
364
+ False,
365
+ True,
366
+ ],
367
+ [
368
+ "Lorsque j'avais six ans j'ai vu, une fois, une magnifique image",
369
+ "fr",
370
+ "examples/male.wav",
371
+ None,
372
+ False,
373
+ False,
374
+ False,
375
+ True,
376
+ ],
377
+ [
378
+ "Als ich sechs war, sah ich einmal ein wunderbares Bild",
379
+ "de",
380
+ "examples/female.wav",
381
+ None,
382
+ False,
383
+ False,
384
+ False,
385
+ True,
386
+ ],
387
+ [
388
+ "Cuando tenía seis años, vi una vez una imagen magnífica",
389
+ "es",
390
+ "examples/male.wav",
391
+ None,
392
+ False,
393
+ False,
394
+ False,
395
+ True,
396
+ ],
397
+ [
398
+ "Quando eu tinha seis anos eu vi, uma vez, uma imagem magnífica",
399
+ "pt",
400
+ "examples/female.wav",
401
+ None,
402
+ False,
403
+ False,
404
+ False,
405
+ True,
406
+ ],
407
+ [
408
+ "Kiedy miałem sześć lat, zobaczyłem pewnego razu wspaniały obrazek",
409
+ "pl",
410
+ "examples/male.wav",
411
+ None,
412
+ False,
413
+ False,
414
+ False,
415
+ True,
416
+ ],
417
+ [
418
+ "Un tempo lontano, quando avevo sei anni, vidi un magnifico disegno",
419
+ "it",
420
+ "examples/female.wav",
421
+ None,
422
+ False,
423
+ False,
424
+ False,
425
+ True,
426
+ ],
427
+ [
428
+ "Bir zamanlar, altı yaşındayken, muhteşem bir resim gördüm",
429
+ "tr",
430
+ "examples/female.wav",
431
+ None,
432
+ False,
433
+ False,
434
+ False,
435
+ True,
436
+ ],
437
+ [
438
+ "Когда мне было шесть лет, я увидел однажды удивительную картинку",
439
+ "ru",
440
+ "examples/female.wav",
441
+ None,
442
+ False,
443
+ False,
444
+ False,
445
+ True,
446
+ ],
447
+ [
448
+ "Toen ik een jaar of zes was, zag ik op een keer een prachtige plaat",
449
+ "nl",
450
+ "examples/male.wav",
451
+ None,
452
+ False,
453
+ False,
454
+ False,
455
+ True,
456
+ ],
457
+ [
458
+ "Když mi bylo šest let, viděl jsem jednou nádherný obrázek",
459
+ "cs",
460
+ "examples/female.wav",
461
+ None,
462
+ False,
463
+ False,
464
+ False,
465
+ True,
466
+ ],
467
+ [
468
+ "当我还只有六岁的时候, 看到了一副精彩的插画",
469
+ "zh-cn",
470
+ "examples/female.wav",
471
+ None,
472
+ False,
473
+ False,
474
+ False,
475
+ True,
476
+ ],
477
+ [
478
+ "かつて 六歳のとき、素晴らしい絵を見ました",
479
+ "ja",
480
+ "examples/female.wav",
481
+ None,
482
+ False,
483
+ False,
484
+ False,
485
+ True,
486
+ ],
487
+
488
+ ]
489
+
490
+
491
+
492
+ gr.Interface(
493
+ fn=predict,
494
+ inputs=[
495
+ gr.Textbox(
496
+ label="Text Prompt",
497
+ info="One or two sentences at a time is better. Up to text characters.",
498
+ value="Hi there, I'm your new voice clone. Try your best to upload quality audio",
499
+ ),
500
+ gr.Dropdown(
501
+ label="Language",
502
+ info="Select an output language for the synthesised speech",
503
+ choices=[
504
+
505
+ "pt",
506
+
507
+ ],
508
+ max_choices=1,
509
+ value="en",
510
+ ),
511
+ gr.Audio(
512
+ label="Reference Audio",
513
+ info="Click on the ✎ button to upload your own target speaker audio",
514
+ type="filepath",
515
+ value="examples/female.wav",
516
+ ),
517
+ gr.Audio(source="microphone",
518
+ type="filepath",
519
+ info="Use your microphone to record audio",
520
+ label="Use Microphone for Reference"),
521
+ gr.Checkbox(label="Use Microphone",
522
+ value=False,
523
+ info="Notice: Microphone input may not work properly under traffic",),
524
+ gr.Checkbox(label="Cleanup Reference Voice",
525
+ value=False,
526
+ info="This check can improve output if your microphone or reference voice is noisy",
527
+ ),
528
+ gr.Checkbox(label="Do not use language auto-detect",
529
+ value=False,
530
+ info="Check to disable language auto-detection",),
531
+ gr.Checkbox(
532
+ label="Agree",
533
+ value=False,
534
+ info="I agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml",
535
+ ),
536
+
537
+
538
+ ],
539
+ outputs=[
540
+ gr.Video(label="Waveform Visual"),
541
+ gr.Audio(label="Synthesised Audio", streaming=True, autoplay=True),
542
+ gr.Text(label="Metrics"),
543
+ gr.Audio(label="Reference Audio Used"),
544
+ ],
545
+ title=title,
546
+ description=description,
547
+ article=article,
548
+ examples=examples,
549
+ cache_examples=False,
550
+ ).queue().launch(debug=True,show_api=True)
examples/examples_.DS_Store ADDED
Binary file (6.15 kB). View file
 
examples/female.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89a4fa9a16b6463f852cf9424f72c3d3c87aa83010e89db534c53fcd1ae12c02
3
+ size 1002030
examples/male.wav ADDED
Binary file (762 kB). View file