erogol commited on
Commit
63c45d7
1 Parent(s): 08a6c74

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +131 -95
app.py CHANGED
@@ -18,7 +18,7 @@ import base64
18
  import csv
19
  from io import StringIO
20
  import datetime
21
- import re
22
 
23
  import gradio as gr
24
  from scipy.io.wavfile import write
@@ -239,7 +239,7 @@ def predict(
239
 
240
  # temporary comma fix
241
  prompt= re.sub("([^\x00-\x7F]|\w)(\.|\。|\?)",r"\1 \2\2",prompt)
242
-
243
  wav_chunks = []
244
  ## Direct mode
245
  """
@@ -260,7 +260,7 @@ def predict(
260
  metrics_text+=f"Real-time factor (RTF): {real_time_factor:.2f}\n"
261
  torchaudio.save("output.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
262
  """
263
-
264
  print("I: Generating new audio in streaming mode...")
265
  t0 = time.time()
266
  chunks = model.inference_stream(
@@ -287,7 +287,7 @@ def predict(
287
  #metrics_text += (
288
  # f"Time to generate audio: {round(inference_time*1000)} milliseconds\n"
289
  #)
290
-
291
  wav = torch.cat(wav_chunks, dim=0)
292
  print(wav.shape)
293
  real_time_factor = (time.time() - t0) / wav.shape[0] * 24000
@@ -392,29 +392,41 @@ def predict(
392
  title = "Coqui🐸 XTTS"
393
 
394
  description = """
395
- <div>
396
- <a style="display:inline-block" href='https://github.com/coqui-ai/TTS'><img src='https://img.shields.io/github/stars/coqui-ai/TTS?style=social' /></a>
397
- <a style='display:inline-block' href='https://discord.gg/5eXr5seRrv'><img src='https://discord.com/api/guilds/1037326658807533628/widget.png?style=shield' /></a>
398
- <a href="https://huggingface.co/spaces/coqui/xtts?duplicate=true">
399
- <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
400
- </div>
401
 
402
- <a href="https://huggingface.co/coqui/XTTS-v2">XTTS</a> is a Voice generation model that lets you clone voices into different languages by using just a quick 6-second audio clip.
403
  <br/>
404
- XTTS is built on previous research, like Tortoise, with additional architectural innovations and training to make cross-language voice cloning and multilingual speech generation possible.
 
 
405
  <br/>
 
406
  This is the same model that powers our creator application <a href="https://coqui.ai">Coqui Studio</a> as well as the <a href="https://docs.coqui.ai">Coqui API</a>. In production we apply modifications to make low-latency streaming possible.
 
407
  <br/>
408
- Leave a star on the Github <a href="https://github.com/coqui-ai/TTS">🐸TTS</a>, where our open-source inference and training code lives.
 
 
 
 
 
 
409
  <br/>
410
- <p>For faster inference without waiting in the queue, you should duplicate this space and upgrade to GPU via the settings.
 
 
411
  <br/>
412
- </p>
413
- <p>Language Selectors:
414
- Arabic: ar, Brazilian Portuguese: pt , Chinese: zh-cn, Czech: cs, Dutch: nl, English: en, French: fr, Italian: it, Polish: pl,<br/>
415
- Russian: ru, Spanish: es, Turkish: tr, Japanese: ja, Korean: ko, Hungarian: hu <br/>
416
- </p>
417
  <img referrerpolicy="no-referrer-when-downgrade" src="https://static.scarf.sh/a.png?x-pxid=0d00920c-8cc9-4bf3-90f2-a615797e5f59" />
 
 
 
 
 
 
 
 
 
418
  """
419
 
420
  article = """
@@ -577,79 +589,103 @@ examples = [
577
  ]
578
 
579
 
580
- gr.Interface(
581
- fn=predict,
582
- inputs=[
583
- gr.Textbox(
584
- label="Text Prompt",
585
- info="One or two sentences at a time is better. Up to 200 text characters.",
586
- value="Hi there, I'm your new voice clone. Try your best to upload quality audio",
587
- ),
588
- gr.Dropdown(
589
- label="Language",
590
- info="Select an output language for the synthesised speech",
591
- choices=[
592
- "en",
593
- "es",
594
- "fr",
595
- "de",
596
- "it",
597
- "pt",
598
- "pl",
599
- "tr",
600
- "ru",
601
- "nl",
602
- "cs",
603
- "ar",
604
- "zh-cn",
605
- "ja",
606
- "ko",
607
- "hu"
608
- ],
609
- max_choices=1,
610
- value="en",
611
- ),
612
- gr.Audio(
613
- label="Reference Audio",
614
- info="Click on the ✎ button to upload your own target speaker audio",
615
- type="filepath",
616
- value="examples/female.wav",
617
- ),
618
- gr.Audio(
619
- source="microphone",
620
- type="filepath",
621
- info="Use your microphone to record audio",
622
- label="Use Microphone for Reference",
623
- ),
624
- gr.Checkbox(
625
- label="Use Microphone",
626
- value=False,
627
- info="Notice: Microphone input may not work properly under traffic",
628
- ),
629
- gr.Checkbox(
630
- label="Cleanup Reference Voice",
631
- value=False,
632
- info="This check can improve output if your microphone or reference voice is noisy",
633
- ),
634
- gr.Checkbox(
635
- label="Do not use language auto-detect",
636
- value=False,
637
- info="Check to disable language auto-detection",
638
- ),
639
- gr.Checkbox(
640
- label="Agree",
641
- value=False,
642
- info="I agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml",
643
- ),
644
- ],
645
- outputs=[
646
- gr.Video(label="Waveform Visual"),
647
- gr.Audio(label="Synthesised Audio", autoplay=True),
648
- gr.Text(label="Metrics"),
649
- gr.Audio(label="Reference Audio Used"),
650
- ],
651
- title=title,
652
- description=description,
653
- article=article,
654
- examples=examples,
655
- ).queue().launch(debug=True, show_api=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  import csv
19
  from io import StringIO
20
  import datetime
21
+ import re
22
 
23
  import gradio as gr
24
  from scipy.io.wavfile import write
 
239
 
240
  # temporary comma fix
241
  prompt= re.sub("([^\x00-\x7F]|\w)(\.|\。|\?)",r"\1 \2\2",prompt)
242
+
243
  wav_chunks = []
244
  ## Direct mode
245
  """
 
260
  metrics_text+=f"Real-time factor (RTF): {real_time_factor:.2f}\n"
261
  torchaudio.save("output.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
262
  """
263
+
264
  print("I: Generating new audio in streaming mode...")
265
  t0 = time.time()
266
  chunks = model.inference_stream(
 
287
  #metrics_text += (
288
  # f"Time to generate audio: {round(inference_time*1000)} milliseconds\n"
289
  #)
290
+
291
  wav = torch.cat(wav_chunks, dim=0)
292
  print(wav.shape)
293
  real_time_factor = (time.time() - t0) / wav.shape[0] * 24000
 
392
  title = "Coqui🐸 XTTS"
393
 
394
  description = """
 
 
 
 
 
 
395
 
 
396
  <br/>
397
+
398
+ <a href="https://huggingface.co/coqui/XTTS-v2">XTTS</a> is a text-to-speech model that lets you clone voices into different languages.
399
+
400
  <br/>
401
+
402
  This is the same model that powers our creator application <a href="https://coqui.ai">Coqui Studio</a> as well as the <a href="https://docs.coqui.ai">Coqui API</a>. In production we apply modifications to make low-latency streaming possible.
403
+
404
  <br/>
405
+
406
+ There are 16 languages.
407
+
408
+ <p>
409
+ Arabic: ar, Brazilian Portuguese: pt , Chinese: zh-cn, Czech: cs, Dutch: nl, English: en, French: fr, Italian: it, Polish: pl, Russian: ru, Spanish: es, Turkish: tr, Japanese: ja, Korean: ko, Hungarian: hu <br/>
410
+ </p>
411
+
412
  <br/>
413
+
414
+ Leave a star 🌟 on the Github <a href="https://github.com/coqui-ai/TTS">🐸TTS</a>, where our open-source inference and training code lives.
415
+
416
  <br/>
417
+ """
418
+
419
+ links = """
 
 
420
  <img referrerpolicy="no-referrer-when-downgrade" src="https://static.scarf.sh/a.png?x-pxid=0d00920c-8cc9-4bf3-90f2-a615797e5f59" />
421
+
422
+ | | |
423
+ | ------------------------------- | --------------------------------------- |
424
+ | 🐸💬 **CoquiTTS** | [Github](https://github.com/coqui-ai/TTS) <a style="display:inline-block" href='https://github.com/coqui-ai/TTS'><img src='https://img.shields.io/github/stars/coqui-ai/TTS?style=social' /></a>|
425
+ | 💼 **Documentation** | [ReadTheDocs](https://tts.readthedocs.io/en/latest/)
426
+ | 👩‍💻 **Questions** | [GitHub Discussions](https://github.com/coqui-ai/TTS/discussions) |
427
+ | 🗯 **Community** | [![Dicord](https://img.shields.io/discord/1037326658807533628?color=%239B59B6&label=chat%20on%20discord)](https://discord.gg/5eXr5seRrv) |
428
+
429
+
430
  """
431
 
432
  article = """
 
589
  ]
590
 
591
 
592
+
593
+ with gr.Blocks(analytics_enabled=False) as demo:
594
+ with gr.Row():
595
+ with gr.Column(width=2):
596
+ gr.Markdown(
597
+ """
598
+ ## <img src="https://raw.githubusercontent.com/coqui-ai/TTS/main/images/coqui-log-green-TTS.png" height="56"/>
599
+ """
600
+ )
601
+
602
+ with gr.Row():
603
+ with gr.Column():
604
+ gr.Markdown(description)
605
+ with gr.Column():
606
+ gr.Markdown(links)
607
+
608
+ with gr.Row():
609
+ with gr.Column():
610
+ input_text_gr = gr.Textbox(
611
+ label="Text Prompt",
612
+ info="One or two sentences at a time is better. Up to 200 text characters.",
613
+ value="Hi there, I'm your new voice clone. Try your best to upload quality audio",
614
+ )
615
+ language_gr = gr.Dropdown(
616
+ label="Language",
617
+ info="Select an output language for the synthesised speech",
618
+ choices=[
619
+ "en",
620
+ "es",
621
+ "fr",
622
+ "de",
623
+ "it",
624
+ "pt",
625
+ "pl",
626
+ "tr",
627
+ "ru",
628
+ "nl",
629
+ "cs",
630
+ "ar",
631
+ "zh-cn",
632
+ "ja",
633
+ "ko",
634
+ "hu"
635
+ ],
636
+ max_choices=1,
637
+ value="en",
638
+ )
639
+ ref_gr = gr.Audio(
640
+ label="Reference Audio",
641
+ info="Click on the ✎ button to upload your own target speaker audio",
642
+ type="filepath",
643
+ value="examples/female.wav",
644
+ )
645
+ mic_gr = gr.Audio(
646
+ source="microphone",
647
+ type="filepath",
648
+ info="Use your microphone to record audio",
649
+ label="Use Microphone for Reference",
650
+ )
651
+ use_mic_gr = gr.Checkbox(
652
+ label="Use Microphone",
653
+ value=False,
654
+ info="Notice: Microphone input may not work properly under traffic",
655
+ )
656
+ clean_ref_gr = gr.Checkbox(
657
+ label="Cleanup Reference Voice",
658
+ value=False,
659
+ info="This check can improve output if your microphone or reference voice is noisy",
660
+ )
661
+ auto_det_lang_gr = gr.Checkbox(
662
+ label="Do not use language auto-detect",
663
+ value=False,
664
+ info="Check to disable language auto-detection",
665
+ )
666
+ tos_gr = gr.Checkbox(
667
+ label="Agree",
668
+ value=False,
669
+ info="I agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml",
670
+ )
671
+
672
+ tts_button = gr.Button("Send", elem_id="send-btn", visible=True)
673
+
674
+
675
+ with gr.Column():
676
+ video_gr = gr.Video(label="Waveform Visual")
677
+ audio_gr = gr.Audio(label="Synthesised Audio", autoplay=True)
678
+ out_text_gr = gr.Text(label="Metrics")
679
+ ref_audio_gr = gr.Audio(label="Reference Audio Used")
680
+
681
+ with gr.Row():
682
+ gr.Examples(examples,
683
+ label="Examples",
684
+ inputs=[input_text_gr, language_gr, ref_gr, mic_gr, use_mic_gr, clean_ref_gr, auto_det_lang_gr, tos_gr],
685
+ outputs=[video_gr, audio_gr, out_text_gr, ref_audio_gr],
686
+ fn=predict,
687
+ cache_examples=False,)
688
+
689
+ tts_button.click(predict, [input_text_gr, language_gr, ref_gr, mic_gr, use_mic_gr, clean_ref_gr, auto_det_lang_gr, tos_gr], outputs=[video_gr, audio_gr, out_text_gr, ref_audio_gr])
690
+
691
+ demo.queue(concurrency_count=16).launch(debug=True, show_api=True)