File size: 5,999 Bytes
ca90f09
d02ad9c
de25487
2f7d9da
 
d02ad9c
 
194fffd
c98fc74
25a1b59
74d134e
a83a001
29a24a3
d069b98
a83a001
f539bd9
 
 
 
 
 
 
 
 
 
 
 
 
 
194fffd
f32ca90
22c3972
 
 
 
 
 
 
 
094e01b
22c3972
 
 
094e01b
22c3972
 
 
 
 
 
094e01b
22c3972
 
094e01b
22c3972
 
 
 
 
 
 
 
 
 
 
 
 
 
 
094e01b
22c3972
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194fffd
3001020
 
 
 
db35b73
 
 
 
 
 
 
 
 
 
 
 
 
3001020
 
 
 
142fdc7
 
8d5f5d8
3001020
 
142fdc7
 
 
6da7111
 
 
 
 
 
 
 
 
 
 
 
 
142fdc7
 
48b3e91
3001020
 
142fdc7
72b6265
142fdc7
939c1fe
3001020
 
f74bce2
72b6265
3001020
 
0287131
3001020
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
import sys
import os
import torch
# By using XTTS you agree to CPML license https://coqui.ai/cpml
os.environ["COQUI_TOS_AGREED"] = "1"

import gradio as gr
from TTS.api import TTS
from TTS.utils.manage import ModelManager
model_names = TTS().list_models()
print(model_names.__dict__)
print(model_names.__dir__())
model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
m = model_name

# Automatic device detection
if torch.cuda.is_available():
    # cuda only
    device_type = "cuda"
    device_selection = "cuda:0"
    data_type = torch.float16
else:
    # no GPU or Amd
    device_type = "cpu"
    device_selection = "cpu"
    data_type = torch.float32

tts = TTS(model_name, gpu=torch.cuda.is_available())
tts.to(device_type)

def predict(prompt, language, audio_file_pth, mic_file_path, use_mic):
    if use_mic:
        if mic_file_path is None:
            gr.Warning("Please record your voice with Microphone, or uncheck Use Microphone to use reference audios")
            return (
                None,
                None,
                None,
            )
        else:
            speaker_wav = mic_file_path
    else:
        speaker_wav = audio_file_pth

    if len(prompt) < 2:
        gr.Warning("Please give a longer prompt text")
        return (
                None,
                None,
                None,
            )
    if len(prompt) > 50000:
        gr.Warning("Text length limited to 50000 characters for this demo, please try shorter text")
        return (
                None,
                None,
                None,
            )  
    try:
        if language == "fr":
            if m.find("your") != -1:
                language = "fr-fr"
        if m.find("/fr/") != -1:
            language = None
        tts.tts_to_file(
            text=prompt,
            file_path="output.wav",
            speaker_wav=speaker_wav,
            language=language
        )
    except RuntimeError as e :
        if "device-assert" in str(e):
            # cannot do anything on cuda device side error, need to restart
            gr.Warning("Unhandled Exception encounter, please retry in a minute")
            print("Cuda device-assert Runtime encountered need restart")
            sys.exit("Exit due to cuda device-assert")
        else:
            raise e
        
    return (
        gr.make_waveform(
            audio="output.wav",
        ),
        "output.wav",
        None,
    )

with gr.Blocks() as interface:
    gr.HTML("Multi-language Text-to-Speech")
    gr.HTML(
        """
<a href="https://huggingface.co/coqui/XTTS-v1">XTTS</a> is a Voice generation model that lets you clone voices into different languages by using just a quick 3-second audio clip. 
<br/>
XTTS is built on previous research, like Tortoise, with additional architectural innovations and training to make cross-language voice cloning and multilingual speech generation possible. 
<br/>
This is the same model that powers our creator application <a href="https://coqui.ai">Coqui Studio</a> as well as the <a href="https://docs.coqui.ai">Coqui API</a>. In production we apply modifications to make low-latency streaming possible.
<br/>
Leave a star on the Github <a href="https://github.com/coqui-ai/TTS">TTS</a>, where our open-source inference and training code lives.
<br/>
<p>For faster inference without waiting in the queue, you should duplicate this space and upgrade to GPU via the settings.
<br/>
<a href="https://huggingface.co/spaces/coqui/xtts?duplicate=true">
<img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
</p>
        """
    )
    with gr.Column():
        prompt = gr.Textbox(
            label="Text Prompt",
            info="One or two sentences at a time is better",
            value="Hello, World! Here is an example of light voice cloning. Try to upload your best audio samples quality",
        )
        language = gr.Dropdown(
            label="Language",
            info="Select an output language for the synthesised speech",
            choices=[
                    ["Arabic", "ar"],
                    ["Brazilian Portuguese", "pt"],
                    ["Mandarin Chinese", "zh-cn"],
                    ["Czech", "cs"],
                    ["Dutch", "nl"],
                    ["English", "en"],
                    ["French", "fr"],
                    ["German", "de"],
                    ["Italian", "it"],
                    ["Polish", "pl"],
                    ["Russian", "ru"],
                    ["Spanish", "es"],
                    ["Turkish", "tr"]
            ],
            max_choices=1,
            value="en",
        )
        audio_file_pth = gr.Audio(
            label="Reference Audio",
            #info="Click on the ✎ button to upload your own target speaker audio",
            type="filepath",
            value="examples/female.wav",
        )
        mic_file_path = gr.Audio(sources=["microphone"],
                 type="filepath",
                 #info="Use your microphone to record audio",
                 label="Use Microphone for Reference")
        use_mic = gr.Checkbox(label="Check to use Microphone as Reference",
                    value=False,
                    info="Notice: Microphone input may not work properly under traffic",)
        with gr.Accordion("Advanced options", open = False):
             debug_mode = gr.Checkbox(label = "Debug mode", value = False, info = "Show intermediate results")

        submit = gr.Button("🚀 Speak", variant = "primary")

        waveform_visual = gr.Video(label="Waveform Visual", autoplay=True)
        synthesised_audio = gr.Audio(label="Synthesised Audio", autoplay=False)
        information = gr.HTML()

    submit.click(predict, inputs = [
        prompt, language, audio_file_pth, mic_file_path, use_mic
    ], outputs = [
        waveform_visual,
        synthesised_audio,
        information
    ], scroll_to_output = True)

interface.queue().launch(debug=True)