Jekyll2000 commited on
Commit
fc085cc
·
verified ·
1 Parent(s): 36e11f1

Create model.py

Browse files
Files changed (1) hide show
  1. model.py +204 -0
model.py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ import time
4
+ import uuid
5
+
6
+ import gradio as gr
7
+ import soundfile as sf
8
+
9
+ from model import get_pretrained_model, language_to_models
10
+
11
+ title = "Text-to-speech (TTS)"
12
+
13
+ description = """
14
+ This space shows how to convert text to speech with Next-gen Kaldi.
15
+
16
+ It is running on CPU within a docker container provided by Hugging Face.
17
+
18
+ """
19
+
20
+ # css style is copied from
21
+ # https://huggingface.co/spaces/alphacep/asr/blob/main/app.py#L113
22
+ css = """
23
+ .result {display:flex;flex-direction:column}
24
+ .result_item {padding:15px;margin-bottom:8px;border-radius:15px;width:100%}
25
+ .result_item_success {background-color:mediumaquamarine;color:white;align-self:start}
26
+ .result_item_error {background-color:#ff7070;color:white;align-self:start}
27
+ """
28
+
29
+ #examples = [
30
+ # ["Min-nan (闽南话)", "csukuangfj/vits-mms-nan", "ài piaǸ chiah ē iaN̂", 0, 1.0],
31
+ # ["Thai", "csukuangfj/vits-mms-tha", "ฉันรักคุณ", 0, 1.0],
32
+ #]
33
+
34
+
35
+ def update_model_dropdown(language: str):
36
+ if language in language_to_models:
37
+ choices = language_to_models[language]
38
+ return gr.Dropdown(
39
+ choices=choices,
40
+ value=choices[0],
41
+ interactive=True,
42
+ )
43
+
44
+ raise ValueError(f"Unsupported language: {language}")
45
+
46
+
47
+ def build_html_output(s: str, style: str = "result_item_success"):
48
+ return f"""
49
+ <div class='result'>
50
+ <div class='result_item {style}'>
51
+ {s}
52
+ </div>
53
+ </div>
54
+ """
55
+
56
+
57
+ def process(language: str, repo_id: str, text: str, sid: str, speed: float):
58
+ logging.info(f"Input text: {text}. sid: {sid}, speed: {speed}")
59
+ sid = int(sid)
60
+ tts = get_pretrained_model(repo_id, speed)
61
+
62
+ start = time.time()
63
+ audio = tts.generate(text, sid=sid)
64
+ end = time.time()
65
+
66
+ if len(audio.samples) == 0:
67
+ raise ValueError(
68
+ "Error in generating audios. Please read previous error messages."
69
+ )
70
+
71
+ duration = len(audio.samples) / audio.sample_rate
72
+
73
+ elapsed_seconds = end - start
74
+ rtf = elapsed_seconds / duration
75
+
76
+ info = f"""
77
+ Wave duration : {duration:.3f} s <br/>
78
+ Processing time: {elapsed_seconds:.3f} s <br/>
79
+ RTF: {elapsed_seconds:.3f}/{duration:.3f} = {rtf:.3f} <br/>
80
+ """
81
+
82
+ logging.info(info)
83
+ logging.info(f"\nrepo_id: {repo_id}\ntext: {text}\nsid: {sid}\nspeed: {speed}")
84
+
85
+ filename = str(uuid.uuid4())
86
+ filename = f"{filename}.wav"
87
+ sf.write(
88
+ filename,
89
+ audio.samples,
90
+ samplerate=audio.sample_rate,
91
+ subtype="PCM_16",
92
+ )
93
+
94
+ return filename, build_html_output(info)
95
+
96
+
97
+ demo = gr.Blocks(css=css)
98
+
99
+
100
+ with demo:
101
+ gr.Markdown(title)
102
+ language_choices = list(language_to_models.keys())
103
+
104
+ language_radio = gr.Radio(
105
+ label="Language",
106
+ choices=language_choices,
107
+ value=language_choices[0],
108
+ )
109
+
110
+ model_dropdown = gr.Dropdown(
111
+ choices=language_to_models[language_choices[0]],
112
+ label="Select a model",
113
+ value=language_to_models[language_choices[0]][0],
114
+ )
115
+
116
+ language_radio.change(
117
+ update_model_dropdown,
118
+ inputs=language_radio,
119
+ outputs=model_dropdown,
120
+ )
121
+
122
+ with gr.Tabs():
123
+ with gr.TabItem("Please input your text"):
124
+ input_text = gr.Textbox(
125
+ label="Input text",
126
+ info="Your text",
127
+ lines=3,
128
+ placeholder="Please input your text here",
129
+ )
130
+
131
+ # input_sid = gr.Textbox(
132
+ # label="Speaker ID",
133
+ # info="Speaker ID",
134
+ # lines=1,
135
+ # max_lines=1,
136
+ # value="0",
137
+ # placeholder="Speaker ID. Valid only for mult-speaker model",
138
+ #)
139
+
140
+ # input_speed = gr.Slider(
141
+ # minimum=0.1,
142
+ # maximum=10,
143
+ # value=1,
144
+ # step=0.1,
145
+ #label="Speed (larger->faster; smaller->slower)",
146
+ #)
147
+
148
+ input_button = gr.Button("Submit")
149
+
150
+ output_audio = gr.Audio(label="Output")
151
+
152
+ output_info = gr.HTML(label="Info")
153
+
154
+ #gr.Examples(
155
+ # examples=examples,
156
+ # fn=process,
157
+ # inputs=[
158
+ # language_radio,
159
+ # model_dropdown,
160
+ # input_text,
161
+ # input_sid,
162
+ # input_speed,
163
+ # ],
164
+ # outputs=[
165
+ # output_audio,
166
+ # output_info,
167
+ # ],
168
+ # )
169
+
170
+ input_button.click(
171
+ process,
172
+ inputs=[
173
+ language_radio,
174
+ model_dropdown,
175
+ input_text,
176
+ #input_sid,
177
+ #input_speed,
178
+ ],
179
+ outputs=[
180
+ output_audio,
181
+ output_info,
182
+ ],
183
+ )
184
+
185
+ gr.Markdown(description)
186
+
187
+
188
+ def download_espeak_ng_data():
189
+ os.system(
190
+ """
191
+ cd /tmp
192
+ wget -qq https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/espeak-ng-data.tar.bz2
193
+ tar xf espeak-ng-data.tar.bz2
194
+ """
195
+ )
196
+
197
+
198
+ if __name__ == "__main__":
199
+ download_espeak_ng_data()
200
+ formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
201
+
202
+ logging.basicConfig(format=formatter, level=logging.INFO)
203
+
204
+ demo.launch()