Spaces:
Running
on
Zero
Running
on
Zero
Contenders tab: query relevant TTS models
Browse files
app.py
CHANGED
@@ -117,6 +117,7 @@ HF_SPACES = {
|
|
117 |
'function': '1',
|
118 |
'text_param_index': 0,
|
119 |
'return_audio_index': 1,
|
|
|
120 |
},
|
121 |
# WhisperSpeech
|
122 |
'collabora/WhisperSpeech': {
|
@@ -124,6 +125,7 @@ HF_SPACES = {
|
|
124 |
'function': '/whisper_speech_demo',
|
125 |
'text_param_index': 0,
|
126 |
'return_audio_index': 0,
|
|
|
127 |
},
|
128 |
# OpenVoice (MyShell.ai)
|
129 |
'myshell-ai/OpenVoice': {
|
@@ -131,6 +133,7 @@ HF_SPACES = {
|
|
131 |
'function': '1',
|
132 |
'text_param_index': 0,
|
133 |
'return_audio_index': 1,
|
|
|
134 |
},
|
135 |
# OpenVoice v2 (MyShell.ai)
|
136 |
'myshell-ai/OpenVoiceV2': {
|
@@ -138,13 +141,15 @@ HF_SPACES = {
|
|
138 |
'function': '1',
|
139 |
'text_param_index': 0,
|
140 |
'return_audio_index': 1,
|
|
|
141 |
},
|
142 |
# MetaVoice
|
143 |
'mrfakename/MetaVoice-1B-v0.1': {
|
144 |
-
'name':'MetaVoice',
|
145 |
'function': '/tts',
|
146 |
'text_param_index': 0,
|
147 |
'return_audio_index': 0,
|
|
|
148 |
},
|
149 |
# xVASynth (CPU)
|
150 |
'Pendrokar/xVASynth-TTS': {
|
@@ -152,6 +157,7 @@ HF_SPACES = {
|
|
152 |
'function': '/predict',
|
153 |
'text_param_index': 0,
|
154 |
'return_audio_index': 0,
|
|
|
155 |
},
|
156 |
# CoquiTTS (CPU)
|
157 |
'coqui/CoquiTTS': {
|
@@ -159,6 +165,7 @@ HF_SPACES = {
|
|
159 |
'function': '0',
|
160 |
'text_param_index': 0,
|
161 |
'return_audio_index': 0,
|
|
|
162 |
},
|
163 |
# HierSpeech_TTS
|
164 |
'LeeSangHoon/HierSpeech_TTS': {
|
@@ -166,6 +173,7 @@ HF_SPACES = {
|
|
166 |
'function': '/predict',
|
167 |
'text_param_index': 0,
|
168 |
'return_audio_index': 0,
|
|
|
169 |
},
|
170 |
# MeloTTS (MyShell.ai)
|
171 |
'mrfakename/MeloTTS': {
|
@@ -173,6 +181,7 @@ HF_SPACES = {
|
|
173 |
'function': '/synthesize',
|
174 |
'text_param_index': 0,
|
175 |
'return_audio_index': 0,
|
|
|
176 |
},
|
177 |
|
178 |
# Parler
|
@@ -182,6 +191,7 @@ HF_SPACES = {
|
|
182 |
'text_param_index': 0,
|
183 |
'return_audio_index': 0,
|
184 |
'is_zero_gpu_space': True,
|
|
|
185 |
},
|
186 |
# Parler Mini
|
187 |
# 'parler-tts/parler_tts': {
|
@@ -190,6 +200,7 @@ HF_SPACES = {
|
|
190 |
# 'text_param_index': 0,
|
191 |
# 'return_audio_index': 0,
|
192 |
# 'is_zero_gpu_space': True,
|
|
|
193 |
# },
|
194 |
# Parler Mini which using Expresso dataset
|
195 |
'parler-tts/parler-tts-expresso': {
|
@@ -198,6 +209,7 @@ HF_SPACES = {
|
|
198 |
'text_param_index': 0,
|
199 |
'return_audio_index': 0,
|
200 |
'is_zero_gpu_space': True,
|
|
|
201 |
},
|
202 |
|
203 |
# Microsoft Edge TTS
|
@@ -207,6 +219,7 @@ HF_SPACES = {
|
|
207 |
'text_param_index': 0,
|
208 |
'return_audio_index': 0,
|
209 |
'is_proprietary': True,
|
|
|
210 |
},
|
211 |
|
212 |
# Fish Speech
|
@@ -215,6 +228,7 @@ HF_SPACES = {
|
|
215 |
'function': '/inference_wrapper',
|
216 |
'text_param_index': 0,
|
217 |
'return_audio_index': 1,
|
|
|
218 |
},
|
219 |
|
220 |
# E2/F5 TTS
|
@@ -224,6 +238,7 @@ HF_SPACES = {
|
|
224 |
'text_param_index': 2,
|
225 |
'return_audio_index': 0,
|
226 |
'is_zero_gpu_space': True,
|
|
|
227 |
},
|
228 |
|
229 |
# TTS w issues
|
@@ -543,6 +558,7 @@ Generated audio clips cannot be redistributed and may be used for personal, non-
|
|
543 |
|
544 |
Random sentences are sourced from a filtered subset of the [Harvard Sentences](https://www.cs.columbia.edu/~hgs/audio/harvard.html).
|
545 |
""".strip()
|
|
|
546 |
LDESC = f"""
|
547 |
## π Leaderboard
|
548 |
|
@@ -552,19 +568,25 @@ The leaderboard displays models in descending order of how natural they sound (b
|
|
552 |
|
553 |
Important: In order to help keep results fair, the leaderboard hides results by default until the number of votes passes a threshold. Tick the `Reveal preliminary results` to show models without sufficient votes. Please note that preliminary results may be inaccurate. [This dataset is public](https://huggingface.co/datasets/{DB_DATASET_ID}) and only saves the hardcoded sentences while keeping the voters anonymous.
|
554 |
""".strip()
|
|
|
555 |
TTS_INFO = f"""
|
556 |
## π£ Contenders
|
557 |
|
558 |
### Open Source TTS capabilities table
|
559 |
|
560 |
-
See the dataset itself for the legend and more in depth information for each model.
|
561 |
""".strip()
|
562 |
-
|
|
|
|
|
|
|
|
|
|
|
563 |
<iframe
|
564 |
-
|
565 |
-
|
566 |
-
|
567 |
-
|
568 |
></iframe>
|
569 |
""".strip()
|
570 |
|
@@ -1576,7 +1598,7 @@ with gr.Blocks() as about:
|
|
1576 |
gr.Markdown(ABOUT)
|
1577 |
with gr.Blocks() as tts_info:
|
1578 |
gr.Markdown(TTS_INFO)
|
1579 |
-
gr.HTML(
|
1580 |
# with gr.Blocks() as admin:
|
1581 |
# rdb = gr.Button("Reload Audio Dataset")
|
1582 |
# # rdb.click(reload_audio_dataset, outputs=rdb)
|
|
|
117 |
'function': '1',
|
118 |
'text_param_index': 0,
|
119 |
'return_audio_index': 1,
|
120 |
+
'series': 'XTTS',
|
121 |
},
|
122 |
# WhisperSpeech
|
123 |
'collabora/WhisperSpeech': {
|
|
|
125 |
'function': '/whisper_speech_demo',
|
126 |
'text_param_index': 0,
|
127 |
'return_audio_index': 0,
|
128 |
+
'series': 'WhisperSpeech',
|
129 |
},
|
130 |
# OpenVoice (MyShell.ai)
|
131 |
'myshell-ai/OpenVoice': {
|
|
|
133 |
'function': '1',
|
134 |
'text_param_index': 0,
|
135 |
'return_audio_index': 1,
|
136 |
+
'series': 'OpenVoice',
|
137 |
},
|
138 |
# OpenVoice v2 (MyShell.ai)
|
139 |
'myshell-ai/OpenVoiceV2': {
|
|
|
141 |
'function': '1',
|
142 |
'text_param_index': 0,
|
143 |
'return_audio_index': 1,
|
144 |
+
'series': 'OpenVoice',
|
145 |
},
|
146 |
# MetaVoice
|
147 |
'mrfakename/MetaVoice-1B-v0.1': {
|
148 |
+
'name':'MetaVoice-1B',
|
149 |
'function': '/tts',
|
150 |
'text_param_index': 0,
|
151 |
'return_audio_index': 0,
|
152 |
+
'series': 'MetaVoice-1B',
|
153 |
},
|
154 |
# xVASynth (CPU)
|
155 |
'Pendrokar/xVASynth-TTS': {
|
|
|
157 |
'function': '/predict',
|
158 |
'text_param_index': 0,
|
159 |
'return_audio_index': 0,
|
160 |
+
'series': 'xVASynth',
|
161 |
},
|
162 |
# CoquiTTS (CPU)
|
163 |
'coqui/CoquiTTS': {
|
|
|
165 |
'function': '0',
|
166 |
'text_param_index': 0,
|
167 |
'return_audio_index': 0,
|
168 |
+
'series': 'CoquiTTS',
|
169 |
},
|
170 |
# HierSpeech_TTS
|
171 |
'LeeSangHoon/HierSpeech_TTS': {
|
|
|
173 |
'function': '/predict',
|
174 |
'text_param_index': 0,
|
175 |
'return_audio_index': 0,
|
176 |
+
'series': 'HierSpeech++',
|
177 |
},
|
178 |
# MeloTTS (MyShell.ai)
|
179 |
'mrfakename/MeloTTS': {
|
|
|
181 |
'function': '/synthesize',
|
182 |
'text_param_index': 0,
|
183 |
'return_audio_index': 0,
|
184 |
+
'series': 'MeloTTS',
|
185 |
},
|
186 |
|
187 |
# Parler
|
|
|
191 |
'text_param_index': 0,
|
192 |
'return_audio_index': 0,
|
193 |
'is_zero_gpu_space': True,
|
194 |
+
'series': 'Parler',
|
195 |
},
|
196 |
# Parler Mini
|
197 |
# 'parler-tts/parler_tts': {
|
|
|
200 |
# 'text_param_index': 0,
|
201 |
# 'return_audio_index': 0,
|
202 |
# 'is_zero_gpu_space': True,
|
203 |
+
# 'series': 'Parler',
|
204 |
# },
|
205 |
# Parler Mini which using Expresso dataset
|
206 |
'parler-tts/parler-tts-expresso': {
|
|
|
209 |
'text_param_index': 0,
|
210 |
'return_audio_index': 0,
|
211 |
'is_zero_gpu_space': True,
|
212 |
+
'series': 'Parler',
|
213 |
},
|
214 |
|
215 |
# Microsoft Edge TTS
|
|
|
219 |
'text_param_index': 0,
|
220 |
'return_audio_index': 0,
|
221 |
'is_proprietary': True,
|
222 |
+
'series': 'Edge TTS',
|
223 |
},
|
224 |
|
225 |
# Fish Speech
|
|
|
228 |
'function': '/inference_wrapper',
|
229 |
'text_param_index': 0,
|
230 |
'return_audio_index': 1,
|
231 |
+
'series': 'Fish Speech',
|
232 |
},
|
233 |
|
234 |
# E2/F5 TTS
|
|
|
238 |
'text_param_index': 2,
|
239 |
'return_audio_index': 0,
|
240 |
'is_zero_gpu_space': True,
|
241 |
+
'series': 'E2/F5 TTS',
|
242 |
},
|
243 |
|
244 |
# TTS w issues
|
|
|
558 |
|
559 |
Random sentences are sourced from a filtered subset of the [Harvard Sentences](https://www.cs.columbia.edu/~hgs/audio/harvard.html).
|
560 |
""".strip()
|
561 |
+
|
562 |
LDESC = f"""
|
563 |
## π Leaderboard
|
564 |
|
|
|
568 |
|
569 |
Important: In order to help keep results fair, the leaderboard hides results by default until the number of votes passes a threshold. Tick the `Reveal preliminary results` to show models without sufficient votes. Please note that preliminary results may be inaccurate. [This dataset is public](https://huggingface.co/datasets/{DB_DATASET_ID}) and only saves the hardcoded sentences while keeping the voters anonymous.
|
570 |
""".strip()
|
571 |
+
|
572 |
TTS_INFO = f"""
|
573 |
## π£ Contenders
|
574 |
|
575 |
### Open Source TTS capabilities table
|
576 |
|
577 |
+
See [the below dataset itself](https://huggingface.co/datasets/Pendrokar/open_tts_tracker) for the legend and more in depth information for each model.
|
578 |
""".strip()
|
579 |
+
|
580 |
+
model_series = []
|
581 |
+
for model in HF_SPACES.values():
|
582 |
+
model_series.append('%27'+ model['series'].replace('+', '%2B') +'%27')
|
583 |
+
TTS_DATASET_IFRAME_ORDER = '%2C+'.join(model_series)
|
584 |
+
TTS_DATASET_IFRAME = f"""
|
585 |
<iframe
|
586 |
+
src="https://huggingface.co/datasets/Pendrokar/open_tts_tracker/embed/viewer/default/train?sql_console=true&sql=--+The+SQL+console+is+powered+by+DuckDB+WASM+and+runs+entirely+in+the+browser.%0A--+Get+started+by+typing+a+query+or+selecting+a+view+from+the+options+below.%0ASELECT+*%2C+%22Name%22+IN+%28{TTS_DATASET_IFRAME_ORDER}%29+AS+%22In+arena%22+FROM+train+WHERE+%22Insta-clone+%F0%9F%91%A5%22+IS+NOT+NULL+ORDER+BY+%22In+arena%22+DESC+LIMIT+50%3B&views%5B%5D=train"
|
587 |
+
frameborder="0"
|
588 |
+
width="100%"
|
589 |
+
height="650px"
|
590 |
></iframe>
|
591 |
""".strip()
|
592 |
|
|
|
1598 |
gr.Markdown(ABOUT)
|
1599 |
with gr.Blocks() as tts_info:
|
1600 |
gr.Markdown(TTS_INFO)
|
1601 |
+
gr.HTML(TTS_DATASET_IFRAME)
|
1602 |
# with gr.Blocks() as admin:
|
1603 |
# rdb = gr.Button("Reload Audio Dataset")
|
1604 |
# # rdb.click(reload_audio_dataset, outputs=rdb)
|